Patch flat cs of 16-bit entry points if current %cs is different from
compiled value, and retrieve flat ds from a global variable. This
should avoid problems with win4lin kernels.

diff --git a/tools/winebuild/relay.c b/tools/winebuild/relay.c
index 018dc45..aa2c550 100644
--- a/tools/winebuild/relay.c
+++ b/tools/winebuild/relay.c
@@ -120,8 +120,15 @@
         fprintf( outfile, "\taddl $_GLOBAL_OFFSET_TABLE_+[.-.LCallFrom16%s.getgot1], %%ecx\n", name );
     }
 
+    if (UsePIC)
+    {
+        fprintf( outfile, "\t.byte 0x2e\n\tmovl " PREFIX "CallTo16_DataSelector@GOT(%%ecx), %%edx\n" );
+        fprintf( outfile, "\t.byte 0x2e\n\tmovl (%%edx), %%edx\n" );
+    }
+    else
+        fprintf( outfile, "\t.byte 0x2e\n\tmovl " PREFIX "CallTo16_DataSelector,%%edx\n" );
+
     /* Load 32-bit segment registers */
-    fprintf( outfile, "\tmovw $0x%04x, %%dx\n", data_selector );
 #ifdef __svr4__
     fprintf( outfile, "\tdata16\n");
 #endif
@@ -690,7 +697,7 @@
 
     /* Restore 32-bit segment registers */
 
-    fprintf( outfile, "\tmovw $0x%04x,%%di\n", data_selector );
+    fprintf( outfile, "\t.byte 0x2e\n\tmovl " PREFIX "CallTo16_DataSelector-" PREFIX "Call16_Ret_Start,%%edi\n" );
 #ifdef __svr4__
     fprintf( outfile, "\tdata16\n");
 #endif
@@ -715,9 +722,12 @@
 
     fprintf( outfile, "\tlret\n" );
 
-    /* Declare the return address variable */
+    /* Declare the return address and data selector variables */
 
-    fprintf( outfile, "\n\t.globl " PREFIX "CallTo16_RetAddr\n" );
+    fprintf( outfile, "\n\t.align 4\n" );
+    fprintf( outfile, "\t.globl " PREFIX "CallTo16_DataSelector\n" );
+    fprintf( outfile, PREFIX "CallTo16_DataSelector:\t.long 0\n" );
+    fprintf( outfile, "\t.globl " PREFIX "CallTo16_RetAddr\n" );
     fprintf( outfile, PREFIX "CallTo16_RetAddr:\t.long 0\n" );
 }