Win32 register functions can now have arguments.
Simplified relay debugging for register functions.

diff --git a/relay32/builtin32.c b/relay32/builtin32.c
index 26a15be..c82c124 100644
--- a/relay32/builtin32.c
+++ b/relay32/builtin32.c
@@ -145,6 +145,7 @@
 };
 
 extern void RELAY_CallFrom32();
+extern void RELAY_CallFrom32Regs();
 
 /***********************************************************************
  *           BUILTIN32_DoLoadImage
@@ -362,29 +363,25 @@
 	}
         switch(args)
         {
-        case 0xfe:  /* register func */
-            debug->call       = 0xe8;
-            debug->callfrom32 = (DWORD)dll->descr->functions[i] -
-                                (DWORD)&debug->ret;
-            debug->ret        = 0x90;  /* nop */
-            debug->args       = 0;
-            *funcs = (LPVOID)((BYTE *)debug - addr);
-            break;
         case 0xfd:  /* forward */
         case 0xff:  /* stub or extern */
             break;
-        default:  /* normal function (stdcall or cdecl) */
+        default:  /* normal function (stdcall or cdecl or register) */
 	    if (TRACE_ON(relay)) {
 		debug->call       = 0xe8; /* lcall relative */
-		debug->callfrom32 = (DWORD)RELAY_CallFrom32 -
-				    (DWORD)&debug->ret;
+                if (args & 0x40)  /* register func */
+                    debug->callfrom32 = (DWORD)RELAY_CallFrom32Regs -
+                        (DWORD)&debug->ret;
+                else
+                    debug->callfrom32 = (DWORD)RELAY_CallFrom32 -
+                        (DWORD)&debug->ret;
 	    } else {
 		debug->call       = 0xe9; /* ljmp relative */
 		debug->callfrom32 = (DWORD)dll->descr->functions[i] -
 				    (DWORD)&debug->ret;
 	    }
 	    debug->ret        = (args & 0x80) ? 0xc3 : 0xc2; /*ret/ret $n*/
-	    debug->args       = (args & 0x7f) * sizeof(int);
+	    debug->args       = (args & 0x3f) * sizeof(int);
             *funcs = (LPVOID)((BYTE *)debug - addr);
             break;
         }
diff --git a/relay32/kernel32.spec b/relay32/kernel32.spec
index 186c66d..d3df4e9 100644
--- a/relay32/kernel32.spec
+++ b/relay32/kernel32.spec
@@ -719,7 +719,7 @@
 698 stdcall UTRegister(long str str str ptr ptr ptr) UTRegister
 699 stdcall UTUnRegister(long) UTUnRegister
 700 stdcall UnMapLS(long) UnMapLS
-701 register UnMapSLFixArray() UnMapSLFixArray
+701 register UnMapSLFixArray(long long) UnMapSLFixArray
 702 stdcall UnhandledExceptionFilter(ptr) UnhandledExceptionFilter
 703 stdcall UninitializeCriticalSection(ptr) UninitializeCriticalSection
 704 stdcall UnlockFile(long long long long long) UnlockFile
@@ -926,3 +926,8 @@
 
 #1599 wrong ordinal (249 in Win32s's W32SCOMB.DLL) !
 1599 stdcall Get16DLLAddress(long str) Get16DLLAddress
+
+# Wine internal functions
+1600 register SNOOP_Entry() SNOOP_Entry
+1601 register SNOOP_Return() SNOOP_Return
+1602 register RELAY_CallFrom32Regs() RELAY_CallFrom32Regs
diff --git a/relay32/relay386.c b/relay32/relay386.c
index ef96419..a783218 100644
--- a/relay32/relay386.c
+++ b/relay32/relay386.c
@@ -10,9 +10,10 @@
 #include "winnt.h"
 #include "builtin32.h"
 #include "selectors.h"
+#include "stackframe.h"
 #include "debugstr.h"
-#include "debug.h"
 #include "main.h"
+#include "debugtools.h"
 
 DEFAULT_DEBUG_CHANNEL(relay)
 
@@ -59,6 +60,29 @@
   return 1;
 }
 
+
+/***********************************************************************
+ *           RELAY_PrintArgs
+ */
+static inline void RELAY_PrintArgs( int *args, int nb_args, unsigned int typemask )
+{
+    while (nb_args--)
+    {
+	if ((typemask & 3) && HIWORD(*args))
+        {
+	    if (typemask & 2)
+	    	DPRINTF( "%08x L%s", *args, debugstr_w((LPWSTR)*args) );
+            else
+	    	DPRINTF( "%08x %s", *args, debugstr_a((LPCSTR)*args) );
+	}
+        else DPRINTF( "%08x", *args );
+        if (nb_args) DPRINTF( "," );
+        args++;
+        typemask >>= 2;
+    }
+}
+
+
 /***********************************************************************
  *           RELAY_CallFrom32
  *
@@ -71,36 +95,23 @@
  */
 int RELAY_CallFrom32( int ret_addr, ... )
 {
-    int i, ret;
+    int ret;
     char buffer[80];
+    unsigned int typemask;
     FARPROC func;
-    unsigned int mask, typemask;
     WORD fs;
 
-    int *args = &ret_addr;
+    int *args = &ret_addr + 1;
     /* Relay addr is the return address for this function */
-    BYTE *relay_addr = (BYTE *)args[-1];
+    BYTE *relay_addr = (BYTE *)__builtin_return_address(0);
     WORD nb_args = *(WORD *)(relay_addr + 1) / sizeof(int);
 
     assert(TRACE_ON(relay));
-    func = (FARPROC)BUILTIN32_GetEntryPoint( buffer, relay_addr - 5,
-                                               &typemask );
-      DPRINTF( "Call %s(", buffer );
-      args++;
-      for (i = 0, mask = 3; i < nb_args; i++, mask <<= 2)
-      {
-        if (i) DPRINTF( "," );
-	if ((typemask & mask) && HIWORD(args[i]))
-        {
-	    if (typemask & (2<<(2*i)))
-	    	DPRINTF( "%08x L%s", args[i], debugstr_w((LPWSTR)args[i]) );
-            else
-	    	DPRINTF( "%08x %s", args[i], debugstr_a((LPCSTR)args[i]) );
-	}
-        else DPRINTF( "%08x", args[i] );
-      }
-      GET_FS( fs );
-      DPRINTF( ") ret=%08x fs=%04x\n", ret_addr, fs );
+    func = (FARPROC)BUILTIN32_GetEntryPoint( buffer, relay_addr - 5, &typemask );
+    DPRINTF( "Call %s(", buffer );
+    RELAY_PrintArgs( args, nb_args, typemask );
+    GET_FS( fs );
+    DPRINTF( ") ret=%08x fs=%04x\n", ret_addr, fs );
 
     if (*relay_addr == 0xc3) /* cdecl */
     {
@@ -141,8 +152,7 @@
                              args[6],args[7],args[8],args[9],args[10],args[11],
                              args[12],args[13],args[14],args[15]); break;
         default:
-            ERR(relay, "Unsupported nb args %d\n",
-                     nb_args );
+            ERR( "Unsupported nb of args %d\n", nb_args );
             assert(FALSE);
         }
     }
@@ -184,7 +194,7 @@
                             args[6],args[7],args[8],args[9],args[10],args[11],
                             args[12],args[13],args[14],args[15]); break;
         default:
-            ERR(relay, "Unsupported nb args %d\n",nb_args );
+            ERR( "Unsupported nb of args %d\n", nb_args );
             assert(FALSE);
         }
     }
@@ -197,101 +207,94 @@
 /***********************************************************************
  *           RELAY_CallFrom32Regs
  *
- * 'context' contains the register contents at the point of call of
- * the REG_ENTRY_POINT. The stack layout of the stack pointed to by
- * ESP_reg(&context) is as follows:
+ * Stack layout (esp is ESP_reg(context), not the current %esp):
  *
- * If debugmsg(relay) is OFF:
- *  ...    ...
- * (esp+4) args
+ * ...
+ * (esp+4) first arg
  * (esp)   return addr to caller
- * (esp-4) function entry point
- *
- * If debugmsg(relay) is ON:
- *  ...    ...
- * (esp+8) args
- * (esp+4) return addr to caller
- * (esp)   return addr to DEBUG_ENTRY_POINT
- * (esp-4) function entry point
- *
- * As the called function might change the stack layout
- * (e.g. FT_Prolog, FT_ExitNN), we remove all modifications to the stack,
- * so that the called function sees (in both cases):
- *
- *  ...    ...
- * (esp+4) args
- * (esp)   return addr to caller
+ * (esp-4) return addr to DEBUG_ENTRY_POINT
+ * (esp-8) ptr to relay entry code for RELAY_CallFrom32Regs
  *  ...    >128 bytes space free to be modified (ensured by the assembly glue)
- *
- * NOTE: This routine makes no assumption about the relative position of
- *       its own stack to the stack pointed to by ESP_reg(&context),
- *	 except that the latter must have >128 bytes space to grow.
- *	 This means the assembly glue could even switch stacks completely
- *	 (e.g. to allow for large stacks).
- *
  */
 
-void RELAY_CallFrom32Regs( CONTEXT context )
+void WINAPI REGS_FUNC(RELAY_CallFrom32Regs)( CONTEXT *context )
 {
-    typedef void (CALLBACK *entry_point_t)(CONTEXT *);
-    entry_point_t entry_point = *(entry_point_t*) (ESP_reg(&context) - 4);
+    unsigned int typemask;
+    char buffer[80];
+    int* args;
+    FARPROC func;
+    BYTE *entry_point;
 
-    __RESTORE_ES;
+    BYTE *relay_addr = *((BYTE **)ESP_reg(context) - 1);
+    WORD nb_args = *(WORD *)(relay_addr + 1) / sizeof(int);
 
-    if (!TRACE_ON(relay))
+    /* remove extra stuff from the stack */
+    EIP_reg(context) = STACK32_POP(context);
+    args = (int *)ESP_reg(context);
+    ESP_reg(context) += 4 * nb_args;
+
+    assert(TRACE_ON(relay));
+
+    entry_point = (BYTE *)BUILTIN32_GetEntryPoint( buffer, relay_addr - 5, &typemask );
+    assert( *entry_point == 0xe8 /* lcall */ );
+    func = *(FARPROC *)(entry_point + 5);
+
+    DPRINTF( "Call %s(", buffer );
+    RELAY_PrintArgs( args, nb_args, typemask );
+    DPRINTF( ") ret=%08lx fs=%04lx\n", EIP_reg(context), FS_reg(context) );
+
+    DPRINTF(" eax=%08lx ebx=%08lx ecx=%08lx edx=%08lx esi=%08lx edi=%08lx\n",
+            EAX_reg(context), EBX_reg(context), ECX_reg(context),
+            EDX_reg(context), ESI_reg(context), EDI_reg(context) );
+    DPRINTF(" ebp=%08lx esp=%08lx ds=%04lx es=%04lx gs=%04lx flags=%08lx\n",
+            EBP_reg(context), ESP_reg(context), DS_reg(context),
+            ES_reg(context), GS_reg(context), EFL_reg(context) );
+
+    /* Now call the real function */
+    switch(nb_args)
     {
-        /* Simply call the entry point */
-        entry_point( &context );
+    case 0: func(context); break;
+    case 1: func(args[0],context); break;
+    case 2: func(args[0],args[1],context); break;
+    case 3: func(args[0],args[1],args[2],context); break;
+    case 4: func(args[0],args[1],args[2],args[3],context); break;
+    case 5: func(args[0],args[1],args[2],args[3],args[4],context); break;
+    case 6: func(args[0],args[1],args[2],args[3],args[4],args[5],context); break;
+    case 7: func(args[0],args[1],args[2],args[3],args[4],args[5],args[6],context); break;
+    case 8: func(args[0],args[1],args[2],args[3],args[4],args[5],args[6],args[7],context); break;
+    case 9: func(args[0],args[1],args[2],args[3],args[4],args[5],args[6],args[7],args[8],
+                 context); break;
+    case 10: func(args[0],args[1],args[2],args[3],args[4],args[5],args[6],args[7],args[8],
+                  args[9],context); break;
+    case 11: func(args[0],args[1],args[2],args[3],args[4],args[5],args[6],args[7],args[8],
+                  args[9],args[10],context); break;
+    case 12: func(args[0],args[1],args[2],args[3],args[4],args[5],args[6],args[7],args[8],
+                  args[9],args[10],args[11],context); break;
+    case 13: func(args[0],args[1],args[2],args[3],args[4],args[5],args[6],args[7],args[8],
+                  args[9],args[10],args[11],args[12],context); break;
+    case 14: func(args[0],args[1],args[2],args[3],args[4],args[5],args[6],args[7],args[8],
+                  args[9],args[10],args[11],args[12],args[13],context); break;
+    case 15: func(args[0],args[1],args[2],args[3],args[4],args[5],args[6],args[7],args[8],
+                  args[9],args[10],args[11],args[12],args[13],args[14],context); break;
+    case 16: func(args[0],args[1],args[2],args[3],args[4],args[5], args[6],args[7],args[8],
+                  args[9],args[10],args[11],args[12],args[13],args[14],args[15],context); break;
+    default:
+        ERR( "Unsupported nb of args %d\n", nb_args );
+        assert(FALSE);
     }
-    else
-    {
-        char buffer[80];
-        unsigned int typemask;
-	BYTE *relay_addr;
 
-        /*
-	 * Fixup the context structure because of the extra parameter
-         * pushed by the relay debugging code.
-	 * Note that this implicitly does a RET on the CALL from the
-	 * DEBUG_ENTRY_POINT to the REG_ENTRY_POINT;  setting the EIP register
-	 * ensures that the assembly glue will directly return to the
-	 * caller, just as in the non-debugging case.
-	 */
-
-        relay_addr = *(BYTE **) ESP_reg(&context); 
-        if (BUILTIN32_GetEntryPoint( buffer, relay_addr - 5, &typemask )) {
-	    /* correct win32 spec generated register function found. 
-	     * remove extra call stuff from stack
-	     */
-            ESP_reg(&context) += sizeof(BYTE *);
-	    EIP_reg(&context) = *(DWORD *)ESP_reg(&context);
-	    DPRINTF("Call %s(regs) ret=%08x\n", buffer, *(int *)ESP_reg(&context) );
-	    DPRINTF(" EAX=%08lx EBX=%08lx ECX=%08lx EDX=%08lx ESI=%08lx EDI=%08lx\n",
-		    EAX_reg(&context), EBX_reg(&context), ECX_reg(&context),
-		    EDX_reg(&context), ESI_reg(&context), EDI_reg(&context) );
-	    DPRINTF(" EBP=%08lx ESP=%08lx EIP=%08lx DS=%04lx ES=%04lx FS=%04lx GS=%04lx EFL=%08lx\n",
-		    EBP_reg(&context), ESP_reg(&context), EIP_reg(&context),
-		    DS_reg(&context), ES_reg(&context), FS_reg(&context),
-		    GS_reg(&context), EFL_reg(&context) );
-
-	    /* Now call the real function */
-	    entry_point( &context );
-
-
-	    DPRINTF("Ret  %s() retval=regs ret=%08x\n", buffer, *(int *)ESP_reg(&context) );
-	    DPRINTF(" EAX=%08lx EBX=%08lx ECX=%08lx EDX=%08lx ESI=%08lx EDI=%08lx\n",
-		    EAX_reg(&context), EBX_reg(&context), ECX_reg(&context),
-		    EDX_reg(&context), ESI_reg(&context), EDI_reg(&context) );
-	    DPRINTF(" EBP=%08lx ESP=%08lx EIP=%08lx DS=%04lx ES=%04lx FS=%04lx GS=%04lx EFL=%08lx\n",
-		    EBP_reg(&context), ESP_reg(&context), EIP_reg(&context),
-		    DS_reg(&context), ES_reg(&context), FS_reg(&context),
-		    GS_reg(&context), EFL_reg(&context) );
-	} else
-	    /* WINE internal register function found. Do not remove anything.
-	     * Do not print any debuginfo (it is not a normal relayed one).
-	     * Currently only used for snooping.
-	     */
-	   entry_point( &context );
-    }
+    DPRINTF( "Ret  %s() retval=%08lx ret=%08lx fs=%04lx\n",
+             buffer, EAX_reg(context), EIP_reg(context), FS_reg(context) );
+    DPRINTF(" eax=%08lx ebx=%08lx ecx=%08lx edx=%08lx esi=%08lx edi=%08lx\n",
+            EAX_reg(context), EBX_reg(context), ECX_reg(context),
+            EDX_reg(context), ESI_reg(context), EDI_reg(context) );
+    DPRINTF(" ebp=%08lx esp=%08lx ds=%04lx es=%04lx gs=%04lx flags=%08lx\n",
+            EBP_reg(context), ESP_reg(context), DS_reg(context),
+            ES_reg(context), GS_reg(context), EFL_reg(context) );
 }
+
+#else  /* __i386__ */
+
+REGS_ENTRYPOINT(RELAY_CallFrom32Regs) { }
+
 #endif  /* __i386__ */
diff --git a/relay32/snoop.c b/relay32/snoop.c
index 7950984..afc7392 100644
--- a/relay32/snoop.c
+++ b/relay32/snoop.c
@@ -23,6 +23,9 @@
 
 char **debug_snoop_excludelist = NULL, **debug_snoop_includelist = NULL;
 
+extern void SNOOP_Entry();
+extern void SNOOP_Return();
+
 #ifdef __i386__
 
 #ifdef NEED_UNDERSCORE_PREFIX
@@ -31,27 +34,6 @@
 # define PREFIX
 #endif
 
-/* Well, not exactly extern since they are in the same file (in the lines
- * below). But the C Compiler doesn't see them there, so we have to help a bit.
- */
-extern void SNOOP_Return();
-extern void SNOOP_Entry();
-__asm__(".align 4\n\t"
-        ".globl "PREFIX"SNOOP_Entry\n\t"
-        ".type "PREFIX"SNOOP_Entry,@function\n\t"
-        PREFIX"SNOOP_Entry:\n\t"
-        "pushl $"PREFIX"__regs_SNOOP_Entry\n\t"
-        "pushl $"PREFIX"CALL32_Regs\n\t"
-        "ret\n\t"
-	".align 4\n\t"
-        ".globl "PREFIX"SNOOP_Return\n\t"
-        ".type "PREFIX"SNOOP_Return,@function\n\t"
-        PREFIX"SNOOP_Return:\n\t"
-        "pushl $"PREFIX"__regs_SNOOP_Return\n\t"
-        "pushl $"PREFIX"CALL32_Regs\n\t"
-        "ret"
-);
-
 #include "pshpack1.h"
 
 typedef	struct tagSNOOP_FUN {
@@ -267,8 +249,9 @@
 	return buf;
 }
 
-#define CALLER1REF (*(DWORD*)(ESP_reg(context)+4))
-REGS_ENTRYPOINT(SNOOP_Entry) {
+#define CALLER1REF (*(DWORD*)ESP_reg(context))
+void WINAPI REGS_FUNC(SNOOP_Entry)( CONTEXT *context )
+{
 	DWORD		ordinal=0,entry = EIP_reg(context)-5;
 	SNOOP_DLL	*dll = firstdll;
 	SNOOP_FUN	*fun = NULL;
@@ -336,18 +319,19 @@
 	if (fun->nrofargs>0) {
 		max = fun->nrofargs; if (max>16) max=16;
 		for (i=0;i<max;i++)
-			DPRINTF("%s%s",SNOOP_PrintArg(*(DWORD*)(ESP_reg(context)+8+sizeof(DWORD)*i)),(i<fun->nrofargs-1)?",":"");
+			DPRINTF("%s%s",SNOOP_PrintArg(*(DWORD*)(ESP_reg(context)+4+sizeof(DWORD)*i)),(i<fun->nrofargs-1)?",":"");
 		if (max!=fun->nrofargs)
 			DPRINTF(" ...");
 	} else if (fun->nrofargs<0) {
 		DPRINTF("<unknown, check return>");
 		ret->args = HeapAlloc(SystemHeap,0,16*sizeof(DWORD));
-		memcpy(ret->args,(LPBYTE)(ESP_reg(context)+8),sizeof(DWORD)*16);
+		memcpy(ret->args,(LPBYTE)(ESP_reg(context)+4),sizeof(DWORD)*16);
 	}
 	DPRINTF(") ret=%08lx fs=%04lx\n",(DWORD)ret->origreturn,FS_reg(context));
 }
 
-REGS_ENTRYPOINT(SNOOP_Return) {
+void WINAPI REGS_FUNC(SNOOP_Return)( CONTEXT *context )
+{
 	SNOOP_RETURNENTRY	*ret = (SNOOP_RETURNENTRY*)(EIP_reg(context)-5);
 
 	/* We haven't found out the nrofargs yet. If we called a cdecl
@@ -388,4 +372,8 @@
 FARPROC SNOOP_GetProcAddress(HMODULE hmod,LPCSTR name,DWORD ordinal,FARPROC origfun) {
 	return origfun;
 }
+
+REGS_ENTRYPOINT(SNOOP_Entry) { }
+REGS_ENTRYPOINT(SNOOP_Return) { }
+
 #endif	/* !__i386__ */
diff --git a/tools/build.c b/tools/build.c
index 1971212..4ea11ba 100644
--- a/tools/build.c
+++ b/tools/build.c
@@ -458,12 +458,6 @@
     odp->u.func.arg_types[i] = '\0';
     if ((odp->type == TYPE_STDCALL) && !i)
         odp->type = TYPE_CDECL; /* stdcall is the same as cdecl for 0 args */
-    if ((odp->type == TYPE_REGISTER) && (SpecType == SPEC_WIN32) && i)
-    {
-        fprintf( stderr, "%s:%d: register functions cannot have arguments in Win32\n",
-                 SpecName, Line );
-        return -1;
-    }
     strcpy(odp->u.func.link_name, GetToken());
     return 0;
 }
@@ -1074,11 +1068,13 @@
                  "        \".globl " PREFIX "%s\\n\\t\"\n"
                  "        \".type " PREFIX "%s,@function\\n\\t\"\n"
                  "        \"" PREFIX "%s:\\n\\t\"\n"
-                 "        \"pushl $" PREFIX "__regs_%s\\n\\t\"\n"
-                 "        \"pushl $" PREFIX "CALL32_Regs\\n\\t\"\n"
-                 "        \"ret\");\n",
+                 "        \"call " PREFIX "CALL32_Regs\\n\\t\"\n"
+                 "        \".long " PREFIX "__regs_%s\\n\\t\"\n"
+                 "        \".byte %d,%d\");\n",
                  odp->u.func.link_name, odp->u.func.link_name,
-                 odp->u.func.link_name, odp->u.func.link_name );
+                 odp->u.func.link_name, odp->u.func.link_name,
+                 4 * strlen(odp->u.func.arg_types),
+                 4 * strlen(odp->u.func.arg_types) );
     }
     fprintf( outfile, "#ifndef __GNUC__\n" );
     fprintf( outfile, "}\n" );
@@ -1173,7 +1169,8 @@
     for (i = Base, odp = OrdinalDefinitions + Base; i <= Limit; i++, odp++)
     {
     	unsigned int j, mask = 0;
-	if ((odp->type == TYPE_STDCALL) || (odp->type == TYPE_CDECL))
+	if ((odp->type == TYPE_STDCALL) || (odp->type == TYPE_CDECL) ||
+            (odp->type == TYPE_REGISTER))
 	    for (j = 0; odp->u.func.arg_types[j]; j++)
             {
                 if (odp->u.func.arg_types[j] == 't') mask |= 1<< (j*2);
@@ -1211,12 +1208,12 @@
         case TYPE_CDECL:
             args = 0x80 | (unsigned char)strlen(odp->u.func.arg_types);
             break;
+        case TYPE_REGISTER:
+            args = 0x40 | (unsigned char)strlen(odp->u.func.arg_types);
+            break;
         case TYPE_FORWARD:
             args = 0xfd;
             break;
-        case TYPE_REGISTER:
-            args = 0xfe;
-            break;
         default:
             args = 0xff;
             break;
@@ -2542,15 +2539,25 @@
  * 'args' is the number of dword arguments.
  *
  * Stack layout:
- *   ...     ...
- * (esp+336) ret addr (or relay addr when debugging(relay) is on)
- * (esp+332) entry point
- * (esp+204) buffer area to allow stack frame manipulation
- * (esp+0)   CONTEXT struct
+ *   ...
+ * (ebp+12)  first arg
+ * (ebp+8)   ret addr to user code
+ * (ebp+4)   ret addr to relay code
+ * (ebp+0)   saved ebp
+ * (ebp-128) buffer area to allow stack frame manipulation
+ * (ebp-332) CONTEXT struct
+ * (ebp-336) CONTEXT *argument
+ *  ....     other arguments copied from (ebp+12)
+ *
+ * The entry point routine is called with a CONTEXT* extra argument,
+ * following the normal args. In this context structure, EIP_reg
+ * contains the return address to user code, and ESP_reg the stack
+ * pointer on return (with the return address and arguments already
+ * removed).
  */
 static void BuildCallFrom32Regs( FILE *outfile )
 {
-#define STACK_SPACE 128
+    static const int STACK_SPACE = 128 + sizeof(CONTEXT);
 
     /* Function header */
 
@@ -2562,85 +2569,102 @@
     fprintf( outfile, PREFIX "CALL32_Regs:\n" );
 
     /* Allocate some buffer space on the stack */
-   
+
+    fprintf( outfile, "\tpushl %%ebp\n" );
+    fprintf( outfile, "\tmovl %%esp,%%ebp\n ");
     fprintf( outfile, "\tleal -%d(%%esp), %%esp\n", STACK_SPACE );
     
     /* Build the context structure */
 
-    fprintf( outfile, "\tpushw $0\n" );
-    fprintf( outfile, "\t.byte 0x66\n\tpushl %%ss\n" );
-    fprintf( outfile, "\tpushl %%eax\n" );  /* %esp place holder */
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(Eax) - STACK_SPACE );
     fprintf( outfile, "\tpushfl\n" );
-    fprintf( outfile, "\tpushw $0\n" );
-    fprintf( outfile, "\t.byte 0x66\n\tpushl %%cs\n" );
-    fprintf( outfile, "\tpushl %d(%%esp)\n", 16+STACK_SPACE+4 );  /* %eip at time of call */
-    fprintf( outfile, "\tpushl %%ebp\n" );
-
-    fprintf( outfile, "\tpushl %%eax\n" );
-    fprintf( outfile, "\tpushl %%ecx\n" );
-    fprintf( outfile, "\tpushl %%edx\n" );
-    fprintf( outfile, "\tpushl %%ebx\n" );
-    fprintf( outfile, "\tpushl %%esi\n" );
-    fprintf( outfile, "\tpushl %%edi\n" );
+    fprintf( outfile, "\tpopl %%eax\n" );
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(EFlags) - STACK_SPACE );
+    fprintf( outfile, "\tmovl 0(%%ebp),%%eax\n" );
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(Ebp) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %%ebx,%d(%%ebp)\n", CONTEXTOFFSET(Ebx) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %%ecx,%d(%%ebp)\n", CONTEXTOFFSET(Ecx) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %%edx,%d(%%ebp)\n", CONTEXTOFFSET(Edx) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %%esi,%d(%%ebp)\n", CONTEXTOFFSET(Esi) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %%edi,%d(%%ebp)\n", CONTEXTOFFSET(Edi) - STACK_SPACE );
 
     fprintf( outfile, "\txorl %%eax,%%eax\n" );
-    fprintf( outfile, "\tmovw %%ds,%%ax\n" );
-    fprintf( outfile, "\tpushl %%eax\n" );
+    fprintf( outfile, "\tmovw %%cs,%%ax\n" );
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(SegCs) - STACK_SPACE );
     fprintf( outfile, "\tmovw %%es,%%ax\n" );
-    fprintf( outfile, "\tpushl %%eax\n" );
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(SegEs) - STACK_SPACE );
     fprintf( outfile, "\tmovw %%fs,%%ax\n" );
-    fprintf( outfile, "\tpushl %%eax\n" );
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(SegFs) - STACK_SPACE );
     fprintf( outfile, "\tmovw %%gs,%%ax\n" );
-    fprintf( outfile, "\tpushl %%eax\n" );
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(SegGs) - STACK_SPACE );
+    fprintf( outfile, "\tmovw %%ss,%%ax\n" );
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(SegSs) - STACK_SPACE );
+    fprintf( outfile, "\tmovw %%ds,%%ax\n" );
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(SegDs) - STACK_SPACE );
+    fprintf( outfile, "\tmovw %%ax,%%es\n" );  /* set %es equal to %ds just in case */
 
-    fprintf( outfile, "\tleal -%d(%%esp),%%esp\n",
-             sizeof(FLOATING_SAVE_AREA) + 6 * sizeof(DWORD) /* DR regs */ );
-    fprintf( outfile, "\tpushl $0x0001001f\n" );  /* ContextFlags */
+    fprintf( outfile, "\tmovl $0x%x,%%eax\n", CONTEXT_FULL );
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(ContextFlags) - STACK_SPACE );
 
-    fprintf( outfile, "\tfsave %d(%%esp)\n", CONTEXTOFFSET(FloatSave) );
+    fprintf( outfile, "\tmovl 8(%%ebp),%%eax\n" ); /* Get %eip at time of call */
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(Eip) - STACK_SPACE );
 
-    fprintf( outfile, "\tleal %d(%%esp),%%eax\n",
-             sizeof(CONTEXT) + STACK_SPACE + 4 ); /* %esp at time of call */
-    fprintf( outfile, "\tmovl %%eax,%d(%%esp)\n", CONTEXTOFFSET(Esp) );
+    /* Transfer the arguments */
 
-    fprintf( outfile, "\tcall " PREFIX "RELAY_CallFrom32Regs\n" );
+    fprintf( outfile, "\tmovl 4(%%ebp),%%ebx\n" );   /* get relay code addr */
+    fprintf( outfile, "\tpushl %%esp\n" );           /* push ptr to context struct */
+    fprintf( outfile, "\tmovzbl 4(%%ebx),%%ecx\n" ); /* fetch number of args to copy */
+    fprintf( outfile, "\tjecxz 1f\n" );
+    fprintf( outfile, "\tsubl %%ecx,%%esp\n" );
+    fprintf( outfile, "\tleal 12(%%ebp),%%esi\n" );  /* get %esp at time of call */
+    fprintf( outfile, "\tmovl %%esp,%%edi\n" );
+    fprintf( outfile, "\tshrl $2,%%ecx\n" );
+    fprintf( outfile, "\tcld\n" );
+    fprintf( outfile, "\trep\n\tmovsl\n" );  /* copy args */
+
+    fprintf( outfile, "1:\tmovzbl 5(%%ebx),%%eax\n" ); /* fetch number of args to remove */
+    fprintf( outfile, "\tleal 12(%%ebp,%%eax),%%eax\n" );
+    fprintf( outfile, "\tmovl %%eax,%d(%%ebp)\n", CONTEXTOFFSET(Esp) - STACK_SPACE );
+
+    /* Call the entry point */
+
+    fprintf( outfile, "\tcall *0(%%ebx)\n" );
+
+    /* Store %eip and %ebp onto the new stack */
+
+    fprintf( outfile, "\tmovl %d(%%ebp),%%edx\n", CONTEXTOFFSET(Esp) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %d(%%ebp),%%eax\n", CONTEXTOFFSET(Eip) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %%eax,-4(%%edx)\n" );
+    fprintf( outfile, "\tmovl %d(%%ebp),%%eax\n", CONTEXTOFFSET(Ebp) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %%eax,-8(%%edx)\n" );
 
     /* Restore the context structure */
 
-    fprintf( outfile, "\tfrstor %d(%%esp)\n", CONTEXTOFFSET(FloatSave) );
-
-    /* Store %eip value onto the new stack */
-
-    fprintf( outfile, "\tmovl %d(%%esp),%%eax\n", CONTEXTOFFSET(Eip) );
-    fprintf( outfile, "\tmovl %d(%%esp),%%ebx\n", CONTEXTOFFSET(Esp) );
-    fprintf( outfile, "\tmovl %%eax,0(%%ebx)\n" );
-
-    /* Restore all registers */
-
-    fprintf( outfile, "\tleal %d(%%esp),%%esp\n",
-             sizeof(FLOATING_SAVE_AREA) + 7 * sizeof(DWORD) );
-    fprintf( outfile, "\tpopl %%eax\n" );
-    fprintf( outfile, "\tmovw %%ax,%%gs\n" );
-    fprintf( outfile, "\tpopl %%eax\n" );
-    fprintf( outfile, "\tmovw %%ax,%%fs\n" );
-    fprintf( outfile, "\tpopl %%eax\n" );
+    /* Note: we don't bother to restore %cs, %ds and %ss
+     *       changing them in 32-bit code is a recipe for disaster anyway
+     */
+    fprintf( outfile, "\tmovl %d(%%ebp),%%eax\n", CONTEXTOFFSET(SegEs) - STACK_SPACE );
     fprintf( outfile, "\tmovw %%ax,%%es\n" );
-    fprintf( outfile, "\tpopl %%eax\n" );
-    fprintf( outfile, "\tmovw %%ax,%%ds\n" );
+    fprintf( outfile, "\tmovl %d(%%ebp),%%eax\n", CONTEXTOFFSET(SegFs) - STACK_SPACE );
+    fprintf( outfile, "\tmovw %%ax,%%fs\n" );
+    fprintf( outfile, "\tmovl %d(%%ebp),%%eax\n", CONTEXTOFFSET(SegGs) - STACK_SPACE );
+    fprintf( outfile, "\tmovw %%ax,%%gs\n" );
 
-    fprintf( outfile, "\tpopl %%edi\n" );
-    fprintf( outfile, "\tpopl %%esi\n" );
-    fprintf( outfile, "\tpopl %%ebx\n" );
-    fprintf( outfile, "\tpopl %%edx\n" );
-    fprintf( outfile, "\tpopl %%ecx\n" );
-    fprintf( outfile, "\tpopl %%eax\n" );
-    fprintf( outfile, "\tpopl %%ebp\n" );
-    fprintf( outfile, "\tleal 8(%%esp),%%esp\n" );  /* skip %eip and %cs */
+    fprintf( outfile, "\tmovl %d(%%ebp),%%edi\n", CONTEXTOFFSET(Edi) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %d(%%ebp),%%esi\n", CONTEXTOFFSET(Esi) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %d(%%ebp),%%edx\n", CONTEXTOFFSET(Edx) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %d(%%ebp),%%ecx\n", CONTEXTOFFSET(Ecx) - STACK_SPACE );
+    fprintf( outfile, "\tmovl %d(%%ebp),%%ebx\n", CONTEXTOFFSET(Ebx) - STACK_SPACE );
+
+    fprintf( outfile, "\tmovl %d(%%ebp),%%eax\n", CONTEXTOFFSET(EFlags) - STACK_SPACE );
+    fprintf( outfile, "\tpushl %%eax\n" );
     fprintf( outfile, "\tpopfl\n" );
-    fprintf( outfile, "\tpopl %%esp\n" );
-    fprintf( outfile, "\tret\n" );
+    fprintf( outfile, "\tmovl %d(%%ebp),%%eax\n", CONTEXTOFFSET(Eax) - STACK_SPACE );
 
-#undef STACK_SPACE
+    fprintf( outfile, "\tmovl %d(%%ebp),%%ebp\n", CONTEXTOFFSET(Esp) - STACK_SPACE );
+    fprintf( outfile, "\tleal -8(%%ebp),%%esp\n" );
+    fprintf( outfile, "\tpopl %%ebp\n" );
+    fprintf( outfile, "\tret\n" );
 }