wined3d: Add support for shader model 3.0 I/O registers.

SM 3.0 can pack multiple "semantics" into 12 generic input/output registers.

To support that, define temporaries called IN and OUT, and use those as
the output registers. At the end of the vshader, unpack the OUT temps
into the proper GL variables. At the beginning of the pshader, pack the
GL variables back into 12 IN registers.
diff --git a/dlls/wined3d/baseshader.c b/dlls/wined3d/baseshader.c
index 6f53ce2..7877bcd 100644
--- a/dlls/wined3d/baseshader.c
+++ b/dlls/wined3d/baseshader.c
@@ -343,10 +343,13 @@
 
                 if (!pshader)
                     reg_maps->attributes[regnum] = 1;
+                else
+                    reg_maps->packed_input[regnum] = 1;
 
                 shader_parse_decl_usage(reg_maps->semantics_in, usage, param);
 
             } else if (D3DSPR_OUTPUT == regtype) {
+                reg_maps->packed_output[regnum] = 1;
                 shader_parse_decl_usage(reg_maps->semantics_out, usage, param);
             }
 
@@ -723,6 +726,18 @@
         shader_addline(buffer, "vec4 T%lu = gl_TexCoord[%lu];\n", i, i);
     }
 
+    /* Declare input register temporaries */
+    for (i=0; i < This->baseShader.limits.packed_input; i++) {
+        if (reg_maps->packed_input[i])
+            shader_addline(buffer, "vec4 IN%lu;\n", i);
+    }
+
+    /* Declare output register temporaries */
+    for (i = 0; i < This->baseShader.limits.packed_output; i++) {
+        if (reg_maps->packed_output[i])
+            shader_addline(buffer, "vec4 OUT%lu;\n", i);
+    }
+
     /* Declare temporary variables */
     for(i = 0; i < This->baseShader.limits.temporary; i++) {
         if (reg_maps->temporary[i])
diff --git a/dlls/wined3d/glsl_shader.c b/dlls/wined3d/glsl_shader.c
index 9cc3f2b..017fe5e 100644
--- a/dlls/wined3d/glsl_shader.c
+++ b/dlls/wined3d/glsl_shader.c
@@ -177,10 +177,14 @@
     break;
     case D3DSPR_INPUT:
         if (pshader) {
-            if (reg==0) {
-                strcpy(tmpStr, "gl_Color");
-            } else {
-                strcpy(tmpStr, "gl_SecondaryColor");
+            /* Pixel shaders >= 3.0 */
+            if (D3DSHADER_VERSION_MAJOR(This->baseShader.hex_version) >= 3)
+                sprintf(tmpStr, "IN%lu", reg);
+             else {
+                if (reg==0)
+                    strcpy(tmpStr, "gl_Color");
+                else
+                    strcpy(tmpStr, "gl_SecondaryColor");
             }
         } else {
             IWineD3DVertexShaderImpl *vshader = (IWineD3DVertexShaderImpl*) arg->shader;
@@ -263,7 +267,11 @@
         }
     break;
     case D3DSPR_TEXCRDOUT:
-        sprintf(tmpStr, "gl_TexCoord[%lu]", reg);
+        /* Vertex shaders >= 3.0: D3DSPR_OUTPUT */
+        if (D3DSHADER_VERSION_MAJOR(This->baseShader.hex_version) >= 3)
+            sprintf(tmpStr, "OUT%lu", reg);
+        else
+            sprintf(tmpStr, "gl_TexCoord[%lu]", reg);
     break;
     default:
         FIXME("Unhandled register name Type(%ld)\n", regtype);
@@ -795,3 +803,117 @@
     shader_addline(buffer, "tmp0.y = dot(vec3(T%lu), vec3(%s));\n", reg, src0_str);
     shader_addline(buffer, "T%lu = texture2D(mytex%lu, tmp0.st);\n", reg, reg);
 }
+
+void pshader_glsl_input_pack(
+   SHADER_BUFFER* buffer,
+   DWORD* semantics_in) {
+
+   unsigned int i;
+
+   for (i = 0; i < WINED3DSHADERDECLUSAGE_MAX_USAGE; i++) {
+
+       DWORD reg = semantics_in[i];
+       unsigned int regnum = reg & D3DSP_REGNUM_MASK;
+       char reg_mask[6];
+
+       /* Uninitialized */
+       if (!reg) continue;
+
+       shader_glsl_get_output_register_swizzle(reg, reg_mask);
+
+       switch(i) {
+
+           case WINED3DSHADERDECLUSAGE_DIFFUSE:
+               shader_addline(buffer, "IN%lu%s = vec4(gl_Color)%s;\n",
+                   regnum, reg_mask, reg_mask);
+               break;
+
+           case WINED3DSHADERDECLUSAGE_SPECULAR:
+               shader_addline(buffer, "IN%lu%s = vec4(gl_SecondaryColor)%s;\n",
+                   regnum, reg_mask, reg_mask);
+               break;
+
+           case WINED3DSHADERDECLUSAGE_TEXCOORD0:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD1:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD2:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD3:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD4:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD5:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD6:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD7:
+               shader_addline(buffer, "IN%lu%s = vec4(gl_TexCoord[%lu])%s;\n",
+                   regnum, reg_mask, i - WINED3DSHADERDECLUSAGE_TEXCOORD0, reg_mask );
+               break;
+
+           case WINED3DSHADERDECLUSAGE_FOG:
+               shader_addline(buffer, "IN%lu%s = vec4(gl_FogFragCoord)%s;\n",
+                   regnum, reg_mask, reg_mask);
+               break;
+
+           default:
+               shader_addline(buffer, "IN%lu%s = vec4(unsupported_input)%s;\n",
+                   regnum, reg_mask, reg_mask);
+        }
+    }
+}
+
+/*********************************************
+ * Vertex Shader Specific Code begins here
+ ********************************************/
+
+void vshader_glsl_output_unpack(
+   SHADER_BUFFER* buffer,
+   DWORD* semantics_out) {
+
+   unsigned int i;
+
+   for (i = 0; i < WINED3DSHADERDECLUSAGE_MAX_USAGE; i++) {
+
+       DWORD reg = semantics_out[i];
+       unsigned int regnum = reg & D3DSP_REGNUM_MASK;
+       char reg_mask[6];
+
+       /* Uninitialized */
+       if (!reg) continue;
+
+       shader_glsl_get_output_register_swizzle(reg, reg_mask);
+
+       switch(i) {
+
+           case WINED3DSHADERDECLUSAGE_DIFFUSE:
+               shader_addline(buffer, "gl_FrontColor%s = OUT%lu%s;\n", reg_mask, regnum, reg_mask);
+               break;
+
+           case WINED3DSHADERDECLUSAGE_SPECULAR:
+               shader_addline(buffer, "gl_FrontSecondaryColor%s = OUT%lu%s;\n", reg_mask, regnum, reg_mask);
+               break;
+
+           case WINED3DSHADERDECLUSAGE_POSITION:
+               shader_addline(buffer, "gl_Position%s = OUT%lu%s;\n", reg_mask, regnum, reg_mask);
+               break;
+
+           case WINED3DSHADERDECLUSAGE_TEXCOORD0:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD1:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD2:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD3:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD4:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD5:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD6:
+           case WINED3DSHADERDECLUSAGE_TEXCOORD7:
+               shader_addline(buffer, "gl_TexCoord[%lu]%s = OUT%lu%s;\n",
+                   i - WINED3DSHADERDECLUSAGE_TEXCOORD0, reg_mask, regnum, reg_mask);
+               break;
+
+           case WINED3DSHADERDECLUSAGE_PSIZE:
+               shader_addline(buffer, "gl_PointSize = OUT%lu.x;\n", regnum);
+               break;
+
+           case WINED3DSHADERDECLUSAGE_FOG:
+               shader_addline(buffer, "gl_FogFragCoord%s = OUT%lu%s;\n", reg_mask, regnum, reg_mask);
+               break;
+
+           default:
+               shader_addline(buffer, "unsupported_output%s = OUT%lu%s;\n", reg_mask, regnum, reg_mask);
+      }
+   }
+}
diff --git a/dlls/wined3d/pixelshader.c b/dlls/wined3d/pixelshader.c
index cf01e50..9b83e62 100644
--- a/dlls/wined3d/pixelshader.c
+++ b/dlls/wined3d/pixelshader.c
@@ -936,6 +936,7 @@
 
       This->baseShader.limits.attributes = 0;
       This->baseShader.limits.address = 0;
+      This->baseShader.limits.packed_output = 0;
 
       switch (This->baseShader.hex_version) {
           case D3DPS_VERSION(1,0):
@@ -947,6 +948,7 @@
                    This->baseShader.limits.constant_int = 0;
                    This->baseShader.limits.constant_bool = 0;
                    This->baseShader.limits.texture = 4;
+                   This->baseShader.limits.packed_input = 0;
                    break;
 
           case D3DPS_VERSION(1,4):
@@ -955,6 +957,7 @@
                    This->baseShader.limits.constant_int = 0;
                    This->baseShader.limits.constant_bool = 0;
                    This->baseShader.limits.texture = 6;
+                   This->baseShader.limits.packed_input = 0;
                    break;
                
           /* FIXME: temporaries must match D3DPSHADERCAPS2_0.NumTemps */ 
@@ -965,6 +968,7 @@
                    This->baseShader.limits.constant_int = 16;
                    This->baseShader.limits.constant_bool = 16;
                    This->baseShader.limits.texture = 8;
+                   This->baseShader.limits.packed_input = 0;
                    break;
 
           case D3DPS_VERSION(3,0):
@@ -973,6 +977,7 @@
                    This->baseShader.limits.constant_int = 16;
                    This->baseShader.limits.constant_bool = 16;
                    This->baseShader.limits.texture = 0;
+                   This->baseShader.limits.packed_input = 12;
                    break;
 
           default: This->baseShader.limits.temporary = 32;
@@ -980,6 +985,7 @@
                    This->baseShader.limits.constant_int = 0;
                    This->baseShader.limits.constant_bool = 0;
                    This->baseShader.limits.texture = 8;
+                   This->baseShader.limits.packed_input = 0;
                    FIXME("Unrecognized pixel shader version %#lx\n", 
                        This->baseShader.hex_version);
       }
@@ -1330,6 +1336,10 @@
         /* Base Declarations */
         shader_generate_glsl_declarations( (IWineD3DBaseShader*) This, &reg_maps, &buffer);
 
+        /* Pack 3.0 inputs */
+        if (This->baseShader.hex_version >= D3DPS_VERSION(3,0))
+            pshader_glsl_input_pack(&buffer, semantics_in);
+
         /* Base Shader Body */
         shader_generate_main( (IWineD3DBaseShader*) This, &buffer, &reg_maps, pFunction);
 
diff --git a/dlls/wined3d/vertexshader.c b/dlls/wined3d/vertexshader.c
index c116595..04a8337 100644
--- a/dlls/wined3d/vertexshader.c
+++ b/dlls/wined3d/vertexshader.c
@@ -714,6 +714,7 @@
 
       This->baseShader.limits.texture = 0;
       This->baseShader.limits.attributes = 16;
+      This->baseShader.limits.packed_input = 0;
 
       /* Must match D3DCAPS9.MaxVertexShaderConst: at least 256 for vs_2_0 */
       This->baseShader.limits.constant_float = WINED3D_VSHADER_MAX_CONSTANTS;
@@ -725,6 +726,7 @@
                    This->baseShader.limits.constant_bool = 0;
                    This->baseShader.limits.constant_int = 0;
                    This->baseShader.limits.address = 1;
+                   This->baseShader.limits.packed_output = 0;
                    break;
       
           case D3DVS_VERSION(2,0):
@@ -733,6 +735,7 @@
                    This->baseShader.limits.constant_bool = 16;
                    This->baseShader.limits.constant_int = 16;
                    This->baseShader.limits.address = 1;
+                   This->baseShader.limits.packed_output = 0;
                    break;
 
           case D3DVS_VERSION(3,0):
@@ -740,12 +743,14 @@
                    This->baseShader.limits.constant_bool = 32;
                    This->baseShader.limits.constant_int = 32;
                    This->baseShader.limits.address = 1;
+                   This->baseShader.limits.packed_output = 12;
                    break;
 
           default: This->baseShader.limits.temporary = 12;
                    This->baseShader.limits.constant_bool = 0;
                    This->baseShader.limits.constant_int = 0;
                    This->baseShader.limits.address = 1;
+                   This->baseShader.limits.packed_output = 0;
                    FIXME("Unrecognized vertex shader version %#lx\n",
                        This->baseShader.hex_version);
       }
@@ -870,6 +875,10 @@
         /* Base Shader Body */
         shader_generate_main( (IWineD3DBaseShader*) This, &buffer, &reg_maps, pFunction);
 
+        /* Unpack 3.0 outputs */
+        if (This->baseShader.hex_version >= D3DVS_VERSION(3,0))
+            vshader_glsl_output_unpack(&buffer, semantics_out);
+
         shader_addline(&buffer, "}\n\0");
 
         TRACE("Compiling shader object %u\n", shader_obj);
diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h
index ff81628..8b86d95 100644
--- a/dlls/wined3d/wined3d_private.h
+++ b/dlls/wined3d/wined3d_private.h
@@ -1251,6 +1251,8 @@
 #define MAX_REG_ADDR 1
 #define MAX_REG_TEMP 32
 #define MAX_REG_TEXCRD 8
+#define MAX_REG_INPUT 12
+#define MAX_REG_OUTPUT 12
 #define MAX_ATTRIBS 16
 #define MAX_CONST_F 256
 
@@ -1259,6 +1261,8 @@
     char texcoord[MAX_REG_TEXCRD];          /* pixel < 3.0 */
     char temporary[MAX_REG_TEMP];           /* pixel, vertex */
     char address[MAX_REG_ADDR];             /* vertex */
+    char packed_input[MAX_REG_INPUT];       /* pshader >= 3.0 */
+    char packed_output[MAX_REG_OUTPUT];     /* vertex >= 3.0 */
     char attributes[MAX_ATTRIBS];           /* vertex */
 
     char constantsF[MAX_CONST_F];           /* pixel, vertex */
@@ -1307,6 +1311,8 @@
     unsigned int constant_float;
     unsigned int constant_bool;
     unsigned int address;
+    unsigned int packed_output;
+    unsigned int packed_input;
     unsigned int attributes;
 } SHADER_LIMITS;
 
@@ -1346,11 +1352,20 @@
 extern void shader_glsl_compare(SHADER_OPCODE_ARG* arg);
 extern void shader_glsl_def(SHADER_OPCODE_ARG* arg);
 extern void shader_glsl_cmp(SHADER_OPCODE_ARG* arg);
+
 /** GLSL Pixel Shader Prototypes */
 extern void pshader_glsl_tex(SHADER_OPCODE_ARG* arg);
 extern void pshader_glsl_texcoord(SHADER_OPCODE_ARG* arg);
 extern void pshader_glsl_texm3x2pad(SHADER_OPCODE_ARG* arg);
 extern void pshader_glsl_texm3x2tex(SHADER_OPCODE_ARG* arg);
+extern void pshader_glsl_input_pack(
+   SHADER_BUFFER* buffer,
+   DWORD* semantics_out);
+
+/** GLSL Vertex Shader Prototypes */
+extern void vshader_glsl_output_unpack(
+   SHADER_BUFFER* buffer,
+   DWORD* semantics_out);
 
 /*****************************************************************************
  * IDirect3DBaseShader implementation structure