U3D优化批处理-GPU Instancing了解一下

一、为什么要使用GPU Instancing？

以往我们优化cpu的时候，为了降低Drawcall的消耗，我们通常采用静态批处理，动态批处理等技术，然而这也是有弊端的。通常一个大的场景中，存在大量相同的植被等物件，静态批处理后，对内存的增加是非常大的，动则就是几十兆的内存。而动态批处理，对于合批要求挺多的，同时可能存在，动态合批消耗过大，得不偿失。如果我们自己在逻辑代码里面进行动态合批，对于mesh的readwrite属性是要求开启的，这无疑也增大了内存的占用，复杂的合批处理可能会消耗更多的cpu时间。

Unity在5.4版本及之后，新增了一项功能，那就是_GPU Instancing_。_GPU Instancing_的出现，给我们提供了新的思路，对于大场景而言将所有的场景物件一次性都加载，对内存来说是很有压力的，我们可以将这些静态的物件如植被等全部从场景中剔除，而保存其位置、缩放、uv偏移、_lightmapindex_等相关信息，在需要渲染的时候，根据其保存的信息，通过_Instance_来渲染，这能够减少那些因为内存原因而不能合批的大批量相同物件的渲染时间。下面这两张图都是同个场景下渲染多个_gameobject_，图1开启了_GPU Instancing_，而图2没有。

**图 1
**

图 2

在Unite2017大会上Unity的开发工程师为我们演示了关于_GPU Instancing_的一些实现，但目前它只支持标准的表面_instance_，同时不支持_lightmap_、灯光探测器、阴影、裁剪等功能。这些都需要我们自己来实现。（这里只指Unity5.6及前面的版本）

二、如何使用GPU Instancing？

首先我们来看看Unity自带的支持标准表面着色器，通过

Create->Shader->StandardSurfaceShader(Instanced)

可以创建一个标准表面着色器(instance)，下面是此着色器中的一段代码 (PS: 我所实验的是Unity 5.5的版本，而Unity5.6中已经没有这个选项，同时Unity5.6在材质属性面板中有一个Enable Instance Variants 勾选项，勾选表示支持Instance)

SubShader {
  Tags { "RenderType"="Opaque" }
  LOD 200

  CGPROGRAM
  // Physically based Standard lighting model, and enable shadows on all light types
  // And generate the shadow pass with instancing support
  #pragma surface surf Standard fullforwardshadows addshadow
  // Use shader model 3.0 target, to get nicer looking lighting
  #pragma target 3.0
  // Enable instancing for this shader
  #pragma multi_compile_instancing
  // Config maxcount. See manual page.
  // #pragma instancing_options
  sampler2D _MainTex;
  struct Input {
                 float2 uv_MainTex;
               };
  half _Glossiness;
  half _Metallic;
  // Declare instanced properties inside a cbuffer.
  // Each instanced property is an array of by default 500(D3D)/128(GL) elements. Since D3D and GL imposes a certain limitation
  // of 64KB and 16KB respectively on the size of a cubffer, the default array size thus allows two matrix arrays in one cbuffer.
  // Use maxcount option on #pragma instancing_options directive to specify array size other than default (divided by 4 when used
  // for GL).
  UNITY_INSTANCING_CBUFFER_START(Props)
  UNITY_DEFINE_INSTANCED_PROP(fixed4, _Color) // Make _Color an instanced property (i.e. an array)
  UNITY_INSTANCING_CBUFFER_END
  void surf (Input IN, inout SurfaceOutputStandard o) {
      // Albedo comes from a texture tinted by color
      fixed4 c = tex2D (_MainTex, IN.uv_MainTex) *  UNITY_ACCESS_INSTANCED_PROP(_Color);
      o.Albedo = c.rgb;
      // Metallic and smoothness come from slider variables
      o.Metallic = _Metallic;
      o.Smoothness = _Glossiness;
      o.Alpha = c.a;
    }
   ENDCG
  }

然后再来来看看官网文档Vertex/Fragment着色器的例子，shader代码如下

Shader "SimplestInstancedShader"
{
   Properties
   {
       _Color   ("Color", Color) = (1, 1, 1, 1)
   }
   SubShader
   {
       Tags   { "RenderType"="Opaque" }
       LOD   100
       Pass
       {
           CGPROGRAM
           #pragma   vertex vert
           #pragma   fragment frag
           #pragma   multi_compile_instancing
           #include   "UnityCG.cginc"
           struct appdata
           {
               float4   vertex : POSITION;
               UNITY_VERTEX_INPUT_INSTANCE_ID
           };
           struct v2f
           {
               float4   vertex : SV_POSITION;
               UNITY_VERTEX_INPUT_INSTANCE_ID //   necessary only if you want to access instanced properties in fragment Shader.
           };
           UNITY_INSTANCING_CBUFFER_START(MyProperties)
           UNITY_DEFINE_INSTANCED_PROP(float4,   _Color)
           UNITY_INSTANCING_CBUFFER_END

           v2f   vert(appdata v)
           {
               v2f   o;
               UNITY_SETUP_INSTANCE_ID(v);
               UNITY_TRANSFER_INSTANCE_ID(v,   o); // necessary only if you want to access instanced properties in the fragment Shader.
               o.vertex   = UnityObjectToClipPos(v.vertex);
               return o;
           }

           fixed4   frag(v2f i) : SV_Target
           {
               UNITY_SETUP_INSTANCE_ID(i); //   necessary only if any instanced properties are going to be accessed in the fragment Shader.
               return UNITY_ACCESS_INSTANCED_PROP(_Color);
           }
           ENDCG
       }
   }
}

最后针对上面的Shader来解释下其中的几条关键宏。

UNITY_VERTEX_INPUT_INSTANCE_ID

用于在_Vertex Shader_输入 / 输出结构中定义一个语义为_SV_InstanceID_的元素。

UNITY_INSTANCING_CBUFFER_START(name) / UNITY_INSTANCING_CBUFFER_END
每个Instance独有的属性必须定义在一个遵循特殊命名规则的Constant Buffer中。使用这对宏来定义这些Constant Buffer。“name”参数可以是任意字符串。

UNITY_DEFINE_INSTANCED_PROP(float4, _Color)
定义一个具有特定类型和名字的每个_Instance_独有的_Shader_属性。这个宏实际会定义一个_Uniform_数组。

UNITY_SETUP_INSTANCE_ID(v)
这个宏必须在Vertex Shader的最开始调用，如果你需要在Fragment Shader里访问_Instanced_属性，则需要在_Fragment Shader_的开始也用一下。这个宏的目的在于让_Instance ID_在_Shader_函数里也能够被访问到。

UNITY_TRANSFER_INSTANCE_ID(v, o)
在Vertex Shader中把Instance ID从输入结构拷贝至输出结构中。只有当你需要在Fragment Shader中访问每个Instance独有的属性时才需要写这个宏。

UNITY_ACCESS_INSTANCED_PROP(_Color)
访问每个Instance独有的属性。这个宏会使用Instance ID作为索引到Uniform数组中去取当前Instance对应的数据。（这个宏在上面的shader中没有出现，在下面我自定义的shader中有引用到）。

三、如何使用lightmap、阴影、裁剪功能？

当然首先我们还是得在我们的通道中包含指令，不然都是白搭。

#pragma   multi_compile_instancing

- lightmap的支持 -

对Unity内置_lightmap_的获取。我们定义两个编译开关，然后在自定义顶点输入输出结构包含_lightmap_的uv。

#pragma   multi_compile LIGHTMAP_OFF LIGHTMAP_ON  //开关编译选项 
struct v2f
{
   float4   pos : SV_POSITION;
   float3   lightDir : TEXCOORD0;
   float3   normal : TEXCOORD1;
   float2   uv : TEXCOORD2;
   LIGHTING_COORDS(3,   4)
#ifdef   LIGHTMAP_ON
   flost2   uv_LightMap : TEXCOORD5;
#endif
   UNITY_VERTEX_INPUT_INSTANCE_ID
｝

然后在顶点函数中进行如下处理

#ifdef   LIGHTMAP_ON
   o.uv_LightMap   = v.texcoord1.xy * _LightMap_ST.xy + _LightMap_ST.zw;
#endif

最后在像素函数中进行解码处理。

_DecodeLightmap_函数可以针对不同的平台对光照贴图进行解码。

#ifdef LIGHTMAP_ON  
   fixed3 lm = DecodeLightmap(UNITY_SAMPLE_TEX2D(_LightMap, i.uv_LightMap.xy));
   finalColor.rgb *= lm;
#endif

当然我们也可以通过属性来将lightmap传递给shader，这里就不写了。

- 阴影 -

当使用标准表面着色器时，Unity可以轻易的为我们提供阴影支持，但_Vertex/fragment_着色器中我们需要增加一些指令，同时还需要自己添加阴影投射通道。首先增加标签，表示接收正向基础光照为主光源。

Tags{ "LightMode" = "ForwardBase" }

然后增加如下指令，确保shder为所需要的通道执行正确的编译，同时因为我们需要里面的光照处理。

#ifdef   LIGHTMAP_ON
   o.uv_LightMap   = v.texcoord1.xy * _LightMap_ST.xy + _LightMap_ST.zw;
#endif

同时在我们的输入输出结构中添加

_LIGHTING_COORDS_宏，这个宏指令定义了对阴影贴图和光照贴图采样所需的参数。

LIGHTING_COORDS(3,   4)

完整的代码如下：

pass
{
  Tags{ "LightMode" = "ForwardBase" }
  CGPROGRAM
  #pragma target 3.0
  #pragma fragmentoption
  ARB_precision_hint_fastest
  #pragma vertex vertShadow
  #pragma fragment fragShadow
  #pragma multi_compile_fwdbase
  #pragma multi_compile_instancing
  #include "UnityCG.cginc"
  #include "AutoLight.cginc"
  #pragma multi_compile LIGHTMAP_OFF LIGHTMAP_ON  //开关编译选项
  sampler2D _DiffuseTexture;
  float4 _DiffuseTint;
  float4 _LightColor0;
  sampler2D _LightMap;//传进来的lightmap
  float4 _LightMap_ST;//
  struct v2f
   {
      float4 pos : SV_POSITION;
      float3 lightDir : TEXCOORD0;
      float3 normal : TEXCOORD1;
      float2 uv : TEXCOORD2;
      LIGHTING_COORDS(3, 4)
      #ifdef LIGHTMAP_ON
      flost2 uv_LightMap : TEXCOORD5;
      #endif
      UNITY_VERTEX_INPUT_INSTANCE_ID
    };
  UNITY_INSTANCING_CBUFFER_START(Props)
  UNITY_DEFINE_INSTANCED_PROP(fixed4, _Color) // Make _Color an instanced property (i.e. an array)
  UNITY_INSTANCING_CBUFFER_END
  v2f vertShadow(appdata_base v)
   {
      v2f o;
      UNITY_SETUP_INSTANCE_ID(v);
      UNITY_TRANSFER_INSTANCE_ID(v, o);
      o.pos = mul(UNITY_MATRIX_MVP, v.vertex);
      o.uv = v.texcoord;
      o.lightDir = normalize(ObjSpaceLightDir(v.vertex));
      o.normal = normalize(v.normal).xyz;
      #ifdef LIGHTMAP_ON 
      //o.uv_LightMap = v.texcoord1.xy * unity_LightmapST.xy + unity_LightmapST.zw;
      o.uv_LightMap = v.texcoord1.xy * _LightMap_ST.xy + _LightMap_ST.zw;
      #endif 
      TRANSFER_VERTEX_TO_FRAGMENT(o);
      return o;
    }
  float4 fragShadow(v2f i) : SV_Target
    {
       UNITY_SETUP_INSTANCE_ID(i);
       float3 L = normalize(i.lightDir);
       float3 N = normalize(i.normal);
       float attenuation = LIGHT_ATTENUATION(i) * 2;
       float4 ambient =  UNITY_LIGHTMODEL_AMBIENT * 2;
       float NdotL = saturate(dot(N, L));
       float4 diffuseTerm = NdotL * _LightColor0 * _DiffuseTint * attenuation;
       float4 diffuse = tex2D(_DiffuseTexture, i.uv)*UNITY_ACCESS_INSTANCED_PROP(_Color);//这里用宏访问Instance的颜色属性
       float4 finalColor = (ambient + diffuseTerm) * diffuse;
       #ifdef LIGHTMAP_ON 
       //fixed3 lm = DecodeLightmap(UNITY_SAMPLE_TEX2D(unity_Lightmap, i.uv_LightMap.xy));
       fixed3 lm = DecodeLightmap(UNITY_SAMPLE_TEX2D(_LightMap, i.uv_LightMap.xy));
       finalColor.rgb *= lm;
       #endif 
       return finalColor;
    }
    ENDCG
  }

有了上面的通道还不够，那只是告诉着色器，我们能够捕获到其阴影所需的一切了；最后我们需要阴影投射通道

/*阴影投射需要自定义，否则不支持GPU Instance同时需要包括指令multi_compile_instancing以及
在vert及frag函数中取instance id否则多个对象将得不到阴影投射
*/
Pass{
  Tags{ "LightMode" = "ShadowCaster" }
  CGPROGRAM
   #pragma vertex vert 
   #pragma fragment frag 
   #pragma multi_compile_shadowcaster 
   #pragma multi_compile_instancing
   #include "UnityCG.cginc" 
   sampler2D _Shadow;
   struct v2f {
                V2F_SHADOW_CASTER;
                float2 uv:TEXCOORD2;
                UNITY_VERTEX_INPUT_INSTANCE_ID
              };
  v2f vert(appdata_base v) 
        {
          v2f o;
          UNITY_SETUP_INSTANCE_ID(v);
          UNITY_TRANSFER_INSTANCE_ID(v, o);//
          o.uv = v.texcoord.xy;
          TRANSFER_SHADOW_CASTER_NORMALOFFSET(o);
          return o;
        }
  float4 frag(v2f i) : SV_Target
        {
          UNITY_SETUP_INSTANCE_ID(i);
          fixed alpha = tex2D(_Shadow, i.uv).a;
          clip(alpha - 0.5);
          SHADOW_CASTER_FRAGMENT(i)
        }
  ENDCG
}

- 裁剪 -

裁剪，我们可以通过逻辑控制来进行处理，一是场景加载策略，如四叉树场景管理，根据当前所在区块来决定渲染目标，二是通过当前摄像机空间来裁剪目标，这里简单的说下通过摄像机视锥体空间裁剪的方法（四叉树动态场景管理网上搜索是有demo的）

bool IsCanCulling(Transform tran)
{
  //必要时候，摄像机的视域体的计算 放置在裁剪判断之外，避免多次坐标变换开销，保证每帧只有一次
  Vector3 viewVec = Camera.main.WorldToViewportPoint(tran.position);
  var far = Camera.main.farClipPlane ;
  var near = Camera.main.nearClipPlane;
  if (viewVec.x > 0 && viewVec.x < 1 && viewVec.y > 0 && viewVec.y < 1 && viewVec.z > near && viewVec.z < far)
      return false;
  else
      return true;
}

四、C#端调用

在C#端，我们可以通过_Graphics.DrawMeshInstanced_ 接口直接向GPU输送绘制调用，这里在初始化阶段随机的生成了一些位置信息，然后在每帧更新阶段调用

Graphics.DrawMeshInstanced 接口进行绘制

public class testInstance : MonoBehaviour
{
   //草材质用到的mesh
   Mesh   mesh;
   Material   mat;
   public GameObject m_prefab;
   Matrix4x4[]   matrix;
   ShadowCastingMode   castShadows;//阴影选项
   public int InstanceCount = 10;
   //树的预制体由树干和树叶两个mesh组成
   MeshFilter[]   meshFs;
   Renderer[]   renders;

  //这个变量类似于unity5.6材质属性的Enable Instance Variants勾选项
   public bool turnOnInstance = true;
   void Start()
   {
       if (m_prefab == null)
         return;
       Shader.EnableKeyword("LIGHTMAP_ON");//开启lightmap
       //Shader.DisableKeyword("LIGHTMAP_OFF");
       var   mf = m_prefab.GetComponent<MeshFilter>();
       if (mf)
         {
           mesh   = m_prefab.GetComponent<MeshFilter>().sharedMesh;
           mat   = m_prefab.GetComponent<Renderer>().sharedMaterial;
         }
      //如果一个预制体 由多个mesh组成，则需要绘制多少次
      if(mesh   == null)
         {
           meshFs   = m_prefab.GetComponentsInChildren<MeshFilter>();
         }
      if(mat   == null)
         {
           renders   = m_prefab.GetComponentsInChildren<Renderer>();
         }
      matrix   = new Matrix4x4[InstanceCount];

      castShadows   = ShadowCastingMode.On;

//随机生成位置与缩放
 for (int i = 0; i < InstanceCount;   i++)
       {   
          ///   random position
          float x = Random.Range(-50, 50);
          float y = Random.Range(-3, 3);
          float z = Random.Range(-50, 50);
          matrix[i]   =   Matrix4x4.identity;   ///   set default identity
           //设置位置
          matrix[i].SetColumn(3, new Vector4(x, 0.5f, z, 1));  /// 4th colummn: set   position
           //设置缩放
           //matrix[i].m00   = Mathf.Max(1, x);
           //matrix[i].m11   = Mathf.Max(1, y);
           //matrix[i].m22   = Mathf.Max(1, z);
       }
   }
   void Update()
   {
     if (turnOnInstance)
       {              
          castShadows   = ShadowCastingMode.On;
          if(mesh)
               Graphics.DrawMeshInstanced(mesh,   0, mat, matrix, matrix.Length, props, castShadows, true, 0, null);
          else
          {
            for(int i = 0; i < meshFs.Length; ++i)
               {
                   Graphics.DrawMeshInstanced(meshFs[i].sharedMesh,   0, renders[i].sharedMaterial, matrix, matrix.Length, props,   castShadows, true, 0, null);
               }
           }
       }
   }
}

五、效果展示

下面场景中使用了1023棵树，8*1023棵草。用1023这个数是因为_DrawMeshInstanced_传递的矩阵长度为1023，而1023个_mesh_其实是分成3个_drawcall_完成的。

_UnityInstance.cginc_中是这么定义的：

#define UNITY_MAX_INSTANCE_COUNT 500

所以一个_drawcall_只能允许最大500个实例。另外，这里草和树的_shader_是我用了js的资源，所以阴影和_lightmap_的我就没增加，我用_cube_这个模型做的demo里是有这方面处理的。

图片来源：游戏www.cungun.com游戏

六、结论

在OpenGL ES3.0及以上设备中，我们完全可以使用_GpuInsttance_技术来更好的提升我们的游戏性能，将更多的Cpu时间留给复杂的逻辑，比如说战斗等游戏体验要求较高的模块；而在较旧的ES2.0的设备，我们完全可以采用现有的做法来兼容，而这时候我们可能需要做的更多的就是精简模型，通过_Lod_等其他策略来进行优化。