Hybird3D

乔捷的技术博客

博客园 首页 新随笔 联系 订阅 管理

看到空明流转分享了他的SALVIA 0.5.2优化谈,我也来说说Hybird3D中和光栅化相关的一些优化技术。

Hybird3D的设计目标是打造一款准实时的软件高质量渲染器,采用了光栅化和光线跟踪混合算法,光栅化用于渲染eye ray,光线跟踪则用于阴影、反射、折射、全局光等次级光线的计算。由于渲染器是以准实时(一帧渲染时间在几十毫秒到几秒之间)为设计目标的,因此性能优化是非常重要的,但同时为了能够实现高质量的软件渲染,在渲染器的架构和支持的特性上也不会因为性能而缩水和妥协。

光栅化算法

Hybird3D的光栅化算法主要采用的是Michael Abrash写的rasterization on Larrabee这篇paper,这个算法是目前我所知道的同时能够支持多线程和SIMD的最佳算法,这个算法保证了每个tile的光栅化可以由每个线程独立计算,而没有任何数据冲突,从而实现完全的无锁化计算。Larrabee有512bit的SIMD宽度,也就是一次可以计算16个float,在一般的支持SSE的CPU上我们只能使用128bit的SIMD指令一次计算4个float,虽然宽度少了,但不影响算法的应用。

Hybird3D也是采用的16x16的tile,光栅化的主要任务是计算某个像素上对应的三角图元和三角形重心坐标(另外为了后续的mipmap纹理插值我们还需要保存重心坐标的差分值),有了图元和重心坐标就可以送到后端的LightShader和PixelShader中做进一步的计算了。一个像素上不一定只存在一个图元,半透明物体及其anti-alise等都会使得一个像素上存在多个图元,可以将这些图元保存为一个单向链表,同时为每个图元设置一个alpha值做为混合权重就可以了。

内存优化技巧

内存的优化主要包括内存的分配和cache的合理利用。在光栅化过程中会产生很多的临时对象,对象的数量不可预估但是每种对象的生命周期可以很容易的知道,所以我们可以采取多次分配一次释放的策略来实现非常高效的对象分配,根据不同的生命周期需要多个内存分配器,同时为了防止多线程冲突,每个线程需要独立的内存分配器。对cache的利用则需要合理的设计我们的渲染流水线以及合理的组织数据结构SoA(struct的array化),让数据访问尽可能的集中。SoA不但可以让数据访问变得集中而且对SIMD指令非常友好,不过SoA的编程难度很高,会让代码变得非常难写和难读,

SIMD编程技巧

SIMD的编程一直是件脏活和累活,15年前我就开始使用MMX指令来加速应用,那个时候只能内嵌汇编,而且指令与普通的运算指令差别很大,写完之后过段时间自己也看不懂了,代码的维护是一个非常让人头疼的问题。后来出了intrinsics指令,可以在C代码中用函数的形式来编写SIMD指令,免去了手工写汇编的痛苦,但是intrinsics指令同普通的C运算符差别还是很大,代码的可读性依然不佳,好在SSE指令集还是比较规整的,大部分运算指令可以用C++运算符重载来包装intrinsics指令,下面给出我的包装函数供大家参考。

 1 inline __m128 operator + (__m128 v1, __m128 v2)
 2 {
 3     return _mm_add_ps(v1, v2);
 4 }
 5 
 6 inline __m128 operator - (__m128 v1, __m128 v2)
 7 {
 8     return _mm_sub_ps(v1, v2);
 9 }
10 
11 inline __m128 operator * (__m128 v1, __m128 v2)
12 {
13     return _mm_mul_ps(v1, v2);
14 }
15 
16 inline __m128 operator / (__m128 v1, __m128 v2)
17 {
18     return _mm_div_ps(v1, v2);
19 }
20 
21 inline __m128 operator == (__m128 v1, __m128 v2)
22 {
23     return _mm_cmpeq_ps(v1, v2);
24 }
25 
26 inline __m128 operator != (__m128 v1, __m128 v2)
27 {
28     return _mm_cmpneq_ps(v1, v2);
29 }
30 
31 inline __m128 operator > (__m128 v1, __m128 v2)
32 {
33     return _mm_cmpgt_ps(v1, v2);
34 }
35 
36 inline __m128 operator >= (__m128 v1, __m128 v2)
37 {
38     return _mm_cmpge_ps(v1, v2);
39 }
40 
41 inline __m128 operator < (__m128 v1, __m128 v2)
42 {
43     return _mm_cmplt_ps(v1, v2);
44 }
45 
46 inline __m128 operator <= (__m128 v1, __m128 v2)
47 {
48     return _mm_cmple_ps(v1, v2);
49 }
50 
51 inline __m128 operator & (__m128 v1, __m128 v2)
52 {
53     return _mm_and_ps(v1, v2);
54 }
55 
56 inline __m128 operator | (__m128 v1, __m128 v2)
57 {
58     return _mm_or_ps(v1, v2);
59 }
60 
61 inline int MoveMask(__m128 v)
62 {
63     return _mm_movemask_ps(v);
64 }
65 
66 inline __m128 Max(__m128 v1, __m128 v2)
67 {
68     return _mm_max_ps(v1, v2);
69 }
70 
71 inline __m128 Min(__m128 v1, __m128 v2)
72 {
73     return _mm_min_ps(v1, v2);
74 }
75 
76 //mask ? a : b
77 inline __m128 Select(__m128 mask, __m128 a, __m128 b)
78 {
79     return _mm_or_ps(_mm_and_ps(a, mask), _mm_andnot_ps(mask, b));
80 }
81 
82 inline __m128 Extract(__m128 m, int n)
83 {
84     switch(n)
85     {
86     case 0:
87         return _mm_shuffle_ps(m, m, 0);
88     case 1:
89         return _mm_shuffle_ps(m, m, 0x55);
90     case 2:
91         return _mm_shuffle_ps(m, m, 0xaa);
92     case 3:
93         return _mm_shuffle_ps(m, m, 0xff);
94     default:
95         return m;
96     }
97 }

 

最后是干货时间,放出Hybird3D中光栅化相关的代码供大家参考。

   1 #include "stdafx.h"
   2 #include "RayTracer.h"
   3 #include "Clipper.h"
   4 #include "PrimitiveTile.h"
   5 
   6 #pragma warning(disable: 4018)
   7 
   8 #define TILE_WIDTH  16
   9 #define TILE_HEIGHT 16
  10 
  11 const float NORMAL_THRESHOLD = 0.9f;
  12 extern int ReflectionDepth;
  13 
  14 _CRT_ALIGN(16) struct Illuminance
  15 {
  16     Float3 direction;
  17     float illuminance;
  18     Float3 color;
  19     float shadowFactor;
  20     Illuminance* next;
  21     LightShader* light;
  22 };
  23 
  24 struct VertexOutput
  25 {
  26     Float4 pos;
  27     Float4 normal;
  28     float attributes[0];
  29 };
  30 
  31 struct PolyPrimitive;
  32 
  33 _CRT_ALIGN(16) struct PixelContext
  34 {
  35     PixelContext* next;
  36     PolyPrimitive* prim;
  37     int triIndex;
  38     float alpha;
  39     Float4 pos;
  40     Float4 view;
  41     Float4 normal;
  42     Float2 uv;
  43     Float2 duvdx;    //d(uv) / dx
  44     Float2 duvdy;    //d(uv) / dy
  45     Illuminance* light;
  46     void* userData;
  47 };
  48 
  49 _CRT_ALIGN(16) const float FrustumClipPlane[6][4] = {
  50     { 0, 0, 1, 0},
  51     { 0, 0,-1, 1},
  52     { 1, 0, 0, 1},
  53     {-1, 0, 0, 1},
  54     { 0, 1, 0, 1},
  55     { 0,-1, 0, 1},
  56 };
  57 
  58 __m128 ScreenOffset;
  59 __m128 ScreenScale;
  60 __m128 ScreenScaleInv;
  61 
  62 struct RenderContext : public IRenderContext
  63 {
  64     Float4x4        ViewProjMatrix;
  65     Float4x4        ViewInvMatrix;
  66     Float4            _eye;
  67     float ScreenWidth, ScreenHeight;
  68     PrimitiveTile*    _primTiles;
  69     Bitmap*            _renderTarget;
  70     int                _tileCol, _tileRow;
  71     BYTE*            _vertexTempBuf;
  72     size_t            _vertexTempSize;
  73     ICamera*        _camera;
  74     Accel            _accelStruct;
  75     int                _aaLevel;
  76     int                _primCount;
  77     DWORD            _bkColor;
  78     Float4            _bkColorF;
  79     vector<LightShader*> _lights;
  80 
  81     RenderContext()
  82     {
  83         _vertexTempSize = 0;
  84         _vertexTempBuf = 0;
  85         _primTiles = 0;
  86         _aaLevel = 0;
  87         _tileCol = 0;
  88         _tileRow = 0;
  89         ScreenOffset = _mm_setr_ps(1, -1, 0, 0);
  90     }
  91 
  92     void AddLight(LightShader* light)
  93     {
  94         _lights.push_back(light);
  95     }
  96 
  97     void ClearLights()
  98     {
  99         _lights.clear();
 100     }
 101 
 102     void SetRenderTarget(Bitmap* target)
 103     {
 104         ScreenWidth = target->width;
 105         ScreenHeight = target->height;
 106         int tileCount = _tileCol * _tileRow;
 107         _renderTarget = target;
 108         _tileCol = Align(target->width, TILE_WIDTH) / TILE_WIDTH;
 109         _tileRow = Align(target->height, TILE_HEIGHT) / TILE_HEIGHT;
 110         if(tileCount < _tileCol * _tileRow)
 111         {
 112             if(_primTiles)
 113                 delete[] _primTiles;
 114 
 115             _primTiles = new PrimitiveTile[_tileCol * _tileRow];
 116         }
 117         for(int i = 0; i < _tileCol * _tileRow; ++i)
 118             _primTiles[i].Clear();
 119 
 120         ScreenScale = _mm_setr_ps(ScreenWidth * 0.5f, -ScreenHeight * 0.5f, 1, 1);
 121         ScreenScaleInv = m128(1) / ScreenScale;
 122     }
 123 
 124     void SetAntiAliasQuality(int level)
 125     {
 126         _aaLevel = min(max(0, level), 4);
 127     }
 128 
 129     void SetCamera(ICamera* camera)
 130     {
 131         _camera = camera;
 132     }
 133 
 134     ICamera* GetCamera()
 135     {
 136         return _camera;
 137     }
 138 
 139     void VertConvert(Float4* dest, VertexOutput* vert, int vertChannels)
 140     {
 141         __m128 pos = _mm_load_ps(vert->pos);
 142         __m128 w = _mm_shuffle_ps(pos, pos, _MM_SHUFFLE(3, 3, 3, 3));
 143 
 144         __m128 rhw = _mm_div_ss(_mm_set_ss(1), w);
 145         rhw = _mm_shuffle_ps(rhw, rhw, 0);
 146 
 147         _mm_store_ps(dest[0], _mm_mul_ps(_mm_add_ps(_mm_mul_ps(pos, rhw), ScreenOffset), ScreenScale));
 148 
 149         __m128* attr = (__m128*)&vert->normal;
 150         for(int k = 0; k < vertChannels; k++)
 151             _mm_store_ps(dest[k + 1], _mm_mul_ps(attr[k], rhw));
 152 
 153         _mm_store_ss(&dest[0].w, rhw);
 154     }
 155 
 156     virtual void BeginScene()
 157     {
 158         _accelStruct.BeginBuild();
 159     }
 160 
 161     virtual void AddPolyons(VertexOutput* verts, int vertSize,
 162         int vertCount, DWORD* triangles, int count, Shader* shader)
 163     {
 164 
 165         _accelStruct.AddPolygons((BYTE*)verts, triangles, vertSize, vertCount, count, shader);
 166     }
 167 
 168     virtual void EndScene()
 169     {
 170         _accelStruct.Build();
 171     }
 172 
 173     virtual void SetBackground(DWORD color)
 174     {
 175         _bkColor = color;
 176         _bkColorF = Float4((float)(color & 0xff),
 177             (float)((color >> 8) & 0xff),
 178             (float)((color >> 16) & 0xff), 1);
 179 
 180         _bkColorF /= 255.f;
 181 
 182         _bkColorF = _bkColorF * _bkColorF;
 183     }
 184 
 185     void RasterTile(PrimitiveTile* tile, int x, int y,
 186         DWORD* target, int pitch, struct FGSampleTable* FGSamples = 0);
 187 
 188     void RasterFGSample(PrimitiveTile* tile, int x, int y, struct FGSampleMap& dest);
 189     
 190     void RasterFragmentSample(PrimitiveTile* tile, int x, int y, struct FragmentSampleMap& dest);
 191     
 192     void FGShader(struct FGSampleRef* samples, int count);
 193 
 194     void DrawPrimitive(TriVertex** vert, TrianglePrim& tri);
 195 
 196     void ClippingAndDraw(TriVertex** verts, TrianglePrim& tri);
 197 
 198     void DrawTriangle(TrianglePrim& tri);
 199 
 200     void Render();
 201 
 202     static void* operator new (size_t size)
 203     {
 204         return _aligned_malloc(sizeof(RenderContext), 16);
 205     }
 206 };
 207 
 208 void Create4TransPixels(PixelContext** pixels, TriPrimitive* prim, const Float4& eye,
 209                         float* rhw, float x, float y, Allocator& alloc)
 210 {
 211     __m128 ma = _mm_loadu_ps(prim->a);
 212     __m128 mb = _mm_loadu_ps(prim->b);
 213     __m128 a0 =  ma * (m128(x - prim->p0.x)) + mb * (m128(y - prim->p0.y)) + _mm_loadu_ps(prim->c);
 214     for(int i = 0; i < 4; ++i)
 215     {
 216         __m128 a = a0;
 217         if(rhw[i] > 0)
 218         {
 219             PixelContext pixel;
 220             __m128 adx = a + ma;
 221             __m128 ady = a + mb;
 222             __m128 r = _mm_div_ss(m128(1), a);
 223             a = a * Extract(r, 0);
 224             adx = a - adx * Extract(_mm_rcp_ss(adx), 0);
 225             ady = a - ady * Extract(_mm_rcp_ss(ady), 0);
 226             _mm_store_ss(&pixel.pos.w, a0);
 227             pixel.prim = prim->prim;
 228             pixel.triIndex = prim->triIndex;
 229             _mm_storeu_ps(pixel.uv, _mm_shuffle_ps(a, adx, _MM_SHUFFLE(2, 1, 2, 1)));
 230             _mm_storeu_ps(pixel.duvdy, _mm_shuffle_ps(ady, ady, _MM_SHUFFLE(2, 1, 2, 1)));
 231             float alpha = prim->prim->shader->TransprentShader(&pixel);
 232             if(alpha > 0.01f)
 233             {
 234                 //insert pixel
 235                 PixelContext* p = (PixelContext*)alloc.Alloc(sizeof(PixelContext), 16);
 236                 p->alpha = alpha;
 237                 p->prim = prim->prim;
 238                 p->triIndex = prim->triIndex;
 239                 p->uv = pixel.uv;
 240                 p->duvdx = pixel.duvdx;
 241                 p->duvdy = pixel.duvdy;
 242                 prim->prim->GetPosNormal(prim->triIndex, pixel.uv, &p->pos, &p->normal);
 243                 p->view = NormalizeFast(eye - p->pos);
 244                 p->light = 0;
 245                 p->next = 0;
 246                 p->pos.w = pixel.pos.w;
 247 
 248                 float alpha2 = 1;
 249                 if(pixels[i] == 0)
 250                     pixels[i] = p;
 251                 else
 252                 {
 253                     PixelContext* prev = 0;
 254                     PixelContext* pp = pixels[i];
 255                     while(pp)
 256                     {
 257                         if(p->pos.w > pp->pos.w)
 258                             break;
 259                         alpha2 -= pp->alpha;
 260                         prev = pp;
 261                         pp = pp->next;
 262                     }
 263                     p->alpha = alpha * alpha2;
 264                     if(prev)
 265                     {
 266                         p->next = prev->next;
 267                         prev->next = p;
 268                     }
 269                     else
 270                     {
 271                         p->next = pixels[i];
 272                         pixels[i] = p;
 273                     }
 274 
 275                     if(alpha > 0.99f)
 276                     {
 277                         p->next = 0;
 278                     }
 279                     else
 280                     {
 281                         alpha = 1 - alpha;
 282                         pp = p->next;
 283                         while(pp)
 284                         {
 285                             pp->alpha *= alpha;
 286                             pp = pp->next;
 287                         }
 288                     }
 289                 }
 290             }
 291         }
 292         a0 = a0 + ma;
 293     }
 294 }
 295 
 296 void CreateMainPixels(PixelContext** pixels, TriPrimitive** primBuf, const Float4& eye,
 297                       float startX, float startY, int tileSize, float alpha, Allocator& alloc)
 298 {
 299     __m128 px = m128(startX);
 300     __m128 py = m128(startY);
 301     for(int i = 0; i < tileSize; ++i)
 302     {
 303         if(i % 16 == 0 && i > 0)
 304         {
 305             py = py + m128(1);
 306             px = m128(startX);
 307         }
 308         TriPrimitive* prim = primBuf[i];
 309         if(prim)
 310         {
 311             PixelContext* p = (PixelContext*)alloc.Alloc(sizeof(PixelContext), 16);
 312             __m128 ma = _mm_loadu_ps(prim->a);
 313             __m128 mb = _mm_loadu_ps(prim->b);
 314             __m128 a =  ma * (px - m128(prim->p0.x)) +
 315                         mb * (py - m128(prim->p0.y)) + _mm_loadu_ps(prim->c);
 316             __m128 rhw = a;
 317 
 318             __m128 r = _mm_div_ss(m128(1), a);
 319             __m128 w = _mm_shuffle_ps(r, r, 0);
 320             __m128 adx = a + ma;
 321             __m128 ady = a + mb;
 322             a = a * w;
 323             r = _mm_rcp_ss(adx);
 324             adx = a - adx * _mm_shuffle_ps(r, r, 0);
 325             r = _mm_rcp_ss(ady);
 326             ady = a - ady * _mm_shuffle_ps(r, r, 0);
 327             _mm_storeu_ps(p->uv, _mm_shuffle_ps(a, adx,  _MM_SHUFFLE(2, 1, 2, 1)));
 328             _mm_storeu_ps(p->duvdy, _mm_shuffle_ps(ady, ady,  _MM_SHUFFLE(2, 1, 2, 1)));
 329             p->prim = prim->prim;
 330             p->triIndex = prim->triIndex;
 331             p->prim->GetPosNormal(p->triIndex, p->uv, &p->pos, &p->normal);
 332             p->view = NormalizeFast(eye - p->pos);
 333             p->light = 0;
 334             p->alpha = alpha;
 335             p->next = 0;
 336             pixels[i] = p;
 337             _mm_store_ss(&p->pos.w, rhw);
 338         }
 339         else
 340             pixels[i] = 0;
 341         px = px + m128(1);
 342     }
 343 }
 344 
 345 void InsertPixel(PixelContext** pixel, PixelContext* p)
 346 {
 347     if(*pixel == 0)
 348         *pixel = p;
 349     else
 350     {
 351         PixelContext* prev = 0;
 352         PixelContext* pp = *pixel;
 353         while(pp)
 354         {
 355             if(p->pos.w > pp->pos.w)
 356                 break;
 357             prev = pp;
 358             pp = pp->next;
 359         }
 360         if(prev)
 361         {
 362             p->next = prev->next;
 363             prev->next = p;
 364         }
 365         else
 366         {
 367             p->next = *pixel;
 368             *pixel = p;
 369         }
 370     }
 371 }
 372 
 373 void CreatePixels(PixelContext** pixels, TriPrimitive** primBuf, const Float4& eye,
 374                   float alpha, float startX, float startY, int tileSize, Allocator& alloc)
 375 {
 376     __m128 px = m128(startX);
 377     __m128 py = m128(startY);
 378     for(int i = 0; i < tileSize; ++i)
 379     {
 380         if(i % 16 == 0 && i > 0)
 381         {
 382             py = py + m128(1);
 383             px = m128(startX);
 384         }
 385         TriPrimitive* prim = primBuf[i];
 386         if(prim)
 387         {
 388             PixelContext* pixel = pixels[i];
 389             while(pixel)
 390             {
 391                 if(pixel->prim == prim->prim)
 392                 {
 393                     pixel->alpha += alpha;
 394                     goto _SkipPixel;
 395                 }
 396                 pixel = pixel->next;
 397             }
 398 
 399             PixelContext* p = (PixelContext*)alloc.Alloc(sizeof(PixelContext), 16);
 400             __m128 ma = _mm_loadu_ps(prim->a);
 401             __m128 mb = _mm_loadu_ps(prim->b);
 402 
 403             __m128 a =  ma * (px - m128(prim->p0.x)) +
 404                         mb * (py - m128(prim->p0.y)) + _mm_loadu_ps(prim->c);
 405             __m128 rhw = a;
 406 
 407             __m128 r = _mm_div_ss(m128(1), a);
 408             __m128 w = Extract(r, 0);
 409             __m128 adx = a + ma;
 410             __m128 ady = a + mb;
 411             a = a * w;
 412             r = _mm_rcp_ss(adx);
 413             adx = a - adx * Extract(r, 0);
 414             r = _mm_rcp_ss(ady);
 415             ady = a - ady * Extract(r, 0);
 416             _mm_storeu_ps(p->uv, _mm_shuffle_ps(a, adx,  _MM_SHUFFLE(2, 1, 2, 1)));
 417             _mm_storeu_ps(p->duvdy, _mm_shuffle_ps(ady, ady,  _MM_SHUFFLE(2, 1, 2, 1)));
 418             p->prim = prim->prim;
 419             p->triIndex = prim->triIndex;
 420             p->prim->GetPosNormal(p->triIndex, p->uv, &p->pos, &p->normal);
 421             p->view = NormalizeFast(eye - p->pos);
 422             p->light = 0;
 423             p->alpha = alpha;
 424             p->next = 0;
 425             _mm_store_ss(&p->pos.w, rhw);
 426             InsertPixel(&pixels[i], p);
 427         }
 428     _SkipPixel:
 429         px = px + m128(1);
 430     }
 431 }
 432 
 433 void RasterFullCoverPrim(TriPrimitive* prim, float startX,
 434                          float startY, float* primBuf, float* wBuf)
 435 {
 436     __m128 startW = m128((startX - prim->p0.x) * prim->a[0]
 437                     + (startY - prim->p0.y) * prim->b[0] + prim->c[0]);
 438     __m128 rhwDx = m128(prim->a[0] * 4);
 439     __m128 primData = m128(*(float*)&prim);
 440     startW = startW + m128(prim->a[0]) * _mm_set_ps(3, 2, 1, 0);
 441 
 442     for(int i = 0; i < TILE_HEIGHT; ++i)
 443     {
 444         __m128 rhw = startW;
 445         for(int j = 0; j < TILE_WIDTH; j += 4)
 446         {
 447             __m128 oldW = _mm_load_ps(wBuf + j);
 448             __m128 mask = rhw > oldW;
 449             _mm_store_ps(wBuf + j, Select(mask, rhw, oldW));
 450             _mm_store_ps(primBuf + j, Select(mask, primData, _mm_load_ps(primBuf + j)));
 451             rhw = rhw + rhwDx;
 452         }
 453         wBuf += TILE_WIDTH;
 454         primBuf += TILE_WIDTH;
 455         startW = startW + m128(prim->b[0]);
 456     }
 457 }
 458 
 459 void RasterPrim(TriPrimitive* prim, float x, float y,
 460                 float xs, float ys, TriPrimitive** primBuf, float* wBuf)
 461 {
 462     __m128 ex[3];
 463     __m128 ey[3];
 464     __m128 xOff[3];
 465     __m128 yOff[3];
 466     __m128 mask0[3];
 467     __m128 primData = m128(*(float*)&prim);
 468 
 469     for(int i = 0; i < 3; ++i)
 470     {
 471         ex[i] = m128(prim->ea[i]);
 472         ey[i] = m128(prim->eb[i]);
 473         xOff[i] = (ex[i] > m128(0)) & m128(4);
 474         yOff[i] = (ey[i] > m128(0)) & m128(4);
 475     }
 476     __m128 p0x = m128(prim->p0.x);
 477     __m128 p0y = m128(prim->p0.y);
 478     __m128 p1x = p0x - ey[0];
 479     __m128 p1y = p0y + ex[0];
 480 
 481     mask0[0] = (m128(x) - p0x) * ex[0] + (m128(y) - p0y) * ey[0];
 482     mask0[1] = (m128(x) - p1x) * ex[1] + (m128(y) - p1y) * ey[1];
 483     mask0[2] = (m128(x) - p0x) * ex[2] + (m128(y) - p0y) * ey[2];
 484 
 485     __m128 rhw0 = (_mm_set_ps(3, 2, 1, 0) + m128(x + xs) - p0x) * m128(prim->a[0]) +
 486                   (m128(y + ys) - p0y) * m128(prim->b[0]) + m128(prim->c[0]);
 487     __m128* mprimBuf = (__m128*)primBuf;
 488     __m128* mwBuf = (__m128*)wBuf;
 489 
 490     __m128 yStep = m128(0);
 491     for(int iy = 0; iy < 4; ++iy)
 492     {
 493         __m128 mask;
 494         __m128 xStep = _mm_set_ps(12, 8, 4, 0);
 495         mask =  ((mask0[0] + (xStep + xOff[0]) * ex[0] + (yStep + yOff[0]) * ey[0]) >= m128(0)) &
 496                 ((mask0[1] + (xStep + xOff[1]) * ex[1] + (yStep + yOff[1]) * ey[1]) >= m128(0)) &
 497                 ((mask0[2] + (xStep + xOff[2]) * ex[2] + (yStep + yOff[2]) * ey[2]) >= m128(0));
 498 
 499         int* imask = (int*)&mask;
 500         if(MoveMask(mask))
 501         {
 502             __m128 rhw1 = rhw0;
 503             for(int ix = 0; ix < 4; ++ix)
 504             {
 505                 if(imask[ix])
 506                 {
 507                     __m128 mask1[3];
 508                     __m128 xpos = _mm_set_ps(3, 2, 1, 0) + m128((float)(ix * 4) + xs);
 509                     __m128 ypos = yStep + m128(ys);
 510                     mask1[0] = mask0[0] + xpos * ex[0] + ypos * ey[0];
 511                     mask1[1] = mask0[1] + xpos * ex[1] + ypos * ey[1];
 512                     mask1[2] = mask0[2] + xpos * ex[2] + ypos * ey[2];
 513 
 514                     __m128* mprimBuf0 = mprimBuf + ix;
 515                     __m128* mwBuf0 = mwBuf + ix;
 516                     __m128 rhw = rhw1;
 517                     for(int j = 0; j < 4; ++j)
 518                     {
 519                         __m128 pmask =  (rhw > *mwBuf0) &
 520                                         (mask1[0] >= m128(0)) &
 521                                         (mask1[1] >= m128(0)) &
 522                                         (mask1[2] >= m128(0));
 523 
 524                         *mwBuf0 = Select(pmask, rhw, *mwBuf0);
 525                         *mprimBuf0 = Select(pmask, primData, *mprimBuf0);
 526                         mask1[0] = mask1[0] + ey[0];
 527                         mask1[1] = mask1[1] + ey[1];
 528                         mask1[2] = mask1[2] + ey[2];
 529                         mprimBuf0 += 4;
 530                         mwBuf0 += 4;
 531                         rhw = rhw + m128(prim->b[0]);
 532                     }
 533                 }
 534                 rhw1 = rhw1 + m128(prim->a[0]) * m128(4);
 535             }
 536         }
 537         rhw0 = rhw0 + m128(4) * m128(prim->b[0]);
 538         mprimBuf += 16;
 539         mwBuf += 16;
 540         yStep = yStep + m128(4);
 541     }
 542 }
 543 
 544 void CreateReflectRay(Ray* rays, int count, PixelContext* pixel, ReflectInfo* refInfo, const Float4& eye)
 545 {
 546     if(count == 1)
 547     {
 548         Float4 pos = pixel->pos;
 549         Float4 normal = pixel->normal;
 550         Ray& ray = rays[0];
 551 
 552         Float4 refVec = -Normalize(Reflect(pixel->view, normal));
 553         pos = pos + refVec * 0.02f;
 554         _mm_store_ps(ray.pos, pos.m);
 555         _mm_store_ps(ray.dir, refVec.m);
 556         ray.triIndex = -1;
 557         ray.tmin = 0;
 558         ray.tmax = 1e10;
 559         ray.userData = refInfo;
 560         return;
 561     }
 562 
 563     static const Float2 offset[] = {
 564         Float2(0, 0),
 565         Float2(-0.4f, -0.4f),
 566         Float2(0.4f, -0.4f),
 567         Float2(0, 0.4f)
 568     };
 569     for(int i = 0; i < count; ++i)
 570     {
 571         Float4 pos = pixel->pos;
 572         Float4 normal = pixel->normal;
 573         Ray& ray = rays[i];
 574 
 575         Float4 dpos, dnormal;
 576         pixel->prim->GetPosNormalDifferential(pixel->triIndex, pixel->duvdx * offset[i].x, &dpos, &dnormal);
 577         pos = pos + dpos;
 578         normal = normal + dnormal;
 579 
 580         pixel->prim->GetPosNormalDifferential(pixel->triIndex, pixel->duvdy * offset[i].y, &dpos, &dnormal);
 581         pos = pos + dpos;
 582         normal = NormalizeFast(normal + dnormal);
 583 
 584         Float4 Vn;
 585         //Vn = pixel->view;
 586         Vn = NormalizeFast(eye - pos);
 587 
 588         Float4 refVec = -Normalize(Reflect(Vn, normal));
 589         pos = pos + refVec * 0.02f;
 590         _mm_store_ps(ray.pos, pos.m);
 591         _mm_store_ps(ray.dir, refVec.m);
 592         ray.triIndex = -1;
 593         ray.tmin = 0;
 594         ray.tmax = 1e10;
 595         ray.userData = refInfo;
 596     }
 597 }
 598 
 599 void CreateReflectPixel(PixelContext** pixels, Ray& ray, Allocator* alloc)
 600 {
 601     ReflectInfo* refinfo = (ReflectInfo*)ray.userData;
 602     /*PixelContext* pixel = pixels[refinfo->index];
 603     while(pixel)
 604     {
 605         if(pixel->prim == ray.prim)
 606         {
 607             pixel->alpha += refinfo->strength;
 608             return;
 609         }
 610         pixel = pixel->next;
 611     }*/
 612     
 613     PixelContext* p = (PixelContext*)alloc->Alloc(sizeof(PixelContext), 16);
 614     p->prim = ray.prim;
 615     p->triIndex = ray.triIndex;
 616     p->uv.x = ray.u;
 617     p->uv.y = ray.v;
 618 
 619     Float4 posddx, normalddx;
 620     Float4 posddy, normalddy;
 621     refinfo->context->prim->GetPosNormalDifferential(refinfo->context->triIndex,
 622                                     refinfo->context->duvdx, &posddx, &normalddx);
 623     refinfo->context->prim->GetPosNormalDifferential(refinfo->context->triIndex,
 624                                     refinfo->context->duvdy, &posddy, &normalddy);
 625 
 626     Float2 uvdx, uvdy;
 627     p->prim->GetRayDifferential(ray.triIndex, *(Float4*)&ray.pos, *(Float4*)&ray.dir,
 628                                 posddx, posddy, normalddx, normalddy, &uvdx, &uvdy);
 629 
 630     p->duvdx = uvdx - p->uv;
 631     p->duvdy = uvdy - p->uv;
 632     p->alpha = refinfo->strength;
 633     ray.prim->GetPosition(ray.triIndex, p->uv, &p->pos);
 634     ray.prim->GetNormal(ray.triIndex, p->uv, &p->normal);
 635     p->view = NormalizeFast(Float4(ray.pos, 1) - p->pos);
 636     p->light = 0;
 637     p->next = pixels[refinfo->index];
 638     pixels[refinfo->index] = p;
 639 }
 640 
 641 void CopyBuf(float* wBuf, float* wBuf2, float* primBuf, float* primBuf2, int size)
 642 {
 643     for(int i = 0; i < size; i += 4)
 644     {
 645         _mm_store_ps(wBuf2 + i, _mm_load_ps(wBuf + i));
 646         _mm_store_ps((float*)primBuf2 + i, _mm_load_ps((float*)primBuf + i));
 647     }
 648 }
 649 
 650 float CopyBuf2(float* wBuf, float* wBuf2, float* primBuf, float* primBuf2, int size)
 651 {
 652     __m128 minRHW = m128(FLT_MAX);
 653     for(int i = 0; i < size; i += 4)
 654     {
 655         __m128 rhw = _mm_load_ps(wBuf + i);
 656         minRHW = Min(minRHW, rhw);
 657         _mm_store_ps((float*)primBuf2 + i, _mm_load_ps((float*)primBuf + i));
 658         _mm_store_ps(wBuf2 + i, rhw);
 659     }
 660 
 661     minRHW = Min(Min(Extract(minRHW, 0), Extract(minRHW, 1)),
 662         Min(Extract(minRHW, 2), Extract(minRHW, 3)));
 663     float m;
 664     _mm_store_ss(&m, minRHW);
 665     return m;
 666 }
 667 
 668 int AACount[] = {0, 2, 4, 8, 16};
 669 float AACfgAlpha[] = {1, 1 / 3.f, 1 / 5.f, 1 / 9.f, 1 / 17.f};
 670 Float2 AASampler[5][16] = {
 671     { Float2(0, 0) },
 672 
 673     { Float2(0.25f, 0.25f), Float2(0.75f, 0.75f) },
 674 
 675     { Float2(0.25f, 0.25f), Float2(0.75f, 0.75f), Float2(0.25f, 0.75f), Float2(0.75f, 0.25f) },
 676 
 677     { Float2(0.2f, 0.5f), Float2(0.8f, 0.5f), Float2(0.5f, 0.2f), Float2(0.5f, 0.8f),
 678       Float2(0.25f, 0.25f), Float2(0.75f, 0.25f), Float2(0.25f, 0.75f), Float2(0.75f, 0.75f)},
 679 
 680     { Float2(0.2f, 0.2f), Float2(0.4f, 0.2f), Float2(0.6f, 0.2f), Float2(0.8f, 0.2f),
 681       Float2(0.2f, 0.4f), Float2(0.4f, 0.4f), Float2(0.6f, 0.4f), Float2(0.8f, 0.4f),
 682       Float2(0.2f, 0.6f), Float2(0.4f, 0.6f), Float2(0.6f, 0.6f), Float2(0.8f, 0.6f),
 683       Float2(0.2f, 0.8f), Float2(0.4f, 0.8f), Float2(0.6f, 0.8f), Float2(0.8f, 0.8f) }
 684 };
 685 
 686 void RenderContext::ClippingAndDraw(TriVertex** verts, TrianglePrim& tri)
 687 {
 688     if(!BackCullTest((VertexOutput**)verts))
 689         return;
 690 
 691     Float4 vertTmpBuf[256];
 692     Float4* vertBuf = vertTmpBuf;
 693 
 694     int vertCount = 3;
 695     for(int i = 0; i < 6; i++)
 696     {
 697         vertCount = ClipPoly(*(const Float4*)FrustumClipPlane[i],
 698                             (VertexOutput**)verts, vertCount, 2, vertBuf);
 699         if(vertCount < 3)
 700             return;
 701     }
 702 
 703     Float4 vertsTmp[256];
 704     for(int i = 0; i < vertCount; i++)
 705         VertConvert(vertsTmp + i * 2, (VertexOutput*)verts[i], 1);
 706 
 707     TriVertex* triangle[3];
 708     for(int i = 0; i < vertCount - 2; i++)
 709     {
 710         triangle[0] = (TriVertex*)vertsTmp;
 711         triangle[1] = (TriVertex*)(vertsTmp + (i + 1) * 2);
 712         triangle[2] = (TriVertex*)(vertsTmp + (i + 2) * 2);
 713         DrawPrimitive(triangle, tri);
 714     }
 715 }
 716 
 717 void RenderContext::DrawTriangle(TrianglePrim& tri)
 718 {
 719     TriVertex verts[3];
 720     verts[0].pos = Mul(tri.p0, ViewProjMatrix);
 721     verts[1].pos = Mul(tri.p1, ViewProjMatrix);
 722     verts[2].pos = Mul(tri.p2, ViewProjMatrix);
 723     verts[0].uv = Float4(0, 0, 0, 0);
 724     verts[1].uv = Float4(1, 0, 0, 0);
 725     verts[2].uv = Float4(0, 1, 0, 0);
 726 
 727     TriVertex* verts2[36];
 728     verts2[0] = verts;
 729     verts2[1] = verts + 1;
 730     verts2[2] = verts + 2;
 731     ClippingAndDraw(verts2, tri);
 732 }
 733 
 734 void RenderContext::RasterTile(PrimitiveTile* tile, int x, int y, DWORD* target,
 735                                 int pitch, FGSampleTable* FGSamples)
 736 {
 737     const int tileSize = TILE_WIDTH * TILE_HEIGHT;
 738     _CRT_ALIGN(16) TriPrimitive* primBuf[tileSize];
 739     _CRT_ALIGN(16) float wBuf[tileSize];
 740     _CRT_ALIGN(16) PixelContext* pixels[tileSize];
 741     //_CRT_ALIGN(16) PixelContext* mainPixels[tileSize];
 742     //_CRT_ALIGN(16) PixelContext* transPixels[tileSize];
 743     _CRT_ALIGN(16) TriPrimitive* primBuf2[tileSize];
 744     _CRT_ALIGN(16) float wBuf2[tileSize];
 745     _CRT_ALIGN(16) Float4 colorBuf[tileSize];
 746 
 747     Allocator allocA(Align((BYTE*)_alloca(1024 * 1024), 16), 1024 * 1024 - 15);
 748     Allocator allocB(Align((BYTE*)_alloca(1024 * 1024), 16), 1024 * 1024 - 15);
 749     Allocator* alloc = &allocA;
 750 
 751     tile->MergePrimitives();
 752     
 753     if(!tile->HasPrimitive())
 754     {
 755         for(int i = 0; i < tileSize; ++i)
 756             *((DWORD*)((BYTE*)target + pitch * (i / 16)) + (i % 16)) = _bkColor;
 757         return;
 758     }
 759 
 760     float startX = (float)x + 0.5f;
 761     float startY = (float)y + 0.5f;
 762 
 763     for(int i = 0; i < tileSize; i += 4)
 764     {
 765         _mm_store_ps(wBuf + i, m128(0));
 766         _mm_store_ps((float*)primBuf + i, m128(0));
 767         //_mm_store_ps((float*)pixels + i, m128(0));
 768         //_mm_store_ps((float*)transPixels + i, m128(0));
 769     }
 770     float farRhw = 0;
 771     bool hasFullPrim = false;
 772     while(true)
 773     {
 774         TriPrimitive* prim = tile->NextFullPrimitive();
 775         if(!prim)
 776             break;
 777         hasFullPrim = true;
 778         RasterFullCoverPrim(prim, startX, startY, (float*)primBuf, wBuf);
 779     }
 780     if(hasFullPrim)
 781         farRhw = CopyBuf2(wBuf, wBuf2, (float*)primBuf, (float*)primBuf2, tileSize);
 782     else
 783     {
 784         for(int i = 0; i < tileSize; i += 4)
 785         {
 786             _mm_store_ps(wBuf2 + i, m128(0));
 787             _mm_store_ps((float*)primBuf2 + i, m128(0));
 788         }
 789     }
 790 
 791     int aaCount = AACount[_aaLevel];
 792     float alpha = AACfgAlpha[_aaLevel];
 793     Float2* sampler = AASampler[_aaLevel];
 794 
 795     while(true)
 796     {
 797         TriPrimitive* prim = tile->NextOpaquePrimitive();
 798         if(!prim)
 799             break;
 800         if(prim->maxRhw < farRhw)
 801             continue;
 802 
 803         RasterPrim(prim, x, y, 0.5f, 0.5f, primBuf2, wBuf2);
 804     }
 805     tile->Reset();
 806     CreateMainPixels(pixels, primBuf2, _eye, startX, startY, tileSize, alpha, *alloc);
 807     farRhw = CopyBuf2(wBuf, wBuf2, (float*)primBuf, (float*)primBuf2, tileSize);
 808     farRhw *= 0.99f;
 809 
 810     for(int aa = 0; aa < aaCount; ++aa)
 811     {
 812         float xs = sampler[aa].x;
 813         float ys = sampler[aa].y;
 814         while(true)
 815         {
 816             TriPrimitive* prim = tile->NextOpaquePrimitive();
 817             if(!prim)
 818                 break;
 819             if(prim->maxRhw < farRhw)
 820                 continue;
 821 
 822             RasterPrim(prim, x, y, xs, ys, primBuf2, wBuf2);
 823         }
 824         tile->Reset();
 825         CreatePixels(pixels, primBuf2, _eye, alpha, startX - 0.5f + xs, startY - 0.5f + ys, tileSize, *alloc);
 826         CopyBuf(wBuf, wBuf2, (float*)primBuf, (float*)primBuf2, tileSize);
 827     }
 828 
 829     do
 830     {
 831         bool fullScreen;
 832         TriPrimitive* prim = tile->NextTransPrimitive(fullScreen);
 833         if(!prim)
 834             break;
 835         while(prim)
 836         {
 837             if(prim->maxRhw < farRhw)
 838             {
 839                 prim = tile->NextTransPrimitive(fullScreen);
 840                 continue;
 841             }
 842             PixelContext** tpixels = pixels;
 843             __m128 ex[3];
 844             __m128 ey[3];
 845             __m128 mask0[3];
 846             __m128 xOff[3];
 847             for(int i = 0; i < 3; ++i)
 848             {
 849                 ex[i] = m128(prim->ea[i]);
 850                 ey[i] = m128(prim->eb[i]);
 851                 xOff[i] = (ex[i] > m128(0)) & m128(4);
 852             }
 853 
 854             __m128 p0x = m128(prim->p0.x);
 855             __m128 p0y = m128(prim->p0.y);
 856             __m128 p1x = p0x - ey[0];
 857             __m128 p1y = p0y + ex[0];
 858 
 859             mask0[0] = (m128(x) - p0x) * ex[0] + (m128(y + 0.5f) - p0y) * ey[0];
 860             mask0[1] = (m128(x) - p1x) * ex[1] + (m128(y + 0.5f) - p1y) * ey[1];
 861             mask0[2] = (m128(x) - p0x) * ex[2] + (m128(y + 0.5f) - p0y) * ey[2];
 862 
 863             __m128 rhw0 = (_mm_set_ps(3, 2, 1, 0) + m128(startX) - p0x) * m128(prim->a[0]) +
 864                           (m128(startY) - p0y) * m128(prim->b[0]) + m128(prim->c[0]);
 865             __m128* mwBuf = (__m128*)wBuf2;
 866             for(int iy = 0; iy < 16; ++iy)
 867             {
 868                 __m128 xStep = _mm_set_ps(12, 8, 4, 0);
 869                 __m128 mask =    ((mask0[0] + (xStep + xOff[0]) * ex[0]) >= m128(0)) &
 870                                 ((mask0[1] + (xStep + xOff[1]) * ex[1]) >= m128(0)) &
 871                                 ((mask0[2] + (xStep + xOff[2]) * ex[2]) >= m128(0));
 872                 if(MoveMask(mask))
 873                 {
 874                     __m128 mask1[3];
 875                     xStep = _mm_set_ps(3.5f, 2.5f, 1.5f, 0.5f);
 876                     mask1[0] = mask0[0] + xStep * ex[0];
 877                     mask1[1] = mask0[1] + xStep * ex[1];
 878                     mask1[2] = mask0[2] + xStep * ex[2];
 879                     __m128 rhw = rhw0;
 880 
 881                     for(int ix = 0; ix < 4; ++ix)
 882                     {
 883                         __m128 pmask = ((rhw > *mwBuf) &
 884                                         (mask1[0] >= m128(0)) &
 885                                         (mask1[1] >= m128(0)) &
 886                                         (mask1[2] >= m128(0)));
 887                         if(MoveMask(pmask))
 888                         {
 889                             __m128 rhw1 = rhw & pmask;
 890                             Create4TransPixels(tpixels, prim, _eye, (float*)&rhw1,
 891                                                x + ix * 4 + 0.5f, y + iy + 0.5f, *alloc);
 892                         }
 893                         rhw = rhw + m128(4) * m128(prim->a[0]);
 894                         mask1[0] = mask1[0] + m128(4) * ex[0];
 895                         mask1[1] = mask1[1]    + m128(4) * ex[1];
 896                         mask1[2] = mask1[2]    + m128(4) * ex[2];
 897                         mwBuf++;
 898                         tpixels += 4;
 899                     }
 900                 }
 901                 else
 902                 {
 903                     mwBuf += 4;
 904                     tpixels += 16;
 905                 }
 906                 rhw0 = rhw0 + m128(prim->b[0]);
 907                 mask0[0] = mask0[0] + ey[0];
 908                 mask0[1] = mask0[1] + ey[1];
 909                 mask0[2] = mask0[2] + ey[2];
 910             }
 911             prim = tile->NextTransPrimitive(fullScreen);
 912         }
 913     }while(0);
 914 
 915     for(int i = 0; i < tileSize; ++i)
 916         colorBuf[i].m = m128(0);
 917 
 918     Ray reflectRays[64];
 919     ReflectInfo refInfos[64];
 920     int refRayIndex = 0;
 921     int refInfoIndex = 0;
 922 
 923     for(int depth = 0; depth <= ReflectionDepth; ++depth)
 924     {
 925         bool hasReflection = false;
 926         for(int j = 0; j < _lights.size(); ++j)
 927         {
 928             int from, to;
 929             if(_lights[j]->Interpolate(&from, &to) && FGSamples)
 930                 continue;
 931             _lights[j]->DirectIlluminate(pixels, tileSize, &_accelStruct, alloc);
 932         }
 933         if(FGSamples && depth == 0)
 934         {
 935             for(int i = 0; i < tileSize; ++i)
 936             {
 937                 int sx = x + (i % 16);
 938                 int sy = y + i / 16;
 939                 PixelContext* pixel = pixels[i];
 940                 while(pixel)
 941                 {
 942                     Float4 norm;
 943                     pixel->prim->GetFaceNormal(pixel->triIndex, &norm);
 944                     
 945                     Float4 color = FGSamples->Lookup(pixel->prim, *(Float3*)&norm, sx, sy);
 946                     if(color.x + color.y + color.z > 0)
 947                     {
 948                         Illuminance* illum = (Illuminance*)alloc->Alloc(sizeof(Illuminance));
 949                         illum->color.x = color.x;
 950                         illum->color.y = color.y;
 951                         illum->color.z = color.z;
 952                         illum->direction.x = 0;
 953                         illum->direction.y = 0;
 954                         illum->direction.z = 1;
 955                         illum->illuminance = 1;
 956                         illum->light = 0;
 957                         illum->next = pixel->light;
 958                         illum->shadowFactor = 0;
 959                         pixel->light = illum;
 960                     }
 961                     pixel = pixel->next;
 962                 }
 963             }
 964         }
 965 
 966         if(alloc == &allocA)
 967             alloc = &allocB;
 968         else
 969             alloc = &allocA;
 970         alloc->Clear();
 971 
 972         for(int i = 0; i < tileSize; ++i)
 973         {
 974             PixelContext* pixel = pixels[i];
 975             pixels[i] = 0;
 976             Float4 color;
 977             color.m = m128(0);
 978             float alpha = 0;
 979 
 980             while(pixel)
 981             {
 982                 ReflectInfo reflect;
 983                 reflect.strength = 0;
 984                 alpha += pixel->alpha;
 985                 color += pixel->prim->shader->PixelShader(pixel, &reflect) * pixel->alpha;
 986                 reflect.strength *= pixel->alpha;
 987 
 988                 if(reflect.strength > 0.01f)
 989                 {
 990                     ReflectInfo& refInfo = refInfos[refInfoIndex++];
 991                     refInfo = reflect;
 992                     refInfo.context = pixel;
 993                     refInfo.index = i;
 994                     refInfo.strength *= 0.25f;
 995 
 996                     if(depth == 0)
 997                     {
 998                         CreateReflectRay(reflectRays + refRayIndex, 4, pixel, &refInfo, _eye);
 999                         refRayIndex += 4;
1000                     }
1001                     else
1002                     {
1003                         CreateReflectRay(reflectRays + refRayIndex, 4, pixel, &refInfo, _eye);
1004                         refRayIndex += 4;
1005                     }
1006                     hasReflection = true;
1007 
1008                     if(refRayIndex >= 64)
1009                     {
1010                         _accelStruct.TraceIntersect(reflectRays, refRayIndex);
1011                         for(int r = 0; r < refRayIndex; ++r)
1012                         {
1013                             Ray& ray = reflectRays[r];
1014                             if(ray.prim)
1015                                 CreateReflectPixel(pixels, ray, alloc);
1016                         }
1017                         refInfoIndex = 0;
1018                         refRayIndex = 0;
1019                     }
1020                 }
1021                 pixel = pixel->next;
1022             }
1023             if(refRayIndex > 0)
1024             {
1025                 _accelStruct.TraceIntersect(reflectRays, refRayIndex);
1026                 for(int r = 0; r < refRayIndex; ++r)
1027                 {
1028                     Ray& ray = reflectRays[r];
1029                     if(ray.prim)
1030                         CreateReflectPixel(pixels, ray, alloc);
1031                 }
1032                 refInfoIndex = 0;
1033                 refRayIndex = 0;
1034             }
1035             if(depth == 0)
1036             {
1037                 if(alpha < 0.99f)
1038                     color = color + _bkColorF * (1 - alpha);
1039                 colorBuf[i] = color;
1040             }
1041             else
1042                 colorBuf[i] += color;
1043         }
1044 
1045         if(!hasReflection)
1046             break;
1047     }
1048 
1049     for(int i = 0; i < tileSize; ++i)
1050     {
1051         __m128i icolor = _mm_cvttps_epi32(_mm_rsqrt_ps(colorBuf[i].m) * colorBuf[i].m * m128(255));
1052         icolor = _mm_packs_epi32(icolor, icolor);
1053         icolor = _mm_packus_epi16(icolor, icolor);
1054         
1055         *((DWORD*)((BYTE*)target + pitch * (i / 16)) + (i % 16)) = _mm_cvtsi128_si32(icolor);
1056     }
1057 }
1058 
1059 void RenderContext::DrawPrimitive(TriVertex** p, TrianglePrim& tri)
1060 {
1061     if((p[2]->pos.x - p[0]->pos.x) * (p[1]->pos.y - p[0]->pos.y)
1062         - (p[1]->pos.x - p[0]->pos.x) * (p[2]->pos.y - p[0]->pos.y) <= 0)
1063         return;
1064 
1065     Float3 edge[3];
1066     edge[0] = CalcEdge(p[0]->pos, p[1]->pos);
1067     edge[1] = CalcEdge(p[1]->pos, p[2]->pos);
1068     edge[2] = CalcEdge(p[2]->pos, p[0]->pos);
1069 
1070     TriPrimitive* prim = (TriPrimitive*)MemoryHeapMT::Alloc(sizeof(TriPrimitive));
1071     prim->prim = tri.prim;
1072     prim->maxRhw = max(max(p[0]->pos.w, p[1]->pos.w), p[2]->pos.w);
1073     prim->triIndex = tri.triIndex;
1074     prim->p0.x = p[0]->pos.x;
1075     prim->p0.y = p[0]->pos.y;
1076     for(int i = 0; i < 3; ++i)
1077     {
1078         prim->ea[i] = -edge[i].x;
1079         prim->eb[i] = -edge[i].y;
1080         //prim->edge[i].x = -edge[i].x;
1081         //prim->edge[i].y = -edge[i].y;
1082     }
1083     __m128 A = m128(1 / ((p[0]->pos.x - p[1]->pos.x) * (p[0]->pos.y - p[2]->pos.y)
1084                     - (p[0]->pos.y - p[1]->pos.y) * (p[0]->pos.x - p[2]->pos.x)));
1085     __m128 attr[3];
1086     for(int i = 0; i < 3; ++i)
1087         attr[i] = _mm_loadu_ps(&p[i]->pos.w);
1088 
1089     _mm_storeu_ps(prim->a, A * (m128(edge[0].x) * attr[2] + m128(edge[1].x) * attr[0] + m128(edge[2].x) * attr[1]));
1090     _mm_storeu_ps(prim->b, A * (m128(edge[0].y) * attr[2] + m128(edge[1].y) * attr[0] + m128(edge[2].y) * attr[1]));
1091     prim->c[0] = p[0]->pos.w;
1092     prim->c[1] = p[0]->uv.x;
1093     prim->c[2] = p[0]->uv.y;
1094 
1095     __m128 maxP = Min(Max(Max(p[0]->pos.m, p[1]->pos.m), p[2]->pos.m) + m128(1.5f),
1096                       _mm_set_ps(0, 0, ScreenHeight, ScreenWidth));
1097     __m128 minP = Max(Min(Min(p[0]->pos.m, p[1]->pos.m), p[2]->pos.m) - m128(0.5f), m128(0));
1098 
1099     __m128i bound = _mm_cvtps_epi32(_mm_unpacklo_ps(minP, maxP));
1100 
1101     bound = _mm_add_epi32(bound, _mm_set_epi32(TILE_HEIGHT - 1, 0, TILE_WIDTH - 1, 0));
1102     bound = _mm_and_si128(bound, _mm_set_epi32(~(TILE_HEIGHT - 1),
1103                           ~(TILE_HEIGHT - 1), ~(TILE_WIDTH - 1), ~(TILE_WIDTH - 1)));
1104 
1105     edge[0] = -edge[0];
1106     edge[1] = -edge[1];
1107     edge[2] = -edge[2];
1108 
1109     const int& minX = ((int*)&bound)[0];
1110     const int& maxX = ((int*)&bound)[1];
1111     const int& minY = ((int*)&bound)[2];
1112     const int& maxY = ((int*)&bound)[3];
1113 
1114     __m128 offX, offY;
1115     __m128 ex = _mm_set_ps(0, edge[2].x, edge[1].x, edge[0].x);
1116     __m128 ey = _mm_set_ps(0, edge[2].y, edge[1].y, edge[0].y);
1117     __m128 ez = _mm_set_ps(0, edge[2].z, edge[1].z, edge[0].z);
1118     offX = (ex > m128(0)) & m128(TILE_WIDTH);
1119     offY = (ey > m128(0)) & m128(TILE_HEIGHT);
1120 
1121     PrimitiveTile* tile = _primTiles + (minY / TILE_HEIGHT) * _tileCol + (minX / TILE_WIDTH);
1122 
1123     bool trans = tri.prim->shader->IsTransprency();
1124 
1125     for(int y = minY; y < maxY; y += TILE_HEIGHT)
1126     {
1127         PrimitiveTile* tile2 = tile;
1128         for(int x = minX; x < maxX; x += TILE_WIDTH)
1129         {
1130             if(MoveMask((m128(x) + offX) * ex + (m128(y) + offY) * ey + ez) == 0)
1131             {
1132                 bool fullCovered = MoveMask((m128(x + TILE_WIDTH) - offX) * ex
1133                                     + (m128(y + TILE_HEIGHT) - offY) * ey + ez) == 0;
1134                 if(trans)
1135                 {
1136                     tile2->AddPrimitive(prim, Tanslusent, fullCovered);
1137                 }
1138                 else
1139                 {
1140                     if(fullCovered)
1141                         tile2->InsertFullPrimitive(prim, prim->maxRhw);
1142                     else
1143                         tile2->AddPrimitive(prim, Opaque, false);
1144                 }
1145             }
1146             tile2++;
1147         }
1148         tile += _tileCol;
1149     }
1150 }
1151 
1152 void RenderContext::Render()
1153 {
1154     ViewProjMatrix = _camera->GetViewProjMatrix();
1155     _eye = Float4(_camera->GetEyePos(), 1);
1156     DWORD startTime = ::timeGetTime();
1157 
1158     struct VertexProcess
1159     {
1160         LONG index;
1161         RenderContext* rc;
1162         TrianglePrim* prims;
1163         int triCount;
1164 
1165         static void Run(int id, void* _context)
1166         {
1167             VertexProcess* context = (VertexProcess*)_context;
1168             RenderContext* rc = context->rc;
1169                 
1170             while(true)
1171             {
1172                 LONG index = ::InterlockedIncrement(&context->index) - 1;
1173                 int c = index * 64;
1174                 if(c >= context->triCount)
1175                     break;
1176                 int e = min(c + 64, context->triCount);
1177                 for(int i = c; i < e; ++i)
1178                     rc->DrawTriangle(context->prims[i]);
1179             }
1180         }
1181 
1182         VertexProcess()
1183         {
1184             index = 0;
1185         }
1186     };
1187 
1188     VertexProcess p;
1189     p.rc = this;
1190     p.prims = _accelStruct.GetPrims(&p.triCount);
1191     Parallel::Run(VertexProcess::Run, &p);
1192 
1193     LogInfo("Vertex Process time: %d\n", ::timeGetTime() - startTime);
1194     startTime = ::timeGetTime();
1195 
1196 
1197     struct PixelProcess
1198     {
1199         LONG index;
1200         RenderContext* rc;
1201 
1202         static void Run(int id, void* _context)
1203         {
1204             PixelProcess* context = (PixelProcess*)_context;
1205             RenderContext* rc = context->rc;
1206             while(true)
1207             {
1208                 LONG index = ::InterlockedIncrement(&context->index) - 1;
1209                 if(index >= (rc->_tileCol * rc->_tileRow))
1210                     break;
1211 
1212                 int col = index % (rc->_tileCol);
1213                 int row = index / (rc->_tileCol);
1214 
1215                 int x = col * TILE_WIDTH;
1216                 int y = row * TILE_HEIGHT;
1217                 index = row * rc->_tileCol + col;
1218 
1219                 rc->RasterTile(rc->_primTiles + index, x, y,
1220                     (*rc->_renderTarget)[y] + x, rc->_renderTarget->pitch);
1221             }
1222         }
1223 
1224         PixelProcess()
1225         {
1226             index = 0;
1227         }
1228     };
1229 
1230     PixelProcess pp;
1231     pp.rc = this;
1232     Parallel::Run(PixelProcess::Run, &pp);
1233 
1234     LogInfo("Pixel Process time: %d\n", ::timeGetTime() - startTime);
1235 }

 

 

 

 

posted on 2013-02-11 23:07  qiaojie  阅读(2617)  评论(1编辑  收藏  举报