SLIDE 37 Practical
Advanced Graphics β SIMD Recap 37
__m128 EPS4 = _mm_set_ps1( EPSILON ); __m128 MINUSEPS4 = _mm_set_ps1( -EPSILON ); __m128 ONE4 = _mm_set_ps1( 1.0f ); __m128 e1x4 = _mm_set_ps1( v2.x - v1.x ); __m128 e1y4 = _mm_set_ps1( v2.y - v1.y ); __m128 e1z4 = _mm_set_ps1( v2.z - v1.z ); __m128 e2x4 = _mm_set_ps1( v3.x - v1.x ); __m128 e2y4 = _mm_set_ps1( v3.y - v1.y ); __m128 e2z4 = _mm_set_ps1( v3.z - v1.z ); __m128 Px4 = _mm_sub_ps( _mm_mul_ps( r4.dy4, e2z4 ), _mm_mul_ps( r4.dz4, e2y4 ) ); __m128 Py4 = _mm_sub_ps( _mm_mul_ps( r4.dz4, e2x4 ), _mm_mul_ps( r4.dx4, e2z4 ) ); __m128 Pz4 = _mm_sub_ps( _mm_mul_ps( r4.dx4, e2y4 ), _mm_mul_ps( r4.dy4, e2x4 ) ); __m128 det4 = _mm_add_ps( _mm_add_ps( _mm_mul_ps( e1x4, Px4 ), _mm_mul_ps( e1y4, Py4 ) ), _mm_mul_ps( e1z4, Pz4 ) ); __m128 mask1 = _mm_or_ps( _mm_cmple_ps( det4, MINUSEPS4 ), _mm_cmpge_ps( det4, EPS4 ) ); __m128 inv_det4 = _mm_rcp_ps( det4 ); __m128 Tx4 = _mm_sub_ps( r4.ox4, _mm_set_ps1( v1.x ) ); __m128 Ty4 = _mm_sub_ps( r4.oy4, _mm_set_ps1( v1.y ) ); __m128 Tz4 = _mm_sub_ps( r4.oz4, _mm_set_ps1( v1.z ) ); __m128 u4 = _mm_mul_ps( _mm_add_ps( _mm_add_ps( _mm_mul_ps( Tx4, Px4 ), _mm_mul_ps( Ty4, Py4 ) ), _mm_mul_ps( Tz4, Pz4 ) ), inv_det4 ); __m128 mask2 = _mm_and_ps( _mm_cmpge_ps( u4, _mm_setzero_ps() ), _mm_cmple_ps( u4, ONE4 ) ); __m128 Qx4 = _mm_sub_ps( _mm_mul_ps( Ty4, e1z4 ), _mm_mul_ps( Tz4, e1y4 ) ); __m128 Qy4 = _mm_sub_ps( _mm_mul_ps( Tz4, e1x4 ), _mm_mul_ps( Tx4, e1z4 ) ); __m128 Qz4 = _mm_sub_ps( _mm_mul_ps( Tx4, e1y4 ), _mm_mul_ps( Ty4, e1x4 ) ); __m128 v4 = _mm_mul_ps( _mm_add_ps( _mm_add_ps( _mm_mul_ps( r4.dx4, Qx4 ), _mm_mul_ps( r4.dy4, Qy4 ) ), _mm_mul_ps( r4.dz4, Qz4 ) ), inv_det4 ); __m128 mask3 = _mm_and_ps( _mm_cmpge_ps( v4, _mm_setzero_ps() ), _mm_cmple_ps( _mm_add_ps( u4, v4 ), ONE4 ) ); __m128 t4 = _mm_mul_ps( _mm_add_ps( _mm_add_ps( _mm_mul_ps( e2x4, Qx4 ), _mm_mul_ps( e2y4, Qy4 ) ), _mm_mul_ps( e2z4, Qz4 ) ), inv_det4 ); __m128 mask4 = _mm_cmpgt_ps( t4, _mm_setzero_ps() ); __m128 mask5 = _mm_cmplt_ps( t4, r4.t4 ); __m128 combined = _mm_and_ps( _mm_and_ps( _mm_and_ps( _mm_and_ps( mask1, mask2 ), mask3 ), mask4 ), mask5 ); r4.t4 = _mm_blendv_ps( r4.t4, t4, combined );
Define these at global scope Option 1: store with the triangle Option 2: amortize over more rays Do we continue even if all rays died? Not as accurate as _mm_div_ps( one4, β¦ );