2 0 1 5 1 0 0 5 0 0 16 64 256 1k 4k 16k 64k 256k 1m 2 0 1

2.0 1.5 1.0 0.5 0.0 16 64 256 1k 4k 16k 64k 256k 1M - PDF document

2.0 1.5 1.0 0.5 0.0 16 64 256 1k 4k 16k 64k 256k 1M 2.0 1.5 1.0 0.5 0.0 16 64 256 1k 4k 16k 64k 256k 1M 40 35 30 25 20 15 10 5 0 16 64 256 1k 4k 16k 64k 256k 1M 40 ... t282 = _mm_addsub_ps(t268,


  1. 2.0 1.5 1.0 0.5 0.0 16 64 256 1k 4k 16k 64k 256k 1M

  2. 2.0 1.5 1.0 0.5 0.0 16 64 256 1k 4k 16k 64k 256k 1M 40 35 30 25 20 15 10 5 0 16 64 256 1k 4k 16k 64k 256k 1M

  3. 40 ... t282 = _mm_addsub_ps(t268, U247); t283 = _mm_add_ps(t282, _mm_addsub_ps(U247, _mm_shuffle_ps(t275, t275, _MM_SHUFFLE(2, 3, 0, 1)))); 35 t284 = _mm_add_ps(t282, _mm_addsub_ps(U247, _mm_sub_ps(_mm_setzero_ps (), ………) s217 = _mm_addsub_ps(t270, U247); s218 = _mm_addsub_ps(_mm_mul_ps(t277, _mm_set1_ps((- 0.70710678118654757))), ………) 30 t285 = _mm_add_ps(s217, s218); t286 = _mm_sub_ps(s217, s218); s219 = _mm_shuffle_ps(t278, t280, _MM_SHUFFLE(1, 0, 1, 0)); 25 s220 = _mm_shuffle_ps(t278, t280, _MM_SHUFFLE(3, 2, 3, 2)); s221 = _mm_shuffle_ps(t283, t285, _MM_SHUFFLE(1, 0, 1, 0)); ... 20 15 10 5 0 16 64 256 1k 4k 16k 64k 256k 1M   Matrix multiplication WiFi Receiver Performance [Gflop/s] Performance [Mbit/s] 70 50 60 40 50 30 40 30 20 20 10 10 0 0 0 2000 4000 6000 8000 10000 6 12 18 24 30 36 42 48 54

  4. 40 35 30 25 20 15 10 5 0 16 64 256 1k 4k 16k 64k 256k 1M // straightforward code // unrolling + scalar replacement for(i = 0; i < N; i += 1) for(i = 0; i < N; i += MU) { for(j = 0; j < N; j += 1) for(j = 0; j < N; j += NU) { for(k = 0; k < N; k += 1) for(k = 0; k < N; k += KU) { c[i][j] += a[i][k]*b[k][j]; t1 = A[i*N + k]; t2 = A[i*N + k + 1]; t3 = A[i*N + k + 2]; t4 = A[i*N + k + 3]; t5 = A[(i + 1)*N + k]; <more copies> t10 = t1 * t9; t17 = t17 + t10; t21 = t1 * t8; t18 = t18 + t21; t12 = t5 * t9; t19 = t19 + t12; t13 = t5 * t8; t20 = t20 + t13; <more ops> C[i*N + j] = t17; C[i*N + j + 1] = t18; C[(i+1)*N + j] = t19; • C[(i+1)*N + j + 1] = t20; } • } }

  5. • • • • • • • • • • • •

  6. T∙

  7. Correct code: easy fast code: very difficult void sub(double *y, double *x) { double f0, f1, f2, f3, f4, f7, f8, f10, f11; f0 = x[0] - x[3]; f1 = x[0] + x[3]; f2 = x[1] - x[2]; f3 = x[1] + x[2]; f4 = f1 - f3; y[0] = f1 + f3; y[2] = 0.7071067811865476 * f4; f7 = 0.9238795325112867 * f0; < more lines>

  8. A Processor 0 A Processor 1 A Processor 2 A Processor 3 x y x y

  9. void dft( int n, cpx *y, cpx *x) { if ( use_dft_base_case(n) ) dft_bc(n, y, x); else { int k = choose_dft_radix(n) ; for ( int i=0; i < k; ++i) dft_strided(m, k, t + m*i, x + m*i); for ( int i=0; i < m; ++i) dft_scaled(k, m, precomp_d[i], y + i, t + i); } }

  10. S- S S

  11. configure/make configure/make d = dft(n) d(x,y) d = dft(n) d(x,y)

  12. DFT on Sandybridge (3.3 GHz, 4 Cores, AVX) Performance [Gflop/s]

  13. S MKL MKL BTO

  14. SPL Data flow graph Scala function def f(x: Array[Double], y: Array[Double]) = { for (i <- 0 until 2) { y(2*i) = x(i*2) + x(i*2+1) y(2*i+1) = x(i*2) - x(i*2+1) } } def f(x: Array[ Rep[ Double ] ], y: Array[ Rep[ Double ] ]) = { t0 = s0 + s1; for (i <- 0 until 2) { t1 = s0 - s1; y(2*i) = x(i*2) + x(i*2+1) t2 = s2 + s3; y(2*i+1) = x(i*2) - x(i*2+1) t2 = s2 - s3; } } t0 = x[0]; t1 = x[1]; t2 = t0 + t1; def f(x: Rep[ Array[Double] ] , y[0] = t2; y: Rep[ Array[Double] ] ) = { t3 = t0 - t1; for (i <- 0 until 2) { y[1] = t3; y(2*i) = x(i*2) + x(i*2+1) t4 = x[0]; y(2*i+1) = x(i*2) - x(i*2+1) t5 = x[1]; } t6 = t4 + x5; } y[0] = t6; t7 = t4 – x5; y[3] = t7; for (int i=0; i < 2; i++) def f(x: Rep[ Array[Double] ] , { y: Rep[ Array[Double] ] ) = { t0 = x[i]; for (i <- 0 until 2: Rep[ Range ] ) { t1 = x[i+1]; y(2*i) = x(i*2) + x(i*2+1) t2 = t0 + t1; y(2*i+1) = x(i*2) - x(i*2+1) y[i] = t2; } t3 = t0 - t1; } y[i+1] = t3; }

  15. def f(x: Array[ Rep[ Double ] ], y: Array[ Rep[ Double ] ]) = { t0 = s0 + s1; for (i <- 0 until 2) { t1 = s0 - s1; y(2*i) = x(i*2) + x(i*2+1) t2 = s2 + s3; y(2*i+1) = x(i*2) - x(i*2+1) t2 = s2 - s3; } } t0 = x[0]; t1 = x[1]; t2 = t0 + t1; def f(x: Rep[ Array[Double] ] , y[0] = t2; y: Rep[ Array[Double] ] ) = { t3 = t0 - t1; for (i <- 0 until 2) { y[1] = t3; y(2*i) = x(i*2) + x(i*2+1) t4 = x[0]; y(2*i+1) = x(i*2) - x(i*2+1) t5 = x[1]; } t6 = t4 + x5; } def f[L[_],A[_],T](looptype: L, x: A[Array[T]], y: A[Array[T]]) = { y[0] = t6; for (i <- 0 until 2: L[Range]) { t7 = t4 – x5; y[3] = t7; y(2*i) = x(i*2) + x(i*2+1) y(2*i+1)= x(i*2) - x(i*2+1) } } for (int i=0; i < 2; i++) def f(x: Rep[ Array[Double] ] , { y: Rep[ Array[Double] ] ) = { t0 = x[i]; for (i <- 0 until 2: Rep[ Range ] ) { t1 = x[i+1]; y(2*i) = x(i*2) + x(i*2+1) t2 = t0 + t1; y[i] = t2; y(2*i+1) = x(i*2) - x(i*2+1) } t3 = t0 - t1; } y[i+1] = t3; } ≥ 70 papers on side threads

Recommend


More recommend