→
2.0 1.5 1.0 0.5 0.0 16 64 256 1k 4k 16k 64k 256k 1M
2.0 1.5 1.0 0.5 0.0 16 64 256 1k 4k 16k 64k 256k 1M 40 35 30 25 20 15 10 5 0 16 64 256 1k 4k 16k 64k 256k 1M
40 ... t282 = _mm_addsub_ps(t268, U247); t283 = _mm_add_ps(t282, _mm_addsub_ps(U247, _mm_shuffle_ps(t275, t275, _MM_SHUFFLE(2, 3, 0, 1)))); 35 t284 = _mm_add_ps(t282, _mm_addsub_ps(U247, _mm_sub_ps(_mm_setzero_ps (), ………) s217 = _mm_addsub_ps(t270, U247); s218 = _mm_addsub_ps(_mm_mul_ps(t277, _mm_set1_ps((- 0.70710678118654757))), ………) 30 t285 = _mm_add_ps(s217, s218); t286 = _mm_sub_ps(s217, s218); s219 = _mm_shuffle_ps(t278, t280, _MM_SHUFFLE(1, 0, 1, 0)); 25 s220 = _mm_shuffle_ps(t278, t280, _MM_SHUFFLE(3, 2, 3, 2)); s221 = _mm_shuffle_ps(t283, t285, _MM_SHUFFLE(1, 0, 1, 0)); ... 20 15 10 5 0 16 64 256 1k 4k 16k 64k 256k 1M Matrix multiplication WiFi Receiver Performance [Gflop/s] Performance [Mbit/s] 70 50 60 40 50 30 40 30 20 20 10 10 0 0 0 2000 4000 6000 8000 10000 6 12 18 24 30 36 42 48 54
40 35 30 25 20 15 10 5 0 16 64 256 1k 4k 16k 64k 256k 1M // straightforward code // unrolling + scalar replacement for(i = 0; i < N; i += 1) for(i = 0; i < N; i += MU) { for(j = 0; j < N; j += 1) for(j = 0; j < N; j += NU) { for(k = 0; k < N; k += 1) for(k = 0; k < N; k += KU) { c[i][j] += a[i][k]*b[k][j]; t1 = A[i*N + k]; t2 = A[i*N + k + 1]; t3 = A[i*N + k + 2]; t4 = A[i*N + k + 3]; t5 = A[(i + 1)*N + k]; <more copies> t10 = t1 * t9; t17 = t17 + t10; t21 = t1 * t8; t18 = t18 + t21; t12 = t5 * t9; t19 = t19 + t12; t13 = t5 * t8; t20 = t20 + t13; <more ops> C[i*N + j] = t17; C[i*N + j + 1] = t18; C[(i+1)*N + j] = t19; • C[(i+1)*N + j + 1] = t20; } • } }
• • • • • • • • • • • •
T∙
Correct code: easy fast code: very difficult void sub(double *y, double *x) { double f0, f1, f2, f3, f4, f7, f8, f10, f11; f0 = x[0] - x[3]; f1 = x[0] + x[3]; f2 = x[1] - x[2]; f3 = x[1] + x[2]; f4 = f1 - f3; y[0] = f1 + f3; y[2] = 0.7071067811865476 * f4; f7 = 0.9238795325112867 * f0; < more lines>
A Processor 0 A Processor 1 A Processor 2 A Processor 3 x y x y
void dft( int n, cpx *y, cpx *x) { if ( use_dft_base_case(n) ) dft_bc(n, y, x); else { int k = choose_dft_radix(n) ; for ( int i=0; i < k; ++i) dft_strided(m, k, t + m*i, x + m*i); for ( int i=0; i < m; ++i) dft_scaled(k, m, precomp_d[i], y + i, t + i); } }
S- S S
configure/make configure/make d = dft(n) d(x,y) d = dft(n) d(x,y)
DFT on Sandybridge (3.3 GHz, 4 Cores, AVX) Performance [Gflop/s]
S MKL MKL BTO
SPL Data flow graph Scala function def f(x: Array[Double], y: Array[Double]) = { for (i <- 0 until 2) { y(2*i) = x(i*2) + x(i*2+1) y(2*i+1) = x(i*2) - x(i*2+1) } } def f(x: Array[ Rep[ Double ] ], y: Array[ Rep[ Double ] ]) = { t0 = s0 + s1; for (i <- 0 until 2) { t1 = s0 - s1; y(2*i) = x(i*2) + x(i*2+1) t2 = s2 + s3; y(2*i+1) = x(i*2) - x(i*2+1) t2 = s2 - s3; } } t0 = x[0]; t1 = x[1]; t2 = t0 + t1; def f(x: Rep[ Array[Double] ] , y[0] = t2; y: Rep[ Array[Double] ] ) = { t3 = t0 - t1; for (i <- 0 until 2) { y[1] = t3; y(2*i) = x(i*2) + x(i*2+1) t4 = x[0]; y(2*i+1) = x(i*2) - x(i*2+1) t5 = x[1]; } t6 = t4 + x5; } y[0] = t6; t7 = t4 – x5; y[3] = t7; for (int i=0; i < 2; i++) def f(x: Rep[ Array[Double] ] , { y: Rep[ Array[Double] ] ) = { t0 = x[i]; for (i <- 0 until 2: Rep[ Range ] ) { t1 = x[i+1]; y(2*i) = x(i*2) + x(i*2+1) t2 = t0 + t1; y(2*i+1) = x(i*2) - x(i*2+1) y[i] = t2; } t3 = t0 - t1; } y[i+1] = t3; }
def f(x: Array[ Rep[ Double ] ], y: Array[ Rep[ Double ] ]) = { t0 = s0 + s1; for (i <- 0 until 2) { t1 = s0 - s1; y(2*i) = x(i*2) + x(i*2+1) t2 = s2 + s3; y(2*i+1) = x(i*2) - x(i*2+1) t2 = s2 - s3; } } t0 = x[0]; t1 = x[1]; t2 = t0 + t1; def f(x: Rep[ Array[Double] ] , y[0] = t2; y: Rep[ Array[Double] ] ) = { t3 = t0 - t1; for (i <- 0 until 2) { y[1] = t3; y(2*i) = x(i*2) + x(i*2+1) t4 = x[0]; y(2*i+1) = x(i*2) - x(i*2+1) t5 = x[1]; } t6 = t4 + x5; } def f[L[_],A[_],T](looptype: L, x: A[Array[T]], y: A[Array[T]]) = { y[0] = t6; for (i <- 0 until 2: L[Range]) { t7 = t4 – x5; y[3] = t7; y(2*i) = x(i*2) + x(i*2+1) y(2*i+1)= x(i*2) - x(i*2+1) } } for (int i=0; i < 2; i++) def f(x: Rep[ Array[Double] ] , { y: Rep[ Array[Double] ] ) = { t0 = x[i]; for (i <- 0 until 2: Rep[ Range ] ) { t1 = x[i+1]; y(2*i) = x(i*2) + x(i*2+1) t2 = t0 + t1; y[i] = t2; y(2*i+1) = x(i*2) - x(i*2+1) } t3 = t0 - t1; } y[i+1] = t3; } ≥ 70 papers on side threads
Recommend
More recommend