#include #include #include #include /********************************** * Pseudo-random number generator * **********************************/ static uint64_t mat_rng[2] = { 11ULL, 1181783497276652981ULL }; static inline uint64_t xorshift128plus(uint64_t s[2]) { uint64_t x, y; x = s[0], y = s[1]; s[0] = y; x ^= x << 23; s[1] = x ^ y ^ (x >> 17) ^ (y >> 26); y += s[1]; return y; } double mat_drand(void) { return (xorshift128plus(mat_rng)>>11) * (1.0/9007199254740992.0); } /******************************************* * Helper routines for matrix manipulation * *******************************************/ float **mat_init(int n_rows, int n_cols) { float **m; int i; m = (float**)malloc(n_rows * sizeof(float*)); m[0] = (float*)calloc(n_rows * n_cols, sizeof(float)); for (i = 1; i < n_rows; ++i) m[i] = m[i-1] + n_cols; return m; } void mat_destroy(float **m) { free(m[0]); free(m); } float **mat_gen_random(int n_rows, int n_cols) { float **m; int i, j; m = mat_init(n_rows, n_cols); for (i = 0; i < n_rows; ++i) for (j = 0; j < n_cols; ++j) m[i][j] = mat_drand(); return m; } float **mat_transpose(int n_rows, int n_cols, float *const* a) { int i, j; float **m; m = mat_init(n_cols, n_rows); for (i = 0; i < n_rows; ++i) for (j = 0; j < n_cols; ++j) m[j][i] = a[i][j]; return m; } float sdot_1(int n, const float *x, const float *y) { int i; float s = 0.0f; for (i = 0; i < n; ++i) s += x[i] * y[i]; return s; } float sdot_8(int n, const float *x, const float *y) { int i, n8 = n>>3<<3; float s, t[8]; t[0] = t[1] = t[2] = t[3] = t[4] = t[5] = t[6] = t[7] = 0.0f; for (i = 0; i < n8; i += 8) { t[0] += x[i+0] * y[i+0]; t[1] += x[i+1] * y[i+1]; t[2] += x[i+2] * y[i+2]; t[3] += x[i+3] * y[i+3]; t[4] += x[i+4] * y[i+4]; t[5] += x[i+5] * y[i+5]; t[6] += x[i+6] * y[i+6]; t[7] += x[i+7] * y[i+7]; } for (s = 0.0f; i < n; ++i) s += x[i] * y[i]; s += t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; return s; } #ifdef __SSE__ #include float sdot_sse(int n, const float *x, const float *y) { int i, n8 = n>>3<<3; __m128 vs1, vs2; float s, t[4]; vs1 = _mm_setzero_ps(); vs2 = _mm_setzero_ps(); for (i = 0; i < n8; i += 8) { __m128 vx1, vx2, vy1, vy2; vx1 = _mm_loadu_ps(&x[i]); vx2 = _mm_loadu_ps(&x[i+4]); vy1 = _mm_loadu_ps(&y[i]); vy2 = _mm_loadu_ps(&y[i+4]); vs1 = _mm_add_ps(vs1, _mm_mul_ps(vx1, vy1)); vs2 = _mm_add_ps(vs2, _mm_mul_ps(vx2, vy2)); } for (s = 0.0f; i < n; ++i) s += x[i] * y[i]; _mm_storeu_ps(t, vs1); s += t[0] + t[1] + t[2] + t[3]; _mm_storeu_ps(t, vs2); s += t[0] + t[1] + t[2] + t[3]; return s; } #endif /************************* * Matrix multiplication * *************************/ float **mat_mul0(int n_a_rows, int n_a_cols, float *const *a, int n_b_cols, float *const *b) { int i, j, k; float **m; m = mat_init(n_a_rows, n_b_cols); for (i = 0; i < n_a_rows; ++i) { for (j = 0; j < n_b_cols; ++j) { float t = 0.0; for (k = 0; k < n_a_cols; ++k) t += a[i][k] * b[k][j]; m[i][j] = t; } } return m; } float **mat_mul1(int n_a_rows, int n_a_cols, float *const *a, int n_b_cols, float *const *b) { int i, j, k, n_b_rows = n_a_cols; float **m, **bT; m = mat_init(n_a_rows, n_b_cols); bT = mat_transpose(n_b_rows, n_b_cols, b); for (i = 0; i < n_a_rows; ++i) { const float *ai = a[i]; float *mi = m[i]; for (j = 0; j < n_b_cols; ++j) { float t = 0.0f, *bTj = bT[j]; for (k = 0; k < n_a_cols; ++k) t += ai[k] * bTj[k]; mi[j] = t; } } mat_destroy(bT); return m; } #ifdef __SSE__ float **mat_mul2(int n_a_rows, int n_a_cols, float *const *a, int n_b_cols, float *const *b) { int i, j, n_b_rows = n_a_cols; float **m, **bT; m = mat_init(n_a_rows, n_b_cols); bT = mat_transpose(n_b_rows, n_b_cols, b); for (i = 0; i < n_a_rows; ++i) for (j = 0; j < n_b_cols; ++j) m[i][j] = sdot_sse(n_a_cols, a[i], bT[j]); mat_destroy(bT); return m; } float **mat_mul7(int n_a_rows, int n_a_cols, float *const *a, int n_b_cols, float *const *b) { int i, j, ii, jj, x = 16, n_b_rows = n_a_cols; float **m, **bT; m = mat_init(n_a_rows, n_b_cols); bT = mat_transpose(n_b_rows, n_b_cols, b); for (i = 0; i < n_a_rows; i += x) { for (j = 0; j < n_b_cols; j += x) { int je = n_b_cols < j + x? n_b_cols : j + x; int ie = n_a_rows < i + x? n_a_rows : i + x; for (ii = i; ii < ie; ++ii) for (jj = j; jj < je; ++jj) m[ii][jj] += sdot_sse(n_a_cols, a[ii], bT[jj]); } } mat_destroy(bT); return m; } #endif float **mat_mul3(int n_a_rows, int n_a_cols, float *const *a, int n_b_cols, float *const *b) { int i, j, n_b_rows = n_a_cols; float **m, **bT; m = mat_init(n_a_rows, n_b_cols); bT = mat_transpose(n_b_rows, n_b_cols, b); for (i = 0; i < n_a_rows; ++i) for (j = 0; j < n_b_cols; ++j) m[i][j] = sdot_8(n_a_cols, a[i], bT[j]); mat_destroy(bT); return m; } float **mat_mul4(int n_a_rows, int n_a_cols, float *const *a, int n_b_cols, float *const *b) { int i, j, n_b_rows = n_a_cols; float **m, **bT; m = mat_init(n_a_rows, n_b_cols); bT = mat_transpose(n_b_rows, n_b_cols, b); for (i = 0; i < n_a_rows; ++i) for (j = 0; j < n_b_cols; ++j) m[i][j] = sdot_1(n_a_cols, a[i], bT[j]); mat_destroy(bT); return m; } #ifdef HAVE_CBLAS #include float **mat_mul5(int n_a_rows, int n_a_cols, float *const *a, int n_b_cols, float *const *b) { int i, j, n_b_rows = n_a_cols; float **m, **bT; m = mat_init(n_a_rows, n_b_cols); bT = mat_transpose(n_b_rows, n_b_cols, b); for (i = 0; i < n_a_rows; ++i) for (j = 0; j < n_b_cols; ++j) m[i][j] = cblas_sdot(n_a_cols, a[i], 1, bT[j], 1); mat_destroy(bT); return m; } float **mat_mul6(int n_a_rows, int n_a_cols, float *const *a, int n_b_cols, float *const *b) { float **m, n_b_rows = n_a_cols; m = mat_init(n_a_rows, n_b_cols); cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_a_rows, n_b_cols, n_a_cols, 1.0f, a[0], n_a_rows, b[0], n_b_rows, 0.0f, m[0], n_a_rows); return m; } #endif /***************** * Main function * *****************/ #include #include int main(int argc, char *argv[]) { int c, n = 1000, algo = 2; clock_t t; float **a, **b, **m = 0; while ((c = getopt(argc, argv, "n:a:h")) >= 0) { if (c == 'n') n = atoi(optarg); else if (c == 'a') algo = atoi(optarg); else if (c == 'h') { fprintf(stderr, "Usage: mat-eval [options]\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -n INT size of the square matrix [%d]\n", n); fprintf(stderr, " -a INT matrix multiplication implementation [%d]\n", algo); fprintf(stderr, " 0: naive - no optimization\n"); fprintf(stderr, " 1: transposing the second matrix\n"); #ifdef __SSE__ fprintf(stderr, " 2: explicitly vectorized sdot() with SSE\n"); fprintf(stderr, " 7: explicitly SSE sdot() plus loop tiling\n"); #endif fprintf(stderr, " 3: implicitly vectorized sdot()\n"); fprintf(stderr, " 4: no vectorization hints\n"); #ifdef HAVE_CBLAS fprintf(stderr, " 5: with sdot() from an external CBLAS library\n"); fprintf(stderr, " 6: with sgemm() from an external CBLAS library\n"); #endif fprintf(stderr, " -h this help message\n"); return 1; } } a = mat_gen_random(n, n); b = mat_gen_random(n, n); t = clock(); if (algo == 0) { m = mat_mul0(n, n, a, n, b); } else if (algo == 1) { m = mat_mul1(n, n, a, n, b); #ifdef __SSE__ } else if (algo == 2) { m = mat_mul2(n, n, a, n, b); } else if (algo == 7) { m = mat_mul7(n, n, a, n, b); #endif } else if (algo == 3) { m = mat_mul3(n, n, a, n, b); } else if (algo == 4) { m = mat_mul4(n, n, a, n, b); #ifdef HAVE_CBLAS } else if (algo == 5) { m = mat_mul5(n, n, a, n, b); } else if (algo == 6) { m = mat_mul6(n, n, a, n, b); #endif } else { fprintf(stderr, "ERROR: unknown algorithm %d\n", algo); return 1; } fprintf(stderr, "CPU time: %g\n", (double)(clock() - t) / CLOCKS_PER_SEC); fprintf(stderr, "Central cell: %g\n", m[n/2][n/2]); if (m) mat_destroy(m); mat_destroy(b); mat_destroy(a); return 0; }