diff options
author | David Monniaux <david.monniaux@univ-grenoble-alpes.fr> | 2019-03-12 06:24:50 +0100 |
---|---|---|
committer | David Monniaux <david.monniaux@univ-grenoble-alpes.fr> | 2019-03-12 06:24:50 +0100 |
commit | 549e1f93c95e06d796bdd15ca0cf24c0a0128e70 (patch) | |
tree | 7efee67ed59f7bf35b5a5d848ac22c8f5a517031 /test/monniaux/complex/complex.c | |
parent | b0812f6f17e6943dee4743245befb20a3dc719f6 (diff) | |
download | compcert-kvx-549e1f93c95e06d796bdd15ca0cf24c0a0128e70.tar.gz compcert-kvx-549e1f93c95e06d796bdd15ca0cf24c0a0128e70.zip |
some more work on complex matrices
Diffstat (limited to 'test/monniaux/complex/complex.c')
-rw-r--r-- | test/monniaux/complex/complex.c | 203 |
1 files changed, 195 insertions, 8 deletions
diff --git a/test/monniaux/complex/complex.c b/test/monniaux/complex/complex.c index 536c321b..0b806b24 100644 --- a/test/monniaux/complex/complex.c +++ b/test/monniaux/complex/complex.c @@ -1,7 +1,13 @@ #include <stdio.h> +#include <stdbool.h> +#include <stdlib.h> +#include <inttypes.h> +#include "../clock.h" + +typedef double REAL; typedef struct { - double re, im; + REAL re, im; } COMPLEX; static inline void COMPLEX_zero(COMPLEX *r) { @@ -26,7 +32,7 @@ static inline void COMPLEX_mul(COMPLEX *r, r->im = im; } -static inline void COMPLEX_mul_add(COMPLEX *r, +static inline void COMPLEX_mul_addto(COMPLEX *r, const COMPLEX *x, const COMPLEX *y) { double re = r->re + x->re * y->re - x->im * y->im; @@ -35,6 +41,17 @@ static inline void COMPLEX_mul_add(COMPLEX *r, r->im = im; } +#define MACRO_COMPLEX_mul_addto(r, x, y) \ + { \ + double rre=(r).re, rim=(r).im, \ + xre = (x).re, xim=(x).im, \ + yre = (y).re, yim = (y).im; \ + double re = rre + xre * yre - xim * yim; \ + double im = rim + xim * yre + xre * yim; \ + r.re = re; \ + r.im = im; \ + } + void COMPLEX_mat_mul1(unsigned m, unsigned n, unsigned p, COMPLEX * restrict c, unsigned stride_c, @@ -48,16 +65,186 @@ void COMPLEX_mat_mul1(unsigned m, unsigned n, unsigned p, for(unsigned i=0; i<m; i++) { for(unsigned k=0; k<p; k++) { for(unsigned j=0; j<n; j++) { - COMPLEX_mul_add(c+i*stride_c+k, a+i*stride_a+j, b+j*stride_b+k); + COMPLEX_mul_addto(c+i*stride_c+k, a+i*stride_a+j, b+j*stride_b+k); + } + } + } +} + +#define UNROLL 4 + +#undef CHUNK +#define CHUNK \ + COMPLEX_mul_addto(&total, pa_i_j, pb_j_k); \ + pa_i_j ++; \ + pb_j_k = (COMPLEX*) ((char*) pb_j_k + stride_b_scaled); + +void COMPLEX_mat_mul8(unsigned m, unsigned n, unsigned p, + COMPLEX * c, unsigned stride_c, + const COMPLEX *a, unsigned stride_a, + const COMPLEX *b, unsigned stride_b) { + const COMPLEX *pa_i = a; + COMPLEX * pc_i = c; + size_t stride_b_scaled = sizeof(COMPLEX) * stride_b; + for(unsigned i=0; i<m; i++) { + for(unsigned k=0; k<p; k++) { + const COMPLEX *pb_j_k = b+k, *pa_i_j = pa_i; + COMPLEX total; + COMPLEX_zero(&total); + { + unsigned j4=0, n4=n/UNROLL; + if (n4 > 0) { + do { + CHUNK + CHUNK + CHUNK + CHUNK + j4++; + } while (j4 < n4); + } + } + { + unsigned j4=0, n4=n%UNROLL; + if (n4 > 0) { + do { + CHUNK + j4++; + } while (j4 < n4); + } + } + pc_i[k] = total; + } + pa_i += stride_a; + pc_i += stride_c; + } +} + +#undef CHUNK +#define CHUNK \ + MACRO_COMPLEX_mul_addto(total, *pa_i_j, *pb_j_k) \ + pa_i_j ++; \ + pb_j_k = (COMPLEX*) ((char*) pb_j_k + stride_b_scaled); + +void COMPLEX_mat_mul9(unsigned m, unsigned n, unsigned p, + COMPLEX * c, unsigned stride_c, + const COMPLEX *a, unsigned stride_a, + const COMPLEX *b, unsigned stride_b) { + const COMPLEX *pa_i = a; + COMPLEX * pc_i = c; + size_t stride_b_scaled = sizeof(COMPLEX) * stride_b; + for(unsigned i=0; i<m; i++) { + for(unsigned k=0; k<p; k++) { + const COMPLEX *pb_j_k = b+k, *pa_i_j = pa_i; + COMPLEX total; + COMPLEX_zero(&total); + { + unsigned j4=0, n4=n/UNROLL; + if (n4 > 0) { + do { + CHUNK + CHUNK + CHUNK + CHUNK + j4++; + } while (j4 < n4); + } + } + { + unsigned j4=0, n4=n%UNROLL; + if (n4 > 0) { + do { + CHUNK + j4++; + } while (j4 < n4); + } } + pc_i[k] = total; } + pa_i += stride_a; + pc_i += stride_c; } } +bool COMPLEX_equal(const COMPLEX *a, + const COMPLEX *b) { + return a->re==b->re && a->im==b->im; +} + +bool COMPLEX_mat_equal(unsigned m, + unsigned n, + const COMPLEX *a, unsigned stride_a, + const COMPLEX *b, unsigned stride_b) { + for(unsigned i=0; i<m; i++) { + for(unsigned j=0; j<n; j++) { + if (! COMPLEX_equal(a+i*stride_a + j, b+i*stride_b + j)) { + printf("at %u,%u: (%g,%g) vs (%g,%g)\n", i, j, + a[i*stride_a + j].re, a[i*stride_a + j].im, + b[i*stride_b + j].re, b[i*stride_b + j].im); + return false; + } + } + } + return true; +} + +REAL REAL_random(void) { + static uint64_t next = 1325997111; + next = next * 1103515249 + 12345; + return next % 1000; +} + +void COMPLEX_mat_random(unsigned m, + unsigned n, + COMPLEX *a, unsigned stride_a) { + for(unsigned i=0; i<m; i++) { + for(unsigned j=0; j<n; j++) { + a[i*stride_a + j].re = REAL_random(); + a[i*stride_a + j].im = REAL_random(); + } + } +} + + int main() { - COMPLEX a = { 1, 2 }; - COMPLEX b = { 7, 4 }; - COMPLEX r; - COMPLEX_mul(&r, &a, &b); - printf("%g %g\n", r.re, r.im); + const unsigned m = 60, n = 31, p = 50; + clock_prepare(); + COMPLEX *a = malloc(sizeof(COMPLEX) * m * n); + COMPLEX_mat_random(m, n, a, n); + COMPLEX *b = malloc(sizeof(COMPLEX) * n * p); + COMPLEX_mat_random(n, p, b, p); + + COMPLEX *c1 = malloc(sizeof(COMPLEX) * m * p); + cycle_t c1_time = get_current_cycle(); + COMPLEX_mat_mul1(m, n, p, c1, p, a, n, b, p); + c1_time = get_current_cycle()-c1_time; + + COMPLEX *c8 = malloc(sizeof(COMPLEX) * m * p); + cycle_t c8_time = get_current_cycle(); + COMPLEX_mat_mul8(m, n, p, c8, p, a, n, b, p); + c8_time = get_current_cycle()-c8_time; + + COMPLEX *c9 = malloc(sizeof(COMPLEX) * m * p); + cycle_t c9_time = get_current_cycle(); + COMPLEX_mat_mul9(m, n, p, c9, p, a, n, b, p); + c9_time = get_current_cycle()-c9_time; + + printf("c1==c8: %s\n" + "c1==c9: %s\n" + "c1_time = %" PRIu64 "\n" + "c8_time = %" PRIu64 "\n" + "c9_time = %" PRIu64 "\n", + + COMPLEX_mat_equal(m, n, c1, p, c8, p)?"true":"false", + COMPLEX_mat_equal(m, n, c1, p, c9, p)?"true":"false", + + c1_time, + c8_time, + c9_time); + + free(a); + free(b); + free(c1); + free(c8); + free(c9); + return 0; } |