#include "float_mat.h" #include #define ADD += #define MUL * void REAL_mat_mul1(unsigned m, unsigned n, unsigned p, REAL * restrict c, unsigned stride_c, const REAL *a, unsigned stride_a, const REAL *b, unsigned stride_b) { for(unsigned i=0; i 0) { do { CHUNK CHUNK j2++; } while (j2 < n2); } if (n%2) { total ADD (*pa_i_j MUL *pb_j_k); } pc_i[k] = total; } pa_i += stride_a; pc_i += stride_c; } } #define UNROLL 4 void REAL_mat_mul7(unsigned m, unsigned n, unsigned p, REAL * c, unsigned stride_c, const REAL *a, unsigned stride_a, const REAL *b, unsigned stride_b) { const REAL *pa_i = a; REAL * pc_i = c; for(unsigned i=0; i 0) { do { CHUNK CHUNK CHUNK CHUNK j4++; } while (j4 < n4); } } { unsigned j4=0, n4=n%UNROLL; if (n4 > 0) { do { CHUNK j4++; } while (j4 < n4); } } pc_i[k] = total; } pa_i += stride_a; pc_i += stride_c; } } #undef CHUNK #define CHUNK \ total ADD (*pa_i_j MUL *pb_j_k); \ pa_i_j ++; \ pb_j_k = (REAL*) ((char*) pb_j_k + stride_b_scaled); void REAL_mat_mul8(unsigned m, unsigned n, unsigned p, REAL * c, unsigned stride_c, const REAL *a, unsigned stride_a, const REAL *b, unsigned stride_b) { const REAL *pa_i = a; REAL * pc_i = c; size_t stride_b_scaled = sizeof(REAL) * stride_b; for(unsigned i=0; i 0) { do { CHUNK CHUNK CHUNK CHUNK j4++; } while (j4 < n4); } } { unsigned j4=0, n4=n%UNROLL; if (n4 > 0) { do { CHUNK j4++; } while (j4 < n4); } } pc_i[k] = total; } pa_i += stride_a; pc_i += stride_c; } }