From cb1d47636f9bb9adbdfb3f18982417b746b2bf40 Mon Sep 17 00:00:00 2001 From: Jianyi Cheng Date: Fri, 12 Jun 2020 13:02:33 +0100 Subject: Adding benchmarks to branch_jc --- benchmarks/README.md | 14 + benchmarks/fft/fft.c | 432 + benchmarks/fft/fft.h | 61 + benchmarks/fft/generate.c | 43 + benchmarks/fft/test.c | 36 + benchmarks/getTanh/getTanh.cpp | 115 + benchmarks/jacob_2d/jacob_2d.cpp | 121 + benchmarks/jacob_2d/jacob_2d.h | 80 + benchmarks/kmeans/lloyds_algorithm_top.cpp | 326 + benchmarks/kmeans/lloyds_algorithm_top.h | 112 + benchmarks/kmeans/lloyds_algorithm_util.cpp | 258 + benchmarks/kmeans/lloyds_algorithm_util.h | 33 + benchmarks/sobel/input.h | 16967 ++++++++++++++++++++++++++ benchmarks/sobel/output.h | 514 + benchmarks/sobel/sobel.c | 77 + 15 files changed, 19189 insertions(+) create mode 100644 benchmarks/README.md create mode 100644 benchmarks/fft/fft.c create mode 100644 benchmarks/fft/fft.h create mode 100644 benchmarks/fft/generate.c create mode 100644 benchmarks/fft/test.c create mode 100644 benchmarks/getTanh/getTanh.cpp create mode 100644 benchmarks/jacob_2d/jacob_2d.cpp create mode 100644 benchmarks/jacob_2d/jacob_2d.h create mode 100644 benchmarks/kmeans/lloyds_algorithm_top.cpp create mode 100644 benchmarks/kmeans/lloyds_algorithm_top.h create mode 100644 benchmarks/kmeans/lloyds_algorithm_util.cpp create mode 100644 benchmarks/kmeans/lloyds_algorithm_util.h create mode 100644 benchmarks/sobel/input.h create mode 100644 benchmarks/sobel/output.h create mode 100644 benchmarks/sobel/sobel.c (limited to 'benchmarks') diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..7b58302 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,14 @@ +Hi, + +I have collected a set of benchmarks that you may be interested. The main idea is to run the exsiting HLS benchmarks to see if they work - they can only test correctness for a single set of input by a customised test bench. + +* jacob_2d: a benchmark from Polybench +* sobel: a benchmark from LegUp HLS +* getTanh: a benchmark from DASS +* fft: a benchmark from MachSuite +* KMeans: a benchmark from Felix's work + +Note all the benchmark set above (links included in the source code) should be all synthesisable in you case, so youmay be show coverage instead of a single benchmark. + +Best, +Jianyi diff --git a/benchmarks/fft/fft.c b/benchmarks/fft/fft.c new file mode 100644 index 0000000..00ae886 --- /dev/null +++ b/benchmarks/fft/fft.c @@ -0,0 +1,432 @@ +// source: https://github.com/tc466/machsuite/tree/a7d831882d931009d4135f0c4d3f642f782ef897/fft/transpose + +/* +Copyright (c) 2011, UT-Battelle, LLC +Copyright (c) 2014, the President and Fellows of Harvard College +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +* Neither the name of Oak Ridge National Laboratory, nor UT-Battelle, LLC, nor + the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Implementations based on: +V. Volkov and B. Kazian. Fitting fft onto the g80 architecture. 2008. +*/ + +#include "fft.h" + +//////BEGIN TWIDDLES //////// +#define THREADS 64 +#define cmplx_M_x(a_x, a_y, b_x, b_y) (a_x*b_x - a_y *b_y) +#define cmplx_M_y(a_x, a_y, b_x, b_y) (a_x*b_y + a_y *b_x) +#define cmplx_MUL_x(a_x, a_y, b_x, b_y ) (a_x*b_x - a_y*b_y) +#define cmplx_MUL_y(a_x, a_y, b_x, b_y ) (a_x*b_y + a_y*b_x) +#define cmplx_mul_x(a_x, a_y, b_x, b_y) (a_x*b_x - a_y*b_y) +#define cmplx_mul_y(a_x, a_y, b_x, b_y) (a_x*b_y + a_y*b_x) +#define cmplx_add_x(a_x, b_x) (a_x + b_x) +#define cmplx_add_y(a_y, b_y) (a_y + b_y) +#define cmplx_sub_x(a_x, b_x) (a_x - b_x) +#define cmplx_sub_y(a_y, b_y) (a_y - b_y) +#define cm_fl_mul_x(a_x, b) (b*a_x) +#define cm_fl_mul_y(a_y, b) (b*a_y) + +void twiddles8(TYPE a_x[8], TYPE a_y[8], int i, int n){ + int reversed8[8] = {0,4,2,6,1,5,3,7}; + int j; + TYPE phi, tmp, phi_x, phi_y; + + for(j=1; j < 8; j++){ + phi = ((-2*PI*reversed8[j]/n)*i); + phi_x = cos(phi); + phi_y = sin(phi); + tmp = a_x[j]; + a_x[j] = cmplx_M_x(a_x[j], a_y[j], phi_x, phi_y); + a_y[j] = cmplx_M_y(tmp, a_y[j], phi_x, phi_y); + } +} +////END TWIDDLES //// + +#define FF2(a0_x, a0_y, a1_x, a1_y){ \ + TYPE c0_x = *a0_x; \ + TYPE c0_y = *a0_y; \ + *a0_x = cmplx_add_x(c0_x, *a1_x); \ + *a0_y = cmplx_add_y(c0_y, *a1_y); \ + *a1_x = cmplx_sub_x(c0_x, *a1_x); \ + *a1_y = cmplx_sub_y(c0_y, *a1_y); \ +} + +#define FFT4(a0_x, a0_y, a1_x, a1_y, a2_x, a2_y, a3_x, a3_y){ \ + TYPE exp_1_44_x; \ + TYPE exp_1_44_y; \ + TYPE tmp; \ + exp_1_44_x = 0.0; \ + exp_1_44_y = -1.0; \ + FF2( a0_x, a0_y, a2_x, a2_y); \ + FF2( a1_x, a1_y, a3_x, a3_y); \ + tmp = *a3_x; \ + *a3_x = *a3_x*exp_1_44_x-*a3_y*exp_1_44_y; \ + *a3_y = tmp*exp_1_44_y - *a3_y*exp_1_44_x; \ + FF2( a0_x, a0_y, a1_x, a1_y ); \ + FF2( a2_x, a2_y, a3_x, a3_y ); \ +} + +#define FFT8(a_x, a_y) \ +{ \ + TYPE exp_1_8_x, exp_1_4_x, exp_3_8_x; \ + TYPE exp_1_8_y, exp_1_4_y, exp_3_8_y; \ + TYPE tmp_1, tmp_2; \ + exp_1_8_x = 1; \ + exp_1_8_y = -1; \ + exp_1_4_x = 0; \ + exp_1_4_y = -1; \ + exp_3_8_x = -1; \ + exp_3_8_y = -1; \ + FF2( &a_x[0], &a_y[0], &a_x[4], &a_y[4]); \ + FF2( &a_x[1], &a_y[1], &a_x[5], &a_y[5]); \ + FF2( &a_x[2], &a_y[2], &a_x[6], &a_y[6]); \ + FF2( &a_x[3], &a_y[3], &a_x[7], &a_y[7]); \ + tmp_1 = a_x[5]; \ + a_x[5] = cm_fl_mul_x( cmplx_mul_x(a_x[5], a_y[5], exp_1_8_x, exp_1_8_y), M_SQRT1_2 ); \ + a_y[5] = cm_fl_mul_y( cmplx_mul_y(tmp_1, a_y[5], exp_1_8_x, exp_1_8_y) , M_SQRT1_2 ); \ + tmp_1 = a_x[6]; \ + a_x[6] = cmplx_mul_x( a_x[6], a_y[6], exp_1_4_x , exp_1_4_y); \ + a_y[6] = cmplx_mul_y( tmp_1, a_y[6], exp_1_4_x , exp_1_4_y); \ + tmp_1 = a_x[7]; \ + a_x[7] = cm_fl_mul_x( cmplx_mul_x(a_x[7], a_y[7], exp_3_8_x, exp_3_8_y), M_SQRT1_2 ); \ + a_y[7] = cm_fl_mul_y( cmplx_mul_y(tmp_1, a_y[7], exp_3_8_x, exp_3_8_y) , M_SQRT1_2 ); \ + FFT4( &a_x[0], &a_y[0], &a_x[1], &a_y[1], &a_x[2], &a_y[2], &a_x[3], &a_y[3] ); \ + FFT4( &a_x[4], &a_y[4], &a_x[5], &a_y[5], &a_x[6], &a_y[6], &a_x[7], &a_y[7] ); \ +} + +void loadx8(TYPE a_x[], TYPE x[], int offset, int sx){ + a_x[0] = x[0*sx+offset]; + a_x[1] = x[1*sx+offset]; + a_x[2] = x[2*sx+offset]; + a_x[3] = x[3*sx+offset]; + a_x[4] = x[4*sx+offset]; + a_x[5] = x[5*sx+offset]; + a_x[6] = x[6*sx+offset]; + a_x[7] = x[7*sx+offset]; +} + +void loady8(TYPE a_y[], TYPE x[], int offset, int sx){ + a_y[0] = x[0*sx+offset]; + a_y[1] = x[1*sx+offset]; + a_y[2] = x[2*sx+offset]; + a_y[3] = x[3*sx+offset]; + a_y[4] = x[4*sx+offset]; + a_y[5] = x[5*sx+offset]; + a_y[6] = x[6*sx+offset]; + a_y[7] = x[7*sx+offset]; +} + +void fft1D_512(TYPE work_x[512], TYPE work_y[512]){ + int tid, hi, lo, i, j, stride; + int reversed[] = {0,4,2,6,1,5,3,7}; + TYPE DATA_x[THREADS*8]; + TYPE DATA_y[THREADS*8]; + + TYPE data_x[ 8 ]; + TYPE data_y[ 8 ]; + + TYPE smem[8*8*9]; + + stride = THREADS; + + //Do it all at once... +loop1 : for(tid = 0; tid < THREADS; tid++){ + //GLOBAL_LOAD... + data_x[0] = work_x[0*stride+tid]; + data_x[1] = work_x[1*stride+tid]; + data_x[2] = work_x[2*stride+tid]; + data_x[3] = work_x[3*stride+tid]; + data_x[4] = work_x[4*stride+tid]; + data_x[5] = work_x[5*stride+tid]; + data_x[6] = work_x[6*stride+tid]; + data_x[7] = work_x[7*stride+tid]; + + data_y[0] = work_y[0*stride+tid]; + data_y[1] = work_y[1*stride+tid]; + data_y[2] = work_y[2*stride+tid]; + data_y[3] = work_y[3*stride+tid]; + data_y[4] = work_y[4*stride+tid]; + data_y[5] = work_y[5*stride+tid]; + data_y[6] = work_y[6*stride+tid]; + data_y[7] = work_y[7*stride+tid]; + + //First 8 point FFT... + FFT8(data_x, data_y); + + //First Twiddle + twiddles8(data_x, data_y, tid, 512); + + //save for fence + DATA_x[tid*8] = data_x[0]; + DATA_x[tid*8 + 1] = data_x[1]; + DATA_x[tid*8 + 2] = data_x[2]; + DATA_x[tid*8 + 3] = data_x[3]; + DATA_x[tid*8 + 4] = data_x[4]; + DATA_x[tid*8 + 5] = data_x[5]; + DATA_x[tid*8 + 6] = data_x[6]; + DATA_x[tid*8 + 7] = data_x[7]; + + DATA_y[tid*8] = data_y[0]; + DATA_y[tid*8 + 1] = data_y[1]; + DATA_y[tid*8 + 2] = data_y[2]; + DATA_y[tid*8 + 3] = data_y[3]; + DATA_y[tid*8 + 4] = data_y[4]; + DATA_y[tid*8 + 5] = data_y[5]; + DATA_y[tid*8 + 6] = data_y[6]; + DATA_y[tid*8 + 7] = data_y[7]; + } + int sx, offset; + sx = 66; +loop2 : for(tid = 0; tid < 64; tid++){ + hi = tid>>3; + lo = tid&7; + offset = hi*8+lo; + smem[0*sx+offset] = DATA_x[tid*8 + 0]; + smem[4*sx+offset] = DATA_x[tid*8 + 1]; + smem[1*sx+offset] = DATA_x[tid*8 + 4]; + smem[5*sx+offset] = DATA_x[tid*8 + 5]; + smem[2*sx+offset] = DATA_x[tid*8 + 2]; + smem[6*sx+offset] = DATA_x[tid*8 + 3]; + smem[3*sx+offset] = DATA_x[tid*8 + 6]; + smem[7*sx+offset] = DATA_x[tid*8 + 7]; + } + sx = 8; +loop3 : for(tid = 0; tid < 64; tid++){ + hi = tid>>3; + lo = tid&7; + offset = lo*66+hi; + + DATA_x[tid*8 +0] = smem[0*sx+offset]; + DATA_x[tid*8 +4] = smem[4*sx+offset]; + DATA_x[tid*8 +1] = smem[1*sx+offset]; + DATA_x[tid*8 +5] = smem[5*sx+offset]; + DATA_x[tid*8 +2] = smem[2*sx+offset]; + DATA_x[tid*8 +6] = smem[6*sx+offset]; + DATA_x[tid*8 +3] = smem[3*sx+offset]; + DATA_x[tid*8 +7] = smem[7*sx+offset]; + } + + sx = 66; +loop4 : for(tid = 0; tid < 64; tid++){ + hi = tid>>3; + lo = tid&7; + offset = hi*8+lo; + + smem[0*sx+offset] = DATA_y[tid*8 + 0]; + smem[4*sx+offset] = DATA_y[tid*8 + 1]; + smem[1*sx+offset] = DATA_y[tid*8 + 4]; + smem[5*sx+offset] = DATA_y[tid*8 + 5]; + smem[2*sx+offset] = DATA_y[tid*8 + 2]; + smem[6*sx+offset] = DATA_y[tid*8 + 3]; + smem[3*sx+offset] = DATA_y[tid*8 + 6]; + smem[7*sx+offset] = DATA_y[tid*8 + 7]; + } + +loop5 : for(tid = 0; tid < 64; tid++){ + data_y[0] = DATA_y[tid*8 + 0]; + data_y[1] = DATA_y[tid*8 + 1]; + data_y[2] = DATA_y[tid*8 + 2]; + data_y[3] = DATA_y[tid*8 + 3]; + data_y[4] = DATA_y[tid*8 + 4]; + data_y[5] = DATA_y[tid*8 + 5]; + data_y[6] = DATA_y[tid*8 + 6]; + data_y[7] = DATA_y[tid*8 + 7]; + + hi = tid>>3; + lo = tid&7; + + loady8(data_y, smem, lo*66+hi, 8); + + DATA_y[tid*8] = data_y[0]; + DATA_y[tid*8 + 1] = data_y[1]; + DATA_y[tid*8 + 2] = data_y[2]; + DATA_y[tid*8 + 3] = data_y[3]; + DATA_y[tid*8 + 4] = data_y[4]; + DATA_y[tid*8 + 5] = data_y[5]; + DATA_y[tid*8 + 6] = data_y[6]; + DATA_y[tid*8 + 7] = data_y[7]; + } + +loop6 : for(tid = 0; tid < 64; tid++){ + data_x[0] = DATA_x[tid*8 + 0]; + data_x[1] = DATA_x[tid*8 + 1]; + data_x[2] = DATA_x[tid*8 + 2]; + data_x[3] = DATA_x[tid*8 + 3]; + data_x[4] = DATA_x[tid*8 + 4]; + data_x[5] = DATA_x[tid*8 + 5]; + data_x[6] = DATA_x[tid*8 + 6]; + data_x[7] = DATA_x[tid*8 + 7]; + + data_y[0] = DATA_y[tid*8 + 0]; + data_y[1] = DATA_y[tid*8 + 1]; + data_y[2] = DATA_y[tid*8 + 2]; + data_y[3] = DATA_y[tid*8 + 3]; + data_y[4] = DATA_y[tid*8 + 4]; + data_y[5] = DATA_y[tid*8 + 5]; + data_y[6] = DATA_y[tid*8 + 6]; + data_y[7] = DATA_y[tid*8 + 7]; + + //Second FFT8... + FFT8(data_x, data_y); + + //Calculate hi for second twiddle calculation... + hi = tid>>3; + + //Second twiddles calc, use hi and 64 stride version as defined in G80/SHOC... + twiddles8(data_x, data_y, hi, 64); + + //Save for final transpose... + DATA_x[tid*8] = data_x[0]; + DATA_x[tid*8 + 1] = data_x[1]; + DATA_x[tid*8 + 2] = data_x[2]; + DATA_x[tid*8 + 3] = data_x[3]; + DATA_x[tid*8 + 4] = data_x[4]; + DATA_x[tid*8 + 5] = data_x[5]; + DATA_x[tid*8 + 6] = data_x[6]; + DATA_x[tid*8 + 7] = data_x[7]; + + DATA_y[tid*8] = data_y[0]; + DATA_y[tid*8 + 1] = data_y[1]; + DATA_y[tid*8 + 2] = data_y[2]; + DATA_y[tid*8 + 3] = data_y[3]; + DATA_y[tid*8 + 4] = data_y[4]; + DATA_y[tid*8 + 5] = data_y[5]; + DATA_y[tid*8 + 6] = data_y[6]; + DATA_y[tid*8 + 7] = data_y[7]; + } + + //Transpose.. + sx = 72; +loop7 : for(tid = 0; tid < 64; tid++){ + hi = tid>>3; + lo = tid&7; + offset = hi*8+lo; + smem[0*sx+offset] = DATA_x[tid*8 + 0]; + smem[4*sx+offset] = DATA_x[tid*8 + 1]; + smem[1*sx+offset] = DATA_x[tid*8 + 4]; + smem[5*sx+offset] = DATA_x[tid*8 + 5]; + smem[2*sx+offset] = DATA_x[tid*8 + 2]; + smem[6*sx+offset] = DATA_x[tid*8 + 3]; + smem[3*sx+offset] = DATA_x[tid*8 + 6]; + smem[7*sx+offset] = DATA_x[tid*8 + 7]; + } + + sx = 8; +loop8 : for(tid = 0; tid < 64; tid++){ + hi = tid>>3; + lo = tid&7; + offset = hi*72+lo; + + DATA_x[tid*8 +0] = smem[0*sx+offset]; + DATA_x[tid*8 +4] = smem[4*sx+offset]; + DATA_x[tid*8 +1] = smem[1*sx+offset]; + DATA_x[tid*8 +5] = smem[5*sx+offset]; + DATA_x[tid*8 +2] = smem[2*sx+offset]; + DATA_x[tid*8 +6] = smem[6*sx+offset]; + DATA_x[tid*8 +3] = smem[3*sx+offset]; + DATA_x[tid*8 +7] = smem[7*sx+offset]; + } + + sx = 72; +loop9 : for(tid = 0; tid < 64; tid++){ + hi = tid>>3; + lo = tid&7; + offset = hi*8+lo; + + smem[0*sx+offset] = DATA_y[tid*8 + 0]; + smem[4*sx+offset] = DATA_y[tid*8 + 1]; + smem[1*sx+offset] = DATA_y[tid*8 + 4]; + smem[5*sx+offset] = DATA_y[tid*8 + 5]; + smem[2*sx+offset] = DATA_y[tid*8 + 2]; + smem[6*sx+offset] = DATA_y[tid*8 + 3]; + smem[3*sx+offset] = DATA_y[tid*8 + 6]; + smem[7*sx+offset] = DATA_y[tid*8 + 7]; + } + +loop10 : for(tid = 0; tid < 64; tid++){ + data_y[0] = DATA_y[tid*8 + 0]; + data_y[1] = DATA_y[tid*8 + 1]; + data_y[2] = DATA_y[tid*8 + 2]; + data_y[3] = DATA_y[tid*8 + 3]; + data_y[4] = DATA_y[tid*8 + 4]; + data_y[5] = DATA_y[tid*8 + 5]; + data_y[6] = DATA_y[tid*8 + 6]; + data_y[7] = DATA_y[tid*8 + 7]; + + hi = tid>>3; + lo = tid&7; + + loady8(data_y, smem, hi*72+lo, 8); + + DATA_y[tid*8 + 0] = data_y[0]; + DATA_y[tid*8 + 1] = data_y[1]; + DATA_y[tid*8 + 2] = data_y[2]; + DATA_y[tid*8 + 3] = data_y[3]; + DATA_y[tid*8 + 4] = data_y[4]; + DATA_y[tid*8 + 5] = data_y[5]; + DATA_y[tid*8 + 6] = data_y[6]; + DATA_y[tid*8 + 7] = data_y[7]; + } + +loop11 : for(tid = 0; tid < 64; tid++){ + //Load post-trans + data_y[0] = DATA_y[tid*8]; + data_y[1] = DATA_y[tid*8 + 1]; + data_y[2] = DATA_y[tid*8 + 2]; + data_y[3] = DATA_y[tid*8 + 3]; + data_y[4] = DATA_y[tid*8 + 4]; + data_y[5] = DATA_y[tid*8 + 5]; + data_y[6] = DATA_y[tid*8 + 6]; + data_y[7] = DATA_y[tid*8 + 7]; + + data_x[0] = DATA_x[tid*8]; + data_x[1] = DATA_x[tid*8 + 1]; + data_x[2] = DATA_x[tid*8 + 2]; + data_x[3] = DATA_x[tid*8 + 3]; + data_x[4] = DATA_x[tid*8 + 4]; + data_x[5] = DATA_x[tid*8 + 5]; + data_x[6] = DATA_x[tid*8 + 6]; + data_x[7] = DATA_x[tid*8 + 7]; + + //Final 8pt FFT... + FFT8(data_x, data_y); + + //Global store + work_x[0*stride+tid] = data_x[reversed[0]]; + work_x[1*stride+tid] = data_x[reversed[1]]; + work_x[2*stride+tid] = data_x[reversed[2]]; + work_x[3*stride+tid] = data_x[reversed[3]]; + work_x[4*stride+tid] = data_x[reversed[4]]; + work_x[5*stride+tid] = data_x[reversed[5]]; + work_x[6*stride+tid] = data_x[reversed[6]]; + work_x[7*stride+tid] = data_x[reversed[7]]; + + work_y[0*stride+tid] = data_y[reversed[0]]; + work_y[1*stride+tid] = data_y[reversed[1]]; + work_y[2*stride+tid] = data_y[reversed[2]]; + work_y[3*stride+tid] = data_y[reversed[3]]; + work_y[4*stride+tid] = data_y[reversed[4]]; + work_y[5*stride+tid] = data_y[reversed[5]]; + work_y[6*stride+tid] = data_y[reversed[6]]; + work_y[7*stride+tid] = data_y[reversed[7]]; + } +} diff --git a/benchmarks/fft/fft.h b/benchmarks/fft/fft.h new file mode 100644 index 0000000..4773894 --- /dev/null +++ b/benchmarks/fft/fft.h @@ -0,0 +1,61 @@ +/* +Copyright (c) 2011, UT-Battelle, LLC +Copyright (c) 2014, the President and Fellows of Harvard College +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +* Neither the name of Oak Ridge National Laboratory, nor UT-Battelle, LLC, nor + the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Implementations based on: +V. Volkov and B. Kazian. Fitting fft onto the g80 architecture. 2008. +*/ + +#include +#include +#include + +#define TYPE double +//#define TYPE int + +typedef struct complex_t { + TYPE x; + TYPE y; +} complex; + +#define PI 3.1415926535 +#ifndef M_SQRT1_2 +#define M_SQRT1_2 0.70710678118654752440f +#endif +void fft1D_512(TYPE work_x[512], TYPE work_y[512]); + +//////////////////////////////////////////////////////////////////////////////// +// Test harness interface code. + +struct bench_args_t { + TYPE work_x[512]; + TYPE work_y[512]; +}; +int INPUT_SIZE = sizeof(struct bench_args_t); + +void run_benchmark( void *vargs ) { + struct bench_args_t *args = (struct bench_args_t *)vargs; + fft1D_512( args->work_x, args->work_y); +} + +//////////////////////////////////////////////////////////////////////////////// \ No newline at end of file diff --git a/benchmarks/fft/generate.c b/benchmarks/fft/generate.c new file mode 100644 index 0000000..f5135e2 --- /dev/null +++ b/benchmarks/fft/generate.c @@ -0,0 +1,43 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fft.h" +// Fake benchmark function to satisfy the extern +void fft1D_512(double data_x[512], double data_y[512]){ } + +void generate_binary() +{ + struct bench_args_t data; + char *ptr; + int status, i, fd, written=0; + + // Fill data structure + srandom(1); + for(i=0; i<512; i++){ + data.work_x[i] = (double)random(); + data.work_y[i] = (double)random(); + } + + // Open and write + fd = open("input.data", O_WRONLY|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH); + assert( fd>0 && "Couldn't open input data file" ); + + ptr = (char *) &data; + while( written=0 && "Couldn't write input data file" ); + written += status; + } +} + +int main(int argc, char **argv) +{ + generate_binary(); + return 0; +} \ No newline at end of file diff --git a/benchmarks/fft/test.c b/benchmarks/fft/test.c new file mode 100644 index 0000000..3717e51 --- /dev/null +++ b/benchmarks/fft/test.c @@ -0,0 +1,36 @@ +#include "fft.h" + +int main(){ + TYPE a_x[512]; + TYPE a_y[512]; + int i; + float max, min; + max = 2147483646.0; + min = -2147483646.0; + + for( i = 0; i < 512; i++){ + a_x[i] = (TYPE)(i);//(TYPE)(((double) rand() / (RAND_MAX)) * (max-min) + min);//i; + a_y[i] = (TYPE)(i);//(((double) rand() / (RAND_MAX)) * (max-min) + min);//i; + } + + printf("ORIG!\n"); + for( i = 0; i < 512; i++){ + printf("x = %f y = %f \n", a_x[i], a_y[i]); + } + + fft1D_512(a_x, a_y); + + printf("OUTPUT\n"); + for( i = 0; i < 512; i++){ + printf("x = %f y = %f \n", a_x[i], a_y[i]); + } + + fft1D_512(a_x, a_y); + + printf("NORMAL\n"); + for( i = 0; i < 512; i++){ + printf("x = %f y = %f \n", a_x[i]/512, a_y[i]/512); + } + + return 0; +} \ No newline at end of file diff --git a/benchmarks/getTanh/getTanh.cpp b/benchmarks/getTanh/getTanh.cpp new file mode 100644 index 0000000..cdb1d24 --- /dev/null +++ b/benchmarks/getTanh/getTanh.cpp @@ -0,0 +1,115 @@ +/* + + Function Name: getTanh + Description: This function takes one input array and generates getTanh result for each elements. + Source: https://github.com/JianyiCheng/HLS-benchmarks/blob/master/DSS/getTanh/src/inlined.cpp + +*/ + +void getTanh(int A[1000], int atanh[12], int sinh[5], int cosh[5]){ + int i; + + // the result is positive or negative + int is_neg; + // Input angle + int beta; + // Output of the hyperbolic CORDIC block + int outputcosh, outputsinh; + // Result of tanh, sinh and cosh + int result; + // Token for the repetition of the 4th iteration + //char token4 = 0; + // Approximation of cosh(beta) and sinh(beta) + int x = 0x1351; + int y = 0; + int x_new; + int j, k; + int index_trigo; + int result_cosh, result_sinh; + + for (i = 0; i < 1000; i++){ + beta = A[i]; + + // Implement approximate range of the hyperbolic CORDIC block + if (beta >= 20480) { + result = 4096; // Saturation effect + } else { + if (beta >= 16384) { + index_trigo = 4; + } else if (beta >= 12288) { + index_trigo = 3; + } else if (beta >= 8192) { + index_trigo = 2; + } else if (beta >= 4096) { + index_trigo = 1; + } else { + index_trigo = 0; + } + beta = beta - index_trigo * 4096; + // Call to the hyperbolic CORDIC block + x = 0x1351; + y = 0; + fp_hyp_cordic: + for (k = 1; k <= 12; k++) { + // force the 3k+1 th iteration to be repeated + if (((k%3)==1) &&( k!=1 )){ + for (j=1;j<=2;j++){ + // beta<0 anti-clockwise rotation + if (beta < 0) { + x_new = x - (y >> k); + y -= x >> k; + beta += atanh[k - 1]; + } + // beta>0 clockwise rotation + else { + x_new = x + (y >> k); + y += (x >> k); + beta -= atanh[k - 1]; + } + x = x_new; + } + } + else { + if (beta < 0) { + x_new = x - (y >> k); + y -= x >> k; + beta += atanh[k - 1]; + } + // beta>0 clockwise rotation + else { + x_new = x + (y >> k); + y += (x >> k); + beta -= atanh[k - 1]; + } + x = x_new; + } + } + outputcosh = x; + outputsinh = y; + + // Trigonometric rules application + result_cosh = (sinh[index_trigo] * outputcosh + cosh[index_trigo] * outputsinh); + result_sinh = (cosh[index_trigo] * outputcosh + sinh[index_trigo] * outputsinh) >> 12; + result = result_cosh / result_sinh; + } + + A[i] = result; + } + +} + +int main(){ + + int atanh[12] = { 0x08C9, 0x0416, 0x0202, 0x0100, 0x0080, 0x0064, 0x0032, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 }; + int cosh[5] = { 0x1000, 0x18B0, 0x3C31, 0xA115, 0x1B4EE}; + int sinh[5] = { 0x0, 0x12CD, 0x3A07, 0xA049, 0x1B4A3}; + + int A[1000]; + + for (int i = 0; i < 1000; i++) + A[i] = i; // here you can change your input data as you wish. + + + getTanh(A, atanh, sinh, cosh); + +} \ No newline at end of file diff --git a/benchmarks/jacob_2d/jacob_2d.cpp b/benchmarks/jacob_2d/jacob_2d.cpp new file mode 100644 index 0000000..0902db6 --- /dev/null +++ b/benchmarks/jacob_2d/jacob_2d.cpp @@ -0,0 +1,121 @@ + +/** + * This version is stamped on May 10, 2016 + * + * Contact: + * Louis-Noel Pouchet + * Tomofumi Yuki + * + * Web address: http://polybench.sourceforge.net + */ +/* jacobi-2d.c: this file is part of PolyBench/C */ + +#include +#include +#include +#include + +/* Include polybench common header. */ +#include + +/* Include benchmark-specific header. */ +#include "jacobi-2d.h" + + +/* Array initialization. */ +static +void init_array (int n, + DATA_TYPE POLYBENCH_2D(A,N,N,n,n), + DATA_TYPE POLYBENCH_2D(B,N,N,n,n)) +{ + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) + { + A[i][j] = ((DATA_TYPE) i*(j+2) + 2) / n; + B[i][j] = ((DATA_TYPE) i*(j+3) + 3) / n; + } +} + + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static +void print_array(int n, + DATA_TYPE POLYBENCH_2D(A,N,N,n,n)) + +{ + int i, j; + + POLYBENCH_DUMP_START; + POLYBENCH_DUMP_BEGIN("A"); + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) { + if ((i * n + j) % 20 == 0) fprintf(POLYBENCH_DUMP_TARGET, "\n"); + fprintf(POLYBENCH_DUMP_TARGET, DATA_PRINTF_MODIFIER, A[i][j]); + } + POLYBENCH_DUMP_END("A"); + POLYBENCH_DUMP_FINISH; +} + + +/* Main computational kernel. The whole function will be timed, + including the call and return. */ +static +void kernel_jacobi_2d(int tsteps, + int n, + DATA_TYPE POLYBENCH_2D(A,N,N,n,n), + DATA_TYPE POLYBENCH_2D(B,N,N,n,n)) +{ + int t, i, j; + +#pragma scop + for (t = 0; t < _PB_TSTEPS; t++) + { + for (i = 1; i < _PB_N - 1; i++) + for (j = 1; j < _PB_N - 1; j++) + B[i][j] = SCALAR_VAL(0.2) * (A[i][j] + A[i][j-1] + A[i][1+j] + A[1+i][j] + A[i-1][j]); + for (i = 1; i < _PB_N - 1; i++) + for (j = 1; j < _PB_N - 1; j++) + A[i][j] = SCALAR_VAL(0.2) * (B[i][j] + B[i][j-1] + B[i][1+j] + B[1+i][j] + B[i-1][j]); + } +#pragma endscop + +} + + +int main(int argc, char** argv) +{ + /* Retrieve problem size. */ + int n = N; + int tsteps = TSTEPS; + + /* Variable declaration/allocation. */ + POLYBENCH_2D_ARRAY_DECL(A, DATA_TYPE, N, N, n, n); + POLYBENCH_2D_ARRAY_DECL(B, DATA_TYPE, N, N, n, n); + + + /* Initialize array(s). */ + init_array (n, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B)); + + /* Start timer. */ + polybench_start_instruments; + + /* Run kernel. */ + kernel_jacobi_2d(tsteps, n, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B)); + + /* Stop and print timer. */ + polybench_stop_instruments; + polybench_print_instruments; + + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + polybench_prevent_dce(print_array(n, POLYBENCH_ARRAY(A))); + + /* Be clean. */ + POLYBENCH_FREE_ARRAY(A); + POLYBENCH_FREE_ARRAY(B); + + return 0; +} diff --git a/benchmarks/jacob_2d/jacob_2d.h b/benchmarks/jacob_2d/jacob_2d.h new file mode 100644 index 0000000..ca0d4d3 --- /dev/null +++ b/benchmarks/jacob_2d/jacob_2d.h @@ -0,0 +1,80 @@ +/** + * This version is stamped on May 10, 2016 + * + * Contact: + * Louis-Noel Pouchet + * Tomofumi Yuki + * + * Web address: http://polybench.sourceforge.net + */ +#ifndef _JACOBI_2D_H +# define _JACOBI_2D_H + +/* Default to LARGE_DATASET. */ +# if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(MEDIUM_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET) +# define LARGE_DATASET +# endif + +# if !defined(TSTEPS) && !defined(N) +/* Define sample dataset sizes. */ +# ifdef MINI_DATASET +# define TSTEPS 20 +# define N 30 +# endif + +# ifdef SMALL_DATASET +# define TSTEPS 40 +# define N 90 +# endif + +# ifdef MEDIUM_DATASET +# define TSTEPS 100 +# define N 250 +# endif + +# ifdef LARGE_DATASET +# define TSTEPS 500 +# define N 1300 +# endif + +# ifdef EXTRALARGE_DATASET +# define TSTEPS 1000 +# define N 2800 +# endif + + +#endif /* !(TSTEPS N) */ + +# define _PB_TSTEPS POLYBENCH_LOOP_BOUND(TSTEPS,tsteps) +# define _PB_N POLYBENCH_LOOP_BOUND(N,n) + + +/* Default data type */ +# if !defined(DATA_TYPE_IS_INT) && !defined(DATA_TYPE_IS_FLOAT) && !defined(DATA_TYPE_IS_DOUBLE) +# define DATA_TYPE_IS_DOUBLE +# endif + +#ifdef DATA_TYPE_IS_INT +# define DATA_TYPE int +# define DATA_PRINTF_MODIFIER "%d " +#endif + +#ifdef DATA_TYPE_IS_FLOAT +# define DATA_TYPE float +# define DATA_PRINTF_MODIFIER "%0.2f " +# define SCALAR_VAL(x) x##f +# define SQRT_FUN(x) sqrtf(x) +# define EXP_FUN(x) expf(x) +# define POW_FUN(x,y) powf(x,y) +# endif + +#ifdef DATA_TYPE_IS_DOUBLE +# define DATA_TYPE double +# define DATA_PRINTF_MODIFIER "%0.2lf " +# define SCALAR_VAL(x) x +# define SQRT_FUN(x) sqrt(x) +# define EXP_FUN(x) exp(x) +# define POW_FUN(x,y) pow(x,y) +# endif + +#endif /* !_JACOBI_2D_H */ diff --git a/benchmarks/kmeans/lloyds_algorithm_top.cpp b/benchmarks/kmeans/lloyds_algorithm_top.cpp new file mode 100644 index 0000000..f2db1ce --- /dev/null +++ b/benchmarks/kmeans/lloyds_algorithm_top.cpp @@ -0,0 +1,326 @@ +// source: https://github.com/FelixWinterstein/Vivado-KMeans/tree/b1121f826bdac8db9502e4bf0c8f3b08425bc061/lloyds_algorithm_HLS/source + +/********************************************************************** +* Felix Winterstein, Imperial College London +* +* File: lloyds_algorithm_top.cpp +* +* Revision 1.01 +* Additional Comments: distributed under a BSD license, see LICENSE.txt +* +**********************************************************************/ + +#include "lloyds_algorithm_top.h" +#include "lloyds_algorithm_util.h" + + +// global array for the data (keep it local to this file) +data_type data_int_memory[N]; +data_type centre_positions[K*P]; +centre_type centre_buffer[K*P]; + + +// top-level function of the design +void lloyds_algorithm_top( volatile data_type *data, + volatile data_type *cntr_pos_init, + node_pointer n, + centre_index_type k, + volatile coord_type_ext *distortion_out, + volatile data_type *clusters_out) +{ + // set the interface properties + #pragma HLS interface ap_none register port=n + #pragma HLS interface ap_none register port=k + #pragma HLS interface ap_fifo port=data depth=256 + + #pragma HLS interface ap_fifo port=cntr_pos_init depth=256 + #pragma HLS interface ap_fifo port=distortion_out depth=256 + #pragma HLS interface ap_fifo port=clusters_out depth=256 + + /* + #pragma HLS data_pack variable=data + #pragma HLS data_pack variable=cntr_pos_init + #pragma HLS data_pack variable=clusters_out + */ + #pragma HLS data_pack variable=data_int_memory + #pragma HLS data_pack variable=centre_positions + #pragma HLS data_pack variable=centre_buffer + + // specify the type of memory instantiated for these arrays: two-port block ram + #pragma HLS resource variable=data_int_memory core=RAM_2P_BRAM + #pragma HLS resource variable=centre_positions core=RAM_2P_BRAM + #pragma HLS resource variable=centre_buffer core=RAM_2P_LUTRAM + + // partition the arrays according to the parallelism degree P + // NOTE: the part. factor must be updated if P is changed (in lloyds_alogrithm_top.h) ! + #pragma HLS array_partition variable=centre_buffer block factor=40 dim=1 + #pragma HLS array_partition variable=centre_positions block factor=40 dim=1 + + init_node_memory(data,n); + + centre_type filt_centres_out[K]; + data_type new_centre_positions[K]; + // more struct-packing + #pragma HLS data_pack variable=filt_centres_out + #pragma HLS data_pack variable=filt_centres_out + + // iterate over a constant number of outer clustering iterations + it_loop: for (uint l=0; l=n-P+1) { + //if (i>=n) { + break; + } + } + + + // readout centres + read_out_centres_loop: for(centre_index_type i=0; i<=k; i++) { + #pragma HLS pipeline II=1 + + coord_type_ext arr_count[P]; + coord_type_ext arr_sum_sq[P]; + coord_type_vector_ext arr_wgtCent[P]; + #pragma HLS array_partition variable=arr_count complete + #pragma HLS array_partition variable=arr_sum_sq complete + #pragma HLS array_partition variable=arr_wgtCent complete + + for (uint p=0; p +#include "ap_int.h" // custom data types + +#define D 3 // data dimensionality +#define N 32768 // max. number of data points +#define K 256 // max. number of centres +#define L 6 // max. number of iterations +#define P 40 // parallelisation degree + +#define COORD_BITWIDTH 16 +#define COORD_BITWITDH_EXT 32 +#define NODE_POINTER_BITWIDTH 15 // log2(N) +#define CNTR_INDEX_BITWIDTH 8 // log2(K) + +// pointer types to tree nodes and centre lists +typedef ap_uint node_pointer; +typedef ap_uint centre_index_type; + +// force register insertion in the generated RTL for some signals +#define FORCE_REGISTERS +// ... used for saturation +#define MAX_FIXED_POINT_VAL_EXT (1<<(COORD_BITWITDH_EXT-1))-1 + +typedef unsigned int uint; +typedef ap_int coord_type; +typedef ap_int coord_type_vector; +typedef ap_int coord_type_ext; +typedef ap_int coord_type_vector_ext; + +//bit width definitions for multiplications +#define MUL_INTEGER_BITS 12 +#define MUL_FRACTIONAL_BITS 6 +#define MUL_MAX_VAL (1<<(MUL_INTEGER_BITS+MUL_FRACTIONAL_BITS-1))-1 +#define MUL_MIN_VAL -1*(1<<(MUL_INTEGER_BITS+MUL_FRACTIONAL_BITS-1)) +typedef ap_int mul_input_type; + +// this should be always 1 +#define FILE_INDEX 1 + +// data point types +struct data_type { + //coord_type value[D]; + coord_type_vector value; + data_type& operator=(const data_type& a); + data_type& operator=(const volatile data_type& a); +}; + + +// data point types ext +struct data_type_ext { + coord_type_vector_ext value; + data_type_ext& operator=(const data_type_ext& a); +}; + + +// centre types +struct centre_type { + data_type_ext wgtCent; // sum of all points assigned to this centre + coord_type_ext sum_sq; // sum of norm of all points assigned to this centre + coord_type count; + centre_type& operator=(const centre_type& a); +}; +typedef centre_type* centre_ptr; + + +#ifdef FORCE_REGISTERS +template +T Reg(T in) { + #pragma HLS INLINE off + #pragma HLS INTERFACE port=return register + return in; +} +#else +template +T Reg(T in) { + #pragma HLS INLINE + return in; +} +#endif + + + +void lloyds_algorithm_top( volatile data_type *data, + volatile data_type *cntr_pos_init, + node_pointer n, + centre_index_type k, + volatile coord_type_ext *distortion_out, + volatile data_type *clusters_out); + +void init_node_memory(volatile data_type *node_data, node_pointer n); + +void update_centres(centre_type *centres_in,centre_index_type k, data_type *centres_positions_out); + +void lloyds ( node_pointer n, + centre_index_type k, + centre_type *centres_out); + +#endif /* LLOYDS_ALGORITHM_TOP_H */ diff --git a/benchmarks/kmeans/lloyds_algorithm_util.cpp b/benchmarks/kmeans/lloyds_algorithm_util.cpp new file mode 100644 index 0000000..fa1db19 --- /dev/null +++ b/benchmarks/kmeans/lloyds_algorithm_util.cpp @@ -0,0 +1,258 @@ +/********************************************************************** +* Felix Winterstein, Imperial College London +* +* File: lloyds_algorithm_util.cpp +* +* Revision 1.01 +* Additional Comments: distributed under a BSD license, see LICENSE.txt +* +**********************************************************************/ + +#include +#include "lloyds_algorithm_util.h" + + +data_type& data_type::operator=(const data_type& a) +{ + + value = a.value; + return *this; +} + +data_type& data_type::operator=(const volatile data_type& a) +{ + + value = a.value; + return *this; +} + + + +data_type_ext& data_type_ext::operator=(const data_type_ext& a) +{ + value = a.value; + return *this; +} + + + +centre_type& centre_type::operator=(const centre_type& a) +{ + count = a.count; + wgtCent = a.wgtCent; + sum_sq = a.sum_sq; + //position = a.position; + return *this; +} + + +void set_coord_type_vector_item(coord_type_vector *a, const coord_type b, const uint idx) +{ + #pragma HLS function_instantiate variable=idx + a->range((idx+1)*COORD_BITWIDTH-1,idx*COORD_BITWIDTH) = b; +} + + +void set_coord_type_vector_ext_item(coord_type_vector_ext *a, const coord_type_ext b, const uint idx) +{ + #pragma HLS function_instantiate variable=idx + a->range((idx+1)*COORD_BITWITDH_EXT-1,idx*COORD_BITWITDH_EXT) = b; +} + + +coord_type get_coord_type_vector_item(const coord_type_vector a, const uint idx) +{ + #pragma HLS function_instantiate variable=idx + coord_type tmp= a.range((idx+1)*COORD_BITWIDTH-1,idx*COORD_BITWIDTH); + return tmp; +} + + +coord_type_ext get_coord_type_vector_ext_item(const coord_type_vector_ext a, const uint idx) +{ + #pragma HLS function_instantiate variable=idx + coord_type_ext tmp= a.range((idx+1)*COORD_BITWITDH_EXT-1,idx*COORD_BITWITDH_EXT); + return tmp; +} + + +/* ****** helper functions *******/ + + +// conversion from data_type_ext to data_type +data_type conv_long_to_short(data_type_ext p) +{ + #pragma HLS inline + data_type result; + for (uint d=0; d MUL_MAX_VAL) { + val = MUL_MAX_VAL; + } else if (val < MUL_MIN_VAL) { + val = MUL_MIN_VAL; + } + return (mul_input_type)val; +} + + +// fixed point multiplication with saturation and scaling +coord_type_ext fi_mul(coord_type_ext op1, coord_type_ext op2) +{ + #pragma HLS inline + mul_input_type tmp_op1 = saturate_mul_input(op1); + mul_input_type tmp_op2 = saturate_mul_input(op2); + + ap_int<2*(MUL_INTEGER_BITS+MUL_FRACTIONAL_BITS)> result_unscaled; + result_unscaled = tmp_op1*tmp_op2; + #pragma HLS resource variable=result_unscaled core=MulnS + + ap_int<2*(MUL_INTEGER_BITS+MUL_FRACTIONAL_BITS)> result_scaled; + result_scaled = result_unscaled >> MUL_FRACTIONAL_BITS; + coord_type_ext result; + result = (coord_type_ext)result_scaled; + return result; +} + +// tree adder +coord_type_ext tree_adder(coord_type_ext *input_array,const uint m) +{ + #pragma HLS inline + + for(uint j=0;j uint((m+m-uint(m/(1<<(j)))*(1<<(j)))/(1<<(j+1)))*(1<<(j+1))) { + input_array[uint(m/(1<<(j+1)))] = input_array[uint(m/(1<<(j+1))-1)*2+2]; + //printf("[%d] = [%d]\n",uint(m/(1<<(j+1))),uint(m/(1<<(j+1))-1)*2+2); + } + } + if (j== ceil(log2(m))-1) { + coord_type_ext tmp1 = input_array[0]; + coord_type_ext tmp2 = input_array[1]; + coord_type_ext tmp3 = tmp1+tmp2; + input_array[0] = tmp3; + //printf("[%d] = [%d]+[%d]\n",0,0,1); + #pragma HLS resource variable=tmp3 core=AddSubnS + } + } + return input_array[0]; +} + + +// tree compare select +void tree_cs(coord_type_ext *input_array,centre_index_type *index_array,coord_type_ext *res_val,centre_index_type *res_idx,const uint m) +{ + #pragma HLS inline + + for(uint j=0;j uint(m/(1<<(j+1)))*(1<<(j+1)) ) { + input_array[uint(m/(1<<(j+1)))] = (input_array[uint(m/(1<<(j+1))-1)*2+2]); + index_array[uint(m/(1<<(j+1)))] = (index_array[uint(m/(1<<(j+1))-1)*2+2]); + } + } + if (j== ceil(log2(m))-1) { + coord_type_ext tmp1 = input_array[0]; + coord_type_ext tmp1_idx = index_array[0]; + coord_type_ext tmp2 = input_array[1]; + coord_type_ext tmp2_idx = index_array[1]; + coord_type_ext tmp3; + centre_index_type tmp3_idx; + if (tmp1 < tmp2) { + tmp3 = tmp1; + tmp3_idx = tmp1_idx; + } else { + tmp3 = tmp2; + tmp3_idx = tmp2_idx; + } + input_array[0] = (tmp3); + index_array[0] = (tmp3_idx); + } + } + *res_val= input_array[0]; + *res_idx= index_array[0]; +} + + +// compute the Euclidean distance +void compute_distance(data_type p1, data_type p2, coord_type_ext *dist) +{ + #pragma HLS inline + + data_type tmp_p1 = p1; + data_type tmp_p2 = p2; + coord_type_ext tmp_mul_res[D]; + + for (uint d=0; d +#include +#include "input.h" +#include "output.h" + +#define WIDTH 512 +#define HEIGHT 512 + +void sobel_filter(unsigned char input[][WIDTH], + unsigned char output[][WIDTH]) { + int i, j; + int m, n; + + int gx_sum, gy_sum, sum; + + int gx[3][3] = {{-1, 0, 1}, + {-2, 0, 2}, + {-1, 0, 1}}; + + int gy[3][3] = {{ 1, 2, 1}, + { 0, 0, 0}, + {-1, -2, -1}}; + + for (i = 0; i < HEIGHT; i++) { + for (j = 0; j < WIDTH; j++) { + sum = 0; + int outofbounds = 0; + outofbounds |= (i < 1) | (i > (HEIGHT - 2)); + outofbounds |= (j < 1) | (j > (WIDTH - 2)); + + gx_sum = 0; + gy_sum = 0; + for (m = -1; m <= 1; m++) { + for (n = -1; n <= 1; n++) { + gx_sum += (outofbounds) ? 0 : + ((int)input[i + m][j + n]) * gx[m + 1][n + 1]; + gy_sum += (outofbounds) ? 0 : + ((int)input[i + m][j + n]) * gy[m + 1][n + 1]; + } + } + + gx_sum = (gx_sum < 0) ? -gx_sum : gx_sum; + gy_sum = (gy_sum < 0) ? -gy_sum : gy_sum; + + sum = gx_sum + gy_sum; + sum = (sum > 255) ? 255 : sum; + + output[i][j] = (unsigned int)sum; + } + } +} + +int main() +{ + unsigned char sobel_output[HEIGHT][WIDTH]; + + sobel_filter(elaine_512_input, sobel_output); + + int result = 0; + int i, j; + + for(i = 0; i < HEIGHT; i++) { + for(j = 0; j < WIDTH; j++){ + if( sobel_output[i][j] != elaine_512_golden_output[i][j]) + result++; + } + } + + if (!result) + printf("PASS!\n"); + else + printf("FAIL with %d differences\n", result); + + return result; +} -- cgit