aboutsummaryrefslogtreecommitdiffstats
path: root/benchmarks/kmeans
diff options
context:
space:
mode:
authorJianyi Cheng <jcheng@Jianyis-Beast.home.ukb>2020-06-12 13:02:33 +0100
committerJianyi Cheng <jcheng@Jianyis-Beast.home.ukb>2020-06-12 13:02:33 +0100
commitcb1d47636f9bb9adbdfb3f18982417b746b2bf40 (patch)
tree56fb65f7590b4ed73d714d3bfc8ce8e6e4594e51 /benchmarks/kmeans
parentc54b32a91427e5342ce5ffa94b2398a2fcb8c144 (diff)
downloadvericert-kvx-cb1d47636f9bb9adbdfb3f18982417b746b2bf40.tar.gz
vericert-kvx-cb1d47636f9bb9adbdfb3f18982417b746b2bf40.zip
Adding benchmarks to branch_jc
Diffstat (limited to 'benchmarks/kmeans')
-rw-r--r--benchmarks/kmeans/lloyds_algorithm_top.cpp326
-rw-r--r--benchmarks/kmeans/lloyds_algorithm_top.h112
-rw-r--r--benchmarks/kmeans/lloyds_algorithm_util.cpp258
-rw-r--r--benchmarks/kmeans/lloyds_algorithm_util.h33
4 files changed, 729 insertions, 0 deletions
diff --git a/benchmarks/kmeans/lloyds_algorithm_top.cpp b/benchmarks/kmeans/lloyds_algorithm_top.cpp
new file mode 100644
index 0000000..f2db1ce
--- /dev/null
+++ b/benchmarks/kmeans/lloyds_algorithm_top.cpp
@@ -0,0 +1,326 @@
+// source: https://github.com/FelixWinterstein/Vivado-KMeans/tree/b1121f826bdac8db9502e4bf0c8f3b08425bc061/lloyds_algorithm_HLS/source
+
+/**********************************************************************
+* Felix Winterstein, Imperial College London
+*
+* File: lloyds_algorithm_top.cpp
+*
+* Revision 1.01
+* Additional Comments: distributed under a BSD license, see LICENSE.txt
+*
+**********************************************************************/
+
+#include "lloyds_algorithm_top.h"
+#include "lloyds_algorithm_util.h"
+
+
+// global array for the data (keep it local to this file)
+data_type data_int_memory[N];
+data_type centre_positions[K*P];
+centre_type centre_buffer[K*P];
+
+
+// top-level function of the design
+void lloyds_algorithm_top( volatile data_type *data,
+ volatile data_type *cntr_pos_init,
+ node_pointer n,
+ centre_index_type k,
+ volatile coord_type_ext *distortion_out,
+ volatile data_type *clusters_out)
+{
+ // set the interface properties
+ #pragma HLS interface ap_none register port=n
+ #pragma HLS interface ap_none register port=k
+ #pragma HLS interface ap_fifo port=data depth=256
+
+ #pragma HLS interface ap_fifo port=cntr_pos_init depth=256
+ #pragma HLS interface ap_fifo port=distortion_out depth=256
+ #pragma HLS interface ap_fifo port=clusters_out depth=256
+
+ /*
+ #pragma HLS data_pack variable=data
+ #pragma HLS data_pack variable=cntr_pos_init
+ #pragma HLS data_pack variable=clusters_out
+ */
+ #pragma HLS data_pack variable=data_int_memory
+ #pragma HLS data_pack variable=centre_positions
+ #pragma HLS data_pack variable=centre_buffer
+
+ // specify the type of memory instantiated for these arrays: two-port block ram
+ #pragma HLS resource variable=data_int_memory core=RAM_2P_BRAM
+ #pragma HLS resource variable=centre_positions core=RAM_2P_BRAM
+ #pragma HLS resource variable=centre_buffer core=RAM_2P_LUTRAM
+
+ // partition the arrays according to the parallelism degree P
+ // NOTE: the part. factor must be updated if P is changed (in lloyds_alogrithm_top.h) !
+ #pragma HLS array_partition variable=centre_buffer block factor=40 dim=1
+ #pragma HLS array_partition variable=centre_positions block factor=40 dim=1
+
+ init_node_memory(data,n);
+
+ centre_type filt_centres_out[K];
+ data_type new_centre_positions[K];
+ // more struct-packing
+ #pragma HLS data_pack variable=filt_centres_out
+ #pragma HLS data_pack variable=filt_centres_out
+
+ // iterate over a constant number of outer clustering iterations
+ it_loop: for (uint l=0; l<L; l++) {
+
+ for (centre_index_type i=0; i<=k; i++) {
+ data_type tmp_pos;
+ if (l==0) {
+ tmp_pos = cntr_pos_init[i];
+ } else {
+ tmp_pos = new_centre_positions[i];
+ }
+ for (uint p=0; p<P; p++) {
+ #pragma HLS unroll
+ centre_positions[p*K+i] = tmp_pos;
+ }
+ if (i==k) {
+ break;
+ }
+ }
+
+ // run the clustering kernel
+ lloyds(n, k, filt_centres_out);
+
+ // re-init centre positions
+ update_centres(filt_centres_out, k, new_centre_positions);
+
+ }
+
+ // write clustering output: new cluster centres and distortion
+ output_loop: for (centre_index_type i=0; i<=k; i++) {
+ #pragma HLS pipeline II=1
+ distortion_out[i] = filt_centres_out[i].sum_sq;
+ clusters_out[i].value = new_centre_positions[i].value;
+ if (i==k) {
+ break;
+ }
+ }
+}
+
+
+// load data points from interface into internal memory
+void init_node_memory(volatile data_type *node_data, node_pointer n)
+{
+ #pragma HLS inline
+ for (node_pointer i=0; i<=n; i++) {
+ #pragma HLS pipeline II=1
+ data_int_memory[i] = node_data[i];
+ if (i==n) {
+ break;
+ }
+ }
+}
+
+
+// update the new centre positions after one outer clustering iteration
+void update_centres(centre_type *centres_in,centre_index_type k, data_type *centres_positions_out)
+{
+ #pragma HLS inline
+ centre_update_loop: for (centre_index_type i=0; i<=k; i++) {
+ #pragma HLS pipeline II=1
+ centre_type tmp_cent = Reg(centres_in[i]);
+ coord_type tmp_count = tmp_cent.count;
+ if ( tmp_count == 0 )
+ tmp_count = 1;
+
+ data_type_ext tmp_wgtCent = tmp_cent.wgtCent;
+ data_type tmp_new_pos;
+ for (uint d=0; d<D; d++) {
+ #pragma HLS unroll
+ coord_type_ext tmp_div_ext = (get_coord_type_vector_ext_item(tmp_wgtCent.value,d) / tmp_count); //let's see what it does with that...
+ coord_type tmp_div = (coord_type) tmp_div_ext;
+ #pragma HLS resource variable=tmp_div core=DivnS
+ set_coord_type_vector_item(&tmp_new_pos.value,Reg(tmp_div),d);
+ }
+ centres_positions_out[i] = Reg(tmp_new_pos);
+ if (i==k) {
+ break;
+ }
+ }
+}
+
+
+// main clustering kernel
+void lloyds ( node_pointer n,
+ centre_index_type k,
+ centre_type *centres_out)
+{
+ // register ports of this entity
+ #pragma HLS interface port=n register
+ #pragma HLS interface port=k register
+ #pragma HLS interface port=return register
+ #pragma HLS interface ap_memory register port=data_int_memory
+ #pragma HLS interface ap_memory register port=centre_positions
+
+ // init centre buffer
+ init_centre_buffer_loop: for(centre_index_type i=0; i<=k; i++) {
+ #pragma HLS pipeline II=1
+
+ for (uint p=0; p<P;p++) {
+ #pragma HLS unroll
+ centre_buffer[i+K*p].count = 0;
+ centre_buffer[i+K*p].sum_sq = 0;
+ data_type_ext tmp;
+ for (uint d=0; d<D; d++) {
+ #pragma HLS unroll
+ set_coord_type_vector_ext_item(&tmp.value,0,d);
+ }
+ centre_buffer[i+K*p].wgtCent = tmp;
+ }
+
+ if (i==k) {
+ #pragma HLS occurrence cycle=2
+ break;
+ }
+ }
+
+ data_type u[P];
+ #pragma HLS array_partition variable=u complete
+
+ // iterate over all data points
+ process_data_points_loop: for (node_pointer i=0; i<=n; i+=P) {
+
+ data_fetch_loop: for (uint p=0; p<P; p++) {
+ #pragma HLS pipeline II=1
+ u[p] = data_int_memory[i+p];
+ }
+
+ centre_index_type final_centre_index[P];
+ coord_type_ext sum_sq_out[P];
+
+ // iterate over all centres
+ minsearch_loop: for (centre_index_type ii=0; ii<=k; ii++) {
+ #pragma HLS pipeline II=1
+
+ par_loop: for (uint p=0; p<P; p++) {
+ #pragma HLS unroll
+ coord_type_ext min_dist[P];
+
+ #pragma HLS array_partition variable=final_centre_index complete
+ #pragma HLS array_partition variable=sum_sq_out complete
+ #pragma HLS array_partition variable=min_dist complete
+
+ // help the scheduler by declaring inter-iteration dependencies for these variables as false
+ #pragma HLS dependence variable=u inter false
+ #pragma HLS dependence variable=final_centre_index inter false
+ #pragma HLS dependence variable=sum_sq_out inter false
+ #pragma HLS dependence variable=min_dist inter false
+ #pragma HLS dependence variable=centre_buffer inter false
+ #pragma HLS dependence variable=centre_positions inter false
+
+ if (ii==0) {
+ min_dist[p]=MAX_FIXED_POINT_VAL_EXT;
+ }
+
+ coord_type_ext tmp_dist;
+ compute_distance(centre_positions[p*K+ii], u[p], &tmp_dist);
+
+ // select the centre with the smallest distance to the data point
+ if (tmp_dist<min_dist[p]) {
+ min_dist[p] = tmp_dist;
+ final_centre_index[p] = ii;
+ sum_sq_out[p] = tmp_dist;
+ }
+
+ //printf("%d: i=%d, %d, %d\n",p,i.VAL+p,final_centre_index[p].VAL,sum_sq_out[p].VAL);
+
+ if (ii == k) {
+ // trigger at most every other cycle
+ #pragma HLS occurrence cycle=2
+ if (p==P-1) {
+ break;
+ }
+ }
+ }
+ }
+
+ // after minsearch loop: update centre buffer
+ for (uint p=0; p<P; p++) {
+ #pragma HLS unroll
+ data_type_ext tmp_wgtCent;
+ coord_type_ext tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ uint tmp_idx = (final_centre_index[p]+K*p);
+ // weighted centroid of this centre
+ for (uint d=0; d<D; d++) {
+ #pragma HLS unroll
+ tmp1 = get_coord_type_vector_ext_item(centre_buffer[(tmp_idx)].wgtCent.value,d);
+ //if (i+p<=n) {
+ tmp2 = (coord_type_ext)get_coord_type_vector_item(u[p].value,d);
+ //} else {
+ //tmp2 = 0;
+ //}
+ set_coord_type_vector_ext_item(&tmp_wgtCent.value,Reg(tmp1)+Reg(tmp2),d);
+ }
+ centre_buffer[tmp_idx].wgtCent = tmp_wgtCent;
+
+ // update number of points assigned to centre
+ tmp4 = centre_buffer[(tmp_idx)].count;
+ //if (i+p<=n) {
+ tmp3 = 1;
+ //} else {
+ //tmp3 = 0;
+ //}
+ centre_buffer[tmp_idx].count = Reg(tmp3) + Reg(tmp4);
+
+ //if (i+p<=n) {
+ tmp5 = sum_sq_out[p];
+ //} else {
+ //tmp5 = 0;
+ //}
+ tmp6 = centre_buffer[(tmp_idx)].sum_sq;
+ centre_buffer[tmp_idx].sum_sq = Reg(tmp5) + Reg(tmp6);
+ }
+
+ if (i>=n-P+1) {
+ //if (i>=n) {
+ break;
+ }
+ }
+
+
+ // readout centres
+ read_out_centres_loop: for(centre_index_type i=0; i<=k; i++) {
+ #pragma HLS pipeline II=1
+
+ coord_type_ext arr_count[P];
+ coord_type_ext arr_sum_sq[P];
+ coord_type_vector_ext arr_wgtCent[P];
+ #pragma HLS array_partition variable=arr_count complete
+ #pragma HLS array_partition variable=arr_sum_sq complete
+ #pragma HLS array_partition variable=arr_wgtCent complete
+
+ for (uint p=0; p<P; p++) {
+ #pragma HLS unroll
+ arr_count[p] = ((coord_type_ext)centre_buffer[i+K*p].count);
+ arr_sum_sq[p] = (centre_buffer[i+K*p].sum_sq);
+ arr_wgtCent[p] = (centre_buffer[i+K*p].wgtCent.value);
+ }
+
+ centres_out[i].count = tree_adder(arr_count,P);
+ //printf("%d\n",centres_out[i].count.VAL);
+ centres_out[i].sum_sq = tree_adder(arr_sum_sq,P);
+ coord_type_vector_ext tmp_sum;
+ for (uint d=0; d<D; d++) {
+ #pragma HLS unroll
+ coord_type_ext tmp_a[P];
+ for (uint p=0; p<P; p++) {
+ #pragma HLS unroll
+ tmp_a[p] = get_coord_type_vector_ext_item(arr_wgtCent[p],d);
+ }
+ coord_type_ext tmp = tree_adder(tmp_a,P);
+ set_coord_type_vector_ext_item(&tmp_sum,tmp,d);
+ }
+ centres_out[i].wgtCent.value = tmp_sum;
+
+ if (i==k) {
+ break;
+ }
+ }
+
+}
+
diff --git a/benchmarks/kmeans/lloyds_algorithm_top.h b/benchmarks/kmeans/lloyds_algorithm_top.h
new file mode 100644
index 0000000..1ea372e
--- /dev/null
+++ b/benchmarks/kmeans/lloyds_algorithm_top.h
@@ -0,0 +1,112 @@
+/**********************************************************************
+* Felix Winterstein, Imperial College London
+*
+* File: lloyds_algorithm_top.h
+*
+* Revision 1.01
+* Additional Comments: distributed under a BSD license, see LICENSE.txt
+*
+**********************************************************************/
+
+
+#ifndef LLOYDS_ALGORITHM_TOP_H
+#define LLOYDS_ALGORITHM_TOP_H
+
+#include <math.h>
+#include "ap_int.h" // custom data types
+
+#define D 3 // data dimensionality
+#define N 32768 // max. number of data points
+#define K 256 // max. number of centres
+#define L 6 // max. number of iterations
+#define P 40 // parallelisation degree
+
+#define COORD_BITWIDTH 16
+#define COORD_BITWITDH_EXT 32
+#define NODE_POINTER_BITWIDTH 15 // log2(N)
+#define CNTR_INDEX_BITWIDTH 8 // log2(K)
+
+// pointer types to tree nodes and centre lists
+typedef ap_uint<NODE_POINTER_BITWIDTH> node_pointer;
+typedef ap_uint<CNTR_INDEX_BITWIDTH> centre_index_type;
+
+// force register insertion in the generated RTL for some signals
+#define FORCE_REGISTERS
+// ... used for saturation
+#define MAX_FIXED_POINT_VAL_EXT (1<<(COORD_BITWITDH_EXT-1))-1
+
+typedef unsigned int uint;
+typedef ap_int<COORD_BITWIDTH> coord_type;
+typedef ap_int<D*COORD_BITWIDTH> coord_type_vector;
+typedef ap_int<COORD_BITWITDH_EXT> coord_type_ext;
+typedef ap_int<D*COORD_BITWITDH_EXT> coord_type_vector_ext;
+
+//bit width definitions for multiplications
+#define MUL_INTEGER_BITS 12
+#define MUL_FRACTIONAL_BITS 6
+#define MUL_MAX_VAL (1<<(MUL_INTEGER_BITS+MUL_FRACTIONAL_BITS-1))-1
+#define MUL_MIN_VAL -1*(1<<(MUL_INTEGER_BITS+MUL_FRACTIONAL_BITS-1))
+typedef ap_int<MUL_INTEGER_BITS+MUL_FRACTIONAL_BITS> mul_input_type;
+
+// this should be always 1
+#define FILE_INDEX 1
+
+// data point types
+struct data_type {
+ //coord_type value[D];
+ coord_type_vector value;
+ data_type& operator=(const data_type& a);
+ data_type& operator=(const volatile data_type& a);
+};
+
+
+// data point types ext
+struct data_type_ext {
+ coord_type_vector_ext value;
+ data_type_ext& operator=(const data_type_ext& a);
+};
+
+
+// centre types
+struct centre_type {
+ data_type_ext wgtCent; // sum of all points assigned to this centre
+ coord_type_ext sum_sq; // sum of norm of all points assigned to this centre
+ coord_type count;
+ centre_type& operator=(const centre_type& a);
+};
+typedef centre_type* centre_ptr;
+
+
+#ifdef FORCE_REGISTERS
+template<class T>
+T Reg(T in) {
+ #pragma HLS INLINE off
+ #pragma HLS INTERFACE port=return register
+ return in;
+}
+#else
+template<class T>
+T Reg(T in) {
+ #pragma HLS INLINE
+ return in;
+}
+#endif
+
+
+
+void lloyds_algorithm_top( volatile data_type *data,
+ volatile data_type *cntr_pos_init,
+ node_pointer n,
+ centre_index_type k,
+ volatile coord_type_ext *distortion_out,
+ volatile data_type *clusters_out);
+
+void init_node_memory(volatile data_type *node_data, node_pointer n);
+
+void update_centres(centre_type *centres_in,centre_index_type k, data_type *centres_positions_out);
+
+void lloyds ( node_pointer n,
+ centre_index_type k,
+ centre_type *centres_out);
+
+#endif /* LLOYDS_ALGORITHM_TOP_H */
diff --git a/benchmarks/kmeans/lloyds_algorithm_util.cpp b/benchmarks/kmeans/lloyds_algorithm_util.cpp
new file mode 100644
index 0000000..fa1db19
--- /dev/null
+++ b/benchmarks/kmeans/lloyds_algorithm_util.cpp
@@ -0,0 +1,258 @@
+/**********************************************************************
+* Felix Winterstein, Imperial College London
+*
+* File: lloyds_algorithm_util.cpp
+*
+* Revision 1.01
+* Additional Comments: distributed under a BSD license, see LICENSE.txt
+*
+**********************************************************************/
+
+#include <math.h>
+#include "lloyds_algorithm_util.h"
+
+
+data_type& data_type::operator=(const data_type& a)
+{
+
+ value = a.value;
+ return *this;
+}
+
+data_type& data_type::operator=(const volatile data_type& a)
+{
+
+ value = a.value;
+ return *this;
+}
+
+
+
+data_type_ext& data_type_ext::operator=(const data_type_ext& a)
+{
+ value = a.value;
+ return *this;
+}
+
+
+
+centre_type& centre_type::operator=(const centre_type& a)
+{
+ count = a.count;
+ wgtCent = a.wgtCent;
+ sum_sq = a.sum_sq;
+ //position = a.position;
+ return *this;
+}
+
+
+void set_coord_type_vector_item(coord_type_vector *a, const coord_type b, const uint idx)
+{
+ #pragma HLS function_instantiate variable=idx
+ a->range((idx+1)*COORD_BITWIDTH-1,idx*COORD_BITWIDTH) = b;
+}
+
+
+void set_coord_type_vector_ext_item(coord_type_vector_ext *a, const coord_type_ext b, const uint idx)
+{
+ #pragma HLS function_instantiate variable=idx
+ a->range((idx+1)*COORD_BITWITDH_EXT-1,idx*COORD_BITWITDH_EXT) = b;
+}
+
+
+coord_type get_coord_type_vector_item(const coord_type_vector a, const uint idx)
+{
+ #pragma HLS function_instantiate variable=idx
+ coord_type tmp= a.range((idx+1)*COORD_BITWIDTH-1,idx*COORD_BITWIDTH);
+ return tmp;
+}
+
+
+coord_type_ext get_coord_type_vector_ext_item(const coord_type_vector_ext a, const uint idx)
+{
+ #pragma HLS function_instantiate variable=idx
+ coord_type_ext tmp= a.range((idx+1)*COORD_BITWITDH_EXT-1,idx*COORD_BITWITDH_EXT);
+ return tmp;
+}
+
+
+/* ****** helper functions *******/
+
+
+// conversion from data_type_ext to data_type
+data_type conv_long_to_short(data_type_ext p)
+{
+ #pragma HLS inline
+ data_type result;
+ for (uint d=0; d<D; d++) {
+ #pragma HLS unroll
+ coord_type tmp = (coord_type)get_coord_type_vector_ext_item(p.value,d);
+ set_coord_type_vector_item(&result.value,tmp,d);
+ }
+ return result;
+}
+
+// conversion from data_type to data_type_ext
+data_type_ext conv_short_to_long(data_type p)
+{
+ #pragma HLS inline
+ data_type_ext result;
+ for (uint d=0; d<D; d++) {
+ #pragma HLS unroll
+ coord_type_ext tmp = (coord_type_ext)get_coord_type_vector_item(p.value,d);
+ set_coord_type_vector_ext_item(&result.value,tmp,d);
+ }
+ return result;
+}
+
+
+mul_input_type saturate_mul_input(coord_type_ext val)
+{
+ #pragma HLS inline
+ if (val > MUL_MAX_VAL) {
+ val = MUL_MAX_VAL;
+ } else if (val < MUL_MIN_VAL) {
+ val = MUL_MIN_VAL;
+ }
+ return (mul_input_type)val;
+}
+
+
+// fixed point multiplication with saturation and scaling
+coord_type_ext fi_mul(coord_type_ext op1, coord_type_ext op2)
+{
+ #pragma HLS inline
+ mul_input_type tmp_op1 = saturate_mul_input(op1);
+ mul_input_type tmp_op2 = saturate_mul_input(op2);
+
+ ap_int<2*(MUL_INTEGER_BITS+MUL_FRACTIONAL_BITS)> result_unscaled;
+ result_unscaled = tmp_op1*tmp_op2;
+ #pragma HLS resource variable=result_unscaled core=MulnS
+
+ ap_int<2*(MUL_INTEGER_BITS+MUL_FRACTIONAL_BITS)> result_scaled;
+ result_scaled = result_unscaled >> MUL_FRACTIONAL_BITS;
+ coord_type_ext result;
+ result = (coord_type_ext)result_scaled;
+ return result;
+}
+
+// tree adder
+coord_type_ext tree_adder(coord_type_ext *input_array,const uint m)
+{
+ #pragma HLS inline
+
+ for(uint j=0;j<ceil(log2(m));j++)
+ {
+ #pragma HLS unroll
+ //printf("j = %d\n",j);
+ if (j<ceil(log2(m))-1) {
+ for(uint i = 0; i < uint((m+m-uint(m/(1<<(j)))*(1<<(j)))/(1<<(j+1))); i++)
+ {
+ #pragma HLS unroll
+ coord_type_ext tmp1 = input_array[2*i];
+ coord_type_ext tmp2 = input_array[2*i+1];
+ coord_type_ext tmp3 = tmp1+tmp2;
+ input_array[i] = tmp3;
+ #pragma HLS resource variable=tmp3 core=AddSubnS
+ //printf("[%d] = [%d]+[%d]\n",i,2*i,2*i+1);
+ }
+ if (m > uint((m+m-uint(m/(1<<(j)))*(1<<(j)))/(1<<(j+1)))*(1<<(j+1))) {
+ input_array[uint(m/(1<<(j+1)))] = input_array[uint(m/(1<<(j+1))-1)*2+2];
+ //printf("[%d] = [%d]\n",uint(m/(1<<(j+1))),uint(m/(1<<(j+1))-1)*2+2);
+ }
+ }
+ if (j== ceil(log2(m))-1) {
+ coord_type_ext tmp1 = input_array[0];
+ coord_type_ext tmp2 = input_array[1];
+ coord_type_ext tmp3 = tmp1+tmp2;
+ input_array[0] = tmp3;
+ //printf("[%d] = [%d]+[%d]\n",0,0,1);
+ #pragma HLS resource variable=tmp3 core=AddSubnS
+ }
+ }
+ return input_array[0];
+}
+
+
+// tree compare select
+void tree_cs(coord_type_ext *input_array,centre_index_type *index_array,coord_type_ext *res_val,centre_index_type *res_idx,const uint m)
+{
+ #pragma HLS inline
+
+ for(uint j=0;j<ceil(log2(m));j++)
+ {
+ if (j<ceil(log2(m))-1) {
+ for(uint i = 0; i < uint(m/(1<<(j+1))); i++)
+ {
+ coord_type_ext tmp1 = input_array[2*i];
+ coord_type_ext tmp1_idx = index_array[2*i];
+ coord_type_ext tmp2 = input_array[2*i+1];
+ coord_type_ext tmp2_idx = index_array[2*i+1];
+ coord_type_ext tmp3;
+ centre_index_type tmp3_idx;
+ if (tmp1 < tmp2) {
+ tmp3 = tmp1;
+ tmp3_idx = tmp1_idx;
+ } else {
+ tmp3 = tmp2;
+ tmp3_idx = tmp2_idx;
+ }
+
+ input_array[i] = tmp3;
+ index_array[i] = (tmp3_idx);
+ }
+ if (m > uint(m/(1<<(j+1)))*(1<<(j+1)) ) {
+ input_array[uint(m/(1<<(j+1)))] = (input_array[uint(m/(1<<(j+1))-1)*2+2]);
+ index_array[uint(m/(1<<(j+1)))] = (index_array[uint(m/(1<<(j+1))-1)*2+2]);
+ }
+ }
+ if (j== ceil(log2(m))-1) {
+ coord_type_ext tmp1 = input_array[0];
+ coord_type_ext tmp1_idx = index_array[0];
+ coord_type_ext tmp2 = input_array[1];
+ coord_type_ext tmp2_idx = index_array[1];
+ coord_type_ext tmp3;
+ centre_index_type tmp3_idx;
+ if (tmp1 < tmp2) {
+ tmp3 = tmp1;
+ tmp3_idx = tmp1_idx;
+ } else {
+ tmp3 = tmp2;
+ tmp3_idx = tmp2_idx;
+ }
+ input_array[0] = (tmp3);
+ index_array[0] = (tmp3_idx);
+ }
+ }
+ *res_val= input_array[0];
+ *res_idx= index_array[0];
+}
+
+
+// compute the Euclidean distance
+void compute_distance(data_type p1, data_type p2, coord_type_ext *dist)
+{
+ #pragma HLS inline
+
+ data_type tmp_p1 = p1;
+ data_type tmp_p2 = p2;
+ coord_type_ext tmp_mul_res[D];
+
+ for (uint d=0; d<D; d++) {
+ #pragma HLS unroll
+ coord_type tmp_sub1 = get_coord_type_vector_item(tmp_p1.value,d);
+ coord_type tmp_sub2 = get_coord_type_vector_item(tmp_p2.value,d);
+ coord_type tmp = tmp_sub1 - tmp_sub2;
+ coord_type_ext tmp_ext = (coord_type_ext)tmp;
+ coord_type_ext tmp_mul = fi_mul(tmp_ext,tmp_ext);
+ tmp_mul_res[d] = tmp_mul;
+ #pragma HLS resource variable=tmp core=AddSubnS
+ //#pragma HLS resource variable=tmp_mul core=MulnS
+ }
+
+ *dist = tree_adder(tmp_mul_res,D);
+}
+
+
+
+
diff --git a/benchmarks/kmeans/lloyds_algorithm_util.h b/benchmarks/kmeans/lloyds_algorithm_util.h
new file mode 100644
index 0000000..3853c3d
--- /dev/null
+++ b/benchmarks/kmeans/lloyds_algorithm_util.h
@@ -0,0 +1,33 @@
+/**********************************************************************
+* Felix Winterstein, Imperial College London
+*
+* File: lloyds_algorithm_util.h
+*
+* Revision 1.01
+* Additional Comments: distributed under a BSD license, see LICENSE.txt
+*
+**********************************************************************/
+
+#ifndef LLOYDS_ALGORITHM_UTIL_H
+#define LLOYDS_ALGORITHM_UTIL_H
+
+#include "lloyds_algorithm_top.h"
+
+
+// helper functions
+void set_coord_type_vector_item(coord_type_vector *a, const coord_type b, const uint idx);
+void set_coord_type_vector_ext_item(coord_type_vector_ext *a, const coord_type_ext b, const uint idx);
+coord_type get_coord_type_vector_item(const coord_type_vector a, const uint idx);
+coord_type_ext get_coord_type_vector_ext_item(const coord_type_vector_ext a, const uint idx);
+
+data_type conv_long_to_short(data_type_ext p);
+data_type_ext conv_short_to_long(data_type p);
+mul_input_type saturate_mul_input(coord_type_ext val);
+coord_type_ext fi_mul(coord_type_ext op1, coord_type_ext op2);
+coord_type_ext tree_adder(coord_type_ext *input_array);
+void tree_cs(coord_type_ext *input_array,centre_index_type *index_array,coord_type_ext *res_val,centre_index_type *res_idx,const uint m);
+coord_type_ext tree_adder(coord_type_ext *input_array,const uint m);
+void compute_distance(data_type p1, data_type p2, coord_type_ext *dist);
+
+
+#endif /* LLOYDS_ALGORITHM_UTIL_H */