c++ - Optimizing 1D Convolution -
is there way speed 1d convolution ? tried make dy cache efficient compiling g++ , -o3 gave worse performances.
i convolving [-1. , 0., 1] in both directions. not homework.
#include<iostream> #include<cstdlib> #include<sys/time.h> void print_matrix( int height, int width, float *matrix){ (int j=0; j < height; j++){ (int i=0; < width; i++){ std::cout << matrix[j * width + i] << ","; } std::cout << std::endl; } } void fill_matrix( int height, int width, float *matrix){ (int j=0; j < height; j++){ (int i=0; < width; i++){ matrix[j * width + i] = ((float)rand() / (float)rand_max) ; } } } #define restrict __restrict__ void dx_matrix( int height, int width, float * restrict in_matrix, float * restrict out_matrix, float *min, float *max){ //init min,max *min = *max = -1.f * in_matrix[0] + in_matrix[1]; (int j=0; j < height; j++){ float* row = in_matrix + j * width; (int i=1; < width-1; i++){ float res = -1.f * row[i-1] + row[i+1]; /* -1.f * value + 0.f * value + 1.f * value; */ if (res > *max ) *max = res; if (res < *min ) *min = res; out_matrix[j * width + i] = res; } } } void dy_matrix( int height, int width, float * restrict in_matrix, float * restrict out_matrix, float *min, float *max){ //init min,max *min = *max = -1.f * in_matrix[0] + in_matrix[ width + 1]; (int j=1; j < height-1; j++){ (int i=0; < width; i++){ float res = -1.f * in_matrix[ (j-1) * width + i] + in_matrix[ (j+1) * width + i] ; if (res > *max ) *max = res; if (res < *min ) *min = res; out_matrix[j * width + i] = res; } } } double (void) { struct timeval tv; gettimeofday(&tv, null); return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; } int main(int argc, char **argv){ int width, height; float *in_matrix; float *out_matrix; if(argc < 3){ std::cout << argv[0] << "usage: width height " << std::endl; return -1; } srand(123); width = atoi(argv[1]); height = atoi(argv[2]); std::cout << "width:"<< width << " height:" << height << std::endl; if (width < 3){ std::cout << "width short " << std::endl; return -1; } if (height < 3){ std::cout << "height short " << std::endl; return -1; } in_matrix = (float *) malloc( height * width * sizeof(float)); out_matrix = (float *) malloc( height * width * sizeof(float)); fill_matrix(height, width, in_matrix); //print_matrix(height, width, in_matrix); float min, max; double = now(); dx_matrix(height, width, in_matrix, out_matrix, &min, &max); std::cout << "dx min:" << min << " max:" << max << std::endl; dy_matrix(height, width, in_matrix, out_matrix, &min, &max); double b = now(); std::cout << "dy min:" << min << " max:" << max << std::endl; std::cout << "time: " << b-a << " sec" << std::endl; return 0; }
first of all, rewrite dy loop rid of "[ (j-1) * width + i]" , "in_matrix[ (j+1) * width + i]", , like:
float* p, *q, *out; p = &in_matrix[(j-1)*width]; q = &in_matrix[(j+1)*width]; out = &out_matrix[j*width]; (int i=0; < width; i++){ float res = -1.f * p[i] + q[i] ; if (res > *max ) *max = res; if (res < *min ) *min = res; out[i] = res; }
but trivial optimization compiler may doing you.
it faster "q[i]-p[i]" instead of "-1.f*p[i]+q[i]", but, again, compiler may smart enough behind back.
the whole thing benefit considerably sse2 , multithreading. i'd bet on @ least 3x speedup sse2 right away. multithreading can added using openmp , take few lines of code.
Comments
Post a Comment