c++ - Optimizing 1D Convolution -

- September 15, 2012

is there way speed 1d convolution ? tried make dy cache efficient compiling g++ , -o3 gave worse performances.

i convolving [-1. , 0., 1] in both directions. not homework.

#include<iostream> #include<cstdlib> #include<sys/time.h>  void print_matrix( int height, int width, float *matrix){     (int j=0; j < height; j++){       (int i=0; < width; i++){         std::cout << matrix[j * width + i] << ",";     }       std::cout << std::endl;   } }  void fill_matrix( int height, int width,  float *matrix){     (int j=0; j < height; j++){       (int i=0; < width; i++){         matrix[j * width + i] = ((float)rand() / (float)rand_max) ;     }   } }  #define restrict __restrict__  void dx_matrix( int height, int width, float * restrict in_matrix,  float * restrict out_matrix, float *min, float *max){   //init min,max   *min = *max = -1.f * in_matrix[0] + in_matrix[1];       (int j=0; j < height; j++){       float* row = in_matrix + j * width;       (int i=1; < width-1; i++){         float res = -1.f * row[i-1] + row[i+1]; /* -1.f * value + 0.f * value + 1.f * value; */          if (res > *max ) *max = res;         if (res < *min ) *min = res;         out_matrix[j * width + i] = res;       }     } }  void dy_matrix( int height, int width, float * restrict in_matrix,  float * restrict out_matrix, float *min, float *max){   //init min,max   *min = *max = -1.f * in_matrix[0] + in_matrix[ width + 1];     (int j=1; j < height-1; j++){       (int i=0; < width; i++){         float res = -1.f * in_matrix[ (j-1) * width + i] + in_matrix[ (j+1) * width + i] ;         if (res > *max ) *max = res;         if (res < *min ) *min = res;         out_matrix[j * width + i] =  res;       }     } }  double (void)                                                                                           {                                                                                                                       struct timeval tv;                                                                                                  gettimeofday(&tv, null);                                                                                            return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; }   int main(int argc, char **argv){    int width, height;   float *in_matrix;   float *out_matrix;    if(argc < 3){     std::cout  << argv[0] << "usage: width height " << std::endl;     return -1;   }    srand(123);    width = atoi(argv[1]);   height = atoi(argv[2]);    std::cout << "width:"<< width << " height:" << height << std::endl;    if (width < 3){     std::cout << "width short " << std::endl;     return -1;   }   if (height < 3){     std::cout << "height short " << std::endl;     return -1;   }    in_matrix = (float *) malloc( height * width * sizeof(float));   out_matrix = (float *) malloc( height * width * sizeof(float));    fill_matrix(height, width, in_matrix);   //print_matrix(height, width, in_matrix);    float min, max;    double = now();   dx_matrix(height, width, in_matrix, out_matrix, &min, &max);   std::cout << "dx min:" << min << " max:" << max << std::endl;    dy_matrix(height, width, in_matrix, out_matrix, &min, &max);   double b = now();   std::cout << "dy min:" << min << " max:" << max << std::endl;   std::cout << "time: " << b-a << " sec" << std::endl;     return 0; }

first of all, rewrite dy loop rid of "[ (j-1) * width + i]" , "in_matrix[ (j+1) * width + i]", , like:

  float* p, *q, *out;  p = &in_matrix[(j-1)*width];  q = &in_matrix[(j+1)*width];  out = &out_matrix[j*width];   (int i=0; < width; i++){          float res = -1.f * p[i] + q[i] ;          if (res > *max ) *max = res;          if (res < *min ) *min = res;          out[i] =  res;        }

but trivial optimization compiler may doing you.

it faster "q[i]-p[i]" instead of "-1.f*p[i]+q[i]", but, again, compiler may smart enough behind back.

the whole thing benefit considerably sse2 , multithreading. i'd bet on @ least 3x speedup sse2 right away. multithreading can added using openmp , take few lines of code.

Search This Blog

ERT

c++ - Optimizing 1D Convolution -

Comments

Post a Comment

Popular posts from this blog

ASP.NET/SQL find the element ID and update database -

c++ - Compiling static TagLib 1.6.3 libraries for Windows -

PostgreSQL 9.x - pg_read_binary_file & inserting files into bytea -