conv:conv
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revisionNext revisionBoth sides next revision | ||
conv:conv [2008/01/04 12:32] – deva | conv:conv [2008/01/24 12:08] – deva | ||
---|---|---|---|
Line 1: | Line 1: | ||
- | =====Convolution implementation pages===== | + | =====Convolution implementation pages (Spring 2008 studies group under Ole Caprani)===== |
+ | Our goal is to implement a realtime plugin to do convolution.\\ | ||
+ | The basic "Input Side" is way too slow (convolution implemented with two for loops), so we will be experimenting with a dft-multiply-idft algorithm (sometimes called "The Overlap-and-Save algorithm" | ||
+ | To do the actual fft we will use the fftw3 library (http:// | ||
+ | The next step is to implement at test the " | ||
+ | |||
+ | ====Group==== | ||
+ | * Jonas Suhr Christensen - // | ||
+ | * Bent Bisballe Nyeng - // | ||
+ | |||
+ | ====Analysis==== | ||
+ | The following graphs show the execution time of a single convolution iteration, using the overlap and save algorithm with various optimizations enabled.\\ | ||
+ | The timings are produced using a Intel(R) Core(TM)2 Duo CPU E6750 @ 2.66GHz, with 4096KB cache.\\ | ||
+ | The algorithm optimizations are named as follows: | ||
+ | * **Double** This is an implementation with no particular optimizations. | ||
+ | * **Float** This is a simple implementation using float as the internal datatype. | ||
+ | * **Real Double** This is an implementation using real ffts instead of complex. | ||
+ | * **Real Float** This is an implementation using real ffts instead of complex, with float as internal datatype. | ||
+ | * **MT Double** This is an implementation running on multiple CPUs. | ||
+ | * **MT Float** This is an implementation with float as internal datatype, running on multiple CPUs. | ||
+ | * **MT Real Double** This is an implementation using real ffts instead of complex, running on multiple CPUs. | ||
+ | * **MT Real Float** This is an implementation using real ffts instead of complex, with float as internal datatype, running on multiple CPUs. | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | As it is seen on the graphs the speed improves with the buffer size increasing, up to 1024 samples. At this point the iteration time starts fluctuating. This may be caused by internal cache size on the given CPU. | ||
====Links==== | ====Links==== | ||
- | * http:// | + | |
- | * http:// | + | |
- | * http:// | + | * http:// |
- | * http:// | + | * http:// |
+ | * http:// | ||
+ | * http:// | ||
+ | * http:// | ||
+ | * http:// | ||
====Code==== | ====Code==== | ||
Compile with: | Compile with: | ||
< | < | ||
- | g++ conv.cc -o conv -lfftw | + | g++ -O2 -mmmx -msse -msse2 conv_opt.cc -o conv_opt -lfftw3 -lsndfile |
+ | </ | ||
+ | or | ||
+ | < | ||
+ | g++ -O2 -mmmx -msse -msse2 conv_opt.cc -o conv_opt -DFFTW_FLOAT -lfftw3f | ||
</ | </ | ||
+ | to use floating point fftw instead of double.\\ | ||
+ | The actual code is shown below: | ||
<code c> | <code c> | ||
#include < | #include < | ||
#include < | #include < | ||
+ | #include < | ||
+ | #include < | ||
+ | #include < | ||
+ | #include " | ||
- | // | + | //#define BUFFER_SIZE_MS 113 //ms 113, // Find a size with small prime factors |
+ | #define BUFFER_SIZE_MS 1000 //ms 113, // Find a size with small prime factors | ||
+ | #define SAMPLE_RATE_HZ 44100 //hz | ||
+ | #define BUFFER_SIZE_SAMPLES 4096 | ||
+ | // | ||
+ | |||
+ | //#define MEASURE | ||
+ | |||
+ | #ifdef FFTW_FLOAT | ||
+ | #define FFTW_DATATYPE fftwf_complex | ||
+ | #define FFTW_PLAN_DFT fftwf_plan_dft_1d | ||
+ | #define FFTW_EXECUTE fftwf_execute | ||
+ | #define FFTW_PLAN fftwf_plan | ||
+ | #define FFTW_DESTROY_PLAN fftwf_destroy_plan | ||
+ | #define FFTW_FREE fftwf_free | ||
+ | #define FFTW_MALLOC fftwf_malloc | ||
+ | typedef | ||
+ | #else | ||
+ | #define FFTW_DATATYPE fftw_complex | ||
+ | #define FFTW_PLAN_DFT fftw_plan_dft_1d | ||
+ | #define FFTW_EXECUTE fftw_execute | ||
+ | #define FFTW_PLAN fftw_plan | ||
+ | #define FFTW_DESTROY_PLAN fftw_destroy_plan | ||
+ | #define FFTW_FREE fftw_free | ||
+ | #define FFTW_MALLOC fftw_malloc | ||
typedef double sample_t; | typedef double sample_t; | ||
+ | #endif | ||
typedef struct { | typedef struct { | ||
Line 34: | Line 100: | ||
} | } | ||
- | samples_t *read_data(const char *filename) | + | samples_t *read_data(const char *filename, double scale) |
{ | { | ||
- | size_t size = 3; | + | printf(" |
+ | |||
+ | SndfileHandle sfh(filename); | ||
+ | size_t size = sfh.seek(0, SEEK_END); | ||
+ | sfh.seek(0, | ||
+ | // size_t size = 3; | ||
+ | | ||
sample_t *data = (sample_t*)malloc(sizeof(sample_t) * size); | sample_t *data = (sample_t*)malloc(sizeof(sample_t) * size); | ||
- | data[0] = 1; | + | sfh.read(data, size); |
- | data[1] = 1; | + | |
- | data[2] = 1; | + | printf(" |
+ | |||
+ | for(int i = 0; i < size; i++) { | ||
+ | data[i] = scale * data[i]; | ||
+ | } | ||
+ | |||
+ | // for(int i = size / 2; i < (size / 2) +20; i++) printf(" | ||
samples_t *samples = new_samples(data, | samples_t *samples = new_samples(data, | ||
+ | printf(" | ||
+ | |||
return samples; | return samples; | ||
} | } | ||
Line 48: | Line 129: | ||
void write_data(const char *filename, samples_t *samples) | void write_data(const char *filename, samples_t *samples) | ||
{ | { | ||
- | for(int i = 0; i < samples-> | + | printf(" |
- | printf(" | + | |
- | } | + | // for(int i = samples-> |
+ | |||
+ | SndfileHandle sfh(filename, | ||
+ | |||
+ | sfh.write(samples-> | ||
+ | |||
+ | printf(" | ||
} | } | ||
Line 56: | Line 143: | ||
#define IM 1 | #define IM 1 | ||
- | samples_t | + | |
+ | static FFTW_DATATYPE | ||
+ | static FFTW_DATATYPE | ||
+ | static FFTW_DATATYPE *out = NULL; | ||
+ | static FFTW_DATATYPE *inx = NULL; | ||
+ | static FFTW_DATATYPE *xfft = NULL; | ||
+ | void init_fftw(samples_t *h) | ||
{ | { | ||
- | int size = x->size + h->size - 1; | + | FFTW_PLAN p; |
- | sample_t *data = (sample_t*)calloc(sizeof(sample_t), | + | |
- | samples_t | + | int size = BUFFER_SIZE_SAMPLES + h->size - 1; |
+ | hfft = (FFTW_DATATYPE*) FFTW_MALLOC(sizeof(FFTW_DATATYPE) * size); | ||
+ | FFTW_DATATYPE *inh = (FFTW_DATATYPE*) FFTW_MALLOC(sizeof(FFTW_DATATYPE) * size); | ||
- | // FFTW | + | for(int i = 0; i < size; i++) { |
- | fftw_plan p; | + | inh[i][RE] = inh[i][IM] = 0; |
- | + | } | |
- | int N = x->size; | + | for(int i = 0; i < h-> |
+ | inh[i][RE] = h-> | ||
+ | } | ||
- | fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N); | + | p = FFTW_PLAN_DFT(size, |
- | fftw_complex *xfft = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N); | + | |
- | fftw_complex *hfft = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N); | + | FFTW_DESTROY_PLAN(p); |
- | + | ||
- | p = fftw_plan_dft_r2c_1d(x->size, x->data, xfft, 0); | + | |
- | fftw_execute(p); | + | |
- | + | ||
- | /* | + | |
- | for(int i = 0; i < N; i++) { | + | |
- | printf(" | + | |
- | } | + | |
- | */ | + | |
- | + | ||
- | p = fftw_plan_dft_r2c_1d(h-> | + | |
- | fftw_execute(p); | + | |
- | + | ||
- | for(int i = 0; i < N; i++ ) { | + | |
- | out[i][RE] = hfft[i][RE] * xfft[i][RE] - hfft[i][IM] * xfft[i][IM]; | + | |
- | out[i][IM] = xfft[i][IM] * hfft[i][RE] + xfft[i][RE] * hfft[i][IM]; | + | |
- | } | + | |
- | + | ||
- | // invers FFTW | + | |
- | + | ||
- | p = fftw_plan_dft_c2r_1d(N, | + | |
- | fftw_execute(p); | + | |
- | for(int i = 0; i < N; i++ ) { | + | FFTW_FREE(inh); |
- | data[i] /= N; | + | |
- | } | + | |
- | fftw_destroy_plan(p); | + | outm = (FFTW_DATATYPE*) FFTW_MALLOC(sizeof(FFTW_DATATYPE) * size); |
- | fftw_free(xfft); | + | out = (FFTW_DATATYPE*) FFTW_MALLOC(sizeof(FFTW_DATATYPE) * size); |
- | fftw_free(hfft); | + | inx = (FFTW_DATATYPE*) FFTW_MALLOC(sizeof(FFTW_DATATYPE) * size); |
- | // end of FFTW | + | for(int i = 0; i < size; i++) { |
+ | inx[i][RE] = inx[i][IM] = 0; | ||
+ | } | ||
+ | |||
+ | | ||
+ | |||
+ | |||
+ | } | ||
+ | |||
+ | void deinit_fftw() | ||
+ | { | ||
+ | FFTW_FREE(hfft); | ||
+ | | ||
+ | FFTW_FREE(outm); | ||
+ | FFTW_FREE(inx); | ||
+ | FFTW_FREE(xfft); | ||
+ | } | ||
+ | |||
+ | #ifdef MEASURE | ||
+ | long static time_used; | ||
+ | #endif | ||
+ | samples_t *conv_fftwc_preprocess(samples_t *x, size_t hsize/*samples_t *h*/) | ||
+ | { | ||
+ | assert(hfft); | ||
+ | assert(out); | ||
+ | assert(outm); | ||
+ | assert(inx); | ||
+ | assert(xfft); | ||
+ | |||
+ | #ifdef MEASURE | ||
+ | struct timeb systime; | ||
+ | long begin = 0; | ||
+ | long end = 0; | ||
+ | ftime(& | ||
+ | begin = systime.time * 1000 + systime.millitm; | ||
+ | #endif | ||
+ | |||
+ | FFTW_PLAN p; | ||
+ | |||
+ | int size = x->size + hsize - 1; | ||
+ | sample_t *data = (sample_t*)calloc(sizeof(sample_t), | ||
+ | samples_t *y = new_samples(data, | ||
+ | for(int i = 0; i < x->size; i++) { | ||
+ | inx[i][RE] = x-> | ||
+ | } | ||
+ | |||
+ | p = FFTW_PLAN_DFT(size, | ||
+ | FFTW_EXECUTE(p); | ||
+ | |||
+ | |||
+ | for(int i = 0; i < size; i++ ) { | ||
+ | outm[i][RE] = hfft[i][RE] * xfft[i][RE] - hfft[i][IM] * xfft[i][IM]; | ||
+ | outm[i][IM] = xfft[i][IM] * hfft[i][RE] + xfft[i][RE] * hfft[i][IM]; | ||
+ | } | ||
+ | |||
+ | |||
+ | p = FFTW_PLAN_DFT(size, | ||
+ | FFTW_EXECUTE(p); | ||
+ | |||
+ | |||
+ | |||
+ | for(int i = 0; i < size; i++) { | ||
+ | data[i] = out[i][RE] / size; | ||
+ | } | ||
+ | |||
+ | FFTW_DESTROY_PLAN(p); | ||
+ | |||
+ | #ifdef MEASURE | ||
+ | ftime(& | ||
+ | end = systime.time * 1000 + systime.millitm; | ||
+ | printf(" | ||
+ | time_used = end - begin; | ||
+ | #endif | ||
+ | |||
+ | |||
return y; | return y; | ||
} | } | ||
- | samples_t | + | sample_t |
{ | { | ||
- | int size = x-> | + | unsigned |
- | sample_t *data = (sample_t*)calloc(sizeof(sample_t), size); | + | static |
+ | static unsigned int p = 0; | ||
- | samples_t *y = new_samples(data, size); | + | samples_t *input_s |
+ | samples_t *y; | ||
+ | |||
+ | y = conv_fftwc_preprocess(input_s, | ||
+ | |||
+ | for(int i = 0; i < BUFFER_SIZE_SAMPLES; | ||
+ | buffer[(i + p + filter_size - 1) % buffer_size] = 0; // | ||
+ | } | ||
+ | |||
+ | for(int i = 0/* BUFFER_SIZE_SAMPLES*/; | ||
+ | buffer[(i + p) % buffer_size] += y-> | ||
+ | } | ||
- | for(int xn = 0; xn < x->size; xn++) { | + | // alloker returbuffer |
- | for(int | + | sample_t *output_buffer = (sample_t*)malloc(BUFFER_SIZE_SAMPLES * sizeof(sample_t)); |
- | y-> | + | |
- | } | + | // kopier retsultat til output buffer |
+ | for(int | ||
+ | output_buffer[i] = buffer[(i + p) % buffer_size]; | ||
} | } | ||
- | return | + | p += BUFFER_SIZE_SAMPLES; |
+ | |||
+ | // frigør | ||
+ | free(input_s); | ||
+ | |||
+ | return output_buffer; | ||
} | } | ||
+ | sample_t puls = 0.5; | ||
int main(int argc, char *argv[]) | int main(int argc, char *argv[]) | ||
{ | { | ||
// | // | ||
+ | if(argc != 4) { | ||
+ | printf(" | ||
+ | exit(1); | ||
+ | } | ||
+ | |||
int xsz, hsz; | int xsz, hsz; | ||
- | samples_t *h = read_data("h.dat"); | + | const char *filterfile = argv[1]; |
- | samples_t *x = read_data("x.dat"); | + | const char *inputfile = argv[2]; |
+ | const char *outputfile = argv[3]; | ||
+ | |||
+ | samples_t *h = read_data(filterfile, 0.08); | ||
+ | // | ||
+ | samples_t *x = read_data(inputfile, 0.5); | ||
+ | // | ||
+ | |||
+ | |||
+ | size_t s = h->size + x->size - 1; | ||
+ | sample_t *d = (sample_t*)malloc(s | ||
+ | |||
+ | samples_t *y = new_samples(d, | ||
+ | |||
+ | |||
+ | // 0 padding | ||
+ | sample_t *pd = (sample_t*)malloc((y-> | ||
+ | for(int i = 0; i < x->size; i++) { | ||
+ | pd[i] = x-> | ||
+ | } | ||
+ | for(int i = x->size; i < y->size + (y->size % BUFFER_SIZE_SAMPLES); | ||
+ | pd[i] = 0; | ||
+ | } | ||
+ | x = new_samples(pd, | ||
+ | |||
+ | init_fftw(h); | ||
+ | |||
+ | int e = 0; | ||
+ | #ifdef MEASURE | ||
+ | // find a good delta value (ie. small prime factors) | ||
+ | int o = 20000; | ||
+ | for(int i = 0; i < 20; i++) { | ||
+ | printf("Measuring %d: ", i); | ||
+ | filter(& | ||
+ | if(time_used < o) { e = i; o = time_used; } | ||
+ | } | ||
+ | #endif | ||
+ | |||
+ | for(int i = 0; i < y->size; i += BUFFER_SIZE_SAMPLES) { | ||
+ | #ifdef MEASURE | ||
+ | printf(" | ||
+ | #endif | ||
+ | sample_t *data = filter(& | ||
+ | |||
+ | for(int j = 0; j < BUFFER_SIZE_SAMPLES; | ||
+ | y-> | ||
+ | } | ||
+ | |||
+ | free(data); | ||
+ | } | ||
- | printf(" | + | deinit_fftw(); |
- | samples_t *y = conv_simple(x, | + | |
- | write_data(" | + | |
- | free(y); | + | |
+ | write_data(outputfile, | ||
- | printf(" | ||
- | y = conv_fftw(x, | ||
- | write_data(" | ||
free(y); | free(y); | ||
conv/conv.txt · Last modified: 2008/09/04 12:09 by deva