diff options
Diffstat (limited to 'noatun-plugins/nexscope/convolve.c')
-rw-r--r-- | noatun-plugins/nexscope/convolve.c | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/noatun-plugins/nexscope/convolve.c b/noatun-plugins/nexscope/convolve.c new file mode 100644 index 0000000..03509eb --- /dev/null +++ b/noatun-plugins/nexscope/convolve.c @@ -0,0 +1,297 @@ +/* Karatsuba convolution + * + * Copyright (C) 1999 Ralph Loader <suckfish@ihug.co.nz> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* The algorithm is based on the following. For the convolution of a pair + * of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four + * multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c - + * b.d. A similar relation enables us to compute a 2n by 2n convolution + * using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n + * multiplications (as opposed to the 4^n that the quadratic algorithm + * takes. */ + +/* For large n, this is slower than the O(n log n) that the FFT method + * takes, but we avoid using complex numbers, and we only have to compute + * one convolution, as opposed to 3 FFTs. We have good locality-of- + * reference as well, which will help on CPUs with tiny caches. */ + +/* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160 + * (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba + * algorithm. We actually want 257 outputs of a 256 x 512 convolution; + * that doesn't appear to give an easy advantage for the FFT algorithm, but + * for the Karatsuba algorithm, it's easy to use two 256 x 256 + * convolutions, taking 2 x 3^8 = 12312 multiplications. [This difference + * is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n, + * while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n + * - 1]. */ + +/* There's a big lie above, actually... for a 4x4 convolution, it's quicker + * to do it using 16 multiplications than the more complex Karatsuba + * algorithm... So the recursion bottoms out at 4x4s. This increases the + * number of multiplications by a factor of 16/9, but reduces the overheads + * dramatically. */ + +/* The convolution algorithm is implemented as a stack machine. We have a + * stack of commands, each in one of the forms "do a 2^n x 2^n + * convolution", or "combine these three length 2^n outputs into one + * 2^{n+1} output." */ + +#include <stdlib.h> +#include "convolve.h" + +/* + * Initialisation routine - sets up tables and space to work in. + * Returns a pointer to internal state, to be used when performing calls. + * On error, returns NULL. + * The pointer should be freed when it is finished with, by convolve_close(). + */ +convolve_state *convolve_init(void) +{ + return (convolve_state *) malloc (sizeof(convolve_state)); +} + +/* + * Free the state allocated with convolve_init(). + */ +void convolve_close(convolve_state *state) +{ + if (state) + free(state); +} + +static void convolve_4 (double * out, const double * left, const double * right) +/* This does a 4x4 -> 7 convolution. For what it's worth, the slightly odd + * ordering gives about a 1% speed up on my Pentium II. */ +{ + double l0, l1, l2, l3, r0, r1, r2, r3; + double a; + l0 = left[0]; + r0 = right[0]; + a = l0 * r0; + l1 = left[1]; + r1 = right[1]; + out[0] = a; + a = (l0 * r1) + (l1 * r0); + l2 = left[2]; + r2 = right[2]; + out[1] = a; + a = (l0 * r2) + (l1 * r1) + (l2 * r0); + l3 = left[3]; + r3 = right[3]; + out[2] = a; + + out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0); + out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1); + out[5] = (l2 * r3) + (l3 * r2); + out[6] = l3 * r3; +} + +static void convolve_run (stack_entry * top, unsigned size, double * scratch) +/* Interpret a stack of commands. The stack starts with two entries; the + * convolution to do, and an illegal entry used to mark the stack top. The + * size is the number of entries in each input, and must be a power of 2, + * and at least 8. It is OK to have out equal to left and/or right. + * scratch must have length 3*size. The number of stack entries needed is + * 3n-4 where size=2^n. */ +{ + do { + const double * left; + const double * right; + double * out; + + /* When we get here, the stack top is always a convolve, + * with size > 4. So we will split it. We repeatedly split + * the top entry until we get to size = 4. */ + + left = top->v.left; + right = top->v.right; + out = top->v.out; + top++; + + do { + double * s_left, * s_right; + int i; + + /* Halve the size. */ + size >>= 1; + + /* Allocate the scratch areas. */ + s_left = scratch + size * 3; + /* s_right is a length 2*size buffer also used for + * intermediate output. */ + s_right = scratch + size * 4; + + /* Create the intermediate factors. */ + for (i = 0; i < size; i++) { + double l = left[i] + left[i + size]; + double r = right[i] + right[i + size]; + s_left[i + size] = r; + s_left[i] = l; + } + + /* Push the combine entry onto the stack. */ + top -= 3; + top[2].b.main = out; + top[2].b.null = NULL; + + /* Push the low entry onto the stack. This must be + * the last of the three sub-convolutions, because + * it may overwrite the arguments. */ + top[1].v.left = left; + top[1].v.right = right; + top[1].v.out = out; + + /* Push the mid entry onto the stack. */ + top[0].v.left = s_left; + top[0].v.right = s_right; + top[0].v.out = s_right; + + /* Leave the high entry in variables. */ + left += size; + right += size; + out += size * 2; + + } while (size > 4); + + /* When we get here, the stack top is a group of 3 + * convolves, with size = 4, followed by some combines. */ + convolve_4 (out, left, right); + convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right); + convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right); + top += 2; + + /* Now process combines. */ + do { + /* b.main is the output buffer, mid is the middle + * part which needs to be adjusted in place, and + * then folded back into the output. We do this in + * a slightly strange way, so as to avoid having + * two loops. */ + double * out = top->b.main; + double * mid = scratch + size * 4; + unsigned int i; + top++; + out[size * 2 - 1] = 0; + for (i = 0; i < size-1; i++) { + double lo; + double hi; + lo = mid[0] - (out[0] + out[2 * size]) + out[size]; + hi = mid[size] - (out[size] + out[3 * size]) + out[2 * size]; + out[size] = lo; + out[2 * size] = hi; + out++; + mid++; + } + size <<= 1; + } while (top->b.null == NULL); + } while (top->b.main != NULL); +} + +int convolve_match (float * lastchoice, + float * input, + convolve_state * state) +/* lastchoice is a 256 sized array. input is a 512 array. We find the + * contiguous length 256 sub-array of input that best matches lastchoice. + * A measure of how good a sub-array is compared with the lastchoice is + * given by the sum of the products of each pair of entries. We maximise + * that, by taking an appropriate convolution, and then finding the maximum + * entry in the convolutions. state is a (non-NULL) pointer returned by + * convolve_init. */ +{ + double avg; + double best; + int p; + int i; + double * left = state->left; + double * right = state->right; + double * scratch = state->scratch; + stack_entry * top = state->stack + STACK_SIZE - 1; + + for (i=0; i<512; i++) + left[i]=input[i]; + + avg = 0; + for (i = 0; i < 256; i++) + { + double a = lastchoice[255 - i]; + right[i] = a; + avg += a; + } + + /* We adjust the smaller of the two input arrays to have average + * value 0. This makes the eventual result insensitive to both + * constant offsets and positive multipliers of the inputs. */ + avg /= 256; + for (i = 0; i < 256; i++) + right[i] -= avg; + + /* End-of-stack marker. */ + top[1].b.null = scratch; + top[1].b.main = NULL; + + /* The low 256x256, of which we want the high 256 outputs. */ + top->v.left = left; + top->v.right = right; + top->v.out = right + 256; + convolve_run (top, 256, scratch); + + /* The high 256x256, of which we want the low 256 outputs. */ + top->v.left = left + 256; + top->v.right = right; + top->v.out = right; + convolve_run (top, 256, scratch); + + /* Now find the best position amoungs this. Apart from the first + * and last, the required convolution outputs are formed by adding + * outputs from the two convolutions above. */ + best = right[511]; + right[767] = 0; + p = -1; + for (i = 0; i < 256; i++) { + double a = right[i] + right[i + 512]; + if (a > best) { + best = a; + p = i; + } + } + p++; + +#if 0 + { + /* This is some debugging code... */ + int bad = 0; + best = 0; + for (i = 0; i < 256; i++) + best += ((double) input[i+p]) * ((double) lastchoice[i] - avg); + + for (i = 0; i < 257; i++) { + double tot = 0; + unsigned int j; + for (j = 0; j < 256; j++) + tot += ((double) input[i+j]) * ((double) lastchoice[j] - avg); + if (tot > best) + printf ("(%i)", i); + if (tot != left[i + 255]) + printf ("!"); + } + + printf ("%i\n", p); + } +#endif + + return p; +} |