Add FFTS v0.7

author: Anthony Blake <anthonix@me.com> 2014-09-06 01:35:32 -0500
committer: Timothy Pearson <kb9vqf@pearsoncomputing.net> 2014-09-06 01:35:32 -0500
commit: 14d918151bd447d854c3d0b34a9d542a5dff38ff (patch)
tree: 897331864b7f2061cbf54b74f8232b318e084f6b /lib/ffts/src/ffts.c
parent: c223a6a35f0b6a1b06aa4ff8029065711a986120 (diff)
download: ulab-14d918151bd447d854c3d0b34a9d542a5dff38ff.tar.gz
ulab-14d918151bd447d854c3d0b34a9d542a5dff38ff.zip
1 files changed, 398 insertions, 0 deletions
diff --git a/lib/ffts/src/ffts.c b/lib/ffts/src/ffts.c
new file mode 100644
index 0000000..bec2177
--- /dev/null
+++ b/lib/ffts/src/ffts.c
@@ -0,0 +1,398 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "ffts.h"
+#include "macros.h"
+//#include "mini_macros.h"
+#include "patterns.h"
+#include "ffts_small.h"
+
+#ifdef DYNAMIC_DISABLED
+	#include "ffts_static.h"
+#else
+	#include "codegen.h"
+#endif
+
+#include <errno.h>
+  #include <sys/mman.h>
+  #include <string.h>
+  #include <limits.h>	   /* for PAGESIZE */
+
+#if __APPLE__
+  #include <libkern/OSCacheControl.h>
+#else
+#endif
+
+void ffts_execute(ffts_plan_t *p, const void *  in, void *  out) {
+	p->transform(p, (const float *)in, (float *)out);
+}
+
+void ffts_free(ffts_plan_t *p) {
+	p->destroy(p);
+}
+
+void ffts_free_1d(ffts_plan_t *p) {
+	
+	size_t i;
+
+	if(p->ws) {
+		FFTS_FREE(p->ws);
+	}
+	if(p->is) free(p->is);
+	if(p->ws_is) free(p->ws_is);
+	if(p->offsets)		free(p->offsets);
+	//free(p->transforms);
+	if(p->transforms) free(p->transforms);
+
+	if(p->transform_base) {
+		if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) {
+			perror("Couldn't mprotect");
+			exit(errno);
+		}
+		munmap(p->transform_base, p->transform_size);
+		//free(p->transform_base);
+	}
+	free(p);
+}
+
+ffts_plan_t *ffts_init_1d(size_t N, int sign) {
+	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+	size_t leafN = 8;	
+	size_t i;	
+
+#ifdef __arm__
+//#ifdef HAVE_NEON
+	V MULI_SIGN;
+	
+	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+//#endif 
+#else
+	V MULI_SIGN;
+	
+	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+#endif
+
+	p->transform = NULL;
+	p->transform_base = NULL;
+	p->transforms = NULL;
+	p->is = NULL;
+	p->ws_is = NULL;
+	p->ws = NULL;
+	p->offsets = NULL;
+	p->destroy = ffts_free_1d;
+
+	if(N >= 32) {
+		ffts_init_offsets(p, N, leafN);
+#ifdef __arm__
+#ifdef HAVE_NEON
+		ffts_init_is(p, N, leafN, 1);
+#else
+		ffts_init_is(p, N, leafN, 1);
+#endif
+#else
+		ffts_init_is(p, N, leafN, 1);
+#endif
+		
+		p->i0 = N/leafN/3+1;
+		p->i1 = N/leafN/3;
+		if((N/leafN) % 3 > 1) p->i1++;
+		p->i2 = N/leafN/3;
+		
+		#ifdef __arm__	
+		#ifdef HAVE_NEON
+		p->i0/=2;
+		p->i1/=2;
+		#endif
+		#else
+		p->i0/=2;
+		p->i1/=2;
+		#endif
+
+	}else{
+		p->transforms = malloc(2 * sizeof(transform_index_t));
+		p->transforms[0] = 0;
+		p->transforms[1] = 1;
+		if(N == 2) p->transform = &firstpass_2;
+		else if(N == 4 && sign == -1) p->transform = &firstpass_4_f;
+		else if(N == 4 && sign == 1) p->transform = &firstpass_4_b;
+		else if(N == 8 && sign == -1) p->transform = &firstpass_8_f;
+		else if(N == 8 && sign == 1) p->transform = &firstpass_8_b;
+		else if(N == 16 && sign == -1) p->transform = &firstpass_16_f;
+		else if(N == 16 && sign == 1) p->transform = &firstpass_16_b;
+
+		p->is = NULL;
+		p->offsets = NULL;
+	}
+
+		int hardcoded = 0;
+
+		/*      LUTS           */
+		size_t n_luts = __builtin_ctzl(N/leafN);
+		if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
+
+		if(n_luts >= 32) n_luts = 0;
+
+//		fprintf(stderr, "n_luts = %zu\n", n_luts);
+		
+		cdata_t *w;
+
+		int n = leafN*2;
+		if(hardcoded) n = 8;
+		
+		size_t lut_size = 0;
+
+		for(i=0;i<n_luts;i++) {
+			if(!i || hardcoded) {
+			#ifdef __arm__ 
+				if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
+				else lut_size += n/4 * sizeof(cdata_t);
+			#else
+				lut_size += n/4 * 2 * sizeof(cdata_t);
+			#endif
+				n *= 2;
+			} else {
+			#ifdef __arm__
+				lut_size += n/8 * 3 * sizeof(cdata_t);
+			#else
+				lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
+			#endif
+			}
+			n *= 2;
+		}
+		
+//		lut_size *= 16;
+		
+	//	fprintf(stderr, "lut size = %zu\n", lut_size);
+		if(n_luts) {
+			p->ws = FFTS_MALLOC(lut_size,32);
+			p->ws_is = malloc(n_luts * sizeof(size_t));
+		}else{
+			p->ws = NULL;
+			p->ws_is = NULL;
+		}
+		w = p->ws;
+
+		n = leafN*2;
+		if(hardcoded) n = 8;
+		
+		#ifdef HAVE_NEON
+			V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
+		#endif
+		
+		for(i=0;i<n_luts;i++) {
+			p->ws_is[i] = w - (cdata_t *)p->ws;	
+			//fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);	
+			
+			if(!i || hardcoded) {
+				cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
+
+				size_t j;
+				for(j=0;j<n/4;j++) {
+					w0[j][0]	= W_re(n,j);
+					w0[j][1]	= W_im(n,j);
+				}
+
+
+				float *fw0 = (float *)w0;
+				#ifdef __arm__
+					if(N < 32) {
+						//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+						float *fw = (float *)w;
+						V temp0, temp1, temp2;
+						for(j=0;j<n/4;j+=2) {
+						//	#ifdef HAVE_NEON
+							temp0 = VLD(fw0 + j*2);
+							V re, im;
+							re = VDUPRE(temp0);
+							im = VDUPIM(temp0);
+							#ifdef HAVE_NEON 
+								im = VXOR(im, MULI_SIGN);
+								//im = IMULI(sign>0, im);
+							#else
+								im = MULI(sign>0, im);
+							#endif
+							VST(fw + j*4  , re);
+							VST(fw + j*4+4, im);
+					//		#endif
+						}
+						w += n/4 * 2;
+					}else{
+						//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
+						float *fw = (float *)w;
+						#ifdef HAVE_NEON
+							VS temp0, temp1, temp2;
+							for(j=0;j<n/4;j+=4) {
+								temp0 = VLD2(fw0 + j*2);
+								temp0.val[1] = VXOR(temp0.val[1], neg);
+								STORESPR(fw + j*2, temp0);
+							}
+						#else
+							for(j=0;j<n/4;j+=1) {
+								fw[j*2] = fw0[j*2];
+								fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+							}
+						#endif
+						w += n/4;
+					}
+				#else
+					//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+					float *fw = (float *)w;
+					V temp0, temp1, temp2;
+					for(j=0;j<n/4;j+=2) {
+						temp0 = VLD(fw0 + j*2);
+						V re, im;
+						re = VDUPRE(temp0);
+						im = VDUPIM(temp0);
+						im = VXOR(im, MULI_SIGN);
+						VST(fw + j*4  , re);
+						VST(fw + j*4+4, im);
+					}
+					w += n/4 * 2;
+				#endif
+
+				FFTS_FREE(w0);
+			}else{
+
+				cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+				cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+				cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+
+				size_t j;
+				for(j=0;j<n/8;j++) {
+					w0[j][0]	= W_re(n,j*2);
+					w0[j][1]	= W_im(n,j*2);
+					w1[j][0]	= W_re(n,j);
+					w1[j][1]	= W_im(n,j);
+					w2[j][0]	= W_re(n,j + (n/8));
+					w2[j][1]	= W_im(n,j + (n/8));
+
+				}
+
+				float *fw0 = (float *)w0;
+				float *fw1 = (float *)w1;
+				float *fw2 = (float *)w2;
+				#ifdef __arm__
+					//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
+					float *fw = (float *)w;
+					#ifdef HAVE_NEON	
+						VS temp0, temp1, temp2;
+						for(j=0;j<n/8;j+=4) {
+							temp0 = VLD2(fw0 + j*2);
+							temp0.val[1] = VXOR(temp0.val[1], neg);
+							STORESPR(fw + j*2*3,      temp0);
+							temp1 = VLD2(fw1 + j*2);
+							temp1.val[1] = VXOR(temp1.val[1], neg);
+							STORESPR(fw + j*2*3 + 8,  temp1);
+							temp2 = VLD2(fw2 + j*2);
+							temp2.val[1] = VXOR(temp2.val[1], neg);
+							STORESPR(fw + j*2*3 + 16, temp2);
+						}
+					#else
+						for(j=0;j<n/8;j+=1) {
+								fw[j*6] = fw0[j*2];
+								fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+								fw[j*6+2] = fw1[j*2+0];
+								fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
+								fw[j*6+4] = fw2[j*2+0];
+								fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
+						}
+					#endif
+					w += n/8 * 3;
+				#else
+					//w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
+					float *fw = (float *)w;
+					V temp0, temp1, temp2, re, im;
+					for(j=0;j<n/8;j+=2) {
+						temp0 = VLD(fw0 + j*2);
+						re = VDUPRE(temp0);
+						im = VDUPIM(temp0);
+						im = VXOR(im, MULI_SIGN);
+						VST(fw + j*2*6  , re);
+						VST(fw + j*2*6+4, im);
+
+						temp1 = VLD(fw1 + j*2);
+						re = VDUPRE(temp1);
+						im = VDUPIM(temp1);
+						im = VXOR(im, MULI_SIGN);
+						VST(fw + j*2*6+8 , re);
+						VST(fw + j*2*6+12, im);
+
+						temp2 = VLD(fw2 + j*2);
+						re = VDUPRE(temp2);
+						im = VDUPIM(temp2);
+						im = VXOR(im, MULI_SIGN);
+						VST(fw + j*2*6+16, re);
+						VST(fw + j*2*6+20, im);
+					}
+					w += n/8 * 3 * 2;
+				#endif
+
+				FFTS_FREE(w0);
+				FFTS_FREE(w1);
+				FFTS_FREE(w2);
+			}
+			///p->ws[i] = w;
+
+			n *= 2;
+		}
+
+	float *tmp = (float *)p->ws;
+
+	if(sign < 0) {
+		p->oe_ws = (void *)(&w_data[4]);
+		p->ee_ws = (void *)(w_data);
+		p->eo_ws = (void *)(&w_data[4]);
+	}else{
+		p->oe_ws = (void *)(w_data + 12);
+		p->ee_ws = (void *)(w_data + 8);
+		p->eo_ws = (void *)(w_data + 12);
+	}
+
+	p->N = N;
+	p->lastlut = w;
+	p->n_luts = n_luts;
+#ifdef DYNAMIC_DISABLED
+	if(sign < 0) { 
+		if(N >= 32) p->transform = ffts_static_transform_f; 
+	}else{
+		if(N >= 32) p->transform = ffts_static_transform_i; 
+	}
+
+#else
+	if(N>=32)  ffts_generate_func_code(p, N, leafN, sign);
+#endif
+
+	return p;
+}
+
author	Anthony Blake <anthonix@me.com>	2014-09-06 01:35:32 -0500
committer	Timothy Pearson <kb9vqf@pearsoncomputing.net>	2014-09-06 01:35:32 -0500
commit	14d918151bd447d854c3d0b34a9d542a5dff38ff (patch)
tree	897331864b7f2061cbf54b74f8232b318e084f6b /lib/ffts/src/ffts.c
parent	c223a6a35f0b6a1b06aa4ff8029065711a986120 (diff)
download	ulab-14d918151bd447d854c3d0b34a9d542a5dff38ff.tar.gz ulab-14d918151bd447d854c3d0b34a9d542a5dff38ff.zip