summaryrefslogtreecommitdiffstats
path: root/lib/ffts/src
diff options
context:
space:
mode:
Diffstat (limited to 'lib/ffts/src')
-rw-r--r--lib/ffts/src/Makefile.am34
-rw-r--r--lib/ffts/src/Makefile.in666
-rw-r--r--lib/ffts/src/codegen.c731
-rw-r--r--lib/ffts/src/codegen.h49
-rw-r--r--lib/ffts/src/codegen_arm.h101
-rw-r--r--lib/ffts/src/codegen_sse.h195
-rw-r--r--lib/ffts/src/ffts.c398
-rw-r--r--lib/ffts/src/ffts.h177
-rw-r--r--lib/ffts/src/ffts_nd.c282
-rw-r--r--lib/ffts/src/ffts_nd.h58
-rw-r--r--lib/ffts/src/ffts_real.c226
-rw-r--r--lib/ffts/src/ffts_real.h53
-rw-r--r--lib/ffts/src/ffts_real_nd.c177
-rw-r--r--lib/ffts/src/ffts_real_nd.h53
-rw-r--r--lib/ffts/src/ffts_small.c156
-rw-r--r--lib/ffts/src/ffts_small.h13
-rw-r--r--lib/ffts/src/ffts_static.c101
-rw-r--r--lib/ffts/src/ffts_static.h46
-rw-r--r--lib/ffts/src/macros-alpha.h206
-rw-r--r--lib/ffts/src/macros-altivec.h137
-rw-r--r--lib/ffts/src/macros-neon.h96
-rw-r--r--lib/ffts/src/macros-sse.h84
-rw-r--r--lib/ffts/src/macros.h161
-rw-r--r--lib/ffts/src/neon.h65
-rw-r--r--lib/ffts/src/neon.s738
-rw-r--r--lib/ffts/src/neon_float.h1126
-rw-r--r--lib/ffts/src/neon_static_f.s956
-rw-r--r--lib/ffts/src/neon_static_i.s955
-rw-r--r--lib/ffts/src/patterns.c208
-rw-r--r--lib/ffts/src/patterns.h44
-rw-r--r--lib/ffts/src/sse.s878
-rw-r--r--lib/ffts/src/types.h49
-rw-r--r--lib/ffts/src/vfp.h45
-rw-r--r--lib/ffts/src/vfp.s473
34 files changed, 9737 insertions, 0 deletions
diff --git a/lib/ffts/src/Makefile.am b/lib/ffts/src/Makefile.am
new file mode 100644
index 0000000..8547795
--- /dev/null
+++ b/lib/ffts/src/Makefile.am
@@ -0,0 +1,34 @@
+
+
+lib_LTLIBRARIES = libffts.la
+
+libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c ffts_real_nd.c patterns.c
+libffts_la_SOURCES += codegen.h codegen_arm.h codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h macros-neon.h macros-sse.h macros.h neon.h neon_float.h patterns.h types.h vfp.h
+
+if DYNAMIC_DISABLED
+libffts_la_SOURCES += ffts_static.c
+else
+libffts_la_SOURCES += codegen.c
+endif
+
+libffts_includedir=$(includedir)/ffts
+libffts_include_HEADERS = ../include/ffts.h
+
+
+if HAVE_VFP
+libffts_la_SOURCES += vfp.s
+else
+if HAVE_NEON
+
+if DYNAMIC_DISABLED
+libffts_la_SOURCES += neon_static_f.s neon_static_i.s
+else
+libffts_la_SOURCES += neon.s
+endif
+
+else
+if HAVE_SSE
+libffts_la_SOURCES += sse.s
+endif
+endif
+endif
diff --git a/lib/ffts/src/Makefile.in b/lib/ffts/src/Makefile.in
new file mode 100644
index 0000000..a1eefbc
--- /dev/null
+++ b/lib/ffts/src/Makefile.in
@@ -0,0 +1,666 @@
+# Makefile.in generated by automake 1.12.4 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2012 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+
+VPATH = @srcdir@
+am__make_dryrun = \
+ { \
+ am__dry=no; \
+ case $$MAKEFLAGS in \
+ *\\[\ \ ]*) \
+ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \
+ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+ *) \
+ for am__flg in $$MAKEFLAGS; do \
+ case $$am__flg in \
+ *=*|--*) ;; \
+ *n*) am__dry=yes; break;; \
+ esac; \
+ done;; \
+ esac; \
+ test $$am__dry = yes; \
+ }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+@DYNAMIC_DISABLED_TRUE@am__append_1 = ffts_static.c
+@DYNAMIC_DISABLED_FALSE@am__append_2 = codegen.c
+@HAVE_VFP_TRUE@am__append_3 = vfp.s
+@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_4 = neon_static_f.s neon_static_i.s
+@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_5 = neon.s
+@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__append_6 = sse.s
+subdir = src
+DIST_COMMON = $(libffts_include_HEADERS) $(srcdir)/Makefile.am \
+ $(srcdir)/Makefile.in $(top_srcdir)/depcomp
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_classpath.m4 \
+ $(top_srcdir)/m4/ax_check_java_home.m4 \
+ $(top_srcdir)/m4/ax_java_options.m4 \
+ $(top_srcdir)/m4/ax_jni_include_dir.m4 \
+ $(top_srcdir)/m4/ax_prog_jar.m4 \
+ $(top_srcdir)/m4/ax_prog_javac.m4 \
+ $(top_srcdir)/m4/ax_prog_javac_works.m4 \
+ $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+ $(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+ *) f=$$p;; \
+ esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+ for p in $$list; do echo "$$p $$p"; done | \
+ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+ if (++n[$$2] == $(am__install_max)) \
+ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+ END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+ test -z "$$files" \
+ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+ $(am__cd) "$$dir" && rm -f $$files; }; \
+ }
+am__installdirs = "$(DESTDIR)$(libdir)" \
+ "$(DESTDIR)$(libffts_includedir)"
+LTLIBRARIES = $(lib_LTLIBRARIES)
+libffts_la_LIBADD =
+am__libffts_la_SOURCES_DIST = ffts.c ffts_small.c ffts_nd.c \
+ ffts_real.c ffts_real_nd.c patterns.c codegen.h codegen_arm.h \
+ codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h \
+ ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h \
+ macros-neon.h macros-sse.h macros.h neon.h neon_float.h \
+ patterns.h types.h vfp.h ffts_static.c codegen.c vfp.s \
+ neon_static_f.s neon_static_i.s neon.s sse.s
+@DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo
+@DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo
+@HAVE_VFP_TRUE@am__objects_3 = vfp.lo
+@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_4 = neon_static_f.lo \
+@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@ neon_static_i.lo
+@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_5 = neon.lo
+@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__objects_6 = \
+@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@ sse.lo
+am_libffts_la_OBJECTS = ffts.lo ffts_small.lo ffts_nd.lo ffts_real.lo \
+ ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \
+ $(am__objects_3) $(am__objects_4) $(am__objects_5) \
+ $(am__objects_6)
+libffts_la_OBJECTS = $(am_libffts_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+ --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+ --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+ $(LDFLAGS) -o $@
+CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS)
+LTCCASCOMPILE = $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+ --mode=compile $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS)
+SOURCES = $(libffts_la_SOURCES)
+DIST_SOURCES = $(am__libffts_la_SOURCES_DIST)
+am__can_run_installinfo = \
+ case $$AM_UPDATE_INFO_DIR in \
+ n|no|NO) false;; \
+ *) (install-info --version) >/dev/null 2>&1;; \
+ esac
+HEADERS = $(libffts_include_HEADERS)
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FGREP = @FGREP@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+JAR = @JAR@
+JAVA = @JAVA@
+JAVAC = @JAVAC@
+JAVACFLAGS = @JAVACFLAGS@
+JAVAFLAGS = @JAVAFLAGS@
+JAVAPREFIX = @JAVAPREFIX@
+JAVA_PATH_NAME = @JAVA_PATH_NAME@
+JNI_CPPFLAGS = @JNI_CPPFLAGS@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+VERSION = @VERSION@
+_ACJNI_JAVAC = @_ACJNI_JAVAC@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+lib_LTLIBRARIES = libffts.la
+libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c \
+ ffts_real_nd.c patterns.c codegen.h codegen_arm.h \
+ codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h \
+ ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h \
+ macros-neon.h macros-sse.h macros.h neon.h neon_float.h \
+ patterns.h types.h vfp.h $(am__append_1) $(am__append_2) \
+ $(am__append_3) $(am__append_4) $(am__append_5) \
+ $(am__append_6)
+libffts_includedir = $(includedir)/ffts
+libffts_include_HEADERS = ../include/ffts.h
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj .s
+$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
+ @for dep in $?; do \
+ case '$(am__configure_deps)' in \
+ *$$dep*) \
+ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+ && { if test -f $@; then exit 0; else break; fi; }; \
+ exit 1;; \
+ esac; \
+ done; \
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \
+ $(am__cd) $(top_srcdir) && \
+ $(AUTOMAKE) --gnu src/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+ @case '$?' in \
+ *config.status*) \
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+ *) \
+ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+ esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: $(am__configure_deps)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): $(am__aclocal_m4_deps)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+ @$(NORMAL_INSTALL)
+ @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+ list2=; for p in $$list; do \
+ if test -f $$p; then \
+ list2="$$list2 $$p"; \
+ else :; fi; \
+ done; \
+ test -z "$$list2" || { \
+ echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \
+ $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \
+ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \
+ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \
+ }
+
+uninstall-libLTLIBRARIES:
+ @$(NORMAL_UNINSTALL)
+ @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+ for p in $$list; do \
+ $(am__strip_dir) \
+ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \
+ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \
+ done
+
+clean-libLTLIBRARIES:
+ -test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+ @list='$(lib_LTLIBRARIES)'; \
+ locs=`for p in $$list; do echo $$p; done | \
+ sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+ sort -u`; \
+ test -z "$$locs" || { \
+ echo rm -f $${locs}; \
+ rm -f $${locs}; \
+ }
+libffts.la: $(libffts_la_OBJECTS) $(libffts_la_DEPENDENCIES) $(EXTRA_libffts_la_DEPENDENCIES)
+ $(LINK) -rpath $(libdir) $(libffts_la_OBJECTS) $(libffts_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+ -rm -f *.$(OBJEXT)
+
+distclean-compile:
+ -rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codegen.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_nd.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real_nd.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_small.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_static.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/patterns.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $<
+
+.s.o:
+ $(CCASCOMPILE) -c -o $@ $<
+
+.s.obj:
+ $(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.s.lo:
+ $(LTCCASCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+ -rm -f *.lo
+
+clean-libtool:
+ -rm -rf .libs _libs
+install-libffts_includeHEADERS: $(libffts_include_HEADERS)
+ @$(NORMAL_INSTALL)
+ @list='$(libffts_include_HEADERS)'; test -n "$(libffts_includedir)" || list=; \
+ if test -n "$$list"; then \
+ echo " $(MKDIR_P) '$(DESTDIR)$(libffts_includedir)'"; \
+ $(MKDIR_P) "$(DESTDIR)$(libffts_includedir)" || exit 1; \
+ fi; \
+ for p in $$list; do \
+ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+ echo "$$d$$p"; \
+ done | $(am__base_list) | \
+ while read files; do \
+ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libffts_includedir)'"; \
+ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libffts_includedir)" || exit $$?; \
+ done
+
+uninstall-libffts_includeHEADERS:
+ @$(NORMAL_UNINSTALL)
+ @list='$(libffts_include_HEADERS)'; test -n "$(libffts_includedir)" || list=; \
+ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+ dir='$(DESTDIR)$(libffts_includedir)'; $(am__uninstall_files_from_dir)
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+ END { if (nonempty) { for (i in files) print i; }; }'`; \
+ mkid -fID $$unique
+tags: TAGS
+
+TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \
+ $(TAGS_FILES) $(LISP)
+ set x; \
+ here=`pwd`; \
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+ END { if (nonempty) { for (i in files) print i; }; }'`; \
+ shift; \
+ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+ test -n "$$unique" || unique=$$empty_fix; \
+ if test $$# -gt 0; then \
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ "$$@" $$unique; \
+ else \
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ $$unique; \
+ fi; \
+ fi
+ctags: CTAGS
+CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \
+ $(TAGS_FILES) $(LISP)
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+ END { if (nonempty) { for (i in files) print i; }; }'`; \
+ test -z "$(CTAGS_ARGS)$$unique" \
+ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+ $$unique
+
+GTAGS:
+ here=`$(am__cd) $(top_builddir) && pwd` \
+ && $(am__cd) $(top_srcdir) \
+ && gtags -i $(GTAGS_ARGS) "$$here"
+
+cscopelist: $(HEADERS) $(SOURCES) $(LISP)
+ list='$(SOURCES) $(HEADERS) $(LISP)'; \
+ case "$(srcdir)" in \
+ [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+ *) sdir=$(subdir)/$(srcdir) ;; \
+ esac; \
+ for i in $$list; do \
+ if test -f "$$i"; then \
+ echo "$(subdir)/$$i"; \
+ else \
+ echo "$$sdir/$$i"; \
+ fi; \
+ done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+ list='$(DISTFILES)'; \
+ dist_files=`for file in $$list; do echo $$file; done | \
+ sed -e "s|^$$srcdirstrip/||;t" \
+ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+ case $$dist_files in \
+ */*) $(MKDIR_P) `echo "$$dist_files" | \
+ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+ sort -u` ;; \
+ esac; \
+ for file in $$dist_files; do \
+ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+ if test -d $$d/$$file; then \
+ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+ if test -d "$(distdir)/$$file"; then \
+ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+ fi; \
+ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+ fi; \
+ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+ else \
+ test -f "$(distdir)/$$file" \
+ || cp -p $$d/$$file "$(distdir)/$$file" \
+ || exit 1; \
+ fi; \
+ done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES) $(HEADERS)
+installdirs:
+ for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(libffts_includedir)"; do \
+ test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+ done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+ if test -z '$(STRIP)'; then \
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+ install; \
+ else \
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+ fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+ @echo "This command is intended for maintainers to use"
+ @echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
+ mostlyclean-am
+
+distclean: distclean-am
+ -rm -rf ./$(DEPDIR)
+ -rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+ distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am: install-libffts_includeHEADERS
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am: install-libLTLIBRARIES
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+ -rm -rf ./$(DEPDIR)
+ -rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+ mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-libLTLIBRARIES \
+ uninstall-libffts_includeHEADERS
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+ clean-libLTLIBRARIES clean-libtool cscopelist ctags distclean \
+ distclean-compile distclean-generic distclean-libtool \
+ distclean-tags distdir dvi dvi-am html html-am info info-am \
+ install install-am install-data install-data-am install-dvi \
+ install-dvi-am install-exec install-exec-am install-html \
+ install-html-am install-info install-info-am \
+ install-libLTLIBRARIES install-libffts_includeHEADERS \
+ install-man install-pdf install-pdf-am install-ps \
+ install-ps-am install-strip installcheck installcheck-am \
+ installdirs maintainer-clean maintainer-clean-generic \
+ mostlyclean mostlyclean-compile mostlyclean-generic \
+ mostlyclean-libtool pdf pdf-am ps ps-am tags uninstall \
+ uninstall-am uninstall-libLTLIBRARIES \
+ uninstall-libffts_includeHEADERS
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/lib/ffts/src/codegen.c b/lib/ffts/src/codegen.c
new file mode 100644
index 0000000..a66ecda
--- /dev/null
+++ b/lib/ffts/src/codegen.c
@@ -0,0 +1,731 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "codegen.h"
+#include "macros.h"
+#include "ffts.h"
+
+#ifdef __APPLE__
+ #include <libkern/OSCacheControl.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/mman.h>
+
+#ifdef HAVE_NEON
+ #include "codegen_arm.h"
+ #include "neon.h"
+#elif HAVE_VFP
+ #include "codegen_arm.h"
+ #include "vfp.h"
+#else
+ #include "codegen_sse.h"
+ #include "macros-sse.h"
+#endif
+
+#ifdef __ANDROID__
+ #include <unistd.h>
+#endif
+
+int tree_count(int N, int leafN, int offset) {
+
+ if(N <= leafN) return 0;
+ int count = 0;
+ count += tree_count(N/4, leafN, offset);
+ count += tree_count(N/8, leafN, offset + N/4);
+ count += tree_count(N/8, leafN, offset + N/4 + N/8);
+ count += tree_count(N/4, leafN, offset + N/2);
+ count += tree_count(N/4, leafN, offset + 3*N/4);
+
+ return 1 + count;
+}
+
+void elaborate_tree(size_t **p, int N, int leafN, int offset) {
+
+ if(N <= leafN) return;
+ elaborate_tree(p, N/4, leafN, offset);
+ elaborate_tree(p, N/8, leafN, offset + N/4);
+ elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
+ elaborate_tree(p, N/4, leafN, offset + N/2);
+ elaborate_tree(p, N/4, leafN, offset + 3*N/4);
+
+ (*p)[0] = N;
+ (*p)[1] = offset*2;
+
+ (*p)+=2;
+}
+
+
+
+
+uint32_t LUT_offset(size_t N, size_t leafN) {
+ int i;
+ size_t p_lut_size = 0;
+ size_t lut_size = 0;
+ int hardcoded = 0;
+ size_t n_luts = __builtin_ctzl(N/leafN);
+ int n = leafN*2;
+ //if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
+
+ for(i=0;i<n_luts-1;i++) {
+ p_lut_size = lut_size;
+ if(!i || hardcoded) {
+ #ifdef __arm__
+ if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
+ else lut_size += n/4 * sizeof(cdata_t);
+ #else
+ lut_size += n/4 * 2 * sizeof(cdata_t);
+ #endif
+ // n *= 2;
+ } else {
+ #ifdef __arm__
+ lut_size += n/8 * 3 * sizeof(cdata_t);
+ #else
+ lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
+ #endif
+ }
+ n *= 2;
+ }
+ return lut_size;
+}
+
+#ifdef __arm__
+ typedef uint32_t insns_t;
+#else
+ typedef uint8_t insns_t;
+#endif
+
+#define P(x) (*(*p)++ = x)
+
+void insert_nops(uint8_t **p, uint32_t count) {
+ switch(count) {
+ case 0: break;
+ case 2: P(0x66);
+ case 1: P(0x90); break;
+ case 3: P(0x0F); P(0x1F); P(0x00); break;
+ case 4: P(0x0F); P(0x1F); P(0x40); P(0x00); break;
+ case 5: P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;
+ case 6: P(0x66); P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;
+ case 7: P(0x0F); P(0x1F); P(0x80); P(0x00); P(0x00); P(0x00); P(0x00); break;
+ case 8: P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;
+ case 9: P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;
+ default:
+ P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00);
+ insert_nops(p, count-9);
+ break;
+ }
+}
+
+
+void align_mem16(uint8_t **p, uint32_t offset) {
+#ifdef __x86_64__
+ int r = (16 - (offset & 0xf)) - ((uint32_t)(*p) & 0xf);
+ r = (16 + r) & 0xf;
+ insert_nops(p, r);
+#endif
+}
+
+void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
+ int count = tree_count(N, leafN, 0) + 1;
+ size_t *ps = malloc(count * 2 * sizeof(size_t));
+ size_t *pps = ps;
+
+#ifdef __x86_64__
+ if(sign < 0) p->constants = sse_constants;
+ else p->constants = sse_constants_inv;
+#endif
+
+ elaborate_tree(&pps, N, leafN, 0);
+ pps[0] = 0;
+ pps[1] = 0;
+
+ pps = ps;
+
+#ifdef __arm__
+ if(N < 8192) p->transform_size = 8192;
+ else p->transform_size = N;
+#else
+ if(N < 2048) p->transform_size = 16384;
+ else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N);
+#endif
+
+#ifdef __APPLE__
+ p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0);
+#else
+#define MAP_ANONYMOUS 0x20
+ p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+#endif
+
+/*
+ if(p->transform_base == MAP_FAILED) {
+ fprintf(stderr, "MAP FAILED\n");
+ exit(1);
+ }*/
+ insns_t *func = p->transform_base;//valloc(8192);
+ insns_t *fp = func;
+
+//fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
+//fprintf(stderr, "Base address = %016p\n", func);
+
+ if(!func) {
+ fprintf(stderr, "NOMEM\n");
+ exit(1);
+ }
+
+ insns_t *x_8_addr = fp;
+#ifdef __arm__
+#ifdef HAVE_NEON
+ memcpy(fp, neon_x8, neon_x8_t - neon_x8);
+ /*
+ * Changes adds to subtracts and vice versa to allow the computation
+ * of both the IFFT and FFT
+ */
+ if(sign < 0) {
+ fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
+ fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
+ fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
+ }
+ fp += (neon_x8_t - neon_x8) / 4;
+#else
+ memcpy(fp, vfp_x8, vfp_end - vfp_x8);
+ if(sign > 0) {
+ fp[65] ^= 0x00000040;
+ fp[66] ^= 0x00000040;
+ fp[68] ^= 0x00000040;
+ fp[70] ^= 0x00000040;
+ fp[103] ^= 0x00000040;
+ fp[104] ^= 0x00000040;
+ fp[105] ^= 0x00000040;
+ fp[108] ^= 0x00000040;
+ fp[113] ^= 0x00000040;
+ fp[114] ^= 0x00000040;
+ fp[117] ^= 0x00000040;
+ fp[118] ^= 0x00000040;
+ }
+ fp += (vfp_end - vfp_x8) / 4;
+#endif
+#else
+ align_mem16(&fp, 0);
+ x_8_addr = fp;
+ align_mem16(&fp, 5);
+ memcpy(fp, x8_soft, x8_hard - x8_soft);
+ fp += (x8_hard - x8_soft);
+//fprintf(stderr, "X8 start address = %016p\n", x_8_addr);
+#endif
+//uint32_t *x_8_t_addr = fp;
+//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
+//fp += (neon_end - neon_x8_t) / 4;
+ insns_t *x_4_addr = fp;
+#ifdef __arm__
+ #ifdef HAVE_NEON
+ memcpy(fp, neon_x4, neon_x8 - neon_x4);
+ if(sign < 0) {
+ fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
+ }
+ fp += (neon_x8 - neon_x4) / 4;
+ #else
+ memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
+ if(sign > 0) {
+ fp[36] ^= 0x00000040;
+ fp[38] ^= 0x00000040;
+ fp[43] ^= 0x00000040;
+ fp[44] ^= 0x00000040;
+ }
+ fp += (vfp_x8 - vfp_x4) / 4;
+ #endif
+#else
+ align_mem16(&fp, 0);
+ x_4_addr = fp;
+ memcpy(fp, x4, x8_soft - x4);
+ fp += (x8_soft - x4);
+
+#endif
+ insns_t *start = fp;
+
+#ifdef __arm__
+ *fp = PUSH_LR(); fp++;
+ *fp = 0xed2d8b10; fp++;
+
+ ADDI(&fp, 3, 1, 0);
+ ADDI(&fp, 7, 1, N);
+ ADDI(&fp, 5, 1, 2*N);
+ ADDI(&fp, 10, 7, 2*N);
+ ADDI(&fp, 4, 5, 2*N);
+ ADDI(&fp, 8, 10, 2*N);
+ ADDI(&fp, 6, 4, 2*N);
+ ADDI(&fp, 9, 8, 2*N);
+
+ *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12
+// *fp++ = LDRI(1, 0, 4); // load ws into r1
+ ADDI(&fp, 1, 0, 0);
+
+ ADDI(&fp, 0, 2, 0), // mov out into r0
+#endif
+
+
+#ifdef __arm__
+ *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
+ #ifdef HAVE_NEON
+ MOVI(&fp, 11, p->i0);
+ #else
+ MOVI(&fp, 11, p->i0);
+ #endif
+
+#else
+ align_mem16(&fp, 0);
+ start = fp;
+
+ *fp++ = 0x4c;
+ *fp++ = 0x8b;
+ *fp++ = 0x07;
+ uint32_t lp_cnt = p->i0 * 4;
+ MOVI(&fp, RCX, lp_cnt);
+
+ //LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p));
+#endif
+ //fp++;
+#ifdef __arm__
+#ifdef HAVE_NEON
+ memcpy(fp, neon_ee, neon_oo - neon_ee);
+ if(sign < 0) {
+ fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
+ fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
+ fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+ }
+ fp += (neon_oo - neon_ee) / 4;
+#else
+ memcpy(fp, vfp_e, vfp_o - vfp_e);
+ if(sign > 0) {
+ fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
+ fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
+ fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
+ }
+ fp += (vfp_o - vfp_e) / 4;
+#endif
+#else
+//fprintf(stderr, "Body start address = %016p\n", start);
+
+ PUSH(&fp, RBP);
+ PUSH(&fp, RBX);
+ PUSH(&fp, R10);
+ PUSH(&fp, R11);
+ PUSH(&fp, R12);
+ PUSH(&fp, R13);
+ PUSH(&fp, R14);
+ PUSH(&fp, R15);
+
+ int i;
+ memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init);
+
+//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init);
+//fprintf(stderr, "Constants address = %016p\n", sse_constants);
+//fprintf(stderr, "Constants address = %016p\n", p->constants);
+
+//int32_t val = READ_IMM32(fp + 3);
+//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p));
+
+//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp );
+//fprintf(stderr, "IMM = 0x%llx\n", v2);
+
+//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp ));
+ fp += (leaf_ee - leaf_ee_init);
+
+//fprintf(stderr, "Leaf start address = %016p\n", fp);
+ align_mem16(&fp, 9);
+ memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
+
+
+ uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
+ uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
+ uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
+
+ for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4);
+
+ fp += (leaf_oo - leaf_ee);
+
+ if(__builtin_ctzl(N) & 1){
+
+ if(p->i1) {
+ lp_cnt += p->i1 * 4;
+ MOVI(&fp, RCX, lp_cnt);
+ align_mem16(&fp, 4);
+ memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
+ for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4);
+ fp += (leaf_eo - leaf_oo);
+ }
+
+
+ memcpy(fp, leaf_oe, leaf_end - leaf_oe);
+ lp_cnt += 4;
+ for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4);
+ fp += (leaf_end - leaf_oe);
+
+ }else{
+
+
+ memcpy(fp, leaf_eo, leaf_oe - leaf_eo);
+ lp_cnt += 4;
+ for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4);
+ fp += (leaf_oe - leaf_eo);
+
+ if(p->i1) {
+ lp_cnt += p->i1 * 4;
+ MOVI(&fp, RCX, lp_cnt);
+ align_mem16(&fp, 4);
+ memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
+ for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4);
+ fp += (leaf_eo - leaf_oo);
+ }
+
+ }
+ if(p->i1) {
+ lp_cnt += p->i1 * 4;
+ MOVI(&fp, RCX, lp_cnt);
+ align_mem16(&fp, 9);
+ memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
+ for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4);
+ fp += (leaf_oo - leaf_ee);
+
+ }
+
+//fprintf(stderr, "Body start address = %016p\n", fp);
+ //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p));
+ memcpy(fp, x_init, x4 - x_init);
+//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp ));
+ fp += (x4 - x_init);
+
+ int32_t pAddr = 0;
+ int32_t pN = 0;
+ int32_t pLUT = 0;
+ count = 2;
+ while(pps[0]) {
+
+ if(!pN) {
+ MOVI(&fp, RCX, pps[0] / 4);
+ }else{
+ if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr);
+ if(pps[0] > leafN && pps[0] - pN) {
+
+ int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN);
+ *fp++ = 0xc1;
+
+ if(diff > 0) {
+ *fp++ = 0xe1;
+ *fp++ = (diff & 0xff);
+ }else{
+ *fp++ = 0xe9;
+ *fp++ = ((-diff) & 0xff);
+ }
+ }
+ }
+
+ if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
+ ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
+
+
+ if(pps[0] == 2*leafN) {
+ CALL(&fp, x_4_addr);
+ // }else if(!pps[2]){
+ // //uint32_t *x_8_t_addr = fp;
+ // memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
+ // fp += (neon_ee - neon_x8_t) / 4;
+ // //*fp++ = BL(fp+2, x_8_t_addr);
+ }else{
+ CALL(&fp, x_8_addr);
+ }
+
+ pAddr = pps[1] * 4;
+ if(pps[0] > leafN)
+ pN = pps[0];
+ pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
+// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
+ count += 4;
+ pps += 2;
+ }
+#endif
+#ifdef __arm__
+#ifdef HAVE_NEON
+ if(__builtin_ctzl(N) & 1){
+ ADDI(&fp, 2, 7, 0);
+ ADDI(&fp, 7, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 8, 0);
+ ADDI(&fp, 8, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ if(p->i1) {
+ MOVI(&fp, 11, p->i1);
+ memcpy(fp, neon_oo, neon_eo - neon_oo);
+ if(sign < 0) {
+ fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
+ fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
+ fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+ }
+ fp += (neon_eo - neon_oo) / 4;
+ }
+
+ *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++;
+
+ memcpy(fp, neon_oe, neon_end - neon_oe);
+ if(sign < 0) {
+ fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000;
+ fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000;
+ fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000;
+ }
+ fp += (neon_end - neon_oe) / 4;
+
+ }else{
+
+ *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++;
+
+ memcpy(fp, neon_eo, neon_oe - neon_eo);
+ if(sign < 0) {
+ fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000;
+ fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000;
+ fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000;
+ }
+ fp += (neon_oe - neon_eo) / 4;
+
+ ADDI(&fp, 2, 7, 0);
+ ADDI(&fp, 7, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 8, 0);
+ ADDI(&fp, 8, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ if(p->i1) {
+ MOVI(&fp, 11, p->i1);
+ memcpy(fp, neon_oo, neon_eo - neon_oo);
+ if(sign < 0) {
+ fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
+ fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
+ fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+ }
+ fp += (neon_eo - neon_oo) / 4;
+ }
+
+ }
+
+
+ if(p->i1) {
+ ADDI(&fp, 2, 3, 0);
+ ADDI(&fp, 3, 7, 0);
+ ADDI(&fp, 7, 2, 0);
+
+ ADDI(&fp, 2, 4, 0);
+ ADDI(&fp, 4, 8, 0);
+ ADDI(&fp, 8, 2, 0);
+
+ ADDI(&fp, 2, 5, 0);
+ ADDI(&fp, 5, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 6, 0);
+ ADDI(&fp, 6, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ ADDI(&fp, 2, 9, 0);
+ ADDI(&fp, 9, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
+ MOVI(&fp, 11, p->i1);
+ memcpy(fp, neon_ee, neon_oo - neon_ee);
+ if(sign < 0) {
+ fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
+ fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
+ fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+ }
+ fp += (neon_oo - neon_ee) / 4;
+
+ }
+#else
+ ADDI(&fp, 2, 7, 0);
+ ADDI(&fp, 7, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 8, 0);
+ ADDI(&fp, 8, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
+ memcpy(fp, vfp_o, vfp_x4 - vfp_o);
+ if(sign > 0) {
+ fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040;
+ fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040;
+ }
+ fp += (vfp_x4 - vfp_o) / 4;
+
+ ADDI(&fp, 2, 3, 0);
+ ADDI(&fp, 3, 7, 0);
+ ADDI(&fp, 7, 2, 0);
+
+ ADDI(&fp, 2, 4, 0);
+ ADDI(&fp, 4, 8, 0);
+ ADDI(&fp, 8, 2, 0);
+
+ ADDI(&fp, 2, 5, 0);
+ ADDI(&fp, 5, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 6, 0);
+ ADDI(&fp, 6, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ ADDI(&fp, 2, 9, 0);
+ ADDI(&fp, 9, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
+ MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
+ memcpy(fp, vfp_e, vfp_o - vfp_e);
+ if(sign > 0) {
+ fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
+ fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
+ fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
+ }
+ fp += (vfp_o - vfp_e) / 4;
+
+#endif
+ *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
+ //ADDI(&fp, 2, 1, 0);
+ MOVI(&fp, 1, 0);
+
+ // args: r0 - out
+ // r1 - N
+ // r2 - ws
+// ADDI(&fp, 3, 1, 0); // put N into r3 for counter
+
+ int32_t pAddr = 0;
+ int32_t pN = 0;
+ int32_t pLUT = 0;
+ count = 2;
+ while(pps[0]) {
+
+// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
+ if(!pN) {
+ MOVI(&fp, 1, pps[0]);
+ }else{
+ if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
+ if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
+ }
+
+ if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
+ ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
+
+
+ if(pps[0] == 2*leafN) {
+ *fp = BL(fp+2, x_4_addr); fp++;
+ }else if(!pps[2]){
+ //uint32_t *x_8_t_addr = fp;
+#ifdef HAVE_NEON
+ memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
+ if(sign < 0) {
+ fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
+ fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
+ fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
+ }
+ fp += (neon_ee - neon_x8_t) / 4;
+ //*fp++ = BL(fp+2, x_8_t_addr);
+
+#else
+ *fp = BL(fp+2, x_8_addr); fp++;
+#endif
+ }else{
+ *fp = BL(fp+2, x_8_addr); fp++;
+ }
+
+ pAddr = pps[1] * 4;
+ pN = pps[0];
+ pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
+// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
+ count += 4;
+ pps += 2;
+ }
+
+ *fp++ = 0xecbd8b10;
+ *fp++ = POP_LR(); count++;
+#else
+ POP(&fp, R15);
+ POP(&fp, R14);
+ POP(&fp, R13);
+ POP(&fp, R12);
+ POP(&fp, R11);
+ POP(&fp, R10);
+ POP(&fp, RBX);
+ POP(&fp, RBP);
+ RET(&fp);
+
+
+//uint8_t *pp = func;
+//int counter = 0;
+//do{
+// printf("%02x ", *pp);
+// if(counter++ % 16 == 15) printf("\n");
+//} while(++pp < fp);
+
+//printf("\n");
+
+
+#endif
+
+
+// *fp++ = B(14); count++;
+
+//for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
+// fprintf(stderr, "%08x\n", x_4_addr[i]);
+//fprintf(stderr, "\n");
+//for(int i=0;i<count;i++)
+
+ free(ps);
+
+ if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
+ perror("Couldn't mprotect");
+ exit(1);
+ }
+#ifdef __APPLE__
+ sys_icache_invalidate(func, p->transform_size);
+#elif __ANDROID__
+ cacheflush((long)(func), (long)(func) + p->transform_size, 0);
+#elif __linux__
+#ifdef __GNUC__
+ __clear_cache((long)(func), (long)(func) + p->transform_size);
+#endif
+#endif
+
+//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4);
+
+ p->transform = (void *) (start);
+}
diff --git a/lib/ffts/src/codegen.h b/lib/ffts/src/codegen.h
new file mode 100644
index 0000000..f592907
--- /dev/null
+++ b/lib/ffts/src/codegen.h
@@ -0,0 +1,49 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __CODEGEN_H__
+#define __CODEGEN_H__
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <limits.h> /* for PAGESIZE */
+
+#include "ffts.h"
+
+void ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN, int sign);
+
+#endif
diff --git a/lib/ffts/src/codegen_arm.h b/lib/ffts/src/codegen_arm.h
new file mode 100644
index 0000000..ad8a9d8
--- /dev/null
+++ b/lib/ffts/src/codegen_arm.h
@@ -0,0 +1,101 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __CODEGEN_ARM_H__
+#define __CODEGEN_ARM_H__
+
+
+
+uint32_t BL(void *pos, void *target) {
+ return 0xeb000000 | (((target - pos) / 4) & 0xffffff);
+}
+
+uint32_t B(uint8_t r) {
+ return 0xe12fff10 | r;
+}
+
+uint32_t MOV(uint8_t dst, uint8_t src) {
+ return 0xe1a00000 | (src & 0xf) | ((dst & 0xf) << 12);
+}
+
+void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) {
+ int32_t oimm = imm;
+ if(imm < 0) {
+ imm = -imm;
+ uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
+ if(shamt & 1) shamt -= 1;
+ imm >>= shamt;
+ shamt = (32 - shamt)/2;
+
+ // if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
+ *(*p)++ = 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
+
+ if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2))));
+
+ }else{
+ uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
+ if(shamt & 1) shamt -= 1;
+ imm >>= shamt;
+ shamt = (32 - shamt)/2;
+
+// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
+
+ *(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
+
+ if(imm > 255) ADDI(p, dst, src, (oimm - ((imm & 0xff) << (32-shamt*2))));
+ }
+}
+
+uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) {
+ return 0xe5900000 | ((dst & 0xf) << 12)
+ | ((base & 0xf) << 16) | (offset & 0xfff) ;
+}
+
+void MOVI(uint32_t **p, uint8_t dst, uint32_t imm) {
+ uint32_t oimm = imm;
+
+ uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
+ if(shamt & 1) shamt -= 1;
+ imm >>= shamt;
+ shamt = (32 - shamt)/2;
+ *(*p)++ = 0xe3a00000 | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff) ;
+ if(imm > 255) ADDI(p, dst, dst, (oimm - ((imm & 0xff) << (32-shamt*2))));
+}
+
+uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; }
+uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; }
+
+
+
+
+#endif
diff --git a/lib/ffts/src/codegen_sse.h b/lib/ffts/src/codegen_sse.h
new file mode 100644
index 0000000..ec85667
--- /dev/null
+++ b/lib/ffts/src/codegen_sse.h
@@ -0,0 +1,195 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#ifndef __CODEGEN_SSE_H__
+#define __CODEGEN_SSE_H__
+
+void neon_x4(float *, size_t, float *);
+void neon_x8(float *, size_t, float *);
+void neon_x8_t(float *, size_t, float *);
+void leaf_ee_init();
+void leaf_ee();
+void leaf_oo();
+void leaf_eo();
+void leaf_oe();
+void leaf_end();
+void x_init();
+void x4();
+void x8_soft();
+void x8_hard();
+
+void sse_constants();
+void sse_constants_inv();
+
+// typedef uint8_t insns_t;
+
+extern const uint32_t sse_leaf_ee_offsets[8];
+extern const uint32_t sse_leaf_oo_offsets[8];
+extern const uint32_t sse_leaf_eo_offsets[8];
+extern const uint32_t sse_leaf_oe_offsets[8];
+
+#define EAX 0
+#define ECX 1
+#define EDX 2
+#define EBX 3
+#define ESI 6
+#define EDI 7
+#define EBP 5
+
+#define RAX 0
+#define RCX 1
+#define RDX 2
+#define RBX 3
+#define RSI 6
+#define RDI 7
+#define RBP 5
+#define R8 8
+#define R9 9
+#define R10 10
+#define R11 11
+#define R12 12
+#define R13 13
+#define R14 14
+#define R15 15
+
+void IMM8(uint8_t **p, int32_t imm) {
+ *(*p)++ = (imm & 0xff);
+}
+
+void IMM16(uint8_t **p, int32_t imm) {
+ int i;
+ for(i=0;i<2;i++) {
+ *(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
+ }
+}
+void IMM32(uint8_t **p, int32_t imm) {
+ int i;
+ for(i=0;i<4;i++) {
+ *(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
+ }
+}
+void IMM32_NI(uint8_t *p, int32_t imm) {
+ int i;
+ for(i=0;i<4;i++) {
+ *(p+i) = (imm & (0xff << (i*8))) >> (i*8);
+ }
+}
+
+int32_t READ_IMM32(uint8_t *p) {
+ int32_t rval = 0;
+ int i;
+ for(i=0;i<4;i++) {
+ rval |= *(p+i) << (i*8);
+ }
+ return rval;
+}
+
+void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) {
+// if(imm < 65536) *(*p)++ = 0x66;
+ if(dst >= 8) *(*p)++ = 0x41;
+
+ //if(imm < 65536 && imm >= 256) *(*p)++ = 0x66;
+
+ //if(imm >= 256)
+ *(*p)++ = 0xb8 | (dst & 0x7);
+// else *(*p)++ = 0xb0 | (dst & 0x7);
+
+ // if(imm < 256) IMM8(p, imm);
+// else
+//if(imm < 65536) IMM16(p, imm);
+//else
+ IMM32(p, imm);
+
+//if(dst < 8) {
+// *(*p)++ = 0xb8 + dst;
+//}else{
+// *(*p)++ = 0x49;
+// *(*p)++ = 0xc7;
+// *(*p)++ = 0xc0 | (dst - 8);
+//}
+//IMM32(p, imm);
+}
+
+void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) {
+ if(disp == 0) {
+ *(*p)++ = (rm & 7) | ((reg & 7) << 3);
+ }else if(disp <= 127 || disp >= -128) {
+ *(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3);
+ IMM8(p, disp);
+ }else{
+ *(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3);
+ IMM32(p, disp);
+ }
+}
+
+void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp) {
+
+ *(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1);
+ *(*p)++ = 0x8d;
+ ADDRMODE(p, dst, base, disp);
+}
+
+void RET(uint8_t **p) {
+ *(*p)++ = 0xc3;
+}
+
+void ADDI(uint8_t **p, uint8_t dst, int32_t imm) {
+
+ if(dst >= 8) *(*p)++ = 0x49;
+ else *(*p)++ = 0x48;
+
+ if(imm > 127 || imm <= -128) *(*p)++ = 0x81;
+ else *(*p)++ = 0x83;
+
+ *(*p)++ = 0xc0 | (dst & 0x7);
+
+ if(imm > 127 || imm <= -128) IMM32(p, imm);
+ else IMM8(p, imm);
+}
+
+void CALL(uint8_t **p, uint8_t *func) {
+ *(*p)++ = 0xe8;
+ IMM32(p, ((void *)func) - (void *)(*p) - 4);
+}
+
+void PUSH(uint8_t **p, uint8_t reg) {
+ if(reg >= 8) *(*p)++ = 0x41;
+ *(*p)++ = 0x50 | (reg & 7);
+}
+void POP(uint8_t **p, uint8_t reg) {
+ if(reg >= 8) *(*p)++ = 0x41;
+ *(*p)++ = 0x58 | (reg & 7);
+}
+
+#endif
diff --git a/lib/ffts/src/ffts.c b/lib/ffts/src/ffts.c
new file mode 100644
index 0000000..bec2177
--- /dev/null
+++ b/lib/ffts/src/ffts.c
@@ -0,0 +1,398 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "ffts.h"
+#include "macros.h"
+//#include "mini_macros.h"
+#include "patterns.h"
+#include "ffts_small.h"
+
+#ifdef DYNAMIC_DISABLED
+ #include "ffts_static.h"
+#else
+ #include "codegen.h"
+#endif
+
+#include <errno.h>
+ #include <sys/mman.h>
+ #include <string.h>
+ #include <limits.h> /* for PAGESIZE */
+
+#if __APPLE__
+ #include <libkern/OSCacheControl.h>
+#else
+#endif
+
+void ffts_execute(ffts_plan_t *p, const void * in, void * out) {
+ p->transform(p, (const float *)in, (float *)out);
+}
+
+void ffts_free(ffts_plan_t *p) {
+ p->destroy(p);
+}
+
+void ffts_free_1d(ffts_plan_t *p) {
+
+ size_t i;
+
+ if(p->ws) {
+ FFTS_FREE(p->ws);
+ }
+ if(p->is) free(p->is);
+ if(p->ws_is) free(p->ws_is);
+ if(p->offsets) free(p->offsets);
+ //free(p->transforms);
+ if(p->transforms) free(p->transforms);
+
+ if(p->transform_base) {
+ if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) {
+ perror("Couldn't mprotect");
+ exit(errno);
+ }
+ munmap(p->transform_base, p->transform_size);
+ //free(p->transform_base);
+ }
+ free(p);
+}
+
+ffts_plan_t *ffts_init_1d(size_t N, int sign) {
+ ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+ size_t leafN = 8;
+ size_t i;
+
+#ifdef __arm__
+//#ifdef HAVE_NEON
+ V MULI_SIGN;
+
+ if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+ else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+//#endif
+#else
+ V MULI_SIGN;
+
+ if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+ else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+#endif
+
+ p->transform = NULL;
+ p->transform_base = NULL;
+ p->transforms = NULL;
+ p->is = NULL;
+ p->ws_is = NULL;
+ p->ws = NULL;
+ p->offsets = NULL;
+ p->destroy = ffts_free_1d;
+
+ if(N >= 32) {
+ ffts_init_offsets(p, N, leafN);
+#ifdef __arm__
+#ifdef HAVE_NEON
+ ffts_init_is(p, N, leafN, 1);
+#else
+ ffts_init_is(p, N, leafN, 1);
+#endif
+#else
+ ffts_init_is(p, N, leafN, 1);
+#endif
+
+ p->i0 = N/leafN/3+1;
+ p->i1 = N/leafN/3;
+ if((N/leafN) % 3 > 1) p->i1++;
+ p->i2 = N/leafN/3;
+
+ #ifdef __arm__
+ #ifdef HAVE_NEON
+ p->i0/=2;
+ p->i1/=2;
+ #endif
+ #else
+ p->i0/=2;
+ p->i1/=2;
+ #endif
+
+ }else{
+ p->transforms = malloc(2 * sizeof(transform_index_t));
+ p->transforms[0] = 0;
+ p->transforms[1] = 1;
+ if(N == 2) p->transform = &firstpass_2;
+ else if(N == 4 && sign == -1) p->transform = &firstpass_4_f;
+ else if(N == 4 && sign == 1) p->transform = &firstpass_4_b;
+ else if(N == 8 && sign == -1) p->transform = &firstpass_8_f;
+ else if(N == 8 && sign == 1) p->transform = &firstpass_8_b;
+ else if(N == 16 && sign == -1) p->transform = &firstpass_16_f;
+ else if(N == 16 && sign == 1) p->transform = &firstpass_16_b;
+
+ p->is = NULL;
+ p->offsets = NULL;
+ }
+
+ int hardcoded = 0;
+
+ /* LUTS */
+ size_t n_luts = __builtin_ctzl(N/leafN);
+ if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
+
+ if(n_luts >= 32) n_luts = 0;
+
+// fprintf(stderr, "n_luts = %zu\n", n_luts);
+
+ cdata_t *w;
+
+ int n = leafN*2;
+ if(hardcoded) n = 8;
+
+ size_t lut_size = 0;
+
+ for(i=0;i<n_luts;i++) {
+ if(!i || hardcoded) {
+ #ifdef __arm__
+ if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
+ else lut_size += n/4 * sizeof(cdata_t);
+ #else
+ lut_size += n/4 * 2 * sizeof(cdata_t);
+ #endif
+ n *= 2;
+ } else {
+ #ifdef __arm__
+ lut_size += n/8 * 3 * sizeof(cdata_t);
+ #else
+ lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
+ #endif
+ }
+ n *= 2;
+ }
+
+// lut_size *= 16;
+
+ // fprintf(stderr, "lut size = %zu\n", lut_size);
+ if(n_luts) {
+ p->ws = FFTS_MALLOC(lut_size,32);
+ p->ws_is = malloc(n_luts * sizeof(size_t));
+ }else{
+ p->ws = NULL;
+ p->ws_is = NULL;
+ }
+ w = p->ws;
+
+ n = leafN*2;
+ if(hardcoded) n = 8;
+
+ #ifdef HAVE_NEON
+ V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
+ #endif
+
+ for(i=0;i<n_luts;i++) {
+ p->ws_is[i] = w - (cdata_t *)p->ws;
+ //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);
+
+ if(!i || hardcoded) {
+ cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
+
+ size_t j;
+ for(j=0;j<n/4;j++) {
+ w0[j][0] = W_re(n,j);
+ w0[j][1] = W_im(n,j);
+ }
+
+
+ float *fw0 = (float *)w0;
+ #ifdef __arm__
+ if(N < 32) {
+ //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+ float *fw = (float *)w;
+ V temp0, temp1, temp2;
+ for(j=0;j<n/4;j+=2) {
+ // #ifdef HAVE_NEON
+ temp0 = VLD(fw0 + j*2);
+ V re, im;
+ re = VDUPRE(temp0);
+ im = VDUPIM(temp0);
+ #ifdef HAVE_NEON
+ im = VXOR(im, MULI_SIGN);
+ //im = IMULI(sign>0, im);
+ #else
+ im = MULI(sign>0, im);
+ #endif
+ VST(fw + j*4 , re);
+ VST(fw + j*4+4, im);
+ // #endif
+ }
+ w += n/4 * 2;
+ }else{
+ //w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
+ float *fw = (float *)w;
+ #ifdef HAVE_NEON
+ VS temp0, temp1, temp2;
+ for(j=0;j<n/4;j+=4) {
+ temp0 = VLD2(fw0 + j*2);
+ temp0.val[1] = VXOR(temp0.val[1], neg);
+ STORESPR(fw + j*2, temp0);
+ }
+ #else
+ for(j=0;j<n/4;j+=1) {
+ fw[j*2] = fw0[j*2];
+ fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+ }
+ #endif
+ w += n/4;
+ }
+ #else
+ //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+ float *fw = (float *)w;
+ V temp0, temp1, temp2;
+ for(j=0;j<n/4;j+=2) {
+ temp0 = VLD(fw0 + j*2);
+ V re, im;
+ re = VDUPRE(temp0);
+ im = VDUPIM(temp0);
+ im = VXOR(im, MULI_SIGN);
+ VST(fw + j*4 , re);
+ VST(fw + j*4+4, im);
+ }
+ w += n/4 * 2;
+ #endif
+
+ FFTS_FREE(w0);
+ }else{
+
+ cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+ cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+ cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+
+ size_t j;
+ for(j=0;j<n/8;j++) {
+ w0[j][0] = W_re(n,j*2);
+ w0[j][1] = W_im(n,j*2);
+ w1[j][0] = W_re(n,j);
+ w1[j][1] = W_im(n,j);
+ w2[j][0] = W_re(n,j + (n/8));
+ w2[j][1] = W_im(n,j + (n/8));
+
+ }
+
+ float *fw0 = (float *)w0;
+ float *fw1 = (float *)w1;
+ float *fw2 = (float *)w2;
+ #ifdef __arm__
+ //w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
+ float *fw = (float *)w;
+ #ifdef HAVE_NEON
+ VS temp0, temp1, temp2;
+ for(j=0;j<n/8;j+=4) {
+ temp0 = VLD2(fw0 + j*2);
+ temp0.val[1] = VXOR(temp0.val[1], neg);
+ STORESPR(fw + j*2*3, temp0);
+ temp1 = VLD2(fw1 + j*2);
+ temp1.val[1] = VXOR(temp1.val[1], neg);
+ STORESPR(fw + j*2*3 + 8, temp1);
+ temp2 = VLD2(fw2 + j*2);
+ temp2.val[1] = VXOR(temp2.val[1], neg);
+ STORESPR(fw + j*2*3 + 16, temp2);
+ }
+ #else
+ for(j=0;j<n/8;j+=1) {
+ fw[j*6] = fw0[j*2];
+ fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+ fw[j*6+2] = fw1[j*2+0];
+ fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
+ fw[j*6+4] = fw2[j*2+0];
+ fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
+ }
+ #endif
+ w += n/8 * 3;
+ #else
+ //w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
+ float *fw = (float *)w;
+ V temp0, temp1, temp2, re, im;
+ for(j=0;j<n/8;j+=2) {
+ temp0 = VLD(fw0 + j*2);
+ re = VDUPRE(temp0);
+ im = VDUPIM(temp0);
+ im = VXOR(im, MULI_SIGN);
+ VST(fw + j*2*6 , re);
+ VST(fw + j*2*6+4, im);
+
+ temp1 = VLD(fw1 + j*2);
+ re = VDUPRE(temp1);
+ im = VDUPIM(temp1);
+ im = VXOR(im, MULI_SIGN);
+ VST(fw + j*2*6+8 , re);
+ VST(fw + j*2*6+12, im);
+
+ temp2 = VLD(fw2 + j*2);
+ re = VDUPRE(temp2);
+ im = VDUPIM(temp2);
+ im = VXOR(im, MULI_SIGN);
+ VST(fw + j*2*6+16, re);
+ VST(fw + j*2*6+20, im);
+ }
+ w += n/8 * 3 * 2;
+ #endif
+
+ FFTS_FREE(w0);
+ FFTS_FREE(w1);
+ FFTS_FREE(w2);
+ }
+ ///p->ws[i] = w;
+
+ n *= 2;
+ }
+
+ float *tmp = (float *)p->ws;
+
+ if(sign < 0) {
+ p->oe_ws = (void *)(&w_data[4]);
+ p->ee_ws = (void *)(w_data);
+ p->eo_ws = (void *)(&w_data[4]);
+ }else{
+ p->oe_ws = (void *)(w_data + 12);
+ p->ee_ws = (void *)(w_data + 8);
+ p->eo_ws = (void *)(w_data + 12);
+ }
+
+ p->N = N;
+ p->lastlut = w;
+ p->n_luts = n_luts;
+#ifdef DYNAMIC_DISABLED
+ if(sign < 0) {
+ if(N >= 32) p->transform = ffts_static_transform_f;
+ }else{
+ if(N >= 32) p->transform = ffts_static_transform_i;
+ }
+
+#else
+ if(N>=32) ffts_generate_func_code(p, N, leafN, sign);
+#endif
+
+ return p;
+}
+
diff --git a/lib/ffts/src/ffts.h b/lib/ffts/src/ffts.h
new file mode 100644
index 0000000..4409029
--- /dev/null
+++ b/lib/ffts/src/ffts.h
@@ -0,0 +1,177 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifndef __CP_SSE_H__
+#define __CP_SSE_H__
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+//#include <stdalign.h>
+
+//#include "codegen.h"
+#include "types.h"
+
+#define PI 3.1415926535897932384626433832795028841971693993751058209
+
+static const __attribute__ ((aligned(64))) float w_data[16] = {
+ 0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
+ -0.70710678118654757273731092936941, -0.70710678118654746171500846685376,
+ 1.0f, 0.70710678118654757273731092936941f,
+ -0.0f, -0.70710678118654746171500846685376,
+ 0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
+ 0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
+ 1.0f, 0.70710678118654757273731092936941f,
+ 0.0f, 0.70710678118654746171500846685376
+};
+
+__INLINE float W_re(float N, float k) { return cos(-2.0f * PI * k / N); }
+__INLINE float W_im(float N, float k) { return sin(-2.0f * PI * k / N); }
+
+typedef size_t transform_index_t;
+
+//typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
+typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
+
+typedef struct _ffts_plan_t ffts_plan_t;
+
+/**
+ * Contains all the Information need to perform FFT
+ *
+ *
+ * DO NOT CHANGE THE ORDER OF MEMBERS
+ * ASSEMBLY CODE USES HARD CODED OFFSETS TO REFERENCE
+ * SOME OF THESE VARIABES!!
+ */
+struct _ffts_plan_t {
+
+ /**
+ *
+ */
+ ptrdiff_t *offsets;
+#ifdef DYNAMIC_DISABLED
+ /**
+ * Twiddle factors
+ */
+ void *ws;
+ /**
+ * ee - 2 size x size8
+ * oo - 2 x size4 in parallel
+ * oe -
+ */
+ void *oe_ws, *eo_ws, *ee_ws;
+#else
+ void __attribute__((aligned(32))) *ws;
+ void __attribute__((aligned(32))) *oe_ws, *eo_ws, *ee_ws;
+#endif
+ /**
+ * Pointer into an array of precomputed indexes for the input data array
+ */
+ ptrdiff_t *is;
+
+ /**
+ * Twiddle Factor Indexes
+ */
+ size_t *ws_is;
+
+ /**
+ * Size of the loops for the base cases
+ */
+ size_t i0, i1, n_luts;
+
+ /**
+ * Size fo the Transform
+ */
+ size_t N;
+ void *lastlut;
+ /**
+ * Used in multidimensional Code ??
+ */
+ transform_index_t *transforms;
+ //transform_func_t transform;
+
+ /**
+ * Pointer to the dynamically generated function
+ * that will execute the FFT
+ */
+ void (*transform)(ffts_plan_t * , const void * , void * );
+
+ /**
+ * Pointer to the base memory address of
+ * of the transform function
+ */
+ void *transform_base;
+
+ /**
+ * Size of the memory block contain the
+ * generated code
+ */
+ size_t transform_size;
+
+ /**
+ * Points to the cosnant variables used by
+ * the Assembly Code
+ */
+ void *constants;
+
+ // multi-dimensional stuff:
+ struct _ffts_plan_t **plans;
+ int rank;
+ size_t *Ns, *Ms;
+ void *buf;
+
+ void *transpose_buf;
+
+ /**
+ * Pointer to the destroy function
+ * to clean up the plan after use
+ * (differs for real and multi dimension transforms
+ */
+ void (*destroy)(ffts_plan_t *);
+
+ /**
+ * Coefficiants for the real valued transforms
+ */
+ float *A, *B;
+
+ size_t i2;
+};
+
+
+void ffts_free(ffts_plan_t *);
+ffts_plan_t *ffts_init_1d(size_t N, int sign);
+void ffts_execute(ffts_plan_t *, const void *, void *);
+#endif
diff --git a/lib/ffts/src/ffts_nd.c b/lib/ffts/src/ffts_nd.c
new file mode 100644
index 0000000..ae9b148
--- /dev/null
+++ b/lib/ffts/src/ffts_nd.c
@@ -0,0 +1,282 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_nd.h"
+
+#ifdef HAVE_NEON
+#include "neon.h"
+#endif
+
+void ffts_free_nd(ffts_plan_t *p) {
+
+ int i;
+ for(i=0;i<p->rank;i++) {
+
+ ffts_plan_t *x = p->plans[i];
+ int k;
+ for(k=0;k<i;k++) {
+ if(p->Ms[i] == p->Ms[k]) x = NULL;
+ }
+
+ if(x) ffts_free(x);
+ }
+
+ free(p->Ns);
+ free(p->Ms);
+ free(p->plans);
+ free(p->buf);
+ free(p->transpose_buf);
+ free(p);
+}
+#define TSIZE 8
+#include <string.h>
+void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
+
+#ifdef HAVE_NEON
+ size_t i,j,k;
+ int linebytes = w*8;
+
+ for(j=0;j<h;j+=8) {
+ for(i=0;i<w;i+=8) {
+ neon_transpose_to_buf(in + j*w + i, buf, w);
+
+ uint64_t *p = out + i*h + j;
+ uint64_t *pbuf = buf;
+ uint64_t *ptemp;
+
+ __asm__ __volatile__(
+ "mov %[ptemp], %[p]\n\t"
+ "add %[p], %[p], %[w], lsl #3\n\t"
+ "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+ "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+ "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+ "mov %[ptemp], %[p]\n\t"
+ "add %[p], %[p], %[w], lsl #3\n\t"
+ "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+ "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+ "mov %[ptemp], %[p]\n\t"
+ "add %[p], %[p], %[w], lsl #3\n\t"
+ "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+ "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+ "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+ "mov %[ptemp], %[p]\n\t"
+ "add %[p], %[p], %[w], lsl #3\n\t"
+ "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+ "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+ "mov %[ptemp], %[p]\n\t"
+ "add %[p], %[p], %[w], lsl #3\n\t"
+ "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+ "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+ "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+ "mov %[ptemp], %[p]\n\t"
+ "add %[p], %[p], %[w], lsl #3\n\t"
+ "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+ "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+ "mov %[ptemp], %[p]\n\t"
+ "add %[p], %[p], %[w], lsl #3\n\t"
+ "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+ "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+ "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+ "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+ "mov %[ptemp], %[p]\n\t"
+ "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+ "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+
+ : [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp)
+ : [w] "r" (w)
+ : "memory", "q8", "q9", "q10", "q11"
+ );
+// out[i*h + j] = in[j*w + i];
+ }
+ }
+#else
+#ifdef HAVE_SSE
+ uint64_t tmp[TSIZE*TSIZE] __attribute__((aligned(64)));
+ int tx, ty;
+ int x, y;
+ int tw = w / TSIZE;
+ int th = h / TSIZE;
+ for (ty=0;ty<th;ty++) {
+ for (tx=0;tx<tw;tx++) {
+ uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
+ uint64_t *op0 = tmp;//out + h*TSIZE*tx + ty*TSIZE;
+
+ // Copy/transpose to tmp
+ for (y=0;y<TSIZE;y+=2) {
+ //for (x=0;x<TSIZE;x+=2) {
+ //op[x*TSIZE] = ip[x];
+ __m128d q0 = _mm_load_pd((double *)(ip0 + 0*w));
+ __m128d q1 = _mm_load_pd((double *)(ip0 + 1*w));
+ __m128d q2 = _mm_load_pd((double *)(ip0 + 2*w));
+ __m128d q3 = _mm_load_pd((double *)(ip0 + 3*w));
+ __m128d q4 = _mm_load_pd((double *)(ip0 + 4*w));
+ __m128d q5 = _mm_load_pd((double *)(ip0 + 5*w));
+ __m128d q6 = _mm_load_pd((double *)(ip0 + 6*w));
+ __m128d q7 = _mm_load_pd((double *)(ip0 + 7*w));
+ ip0 += 2;
+
+ __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
+ __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
+ __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
+ __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
+ __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
+ __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
+ __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
+ __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));
+ //_mm_store_pd((double *)(op0 + y*h + x), t0);
+ //_mm_store_pd((double *)(op0 + y*h + x + h), t1);
+ _mm_store_pd((double *)(op0 + 0), t0);
+ _mm_store_pd((double *)(op0 + 0 + TSIZE), t1);
+ _mm_store_pd((double *)(op0 + 2 ), t2);
+ _mm_store_pd((double *)(op0 + 2 + TSIZE), t3);
+ _mm_store_pd((double *)(op0 + 4 ), t4);
+ _mm_store_pd((double *)(op0 + 4 + TSIZE), t5);
+ _mm_store_pd((double *)(op0 + 6 ), t6);
+ _mm_store_pd((double *)(op0 + 6 + TSIZE), t7);
+ //}
+ op0 += 2*TSIZE;
+ }
+
+ op0 = out + h*tx*TSIZE + ty*TSIZE;
+ ip0 = tmp;
+ for (y=0;y<TSIZE;y+=1) {
+ // memcpy(op0, ip0, TSIZE * sizeof(*ip0));
+
+ __m128d q0 = _mm_load_pd((double *)(ip0 + 0));
+ __m128d q1 = _mm_load_pd((double *)(ip0 + 2));
+ __m128d q2 = _mm_load_pd((double *)(ip0 + 4));
+ __m128d q3 = _mm_load_pd((double *)(ip0 + 6));
+ _mm_store_pd((double *)(op0 + 0), q0);
+ _mm_store_pd((double *)(op0 + 2), q1);
+ _mm_store_pd((double *)(op0 + 4), q2);
+ _mm_store_pd((double *)(op0 + 6), q3);
+
+ op0 += h;
+ ip0 += TSIZE;
+ }
+
+ }
+ }
+/*
+ size_t i,j;
+ for(i=0;i<w;i+=2) {
+ for(j=0;j<h;j+=2) {
+// out[i*h + j] = in[j*w + i];
+ __m128d q0 = _mm_load_pd((double *)(in + j*w + i));
+ __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
+ __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
+ __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
+ _mm_store_pd((double *)(out + i*h + j), t0);
+ _mm_store_pd((double *)(out + i*h + j + h), t1);
+ }
+ }
+*/
+#endif
+#endif
+
+}
+
+void ffts_execute_nd(ffts_plan_t *p, const void * in, void * out) {
+
+ uint64_t *din = (uint64_t *)in;
+ uint64_t *buf = p->buf;
+ uint64_t *dout = (uint64_t *)out;
+
+ size_t i,j;
+ for(i=0;i<p->Ns[0];i++) {
+ p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * p->Ms[0]));
+ }
+ ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf);
+
+ for(i=1;i<p->rank;i++) {
+ for(j=0;j<p->Ns[i];j++) {
+ p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));
+ }
+ ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);
+ }
+}
+
+ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign) {
+ size_t vol = 1;
+
+ ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+
+ p->transform = &ffts_execute_nd;
+ p->destroy = &ffts_free_nd;
+
+ p->rank = rank;
+ p->Ns = malloc(sizeof(size_t) * rank);
+ p->Ms = malloc(sizeof(size_t) * rank);
+ p->plans = malloc(sizeof(ffts_plan_t **) * rank);
+ int i;
+ for(i=0;i<rank;i++) {
+ p->Ns[i] = Ns[i];
+ vol *= Ns[i];
+ }
+ p->buf = valloc(sizeof(float) * 2 * vol);
+
+ for(i=0;i<rank;i++) {
+ p->Ms[i] = vol / p->Ns[i];
+
+ p->plans[i] = NULL;
+ int k;
+ for(k=0;k<i;k++) {
+ if(p->Ms[k] == p->Ms[i])
+ p->plans[i] = p->plans[k];
+ }
+
+ if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign);
+ }
+
+ p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
+ return p;
+}
+
+
+ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign) {
+ size_t Ns[2];
+ Ns[0] = N1;
+ Ns[1] = N2;
+ return ffts_init_nd(2, Ns, sign);
+}
diff --git a/lib/ffts/src/ffts_nd.h b/lib/ffts/src/ffts_nd.h
new file mode 100644
index 0000000..8f0c855
--- /dev/null
+++ b/lib/ffts/src/ffts_nd.h
@@ -0,0 +1,58 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __FFTS_ND_H__
+#define __FFTS_ND_H__
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include "ffts.h"
+
+#ifdef HAVE_NEON
+ #include <arm_neon.h>
+#endif
+#ifdef HAVE_SSE
+ #include <xmmintrin.h>
+#endif
+
+void ffts_free_nd(ffts_plan_t *p);
+void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf);
+
+void ffts_execute_nd(ffts_plan_t *p, const void * in, void * out);
+ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign);
+ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign);
+
+#endif
+
diff --git a/lib/ffts/src/ffts_real.c b/lib/ffts/src/ffts_real.c
new file mode 100644
index 0000000..bdb6eac
--- /dev/null
+++ b/lib/ffts/src/ffts_real.c
@@ -0,0 +1,226 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_real.h"
+
+void ffts_free_1d_real(ffts_plan_t *p) {
+ ffts_free(p->plans[0]);
+ free(p->A);
+ free(p->B);
+ free(p->plans);
+ free(p->buf);
+ free(p);
+}
+
+void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) {
+ float *out = (float *)vout;
+ float *buf = (float *)p->buf;
+ float *A = p->A;
+ float *B = p->B;
+
+ p->plans[0]->transform(p->plans[0], vin, buf);
+
+ size_t N = p->N;
+ buf[N] = buf[0];
+ buf[N+1] = buf[1];
+
+ float *p_buf0 = buf;
+ float *p_buf1 = buf + N - 2;
+ float *p_out = out;
+
+ size_t i;
+#ifdef __ARM_NEON__
+ for(i=0;i<N/2;i+=2) {
+ __asm__ __volatile__ ("vld1.32 {q8}, [%[pa], :128]!\n\t"
+ "vld1.32 {q9}, [%[pb], :128]!\n\t"
+ "vld1.32 {q10}, [%[buf0], :128]!\n\t"
+ "vld1.32 {q11}, [%[buf1], :64]\n\t"
+ "sub %[buf1], %[buf1], #16\n\t"
+
+ "vdup.32 d26, d16[1]\n\t"
+ "vdup.32 d27, d17[1]\n\t"
+ "vdup.32 d24, d16[0]\n\t"
+ "vdup.32 d25, d17[0]\n\t"
+
+ "vdup.32 d30, d23[1]\n\t"
+ "vdup.32 d31, d22[1]\n\t"
+ "vdup.32 d28, d23[0]\n\t"
+ "vdup.32 d29, d22[0]\n\t"
+
+ "vmul.f32 q13, q13, q10\n\t"
+ "vmul.f32 q15, q15, q9\n\t"
+ "vmul.f32 q12, q12, q10\n\t"
+ "vmul.f32 q14, q14, q9\n\t"
+ "vrev64.f32 q13, q13\n\t"
+ "vrev64.f32 q15, q15\n\t"
+
+ "vtrn.32 d26, d27\n\t"
+ "vtrn.32 d30, d31\n\t"
+ "vneg.f32 d26, d26\n\t"
+ "vneg.f32 d31, d31\n\t"
+ "vtrn.32 d26, d27\n\t"
+ "vtrn.32 d30, d31\n\t"
+
+ "vadd.f32 q12, q12, q14\n\t"
+ "vadd.f32 q13, q13, q15\n\t"
+ "vadd.f32 q12, q12, q13\n\t"
+ "vst1.32 {q12}, [%[pout], :128]!\n\t"
+ : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
+ [pout] "+r" (p_out)
+ :
+ : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+#else
+ for(i=0;i<N/2;i++) {
+ out[2*i] = buf[2*i]*A[2*i] - buf[2*i+1]*A[2*i+1] + buf[N-2*i]*B[2*i] + buf[N-2*i+1]*B[2*i+1];
+ out[2*i+1] = buf[2*i+1]*A[2*i] + buf[2*i]*A[2*i+1] + buf[N-2*i]*B[2*i+1] - buf[N-2*i+1]*B[2*i];
+
+// out[2*N-2*i] = out[2*i];
+// out[2*N-2*i+1] = -out[2*i+1];
+
+#endif
+ }
+
+ out[N] = buf[0] - buf[1];
+ out[N+1] = 0.0f;
+
+}
+
+void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) {
+ float *out = (float *)vout;
+ float *in = (float *)vin;
+ float *buf = (float *)p->buf;
+ float *A = p->A;
+ float *B = p->B;
+ size_t N = p->N;
+
+ float *p_buf0 = in;
+ float *p_buf1 = in + N - 2;
+
+ float *p_out = buf;
+
+ size_t i;
+#ifdef __ARM_NEON__
+ for(i=0;i<N/2;i+=2) {
+ __asm__ __volatile__ ("vld1.32 {q8}, [%[pa], :128]!\n\t"
+ "vld1.32 {q9}, [%[pb], :128]!\n\t"
+ "vld1.32 {q10}, [%[buf0], :128]!\n\t"
+ "vld1.32 {q11}, [%[buf1], :64]\n\t"
+ "sub %[buf1], %[buf1], #16\n\t"
+
+ "vdup.32 d26, d16[1]\n\t"
+ "vdup.32 d27, d17[1]\n\t"
+ "vdup.32 d24, d16[0]\n\t"
+ "vdup.32 d25, d17[0]\n\t"
+
+ "vdup.32 d30, d23[1]\n\t"
+ "vdup.32 d31, d22[1]\n\t"
+ "vdup.32 d28, d23[0]\n\t"
+ "vdup.32 d29, d22[0]\n\t"
+
+ "vmul.f32 q13, q13, q10\n\t"
+ "vmul.f32 q15, q15, q9\n\t"
+ "vmul.f32 q12, q12, q10\n\t"
+ "vmul.f32 q14, q14, q9\n\t"
+ "vrev64.f32 q13, q13\n\t"
+ "vrev64.f32 q15, q15\n\t"
+
+ "vtrn.32 d26, d27\n\t"
+ "vtrn.32 d28, d29\n\t"
+ "vneg.f32 d27, d27\n\t"
+ "vneg.f32 d29, d29\n\t"
+ "vtrn.32 d26, d27\n\t"
+ "vtrn.32 d28, d29\n\t"
+
+ "vadd.f32 q12, q12, q14\n\t"
+ "vsub.f32 q13, q13, q15\n\t"
+ "vadd.f32 q12, q12, q13\n\t"
+ "vst1.32 {q12}, [%[pout], :128]!\n\t"
+ : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
+ [pout] "+r" (p_out)
+ :
+ : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+
+
+#else
+ for(i=0;i<N/2;i++) {
+ buf[2*i] = in[2*i]*A[2*i] + in[2*i+1]*A[2*i+1] + in[N-2*i]*B[2*i] - in[N-2*i+1]*B[2*i+1];
+ buf[2*i+1] = in[2*i+1]*A[2*i] - in[2*i]*A[2*i+1] - in[N-2*i]*B[2*i+1] - in[N-2*i+1]*B[2*i];
+#endif
+}
+
+ p->plans[0]->transform(p->plans[0], buf, out);
+
+}
+
+ffts_plan_t *ffts_init_1d_real(size_t N, int sign) {
+ ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+
+ if(sign < 0) p->transform = &ffts_execute_1d_real;
+ else p->transform = &ffts_execute_1d_real_inv;
+
+ p->destroy = &ffts_free_1d_real;
+ p->N = N;
+ p->rank = 1;
+ p->plans = malloc(sizeof(ffts_plan_t **) * 1);
+
+ p->plans[0] = ffts_init_1d(N/2, sign);
+
+ p->buf = valloc(sizeof(float) * 2 * ((N/2) + 1));
+
+ p->A = valloc(sizeof(float) * N);
+ p->B = valloc(sizeof(float) * N);
+
+ if(sign < 0) {
+ int i;
+ for (i = 0; i < N/2; i++) {
+ p->A[2 * i] = 0.5 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
+ p->A[2 * i + 1] = 0.5 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
+ p->B[2 * i] = 0.5 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
+ p->B[2 * i + 1] = 0.5 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
+ }
+ }else{
+ int i;
+ for (i = 0; i < N/2; i++) {
+ p->A[2 * i] = 1.0 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
+ p->A[2 * i + 1] = 1.0 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
+ p->B[2 * i] = 1.0 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
+ p->B[2 * i + 1] = 1.0 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
+ }
+ }
+
+ return p;
+}
+
+
diff --git a/lib/ffts/src/ffts_real.h b/lib/ffts/src/ffts_real.h
new file mode 100644
index 0000000..bf8834d
--- /dev/null
+++ b/lib/ffts/src/ffts_real.h
@@ -0,0 +1,53 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __FFTS_REAL_H__
+#define __FFTS_REAL_H__
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include "ffts.h"
+
+#ifdef HAVE_NEON
+ #include <arm_neon.h>
+#endif
+#ifdef HAVE_SSE
+ #include <xmmintrin.h>
+#endif
+
+ffts_plan_t *ffts_init_1d_real(size_t N, int sign);
+
+#endif
+
diff --git a/lib/ffts/src/ffts_real_nd.c b/lib/ffts/src/ffts_real_nd.c
new file mode 100644
index 0000000..bf46254
--- /dev/null
+++ b/lib/ffts/src/ffts_real_nd.c
@@ -0,0 +1,177 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_real_nd.h"
+
+#ifdef __ARM_NEON__
+#include "neon.h"
+#endif
+
+void ffts_free_nd_real(ffts_plan_t *p) {
+
+ int i;
+ for(i=0;i<p->rank;i++) {
+
+ ffts_plan_t *x = p->plans[i];
+
+ int k;
+ for(k=i+1;k<p->rank;k++) {
+ if(x == p->plans[k]) p->plans[k] = NULL;
+ }
+
+ if(x) ffts_free(x);
+ }
+
+ free(p->Ns);
+ free(p->Ms);
+ free(p->plans);
+ free(p->buf);
+ free(p->transpose_buf);
+ free(p);
+}
+
+void ffts_scalar_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
+
+ size_t i,j;
+ for(i=0;i<w;i+=1) {
+ for(j=0;j<h;j+=1) {
+ out[i*h + j] = in[j*w + i];
+ }
+ }
+
+}
+
+void ffts_execute_nd_real(ffts_plan_t *p, const void * in, void * out) {
+
+ uint32_t *din = (uint32_t *)in;
+ uint64_t *buf = p->buf;
+ uint64_t *dout = (uint64_t *)out;
+
+ size_t i,j;
+ for(i=0;i<p->Ns[0];i++) {
+ p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * (p->Ms[0] / 2 + 1)));
+ }
+ ffts_scalar_transpose(buf, dout, p->Ms[0] / 2 + 1, p->Ns[0], p->transpose_buf);
+
+ for(i=1;i<p->rank;i++) {
+ for(j=0;j<p->Ns[i];j++) {
+ p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));
+ }
+ ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);
+ }
+}
+
+void ffts_execute_nd_real_inv(ffts_plan_t *p, const void * in, void * out) {
+
+ uint64_t *din = (uint64_t *)in;
+ uint64_t *buf = p->buf;
+ uint64_t *dout = (uint64_t *)out;
+
+ float *bufr = (float *)(p->buf);
+ float *doutr = (float *)out;
+
+ size_t i,j;
+ ffts_scalar_transpose(din, buf, p->Ms[0], p->Ns[0], p->transpose_buf);
+
+ for(i=0;i<p->Ms[0];i++) {
+ p->plans[0]->transform(p->plans[0], buf + (i * p->Ns[0]), dout + (i * p->Ns[0]));
+ }
+
+ ffts_scalar_transpose(dout, buf, p->Ns[0], p->Ms[0], p->transpose_buf);
+ for(j=0;j<p->Ms[1];j++) {
+ p->plans[1]->transform(p->plans[1], buf + (j * (p->Ms[0])), &doutr[j * p->Ns[1]]);
+ }
+}
+
+ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) {
+ size_t vol = 1;
+
+ ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+
+ if(sign < 0) p->transform = &ffts_execute_nd_real;
+ else p->transform = &ffts_execute_nd_real_inv;
+
+ p->destroy = &ffts_free_nd_real;
+
+ p->rank = rank;
+ p->Ns = malloc(sizeof(size_t) * rank);
+ p->Ms = malloc(sizeof(size_t) * rank);
+ p->plans = malloc(sizeof(ffts_plan_t **) * rank);
+ int i;
+ for(i=0;i<rank;i++) {
+ p->Ns[i] = Ns[i];
+ vol *= Ns[i];
+ }
+ p->buf = valloc(sizeof(float) * 2 * vol);
+
+ for(i=0;i<rank;i++) {
+ p->Ms[i] = vol / p->Ns[i];
+
+ p->plans[i] = NULL;
+ int k;
+
+ if(sign < 0) {
+ for(k=1;k<i;k++) {
+ if(p->Ms[k] == p->Ms[i]) p->plans[i] = p->plans[k];
+ }
+ if(!i) p->plans[i] = ffts_init_1d_real(p->Ms[i], sign);
+ else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign);
+ }else{
+ for(k=0;k<i;k++) {
+ if(p->Ns[k] == p->Ns[i]) p->plans[i] = p->plans[k];
+ }
+ if(i==rank-1) p->plans[i] = ffts_init_1d_real(p->Ns[i], sign);
+ else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ns[i], sign);
+ }
+ }
+ if(sign < 0) {
+ for(i=1;i<rank;i++) {
+ p->Ns[i] = p->Ns[i] / 2 + 1;
+ }
+ }else{
+ for(i=0;i<rank-1;i++) {
+ p->Ms[i] = p->Ms[i] / 2 + 1;
+ }
+ }
+
+ p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
+ return p;
+}
+
+
+ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) {
+ size_t Ns[2];
+ Ns[0] = N1;
+ Ns[1] = N2;
+ return ffts_init_nd_real(2, Ns, sign);
+}
diff --git a/lib/ffts/src/ffts_real_nd.h b/lib/ffts/src/ffts_real_nd.h
new file mode 100644
index 0000000..d777d42
--- /dev/null
+++ b/lib/ffts/src/ffts_real_nd.h
@@ -0,0 +1,53 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __FFTS_REAL_ND_H__
+#define __FFTS_REAL_ND_H__
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include "ffts_nd.h"
+#include "ffts_real.h"
+#include "ffts.h"
+
+#ifdef HAVE_NEON
+ #include <arm_neon.h>
+#endif
+#ifdef HAVE_SSE
+ #include <xmmintrin.h>
+#endif
+
+#endif
+
diff --git a/lib/ffts/src/ffts_small.c b/lib/ffts/src/ffts_small.c
new file mode 100644
index 0000000..ddd2d3e
--- /dev/null
+++ b/lib/ffts/src/ffts_small.c
@@ -0,0 +1,156 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts.h"
+#include "macros.h"
+
+#include <stdlib.h>
+
+#define DEBUG(x)
+
+#include "ffts_small.h"
+
+ void firstpass_16_f(ffts_plan_t * p, const void * in, void * out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
+ float *LUT8 = p->ws;
+
+ L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
+ L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
+ K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
+ K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
+ S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
+ K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
+ S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
+}
+
+ void firstpass_16_b(ffts_plan_t * p, const void * in, void * out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
+ float *LUT8 = p->ws;
+
+ L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
+ L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
+ K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
+ K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
+ S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
+ K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
+ S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
+}
+
+
+ void firstpass_8_f(ffts_plan_t *p, const void *in, void *out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ V r0_1, r2_3, r4_5, r6_7;
+ float *LUT8 = p->ws + p->ws_is[0];
+
+ L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+ K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
+ S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
+}
+
+ void firstpass_8_b(ffts_plan_t *p, const void *in, void *out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ V r0_1, r2_3, r4_5, r6_7;
+ float *LUT8 = p->ws + p->ws_is[0];
+
+ L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+ K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
+ S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
+}
+
+
+ void firstpass_4_f(ffts_plan_t *p, const void *in, void *out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
+ t0[0] = din[0]; t0[1] = din[1];
+ t1[0] = din[4]; t1[1] = din[5];
+ t2[0] = din[2]; t2[1] = din[3];
+ t3[0] = din[6]; t3[1] = din[7];
+
+ t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
+ t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
+ t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
+ t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
+
+ dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
+ dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
+ dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0];
+ dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0];
+}
+
+ void firstpass_4_b(ffts_plan_t *p, const void *in, void *out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
+ t0[0] = din[0]; t0[1] = din[1];
+ t1[0] = din[4]; t1[1] = din[5];
+ t2[0] = din[2]; t2[1] = din[3];
+ t3[0] = din[6]; t3[1] = din[7];
+
+ t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
+ t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
+ t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
+ t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
+
+ dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
+ dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
+ dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0];
+ dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0];
+}
+
+ void firstpass_2(ffts_plan_t *p, const void *in, void *out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ cdata_t t0, t1, r0,r1;
+ t0[0] = din[0]; t0[1] = din[1];
+ t1[0] = din[2]; t1[1] = din[3];
+ r0[0] = t0[0] + t1[0];
+ r0[1] = t0[1] + t1[1];
+ r1[0] = t0[0] - t1[0];
+ r1[1] = t0[1] - t1[1];
+ dout[0] = r0[0]; dout[1] = r0[1];
+ dout[2] = r1[0]; dout[3] = r1[1];
+}
diff --git a/lib/ffts/src/ffts_small.h b/lib/ffts/src/ffts_small.h
new file mode 100644
index 0000000..76cadf5
--- /dev/null
+++ b/lib/ffts/src/ffts_small.h
@@ -0,0 +1,13 @@
+#ifndef __FFTS_SMALL_H__
+#define __FFTS_SMALL_H__
+
+
+void firstpass_16_f(ffts_plan_t * p, const void * in, void * out);
+void firstpass_16_b(ffts_plan_t * p, const void * in, void * out);
+void firstpass_8_f(ffts_plan_t * p, const void * in, void * out);
+void firstpass_8_b(ffts_plan_t * p, const void * in, void * out);
+void firstpass_4_f(ffts_plan_t * p, const void * in, void * out);
+void firstpass_4_b(ffts_plan_t * p, const void * in, void * out);
+void firstpass_2(ffts_plan_t * p, const void * in, void * out);
+
+#endif
diff --git a/lib/ffts/src/ffts_static.c b/lib/ffts/src/ffts_static.c
new file mode 100644
index 0000000..3edf2ea
--- /dev/null
+++ b/lib/ffts/src/ffts_static.c
@@ -0,0 +1,101 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "ffts_static.h"
+
+void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) {
+ if(N > 16) {
+ size_t N1 = N >> 1;
+ size_t N2 = N >> 2;
+ size_t N3 = N >> 3;
+ float *ws = ((float *)(p->ws)) + (p->ws_is[__builtin_ctzl(N)-4] << 1);
+
+ ffts_static_rec_i(p, data, N2);
+ ffts_static_rec_i(p, data + N1, N3);
+ ffts_static_rec_i(p, data + N1 + N2, N3);
+ ffts_static_rec_i(p, data + N, N2);
+ ffts_static_rec_i(p, data + N + N1, N2);
+
+ if(N == p->N) {
+ neon_static_x8_t_i(data, N, ws);
+ }else{
+ neon_static_x8_i(data, N, ws);
+ }
+
+ }else if(N==16){
+ neon_static_x4_i(data, N, p->ws);
+ }
+
+}
+void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) {
+ if(N > 16) {
+ size_t N1 = N >> 1;
+ size_t N2 = N >> 2;
+ size_t N3 = N >> 3;
+ float *ws = ((float *)(p->ws)) + (p->ws_is[__builtin_ctzl(N)-4] << 1);
+
+ ffts_static_rec_f(p, data, N2);
+ ffts_static_rec_f(p, data + N1, N3);
+ ffts_static_rec_f(p, data + N1 + N2, N3);
+ ffts_static_rec_f(p, data + N, N2);
+ ffts_static_rec_f(p, data + N + N1, N2);
+
+ if(N == p->N) {
+ neon_static_x8_t_f(data, N, ws);
+ }else{
+ neon_static_x8_f(data, N, ws);
+ }
+
+ }else if(N==16){
+ neon_static_x4_f(data, N, p->ws);
+ }
+
+}
+
+void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out) {
+
+ if(__builtin_ctzl(p->N) & 1)
+ neon_static_o_f(p, in, out);
+ else
+ neon_static_e_f(p, in, out);
+ ffts_static_rec_f(p, out, p->N);
+}
+
+
+void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out) {
+
+ if(__builtin_ctzl(p->N) & 1)
+ neon_static_o_i(p, in, out);
+ else
+ neon_static_e_i(p, in, out);
+ ffts_static_rec_i(p, out, p->N);
+}
diff --git a/lib/ffts/src/ffts_static.h b/lib/ffts/src/ffts_static.h
new file mode 100644
index 0000000..4490bde
--- /dev/null
+++ b/lib/ffts/src/ffts_static.h
@@ -0,0 +1,46 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __FFTS_STATIC_H__
+#define __FFTS_STATIC_H__
+
+#include "ffts.h"
+#include "neon.h"
+
+void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) ;
+void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out);
+
+void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) ;
+void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out);
+
+#endif
diff --git a/lib/ffts/src/macros-alpha.h b/lib/ffts/src/macros-alpha.h
new file mode 100644
index 0000000..06daf4a
--- /dev/null
+++ b/lib/ffts/src/macros-alpha.h
@@ -0,0 +1,206 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __MACROS_ALPHA_H__
+#define __MACROS_ALPHA_H__
+
+#include <math.h>
+
+#ifdef __alpha__
+#define restrict
+#endif
+
+typedef struct {float r1, i1, r2, i2;} V;
+
+#define FFTS_MALLOC(d,a) malloc(d)
+#define FFTS_FREE(d) free(d)
+
+#define VLIT4(f3,f2,f1,f0) ((V){f0,f1,f2,f3})
+
+static inline V VADD(V x, V y)
+{
+ V z;
+ z.r1 = x.r1 + y.r1;
+ z.i1 = x.i1 + y.i1;
+ z.r2 = x.r2 + y.r2;
+ z.i2 = x.i2 + y.i2;
+ return z;
+}
+
+
+static inline V VSUB(V x, V y)
+{
+ V z;
+ z.r1 = x.r1 - y.r1;
+ z.i1 = x.i1 - y.i1;
+ z.r2 = x.r2 - y.r2;
+ z.i2 = x.i2 - y.i2;
+ return z;
+}
+
+
+static inline V VMUL(V x, V y)
+{
+ V z;
+ z.r1 = x.r1 * y.r1;
+ z.i1 = x.i1 * y.i1;
+ z.r2 = x.r2 * y.r2;
+ z.i2 = x.i2 * y.i2;
+ return z;
+}
+
+static inline V VXOR(V x, V y)
+{
+ V r;
+ r.r1 = (uint32_t)x.r1 ^ (uint32_t)y.r1;
+ r.i1 = (uint32_t)x.i1 ^ (uint32_t)y.i1;
+ r.r2 = (uint32_t)x.r2 ^ (uint32_t)y.r2;
+ r.i2 = (uint32_t)x.i2 ^ (uint32_t)y.i2;
+ return r;
+}
+
+static inline V VSWAPPAIRS(V x)
+{
+ V z;
+ z.r1 = x.i1;
+ z.i1 = x.r1;
+ z.r2 = x.i2;
+ z.i2 = x.r2;
+ return z;
+}
+
+
+static inline V VBLEND(V x, V y)
+{
+ V z;
+ z.r1 = x.r1;
+ z.i1 = x.i1;
+ z.r2 = y.r2;
+ z.i2 = y.i2;
+ return z;
+}
+
+static inline V VUNPACKHI(V x, V y)
+{
+ V z;
+ z.r1 = x.r2;
+ z.i1 = x.i2;
+ z.r2 = y.r2;
+ z.i2 = y.i2;
+ return z;
+}
+
+static inline V VUNPACKLO(V x, V y)
+{
+ V z;
+ z.r1 = x.r1;
+ z.i1 = x.i1;
+ z.r2 = y.r1;
+ z.i2 = y.i1;
+ return z;
+}
+
+static inline V VDUPRE(V x)
+{
+ V z;
+ z.r1 = x.r1;
+ z.i1 = x.r1;
+ z.r2 = x.r2;
+ z.i2 = x.r2;
+ return z;
+}
+
+static inline V VDUPIM(V x)
+{
+ V z;
+ z.r1 = x.i1;
+ z.i1 = x.i1;
+ z.r2 = x.i2;
+ z.i2 = x.i2;
+ return z;
+}
+
+static inline V IMUL(V d, V re, V im)
+{
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VSUB(re, im);
+}
+
+
+static inline V IMULJ(V d, V re, V im)
+{
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VADD(re, im);
+}
+
+static inline V MULI(int inv, V x)
+{
+ V z;
+
+ if (inv) {
+ z.r1 = -x.r1;
+ z.i1 = x.i1;
+ z.r2 = -x.r2;
+ z.i2 = x.i2;
+ }else{
+ z.r1 = x.r1;
+ z.i1 = -x.i1;
+ z.r2 = x.r2;
+ z.i2 = -x.i2;
+ }
+ return z;
+}
+
+
+static inline V IMULI(int inv, V x)
+{
+ return VSWAPPAIRS(MULI(inv, x));
+}
+
+
+static inline V VLD(const void *s)
+{
+ V *d = (V *)s;
+ return *d;
+}
+
+
+static inline void VST(void *d, V s)
+{
+ V *r = (V *)d;
+ *r = s;
+}
+
+#endif
diff --git a/lib/ffts/src/macros-altivec.h b/lib/ffts/src/macros-altivec.h
new file mode 100644
index 0000000..0d148a5
--- /dev/null
+++ b/lib/ffts/src/macros-altivec.h
@@ -0,0 +1,137 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __MACROS_ALTIVEC_H__
+#define __MACROS_ALTIVEC_H__
+
+#include <math.h>
+#include <altivec.h>
+
+#define restrict
+
+typedef vector float V;
+typedef vector unsigned char VUC;
+
+#ifdef __apple__
+#define FFTS_MALLOC(d,a) vec_malloc(d)
+#define FFTS_FREE(d) vec_free(d)
+#else
+/* It appears vec_malloc() and friends are not implemented on Linux */
+#include <malloc.h>
+#define FFTS_MALLOC(d,a) memalign(16,d)
+#define FFTS_FREE(d) free(d)
+#endif
+
+#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3})
+
+#define VADD(x,y) vec_add(x,y)
+#define VSUB(x,y) vec_sub(x,y)
+#define VMUL(x,y) vec_madd(x,y,(V){0})
+#define VMULADD(x,y,z) vec_madd(x,y,z)
+#define VNMULSUB(x,y,z) vec_nmsub(x,y,z)
+#define VXOR(x,y) vec_xor((x),(y))
+#define VSWAPPAIRS(x) \
+ vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03, \
+ 0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b})
+
+#define VBLEND(x,y) \
+ vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
+ 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
+
+#define VUNPACKHI(x,y) \
+ vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, \
+ 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
+
+#define VUNPACKLO(x,y) \
+ vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
+ 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17})
+
+#define VDUPRE(x) \
+ vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03, \
+ 0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b})
+
+#define VDUPIM(x) \
+ vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07, \
+ 0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f})
+
+
+static inline V IMUL(V d, V re, V im)
+{
+ im = VMUL(im, VSWAPPAIRS(d));
+ re = VMUL(re, d);
+ return VSUB(re, im);
+}
+
+
+static inline V IMULJ(V d, V re, V im)
+{
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VMULADD(re, d, im);
+}
+
+#ifndef __GNUC__
+/* gcc (4.6 and 4.7) ICEs on this code! */
+static inline V MULI(int inv, V x)
+{
+ return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f));
+}
+#else
+/* but compiles this fine... */
+static inline V MULI(int inv, V x)
+{
+ V t;
+ t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f);
+ return VXOR(x, t);
+}
+#endif
+
+
+static inline V IMULI(int inv, V x)
+{
+ return VSWAPPAIRS(MULI(inv, x));
+}
+
+
+static inline V VLD(const void *s)
+{
+ V *d = (V *)s;
+ return *d;
+}
+
+
+static inline void VST(void *d, V s)
+{
+ V *r = (V *)d;
+ *r = s;
+}
+#endif
diff --git a/lib/ffts/src/macros-neon.h b/lib/ffts/src/macros-neon.h
new file mode 100644
index 0000000..0750b75
--- /dev/null
+++ b/lib/ffts/src/macros-neon.h
@@ -0,0 +1,96 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifndef __MACROS_NEON_H__
+#define __MACROS_NEON_H__
+
+#include "neon.h"
+#include <arm_neon.h>
+
+typedef float32x4_t V;
+
+typedef float32x4x2_t VS;
+
+#define ADD vaddq_f32
+#define SUB vsubq_f32
+#define MUL vmulq_f32
+#define VADD vaddq_f32
+#define VSUB vsubq_f32
+#define VMUL vmulq_f32
+#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y))))
+#define VST vst1q_f32
+#define VLD vld1q_f32
+#define VST2 vst2q_f32
+#define VLD2 vld2q_f32
+
+#define VSWAPPAIRS(x) (vrev64q_f32(x))
+
+#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b)))
+#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b)))
+
+#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y)))
+
+__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
+ data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3};
+ return VLD(d);
+}
+
+#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0))
+#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1))
+
+#define FFTS_MALLOC(d,a) (valloc(d))
+#define FFTS_FREE(d) (free(d))
+
+__INLINE void STORESPR(data_t * addr, VS p) {
+
+ vst1q_f32(addr, p.val[0]);
+ vst1q_f32(addr + 4, p.val[1]);
+
+}
+
+__INLINE V IMULI(int inv, V a) {
+ if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
+ else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+}
+
+__INLINE V IMUL(V d, V re, V im) {
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VSUB(re, im);
+}
+
+__INLINE V IMULJ(V d, V re, V im) {
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VADD(re, im);
+}
+
+#endif
diff --git a/lib/ffts/src/macros-sse.h b/lib/ffts/src/macros-sse.h
new file mode 100644
index 0000000..229477c
--- /dev/null
+++ b/lib/ffts/src/macros-sse.h
@@ -0,0 +1,84 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __SSE_FLOAT_H__
+#define __SSE_FLOAT_H__
+
+#include <xmmintrin.h>
+
+//#define VL 4
+
+typedef __m128 V;
+
+#define VADD _mm_add_ps
+#define VSUB _mm_sub_ps
+#define VMUL _mm_mul_ps
+//#define VLIT4 _mm_set_ps
+#define VXOR _mm_xor_ps
+#define VST _mm_store_ps
+#define VLD _mm_load_ps
+
+#define VSWAPPAIRS(x) (_mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1)))
+
+#define VUNPACKHI(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,3,2)))
+#define VUNPACKLO(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(1,0,1,0)))
+
+#define VBLEND(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,1,0)))
+
+#define VLIT4 _mm_set_ps
+
+#define VDUPRE(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(2,2,0,0)))
+#define VDUPIM(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(3,3,1,1)))
+
+#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
+#define FFTS_FREE(d) (_mm_free(d))
+
+__INLINE V IMULI(int inv, V a) {
+ if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
+ else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+}
+
+
+__INLINE V IMUL(V d, V re, V im) {
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VSUB(re, im);
+}
+
+__INLINE V IMULJ(V d, V re, V im) {
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VADD(re, im);
+}
+
+#endif
diff --git a/lib/ffts/src/macros.h b/lib/ffts/src/macros.h
new file mode 100644
index 0000000..d304cec
--- /dev/null
+++ b/lib/ffts/src/macros.h
@@ -0,0 +1,161 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __MACROS_H__
+#define __MACROS_H__
+
+#ifdef HAVE_NEON
+#include "macros-neon.h"
+#else
+#ifdef __alpha__
+#include "macros-alpha.h"
+#else
+#ifdef __powerpc__
+#include "macros-altivec.h"
+#endif
+#endif
+
+#endif
+
+
+#ifdef HAVE_VFP
+#include "macros-alpha.h"
+#endif
+#ifdef HAVE_SSE
+ #include "macros-sse.h"
+#endif
+
+static inline void TX2(V *a, V *b)
+{
+ V TX2_t0 = VUNPACKLO(*a, *b);
+ V TX2_t1 = VUNPACKHI(*a, *b);
+ *a = TX2_t0; *b = TX2_t1;
+}
+
+static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3)
+{
+ V uk, uk2, zk_p, zk_n, zk, zk_d;
+ uk = *r0; uk2 = *r1;
+ zk_p = IMUL(*r2, re, im);
+ zk_n = IMULJ(*r3, re, im);
+
+ zk = VADD(zk_p, zk_n);
+ zk_d = IMULI(inv, VSUB(zk_p, zk_n));
+
+ *r2 = VSUB(uk, zk);
+ *r0 = VADD(uk, zk);
+ *r3 = VADD(uk2, zk_d);
+ *r1 = VSUB(uk2, zk_d);
+}
+
+
+static inline void S_4(V r0, V r1, V r2, V r3,
+ data_t * restrict o0, data_t * restrict o1,
+ data_t * restrict o2, data_t * restrict o3)
+{
+ VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
+}
+
+
+static inline void L_2_4(int inv,
+ const data_t * restrict i0, const data_t * restrict i1,
+ const data_t * restrict i2, const data_t * restrict i3,
+ V *r0, V *r1, V *r2, V *r3)
+{
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
+ t4 = VADD(t0, t1);
+ t5 = VSUB(t0, t1);
+ t6 = VADD(t2, t3);
+ t7 = VSUB(t2, t3);
+ *r0 = VUNPACKLO(t4, t5);
+ *r1 = VUNPACKLO(t6, t7);
+ t5 = IMULI(inv, t5);
+ t0 = VADD(t6, t4);
+ t2 = VSUB(t6, t4);
+ t1 = VSUB(t7, t5);
+ t3 = VADD(t7, t5);
+ *r3 = VUNPACKHI(t0, t1);
+ *r2 = VUNPACKHI(t2, t3);
+}
+
+
+static inline void L_4_4(int inv,
+ const data_t * restrict i0, const data_t * restrict i1,
+ const data_t * restrict i2, const data_t * restrict i3,
+ V *r0, V *r1, V *r2, V *r3)
+{
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
+ t4 = VADD(t0, t1);
+ t5 = VSUB(t0, t1);
+ t6 = VADD(t2, t3);
+ t7 = IMULI(inv, VSUB(t2, t3));
+ t0 = VADD(t4, t6);
+ t2 = VSUB(t4, t6);
+ t1 = VSUB(t5, t7);
+ t3 = VADD(t5, t7);
+ TX2(&t0, &t1);
+ TX2(&t2, &t3);
+ *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3;
+}
+
+
+
+static inline void L_4_2(int inv,
+ const data_t * restrict i0, const data_t * restrict i1,
+ const data_t * restrict i2, const data_t * restrict i3,
+ V *r0, V *r1, V *r2, V *r3)
+{
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = VLD(i0); t1 = VLD(i1); t6 = VLD(i2); t7 = VLD(i3);
+ t2 = VBLEND(t6, t7);
+ t3 = VBLEND(t7, t6);
+ t4 = VADD(t0, t1);
+ t5 = VSUB(t0, t1);
+ t6 = VADD(t2, t3);
+ t7 = VSUB(t2, t3);
+ *r2 = VUNPACKHI(t4, t5);
+ *r3 = VUNPACKHI(t6, t7);
+ t7 = IMULI(inv, t7);
+ t0 = VADD(t4, t6);
+ t2 = VSUB(t4, t6);
+ t1 = VSUB(t5, t7);
+ t3 = VADD(t5, t7);
+ *r0 = VUNPACKLO(t0, t1);
+ *r1 = VUNPACKLO(t2, t3);
+}
+#endif
diff --git a/lib/ffts/src/neon.h b/lib/ffts/src/neon.h
new file mode 100644
index 0000000..f3132c2
--- /dev/null
+++ b/lib/ffts/src/neon.h
@@ -0,0 +1,65 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __NEON_H__
+#define __NEON_H__
+
+#include "ffts.h"
+
+void neon_x4(float *, size_t, float *);
+void neon_x8(float *, size_t, float *);
+void neon_x8_t(float *, size_t, float *);
+void neon_ee();
+void neon_oo();
+void neon_eo();
+void neon_oe();
+void neon_end();
+
+void neon_transpose(uint64_t *in, uint64_t *out, int w, int h);
+void neon_transpose_to_buf(uint64_t *in, uint64_t *out, int w);
+
+//typedef struct _ffts_plan_t ffts_plan_t;
+
+void neon_static_e_f(ffts_plan_t * , const void * , void * );
+void neon_static_o_f(ffts_plan_t * , const void * , void * );
+void neon_static_x4_f(float *, size_t, float *);
+void neon_static_x8_f(float *, size_t, float *);
+void neon_static_x8_t_f(float *, size_t, float *);
+
+void neon_static_e_i(ffts_plan_t * , const void * , void * );
+void neon_static_o_i(ffts_plan_t * , const void * , void * );
+void neon_static_x4_i(float *, size_t, float *);
+void neon_static_x8_i(float *, size_t, float *);
+void neon_static_x8_t_i(float *, size_t, float *);
+
+#endif
diff --git a/lib/ffts/src/neon.s b/lib/ffts/src/neon.s
new file mode 100644
index 0000000..6995066
--- /dev/null
+++ b/lib/ffts/src/neon.s
@@ -0,0 +1,738 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_x4
+_neon_x4:
+#else
+ .globl neon_x4
+neon_x4:
+#endif
+@ add r3, r0, #0
+
+ vld1.32 {q8,q9}, [r0, :128]
+ add r4, r0, r1, lsl #1
+ vld1.32 {q10,q11}, [r4, :128]
+ add r5, r0, r1, lsl #2
+ vld1.32 {q12,q13}, [r5, :128]
+ add r6, r4, r1, lsl #2
+ vld1.32 {q14,q15}, [r6, :128]
+ vld1.32 {q2,q3}, [r2, :128]
+
+ vmul.f32 q0, q13, q3
+ vmul.f32 q5, q12, q2
+ vmul.f32 q1, q14, q2
+ vmul.f32 q4, q14, q3
+ vmul.f32 q14, q12, q3
+ vmul.f32 q13, q13, q2
+ vmul.f32 q12, q15, q3
+ vmul.f32 q2, q15, q2
+ vsub.f32 q0, q5, q0
+ vadd.f32 q13, q13, q14
+ vadd.f32 q12, q12, q1
+ vsub.f32 q1, q2, q4
+ vadd.f32 q15, q0, q12
+ vsub.f32 q12, q0, q12
+ vadd.f32 q14, q13, q1
+ vsub.f32 q13, q13, q1
+ vadd.f32 q0, q8, q15
+ vadd.f32 q1, q9, q14
+ vsub.f32 q2, q10, q13 @
+ vsub.f32 q4, q8, q15
+ vadd.f32 q3, q11, q12 @
+ vst1.32 {q0,q1}, [r0, :128]
+ vsub.f32 q5, q9, q14
+ vadd.f32 q6, q10, q13 @
+ vsub.f32 q7, q11, q12 @
+ vst1.32 {q2,q3}, [r4, :128]
+ vst1.32 {q4,q5}, [r5, :128]
+ vst1.32 {q6,q7}, [r6, :128]
+ bx lr
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_x8
+_neon_x8:
+#else
+ .globl neon_x8
+neon_x8:
+#endif
+ mov r11, #0
+ add r3, r0, #0 @ data0
+ add r5, r0, r1, lsl #1 @ data2
+ add r4, r0, r1 @ data1
+ add r7, r5, r1, lsl #1 @ data4
+ add r6, r5, r1 @ data3
+ add r9, r7, r1, lsl #1 @ data6
+ add r8, r7, r1 @ data5
+ add r10, r9, r1 @ data7
+ add r12, r2, #0 @ LUT
+
+ sub r11, r11, r1, lsr #5
+neon_x8_loop:
+ vld1.32 {q2,q3}, [r12, :128]!
+ vld1.32 {q14,q15}, [r6, :128]
+ vld1.32 {q10,q11}, [r5, :128]
+ adds r11, r11, #1
+ vmul.f32 q12, q15, q2
+ vmul.f32 q8, q14, q3
+ vmul.f32 q13, q14, q2
+ vmul.f32 q9, q10, q3
+ vmul.f32 q1, q10, q2
+ vmul.f32 q0, q11, q2
+ vmul.f32 q14, q11, q3
+ vmul.f32 q15, q15, q3
+ vld1.32 {q2,q3}, [r12, :128]!
+ vsub.f32 q10, q12, q8
+ vadd.f32 q11, q0, q9
+ vadd.f32 q8, q15, q13
+ vld1.32 {q12,q13}, [r4, :128]
+ vsub.f32 q9, q1, q14
+ vsub.f32 q15, q11, q10
+ vsub.f32 q14, q9, q8
+ vsub.f32 q4, q12, q15 @
+ vadd.f32 q6, q12, q15 @
+ vadd.f32 q5, q13, q14 @
+ vsub.f32 q7, q13, q14 @
+ vld1.32 {q14,q15}, [r9, :128]
+ vld1.32 {q12,q13}, [r7, :128]
+ vmul.f32 q1, q14, q2
+ vmul.f32 q0, q14, q3
+ vst1.32 {q4,q5}, [r4, :128]
+ vmul.f32 q14, q15, q3
+ vmul.f32 q4, q15, q2
+ vadd.f32 q15, q9, q8
+ vst1.32 {q6,q7}, [r6, :128]
+ vmul.f32 q8, q12, q3
+ vmul.f32 q5, q13, q3
+ vmul.f32 q12, q12, q2
+ vmul.f32 q9, q13, q2
+ vadd.f32 q14, q14, q1
+ vsub.f32 q13, q4, q0
+ vadd.f32 q0, q9, q8
+ vld1.32 {q8,q9}, [r3, :128]
+ vadd.f32 q1, q11, q10
+ vsub.f32 q12, q12, q5
+ vadd.f32 q11, q8, q15
+ vsub.f32 q8, q8, q15
+ vadd.f32 q2, q12, q14
+ vsub.f32 q10, q0, q13
+ vadd.f32 q15, q0, q13
+ vadd.f32 q13, q9, q1
+ vsub.f32 q9, q9, q1
+ vsub.f32 q12, q12, q14
+ vadd.f32 q0, q11, q2
+ vadd.f32 q1, q13, q15
+ vsub.f32 q4, q11, q2
+ vsub.f32 q2, q8, q10 @
+ vadd.f32 q3, q9, q12 @
+ vst1.32 {q0,q1}, [r3, :128]!
+ vsub.f32 q5, q13, q15
+ vld1.32 {q14,q15}, [r10, :128]
+ vsub.f32 q7, q9, q12 @
+ vld1.32 {q12,q13}, [r8, :128]
+ vst1.32 {q2,q3}, [r5, :128]!
+ vld1.32 {q2,q3}, [r12, :128]!
+ vadd.f32 q6, q8, q10 @
+ vmul.f32 q8, q14, q2
+ vst1.32 {q4,q5}, [r7, :128]!
+ vmul.f32 q10, q15, q3
+ vmul.f32 q9, q13, q3
+ vmul.f32 q11, q12, q2
+ vmul.f32 q14, q14, q3
+ vst1.32 {q6,q7}, [r9, :128]!
+ vmul.f32 q15, q15, q2
+ vmul.f32 q12, q12, q3
+ vmul.f32 q13, q13, q2
+ vadd.f32 q10, q10, q8
+ vsub.f32 q11, q11, q9
+ vld1.32 {q8,q9}, [r4, :128]
+ vsub.f32 q14, q15, q14
+ vadd.f32 q15, q13, q12
+ vadd.f32 q13, q11, q10
+ vadd.f32 q12, q15, q14
+ vsub.f32 q15, q15, q14
+ vsub.f32 q14, q11, q10
+ vld1.32 {q10,q11}, [r6, :128]
+ vadd.f32 q0, q8, q13
+ vadd.f32 q1, q9, q12
+ vsub.f32 q2, q10, q15 @
+ vadd.f32 q3, q11, q14 @
+ vsub.f32 q4, q8, q13
+ vst1.32 {q0,q1}, [r4, :128]!
+ vsub.f32 q5, q9, q12
+ vadd.f32 q6, q10, q15 @
+ vst1.32 {q2,q3}, [r6, :128]!
+ vsub.f32 q7, q11, q14 @
+ vst1.32 {q4,q5}, [r8, :128]!
+ vst1.32 {q6,q7}, [r10, :128]!
+ bne neon_x8_loop
+
+ bx lr
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_x8_t
+_neon_x8_t:
+#else
+ .globl neon_x8_t
+neon_x8_t:
+#endif
+ mov r11, #0
+ add r3, r0, #0 @ data0
+ add r5, r0, r1, lsl #1 @ data2
+ add r4, r0, r1 @ data1
+ add r7, r5, r1, lsl #1 @ data4
+ add r6, r5, r1 @ data3
+ add r9, r7, r1, lsl #1 @ data6
+ add r8, r7, r1 @ data5
+ add r10, r9, r1 @ data7
+ add r12, r2, #0 @ LUT
+
+ sub r11, r11, r1, lsr #5
+neon_x8_t_loop:
+ vld1.32 {q2,q3}, [r12, :128]!
+ vld1.32 {q14,q15}, [r6, :128]
+ vld1.32 {q10,q11}, [r5, :128]
+ adds r11, r11, #1
+ vmul.f32 q12, q15, q2
+ vmul.f32 q8, q14, q3
+ vmul.f32 q13, q14, q2
+ vmul.f32 q9, q10, q3
+ vmul.f32 q1, q10, q2
+ vmul.f32 q0, q11, q2
+ vmul.f32 q14, q11, q3
+ vmul.f32 q15, q15, q3
+ vld1.32 {q2,q3}, [r12, :128]!
+ vsub.f32 q10, q12, q8
+ vadd.f32 q11, q0, q9
+ vadd.f32 q8, q15, q13
+ vld1.32 {q12,q13}, [r4, :128]
+ vsub.f32 q9, q1, q14
+ vsub.f32 q15, q11, q10
+ vsub.f32 q14, q9, q8
+ vsub.f32 q4, q12, q15 @
+ vadd.f32 q6, q12, q15 @
+ vadd.f32 q5, q13, q14 @
+ vsub.f32 q7, q13, q14 @
+ vld1.32 {q14,q15}, [r9, :128]
+ vld1.32 {q12,q13}, [r7, :128]
+ vmul.f32 q1, q14, q2
+ vmul.f32 q0, q14, q3
+ vst1.32 {q4,q5}, [r4, :128]
+ vmul.f32 q14, q15, q3
+ vmul.f32 q4, q15, q2
+ vadd.f32 q15, q9, q8
+ vst1.32 {q6,q7}, [r6, :128]
+ vmul.f32 q8, q12, q3
+ vmul.f32 q5, q13, q3
+ vmul.f32 q12, q12, q2
+ vmul.f32 q9, q13, q2
+ vadd.f32 q14, q14, q1
+ vsub.f32 q13, q4, q0
+ vadd.f32 q0, q9, q8
+ vld1.32 {q8,q9}, [r3, :128]
+ vadd.f32 q1, q11, q10
+ vsub.f32 q12, q12, q5
+ vadd.f32 q11, q8, q15
+ vsub.f32 q8, q8, q15
+ vadd.f32 q2, q12, q14
+ vsub.f32 q10, q0, q13
+ vadd.f32 q15, q0, q13
+ vadd.f32 q13, q9, q1
+ vsub.f32 q9, q9, q1
+ vsub.f32 q12, q12, q14
+ vadd.f32 q0, q11, q2
+ vadd.f32 q1, q13, q15
+ vsub.f32 q4, q11, q2
+ vsub.f32 q2, q8, q10 @
+ vadd.f32 q3, q9, q12 @
+ vst2.32 {q0,q1}, [r3, :128]!
+ vsub.f32 q5, q13, q15
+ vld1.32 {q14,q15}, [r10, :128]
+ vsub.f32 q7, q9, q12 @
+ vld1.32 {q12,q13}, [r8, :128]
+ vst2.32 {q2,q3}, [r5, :128]!
+ vld1.32 {q2,q3}, [r12, :128]!
+ vadd.f32 q6, q8, q10 @
+ vmul.f32 q8, q14, q2
+ vst2.32 {q4,q5}, [r7, :128]!
+ vmul.f32 q10, q15, q3
+ vmul.f32 q9, q13, q3
+ vmul.f32 q11, q12, q2
+ vmul.f32 q14, q14, q3
+ vst2.32 {q6,q7}, [r9, :128]!
+ vmul.f32 q15, q15, q2
+ vmul.f32 q12, q12, q3
+ vmul.f32 q13, q13, q2
+ vadd.f32 q10, q10, q8
+ vsub.f32 q11, q11, q9
+ vld1.32 {q8,q9}, [r4, :128]
+ vsub.f32 q14, q15, q14
+ vadd.f32 q15, q13, q12
+ vadd.f32 q13, q11, q10
+ vadd.f32 q12, q15, q14
+ vsub.f32 q15, q15, q14
+ vsub.f32 q14, q11, q10
+ vld1.32 {q10,q11}, [r6, :128]
+ vadd.f32 q0, q8, q13
+ vadd.f32 q1, q9, q12
+ vsub.f32 q2, q10, q15 @
+ vadd.f32 q3, q11, q14 @
+ vsub.f32 q4, q8, q13
+ vst2.32 {q0,q1}, [r4, :128]!
+ vsub.f32 q5, q9, q12
+ vadd.f32 q6, q10, q15 @
+ vst2.32 {q2,q3}, [r6, :128]!
+ vsub.f32 q7, q11, q14 @
+ vst2.32 {q4,q5}, [r8, :128]!
+ vst2.32 {q6,q7}, [r10, :128]!
+ bne neon_x8_t_loop
+
+ @bx lr
+
+@ assumes r0 = out
+@ r1 = in ?
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = loop iterations
+@ r2 & lr = temps
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_ee
+_neon_ee:
+#else
+ .globl neon_ee
+neon_ee:
+#endif
+ vld1.32 {d16, d17}, [r2, :128]
+_neon_ee_loop:
+ vld2.32 {q15}, [r10, :128]!
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vld2.32 {q9}, [r4, :128]!
+ vld2.32 {q10}, [r3, :128]!
+ vld2.32 {q11}, [r6, :128]!
+ vld2.32 {q12}, [r5, :128]!
+ vsub.f32 q1, q14, q13
+ vld2.32 {q0}, [r9, :128]!
+ subs r11, r11, #1
+ vsub.f32 q2, q0, q15
+ vadd.f32 q0, q0, q15
+ vmul.f32 d10, d2, d17
+ vmul.f32 d11, d3, d16
+ vmul.f32 d12, d3, d17
+ vmul.f32 d6, d4, d17
+ vmul.f32 d7, d5, d16
+ vmul.f32 d8, d4, d16
+ vmul.f32 d9, d5, d17
+ vmul.f32 d13, d2, d16
+ vsub.f32 d7, d7, d6
+ vadd.f32 d11, d11, d10
+ vsub.f32 q1, q12, q11
+ vsub.f32 q2, q10, q9
+ vadd.f32 d6, d9, d8
+ vadd.f32 q4, q14, q13
+ vadd.f32 q11, q12, q11
+ vadd.f32 q12, q10, q9
+ vsub.f32 d10, d13, d12
+ vsub.f32 q7, q4, q0
+ vsub.f32 q9, q12, q11
+ vsub.f32 q13, q5, q3
+ vadd.f32 d29, d5, d2 @
+ vadd.f32 q5, q5, q3
+ vadd.f32 q10, q4, q0
+ vadd.f32 q11, q12, q11
+ vsub.f32 d31, d5, d2 @
+ vsub.f32 d28, d4, d3 @
+ vadd.f32 d30, d4, d3 @
+ vadd.f32 d5, d19, d14 @-
+ vadd.f32 d7, d31, d26 @-
+ vadd.f32 q1, q14, q5
+ vadd.f32 q0, q11, q10
+ vsub.f32 d6, d30, d27 @-
+ vsub.f32 d4, d18, d15 @-
+ vsub.f32 d13, d19, d14 @-
+ vadd.f32 d12, d18, d15 @-
+ vsub.f32 d15, d31, d26 @-
+ ldr r2, [r12], #4
+ vtrn.32 q1, q3
+ ldr lr, [r12], #4
+ vtrn.32 q0, q2
+ add r2, r0, r2, lsl #2
+ vsub.f32 q4, q11, q10
+ add lr, r0, lr, lsl #2
+ vsub.f32 q5, q14, q5
+ vadd.f32 d14, d30, d27 @-
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_ee_loop
+
+@ assumes r0 = out
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = loop iterations
+@ r2 & lr = temps
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_oo
+_neon_oo:
+#else
+ .globl neon_oo
+neon_oo:
+#endif
+_neon_oo_loop:
+ vld2.32 {q8}, [r6, :128]!
+ vld2.32 {q9}, [r5, :128]!
+ vld2.32 {q10}, [r4, :128]!
+ vld2.32 {q13}, [r3, :128]!
+ vadd.f32 q11, q9, q8
+ vsub.f32 q8, q9, q8
+ vsub.f32 q9, q13, q10
+ vadd.f32 q12, q13, q10
+ subs r11, r11, #1
+ vld2.32 {q10}, [r7, :128]!
+ vld2.32 {q13}, [r9, :128]!
+ vsub.f32 q2, q12, q11
+ vsub.f32 d7, d19, d16 @
+ vadd.f32 d3, d19, d16 @
+ vadd.f32 d6, d18, d17 @
+ vsub.f32 d2, d18, d17 @
+ vld2.32 {q9}, [r8, :128]!
+ vld2.32 {q8}, [r10, :128]!
+ vadd.f32 q0, q12, q11
+ vadd.f32 q11, q13, q8
+ vadd.f32 q12, q10, q9
+ vsub.f32 q8, q13, q8
+ vsub.f32 q9, q10, q9
+ vsub.f32 q6, q12, q11
+ vadd.f32 q4, q12, q11
+ vtrn.32 q0, q2
+ ldr r2, [r12], #4
+ vsub.f32 d15, d19, d16 @
+ ldr lr, [r12], #4
+ vadd.f32 d11, d19, d16 @
+ vadd.f32 d14, d18, d17 @
+ vsub.f32 d10, d18, d17 @
+ add r2, r0, r2, lsl #2
+ vtrn.32 q1, q3
+ add lr, r0, lr, lsl #2
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_oo_loop
+
+@ assumes r0 = out
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = addr of twiddle
+@ r2 & lr = temps
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_eo
+_neon_eo:
+#else
+ .globl neon_eo
+neon_eo:
+#endif
+ vld2.32 {q9}, [r5, :128]! @tag2
+ vld2.32 {q13}, [r3, :128]! @tag0
+ vld2.32 {q12}, [r4, :128]! @tag1
+ vld2.32 {q0}, [r7, :128]! @tag4
+ vsub.f32 q11, q13, q12
+ vld2.32 {q8}, [r6, :128]! @tag3
+ vadd.f32 q12, q13, q12
+ vsub.f32 q10, q9, q8
+ vadd.f32 q8, q9, q8
+ vadd.f32 q9, q12, q8
+ vadd.f32 d9, d23, d20 @
+ vsub.f32 d11, d23, d20 @
+ vsub.f32 q8, q12, q8
+ vsub.f32 d8, d22, d21 @
+ vadd.f32 d10, d22, d21 @
+ ldr r2, [r12], #4
+ vld1.32 {d20, d21}, [r11, :128]
+ ldr lr, [r12], #4
+ vtrn.32 q9, q4
+ add r2, r0, r2, lsl #2
+ vtrn.32 q8, q5
+ add lr, r0, lr, lsl #2
+ vswp d9,d10
+ vst1.32 {d8,d9,d10,d11}, [lr, :128]!
+ vld2.32 {q13}, [r10, :128]! @tag7
+ vld2.32 {q15}, [r9, :128]! @tag6
+ vld2.32 {q11}, [r8, :128]! @tag5
+ vsub.f32 q14, q15, q13
+ vsub.f32 q12, q0, q11
+ vadd.f32 q11, q0, q11
+ vadd.f32 q13, q15, q13
+ vadd.f32 d13, d29, d24 @
+ vadd.f32 q15, q13, q11
+ vsub.f32 d12, d28, d25 @
+ vsub.f32 d15, d29, d24 @
+ vadd.f32 d14, d28, d25 @
+ vtrn.32 q15, q6
+ vsub.f32 q15, q13, q11
+ vtrn.32 q15, q7
+ vswp d13, d14
+ vst1.32 {d12,d13,d14,d15}, [lr, :128]!
+ vtrn.32 q13, q14
+ vtrn.32 q11, q12
+ vmul.f32 d24, d26, d21
+ vmul.f32 d28, d27, d20
+ vmul.f32 d25, d26, d20
+ vmul.f32 d26, d27, d21
+ vmul.f32 d27, d22, d21
+ vmul.f32 d30, d23, d20
+ vmul.f32 d29, d23, d21
+ vmul.f32 d22, d22, d20
+ vsub.f32 d21, d28, d24
+ vadd.f32 d20, d26, d25
+ vadd.f32 d25, d30, d27
+ vsub.f32 d24, d22, d29
+ vadd.f32 q11, q12, q10
+ vsub.f32 q10, q12, q10
+ vadd.f32 q0, q9, q11
+ vsub.f32 q2, q9, q11
+ vadd.f32 d3, d17, d20 @
+ vsub.f32 d7, d17, d20 @
+ vsub.f32 d2, d16, d21 @
+ vadd.f32 d6, d16, d21 @
+ vswp d1, d2
+ vswp d5, d6
+ vstmia r2!, {q0-q3}
+
+
+@ assumes r0 = out
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = addr of twiddle
+@ r2 & lr = temps
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_oe
+_neon_oe:
+#else
+ .globl neon_oe
+neon_oe:
+#endif
+ vld1.32 {q8}, [r5, :128]!
+ vld1.32 {q10}, [r6, :128]!
+ vld2.32 {q11}, [r4, :128]!
+ vld2.32 {q13}, [r3, :128]!
+ vld2.32 {q15}, [r10, :128]!
+ vorr d25, d17, d17
+ vorr d24, d20, d20
+ vorr d20, d16, d16
+ vsub.f32 q9, q13, q11
+ vadd.f32 q11, q13, q11
+ ldr r2, [r12], #4
+ vtrn.32 d24, d25
+ ldr lr, [r12], #4
+ vtrn.32 d20, d21
+ add r2, r0, r2, lsl #2
+ vsub.f32 q8, q10, q12
+ add lr, r0, lr, lsl #2
+ vadd.f32 q10, q10, q12
+ vadd.f32 q0, q11, q10
+ vadd.f32 d25, d19, d16 @
+ vsub.f32 d27, d19, d16 @
+ vsub.f32 q1, q11, q10
+ vsub.f32 d24, d18, d17 @
+ vadd.f32 d26, d18, d17 @
+ vtrn.32 q0, q12
+ vtrn.32 q1, q13
+ vld1.32 {d24, d25}, [r11, :128]
+ vswp d1, d2
+ vst1.32 {q0, q1}, [r2, :128]!
+ vld2.32 {q0}, [r9, :128]!
+ vadd.f32 q1, q0, q15
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vsub.f32 q15, q0, q15
+ vsub.f32 q0, q14, q13
+ vadd.f32 q3, q14, q13
+ vadd.f32 q2, q3, q1
+ vadd.f32 d29, d1, d30 @
+ vsub.f32 d27, d1, d30 @
+ vsub.f32 q3, q3, q1
+ vsub.f32 d28, d0, d31 @
+ vadd.f32 d26, d0, d31 @
+ vtrn.32 q2, q14
+ vtrn.32 q3, q13
+ vswp d5, d6
+ vst1.32 {q2, q3}, [r2, :128]!
+ vtrn.32 q11, q9
+ vtrn.32 q10, q8
+ vmul.f32 d20, d18, d25
+ vmul.f32 d22, d19, d24
+ vmul.f32 d21, d19, d25
+ vmul.f32 d18, d18, d24
+ vmul.f32 d19, d16, d25
+ vmul.f32 d30, d17, d24
+ vmul.f32 d23, d16, d24
+ vmul.f32 d24, d17, d25
+ vadd.f32 d17, d22, d20
+ vsub.f32 d16, d18, d21
+ vsub.f32 d21, d30, d19
+ vadd.f32 d20, d24, d23
+ vadd.f32 q9, q8, q10
+ vsub.f32 q8, q8, q10
+ vadd.f32 q4, q14, q9
+ vsub.f32 q6, q14, q9
+ vadd.f32 d11, d27, d16 @
+ vsub.f32 d15, d27, d16 @
+ vsub.f32 d10, d26, d17 @
+ vadd.f32 d14, d26, d17 @
+ vswp d9, d10
+ vswp d13, d14
+ vstmia lr!, {q4-q7}
+
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_end
+_neon_end:
+#else
+ .globl neon_end
+neon_end:
+#endif
+ bx lr
+
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_transpose
+_neon_transpose:
+#else
+ .globl neon_transpose
+neon_transpose:
+#endif
+ push {r4-r8}
+ @ vpush {q8-q9}
+ mov r5, r3
+_neon_transpose_col:
+ mov r7, r1
+ add r8, r1, r3, lsl #3
+ mov r4, r2
+ add r6, r0, r2, lsl #3
+_neon_transpose_row:
+ vld1.32 {q8,q9}, [r0, :128]!
+@ vld1.32 {q10,q11}, [r0, :128]!
+ vld1.32 {q12,q13}, [r6, :128]!
+@ vld1.32 {q14,q15}, [r6, :128]!
+ sub r4, r4, #4
+ cmp r4, #0
+ vswp d17,d24
+ vswp d19,d26
+ vswp d21,d28
+ vswp d23,d30
+ vst1.32 {q8}, [r7, :128]
+ vst1.32 {q12}, [r8, :128]
+ add r7, r7, r3, lsl #4
+ add r8, r8, r3, lsl #4
+ vst1.32 {q9}, [r7, :128]
+ vst1.32 {q13}, [r8, :128]
+ add r7, r7, r3, lsl #4
+ add r8, r8, r3, lsl #4
+@@vst1.32 {q10}, [r7, :128]
+@@vst1.32 {q14}, [r8, :128]
+@@add r7, r7, r3, lsl #4
+@@add r8, r8, r3, lsl #4
+@@vst1.32 {q11}, [r7, :128]
+@@vst1.32 {q15}, [r8, :128]
+@@add r7, r7, r3, lsl #4
+@@add r8, r8, r3, lsl #4
+ bne _neon_transpose_row
+ sub r5, r5, #2
+ cmp r5, #0
+ add r0, r0, r2, lsl #3
+ add r1, r1, #16
+ bne _neon_transpose_col
+ @ vpop {q8-q9}
+ pop {r4-r8}
+ bx lr
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_transpose_to_buf
+_neon_transpose_to_buf:
+#else
+ .globl neon_transpose_to_buf
+neon_transpose_to_buf:
+#endif
+ push {r4-r10}
+ mov r5, #8
+_neon_transpose_to_buf_col:
+ mov r4, #8
+ add r6, r0, r2, lsl #3
+ mov r7, r1
+ add r8, r1, #64
+ add r9, r1, #128
+ add r10, r1, #192
+_neon_transpose_to_buf_row:
+ vld1.32 {q8,q9}, [r0, :128]!
+ vld1.32 {q12,q13}, [r6, :128]!
+ sub r4, r4, #4
+ cmp r4, #0
+ vswp d17,d24
+ vswp d19,d26
+ vst1.32 {q8}, [r7, :128]
+ vst1.32 {q12}, [r8, :128]
+ vst1.32 {q9}, [r9, :128]
+ vst1.32 {q13}, [r10, :128]
+ add r7, r7, #256
+ add r8, r8, #256
+ add r9, r9, #256
+ add r10, r10, #256
+ bne _neon_transpose_to_buf_row
+ sub r5, r5, #2
+ cmp r5, #0
+ sub r0, r0, #64
+ add r0, r0, r2, lsl #4
+ add r1, r1, #16
+ bne _neon_transpose_to_buf_col
+ pop {r4-r10}
+ bx lr
diff --git a/lib/ffts/src/neon_float.h b/lib/ffts/src/neon_float.h
new file mode 100644
index 0000000..a958b8a
--- /dev/null
+++ b/lib/ffts/src/neon_float.h
@@ -0,0 +1,1126 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifndef __NEON_FLOAT_H__
+#define __NEON_FLOAT_H__
+
+#include <arm_neon.h>
+
+//#define VL 4
+#define __INLINE static inline __attribute__((always_inline))
+
+typedef float32x4_t V;
+
+typedef float32x4x2_t VS;
+
+#if defined(complex)
+ typedef complex float cdata_t;
+#else
+ typedef float cdata_t[2];
+#endif
+ typedef float data_t;
+
+#define ADD vaddq_f32
+#define SUB vsubq_f32
+#define MUL vmulq_f32
+#define VADD vaddq_f32
+#define VSUB vsubq_f32
+#define VMUL vmulq_f32
+#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y))))
+#define VST vst1q_f32
+#define VLD vld1q_f32
+#define VST2 vst2q_f32
+#define VLD2 vld2q_f32
+
+#define VSWAPPAIRS(x) (vrev64q_f32(x))
+
+#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b)))
+#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b)))
+
+#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y)))
+
+__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
+ data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3};
+ return VLD(d);
+}
+
+#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0))
+#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1))
+
+#define FFTS_MALLOC(d,a) (valloc(d))
+#define FFTS_FREE(d) (free(d))
+__INLINE void FMA(V *Rd, V Rn, V Rm) {
+ *Rd = vmlaq_f32(*Rd, Rn, Rm);
+// __asm__ ("vmla.f32 %q0,%q1,%q2\n\t"
+// : "+w" (*Rd)
+// : "w" (Rn), "w" (Rm)
+// //: "0"
+// );
+
+}
+__INLINE void FMS(V *Rd, V Rn, V Rm) {
+ *Rd = vmlsq_f32(*Rd, Rn, Rm);
+// __asm__ ("vmls.f32 %q0,%q1,%q2\n\t"
+// : "+w" (*Rd)
+// : "w" (Rn), "w" (Rm)
+// // : "0"
+// );
+}
+
+__INLINE VS VSMUL(VS *d, VS *w) {
+ VS t;
+ t.val[0] = vmulq_f32(d->val[0], w->val[0]);
+ t.val[1] = vmulq_f32(d->val[0], w->val[1]);
+// t.val[0] = vmlsq_f32(t.val[0], d->val[1], w->val[1]);
+// t.val[1] = vmlaq_f32(t.val[1], d->val[1], w->val[0]);
+ FMS(&t.val[0], d->val[1], w->val[1]);
+ FMA(&t.val[1], d->val[1], w->val[0]);
+ return t;
+}
+__INLINE VS VSMULJ(VS *d, VS *w) {
+ VS t;
+ t.val[0] = vmulq_f32(d->val[0], w->val[0]);
+ t.val[1] = vmulq_f32(d->val[1], w->val[0]);
+// t.val[0] = vmlaq_f32(t.val[0], d->val[1], w->val[1]);
+// t.val[1] = vmlsq_f32(t.val[1], d->val[0], w->val[1]);
+ FMA(&t.val[0], d->val[1], w->val[1]);
+ FMS(&t.val[1], d->val[0], w->val[1]);
+ return t;
+}
+__INLINE VS VSADD(VS *a, VS *b) {
+ VS r;
+ r.val[0] = vaddq_f32(a->val[0], b->val[0]);
+ r.val[1] = vaddq_f32(a->val[1], b->val[1]);
+ return r;
+}
+__INLINE VS VSSUB(VS *a, VS *b) {
+ VS r;
+ r.val[0] = vsubq_f32(a->val[0], b->val[0]);
+ r.val[1] = vsubq_f32(a->val[1], b->val[1]);
+ return r;
+}
+__INLINE VS VSSUB_MULI(VS *a, VS *b) {
+ VS r;
+ r.val[0] = vaddq_f32(a->val[0], b->val[1]);
+ r.val[1] = vsubq_f32(a->val[1], b->val[0]);
+ return r;
+}
+__INLINE VS VSADD_MULI(VS *a, VS *b) {
+ VS r;
+ r.val[0] = vsubq_f32(a->val[0], b->val[1]);
+ r.val[1] = vaddq_f32(a->val[1], b->val[0]);
+ return r;
+}
+
+__INLINE void VSK_N(VS w, VS *r0, VS *r1, VS *r2, VS *r3) {
+ VS uk, uk2, zk_p, zk_n, zk, zk_d;
+ uk = *r0; uk2 = *r1;
+ zk_p = VSMUL(r2, &w);
+ zk_n = VSMULJ(r3, &w);
+
+ zk = VSADD(&zk_p, &zk_n);
+ zk_d = VSSUB(&zk_p, &zk_n);
+
+ *r2 = VSSUB(&uk, &zk);
+ *r0 = VSADD(&uk, &zk);
+ *r3 = VSADD_MULI(&uk2, &zk_d);
+ *r1 = VSSUB_MULI(&uk2, &zk_d);
+}
+
+
+__INLINE float32x2x2_t HVS_ADD(float32x2x2_t a, float32x2x2_t b) {
+ float32x2x2_t rval;
+ rval.val[0] = vadd_f32(a.val[0], b.val[0]);
+ rval.val[1] = vadd_f32(a.val[1], b.val[1]);
+ return rval;
+}
+__INLINE float32x2x2_t HVS_SUB(float32x2x2_t a, float32x2x2_t b) {
+ float32x2x2_t rval;
+ rval.val[0] = vsub_f32(a.val[0], b.val[0]);
+ rval.val[1] = vsub_f32(a.val[1], b.val[1]);
+ return rval;
+}
+__INLINE float32x2x2_t HVS_SUB_MULI(float32x2x2_t a, float32x2x2_t b) {
+ float32x2x2_t rval;
+ rval.val[0] = vadd_f32(a.val[0], b.val[1]);
+ rval.val[1] = vsub_f32(a.val[1], b.val[0]);
+ return rval;
+}
+__INLINE float32x2x2_t HVS_ADD_MULI(float32x2x2_t a, float32x2x2_t b) {
+ float32x2x2_t rval;
+ rval.val[0] = vsub_f32(a.val[0], b.val[1]);
+ rval.val[1] = vadd_f32(a.val[1], b.val[0]);
+ return rval;
+}
+__INLINE float32x2x2_t HVS_MUL(float32x2x2_t d, float32x2x2_t w) {
+ float32x2x2_t t;
+ t.val[0] = vmul_f32(d.val[0], w.val[0]);
+ t.val[1] = vmul_f32(d.val[0], w.val[1]);
+ t.val[0] = vmls_f32(t.val[0], d.val[1], w.val[1]);
+ t.val[1] = vmla_f32(t.val[1], d.val[1], w.val[0]);
+ return t;
+}
+__INLINE float32x2x2_t HVS_MULJ(float32x2x2_t d, float32x2x2_t w) {
+ float32x2x2_t t;
+ t.val[0] = vmul_f32(d.val[0], w.val[0]);
+ t.val[1] = vmul_f32(d.val[1], w.val[0]);
+ t.val[0] = vmla_f32(t.val[0], d.val[1], w.val[1]);
+ t.val[1] = vmls_f32(t.val[1], d.val[0], w.val[1]);
+ return t;
+}
+__INLINE void HVS_K_N(float32x2x2_t w, float32x2x2_t *r0, float32x2x2_t *r1, float32x2x2_t *r2, float32x2x2_t *r3) {
+ float32x2x2_t uk, uk2, zk_p, zk_n, zk, zk_d;
+ uk = *r0; uk2 = *r1;
+ zk_p = HVS_MUL(*r2, w);
+ zk_n = HVS_MULJ(*r3, w);
+ zk = HVS_ADD(zk_p, zk_n);
+ zk_d = HVS_SUB(zk_p, zk_n);
+
+ *r2 = HVS_SUB(uk, zk);
+ *r0 = HVS_ADD(uk, zk);
+ *r3 = HVS_ADD_MULI(uk2, zk_d);
+ *r1 = HVS_SUB_MULI(uk2, zk_d);
+}
+
+typedef union {
+ float32x4_t f32x4;
+ float32x2x2_t f32x2x2;
+} float_mixed_t;
+
+__INLINE void VSWP(float32x2x2_t *a, float32x2x2_t *b) {
+//float32x2_t tmp = a->val[1];
+//a->val[1] = b->val[0];
+//b->val[0] = tmp;
+ __asm__ ("vswp %0,%1\n\t"
+ : "+w" (a->val[1]), "+w" (b->val[0])
+ :
+ );
+}
+
+static const __attribute__ ((aligned(16))) float ee_w_data[4] = {0.70710678118654757273731092936941,0.70710678118654746171500846685376,
+ -0.70710678118654757273731092936941,-0.70710678118654746171500846685376};
+__INLINE void LEAF_EE8_SPLIT(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
+ data_t *out0 = out + (*out_offsets)[0];
+ data_t *out1 = out + (*out_offsets)[1];
+ *out_offsets += 2;
+
+ float32x2x2_t r0, r1, r2, r3, r4, r5, r6, r7;
+ float32x2x2_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = vld2_f32(in + (*is)[0]); t1 = vld2_f32(in + (*is)[1]); t2 = vld2_f32(in + (*is)[2]); t3 = vld2_f32(in + (*is)[3]);
+
+ t4 = HVS_ADD (t0, t1);
+ t5 = HVS_SUB (t0, t1);
+ t6 = HVS_ADD (t2, t3);
+ t7 = HVS_SUB (t2, t3);
+ r0 = HVS_ADD (t4, t6);
+ r2 = HVS_SUB (t4, t6);
+ r1 = HVS_SUB_MULI(t5, t7);
+ r3 = HVS_ADD_MULI(t5, t7);
+
+ t0 = vld2_f32(in + (*is)[4]); t1 = vld2_f32(in + (*is)[5]); t2 = vld2_f32(in + (*is)[6]); t3 = vld2_f32(in + (*is)[7]);
+ r4 = HVS_ADD (t0, t1);
+ r5 = HVS_SUB (t0, t1);
+ r6 = HVS_ADD (t2, t3);
+ r7 = HVS_SUB (t2, t3);
+ t0 = r0; t1 = r2;
+ t2 = HVS_ADD(r4, r6);
+ t3 = HVS_SUB(r4, r6);
+ r0 = HVS_ADD(t0, t2);
+ r4 = HVS_SUB(t0, t2);
+ r2 = HVS_SUB_MULI(t1, t3);
+ r6 = HVS_ADD_MULI(t1, t3);
+
+ float32x4_t w = vld1q_f32(ee_w_data);
+ float32x2x2_t ww;
+ ww.val[0] = vget_low_f32(w);
+ ww.val[1] = vget_high_f32(w);
+
+ HVS_K_N(ww,&r1,&r3,&r5,&r7);
+
+//vst2_f32(out0, r0);
+//vst2_f32(out0+4, r2);
+//vst2_f32(out0+8, r4);
+//vst2_f32(out0+12, r6);
+
+//vst2_f32(out1, r1);
+//vst2_f32(out1+4, r3);
+//vst2_f32(out1+8, r5);
+//vst2_f32(out1+12, r7);
+
+ float32x2x2_t tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
+
+ tt0 = vtrn_f32(r0.val[0], r0.val[1]);
+ tt1 = vtrn_f32(r1.val[0], r1.val[1]);
+ tt2 = vtrn_f32(r2.val[0], r2.val[1]);
+ tt3 = vtrn_f32(r3.val[0], r3.val[1]);
+ tt4 = vtrn_f32(r4.val[0], r4.val[1]);
+ tt5 = vtrn_f32(r5.val[0], r5.val[1]);
+ tt6 = vtrn_f32(r6.val[0], r6.val[1]);
+ tt7 = vtrn_f32(r7.val[0], r7.val[1]);
+
+//VSWP(&tt0.f32x2x2, &tt1.f32x2x2);
+//VSWP(&tt2.f32x2x2, &tt3.f32x2x2);
+//VSWP(&tt4.f32x2x2, &tt5.f32x2x2);
+//VSWP(&tt6.f32x2x2, &tt7.f32x2x2);
+
+ float32x4_t z0, z1, z2, z3, z4, z5, z6, z7;
+
+ z0 = vcombine_f32(tt0.val[0], tt1.val[0]);
+ z1 = vcombine_f32(tt0.val[1], tt1.val[1]);
+ z2 = vcombine_f32(tt2.val[0], tt3.val[0]);
+ z3 = vcombine_f32(tt2.val[1], tt3.val[1]);
+ z4 = vcombine_f32(tt4.val[0], tt5.val[0]);
+ z5 = vcombine_f32(tt4.val[1], tt5.val[1]);
+ z6 = vcombine_f32(tt6.val[0], tt7.val[0]);
+ z7 = vcombine_f32(tt6.val[1], tt7.val[1]);
+
+
+ vst1q_f32(out0, z0);
+ vst1q_f32(out0+4, z2);
+ vst1q_f32(out0+8, z4);
+ vst1q_f32(out0+12, z6);
+
+ vst1q_f32(out1, z1);
+ vst1q_f32(out1+4, z3);
+ vst1q_f32(out1+8, z5);
+ vst1q_f32(out1+12, z7);
+/*
+ vst1_f32(out0, tt0.val[0]);
+ vst1_f32(out0+2, tt1.val[0]);
+ vst1_f32(out0+4, tt2.val[0]);
+ vst1_f32(out0+6, tt3.val[0]);
+ vst1_f32(out0+8, tt4.val[0]);
+ vst1_f32(out0+10, tt5.val[0]);
+ vst1_f32(out0+12, tt6.val[0]);
+ vst1_f32(out0+14, tt7.val[0]);
+
+ vst1_f32(out1, tt0.val[1]);
+ vst1_f32(out1+2, tt1.val[1]);
+ vst1_f32(out1+4, tt2.val[1]);
+ vst1_f32(out1+6, tt3.val[1]);
+ vst1_f32(out1+8, tt4.val[1]);
+ vst1_f32(out1+10, tt5.val[1]);
+ vst1_f32(out1+12, tt6.val[1]);
+ vst1_f32(out1+14, tt7.val[1]);
+ */
+/*
+ float32x4_t rr0 = vcombine_f32(r0.val[0], r0.val[1]);
+ float32x4_t rr1 = vcombine_f32(r1.val[0], r1.val[1]);
+ float32x4_t rr2 = vcombine_f32(r2.val[0], r2.val[1]);
+ float32x4_t rr3 = vcombine_f32(r3.val[0], r3.val[1]);
+
+ float32x4x2_t tmp0, tmp1, tmp2, tmp3;
+ tmp0 = vtrnq_f32(rr0, rr2);
+ tmp1 = vtrnq_f32(rr1, rr3);
+
+
+ float32x2x2_t v0, v1, v2, v3;
+ v0.val[0] = vget_low_f32(tmp0.val[0]);
+ v0.val[1] = vget_high_f32(tmp0.val[0]);
+ v1.val[0] = vget_low_f32(tmp0.val[1]);
+ v1.val[1] = vget_high_f32(tmp0.val[1]);
+ v2.val[0] = vget_low_f32(tmp1.val[0]);
+ v2.val[1] = vget_high_f32(tmp1.val[0]);
+ v3.val[0] = vget_low_f32(tmp1.val[1]);
+ v3.val[1] = vget_high_f32(tmp1.val[1]);
+
+ tmp2.val[0] = tmp0.val[0];
+ tmp2.val[1] = tmp1.val[0];
+ tmp3.val[0] = tmp0.val[1];
+ tmp3.val[1] = tmp1.val[1];
+
+//vst2q_f32(out0 , tmp2);
+//vst2q_f32(out1 , tmp3);
+ vst2_f32(out0, v0);
+ vst2_f32(out0+4, v1);
+ vst2_f32(out1, v2);
+ vst2_f32(out1+4, v3);
+
+ float32x4_t rr4 = vcombine_f32(r4.val[0], r4.val[1]);
+ float32x4_t rr5 = vcombine_f32(r5.val[0], r5.val[1]);
+ float32x4_t rr6 = vcombine_f32(r6.val[0], r6.val[1]);
+ float32x4_t rr7 = vcombine_f32(r7.val[0], r7.val[1]);
+
+ tmp0 = vtrnq_f32(rr4, rr6);
+ tmp1 = vtrnq_f32(rr5, rr7);
+
+ tmp2.val[0] = tmp0.val[0];
+ tmp2.val[1] = tmp1.val[0];
+ tmp3.val[0] = tmp0.val[1];
+ tmp3.val[1] = tmp1.val[1];
+ v0.val[0] = vget_low_f32(tmp0.val[0]);
+ v0.val[1] = vget_high_f32(tmp0.val[0]);
+ v1.val[0] = vget_low_f32(tmp0.val[1]);
+ v1.val[1] = vget_high_f32(tmp0.val[1]);
+ v2.val[0] = vget_low_f32(tmp1.val[0]);
+ v2.val[1] = vget_high_f32(tmp1.val[0]);
+ v3.val[0] = vget_low_f32(tmp1.val[1]);
+ v3.val[1] = vget_high_f32(tmp1.val[1]);
+ vst2_f32(out0+8, v0);
+ vst2_f32(out0+12, v1);
+ vst2_f32(out1+8, v1);
+ vst2_f32(out1+12, v3);
+
+//vst2q_f32(out0 + 8, tmp2);
+//vst2q_f32(out1 + 8, tmp3);
+//vst1q_f32(out0+8, tmp0.val[0]);
+//vst1q_f32(out0+12,tmp0.val[1]);
+//vst1q_f32(out1+8, tmp1.val[0]);
+//vst1q_f32(out1+12,tmp1.val[1]);
+ */
+ *is += 8;
+}
+
+__INLINE void STORESPR(data_t * addr, VS p) {
+ __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t"
+ :
+ : "r" (addr), "w" (p.val[0]), "w" (p.val[1])
+ : "memory");
+}
+__INLINE void STORESPRI(data_t * restrict * addr, V p0, V p1) {
+ __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t"
+ : "+r" (*addr)
+ : "w" (p0), "w" (p1)
+ : "memory");
+}
+__INLINE void STORESPRI0(data_t * restrict *addr, VS r) {
+ register V p0 __asm__ ("q0") = r.val[0];
+ register V p1 __asm__ ("q1") = r.val[1];
+ __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t"
+ : "+r" (*addr)
+ : "w" (p0), "w" (p1)
+ : "memory");
+ //STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRI1(data_t **addr, VS r) {
+ register V p0 __asm__ ("q2") = r.val[0];
+ register V p1 __asm__ ("q3") = r.val[1];
+ __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t"
+ : "+r" (*addr)
+ : "w" (p0), "w" (p1)
+ : "memory");
+ //STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRI2(data_t **addr, VS r) {
+ register V p0 __asm__ ("q4") = r.val[0];
+ register V p1 __asm__ ("q5") = r.val[1];
+ __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t"
+ : "+r" (*addr)
+ : "w" (p0), "w" (p1)
+ : "memory");
+ //STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRI3(data_t **addr, VS r) {
+ register V p0 __asm__ ("q6") = r.val[0];
+ register V p1 __asm__ ("q7") = r.val[1];
+ __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t"
+ : "+r" (*addr)
+ : "w" (p0), "w" (p1)
+ : "memory");
+ //STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRIT0(data_t * restrict *addr, VS r) {
+ register V p0 __asm__ ("q0") = r.val[0];
+ register V p1 __asm__ ("q1") = r.val[1];
+ __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t"
+ : "+r" (*addr)
+ : "w" (p0), "w" (p1)
+ : "memory");
+ //STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRIT1(data_t **addr, VS r) {
+ register V p0 __asm__ ("q2") = r.val[0];
+ register V p1 __asm__ ("q3") = r.val[1];
+ __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t"
+ : "+r" (*addr)
+ : "w" (p0), "w" (p1)
+ : "memory");
+ //STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRIT2(data_t **addr, VS r) {
+ register V p0 __asm__ ("q4") = r.val[0];
+ register V p1 __asm__ ("q5") = r.val[1];
+ __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t"
+ : "+r" (*addr)
+ : "w" (p0), "w" (p1)
+ : "memory");
+ //STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRIT3(data_t **addr, VS r) {
+ register V p0 __asm__ ("q6") = r.val[0];
+ register V p1 __asm__ ("q7") = r.val[1];
+ __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t"
+ : "+r" (*addr)
+ : "w" (p0), "w" (p1)
+ : "memory");
+ //STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPR0(data_t *addr, VS r) {
+ register V p0 __asm__ ("q0") = r.val[0];
+ register V p1 __asm__ ("q1") = r.val[1];
+ __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t"
+ :
+ : "r" (addr), "w" (p0), "w" (p1)
+ : "memory");
+}
+__INLINE void STORESPR1(data_t *addr, VS r) {
+ register V p0 __asm__ ("q2") = r.val[0];
+ register V p1 __asm__ ("q3") = r.val[1];
+ __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t"
+ :
+ : "r" (addr), "w" (p0), "w" (p1)
+ : "memory");
+}
+__INLINE void STORESPR2(data_t *addr, VS r) {
+ register V p0 __asm__ ("q4") = r.val[0];
+ register V p1 __asm__ ("q5") = r.val[1];
+ __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t"
+ :
+ : "r" (addr), "w" (p0), "w" (p1)
+ : "memory");
+}
+__INLINE void STORESPR3(data_t *addr, VS r) {
+ register V p0 __asm__ ("q6") = r.val[0];
+ register V p1 __asm__ ("q7") = r.val[1];
+ __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t"
+ :
+ : "r" (addr), "w" (p0), "w" (p1)
+ : "memory");
+}
+__INLINE VS LOADSPR0(data_t *addr) {
+ VS r;
+ register V p0 __asm__ ("q8") ;
+ register V p1 __asm__ ("q9") ;
+ __asm__ __volatile__("vld1.32 {%q0,%q1}, [%2, :128]\n\t"
+ : "=&w" (p0), "=&w" (p1)
+ : "r" (addr)
+ );
+ r.val[0] = p0; r.val[1] = p1;
+ return r;
+}
+__INLINE VS LOADSPR1(data_t *addr) {
+ VS r;
+ register V p0 __asm__ ("q10") ;
+ register V p1 __asm__ ("q11") ;
+ __asm__ __volatile__("vld1.32 {%q0,%q1}, [%2, :128]\n\t"
+ : "=&w" (p0), "=&w" (p1)
+ : "r" (addr)
+ );
+ r.val[0] = p0; r.val[1] = p1;
+ return r;
+}
+__INLINE VS LOADSPR2(data_t *addr) {
+ VS r;
+ register V p0 __asm__ ("q12") ;
+ register V p1 __asm__ ("q13") ;
+ __asm__ __volatile__("vld1.32 {%q0,%q1}, [%2, :128]\n\t"
+ : "=&w" (p0), "=&w" (p1)
+ : "r" (addr)
+ );
+ r.val[0] = p0; r.val[1] = p1;
+ return r;
+}
+__INLINE VS LOADSPR3(data_t *addr) {
+ VS r;
+ register V p0 __asm__ ("q14") ;
+ register V p1 __asm__ ("q15") ;
+ __asm__ __volatile__("vld1.32 {%q0,%q1}, [%2, :128]\n\t"
+ : "=&w" (p0), "=&w" (p1)
+ : "r" (addr)
+ );
+ r.val[0] = p0; r.val[1] = p1;
+ return r;
+}
+__INLINE VS LOADSPRI(data_t * restrict * addr) {
+ VS r;
+ register V p0 __asm__ ("q2") ;
+ register V p1 __asm__ ("q3") ;
+ __asm__ __volatile__("vld1.32 {%q0,%q1}, [%2, :128]!\n\t"
+ : "=&w" (p0), "=&w" (p1), "+r" (*addr)
+ :
+ );
+ r.val[0] = p0; r.val[1] = p1;
+ return r;
+}
+
+__INLINE void X_4_SPLIT(data_t * restrict data, size_t N, data_t * restrict LUT) {
+
+//size_t i;
+//for(i=0;i<N/4/2/2;i++) {
+ VS uk = LOADSPR0(data);
+ VS uk2 = LOADSPR1(data + 2*N/4);
+ VS zk_p = LOADSPR2(data + 4*N/4);
+ VS zk_n = LOADSPR3(data + 6*N/4);
+
+ VSK_N(LOADSPRI(&LUT), &uk, &uk2, &zk_p, &zk_n);
+
+ STORESPR0(data, uk);
+ STORESPR1(data + 2*N/4, uk2);
+ STORESPR2(data + 4*N/4, zk_p);
+ STORESPR3(data + 6*N/4, zk_n);
+
+// LUT += 8;
+// data += 8;
+// }
+}
+
+__INLINE void X_8_SPLIT(data_t * restrict data0, size_t N, data_t * restrict LUT) {
+ data_t *data2 = data0 + 2*N/4;
+ data_t *data4 = data0 + 4*N/4;
+ data_t *data6 = data0 + 6*N/4;
+ data_t *data1 = data0 + 1*N/4;
+ data_t *data3 = data0 + 3*N/4;
+ data_t *data5 = data0 + 5*N/4;
+ data_t *data7 = data0 + 7*N/4;
+ size_t k, n4 = N/4;
+
+ for(k=N/8/2/2;k>0;--k) {
+ VS r0, r1, r2, r3, r4, r5, r6, r7,w;
+ r0 = LOADSPR0(data0);
+ r2 = LOADSPR1(data2);
+ r1 = LOADSPR2(data1);
+ r3 = LOADSPR3(data3);
+ VSK_N(LOADSPRI(&LUT), &r0, &r1, &r2, &r3);
+ STORESPR2(data1, r1);
+ STORESPR3(data3, r3);
+ r4 = LOADSPR2(data4);
+ r6 = LOADSPR3(data6);
+ VSK_N(LOADSPRI(&LUT), &r0, &r2, &r4, &r6);
+ STORESPRI0(&data0, r0); //data0 += 8;
+ STORESPRI1(&data2, r2); //data2 += 8;
+ STORESPRI2(&data4, r4); //data4 += 8;
+ STORESPRI3(&data6, r6); //data6 += 8;
+ r1 = LOADSPR0(data1);
+ r3 = LOADSPR1(data3);
+ r5 = LOADSPR2(data5);
+ r7 = LOADSPR3(data7);
+ VSK_N(LOADSPRI(&LUT), &r1, &r3, &r5, &r7);
+ // LUT += 24;
+ STORESPRI0(&data1, r1); //data1 += 8;
+ STORESPRI1(&data3, r3); //data3 += 8;
+ STORESPRI2(&data5, r5); //data5 += 8;
+ STORESPRI3(&data7, r7); //data7 += 8;
+ }
+}
+
+__INLINE void X_8_SPLIT_T(data_t * restrict data0, size_t N, data_t * restrict LUT) {
+ data_t *data2 = data0 + 2*N/4;
+ data_t *data4 = data0 + 4*N/4;
+ data_t *data6 = data0 + 6*N/4;
+ data_t *data1 = data0 + 1*N/4;
+ data_t *data3 = data0 + 3*N/4;
+ data_t *data5 = data0 + 5*N/4;
+ data_t *data7 = data0 + 7*N/4;
+ size_t k, n4 = N/4;
+
+ for(k=N/8/2/2;k>0;--k) {
+ VS r0, r1, r2, r3, r4, r5, r6, r7,w;
+ r0 = LOADSPR0(data0);
+ r2 = LOADSPR1(data2);
+ r1 = LOADSPR2(data1);
+ r3 = LOADSPR3(data3);
+ VSK_N(LOADSPRI(&LUT), &r0, &r1, &r2, &r3);
+ STORESPR2(data1, r1);
+ STORESPR3(data3, r3);
+ r4 = LOADSPR2(data4);
+ r6 = LOADSPR3(data6);
+ VSK_N(LOADSPRI(&LUT), &r0, &r2, &r4, &r6);
+ STORESPRIT0(&data0, r0); //data0 += 8;
+ STORESPRIT1(&data2, r2); //data2 += 8;
+ STORESPRIT2(&data4, r4); //data4 += 8;
+ STORESPRIT3(&data6, r6); //data6 += 8;
+ r1 = LOADSPR0(data1);
+ r3 = LOADSPR1(data3);
+ r5 = LOADSPR2(data5);
+ r7 = LOADSPR3(data7);
+ VSK_N(LOADSPRI(&LUT), &r1, &r3, &r5, &r7);
+ STORESPRIT0(&data1, r1); //data1 += 8;
+ STORESPRIT1(&data3, r3); //data3 += 8;
+ STORESPRIT2(&data5, r5); //data5 += 8;
+ STORESPRIT3(&data7, r7); //data7 += 8;
+ }
+}
+__INLINE V LOAD2I(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]!\n\t"
+ : "=w" (o), "+r" (*addr)
+ :
+ );
+
+ return o;
+}
+__INLINE V LOAD2I_0(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag0\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_1(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag1\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_2(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag2\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_3(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag3\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_4(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag4\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_5(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag5\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_6(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag6\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_7(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag7\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+
+
+
+__INLINE V LOADI(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOADI_2(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t @tag2" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOADI_3(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t @tag3" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V HSP_MUL(V *d, const V *w) {
+ V t;
+ t = vcombine_f32(vmul_f32(vget_low_f32(*d), vget_low_f32(*w)),
+ vmul_f32(vget_low_f32(*d), vget_high_f32(*w)));
+ t = vcombine_f32(vmls_f32(vget_low_f32(t), vget_high_f32(*d), vget_high_f32(*w)),
+ vmla_f32(vget_high_f32(t), vget_high_f32(*d), vget_low_f32(*w)));
+ return t;
+}
+__INLINE V HSP_MULJ(V *d, const V *w) {
+ V t;
+ t = vcombine_f32(vmul_f32(vget_low_f32(*d), vget_low_f32(*w)),
+ vmul_f32(vget_high_f32(*d), vget_low_f32(*w)));
+ t = vcombine_f32(vmla_f32(vget_low_f32(t), vget_high_f32(*d), vget_high_f32(*w)),
+ vmls_f32(vget_high_f32(t), vget_low_f32(*d), vget_high_f32(*w)));
+ return t;
+}
+__INLINE V HSP_SUB_MULI(V *a, V *b) {
+ return vcombine_f32(vadd_f32(vget_low_f32(*a), vget_high_f32(*b)), vsub_f32(vget_high_f32(*a), vget_low_f32(*b)));
+}
+__INLINE V HSP_ADD_MULI(V *a, V *b) {
+ return vcombine_f32(vsub_f32(vget_low_f32(*a), vget_high_f32(*b)), vadd_f32(vget_high_f32(*a), vget_low_f32(*b)));
+}
+
+__INLINE void K_N_HSP(const V *w, V *r0, V *r1, V *r2, V *r3) {
+ V uk, uk2, zk_p, zk_n, zk, zk_d;
+
+ uk = *r0;
+ uk2 = *r1;
+ zk_p = HSP_MUL(r2, w);
+ zk_n = HSP_MULJ(r3, w);
+ zk = ADD(zk_p, zk_n);
+ zk_d = SUB(zk_p, zk_n);
+
+ *r2 = SUB(uk, zk);
+ *r0 = ADD(uk, zk);
+ *r3 = HSP_ADD_MULI(&uk2, &zk_d);
+ *r1 = HSP_SUB_MULI(&uk2, &zk_d);
+}
+
+__INLINE void neon_shl8_ee(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) {
+
+ V r0, r1, r2, r3, r4, r5, r6, r7;
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+
+
+ t0 = LOAD2I_0(i0);
+ t1 = LOAD2I_1(i1);
+ t2 = LOAD2I_2(i2);
+ t3 = LOAD2I_3(i3);
+ t4 = ADD (t0, t1);
+ t5 = SUB (t0, t1);
+ t6 = ADD (t2, t3);
+ t7 = SUB (t2, t3);
+ r0 = ADD (t4, t6);
+ r2 = SUB (t4, t6);
+ r1 = HSP_SUB_MULI(&t5, &t7);
+ r3 = HSP_ADD_MULI(&t5, &t7);
+
+ t0 = LOAD2I_4(i4);
+ t1 = LOAD2I_5(i5);
+ t2 = LOAD2I_6(i6);
+ t3 = LOAD2I_7(i7);
+ r4 = ADD (t0, t1);
+ r5 = SUB (t0, t1);
+ r6 = ADD (t2, t3);
+ r7 = SUB (t2, t3);
+
+ t0 = r0; t1 = r2;
+ t2 = ADD(r4, r6);
+ t3 = SUB(r4, r6);
+ r0 = ADD(t0, t2);
+ r4 = SUB(t0, t2);
+ r2 = HSP_SUB_MULI(&t1, &t3);
+ r6 = HSP_ADD_MULI(&t1, &t3);
+
+ V w = vld1q_f32(ee_w_data);
+
+ K_N_HSP(&w,&r1,&r3,&r5,&r7);
+ V uk, uk2, zk, zk_d;
+
+ float32x4x2_t tmp1 = vtrnq_f32(r0, r2);
+ r0 = tmp1.val[0];
+ r2 = tmp1.val[1];
+ float32x4x2_t tmp4 = vtrnq_f32(r1, r3);
+ r1 = tmp4.val[0];
+ r3 = tmp4.val[1];
+ register V tt0 __asm__ ("q0") = r0;
+ register V tt1 __asm__ ("q1") = r1;
+ register V tt2 __asm__ ("q2") = r2;
+ register V tt3 __asm__ ("q3") = r3;
+ __asm__ __volatile__ ("vst2.32 {q0,q1}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt0), "w"(tt1) : "memory");
+ __asm__ __volatile__ ("vst2.32 {q2,q3}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt2), "w"(tt3) : "memory");
+
+ float32x4x2_t tmp2 = vtrnq_f32(r4, r6);
+ r4 = tmp2.val[0];
+ r6 = tmp2.val[1];
+ float32x4x2_t tmp3 = vtrnq_f32(r5, r7);
+ r5 = tmp3.val[0];
+ r7 = tmp3.val[1];
+ register V tt4 __asm__ ("q4") = r4;
+ register V tt5 __asm__ ("q5") = r5;
+ register V tt6 __asm__ ("q6") = r6;
+ register V tt7 __asm__ ("q7") = r7;
+
+ __asm__ __volatile__ ("vst2.32 {q4,q5}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt4), "w"(tt5) : "memory");
+ __asm__ __volatile__ ("vst2.32 {q6,q7}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt6), "w"(tt7) : "memory");
+
+}
+
+__INLINE void neon_shl8_oo(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) {
+
+ V r0, r1, r2, r3, r4, r5, r6, r7;
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = LOAD2I_0(i0);
+ t1 = LOAD2I_1(i1);
+ t2 = LOAD2I_2(i2);
+ t3 = LOAD2I_3(i3);
+ t4 = ADD (t0, t1);
+ t5 = SUB (t0, t1);
+ t6 = ADD (t2, t3);
+ t7 = SUB (t2, t3);
+ r0 = ADD (t4, t6);
+ r2 = SUB (t4, t6);
+ r1 = HSP_SUB_MULI(&t5, &t7);
+ r3 = HSP_ADD_MULI(&t5, &t7);
+
+ float32x4x2_t tmp1 = vtrnq_f32(r0, r2);
+ r0 = tmp1.val[0];
+ r2 = tmp1.val[1];
+ float32x4x2_t tmp4 = vtrnq_f32(r1, r3);
+ r1 = tmp4.val[0];
+ r3 = tmp4.val[1];
+ register V tt0 __asm__ ("q0") = r0;
+ register V tt1 __asm__ ("q1") = r1;
+ register V tt2 __asm__ ("q2") = r2;
+ register V tt3 __asm__ ("q3") = r3;
+ __asm__ __volatile__ ("vst2.32 {q0,q1}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt0), "w"(tt1) : "memory");
+ __asm__ __volatile__ ("vst2.32 {q2,q3}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt2), "w"(tt3) : "memory");
+
+
+
+ t0 = LOAD2I_4(i4);
+ t1 = LOAD2I_5(i5);
+ t2 = LOAD2I_6(i6);
+ t3 = LOAD2I_7(i7);
+ t4 = ADD (t0, t1);
+ t5 = SUB (t0, t1);
+ t6 = ADD (t2, t3);
+ t7 = SUB (t2, t3);
+ r4 = ADD (t4, t6);
+ r6 = SUB (t4, t6);
+ r5 = HSP_SUB_MULI(&t5, &t7);
+ r7 = HSP_ADD_MULI(&t5, &t7);
+
+ float32x4x2_t tmp2 = vtrnq_f32(r4, r6);
+ r4 = tmp2.val[0];
+ r6 = tmp2.val[1];
+ float32x4x2_t tmp3 = vtrnq_f32(r5, r7);
+ r5 = tmp3.val[0];
+ r7 = tmp3.val[1];
+
+
+ register V tt4 __asm__ ("q4") = r4;
+ register V tt5 __asm__ ("q5") = r5;
+ register V tt6 __asm__ ("q6") = r6;
+ register V tt7 __asm__ ("q7") = r7;
+
+ __asm__ __volatile__ ("vst2.32 {q4,q5}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt4), "w"(tt5) : "memory");
+ __asm__ __volatile__ ("vst2.32 {q6,q7}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt6), "w"(tt7) : "memory");
+
+
+
+}
+
+static const __attribute__ ((aligned(16))) data_t eo_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376};
+
+
+__INLINE void neon_shl8_eo(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) {
+ /*
+ register V r0_1 __asm__ ("q0");
+ register V r2_3 __asm__ ("q1");
+ register V r4_5 __asm__ ("q2");
+ register V r6_7 __asm__ ("q3");
+ */
+ const V w = vld1q_f32(eo_w_data);
+
+ V r0_1, r2_3, r4_5, r6_7;
+
+ register V r8_9 __asm__ ("q4");
+ register V r10_11 __asm__ ("q5");
+ register V r12_13 __asm__ ("q6");
+ register V r14_15 __asm__ ("q7");
+
+ {
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+ t0 = LOAD2I_0(i0);
+ t1 = LOAD2I_1(i1);
+ t2 = LOAD2I_2(i2);
+ t3 = LOAD2I_3(i3);
+ t4 = ADD(t0, t1);
+ t5 = SUB(t0, t1);
+ t6 = ADD(t2, t3);
+ t7 = SUB(t2, t3);
+
+ t0 = ADD(t4, t6);
+ t2 = SUB(t4, t6);
+ t1 = HSP_SUB_MULI(&t5, &t7);
+ t3 = HSP_ADD_MULI(&t5, &t7);
+
+ float32x4x2_t tmp1 = vtrnq_f32(t0, t1);
+ t0 = tmp1.val[0];
+ t1 = tmp1.val[1];
+ float32x4x2_t tmp2 = vtrnq_f32(t2, t3);
+ t2 = tmp2.val[0];
+ t3 = tmp2.val[1];
+
+ r0_1 = t0;
+ r2_3 = t2;
+ r8_9 = t1;
+ r10_11 = t3;
+ __asm__ __volatile__ ("vswp d9,d10\n\t"
+ "vst1.32 {d8,d9,d10,d11}, [%0, :128]!\n\t"
+// "vst1.32 {d8,d9}, [%0, :128]!\n\t"
+// "vst1.32 {d10,d11}, [%0, :128]!\n\t"
+ : "+&r" (out1)
+ : "w" (r8_9), "w" (r10_11)
+ : "memory");
+
+ }
+ {
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+ t0 = LOAD2I_4(i4);
+ t1 = LOAD2I_5(i5);
+ t2 = LOAD2I_6(i6);
+ t3 = LOAD2I_7(i7);
+ //t2 = HALFBLEND(t6, t7);
+ //t3 = HALFBLEND(t7, t6);
+ t4 = ADD(t0, t1);
+ t5 = SUB(t0, t1);
+ t6 = ADD(t2, t3);
+ t7 = SUB(t2, t3);
+ float32x4x2_t tmp1 = vtrnq_f32(t4, t5);
+ r4_5 = tmp1.val[0];
+ float32x4x2_t tmp2 = vtrnq_f32(t6, t7);
+ r6_7 = tmp2.val[0];
+ //t5 = MULI(t5);
+ t0 = ADD(t6, t4);
+ t2 = SUB(t6, t4);
+ t1 = HSP_SUB_MULI(&t7, &t5);
+ t3 = HSP_ADD_MULI(&t7, &t5);
+
+ float32x4x2_t tmp3 = vtrnq_f32(t0, t1);
+ r12_13 = tmp3.val[1];
+ float32x4x2_t tmp4 = vtrnq_f32(t2, t3);
+ r14_15 = tmp4.val[1];
+
+
+ __asm__ __volatile__ ("vswp d13, d14\n\t"
+ "vst1.32 {d12,d13,d14,d15}, [%0, :128]!\n\t"
+// "vst1.32 {d12,d13}, [%0, :128]!\n\t"
+// "vst1.32 {d14,d15}, [%0, :128]!\n\t"
+ : "+&r" (out1)
+ : "w" (r12_13), "w" (r14_15)
+ : "memory");
+
+
+ }
+
+ K_N_HSP(&w,&r0_1,&r2_3,&r4_5,&r6_7);
+
+ register V t0 __asm__ ("q0") = r0_1;
+ register V t1 __asm__ ("q1") = r2_3;
+ register V t2 __asm__ ("q2") = r4_5;
+ register V t3 __asm__ ("q3") = r6_7;
+
+ __asm__ __volatile__ ("vswp d1, d2\n\t"
+ "vswp d5, d6\n\t"
+ "vstmia %0!, {q0-q3}\n\t"
+// "vst1.32 {d0,d1}, [%0, :128]!\n\t"
+// "vst1.32 {d2,d3}, [%0, :128]!\n\t"
+// "vst1.32 {d4,d5}, [%0, :128]!\n\t"
+// "vst1.32 {d6,d7}, [%0, :128]\n\t"
+ : "+&r" (out0)
+ : "w" (t0), "w" (t1), "w" (t2), "w" (t3)
+ : "memory");
+
+}
+static const __attribute__ ((aligned(16))) data_t oe_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376};
+
+__INLINE void neon_shl8_oe(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) {
+ register V r0_1 __asm__ ("q0");
+ register V r2_3 __asm__ ("q1");
+ register V r4_5 __asm__ ("q2");
+ register V r6_7 __asm__ ("q3");
+
+ V r8_9, r10_11, r12_13, r14_15;
+ const V w = vld1q_f32(oe_w_data);
+
+ {
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+ t0 = LOAD2I_0(i0);
+ t1 = LOAD2I_1(i1);
+ t6 = LOADI_2(i2);
+ t7 = LOADI_3(i3);
+
+ float32x2x2_t tmp0 = vtrn_f32(vget_low_f32(t6), vget_high_f32(t7));
+ float32x2x2_t tmp1 = vtrn_f32(vget_low_f32(t7), vget_high_f32(t6));
+ t2 = vcombine_f32(tmp0.val[0], tmp0.val[1]);
+ t3 = vcombine_f32(tmp1.val[0], tmp1.val[1]);
+
+ t4 = ADD(t0, t1);
+ t5 = SUB(t0, t1);
+ t6 = ADD(t2, t3);
+ t7 = SUB(t2, t3);
+ float32x4x2_t tmp2 = vtrnq_f32(t4, t5);
+ r12_13 = tmp2.val[1];
+ float32x4x2_t tmp3 = vtrnq_f32(t6, t7);
+ r14_15 = tmp3.val[1];
+
+ t0 = ADD(t4, t6);
+ t2 = SUB(t4, t6);
+ t1 = HSP_SUB_MULI(&t5, &t7);
+ t3 = HSP_ADD_MULI(&t5, &t7);
+ float32x4x2_t tmp4 = vtrnq_f32(t0, t1);
+ r0_1 = tmp4.val[0];
+ float32x4x2_t tmp5 = vtrnq_f32(t2, t3);
+ r2_3 = tmp5.val[0];
+ __asm__ __volatile__ ("vswp d1, d2\n\t"
+ "vst1.32 {q0, q1}, [%0, :128]!\n\t"
+// "vst1.32 {q1}, [%0, :128]!\n\t"
+ : "+&r" (out0)
+ : "w" (r0_1), "w" (r2_3)
+ : "memory");
+ }
+ {
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+ t0 = LOAD2I_4(i4);
+ t1 = LOAD2I_5(i5);
+ t2 = LOAD2I_6(i6);
+ t3 = LOAD2I_7(i7);
+ t4 = ADD(t0, t1);
+ t5 = SUB(t0, t1);
+ t6 = ADD(t2, t3);
+ t7 = SUB(t2, t3);
+ t0 = ADD(t4, t6);
+ t2 = SUB(t4, t6);
+ t1 = HSP_SUB_MULI(&t5, &t7);
+ t3 = HSP_ADD_MULI(&t5, &t7);
+
+ float32x4x2_t tmp0 = vtrnq_f32(t0, t1);
+ r4_5 = tmp0.val[0];
+ r8_9 = tmp0.val[1];
+ float32x4x2_t tmp1 = vtrnq_f32(t2, t3);
+ r6_7 = tmp1.val[0];
+ r10_11 = tmp1.val[1];
+
+
+ __asm__ __volatile__ ("vswp d5, d6\n\t"
+ "vst1.32 {q2, q3}, [%0, :128]!\n\t"
+// "vst1.32 {q3}, [%0, :128]!\n\t"
+ : "+&r" (out0)
+ : "w" (r4_5), "w" (r6_7)
+ : "memory");
+
+ }
+
+ K_N_HSP(&w,&r8_9,&r10_11,&r12_13,&r14_15);
+ register V t0 __asm__ ("q4") = r8_9;
+ register V t1 __asm__ ("q5") = r10_11;
+ register V t2 __asm__ ("q6") = r12_13;
+ register V t3 __asm__ ("q7") = r14_15;
+
+ __asm__ __volatile__ ("vswp d9, d10\n\t"
+ "vswp d13, d14\n\t"
+ "vstmia %0!, {q4-q7}\n\t"
+// "vst1.32 {q4}, [%0, :128]!\n\t"
+// "vst1.32 {q5}, [%0, :128]!\n\t"
+// "vst1.32 {q6}, [%0, :128]!\n\t"
+// "vst1.32 {q7}, [%0, :128]\n\t"
+ : "+&r" (out1)
+ : "w" (t0), "w" (t1), "w" (t2), "w" (t3)
+ : "memory");
+
+
+}
+#endif
diff --git a/lib/ffts/src/neon_static_f.s b/lib/ffts/src/neon_static_f.s
new file mode 100644
index 0000000..920d13c
--- /dev/null
+++ b/lib/ffts/src/neon_static_f.s
@@ -0,0 +1,956 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_static_e_f
+_neon_static_e_f:
+#else
+ .globl neon_static_e_f
+neon_static_e_f:
+#endif
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ vstmdb sp!, {d8-d15}
+ ldr lr, [r0, #40] @ this is p->N
+ add r3, r1, #0
+ add r7, r1, lr
+ add r5, r7, lr
+ add r10, r5, lr
+ add r4, r10, lr
+ add r8, r4, lr
+ add r6, r8, lr
+ add r9, r6, lr
+ ldr r12, [r0]
+ add r1, r0, #0
+ add r0, r2, #0
+ ldr r2, [r1, #16] @ this is p->ee_ws
+ ldr r11, [r1, #28] @ this is p->i0
+
+ vld1.32 {d16, d17}, [r2, :128]
+_neon_ee_loop:
+ vld2.32 {q15}, [r10, :128]!
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vld2.32 {q9}, [r4, :128]!
+ vld2.32 {q10}, [r3, :128]!
+ vld2.32 {q11}, [r6, :128]!
+ vld2.32 {q12}, [r5, :128]!
+ vsub.f32 q1, q14, q13
+ vld2.32 {q0}, [r9, :128]!
+ subs r11, r11, #1
+ vsub.f32 q2, q0, q15
+ vadd.f32 q0, q0, q15
+ vmul.f32 d10, d2, d17
+ vmul.f32 d11, d3, d16
+ vmul.f32 d12, d3, d17
+ vmul.f32 d6, d4, d17
+ vmul.f32 d7, d5, d16
+ vmul.f32 d8, d4, d16
+ vmul.f32 d9, d5, d17
+ vmul.f32 d13, d2, d16
+ vsub.f32 d7, d7, d6
+ vadd.f32 d11, d11, d10
+ vsub.f32 q1, q12, q11
+ vsub.f32 q2, q10, q9
+ vadd.f32 d6, d9, d8
+ vadd.f32 q4, q14, q13
+ vadd.f32 q11, q12, q11
+ vadd.f32 q12, q10, q9
+ vsub.f32 d10, d13, d12
+ vsub.f32 q7, q4, q0
+ vsub.f32 q9, q12, q11
+ vsub.f32 q13, q5, q3
+ vsub.f32 d29, d5, d2 @
+ vadd.f32 q5, q5, q3
+ vadd.f32 q10, q4, q0
+ vadd.f32 q11, q12, q11
+ vadd.f32 d31, d5, d2 @
+ vadd.f32 d28, d4, d3 @
+ vsub.f32 d30, d4, d3 @
+ vsub.f32 d5, d19, d14 @
+ vsub.f32 d7, d31, d26 @
+ vadd.f32 q1, q14, q5
+ vadd.f32 q0, q11, q10
+ vadd.f32 d6, d30, d27 @
+ vadd.f32 d4, d18, d15 @
+ vadd.f32 d13, d19, d14 @
+ vsub.f32 d12, d18, d15 @
+ vadd.f32 d15, d31, d26 @
+ ldr r2, [r12], #4
+ vtrn.32 q1, q3
+ ldr lr, [r12], #4
+ vtrn.32 q0, q2
+ add r2, r0, r2, lsl #2
+ vsub.f32 q4, q11, q10
+ add lr, r0, lr, lsl #2
+ vsub.f32 q5, q14, q5
+ vsub.f32 d14, d30, d27 @
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_ee_loop
+
+ ldr r11, [r1, #12]
+ vld2.32 {q9}, [r5, :128]! @tag2
+ vld2.32 {q13}, [r3, :128]! @tag0
+ vld2.32 {q12}, [r4, :128]! @tag1
+ vld2.32 {q0}, [r7, :128]! @tag4
+ vsub.f32 q11, q13, q12
+ vld2.32 {q8}, [r6, :128]! @tag3
+ vadd.f32 q12, q13, q12
+ vsub.f32 q10, q9, q8
+ vadd.f32 q8, q9, q8
+ vadd.f32 q9, q12, q8
+ vsub.f32 d9, d23, d20 @
+ vadd.f32 d11, d23, d20 @
+ vsub.f32 q8, q12, q8
+ vadd.f32 d8, d22, d21 @
+ vsub.f32 d10, d22, d21 @
+ ldr r2, [r12], #4
+ vld1.32 {d20, d21}, [r11, :128]
+ ldr lr, [r12], #4
+ vtrn.32 q9, q4
+ add r2, r0, r2, lsl #2
+ vtrn.32 q8, q5
+ add lr, r0, lr, lsl #2
+ vswp d9,d10
+ vst1.32 {d8,d9,d10,d11}, [lr, :128]!
+ vld2.32 {q13}, [r10, :128]! @tag7
+ vld2.32 {q15}, [r9, :128]! @tag6
+ vld2.32 {q11}, [r8, :128]! @tag5
+ vsub.f32 q14, q15, q13
+ vsub.f32 q12, q0, q11
+ vadd.f32 q11, q0, q11
+ vadd.f32 q13, q15, q13
+ vsub.f32 d13, d29, d24 @
+ vadd.f32 q15, q13, q11
+ vadd.f32 d12, d28, d25 @
+ vadd.f32 d15, d29, d24 @
+ vsub.f32 d14, d28, d25 @
+ vtrn.32 q15, q6
+ vsub.f32 q15, q13, q11
+ vtrn.32 q15, q7
+ vswp d13, d14
+ vst1.32 {d12,d13,d14,d15}, [lr, :128]!
+ vtrn.32 q13, q14
+ vtrn.32 q11, q12
+ vmul.f32 d24, d26, d21
+ vmul.f32 d28, d27, d20
+ vmul.f32 d25, d26, d20
+ vmul.f32 d26, d27, d21
+ vmul.f32 d27, d22, d21
+ vmul.f32 d30, d23, d20
+ vmul.f32 d29, d23, d21
+ vmul.f32 d22, d22, d20
+ vsub.f32 d21, d28, d24
+ vadd.f32 d20, d26, d25
+ vadd.f32 d25, d30, d27
+ vsub.f32 d24, d22, d29
+ vadd.f32 q11, q12, q10
+ vsub.f32 q10, q12, q10
+ vadd.f32 q0, q9, q11
+ vsub.f32 q2, q9, q11
+ vsub.f32 d3, d17, d20 @
+ vadd.f32 d7, d17, d20 @
+ vadd.f32 d2, d16, d21 @
+ vsub.f32 d6, d16, d21 @
+ vswp d1, d2
+ vswp d5, d6
+ vstmia r2!, {q0-q3}
+
+ add r2, r7, #0
+ add r7, r9, #0
+ add r9, r2, #0
+ add r2, r8, #0
+ add r8, r10, #0
+ add r10, r2, #0
+ ldr r11, [r1, #32] @ this is p->i1
+ cmp r11, #0
+ beq _neon_oo_loop_exit
+_neon_oo_loop:
+ vld2.32 {q8}, [r6, :128]!
+ vld2.32 {q9}, [r5, :128]!
+ vld2.32 {q10}, [r4, :128]!
+ vld2.32 {q13}, [r3, :128]!
+ vadd.f32 q11, q9, q8
+ vsub.f32 q8, q9, q8
+ vsub.f32 q9, q13, q10
+ vadd.f32 q12, q13, q10
+ subs r11, r11, #1
+ vld2.32 {q10}, [r7, :128]!
+ vld2.32 {q13}, [r9, :128]!
+ vsub.f32 q2, q12, q11
+ vadd.f32 d7, d19, d16 @
+ vsub.f32 d3, d19, d16 @
+ vsub.f32 d6, d18, d17 @
+ vadd.f32 d2, d18, d17 @
+ vld2.32 {q9}, [r8, :128]!
+ vld2.32 {q8}, [r10, :128]!
+ vadd.f32 q0, q12, q11
+ vadd.f32 q11, q13, q8
+ vadd.f32 q12, q10, q9
+ vsub.f32 q8, q13, q8
+ vsub.f32 q9, q10, q9
+ vsub.f32 q6, q12, q11
+ vadd.f32 q4, q12, q11
+ vtrn.32 q0, q2
+ ldr r2, [r12], #4
+ vadd.f32 d15, d19, d16 @
+ ldr lr, [r12], #4
+ vsub.f32 d11, d19, d16 @
+ vsub.f32 d14, d18, d17 @
+ vadd.f32 d10, d18, d17 @
+ add r2, r0, r2, lsl #2
+ vtrn.32 q1, q3
+ add lr, r0, lr, lsl #2
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_oo_loop
+_neon_oo_loop_exit:
+
+
+ add r2, r3, #0
+ add r3, r7, #0
+ add r7, r2, #0
+ add r2, r4, #0
+ add r4, r8, #0
+ add r8, r2, #0
+ add r2, r5, #0
+ add r5, r9, #0
+ add r9, r2, #0
+ add r2, r6, #0
+ add r6, r10, #0
+ add r10, r2, #0
+ add r2, r9, #0
+ add r9, r10, #0
+ add r10, r2, #0
+ ldr r2, [r1, #16]
+ ldr r11, [r1, #32] @ this is p->i1
+ cmp r11, #0
+ beq _neon_ee_loop2_exit
+
+ vld1.32 {d16, d17}, [r2, :128]
+_neon_ee_loop2:
+ vld2.32 {q15}, [r10, :128]!
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vld2.32 {q9}, [r4, :128]!
+ vld2.32 {q10}, [r3, :128]!
+ vld2.32 {q11}, [r6, :128]!
+ vld2.32 {q12}, [r5, :128]!
+ vsub.f32 q1, q14, q13
+ vld2.32 {q0}, [r9, :128]!
+ subs r11, r11, #1
+ vsub.f32 q2, q0, q15
+ vadd.f32 q0, q0, q15
+ vmul.f32 d10, d2, d17
+ vmul.f32 d11, d3, d16
+ vmul.f32 d12, d3, d17
+ vmul.f32 d6, d4, d17
+ vmul.f32 d7, d5, d16
+ vmul.f32 d8, d4, d16
+ vmul.f32 d9, d5, d17
+ vmul.f32 d13, d2, d16
+ vsub.f32 d7, d7, d6
+ vadd.f32 d11, d11, d10
+ vsub.f32 q1, q12, q11
+ vsub.f32 q2, q10, q9
+ vadd.f32 d6, d9, d8
+ vadd.f32 q4, q14, q13
+ vadd.f32 q11, q12, q11
+ vadd.f32 q12, q10, q9
+ vsub.f32 d10, d13, d12
+ vsub.f32 q7, q4, q0
+ vsub.f32 q9, q12, q11
+ vsub.f32 q13, q5, q3
+ vsub.f32 d29, d5, d2 @
+ vadd.f32 q5, q5, q3
+ vadd.f32 q10, q4, q0
+ vadd.f32 q11, q12, q11
+ vadd.f32 d31, d5, d2 @
+ vadd.f32 d28, d4, d3 @
+ vsub.f32 d30, d4, d3 @
+ vsub.f32 d5, d19, d14 @
+ vsub.f32 d7, d31, d26 @
+ vadd.f32 q1, q14, q5
+ vadd.f32 q0, q11, q10
+ vadd.f32 d6, d30, d27 @
+ vadd.f32 d4, d18, d15 @
+ vadd.f32 d13, d19, d14 @
+ vsub.f32 d12, d18, d15 @
+ vadd.f32 d15, d31, d26 @
+ ldr r2, [r12], #4
+ vtrn.32 q1, q3
+ ldr lr, [r12], #4
+ vtrn.32 q0, q2
+ add r2, r0, r2, lsl #2
+ vsub.f32 q4, q11, q10
+ add lr, r0, lr, lsl #2
+ vsub.f32 q5, q14, q5
+ vsub.f32 d14, d30, d27 @
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_ee_loop2
+_neon_ee_loop2_exit:
+
+ vldmia sp!, {d8-d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+
+
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_static_o_f
+_neon_static_o_f:
+#else
+ .globl neon_static_o_f
+neon_static_o_f:
+#endif
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ vstmdb sp!, {d8-d15}
+ ldr lr, [r0, #40] @ this is p->N
+ add r3, r1, #0
+ add r7, r1, lr
+ add r5, r7, lr
+ add r10, r5, lr
+ add r4, r10, lr
+ add r8, r4, lr
+ add r6, r8, lr
+ add r9, r6, lr
+ ldr r12, [r0]
+ add r1, r0, #0
+ add r0, r2, #0
+ ldr r2, [r1, #16] @ this is p->ee_ws
+ ldr r11, [r1, #28] @ this is p->i0
+
+ vld1.32 {d16, d17}, [r2, :128]
+_neon_ee_o_loop:
+ vld2.32 {q15}, [r10, :128]!
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vld2.32 {q9}, [r4, :128]!
+ vld2.32 {q10}, [r3, :128]!
+ vld2.32 {q11}, [r6, :128]!
+ vld2.32 {q12}, [r5, :128]!
+ vsub.f32 q1, q14, q13
+ vld2.32 {q0}, [r9, :128]!
+ subs r11, r11, #1
+ vsub.f32 q2, q0, q15
+ vadd.f32 q0, q0, q15
+ vmul.f32 d10, d2, d17
+ vmul.f32 d11, d3, d16
+ vmul.f32 d12, d3, d17
+ vmul.f32 d6, d4, d17
+ vmul.f32 d7, d5, d16
+ vmul.f32 d8, d4, d16
+ vmul.f32 d9, d5, d17
+ vmul.f32 d13, d2, d16
+ vsub.f32 d7, d7, d6
+ vadd.f32 d11, d11, d10
+ vsub.f32 q1, q12, q11
+ vsub.f32 q2, q10, q9
+ vadd.f32 d6, d9, d8
+ vadd.f32 q4, q14, q13
+ vadd.f32 q11, q12, q11
+ vadd.f32 q12, q10, q9
+ vsub.f32 d10, d13, d12
+ vsub.f32 q7, q4, q0
+ vsub.f32 q9, q12, q11
+ vsub.f32 q13, q5, q3
+ vsub.f32 d29, d5, d2 @
+ vadd.f32 q5, q5, q3
+ vadd.f32 q10, q4, q0
+ vadd.f32 q11, q12, q11
+ vadd.f32 d31, d5, d2 @
+ vadd.f32 d28, d4, d3 @
+ vsub.f32 d30, d4, d3 @
+ vsub.f32 d5, d19, d14 @
+ vsub.f32 d7, d31, d26 @
+ vadd.f32 q1, q14, q5
+ vadd.f32 q0, q11, q10
+ vadd.f32 d6, d30, d27 @
+ vadd.f32 d4, d18, d15 @
+ vadd.f32 d13, d19, d14 @
+ vsub.f32 d12, d18, d15 @
+ vadd.f32 d15, d31, d26 @
+ ldr r2, [r12], #4
+ vtrn.32 q1, q3
+ ldr lr, [r12], #4
+ vtrn.32 q0, q2
+ add r2, r0, r2, lsl #2
+ vsub.f32 q4, q11, q10
+ add lr, r0, lr, lsl #2
+ vsub.f32 q5, q14, q5
+ vsub.f32 d14, d30, d27 @
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_ee_o_loop
+
+ add r2, r7, #0
+ add r7, r9, #0
+ add r9, r2, #0
+ add r2, r8, #0
+ add r8, r10, #0
+ add r10, r2, #0
+ ldr r11, [r1, #32] @ this is p->i1
+ cmp r11, #0
+ beq _neon_oo_o_loop_exit
+_neon_oo_o_loop:
+ vld2.32 {q8}, [r6, :128]!
+ vld2.32 {q9}, [r5, :128]!
+ vld2.32 {q10}, [r4, :128]!
+ vld2.32 {q13}, [r3, :128]!
+ vadd.f32 q11, q9, q8
+ vsub.f32 q8, q9, q8
+ vsub.f32 q9, q13, q10
+ vadd.f32 q12, q13, q10
+ subs r11, r11, #1
+ vld2.32 {q10}, [r7, :128]!
+ vld2.32 {q13}, [r9, :128]!
+ vsub.f32 q2, q12, q11
+ vadd.f32 d7, d19, d16 @
+ vsub.f32 d3, d19, d16 @
+ vsub.f32 d6, d18, d17 @
+ vadd.f32 d2, d18, d17 @
+ vld2.32 {q9}, [r8, :128]!
+ vld2.32 {q8}, [r10, :128]!
+ vadd.f32 q0, q12, q11
+ vadd.f32 q11, q13, q8
+ vadd.f32 q12, q10, q9
+ vsub.f32 q8, q13, q8
+ vsub.f32 q9, q10, q9
+ vsub.f32 q6, q12, q11
+ vadd.f32 q4, q12, q11
+ vtrn.32 q0, q2
+ ldr r2, [r12], #4
+ vadd.f32 d15, d19, d16 @
+ ldr lr, [r12], #4
+ vsub.f32 d11, d19, d16 @
+ vsub.f32 d14, d18, d17 @
+ vadd.f32 d10, d18, d17 @
+ add r2, r0, r2, lsl #2
+ vtrn.32 q1, q3
+ add lr, r0, lr, lsl #2
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_oo_o_loop
+_neon_oo_o_loop_exit:
+
+ ldr r11, [r1, #8]
+ vld1.32 {q8}, [r5, :128]!
+ vld1.32 {q10}, [r6, :128]!
+ vld2.32 {q11}, [r4, :128]!
+ vld2.32 {q13}, [r3, :128]!
+ vld2.32 {q15}, [r10, :128]!
+ vorr d25, d17, d17
+ vorr d24, d20, d20
+ vorr d20, d16, d16
+ vsub.f32 q9, q13, q11
+ vadd.f32 q11, q13, q11
+ ldr r2, [r12], #4
+ vtrn.32 d24, d25
+ ldr lr, [r12], #4
+ vtrn.32 d20, d21
+ add r2, r0, r2, lsl #2
+ vsub.f32 q8, q10, q12
+ add lr, r0, lr, lsl #2
+ vadd.f32 q10, q10, q12
+ vadd.f32 q0, q11, q10
+ vsub.f32 d25, d19, d16 @
+ vadd.f32 d27, d19, d16 @
+ vsub.f32 q1, q11, q10
+ vadd.f32 d24, d18, d17 @
+ vsub.f32 d26, d18, d17 @
+ vtrn.32 q0, q12
+ vtrn.32 q1, q13
+ vld1.32 {d24, d25}, [r11, :128]
+ vswp d1, d2
+ vst1.32 {q0, q1}, [r2, :128]!
+ vld2.32 {q0}, [r9, :128]!
+ vadd.f32 q1, q0, q15
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vsub.f32 q15, q0, q15
+ vsub.f32 q0, q14, q13
+ vadd.f32 q3, q14, q13
+ vadd.f32 q2, q3, q1
+ vsub.f32 d29, d1, d30 @
+ vadd.f32 d27, d1, d30 @
+ vsub.f32 q3, q3, q1
+ vadd.f32 d28, d0, d31 @
+ vsub.f32 d26, d0, d31 @
+ vtrn.32 q2, q14
+ vtrn.32 q3, q13
+ vswp d5, d6
+ vst1.32 {q2, q3}, [r2, :128]!
+ vtrn.32 q11, q9
+ vtrn.32 q10, q8
+ vmul.f32 d20, d18, d25
+ vmul.f32 d22, d19, d24
+ vmul.f32 d21, d19, d25
+ vmul.f32 d18, d18, d24
+ vmul.f32 d19, d16, d25
+ vmul.f32 d30, d17, d24
+ vmul.f32 d23, d16, d24
+ vmul.f32 d24, d17, d25
+ vadd.f32 d17, d22, d20
+ vsub.f32 d16, d18, d21
+ vsub.f32 d21, d30, d19
+ vadd.f32 d20, d24, d23
+ vadd.f32 q9, q8, q10
+ vsub.f32 q8, q8, q10
+ vadd.f32 q4, q14, q9
+ vsub.f32 q6, q14, q9
+ vsub.f32 d11, d27, d16 @
+ vadd.f32 d15, d27, d16 @
+ vadd.f32 d10, d26, d17 @
+ vsub.f32 d14, d26, d17 @
+ vswp d9, d10
+ vswp d13, d14
+ vstmia lr!, {q4-q7}
+
+
+ add r2, r3, #0
+ add r3, r7, #0
+ add r7, r2, #0
+ add r2, r4, #0
+ add r4, r8, #0
+ add r8, r2, #0
+ add r2, r5, #0
+ add r5, r9, #0
+ add r9, r2, #0
+ add r2, r6, #0
+ add r6, r10, #0
+ add r10, r2, #0
+ add r2, r9, #0
+ add r9, r10, #0
+ add r10, r2, #0
+ ldr r2, [r1, #16]
+ ldr r11, [r1, #32] @ this is p->i1
+ cmp r11, #0
+ beq _neon_ee_o_loop2_exit
+
+ vld1.32 {d16, d17}, [r2, :128]
+_neon_ee_o_loop2:
+ vld2.32 {q15}, [r10, :128]!
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vld2.32 {q9}, [r4, :128]!
+ vld2.32 {q10}, [r3, :128]!
+ vld2.32 {q11}, [r6, :128]!
+ vld2.32 {q12}, [r5, :128]!
+ vsub.f32 q1, q14, q13
+ vld2.32 {q0}, [r9, :128]!
+ subs r11, r11, #1
+ vsub.f32 q2, q0, q15
+ vadd.f32 q0, q0, q15
+ vmul.f32 d10, d2, d17
+ vmul.f32 d11, d3, d16
+ vmul.f32 d12, d3, d17
+ vmul.f32 d6, d4, d17
+ vmul.f32 d7, d5, d16
+ vmul.f32 d8, d4, d16
+ vmul.f32 d9, d5, d17
+ vmul.f32 d13, d2, d16
+ vsub.f32 d7, d7, d6
+ vadd.f32 d11, d11, d10
+ vsub.f32 q1, q12, q11
+ vsub.f32 q2, q10, q9
+ vadd.f32 d6, d9, d8
+ vadd.f32 q4, q14, q13
+ vadd.f32 q11, q12, q11
+ vadd.f32 q12, q10, q9
+ vsub.f32 d10, d13, d12
+ vsub.f32 q7, q4, q0
+ vsub.f32 q9, q12, q11
+ vsub.f32 q13, q5, q3
+ vsub.f32 d29, d5, d2 @
+ vadd.f32 q5, q5, q3
+ vadd.f32 q10, q4, q0
+ vadd.f32 q11, q12, q11
+ vadd.f32 d31, d5, d2 @
+ vadd.f32 d28, d4, d3 @
+ vsub.f32 d30, d4, d3 @
+ vsub.f32 d5, d19, d14 @
+ vsub.f32 d7, d31, d26 @
+ vadd.f32 q1, q14, q5
+ vadd.f32 q0, q11, q10
+ vadd.f32 d6, d30, d27 @
+ vadd.f32 d4, d18, d15 @
+ vadd.f32 d13, d19, d14 @
+ vsub.f32 d12, d18, d15 @
+ vadd.f32 d15, d31, d26 @
+ ldr r2, [r12], #4
+ vtrn.32 q1, q3
+ ldr lr, [r12], #4
+ vtrn.32 q0, q2
+ add r2, r0, r2, lsl #2
+ vsub.f32 q4, q11, q10
+ add lr, r0, lr, lsl #2
+ vsub.f32 q5, q14, q5
+ vsub.f32 d14, d30, d27 @
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_ee_o_loop2
+_neon_ee_o_loop2_exit:
+
+ vldmia sp!, {d8-d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_static_x4_f
+_neon_static_x4_f:
+#else
+ .globl neon_static_x4_f
+neon_static_x4_f:
+#endif
+@ add r3, r0, #0
+ push {r4, r5, r6, lr}
+ vstmdb sp!, {d8-d15}
+
+ vld1.32 {q8,q9}, [r0, :128]
+ add r4, r0, r1, lsl #1
+ vld1.32 {q10,q11}, [r4, :128]
+ add r5, r0, r1, lsl #2
+ vld1.32 {q12,q13}, [r5, :128]
+ add r6, r4, r1, lsl #2
+ vld1.32 {q14,q15}, [r6, :128]
+ vld1.32 {q2,q3}, [r2, :128]
+
+ vmul.f32 q0, q13, q3
+ vmul.f32 q5, q12, q2
+ vmul.f32 q1, q14, q2
+ vmul.f32 q4, q14, q3
+ vmul.f32 q14, q12, q3
+ vmul.f32 q13, q13, q2
+ vmul.f32 q12, q15, q3
+ vmul.f32 q2, q15, q2
+ vsub.f32 q0, q5, q0
+ vadd.f32 q13, q13, q14
+ vadd.f32 q12, q12, q1
+ vsub.f32 q1, q2, q4
+ vadd.f32 q15, q0, q12
+ vsub.f32 q12, q0, q12
+ vadd.f32 q14, q13, q1
+ vsub.f32 q13, q13, q1
+ vadd.f32 q0, q8, q15
+ vadd.f32 q1, q9, q14
+ vadd.f32 q2, q10, q13 @
+ vsub.f32 q4, q8, q15
+ vsub.f32 q3, q11, q12 @
+ vst1.32 {q0,q1}, [r0, :128]
+ vsub.f32 q5, q9, q14
+ vsub.f32 q6, q10, q13 @
+ vadd.f32 q7, q11, q12 @
+ vst1.32 {q2,q3}, [r4, :128]
+ vst1.32 {q4,q5}, [r5, :128]
+ vst1.32 {q6,q7}, [r6, :128]
+ vldmia sp!, {d8-d15}
+ pop {r4, r5, r6, pc}
+
+
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_static_x8_f
+_neon_static_x8_f:
+#else
+ .globl neon_static_x8_f
+neon_static_x8_f:
+#endif
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ vstmdb sp!, {d8-d15}
+ mov r11, #0
+ add r3, r0, #0 @ data0
+ add r5, r0, r1, lsl #1 @ data2
+ add r4, r0, r1 @ data1
+ add r7, r5, r1, lsl #1 @ data4
+ add r6, r5, r1 @ data3
+ add r9, r7, r1, lsl #1 @ data6
+ add r8, r7, r1 @ data5
+ add r10, r9, r1 @ data7
+ add r12, r2, #0 @ LUT
+
+ sub r11, r11, r1, lsr #5
+neon_x8_loop:
+ vld1.32 {q2,q3}, [r12, :128]!
+ vld1.32 {q14,q15}, [r6, :128]
+ vld1.32 {q10,q11}, [r5, :128]
+ adds r11, r11, #1
+ vmul.f32 q12, q15, q2
+ vmul.f32 q8, q14, q3
+ vmul.f32 q13, q14, q2
+ vmul.f32 q9, q10, q3
+ vmul.f32 q1, q10, q2
+ vmul.f32 q0, q11, q2
+ vmul.f32 q14, q11, q3
+ vmul.f32 q15, q15, q3
+ vld1.32 {q2,q3}, [r12, :128]!
+ vsub.f32 q10, q12, q8
+ vadd.f32 q11, q0, q9
+ vadd.f32 q8, q15, q13
+ vld1.32 {q12,q13}, [r4, :128]
+ vsub.f32 q9, q1, q14
+ vsub.f32 q15, q11, q10
+ vsub.f32 q14, q9, q8
+ vadd.f32 q4, q12, q15 @
+ vsub.f32 q6, q12, q15 @
+ vsub.f32 q5, q13, q14 @
+ vadd.f32 q7, q13, q14 @
+ vld1.32 {q14,q15}, [r9, :128]
+ vld1.32 {q12,q13}, [r7, :128]
+ vmul.f32 q1, q14, q2
+ vmul.f32 q0, q14, q3
+ vst1.32 {q4,q5}, [r4, :128]
+ vmul.f32 q14, q15, q3
+ vmul.f32 q4, q15, q2
+ vadd.f32 q15, q9, q8
+ vst1.32 {q6,q7}, [r6, :128]
+ vmul.f32 q8, q12, q3
+ vmul.f32 q5, q13, q3
+ vmul.f32 q12, q12, q2
+ vmul.f32 q9, q13, q2
+ vadd.f32 q14, q14, q1
+ vsub.f32 q13, q4, q0
+ vadd.f32 q0, q9, q8
+ vld1.32 {q8,q9}, [r3, :128]
+ vadd.f32 q1, q11, q10
+ vsub.f32 q12, q12, q5
+ vadd.f32 q11, q8, q15
+ vsub.f32 q8, q8, q15
+ vadd.f32 q2, q12, q14
+ vsub.f32 q10, q0, q13
+ vadd.f32 q15, q0, q13
+ vadd.f32 q13, q9, q1
+ vsub.f32 q9, q9, q1
+ vsub.f32 q12, q12, q14
+ vadd.f32 q0, q11, q2
+ vadd.f32 q1, q13, q15
+ vsub.f32 q4, q11, q2
+ vadd.f32 q2, q8, q10 @
+ vsub.f32 q3, q9, q12 @
+ vst1.32 {q0,q1}, [r3, :128]!
+ vsub.f32 q5, q13, q15
+ vld1.32 {q14,q15}, [r10, :128]
+ vadd.f32 q7, q9, q12 @
+ vld1.32 {q12,q13}, [r8, :128]
+ vst1.32 {q2,q3}, [r5, :128]!
+ vld1.32 {q2,q3}, [r12, :128]!
+ vsub.f32 q6, q8, q10 @
+ vmul.f32 q8, q14, q2
+ vst1.32 {q4,q5}, [r7, :128]!
+ vmul.f32 q10, q15, q3
+ vmul.f32 q9, q13, q3
+ vmul.f32 q11, q12, q2
+ vmul.f32 q14, q14, q3
+ vst1.32 {q6,q7}, [r9, :128]!
+ vmul.f32 q15, q15, q2
+ vmul.f32 q12, q12, q3
+ vmul.f32 q13, q13, q2
+ vadd.f32 q10, q10, q8
+ vsub.f32 q11, q11, q9
+ vld1.32 {q8,q9}, [r4, :128]
+ vsub.f32 q14, q15, q14
+ vadd.f32 q15, q13, q12
+ vadd.f32 q13, q11, q10
+ vadd.f32 q12, q15, q14
+ vsub.f32 q15, q15, q14
+ vsub.f32 q14, q11, q10
+ vld1.32 {q10,q11}, [r6, :128]
+ vadd.f32 q0, q8, q13
+ vadd.f32 q1, q9, q12
+ vadd.f32 q2, q10, q15 @
+ vsub.f32 q3, q11, q14 @
+ vsub.f32 q4, q8, q13
+ vst1.32 {q0,q1}, [r4, :128]!
+ vsub.f32 q5, q9, q12
+ vsub.f32 q6, q10, q15 @
+ vst1.32 {q2,q3}, [r6, :128]!
+ vadd.f32 q7, q11, q14 @
+ vst1.32 {q4,q5}, [r8, :128]!
+ vst1.32 {q6,q7}, [r10, :128]!
+ bne neon_x8_loop
+
+ vldmia sp!, {d8-d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_static_x8_t_f
+_neon_static_x8_t_f:
+#else
+ .globl neon_static_x8_t_f
+neon_static_x8_t_f:
+#endif
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ vstmdb sp!, {d8-d15}
+ mov r11, #0
+ add r3, r0, #0 @ data0
+ add r5, r0, r1, lsl #1 @ data2
+ add r4, r0, r1 @ data1
+ add r7, r5, r1, lsl #1 @ data4
+ add r6, r5, r1 @ data3
+ add r9, r7, r1, lsl #1 @ data6
+ add r8, r7, r1 @ data5
+ add r10, r9, r1 @ data7
+ add r12, r2, #0 @ LUT
+
+ sub r11, r11, r1, lsr #5
+neon_x8_t_loop:
+ vld1.32 {q2,q3}, [r12, :128]!
+ vld1.32 {q14,q15}, [r6, :128]
+ vld1.32 {q10,q11}, [r5, :128]
+ adds r11, r11, #1
+ vmul.f32 q12, q15, q2
+ vmul.f32 q8, q14, q3
+ vmul.f32 q13, q14, q2
+ vmul.f32 q9, q10, q3
+ vmul.f32 q1, q10, q2
+ vmul.f32 q0, q11, q2
+ vmul.f32 q14, q11, q3
+ vmul.f32 q15, q15, q3
+ vld1.32 {q2,q3}, [r12, :128]!
+ vsub.f32 q10, q12, q8
+ vadd.f32 q11, q0, q9
+ vadd.f32 q8, q15, q13
+ vld1.32 {q12,q13}, [r4, :128]
+ vsub.f32 q9, q1, q14
+ vsub.f32 q15, q11, q10
+ vsub.f32 q14, q9, q8
+ vadd.f32 q4, q12, q15 @
+ vsub.f32 q6, q12, q15 @
+ vsub.f32 q5, q13, q14 @
+ vadd.f32 q7, q13, q14 @
+ vld1.32 {q14,q15}, [r9, :128]
+ vld1.32 {q12,q13}, [r7, :128]
+ vmul.f32 q1, q14, q2
+ vmul.f32 q0, q14, q3
+ vst1.32 {q4,q5}, [r4, :128]
+ vmul.f32 q14, q15, q3
+ vmul.f32 q4, q15, q2
+ vadd.f32 q15, q9, q8
+ vst1.32 {q6,q7}, [r6, :128]
+ vmul.f32 q8, q12, q3
+ vmul.f32 q5, q13, q3
+ vmul.f32 q12, q12, q2
+ vmul.f32 q9, q13, q2
+ vadd.f32 q14, q14, q1
+ vsub.f32 q13, q4, q0
+ vadd.f32 q0, q9, q8
+ vld1.32 {q8,q9}, [r3, :128]
+ vadd.f32 q1, q11, q10
+ vsub.f32 q12, q12, q5
+ vadd.f32 q11, q8, q15
+ vsub.f32 q8, q8, q15
+ vadd.f32 q2, q12, q14
+ vsub.f32 q10, q0, q13
+ vadd.f32 q15, q0, q13
+ vadd.f32 q13, q9, q1
+ vsub.f32 q9, q9, q1
+ vsub.f32 q12, q12, q14
+ vadd.f32 q0, q11, q2
+ vadd.f32 q1, q13, q15
+ vsub.f32 q4, q11, q2
+ vadd.f32 q2, q8, q10 @
+ vsub.f32 q3, q9, q12 @
+ vst2.32 {q0,q1}, [r3, :128]!
+ vsub.f32 q5, q13, q15
+ vld1.32 {q14,q15}, [r10, :128]
+ vadd.f32 q7, q9, q12 @
+ vld1.32 {q12,q13}, [r8, :128]
+ vst2.32 {q2,q3}, [r5, :128]!
+ vld1.32 {q2,q3}, [r12, :128]!
+ vsub.f32 q6, q8, q10 @
+ vmul.f32 q8, q14, q2
+ vst2.32 {q4,q5}, [r7, :128]!
+ vmul.f32 q10, q15, q3
+ vmul.f32 q9, q13, q3
+ vmul.f32 q11, q12, q2
+ vmul.f32 q14, q14, q3
+ vst2.32 {q6,q7}, [r9, :128]!
+ vmul.f32 q15, q15, q2
+ vmul.f32 q12, q12, q3
+ vmul.f32 q13, q13, q2
+ vadd.f32 q10, q10, q8
+ vsub.f32 q11, q11, q9
+ vld1.32 {q8,q9}, [r4, :128]
+ vsub.f32 q14, q15, q14
+ vadd.f32 q15, q13, q12
+ vadd.f32 q13, q11, q10
+ vadd.f32 q12, q15, q14
+ vsub.f32 q15, q15, q14
+ vsub.f32 q14, q11, q10
+ vld1.32 {q10,q11}, [r6, :128]
+ vadd.f32 q0, q8, q13
+ vadd.f32 q1, q9, q12
+ vadd.f32 q2, q10, q15 @
+ vsub.f32 q3, q11, q14 @
+ vsub.f32 q4, q8, q13
+ vst2.32 {q0,q1}, [r4, :128]!
+ vsub.f32 q5, q9, q12
+ vsub.f32 q6, q10, q15 @
+ vst2.32 {q2,q3}, [r6, :128]!
+ vadd.f32 q7, q11, q14 @
+ vst2.32 {q4,q5}, [r8, :128]!
+ vst2.32 {q6,q7}, [r10, :128]!
+ bne neon_x8_t_loop
+
+ vldmia sp!, {d8-d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+
diff --git a/lib/ffts/src/neon_static_i.s b/lib/ffts/src/neon_static_i.s
new file mode 100644
index 0000000..cfa766c
--- /dev/null
+++ b/lib/ffts/src/neon_static_i.s
@@ -0,0 +1,955 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_static_e_i
+_neon_static_e_i:
+#else
+ .globl neon_static_e_i
+neon_static_e_i:
+#endif
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ vstmdb sp!, {d8-d15}
+ ldr lr, [r0, #40] @ this is p->N
+ add r3, r1, #0
+ add r7, r1, lr
+ add r5, r7, lr
+ add r10, r5, lr
+ add r4, r10, lr
+ add r8, r4, lr
+ add r6, r8, lr
+ add r9, r6, lr
+ ldr r12, [r0]
+ add r1, r0, #0
+ add r0, r2, #0
+ ldr r2, [r1, #16] @ this is p->ee_ws
+ ldr r11, [r1, #28] @ this is p->i0
+
+ vld1.32 {d16, d17}, [r2, :128]
+_neon_ee_loop:
+ vld2.32 {q15}, [r10, :128]!
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vld2.32 {q9}, [r4, :128]!
+ vld2.32 {q10}, [r3, :128]!
+ vld2.32 {q11}, [r6, :128]!
+ vld2.32 {q12}, [r5, :128]!
+ vsub.f32 q1, q14, q13
+ vld2.32 {q0}, [r9, :128]!
+ subs r11, r11, #1
+ vsub.f32 q2, q0, q15
+ vadd.f32 q0, q0, q15
+ vmul.f32 d10, d2, d17
+ vmul.f32 d11, d3, d16
+ vmul.f32 d12, d3, d17
+ vmul.f32 d6, d4, d17
+ vmul.f32 d7, d5, d16
+ vmul.f32 d8, d4, d16
+ vmul.f32 d9, d5, d17
+ vmul.f32 d13, d2, d16
+ vsub.f32 d7, d7, d6
+ vadd.f32 d11, d11, d10
+ vsub.f32 q1, q12, q11
+ vsub.f32 q2, q10, q9
+ vadd.f32 d6, d9, d8
+ vadd.f32 q4, q14, q13
+ vadd.f32 q11, q12, q11
+ vadd.f32 q12, q10, q9
+ vsub.f32 d10, d13, d12
+ vsub.f32 q7, q4, q0
+ vsub.f32 q9, q12, q11
+ vsub.f32 q13, q5, q3
+ vadd.f32 d29, d5, d2 @
+ vadd.f32 q5, q5, q3
+ vadd.f32 q10, q4, q0
+ vadd.f32 q11, q12, q11
+ vsub.f32 d31, d5, d2 @
+ vsub.f32 d28, d4, d3 @
+ vadd.f32 d30, d4, d3 @
+ vadd.f32 d5, d19, d14 @
+ vadd.f32 d7, d31, d26 @
+ vadd.f32 q1, q14, q5
+ vadd.f32 q0, q11, q10
+ vsub.f32 d6, d30, d27 @
+ vsub.f32 d4, d18, d15 @
+ vsub.f32 d13, d19, d14 @
+ vadd.f32 d12, d18, d15 @
+ vsub.f32 d15, d31, d26 @
+ ldr r2, [r12], #4
+ vtrn.32 q1, q3
+ ldr lr, [r12], #4
+ vtrn.32 q0, q2
+ add r2, r0, r2, lsl #2
+ vsub.f32 q4, q11, q10
+ add lr, r0, lr, lsl #2
+ vsub.f32 q5, q14, q5
+ vadd.f32 d14, d30, d27 @
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_ee_loop
+
+ ldr r11, [r1, #12]
+ vld2.32 {q9}, [r5, :128]! @tag2
+ vld2.32 {q13}, [r3, :128]! @tag0
+ vld2.32 {q12}, [r4, :128]! @tag1
+ vld2.32 {q0}, [r7, :128]! @tag4
+ vsub.f32 q11, q13, q12
+ vld2.32 {q8}, [r6, :128]! @tag3
+ vadd.f32 q12, q13, q12
+ vsub.f32 q10, q9, q8
+ vadd.f32 q8, q9, q8
+ vadd.f32 q9, q12, q8
+ vadd.f32 d9, d23, d20 @
+ vsub.f32 d11, d23, d20 @
+ vsub.f32 q8, q12, q8
+ vsub.f32 d8, d22, d21 @
+ vadd.f32 d10, d22, d21 @
+ ldr r2, [r12], #4
+ vld1.32 {d20, d21}, [r11, :128]
+ ldr lr, [r12], #4
+ vtrn.32 q9, q4
+ add r2, r0, r2, lsl #2
+ vtrn.32 q8, q5
+ add lr, r0, lr, lsl #2
+ vswp d9,d10
+ vst1.32 {d8,d9,d10,d11}, [lr, :128]!
+ vld2.32 {q13}, [r10, :128]! @tag7
+ vld2.32 {q15}, [r9, :128]! @tag6
+ vld2.32 {q11}, [r8, :128]! @tag5
+ vsub.f32 q14, q15, q13
+ vsub.f32 q12, q0, q11
+ vadd.f32 q11, q0, q11
+ vadd.f32 q13, q15, q13
+ vadd.f32 d13, d29, d24 @
+ vadd.f32 q15, q13, q11
+ vsub.f32 d12, d28, d25 @
+ vsub.f32 d15, d29, d24 @
+ vadd.f32 d14, d28, d25 @
+ vtrn.32 q15, q6
+ vsub.f32 q15, q13, q11
+ vtrn.32 q15, q7
+ vswp d13, d14
+ vst1.32 {d12,d13,d14,d15}, [lr, :128]!
+ vtrn.32 q13, q14
+ vtrn.32 q11, q12
+ vmul.f32 d24, d26, d21
+ vmul.f32 d28, d27, d20
+ vmul.f32 d25, d26, d20
+ vmul.f32 d26, d27, d21
+ vmul.f32 d27, d22, d21
+ vmul.f32 d30, d23, d20
+ vmul.f32 d29, d23, d21
+ vmul.f32 d22, d22, d20
+ vsub.f32 d21, d28, d24
+ vadd.f32 d20, d26, d25
+ vadd.f32 d25, d30, d27
+ vsub.f32 d24, d22, d29
+ vadd.f32 q11, q12, q10
+ vsub.f32 q10, q12, q10
+ vadd.f32 q0, q9, q11
+ vsub.f32 q2, q9, q11
+ vadd.f32 d3, d17, d20 @
+ vsub.f32 d7, d17, d20 @
+ vsub.f32 d2, d16, d21 @
+ vadd.f32 d6, d16, d21 @
+ vswp d1, d2
+ vswp d5, d6
+ vstmia r2!, {q0-q3}
+
+ add r2, r7, #0
+ add r7, r9, #0
+ add r9, r2, #0
+ add r2, r8, #0
+ add r8, r10, #0
+ add r10, r2, #0
+ ldr r11, [r1, #32] @ this is p->i1
+ cmp r11, #0
+ beq _neon_oo_loop_exit
+_neon_oo_loop:
+ vld2.32 {q8}, [r6, :128]!
+ vld2.32 {q9}, [r5, :128]!
+ vld2.32 {q10}, [r4, :128]!
+ vld2.32 {q13}, [r3, :128]!
+ vadd.f32 q11, q9, q8
+ vsub.f32 q8, q9, q8
+ vsub.f32 q9, q13, q10
+ vadd.f32 q12, q13, q10
+ subs r11, r11, #1
+ vld2.32 {q10}, [r7, :128]!
+ vld2.32 {q13}, [r9, :128]!
+ vsub.f32 q2, q12, q11
+ vsub.f32 d7, d19, d16 @
+ vadd.f32 d3, d19, d16 @
+ vadd.f32 d6, d18, d17 @
+ vsub.f32 d2, d18, d17 @
+ vld2.32 {q9}, [r8, :128]!
+ vld2.32 {q8}, [r10, :128]!
+ vadd.f32 q0, q12, q11
+ vadd.f32 q11, q13, q8
+ vadd.f32 q12, q10, q9
+ vsub.f32 q8, q13, q8
+ vsub.f32 q9, q10, q9
+ vsub.f32 q6, q12, q11
+ vadd.f32 q4, q12, q11
+ vtrn.32 q0, q2
+ ldr r2, [r12], #4
+ vsub.f32 d15, d19, d16 @
+ ldr lr, [r12], #4
+ vadd.f32 d11, d19, d16 @
+ vadd.f32 d14, d18, d17 @
+ vsub.f32 d10, d18, d17 @
+ add r2, r0, r2, lsl #2
+ vtrn.32 q1, q3
+ add lr, r0, lr, lsl #2
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_oo_loop
+_neon_oo_loop_exit:
+
+ add r2, r3, #0
+ add r3, r7, #0
+ add r7, r2, #0
+ add r2, r4, #0
+ add r4, r8, #0
+ add r8, r2, #0
+ add r2, r5, #0
+ add r5, r9, #0
+ add r9, r2, #0
+ add r2, r6, #0
+ add r6, r10, #0
+ add r10, r2, #0
+ add r2, r9, #0
+ add r9, r10, #0
+ add r10, r2, #0
+ ldr r2, [r1, #16]
+ ldr r11, [r1, #32] @ this is p->i1
+ cmp r11, #0
+ beq _neon_ee_loop2_exit
+
+ vld1.32 {d16, d17}, [r2, :128]
+_neon_ee_loop2:
+ vld2.32 {q15}, [r10, :128]!
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vld2.32 {q9}, [r4, :128]!
+ vld2.32 {q10}, [r3, :128]!
+ vld2.32 {q11}, [r6, :128]!
+ vld2.32 {q12}, [r5, :128]!
+ vsub.f32 q1, q14, q13
+ vld2.32 {q0}, [r9, :128]!
+ subs r11, r11, #1
+ vsub.f32 q2, q0, q15
+ vadd.f32 q0, q0, q15
+ vmul.f32 d10, d2, d17
+ vmul.f32 d11, d3, d16
+ vmul.f32 d12, d3, d17
+ vmul.f32 d6, d4, d17
+ vmul.f32 d7, d5, d16
+ vmul.f32 d8, d4, d16
+ vmul.f32 d9, d5, d17
+ vmul.f32 d13, d2, d16
+ vsub.f32 d7, d7, d6
+ vadd.f32 d11, d11, d10
+ vsub.f32 q1, q12, q11
+ vsub.f32 q2, q10, q9
+ vadd.f32 d6, d9, d8
+ vadd.f32 q4, q14, q13
+ vadd.f32 q11, q12, q11
+ vadd.f32 q12, q10, q9
+ vsub.f32 d10, d13, d12
+ vsub.f32 q7, q4, q0
+ vsub.f32 q9, q12, q11
+ vsub.f32 q13, q5, q3
+ vadd.f32 d29, d5, d2 @
+ vadd.f32 q5, q5, q3
+ vadd.f32 q10, q4, q0
+ vadd.f32 q11, q12, q11
+ vsub.f32 d31, d5, d2 @
+ vsub.f32 d28, d4, d3 @
+ vadd.f32 d30, d4, d3 @
+ vadd.f32 d5, d19, d14 @
+ vadd.f32 d7, d31, d26 @
+ vadd.f32 q1, q14, q5
+ vadd.f32 q0, q11, q10
+ vsub.f32 d6, d30, d27 @
+ vsub.f32 d4, d18, d15 @
+ vsub.f32 d13, d19, d14 @
+ vadd.f32 d12, d18, d15 @
+ vsub.f32 d15, d31, d26 @
+ ldr r2, [r12], #4
+ vtrn.32 q1, q3
+ ldr lr, [r12], #4
+ vtrn.32 q0, q2
+ add r2, r0, r2, lsl #2
+ vsub.f32 q4, q11, q10
+ add lr, r0, lr, lsl #2
+ vsub.f32 q5, q14, q5
+ vadd.f32 d14, d30, d27 @
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_ee_loop2
+_neon_ee_loop2_exit:
+
+ vldmia sp!, {d8-d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+
+
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_static_o_i
+_neon_static_o_i:
+#else
+ .globl neon_static_o_i
+neon_static_o_i:
+#endif
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ vstmdb sp!, {d8-d15}
+ ldr lr, [r0, #40] @ this is p->N
+ add r3, r1, #0
+ add r7, r1, lr
+ add r5, r7, lr
+ add r10, r5, lr
+ add r4, r10, lr
+ add r8, r4, lr
+ add r6, r8, lr
+ add r9, r6, lr
+ ldr r12, [r0]
+ add r1, r0, #0
+ add r0, r2, #0
+ ldr r2, [r1, #16] @ this is p->ee_ws
+ ldr r11, [r1, #28] @ this is p->i0
+
+ vld1.32 {d16, d17}, [r2, :128]
+_neon_ee_o_loop:
+ vld2.32 {q15}, [r10, :128]!
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vld2.32 {q9}, [r4, :128]!
+ vld2.32 {q10}, [r3, :128]!
+ vld2.32 {q11}, [r6, :128]!
+ vld2.32 {q12}, [r5, :128]!
+ vsub.f32 q1, q14, q13
+ vld2.32 {q0}, [r9, :128]!
+ subs r11, r11, #1
+ vsub.f32 q2, q0, q15
+ vadd.f32 q0, q0, q15
+ vmul.f32 d10, d2, d17
+ vmul.f32 d11, d3, d16
+ vmul.f32 d12, d3, d17
+ vmul.f32 d6, d4, d17
+ vmul.f32 d7, d5, d16
+ vmul.f32 d8, d4, d16
+ vmul.f32 d9, d5, d17
+ vmul.f32 d13, d2, d16
+ vsub.f32 d7, d7, d6
+ vadd.f32 d11, d11, d10
+ vsub.f32 q1, q12, q11
+ vsub.f32 q2, q10, q9
+ vadd.f32 d6, d9, d8
+ vadd.f32 q4, q14, q13
+ vadd.f32 q11, q12, q11
+ vadd.f32 q12, q10, q9
+ vsub.f32 d10, d13, d12
+ vsub.f32 q7, q4, q0
+ vsub.f32 q9, q12, q11
+ vsub.f32 q13, q5, q3
+ vadd.f32 d29, d5, d2 @
+ vadd.f32 q5, q5, q3
+ vadd.f32 q10, q4, q0
+ vadd.f32 q11, q12, q11
+ vsub.f32 d31, d5, d2 @
+ vsub.f32 d28, d4, d3 @
+ vadd.f32 d30, d4, d3 @
+ vadd.f32 d5, d19, d14 @
+ vadd.f32 d7, d31, d26 @
+ vadd.f32 q1, q14, q5
+ vadd.f32 q0, q11, q10
+ vsub.f32 d6, d30, d27 @
+ vsub.f32 d4, d18, d15 @
+ vsub.f32 d13, d19, d14 @
+ vadd.f32 d12, d18, d15 @
+ vsub.f32 d15, d31, d26 @
+ ldr r2, [r12], #4
+ vtrn.32 q1, q3
+ ldr lr, [r12], #4
+ vtrn.32 q0, q2
+ add r2, r0, r2, lsl #2
+ vsub.f32 q4, q11, q10
+ add lr, r0, lr, lsl #2
+ vsub.f32 q5, q14, q5
+ vadd.f32 d14, d30, d27 @
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_ee_o_loop
+
+ add r2, r7, #0
+ add r7, r9, #0
+ add r9, r2, #0
+ add r2, r8, #0
+ add r8, r10, #0
+ add r10, r2, #0
+ ldr r11, [r1, #32] @ this is p->i1
+ cmp r11, #0
+ beq _neon_oo_o_loop_exit
+_neon_oo_o_loop:
+ vld2.32 {q8}, [r6, :128]!
+ vld2.32 {q9}, [r5, :128]!
+ vld2.32 {q10}, [r4, :128]!
+ vld2.32 {q13}, [r3, :128]!
+ vadd.f32 q11, q9, q8
+ vsub.f32 q8, q9, q8
+ vsub.f32 q9, q13, q10
+ vadd.f32 q12, q13, q10
+ subs r11, r11, #1
+ vld2.32 {q10}, [r7, :128]!
+ vld2.32 {q13}, [r9, :128]!
+ vsub.f32 q2, q12, q11
+ vsub.f32 d7, d19, d16 @
+ vadd.f32 d3, d19, d16 @
+ vadd.f32 d6, d18, d17 @
+ vsub.f32 d2, d18, d17 @
+ vld2.32 {q9}, [r8, :128]!
+ vld2.32 {q8}, [r10, :128]!
+ vadd.f32 q0, q12, q11
+ vadd.f32 q11, q13, q8
+ vadd.f32 q12, q10, q9
+ vsub.f32 q8, q13, q8
+ vsub.f32 q9, q10, q9
+ vsub.f32 q6, q12, q11
+ vadd.f32 q4, q12, q11
+ vtrn.32 q0, q2
+ ldr r2, [r12], #4
+ vsub.f32 d15, d19, d16 @
+ ldr lr, [r12], #4
+ vadd.f32 d11, d19, d16 @
+ vadd.f32 d14, d18, d17 @
+ vsub.f32 d10, d18, d17 @
+ add r2, r0, r2, lsl #2
+ vtrn.32 q1, q3
+ add lr, r0, lr, lsl #2
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_oo_o_loop
+_neon_oo_o_loop_exit:
+
+ ldr r11, [r1, #8]
+ vld1.32 {q8}, [r5, :128]!
+ vld1.32 {q10}, [r6, :128]!
+ vld2.32 {q11}, [r4, :128]!
+ vld2.32 {q13}, [r3, :128]!
+ vld2.32 {q15}, [r10, :128]!
+ vorr d25, d17, d17
+ vorr d24, d20, d20
+ vorr d20, d16, d16
+ vsub.f32 q9, q13, q11
+ vadd.f32 q11, q13, q11
+ ldr r2, [r12], #4
+ vtrn.32 d24, d25
+ ldr lr, [r12], #4
+ vtrn.32 d20, d21
+ add r2, r0, r2, lsl #2
+ vsub.f32 q8, q10, q12
+ add lr, r0, lr, lsl #2
+ vadd.f32 q10, q10, q12
+ vadd.f32 q0, q11, q10
+ vadd.f32 d25, d19, d16 @
+ vsub.f32 d27, d19, d16 @
+ vsub.f32 q1, q11, q10
+ vsub.f32 d24, d18, d17 @
+ vadd.f32 d26, d18, d17 @
+ vtrn.32 q0, q12
+ vtrn.32 q1, q13
+ vld1.32 {d24, d25}, [r11, :128]
+ vswp d1, d2
+ vst1.32 {q0, q1}, [r2, :128]!
+ vld2.32 {q0}, [r9, :128]!
+ vadd.f32 q1, q0, q15
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vsub.f32 q15, q0, q15
+ vsub.f32 q0, q14, q13
+ vadd.f32 q3, q14, q13
+ vadd.f32 q2, q3, q1
+ vadd.f32 d29, d1, d30 @
+ vsub.f32 d27, d1, d30 @
+ vsub.f32 q3, q3, q1
+ vsub.f32 d28, d0, d31 @
+ vadd.f32 d26, d0, d31 @
+ vtrn.32 q2, q14
+ vtrn.32 q3, q13
+ vswp d5, d6
+ vst1.32 {q2, q3}, [r2, :128]!
+ vtrn.32 q11, q9
+ vtrn.32 q10, q8
+ vmul.f32 d20, d18, d25
+ vmul.f32 d22, d19, d24
+ vmul.f32 d21, d19, d25
+ vmul.f32 d18, d18, d24
+ vmul.f32 d19, d16, d25
+ vmul.f32 d30, d17, d24
+ vmul.f32 d23, d16, d24
+ vmul.f32 d24, d17, d25
+ vadd.f32 d17, d22, d20
+ vsub.f32 d16, d18, d21
+ vsub.f32 d21, d30, d19
+ vadd.f32 d20, d24, d23
+ vadd.f32 q9, q8, q10
+ vsub.f32 q8, q8, q10
+ vadd.f32 q4, q14, q9
+ vsub.f32 q6, q14, q9
+ vadd.f32 d11, d27, d16 @
+ vsub.f32 d15, d27, d16 @
+ vsub.f32 d10, d26, d17 @
+ vadd.f32 d14, d26, d17 @
+ vswp d9, d10
+ vswp d13, d14
+ vstmia lr!, {q4-q7}
+
+
+ add r2, r3, #0
+ add r3, r7, #0
+ add r7, r2, #0
+ add r2, r4, #0
+ add r4, r8, #0
+ add r8, r2, #0
+ add r2, r5, #0
+ add r5, r9, #0
+ add r9, r2, #0
+ add r2, r6, #0
+ add r6, r10, #0
+ add r10, r2, #0
+ add r2, r9, #0
+ add r9, r10, #0
+ add r10, r2, #0
+ ldr r2, [r1, #16]
+ ldr r11, [r1, #32] @ this is p->i1
+ cmp r11, #0
+ beq _neon_ee_o_loop2_exit
+
+ vld1.32 {d16, d17}, [r2, :128]
+_neon_ee_o_loop2:
+ vld2.32 {q15}, [r10, :128]!
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vld2.32 {q9}, [r4, :128]!
+ vld2.32 {q10}, [r3, :128]!
+ vld2.32 {q11}, [r6, :128]!
+ vld2.32 {q12}, [r5, :128]!
+ vsub.f32 q1, q14, q13
+ vld2.32 {q0}, [r9, :128]!
+ subs r11, r11, #1
+ vsub.f32 q2, q0, q15
+ vadd.f32 q0, q0, q15
+ vmul.f32 d10, d2, d17
+ vmul.f32 d11, d3, d16
+ vmul.f32 d12, d3, d17
+ vmul.f32 d6, d4, d17
+ vmul.f32 d7, d5, d16
+ vmul.f32 d8, d4, d16
+ vmul.f32 d9, d5, d17
+ vmul.f32 d13, d2, d16
+ vsub.f32 d7, d7, d6
+ vadd.f32 d11, d11, d10
+ vsub.f32 q1, q12, q11
+ vsub.f32 q2, q10, q9
+ vadd.f32 d6, d9, d8
+ vadd.f32 q4, q14, q13
+ vadd.f32 q11, q12, q11
+ vadd.f32 q12, q10, q9
+ vsub.f32 d10, d13, d12
+ vsub.f32 q7, q4, q0
+ vsub.f32 q9, q12, q11
+ vsub.f32 q13, q5, q3
+ vadd.f32 d29, d5, d2 @
+ vadd.f32 q5, q5, q3
+ vadd.f32 q10, q4, q0
+ vadd.f32 q11, q12, q11
+ vsub.f32 d31, d5, d2 @
+ vsub.f32 d28, d4, d3 @
+ vadd.f32 d30, d4, d3 @
+ vadd.f32 d5, d19, d14 @
+ vadd.f32 d7, d31, d26 @
+ vadd.f32 q1, q14, q5
+ vadd.f32 q0, q11, q10
+ vsub.f32 d6, d30, d27 @
+ vsub.f32 d4, d18, d15 @
+ vsub.f32 d13, d19, d14 @
+ vadd.f32 d12, d18, d15 @
+ vsub.f32 d15, d31, d26 @
+ ldr r2, [r12], #4
+ vtrn.32 q1, q3
+ ldr lr, [r12], #4
+ vtrn.32 q0, q2
+ add r2, r0, r2, lsl #2
+ vsub.f32 q4, q11, q10
+ add lr, r0, lr, lsl #2
+ vsub.f32 q5, q14, q5
+ vadd.f32 d14, d30, d27 @
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_ee_o_loop2
+_neon_ee_o_loop2_exit:
+
+ vldmia sp!, {d8-d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_static_x4_i
+_neon_static_x4_i:
+#else
+ .globl neon_static_x4_i
+neon_static_x4_i:
+#endif
+@ add r3, r0, #0
+ push {r4, r5, r6, lr}
+ vstmdb sp!, {d8-d15}
+
+ vld1.32 {q8,q9}, [r0, :128]
+ add r4, r0, r1, lsl #1
+ vld1.32 {q10,q11}, [r4, :128]
+ add r5, r0, r1, lsl #2
+ vld1.32 {q12,q13}, [r5, :128]
+ add r6, r4, r1, lsl #2
+ vld1.32 {q14,q15}, [r6, :128]
+ vld1.32 {q2,q3}, [r2, :128]
+
+ vmul.f32 q0, q13, q3
+ vmul.f32 q5, q12, q2
+ vmul.f32 q1, q14, q2
+ vmul.f32 q4, q14, q3
+ vmul.f32 q14, q12, q3
+ vmul.f32 q13, q13, q2
+ vmul.f32 q12, q15, q3
+ vmul.f32 q2, q15, q2
+ vsub.f32 q0, q5, q0
+ vadd.f32 q13, q13, q14
+ vadd.f32 q12, q12, q1
+ vsub.f32 q1, q2, q4
+ vadd.f32 q15, q0, q12
+ vsub.f32 q12, q0, q12
+ vadd.f32 q14, q13, q1
+ vsub.f32 q13, q13, q1
+ vadd.f32 q0, q8, q15
+ vadd.f32 q1, q9, q14
+ vsub.f32 q2, q10, q13 @
+ vsub.f32 q4, q8, q15
+ vadd.f32 q3, q11, q12 @
+ vst1.32 {q0,q1}, [r0, :128]
+ vsub.f32 q5, q9, q14
+ vadd.f32 q6, q10, q13 @
+ vsub.f32 q7, q11, q12 @
+ vst1.32 {q2,q3}, [r4, :128]
+ vst1.32 {q4,q5}, [r5, :128]
+ vst1.32 {q6,q7}, [r6, :128]
+ vldmia sp!, {d8-d15}
+ pop {r4, r5, r6, pc}
+
+
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_static_x8_i
+_neon_static_x8_i:
+#else
+ .globl neon_static_x8_i
+neon_static_x8_i:
+#endif
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ vstmdb sp!, {d8-d15}
+ mov r11, #0
+ add r3, r0, #0 @ data0
+ add r5, r0, r1, lsl #1 @ data2
+ add r4, r0, r1 @ data1
+ add r7, r5, r1, lsl #1 @ data4
+ add r6, r5, r1 @ data3
+ add r9, r7, r1, lsl #1 @ data6
+ add r8, r7, r1 @ data5
+ add r10, r9, r1 @ data7
+ add r12, r2, #0 @ LUT
+
+ sub r11, r11, r1, lsr #5
+neon_x8_loop:
+ vld1.32 {q2,q3}, [r12, :128]!
+ vld1.32 {q14,q15}, [r6, :128]
+ vld1.32 {q10,q11}, [r5, :128]
+ adds r11, r11, #1
+ vmul.f32 q12, q15, q2
+ vmul.f32 q8, q14, q3
+ vmul.f32 q13, q14, q2
+ vmul.f32 q9, q10, q3
+ vmul.f32 q1, q10, q2
+ vmul.f32 q0, q11, q2
+ vmul.f32 q14, q11, q3
+ vmul.f32 q15, q15, q3
+ vld1.32 {q2,q3}, [r12, :128]!
+ vsub.f32 q10, q12, q8
+ vadd.f32 q11, q0, q9
+ vadd.f32 q8, q15, q13
+ vld1.32 {q12,q13}, [r4, :128]
+ vsub.f32 q9, q1, q14
+ vsub.f32 q15, q11, q10
+ vsub.f32 q14, q9, q8
+ vsub.f32 q4, q12, q15 @
+ vadd.f32 q6, q12, q15 @
+ vadd.f32 q5, q13, q14 @
+ vsub.f32 q7, q13, q14 @
+ vld1.32 {q14,q15}, [r9, :128]
+ vld1.32 {q12,q13}, [r7, :128]
+ vmul.f32 q1, q14, q2
+ vmul.f32 q0, q14, q3
+ vst1.32 {q4,q5}, [r4, :128]
+ vmul.f32 q14, q15, q3
+ vmul.f32 q4, q15, q2
+ vadd.f32 q15, q9, q8
+ vst1.32 {q6,q7}, [r6, :128]
+ vmul.f32 q8, q12, q3
+ vmul.f32 q5, q13, q3
+ vmul.f32 q12, q12, q2
+ vmul.f32 q9, q13, q2
+ vadd.f32 q14, q14, q1
+ vsub.f32 q13, q4, q0
+ vadd.f32 q0, q9, q8
+ vld1.32 {q8,q9}, [r3, :128]
+ vadd.f32 q1, q11, q10
+ vsub.f32 q12, q12, q5
+ vadd.f32 q11, q8, q15
+ vsub.f32 q8, q8, q15
+ vadd.f32 q2, q12, q14
+ vsub.f32 q10, q0, q13
+ vadd.f32 q15, q0, q13
+ vadd.f32 q13, q9, q1
+ vsub.f32 q9, q9, q1
+ vsub.f32 q12, q12, q14
+ vadd.f32 q0, q11, q2
+ vadd.f32 q1, q13, q15
+ vsub.f32 q4, q11, q2
+ vsub.f32 q2, q8, q10 @
+ vadd.f32 q3, q9, q12 @
+ vst1.32 {q0,q1}, [r3, :128]!
+ vsub.f32 q5, q13, q15
+ vld1.32 {q14,q15}, [r10, :128]
+ vsub.f32 q7, q9, q12 @
+ vld1.32 {q12,q13}, [r8, :128]
+ vst1.32 {q2,q3}, [r5, :128]!
+ vld1.32 {q2,q3}, [r12, :128]!
+ vadd.f32 q6, q8, q10 @
+ vmul.f32 q8, q14, q2
+ vst1.32 {q4,q5}, [r7, :128]!
+ vmul.f32 q10, q15, q3
+ vmul.f32 q9, q13, q3
+ vmul.f32 q11, q12, q2
+ vmul.f32 q14, q14, q3
+ vst1.32 {q6,q7}, [r9, :128]!
+ vmul.f32 q15, q15, q2
+ vmul.f32 q12, q12, q3
+ vmul.f32 q13, q13, q2
+ vadd.f32 q10, q10, q8
+ vsub.f32 q11, q11, q9
+ vld1.32 {q8,q9}, [r4, :128]
+ vsub.f32 q14, q15, q14
+ vadd.f32 q15, q13, q12
+ vadd.f32 q13, q11, q10
+ vadd.f32 q12, q15, q14
+ vsub.f32 q15, q15, q14
+ vsub.f32 q14, q11, q10
+ vld1.32 {q10,q11}, [r6, :128]
+ vadd.f32 q0, q8, q13
+ vadd.f32 q1, q9, q12
+ vsub.f32 q2, q10, q15 @
+ vadd.f32 q3, q11, q14 @
+ vsub.f32 q4, q8, q13
+ vst1.32 {q0,q1}, [r4, :128]!
+ vsub.f32 q5, q9, q12
+ vadd.f32 q6, q10, q15 @
+ vst1.32 {q2,q3}, [r6, :128]!
+ vsub.f32 q7, q11, q14 @
+ vst1.32 {q4,q5}, [r8, :128]!
+ vst1.32 {q6,q7}, [r10, :128]!
+ bne neon_x8_loop
+
+ vldmia sp!, {d8-d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+ .align 4
+#ifdef __APPLE__
+ .globl _neon_static_x8_t_i
+_neon_static_x8_t_i:
+#else
+ .globl neon_static_x8_t_i
+neon_static_x8_t_i:
+#endif
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ vstmdb sp!, {d8-d15}
+ mov r11, #0
+ add r3, r0, #0 @ data0
+ add r5, r0, r1, lsl #1 @ data2
+ add r4, r0, r1 @ data1
+ add r7, r5, r1, lsl #1 @ data4
+ add r6, r5, r1 @ data3
+ add r9, r7, r1, lsl #1 @ data6
+ add r8, r7, r1 @ data5
+ add r10, r9, r1 @ data7
+ add r12, r2, #0 @ LUT
+
+ sub r11, r11, r1, lsr #5
+neon_x8_t_loop:
+ vld1.32 {q2,q3}, [r12, :128]!
+ vld1.32 {q14,q15}, [r6, :128]
+ vld1.32 {q10,q11}, [r5, :128]
+ adds r11, r11, #1
+ vmul.f32 q12, q15, q2
+ vmul.f32 q8, q14, q3
+ vmul.f32 q13, q14, q2
+ vmul.f32 q9, q10, q3
+ vmul.f32 q1, q10, q2
+ vmul.f32 q0, q11, q2
+ vmul.f32 q14, q11, q3
+ vmul.f32 q15, q15, q3
+ vld1.32 {q2,q3}, [r12, :128]!
+ vsub.f32 q10, q12, q8
+ vadd.f32 q11, q0, q9
+ vadd.f32 q8, q15, q13
+ vld1.32 {q12,q13}, [r4, :128]
+ vsub.f32 q9, q1, q14
+ vsub.f32 q15, q11, q10
+ vsub.f32 q14, q9, q8
+ vsub.f32 q4, q12, q15 @
+ vadd.f32 q6, q12, q15 @
+ vadd.f32 q5, q13, q14 @
+ vsub.f32 q7, q13, q14 @
+ vld1.32 {q14,q15}, [r9, :128]
+ vld1.32 {q12,q13}, [r7, :128]
+ vmul.f32 q1, q14, q2
+ vmul.f32 q0, q14, q3
+ vst1.32 {q4,q5}, [r4, :128]
+ vmul.f32 q14, q15, q3
+ vmul.f32 q4, q15, q2
+ vadd.f32 q15, q9, q8
+ vst1.32 {q6,q7}, [r6, :128]
+ vmul.f32 q8, q12, q3
+ vmul.f32 q5, q13, q3
+ vmul.f32 q12, q12, q2
+ vmul.f32 q9, q13, q2
+ vadd.f32 q14, q14, q1
+ vsub.f32 q13, q4, q0
+ vadd.f32 q0, q9, q8
+ vld1.32 {q8,q9}, [r3, :128]
+ vadd.f32 q1, q11, q10
+ vsub.f32 q12, q12, q5
+ vadd.f32 q11, q8, q15
+ vsub.f32 q8, q8, q15
+ vadd.f32 q2, q12, q14
+ vsub.f32 q10, q0, q13
+ vadd.f32 q15, q0, q13
+ vadd.f32 q13, q9, q1
+ vsub.f32 q9, q9, q1
+ vsub.f32 q12, q12, q14
+ vadd.f32 q0, q11, q2
+ vadd.f32 q1, q13, q15
+ vsub.f32 q4, q11, q2
+ vsub.f32 q2, q8, q10 @
+ vadd.f32 q3, q9, q12 @
+ vst2.32 {q0,q1}, [r3, :128]!
+ vsub.f32 q5, q13, q15
+ vld1.32 {q14,q15}, [r10, :128]
+ vsub.f32 q7, q9, q12 @
+ vld1.32 {q12,q13}, [r8, :128]
+ vst2.32 {q2,q3}, [r5, :128]!
+ vld1.32 {q2,q3}, [r12, :128]!
+ vadd.f32 q6, q8, q10 @
+ vmul.f32 q8, q14, q2
+ vst2.32 {q4,q5}, [r7, :128]!
+ vmul.f32 q10, q15, q3
+ vmul.f32 q9, q13, q3
+ vmul.f32 q11, q12, q2
+ vmul.f32 q14, q14, q3
+ vst2.32 {q6,q7}, [r9, :128]!
+ vmul.f32 q15, q15, q2
+ vmul.f32 q12, q12, q3
+ vmul.f32 q13, q13, q2
+ vadd.f32 q10, q10, q8
+ vsub.f32 q11, q11, q9
+ vld1.32 {q8,q9}, [r4, :128]
+ vsub.f32 q14, q15, q14
+ vadd.f32 q15, q13, q12
+ vadd.f32 q13, q11, q10
+ vadd.f32 q12, q15, q14
+ vsub.f32 q15, q15, q14
+ vsub.f32 q14, q11, q10
+ vld1.32 {q10,q11}, [r6, :128]
+ vadd.f32 q0, q8, q13
+ vadd.f32 q1, q9, q12
+ vsub.f32 q2, q10, q15 @
+ vadd.f32 q3, q11, q14 @
+ vsub.f32 q4, q8, q13
+ vst2.32 {q0,q1}, [r4, :128]!
+ vsub.f32 q5, q9, q12
+ vadd.f32 q6, q10, q15 @
+ vst2.32 {q2,q3}, [r6, :128]!
+ vsub.f32 q7, q11, q14 @
+ vst2.32 {q4,q5}, [r8, :128]!
+ vst2.32 {q6,q7}, [r10, :128]!
+ bne neon_x8_t_loop
+
+ vldmia sp!, {d8-d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+
diff --git a/lib/ffts/src/patterns.c b/lib/ffts/src/patterns.c
new file mode 100644
index 0000000..93fe7f7
--- /dev/null
+++ b/lib/ffts/src/patterns.c
@@ -0,0 +1,208 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "patterns.h"
+
+void permute_addr(int N, int offset, int stride, int *d) {
+ int i, a[4] = {0,2,1,3};
+ for(i=0;i<4;i++) {
+ d[i] = offset + (a[i] << stride);
+ if(d[i] < 0) d[i] += N;
+ }
+}
+
+void ffts_hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) {
+
+ if(N > 4) {
+ ffts_hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL);
+ if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL);
+ if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL);
+ else {
+ int temp = poffset+(1<<stride);
+ if(temp < 0) temp += bigN;
+ temp *= 2;
+
+ if(!(temp % (VL*2))) {
+ (*is)[0] = poffset+(1<<stride);
+ (*is)[1] = poffset+(1<<stride)+(1<<(stride+2));
+ (*is)[2] = poffset-(1<<stride);
+ (*is)[3] = poffset-(1<<stride)+(1<<(stride+2));
+ int i;
+ for(i=0;i<4;i++) if((*is)[i] < 0) (*is)[i] += bigN;
+ for(i=0;i<4;i++) (*is)[i] *= 2;
+ *is += 4;
+ }
+ }
+ }else if(N == 4) {
+ int perm[4];
+ permute_addr(bigN, poffset, stride, perm);
+ if(!((perm[0]*2) % (VL*2))) {
+ int i;
+ for(i=0;i<4;i++) {
+ (*is)[i] = perm[i] * 2;
+ }
+ *is += 4;
+ }
+ }
+}
+
+void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL) {
+ int i, i0 = N/leafN/3+1, i1=N/leafN/3, i2 = N/leafN/3;
+ int stride = log(N/leafN)/log(2);
+
+ p->is = malloc(N/VL * sizeof(ptrdiff_t));
+
+ ptrdiff_t *is = p->is;
+
+ if((N/leafN) % 3 > 1) i1++;
+
+ for(i=0;i<i0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
+ for(i=i0;i<i0+i1;i++) {
+ ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL);
+ ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL);
+ }
+ for(i=0-i2;i<0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
+
+
+//for(i=0;i<N/VL;i++) {
+// printf("%td ", p->is[i]);
+// if(i % 16 == 15) printf("\n");
+//}
+
+ p->i0 = i0; p->i1 = i1;
+}
+/**
+ *
+ *
+ */
+void ffts_elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) {
+ if((even && N == leafN) || (!even && N <= leafN)) {
+ offsets[2*(ooffset/leafN)] = ioffset*2;
+ offsets[2*(ooffset/leafN)+1] = ooffset;
+ }else if(N > 4) {
+ ffts_elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even);
+ ffts_elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0);
+ if(N/4 >= leafN)
+ ffts_elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0);
+ }
+
+}
+
+int compare_offsets(const void *a, const void *b) {
+ return ((ptrdiff_t *)a)[0] - ((ptrdiff_t *)b)[0];
+}
+
+uint32_t reverse_bits(uint32_t a, int n) {
+ uint32_t x = 0;
+
+ int i;
+ for(i=0;i<n;i++) {
+ if(a & (1 << i)) x |= 1 << (n-i-1);
+ }
+ return x;
+}
+
+
+void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) {
+
+ ptrdiff_t *offsets = malloc(2 * N/leafN * sizeof(ptrdiff_t));
+
+ ffts_elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1);
+
+ size_t i;
+ for(i=0;i<2*N/leafN;i+=2) {
+ if(offsets[i] < 0) offsets[i] = N + offsets[i];
+ }
+
+ qsort(offsets, N/leafN, 2 * sizeof(ptrdiff_t), compare_offsets);
+ //elaborate_is(p, N, 0, 0, 1);
+ p->offsets = malloc(N/leafN * sizeof(ptrdiff_t));
+ for(i=0;i<N/leafN;i++) {
+ p->offsets[i] = offsets[i*2+1]*2;
+ }
+//for(i=0;i<N/leafN;i++) {
+// printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N)));
+//}
+ free(offsets);
+}
+
+/*
+int tree_count(int N, int leafN, int offset) {
+
+ if(N <= leafN) return 0;
+ int count = 0;
+ count += tree_count(N/4, leafN, offset);
+ count += tree_count(N/8, leafN, offset + N/4);
+ count += tree_count(N/8, leafN, offset + N/4 + N/8);
+ count += tree_count(N/4, leafN, offset + N/2);
+ count += tree_count(N/4, leafN, offset + 3*N/4);
+
+ return 1 + count;
+}
+
+void elaborate_tree(transform_index_t **p, int N, int leafN, int offset) {
+
+ if(N <= leafN) return;
+ elaborate_tree(p, N/4, leafN, offset);
+ elaborate_tree(p, N/8, leafN, offset + N/4);
+ elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
+ elaborate_tree(p, N/4, leafN, offset + N/2);
+ elaborate_tree(p, N/4, leafN, offset + 3*N/4);
+
+ (*p)[0] = N;
+ (*p)[1] = offset*2;
+
+ (*p)+=2;
+}
+
+void ffts_init_tree(ffts_plan_t *p, int N, int leafN) {
+
+ int count = tree_count(N, leafN, 0) + 1;
+ transform_index_t *ps = p->transforms = malloc(count * 2 * sizeof(transform_index_t));
+
+//printf("count = %d\n", count);
+
+ elaborate_tree(&ps, N, leafN, 0);
+ #ifdef __ARM_NEON__
+ ps -= 2;
+ #endif
+ ps[0] = 0;
+ ps[1] = 0;
+//int i;
+//for(i=0;i<count;i++) {
+// fprintf(stderr, "%lu %lu - %d\n", p->transforms[i*2], p->transforms[i*2+1],
+// __builtin_ctzl(p->transforms[i*2]) - 5);
+//}
+
+}
+*/
diff --git a/lib/ffts/src/patterns.h b/lib/ffts/src/patterns.h
new file mode 100644
index 0000000..6e2d6bb
--- /dev/null
+++ b/lib/ffts/src/patterns.h
@@ -0,0 +1,44 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#ifndef __PATTERNS_H__
+#define __PATTERNS_H__
+
+#include "ffts.h"
+
+void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL);
+void ffts_init_offsets(ffts_plan_t *p, int N, int leafN);
+//void ffts_init_tree(ffts_plan_t *p, int N, int leafN);
+
+#endif
diff --git a/lib/ffts/src/sse.s b/lib/ffts/src/sse.s
new file mode 100644
index 0000000..79dd6ec
--- /dev/null
+++ b/lib/ffts/src/sse.s
@@ -0,0 +1,878 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+ .globl _neon_x4
+ .align 4
+_neon_x4:
+
+ .globl _neon_x8
+ .align 4
+_neon_x8:
+
+ .globl _neon_x8_t
+ .align 4
+_neon_x8_t:
+
+
+#ifdef __APPLE__
+ .globl _leaf_ee_init
+_leaf_ee_init:
+#else
+ .globl leaf_ee_init
+leaf_ee_init:
+#endif
+ #lea L_sse_constants(%rip), %r9
+ movq 0xe0(%rdi), %r9
+ xorl %eax, %eax
+# eax is loop counter (init to 0)
+# rcx is loop max count
+# rsi is 'in' base pointer
+# rdx is 'out' base pointer
+# r8 is offsets pointer
+# r9 is constants pointer
+# scratch: rax r11 r12
+# .align 4, 0x90
+
+# _leaf_ee + 9 needs 16 byte alignment
+#ifdef __APPLE__
+ .globl _leaf_ee
+_leaf_ee:
+#else
+ .globl leaf_ee
+leaf_ee:
+#endif
+ movaps 32(%r9), %xmm0 #83.5
+ movaps (%r9), %xmm8 #83.5
+LEAF_EE_1:
+LEAF_EE_const_0:
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5
+LEAF_EE_const_2:
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5
+ movaps %xmm7, %xmm6 #83.5
+LEAF_EE_const_3:
+ movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
+ movaps %xmm12, %xmm11 #83.5
+ subps %xmm10, %xmm12 #83.5
+ addps %xmm10, %xmm11 #83.5
+ xorps %xmm8, %xmm12 #83.5
+LEAF_EE_const_1:
+ movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5
+LEAF_EE_const_4:
+ movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
+ addps %xmm9, %xmm6 #83.5
+ subps %xmm9, %xmm7 #83.5
+LEAF_EE_const_5:
+ movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5
+ movaps %xmm10, %xmm9 #83.5
+LEAF_EE_const_6:
+ movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5
+ movaps %xmm6, %xmm5 #83.5
+LEAF_EE_const_7:
+ movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5
+ movaps %xmm3, %xmm15 #83.5
+ shufps $177, %xmm12, %xmm12 #83.5
+ movaps %xmm7, %xmm4 #83.5
+ movslq (%r8, %rax, 4), %r11 #83.44
+ subps %xmm13, %xmm10 #83.5
+ subps %xmm14, %xmm3 #83.5
+ addps %xmm11, %xmm5 #83.5
+ subps %xmm11, %xmm6 #83.5
+ subps %xmm12, %xmm4 #83.5
+ addps %xmm12, %xmm7 #83.5
+ addps %xmm13, %xmm9 #83.5
+ addps %xmm14, %xmm15 #83.5
+ movaps 16(%r9), %xmm12 #83.5
+ movaps %xmm9, %xmm1 #83.5
+ movaps 16(%r9), %xmm11 #83.5
+ movaps %xmm5, %xmm2 #83.5
+ mulps %xmm10, %xmm12 #83.5
+ subps %xmm15, %xmm9 #83.5
+ addps %xmm15, %xmm1 #83.5
+ mulps %xmm3, %xmm11 #83.5
+ addps %xmm1, %xmm2 #83.5
+ subps %xmm1, %xmm5 #83.5
+ shufps $177, %xmm10, %xmm10 #83.5
+ xorps %xmm8, %xmm9 #83.5
+ shufps $177, %xmm3, %xmm3 #83.5
+ movaps %xmm6, %xmm1 #83.5
+ mulps %xmm0, %xmm10 #83.5
+ movaps %xmm4, %xmm13 #83.5
+ mulps %xmm0, %xmm3 #83.5
+ subps %xmm10, %xmm12 #83.5
+ addps %xmm3, %xmm11 #83.5
+ movaps %xmm12, %xmm3 #83.5
+ movaps %xmm7, %xmm14 #83.5
+ shufps $177, %xmm9, %xmm9 #83.5
+ subps %xmm11, %xmm12 #83.5
+ addps %xmm11, %xmm3 #83.5
+ subps %xmm9, %xmm1 #83.5
+ addps %xmm9, %xmm6 #83.5
+ addps %xmm3, %xmm4 #83.5
+ subps %xmm3, %xmm13 #83.5
+ xorps %xmm8, %xmm12 #83.5
+ movaps %xmm2, %xmm3 #83.5
+ shufps $177, %xmm12, %xmm12 #83.5
+ movaps %xmm6, %xmm9 #83.5
+ movslq 8(%r8, %rax, 4), %r12 #83.59
+ movlhps %xmm4, %xmm3 #83.5
+ addq $4, %rax
+ shufps $238, %xmm4, %xmm2 #83.5
+ movaps %xmm1, %xmm4 #83.5
+ #movntdq %xmm3, (%rdx,%r11,4) #83.5
+ subps %xmm12, %xmm7 #83.5
+ addps %xmm12, %xmm14 #83.5
+ movlhps %xmm7, %xmm4 #83.5
+ shufps $238, %xmm7, %xmm1 #83.5
+ movaps %xmm5, %xmm7 #83.5
+ movlhps %xmm13, %xmm7 #83.5
+ movlhps %xmm14, %xmm9 #83.5
+ shufps $238, %xmm13, %xmm5 #83.5
+ shufps $238, %xmm14, %xmm6 #83.5
+ movaps %xmm3, (%rdx,%r11,4) #83.5
+ movaps %xmm4, 16(%rdx,%r11,4) #83.5
+ movaps %xmm7, 32(%rdx,%r11,4) #83.5
+ movaps %xmm9, 48(%rdx,%r11,4) #83.5
+ movaps %xmm2, (%rdx,%r12,4) #83.5
+ movaps %xmm1, 16(%rdx,%r12,4) #83.5
+ movaps %xmm5, 32(%rdx,%r12,4) #83.5
+ movaps %xmm6, 48(%rdx,%r12,4) #83.5
+ cmpq %rcx, %rax
+ jne LEAF_EE_1
+
+
+
+# _leaf_oo + 4 needs to be 16 byte aligned
+#ifdef __APPLE__
+ .globl _leaf_oo
+_leaf_oo:
+#else
+ .globl leaf_oo
+leaf_oo:
+#endif
+ movaps (%r9), %xmm5 #92.7
+LEAF_OO_1:
+LEAF_OO_const_0:
+ movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5
+ movaps %xmm4, %xmm6 #93.5
+LEAF_OO_const_1:
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5
+LEAF_OO_const_2:
+ movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5
+ addps %xmm7, %xmm6 #93.5
+ subps %xmm7, %xmm4 #93.5
+LEAF_OO_const_3:
+ movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5
+ movaps %xmm10, %xmm9 #93.5
+LEAF_OO_const_4:
+ movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5
+ movaps %xmm6, %xmm3 #93.5
+LEAF_OO_const_5:
+ movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5
+ movaps %xmm1, %xmm2 #93.5
+LEAF_OO_const_6:
+ movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5
+ movaps %xmm4, %xmm15 #93.5
+LEAF_OO_const_7:
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5
+ movaps %xmm14, %xmm13 #93.5
+ movslq (%r8, %rax, 4), %r11 #83.44
+ subps %xmm8, %xmm10 #93.5
+ addps %xmm8, %xmm9 #93.5
+ addps %xmm11, %xmm2 #93.5
+ subps %xmm12, %xmm14 #93.5
+ subps %xmm11, %xmm1 #93.5
+ addps %xmm12, %xmm13 #93.5
+ addps %xmm9, %xmm3 #93.5
+ subps %xmm9, %xmm6 #93.5
+ xorps %xmm5, %xmm10 #93.5
+ xorps %xmm5, %xmm14 #93.5
+ shufps $177, %xmm10, %xmm10 #93.5
+ movaps %xmm2, %xmm9 #93.5
+ shufps $177, %xmm14, %xmm14 #93.5
+ movaps %xmm6, %xmm7 #93.5
+ movslq 8(%r8, %rax, 4), %r12 #83.59
+ addq $4, %rax #92.18
+ addps %xmm10, %xmm4 #93.5
+ addps %xmm13, %xmm9 #93.5
+ subps %xmm13, %xmm2 #93.5
+ subps %xmm10, %xmm15 #93.5
+ movaps %xmm1, %xmm13 #93.5
+ movaps %xmm2, %xmm8 #93.5
+ movlhps %xmm4, %xmm7 #93.5
+ subps %xmm14, %xmm13 #93.5
+ addps %xmm14, %xmm1 #93.5
+ shufps $238, %xmm4, %xmm6 #93.5
+ movaps %xmm3, %xmm14 #93.5
+ movaps %xmm9, %xmm4 #93.5
+ movlhps %xmm15, %xmm14 #93.5
+ movlhps %xmm13, %xmm4 #93.5
+ movlhps %xmm1, %xmm8 #93.5
+ shufps $238, %xmm15, %xmm3 #93.5
+ shufps $238, %xmm13, %xmm9 #93.5
+ shufps $238, %xmm1, %xmm2 #93.5
+ movaps %xmm14, (%rdx,%r11,4) #93.5
+ movaps %xmm7, 16(%rdx,%r11,4) #93.5
+ movaps %xmm4, 32(%rdx,%r11,4) #93.5
+ movaps %xmm8, 48(%rdx,%r11,4) #93.5
+ movaps %xmm3, (%rdx,%r12,4) #93.5
+ movaps %xmm6, 16(%rdx,%r12,4) #93.5
+ movaps %xmm9, 32(%rdx,%r12,4) #93.5
+ movaps %xmm2, 48(%rdx,%r12,4) #93.5
+ cmpq %rcx, %rax
+ jne LEAF_OO_1 # Prob 95% #92.14
+
+#ifdef __APPLE__
+ .globl _leaf_eo
+_leaf_eo:
+#else
+ .globl leaf_eo
+leaf_eo:
+#endif
+LEAF_EO_const_0:
+ movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5
+LEAF_EO_const_2:
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5
+ movaps %xmm9, %xmm11 #88.5
+LEAF_EO_const_3:
+ movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5
+ movaps %xmm7, %xmm6 #88.5
+LEAF_EO_const_1:
+ movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
+ subps %xmm5, %xmm7 #88.5
+ addps %xmm4, %xmm11 #88.5
+ subps %xmm4, %xmm9 #88.5
+ addps %xmm5, %xmm6 #88.5
+ movaps (%r9), %xmm3 #88.5
+ movaps %xmm11, %xmm10 #88.5
+ xorps %xmm3, %xmm7 #88.5
+ movaps %xmm9, %xmm8 #88.5
+ shufps $177, %xmm7, %xmm7 #88.5
+ addps %xmm6, %xmm10 #88.5
+ subps %xmm6, %xmm11 #88.5
+ subps %xmm7, %xmm8 #88.5
+ addps %xmm7, %xmm9 #88.5
+ movslq 8(%r8, %rax, 4), %r12 #83.59
+ movaps %xmm10, %xmm2 #88.5
+ movslq (%r8, %rax, 4), %r11 #83.44
+ movaps %xmm11, %xmm1 #88.5
+ shufps $238, %xmm8, %xmm10 #88.5
+ shufps $238, %xmm9, %xmm11 #88.5
+ movaps %xmm10, (%rdx,%r12,4) #88.5
+ movaps %xmm11, 16(%rdx,%r12,4) #88.5
+LEAF_EO_const_4:
+ movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5
+LEAF_EO_const_5:
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5
+ movaps %xmm15, %xmm14 #88.5
+LEAF_EO_const_6:
+ movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
+ addps %xmm12, %xmm14 #88.5
+ subps %xmm12, %xmm15 #88.5
+LEAF_EO_const_7:
+ movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5
+ movaps %xmm4, %xmm5 #88.5
+ movaps %xmm14, %xmm7 #88.5
+ addps %xmm13, %xmm5 #88.5
+ subps %xmm13, %xmm4 #88.5
+ movlhps %xmm8, %xmm2 #88.5
+ movaps %xmm5, %xmm8 #88.5
+ movlhps %xmm15, %xmm7 #88.5
+ xorps %xmm3, %xmm15 #88.5
+ movaps %xmm5, %xmm6 #88.5
+ subps %xmm14, %xmm5 #88.5
+ addps %xmm14, %xmm6 #88.5
+ movlhps %xmm9, %xmm1 #88.5
+ movaps %xmm4, %xmm14 #88.5
+ movlhps %xmm4, %xmm8 #88.5
+ movaps %xmm1, %xmm12 #88.5
+ shufps $177, %xmm15, %xmm15 #88.5
+ movaps 0x30(%r9), %xmm11 #88.5
+ addq $4, %rax #90.5
+ subps %xmm15, %xmm14 #88.5
+ mulps %xmm7, %xmm11 #88.5
+ addps %xmm15, %xmm4 #88.5
+ movaps 0x30(%r9), %xmm9 #88.5
+ movaps 0x40(%r9), %xmm15 #88.5
+ shufps $177, %xmm7, %xmm7 #88.5
+ mulps %xmm8, %xmm9 #88.5
+ mulps %xmm15, %xmm7 #88.5
+ shufps $177, %xmm8, %xmm8 #88.5
+ subps %xmm7, %xmm11 #88.5
+ mulps %xmm15, %xmm8 #88.5
+ movaps %xmm11, %xmm10 #88.5
+ addps %xmm8, %xmm9 #88.5
+ shufps $238, %xmm14, %xmm6 #88.5
+ subps %xmm9, %xmm11 #88.5
+ addps %xmm9, %xmm10 #88.5
+ xorps %xmm3, %xmm11 #88.5
+ movaps %xmm2, %xmm3 #88.5
+ shufps $177, %xmm11, %xmm11 #88.5
+ subps %xmm10, %xmm3 #88.5
+ addps %xmm10, %xmm2 #88.5
+ addps %xmm11, %xmm12 #88.5
+ subps %xmm11, %xmm1 #88.5
+ shufps $238, %xmm4, %xmm5 #88.5
+ movaps %xmm5, 48(%rdx,%r12,4) #88.5
+ movaps %xmm6, 32(%rdx,%r12,4) #88.5
+ movaps %xmm2, (%rdx,%r11,4) #88.5
+ movaps %xmm1, 16(%rdx,%r11,4) #88.5
+ movaps %xmm3, 32(%rdx,%r11,4) #88.5
+ movaps %xmm12, 48(%rdx,%r11,4) #88.5
+
+
+#ifdef __APPLE__
+ .globl _leaf_oe
+_leaf_oe:
+#else
+ .globl leaf_oe
+leaf_oe:
+#endif
+ movaps (%r9), %xmm0 #59.5
+ #movaps 0x20(%r9), %xmm1 #59.5
+LEAF_OE_const_2:
+ movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5
+LEAF_OE_const_3:
+ movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5
+ movaps %xmm6, %xmm10 #70.5
+ shufps $228, %xmm8, %xmm10 #70.5
+ movaps %xmm10, %xmm9 #70.5
+ shufps $228, %xmm6, %xmm8 #70.5
+LEAF_OE_const_0:
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5
+LEAF_OE_const_1:
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
+ movaps %xmm12, %xmm14 #70.5
+ movslq (%r8, %rax, 4), %r11 #83.44
+ addps %xmm8, %xmm9 #70.5
+ subps %xmm8, %xmm10 #70.5
+ addps %xmm7, %xmm14 #70.5
+ subps %xmm7, %xmm12 #70.5
+ movaps %xmm9, %xmm4 #70.5
+ movaps %xmm14, %xmm13 #70.5
+ shufps $238, %xmm10, %xmm4 #70.5
+ xorps %xmm0, %xmm10 #70.5
+ shufps $177, %xmm10, %xmm10 #70.5
+ movaps %xmm12, %xmm11 #70.5
+ movaps %xmm14, %xmm5 #70.5
+ addps %xmm9, %xmm13 #70.5
+ subps %xmm10, %xmm11 #70.5
+ subps %xmm9, %xmm14 #70.5
+ shufps $238, %xmm12, %xmm5 #70.5
+ addps %xmm10, %xmm12 #70.5
+ movslq 8(%r8, %rax, 4), %r12 #83.59
+ movlhps %xmm11, %xmm13 #70.5
+ movaps %xmm13, (%rdx,%r11,4) #70.5
+ movaps 0x30(%r9), %xmm13 #70.5
+ movlhps %xmm12, %xmm14 #70.5
+ movaps 0x40(%r9), %xmm12 #70.5
+ mulps %xmm5, %xmm13 #70.5
+ shufps $177, %xmm5, %xmm5 #70.5
+ mulps %xmm12, %xmm5 #70.5
+ movaps %xmm14, 16(%rdx,%r11,4) #70.5
+ subps %xmm5, %xmm13 #70.5
+ movaps 0x30(%r9), %xmm5 #70.5
+ mulps %xmm4, %xmm5 #70.5
+ shufps $177, %xmm4, %xmm4 #70.5
+ mulps %xmm12, %xmm4 #70.5
+LEAF_OE_const_4:
+ movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5
+ addps %xmm4, %xmm5 #70.5
+LEAF_OE_const_6:
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
+ movaps %xmm9, %xmm3 #70.5
+LEAF_OE_const_7:
+ movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5
+ movaps %xmm7, %xmm6 #70.5
+LEAF_OE_const_5:
+ movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5
+ movaps %xmm13, %xmm4 #70.5
+ subps %xmm2, %xmm7 #70.5
+ addps %xmm15, %xmm3 #70.5
+ subps %xmm15, %xmm9 #70.5
+ addps %xmm2, %xmm6 #70.5
+ subps %xmm5, %xmm13 #70.5
+ addps %xmm5, %xmm4 #70.5
+ xorps %xmm0, %xmm7 #70.5
+ addq $4, %rax #72.5
+ movaps %xmm3, %xmm2 #70.5
+ shufps $177, %xmm7, %xmm7 #70.5
+ movaps %xmm9, %xmm8 #70.5
+ xorps %xmm0, %xmm13 #70.5
+ addps %xmm6, %xmm2 #70.5
+ subps %xmm7, %xmm8 #70.5
+ subps %xmm6, %xmm3 #70.5
+ addps %xmm7, %xmm9 #70.5
+ movaps %xmm2, %xmm10 #70.5
+ movaps %xmm3, %xmm11 #70.5
+ shufps $238, %xmm8, %xmm2 #70.5
+ shufps $238, %xmm9, %xmm3 #70.5
+ movaps %xmm2, %xmm14 #70.5
+ shufps $177, %xmm13, %xmm13 #70.5
+ subps %xmm4, %xmm14 #70.5
+ addps %xmm4, %xmm2 #70.5
+ movaps %xmm3, %xmm4 #70.5
+ subps %xmm13, %xmm3 #70.5
+ addps %xmm13, %xmm4 #70.5
+ movlhps %xmm8, %xmm10 #70.5
+ movlhps %xmm9, %xmm11 #70.5
+ movaps %xmm10, 32(%rdx,%r11,4) #70.5
+ movaps %xmm11, 48(%rdx,%r11,4) #70.5
+ movaps %xmm2, (%rdx,%r12,4) #70.5
+ movaps %xmm3, 16(%rdx,%r12,4) #70.5
+ movaps %xmm14, 32(%rdx,%r12,4) #70.5
+ movaps %xmm4, 48(%rdx,%r12,4) #70.5
+
+
+#ifdef __APPLE__
+ .globl _leaf_end
+_leaf_end:
+#else
+ .globl leaf_end
+leaf_end:
+#endif
+
+#ifdef __APPLE__
+ .globl _x_init
+_x_init:
+#else
+ .globl x_init
+x_init:
+#endif
+ #movaps L_sse_constants(%rip), %xmm3 #34.3
+ movaps (%r9), %xmm3 #34.3
+ movq 0x20(%rdi),%r8
+#ifdef __APPLE__
+ .globl _x4
+_x4:
+#else
+ .globl x4
+x4:
+#endif
+ movaps 64(%rdx), %xmm0 #34.3
+ movaps 96(%rdx), %xmm1 #34.3
+ movaps (%rdx), %xmm7 #34.3
+ movaps (%r8), %xmm4 #const
+ movaps %xmm7, %xmm9 #34.3
+ movaps %xmm4, %xmm6 #34.3
+ movaps 16(%r8), %xmm2 #const
+ mulps %xmm0, %xmm6 #34.3
+ mulps %xmm1, %xmm4 #34.3
+ shufps $177, %xmm0, %xmm0 #34.3
+ shufps $177, %xmm1, %xmm1 #34.3
+ mulps %xmm2, %xmm0 #34.3
+ mulps %xmm1, %xmm2 #34.3
+ subps %xmm0, %xmm6 #34.3
+ addps %xmm2, %xmm4 #34.3
+ movaps %xmm6, %xmm5 #34.3
+ subps %xmm4, %xmm6 #34.3
+ addps %xmm4, %xmm5 #34.3
+ movaps 32(%rdx), %xmm8 #34.3
+ xorps %xmm3, %xmm6 #34.3
+ shufps $177, %xmm6, %xmm6 #34.3
+ movaps %xmm8, %xmm10 #34.3
+ movaps 112(%rdx), %xmm12 #34.3
+ subps %xmm5, %xmm9 #34.3
+ addps %xmm5, %xmm7 #34.3
+ addps %xmm6, %xmm10 #34.3
+ subps %xmm6, %xmm8 #34.3
+ movaps %xmm7, (%rdx) #34.3
+ movaps %xmm8, 32(%rdx) #34.3
+ movaps %xmm9, 64(%rdx) #34.3
+ movaps %xmm10, 96(%rdx) #34.3
+ movaps 32(%r8), %xmm14 #const #34.3
+ movaps 80(%rdx), %xmm11 #34.3
+ movaps %xmm14, %xmm0 #34.3
+ movaps 48(%r8), %xmm13 #const #34.3
+ mulps %xmm11, %xmm0 #34.3
+ mulps %xmm12, %xmm14 #34.3
+ shufps $177, %xmm11, %xmm11 #34.3
+ shufps $177, %xmm12, %xmm12 #34.3
+ mulps %xmm13, %xmm11 #34.3
+ mulps %xmm12, %xmm13 #34.3
+ subps %xmm11, %xmm0 #34.3
+ addps %xmm13, %xmm14 #34.3
+ movaps %xmm0, %xmm15 #34.3
+ subps %xmm14, %xmm0 #34.3
+ addps %xmm14, %xmm15 #34.3
+ xorps %xmm3, %xmm0 #34.3
+ movaps 16(%rdx), %xmm1 #34.3
+ movaps 48(%rdx), %xmm2 #34.3
+ movaps %xmm1, %xmm4 #34.3
+ shufps $177, %xmm0, %xmm0 #34.3
+ movaps %xmm2, %xmm5 #34.3
+ addps %xmm15, %xmm1 #34.3
+ subps %xmm0, %xmm2 #34.3
+ subps %xmm15, %xmm4 #34.3
+ addps %xmm0, %xmm5 #34.3
+ movaps %xmm1, 16(%rdx) #34.3
+ movaps %xmm2, 48(%rdx) #34.3
+ movaps %xmm4, 80(%rdx) #34.3
+ movaps %xmm5, 112(%rdx) #34.3
+ ret
+
+# _x8_soft + 5 needs to be 16 byte aligned
+#ifdef __APPLE__
+ .globl _x8_soft
+_x8_soft:
+#else
+ .globl x8_soft
+x8_soft:
+#endif
+ xorl %eax, %eax
+ movq %rdx, %rbx
+ movq %r8, %rsi
+ leaq (%rdx,%rcx,4), %r9
+ leaq (%r9,%rcx,4), %r10
+ leaq (%r10,%rcx,4), %r11
+ leaq (%r11,%rcx,4), %r12
+ leaq (%r12,%rcx,4), %r13
+ leaq (%r13,%rcx,4), %r14
+ leaq (%r14,%rcx,4), %r15
+X8_soft_loop:
+ movaps (%rsi), %xmm9
+ movaps (%r10,%rax,4), %xmm6
+ movaps %xmm9, %xmm11
+ movaps (%r11,%rax,4), %xmm7
+ movaps 16(%rsi), %xmm8
+ mulps %xmm6, %xmm11
+ mulps %xmm7, %xmm9
+ shufps $177, %xmm6, %xmm6
+ mulps %xmm8, %xmm6
+ shufps $177, %xmm7, %xmm7
+ subps %xmm6, %xmm11
+ mulps %xmm7, %xmm8
+ movaps %xmm11, %xmm10
+ addps %xmm8, %xmm9
+ movaps 32(%rsi), %xmm15
+ addps %xmm9, %xmm10
+ subps %xmm9, %xmm11
+ movaps (%rbx,%rax,4), %xmm5
+ movaps %xmm15, %xmm6
+ movaps (%r12,%rax,4), %xmm12
+ movaps %xmm5, %xmm2
+ movaps (%r14,%rax,4), %xmm13
+ xorps %xmm3, %xmm11 #const
+ movaps 48(%rsi), %xmm14
+ subps %xmm10, %xmm2
+ mulps %xmm12, %xmm6
+ addps %xmm10, %xmm5
+ mulps %xmm13, %xmm15
+ movaps 64(%rsi), %xmm10
+ movaps %xmm5, %xmm0
+ shufps $177, %xmm12, %xmm12
+ shufps $177, %xmm13, %xmm13
+ mulps %xmm14, %xmm12
+ mulps %xmm13, %xmm14
+ subps %xmm12, %xmm6
+ addps %xmm14, %xmm15
+ movaps (%r13,%rax,4), %xmm7
+ movaps %xmm10, %xmm13
+ movaps (%r15,%rax,4), %xmm8
+ movaps %xmm6, %xmm12
+ movaps 80(%rsi), %xmm9
+ addq $96, %rsi
+ mulps %xmm7, %xmm13
+ subps %xmm15, %xmm6
+ addps %xmm15, %xmm12
+ mulps %xmm8, %xmm10
+ subps %xmm12, %xmm0
+ addps %xmm12, %xmm5
+ shufps $177, %xmm7, %xmm7
+ xorps %xmm3, %xmm6 #const
+ shufps $177, %xmm8, %xmm8
+ movaps %xmm2, %xmm12
+ mulps %xmm9, %xmm7
+ mulps %xmm8, %xmm9
+ subps %xmm7, %xmm13
+ addps %xmm9, %xmm10
+ movaps (%r9,%rax,4), %xmm4
+ shufps $177, %xmm11, %xmm11
+ movaps %xmm4, %xmm1
+ shufps $177, %xmm6, %xmm6
+ addps %xmm11, %xmm1
+ subps %xmm11, %xmm4
+ addps %xmm6, %xmm12
+ subps %xmm6, %xmm2
+ movaps %xmm13, %xmm11
+ movaps %xmm4, %xmm14
+ movaps %xmm1, %xmm6
+ subps %xmm10, %xmm13
+ addps %xmm10, %xmm11
+ xorps %xmm3, %xmm13 #const
+ addps %xmm11, %xmm4
+ subps %xmm11, %xmm14
+ shufps $177, %xmm13, %xmm13
+ movaps %xmm5, (%rbx,%rax,4)
+ movaps %xmm4, (%r9,%rax,4)
+ movaps %xmm2, (%r10,%rax,4)
+ subps %xmm13, %xmm1
+ addps %xmm13, %xmm6
+ movaps %xmm1, (%r11,%rax,4)
+ movaps %xmm0, (%r12,%rax,4)
+ movaps %xmm14, (%r13,%rax,4)
+ movaps %xmm12, (%r14,%rax,4)
+ movaps %xmm6, (%r15,%rax,4)
+ addq $4, %rax
+ cmpq %rcx, %rax
+ jne X8_soft_loop
+ ret
+
+#ifdef __APPLE__
+ .globl _x8_hard
+_x8_hard:
+#else
+ .globl x8_hard
+x8_hard:
+#endif
+ movaps (%r9), %xmm5
+X8_loop:
+ movaps (%r8), %xmm9
+X8_const_2:
+ movaps 0xFECA(%rdx,%rax,4), %xmm6
+ movaps %xmm9, %xmm11
+X8_const_3:
+ movaps 0xFECA(%rdx,%rax,4), %xmm7
+ movaps 16(%r8), %xmm8
+ mulps %xmm6, %xmm11
+ mulps %xmm7, %xmm9
+ shufps $177, %xmm6, %xmm6
+ mulps %xmm8, %xmm6
+ shufps $177, %xmm7, %xmm7
+ subps %xmm6, %xmm11
+ mulps %xmm7, %xmm8
+ movaps %xmm11, %xmm10
+ addps %xmm8, %xmm9
+ movaps 32(%r8), %xmm15
+ addps %xmm9, %xmm10
+ subps %xmm9, %xmm11
+X8_const_0:
+ movaps 0xFECA(%rdx,%rax,4), %xmm3
+ movaps %xmm15, %xmm6
+X8_const_4:
+ movaps 0xFECA(%rdx,%rax,4), %xmm12
+ movaps %xmm3, %xmm2
+X8_const_6:
+ movaps 0xFECA(%rdx,%rax,4), %xmm13
+ xorps %xmm5, %xmm11
+ movaps 48(%r8), %xmm14
+ subps %xmm10, %xmm2
+ mulps %xmm12, %xmm6
+ addps %xmm10, %xmm3
+ mulps %xmm13, %xmm15
+ movaps 64(%r8), %xmm10
+ movaps %xmm3, %xmm0
+ shufps $177, %xmm12, %xmm12
+ shufps $177, %xmm13, %xmm13
+ mulps %xmm14, %xmm12
+ mulps %xmm13, %xmm14
+ subps %xmm12, %xmm6
+ addps %xmm14, %xmm15
+X8_const_5:
+ movaps 0xFECA(%rdx,%rax,4), %xmm7
+ movaps %xmm10, %xmm13
+X8_const_7:
+ movaps 0xFECA(%rdx,%rax,4), %xmm8
+ movaps %xmm6, %xmm12
+ movaps 80(%r8), %xmm9
+ addq $96, %r8
+ mulps %xmm7, %xmm13
+ subps %xmm15, %xmm6
+ addps %xmm15, %xmm12
+ mulps %xmm8, %xmm10
+ subps %xmm12, %xmm0
+ addps %xmm12, %xmm3
+ shufps $177, %xmm7, %xmm7
+ xorps %xmm5, %xmm6
+ shufps $177, %xmm8, %xmm8
+ movaps %xmm2, %xmm12
+ mulps %xmm9, %xmm7
+ mulps %xmm8, %xmm9
+ subps %xmm7, %xmm13
+ addps %xmm9, %xmm10
+X8_const_1:
+ movaps 0xFECA(%rdx,%rax,4), %xmm4
+ shufps $177, %xmm11, %xmm11
+ movaps %xmm4, %xmm1
+ shufps $177, %xmm6, %xmm6
+ addps %xmm11, %xmm1
+ subps %xmm11, %xmm4
+ addps %xmm6, %xmm12
+ subps %xmm6, %xmm2
+ movaps %xmm13, %xmm11
+ movaps %xmm4, %xmm14
+ movaps %xmm1, %xmm6
+ subps %xmm10, %xmm13
+ addps %xmm10, %xmm11
+ xorps %xmm5, %xmm13
+ addps %xmm11, %xmm4
+ subps %xmm11, %xmm14
+ shufps $177, %xmm13, %xmm13
+X8_const1_0:
+ movaps %xmm3, 0xFECA(%rdx,%rax,4)
+X8_const1_1:
+ movaps %xmm4, 0xFECA(%rdx,%rax,4)
+X8_const1_2:
+ movaps %xmm2, 0xFECA(%rdx,%rax,4)
+ subps %xmm13, %xmm1
+ addps %xmm13, %xmm6
+X8_const1_3:
+ movaps %xmm1, 0xFECA(%rdx,%rax,4)
+X8_const1_4:
+ movaps %xmm0, 0xFECA(%rdx,%rax,4)
+X8_const1_5:
+ movaps %xmm14, 0xFECA(%rdx,%rax,4)
+X8_const1_6:
+ movaps %xmm12, 0xFECA(%rdx,%rax,4)
+X8_const1_7:
+ movaps %xmm6, 0xFECA(%rdx,%rax,4)
+ addq $4, %rax
+ cmpq %rcx, %rax
+ jne X8_loop
+
+#ifdef __APPLE__
+ .globl _sse_leaf_ee_offsets
+ .globl _sse_leaf_oo_offsets
+ .globl _sse_leaf_eo_offsets
+ .globl _sse_leaf_oe_offsets
+ .align 4
+_sse_leaf_ee_offsets:
+ .long LEAF_EE_const_0-_leaf_ee+0x4
+ .long LEAF_EE_const_1-_leaf_ee+0x5
+ .long LEAF_EE_const_2-_leaf_ee+0x5
+ .long LEAF_EE_const_3-_leaf_ee+0x5
+ .long LEAF_EE_const_4-_leaf_ee+0x5
+ .long LEAF_EE_const_5-_leaf_ee+0x5
+ .long LEAF_EE_const_6-_leaf_ee+0x4
+ .long LEAF_EE_const_7-_leaf_ee+0x5
+_sse_leaf_oo_offsets:
+ .long LEAF_OO_const_0-_leaf_oo+0x4
+ .long LEAF_OO_const_1-_leaf_oo+0x4
+ .long LEAF_OO_const_2-_leaf_oo+0x5
+ .long LEAF_OO_const_3-_leaf_oo+0x5
+ .long LEAF_OO_const_4-_leaf_oo+0x4
+ .long LEAF_OO_const_5-_leaf_oo+0x5
+ .long LEAF_OO_const_6-_leaf_oo+0x5
+ .long LEAF_OO_const_7-_leaf_oo+0x5
+_sse_leaf_eo_offsets:
+ .long LEAF_EO_const_0-_leaf_eo+0x5
+ .long LEAF_EO_const_1-_leaf_eo+0x4
+ .long LEAF_EO_const_2-_leaf_eo+0x4
+ .long LEAF_EO_const_3-_leaf_eo+0x4
+ .long LEAF_EO_const_4-_leaf_eo+0x5
+ .long LEAF_EO_const_5-_leaf_eo+0x5
+ .long LEAF_EO_const_6-_leaf_eo+0x4
+ .long LEAF_EO_const_7-_leaf_eo+0x5
+_sse_leaf_oe_offsets:
+ .long LEAF_OE_const_0-_leaf_oe+0x5
+ .long LEAF_OE_const_1-_leaf_oe+0x4
+ .long LEAF_OE_const_2-_leaf_oe+0x4
+ .long LEAF_OE_const_3-_leaf_oe+0x5
+ .long LEAF_OE_const_4-_leaf_oe+0x5
+ .long LEAF_OE_const_5-_leaf_oe+0x5
+ .long LEAF_OE_const_6-_leaf_oe+0x4
+ .long LEAF_OE_const_7-_leaf_oe+0x4
+#else
+ .globl sse_leaf_ee_offsets
+ .globl sse_leaf_oo_offsets
+ .globl sse_leaf_eo_offsets
+ .globl sse_leaf_oe_offsets
+ .align 4
+sse_leaf_ee_offsets:
+ .long LEAF_EE_const_0-leaf_ee+0x4
+ .long LEAF_EE_const_1-leaf_ee+0x5
+ .long LEAF_EE_const_2-leaf_ee+0x5
+ .long LEAF_EE_const_3-leaf_ee+0x5
+ .long LEAF_EE_const_4-leaf_ee+0x5
+ .long LEAF_EE_const_5-leaf_ee+0x5
+ .long LEAF_EE_const_6-leaf_ee+0x4
+ .long LEAF_EE_const_7-leaf_ee+0x5
+sse_leaf_oo_offsets:
+ .long LEAF_OO_const_0-leaf_oo+0x4
+ .long LEAF_OO_const_1-leaf_oo+0x4
+ .long LEAF_OO_const_2-leaf_oo+0x5
+ .long LEAF_OO_const_3-leaf_oo+0x5
+ .long LEAF_OO_const_4-leaf_oo+0x4
+ .long LEAF_OO_const_5-leaf_oo+0x5
+ .long LEAF_OO_const_6-leaf_oo+0x5
+ .long LEAF_OO_const_7-leaf_oo+0x5
+sse_leaf_eo_offsets:
+ .long LEAF_EO_const_0-leaf_eo+0x5
+ .long LEAF_EO_const_1-leaf_eo+0x4
+ .long LEAF_EO_const_2-leaf_eo+0x4
+ .long LEAF_EO_const_3-leaf_eo+0x4
+ .long LEAF_EO_const_4-leaf_eo+0x5
+ .long LEAF_EO_const_5-leaf_eo+0x5
+ .long LEAF_EO_const_6-leaf_eo+0x4
+ .long LEAF_EO_const_7-leaf_eo+0x5
+sse_leaf_oe_offsets:
+ .long LEAF_OE_const_0-leaf_oe+0x5
+ .long LEAF_OE_const_1-leaf_oe+0x4
+ .long LEAF_OE_const_2-leaf_oe+0x4
+ .long LEAF_OE_const_3-leaf_oe+0x5
+ .long LEAF_OE_const_4-leaf_oe+0x5
+ .long LEAF_OE_const_5-leaf_oe+0x5
+ .long LEAF_OE_const_6-leaf_oe+0x4
+ .long LEAF_OE_const_7-leaf_oe+0x4
+#endif
+
+#ifdef __APPLE__
+ .data
+#else
+ .section .data
+#endif
+ .p2align 4
+#ifdef __APPLE__
+ .globl _sse_constants
+_sse_constants:
+#else
+ .globl sse_constants
+sse_constants:
+#endif
+ .long 0x00000000,0x80000000,0x00000000,0x80000000
+ .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+ .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
+ .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+ .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
+#ifdef __APPLE__
+ .globl _sse_constants_inv
+_sse_constants_inv:
+#else
+ .globl sse_constants_inv
+sse_constants_inv:
+#endif
+ .long 0x80000000,0x00000000,0x80000000,0x00000000
+ .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+ .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
+ .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+ .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
diff --git a/lib/ffts/src/types.h b/lib/ffts/src/types.h
new file mode 100644
index 0000000..04cbf61
--- /dev/null
+++ b/lib/ffts/src/types.h
@@ -0,0 +1,49 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#ifndef __TYPES_H__
+#define __TYPES_H__
+
+#define __INLINE static inline __attribute__((always_inline))
+
+#if defined(complex)
+ typedef complex float cdata_t;
+#else
+ typedef float cdata_t[2];
+#endif
+ typedef float data_t;
+
+#endif
+
+
diff --git a/lib/ffts/src/vfp.h b/lib/ffts/src/vfp.h
new file mode 100644
index 0000000..f733a3f
--- /dev/null
+++ b/lib/ffts/src/vfp.h
@@ -0,0 +1,45 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, 2013 The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __VFP_H__
+#define __VFP_H__
+
+#include "ffts.h"
+
+void vfp_e();
+void vfp_o();
+void vfp_x4();
+void vfp_x8();
+void vfp_end();
+
+#endif
diff --git a/lib/ffts/src/vfp.s b/lib/ffts/src/vfp.s
new file mode 100644
index 0000000..8ced89d
--- /dev/null
+++ b/lib/ffts/src/vfp.s
@@ -0,0 +1,473 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, 2013 The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+@ assumes r0 = out
+@ r1 = in ?
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = loop iterations
+@ r2 = const pointer
+@ & lr = temps
+
+ .align 4
+#ifdef __APPLE__
+ .globl _vfp_e
+_vfp_e:
+#else
+ .globl vfp_e
+vfp_e:
+#endif
+_vfp_e_loop:
+ vldr s15, [r2, #8]
+ vldr s2, [r3] @ x0
+ vldr s0, [r3, #4]
+ vldr s4, [r4] @ x1
+ vldr s11, [r2]
+ vldr s10, [r7] @ x4
+ vldr s3, [r7, #4]
+ vldr s8, [r8] @ x5
+ vldr s1, [r8, #4]
+ vldr s14, [r9] @ x6
+ vldr s9, [r9, #4]
+ vldr s6, [r10] @ x7
+ vldr s12, [r10, #4]
+ vsub.f32 s18, s3, s1
+ vsub.f32 s7, s10, s8
+ vsub.f32 s5, s14, s6
+ vadd.f32 s6, s14, s6
+ vldr s24, [r5, #4]
+ vsub.f32 s14, s9, s12
+ vldr s22, [r6, #4]
+ vadd.f32 s8, s10, s8
+ vldr s28, [r6] @ x3
+ vldr s17, [r5] @ x2
+ vadd.f32 s10, s9, s12
+ vmul.f32 s13, s18, s15
+ vmul.f32 s9, s7, s11
+ vmul.f32 s16, s5, s11
+ vmul.f32 s18, s18, s11
+ vmul.f32 s30, s14, s11
+ vldr s11, [r4, #4]
+ add r3, r3, #8
+ add r4, r4, #8
+ add r5, r5, #8
+ add r6, r6, #8
+ add r7, r7, #8
+ add r8, r8, #8
+ add r9, r9, #8
+ add r10, r10, #8
+ vmul.f32 s12, s5, s15
+ vmul.f32 s20, s14, s15
+ vadd.f32 s5, s2, s4
+ vadd.f32 s3, s3, s1
+ vmul.f32 s15, s7, s15
+ vadd.f32 s1, s24, s22
+ vsub.f32 s7, s24, s22
+ vadd.f32 s24, s17, s28
+ vadd.f32 s26, s0, s11
+ vsub.f32 s14, s9, s13
+ vsub.f32 s2, s2, s4
+ vadd.f32 s4, s16, s20
+ vsub.f32 s22, s0, s11
+ vsub.f32 s16, s17, s28
+ vadd.f32 s9, s5, s24
+ vadd.f32 s28, s18, s15
+ vadd.f32 s13, s8, s6
+ vsub.f32 s5, s5, s24
+ vsub.f32 s24, s8, s6
+ vadd.f32 s11, s26, s1
+ vsub.f32 s12, s30, s12
+ vadd.f32 s20, s3, s10
+ vsub.f32 s15, s3, s10
+ vsub.f32 s3, s26, s1
+ vadd.f32 s18, s9, s13
+ vadd.f32 s10, s14, s4
+ vadd.f32 s6, s2, s7 @
+ vsub.f32 s0, s2, s7 @
+ vadd.f32 s26, s11, s20
+ vsub.f32 s4, s14, s4
+ vsub.f32 s8, s22, s16 @
+ vadd.f32 s1, s28, s12
+ldr lr, [r12], #4
+add lr, r0, lr, lsl #2
+subs r11, r11, #1
+ vstr s18, [lr]
+ vsub.f32 s2, s28, s12
+ vadd.f32 s12, s22, s16 @
+ vsub.f32 s16, s3, s24 @
+ vsub.f32 s13, s9, s13
+ vstr s26, [lr, #4]
+ vadd.f32 s28, s5, s15 @
+ vsub.f32 s7, s5, s15 @
+ vadd.f32 s14, s6, s10
+ vadd.f32 s5, s8, s1
+ vadd.f32 s9, s0, s2 @
+ vsub.f32 s2, s0, s2 @
+ vsub.f32 s11, s11, s20
+ vstr s28, [lr, #16]
+ vadd.f32 s3, s3, s24 @
+ vstr s16, [lr, #20]
+ vsub.f32 s6, s6, s10
+ vstr s13, [lr, #32]
+ vsub.f32 s13, s12, s4 @
+ vsub.f32 s8, s8, s1
+ vadd.f32 s0, s12, s4 @
+ vstr s11, [lr, #36]
+ vstr s7, [lr, #48]
+ vstr s3, [lr, #52]
+ vstr s14, [lr, #8]
+ vstr s5, [lr, #12]
+ vstr s9, [lr, #24]
+ vstr s13, [lr, #28]
+ vstr s6, [lr, #40]
+ vstr s8, [lr, #44]
+ vstr s2, [lr, #56]
+ vstr s0, [lr, #60]
+ bne _vfp_e_loop
+
+@ assumes r0 = out
+@ r1 = in ?
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = loop iterations
+@ r2 & lr = temps
+ .align 4
+#ifdef __APPLE__
+ .globl _vfp_o
+_vfp_o:
+#else
+ .globl vfp_o
+vfp_o:
+#endif
+ _vfp_o_loop:
+ vldr s4, [r3] @ x0
+ vldr s0, [r3, #4]
+ vldr s6, [r4] @ x1
+ vldr s5, [r4, #4]
+ vldr s7, [r5] @ x2
+ vldr s1, [r5, #4]
+ vldr s3, [r6] @ x3
+ vldr s8, [r6, #4]
+ subs r11, r11, #1
+ ldr r2, [r12], #4
+ add r2, r0, r2, lsl #2
+ vadd.f32 s2, s4, s6
+ vadd.f32 s14, s0, s5
+ vadd.f32 s10, s1, s8
+ vsub.f32 s4, s4, s6
+ vsub.f32 s0, s0, s5
+ vadd.f32 s12, s7, s3
+ vsub.f32 s6, s7, s3
+ vsub.f32 s8, s1, s8
+ vadd.f32 s5, s14, s10
+ vsub.f32 s10, s14, s10
+ vadd.f32 s7, s2, s12
+ vsub.f32 s1, s0, s6 @
+ vsub.f32 s12, s2, s12
+ vadd.f32 s3, s4, s8 @
+ vsub.f32 s2, s4, s8 @
+ vadd.f32 s0, s0, s6 @
+ vstr s7, [r2]
+ vldr s7, [r9] @ x2
+ vstr s5, [r2, #4]
+ vstr s3, [r2, #8]
+ vstr s1, [r2, #12]
+ vstr s12, [r2, #16]
+ vstr s10, [r2, #20]
+ vstr s2, [r2, #24]
+ vstr s0, [r2, #28]
+ vldr s4, [r7] @ x0
+ vldr s0, [r7, #4]
+ vldr s6, [r8] @ x1
+ vldr s5, [r8, #4]
+ vldr s3, [r10] @ x3
+ vldr s8, [r10, #4]
+ vldr s1, [r9, #4]
+ add r3, r3, #8
+ add r4, r4, #8
+ add r5, r5, #8
+ add r6, r6, #8
+ add r7, r7, #8
+ add r8, r8, #8
+ add r9, r9, #8
+ add r10, r10, #8
+ vadd.f32 s2, s4, s6
+ vadd.f32 s14, s0, s5
+ vadd.f32 s10, s1, s8
+ vsub.f32 s4, s4, s6
+ vsub.f32 s0, s0, s5
+ vadd.f32 s12, s7, s3
+ vsub.f32 s6, s7, s3
+ vsub.f32 s8, s1, s8
+ vadd.f32 s5, s14, s10
+ vsub.f32 s10, s14, s10
+ vadd.f32 s7, s2, s12
+ vsub.f32 s1, s0, s6 @
+ vsub.f32 s12, s2, s12
+ vadd.f32 s3, s4, s8 @
+ vsub.f32 s2, s4, s8 @
+ vadd.f32 s0, s0, s6 @
+ vstr s7, [r2, #32]
+ vstr s5, [r2, #36]
+ vstr s3, [r2, #40]
+ vstr s1, [r2, #44]
+ vstr s12, [r2, #48]
+ vstr s10, [r2, #52]
+ vstr s2, [r2, #56]
+ vstr s0, [r2, #60]
+ bne _vfp_o_loop
+
+ .align 4
+#ifdef __APPLE__
+ .globl _vfp_x4
+_vfp_x4:
+#else
+ .globl vfp_x4
+vfp_x4:
+#endif
+ add r3, r0, #0
+ add r7, r2, #0
+ add r4, r0, r1, lsl #1
+ add r5, r0, r1, lsl #2
+ add r6, r4, r1, lsl #2
+ mov r11, #4
+_vfp_x4_loop:
+
+ vldr s8, [r3, #0]
+ vldr s9, [r3, #4]
+ vldr s10, [r4, #0]
+ vldr s11, [r4, #4]
+ vldr s12, [r5, #0]
+ vldr s13, [r5, #4]
+ vldr s14, [r6, #0]
+ vldr s15, [r6, #4]
+ vldr s2, [r7, #0]
+ vldr s3, [r7, #4]
+ add r7, r7, #8
+ subs r11, r11, #1
+ vmul.f32 s0, s13, s3
+ vmul.f32 s5, s12, s2
+ vmul.f32 s1, s14, s2
+ vmul.f32 s4, s14, s3
+ vmul.f32 s14, s12, s3
+ vmul.f32 s13, s13, s2
+ vmul.f32 s12, s15, s3
+ vmul.f32 s2, s15, s2
+ vsub.f32 s0, s5, s0
+ vadd.f32 s13, s13, s14
+ vadd.f32 s12, s12, s1
+ vsub.f32 s1, s2, s4
+ vadd.f32 s15, s0, s12
+ vsub.f32 s12, s0, s12
+ vadd.f32 s14, s13, s1
+ vsub.f32 s13, s13, s1
+ vadd.f32 s0, s8, s15
+ vadd.f32 s1, s9, s14
+ vadd.f32 s2, s10, s13 @
+ vsub.f32 s4, s8, s15
+ vsub.f32 s3, s11, s12 @
+ vstr s0, [r3, #0]
+ vstr s1, [r3, #4]
+ add r3, r3, #8
+ vsub.f32 s5, s9, s14
+ vsub.f32 s6, s10, s13 @
+ vadd.f32 s7, s11, s12 @
+ vstr s2, [r4, #0]
+ vstr s3, [r4, #4]
+ add r4, r4, #8
+ vstr s4, [r5, #0]
+ vstr s5, [r5, #4]
+ add r5, r5, #8
+ vstr s6, [r6, #0]
+ vstr s7, [r6, #4]
+ add r6, r6, #8
+ bne _vfp_x4_loop
+ bx lr
+
+ .align 4
+#ifdef __APPLE__
+ .globl _vfp_x8
+_vfp_x8:
+#else
+ .globl vfp_x8
+vfp_x8:
+#endif
+ mov r11, #0
+ add r3, r0, #0 @ data0
+ add r5, r0, r1, lsl #1 @ data2
+ add r4, r0, r1 @ data1
+ add r7, r5, r1, lsl #1 @ data4
+ add r6, r5, r1 @ data3
+ add r9, r7, r1, lsl #1 @ data6
+ add r8, r7, r1 @ data5
+ add r10, r9, r1 @ data7
+ add r12, r2, #0 @ LUT
+
+ sub r11, r11, r1, lsr #3
+_vfp_x8_loop:
+ vldr s10, [r3, #0] @ x0-re
+ vldr s8, [r3, #4] @ x0-im
+ vldr s2, [r4, #0] @ x1-re
+ vldr s0, [r4, #4] @ x1-im
+ vldr s6, [r5, #0] @ x2-re
+ vldr s4, [r5, #4] @ x2-im
+ vldr s13, [r6, #0] @ x3-re
+ vldr s15, [r6, #4] @ x3-im
+ vldr s7, [r12]
+ vldr s11, [r12, #4]
+ vldr s5, [r7, #0] @ x4-re
+ vldr s1, [r7, #4] @ x4-im
+ vldr s28, [r9, #0] @ x6-re
+ vldr s18, [r9, #4] @ x6-im
+ adds r11, r11, #1
+ vmul.f32 s14, s15, s7
+ vldr s24, [r12, #12]
+ vmul.f32 s12, s13, s11
+ vmul.f32 s26, s13, s7
+ vldr s13, [r12, #8]
+ vmul.f32 s3, s4, s11
+ vmul.f32 s15, s15, s11
+ vmul.f32 s16, s4, s7
+ vmul.f32 s9, s6, s7
+ vmul.f32 s11, s6, s11
+ vmul.f32 s7, s18, s24
+ vmul.f32 s20, s1, s24
+ vmul.f32 s30, s5, s13
+ vadd.f32 s4, s26, s15
+ vsub.f32 s12, s14, s12
+ vsub.f32 s6, s9, s3
+ vadd.f32 s14, s16, s11
+ vmul.f32 s22, s28, s13
+ vmul.f32 s26, s28, s24
+ vmul.f32 s18, s18, s13
+ vmul.f32 s5, s5, s24
+ vmul.f32 s1, s1, s13
+ vsub.f32 s9, s30, s20
+ vadd.f32 s16, s14, s12
+ vadd.f32 s3, s22, s7
+ vadd.f32 s15, s6, s4
+ vsub.f32 s11, s18, s26
+ vadd.f32 s18, s1, s5
+ vadd.f32 s13, s8, s16
+ vadd.f32 s1, s9, s3
+ vadd.f32 s7, s10, s15
+ vsub.f32 s15, s10, s15
+ vsub.f32 s10, s9, s3
+ vadd.f32 s5, s18, s11
+ vsub.f32 s11, s18, s11
+ vsub.f32 s8, s8, s16
+ vadd.f32 s20, s7, s1
+ vsub.f32 s7, s7, s1
+ vadd.f32 s18, s13, s5
+ vadd.f32 s16, s15, s11 @
+ vsub.f32 s9, s8, s10 @
+ vsub.f32 s3, s13, s5
+ vsub.f32 s1, s15, s11 @
+ vstr s20, [r3]
+ vadd.f32 s8, s8, s10 @
+ vstr s18, [r3, #4]
+ add r3, r3, #8
+ vstr s16, [r5]
+ vstr s9, [r5, #4]
+ add r5, r5, #8
+ vstr s7, [r7]
+ vstr s3, [r7, #4]
+ add r7, r7, #8
+ vstr s1, [r9]
+ vstr s8, [r9, #4]
+ add r9, r9, #8
+ vldr s10, [r8, #0] @ x5-re
+ vldr s8, [r8, #4] @ x5-im
+ vldr s5, [r10, #0] @ x7-re
+ vldr s11, [r10, #4] @ x7-im
+ vldr s1, [r12, #16]
+ vldr s15, [r12, #20]
+ add r12, r12, #24
+ vmul.f32 s9, s5, s1
+ vmul.f32 s3, s11, s15
+ vmul.f32 s13, s10, s1
+ vmul.f32 s7, s8, s15
+ vmul.f32 s5, s5, s15
+ vmul.f32 s11, s11, s1
+ vmul.f32 s10, s10, s15
+ vmul.f32 s15, s8, s1
+ vsub.f32 s1, s14, s12
+ vadd.f32 s8, s9, s3
+ vsub.f32 s3, s6, s4
+ vsub.f32 s12, s13, s7
+ vsub.f32 s5, s11, s5
+ vadd.f32 s7, s15, s10
+ vadd.f32 s4, s2, s1 @
+ vsub.f32 s2, s2, s1 @
+ vsub.f32 s6, s0, s3 @
+ vadd.f32 s10, s12, s8
+ vsub.f32 s9, s12, s8
+ vadd.f32 s0, s0, s3 @
+ vsub.f32 s1, s7, s5
+ vadd.f32 s14, s7, s5
+ vadd.f32 s7, s4, s10
+ vsub.f32 s8, s4, s10
+ vsub.f32 s12, s0, s9 @
+ vadd.f32 s3, s2, s1 @
+ vadd.f32 s5, s6, s14
+ vsub.f32 s4, s6, s14
+ vsub.f32 s2, s2, s1 @
+ vadd.f32 s0, s0, s9 @
+ vstr s7, [r4]
+ vstr s5, [r4, #4]
+ add r4, r4, #8
+ vstr s3, [r6]
+ vstr s12, [r6, #4]
+ add r6, r6, #8
+ vstr s8, [r8]
+ vstr s4, [r8, #4]
+ add r8, r8, #8
+ vstr s2, [r10]
+ vstr s0, [r10, #4]
+ add r10, r10, #8
+ bne _vfp_x8_loop
+ bx lr
+
+
+ .align 4
+#ifdef __APPLE__
+ .globl _vfp_end
+_vfp_end:
+#else
+ .globl vfp_end
+vfp_end:
+#endif
+ bx lr