diff options
Diffstat (limited to 'lib/ffts/src')
34 files changed, 9737 insertions, 0 deletions
diff --git a/lib/ffts/src/Makefile.am b/lib/ffts/src/Makefile.am new file mode 100644 index 0000000..8547795 --- /dev/null +++ b/lib/ffts/src/Makefile.am @@ -0,0 +1,34 @@ + + +lib_LTLIBRARIES = libffts.la + +libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c ffts_real_nd.c patterns.c +libffts_la_SOURCES += codegen.h codegen_arm.h codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h macros-neon.h macros-sse.h macros.h neon.h neon_float.h patterns.h types.h vfp.h + +if DYNAMIC_DISABLED +libffts_la_SOURCES += ffts_static.c +else +libffts_la_SOURCES += codegen.c +endif + +libffts_includedir=$(includedir)/ffts +libffts_include_HEADERS = ../include/ffts.h + + +if HAVE_VFP +libffts_la_SOURCES += vfp.s +else +if HAVE_NEON + +if DYNAMIC_DISABLED +libffts_la_SOURCES += neon_static_f.s neon_static_i.s +else +libffts_la_SOURCES += neon.s +endif + +else +if HAVE_SSE +libffts_la_SOURCES += sse.s +endif +endif +endif diff --git a/lib/ffts/src/Makefile.in b/lib/ffts/src/Makefile.in new file mode 100644 index 0000000..a1eefbc --- /dev/null +++ b/lib/ffts/src/Makefile.in @@ -0,0 +1,666 @@ +# Makefile.in generated by automake 1.12.4 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2012 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + + +VPATH = @srcdir@ +am__make_dryrun = \ + { \ + am__dry=no; \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ + | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ + *) \ + for am__flg in $$MAKEFLAGS; do \ + case $$am__flg in \ + *=*|--*) ;; \ + *n*) am__dry=yes; break;; \ + esac; \ + done;; \ + esac; \ + test $$am__dry = yes; \ + } +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +@DYNAMIC_DISABLED_TRUE@am__append_1 = ffts_static.c +@DYNAMIC_DISABLED_FALSE@am__append_2 = codegen.c +@HAVE_VFP_TRUE@am__append_3 = vfp.s +@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_4 = neon_static_f.s neon_static_i.s +@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_5 = neon.s +@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__append_6 = sse.s +subdir = src +DIST_COMMON = $(libffts_include_HEADERS) $(srcdir)/Makefile.am \ + $(srcdir)/Makefile.in $(top_srcdir)/depcomp +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_classpath.m4 \ + $(top_srcdir)/m4/ax_check_java_home.m4 \ + $(top_srcdir)/m4/ax_java_options.m4 \ + $(top_srcdir)/m4/ax_jni_include_dir.m4 \ + $(top_srcdir)/m4/ax_prog_jar.m4 \ + $(top_srcdir)/m4/ax_prog_javac.m4 \ + $(top_srcdir)/m4/ax_prog_javac_works.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ + srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ + for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ + for p in $$list; do echo "$$p $$p"; done | \ + sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ + $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ + if (++n[$$2] == $(am__install_max)) \ + { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ + END { for (dir in files) print dir, files[dir] }' +am__base_list = \ + sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ + sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +am__uninstall_files_from_dir = { \ + test -z "$$files" \ + || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ + || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ + $(am__cd) "$$dir" && rm -f $$files; }; \ + } +am__installdirs = "$(DESTDIR)$(libdir)" \ + "$(DESTDIR)$(libffts_includedir)" +LTLIBRARIES = $(lib_LTLIBRARIES) +libffts_la_LIBADD = +am__libffts_la_SOURCES_DIST = ffts.c ffts_small.c ffts_nd.c \ + ffts_real.c ffts_real_nd.c patterns.c codegen.h codegen_arm.h \ + codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h \ + ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h \ + macros-neon.h macros-sse.h macros.h neon.h neon_float.h \ + patterns.h types.h vfp.h ffts_static.c codegen.c vfp.s \ + neon_static_f.s neon_static_i.s neon.s sse.s +@DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo +@DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo +@HAVE_VFP_TRUE@am__objects_3 = vfp.lo +@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_4 = neon_static_f.lo \ +@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@ neon_static_i.lo +@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_5 = neon.lo +@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__objects_6 = \ +@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@ sse.lo +am_libffts_la_OBJECTS = ffts.lo ffts_small.lo ffts_nd.lo ffts_real.lo \ + ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \ + $(am__objects_3) $(am__objects_4) $(am__objects_5) \ + $(am__objects_6) +libffts_la_OBJECTS = $(am_libffts_la_OBJECTS) +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__depfiles_maybe = depfiles +am__mv = mv -f +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS) +LTCCASCOMPILE = $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS) +SOURCES = $(libffts_la_SOURCES) +DIST_SOURCES = $(am__libffts_la_SOURCES_DIST) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +HEADERS = $(libffts_include_HEADERS) +ETAGS = etags +CTAGS = ctags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCAS = @CCAS@ +CCASDEPMODE = @CCASDEPMODE@ +CCASFLAGS = @CCASFLAGS@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +JAR = @JAR@ +JAVA = @JAVA@ +JAVAC = @JAVAC@ +JAVACFLAGS = @JAVACFLAGS@ +JAVAFLAGS = @JAVAFLAGS@ +JAVAPREFIX = @JAVAPREFIX@ +JAVA_PATH_NAME = @JAVA_PATH_NAME@ +JNI_CPPFLAGS = @JNI_CPPFLAGS@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VERSION = @VERSION@ +_ACJNI_JAVAC = @_ACJNI_JAVAC@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +lib_LTLIBRARIES = libffts.la +libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c \ + ffts_real_nd.c patterns.c codegen.h codegen_arm.h \ + codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h \ + ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h \ + macros-neon.h macros-sse.h macros.h neon.h neon_float.h \ + patterns.h types.h vfp.h $(am__append_1) $(am__append_2) \ + $(am__append_3) $(am__append_4) $(am__append_5) \ + $(am__append_6) +libffts_includedir = $(includedir)/ffts +libffts_include_HEADERS = ../include/ffts.h +all: all-am + +.SUFFIXES: +.SUFFIXES: .c .lo .o .obj .s +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu src/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): +install-libLTLIBRARIES: $(lib_LTLIBRARIES) + @$(NORMAL_INSTALL) + @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ + list2=; for p in $$list; do \ + if test -f $$p; then \ + list2="$$list2 $$p"; \ + else :; fi; \ + done; \ + test -z "$$list2" || { \ + echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \ + echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \ + $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \ + } + +uninstall-libLTLIBRARIES: + @$(NORMAL_UNINSTALL) + @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ + for p in $$list; do \ + $(am__strip_dir) \ + echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \ + $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \ + done + +clean-libLTLIBRARIES: + -test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES) + @list='$(lib_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } +libffts.la: $(libffts_la_OBJECTS) $(libffts_la_DEPENDENCIES) $(EXTRA_libffts_la_DEPENDENCIES) + $(LINK) -rpath $(libdir) $(libffts_la_OBJECTS) $(libffts_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codegen.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_nd.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real_nd.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_small.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_static.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/patterns.Plo@am__quote@ + +.c.o: +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c $< + +.c.obj: +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` + +.c.lo: +@am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< + +.s.o: + $(CCASCOMPILE) -c -o $@ $< + +.s.obj: + $(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.s.lo: + $(LTCCASCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs +install-libffts_includeHEADERS: $(libffts_include_HEADERS) + @$(NORMAL_INSTALL) + @list='$(libffts_include_HEADERS)'; test -n "$(libffts_includedir)" || list=; \ + if test -n "$$list"; then \ + echo " $(MKDIR_P) '$(DESTDIR)$(libffts_includedir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(libffts_includedir)" || exit 1; \ + fi; \ + for p in $$list; do \ + if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ + echo "$$d$$p"; \ + done | $(am__base_list) | \ + while read files; do \ + echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libffts_includedir)'"; \ + $(INSTALL_HEADER) $$files "$(DESTDIR)$(libffts_includedir)" || exit $$?; \ + done + +uninstall-libffts_includeHEADERS: + @$(NORMAL_UNINSTALL) + @list='$(libffts_include_HEADERS)'; test -n "$(libffts_includedir)" || list=; \ + files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ + dir='$(DESTDIR)$(libffts_includedir)'; $(am__uninstall_files_from_dir) + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + mkid -fID $$unique +tags: TAGS + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + set x; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" + +cscopelist: $(HEADERS) $(SOURCES) $(LISP) + list='$(SOURCES) $(HEADERS) $(LISP)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) $(HEADERS) +installdirs: + for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(libffts_includedir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \ + mostlyclean-am + +distclean: distclean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: install-libffts_includeHEADERS + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: install-libLTLIBRARIES + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-libLTLIBRARIES \ + uninstall-libffts_includeHEADERS + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ + clean-libLTLIBRARIES clean-libtool cscopelist ctags distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am html html-am info info-am \ + install install-am install-data install-data-am install-dvi \ + install-dvi-am install-exec install-exec-am install-html \ + install-html-am install-info install-info-am \ + install-libLTLIBRARIES install-libffts_includeHEADERS \ + install-man install-pdf install-pdf-am install-ps \ + install-ps-am install-strip installcheck installcheck-am \ + installdirs maintainer-clean maintainer-clean-generic \ + mostlyclean mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool pdf pdf-am ps ps-am tags uninstall \ + uninstall-am uninstall-libLTLIBRARIES \ + uninstall-libffts_includeHEADERS + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/lib/ffts/src/codegen.c b/lib/ffts/src/codegen.c new file mode 100644 index 0000000..a66ecda --- /dev/null +++ b/lib/ffts/src/codegen.c @@ -0,0 +1,731 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "codegen.h" +#include "macros.h" +#include "ffts.h" + +#ifdef __APPLE__ + #include <libkern/OSCacheControl.h> +#endif + +#include <sys/types.h> +#include <sys/mman.h> + +#ifdef HAVE_NEON + #include "codegen_arm.h" + #include "neon.h" +#elif HAVE_VFP + #include "codegen_arm.h" + #include "vfp.h" +#else + #include "codegen_sse.h" + #include "macros-sse.h" +#endif + +#ifdef __ANDROID__ + #include <unistd.h> +#endif + +int tree_count(int N, int leafN, int offset) { + + if(N <= leafN) return 0; + int count = 0; + count += tree_count(N/4, leafN, offset); + count += tree_count(N/8, leafN, offset + N/4); + count += tree_count(N/8, leafN, offset + N/4 + N/8); + count += tree_count(N/4, leafN, offset + N/2); + count += tree_count(N/4, leafN, offset + 3*N/4); + + return 1 + count; +} + +void elaborate_tree(size_t **p, int N, int leafN, int offset) { + + if(N <= leafN) return; + elaborate_tree(p, N/4, leafN, offset); + elaborate_tree(p, N/8, leafN, offset + N/4); + elaborate_tree(p, N/8, leafN, offset + N/4 + N/8); + elaborate_tree(p, N/4, leafN, offset + N/2); + elaborate_tree(p, N/4, leafN, offset + 3*N/4); + + (*p)[0] = N; + (*p)[1] = offset*2; + + (*p)+=2; +} + + + + +uint32_t LUT_offset(size_t N, size_t leafN) { + int i; + size_t p_lut_size = 0; + size_t lut_size = 0; + int hardcoded = 0; + size_t n_luts = __builtin_ctzl(N/leafN); + int n = leafN*2; + //if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } + + for(i=0;i<n_luts-1;i++) { + p_lut_size = lut_size; + if(!i || hardcoded) { + #ifdef __arm__ + if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t); + else lut_size += n/4 * sizeof(cdata_t); + #else + lut_size += n/4 * 2 * sizeof(cdata_t); + #endif + // n *= 2; + } else { + #ifdef __arm__ + lut_size += n/8 * 3 * sizeof(cdata_t); + #else + lut_size += n/8 * 3 * 2 * sizeof(cdata_t); + #endif + } + n *= 2; + } + return lut_size; +} + +#ifdef __arm__ + typedef uint32_t insns_t; +#else + typedef uint8_t insns_t; +#endif + +#define P(x) (*(*p)++ = x) + +void insert_nops(uint8_t **p, uint32_t count) { + switch(count) { + case 0: break; + case 2: P(0x66); + case 1: P(0x90); break; + case 3: P(0x0F); P(0x1F); P(0x00); break; + case 4: P(0x0F); P(0x1F); P(0x40); P(0x00); break; + case 5: P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break; + case 6: P(0x66); P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break; + case 7: P(0x0F); P(0x1F); P(0x80); P(0x00); P(0x00); P(0x00); P(0x00); break; + case 8: P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break; + case 9: P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break; + default: + P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); + insert_nops(p, count-9); + break; + } +} + + +void align_mem16(uint8_t **p, uint32_t offset) { +#ifdef __x86_64__ + int r = (16 - (offset & 0xf)) - ((uint32_t)(*p) & 0xf); + r = (16 + r) & 0xf; + insert_nops(p, r); +#endif +} + +void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { + int count = tree_count(N, leafN, 0) + 1; + size_t *ps = malloc(count * 2 * sizeof(size_t)); + size_t *pps = ps; + +#ifdef __x86_64__ + if(sign < 0) p->constants = sse_constants; + else p->constants = sse_constants_inv; +#endif + + elaborate_tree(&pps, N, leafN, 0); + pps[0] = 0; + pps[1] = 0; + + pps = ps; + +#ifdef __arm__ + if(N < 8192) p->transform_size = 8192; + else p->transform_size = N; +#else + if(N < 2048) p->transform_size = 16384; + else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N); +#endif + +#ifdef __APPLE__ + p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0); +#else +#define MAP_ANONYMOUS 0x20 + p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0); +#endif + +/* + if(p->transform_base == MAP_FAILED) { + fprintf(stderr, "MAP FAILED\n"); + exit(1); + }*/ + insns_t *func = p->transform_base;//valloc(8192); + insns_t *fp = func; + +//fprintf(stderr, "Allocating %d bytes \n", p->transform_size); +//fprintf(stderr, "Base address = %016p\n", func); + + if(!func) { + fprintf(stderr, "NOMEM\n"); + exit(1); + } + + insns_t *x_8_addr = fp; +#ifdef __arm__ +#ifdef HAVE_NEON + memcpy(fp, neon_x8, neon_x8_t - neon_x8); + /* + * Changes adds to subtracts and vice versa to allow the computation + * of both the IFFT and FFT + */ + if(sign < 0) { + fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; + fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000; + fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000; + } + fp += (neon_x8_t - neon_x8) / 4; +#else + memcpy(fp, vfp_x8, vfp_end - vfp_x8); + if(sign > 0) { + fp[65] ^= 0x00000040; + fp[66] ^= 0x00000040; + fp[68] ^= 0x00000040; + fp[70] ^= 0x00000040; + fp[103] ^= 0x00000040; + fp[104] ^= 0x00000040; + fp[105] ^= 0x00000040; + fp[108] ^= 0x00000040; + fp[113] ^= 0x00000040; + fp[114] ^= 0x00000040; + fp[117] ^= 0x00000040; + fp[118] ^= 0x00000040; + } + fp += (vfp_end - vfp_x8) / 4; +#endif +#else + align_mem16(&fp, 0); + x_8_addr = fp; + align_mem16(&fp, 5); + memcpy(fp, x8_soft, x8_hard - x8_soft); + fp += (x8_hard - x8_soft); +//fprintf(stderr, "X8 start address = %016p\n", x_8_addr); +#endif +//uint32_t *x_8_t_addr = fp; +//memcpy(fp, neon_x8_t, neon_end - neon_x8_t); +//fp += (neon_end - neon_x8_t) / 4; + insns_t *x_4_addr = fp; +#ifdef __arm__ + #ifdef HAVE_NEON + memcpy(fp, neon_x4, neon_x8 - neon_x4); + if(sign < 0) { + fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; + } + fp += (neon_x8 - neon_x4) / 4; + #else + memcpy(fp, vfp_x4, vfp_x8 - vfp_x4); + if(sign > 0) { + fp[36] ^= 0x00000040; + fp[38] ^= 0x00000040; + fp[43] ^= 0x00000040; + fp[44] ^= 0x00000040; + } + fp += (vfp_x8 - vfp_x4) / 4; + #endif +#else + align_mem16(&fp, 0); + x_4_addr = fp; + memcpy(fp, x4, x8_soft - x4); + fp += (x8_soft - x4); + +#endif + insns_t *start = fp; + +#ifdef __arm__ + *fp = PUSH_LR(); fp++; + *fp = 0xed2d8b10; fp++; + + ADDI(&fp, 3, 1, 0); + ADDI(&fp, 7, 1, N); + ADDI(&fp, 5, 1, 2*N); + ADDI(&fp, 10, 7, 2*N); + ADDI(&fp, 4, 5, 2*N); + ADDI(&fp, 8, 10, 2*N); + ADDI(&fp, 6, 4, 2*N); + ADDI(&fp, 9, 8, 2*N); + + *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12 +// *fp++ = LDRI(1, 0, 4); // load ws into r1 + ADDI(&fp, 1, 0, 0); + + ADDI(&fp, 0, 2, 0), // mov out into r0 +#endif + + +#ifdef __arm__ + *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; + #ifdef HAVE_NEON + MOVI(&fp, 11, p->i0); + #else + MOVI(&fp, 11, p->i0); + #endif + +#else + align_mem16(&fp, 0); + start = fp; + + *fp++ = 0x4c; + *fp++ = 0x8b; + *fp++ = 0x07; + uint32_t lp_cnt = p->i0 * 4; + MOVI(&fp, RCX, lp_cnt); + + //LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); +#endif + //fp++; +#ifdef __arm__ +#ifdef HAVE_NEON + memcpy(fp, neon_ee, neon_oo - neon_ee); + if(sign < 0) { + fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; + fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; + fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; + } + fp += (neon_oo - neon_ee) / 4; +#else + memcpy(fp, vfp_e, vfp_o - vfp_e); + if(sign > 0) { + fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040; + fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040; + fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040; + } + fp += (vfp_o - vfp_e) / 4; +#endif +#else +//fprintf(stderr, "Body start address = %016p\n", start); + + PUSH(&fp, RBP); + PUSH(&fp, RBX); + PUSH(&fp, R10); + PUSH(&fp, R11); + PUSH(&fp, R12); + PUSH(&fp, R13); + PUSH(&fp, R14); + PUSH(&fp, R15); + + int i; + memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init); + +//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init); +//fprintf(stderr, "Constants address = %016p\n", sse_constants); +//fprintf(stderr, "Constants address = %016p\n", p->constants); + +//int32_t val = READ_IMM32(fp + 3); +//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p)); + +//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp ); +//fprintf(stderr, "IMM = 0x%llx\n", v2); + +//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp )); + fp += (leaf_ee - leaf_ee_init); + +//fprintf(stderr, "Leaf start address = %016p\n", fp); + align_mem16(&fp, 9); + memcpy(fp, leaf_ee, leaf_oo - leaf_ee); + + + uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4}; + uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4}; + uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2}; + + for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4); + + fp += (leaf_oo - leaf_ee); + + if(__builtin_ctzl(N) & 1){ + + if(p->i1) { + lp_cnt += p->i1 * 4; + MOVI(&fp, RCX, lp_cnt); + align_mem16(&fp, 4); + memcpy(fp, leaf_oo, leaf_eo - leaf_oo); + for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); + fp += (leaf_eo - leaf_oo); + } + + + memcpy(fp, leaf_oe, leaf_end - leaf_oe); + lp_cnt += 4; + for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4); + fp += (leaf_end - leaf_oe); + + }else{ + + + memcpy(fp, leaf_eo, leaf_oe - leaf_eo); + lp_cnt += 4; + for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4); + fp += (leaf_oe - leaf_eo); + + if(p->i1) { + lp_cnt += p->i1 * 4; + MOVI(&fp, RCX, lp_cnt); + align_mem16(&fp, 4); + memcpy(fp, leaf_oo, leaf_eo - leaf_oo); + for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); + fp += (leaf_eo - leaf_oo); + } + + } + if(p->i1) { + lp_cnt += p->i1 * 4; + MOVI(&fp, RCX, lp_cnt); + align_mem16(&fp, 9); + memcpy(fp, leaf_ee, leaf_oo - leaf_ee); + for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4); + fp += (leaf_oo - leaf_ee); + + } + +//fprintf(stderr, "Body start address = %016p\n", fp); + //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); + memcpy(fp, x_init, x4 - x_init); +//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); + fp += (x4 - x_init); + + int32_t pAddr = 0; + int32_t pN = 0; + int32_t pLUT = 0; + count = 2; + while(pps[0]) { + + if(!pN) { + MOVI(&fp, RCX, pps[0] / 4); + }else{ + if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr); + if(pps[0] > leafN && pps[0] - pN) { + + int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN); + *fp++ = 0xc1; + + if(diff > 0) { + *fp++ = 0xe1; + *fp++ = (diff & 0xff); + }else{ + *fp++ = 0xe9; + *fp++ = ((-diff) & 0xff); + } + } + } + + if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT) + ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); + + + if(pps[0] == 2*leafN) { + CALL(&fp, x_4_addr); + // }else if(!pps[2]){ + // //uint32_t *x_8_t_addr = fp; + // memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); + // fp += (neon_ee - neon_x8_t) / 4; + // //*fp++ = BL(fp+2, x_8_t_addr); + }else{ + CALL(&fp, x_8_addr); + } + + pAddr = pps[1] * 4; + if(pps[0] > leafN) + pN = pps[0]; + pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN); +// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); + count += 4; + pps += 2; + } +#endif +#ifdef __arm__ +#ifdef HAVE_NEON + if(__builtin_ctzl(N) & 1){ + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + + if(p->i1) { + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_oo, neon_eo - neon_oo); + if(sign < 0) { + fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000; + fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000; + fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; + } + fp += (neon_eo - neon_oo) / 4; + } + + *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++; + + memcpy(fp, neon_oe, neon_end - neon_oe); + if(sign < 0) { + fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000; + fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; + fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000; + } + fp += (neon_end - neon_oe) / 4; + + }else{ + + *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++; + + memcpy(fp, neon_eo, neon_oe - neon_eo); + if(sign < 0) { + fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; + fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000; + fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000; + } + fp += (neon_oe - neon_eo) / 4; + + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + + if(p->i1) { + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_oo, neon_eo - neon_oo); + if(sign < 0) { + fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000; + fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000; + fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; + } + fp += (neon_eo - neon_oo) / 4; + } + + } + + + if(p->i1) { + ADDI(&fp, 2, 3, 0); + ADDI(&fp, 3, 7, 0); + ADDI(&fp, 7, 2, 0); + + ADDI(&fp, 2, 4, 0); + ADDI(&fp, 4, 8, 0); + ADDI(&fp, 8, 2, 0); + + ADDI(&fp, 2, 5, 0); + ADDI(&fp, 5, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 6, 0); + ADDI(&fp, 6, 10, 0); + ADDI(&fp, 10, 2, 0); + + ADDI(&fp, 2, 9, 0); + ADDI(&fp, 9, 10, 0); + ADDI(&fp, 10, 2, 0); + + *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_ee, neon_oo - neon_ee); + if(sign < 0) { + fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; + fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; + fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; + } + fp += (neon_oo - neon_ee) / 4; + + } +#else + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + + MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1); + memcpy(fp, vfp_o, vfp_x4 - vfp_o); + if(sign > 0) { + fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040; + fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040; + } + fp += (vfp_x4 - vfp_o) / 4; + + ADDI(&fp, 2, 3, 0); + ADDI(&fp, 3, 7, 0); + ADDI(&fp, 7, 2, 0); + + ADDI(&fp, 2, 4, 0); + ADDI(&fp, 4, 8, 0); + ADDI(&fp, 8, 2, 0); + + ADDI(&fp, 2, 5, 0); + ADDI(&fp, 5, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 6, 0); + ADDI(&fp, 6, 10, 0); + ADDI(&fp, 10, 2, 0); + + ADDI(&fp, 2, 9, 0); + ADDI(&fp, 9, 10, 0); + ADDI(&fp, 10, 2, 0); + + *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; + MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1); + memcpy(fp, vfp_e, vfp_o - vfp_e); + if(sign > 0) { + fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040; + fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040; + fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040; + } + fp += (vfp_o - vfp_e) / 4; + +#endif + *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12 + //ADDI(&fp, 2, 1, 0); + MOVI(&fp, 1, 0); + + // args: r0 - out + // r1 - N + // r2 - ws +// ADDI(&fp, 3, 1, 0); // put N into r3 for counter + + int32_t pAddr = 0; + int32_t pN = 0; + int32_t pLUT = 0; + count = 2; + while(pps[0]) { + +// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr); + if(!pN) { + MOVI(&fp, 1, pps[0]); + }else{ + if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr); + if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN); + } + + if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT) + ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); + + + if(pps[0] == 2*leafN) { + *fp = BL(fp+2, x_4_addr); fp++; + }else if(!pps[2]){ + //uint32_t *x_8_t_addr = fp; +#ifdef HAVE_NEON + memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); + if(sign < 0) { + fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; + fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000; + fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000; + } + fp += (neon_ee - neon_x8_t) / 4; + //*fp++ = BL(fp+2, x_8_t_addr); + +#else + *fp = BL(fp+2, x_8_addr); fp++; +#endif + }else{ + *fp = BL(fp+2, x_8_addr); fp++; + } + + pAddr = pps[1] * 4; + pN = pps[0]; + pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN); +// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); + count += 4; + pps += 2; + } + + *fp++ = 0xecbd8b10; + *fp++ = POP_LR(); count++; +#else + POP(&fp, R15); + POP(&fp, R14); + POP(&fp, R13); + POP(&fp, R12); + POP(&fp, R11); + POP(&fp, R10); + POP(&fp, RBX); + POP(&fp, RBP); + RET(&fp); + + +//uint8_t *pp = func; +//int counter = 0; +//do{ +// printf("%02x ", *pp); +// if(counter++ % 16 == 15) printf("\n"); +//} while(++pp < fp); + +//printf("\n"); + + +#endif + + +// *fp++ = B(14); count++; + +//for(int i=0;i<(neon_x8 - neon_x4)/4;i++) +// fprintf(stderr, "%08x\n", x_4_addr[i]); +//fprintf(stderr, "\n"); +//for(int i=0;i<count;i++) + + free(ps); + + if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) { + perror("Couldn't mprotect"); + exit(1); + } +#ifdef __APPLE__ + sys_icache_invalidate(func, p->transform_size); +#elif __ANDROID__ + cacheflush((long)(func), (long)(func) + p->transform_size, 0); +#elif __linux__ +#ifdef __GNUC__ + __clear_cache((long)(func), (long)(func) + p->transform_size); +#endif +#endif + +//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4); + + p->transform = (void *) (start); +} diff --git a/lib/ffts/src/codegen.h b/lib/ffts/src/codegen.h new file mode 100644 index 0000000..f592907 --- /dev/null +++ b/lib/ffts/src/codegen.h @@ -0,0 +1,49 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __CODEGEN_H__ +#define __CODEGEN_H__ + +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <sys/mman.h> +#include <string.h> +#include <limits.h> /* for PAGESIZE */ + +#include "ffts.h" + +void ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN, int sign); + +#endif diff --git a/lib/ffts/src/codegen_arm.h b/lib/ffts/src/codegen_arm.h new file mode 100644 index 0000000..ad8a9d8 --- /dev/null +++ b/lib/ffts/src/codegen_arm.h @@ -0,0 +1,101 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __CODEGEN_ARM_H__ +#define __CODEGEN_ARM_H__ + + + +uint32_t BL(void *pos, void *target) { + return 0xeb000000 | (((target - pos) / 4) & 0xffffff); +} + +uint32_t B(uint8_t r) { + return 0xe12fff10 | r; +} + +uint32_t MOV(uint8_t dst, uint8_t src) { + return 0xe1a00000 | (src & 0xf) | ((dst & 0xf) << 12); +} + +void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) { + int32_t oimm = imm; + if(imm < 0) { + imm = -imm; + uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm); + if(shamt & 1) shamt -= 1; + imm >>= shamt; + shamt = (32 - shamt)/2; + + // if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm); + *(*p)++ = 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff); + + if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2)))); + + }else{ + uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm); + if(shamt & 1) shamt -= 1; + imm >>= shamt; + shamt = (32 - shamt)/2; + +// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm); + + *(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff); + + if(imm > 255) ADDI(p, dst, src, (oimm - ((imm & 0xff) << (32-shamt*2)))); + } +} + +uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) { + return 0xe5900000 | ((dst & 0xf) << 12) + | ((base & 0xf) << 16) | (offset & 0xfff) ; +} + +void MOVI(uint32_t **p, uint8_t dst, uint32_t imm) { + uint32_t oimm = imm; + + uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm); + if(shamt & 1) shamt -= 1; + imm >>= shamt; + shamt = (32 - shamt)/2; + *(*p)++ = 0xe3a00000 | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff) ; + if(imm > 255) ADDI(p, dst, dst, (oimm - ((imm & 0xff) << (32-shamt*2)))); +} + +uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; } +uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; } + + + + +#endif diff --git a/lib/ffts/src/codegen_sse.h b/lib/ffts/src/codegen_sse.h new file mode 100644 index 0000000..ec85667 --- /dev/null +++ b/lib/ffts/src/codegen_sse.h @@ -0,0 +1,195 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#ifndef __CODEGEN_SSE_H__ +#define __CODEGEN_SSE_H__ + +void neon_x4(float *, size_t, float *); +void neon_x8(float *, size_t, float *); +void neon_x8_t(float *, size_t, float *); +void leaf_ee_init(); +void leaf_ee(); +void leaf_oo(); +void leaf_eo(); +void leaf_oe(); +void leaf_end(); +void x_init(); +void x4(); +void x8_soft(); +void x8_hard(); + +void sse_constants(); +void sse_constants_inv(); + +// typedef uint8_t insns_t; + +extern const uint32_t sse_leaf_ee_offsets[8]; +extern const uint32_t sse_leaf_oo_offsets[8]; +extern const uint32_t sse_leaf_eo_offsets[8]; +extern const uint32_t sse_leaf_oe_offsets[8]; + +#define EAX 0 +#define ECX 1 +#define EDX 2 +#define EBX 3 +#define ESI 6 +#define EDI 7 +#define EBP 5 + +#define RAX 0 +#define RCX 1 +#define RDX 2 +#define RBX 3 +#define RSI 6 +#define RDI 7 +#define RBP 5 +#define R8 8 +#define R9 9 +#define R10 10 +#define R11 11 +#define R12 12 +#define R13 13 +#define R14 14 +#define R15 15 + +void IMM8(uint8_t **p, int32_t imm) { + *(*p)++ = (imm & 0xff); +} + +void IMM16(uint8_t **p, int32_t imm) { + int i; + for(i=0;i<2;i++) { + *(*p)++ = (imm & (0xff << (i*8))) >> (i*8); + } +} +void IMM32(uint8_t **p, int32_t imm) { + int i; + for(i=0;i<4;i++) { + *(*p)++ = (imm & (0xff << (i*8))) >> (i*8); + } +} +void IMM32_NI(uint8_t *p, int32_t imm) { + int i; + for(i=0;i<4;i++) { + *(p+i) = (imm & (0xff << (i*8))) >> (i*8); + } +} + +int32_t READ_IMM32(uint8_t *p) { + int32_t rval = 0; + int i; + for(i=0;i<4;i++) { + rval |= *(p+i) << (i*8); + } + return rval; +} + +void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) { +// if(imm < 65536) *(*p)++ = 0x66; + if(dst >= 8) *(*p)++ = 0x41; + + //if(imm < 65536 && imm >= 256) *(*p)++ = 0x66; + + //if(imm >= 256) + *(*p)++ = 0xb8 | (dst & 0x7); +// else *(*p)++ = 0xb0 | (dst & 0x7); + + // if(imm < 256) IMM8(p, imm); +// else +//if(imm < 65536) IMM16(p, imm); +//else + IMM32(p, imm); + +//if(dst < 8) { +// *(*p)++ = 0xb8 + dst; +//}else{ +// *(*p)++ = 0x49; +// *(*p)++ = 0xc7; +// *(*p)++ = 0xc0 | (dst - 8); +//} +//IMM32(p, imm); +} + +void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) { + if(disp == 0) { + *(*p)++ = (rm & 7) | ((reg & 7) << 3); + }else if(disp <= 127 || disp >= -128) { + *(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3); + IMM8(p, disp); + }else{ + *(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3); + IMM32(p, disp); + } +} + +void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp) { + + *(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1); + *(*p)++ = 0x8d; + ADDRMODE(p, dst, base, disp); +} + +void RET(uint8_t **p) { + *(*p)++ = 0xc3; +} + +void ADDI(uint8_t **p, uint8_t dst, int32_t imm) { + + if(dst >= 8) *(*p)++ = 0x49; + else *(*p)++ = 0x48; + + if(imm > 127 || imm <= -128) *(*p)++ = 0x81; + else *(*p)++ = 0x83; + + *(*p)++ = 0xc0 | (dst & 0x7); + + if(imm > 127 || imm <= -128) IMM32(p, imm); + else IMM8(p, imm); +} + +void CALL(uint8_t **p, uint8_t *func) { + *(*p)++ = 0xe8; + IMM32(p, ((void *)func) - (void *)(*p) - 4); +} + +void PUSH(uint8_t **p, uint8_t reg) { + if(reg >= 8) *(*p)++ = 0x41; + *(*p)++ = 0x50 | (reg & 7); +} +void POP(uint8_t **p, uint8_t reg) { + if(reg >= 8) *(*p)++ = 0x41; + *(*p)++ = 0x58 | (reg & 7); +} + +#endif diff --git a/lib/ffts/src/ffts.c b/lib/ffts/src/ffts.c new file mode 100644 index 0000000..bec2177 --- /dev/null +++ b/lib/ffts/src/ffts.c @@ -0,0 +1,398 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#include "ffts.h" +#include "macros.h" +//#include "mini_macros.h" +#include "patterns.h" +#include "ffts_small.h" + +#ifdef DYNAMIC_DISABLED + #include "ffts_static.h" +#else + #include "codegen.h" +#endif + +#include <errno.h> + #include <sys/mman.h> + #include <string.h> + #include <limits.h> /* for PAGESIZE */ + +#if __APPLE__ + #include <libkern/OSCacheControl.h> +#else +#endif + +void ffts_execute(ffts_plan_t *p, const void * in, void * out) { + p->transform(p, (const float *)in, (float *)out); +} + +void ffts_free(ffts_plan_t *p) { + p->destroy(p); +} + +void ffts_free_1d(ffts_plan_t *p) { + + size_t i; + + if(p->ws) { + FFTS_FREE(p->ws); + } + if(p->is) free(p->is); + if(p->ws_is) free(p->ws_is); + if(p->offsets) free(p->offsets); + //free(p->transforms); + if(p->transforms) free(p->transforms); + + if(p->transform_base) { + if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) { + perror("Couldn't mprotect"); + exit(errno); + } + munmap(p->transform_base, p->transform_size); + //free(p->transform_base); + } + free(p); +} + +ffts_plan_t *ffts_init_1d(size_t N, int sign) { + ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); + size_t leafN = 8; + size_t i; + +#ifdef __arm__ +//#ifdef HAVE_NEON + V MULI_SIGN; + + if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); + else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); +//#endif +#else + V MULI_SIGN; + + if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); + else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); +#endif + + p->transform = NULL; + p->transform_base = NULL; + p->transforms = NULL; + p->is = NULL; + p->ws_is = NULL; + p->ws = NULL; + p->offsets = NULL; + p->destroy = ffts_free_1d; + + if(N >= 32) { + ffts_init_offsets(p, N, leafN); +#ifdef __arm__ +#ifdef HAVE_NEON + ffts_init_is(p, N, leafN, 1); +#else + ffts_init_is(p, N, leafN, 1); +#endif +#else + ffts_init_is(p, N, leafN, 1); +#endif + + p->i0 = N/leafN/3+1; + p->i1 = N/leafN/3; + if((N/leafN) % 3 > 1) p->i1++; + p->i2 = N/leafN/3; + + #ifdef __arm__ + #ifdef HAVE_NEON + p->i0/=2; + p->i1/=2; + #endif + #else + p->i0/=2; + p->i1/=2; + #endif + + }else{ + p->transforms = malloc(2 * sizeof(transform_index_t)); + p->transforms[0] = 0; + p->transforms[1] = 1; + if(N == 2) p->transform = &firstpass_2; + else if(N == 4 && sign == -1) p->transform = &firstpass_4_f; + else if(N == 4 && sign == 1) p->transform = &firstpass_4_b; + else if(N == 8 && sign == -1) p->transform = &firstpass_8_f; + else if(N == 8 && sign == 1) p->transform = &firstpass_8_b; + else if(N == 16 && sign == -1) p->transform = &firstpass_16_f; + else if(N == 16 && sign == 1) p->transform = &firstpass_16_b; + + p->is = NULL; + p->offsets = NULL; + } + + int hardcoded = 0; + + /* LUTS */ + size_t n_luts = __builtin_ctzl(N/leafN); + if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } + + if(n_luts >= 32) n_luts = 0; + +// fprintf(stderr, "n_luts = %zu\n", n_luts); + + cdata_t *w; + + int n = leafN*2; + if(hardcoded) n = 8; + + size_t lut_size = 0; + + for(i=0;i<n_luts;i++) { + if(!i || hardcoded) { + #ifdef __arm__ + if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t); + else lut_size += n/4 * sizeof(cdata_t); + #else + lut_size += n/4 * 2 * sizeof(cdata_t); + #endif + n *= 2; + } else { + #ifdef __arm__ + lut_size += n/8 * 3 * sizeof(cdata_t); + #else + lut_size += n/8 * 3 * 2 * sizeof(cdata_t); + #endif + } + n *= 2; + } + +// lut_size *= 16; + + // fprintf(stderr, "lut size = %zu\n", lut_size); + if(n_luts) { + p->ws = FFTS_MALLOC(lut_size,32); + p->ws_is = malloc(n_luts * sizeof(size_t)); + }else{ + p->ws = NULL; + p->ws_is = NULL; + } + w = p->ws; + + n = leafN*2; + if(hardcoded) n = 8; + + #ifdef HAVE_NEON + V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f); + #endif + + for(i=0;i<n_luts;i++) { + p->ws_is[i] = w - (cdata_t *)p->ws; + //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]); + + if(!i || hardcoded) { + cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); + + size_t j; + for(j=0;j<n/4;j++) { + w0[j][0] = W_re(n,j); + w0[j][1] = W_im(n,j); + } + + + float *fw0 = (float *)w0; + #ifdef __arm__ + if(N < 32) { + //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); + float *fw = (float *)w; + V temp0, temp1, temp2; + for(j=0;j<n/4;j+=2) { + // #ifdef HAVE_NEON + temp0 = VLD(fw0 + j*2); + V re, im; + re = VDUPRE(temp0); + im = VDUPIM(temp0); + #ifdef HAVE_NEON + im = VXOR(im, MULI_SIGN); + //im = IMULI(sign>0, im); + #else + im = MULI(sign>0, im); + #endif + VST(fw + j*4 , re); + VST(fw + j*4+4, im); + // #endif + } + w += n/4 * 2; + }else{ + //w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); + float *fw = (float *)w; + #ifdef HAVE_NEON + VS temp0, temp1, temp2; + for(j=0;j<n/4;j+=4) { + temp0 = VLD2(fw0 + j*2); + temp0.val[1] = VXOR(temp0.val[1], neg); + STORESPR(fw + j*2, temp0); + } + #else + for(j=0;j<n/4;j+=1) { + fw[j*2] = fw0[j*2]; + fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; + } + #endif + w += n/4; + } + #else + //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); + float *fw = (float *)w; + V temp0, temp1, temp2; + for(j=0;j<n/4;j+=2) { + temp0 = VLD(fw0 + j*2); + V re, im; + re = VDUPRE(temp0); + im = VDUPIM(temp0); + im = VXOR(im, MULI_SIGN); + VST(fw + j*4 , re); + VST(fw + j*4+4, im); + } + w += n/4 * 2; + #endif + + FFTS_FREE(w0); + }else{ + + cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); + cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); + cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); + + size_t j; + for(j=0;j<n/8;j++) { + w0[j][0] = W_re(n,j*2); + w0[j][1] = W_im(n,j*2); + w1[j][0] = W_re(n,j); + w1[j][1] = W_im(n,j); + w2[j][0] = W_re(n,j + (n/8)); + w2[j][1] = W_im(n,j + (n/8)); + + } + + float *fw0 = (float *)w0; + float *fw1 = (float *)w1; + float *fw2 = (float *)w2; + #ifdef __arm__ + //w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32); + float *fw = (float *)w; + #ifdef HAVE_NEON + VS temp0, temp1, temp2; + for(j=0;j<n/8;j+=4) { + temp0 = VLD2(fw0 + j*2); + temp0.val[1] = VXOR(temp0.val[1], neg); + STORESPR(fw + j*2*3, temp0); + temp1 = VLD2(fw1 + j*2); + temp1.val[1] = VXOR(temp1.val[1], neg); + STORESPR(fw + j*2*3 + 8, temp1); + temp2 = VLD2(fw2 + j*2); + temp2.val[1] = VXOR(temp2.val[1], neg); + STORESPR(fw + j*2*3 + 16, temp2); + } + #else + for(j=0;j<n/8;j+=1) { + fw[j*6] = fw0[j*2]; + fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; + fw[j*6+2] = fw1[j*2+0]; + fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1]; + fw[j*6+4] = fw2[j*2+0]; + fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1]; + } + #endif + w += n/8 * 3; + #else + //w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32); + float *fw = (float *)w; + V temp0, temp1, temp2, re, im; + for(j=0;j<n/8;j+=2) { + temp0 = VLD(fw0 + j*2); + re = VDUPRE(temp0); + im = VDUPIM(temp0); + im = VXOR(im, MULI_SIGN); + VST(fw + j*2*6 , re); + VST(fw + j*2*6+4, im); + + temp1 = VLD(fw1 + j*2); + re = VDUPRE(temp1); + im = VDUPIM(temp1); + im = VXOR(im, MULI_SIGN); + VST(fw + j*2*6+8 , re); + VST(fw + j*2*6+12, im); + + temp2 = VLD(fw2 + j*2); + re = VDUPRE(temp2); + im = VDUPIM(temp2); + im = VXOR(im, MULI_SIGN); + VST(fw + j*2*6+16, re); + VST(fw + j*2*6+20, im); + } + w += n/8 * 3 * 2; + #endif + + FFTS_FREE(w0); + FFTS_FREE(w1); + FFTS_FREE(w2); + } + ///p->ws[i] = w; + + n *= 2; + } + + float *tmp = (float *)p->ws; + + if(sign < 0) { + p->oe_ws = (void *)(&w_data[4]); + p->ee_ws = (void *)(w_data); + p->eo_ws = (void *)(&w_data[4]); + }else{ + p->oe_ws = (void *)(w_data + 12); + p->ee_ws = (void *)(w_data + 8); + p->eo_ws = (void *)(w_data + 12); + } + + p->N = N; + p->lastlut = w; + p->n_luts = n_luts; +#ifdef DYNAMIC_DISABLED + if(sign < 0) { + if(N >= 32) p->transform = ffts_static_transform_f; + }else{ + if(N >= 32) p->transform = ffts_static_transform_i; + } + +#else + if(N>=32) ffts_generate_func_code(p, N, leafN, sign); +#endif + + return p; +} + diff --git a/lib/ffts/src/ffts.h b/lib/ffts/src/ffts.h new file mode 100644 index 0000000..4409029 --- /dev/null +++ b/lib/ffts/src/ffts.h @@ -0,0 +1,177 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#ifndef __CP_SSE_H__ +#define __CP_SSE_H__ + +#include "config.h" + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stddef.h> +#include <stdint.h> +//#include <stdalign.h> + +//#include "codegen.h" +#include "types.h" + +#define PI 3.1415926535897932384626433832795028841971693993751058209 + +static const __attribute__ ((aligned(64))) float w_data[16] = { + 0.70710678118654757273731092936941, 0.70710678118654746171500846685376, + -0.70710678118654757273731092936941, -0.70710678118654746171500846685376, + 1.0f, 0.70710678118654757273731092936941f, + -0.0f, -0.70710678118654746171500846685376, + 0.70710678118654757273731092936941, 0.70710678118654746171500846685376, + 0.70710678118654757273731092936941, 0.70710678118654746171500846685376, + 1.0f, 0.70710678118654757273731092936941f, + 0.0f, 0.70710678118654746171500846685376 +}; + +__INLINE float W_re(float N, float k) { return cos(-2.0f * PI * k / N); } +__INLINE float W_im(float N, float k) { return sin(-2.0f * PI * k / N); } + +typedef size_t transform_index_t; + +//typedef void (*transform_func_t)(float *data, size_t N, float *LUT); +typedef void (*transform_func_t)(float *data, size_t N, float *LUT); + +typedef struct _ffts_plan_t ffts_plan_t; + +/** + * Contains all the Information need to perform FFT + * + * + * DO NOT CHANGE THE ORDER OF MEMBERS + * ASSEMBLY CODE USES HARD CODED OFFSETS TO REFERENCE + * SOME OF THESE VARIABES!! + */ +struct _ffts_plan_t { + + /** + * + */ + ptrdiff_t *offsets; +#ifdef DYNAMIC_DISABLED + /** + * Twiddle factors + */ + void *ws; + /** + * ee - 2 size x size8 + * oo - 2 x size4 in parallel + * oe - + */ + void *oe_ws, *eo_ws, *ee_ws; +#else + void __attribute__((aligned(32))) *ws; + void __attribute__((aligned(32))) *oe_ws, *eo_ws, *ee_ws; +#endif + /** + * Pointer into an array of precomputed indexes for the input data array + */ + ptrdiff_t *is; + + /** + * Twiddle Factor Indexes + */ + size_t *ws_is; + + /** + * Size of the loops for the base cases + */ + size_t i0, i1, n_luts; + + /** + * Size fo the Transform + */ + size_t N; + void *lastlut; + /** + * Used in multidimensional Code ?? + */ + transform_index_t *transforms; + //transform_func_t transform; + + /** + * Pointer to the dynamically generated function + * that will execute the FFT + */ + void (*transform)(ffts_plan_t * , const void * , void * ); + + /** + * Pointer to the base memory address of + * of the transform function + */ + void *transform_base; + + /** + * Size of the memory block contain the + * generated code + */ + size_t transform_size; + + /** + * Points to the cosnant variables used by + * the Assembly Code + */ + void *constants; + + // multi-dimensional stuff: + struct _ffts_plan_t **plans; + int rank; + size_t *Ns, *Ms; + void *buf; + + void *transpose_buf; + + /** + * Pointer to the destroy function + * to clean up the plan after use + * (differs for real and multi dimension transforms + */ + void (*destroy)(ffts_plan_t *); + + /** + * Coefficiants for the real valued transforms + */ + float *A, *B; + + size_t i2; +}; + + +void ffts_free(ffts_plan_t *); +ffts_plan_t *ffts_init_1d(size_t N, int sign); +void ffts_execute(ffts_plan_t *, const void *, void *); +#endif diff --git a/lib/ffts/src/ffts_nd.c b/lib/ffts/src/ffts_nd.c new file mode 100644 index 0000000..ae9b148 --- /dev/null +++ b/lib/ffts/src/ffts_nd.c @@ -0,0 +1,282 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "ffts_nd.h" + +#ifdef HAVE_NEON +#include "neon.h" +#endif + +void ffts_free_nd(ffts_plan_t *p) { + + int i; + for(i=0;i<p->rank;i++) { + + ffts_plan_t *x = p->plans[i]; + int k; + for(k=0;k<i;k++) { + if(p->Ms[i] == p->Ms[k]) x = NULL; + } + + if(x) ffts_free(x); + } + + free(p->Ns); + free(p->Ms); + free(p->plans); + free(p->buf); + free(p->transpose_buf); + free(p); +} +#define TSIZE 8 +#include <string.h> +void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) { + +#ifdef HAVE_NEON + size_t i,j,k; + int linebytes = w*8; + + for(j=0;j<h;j+=8) { + for(i=0;i<w;i+=8) { + neon_transpose_to_buf(in + j*w + i, buf, w); + + uint64_t *p = out + i*h + j; + uint64_t *pbuf = buf; + uint64_t *ptemp; + + __asm__ __volatile__( + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" + "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" + "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" + "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" + "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" + "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" + "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" + "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" + "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" + "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" + "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" + "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" + "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" + "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" + "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" + "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" + "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" + "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" + "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" + "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" + "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" + "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" + "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" + "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" + "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" + + : [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp) + : [w] "r" (w) + : "memory", "q8", "q9", "q10", "q11" + ); +// out[i*h + j] = in[j*w + i]; + } + } +#else +#ifdef HAVE_SSE + uint64_t tmp[TSIZE*TSIZE] __attribute__((aligned(64))); + int tx, ty; + int x, y; + int tw = w / TSIZE; + int th = h / TSIZE; + for (ty=0;ty<th;ty++) { + for (tx=0;tx<tw;tx++) { + uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE; + uint64_t *op0 = tmp;//out + h*TSIZE*tx + ty*TSIZE; + + // Copy/transpose to tmp + for (y=0;y<TSIZE;y+=2) { + //for (x=0;x<TSIZE;x+=2) { + //op[x*TSIZE] = ip[x]; + __m128d q0 = _mm_load_pd((double *)(ip0 + 0*w)); + __m128d q1 = _mm_load_pd((double *)(ip0 + 1*w)); + __m128d q2 = _mm_load_pd((double *)(ip0 + 2*w)); + __m128d q3 = _mm_load_pd((double *)(ip0 + 3*w)); + __m128d q4 = _mm_load_pd((double *)(ip0 + 4*w)); + __m128d q5 = _mm_load_pd((double *)(ip0 + 5*w)); + __m128d q6 = _mm_load_pd((double *)(ip0 + 6*w)); + __m128d q7 = _mm_load_pd((double *)(ip0 + 7*w)); + ip0 += 2; + + __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); + __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); + __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0)); + __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1)); + __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0)); + __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1)); + __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0)); + __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1)); + //_mm_store_pd((double *)(op0 + y*h + x), t0); + //_mm_store_pd((double *)(op0 + y*h + x + h), t1); + _mm_store_pd((double *)(op0 + 0), t0); + _mm_store_pd((double *)(op0 + 0 + TSIZE), t1); + _mm_store_pd((double *)(op0 + 2 ), t2); + _mm_store_pd((double *)(op0 + 2 + TSIZE), t3); + _mm_store_pd((double *)(op0 + 4 ), t4); + _mm_store_pd((double *)(op0 + 4 + TSIZE), t5); + _mm_store_pd((double *)(op0 + 6 ), t6); + _mm_store_pd((double *)(op0 + 6 + TSIZE), t7); + //} + op0 += 2*TSIZE; + } + + op0 = out + h*tx*TSIZE + ty*TSIZE; + ip0 = tmp; + for (y=0;y<TSIZE;y+=1) { + // memcpy(op0, ip0, TSIZE * sizeof(*ip0)); + + __m128d q0 = _mm_load_pd((double *)(ip0 + 0)); + __m128d q1 = _mm_load_pd((double *)(ip0 + 2)); + __m128d q2 = _mm_load_pd((double *)(ip0 + 4)); + __m128d q3 = _mm_load_pd((double *)(ip0 + 6)); + _mm_store_pd((double *)(op0 + 0), q0); + _mm_store_pd((double *)(op0 + 2), q1); + _mm_store_pd((double *)(op0 + 4), q2); + _mm_store_pd((double *)(op0 + 6), q3); + + op0 += h; + ip0 += TSIZE; + } + + } + } +/* + size_t i,j; + for(i=0;i<w;i+=2) { + for(j=0;j<h;j+=2) { +// out[i*h + j] = in[j*w + i]; + __m128d q0 = _mm_load_pd((double *)(in + j*w + i)); + __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w)); + __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); + __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); + _mm_store_pd((double *)(out + i*h + j), t0); + _mm_store_pd((double *)(out + i*h + j + h), t1); + } + } +*/ +#endif +#endif + +} + +void ffts_execute_nd(ffts_plan_t *p, const void * in, void * out) { + + uint64_t *din = (uint64_t *)in; + uint64_t *buf = p->buf; + uint64_t *dout = (uint64_t *)out; + + size_t i,j; + for(i=0;i<p->Ns[0];i++) { + p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * p->Ms[0])); + } + ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf); + + for(i=1;i<p->rank;i++) { + for(j=0;j<p->Ns[i];j++) { + p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i])); + } + ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf); + } +} + +ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign) { + size_t vol = 1; + + ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); + + p->transform = &ffts_execute_nd; + p->destroy = &ffts_free_nd; + + p->rank = rank; + p->Ns = malloc(sizeof(size_t) * rank); + p->Ms = malloc(sizeof(size_t) * rank); + p->plans = malloc(sizeof(ffts_plan_t **) * rank); + int i; + for(i=0;i<rank;i++) { + p->Ns[i] = Ns[i]; + vol *= Ns[i]; + } + p->buf = valloc(sizeof(float) * 2 * vol); + + for(i=0;i<rank;i++) { + p->Ms[i] = vol / p->Ns[i]; + + p->plans[i] = NULL; + int k; + for(k=0;k<i;k++) { + if(p->Ms[k] == p->Ms[i]) + p->plans[i] = p->plans[k]; + } + + if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); + } + + p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8); + return p; +} + + +ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign) { + size_t Ns[2]; + Ns[0] = N1; + Ns[1] = N2; + return ffts_init_nd(2, Ns, sign); +} diff --git a/lib/ffts/src/ffts_nd.h b/lib/ffts/src/ffts_nd.h new file mode 100644 index 0000000..8f0c855 --- /dev/null +++ b/lib/ffts/src/ffts_nd.h @@ -0,0 +1,58 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __FFTS_ND_H__ +#define __FFTS_ND_H__ + +#include <stdint.h> +#include <stddef.h> +#include <stdio.h> + +#include "ffts.h" + +#ifdef HAVE_NEON + #include <arm_neon.h> +#endif +#ifdef HAVE_SSE + #include <xmmintrin.h> +#endif + +void ffts_free_nd(ffts_plan_t *p); +void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf); + +void ffts_execute_nd(ffts_plan_t *p, const void * in, void * out); +ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign); +ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign); + +#endif + diff --git a/lib/ffts/src/ffts_real.c b/lib/ffts/src/ffts_real.c new file mode 100644 index 0000000..bdb6eac --- /dev/null +++ b/lib/ffts/src/ffts_real.c @@ -0,0 +1,226 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "ffts_real.h" + +void ffts_free_1d_real(ffts_plan_t *p) { + ffts_free(p->plans[0]); + free(p->A); + free(p->B); + free(p->plans); + free(p->buf); + free(p); +} + +void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) { + float *out = (float *)vout; + float *buf = (float *)p->buf; + float *A = p->A; + float *B = p->B; + + p->plans[0]->transform(p->plans[0], vin, buf); + + size_t N = p->N; + buf[N] = buf[0]; + buf[N+1] = buf[1]; + + float *p_buf0 = buf; + float *p_buf1 = buf + N - 2; + float *p_out = out; + + size_t i; +#ifdef __ARM_NEON__ + for(i=0;i<N/2;i+=2) { + __asm__ __volatile__ ("vld1.32 {q8}, [%[pa], :128]!\n\t" + "vld1.32 {q9}, [%[pb], :128]!\n\t" + "vld1.32 {q10}, [%[buf0], :128]!\n\t" + "vld1.32 {q11}, [%[buf1], :64]\n\t" + "sub %[buf1], %[buf1], #16\n\t" + + "vdup.32 d26, d16[1]\n\t" + "vdup.32 d27, d17[1]\n\t" + "vdup.32 d24, d16[0]\n\t" + "vdup.32 d25, d17[0]\n\t" + + "vdup.32 d30, d23[1]\n\t" + "vdup.32 d31, d22[1]\n\t" + "vdup.32 d28, d23[0]\n\t" + "vdup.32 d29, d22[0]\n\t" + + "vmul.f32 q13, q13, q10\n\t" + "vmul.f32 q15, q15, q9\n\t" + "vmul.f32 q12, q12, q10\n\t" + "vmul.f32 q14, q14, q9\n\t" + "vrev64.f32 q13, q13\n\t" + "vrev64.f32 q15, q15\n\t" + + "vtrn.32 d26, d27\n\t" + "vtrn.32 d30, d31\n\t" + "vneg.f32 d26, d26\n\t" + "vneg.f32 d31, d31\n\t" + "vtrn.32 d26, d27\n\t" + "vtrn.32 d30, d31\n\t" + + "vadd.f32 q12, q12, q14\n\t" + "vadd.f32 q13, q13, q15\n\t" + "vadd.f32 q12, q12, q13\n\t" + "vst1.32 {q12}, [%[pout], :128]!\n\t" + : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), + [pout] "+r" (p_out) + : + : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +#else + for(i=0;i<N/2;i++) { + out[2*i] = buf[2*i]*A[2*i] - buf[2*i+1]*A[2*i+1] + buf[N-2*i]*B[2*i] + buf[N-2*i+1]*B[2*i+1]; + out[2*i+1] = buf[2*i+1]*A[2*i] + buf[2*i]*A[2*i+1] + buf[N-2*i]*B[2*i+1] - buf[N-2*i+1]*B[2*i]; + +// out[2*N-2*i] = out[2*i]; +// out[2*N-2*i+1] = -out[2*i+1]; + +#endif + } + + out[N] = buf[0] - buf[1]; + out[N+1] = 0.0f; + +} + +void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) { + float *out = (float *)vout; + float *in = (float *)vin; + float *buf = (float *)p->buf; + float *A = p->A; + float *B = p->B; + size_t N = p->N; + + float *p_buf0 = in; + float *p_buf1 = in + N - 2; + + float *p_out = buf; + + size_t i; +#ifdef __ARM_NEON__ + for(i=0;i<N/2;i+=2) { + __asm__ __volatile__ ("vld1.32 {q8}, [%[pa], :128]!\n\t" + "vld1.32 {q9}, [%[pb], :128]!\n\t" + "vld1.32 {q10}, [%[buf0], :128]!\n\t" + "vld1.32 {q11}, [%[buf1], :64]\n\t" + "sub %[buf1], %[buf1], #16\n\t" + + "vdup.32 d26, d16[1]\n\t" + "vdup.32 d27, d17[1]\n\t" + "vdup.32 d24, d16[0]\n\t" + "vdup.32 d25, d17[0]\n\t" + + "vdup.32 d30, d23[1]\n\t" + "vdup.32 d31, d22[1]\n\t" + "vdup.32 d28, d23[0]\n\t" + "vdup.32 d29, d22[0]\n\t" + + "vmul.f32 q13, q13, q10\n\t" + "vmul.f32 q15, q15, q9\n\t" + "vmul.f32 q12, q12, q10\n\t" + "vmul.f32 q14, q14, q9\n\t" + "vrev64.f32 q13, q13\n\t" + "vrev64.f32 q15, q15\n\t" + + "vtrn.32 d26, d27\n\t" + "vtrn.32 d28, d29\n\t" + "vneg.f32 d27, d27\n\t" + "vneg.f32 d29, d29\n\t" + "vtrn.32 d26, d27\n\t" + "vtrn.32 d28, d29\n\t" + + "vadd.f32 q12, q12, q14\n\t" + "vsub.f32 q13, q13, q15\n\t" + "vadd.f32 q12, q12, q13\n\t" + "vst1.32 {q12}, [%[pout], :128]!\n\t" + : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), + [pout] "+r" (p_out) + : + : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + + +#else + for(i=0;i<N/2;i++) { + buf[2*i] = in[2*i]*A[2*i] + in[2*i+1]*A[2*i+1] + in[N-2*i]*B[2*i] - in[N-2*i+1]*B[2*i+1]; + buf[2*i+1] = in[2*i+1]*A[2*i] - in[2*i]*A[2*i+1] - in[N-2*i]*B[2*i+1] - in[N-2*i+1]*B[2*i]; +#endif +} + + p->plans[0]->transform(p->plans[0], buf, out); + +} + +ffts_plan_t *ffts_init_1d_real(size_t N, int sign) { + ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); + + if(sign < 0) p->transform = &ffts_execute_1d_real; + else p->transform = &ffts_execute_1d_real_inv; + + p->destroy = &ffts_free_1d_real; + p->N = N; + p->rank = 1; + p->plans = malloc(sizeof(ffts_plan_t **) * 1); + + p->plans[0] = ffts_init_1d(N/2, sign); + + p->buf = valloc(sizeof(float) * 2 * ((N/2) + 1)); + + p->A = valloc(sizeof(float) * N); + p->B = valloc(sizeof(float) * N); + + if(sign < 0) { + int i; + for (i = 0; i < N/2; i++) { + p->A[2 * i] = 0.5 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i)); + p->A[2 * i + 1] = 0.5 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i)); + p->B[2 * i] = 0.5 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i)); + p->B[2 * i + 1] = 0.5 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i)); + } + }else{ + int i; + for (i = 0; i < N/2; i++) { + p->A[2 * i] = 1.0 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i)); + p->A[2 * i + 1] = 1.0 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i)); + p->B[2 * i] = 1.0 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i)); + p->B[2 * i + 1] = 1.0 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i)); + } + } + + return p; +} + + diff --git a/lib/ffts/src/ffts_real.h b/lib/ffts/src/ffts_real.h new file mode 100644 index 0000000..bf8834d --- /dev/null +++ b/lib/ffts/src/ffts_real.h @@ -0,0 +1,53 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __FFTS_REAL_H__ +#define __FFTS_REAL_H__ + +#include <stdint.h> +#include <stddef.h> +#include <stdio.h> + +#include "ffts.h" + +#ifdef HAVE_NEON + #include <arm_neon.h> +#endif +#ifdef HAVE_SSE + #include <xmmintrin.h> +#endif + +ffts_plan_t *ffts_init_1d_real(size_t N, int sign); + +#endif + diff --git a/lib/ffts/src/ffts_real_nd.c b/lib/ffts/src/ffts_real_nd.c new file mode 100644 index 0000000..bf46254 --- /dev/null +++ b/lib/ffts/src/ffts_real_nd.c @@ -0,0 +1,177 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "ffts_real_nd.h" + +#ifdef __ARM_NEON__ +#include "neon.h" +#endif + +void ffts_free_nd_real(ffts_plan_t *p) { + + int i; + for(i=0;i<p->rank;i++) { + + ffts_plan_t *x = p->plans[i]; + + int k; + for(k=i+1;k<p->rank;k++) { + if(x == p->plans[k]) p->plans[k] = NULL; + } + + if(x) ffts_free(x); + } + + free(p->Ns); + free(p->Ms); + free(p->plans); + free(p->buf); + free(p->transpose_buf); + free(p); +} + +void ffts_scalar_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) { + + size_t i,j; + for(i=0;i<w;i+=1) { + for(j=0;j<h;j+=1) { + out[i*h + j] = in[j*w + i]; + } + } + +} + +void ffts_execute_nd_real(ffts_plan_t *p, const void * in, void * out) { + + uint32_t *din = (uint32_t *)in; + uint64_t *buf = p->buf; + uint64_t *dout = (uint64_t *)out; + + size_t i,j; + for(i=0;i<p->Ns[0];i++) { + p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * (p->Ms[0] / 2 + 1))); + } + ffts_scalar_transpose(buf, dout, p->Ms[0] / 2 + 1, p->Ns[0], p->transpose_buf); + + for(i=1;i<p->rank;i++) { + for(j=0;j<p->Ns[i];j++) { + p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i])); + } + ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf); + } +} + +void ffts_execute_nd_real_inv(ffts_plan_t *p, const void * in, void * out) { + + uint64_t *din = (uint64_t *)in; + uint64_t *buf = p->buf; + uint64_t *dout = (uint64_t *)out; + + float *bufr = (float *)(p->buf); + float *doutr = (float *)out; + + size_t i,j; + ffts_scalar_transpose(din, buf, p->Ms[0], p->Ns[0], p->transpose_buf); + + for(i=0;i<p->Ms[0];i++) { + p->plans[0]->transform(p->plans[0], buf + (i * p->Ns[0]), dout + (i * p->Ns[0])); + } + + ffts_scalar_transpose(dout, buf, p->Ns[0], p->Ms[0], p->transpose_buf); + for(j=0;j<p->Ms[1];j++) { + p->plans[1]->transform(p->plans[1], buf + (j * (p->Ms[0])), &doutr[j * p->Ns[1]]); + } +} + +ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) { + size_t vol = 1; + + ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); + + if(sign < 0) p->transform = &ffts_execute_nd_real; + else p->transform = &ffts_execute_nd_real_inv; + + p->destroy = &ffts_free_nd_real; + + p->rank = rank; + p->Ns = malloc(sizeof(size_t) * rank); + p->Ms = malloc(sizeof(size_t) * rank); + p->plans = malloc(sizeof(ffts_plan_t **) * rank); + int i; + for(i=0;i<rank;i++) { + p->Ns[i] = Ns[i]; + vol *= Ns[i]; + } + p->buf = valloc(sizeof(float) * 2 * vol); + + for(i=0;i<rank;i++) { + p->Ms[i] = vol / p->Ns[i]; + + p->plans[i] = NULL; + int k; + + if(sign < 0) { + for(k=1;k<i;k++) { + if(p->Ms[k] == p->Ms[i]) p->plans[i] = p->plans[k]; + } + if(!i) p->plans[i] = ffts_init_1d_real(p->Ms[i], sign); + else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); + }else{ + for(k=0;k<i;k++) { + if(p->Ns[k] == p->Ns[i]) p->plans[i] = p->plans[k]; + } + if(i==rank-1) p->plans[i] = ffts_init_1d_real(p->Ns[i], sign); + else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ns[i], sign); + } + } + if(sign < 0) { + for(i=1;i<rank;i++) { + p->Ns[i] = p->Ns[i] / 2 + 1; + } + }else{ + for(i=0;i<rank-1;i++) { + p->Ms[i] = p->Ms[i] / 2 + 1; + } + } + + p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8); + return p; +} + + +ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) { + size_t Ns[2]; + Ns[0] = N1; + Ns[1] = N2; + return ffts_init_nd_real(2, Ns, sign); +} diff --git a/lib/ffts/src/ffts_real_nd.h b/lib/ffts/src/ffts_real_nd.h new file mode 100644 index 0000000..d777d42 --- /dev/null +++ b/lib/ffts/src/ffts_real_nd.h @@ -0,0 +1,53 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __FFTS_REAL_ND_H__ +#define __FFTS_REAL_ND_H__ + +#include <stdint.h> +#include <stddef.h> +#include <stdio.h> + +#include "ffts_nd.h" +#include "ffts_real.h" +#include "ffts.h" + +#ifdef HAVE_NEON + #include <arm_neon.h> +#endif +#ifdef HAVE_SSE + #include <xmmintrin.h> +#endif + +#endif + diff --git a/lib/ffts/src/ffts_small.c b/lib/ffts/src/ffts_small.c new file mode 100644 index 0000000..ddd2d3e --- /dev/null +++ b/lib/ffts/src/ffts_small.c @@ -0,0 +1,156 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> + Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "ffts.h" +#include "macros.h" + +#include <stdlib.h> + +#define DEBUG(x) + +#include "ffts_small.h" + + void firstpass_16_f(ffts_plan_t * p, const void * in, void * out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; + float *LUT8 = p->ws; + + L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11); + L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13); + K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); + K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13); + S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24); + K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15); + S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28); +} + + void firstpass_16_b(ffts_plan_t * p, const void * in, void * out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; + float *LUT8 = p->ws; + + L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11); + L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13); + K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); + K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13); + S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24); + K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15); + S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28); +} + + + void firstpass_8_f(ffts_plan_t *p, const void *in, void *out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + V r0_1, r2_3, r4_5, r6_7; + float *LUT8 = p->ws + p->ws_is[0]; + + L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); + K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); + S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12); +} + + void firstpass_8_b(ffts_plan_t *p, const void *in, void *out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + V r0_1, r2_3, r4_5, r6_7; + float *LUT8 = p->ws + p->ws_is[0]; + + L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); + K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); + S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12); +} + + + void firstpass_4_f(ffts_plan_t *p, const void *in, void *out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + cdata_t t0, t1, t2, t3, t4, t5, t6, t7; + t0[0] = din[0]; t0[1] = din[1]; + t1[0] = din[4]; t1[1] = din[5]; + t2[0] = din[2]; t2[1] = din[3]; + t3[0] = din[6]; t3[1] = din[7]; + + t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1]; + t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1]; + t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1]; + t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1]; + + dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1]; + dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1]; + dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0]; + dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0]; +} + + void firstpass_4_b(ffts_plan_t *p, const void *in, void *out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + cdata_t t0, t1, t2, t3, t4, t5, t6, t7; + t0[0] = din[0]; t0[1] = din[1]; + t1[0] = din[4]; t1[1] = din[5]; + t2[0] = din[2]; t2[1] = din[3]; + t3[0] = din[6]; t3[1] = din[7]; + + t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1]; + t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1]; + t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1]; + t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1]; + + dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1]; + dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1]; + dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0]; + dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0]; +} + + void firstpass_2(ffts_plan_t *p, const void *in, void *out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + cdata_t t0, t1, r0,r1; + t0[0] = din[0]; t0[1] = din[1]; + t1[0] = din[2]; t1[1] = din[3]; + r0[0] = t0[0] + t1[0]; + r0[1] = t0[1] + t1[1]; + r1[0] = t0[0] - t1[0]; + r1[1] = t0[1] - t1[1]; + dout[0] = r0[0]; dout[1] = r0[1]; + dout[2] = r1[0]; dout[3] = r1[1]; +} diff --git a/lib/ffts/src/ffts_small.h b/lib/ffts/src/ffts_small.h new file mode 100644 index 0000000..76cadf5 --- /dev/null +++ b/lib/ffts/src/ffts_small.h @@ -0,0 +1,13 @@ +#ifndef __FFTS_SMALL_H__ +#define __FFTS_SMALL_H__ + + +void firstpass_16_f(ffts_plan_t * p, const void * in, void * out); +void firstpass_16_b(ffts_plan_t * p, const void * in, void * out); +void firstpass_8_f(ffts_plan_t * p, const void * in, void * out); +void firstpass_8_b(ffts_plan_t * p, const void * in, void * out); +void firstpass_4_f(ffts_plan_t * p, const void * in, void * out); +void firstpass_4_b(ffts_plan_t * p, const void * in, void * out); +void firstpass_2(ffts_plan_t * p, const void * in, void * out); + +#endif diff --git a/lib/ffts/src/ffts_static.c b/lib/ffts/src/ffts_static.c new file mode 100644 index 0000000..3edf2ea --- /dev/null +++ b/lib/ffts/src/ffts_static.c @@ -0,0 +1,101 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#include "ffts_static.h" + +void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) { + if(N > 16) { + size_t N1 = N >> 1; + size_t N2 = N >> 2; + size_t N3 = N >> 3; + float *ws = ((float *)(p->ws)) + (p->ws_is[__builtin_ctzl(N)-4] << 1); + + ffts_static_rec_i(p, data, N2); + ffts_static_rec_i(p, data + N1, N3); + ffts_static_rec_i(p, data + N1 + N2, N3); + ffts_static_rec_i(p, data + N, N2); + ffts_static_rec_i(p, data + N + N1, N2); + + if(N == p->N) { + neon_static_x8_t_i(data, N, ws); + }else{ + neon_static_x8_i(data, N, ws); + } + + }else if(N==16){ + neon_static_x4_i(data, N, p->ws); + } + +} +void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) { + if(N > 16) { + size_t N1 = N >> 1; + size_t N2 = N >> 2; + size_t N3 = N >> 3; + float *ws = ((float *)(p->ws)) + (p->ws_is[__builtin_ctzl(N)-4] << 1); + + ffts_static_rec_f(p, data, N2); + ffts_static_rec_f(p, data + N1, N3); + ffts_static_rec_f(p, data + N1 + N2, N3); + ffts_static_rec_f(p, data + N, N2); + ffts_static_rec_f(p, data + N + N1, N2); + + if(N == p->N) { + neon_static_x8_t_f(data, N, ws); + }else{ + neon_static_x8_f(data, N, ws); + } + + }else if(N==16){ + neon_static_x4_f(data, N, p->ws); + } + +} + +void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out) { + + if(__builtin_ctzl(p->N) & 1) + neon_static_o_f(p, in, out); + else + neon_static_e_f(p, in, out); + ffts_static_rec_f(p, out, p->N); +} + + +void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out) { + + if(__builtin_ctzl(p->N) & 1) + neon_static_o_i(p, in, out); + else + neon_static_e_i(p, in, out); + ffts_static_rec_i(p, out, p->N); +} diff --git a/lib/ffts/src/ffts_static.h b/lib/ffts/src/ffts_static.h new file mode 100644 index 0000000..4490bde --- /dev/null +++ b/lib/ffts/src/ffts_static.h @@ -0,0 +1,46 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __FFTS_STATIC_H__ +#define __FFTS_STATIC_H__ + +#include "ffts.h" +#include "neon.h" + +void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) ; +void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out); + +void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) ; +void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out); + +#endif diff --git a/lib/ffts/src/macros-alpha.h b/lib/ffts/src/macros-alpha.h new file mode 100644 index 0000000..06daf4a --- /dev/null +++ b/lib/ffts/src/macros-alpha.h @@ -0,0 +1,206 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> + Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __MACROS_ALPHA_H__ +#define __MACROS_ALPHA_H__ + +#include <math.h> + +#ifdef __alpha__ +#define restrict +#endif + +typedef struct {float r1, i1, r2, i2;} V; + +#define FFTS_MALLOC(d,a) malloc(d) +#define FFTS_FREE(d) free(d) + +#define VLIT4(f3,f2,f1,f0) ((V){f0,f1,f2,f3}) + +static inline V VADD(V x, V y) +{ + V z; + z.r1 = x.r1 + y.r1; + z.i1 = x.i1 + y.i1; + z.r2 = x.r2 + y.r2; + z.i2 = x.i2 + y.i2; + return z; +} + + +static inline V VSUB(V x, V y) +{ + V z; + z.r1 = x.r1 - y.r1; + z.i1 = x.i1 - y.i1; + z.r2 = x.r2 - y.r2; + z.i2 = x.i2 - y.i2; + return z; +} + + +static inline V VMUL(V x, V y) +{ + V z; + z.r1 = x.r1 * y.r1; + z.i1 = x.i1 * y.i1; + z.r2 = x.r2 * y.r2; + z.i2 = x.i2 * y.i2; + return z; +} + +static inline V VXOR(V x, V y) +{ + V r; + r.r1 = (uint32_t)x.r1 ^ (uint32_t)y.r1; + r.i1 = (uint32_t)x.i1 ^ (uint32_t)y.i1; + r.r2 = (uint32_t)x.r2 ^ (uint32_t)y.r2; + r.i2 = (uint32_t)x.i2 ^ (uint32_t)y.i2; + return r; +} + +static inline V VSWAPPAIRS(V x) +{ + V z; + z.r1 = x.i1; + z.i1 = x.r1; + z.r2 = x.i2; + z.i2 = x.r2; + return z; +} + + +static inline V VBLEND(V x, V y) +{ + V z; + z.r1 = x.r1; + z.i1 = x.i1; + z.r2 = y.r2; + z.i2 = y.i2; + return z; +} + +static inline V VUNPACKHI(V x, V y) +{ + V z; + z.r1 = x.r2; + z.i1 = x.i2; + z.r2 = y.r2; + z.i2 = y.i2; + return z; +} + +static inline V VUNPACKLO(V x, V y) +{ + V z; + z.r1 = x.r1; + z.i1 = x.i1; + z.r2 = y.r1; + z.i2 = y.i1; + return z; +} + +static inline V VDUPRE(V x) +{ + V z; + z.r1 = x.r1; + z.i1 = x.r1; + z.r2 = x.r2; + z.i2 = x.r2; + return z; +} + +static inline V VDUPIM(V x) +{ + V z; + z.r1 = x.i1; + z.i1 = x.i1; + z.r2 = x.i2; + z.i2 = x.i2; + return z; +} + +static inline V IMUL(V d, V re, V im) +{ + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VSUB(re, im); +} + + +static inline V IMULJ(V d, V re, V im) +{ + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VADD(re, im); +} + +static inline V MULI(int inv, V x) +{ + V z; + + if (inv) { + z.r1 = -x.r1; + z.i1 = x.i1; + z.r2 = -x.r2; + z.i2 = x.i2; + }else{ + z.r1 = x.r1; + z.i1 = -x.i1; + z.r2 = x.r2; + z.i2 = -x.i2; + } + return z; +} + + +static inline V IMULI(int inv, V x) +{ + return VSWAPPAIRS(MULI(inv, x)); +} + + +static inline V VLD(const void *s) +{ + V *d = (V *)s; + return *d; +} + + +static inline void VST(void *d, V s) +{ + V *r = (V *)d; + *r = s; +} + +#endif diff --git a/lib/ffts/src/macros-altivec.h b/lib/ffts/src/macros-altivec.h new file mode 100644 index 0000000..0d148a5 --- /dev/null +++ b/lib/ffts/src/macros-altivec.h @@ -0,0 +1,137 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> + Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __MACROS_ALTIVEC_H__ +#define __MACROS_ALTIVEC_H__ + +#include <math.h> +#include <altivec.h> + +#define restrict + +typedef vector float V; +typedef vector unsigned char VUC; + +#ifdef __apple__ +#define FFTS_MALLOC(d,a) vec_malloc(d) +#define FFTS_FREE(d) vec_free(d) +#else +/* It appears vec_malloc() and friends are not implemented on Linux */ +#include <malloc.h> +#define FFTS_MALLOC(d,a) memalign(16,d) +#define FFTS_FREE(d) free(d) +#endif + +#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3}) + +#define VADD(x,y) vec_add(x,y) +#define VSUB(x,y) vec_sub(x,y) +#define VMUL(x,y) vec_madd(x,y,(V){0}) +#define VMULADD(x,y,z) vec_madd(x,y,z) +#define VNMULSUB(x,y,z) vec_nmsub(x,y,z) +#define VXOR(x,y) vec_xor((x),(y)) +#define VSWAPPAIRS(x) \ + vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03, \ + 0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b}) + +#define VBLEND(x,y) \ + vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \ + 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f}) + +#define VUNPACKHI(x,y) \ + vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, \ + 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f}) + +#define VUNPACKLO(x,y) \ + vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \ + 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}) + +#define VDUPRE(x) \ + vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03, \ + 0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b}) + +#define VDUPIM(x) \ + vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07, \ + 0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f}) + + +static inline V IMUL(V d, V re, V im) +{ + im = VMUL(im, VSWAPPAIRS(d)); + re = VMUL(re, d); + return VSUB(re, im); +} + + +static inline V IMULJ(V d, V re, V im) +{ + im = VMUL(im, VSWAPPAIRS(d)); + return VMULADD(re, d, im); +} + +#ifndef __GNUC__ +/* gcc (4.6 and 4.7) ICEs on this code! */ +static inline V MULI(int inv, V x) +{ + return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f)); +} +#else +/* but compiles this fine... */ +static inline V MULI(int inv, V x) +{ + V t; + t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f); + return VXOR(x, t); +} +#endif + + +static inline V IMULI(int inv, V x) +{ + return VSWAPPAIRS(MULI(inv, x)); +} + + +static inline V VLD(const void *s) +{ + V *d = (V *)s; + return *d; +} + + +static inline void VST(void *d, V s) +{ + V *r = (V *)d; + *r = s; +} +#endif diff --git a/lib/ffts/src/macros-neon.h b/lib/ffts/src/macros-neon.h new file mode 100644 index 0000000..0750b75 --- /dev/null +++ b/lib/ffts/src/macros-neon.h @@ -0,0 +1,96 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#ifndef __MACROS_NEON_H__ +#define __MACROS_NEON_H__ + +#include "neon.h" +#include <arm_neon.h> + +typedef float32x4_t V; + +typedef float32x4x2_t VS; + +#define ADD vaddq_f32 +#define SUB vsubq_f32 +#define MUL vmulq_f32 +#define VADD vaddq_f32 +#define VSUB vsubq_f32 +#define VMUL vmulq_f32 +#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y)))) +#define VST vst1q_f32 +#define VLD vld1q_f32 +#define VST2 vst2q_f32 +#define VLD2 vld2q_f32 + +#define VSWAPPAIRS(x) (vrev64q_f32(x)) + +#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b))) +#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b))) + +#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y))) + +__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) { + data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3}; + return VLD(d); +} + +#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0)) +#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1)) + +#define FFTS_MALLOC(d,a) (valloc(d)) +#define FFTS_FREE(d) (free(d)) + +__INLINE void STORESPR(data_t * addr, VS p) { + + vst1q_f32(addr, p.val[0]); + vst1q_f32(addr + 4, p.val[1]); + +} + +__INLINE V IMULI(int inv, V a) { + if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f))); + else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f))); +} + +__INLINE V IMUL(V d, V re, V im) { + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VSUB(re, im); +} + +__INLINE V IMULJ(V d, V re, V im) { + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VADD(re, im); +} + +#endif diff --git a/lib/ffts/src/macros-sse.h b/lib/ffts/src/macros-sse.h new file mode 100644 index 0000000..229477c --- /dev/null +++ b/lib/ffts/src/macros-sse.h @@ -0,0 +1,84 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __SSE_FLOAT_H__ +#define __SSE_FLOAT_H__ + +#include <xmmintrin.h> + +//#define VL 4 + +typedef __m128 V; + +#define VADD _mm_add_ps +#define VSUB _mm_sub_ps +#define VMUL _mm_mul_ps +//#define VLIT4 _mm_set_ps +#define VXOR _mm_xor_ps +#define VST _mm_store_ps +#define VLD _mm_load_ps + +#define VSWAPPAIRS(x) (_mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1))) + +#define VUNPACKHI(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,3,2))) +#define VUNPACKLO(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(1,0,1,0))) + +#define VBLEND(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,1,0))) + +#define VLIT4 _mm_set_ps + +#define VDUPRE(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(2,2,0,0))) +#define VDUPIM(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(3,3,1,1))) + +#define FFTS_MALLOC(d,a) (_mm_malloc(d,a)) +#define FFTS_FREE(d) (_mm_free(d)) + +__INLINE V IMULI(int inv, V a) { + if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f))); + else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f))); +} + + +__INLINE V IMUL(V d, V re, V im) { + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VSUB(re, im); +} + +__INLINE V IMULJ(V d, V re, V im) { + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VADD(re, im); +} + +#endif diff --git a/lib/ffts/src/macros.h b/lib/ffts/src/macros.h new file mode 100644 index 0000000..d304cec --- /dev/null +++ b/lib/ffts/src/macros.h @@ -0,0 +1,161 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> + Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __MACROS_H__ +#define __MACROS_H__ + +#ifdef HAVE_NEON +#include "macros-neon.h" +#else +#ifdef __alpha__ +#include "macros-alpha.h" +#else +#ifdef __powerpc__ +#include "macros-altivec.h" +#endif +#endif + +#endif + + +#ifdef HAVE_VFP +#include "macros-alpha.h" +#endif +#ifdef HAVE_SSE + #include "macros-sse.h" +#endif + +static inline void TX2(V *a, V *b) +{ + V TX2_t0 = VUNPACKLO(*a, *b); + V TX2_t1 = VUNPACKHI(*a, *b); + *a = TX2_t0; *b = TX2_t1; +} + +static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3) +{ + V uk, uk2, zk_p, zk_n, zk, zk_d; + uk = *r0; uk2 = *r1; + zk_p = IMUL(*r2, re, im); + zk_n = IMULJ(*r3, re, im); + + zk = VADD(zk_p, zk_n); + zk_d = IMULI(inv, VSUB(zk_p, zk_n)); + + *r2 = VSUB(uk, zk); + *r0 = VADD(uk, zk); + *r3 = VADD(uk2, zk_d); + *r1 = VSUB(uk2, zk_d); +} + + +static inline void S_4(V r0, V r1, V r2, V r3, + data_t * restrict o0, data_t * restrict o1, + data_t * restrict o2, data_t * restrict o3) +{ + VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3); +} + + +static inline void L_2_4(int inv, + const data_t * restrict i0, const data_t * restrict i1, + const data_t * restrict i2, const data_t * restrict i3, + V *r0, V *r1, V *r2, V *r3) +{ + V t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); + t4 = VADD(t0, t1); + t5 = VSUB(t0, t1); + t6 = VADD(t2, t3); + t7 = VSUB(t2, t3); + *r0 = VUNPACKLO(t4, t5); + *r1 = VUNPACKLO(t6, t7); + t5 = IMULI(inv, t5); + t0 = VADD(t6, t4); + t2 = VSUB(t6, t4); + t1 = VSUB(t7, t5); + t3 = VADD(t7, t5); + *r3 = VUNPACKHI(t0, t1); + *r2 = VUNPACKHI(t2, t3); +} + + +static inline void L_4_4(int inv, + const data_t * restrict i0, const data_t * restrict i1, + const data_t * restrict i2, const data_t * restrict i3, + V *r0, V *r1, V *r2, V *r3) +{ + V t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); + t4 = VADD(t0, t1); + t5 = VSUB(t0, t1); + t6 = VADD(t2, t3); + t7 = IMULI(inv, VSUB(t2, t3)); + t0 = VADD(t4, t6); + t2 = VSUB(t4, t6); + t1 = VSUB(t5, t7); + t3 = VADD(t5, t7); + TX2(&t0, &t1); + TX2(&t2, &t3); + *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; +} + + + +static inline void L_4_2(int inv, + const data_t * restrict i0, const data_t * restrict i1, + const data_t * restrict i2, const data_t * restrict i3, + V *r0, V *r1, V *r2, V *r3) +{ + V t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = VLD(i0); t1 = VLD(i1); t6 = VLD(i2); t7 = VLD(i3); + t2 = VBLEND(t6, t7); + t3 = VBLEND(t7, t6); + t4 = VADD(t0, t1); + t5 = VSUB(t0, t1); + t6 = VADD(t2, t3); + t7 = VSUB(t2, t3); + *r2 = VUNPACKHI(t4, t5); + *r3 = VUNPACKHI(t6, t7); + t7 = IMULI(inv, t7); + t0 = VADD(t4, t6); + t2 = VSUB(t4, t6); + t1 = VSUB(t5, t7); + t3 = VADD(t5, t7); + *r0 = VUNPACKLO(t0, t1); + *r1 = VUNPACKLO(t2, t3); +} +#endif diff --git a/lib/ffts/src/neon.h b/lib/ffts/src/neon.h new file mode 100644 index 0000000..f3132c2 --- /dev/null +++ b/lib/ffts/src/neon.h @@ -0,0 +1,65 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __NEON_H__ +#define __NEON_H__ + +#include "ffts.h" + +void neon_x4(float *, size_t, float *); +void neon_x8(float *, size_t, float *); +void neon_x8_t(float *, size_t, float *); +void neon_ee(); +void neon_oo(); +void neon_eo(); +void neon_oe(); +void neon_end(); + +void neon_transpose(uint64_t *in, uint64_t *out, int w, int h); +void neon_transpose_to_buf(uint64_t *in, uint64_t *out, int w); + +//typedef struct _ffts_plan_t ffts_plan_t; + +void neon_static_e_f(ffts_plan_t * , const void * , void * ); +void neon_static_o_f(ffts_plan_t * , const void * , void * ); +void neon_static_x4_f(float *, size_t, float *); +void neon_static_x8_f(float *, size_t, float *); +void neon_static_x8_t_f(float *, size_t, float *); + +void neon_static_e_i(ffts_plan_t * , const void * , void * ); +void neon_static_o_i(ffts_plan_t * , const void * , void * ); +void neon_static_x4_i(float *, size_t, float *); +void neon_static_x8_i(float *, size_t, float *); +void neon_static_x8_t_i(float *, size_t, float *); + +#endif diff --git a/lib/ffts/src/neon.s b/lib/ffts/src/neon.s new file mode 100644 index 0000000..6995066 --- /dev/null +++ b/lib/ffts/src/neon.s @@ -0,0 +1,738 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + .align 4 +#ifdef __APPLE__ + .globl _neon_x4 +_neon_x4: +#else + .globl neon_x4 +neon_x4: +#endif +@ add r3, r0, #0 + + vld1.32 {q8,q9}, [r0, :128] + add r4, r0, r1, lsl #1 + vld1.32 {q10,q11}, [r4, :128] + add r5, r0, r1, lsl #2 + vld1.32 {q12,q13}, [r5, :128] + add r6, r4, r1, lsl #2 + vld1.32 {q14,q15}, [r6, :128] + vld1.32 {q2,q3}, [r2, :128] + + vmul.f32 q0, q13, q3 + vmul.f32 q5, q12, q2 + vmul.f32 q1, q14, q2 + vmul.f32 q4, q14, q3 + vmul.f32 q14, q12, q3 + vmul.f32 q13, q13, q2 + vmul.f32 q12, q15, q3 + vmul.f32 q2, q15, q2 + vsub.f32 q0, q5, q0 + vadd.f32 q13, q13, q14 + vadd.f32 q12, q12, q1 + vsub.f32 q1, q2, q4 + vadd.f32 q15, q0, q12 + vsub.f32 q12, q0, q12 + vadd.f32 q14, q13, q1 + vsub.f32 q13, q13, q1 + vadd.f32 q0, q8, q15 + vadd.f32 q1, q9, q14 + vsub.f32 q2, q10, q13 @ + vsub.f32 q4, q8, q15 + vadd.f32 q3, q11, q12 @ + vst1.32 {q0,q1}, [r0, :128] + vsub.f32 q5, q9, q14 + vadd.f32 q6, q10, q13 @ + vsub.f32 q7, q11, q12 @ + vst1.32 {q2,q3}, [r4, :128] + vst1.32 {q4,q5}, [r5, :128] + vst1.32 {q6,q7}, [r6, :128] + bx lr + + .align 4 +#ifdef __APPLE__ + .globl _neon_x8 +_neon_x8: +#else + .globl neon_x8 +neon_x8: +#endif + mov r11, #0 + add r3, r0, #0 @ data0 + add r5, r0, r1, lsl #1 @ data2 + add r4, r0, r1 @ data1 + add r7, r5, r1, lsl #1 @ data4 + add r6, r5, r1 @ data3 + add r9, r7, r1, lsl #1 @ data6 + add r8, r7, r1 @ data5 + add r10, r9, r1 @ data7 + add r12, r2, #0 @ LUT + + sub r11, r11, r1, lsr #5 +neon_x8_loop: + vld1.32 {q2,q3}, [r12, :128]! + vld1.32 {q14,q15}, [r6, :128] + vld1.32 {q10,q11}, [r5, :128] + adds r11, r11, #1 + vmul.f32 q12, q15, q2 + vmul.f32 q8, q14, q3 + vmul.f32 q13, q14, q2 + vmul.f32 q9, q10, q3 + vmul.f32 q1, q10, q2 + vmul.f32 q0, q11, q2 + vmul.f32 q14, q11, q3 + vmul.f32 q15, q15, q3 + vld1.32 {q2,q3}, [r12, :128]! + vsub.f32 q10, q12, q8 + vadd.f32 q11, q0, q9 + vadd.f32 q8, q15, q13 + vld1.32 {q12,q13}, [r4, :128] + vsub.f32 q9, q1, q14 + vsub.f32 q15, q11, q10 + vsub.f32 q14, q9, q8 + vsub.f32 q4, q12, q15 @ + vadd.f32 q6, q12, q15 @ + vadd.f32 q5, q13, q14 @ + vsub.f32 q7, q13, q14 @ + vld1.32 {q14,q15}, [r9, :128] + vld1.32 {q12,q13}, [r7, :128] + vmul.f32 q1, q14, q2 + vmul.f32 q0, q14, q3 + vst1.32 {q4,q5}, [r4, :128] + vmul.f32 q14, q15, q3 + vmul.f32 q4, q15, q2 + vadd.f32 q15, q9, q8 + vst1.32 {q6,q7}, [r6, :128] + vmul.f32 q8, q12, q3 + vmul.f32 q5, q13, q3 + vmul.f32 q12, q12, q2 + vmul.f32 q9, q13, q2 + vadd.f32 q14, q14, q1 + vsub.f32 q13, q4, q0 + vadd.f32 q0, q9, q8 + vld1.32 {q8,q9}, [r3, :128] + vadd.f32 q1, q11, q10 + vsub.f32 q12, q12, q5 + vadd.f32 q11, q8, q15 + vsub.f32 q8, q8, q15 + vadd.f32 q2, q12, q14 + vsub.f32 q10, q0, q13 + vadd.f32 q15, q0, q13 + vadd.f32 q13, q9, q1 + vsub.f32 q9, q9, q1 + vsub.f32 q12, q12, q14 + vadd.f32 q0, q11, q2 + vadd.f32 q1, q13, q15 + vsub.f32 q4, q11, q2 + vsub.f32 q2, q8, q10 @ + vadd.f32 q3, q9, q12 @ + vst1.32 {q0,q1}, [r3, :128]! + vsub.f32 q5, q13, q15 + vld1.32 {q14,q15}, [r10, :128] + vsub.f32 q7, q9, q12 @ + vld1.32 {q12,q13}, [r8, :128] + vst1.32 {q2,q3}, [r5, :128]! + vld1.32 {q2,q3}, [r12, :128]! + vadd.f32 q6, q8, q10 @ + vmul.f32 q8, q14, q2 + vst1.32 {q4,q5}, [r7, :128]! + vmul.f32 q10, q15, q3 + vmul.f32 q9, q13, q3 + vmul.f32 q11, q12, q2 + vmul.f32 q14, q14, q3 + vst1.32 {q6,q7}, [r9, :128]! + vmul.f32 q15, q15, q2 + vmul.f32 q12, q12, q3 + vmul.f32 q13, q13, q2 + vadd.f32 q10, q10, q8 + vsub.f32 q11, q11, q9 + vld1.32 {q8,q9}, [r4, :128] + vsub.f32 q14, q15, q14 + vadd.f32 q15, q13, q12 + vadd.f32 q13, q11, q10 + vadd.f32 q12, q15, q14 + vsub.f32 q15, q15, q14 + vsub.f32 q14, q11, q10 + vld1.32 {q10,q11}, [r6, :128] + vadd.f32 q0, q8, q13 + vadd.f32 q1, q9, q12 + vsub.f32 q2, q10, q15 @ + vadd.f32 q3, q11, q14 @ + vsub.f32 q4, q8, q13 + vst1.32 {q0,q1}, [r4, :128]! + vsub.f32 q5, q9, q12 + vadd.f32 q6, q10, q15 @ + vst1.32 {q2,q3}, [r6, :128]! + vsub.f32 q7, q11, q14 @ + vst1.32 {q4,q5}, [r8, :128]! + vst1.32 {q6,q7}, [r10, :128]! + bne neon_x8_loop + + bx lr + + .align 4 +#ifdef __APPLE__ + .globl _neon_x8_t +_neon_x8_t: +#else + .globl neon_x8_t +neon_x8_t: +#endif + mov r11, #0 + add r3, r0, #0 @ data0 + add r5, r0, r1, lsl #1 @ data2 + add r4, r0, r1 @ data1 + add r7, r5, r1, lsl #1 @ data4 + add r6, r5, r1 @ data3 + add r9, r7, r1, lsl #1 @ data6 + add r8, r7, r1 @ data5 + add r10, r9, r1 @ data7 + add r12, r2, #0 @ LUT + + sub r11, r11, r1, lsr #5 +neon_x8_t_loop: + vld1.32 {q2,q3}, [r12, :128]! + vld1.32 {q14,q15}, [r6, :128] + vld1.32 {q10,q11}, [r5, :128] + adds r11, r11, #1 + vmul.f32 q12, q15, q2 + vmul.f32 q8, q14, q3 + vmul.f32 q13, q14, q2 + vmul.f32 q9, q10, q3 + vmul.f32 q1, q10, q2 + vmul.f32 q0, q11, q2 + vmul.f32 q14, q11, q3 + vmul.f32 q15, q15, q3 + vld1.32 {q2,q3}, [r12, :128]! + vsub.f32 q10, q12, q8 + vadd.f32 q11, q0, q9 + vadd.f32 q8, q15, q13 + vld1.32 {q12,q13}, [r4, :128] + vsub.f32 q9, q1, q14 + vsub.f32 q15, q11, q10 + vsub.f32 q14, q9, q8 + vsub.f32 q4, q12, q15 @ + vadd.f32 q6, q12, q15 @ + vadd.f32 q5, q13, q14 @ + vsub.f32 q7, q13, q14 @ + vld1.32 {q14,q15}, [r9, :128] + vld1.32 {q12,q13}, [r7, :128] + vmul.f32 q1, q14, q2 + vmul.f32 q0, q14, q3 + vst1.32 {q4,q5}, [r4, :128] + vmul.f32 q14, q15, q3 + vmul.f32 q4, q15, q2 + vadd.f32 q15, q9, q8 + vst1.32 {q6,q7}, [r6, :128] + vmul.f32 q8, q12, q3 + vmul.f32 q5, q13, q3 + vmul.f32 q12, q12, q2 + vmul.f32 q9, q13, q2 + vadd.f32 q14, q14, q1 + vsub.f32 q13, q4, q0 + vadd.f32 q0, q9, q8 + vld1.32 {q8,q9}, [r3, :128] + vadd.f32 q1, q11, q10 + vsub.f32 q12, q12, q5 + vadd.f32 q11, q8, q15 + vsub.f32 q8, q8, q15 + vadd.f32 q2, q12, q14 + vsub.f32 q10, q0, q13 + vadd.f32 q15, q0, q13 + vadd.f32 q13, q9, q1 + vsub.f32 q9, q9, q1 + vsub.f32 q12, q12, q14 + vadd.f32 q0, q11, q2 + vadd.f32 q1, q13, q15 + vsub.f32 q4, q11, q2 + vsub.f32 q2, q8, q10 @ + vadd.f32 q3, q9, q12 @ + vst2.32 {q0,q1}, [r3, :128]! + vsub.f32 q5, q13, q15 + vld1.32 {q14,q15}, [r10, :128] + vsub.f32 q7, q9, q12 @ + vld1.32 {q12,q13}, [r8, :128] + vst2.32 {q2,q3}, [r5, :128]! + vld1.32 {q2,q3}, [r12, :128]! + vadd.f32 q6, q8, q10 @ + vmul.f32 q8, q14, q2 + vst2.32 {q4,q5}, [r7, :128]! + vmul.f32 q10, q15, q3 + vmul.f32 q9, q13, q3 + vmul.f32 q11, q12, q2 + vmul.f32 q14, q14, q3 + vst2.32 {q6,q7}, [r9, :128]! + vmul.f32 q15, q15, q2 + vmul.f32 q12, q12, q3 + vmul.f32 q13, q13, q2 + vadd.f32 q10, q10, q8 + vsub.f32 q11, q11, q9 + vld1.32 {q8,q9}, [r4, :128] + vsub.f32 q14, q15, q14 + vadd.f32 q15, q13, q12 + vadd.f32 q13, q11, q10 + vadd.f32 q12, q15, q14 + vsub.f32 q15, q15, q14 + vsub.f32 q14, q11, q10 + vld1.32 {q10,q11}, [r6, :128] + vadd.f32 q0, q8, q13 + vadd.f32 q1, q9, q12 + vsub.f32 q2, q10, q15 @ + vadd.f32 q3, q11, q14 @ + vsub.f32 q4, q8, q13 + vst2.32 {q0,q1}, [r4, :128]! + vsub.f32 q5, q9, q12 + vadd.f32 q6, q10, q15 @ + vst2.32 {q2,q3}, [r6, :128]! + vsub.f32 q7, q11, q14 @ + vst2.32 {q4,q5}, [r8, :128]! + vst2.32 {q6,q7}, [r10, :128]! + bne neon_x8_t_loop + + @bx lr + +@ assumes r0 = out +@ r1 = in ? +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = loop iterations +@ r2 & lr = temps + .align 4 +#ifdef __APPLE__ + .globl _neon_ee +_neon_ee: +#else + .globl neon_ee +neon_ee: +#endif + vld1.32 {d16, d17}, [r2, :128] +_neon_ee_loop: + vld2.32 {q15}, [r10, :128]! + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vld2.32 {q9}, [r4, :128]! + vld2.32 {q10}, [r3, :128]! + vld2.32 {q11}, [r6, :128]! + vld2.32 {q12}, [r5, :128]! + vsub.f32 q1, q14, q13 + vld2.32 {q0}, [r9, :128]! + subs r11, r11, #1 + vsub.f32 q2, q0, q15 + vadd.f32 q0, q0, q15 + vmul.f32 d10, d2, d17 + vmul.f32 d11, d3, d16 + vmul.f32 d12, d3, d17 + vmul.f32 d6, d4, d17 + vmul.f32 d7, d5, d16 + vmul.f32 d8, d4, d16 + vmul.f32 d9, d5, d17 + vmul.f32 d13, d2, d16 + vsub.f32 d7, d7, d6 + vadd.f32 d11, d11, d10 + vsub.f32 q1, q12, q11 + vsub.f32 q2, q10, q9 + vadd.f32 d6, d9, d8 + vadd.f32 q4, q14, q13 + vadd.f32 q11, q12, q11 + vadd.f32 q12, q10, q9 + vsub.f32 d10, d13, d12 + vsub.f32 q7, q4, q0 + vsub.f32 q9, q12, q11 + vsub.f32 q13, q5, q3 + vadd.f32 d29, d5, d2 @ + vadd.f32 q5, q5, q3 + vadd.f32 q10, q4, q0 + vadd.f32 q11, q12, q11 + vsub.f32 d31, d5, d2 @ + vsub.f32 d28, d4, d3 @ + vadd.f32 d30, d4, d3 @ + vadd.f32 d5, d19, d14 @- + vadd.f32 d7, d31, d26 @- + vadd.f32 q1, q14, q5 + vadd.f32 q0, q11, q10 + vsub.f32 d6, d30, d27 @- + vsub.f32 d4, d18, d15 @- + vsub.f32 d13, d19, d14 @- + vadd.f32 d12, d18, d15 @- + vsub.f32 d15, d31, d26 @- + ldr r2, [r12], #4 + vtrn.32 q1, q3 + ldr lr, [r12], #4 + vtrn.32 q0, q2 + add r2, r0, r2, lsl #2 + vsub.f32 q4, q11, q10 + add lr, r0, lr, lsl #2 + vsub.f32 q5, q14, q5 + vadd.f32 d14, d30, d27 @- + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_ee_loop + +@ assumes r0 = out +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = loop iterations +@ r2 & lr = temps + .align 4 +#ifdef __APPLE__ + .globl _neon_oo +_neon_oo: +#else + .globl neon_oo +neon_oo: +#endif +_neon_oo_loop: + vld2.32 {q8}, [r6, :128]! + vld2.32 {q9}, [r5, :128]! + vld2.32 {q10}, [r4, :128]! + vld2.32 {q13}, [r3, :128]! + vadd.f32 q11, q9, q8 + vsub.f32 q8, q9, q8 + vsub.f32 q9, q13, q10 + vadd.f32 q12, q13, q10 + subs r11, r11, #1 + vld2.32 {q10}, [r7, :128]! + vld2.32 {q13}, [r9, :128]! + vsub.f32 q2, q12, q11 + vsub.f32 d7, d19, d16 @ + vadd.f32 d3, d19, d16 @ + vadd.f32 d6, d18, d17 @ + vsub.f32 d2, d18, d17 @ + vld2.32 {q9}, [r8, :128]! + vld2.32 {q8}, [r10, :128]! + vadd.f32 q0, q12, q11 + vadd.f32 q11, q13, q8 + vadd.f32 q12, q10, q9 + vsub.f32 q8, q13, q8 + vsub.f32 q9, q10, q9 + vsub.f32 q6, q12, q11 + vadd.f32 q4, q12, q11 + vtrn.32 q0, q2 + ldr r2, [r12], #4 + vsub.f32 d15, d19, d16 @ + ldr lr, [r12], #4 + vadd.f32 d11, d19, d16 @ + vadd.f32 d14, d18, d17 @ + vsub.f32 d10, d18, d17 @ + add r2, r0, r2, lsl #2 + vtrn.32 q1, q3 + add lr, r0, lr, lsl #2 + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_oo_loop + +@ assumes r0 = out +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = addr of twiddle +@ r2 & lr = temps + .align 4 +#ifdef __APPLE__ + .globl _neon_eo +_neon_eo: +#else + .globl neon_eo +neon_eo: +#endif + vld2.32 {q9}, [r5, :128]! @tag2 + vld2.32 {q13}, [r3, :128]! @tag0 + vld2.32 {q12}, [r4, :128]! @tag1 + vld2.32 {q0}, [r7, :128]! @tag4 + vsub.f32 q11, q13, q12 + vld2.32 {q8}, [r6, :128]! @tag3 + vadd.f32 q12, q13, q12 + vsub.f32 q10, q9, q8 + vadd.f32 q8, q9, q8 + vadd.f32 q9, q12, q8 + vadd.f32 d9, d23, d20 @ + vsub.f32 d11, d23, d20 @ + vsub.f32 q8, q12, q8 + vsub.f32 d8, d22, d21 @ + vadd.f32 d10, d22, d21 @ + ldr r2, [r12], #4 + vld1.32 {d20, d21}, [r11, :128] + ldr lr, [r12], #4 + vtrn.32 q9, q4 + add r2, r0, r2, lsl #2 + vtrn.32 q8, q5 + add lr, r0, lr, lsl #2 + vswp d9,d10 + vst1.32 {d8,d9,d10,d11}, [lr, :128]! + vld2.32 {q13}, [r10, :128]! @tag7 + vld2.32 {q15}, [r9, :128]! @tag6 + vld2.32 {q11}, [r8, :128]! @tag5 + vsub.f32 q14, q15, q13 + vsub.f32 q12, q0, q11 + vadd.f32 q11, q0, q11 + vadd.f32 q13, q15, q13 + vadd.f32 d13, d29, d24 @ + vadd.f32 q15, q13, q11 + vsub.f32 d12, d28, d25 @ + vsub.f32 d15, d29, d24 @ + vadd.f32 d14, d28, d25 @ + vtrn.32 q15, q6 + vsub.f32 q15, q13, q11 + vtrn.32 q15, q7 + vswp d13, d14 + vst1.32 {d12,d13,d14,d15}, [lr, :128]! + vtrn.32 q13, q14 + vtrn.32 q11, q12 + vmul.f32 d24, d26, d21 + vmul.f32 d28, d27, d20 + vmul.f32 d25, d26, d20 + vmul.f32 d26, d27, d21 + vmul.f32 d27, d22, d21 + vmul.f32 d30, d23, d20 + vmul.f32 d29, d23, d21 + vmul.f32 d22, d22, d20 + vsub.f32 d21, d28, d24 + vadd.f32 d20, d26, d25 + vadd.f32 d25, d30, d27 + vsub.f32 d24, d22, d29 + vadd.f32 q11, q12, q10 + vsub.f32 q10, q12, q10 + vadd.f32 q0, q9, q11 + vsub.f32 q2, q9, q11 + vadd.f32 d3, d17, d20 @ + vsub.f32 d7, d17, d20 @ + vsub.f32 d2, d16, d21 @ + vadd.f32 d6, d16, d21 @ + vswp d1, d2 + vswp d5, d6 + vstmia r2!, {q0-q3} + + +@ assumes r0 = out +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = addr of twiddle +@ r2 & lr = temps + .align 4 +#ifdef __APPLE__ + .globl _neon_oe +_neon_oe: +#else + .globl neon_oe +neon_oe: +#endif + vld1.32 {q8}, [r5, :128]! + vld1.32 {q10}, [r6, :128]! + vld2.32 {q11}, [r4, :128]! + vld2.32 {q13}, [r3, :128]! + vld2.32 {q15}, [r10, :128]! + vorr d25, d17, d17 + vorr d24, d20, d20 + vorr d20, d16, d16 + vsub.f32 q9, q13, q11 + vadd.f32 q11, q13, q11 + ldr r2, [r12], #4 + vtrn.32 d24, d25 + ldr lr, [r12], #4 + vtrn.32 d20, d21 + add r2, r0, r2, lsl #2 + vsub.f32 q8, q10, q12 + add lr, r0, lr, lsl #2 + vadd.f32 q10, q10, q12 + vadd.f32 q0, q11, q10 + vadd.f32 d25, d19, d16 @ + vsub.f32 d27, d19, d16 @ + vsub.f32 q1, q11, q10 + vsub.f32 d24, d18, d17 @ + vadd.f32 d26, d18, d17 @ + vtrn.32 q0, q12 + vtrn.32 q1, q13 + vld1.32 {d24, d25}, [r11, :128] + vswp d1, d2 + vst1.32 {q0, q1}, [r2, :128]! + vld2.32 {q0}, [r9, :128]! + vadd.f32 q1, q0, q15 + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vsub.f32 q15, q0, q15 + vsub.f32 q0, q14, q13 + vadd.f32 q3, q14, q13 + vadd.f32 q2, q3, q1 + vadd.f32 d29, d1, d30 @ + vsub.f32 d27, d1, d30 @ + vsub.f32 q3, q3, q1 + vsub.f32 d28, d0, d31 @ + vadd.f32 d26, d0, d31 @ + vtrn.32 q2, q14 + vtrn.32 q3, q13 + vswp d5, d6 + vst1.32 {q2, q3}, [r2, :128]! + vtrn.32 q11, q9 + vtrn.32 q10, q8 + vmul.f32 d20, d18, d25 + vmul.f32 d22, d19, d24 + vmul.f32 d21, d19, d25 + vmul.f32 d18, d18, d24 + vmul.f32 d19, d16, d25 + vmul.f32 d30, d17, d24 + vmul.f32 d23, d16, d24 + vmul.f32 d24, d17, d25 + vadd.f32 d17, d22, d20 + vsub.f32 d16, d18, d21 + vsub.f32 d21, d30, d19 + vadd.f32 d20, d24, d23 + vadd.f32 q9, q8, q10 + vsub.f32 q8, q8, q10 + vadd.f32 q4, q14, q9 + vsub.f32 q6, q14, q9 + vadd.f32 d11, d27, d16 @ + vsub.f32 d15, d27, d16 @ + vsub.f32 d10, d26, d17 @ + vadd.f32 d14, d26, d17 @ + vswp d9, d10 + vswp d13, d14 + vstmia lr!, {q4-q7} + + + .align 4 +#ifdef __APPLE__ + .globl _neon_end +_neon_end: +#else + .globl neon_end +neon_end: +#endif + bx lr + + + .align 4 +#ifdef __APPLE__ + .globl _neon_transpose +_neon_transpose: +#else + .globl neon_transpose +neon_transpose: +#endif + push {r4-r8} + @ vpush {q8-q9} + mov r5, r3 +_neon_transpose_col: + mov r7, r1 + add r8, r1, r3, lsl #3 + mov r4, r2 + add r6, r0, r2, lsl #3 +_neon_transpose_row: + vld1.32 {q8,q9}, [r0, :128]! +@ vld1.32 {q10,q11}, [r0, :128]! + vld1.32 {q12,q13}, [r6, :128]! +@ vld1.32 {q14,q15}, [r6, :128]! + sub r4, r4, #4 + cmp r4, #0 + vswp d17,d24 + vswp d19,d26 + vswp d21,d28 + vswp d23,d30 + vst1.32 {q8}, [r7, :128] + vst1.32 {q12}, [r8, :128] + add r7, r7, r3, lsl #4 + add r8, r8, r3, lsl #4 + vst1.32 {q9}, [r7, :128] + vst1.32 {q13}, [r8, :128] + add r7, r7, r3, lsl #4 + add r8, r8, r3, lsl #4 +@@vst1.32 {q10}, [r7, :128] +@@vst1.32 {q14}, [r8, :128] +@@add r7, r7, r3, lsl #4 +@@add r8, r8, r3, lsl #4 +@@vst1.32 {q11}, [r7, :128] +@@vst1.32 {q15}, [r8, :128] +@@add r7, r7, r3, lsl #4 +@@add r8, r8, r3, lsl #4 + bne _neon_transpose_row + sub r5, r5, #2 + cmp r5, #0 + add r0, r0, r2, lsl #3 + add r1, r1, #16 + bne _neon_transpose_col + @ vpop {q8-q9} + pop {r4-r8} + bx lr + + .align 4 +#ifdef __APPLE__ + .globl _neon_transpose_to_buf +_neon_transpose_to_buf: +#else + .globl neon_transpose_to_buf +neon_transpose_to_buf: +#endif + push {r4-r10} + mov r5, #8 +_neon_transpose_to_buf_col: + mov r4, #8 + add r6, r0, r2, lsl #3 + mov r7, r1 + add r8, r1, #64 + add r9, r1, #128 + add r10, r1, #192 +_neon_transpose_to_buf_row: + vld1.32 {q8,q9}, [r0, :128]! + vld1.32 {q12,q13}, [r6, :128]! + sub r4, r4, #4 + cmp r4, #0 + vswp d17,d24 + vswp d19,d26 + vst1.32 {q8}, [r7, :128] + vst1.32 {q12}, [r8, :128] + vst1.32 {q9}, [r9, :128] + vst1.32 {q13}, [r10, :128] + add r7, r7, #256 + add r8, r8, #256 + add r9, r9, #256 + add r10, r10, #256 + bne _neon_transpose_to_buf_row + sub r5, r5, #2 + cmp r5, #0 + sub r0, r0, #64 + add r0, r0, r2, lsl #4 + add r1, r1, #16 + bne _neon_transpose_to_buf_col + pop {r4-r10} + bx lr diff --git a/lib/ffts/src/neon_float.h b/lib/ffts/src/neon_float.h new file mode 100644 index 0000000..a958b8a --- /dev/null +++ b/lib/ffts/src/neon_float.h @@ -0,0 +1,1126 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#ifndef __NEON_FLOAT_H__ +#define __NEON_FLOAT_H__ + +#include <arm_neon.h> + +//#define VL 4 +#define __INLINE static inline __attribute__((always_inline)) + +typedef float32x4_t V; + +typedef float32x4x2_t VS; + +#if defined(complex) + typedef complex float cdata_t; +#else + typedef float cdata_t[2]; +#endif + typedef float data_t; + +#define ADD vaddq_f32 +#define SUB vsubq_f32 +#define MUL vmulq_f32 +#define VADD vaddq_f32 +#define VSUB vsubq_f32 +#define VMUL vmulq_f32 +#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y)))) +#define VST vst1q_f32 +#define VLD vld1q_f32 +#define VST2 vst2q_f32 +#define VLD2 vld2q_f32 + +#define VSWAPPAIRS(x) (vrev64q_f32(x)) + +#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b))) +#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b))) + +#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y))) + +__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) { + data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3}; + return VLD(d); +} + +#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0)) +#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1)) + +#define FFTS_MALLOC(d,a) (valloc(d)) +#define FFTS_FREE(d) (free(d)) +__INLINE void FMA(V *Rd, V Rn, V Rm) { + *Rd = vmlaq_f32(*Rd, Rn, Rm); +// __asm__ ("vmla.f32 %q0,%q1,%q2\n\t" +// : "+w" (*Rd) +// : "w" (Rn), "w" (Rm) +// //: "0" +// ); + +} +__INLINE void FMS(V *Rd, V Rn, V Rm) { + *Rd = vmlsq_f32(*Rd, Rn, Rm); +// __asm__ ("vmls.f32 %q0,%q1,%q2\n\t" +// : "+w" (*Rd) +// : "w" (Rn), "w" (Rm) +// // : "0" +// ); +} + +__INLINE VS VSMUL(VS *d, VS *w) { + VS t; + t.val[0] = vmulq_f32(d->val[0], w->val[0]); + t.val[1] = vmulq_f32(d->val[0], w->val[1]); +// t.val[0] = vmlsq_f32(t.val[0], d->val[1], w->val[1]); +// t.val[1] = vmlaq_f32(t.val[1], d->val[1], w->val[0]); + FMS(&t.val[0], d->val[1], w->val[1]); + FMA(&t.val[1], d->val[1], w->val[0]); + return t; +} +__INLINE VS VSMULJ(VS *d, VS *w) { + VS t; + t.val[0] = vmulq_f32(d->val[0], w->val[0]); + t.val[1] = vmulq_f32(d->val[1], w->val[0]); +// t.val[0] = vmlaq_f32(t.val[0], d->val[1], w->val[1]); +// t.val[1] = vmlsq_f32(t.val[1], d->val[0], w->val[1]); + FMA(&t.val[0], d->val[1], w->val[1]); + FMS(&t.val[1], d->val[0], w->val[1]); + return t; +} +__INLINE VS VSADD(VS *a, VS *b) { + VS r; + r.val[0] = vaddq_f32(a->val[0], b->val[0]); + r.val[1] = vaddq_f32(a->val[1], b->val[1]); + return r; +} +__INLINE VS VSSUB(VS *a, VS *b) { + VS r; + r.val[0] = vsubq_f32(a->val[0], b->val[0]); + r.val[1] = vsubq_f32(a->val[1], b->val[1]); + return r; +} +__INLINE VS VSSUB_MULI(VS *a, VS *b) { + VS r; + r.val[0] = vaddq_f32(a->val[0], b->val[1]); + r.val[1] = vsubq_f32(a->val[1], b->val[0]); + return r; +} +__INLINE VS VSADD_MULI(VS *a, VS *b) { + VS r; + r.val[0] = vsubq_f32(a->val[0], b->val[1]); + r.val[1] = vaddq_f32(a->val[1], b->val[0]); + return r; +} + +__INLINE void VSK_N(VS w, VS *r0, VS *r1, VS *r2, VS *r3) { + VS uk, uk2, zk_p, zk_n, zk, zk_d; + uk = *r0; uk2 = *r1; + zk_p = VSMUL(r2, &w); + zk_n = VSMULJ(r3, &w); + + zk = VSADD(&zk_p, &zk_n); + zk_d = VSSUB(&zk_p, &zk_n); + + *r2 = VSSUB(&uk, &zk); + *r0 = VSADD(&uk, &zk); + *r3 = VSADD_MULI(&uk2, &zk_d); + *r1 = VSSUB_MULI(&uk2, &zk_d); +} + + +__INLINE float32x2x2_t HVS_ADD(float32x2x2_t a, float32x2x2_t b) { + float32x2x2_t rval; + rval.val[0] = vadd_f32(a.val[0], b.val[0]); + rval.val[1] = vadd_f32(a.val[1], b.val[1]); + return rval; +} +__INLINE float32x2x2_t HVS_SUB(float32x2x2_t a, float32x2x2_t b) { + float32x2x2_t rval; + rval.val[0] = vsub_f32(a.val[0], b.val[0]); + rval.val[1] = vsub_f32(a.val[1], b.val[1]); + return rval; +} +__INLINE float32x2x2_t HVS_SUB_MULI(float32x2x2_t a, float32x2x2_t b) { + float32x2x2_t rval; + rval.val[0] = vadd_f32(a.val[0], b.val[1]); + rval.val[1] = vsub_f32(a.val[1], b.val[0]); + return rval; +} +__INLINE float32x2x2_t HVS_ADD_MULI(float32x2x2_t a, float32x2x2_t b) { + float32x2x2_t rval; + rval.val[0] = vsub_f32(a.val[0], b.val[1]); + rval.val[1] = vadd_f32(a.val[1], b.val[0]); + return rval; +} +__INLINE float32x2x2_t HVS_MUL(float32x2x2_t d, float32x2x2_t w) { + float32x2x2_t t; + t.val[0] = vmul_f32(d.val[0], w.val[0]); + t.val[1] = vmul_f32(d.val[0], w.val[1]); + t.val[0] = vmls_f32(t.val[0], d.val[1], w.val[1]); + t.val[1] = vmla_f32(t.val[1], d.val[1], w.val[0]); + return t; +} +__INLINE float32x2x2_t HVS_MULJ(float32x2x2_t d, float32x2x2_t w) { + float32x2x2_t t; + t.val[0] = vmul_f32(d.val[0], w.val[0]); + t.val[1] = vmul_f32(d.val[1], w.val[0]); + t.val[0] = vmla_f32(t.val[0], d.val[1], w.val[1]); + t.val[1] = vmls_f32(t.val[1], d.val[0], w.val[1]); + return t; +} +__INLINE void HVS_K_N(float32x2x2_t w, float32x2x2_t *r0, float32x2x2_t *r1, float32x2x2_t *r2, float32x2x2_t *r3) { + float32x2x2_t uk, uk2, zk_p, zk_n, zk, zk_d; + uk = *r0; uk2 = *r1; + zk_p = HVS_MUL(*r2, w); + zk_n = HVS_MULJ(*r3, w); + zk = HVS_ADD(zk_p, zk_n); + zk_d = HVS_SUB(zk_p, zk_n); + + *r2 = HVS_SUB(uk, zk); + *r0 = HVS_ADD(uk, zk); + *r3 = HVS_ADD_MULI(uk2, zk_d); + *r1 = HVS_SUB_MULI(uk2, zk_d); +} + +typedef union { + float32x4_t f32x4; + float32x2x2_t f32x2x2; +} float_mixed_t; + +__INLINE void VSWP(float32x2x2_t *a, float32x2x2_t *b) { +//float32x2_t tmp = a->val[1]; +//a->val[1] = b->val[0]; +//b->val[0] = tmp; + __asm__ ("vswp %0,%1\n\t" + : "+w" (a->val[1]), "+w" (b->val[0]) + : + ); +} + +static const __attribute__ ((aligned(16))) float ee_w_data[4] = {0.70710678118654757273731092936941,0.70710678118654746171500846685376, + -0.70710678118654757273731092936941,-0.70710678118654746171500846685376}; +__INLINE void LEAF_EE8_SPLIT(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { + data_t *out0 = out + (*out_offsets)[0]; + data_t *out1 = out + (*out_offsets)[1]; + *out_offsets += 2; + + float32x2x2_t r0, r1, r2, r3, r4, r5, r6, r7; + float32x2x2_t t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = vld2_f32(in + (*is)[0]); t1 = vld2_f32(in + (*is)[1]); t2 = vld2_f32(in + (*is)[2]); t3 = vld2_f32(in + (*is)[3]); + + t4 = HVS_ADD (t0, t1); + t5 = HVS_SUB (t0, t1); + t6 = HVS_ADD (t2, t3); + t7 = HVS_SUB (t2, t3); + r0 = HVS_ADD (t4, t6); + r2 = HVS_SUB (t4, t6); + r1 = HVS_SUB_MULI(t5, t7); + r3 = HVS_ADD_MULI(t5, t7); + + t0 = vld2_f32(in + (*is)[4]); t1 = vld2_f32(in + (*is)[5]); t2 = vld2_f32(in + (*is)[6]); t3 = vld2_f32(in + (*is)[7]); + r4 = HVS_ADD (t0, t1); + r5 = HVS_SUB (t0, t1); + r6 = HVS_ADD (t2, t3); + r7 = HVS_SUB (t2, t3); + t0 = r0; t1 = r2; + t2 = HVS_ADD(r4, r6); + t3 = HVS_SUB(r4, r6); + r0 = HVS_ADD(t0, t2); + r4 = HVS_SUB(t0, t2); + r2 = HVS_SUB_MULI(t1, t3); + r6 = HVS_ADD_MULI(t1, t3); + + float32x4_t w = vld1q_f32(ee_w_data); + float32x2x2_t ww; + ww.val[0] = vget_low_f32(w); + ww.val[1] = vget_high_f32(w); + + HVS_K_N(ww,&r1,&r3,&r5,&r7); + +//vst2_f32(out0, r0); +//vst2_f32(out0+4, r2); +//vst2_f32(out0+8, r4); +//vst2_f32(out0+12, r6); + +//vst2_f32(out1, r1); +//vst2_f32(out1+4, r3); +//vst2_f32(out1+8, r5); +//vst2_f32(out1+12, r7); + + float32x2x2_t tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7; + + tt0 = vtrn_f32(r0.val[0], r0.val[1]); + tt1 = vtrn_f32(r1.val[0], r1.val[1]); + tt2 = vtrn_f32(r2.val[0], r2.val[1]); + tt3 = vtrn_f32(r3.val[0], r3.val[1]); + tt4 = vtrn_f32(r4.val[0], r4.val[1]); + tt5 = vtrn_f32(r5.val[0], r5.val[1]); + tt6 = vtrn_f32(r6.val[0], r6.val[1]); + tt7 = vtrn_f32(r7.val[0], r7.val[1]); + +//VSWP(&tt0.f32x2x2, &tt1.f32x2x2); +//VSWP(&tt2.f32x2x2, &tt3.f32x2x2); +//VSWP(&tt4.f32x2x2, &tt5.f32x2x2); +//VSWP(&tt6.f32x2x2, &tt7.f32x2x2); + + float32x4_t z0, z1, z2, z3, z4, z5, z6, z7; + + z0 = vcombine_f32(tt0.val[0], tt1.val[0]); + z1 = vcombine_f32(tt0.val[1], tt1.val[1]); + z2 = vcombine_f32(tt2.val[0], tt3.val[0]); + z3 = vcombine_f32(tt2.val[1], tt3.val[1]); + z4 = vcombine_f32(tt4.val[0], tt5.val[0]); + z5 = vcombine_f32(tt4.val[1], tt5.val[1]); + z6 = vcombine_f32(tt6.val[0], tt7.val[0]); + z7 = vcombine_f32(tt6.val[1], tt7.val[1]); + + + vst1q_f32(out0, z0); + vst1q_f32(out0+4, z2); + vst1q_f32(out0+8, z4); + vst1q_f32(out0+12, z6); + + vst1q_f32(out1, z1); + vst1q_f32(out1+4, z3); + vst1q_f32(out1+8, z5); + vst1q_f32(out1+12, z7); +/* + vst1_f32(out0, tt0.val[0]); + vst1_f32(out0+2, tt1.val[0]); + vst1_f32(out0+4, tt2.val[0]); + vst1_f32(out0+6, tt3.val[0]); + vst1_f32(out0+8, tt4.val[0]); + vst1_f32(out0+10, tt5.val[0]); + vst1_f32(out0+12, tt6.val[0]); + vst1_f32(out0+14, tt7.val[0]); + + vst1_f32(out1, tt0.val[1]); + vst1_f32(out1+2, tt1.val[1]); + vst1_f32(out1+4, tt2.val[1]); + vst1_f32(out1+6, tt3.val[1]); + vst1_f32(out1+8, tt4.val[1]); + vst1_f32(out1+10, tt5.val[1]); + vst1_f32(out1+12, tt6.val[1]); + vst1_f32(out1+14, tt7.val[1]); + */ +/* + float32x4_t rr0 = vcombine_f32(r0.val[0], r0.val[1]); + float32x4_t rr1 = vcombine_f32(r1.val[0], r1.val[1]); + float32x4_t rr2 = vcombine_f32(r2.val[0], r2.val[1]); + float32x4_t rr3 = vcombine_f32(r3.val[0], r3.val[1]); + + float32x4x2_t tmp0, tmp1, tmp2, tmp3; + tmp0 = vtrnq_f32(rr0, rr2); + tmp1 = vtrnq_f32(rr1, rr3); + + + float32x2x2_t v0, v1, v2, v3; + v0.val[0] = vget_low_f32(tmp0.val[0]); + v0.val[1] = vget_high_f32(tmp0.val[0]); + v1.val[0] = vget_low_f32(tmp0.val[1]); + v1.val[1] = vget_high_f32(tmp0.val[1]); + v2.val[0] = vget_low_f32(tmp1.val[0]); + v2.val[1] = vget_high_f32(tmp1.val[0]); + v3.val[0] = vget_low_f32(tmp1.val[1]); + v3.val[1] = vget_high_f32(tmp1.val[1]); + + tmp2.val[0] = tmp0.val[0]; + tmp2.val[1] = tmp1.val[0]; + tmp3.val[0] = tmp0.val[1]; + tmp3.val[1] = tmp1.val[1]; + +//vst2q_f32(out0 , tmp2); +//vst2q_f32(out1 , tmp3); + vst2_f32(out0, v0); + vst2_f32(out0+4, v1); + vst2_f32(out1, v2); + vst2_f32(out1+4, v3); + + float32x4_t rr4 = vcombine_f32(r4.val[0], r4.val[1]); + float32x4_t rr5 = vcombine_f32(r5.val[0], r5.val[1]); + float32x4_t rr6 = vcombine_f32(r6.val[0], r6.val[1]); + float32x4_t rr7 = vcombine_f32(r7.val[0], r7.val[1]); + + tmp0 = vtrnq_f32(rr4, rr6); + tmp1 = vtrnq_f32(rr5, rr7); + + tmp2.val[0] = tmp0.val[0]; + tmp2.val[1] = tmp1.val[0]; + tmp3.val[0] = tmp0.val[1]; + tmp3.val[1] = tmp1.val[1]; + v0.val[0] = vget_low_f32(tmp0.val[0]); + v0.val[1] = vget_high_f32(tmp0.val[0]); + v1.val[0] = vget_low_f32(tmp0.val[1]); + v1.val[1] = vget_high_f32(tmp0.val[1]); + v2.val[0] = vget_low_f32(tmp1.val[0]); + v2.val[1] = vget_high_f32(tmp1.val[0]); + v3.val[0] = vget_low_f32(tmp1.val[1]); + v3.val[1] = vget_high_f32(tmp1.val[1]); + vst2_f32(out0+8, v0); + vst2_f32(out0+12, v1); + vst2_f32(out1+8, v1); + vst2_f32(out1+12, v3); + +//vst2q_f32(out0 + 8, tmp2); +//vst2q_f32(out1 + 8, tmp3); +//vst1q_f32(out0+8, tmp0.val[0]); +//vst1q_f32(out0+12,tmp0.val[1]); +//vst1q_f32(out1+8, tmp1.val[0]); +//vst1q_f32(out1+12,tmp1.val[1]); + */ + *is += 8; +} + +__INLINE void STORESPR(data_t * addr, VS p) { + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t" + : + : "r" (addr), "w" (p.val[0]), "w" (p.val[1]) + : "memory"); +} +__INLINE void STORESPRI(data_t * restrict * addr, V p0, V p1) { + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); +} +__INLINE void STORESPRI0(data_t * restrict *addr, VS r) { + register V p0 __asm__ ("q0") = r.val[0]; + register V p1 __asm__ ("q1") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRI1(data_t **addr, VS r) { + register V p0 __asm__ ("q2") = r.val[0]; + register V p1 __asm__ ("q3") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRI2(data_t **addr, VS r) { + register V p0 __asm__ ("q4") = r.val[0]; + register V p1 __asm__ ("q5") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRI3(data_t **addr, VS r) { + register V p0 __asm__ ("q6") = r.val[0]; + register V p1 __asm__ ("q7") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRIT0(data_t * restrict *addr, VS r) { + register V p0 __asm__ ("q0") = r.val[0]; + register V p1 __asm__ ("q1") = r.val[1]; + __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRIT1(data_t **addr, VS r) { + register V p0 __asm__ ("q2") = r.val[0]; + register V p1 __asm__ ("q3") = r.val[1]; + __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRIT2(data_t **addr, VS r) { + register V p0 __asm__ ("q4") = r.val[0]; + register V p1 __asm__ ("q5") = r.val[1]; + __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRIT3(data_t **addr, VS r) { + register V p0 __asm__ ("q6") = r.val[0]; + register V p1 __asm__ ("q7") = r.val[1]; + __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPR0(data_t *addr, VS r) { + register V p0 __asm__ ("q0") = r.val[0]; + register V p1 __asm__ ("q1") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t" + : + : "r" (addr), "w" (p0), "w" (p1) + : "memory"); +} +__INLINE void STORESPR1(data_t *addr, VS r) { + register V p0 __asm__ ("q2") = r.val[0]; + register V p1 __asm__ ("q3") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t" + : + : "r" (addr), "w" (p0), "w" (p1) + : "memory"); +} +__INLINE void STORESPR2(data_t *addr, VS r) { + register V p0 __asm__ ("q4") = r.val[0]; + register V p1 __asm__ ("q5") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t" + : + : "r" (addr), "w" (p0), "w" (p1) + : "memory"); +} +__INLINE void STORESPR3(data_t *addr, VS r) { + register V p0 __asm__ ("q6") = r.val[0]; + register V p1 __asm__ ("q7") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t" + : + : "r" (addr), "w" (p0), "w" (p1) + : "memory"); +} +__INLINE VS LOADSPR0(data_t *addr) { + VS r; + register V p0 __asm__ ("q8") ; + register V p1 __asm__ ("q9") ; + __asm__ __volatile__("vld1.32 {%q0,%q1}, [%2, :128]\n\t" + : "=&w" (p0), "=&w" (p1) + : "r" (addr) + ); + r.val[0] = p0; r.val[1] = p1; + return r; +} +__INLINE VS LOADSPR1(data_t *addr) { + VS r; + register V p0 __asm__ ("q10") ; + register V p1 __asm__ ("q11") ; + __asm__ __volatile__("vld1.32 {%q0,%q1}, [%2, :128]\n\t" + : "=&w" (p0), "=&w" (p1) + : "r" (addr) + ); + r.val[0] = p0; r.val[1] = p1; + return r; +} +__INLINE VS LOADSPR2(data_t *addr) { + VS r; + register V p0 __asm__ ("q12") ; + register V p1 __asm__ ("q13") ; + __asm__ __volatile__("vld1.32 {%q0,%q1}, [%2, :128]\n\t" + : "=&w" (p0), "=&w" (p1) + : "r" (addr) + ); + r.val[0] = p0; r.val[1] = p1; + return r; +} +__INLINE VS LOADSPR3(data_t *addr) { + VS r; + register V p0 __asm__ ("q14") ; + register V p1 __asm__ ("q15") ; + __asm__ __volatile__("vld1.32 {%q0,%q1}, [%2, :128]\n\t" + : "=&w" (p0), "=&w" (p1) + : "r" (addr) + ); + r.val[0] = p0; r.val[1] = p1; + return r; +} +__INLINE VS LOADSPRI(data_t * restrict * addr) { + VS r; + register V p0 __asm__ ("q2") ; + register V p1 __asm__ ("q3") ; + __asm__ __volatile__("vld1.32 {%q0,%q1}, [%2, :128]!\n\t" + : "=&w" (p0), "=&w" (p1), "+r" (*addr) + : + ); + r.val[0] = p0; r.val[1] = p1; + return r; +} + +__INLINE void X_4_SPLIT(data_t * restrict data, size_t N, data_t * restrict LUT) { + +//size_t i; +//for(i=0;i<N/4/2/2;i++) { + VS uk = LOADSPR0(data); + VS uk2 = LOADSPR1(data + 2*N/4); + VS zk_p = LOADSPR2(data + 4*N/4); + VS zk_n = LOADSPR3(data + 6*N/4); + + VSK_N(LOADSPRI(&LUT), &uk, &uk2, &zk_p, &zk_n); + + STORESPR0(data, uk); + STORESPR1(data + 2*N/4, uk2); + STORESPR2(data + 4*N/4, zk_p); + STORESPR3(data + 6*N/4, zk_n); + +// LUT += 8; +// data += 8; +// } +} + +__INLINE void X_8_SPLIT(data_t * restrict data0, size_t N, data_t * restrict LUT) { + data_t *data2 = data0 + 2*N/4; + data_t *data4 = data0 + 4*N/4; + data_t *data6 = data0 + 6*N/4; + data_t *data1 = data0 + 1*N/4; + data_t *data3 = data0 + 3*N/4; + data_t *data5 = data0 + 5*N/4; + data_t *data7 = data0 + 7*N/4; + size_t k, n4 = N/4; + + for(k=N/8/2/2;k>0;--k) { + VS r0, r1, r2, r3, r4, r5, r6, r7,w; + r0 = LOADSPR0(data0); + r2 = LOADSPR1(data2); + r1 = LOADSPR2(data1); + r3 = LOADSPR3(data3); + VSK_N(LOADSPRI(&LUT), &r0, &r1, &r2, &r3); + STORESPR2(data1, r1); + STORESPR3(data3, r3); + r4 = LOADSPR2(data4); + r6 = LOADSPR3(data6); + VSK_N(LOADSPRI(&LUT), &r0, &r2, &r4, &r6); + STORESPRI0(&data0, r0); //data0 += 8; + STORESPRI1(&data2, r2); //data2 += 8; + STORESPRI2(&data4, r4); //data4 += 8; + STORESPRI3(&data6, r6); //data6 += 8; + r1 = LOADSPR0(data1); + r3 = LOADSPR1(data3); + r5 = LOADSPR2(data5); + r7 = LOADSPR3(data7); + VSK_N(LOADSPRI(&LUT), &r1, &r3, &r5, &r7); + // LUT += 24; + STORESPRI0(&data1, r1); //data1 += 8; + STORESPRI1(&data3, r3); //data3 += 8; + STORESPRI2(&data5, r5); //data5 += 8; + STORESPRI3(&data7, r7); //data7 += 8; + } +} + +__INLINE void X_8_SPLIT_T(data_t * restrict data0, size_t N, data_t * restrict LUT) { + data_t *data2 = data0 + 2*N/4; + data_t *data4 = data0 + 4*N/4; + data_t *data6 = data0 + 6*N/4; + data_t *data1 = data0 + 1*N/4; + data_t *data3 = data0 + 3*N/4; + data_t *data5 = data0 + 5*N/4; + data_t *data7 = data0 + 7*N/4; + size_t k, n4 = N/4; + + for(k=N/8/2/2;k>0;--k) { + VS r0, r1, r2, r3, r4, r5, r6, r7,w; + r0 = LOADSPR0(data0); + r2 = LOADSPR1(data2); + r1 = LOADSPR2(data1); + r3 = LOADSPR3(data3); + VSK_N(LOADSPRI(&LUT), &r0, &r1, &r2, &r3); + STORESPR2(data1, r1); + STORESPR3(data3, r3); + r4 = LOADSPR2(data4); + r6 = LOADSPR3(data6); + VSK_N(LOADSPRI(&LUT), &r0, &r2, &r4, &r6); + STORESPRIT0(&data0, r0); //data0 += 8; + STORESPRIT1(&data2, r2); //data2 += 8; + STORESPRIT2(&data4, r4); //data4 += 8; + STORESPRIT3(&data6, r6); //data6 += 8; + r1 = LOADSPR0(data1); + r3 = LOADSPR1(data3); + r5 = LOADSPR2(data5); + r7 = LOADSPR3(data7); + VSK_N(LOADSPRI(&LUT), &r1, &r3, &r5, &r7); + STORESPRIT0(&data1, r1); //data1 += 8; + STORESPRIT1(&data3, r3); //data3 += 8; + STORESPRIT2(&data5, r5); //data5 += 8; + STORESPRIT3(&data7, r7); //data7 += 8; + } +} +__INLINE V LOAD2I(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]!\n\t" + : "=w" (o), "+r" (*addr) + : + ); + + return o; +} +__INLINE V LOAD2I_0(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag0\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_1(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag1\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_2(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag2\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_3(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag3\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_4(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag4\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_5(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag5\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_6(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag6\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_7(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag7\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} + + + +__INLINE V LOADI(const data_t **addr) { + float32x4_t o; + __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOADI_2(const data_t **addr) { + float32x4_t o; + __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t @tag2" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOADI_3(const data_t **addr) { + float32x4_t o; + __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t @tag3" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V HSP_MUL(V *d, const V *w) { + V t; + t = vcombine_f32(vmul_f32(vget_low_f32(*d), vget_low_f32(*w)), + vmul_f32(vget_low_f32(*d), vget_high_f32(*w))); + t = vcombine_f32(vmls_f32(vget_low_f32(t), vget_high_f32(*d), vget_high_f32(*w)), + vmla_f32(vget_high_f32(t), vget_high_f32(*d), vget_low_f32(*w))); + return t; +} +__INLINE V HSP_MULJ(V *d, const V *w) { + V t; + t = vcombine_f32(vmul_f32(vget_low_f32(*d), vget_low_f32(*w)), + vmul_f32(vget_high_f32(*d), vget_low_f32(*w))); + t = vcombine_f32(vmla_f32(vget_low_f32(t), vget_high_f32(*d), vget_high_f32(*w)), + vmls_f32(vget_high_f32(t), vget_low_f32(*d), vget_high_f32(*w))); + return t; +} +__INLINE V HSP_SUB_MULI(V *a, V *b) { + return vcombine_f32(vadd_f32(vget_low_f32(*a), vget_high_f32(*b)), vsub_f32(vget_high_f32(*a), vget_low_f32(*b))); +} +__INLINE V HSP_ADD_MULI(V *a, V *b) { + return vcombine_f32(vsub_f32(vget_low_f32(*a), vget_high_f32(*b)), vadd_f32(vget_high_f32(*a), vget_low_f32(*b))); +} + +__INLINE void K_N_HSP(const V *w, V *r0, V *r1, V *r2, V *r3) { + V uk, uk2, zk_p, zk_n, zk, zk_d; + + uk = *r0; + uk2 = *r1; + zk_p = HSP_MUL(r2, w); + zk_n = HSP_MULJ(r3, w); + zk = ADD(zk_p, zk_n); + zk_d = SUB(zk_p, zk_n); + + *r2 = SUB(uk, zk); + *r0 = ADD(uk, zk); + *r3 = HSP_ADD_MULI(&uk2, &zk_d); + *r1 = HSP_SUB_MULI(&uk2, &zk_d); +} + +__INLINE void neon_shl8_ee(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) { + + V r0, r1, r2, r3, r4, r5, r6, r7; + V t0, t1, t2, t3, t4, t5, t6, t7; + + + t0 = LOAD2I_0(i0); + t1 = LOAD2I_1(i1); + t2 = LOAD2I_2(i2); + t3 = LOAD2I_3(i3); + t4 = ADD (t0, t1); + t5 = SUB (t0, t1); + t6 = ADD (t2, t3); + t7 = SUB (t2, t3); + r0 = ADD (t4, t6); + r2 = SUB (t4, t6); + r1 = HSP_SUB_MULI(&t5, &t7); + r3 = HSP_ADD_MULI(&t5, &t7); + + t0 = LOAD2I_4(i4); + t1 = LOAD2I_5(i5); + t2 = LOAD2I_6(i6); + t3 = LOAD2I_7(i7); + r4 = ADD (t0, t1); + r5 = SUB (t0, t1); + r6 = ADD (t2, t3); + r7 = SUB (t2, t3); + + t0 = r0; t1 = r2; + t2 = ADD(r4, r6); + t3 = SUB(r4, r6); + r0 = ADD(t0, t2); + r4 = SUB(t0, t2); + r2 = HSP_SUB_MULI(&t1, &t3); + r6 = HSP_ADD_MULI(&t1, &t3); + + V w = vld1q_f32(ee_w_data); + + K_N_HSP(&w,&r1,&r3,&r5,&r7); + V uk, uk2, zk, zk_d; + + float32x4x2_t tmp1 = vtrnq_f32(r0, r2); + r0 = tmp1.val[0]; + r2 = tmp1.val[1]; + float32x4x2_t tmp4 = vtrnq_f32(r1, r3); + r1 = tmp4.val[0]; + r3 = tmp4.val[1]; + register V tt0 __asm__ ("q0") = r0; + register V tt1 __asm__ ("q1") = r1; + register V tt2 __asm__ ("q2") = r2; + register V tt3 __asm__ ("q3") = r3; + __asm__ __volatile__ ("vst2.32 {q0,q1}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt0), "w"(tt1) : "memory"); + __asm__ __volatile__ ("vst2.32 {q2,q3}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt2), "w"(tt3) : "memory"); + + float32x4x2_t tmp2 = vtrnq_f32(r4, r6); + r4 = tmp2.val[0]; + r6 = tmp2.val[1]; + float32x4x2_t tmp3 = vtrnq_f32(r5, r7); + r5 = tmp3.val[0]; + r7 = tmp3.val[1]; + register V tt4 __asm__ ("q4") = r4; + register V tt5 __asm__ ("q5") = r5; + register V tt6 __asm__ ("q6") = r6; + register V tt7 __asm__ ("q7") = r7; + + __asm__ __volatile__ ("vst2.32 {q4,q5}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt4), "w"(tt5) : "memory"); + __asm__ __volatile__ ("vst2.32 {q6,q7}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt6), "w"(tt7) : "memory"); + +} + +__INLINE void neon_shl8_oo(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) { + + V r0, r1, r2, r3, r4, r5, r6, r7; + V t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = LOAD2I_0(i0); + t1 = LOAD2I_1(i1); + t2 = LOAD2I_2(i2); + t3 = LOAD2I_3(i3); + t4 = ADD (t0, t1); + t5 = SUB (t0, t1); + t6 = ADD (t2, t3); + t7 = SUB (t2, t3); + r0 = ADD (t4, t6); + r2 = SUB (t4, t6); + r1 = HSP_SUB_MULI(&t5, &t7); + r3 = HSP_ADD_MULI(&t5, &t7); + + float32x4x2_t tmp1 = vtrnq_f32(r0, r2); + r0 = tmp1.val[0]; + r2 = tmp1.val[1]; + float32x4x2_t tmp4 = vtrnq_f32(r1, r3); + r1 = tmp4.val[0]; + r3 = tmp4.val[1]; + register V tt0 __asm__ ("q0") = r0; + register V tt1 __asm__ ("q1") = r1; + register V tt2 __asm__ ("q2") = r2; + register V tt3 __asm__ ("q3") = r3; + __asm__ __volatile__ ("vst2.32 {q0,q1}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt0), "w"(tt1) : "memory"); + __asm__ __volatile__ ("vst2.32 {q2,q3}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt2), "w"(tt3) : "memory"); + + + + t0 = LOAD2I_4(i4); + t1 = LOAD2I_5(i5); + t2 = LOAD2I_6(i6); + t3 = LOAD2I_7(i7); + t4 = ADD (t0, t1); + t5 = SUB (t0, t1); + t6 = ADD (t2, t3); + t7 = SUB (t2, t3); + r4 = ADD (t4, t6); + r6 = SUB (t4, t6); + r5 = HSP_SUB_MULI(&t5, &t7); + r7 = HSP_ADD_MULI(&t5, &t7); + + float32x4x2_t tmp2 = vtrnq_f32(r4, r6); + r4 = tmp2.val[0]; + r6 = tmp2.val[1]; + float32x4x2_t tmp3 = vtrnq_f32(r5, r7); + r5 = tmp3.val[0]; + r7 = tmp3.val[1]; + + + register V tt4 __asm__ ("q4") = r4; + register V tt5 __asm__ ("q5") = r5; + register V tt6 __asm__ ("q6") = r6; + register V tt7 __asm__ ("q7") = r7; + + __asm__ __volatile__ ("vst2.32 {q4,q5}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt4), "w"(tt5) : "memory"); + __asm__ __volatile__ ("vst2.32 {q6,q7}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt6), "w"(tt7) : "memory"); + + + +} + +static const __attribute__ ((aligned(16))) data_t eo_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376}; + + +__INLINE void neon_shl8_eo(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) { + /* + register V r0_1 __asm__ ("q0"); + register V r2_3 __asm__ ("q1"); + register V r4_5 __asm__ ("q2"); + register V r6_7 __asm__ ("q3"); + */ + const V w = vld1q_f32(eo_w_data); + + V r0_1, r2_3, r4_5, r6_7; + + register V r8_9 __asm__ ("q4"); + register V r10_11 __asm__ ("q5"); + register V r12_13 __asm__ ("q6"); + register V r14_15 __asm__ ("q7"); + + { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = LOAD2I_0(i0); + t1 = LOAD2I_1(i1); + t2 = LOAD2I_2(i2); + t3 = LOAD2I_3(i3); + t4 = ADD(t0, t1); + t5 = SUB(t0, t1); + t6 = ADD(t2, t3); + t7 = SUB(t2, t3); + + t0 = ADD(t4, t6); + t2 = SUB(t4, t6); + t1 = HSP_SUB_MULI(&t5, &t7); + t3 = HSP_ADD_MULI(&t5, &t7); + + float32x4x2_t tmp1 = vtrnq_f32(t0, t1); + t0 = tmp1.val[0]; + t1 = tmp1.val[1]; + float32x4x2_t tmp2 = vtrnq_f32(t2, t3); + t2 = tmp2.val[0]; + t3 = tmp2.val[1]; + + r0_1 = t0; + r2_3 = t2; + r8_9 = t1; + r10_11 = t3; + __asm__ __volatile__ ("vswp d9,d10\n\t" + "vst1.32 {d8,d9,d10,d11}, [%0, :128]!\n\t" +// "vst1.32 {d8,d9}, [%0, :128]!\n\t" +// "vst1.32 {d10,d11}, [%0, :128]!\n\t" + : "+&r" (out1) + : "w" (r8_9), "w" (r10_11) + : "memory"); + + } + { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = LOAD2I_4(i4); + t1 = LOAD2I_5(i5); + t2 = LOAD2I_6(i6); + t3 = LOAD2I_7(i7); + //t2 = HALFBLEND(t6, t7); + //t3 = HALFBLEND(t7, t6); + t4 = ADD(t0, t1); + t5 = SUB(t0, t1); + t6 = ADD(t2, t3); + t7 = SUB(t2, t3); + float32x4x2_t tmp1 = vtrnq_f32(t4, t5); + r4_5 = tmp1.val[0]; + float32x4x2_t tmp2 = vtrnq_f32(t6, t7); + r6_7 = tmp2.val[0]; + //t5 = MULI(t5); + t0 = ADD(t6, t4); + t2 = SUB(t6, t4); + t1 = HSP_SUB_MULI(&t7, &t5); + t3 = HSP_ADD_MULI(&t7, &t5); + + float32x4x2_t tmp3 = vtrnq_f32(t0, t1); + r12_13 = tmp3.val[1]; + float32x4x2_t tmp4 = vtrnq_f32(t2, t3); + r14_15 = tmp4.val[1]; + + + __asm__ __volatile__ ("vswp d13, d14\n\t" + "vst1.32 {d12,d13,d14,d15}, [%0, :128]!\n\t" +// "vst1.32 {d12,d13}, [%0, :128]!\n\t" +// "vst1.32 {d14,d15}, [%0, :128]!\n\t" + : "+&r" (out1) + : "w" (r12_13), "w" (r14_15) + : "memory"); + + + } + + K_N_HSP(&w,&r0_1,&r2_3,&r4_5,&r6_7); + + register V t0 __asm__ ("q0") = r0_1; + register V t1 __asm__ ("q1") = r2_3; + register V t2 __asm__ ("q2") = r4_5; + register V t3 __asm__ ("q3") = r6_7; + + __asm__ __volatile__ ("vswp d1, d2\n\t" + "vswp d5, d6\n\t" + "vstmia %0!, {q0-q3}\n\t" +// "vst1.32 {d0,d1}, [%0, :128]!\n\t" +// "vst1.32 {d2,d3}, [%0, :128]!\n\t" +// "vst1.32 {d4,d5}, [%0, :128]!\n\t" +// "vst1.32 {d6,d7}, [%0, :128]\n\t" + : "+&r" (out0) + : "w" (t0), "w" (t1), "w" (t2), "w" (t3) + : "memory"); + +} +static const __attribute__ ((aligned(16))) data_t oe_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376}; + +__INLINE void neon_shl8_oe(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) { + register V r0_1 __asm__ ("q0"); + register V r2_3 __asm__ ("q1"); + register V r4_5 __asm__ ("q2"); + register V r6_7 __asm__ ("q3"); + + V r8_9, r10_11, r12_13, r14_15; + const V w = vld1q_f32(oe_w_data); + + { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = LOAD2I_0(i0); + t1 = LOAD2I_1(i1); + t6 = LOADI_2(i2); + t7 = LOADI_3(i3); + + float32x2x2_t tmp0 = vtrn_f32(vget_low_f32(t6), vget_high_f32(t7)); + float32x2x2_t tmp1 = vtrn_f32(vget_low_f32(t7), vget_high_f32(t6)); + t2 = vcombine_f32(tmp0.val[0], tmp0.val[1]); + t3 = vcombine_f32(tmp1.val[0], tmp1.val[1]); + + t4 = ADD(t0, t1); + t5 = SUB(t0, t1); + t6 = ADD(t2, t3); + t7 = SUB(t2, t3); + float32x4x2_t tmp2 = vtrnq_f32(t4, t5); + r12_13 = tmp2.val[1]; + float32x4x2_t tmp3 = vtrnq_f32(t6, t7); + r14_15 = tmp3.val[1]; + + t0 = ADD(t4, t6); + t2 = SUB(t4, t6); + t1 = HSP_SUB_MULI(&t5, &t7); + t3 = HSP_ADD_MULI(&t5, &t7); + float32x4x2_t tmp4 = vtrnq_f32(t0, t1); + r0_1 = tmp4.val[0]; + float32x4x2_t tmp5 = vtrnq_f32(t2, t3); + r2_3 = tmp5.val[0]; + __asm__ __volatile__ ("vswp d1, d2\n\t" + "vst1.32 {q0, q1}, [%0, :128]!\n\t" +// "vst1.32 {q1}, [%0, :128]!\n\t" + : "+&r" (out0) + : "w" (r0_1), "w" (r2_3) + : "memory"); + } + { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = LOAD2I_4(i4); + t1 = LOAD2I_5(i5); + t2 = LOAD2I_6(i6); + t3 = LOAD2I_7(i7); + t4 = ADD(t0, t1); + t5 = SUB(t0, t1); + t6 = ADD(t2, t3); + t7 = SUB(t2, t3); + t0 = ADD(t4, t6); + t2 = SUB(t4, t6); + t1 = HSP_SUB_MULI(&t5, &t7); + t3 = HSP_ADD_MULI(&t5, &t7); + + float32x4x2_t tmp0 = vtrnq_f32(t0, t1); + r4_5 = tmp0.val[0]; + r8_9 = tmp0.val[1]; + float32x4x2_t tmp1 = vtrnq_f32(t2, t3); + r6_7 = tmp1.val[0]; + r10_11 = tmp1.val[1]; + + + __asm__ __volatile__ ("vswp d5, d6\n\t" + "vst1.32 {q2, q3}, [%0, :128]!\n\t" +// "vst1.32 {q3}, [%0, :128]!\n\t" + : "+&r" (out0) + : "w" (r4_5), "w" (r6_7) + : "memory"); + + } + + K_N_HSP(&w,&r8_9,&r10_11,&r12_13,&r14_15); + register V t0 __asm__ ("q4") = r8_9; + register V t1 __asm__ ("q5") = r10_11; + register V t2 __asm__ ("q6") = r12_13; + register V t3 __asm__ ("q7") = r14_15; + + __asm__ __volatile__ ("vswp d9, d10\n\t" + "vswp d13, d14\n\t" + "vstmia %0!, {q4-q7}\n\t" +// "vst1.32 {q4}, [%0, :128]!\n\t" +// "vst1.32 {q5}, [%0, :128]!\n\t" +// "vst1.32 {q6}, [%0, :128]!\n\t" +// "vst1.32 {q7}, [%0, :128]\n\t" + : "+&r" (out1) + : "w" (t0), "w" (t1), "w" (t2), "w" (t3) + : "memory"); + + +} +#endif diff --git a/lib/ffts/src/neon_static_f.s b/lib/ffts/src/neon_static_f.s new file mode 100644 index 0000000..920d13c --- /dev/null +++ b/lib/ffts/src/neon_static_f.s @@ -0,0 +1,956 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + .align 4 +#ifdef __APPLE__ + .globl _neon_static_e_f +_neon_static_e_f: +#else + .globl neon_static_e_f +neon_static_e_f: +#endif + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + vstmdb sp!, {d8-d15} + ldr lr, [r0, #40] @ this is p->N + add r3, r1, #0 + add r7, r1, lr + add r5, r7, lr + add r10, r5, lr + add r4, r10, lr + add r8, r4, lr + add r6, r8, lr + add r9, r6, lr + ldr r12, [r0] + add r1, r0, #0 + add r0, r2, #0 + ldr r2, [r1, #16] @ this is p->ee_ws + ldr r11, [r1, #28] @ this is p->i0 + + vld1.32 {d16, d17}, [r2, :128] +_neon_ee_loop: + vld2.32 {q15}, [r10, :128]! + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vld2.32 {q9}, [r4, :128]! + vld2.32 {q10}, [r3, :128]! + vld2.32 {q11}, [r6, :128]! + vld2.32 {q12}, [r5, :128]! + vsub.f32 q1, q14, q13 + vld2.32 {q0}, [r9, :128]! + subs r11, r11, #1 + vsub.f32 q2, q0, q15 + vadd.f32 q0, q0, q15 + vmul.f32 d10, d2, d17 + vmul.f32 d11, d3, d16 + vmul.f32 d12, d3, d17 + vmul.f32 d6, d4, d17 + vmul.f32 d7, d5, d16 + vmul.f32 d8, d4, d16 + vmul.f32 d9, d5, d17 + vmul.f32 d13, d2, d16 + vsub.f32 d7, d7, d6 + vadd.f32 d11, d11, d10 + vsub.f32 q1, q12, q11 + vsub.f32 q2, q10, q9 + vadd.f32 d6, d9, d8 + vadd.f32 q4, q14, q13 + vadd.f32 q11, q12, q11 + vadd.f32 q12, q10, q9 + vsub.f32 d10, d13, d12 + vsub.f32 q7, q4, q0 + vsub.f32 q9, q12, q11 + vsub.f32 q13, q5, q3 + vsub.f32 d29, d5, d2 @ + vadd.f32 q5, q5, q3 + vadd.f32 q10, q4, q0 + vadd.f32 q11, q12, q11 + vadd.f32 d31, d5, d2 @ + vadd.f32 d28, d4, d3 @ + vsub.f32 d30, d4, d3 @ + vsub.f32 d5, d19, d14 @ + vsub.f32 d7, d31, d26 @ + vadd.f32 q1, q14, q5 + vadd.f32 q0, q11, q10 + vadd.f32 d6, d30, d27 @ + vadd.f32 d4, d18, d15 @ + vadd.f32 d13, d19, d14 @ + vsub.f32 d12, d18, d15 @ + vadd.f32 d15, d31, d26 @ + ldr r2, [r12], #4 + vtrn.32 q1, q3 + ldr lr, [r12], #4 + vtrn.32 q0, q2 + add r2, r0, r2, lsl #2 + vsub.f32 q4, q11, q10 + add lr, r0, lr, lsl #2 + vsub.f32 q5, q14, q5 + vsub.f32 d14, d30, d27 @ + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_ee_loop + + ldr r11, [r1, #12] + vld2.32 {q9}, [r5, :128]! @tag2 + vld2.32 {q13}, [r3, :128]! @tag0 + vld2.32 {q12}, [r4, :128]! @tag1 + vld2.32 {q0}, [r7, :128]! @tag4 + vsub.f32 q11, q13, q12 + vld2.32 {q8}, [r6, :128]! @tag3 + vadd.f32 q12, q13, q12 + vsub.f32 q10, q9, q8 + vadd.f32 q8, q9, q8 + vadd.f32 q9, q12, q8 + vsub.f32 d9, d23, d20 @ + vadd.f32 d11, d23, d20 @ + vsub.f32 q8, q12, q8 + vadd.f32 d8, d22, d21 @ + vsub.f32 d10, d22, d21 @ + ldr r2, [r12], #4 + vld1.32 {d20, d21}, [r11, :128] + ldr lr, [r12], #4 + vtrn.32 q9, q4 + add r2, r0, r2, lsl #2 + vtrn.32 q8, q5 + add lr, r0, lr, lsl #2 + vswp d9,d10 + vst1.32 {d8,d9,d10,d11}, [lr, :128]! + vld2.32 {q13}, [r10, :128]! @tag7 + vld2.32 {q15}, [r9, :128]! @tag6 + vld2.32 {q11}, [r8, :128]! @tag5 + vsub.f32 q14, q15, q13 + vsub.f32 q12, q0, q11 + vadd.f32 q11, q0, q11 + vadd.f32 q13, q15, q13 + vsub.f32 d13, d29, d24 @ + vadd.f32 q15, q13, q11 + vadd.f32 d12, d28, d25 @ + vadd.f32 d15, d29, d24 @ + vsub.f32 d14, d28, d25 @ + vtrn.32 q15, q6 + vsub.f32 q15, q13, q11 + vtrn.32 q15, q7 + vswp d13, d14 + vst1.32 {d12,d13,d14,d15}, [lr, :128]! + vtrn.32 q13, q14 + vtrn.32 q11, q12 + vmul.f32 d24, d26, d21 + vmul.f32 d28, d27, d20 + vmul.f32 d25, d26, d20 + vmul.f32 d26, d27, d21 + vmul.f32 d27, d22, d21 + vmul.f32 d30, d23, d20 + vmul.f32 d29, d23, d21 + vmul.f32 d22, d22, d20 + vsub.f32 d21, d28, d24 + vadd.f32 d20, d26, d25 + vadd.f32 d25, d30, d27 + vsub.f32 d24, d22, d29 + vadd.f32 q11, q12, q10 + vsub.f32 q10, q12, q10 + vadd.f32 q0, q9, q11 + vsub.f32 q2, q9, q11 + vsub.f32 d3, d17, d20 @ + vadd.f32 d7, d17, d20 @ + vadd.f32 d2, d16, d21 @ + vsub.f32 d6, d16, d21 @ + vswp d1, d2 + vswp d5, d6 + vstmia r2!, {q0-q3} + + add r2, r7, #0 + add r7, r9, #0 + add r9, r2, #0 + add r2, r8, #0 + add r8, r10, #0 + add r10, r2, #0 + ldr r11, [r1, #32] @ this is p->i1 + cmp r11, #0 + beq _neon_oo_loop_exit +_neon_oo_loop: + vld2.32 {q8}, [r6, :128]! + vld2.32 {q9}, [r5, :128]! + vld2.32 {q10}, [r4, :128]! + vld2.32 {q13}, [r3, :128]! + vadd.f32 q11, q9, q8 + vsub.f32 q8, q9, q8 + vsub.f32 q9, q13, q10 + vadd.f32 q12, q13, q10 + subs r11, r11, #1 + vld2.32 {q10}, [r7, :128]! + vld2.32 {q13}, [r9, :128]! + vsub.f32 q2, q12, q11 + vadd.f32 d7, d19, d16 @ + vsub.f32 d3, d19, d16 @ + vsub.f32 d6, d18, d17 @ + vadd.f32 d2, d18, d17 @ + vld2.32 {q9}, [r8, :128]! + vld2.32 {q8}, [r10, :128]! + vadd.f32 q0, q12, q11 + vadd.f32 q11, q13, q8 + vadd.f32 q12, q10, q9 + vsub.f32 q8, q13, q8 + vsub.f32 q9, q10, q9 + vsub.f32 q6, q12, q11 + vadd.f32 q4, q12, q11 + vtrn.32 q0, q2 + ldr r2, [r12], #4 + vadd.f32 d15, d19, d16 @ + ldr lr, [r12], #4 + vsub.f32 d11, d19, d16 @ + vsub.f32 d14, d18, d17 @ + vadd.f32 d10, d18, d17 @ + add r2, r0, r2, lsl #2 + vtrn.32 q1, q3 + add lr, r0, lr, lsl #2 + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_oo_loop +_neon_oo_loop_exit: + + + add r2, r3, #0 + add r3, r7, #0 + add r7, r2, #0 + add r2, r4, #0 + add r4, r8, #0 + add r8, r2, #0 + add r2, r5, #0 + add r5, r9, #0 + add r9, r2, #0 + add r2, r6, #0 + add r6, r10, #0 + add r10, r2, #0 + add r2, r9, #0 + add r9, r10, #0 + add r10, r2, #0 + ldr r2, [r1, #16] + ldr r11, [r1, #32] @ this is p->i1 + cmp r11, #0 + beq _neon_ee_loop2_exit + + vld1.32 {d16, d17}, [r2, :128] +_neon_ee_loop2: + vld2.32 {q15}, [r10, :128]! + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vld2.32 {q9}, [r4, :128]! + vld2.32 {q10}, [r3, :128]! + vld2.32 {q11}, [r6, :128]! + vld2.32 {q12}, [r5, :128]! + vsub.f32 q1, q14, q13 + vld2.32 {q0}, [r9, :128]! + subs r11, r11, #1 + vsub.f32 q2, q0, q15 + vadd.f32 q0, q0, q15 + vmul.f32 d10, d2, d17 + vmul.f32 d11, d3, d16 + vmul.f32 d12, d3, d17 + vmul.f32 d6, d4, d17 + vmul.f32 d7, d5, d16 + vmul.f32 d8, d4, d16 + vmul.f32 d9, d5, d17 + vmul.f32 d13, d2, d16 + vsub.f32 d7, d7, d6 + vadd.f32 d11, d11, d10 + vsub.f32 q1, q12, q11 + vsub.f32 q2, q10, q9 + vadd.f32 d6, d9, d8 + vadd.f32 q4, q14, q13 + vadd.f32 q11, q12, q11 + vadd.f32 q12, q10, q9 + vsub.f32 d10, d13, d12 + vsub.f32 q7, q4, q0 + vsub.f32 q9, q12, q11 + vsub.f32 q13, q5, q3 + vsub.f32 d29, d5, d2 @ + vadd.f32 q5, q5, q3 + vadd.f32 q10, q4, q0 + vadd.f32 q11, q12, q11 + vadd.f32 d31, d5, d2 @ + vadd.f32 d28, d4, d3 @ + vsub.f32 d30, d4, d3 @ + vsub.f32 d5, d19, d14 @ + vsub.f32 d7, d31, d26 @ + vadd.f32 q1, q14, q5 + vadd.f32 q0, q11, q10 + vadd.f32 d6, d30, d27 @ + vadd.f32 d4, d18, d15 @ + vadd.f32 d13, d19, d14 @ + vsub.f32 d12, d18, d15 @ + vadd.f32 d15, d31, d26 @ + ldr r2, [r12], #4 + vtrn.32 q1, q3 + ldr lr, [r12], #4 + vtrn.32 q0, q2 + add r2, r0, r2, lsl #2 + vsub.f32 q4, q11, q10 + add lr, r0, lr, lsl #2 + vsub.f32 q5, q14, q5 + vsub.f32 d14, d30, d27 @ + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_ee_loop2 +_neon_ee_loop2_exit: + + vldmia sp!, {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + + + + + .align 4 +#ifdef __APPLE__ + .globl _neon_static_o_f +_neon_static_o_f: +#else + .globl neon_static_o_f +neon_static_o_f: +#endif + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + vstmdb sp!, {d8-d15} + ldr lr, [r0, #40] @ this is p->N + add r3, r1, #0 + add r7, r1, lr + add r5, r7, lr + add r10, r5, lr + add r4, r10, lr + add r8, r4, lr + add r6, r8, lr + add r9, r6, lr + ldr r12, [r0] + add r1, r0, #0 + add r0, r2, #0 + ldr r2, [r1, #16] @ this is p->ee_ws + ldr r11, [r1, #28] @ this is p->i0 + + vld1.32 {d16, d17}, [r2, :128] +_neon_ee_o_loop: + vld2.32 {q15}, [r10, :128]! + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vld2.32 {q9}, [r4, :128]! + vld2.32 {q10}, [r3, :128]! + vld2.32 {q11}, [r6, :128]! + vld2.32 {q12}, [r5, :128]! + vsub.f32 q1, q14, q13 + vld2.32 {q0}, [r9, :128]! + subs r11, r11, #1 + vsub.f32 q2, q0, q15 + vadd.f32 q0, q0, q15 + vmul.f32 d10, d2, d17 + vmul.f32 d11, d3, d16 + vmul.f32 d12, d3, d17 + vmul.f32 d6, d4, d17 + vmul.f32 d7, d5, d16 + vmul.f32 d8, d4, d16 + vmul.f32 d9, d5, d17 + vmul.f32 d13, d2, d16 + vsub.f32 d7, d7, d6 + vadd.f32 d11, d11, d10 + vsub.f32 q1, q12, q11 + vsub.f32 q2, q10, q9 + vadd.f32 d6, d9, d8 + vadd.f32 q4, q14, q13 + vadd.f32 q11, q12, q11 + vadd.f32 q12, q10, q9 + vsub.f32 d10, d13, d12 + vsub.f32 q7, q4, q0 + vsub.f32 q9, q12, q11 + vsub.f32 q13, q5, q3 + vsub.f32 d29, d5, d2 @ + vadd.f32 q5, q5, q3 + vadd.f32 q10, q4, q0 + vadd.f32 q11, q12, q11 + vadd.f32 d31, d5, d2 @ + vadd.f32 d28, d4, d3 @ + vsub.f32 d30, d4, d3 @ + vsub.f32 d5, d19, d14 @ + vsub.f32 d7, d31, d26 @ + vadd.f32 q1, q14, q5 + vadd.f32 q0, q11, q10 + vadd.f32 d6, d30, d27 @ + vadd.f32 d4, d18, d15 @ + vadd.f32 d13, d19, d14 @ + vsub.f32 d12, d18, d15 @ + vadd.f32 d15, d31, d26 @ + ldr r2, [r12], #4 + vtrn.32 q1, q3 + ldr lr, [r12], #4 + vtrn.32 q0, q2 + add r2, r0, r2, lsl #2 + vsub.f32 q4, q11, q10 + add lr, r0, lr, lsl #2 + vsub.f32 q5, q14, q5 + vsub.f32 d14, d30, d27 @ + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_ee_o_loop + + add r2, r7, #0 + add r7, r9, #0 + add r9, r2, #0 + add r2, r8, #0 + add r8, r10, #0 + add r10, r2, #0 + ldr r11, [r1, #32] @ this is p->i1 + cmp r11, #0 + beq _neon_oo_o_loop_exit +_neon_oo_o_loop: + vld2.32 {q8}, [r6, :128]! + vld2.32 {q9}, [r5, :128]! + vld2.32 {q10}, [r4, :128]! + vld2.32 {q13}, [r3, :128]! + vadd.f32 q11, q9, q8 + vsub.f32 q8, q9, q8 + vsub.f32 q9, q13, q10 + vadd.f32 q12, q13, q10 + subs r11, r11, #1 + vld2.32 {q10}, [r7, :128]! + vld2.32 {q13}, [r9, :128]! + vsub.f32 q2, q12, q11 + vadd.f32 d7, d19, d16 @ + vsub.f32 d3, d19, d16 @ + vsub.f32 d6, d18, d17 @ + vadd.f32 d2, d18, d17 @ + vld2.32 {q9}, [r8, :128]! + vld2.32 {q8}, [r10, :128]! + vadd.f32 q0, q12, q11 + vadd.f32 q11, q13, q8 + vadd.f32 q12, q10, q9 + vsub.f32 q8, q13, q8 + vsub.f32 q9, q10, q9 + vsub.f32 q6, q12, q11 + vadd.f32 q4, q12, q11 + vtrn.32 q0, q2 + ldr r2, [r12], #4 + vadd.f32 d15, d19, d16 @ + ldr lr, [r12], #4 + vsub.f32 d11, d19, d16 @ + vsub.f32 d14, d18, d17 @ + vadd.f32 d10, d18, d17 @ + add r2, r0, r2, lsl #2 + vtrn.32 q1, q3 + add lr, r0, lr, lsl #2 + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_oo_o_loop +_neon_oo_o_loop_exit: + + ldr r11, [r1, #8] + vld1.32 {q8}, [r5, :128]! + vld1.32 {q10}, [r6, :128]! + vld2.32 {q11}, [r4, :128]! + vld2.32 {q13}, [r3, :128]! + vld2.32 {q15}, [r10, :128]! + vorr d25, d17, d17 + vorr d24, d20, d20 + vorr d20, d16, d16 + vsub.f32 q9, q13, q11 + vadd.f32 q11, q13, q11 + ldr r2, [r12], #4 + vtrn.32 d24, d25 + ldr lr, [r12], #4 + vtrn.32 d20, d21 + add r2, r0, r2, lsl #2 + vsub.f32 q8, q10, q12 + add lr, r0, lr, lsl #2 + vadd.f32 q10, q10, q12 + vadd.f32 q0, q11, q10 + vsub.f32 d25, d19, d16 @ + vadd.f32 d27, d19, d16 @ + vsub.f32 q1, q11, q10 + vadd.f32 d24, d18, d17 @ + vsub.f32 d26, d18, d17 @ + vtrn.32 q0, q12 + vtrn.32 q1, q13 + vld1.32 {d24, d25}, [r11, :128] + vswp d1, d2 + vst1.32 {q0, q1}, [r2, :128]! + vld2.32 {q0}, [r9, :128]! + vadd.f32 q1, q0, q15 + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vsub.f32 q15, q0, q15 + vsub.f32 q0, q14, q13 + vadd.f32 q3, q14, q13 + vadd.f32 q2, q3, q1 + vsub.f32 d29, d1, d30 @ + vadd.f32 d27, d1, d30 @ + vsub.f32 q3, q3, q1 + vadd.f32 d28, d0, d31 @ + vsub.f32 d26, d0, d31 @ + vtrn.32 q2, q14 + vtrn.32 q3, q13 + vswp d5, d6 + vst1.32 {q2, q3}, [r2, :128]! + vtrn.32 q11, q9 + vtrn.32 q10, q8 + vmul.f32 d20, d18, d25 + vmul.f32 d22, d19, d24 + vmul.f32 d21, d19, d25 + vmul.f32 d18, d18, d24 + vmul.f32 d19, d16, d25 + vmul.f32 d30, d17, d24 + vmul.f32 d23, d16, d24 + vmul.f32 d24, d17, d25 + vadd.f32 d17, d22, d20 + vsub.f32 d16, d18, d21 + vsub.f32 d21, d30, d19 + vadd.f32 d20, d24, d23 + vadd.f32 q9, q8, q10 + vsub.f32 q8, q8, q10 + vadd.f32 q4, q14, q9 + vsub.f32 q6, q14, q9 + vsub.f32 d11, d27, d16 @ + vadd.f32 d15, d27, d16 @ + vadd.f32 d10, d26, d17 @ + vsub.f32 d14, d26, d17 @ + vswp d9, d10 + vswp d13, d14 + vstmia lr!, {q4-q7} + + + add r2, r3, #0 + add r3, r7, #0 + add r7, r2, #0 + add r2, r4, #0 + add r4, r8, #0 + add r8, r2, #0 + add r2, r5, #0 + add r5, r9, #0 + add r9, r2, #0 + add r2, r6, #0 + add r6, r10, #0 + add r10, r2, #0 + add r2, r9, #0 + add r9, r10, #0 + add r10, r2, #0 + ldr r2, [r1, #16] + ldr r11, [r1, #32] @ this is p->i1 + cmp r11, #0 + beq _neon_ee_o_loop2_exit + + vld1.32 {d16, d17}, [r2, :128] +_neon_ee_o_loop2: + vld2.32 {q15}, [r10, :128]! + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vld2.32 {q9}, [r4, :128]! + vld2.32 {q10}, [r3, :128]! + vld2.32 {q11}, [r6, :128]! + vld2.32 {q12}, [r5, :128]! + vsub.f32 q1, q14, q13 + vld2.32 {q0}, [r9, :128]! + subs r11, r11, #1 + vsub.f32 q2, q0, q15 + vadd.f32 q0, q0, q15 + vmul.f32 d10, d2, d17 + vmul.f32 d11, d3, d16 + vmul.f32 d12, d3, d17 + vmul.f32 d6, d4, d17 + vmul.f32 d7, d5, d16 + vmul.f32 d8, d4, d16 + vmul.f32 d9, d5, d17 + vmul.f32 d13, d2, d16 + vsub.f32 d7, d7, d6 + vadd.f32 d11, d11, d10 + vsub.f32 q1, q12, q11 + vsub.f32 q2, q10, q9 + vadd.f32 d6, d9, d8 + vadd.f32 q4, q14, q13 + vadd.f32 q11, q12, q11 + vadd.f32 q12, q10, q9 + vsub.f32 d10, d13, d12 + vsub.f32 q7, q4, q0 + vsub.f32 q9, q12, q11 + vsub.f32 q13, q5, q3 + vsub.f32 d29, d5, d2 @ + vadd.f32 q5, q5, q3 + vadd.f32 q10, q4, q0 + vadd.f32 q11, q12, q11 + vadd.f32 d31, d5, d2 @ + vadd.f32 d28, d4, d3 @ + vsub.f32 d30, d4, d3 @ + vsub.f32 d5, d19, d14 @ + vsub.f32 d7, d31, d26 @ + vadd.f32 q1, q14, q5 + vadd.f32 q0, q11, q10 + vadd.f32 d6, d30, d27 @ + vadd.f32 d4, d18, d15 @ + vadd.f32 d13, d19, d14 @ + vsub.f32 d12, d18, d15 @ + vadd.f32 d15, d31, d26 @ + ldr r2, [r12], #4 + vtrn.32 q1, q3 + ldr lr, [r12], #4 + vtrn.32 q0, q2 + add r2, r0, r2, lsl #2 + vsub.f32 q4, q11, q10 + add lr, r0, lr, lsl #2 + vsub.f32 q5, q14, q5 + vsub.f32 d14, d30, d27 @ + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_ee_o_loop2 +_neon_ee_o_loop2_exit: + + vldmia sp!, {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + + .align 4 +#ifdef __APPLE__ + .globl _neon_static_x4_f +_neon_static_x4_f: +#else + .globl neon_static_x4_f +neon_static_x4_f: +#endif +@ add r3, r0, #0 + push {r4, r5, r6, lr} + vstmdb sp!, {d8-d15} + + vld1.32 {q8,q9}, [r0, :128] + add r4, r0, r1, lsl #1 + vld1.32 {q10,q11}, [r4, :128] + add r5, r0, r1, lsl #2 + vld1.32 {q12,q13}, [r5, :128] + add r6, r4, r1, lsl #2 + vld1.32 {q14,q15}, [r6, :128] + vld1.32 {q2,q3}, [r2, :128] + + vmul.f32 q0, q13, q3 + vmul.f32 q5, q12, q2 + vmul.f32 q1, q14, q2 + vmul.f32 q4, q14, q3 + vmul.f32 q14, q12, q3 + vmul.f32 q13, q13, q2 + vmul.f32 q12, q15, q3 + vmul.f32 q2, q15, q2 + vsub.f32 q0, q5, q0 + vadd.f32 q13, q13, q14 + vadd.f32 q12, q12, q1 + vsub.f32 q1, q2, q4 + vadd.f32 q15, q0, q12 + vsub.f32 q12, q0, q12 + vadd.f32 q14, q13, q1 + vsub.f32 q13, q13, q1 + vadd.f32 q0, q8, q15 + vadd.f32 q1, q9, q14 + vadd.f32 q2, q10, q13 @ + vsub.f32 q4, q8, q15 + vsub.f32 q3, q11, q12 @ + vst1.32 {q0,q1}, [r0, :128] + vsub.f32 q5, q9, q14 + vsub.f32 q6, q10, q13 @ + vadd.f32 q7, q11, q12 @ + vst1.32 {q2,q3}, [r4, :128] + vst1.32 {q4,q5}, [r5, :128] + vst1.32 {q6,q7}, [r6, :128] + vldmia sp!, {d8-d15} + pop {r4, r5, r6, pc} + + + + .align 4 +#ifdef __APPLE__ + .globl _neon_static_x8_f +_neon_static_x8_f: +#else + .globl neon_static_x8_f +neon_static_x8_f: +#endif + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + vstmdb sp!, {d8-d15} + mov r11, #0 + add r3, r0, #0 @ data0 + add r5, r0, r1, lsl #1 @ data2 + add r4, r0, r1 @ data1 + add r7, r5, r1, lsl #1 @ data4 + add r6, r5, r1 @ data3 + add r9, r7, r1, lsl #1 @ data6 + add r8, r7, r1 @ data5 + add r10, r9, r1 @ data7 + add r12, r2, #0 @ LUT + + sub r11, r11, r1, lsr #5 +neon_x8_loop: + vld1.32 {q2,q3}, [r12, :128]! + vld1.32 {q14,q15}, [r6, :128] + vld1.32 {q10,q11}, [r5, :128] + adds r11, r11, #1 + vmul.f32 q12, q15, q2 + vmul.f32 q8, q14, q3 + vmul.f32 q13, q14, q2 + vmul.f32 q9, q10, q3 + vmul.f32 q1, q10, q2 + vmul.f32 q0, q11, q2 + vmul.f32 q14, q11, q3 + vmul.f32 q15, q15, q3 + vld1.32 {q2,q3}, [r12, :128]! + vsub.f32 q10, q12, q8 + vadd.f32 q11, q0, q9 + vadd.f32 q8, q15, q13 + vld1.32 {q12,q13}, [r4, :128] + vsub.f32 q9, q1, q14 + vsub.f32 q15, q11, q10 + vsub.f32 q14, q9, q8 + vadd.f32 q4, q12, q15 @ + vsub.f32 q6, q12, q15 @ + vsub.f32 q5, q13, q14 @ + vadd.f32 q7, q13, q14 @ + vld1.32 {q14,q15}, [r9, :128] + vld1.32 {q12,q13}, [r7, :128] + vmul.f32 q1, q14, q2 + vmul.f32 q0, q14, q3 + vst1.32 {q4,q5}, [r4, :128] + vmul.f32 q14, q15, q3 + vmul.f32 q4, q15, q2 + vadd.f32 q15, q9, q8 + vst1.32 {q6,q7}, [r6, :128] + vmul.f32 q8, q12, q3 + vmul.f32 q5, q13, q3 + vmul.f32 q12, q12, q2 + vmul.f32 q9, q13, q2 + vadd.f32 q14, q14, q1 + vsub.f32 q13, q4, q0 + vadd.f32 q0, q9, q8 + vld1.32 {q8,q9}, [r3, :128] + vadd.f32 q1, q11, q10 + vsub.f32 q12, q12, q5 + vadd.f32 q11, q8, q15 + vsub.f32 q8, q8, q15 + vadd.f32 q2, q12, q14 + vsub.f32 q10, q0, q13 + vadd.f32 q15, q0, q13 + vadd.f32 q13, q9, q1 + vsub.f32 q9, q9, q1 + vsub.f32 q12, q12, q14 + vadd.f32 q0, q11, q2 + vadd.f32 q1, q13, q15 + vsub.f32 q4, q11, q2 + vadd.f32 q2, q8, q10 @ + vsub.f32 q3, q9, q12 @ + vst1.32 {q0,q1}, [r3, :128]! + vsub.f32 q5, q13, q15 + vld1.32 {q14,q15}, [r10, :128] + vadd.f32 q7, q9, q12 @ + vld1.32 {q12,q13}, [r8, :128] + vst1.32 {q2,q3}, [r5, :128]! + vld1.32 {q2,q3}, [r12, :128]! + vsub.f32 q6, q8, q10 @ + vmul.f32 q8, q14, q2 + vst1.32 {q4,q5}, [r7, :128]! + vmul.f32 q10, q15, q3 + vmul.f32 q9, q13, q3 + vmul.f32 q11, q12, q2 + vmul.f32 q14, q14, q3 + vst1.32 {q6,q7}, [r9, :128]! + vmul.f32 q15, q15, q2 + vmul.f32 q12, q12, q3 + vmul.f32 q13, q13, q2 + vadd.f32 q10, q10, q8 + vsub.f32 q11, q11, q9 + vld1.32 {q8,q9}, [r4, :128] + vsub.f32 q14, q15, q14 + vadd.f32 q15, q13, q12 + vadd.f32 q13, q11, q10 + vadd.f32 q12, q15, q14 + vsub.f32 q15, q15, q14 + vsub.f32 q14, q11, q10 + vld1.32 {q10,q11}, [r6, :128] + vadd.f32 q0, q8, q13 + vadd.f32 q1, q9, q12 + vadd.f32 q2, q10, q15 @ + vsub.f32 q3, q11, q14 @ + vsub.f32 q4, q8, q13 + vst1.32 {q0,q1}, [r4, :128]! + vsub.f32 q5, q9, q12 + vsub.f32 q6, q10, q15 @ + vst1.32 {q2,q3}, [r6, :128]! + vadd.f32 q7, q11, q14 @ + vst1.32 {q4,q5}, [r8, :128]! + vst1.32 {q6,q7}, [r10, :128]! + bne neon_x8_loop + + vldmia sp!, {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + + .align 4 +#ifdef __APPLE__ + .globl _neon_static_x8_t_f +_neon_static_x8_t_f: +#else + .globl neon_static_x8_t_f +neon_static_x8_t_f: +#endif + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + vstmdb sp!, {d8-d15} + mov r11, #0 + add r3, r0, #0 @ data0 + add r5, r0, r1, lsl #1 @ data2 + add r4, r0, r1 @ data1 + add r7, r5, r1, lsl #1 @ data4 + add r6, r5, r1 @ data3 + add r9, r7, r1, lsl #1 @ data6 + add r8, r7, r1 @ data5 + add r10, r9, r1 @ data7 + add r12, r2, #0 @ LUT + + sub r11, r11, r1, lsr #5 +neon_x8_t_loop: + vld1.32 {q2,q3}, [r12, :128]! + vld1.32 {q14,q15}, [r6, :128] + vld1.32 {q10,q11}, [r5, :128] + adds r11, r11, #1 + vmul.f32 q12, q15, q2 + vmul.f32 q8, q14, q3 + vmul.f32 q13, q14, q2 + vmul.f32 q9, q10, q3 + vmul.f32 q1, q10, q2 + vmul.f32 q0, q11, q2 + vmul.f32 q14, q11, q3 + vmul.f32 q15, q15, q3 + vld1.32 {q2,q3}, [r12, :128]! + vsub.f32 q10, q12, q8 + vadd.f32 q11, q0, q9 + vadd.f32 q8, q15, q13 + vld1.32 {q12,q13}, [r4, :128] + vsub.f32 q9, q1, q14 + vsub.f32 q15, q11, q10 + vsub.f32 q14, q9, q8 + vadd.f32 q4, q12, q15 @ + vsub.f32 q6, q12, q15 @ + vsub.f32 q5, q13, q14 @ + vadd.f32 q7, q13, q14 @ + vld1.32 {q14,q15}, [r9, :128] + vld1.32 {q12,q13}, [r7, :128] + vmul.f32 q1, q14, q2 + vmul.f32 q0, q14, q3 + vst1.32 {q4,q5}, [r4, :128] + vmul.f32 q14, q15, q3 + vmul.f32 q4, q15, q2 + vadd.f32 q15, q9, q8 + vst1.32 {q6,q7}, [r6, :128] + vmul.f32 q8, q12, q3 + vmul.f32 q5, q13, q3 + vmul.f32 q12, q12, q2 + vmul.f32 q9, q13, q2 + vadd.f32 q14, q14, q1 + vsub.f32 q13, q4, q0 + vadd.f32 q0, q9, q8 + vld1.32 {q8,q9}, [r3, :128] + vadd.f32 q1, q11, q10 + vsub.f32 q12, q12, q5 + vadd.f32 q11, q8, q15 + vsub.f32 q8, q8, q15 + vadd.f32 q2, q12, q14 + vsub.f32 q10, q0, q13 + vadd.f32 q15, q0, q13 + vadd.f32 q13, q9, q1 + vsub.f32 q9, q9, q1 + vsub.f32 q12, q12, q14 + vadd.f32 q0, q11, q2 + vadd.f32 q1, q13, q15 + vsub.f32 q4, q11, q2 + vadd.f32 q2, q8, q10 @ + vsub.f32 q3, q9, q12 @ + vst2.32 {q0,q1}, [r3, :128]! + vsub.f32 q5, q13, q15 + vld1.32 {q14,q15}, [r10, :128] + vadd.f32 q7, q9, q12 @ + vld1.32 {q12,q13}, [r8, :128] + vst2.32 {q2,q3}, [r5, :128]! + vld1.32 {q2,q3}, [r12, :128]! + vsub.f32 q6, q8, q10 @ + vmul.f32 q8, q14, q2 + vst2.32 {q4,q5}, [r7, :128]! + vmul.f32 q10, q15, q3 + vmul.f32 q9, q13, q3 + vmul.f32 q11, q12, q2 + vmul.f32 q14, q14, q3 + vst2.32 {q6,q7}, [r9, :128]! + vmul.f32 q15, q15, q2 + vmul.f32 q12, q12, q3 + vmul.f32 q13, q13, q2 + vadd.f32 q10, q10, q8 + vsub.f32 q11, q11, q9 + vld1.32 {q8,q9}, [r4, :128] + vsub.f32 q14, q15, q14 + vadd.f32 q15, q13, q12 + vadd.f32 q13, q11, q10 + vadd.f32 q12, q15, q14 + vsub.f32 q15, q15, q14 + vsub.f32 q14, q11, q10 + vld1.32 {q10,q11}, [r6, :128] + vadd.f32 q0, q8, q13 + vadd.f32 q1, q9, q12 + vadd.f32 q2, q10, q15 @ + vsub.f32 q3, q11, q14 @ + vsub.f32 q4, q8, q13 + vst2.32 {q0,q1}, [r4, :128]! + vsub.f32 q5, q9, q12 + vsub.f32 q6, q10, q15 @ + vst2.32 {q2,q3}, [r6, :128]! + vadd.f32 q7, q11, q14 @ + vst2.32 {q4,q5}, [r8, :128]! + vst2.32 {q6,q7}, [r10, :128]! + bne neon_x8_t_loop + + vldmia sp!, {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + + diff --git a/lib/ffts/src/neon_static_i.s b/lib/ffts/src/neon_static_i.s new file mode 100644 index 0000000..cfa766c --- /dev/null +++ b/lib/ffts/src/neon_static_i.s @@ -0,0 +1,955 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + .align 4 +#ifdef __APPLE__ + .globl _neon_static_e_i +_neon_static_e_i: +#else + .globl neon_static_e_i +neon_static_e_i: +#endif + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + vstmdb sp!, {d8-d15} + ldr lr, [r0, #40] @ this is p->N + add r3, r1, #0 + add r7, r1, lr + add r5, r7, lr + add r10, r5, lr + add r4, r10, lr + add r8, r4, lr + add r6, r8, lr + add r9, r6, lr + ldr r12, [r0] + add r1, r0, #0 + add r0, r2, #0 + ldr r2, [r1, #16] @ this is p->ee_ws + ldr r11, [r1, #28] @ this is p->i0 + + vld1.32 {d16, d17}, [r2, :128] +_neon_ee_loop: + vld2.32 {q15}, [r10, :128]! + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vld2.32 {q9}, [r4, :128]! + vld2.32 {q10}, [r3, :128]! + vld2.32 {q11}, [r6, :128]! + vld2.32 {q12}, [r5, :128]! + vsub.f32 q1, q14, q13 + vld2.32 {q0}, [r9, :128]! + subs r11, r11, #1 + vsub.f32 q2, q0, q15 + vadd.f32 q0, q0, q15 + vmul.f32 d10, d2, d17 + vmul.f32 d11, d3, d16 + vmul.f32 d12, d3, d17 + vmul.f32 d6, d4, d17 + vmul.f32 d7, d5, d16 + vmul.f32 d8, d4, d16 + vmul.f32 d9, d5, d17 + vmul.f32 d13, d2, d16 + vsub.f32 d7, d7, d6 + vadd.f32 d11, d11, d10 + vsub.f32 q1, q12, q11 + vsub.f32 q2, q10, q9 + vadd.f32 d6, d9, d8 + vadd.f32 q4, q14, q13 + vadd.f32 q11, q12, q11 + vadd.f32 q12, q10, q9 + vsub.f32 d10, d13, d12 + vsub.f32 q7, q4, q0 + vsub.f32 q9, q12, q11 + vsub.f32 q13, q5, q3 + vadd.f32 d29, d5, d2 @ + vadd.f32 q5, q5, q3 + vadd.f32 q10, q4, q0 + vadd.f32 q11, q12, q11 + vsub.f32 d31, d5, d2 @ + vsub.f32 d28, d4, d3 @ + vadd.f32 d30, d4, d3 @ + vadd.f32 d5, d19, d14 @ + vadd.f32 d7, d31, d26 @ + vadd.f32 q1, q14, q5 + vadd.f32 q0, q11, q10 + vsub.f32 d6, d30, d27 @ + vsub.f32 d4, d18, d15 @ + vsub.f32 d13, d19, d14 @ + vadd.f32 d12, d18, d15 @ + vsub.f32 d15, d31, d26 @ + ldr r2, [r12], #4 + vtrn.32 q1, q3 + ldr lr, [r12], #4 + vtrn.32 q0, q2 + add r2, r0, r2, lsl #2 + vsub.f32 q4, q11, q10 + add lr, r0, lr, lsl #2 + vsub.f32 q5, q14, q5 + vadd.f32 d14, d30, d27 @ + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_ee_loop + + ldr r11, [r1, #12] + vld2.32 {q9}, [r5, :128]! @tag2 + vld2.32 {q13}, [r3, :128]! @tag0 + vld2.32 {q12}, [r4, :128]! @tag1 + vld2.32 {q0}, [r7, :128]! @tag4 + vsub.f32 q11, q13, q12 + vld2.32 {q8}, [r6, :128]! @tag3 + vadd.f32 q12, q13, q12 + vsub.f32 q10, q9, q8 + vadd.f32 q8, q9, q8 + vadd.f32 q9, q12, q8 + vadd.f32 d9, d23, d20 @ + vsub.f32 d11, d23, d20 @ + vsub.f32 q8, q12, q8 + vsub.f32 d8, d22, d21 @ + vadd.f32 d10, d22, d21 @ + ldr r2, [r12], #4 + vld1.32 {d20, d21}, [r11, :128] + ldr lr, [r12], #4 + vtrn.32 q9, q4 + add r2, r0, r2, lsl #2 + vtrn.32 q8, q5 + add lr, r0, lr, lsl #2 + vswp d9,d10 + vst1.32 {d8,d9,d10,d11}, [lr, :128]! + vld2.32 {q13}, [r10, :128]! @tag7 + vld2.32 {q15}, [r9, :128]! @tag6 + vld2.32 {q11}, [r8, :128]! @tag5 + vsub.f32 q14, q15, q13 + vsub.f32 q12, q0, q11 + vadd.f32 q11, q0, q11 + vadd.f32 q13, q15, q13 + vadd.f32 d13, d29, d24 @ + vadd.f32 q15, q13, q11 + vsub.f32 d12, d28, d25 @ + vsub.f32 d15, d29, d24 @ + vadd.f32 d14, d28, d25 @ + vtrn.32 q15, q6 + vsub.f32 q15, q13, q11 + vtrn.32 q15, q7 + vswp d13, d14 + vst1.32 {d12,d13,d14,d15}, [lr, :128]! + vtrn.32 q13, q14 + vtrn.32 q11, q12 + vmul.f32 d24, d26, d21 + vmul.f32 d28, d27, d20 + vmul.f32 d25, d26, d20 + vmul.f32 d26, d27, d21 + vmul.f32 d27, d22, d21 + vmul.f32 d30, d23, d20 + vmul.f32 d29, d23, d21 + vmul.f32 d22, d22, d20 + vsub.f32 d21, d28, d24 + vadd.f32 d20, d26, d25 + vadd.f32 d25, d30, d27 + vsub.f32 d24, d22, d29 + vadd.f32 q11, q12, q10 + vsub.f32 q10, q12, q10 + vadd.f32 q0, q9, q11 + vsub.f32 q2, q9, q11 + vadd.f32 d3, d17, d20 @ + vsub.f32 d7, d17, d20 @ + vsub.f32 d2, d16, d21 @ + vadd.f32 d6, d16, d21 @ + vswp d1, d2 + vswp d5, d6 + vstmia r2!, {q0-q3} + + add r2, r7, #0 + add r7, r9, #0 + add r9, r2, #0 + add r2, r8, #0 + add r8, r10, #0 + add r10, r2, #0 + ldr r11, [r1, #32] @ this is p->i1 + cmp r11, #0 + beq _neon_oo_loop_exit +_neon_oo_loop: + vld2.32 {q8}, [r6, :128]! + vld2.32 {q9}, [r5, :128]! + vld2.32 {q10}, [r4, :128]! + vld2.32 {q13}, [r3, :128]! + vadd.f32 q11, q9, q8 + vsub.f32 q8, q9, q8 + vsub.f32 q9, q13, q10 + vadd.f32 q12, q13, q10 + subs r11, r11, #1 + vld2.32 {q10}, [r7, :128]! + vld2.32 {q13}, [r9, :128]! + vsub.f32 q2, q12, q11 + vsub.f32 d7, d19, d16 @ + vadd.f32 d3, d19, d16 @ + vadd.f32 d6, d18, d17 @ + vsub.f32 d2, d18, d17 @ + vld2.32 {q9}, [r8, :128]! + vld2.32 {q8}, [r10, :128]! + vadd.f32 q0, q12, q11 + vadd.f32 q11, q13, q8 + vadd.f32 q12, q10, q9 + vsub.f32 q8, q13, q8 + vsub.f32 q9, q10, q9 + vsub.f32 q6, q12, q11 + vadd.f32 q4, q12, q11 + vtrn.32 q0, q2 + ldr r2, [r12], #4 + vsub.f32 d15, d19, d16 @ + ldr lr, [r12], #4 + vadd.f32 d11, d19, d16 @ + vadd.f32 d14, d18, d17 @ + vsub.f32 d10, d18, d17 @ + add r2, r0, r2, lsl #2 + vtrn.32 q1, q3 + add lr, r0, lr, lsl #2 + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_oo_loop +_neon_oo_loop_exit: + + add r2, r3, #0 + add r3, r7, #0 + add r7, r2, #0 + add r2, r4, #0 + add r4, r8, #0 + add r8, r2, #0 + add r2, r5, #0 + add r5, r9, #0 + add r9, r2, #0 + add r2, r6, #0 + add r6, r10, #0 + add r10, r2, #0 + add r2, r9, #0 + add r9, r10, #0 + add r10, r2, #0 + ldr r2, [r1, #16] + ldr r11, [r1, #32] @ this is p->i1 + cmp r11, #0 + beq _neon_ee_loop2_exit + + vld1.32 {d16, d17}, [r2, :128] +_neon_ee_loop2: + vld2.32 {q15}, [r10, :128]! + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vld2.32 {q9}, [r4, :128]! + vld2.32 {q10}, [r3, :128]! + vld2.32 {q11}, [r6, :128]! + vld2.32 {q12}, [r5, :128]! + vsub.f32 q1, q14, q13 + vld2.32 {q0}, [r9, :128]! + subs r11, r11, #1 + vsub.f32 q2, q0, q15 + vadd.f32 q0, q0, q15 + vmul.f32 d10, d2, d17 + vmul.f32 d11, d3, d16 + vmul.f32 d12, d3, d17 + vmul.f32 d6, d4, d17 + vmul.f32 d7, d5, d16 + vmul.f32 d8, d4, d16 + vmul.f32 d9, d5, d17 + vmul.f32 d13, d2, d16 + vsub.f32 d7, d7, d6 + vadd.f32 d11, d11, d10 + vsub.f32 q1, q12, q11 + vsub.f32 q2, q10, q9 + vadd.f32 d6, d9, d8 + vadd.f32 q4, q14, q13 + vadd.f32 q11, q12, q11 + vadd.f32 q12, q10, q9 + vsub.f32 d10, d13, d12 + vsub.f32 q7, q4, q0 + vsub.f32 q9, q12, q11 + vsub.f32 q13, q5, q3 + vadd.f32 d29, d5, d2 @ + vadd.f32 q5, q5, q3 + vadd.f32 q10, q4, q0 + vadd.f32 q11, q12, q11 + vsub.f32 d31, d5, d2 @ + vsub.f32 d28, d4, d3 @ + vadd.f32 d30, d4, d3 @ + vadd.f32 d5, d19, d14 @ + vadd.f32 d7, d31, d26 @ + vadd.f32 q1, q14, q5 + vadd.f32 q0, q11, q10 + vsub.f32 d6, d30, d27 @ + vsub.f32 d4, d18, d15 @ + vsub.f32 d13, d19, d14 @ + vadd.f32 d12, d18, d15 @ + vsub.f32 d15, d31, d26 @ + ldr r2, [r12], #4 + vtrn.32 q1, q3 + ldr lr, [r12], #4 + vtrn.32 q0, q2 + add r2, r0, r2, lsl #2 + vsub.f32 q4, q11, q10 + add lr, r0, lr, lsl #2 + vsub.f32 q5, q14, q5 + vadd.f32 d14, d30, d27 @ + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_ee_loop2 +_neon_ee_loop2_exit: + + vldmia sp!, {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + + + + + .align 4 +#ifdef __APPLE__ + .globl _neon_static_o_i +_neon_static_o_i: +#else + .globl neon_static_o_i +neon_static_o_i: +#endif + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + vstmdb sp!, {d8-d15} + ldr lr, [r0, #40] @ this is p->N + add r3, r1, #0 + add r7, r1, lr + add r5, r7, lr + add r10, r5, lr + add r4, r10, lr + add r8, r4, lr + add r6, r8, lr + add r9, r6, lr + ldr r12, [r0] + add r1, r0, #0 + add r0, r2, #0 + ldr r2, [r1, #16] @ this is p->ee_ws + ldr r11, [r1, #28] @ this is p->i0 + + vld1.32 {d16, d17}, [r2, :128] +_neon_ee_o_loop: + vld2.32 {q15}, [r10, :128]! + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vld2.32 {q9}, [r4, :128]! + vld2.32 {q10}, [r3, :128]! + vld2.32 {q11}, [r6, :128]! + vld2.32 {q12}, [r5, :128]! + vsub.f32 q1, q14, q13 + vld2.32 {q0}, [r9, :128]! + subs r11, r11, #1 + vsub.f32 q2, q0, q15 + vadd.f32 q0, q0, q15 + vmul.f32 d10, d2, d17 + vmul.f32 d11, d3, d16 + vmul.f32 d12, d3, d17 + vmul.f32 d6, d4, d17 + vmul.f32 d7, d5, d16 + vmul.f32 d8, d4, d16 + vmul.f32 d9, d5, d17 + vmul.f32 d13, d2, d16 + vsub.f32 d7, d7, d6 + vadd.f32 d11, d11, d10 + vsub.f32 q1, q12, q11 + vsub.f32 q2, q10, q9 + vadd.f32 d6, d9, d8 + vadd.f32 q4, q14, q13 + vadd.f32 q11, q12, q11 + vadd.f32 q12, q10, q9 + vsub.f32 d10, d13, d12 + vsub.f32 q7, q4, q0 + vsub.f32 q9, q12, q11 + vsub.f32 q13, q5, q3 + vadd.f32 d29, d5, d2 @ + vadd.f32 q5, q5, q3 + vadd.f32 q10, q4, q0 + vadd.f32 q11, q12, q11 + vsub.f32 d31, d5, d2 @ + vsub.f32 d28, d4, d3 @ + vadd.f32 d30, d4, d3 @ + vadd.f32 d5, d19, d14 @ + vadd.f32 d7, d31, d26 @ + vadd.f32 q1, q14, q5 + vadd.f32 q0, q11, q10 + vsub.f32 d6, d30, d27 @ + vsub.f32 d4, d18, d15 @ + vsub.f32 d13, d19, d14 @ + vadd.f32 d12, d18, d15 @ + vsub.f32 d15, d31, d26 @ + ldr r2, [r12], #4 + vtrn.32 q1, q3 + ldr lr, [r12], #4 + vtrn.32 q0, q2 + add r2, r0, r2, lsl #2 + vsub.f32 q4, q11, q10 + add lr, r0, lr, lsl #2 + vsub.f32 q5, q14, q5 + vadd.f32 d14, d30, d27 @ + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_ee_o_loop + + add r2, r7, #0 + add r7, r9, #0 + add r9, r2, #0 + add r2, r8, #0 + add r8, r10, #0 + add r10, r2, #0 + ldr r11, [r1, #32] @ this is p->i1 + cmp r11, #0 + beq _neon_oo_o_loop_exit +_neon_oo_o_loop: + vld2.32 {q8}, [r6, :128]! + vld2.32 {q9}, [r5, :128]! + vld2.32 {q10}, [r4, :128]! + vld2.32 {q13}, [r3, :128]! + vadd.f32 q11, q9, q8 + vsub.f32 q8, q9, q8 + vsub.f32 q9, q13, q10 + vadd.f32 q12, q13, q10 + subs r11, r11, #1 + vld2.32 {q10}, [r7, :128]! + vld2.32 {q13}, [r9, :128]! + vsub.f32 q2, q12, q11 + vsub.f32 d7, d19, d16 @ + vadd.f32 d3, d19, d16 @ + vadd.f32 d6, d18, d17 @ + vsub.f32 d2, d18, d17 @ + vld2.32 {q9}, [r8, :128]! + vld2.32 {q8}, [r10, :128]! + vadd.f32 q0, q12, q11 + vadd.f32 q11, q13, q8 + vadd.f32 q12, q10, q9 + vsub.f32 q8, q13, q8 + vsub.f32 q9, q10, q9 + vsub.f32 q6, q12, q11 + vadd.f32 q4, q12, q11 + vtrn.32 q0, q2 + ldr r2, [r12], #4 + vsub.f32 d15, d19, d16 @ + ldr lr, [r12], #4 + vadd.f32 d11, d19, d16 @ + vadd.f32 d14, d18, d17 @ + vsub.f32 d10, d18, d17 @ + add r2, r0, r2, lsl #2 + vtrn.32 q1, q3 + add lr, r0, lr, lsl #2 + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_oo_o_loop +_neon_oo_o_loop_exit: + + ldr r11, [r1, #8] + vld1.32 {q8}, [r5, :128]! + vld1.32 {q10}, [r6, :128]! + vld2.32 {q11}, [r4, :128]! + vld2.32 {q13}, [r3, :128]! + vld2.32 {q15}, [r10, :128]! + vorr d25, d17, d17 + vorr d24, d20, d20 + vorr d20, d16, d16 + vsub.f32 q9, q13, q11 + vadd.f32 q11, q13, q11 + ldr r2, [r12], #4 + vtrn.32 d24, d25 + ldr lr, [r12], #4 + vtrn.32 d20, d21 + add r2, r0, r2, lsl #2 + vsub.f32 q8, q10, q12 + add lr, r0, lr, lsl #2 + vadd.f32 q10, q10, q12 + vadd.f32 q0, q11, q10 + vadd.f32 d25, d19, d16 @ + vsub.f32 d27, d19, d16 @ + vsub.f32 q1, q11, q10 + vsub.f32 d24, d18, d17 @ + vadd.f32 d26, d18, d17 @ + vtrn.32 q0, q12 + vtrn.32 q1, q13 + vld1.32 {d24, d25}, [r11, :128] + vswp d1, d2 + vst1.32 {q0, q1}, [r2, :128]! + vld2.32 {q0}, [r9, :128]! + vadd.f32 q1, q0, q15 + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vsub.f32 q15, q0, q15 + vsub.f32 q0, q14, q13 + vadd.f32 q3, q14, q13 + vadd.f32 q2, q3, q1 + vadd.f32 d29, d1, d30 @ + vsub.f32 d27, d1, d30 @ + vsub.f32 q3, q3, q1 + vsub.f32 d28, d0, d31 @ + vadd.f32 d26, d0, d31 @ + vtrn.32 q2, q14 + vtrn.32 q3, q13 + vswp d5, d6 + vst1.32 {q2, q3}, [r2, :128]! + vtrn.32 q11, q9 + vtrn.32 q10, q8 + vmul.f32 d20, d18, d25 + vmul.f32 d22, d19, d24 + vmul.f32 d21, d19, d25 + vmul.f32 d18, d18, d24 + vmul.f32 d19, d16, d25 + vmul.f32 d30, d17, d24 + vmul.f32 d23, d16, d24 + vmul.f32 d24, d17, d25 + vadd.f32 d17, d22, d20 + vsub.f32 d16, d18, d21 + vsub.f32 d21, d30, d19 + vadd.f32 d20, d24, d23 + vadd.f32 q9, q8, q10 + vsub.f32 q8, q8, q10 + vadd.f32 q4, q14, q9 + vsub.f32 q6, q14, q9 + vadd.f32 d11, d27, d16 @ + vsub.f32 d15, d27, d16 @ + vsub.f32 d10, d26, d17 @ + vadd.f32 d14, d26, d17 @ + vswp d9, d10 + vswp d13, d14 + vstmia lr!, {q4-q7} + + + add r2, r3, #0 + add r3, r7, #0 + add r7, r2, #0 + add r2, r4, #0 + add r4, r8, #0 + add r8, r2, #0 + add r2, r5, #0 + add r5, r9, #0 + add r9, r2, #0 + add r2, r6, #0 + add r6, r10, #0 + add r10, r2, #0 + add r2, r9, #0 + add r9, r10, #0 + add r10, r2, #0 + ldr r2, [r1, #16] + ldr r11, [r1, #32] @ this is p->i1 + cmp r11, #0 + beq _neon_ee_o_loop2_exit + + vld1.32 {d16, d17}, [r2, :128] +_neon_ee_o_loop2: + vld2.32 {q15}, [r10, :128]! + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vld2.32 {q9}, [r4, :128]! + vld2.32 {q10}, [r3, :128]! + vld2.32 {q11}, [r6, :128]! + vld2.32 {q12}, [r5, :128]! + vsub.f32 q1, q14, q13 + vld2.32 {q0}, [r9, :128]! + subs r11, r11, #1 + vsub.f32 q2, q0, q15 + vadd.f32 q0, q0, q15 + vmul.f32 d10, d2, d17 + vmul.f32 d11, d3, d16 + vmul.f32 d12, d3, d17 + vmul.f32 d6, d4, d17 + vmul.f32 d7, d5, d16 + vmul.f32 d8, d4, d16 + vmul.f32 d9, d5, d17 + vmul.f32 d13, d2, d16 + vsub.f32 d7, d7, d6 + vadd.f32 d11, d11, d10 + vsub.f32 q1, q12, q11 + vsub.f32 q2, q10, q9 + vadd.f32 d6, d9, d8 + vadd.f32 q4, q14, q13 + vadd.f32 q11, q12, q11 + vadd.f32 q12, q10, q9 + vsub.f32 d10, d13, d12 + vsub.f32 q7, q4, q0 + vsub.f32 q9, q12, q11 + vsub.f32 q13, q5, q3 + vadd.f32 d29, d5, d2 @ + vadd.f32 q5, q5, q3 + vadd.f32 q10, q4, q0 + vadd.f32 q11, q12, q11 + vsub.f32 d31, d5, d2 @ + vsub.f32 d28, d4, d3 @ + vadd.f32 d30, d4, d3 @ + vadd.f32 d5, d19, d14 @ + vadd.f32 d7, d31, d26 @ + vadd.f32 q1, q14, q5 + vadd.f32 q0, q11, q10 + vsub.f32 d6, d30, d27 @ + vsub.f32 d4, d18, d15 @ + vsub.f32 d13, d19, d14 @ + vadd.f32 d12, d18, d15 @ + vsub.f32 d15, d31, d26 @ + ldr r2, [r12], #4 + vtrn.32 q1, q3 + ldr lr, [r12], #4 + vtrn.32 q0, q2 + add r2, r0, r2, lsl #2 + vsub.f32 q4, q11, q10 + add lr, r0, lr, lsl #2 + vsub.f32 q5, q14, q5 + vadd.f32 d14, d30, d27 @ + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + bne _neon_ee_o_loop2 +_neon_ee_o_loop2_exit: + + vldmia sp!, {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + + .align 4 +#ifdef __APPLE__ + .globl _neon_static_x4_i +_neon_static_x4_i: +#else + .globl neon_static_x4_i +neon_static_x4_i: +#endif +@ add r3, r0, #0 + push {r4, r5, r6, lr} + vstmdb sp!, {d8-d15} + + vld1.32 {q8,q9}, [r0, :128] + add r4, r0, r1, lsl #1 + vld1.32 {q10,q11}, [r4, :128] + add r5, r0, r1, lsl #2 + vld1.32 {q12,q13}, [r5, :128] + add r6, r4, r1, lsl #2 + vld1.32 {q14,q15}, [r6, :128] + vld1.32 {q2,q3}, [r2, :128] + + vmul.f32 q0, q13, q3 + vmul.f32 q5, q12, q2 + vmul.f32 q1, q14, q2 + vmul.f32 q4, q14, q3 + vmul.f32 q14, q12, q3 + vmul.f32 q13, q13, q2 + vmul.f32 q12, q15, q3 + vmul.f32 q2, q15, q2 + vsub.f32 q0, q5, q0 + vadd.f32 q13, q13, q14 + vadd.f32 q12, q12, q1 + vsub.f32 q1, q2, q4 + vadd.f32 q15, q0, q12 + vsub.f32 q12, q0, q12 + vadd.f32 q14, q13, q1 + vsub.f32 q13, q13, q1 + vadd.f32 q0, q8, q15 + vadd.f32 q1, q9, q14 + vsub.f32 q2, q10, q13 @ + vsub.f32 q4, q8, q15 + vadd.f32 q3, q11, q12 @ + vst1.32 {q0,q1}, [r0, :128] + vsub.f32 q5, q9, q14 + vadd.f32 q6, q10, q13 @ + vsub.f32 q7, q11, q12 @ + vst1.32 {q2,q3}, [r4, :128] + vst1.32 {q4,q5}, [r5, :128] + vst1.32 {q6,q7}, [r6, :128] + vldmia sp!, {d8-d15} + pop {r4, r5, r6, pc} + + + + .align 4 +#ifdef __APPLE__ + .globl _neon_static_x8_i +_neon_static_x8_i: +#else + .globl neon_static_x8_i +neon_static_x8_i: +#endif + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + vstmdb sp!, {d8-d15} + mov r11, #0 + add r3, r0, #0 @ data0 + add r5, r0, r1, lsl #1 @ data2 + add r4, r0, r1 @ data1 + add r7, r5, r1, lsl #1 @ data4 + add r6, r5, r1 @ data3 + add r9, r7, r1, lsl #1 @ data6 + add r8, r7, r1 @ data5 + add r10, r9, r1 @ data7 + add r12, r2, #0 @ LUT + + sub r11, r11, r1, lsr #5 +neon_x8_loop: + vld1.32 {q2,q3}, [r12, :128]! + vld1.32 {q14,q15}, [r6, :128] + vld1.32 {q10,q11}, [r5, :128] + adds r11, r11, #1 + vmul.f32 q12, q15, q2 + vmul.f32 q8, q14, q3 + vmul.f32 q13, q14, q2 + vmul.f32 q9, q10, q3 + vmul.f32 q1, q10, q2 + vmul.f32 q0, q11, q2 + vmul.f32 q14, q11, q3 + vmul.f32 q15, q15, q3 + vld1.32 {q2,q3}, [r12, :128]! + vsub.f32 q10, q12, q8 + vadd.f32 q11, q0, q9 + vadd.f32 q8, q15, q13 + vld1.32 {q12,q13}, [r4, :128] + vsub.f32 q9, q1, q14 + vsub.f32 q15, q11, q10 + vsub.f32 q14, q9, q8 + vsub.f32 q4, q12, q15 @ + vadd.f32 q6, q12, q15 @ + vadd.f32 q5, q13, q14 @ + vsub.f32 q7, q13, q14 @ + vld1.32 {q14,q15}, [r9, :128] + vld1.32 {q12,q13}, [r7, :128] + vmul.f32 q1, q14, q2 + vmul.f32 q0, q14, q3 + vst1.32 {q4,q5}, [r4, :128] + vmul.f32 q14, q15, q3 + vmul.f32 q4, q15, q2 + vadd.f32 q15, q9, q8 + vst1.32 {q6,q7}, [r6, :128] + vmul.f32 q8, q12, q3 + vmul.f32 q5, q13, q3 + vmul.f32 q12, q12, q2 + vmul.f32 q9, q13, q2 + vadd.f32 q14, q14, q1 + vsub.f32 q13, q4, q0 + vadd.f32 q0, q9, q8 + vld1.32 {q8,q9}, [r3, :128] + vadd.f32 q1, q11, q10 + vsub.f32 q12, q12, q5 + vadd.f32 q11, q8, q15 + vsub.f32 q8, q8, q15 + vadd.f32 q2, q12, q14 + vsub.f32 q10, q0, q13 + vadd.f32 q15, q0, q13 + vadd.f32 q13, q9, q1 + vsub.f32 q9, q9, q1 + vsub.f32 q12, q12, q14 + vadd.f32 q0, q11, q2 + vadd.f32 q1, q13, q15 + vsub.f32 q4, q11, q2 + vsub.f32 q2, q8, q10 @ + vadd.f32 q3, q9, q12 @ + vst1.32 {q0,q1}, [r3, :128]! + vsub.f32 q5, q13, q15 + vld1.32 {q14,q15}, [r10, :128] + vsub.f32 q7, q9, q12 @ + vld1.32 {q12,q13}, [r8, :128] + vst1.32 {q2,q3}, [r5, :128]! + vld1.32 {q2,q3}, [r12, :128]! + vadd.f32 q6, q8, q10 @ + vmul.f32 q8, q14, q2 + vst1.32 {q4,q5}, [r7, :128]! + vmul.f32 q10, q15, q3 + vmul.f32 q9, q13, q3 + vmul.f32 q11, q12, q2 + vmul.f32 q14, q14, q3 + vst1.32 {q6,q7}, [r9, :128]! + vmul.f32 q15, q15, q2 + vmul.f32 q12, q12, q3 + vmul.f32 q13, q13, q2 + vadd.f32 q10, q10, q8 + vsub.f32 q11, q11, q9 + vld1.32 {q8,q9}, [r4, :128] + vsub.f32 q14, q15, q14 + vadd.f32 q15, q13, q12 + vadd.f32 q13, q11, q10 + vadd.f32 q12, q15, q14 + vsub.f32 q15, q15, q14 + vsub.f32 q14, q11, q10 + vld1.32 {q10,q11}, [r6, :128] + vadd.f32 q0, q8, q13 + vadd.f32 q1, q9, q12 + vsub.f32 q2, q10, q15 @ + vadd.f32 q3, q11, q14 @ + vsub.f32 q4, q8, q13 + vst1.32 {q0,q1}, [r4, :128]! + vsub.f32 q5, q9, q12 + vadd.f32 q6, q10, q15 @ + vst1.32 {q2,q3}, [r6, :128]! + vsub.f32 q7, q11, q14 @ + vst1.32 {q4,q5}, [r8, :128]! + vst1.32 {q6,q7}, [r10, :128]! + bne neon_x8_loop + + vldmia sp!, {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + + .align 4 +#ifdef __APPLE__ + .globl _neon_static_x8_t_i +_neon_static_x8_t_i: +#else + .globl neon_static_x8_t_i +neon_static_x8_t_i: +#endif + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + vstmdb sp!, {d8-d15} + mov r11, #0 + add r3, r0, #0 @ data0 + add r5, r0, r1, lsl #1 @ data2 + add r4, r0, r1 @ data1 + add r7, r5, r1, lsl #1 @ data4 + add r6, r5, r1 @ data3 + add r9, r7, r1, lsl #1 @ data6 + add r8, r7, r1 @ data5 + add r10, r9, r1 @ data7 + add r12, r2, #0 @ LUT + + sub r11, r11, r1, lsr #5 +neon_x8_t_loop: + vld1.32 {q2,q3}, [r12, :128]! + vld1.32 {q14,q15}, [r6, :128] + vld1.32 {q10,q11}, [r5, :128] + adds r11, r11, #1 + vmul.f32 q12, q15, q2 + vmul.f32 q8, q14, q3 + vmul.f32 q13, q14, q2 + vmul.f32 q9, q10, q3 + vmul.f32 q1, q10, q2 + vmul.f32 q0, q11, q2 + vmul.f32 q14, q11, q3 + vmul.f32 q15, q15, q3 + vld1.32 {q2,q3}, [r12, :128]! + vsub.f32 q10, q12, q8 + vadd.f32 q11, q0, q9 + vadd.f32 q8, q15, q13 + vld1.32 {q12,q13}, [r4, :128] + vsub.f32 q9, q1, q14 + vsub.f32 q15, q11, q10 + vsub.f32 q14, q9, q8 + vsub.f32 q4, q12, q15 @ + vadd.f32 q6, q12, q15 @ + vadd.f32 q5, q13, q14 @ + vsub.f32 q7, q13, q14 @ + vld1.32 {q14,q15}, [r9, :128] + vld1.32 {q12,q13}, [r7, :128] + vmul.f32 q1, q14, q2 + vmul.f32 q0, q14, q3 + vst1.32 {q4,q5}, [r4, :128] + vmul.f32 q14, q15, q3 + vmul.f32 q4, q15, q2 + vadd.f32 q15, q9, q8 + vst1.32 {q6,q7}, [r6, :128] + vmul.f32 q8, q12, q3 + vmul.f32 q5, q13, q3 + vmul.f32 q12, q12, q2 + vmul.f32 q9, q13, q2 + vadd.f32 q14, q14, q1 + vsub.f32 q13, q4, q0 + vadd.f32 q0, q9, q8 + vld1.32 {q8,q9}, [r3, :128] + vadd.f32 q1, q11, q10 + vsub.f32 q12, q12, q5 + vadd.f32 q11, q8, q15 + vsub.f32 q8, q8, q15 + vadd.f32 q2, q12, q14 + vsub.f32 q10, q0, q13 + vadd.f32 q15, q0, q13 + vadd.f32 q13, q9, q1 + vsub.f32 q9, q9, q1 + vsub.f32 q12, q12, q14 + vadd.f32 q0, q11, q2 + vadd.f32 q1, q13, q15 + vsub.f32 q4, q11, q2 + vsub.f32 q2, q8, q10 @ + vadd.f32 q3, q9, q12 @ + vst2.32 {q0,q1}, [r3, :128]! + vsub.f32 q5, q13, q15 + vld1.32 {q14,q15}, [r10, :128] + vsub.f32 q7, q9, q12 @ + vld1.32 {q12,q13}, [r8, :128] + vst2.32 {q2,q3}, [r5, :128]! + vld1.32 {q2,q3}, [r12, :128]! + vadd.f32 q6, q8, q10 @ + vmul.f32 q8, q14, q2 + vst2.32 {q4,q5}, [r7, :128]! + vmul.f32 q10, q15, q3 + vmul.f32 q9, q13, q3 + vmul.f32 q11, q12, q2 + vmul.f32 q14, q14, q3 + vst2.32 {q6,q7}, [r9, :128]! + vmul.f32 q15, q15, q2 + vmul.f32 q12, q12, q3 + vmul.f32 q13, q13, q2 + vadd.f32 q10, q10, q8 + vsub.f32 q11, q11, q9 + vld1.32 {q8,q9}, [r4, :128] + vsub.f32 q14, q15, q14 + vadd.f32 q15, q13, q12 + vadd.f32 q13, q11, q10 + vadd.f32 q12, q15, q14 + vsub.f32 q15, q15, q14 + vsub.f32 q14, q11, q10 + vld1.32 {q10,q11}, [r6, :128] + vadd.f32 q0, q8, q13 + vadd.f32 q1, q9, q12 + vsub.f32 q2, q10, q15 @ + vadd.f32 q3, q11, q14 @ + vsub.f32 q4, q8, q13 + vst2.32 {q0,q1}, [r4, :128]! + vsub.f32 q5, q9, q12 + vadd.f32 q6, q10, q15 @ + vst2.32 {q2,q3}, [r6, :128]! + vsub.f32 q7, q11, q14 @ + vst2.32 {q4,q5}, [r8, :128]! + vst2.32 {q6,q7}, [r10, :128]! + bne neon_x8_t_loop + + vldmia sp!, {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + + diff --git a/lib/ffts/src/patterns.c b/lib/ffts/src/patterns.c new file mode 100644 index 0000000..93fe7f7 --- /dev/null +++ b/lib/ffts/src/patterns.c @@ -0,0 +1,208 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "patterns.h" + +void permute_addr(int N, int offset, int stride, int *d) { + int i, a[4] = {0,2,1,3}; + for(i=0;i<4;i++) { + d[i] = offset + (a[i] << stride); + if(d[i] < 0) d[i] += N; + } +} + +void ffts_hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) { + + if(N > 4) { + ffts_hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL); + if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL); + if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL); + else { + int temp = poffset+(1<<stride); + if(temp < 0) temp += bigN; + temp *= 2; + + if(!(temp % (VL*2))) { + (*is)[0] = poffset+(1<<stride); + (*is)[1] = poffset+(1<<stride)+(1<<(stride+2)); + (*is)[2] = poffset-(1<<stride); + (*is)[3] = poffset-(1<<stride)+(1<<(stride+2)); + int i; + for(i=0;i<4;i++) if((*is)[i] < 0) (*is)[i] += bigN; + for(i=0;i<4;i++) (*is)[i] *= 2; + *is += 4; + } + } + }else if(N == 4) { + int perm[4]; + permute_addr(bigN, poffset, stride, perm); + if(!((perm[0]*2) % (VL*2))) { + int i; + for(i=0;i<4;i++) { + (*is)[i] = perm[i] * 2; + } + *is += 4; + } + } +} + +void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL) { + int i, i0 = N/leafN/3+1, i1=N/leafN/3, i2 = N/leafN/3; + int stride = log(N/leafN)/log(2); + + p->is = malloc(N/VL * sizeof(ptrdiff_t)); + + ptrdiff_t *is = p->is; + + if((N/leafN) % 3 > 1) i1++; + + for(i=0;i<i0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL); + for(i=i0;i<i0+i1;i++) { + ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL); + ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL); + } + for(i=0-i2;i<0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL); + + +//for(i=0;i<N/VL;i++) { +// printf("%td ", p->is[i]); +// if(i % 16 == 15) printf("\n"); +//} + + p->i0 = i0; p->i1 = i1; +} +/** + * + * + */ +void ffts_elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) { + if((even && N == leafN) || (!even && N <= leafN)) { + offsets[2*(ooffset/leafN)] = ioffset*2; + offsets[2*(ooffset/leafN)+1] = ooffset; + }else if(N > 4) { + ffts_elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even); + ffts_elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0); + if(N/4 >= leafN) + ffts_elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0); + } + +} + +int compare_offsets(const void *a, const void *b) { + return ((ptrdiff_t *)a)[0] - ((ptrdiff_t *)b)[0]; +} + +uint32_t reverse_bits(uint32_t a, int n) { + uint32_t x = 0; + + int i; + for(i=0;i<n;i++) { + if(a & (1 << i)) x |= 1 << (n-i-1); + } + return x; +} + + +void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) { + + ptrdiff_t *offsets = malloc(2 * N/leafN * sizeof(ptrdiff_t)); + + ffts_elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1); + + size_t i; + for(i=0;i<2*N/leafN;i+=2) { + if(offsets[i] < 0) offsets[i] = N + offsets[i]; + } + + qsort(offsets, N/leafN, 2 * sizeof(ptrdiff_t), compare_offsets); + //elaborate_is(p, N, 0, 0, 1); + p->offsets = malloc(N/leafN * sizeof(ptrdiff_t)); + for(i=0;i<N/leafN;i++) { + p->offsets[i] = offsets[i*2+1]*2; + } +//for(i=0;i<N/leafN;i++) { +// printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N))); +//} + free(offsets); +} + +/* +int tree_count(int N, int leafN, int offset) { + + if(N <= leafN) return 0; + int count = 0; + count += tree_count(N/4, leafN, offset); + count += tree_count(N/8, leafN, offset + N/4); + count += tree_count(N/8, leafN, offset + N/4 + N/8); + count += tree_count(N/4, leafN, offset + N/2); + count += tree_count(N/4, leafN, offset + 3*N/4); + + return 1 + count; +} + +void elaborate_tree(transform_index_t **p, int N, int leafN, int offset) { + + if(N <= leafN) return; + elaborate_tree(p, N/4, leafN, offset); + elaborate_tree(p, N/8, leafN, offset + N/4); + elaborate_tree(p, N/8, leafN, offset + N/4 + N/8); + elaborate_tree(p, N/4, leafN, offset + N/2); + elaborate_tree(p, N/4, leafN, offset + 3*N/4); + + (*p)[0] = N; + (*p)[1] = offset*2; + + (*p)+=2; +} + +void ffts_init_tree(ffts_plan_t *p, int N, int leafN) { + + int count = tree_count(N, leafN, 0) + 1; + transform_index_t *ps = p->transforms = malloc(count * 2 * sizeof(transform_index_t)); + +//printf("count = %d\n", count); + + elaborate_tree(&ps, N, leafN, 0); + #ifdef __ARM_NEON__ + ps -= 2; + #endif + ps[0] = 0; + ps[1] = 0; +//int i; +//for(i=0;i<count;i++) { +// fprintf(stderr, "%lu %lu - %d\n", p->transforms[i*2], p->transforms[i*2+1], +// __builtin_ctzl(p->transforms[i*2]) - 5); +//} + +} +*/ diff --git a/lib/ffts/src/patterns.h b/lib/ffts/src/patterns.h new file mode 100644 index 0000000..6e2d6bb --- /dev/null +++ b/lib/ffts/src/patterns.h @@ -0,0 +1,44 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#ifndef __PATTERNS_H__ +#define __PATTERNS_H__ + +#include "ffts.h" + +void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL); +void ffts_init_offsets(ffts_plan_t *p, int N, int leafN); +//void ffts_init_tree(ffts_plan_t *p, int N, int leafN); + +#endif diff --git a/lib/ffts/src/sse.s b/lib/ffts/src/sse.s new file mode 100644 index 0000000..79dd6ec --- /dev/null +++ b/lib/ffts/src/sse.s @@ -0,0 +1,878 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + + .globl _neon_x4 + .align 4 +_neon_x4: + + .globl _neon_x8 + .align 4 +_neon_x8: + + .globl _neon_x8_t + .align 4 +_neon_x8_t: + + +#ifdef __APPLE__ + .globl _leaf_ee_init +_leaf_ee_init: +#else + .globl leaf_ee_init +leaf_ee_init: +#endif + #lea L_sse_constants(%rip), %r9 + movq 0xe0(%rdi), %r9 + xorl %eax, %eax +# eax is loop counter (init to 0) +# rcx is loop max count +# rsi is 'in' base pointer +# rdx is 'out' base pointer +# r8 is offsets pointer +# r9 is constants pointer +# scratch: rax r11 r12 +# .align 4, 0x90 + +# _leaf_ee + 9 needs 16 byte alignment +#ifdef __APPLE__ + .globl _leaf_ee +_leaf_ee: +#else + .globl leaf_ee +leaf_ee: +#endif + movaps 32(%r9), %xmm0 #83.5 + movaps (%r9), %xmm8 #83.5 +LEAF_EE_1: +LEAF_EE_const_0: + movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5 +LEAF_EE_const_2: + movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5 + movaps %xmm7, %xmm6 #83.5 +LEAF_EE_const_3: + movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5 + movaps %xmm12, %xmm11 #83.5 + subps %xmm10, %xmm12 #83.5 + addps %xmm10, %xmm11 #83.5 + xorps %xmm8, %xmm12 #83.5 +LEAF_EE_const_1: + movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5 +LEAF_EE_const_4: + movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5 + addps %xmm9, %xmm6 #83.5 + subps %xmm9, %xmm7 #83.5 +LEAF_EE_const_5: + movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5 + movaps %xmm10, %xmm9 #83.5 +LEAF_EE_const_6: + movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5 + movaps %xmm6, %xmm5 #83.5 +LEAF_EE_const_7: + movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5 + movaps %xmm3, %xmm15 #83.5 + shufps $177, %xmm12, %xmm12 #83.5 + movaps %xmm7, %xmm4 #83.5 + movslq (%r8, %rax, 4), %r11 #83.44 + subps %xmm13, %xmm10 #83.5 + subps %xmm14, %xmm3 #83.5 + addps %xmm11, %xmm5 #83.5 + subps %xmm11, %xmm6 #83.5 + subps %xmm12, %xmm4 #83.5 + addps %xmm12, %xmm7 #83.5 + addps %xmm13, %xmm9 #83.5 + addps %xmm14, %xmm15 #83.5 + movaps 16(%r9), %xmm12 #83.5 + movaps %xmm9, %xmm1 #83.5 + movaps 16(%r9), %xmm11 #83.5 + movaps %xmm5, %xmm2 #83.5 + mulps %xmm10, %xmm12 #83.5 + subps %xmm15, %xmm9 #83.5 + addps %xmm15, %xmm1 #83.5 + mulps %xmm3, %xmm11 #83.5 + addps %xmm1, %xmm2 #83.5 + subps %xmm1, %xmm5 #83.5 + shufps $177, %xmm10, %xmm10 #83.5 + xorps %xmm8, %xmm9 #83.5 + shufps $177, %xmm3, %xmm3 #83.5 + movaps %xmm6, %xmm1 #83.5 + mulps %xmm0, %xmm10 #83.5 + movaps %xmm4, %xmm13 #83.5 + mulps %xmm0, %xmm3 #83.5 + subps %xmm10, %xmm12 #83.5 + addps %xmm3, %xmm11 #83.5 + movaps %xmm12, %xmm3 #83.5 + movaps %xmm7, %xmm14 #83.5 + shufps $177, %xmm9, %xmm9 #83.5 + subps %xmm11, %xmm12 #83.5 + addps %xmm11, %xmm3 #83.5 + subps %xmm9, %xmm1 #83.5 + addps %xmm9, %xmm6 #83.5 + addps %xmm3, %xmm4 #83.5 + subps %xmm3, %xmm13 #83.5 + xorps %xmm8, %xmm12 #83.5 + movaps %xmm2, %xmm3 #83.5 + shufps $177, %xmm12, %xmm12 #83.5 + movaps %xmm6, %xmm9 #83.5 + movslq 8(%r8, %rax, 4), %r12 #83.59 + movlhps %xmm4, %xmm3 #83.5 + addq $4, %rax + shufps $238, %xmm4, %xmm2 #83.5 + movaps %xmm1, %xmm4 #83.5 + #movntdq %xmm3, (%rdx,%r11,4) #83.5 + subps %xmm12, %xmm7 #83.5 + addps %xmm12, %xmm14 #83.5 + movlhps %xmm7, %xmm4 #83.5 + shufps $238, %xmm7, %xmm1 #83.5 + movaps %xmm5, %xmm7 #83.5 + movlhps %xmm13, %xmm7 #83.5 + movlhps %xmm14, %xmm9 #83.5 + shufps $238, %xmm13, %xmm5 #83.5 + shufps $238, %xmm14, %xmm6 #83.5 + movaps %xmm3, (%rdx,%r11,4) #83.5 + movaps %xmm4, 16(%rdx,%r11,4) #83.5 + movaps %xmm7, 32(%rdx,%r11,4) #83.5 + movaps %xmm9, 48(%rdx,%r11,4) #83.5 + movaps %xmm2, (%rdx,%r12,4) #83.5 + movaps %xmm1, 16(%rdx,%r12,4) #83.5 + movaps %xmm5, 32(%rdx,%r12,4) #83.5 + movaps %xmm6, 48(%rdx,%r12,4) #83.5 + cmpq %rcx, %rax + jne LEAF_EE_1 + + + +# _leaf_oo + 4 needs to be 16 byte aligned +#ifdef __APPLE__ + .globl _leaf_oo +_leaf_oo: +#else + .globl leaf_oo +leaf_oo: +#endif + movaps (%r9), %xmm5 #92.7 +LEAF_OO_1: +LEAF_OO_const_0: + movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5 + movaps %xmm4, %xmm6 #93.5 +LEAF_OO_const_1: + movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5 +LEAF_OO_const_2: + movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5 + addps %xmm7, %xmm6 #93.5 + subps %xmm7, %xmm4 #93.5 +LEAF_OO_const_3: + movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5 + movaps %xmm10, %xmm9 #93.5 +LEAF_OO_const_4: + movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5 + movaps %xmm6, %xmm3 #93.5 +LEAF_OO_const_5: + movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5 + movaps %xmm1, %xmm2 #93.5 +LEAF_OO_const_6: + movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5 + movaps %xmm4, %xmm15 #93.5 +LEAF_OO_const_7: + movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5 + movaps %xmm14, %xmm13 #93.5 + movslq (%r8, %rax, 4), %r11 #83.44 + subps %xmm8, %xmm10 #93.5 + addps %xmm8, %xmm9 #93.5 + addps %xmm11, %xmm2 #93.5 + subps %xmm12, %xmm14 #93.5 + subps %xmm11, %xmm1 #93.5 + addps %xmm12, %xmm13 #93.5 + addps %xmm9, %xmm3 #93.5 + subps %xmm9, %xmm6 #93.5 + xorps %xmm5, %xmm10 #93.5 + xorps %xmm5, %xmm14 #93.5 + shufps $177, %xmm10, %xmm10 #93.5 + movaps %xmm2, %xmm9 #93.5 + shufps $177, %xmm14, %xmm14 #93.5 + movaps %xmm6, %xmm7 #93.5 + movslq 8(%r8, %rax, 4), %r12 #83.59 + addq $4, %rax #92.18 + addps %xmm10, %xmm4 #93.5 + addps %xmm13, %xmm9 #93.5 + subps %xmm13, %xmm2 #93.5 + subps %xmm10, %xmm15 #93.5 + movaps %xmm1, %xmm13 #93.5 + movaps %xmm2, %xmm8 #93.5 + movlhps %xmm4, %xmm7 #93.5 + subps %xmm14, %xmm13 #93.5 + addps %xmm14, %xmm1 #93.5 + shufps $238, %xmm4, %xmm6 #93.5 + movaps %xmm3, %xmm14 #93.5 + movaps %xmm9, %xmm4 #93.5 + movlhps %xmm15, %xmm14 #93.5 + movlhps %xmm13, %xmm4 #93.5 + movlhps %xmm1, %xmm8 #93.5 + shufps $238, %xmm15, %xmm3 #93.5 + shufps $238, %xmm13, %xmm9 #93.5 + shufps $238, %xmm1, %xmm2 #93.5 + movaps %xmm14, (%rdx,%r11,4) #93.5 + movaps %xmm7, 16(%rdx,%r11,4) #93.5 + movaps %xmm4, 32(%rdx,%r11,4) #93.5 + movaps %xmm8, 48(%rdx,%r11,4) #93.5 + movaps %xmm3, (%rdx,%r12,4) #93.5 + movaps %xmm6, 16(%rdx,%r12,4) #93.5 + movaps %xmm9, 32(%rdx,%r12,4) #93.5 + movaps %xmm2, 48(%rdx,%r12,4) #93.5 + cmpq %rcx, %rax + jne LEAF_OO_1 # Prob 95% #92.14 + +#ifdef __APPLE__ + .globl _leaf_eo +_leaf_eo: +#else + .globl leaf_eo +leaf_eo: +#endif +LEAF_EO_const_0: + movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5 +LEAF_EO_const_2: + movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5 + movaps %xmm9, %xmm11 #88.5 +LEAF_EO_const_3: + movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5 + movaps %xmm7, %xmm6 #88.5 +LEAF_EO_const_1: + movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5 + subps %xmm5, %xmm7 #88.5 + addps %xmm4, %xmm11 #88.5 + subps %xmm4, %xmm9 #88.5 + addps %xmm5, %xmm6 #88.5 + movaps (%r9), %xmm3 #88.5 + movaps %xmm11, %xmm10 #88.5 + xorps %xmm3, %xmm7 #88.5 + movaps %xmm9, %xmm8 #88.5 + shufps $177, %xmm7, %xmm7 #88.5 + addps %xmm6, %xmm10 #88.5 + subps %xmm6, %xmm11 #88.5 + subps %xmm7, %xmm8 #88.5 + addps %xmm7, %xmm9 #88.5 + movslq 8(%r8, %rax, 4), %r12 #83.59 + movaps %xmm10, %xmm2 #88.5 + movslq (%r8, %rax, 4), %r11 #83.44 + movaps %xmm11, %xmm1 #88.5 + shufps $238, %xmm8, %xmm10 #88.5 + shufps $238, %xmm9, %xmm11 #88.5 + movaps %xmm10, (%rdx,%r12,4) #88.5 + movaps %xmm11, 16(%rdx,%r12,4) #88.5 +LEAF_EO_const_4: + movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5 +LEAF_EO_const_5: + movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5 + movaps %xmm15, %xmm14 #88.5 +LEAF_EO_const_6: + movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5 + addps %xmm12, %xmm14 #88.5 + subps %xmm12, %xmm15 #88.5 +LEAF_EO_const_7: + movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5 + movaps %xmm4, %xmm5 #88.5 + movaps %xmm14, %xmm7 #88.5 + addps %xmm13, %xmm5 #88.5 + subps %xmm13, %xmm4 #88.5 + movlhps %xmm8, %xmm2 #88.5 + movaps %xmm5, %xmm8 #88.5 + movlhps %xmm15, %xmm7 #88.5 + xorps %xmm3, %xmm15 #88.5 + movaps %xmm5, %xmm6 #88.5 + subps %xmm14, %xmm5 #88.5 + addps %xmm14, %xmm6 #88.5 + movlhps %xmm9, %xmm1 #88.5 + movaps %xmm4, %xmm14 #88.5 + movlhps %xmm4, %xmm8 #88.5 + movaps %xmm1, %xmm12 #88.5 + shufps $177, %xmm15, %xmm15 #88.5 + movaps 0x30(%r9), %xmm11 #88.5 + addq $4, %rax #90.5 + subps %xmm15, %xmm14 #88.5 + mulps %xmm7, %xmm11 #88.5 + addps %xmm15, %xmm4 #88.5 + movaps 0x30(%r9), %xmm9 #88.5 + movaps 0x40(%r9), %xmm15 #88.5 + shufps $177, %xmm7, %xmm7 #88.5 + mulps %xmm8, %xmm9 #88.5 + mulps %xmm15, %xmm7 #88.5 + shufps $177, %xmm8, %xmm8 #88.5 + subps %xmm7, %xmm11 #88.5 + mulps %xmm15, %xmm8 #88.5 + movaps %xmm11, %xmm10 #88.5 + addps %xmm8, %xmm9 #88.5 + shufps $238, %xmm14, %xmm6 #88.5 + subps %xmm9, %xmm11 #88.5 + addps %xmm9, %xmm10 #88.5 + xorps %xmm3, %xmm11 #88.5 + movaps %xmm2, %xmm3 #88.5 + shufps $177, %xmm11, %xmm11 #88.5 + subps %xmm10, %xmm3 #88.5 + addps %xmm10, %xmm2 #88.5 + addps %xmm11, %xmm12 #88.5 + subps %xmm11, %xmm1 #88.5 + shufps $238, %xmm4, %xmm5 #88.5 + movaps %xmm5, 48(%rdx,%r12,4) #88.5 + movaps %xmm6, 32(%rdx,%r12,4) #88.5 + movaps %xmm2, (%rdx,%r11,4) #88.5 + movaps %xmm1, 16(%rdx,%r11,4) #88.5 + movaps %xmm3, 32(%rdx,%r11,4) #88.5 + movaps %xmm12, 48(%rdx,%r11,4) #88.5 + + +#ifdef __APPLE__ + .globl _leaf_oe +_leaf_oe: +#else + .globl leaf_oe +leaf_oe: +#endif + movaps (%r9), %xmm0 #59.5 + #movaps 0x20(%r9), %xmm1 #59.5 +LEAF_OE_const_2: + movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5 +LEAF_OE_const_3: + movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5 + movaps %xmm6, %xmm10 #70.5 + shufps $228, %xmm8, %xmm10 #70.5 + movaps %xmm10, %xmm9 #70.5 + shufps $228, %xmm6, %xmm8 #70.5 +LEAF_OE_const_0: + movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5 +LEAF_OE_const_1: + movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5 + movaps %xmm12, %xmm14 #70.5 + movslq (%r8, %rax, 4), %r11 #83.44 + addps %xmm8, %xmm9 #70.5 + subps %xmm8, %xmm10 #70.5 + addps %xmm7, %xmm14 #70.5 + subps %xmm7, %xmm12 #70.5 + movaps %xmm9, %xmm4 #70.5 + movaps %xmm14, %xmm13 #70.5 + shufps $238, %xmm10, %xmm4 #70.5 + xorps %xmm0, %xmm10 #70.5 + shufps $177, %xmm10, %xmm10 #70.5 + movaps %xmm12, %xmm11 #70.5 + movaps %xmm14, %xmm5 #70.5 + addps %xmm9, %xmm13 #70.5 + subps %xmm10, %xmm11 #70.5 + subps %xmm9, %xmm14 #70.5 + shufps $238, %xmm12, %xmm5 #70.5 + addps %xmm10, %xmm12 #70.5 + movslq 8(%r8, %rax, 4), %r12 #83.59 + movlhps %xmm11, %xmm13 #70.5 + movaps %xmm13, (%rdx,%r11,4) #70.5 + movaps 0x30(%r9), %xmm13 #70.5 + movlhps %xmm12, %xmm14 #70.5 + movaps 0x40(%r9), %xmm12 #70.5 + mulps %xmm5, %xmm13 #70.5 + shufps $177, %xmm5, %xmm5 #70.5 + mulps %xmm12, %xmm5 #70.5 + movaps %xmm14, 16(%rdx,%r11,4) #70.5 + subps %xmm5, %xmm13 #70.5 + movaps 0x30(%r9), %xmm5 #70.5 + mulps %xmm4, %xmm5 #70.5 + shufps $177, %xmm4, %xmm4 #70.5 + mulps %xmm12, %xmm4 #70.5 +LEAF_OE_const_4: + movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5 + addps %xmm4, %xmm5 #70.5 +LEAF_OE_const_6: + movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5 + movaps %xmm9, %xmm3 #70.5 +LEAF_OE_const_7: + movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5 + movaps %xmm7, %xmm6 #70.5 +LEAF_OE_const_5: + movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5 + movaps %xmm13, %xmm4 #70.5 + subps %xmm2, %xmm7 #70.5 + addps %xmm15, %xmm3 #70.5 + subps %xmm15, %xmm9 #70.5 + addps %xmm2, %xmm6 #70.5 + subps %xmm5, %xmm13 #70.5 + addps %xmm5, %xmm4 #70.5 + xorps %xmm0, %xmm7 #70.5 + addq $4, %rax #72.5 + movaps %xmm3, %xmm2 #70.5 + shufps $177, %xmm7, %xmm7 #70.5 + movaps %xmm9, %xmm8 #70.5 + xorps %xmm0, %xmm13 #70.5 + addps %xmm6, %xmm2 #70.5 + subps %xmm7, %xmm8 #70.5 + subps %xmm6, %xmm3 #70.5 + addps %xmm7, %xmm9 #70.5 + movaps %xmm2, %xmm10 #70.5 + movaps %xmm3, %xmm11 #70.5 + shufps $238, %xmm8, %xmm2 #70.5 + shufps $238, %xmm9, %xmm3 #70.5 + movaps %xmm2, %xmm14 #70.5 + shufps $177, %xmm13, %xmm13 #70.5 + subps %xmm4, %xmm14 #70.5 + addps %xmm4, %xmm2 #70.5 + movaps %xmm3, %xmm4 #70.5 + subps %xmm13, %xmm3 #70.5 + addps %xmm13, %xmm4 #70.5 + movlhps %xmm8, %xmm10 #70.5 + movlhps %xmm9, %xmm11 #70.5 + movaps %xmm10, 32(%rdx,%r11,4) #70.5 + movaps %xmm11, 48(%rdx,%r11,4) #70.5 + movaps %xmm2, (%rdx,%r12,4) #70.5 + movaps %xmm3, 16(%rdx,%r12,4) #70.5 + movaps %xmm14, 32(%rdx,%r12,4) #70.5 + movaps %xmm4, 48(%rdx,%r12,4) #70.5 + + +#ifdef __APPLE__ + .globl _leaf_end +_leaf_end: +#else + .globl leaf_end +leaf_end: +#endif + +#ifdef __APPLE__ + .globl _x_init +_x_init: +#else + .globl x_init +x_init: +#endif + #movaps L_sse_constants(%rip), %xmm3 #34.3 + movaps (%r9), %xmm3 #34.3 + movq 0x20(%rdi),%r8 +#ifdef __APPLE__ + .globl _x4 +_x4: +#else + .globl x4 +x4: +#endif + movaps 64(%rdx), %xmm0 #34.3 + movaps 96(%rdx), %xmm1 #34.3 + movaps (%rdx), %xmm7 #34.3 + movaps (%r8), %xmm4 #const + movaps %xmm7, %xmm9 #34.3 + movaps %xmm4, %xmm6 #34.3 + movaps 16(%r8), %xmm2 #const + mulps %xmm0, %xmm6 #34.3 + mulps %xmm1, %xmm4 #34.3 + shufps $177, %xmm0, %xmm0 #34.3 + shufps $177, %xmm1, %xmm1 #34.3 + mulps %xmm2, %xmm0 #34.3 + mulps %xmm1, %xmm2 #34.3 + subps %xmm0, %xmm6 #34.3 + addps %xmm2, %xmm4 #34.3 + movaps %xmm6, %xmm5 #34.3 + subps %xmm4, %xmm6 #34.3 + addps %xmm4, %xmm5 #34.3 + movaps 32(%rdx), %xmm8 #34.3 + xorps %xmm3, %xmm6 #34.3 + shufps $177, %xmm6, %xmm6 #34.3 + movaps %xmm8, %xmm10 #34.3 + movaps 112(%rdx), %xmm12 #34.3 + subps %xmm5, %xmm9 #34.3 + addps %xmm5, %xmm7 #34.3 + addps %xmm6, %xmm10 #34.3 + subps %xmm6, %xmm8 #34.3 + movaps %xmm7, (%rdx) #34.3 + movaps %xmm8, 32(%rdx) #34.3 + movaps %xmm9, 64(%rdx) #34.3 + movaps %xmm10, 96(%rdx) #34.3 + movaps 32(%r8), %xmm14 #const #34.3 + movaps 80(%rdx), %xmm11 #34.3 + movaps %xmm14, %xmm0 #34.3 + movaps 48(%r8), %xmm13 #const #34.3 + mulps %xmm11, %xmm0 #34.3 + mulps %xmm12, %xmm14 #34.3 + shufps $177, %xmm11, %xmm11 #34.3 + shufps $177, %xmm12, %xmm12 #34.3 + mulps %xmm13, %xmm11 #34.3 + mulps %xmm12, %xmm13 #34.3 + subps %xmm11, %xmm0 #34.3 + addps %xmm13, %xmm14 #34.3 + movaps %xmm0, %xmm15 #34.3 + subps %xmm14, %xmm0 #34.3 + addps %xmm14, %xmm15 #34.3 + xorps %xmm3, %xmm0 #34.3 + movaps 16(%rdx), %xmm1 #34.3 + movaps 48(%rdx), %xmm2 #34.3 + movaps %xmm1, %xmm4 #34.3 + shufps $177, %xmm0, %xmm0 #34.3 + movaps %xmm2, %xmm5 #34.3 + addps %xmm15, %xmm1 #34.3 + subps %xmm0, %xmm2 #34.3 + subps %xmm15, %xmm4 #34.3 + addps %xmm0, %xmm5 #34.3 + movaps %xmm1, 16(%rdx) #34.3 + movaps %xmm2, 48(%rdx) #34.3 + movaps %xmm4, 80(%rdx) #34.3 + movaps %xmm5, 112(%rdx) #34.3 + ret + +# _x8_soft + 5 needs to be 16 byte aligned +#ifdef __APPLE__ + .globl _x8_soft +_x8_soft: +#else + .globl x8_soft +x8_soft: +#endif + xorl %eax, %eax + movq %rdx, %rbx + movq %r8, %rsi + leaq (%rdx,%rcx,4), %r9 + leaq (%r9,%rcx,4), %r10 + leaq (%r10,%rcx,4), %r11 + leaq (%r11,%rcx,4), %r12 + leaq (%r12,%rcx,4), %r13 + leaq (%r13,%rcx,4), %r14 + leaq (%r14,%rcx,4), %r15 +X8_soft_loop: + movaps (%rsi), %xmm9 + movaps (%r10,%rax,4), %xmm6 + movaps %xmm9, %xmm11 + movaps (%r11,%rax,4), %xmm7 + movaps 16(%rsi), %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm9 + shufps $177, %xmm6, %xmm6 + mulps %xmm8, %xmm6 + shufps $177, %xmm7, %xmm7 + subps %xmm6, %xmm11 + mulps %xmm7, %xmm8 + movaps %xmm11, %xmm10 + addps %xmm8, %xmm9 + movaps 32(%rsi), %xmm15 + addps %xmm9, %xmm10 + subps %xmm9, %xmm11 + movaps (%rbx,%rax,4), %xmm5 + movaps %xmm15, %xmm6 + movaps (%r12,%rax,4), %xmm12 + movaps %xmm5, %xmm2 + movaps (%r14,%rax,4), %xmm13 + xorps %xmm3, %xmm11 #const + movaps 48(%rsi), %xmm14 + subps %xmm10, %xmm2 + mulps %xmm12, %xmm6 + addps %xmm10, %xmm5 + mulps %xmm13, %xmm15 + movaps 64(%rsi), %xmm10 + movaps %xmm5, %xmm0 + shufps $177, %xmm12, %xmm12 + shufps $177, %xmm13, %xmm13 + mulps %xmm14, %xmm12 + mulps %xmm13, %xmm14 + subps %xmm12, %xmm6 + addps %xmm14, %xmm15 + movaps (%r13,%rax,4), %xmm7 + movaps %xmm10, %xmm13 + movaps (%r15,%rax,4), %xmm8 + movaps %xmm6, %xmm12 + movaps 80(%rsi), %xmm9 + addq $96, %rsi + mulps %xmm7, %xmm13 + subps %xmm15, %xmm6 + addps %xmm15, %xmm12 + mulps %xmm8, %xmm10 + subps %xmm12, %xmm0 + addps %xmm12, %xmm5 + shufps $177, %xmm7, %xmm7 + xorps %xmm3, %xmm6 #const + shufps $177, %xmm8, %xmm8 + movaps %xmm2, %xmm12 + mulps %xmm9, %xmm7 + mulps %xmm8, %xmm9 + subps %xmm7, %xmm13 + addps %xmm9, %xmm10 + movaps (%r9,%rax,4), %xmm4 + shufps $177, %xmm11, %xmm11 + movaps %xmm4, %xmm1 + shufps $177, %xmm6, %xmm6 + addps %xmm11, %xmm1 + subps %xmm11, %xmm4 + addps %xmm6, %xmm12 + subps %xmm6, %xmm2 + movaps %xmm13, %xmm11 + movaps %xmm4, %xmm14 + movaps %xmm1, %xmm6 + subps %xmm10, %xmm13 + addps %xmm10, %xmm11 + xorps %xmm3, %xmm13 #const + addps %xmm11, %xmm4 + subps %xmm11, %xmm14 + shufps $177, %xmm13, %xmm13 + movaps %xmm5, (%rbx,%rax,4) + movaps %xmm4, (%r9,%rax,4) + movaps %xmm2, (%r10,%rax,4) + subps %xmm13, %xmm1 + addps %xmm13, %xmm6 + movaps %xmm1, (%r11,%rax,4) + movaps %xmm0, (%r12,%rax,4) + movaps %xmm14, (%r13,%rax,4) + movaps %xmm12, (%r14,%rax,4) + movaps %xmm6, (%r15,%rax,4) + addq $4, %rax + cmpq %rcx, %rax + jne X8_soft_loop + ret + +#ifdef __APPLE__ + .globl _x8_hard +_x8_hard: +#else + .globl x8_hard +x8_hard: +#endif + movaps (%r9), %xmm5 +X8_loop: + movaps (%r8), %xmm9 +X8_const_2: + movaps 0xFECA(%rdx,%rax,4), %xmm6 + movaps %xmm9, %xmm11 +X8_const_3: + movaps 0xFECA(%rdx,%rax,4), %xmm7 + movaps 16(%r8), %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm9 + shufps $177, %xmm6, %xmm6 + mulps %xmm8, %xmm6 + shufps $177, %xmm7, %xmm7 + subps %xmm6, %xmm11 + mulps %xmm7, %xmm8 + movaps %xmm11, %xmm10 + addps %xmm8, %xmm9 + movaps 32(%r8), %xmm15 + addps %xmm9, %xmm10 + subps %xmm9, %xmm11 +X8_const_0: + movaps 0xFECA(%rdx,%rax,4), %xmm3 + movaps %xmm15, %xmm6 +X8_const_4: + movaps 0xFECA(%rdx,%rax,4), %xmm12 + movaps %xmm3, %xmm2 +X8_const_6: + movaps 0xFECA(%rdx,%rax,4), %xmm13 + xorps %xmm5, %xmm11 + movaps 48(%r8), %xmm14 + subps %xmm10, %xmm2 + mulps %xmm12, %xmm6 + addps %xmm10, %xmm3 + mulps %xmm13, %xmm15 + movaps 64(%r8), %xmm10 + movaps %xmm3, %xmm0 + shufps $177, %xmm12, %xmm12 + shufps $177, %xmm13, %xmm13 + mulps %xmm14, %xmm12 + mulps %xmm13, %xmm14 + subps %xmm12, %xmm6 + addps %xmm14, %xmm15 +X8_const_5: + movaps 0xFECA(%rdx,%rax,4), %xmm7 + movaps %xmm10, %xmm13 +X8_const_7: + movaps 0xFECA(%rdx,%rax,4), %xmm8 + movaps %xmm6, %xmm12 + movaps 80(%r8), %xmm9 + addq $96, %r8 + mulps %xmm7, %xmm13 + subps %xmm15, %xmm6 + addps %xmm15, %xmm12 + mulps %xmm8, %xmm10 + subps %xmm12, %xmm0 + addps %xmm12, %xmm3 + shufps $177, %xmm7, %xmm7 + xorps %xmm5, %xmm6 + shufps $177, %xmm8, %xmm8 + movaps %xmm2, %xmm12 + mulps %xmm9, %xmm7 + mulps %xmm8, %xmm9 + subps %xmm7, %xmm13 + addps %xmm9, %xmm10 +X8_const_1: + movaps 0xFECA(%rdx,%rax,4), %xmm4 + shufps $177, %xmm11, %xmm11 + movaps %xmm4, %xmm1 + shufps $177, %xmm6, %xmm6 + addps %xmm11, %xmm1 + subps %xmm11, %xmm4 + addps %xmm6, %xmm12 + subps %xmm6, %xmm2 + movaps %xmm13, %xmm11 + movaps %xmm4, %xmm14 + movaps %xmm1, %xmm6 + subps %xmm10, %xmm13 + addps %xmm10, %xmm11 + xorps %xmm5, %xmm13 + addps %xmm11, %xmm4 + subps %xmm11, %xmm14 + shufps $177, %xmm13, %xmm13 +X8_const1_0: + movaps %xmm3, 0xFECA(%rdx,%rax,4) +X8_const1_1: + movaps %xmm4, 0xFECA(%rdx,%rax,4) +X8_const1_2: + movaps %xmm2, 0xFECA(%rdx,%rax,4) + subps %xmm13, %xmm1 + addps %xmm13, %xmm6 +X8_const1_3: + movaps %xmm1, 0xFECA(%rdx,%rax,4) +X8_const1_4: + movaps %xmm0, 0xFECA(%rdx,%rax,4) +X8_const1_5: + movaps %xmm14, 0xFECA(%rdx,%rax,4) +X8_const1_6: + movaps %xmm12, 0xFECA(%rdx,%rax,4) +X8_const1_7: + movaps %xmm6, 0xFECA(%rdx,%rax,4) + addq $4, %rax + cmpq %rcx, %rax + jne X8_loop + +#ifdef __APPLE__ + .globl _sse_leaf_ee_offsets + .globl _sse_leaf_oo_offsets + .globl _sse_leaf_eo_offsets + .globl _sse_leaf_oe_offsets + .align 4 +_sse_leaf_ee_offsets: + .long LEAF_EE_const_0-_leaf_ee+0x4 + .long LEAF_EE_const_1-_leaf_ee+0x5 + .long LEAF_EE_const_2-_leaf_ee+0x5 + .long LEAF_EE_const_3-_leaf_ee+0x5 + .long LEAF_EE_const_4-_leaf_ee+0x5 + .long LEAF_EE_const_5-_leaf_ee+0x5 + .long LEAF_EE_const_6-_leaf_ee+0x4 + .long LEAF_EE_const_7-_leaf_ee+0x5 +_sse_leaf_oo_offsets: + .long LEAF_OO_const_0-_leaf_oo+0x4 + .long LEAF_OO_const_1-_leaf_oo+0x4 + .long LEAF_OO_const_2-_leaf_oo+0x5 + .long LEAF_OO_const_3-_leaf_oo+0x5 + .long LEAF_OO_const_4-_leaf_oo+0x4 + .long LEAF_OO_const_5-_leaf_oo+0x5 + .long LEAF_OO_const_6-_leaf_oo+0x5 + .long LEAF_OO_const_7-_leaf_oo+0x5 +_sse_leaf_eo_offsets: + .long LEAF_EO_const_0-_leaf_eo+0x5 + .long LEAF_EO_const_1-_leaf_eo+0x4 + .long LEAF_EO_const_2-_leaf_eo+0x4 + .long LEAF_EO_const_3-_leaf_eo+0x4 + .long LEAF_EO_const_4-_leaf_eo+0x5 + .long LEAF_EO_const_5-_leaf_eo+0x5 + .long LEAF_EO_const_6-_leaf_eo+0x4 + .long LEAF_EO_const_7-_leaf_eo+0x5 +_sse_leaf_oe_offsets: + .long LEAF_OE_const_0-_leaf_oe+0x5 + .long LEAF_OE_const_1-_leaf_oe+0x4 + .long LEAF_OE_const_2-_leaf_oe+0x4 + .long LEAF_OE_const_3-_leaf_oe+0x5 + .long LEAF_OE_const_4-_leaf_oe+0x5 + .long LEAF_OE_const_5-_leaf_oe+0x5 + .long LEAF_OE_const_6-_leaf_oe+0x4 + .long LEAF_OE_const_7-_leaf_oe+0x4 +#else + .globl sse_leaf_ee_offsets + .globl sse_leaf_oo_offsets + .globl sse_leaf_eo_offsets + .globl sse_leaf_oe_offsets + .align 4 +sse_leaf_ee_offsets: + .long LEAF_EE_const_0-leaf_ee+0x4 + .long LEAF_EE_const_1-leaf_ee+0x5 + .long LEAF_EE_const_2-leaf_ee+0x5 + .long LEAF_EE_const_3-leaf_ee+0x5 + .long LEAF_EE_const_4-leaf_ee+0x5 + .long LEAF_EE_const_5-leaf_ee+0x5 + .long LEAF_EE_const_6-leaf_ee+0x4 + .long LEAF_EE_const_7-leaf_ee+0x5 +sse_leaf_oo_offsets: + .long LEAF_OO_const_0-leaf_oo+0x4 + .long LEAF_OO_const_1-leaf_oo+0x4 + .long LEAF_OO_const_2-leaf_oo+0x5 + .long LEAF_OO_const_3-leaf_oo+0x5 + .long LEAF_OO_const_4-leaf_oo+0x4 + .long LEAF_OO_const_5-leaf_oo+0x5 + .long LEAF_OO_const_6-leaf_oo+0x5 + .long LEAF_OO_const_7-leaf_oo+0x5 +sse_leaf_eo_offsets: + .long LEAF_EO_const_0-leaf_eo+0x5 + .long LEAF_EO_const_1-leaf_eo+0x4 + .long LEAF_EO_const_2-leaf_eo+0x4 + .long LEAF_EO_const_3-leaf_eo+0x4 + .long LEAF_EO_const_4-leaf_eo+0x5 + .long LEAF_EO_const_5-leaf_eo+0x5 + .long LEAF_EO_const_6-leaf_eo+0x4 + .long LEAF_EO_const_7-leaf_eo+0x5 +sse_leaf_oe_offsets: + .long LEAF_OE_const_0-leaf_oe+0x5 + .long LEAF_OE_const_1-leaf_oe+0x4 + .long LEAF_OE_const_2-leaf_oe+0x4 + .long LEAF_OE_const_3-leaf_oe+0x5 + .long LEAF_OE_const_4-leaf_oe+0x5 + .long LEAF_OE_const_5-leaf_oe+0x5 + .long LEAF_OE_const_6-leaf_oe+0x4 + .long LEAF_OE_const_7-leaf_oe+0x4 +#endif + +#ifdef __APPLE__ + .data +#else + .section .data +#endif + .p2align 4 +#ifdef __APPLE__ + .globl _sse_constants +_sse_constants: +#else + .globl sse_constants +sse_constants: +#endif + .long 0x00000000,0x80000000,0x00000000,0x80000000 + .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 + .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3 + .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 + .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3 +#ifdef __APPLE__ + .globl _sse_constants_inv +_sse_constants_inv: +#else + .globl sse_constants_inv +sse_constants_inv: +#endif + .long 0x80000000,0x00000000,0x80000000,0x00000000 + .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 + .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3 + .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 + .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3 diff --git a/lib/ffts/src/types.h b/lib/ffts/src/types.h new file mode 100644 index 0000000..04cbf61 --- /dev/null +++ b/lib/ffts/src/types.h @@ -0,0 +1,49 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#ifndef __TYPES_H__ +#define __TYPES_H__ + +#define __INLINE static inline __attribute__((always_inline)) + +#if defined(complex) + typedef complex float cdata_t; +#else + typedef float cdata_t[2]; +#endif + typedef float data_t; + +#endif + + diff --git a/lib/ffts/src/vfp.h b/lib/ffts/src/vfp.h new file mode 100644 index 0000000..f733a3f --- /dev/null +++ b/lib/ffts/src/vfp.h @@ -0,0 +1,45 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, 2013 The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __VFP_H__ +#define __VFP_H__ + +#include "ffts.h" + +void vfp_e(); +void vfp_o(); +void vfp_x4(); +void vfp_x8(); +void vfp_end(); + +#endif diff --git a/lib/ffts/src/vfp.s b/lib/ffts/src/vfp.s new file mode 100644 index 0000000..8ced89d --- /dev/null +++ b/lib/ffts/src/vfp.s @@ -0,0 +1,473 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, 2013 The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +@ assumes r0 = out +@ r1 = in ? +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = loop iterations +@ r2 = const pointer +@ & lr = temps + + .align 4 +#ifdef __APPLE__ + .globl _vfp_e +_vfp_e: +#else + .globl vfp_e +vfp_e: +#endif +_vfp_e_loop: + vldr s15, [r2, #8] + vldr s2, [r3] @ x0 + vldr s0, [r3, #4] + vldr s4, [r4] @ x1 + vldr s11, [r2] + vldr s10, [r7] @ x4 + vldr s3, [r7, #4] + vldr s8, [r8] @ x5 + vldr s1, [r8, #4] + vldr s14, [r9] @ x6 + vldr s9, [r9, #4] + vldr s6, [r10] @ x7 + vldr s12, [r10, #4] + vsub.f32 s18, s3, s1 + vsub.f32 s7, s10, s8 + vsub.f32 s5, s14, s6 + vadd.f32 s6, s14, s6 + vldr s24, [r5, #4] + vsub.f32 s14, s9, s12 + vldr s22, [r6, #4] + vadd.f32 s8, s10, s8 + vldr s28, [r6] @ x3 + vldr s17, [r5] @ x2 + vadd.f32 s10, s9, s12 + vmul.f32 s13, s18, s15 + vmul.f32 s9, s7, s11 + vmul.f32 s16, s5, s11 + vmul.f32 s18, s18, s11 + vmul.f32 s30, s14, s11 + vldr s11, [r4, #4] + add r3, r3, #8 + add r4, r4, #8 + add r5, r5, #8 + add r6, r6, #8 + add r7, r7, #8 + add r8, r8, #8 + add r9, r9, #8 + add r10, r10, #8 + vmul.f32 s12, s5, s15 + vmul.f32 s20, s14, s15 + vadd.f32 s5, s2, s4 + vadd.f32 s3, s3, s1 + vmul.f32 s15, s7, s15 + vadd.f32 s1, s24, s22 + vsub.f32 s7, s24, s22 + vadd.f32 s24, s17, s28 + vadd.f32 s26, s0, s11 + vsub.f32 s14, s9, s13 + vsub.f32 s2, s2, s4 + vadd.f32 s4, s16, s20 + vsub.f32 s22, s0, s11 + vsub.f32 s16, s17, s28 + vadd.f32 s9, s5, s24 + vadd.f32 s28, s18, s15 + vadd.f32 s13, s8, s6 + vsub.f32 s5, s5, s24 + vsub.f32 s24, s8, s6 + vadd.f32 s11, s26, s1 + vsub.f32 s12, s30, s12 + vadd.f32 s20, s3, s10 + vsub.f32 s15, s3, s10 + vsub.f32 s3, s26, s1 + vadd.f32 s18, s9, s13 + vadd.f32 s10, s14, s4 + vadd.f32 s6, s2, s7 @ + vsub.f32 s0, s2, s7 @ + vadd.f32 s26, s11, s20 + vsub.f32 s4, s14, s4 + vsub.f32 s8, s22, s16 @ + vadd.f32 s1, s28, s12 +ldr lr, [r12], #4 +add lr, r0, lr, lsl #2 +subs r11, r11, #1 + vstr s18, [lr] + vsub.f32 s2, s28, s12 + vadd.f32 s12, s22, s16 @ + vsub.f32 s16, s3, s24 @ + vsub.f32 s13, s9, s13 + vstr s26, [lr, #4] + vadd.f32 s28, s5, s15 @ + vsub.f32 s7, s5, s15 @ + vadd.f32 s14, s6, s10 + vadd.f32 s5, s8, s1 + vadd.f32 s9, s0, s2 @ + vsub.f32 s2, s0, s2 @ + vsub.f32 s11, s11, s20 + vstr s28, [lr, #16] + vadd.f32 s3, s3, s24 @ + vstr s16, [lr, #20] + vsub.f32 s6, s6, s10 + vstr s13, [lr, #32] + vsub.f32 s13, s12, s4 @ + vsub.f32 s8, s8, s1 + vadd.f32 s0, s12, s4 @ + vstr s11, [lr, #36] + vstr s7, [lr, #48] + vstr s3, [lr, #52] + vstr s14, [lr, #8] + vstr s5, [lr, #12] + vstr s9, [lr, #24] + vstr s13, [lr, #28] + vstr s6, [lr, #40] + vstr s8, [lr, #44] + vstr s2, [lr, #56] + vstr s0, [lr, #60] + bne _vfp_e_loop + +@ assumes r0 = out +@ r1 = in ? +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = loop iterations +@ r2 & lr = temps + .align 4 +#ifdef __APPLE__ + .globl _vfp_o +_vfp_o: +#else + .globl vfp_o +vfp_o: +#endif + _vfp_o_loop: + vldr s4, [r3] @ x0 + vldr s0, [r3, #4] + vldr s6, [r4] @ x1 + vldr s5, [r4, #4] + vldr s7, [r5] @ x2 + vldr s1, [r5, #4] + vldr s3, [r6] @ x3 + vldr s8, [r6, #4] + subs r11, r11, #1 + ldr r2, [r12], #4 + add r2, r0, r2, lsl #2 + vadd.f32 s2, s4, s6 + vadd.f32 s14, s0, s5 + vadd.f32 s10, s1, s8 + vsub.f32 s4, s4, s6 + vsub.f32 s0, s0, s5 + vadd.f32 s12, s7, s3 + vsub.f32 s6, s7, s3 + vsub.f32 s8, s1, s8 + vadd.f32 s5, s14, s10 + vsub.f32 s10, s14, s10 + vadd.f32 s7, s2, s12 + vsub.f32 s1, s0, s6 @ + vsub.f32 s12, s2, s12 + vadd.f32 s3, s4, s8 @ + vsub.f32 s2, s4, s8 @ + vadd.f32 s0, s0, s6 @ + vstr s7, [r2] + vldr s7, [r9] @ x2 + vstr s5, [r2, #4] + vstr s3, [r2, #8] + vstr s1, [r2, #12] + vstr s12, [r2, #16] + vstr s10, [r2, #20] + vstr s2, [r2, #24] + vstr s0, [r2, #28] + vldr s4, [r7] @ x0 + vldr s0, [r7, #4] + vldr s6, [r8] @ x1 + vldr s5, [r8, #4] + vldr s3, [r10] @ x3 + vldr s8, [r10, #4] + vldr s1, [r9, #4] + add r3, r3, #8 + add r4, r4, #8 + add r5, r5, #8 + add r6, r6, #8 + add r7, r7, #8 + add r8, r8, #8 + add r9, r9, #8 + add r10, r10, #8 + vadd.f32 s2, s4, s6 + vadd.f32 s14, s0, s5 + vadd.f32 s10, s1, s8 + vsub.f32 s4, s4, s6 + vsub.f32 s0, s0, s5 + vadd.f32 s12, s7, s3 + vsub.f32 s6, s7, s3 + vsub.f32 s8, s1, s8 + vadd.f32 s5, s14, s10 + vsub.f32 s10, s14, s10 + vadd.f32 s7, s2, s12 + vsub.f32 s1, s0, s6 @ + vsub.f32 s12, s2, s12 + vadd.f32 s3, s4, s8 @ + vsub.f32 s2, s4, s8 @ + vadd.f32 s0, s0, s6 @ + vstr s7, [r2, #32] + vstr s5, [r2, #36] + vstr s3, [r2, #40] + vstr s1, [r2, #44] + vstr s12, [r2, #48] + vstr s10, [r2, #52] + vstr s2, [r2, #56] + vstr s0, [r2, #60] + bne _vfp_o_loop + + .align 4 +#ifdef __APPLE__ + .globl _vfp_x4 +_vfp_x4: +#else + .globl vfp_x4 +vfp_x4: +#endif + add r3, r0, #0 + add r7, r2, #0 + add r4, r0, r1, lsl #1 + add r5, r0, r1, lsl #2 + add r6, r4, r1, lsl #2 + mov r11, #4 +_vfp_x4_loop: + + vldr s8, [r3, #0] + vldr s9, [r3, #4] + vldr s10, [r4, #0] + vldr s11, [r4, #4] + vldr s12, [r5, #0] + vldr s13, [r5, #4] + vldr s14, [r6, #0] + vldr s15, [r6, #4] + vldr s2, [r7, #0] + vldr s3, [r7, #4] + add r7, r7, #8 + subs r11, r11, #1 + vmul.f32 s0, s13, s3 + vmul.f32 s5, s12, s2 + vmul.f32 s1, s14, s2 + vmul.f32 s4, s14, s3 + vmul.f32 s14, s12, s3 + vmul.f32 s13, s13, s2 + vmul.f32 s12, s15, s3 + vmul.f32 s2, s15, s2 + vsub.f32 s0, s5, s0 + vadd.f32 s13, s13, s14 + vadd.f32 s12, s12, s1 + vsub.f32 s1, s2, s4 + vadd.f32 s15, s0, s12 + vsub.f32 s12, s0, s12 + vadd.f32 s14, s13, s1 + vsub.f32 s13, s13, s1 + vadd.f32 s0, s8, s15 + vadd.f32 s1, s9, s14 + vadd.f32 s2, s10, s13 @ + vsub.f32 s4, s8, s15 + vsub.f32 s3, s11, s12 @ + vstr s0, [r3, #0] + vstr s1, [r3, #4] + add r3, r3, #8 + vsub.f32 s5, s9, s14 + vsub.f32 s6, s10, s13 @ + vadd.f32 s7, s11, s12 @ + vstr s2, [r4, #0] + vstr s3, [r4, #4] + add r4, r4, #8 + vstr s4, [r5, #0] + vstr s5, [r5, #4] + add r5, r5, #8 + vstr s6, [r6, #0] + vstr s7, [r6, #4] + add r6, r6, #8 + bne _vfp_x4_loop + bx lr + + .align 4 +#ifdef __APPLE__ + .globl _vfp_x8 +_vfp_x8: +#else + .globl vfp_x8 +vfp_x8: +#endif + mov r11, #0 + add r3, r0, #0 @ data0 + add r5, r0, r1, lsl #1 @ data2 + add r4, r0, r1 @ data1 + add r7, r5, r1, lsl #1 @ data4 + add r6, r5, r1 @ data3 + add r9, r7, r1, lsl #1 @ data6 + add r8, r7, r1 @ data5 + add r10, r9, r1 @ data7 + add r12, r2, #0 @ LUT + + sub r11, r11, r1, lsr #3 +_vfp_x8_loop: + vldr s10, [r3, #0] @ x0-re + vldr s8, [r3, #4] @ x0-im + vldr s2, [r4, #0] @ x1-re + vldr s0, [r4, #4] @ x1-im + vldr s6, [r5, #0] @ x2-re + vldr s4, [r5, #4] @ x2-im + vldr s13, [r6, #0] @ x3-re + vldr s15, [r6, #4] @ x3-im + vldr s7, [r12] + vldr s11, [r12, #4] + vldr s5, [r7, #0] @ x4-re + vldr s1, [r7, #4] @ x4-im + vldr s28, [r9, #0] @ x6-re + vldr s18, [r9, #4] @ x6-im + adds r11, r11, #1 + vmul.f32 s14, s15, s7 + vldr s24, [r12, #12] + vmul.f32 s12, s13, s11 + vmul.f32 s26, s13, s7 + vldr s13, [r12, #8] + vmul.f32 s3, s4, s11 + vmul.f32 s15, s15, s11 + vmul.f32 s16, s4, s7 + vmul.f32 s9, s6, s7 + vmul.f32 s11, s6, s11 + vmul.f32 s7, s18, s24 + vmul.f32 s20, s1, s24 + vmul.f32 s30, s5, s13 + vadd.f32 s4, s26, s15 + vsub.f32 s12, s14, s12 + vsub.f32 s6, s9, s3 + vadd.f32 s14, s16, s11 + vmul.f32 s22, s28, s13 + vmul.f32 s26, s28, s24 + vmul.f32 s18, s18, s13 + vmul.f32 s5, s5, s24 + vmul.f32 s1, s1, s13 + vsub.f32 s9, s30, s20 + vadd.f32 s16, s14, s12 + vadd.f32 s3, s22, s7 + vadd.f32 s15, s6, s4 + vsub.f32 s11, s18, s26 + vadd.f32 s18, s1, s5 + vadd.f32 s13, s8, s16 + vadd.f32 s1, s9, s3 + vadd.f32 s7, s10, s15 + vsub.f32 s15, s10, s15 + vsub.f32 s10, s9, s3 + vadd.f32 s5, s18, s11 + vsub.f32 s11, s18, s11 + vsub.f32 s8, s8, s16 + vadd.f32 s20, s7, s1 + vsub.f32 s7, s7, s1 + vadd.f32 s18, s13, s5 + vadd.f32 s16, s15, s11 @ + vsub.f32 s9, s8, s10 @ + vsub.f32 s3, s13, s5 + vsub.f32 s1, s15, s11 @ + vstr s20, [r3] + vadd.f32 s8, s8, s10 @ + vstr s18, [r3, #4] + add r3, r3, #8 + vstr s16, [r5] + vstr s9, [r5, #4] + add r5, r5, #8 + vstr s7, [r7] + vstr s3, [r7, #4] + add r7, r7, #8 + vstr s1, [r9] + vstr s8, [r9, #4] + add r9, r9, #8 + vldr s10, [r8, #0] @ x5-re + vldr s8, [r8, #4] @ x5-im + vldr s5, [r10, #0] @ x7-re + vldr s11, [r10, #4] @ x7-im + vldr s1, [r12, #16] + vldr s15, [r12, #20] + add r12, r12, #24 + vmul.f32 s9, s5, s1 + vmul.f32 s3, s11, s15 + vmul.f32 s13, s10, s1 + vmul.f32 s7, s8, s15 + vmul.f32 s5, s5, s15 + vmul.f32 s11, s11, s1 + vmul.f32 s10, s10, s15 + vmul.f32 s15, s8, s1 + vsub.f32 s1, s14, s12 + vadd.f32 s8, s9, s3 + vsub.f32 s3, s6, s4 + vsub.f32 s12, s13, s7 + vsub.f32 s5, s11, s5 + vadd.f32 s7, s15, s10 + vadd.f32 s4, s2, s1 @ + vsub.f32 s2, s2, s1 @ + vsub.f32 s6, s0, s3 @ + vadd.f32 s10, s12, s8 + vsub.f32 s9, s12, s8 + vadd.f32 s0, s0, s3 @ + vsub.f32 s1, s7, s5 + vadd.f32 s14, s7, s5 + vadd.f32 s7, s4, s10 + vsub.f32 s8, s4, s10 + vsub.f32 s12, s0, s9 @ + vadd.f32 s3, s2, s1 @ + vadd.f32 s5, s6, s14 + vsub.f32 s4, s6, s14 + vsub.f32 s2, s2, s1 @ + vadd.f32 s0, s0, s9 @ + vstr s7, [r4] + vstr s5, [r4, #4] + add r4, r4, #8 + vstr s3, [r6] + vstr s12, [r6, #4] + add r6, r6, #8 + vstr s8, [r8] + vstr s4, [r8, #4] + add r8, r8, #8 + vstr s2, [r10] + vstr s0, [r10, #4] + add r10, r10, #8 + bne _vfp_x8_loop + bx lr + + + .align 4 +#ifdef __APPLE__ + .globl _vfp_end +_vfp_end: +#else + .globl vfp_end +vfp_end: +#endif + bx lr |