summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTimothy Pearson <tpearson@raptorengineering.com>2024-08-24 13:04:45 -0500
committerTimothy Pearson <tpearson@raptorengineering.com>2024-08-24 13:51:05 -0500
commit2ef6dba8728db2437def9a4fc1d3e20e0aa44c31 (patch)
tree5211805789c78639d6b96a89bd0a4a96078d0fd9
parentc40a208abbc778da4271485eba06a89d05c69b5e (diff)
downloadulab-2ef6dba8728db2437def9a4fc1d3e20e0aa44c31.tar.gz
ulab-2ef6dba8728db2437def9a4fc1d3e20e0aa44c31.zip
Revup FFTS to latest upstream version
Taken from https://github.com/linkotec/ffts Fixes ppc64el support and a handful of other bugs
-rw-r--r--lib/ffts/CMakeLists.txt158
-rwxr-xr-xlib/ffts/config.guess463
-rwxr-xr-xlib/ffts/config.sub139
-rw-r--r--lib/ffts/ffts.pc.cmake.in6
-rw-r--r--lib/ffts/include/ffts.h4
-rw-r--r--lib/ffts/src/Makefile.am4
-rw-r--r--lib/ffts/src/codegen.c4
-rw-r--r--lib/ffts/src/codegen_sse.h64
-rw-r--r--lib/ffts/src/ffts.c292
-rw-r--r--lib/ffts/src/ffts_chirp_z.c225
-rw-r--r--lib/ffts/src/ffts_chirp_z.h45
-rw-r--r--lib/ffts/src/ffts_cpu.c371
-rw-r--r--lib/ffts/src/ffts_cpu.h54
-rw-r--r--lib/ffts/src/ffts_internal.h123
-rw-r--r--lib/ffts/src/ffts_real.c218
-rw-r--r--lib/ffts/src/ffts_static.c586
-rw-r--r--lib/ffts/src/ffts_static.h24
-rw-r--r--lib/ffts/src/ffts_trig.c1057
-rw-r--r--lib/ffts/src/ffts_trig.h12
-rw-r--r--lib/ffts/src/macros-alpha.h3
-rw-r--r--lib/ffts/src/macros-altivec.h77
-rw-r--r--lib/ffts/src/macros-neon.h3
-rw-r--r--lib/ffts/src/macros-sse.h223
-rw-r--r--lib/ffts/src/macros.h172
24 files changed, 3620 insertions, 707 deletions
diff --git a/lib/ffts/CMakeLists.txt b/lib/ffts/CMakeLists.txt
index 459655e..748f412 100644
--- a/lib/ffts/CMakeLists.txt
+++ b/lib/ffts/CMakeLists.txt
@@ -7,7 +7,7 @@ set(FFTS_MAJOR 0)
set(FFTS_MINOR 9)
set(FFTS_MICRO 0)
-set(FFTS_VERSION "ffts-${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}")
+set(FFTS_VERSION "${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}")
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
@@ -22,6 +22,16 @@ set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/include/ffts)
set(LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib)
# common options
+
+# !!!! FOR TESTING ONLY !!!!
+option(ENABLE_AVX
+ "Enables AVX instructions." OFF
+)
+# !!!! FOR TESTING ONLY !!!!
+option(ENABLE_DOUBLE
+ "Enables double precision" OFF
+)
+
option(ENABLE_NEON
"Enables the use of NEON instructions." OFF
)
@@ -48,24 +58,36 @@ option(ENABLE_STATIC
include(CheckCSourceCompiles)
include(CheckCSourceRuns)
+include(CheckFunctionExists)
include(CheckIncludeFile)
+include(CheckSymbolExists)
# Ensure defined when building FFTS (as opposed to using it from
# another project). Used to export functions from Windows DLL.
add_definitions(-DFFTS_BUILD)
# check existence of various headers
-check_include_file(malloc.h HAVE_MALLOC_H)
-check_include_file(stdint.h HAVE_STDINT_H)
-check_include_file(stdlib.h HAVE_STDLIB_H)
-check_include_file(string.h HAVE_STRING_H)
-check_include_file(sys/mman.h HAVE_SYS_MMAN_H)
-check_include_file(unistd.h HAVE_UNISTD_H)
+check_include_file(inttypes.h HAVE_INTTYPES_H)
+check_include_file(malloc.h HAVE_MALLOC_H)
+check_include_file(mm_malloc.h HAVE_MM_MALLOC_H)
+check_include_file(stdint.h HAVE_STDINT_H)
+check_include_file(stdlib.h HAVE_STDLIB_H)
+check_include_file(string.h HAVE_STRING_H)
+check_include_file(sys/mman.h HAVE_SYS_MMAN_H)
+check_include_file(unistd.h HAVE_UNISTD_H)
+
+if(HAVE_INTTYPES_H)
+ add_definitions(-DHAVE_INTTYPES_H)
+endif(HAVE_INTTYPES_H)
if(HAVE_MALLOC_H)
add_definitions(-DHAVE_MALLOC_H)
endif(HAVE_MALLOC_H)
+if(HAVE_MM_MALLOC_H)
+ add_definitions(-DHAVE_MM_MALLOC_H)
+endif(HAVE_MM_MALLOC_H)
+
if(HAVE_STDINT_H)
add_definitions(-DHAVE_STDINT_H)
endif(HAVE_STDINT_H)
@@ -86,6 +108,50 @@ if(HAVE_UNISTD_H)
add_definitions(-DHAVE_UNISTD_H)
endif(HAVE_UNISTD_H)
+# check existence of various declarations
+check_symbol_exists(memalign malloc.h HAVE_DECL_MEMALIGN)
+check_symbol_exists(posix_memalign stdlib.h HAVE_DECL_POSIX_MEMALIGN)
+check_symbol_exists(valloc stdlib.h HAVE_DECL_VALLOC)
+check_symbol_exists(_mm_malloc malloc.h HAVE_DECL__MM_MALLOC)
+
+if(HAVE_DECL_MEMALIGN)
+ add_definitions(-DHAVE_DECL_MEMALIGN)
+endif(HAVE_DECL_MEMALIGN)
+
+if(HAVE_DECL_POSIX_MEMALIGN)
+ add_definitions(-DHAVE_DECL_POSIX_MEMALIGN)
+endif(HAVE_DECL_POSIX_MEMALIGN)
+
+if(HAVE_DECL_VALLOC)
+ add_definitions(-DHAVE_DECL_VALLOC)
+endif(HAVE_DECL_VALLOC)
+
+if(HAVE_DECL__MM_MALLOC)
+ add_definitions(-DHAVE_DECL__MM_MALLOC)
+endif(HAVE_DECL__MM_MALLOC)
+
+# check existence of various functions
+check_function_exists(memalign HAVE_MEMALIGN)
+check_function_exists(posix_memalign HAVE_POSIX_MEMALIGN)
+check_function_exists(valloc HAVE_VALLOC)
+check_function_exists(_mm_malloc HAVE__MM_MALLOC)
+
+if(HAVE_MEMALIGN)
+ add_definitions(-DHAVE_MEMALIGN)
+endif(HAVE_MEMALIGN)
+
+if(HAVE_POSIX_MEMALIGN)
+ add_definitions(-DHAVE_POSIX_MEMALIGN)
+endif(HAVE_POSIX_MEMALIGN)
+
+if(HAVE_VALLOC)
+ add_definitions(-DHAVE_VALLOC)
+endif(HAVE_VALLOC)
+
+if(HAVE__MM_MALLOC)
+ add_definitions(-DHAVE__MM_MALLOC)
+endif(HAVE__MM_MALLOC)
+
# backup flags
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
@@ -246,6 +312,14 @@ if(NOT CMAKE_CROSSCOMPILING)
if(HAVE_XMMINTRIN_H)
add_definitions(-DHAVE_SSE)
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+
+ # TODO: not the right place
+ if(ENABLE_AVX)
+ add_definitions(-DHAVE_AVX)
+ endif(ENABLE_AVX)
+ if(ENABLE_DOUBLE)
+ add_definitions(-DFFTS_DOUBLE)
+ endif(ENABLE_DOUBLE)
endif(HAVE_XMMINTRIN_H)
# enable SSE2 code generation
@@ -351,6 +425,10 @@ set(FFTS_HEADERS
set(FFTS_SOURCES
src/ffts_attributes.h
src/ffts.c
+ src/ffts_chirp_z.c
+ src/ffts_chirp_z.h
+ src/ffts_cpu.c
+ src/ffts_cpu.h
src/ffts_internal.h
src/ffts_nd.c
src/ffts_nd.h
@@ -369,6 +447,17 @@ set(FFTS_SOURCES
src/types.h
)
+if(NOT DISABLE_DYNAMIC_CODE)
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+ list(APPEND FFTS_SOURCES
+ src/codegen_sse.h
+ )
+ else()
+ message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.")
+ set(DISABLE_DYNAMIC_CODE ON)
+ endif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+endif(NOT DISABLE_DYNAMIC_CODE)
+
if(ENABLE_NEON)
list(APPEND FFTS_SOURCES
src/neon.s
@@ -393,19 +482,9 @@ elseif(HAVE_XMMINTRIN_H)
add_definitions(-DHAVE_SSE)
list(APPEND FFTS_SOURCES
+ src/macros-avx.h
src/macros-sse.h
)
-
- if(NOT DISABLE_DYNAMIC_CODE)
- if(CMAKE_SIZEOF_VOID_P EQUAL 8)
- list(APPEND FFTS_SOURCES
- src/codegen_sse.h
- )
- else()
- message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.")
- set(DISABLE_DYNAMIC_CODE ON)
- endif(CMAKE_SIZEOF_VOID_P EQUAL 8)
- endif(NOT DISABLE_DYNAMIC_CODE)
endif(ENABLE_NEON)
if(DISABLE_DYNAMIC_CODE)
@@ -452,6 +531,41 @@ if(ENABLE_STATIC)
endif(ENABLE_STATIC)
if(ENABLE_STATIC OR ENABLE_SHARED)
+ find_path(MPFR_INCLUDES
+ NAMES mpfr.h
+ PATHS ${INCLUDE_INSTALL_DIR}
+ )
+ find_library(MPFR_LIBRARIES mpfr PATHS ${LIB_INSTALL_DIR})
+ find_package(OpenMP)
+
+ if(MPFR_INCLUDES)
+ add_definitions(-DHAVE_MPFR_H)
+ include_directories(${MPFR_INCLUDES})
+ endif(MPFR_INCLUDES)
+
+ add_executable(ffts_trig_test
+ tests/trig_test.c
+ )
+
+ target_link_libraries(ffts_trig_test ffts)
+ if(MPFR_LIBRARIES)
+ target_link_libraries(ffts_trig_test ${MPFR_LIBRARIES})
+ endif(MPFR_LIBRARIES)
+
+ if(OPENMP_FOUND)
+ if(MSVC)
+ set_target_properties(ffts_trig_test PROPERTIES
+ COMPILE_FLAGS "${OpenMP_C_FLAGS}"
+ LINK_FLAGS "${OpenMP_EXE_LINKER_FLAGS}"
+ )
+ else()
+ set_target_properties(ffts_trig_test PROPERTIES
+ COMPILE_FLAGS "${OpenMP_C_FLAGS}"
+ LINK_FLAGS "${OpenMP_C_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}"
+ )
+ endif(MSVC)
+ endif(OPENMP_FOUND)
+
add_executable(ffts_test
tests/test.c
)
@@ -467,6 +581,14 @@ if(ENABLE_STATIC OR ENABLE_SHARED)
ffts
${FFTS_EXTRA_LIBRARIES}
)
+
+ add_executable(ffts_cpu_test
+ src/ffts_cpu.c
+ src/ffts_cpu.h
+ tests/cpu_test.c
+ )
+
+ set_target_properties(ffts_cpu_test PROPERTIES COMPILE_DEFINITIONS FFTS_BUILDING_CPU_TEST)
endif(ENABLE_STATIC OR ENABLE_SHARED)
# generate packageconfig file
diff --git a/lib/ffts/config.guess b/lib/ffts/config.guess
index 0967f2a..137bedf 100755
--- a/lib/ffts/config.guess
+++ b/lib/ffts/config.guess
@@ -1,12 +1,14 @@
#! /bin/sh
# Attempt to guess a canonical system name.
-# Copyright 1992-2016 Free Software Foundation, Inc.
+# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+# 2011, 2012 Free Software Foundation, Inc.
-timestamp='2016-04-02'
+timestamp='2012-08-14'
# This file is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
+# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
@@ -20,17 +22,19 @@ timestamp='2016-04-02'
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that
-# program. This Exception is an additional permission under section 7
-# of the GNU General Public License, version 3 ("GPLv3").
+# the same distribution terms that you use for the rest of that program.
+
+
+# Originally written by Per Bothner. Please send patches (context
+# diff format) to <config-patches@gnu.org> and include a ChangeLog
+# entry.
#
-# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
+# This script attempts to guess a canonical system name similar to
+# config.sub. If it succeeds, it prints the system name on stdout, and
+# exits with 0. Otherwise, it exits with 1.
#
# You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
-#
-# Please send patches to <config-patches@gnu.org>.
-
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
me=`echo "$0" | sed -e 's,.*/,,'`
@@ -50,7 +54,9 @@ version="\
GNU config.guess ($timestamp)
Originally written by Per Bothner.
-Copyright 1992-2016 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
+Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -132,27 +138,6 @@ UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
-case "${UNAME_SYSTEM}" in
-Linux|GNU|GNU/*)
- # If the system lacks a compiler, then just pick glibc.
- # We could probably try harder.
- LIBC=gnu
-
- eval $set_cc_for_build
- cat <<-EOF > $dummy.c
- #include <features.h>
- #if defined(__UCLIBC__)
- LIBC=uclibc
- #elif defined(__dietlibc__)
- LIBC=dietlibc
- #else
- LIBC=gnu
- #endif
- EOF
- eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
- ;;
-esac
-
# Note: order is significant - the case branches are not exclusive.
case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
@@ -168,27 +153,20 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
# Note: NetBSD doesn't particularly care about the vendor
# portion of the name. We always set it to "unknown".
sysctl="sysctl -n hw.machine_arch"
- UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
- /sbin/$sysctl 2>/dev/null || \
- /usr/sbin/$sysctl 2>/dev/null || \
- echo unknown)`
+ UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+ /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
case "${UNAME_MACHINE_ARCH}" in
armeb) machine=armeb-unknown ;;
arm*) machine=arm-unknown ;;
sh3el) machine=shl-unknown ;;
sh3eb) machine=sh-unknown ;;
sh5el) machine=sh5le-unknown ;;
- earmv*)
- arch=`echo ${UNAME_MACHINE_ARCH} | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
- endian=`echo ${UNAME_MACHINE_ARCH} | sed -ne 's,^.*\(eb\)$,\1,p'`
- machine=${arch}${endian}-unknown
- ;;
*) machine=${UNAME_MACHINE_ARCH}-unknown ;;
esac
# The Operating System including object format, if it has switched
# to ELF recently, or will in the future.
case "${UNAME_MACHINE_ARCH}" in
- arm*|earm*|i386|m68k|ns32k|sh3*|sparc|vax)
+ arm*|i386|m68k|ns32k|sh3*|sparc|vax)
eval $set_cc_for_build
if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
| grep -q __ELF__
@@ -204,13 +182,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
os=netbsd
;;
esac
- # Determine ABI tags.
- case "${UNAME_MACHINE_ARCH}" in
- earm*)
- expr='s/^earmv[0-9]/-eabi/;s/eb$//'
- abi=`echo ${UNAME_MACHINE_ARCH} | sed -e "$expr"`
- ;;
- esac
# The OS release
# Debian GNU/NetBSD machines have a different userland, and
# thus, need a distinct triplet. However, they do not need
@@ -221,13 +192,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
release='-gnu'
;;
*)
- release=`echo ${UNAME_RELEASE} | sed -e 's/[-_].*//' | cut -d. -f1,2`
+ release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
;;
esac
# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
# contains redundant information, the shorter form:
# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
- echo "${machine}-${os}${release}${abi}"
+ echo "${machine}-${os}${release}"
exit ;;
*:Bitrig:*:*)
UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
@@ -237,10 +208,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
exit ;;
- *:LibertyBSD:*:*)
- UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'`
- echo ${UNAME_MACHINE_ARCH}-unknown-libertybsd${UNAME_RELEASE}
- exit ;;
*:ekkoBSD:*:*)
echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
exit ;;
@@ -253,9 +220,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
*:MirBSD:*:*)
echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
exit ;;
- *:Sortix:*:*)
- echo ${UNAME_MACHINE}-unknown-sortix
- exit ;;
alpha:OSF1:*:*)
case $UNAME_RELEASE in
*4.0)
@@ -272,42 +236,42 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1`
case "$ALPHA_CPU_TYPE" in
"EV4 (21064)")
- UNAME_MACHINE=alpha ;;
+ UNAME_MACHINE="alpha" ;;
"EV4.5 (21064)")
- UNAME_MACHINE=alpha ;;
+ UNAME_MACHINE="alpha" ;;
"LCA4 (21066/21068)")
- UNAME_MACHINE=alpha ;;
+ UNAME_MACHINE="alpha" ;;
"EV5 (21164)")
- UNAME_MACHINE=alphaev5 ;;
+ UNAME_MACHINE="alphaev5" ;;
"EV5.6 (21164A)")
- UNAME_MACHINE=alphaev56 ;;
+ UNAME_MACHINE="alphaev56" ;;
"EV5.6 (21164PC)")
- UNAME_MACHINE=alphapca56 ;;
+ UNAME_MACHINE="alphapca56" ;;
"EV5.7 (21164PC)")
- UNAME_MACHINE=alphapca57 ;;
+ UNAME_MACHINE="alphapca57" ;;
"EV6 (21264)")
- UNAME_MACHINE=alphaev6 ;;
+ UNAME_MACHINE="alphaev6" ;;
"EV6.7 (21264A)")
- UNAME_MACHINE=alphaev67 ;;
+ UNAME_MACHINE="alphaev67" ;;
"EV6.8CB (21264C)")
- UNAME_MACHINE=alphaev68 ;;
+ UNAME_MACHINE="alphaev68" ;;
"EV6.8AL (21264B)")
- UNAME_MACHINE=alphaev68 ;;
+ UNAME_MACHINE="alphaev68" ;;
"EV6.8CX (21264D)")
- UNAME_MACHINE=alphaev68 ;;
+ UNAME_MACHINE="alphaev68" ;;
"EV6.9A (21264/EV69A)")
- UNAME_MACHINE=alphaev69 ;;
+ UNAME_MACHINE="alphaev69" ;;
"EV7 (21364)")
- UNAME_MACHINE=alphaev7 ;;
+ UNAME_MACHINE="alphaev7" ;;
"EV7.9 (21364A)")
- UNAME_MACHINE=alphaev79 ;;
+ UNAME_MACHINE="alphaev79" ;;
esac
# A Pn.n version is a patched version.
# A Vn.n version is a released version.
# A Tn.n version is a released field test version.
# A Xn.n version is an unreleased experimental baselevel.
# 1.2 uses "1.2" for uname -r.
- echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+ echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
exitcode=$?
trap '' 0
@@ -342,7 +306,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
echo arm-acorn-riscix${UNAME_RELEASE}
exit ;;
- arm*:riscos:*:*|arm*:RISCOS:*:*)
+ arm:riscos:*:*|arm:RISCOS:*:*)
echo arm-unknown-riscos
exit ;;
SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
@@ -380,16 +344,16 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
exit ;;
i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
eval $set_cc_for_build
- SUN_ARCH=i386
+ SUN_ARCH="i386"
# If there is a compiler, see if it is configured for 64-bit objects.
# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
# This test works for both compilers.
- if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
+ if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
- (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
grep IS_64BIT_ARCH >/dev/null
then
- SUN_ARCH=x86_64
+ SUN_ARCH="x86_64"
fi
fi
echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
@@ -414,7 +378,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
exit ;;
sun*:*:4.2BSD:*)
UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
- test "x${UNAME_RELEASE}" = x && UNAME_RELEASE=3
+ test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
case "`/bin/arch`" in
sun3)
echo m68k-sun-sunos${UNAME_RELEASE}
@@ -600,9 +564,8 @@ EOF
else
IBM_ARCH=powerpc
fi
- if [ -x /usr/bin/lslpp ] ; then
- IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc |
- awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
+ if [ -x /usr/bin/oslevel ] ; then
+ IBM_REV=`/usr/bin/oslevel`
else
IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
fi
@@ -639,13 +602,13 @@ EOF
sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
case "${sc_cpu_version}" in
- 523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
- 528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
+ 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+ 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
532) # CPU_PA_RISC2_0
case "${sc_kernel_bits}" in
- 32) HP_ARCH=hppa2.0n ;;
- 64) HP_ARCH=hppa2.0w ;;
- '') HP_ARCH=hppa2.0 ;; # HP-UX 10.20
+ 32) HP_ARCH="hppa2.0n" ;;
+ 64) HP_ARCH="hppa2.0w" ;;
+ '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20
esac ;;
esac
fi
@@ -684,11 +647,11 @@ EOF
exit (0);
}
EOF
- (CCOPTS="" $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+ (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
test -z "$HP_ARCH" && HP_ARCH=hppa
fi ;;
esac
- if [ ${HP_ARCH} = hppa2.0w ]
+ if [ ${HP_ARCH} = "hppa2.0w" ]
then
eval $set_cc_for_build
@@ -701,12 +664,12 @@ EOF
# $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
# => hppa64-hp-hpux11.23
- if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) |
+ if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
grep -q __LP64__
then
- HP_ARCH=hppa2.0w
+ HP_ARCH="hppa2.0w"
else
- HP_ARCH=hppa64
+ HP_ARCH="hppa64"
fi
fi
echo ${HP_ARCH}-hp-hpux${HPUX_REV}
@@ -811,14 +774,14 @@ EOF
echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
exit ;;
F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
- FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
- FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+ FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+ FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
exit ;;
5000:UNIX_System_V:4.*:*)
- FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
- FUJITSU_REL=`echo ${UNAME_RELEASE} | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'`
+ FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+ FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
exit ;;
i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
@@ -848,7 +811,7 @@ EOF
*:MINGW*:*)
echo ${UNAME_MACHINE}-pc-mingw32
exit ;;
- *:MSYS*:*)
+ i*:MSYS*:*)
echo ${UNAME_MACHINE}-pc-msys
exit ;;
i*:windows32*:*)
@@ -896,21 +859,21 @@ EOF
exit ;;
*:GNU:*:*)
# the GNU system
- echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+ echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
exit ;;
*:GNU/*:*:*)
# other systems with GNU libc and userland
- echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
exit ;;
i*86:Minix:*:*)
echo ${UNAME_MACHINE}-pc-minix
exit ;;
aarch64:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
aarch64_be:Linux:*:*)
UNAME_MACHINE=aarch64_be
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
alpha:Linux:*:*)
case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
@@ -923,60 +886,59 @@ EOF
EV68*) UNAME_MACHINE=alphaev68 ;;
esac
objdump --private-headers /bin/sh | grep -q ld.so.1
- if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
- exit ;;
- arc:Linux:*:* | arceb:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+ echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
exit ;;
arm*:Linux:*:*)
eval $set_cc_for_build
if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
| grep -q __ARM_EABI__
then
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
else
if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
| grep -q __ARM_PCS_VFP
then
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
+ echo ${UNAME_MACHINE}-unknown-linux-gnueabi
else
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf
+ echo ${UNAME_MACHINE}-unknown-linux-gnueabihf
fi
fi
exit ;;
avr32*:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
cris:Linux:*:*)
- echo ${UNAME_MACHINE}-axis-linux-${LIBC}
+ echo ${UNAME_MACHINE}-axis-linux-gnu
exit ;;
crisv32:Linux:*:*)
- echo ${UNAME_MACHINE}-axis-linux-${LIBC}
- exit ;;
- e2k:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-axis-linux-gnu
exit ;;
frv:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
hexagon:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
i*86:Linux:*:*)
- echo ${UNAME_MACHINE}-pc-linux-${LIBC}
+ LIBC=gnu
+ eval $set_cc_for_build
+ sed 's/^ //' << EOF >$dummy.c
+ #ifdef __dietlibc__
+ LIBC=dietlibc
+ #endif
+EOF
+ eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+ echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
exit ;;
ia64:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
- exit ;;
- k1om:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
m32r*:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
m68*:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
mips:Linux:*:* | mips64:Linux:*:*)
eval $set_cc_for_build
@@ -995,63 +957,54 @@ EOF
#endif
EOF
eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
- test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
+ test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
;;
- openrisc*:Linux:*:*)
- echo or1k-unknown-linux-${LIBC}
- exit ;;
- or32:Linux:*:* | or1k*:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ or32:Linux:*:*)
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
padre:Linux:*:*)
- echo sparc-unknown-linux-${LIBC}
+ echo sparc-unknown-linux-gnu
exit ;;
parisc64:Linux:*:* | hppa64:Linux:*:*)
- echo hppa64-unknown-linux-${LIBC}
+ echo hppa64-unknown-linux-gnu
exit ;;
parisc:Linux:*:* | hppa:Linux:*:*)
# Look for CPU level
case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
- PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
- PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
- *) echo hppa-unknown-linux-${LIBC} ;;
+ PA7*) echo hppa1.1-unknown-linux-gnu ;;
+ PA8*) echo hppa2.0-unknown-linux-gnu ;;
+ *) echo hppa-unknown-linux-gnu ;;
esac
exit ;;
ppc64:Linux:*:*)
- echo powerpc64-unknown-linux-${LIBC}
+ echo powerpc64-unknown-linux-gnu
exit ;;
ppc:Linux:*:*)
- echo powerpc-unknown-linux-${LIBC}
- exit ;;
- ppc64le:Linux:*:*)
- echo powerpc64le-unknown-linux-${LIBC}
- exit ;;
- ppcle:Linux:*:*)
- echo powerpcle-unknown-linux-${LIBC}
+ echo powerpc-unknown-linux-gnu
exit ;;
s390:Linux:*:* | s390x:Linux:*:*)
- echo ${UNAME_MACHINE}-ibm-linux-${LIBC}
+ echo ${UNAME_MACHINE}-ibm-linux
exit ;;
sh64*:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
sh*:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
sparc:Linux:*:* | sparc64:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
tile*:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
vax:Linux:*:*)
- echo ${UNAME_MACHINE}-dec-linux-${LIBC}
+ echo ${UNAME_MACHINE}-dec-linux-gnu
exit ;;
x86_64:Linux:*:*)
- echo ${UNAME_MACHINE}-pc-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
xtensa*:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
i*86:DYNIX/ptx:4*:*)
# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
@@ -1127,7 +1080,7 @@ EOF
# uname -m prints for DJGPP always 'pc', but it prints nothing about
# the processor, so we play safe by assuming i586.
# Note: whatever this is, it MUST be the same as what config.sub
- # prints for the "djgpp" host, or else GDB configure will decide that
+ # prints for the "djgpp" host, or else GDB configury will decide that
# this is a cross-build.
echo i586-pc-msdosdjgpp
exit ;;
@@ -1276,9 +1229,6 @@ EOF
SX-8R:SUPER-UX:*:*)
echo sx8r-nec-superux${UNAME_RELEASE}
exit ;;
- SX-ACE:SUPER-UX:*:*)
- echo sxace-nec-superux${UNAME_RELEASE}
- exit ;;
Power*:Rhapsody:*:*)
echo powerpc-apple-rhapsody${UNAME_RELEASE}
exit ;;
@@ -1287,36 +1237,24 @@ EOF
exit ;;
*:Darwin:*:*)
UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
- eval $set_cc_for_build
- if test "$UNAME_PROCESSOR" = unknown ; then
- UNAME_PROCESSOR=powerpc
- fi
- if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
- if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
- if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
- (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
- grep IS_64BIT_ARCH >/dev/null
- then
- case $UNAME_PROCESSOR in
- i386) UNAME_PROCESSOR=x86_64 ;;
- powerpc) UNAME_PROCESSOR=powerpc64 ;;
- esac
- fi
- fi
- elif test "$UNAME_PROCESSOR" = i386 ; then
- # Avoid executing cc on OS X 10.9, as it ships with a stub
- # that puts up a graphical alert prompting to install
- # developer tools. Any system running Mac OS X 10.7 or
- # later (Darwin 11 and later) is required to have a 64-bit
- # processor. This is not true of the ARM version of Darwin
- # that Apple uses in portable devices.
- UNAME_PROCESSOR=x86_64
- fi
+ case $UNAME_PROCESSOR in
+ i386)
+ eval $set_cc_for_build
+ if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+ if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+ grep IS_64BIT_ARCH >/dev/null
+ then
+ UNAME_PROCESSOR="x86_64"
+ fi
+ fi ;;
+ unknown) UNAME_PROCESSOR=powerpc ;;
+ esac
echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
exit ;;
*:procnto*:*:* | *:QNX:[0123456789]*:*)
UNAME_PROCESSOR=`uname -p`
- if test "$UNAME_PROCESSOR" = x86; then
+ if test "$UNAME_PROCESSOR" = "x86"; then
UNAME_PROCESSOR=i386
UNAME_MACHINE=pc
fi
@@ -1347,7 +1285,7 @@ EOF
# "uname -m" is not consistent, so use $cputype instead. 386
# is converted to i386 for consistency with other x86
# operating systems.
- if test "$cputype" = 386; then
+ if test "$cputype" = "386"; then
UNAME_MACHINE=i386
else
UNAME_MACHINE="$cputype"
@@ -1389,7 +1327,7 @@ EOF
echo i386-pc-xenix
exit ;;
i*86:skyos:*:*)
- echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE} | sed -e 's/ .*$//'`
+ echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
exit ;;
i*86:rdos:*:*)
echo ${UNAME_MACHINE}-pc-rdos
@@ -1400,11 +1338,156 @@ EOF
x86_64:VMkernel:*:*)
echo ${UNAME_MACHINE}-unknown-esx
exit ;;
- amd64:Isilon\ OneFS:*:*)
- echo x86_64-unknown-onefs
- exit ;;
esac
+eval $set_cc_for_build
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+ /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed,
+ I don't know.... */
+ printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+ printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+ "4"
+#else
+ ""
+#endif
+ ); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+ printf ("arm-acorn-riscix\n"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+ printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+ int version;
+ version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+ if (version < 4)
+ printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+ else
+ printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+ exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+ printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+ printf ("ns32k-encore-mach\n"); exit (0);
+#else
+ printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+ printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+ printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+ printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+ struct utsname un;
+
+ uname(&un);
+
+ if (strncmp(un.version, "V2", 2) == 0) {
+ printf ("i386-sequent-ptx2\n"); exit (0);
+ }
+ if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+ printf ("i386-sequent-ptx1\n"); exit (0);
+ }
+ printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+# if !defined (ultrix)
+# include <sys/param.h>
+# if defined (BSD)
+# if BSD == 43
+ printf ("vax-dec-bsd4.3\n"); exit (0);
+# else
+# if BSD == 199006
+ printf ("vax-dec-bsd4.3reno\n"); exit (0);
+# else
+ printf ("vax-dec-bsd\n"); exit (0);
+# endif
+# endif
+# else
+ printf ("vax-dec-bsd\n"); exit (0);
+# endif
+# else
+ printf ("vax-dec-ultrix\n"); exit (0);
+# endif
+#endif
+
+#if defined (alliant) && defined (i860)
+ printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+ exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
+ { echo "$SYSTEM_NAME"; exit; }
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+ case `getsysinfo -f cpu_type` in
+ c1*)
+ echo c1-convex-bsd
+ exit ;;
+ c2*)
+ if getsysinfo -f scalar_acc
+ then echo c32-convex-bsd
+ else echo c2-convex-bsd
+ fi
+ exit ;;
+ c34*)
+ echo c34-convex-bsd
+ exit ;;
+ c38*)
+ echo c38-convex-bsd
+ exit ;;
+ c4*)
+ echo c4-convex-bsd
+ exit ;;
+ esac
+fi
+
cat >&2 <<EOF
$0: unable to guess system type
@@ -1412,9 +1495,9 @@ This script, last modified $timestamp, has failed to recognize
the operating system you are using. It is advised that you
download the most up to date version of the config scripts from
- http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
+ http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
and
- http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+ http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
If the version you run ($0) is already up to date, please
send the following data and any information you think might be
diff --git a/lib/ffts/config.sub b/lib/ffts/config.sub
index 8d39c4b..bdda9e4 100755
--- a/lib/ffts/config.sub
+++ b/lib/ffts/config.sub
@@ -1,18 +1,24 @@
#! /bin/sh
# Configuration validation subroutine script.
-# Copyright 1992-2016 Free Software Foundation, Inc.
+# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+# 2011, 2012 Free Software Foundation, Inc.
-timestamp='2016-03-30'
+timestamp='2012-08-18'
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
+# This file is (in principle) common to ALL GNU software.
+# The presence of a machine in this file suggests that SOME GNU software
+# can handle that machine. It does not imply ALL GNU software can.
+#
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
@@ -20,12 +26,11 @@ timestamp='2016-03-30'
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that
-# program. This Exception is an additional permission under section 7
-# of the GNU General Public License, version 3 ("GPLv3").
+# the same distribution terms that you use for the rest of that program.
-# Please send patches to <config-patches@gnu.org>.
+# Please send patches to <config-patches@gnu.org>. Submit a context
+# diff and a properly formatted GNU ChangeLog entry.
#
# Configuration subroutine to validate and canonicalize a configuration type.
# Supply the specified configuration type as an argument.
@@ -33,7 +38,7 @@ timestamp='2016-03-30'
# Otherwise, we print the canonical config type on stdout and succeed.
# You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
# This file is supposed to be the same for all GNU packages
# and recognize all the CPU types, system types and aliases
@@ -53,7 +58,8 @@ timestamp='2016-03-30'
me=`echo "$0" | sed -e 's,.*/,,'`
usage="\
-Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
+Usage: $0 [OPTION] CPU-MFR-OPSYS
+ $0 [OPTION] ALIAS
Canonicalize a configuration name.
@@ -67,7 +73,9 @@ Report bugs and patches to <config-patches@gnu.org>."
version="\
GNU config.sub ($timestamp)
-Copyright 1992-2016 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
+Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -116,7 +124,7 @@ maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
case $maybe_os in
nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
- knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \
+ knetbsd*-gnu* | netbsd*-gnu* | \
kopensolaris*-gnu* | \
storm-chaos* | os2-emx* | rtmk-nova*)
os=-$maybe_os
@@ -148,7 +156,7 @@ case $os in
-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
- -apple | -axis | -knuth | -cray | -microblaze*)
+ -apple | -axis | -knuth | -cray | -microblaze)
os=
basic_machine=$1
;;
@@ -251,25 +259,21 @@ case $basic_machine in
| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
| am33_2.0 \
- | arc | arceb \
- | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
- | avr | avr32 \
- | ba \
- | be32 | be64 \
+ | arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
+ | be32 | be64 \
| bfin \
- | c4x | c8051 | clipper \
+ | c4x | clipper \
| d10v | d30v | dlx | dsp16xx \
- | e2k | epiphany \
- | fido | fr30 | frv | ft32 \
+ | epiphany \
+ | fido | fr30 | frv \
| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
| hexagon \
| i370 | i860 | i960 | ia64 \
| ip2k | iq2000 \
- | k1om \
| le32 | le64 \
| lm32 \
| m32c | m32r | m32rle | m68000 | m68k | m88k \
- | maxq | mb | microblaze | microblazeel | mcore | mep | metag \
+ | maxq | mb | microblaze | mcore | mep | metag \
| mips | mipsbe | mipseb | mipsel | mipsle \
| mips16 \
| mips64 | mips64el \
@@ -283,29 +287,26 @@ case $basic_machine in
| mips64vr5900 | mips64vr5900el \
| mipsisa32 | mipsisa32el \
| mipsisa32r2 | mipsisa32r2el \
- | mipsisa32r6 | mipsisa32r6el \
| mipsisa64 | mipsisa64el \
| mipsisa64r2 | mipsisa64r2el \
- | mipsisa64r6 | mipsisa64r6el \
| mipsisa64sb1 | mipsisa64sb1el \
| mipsisa64sr71k | mipsisa64sr71kel \
- | mipsr5900 | mipsr5900el \
| mipstx39 | mipstx39el \
| mn10200 | mn10300 \
| moxie \
| mt \
| msp430 \
| nds32 | nds32le | nds32be \
- | nios | nios2 | nios2eb | nios2el \
+ | nios | nios2 \
| ns16k | ns32k \
- | open8 | or1k | or1knd | or32 \
+ | open8 \
+ | or32 \
| pdp10 | pdp11 | pj | pjl \
| powerpc | powerpc64 | powerpc64le | powerpcle \
| pyramid \
- | riscv32 | riscv64 \
| rl78 | rx \
| score \
- | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+ | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
| sh64 | sh64le \
| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
@@ -313,7 +314,6 @@ case $basic_machine in
| tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
| ubicom32 \
| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
- | visium \
| we32k \
| x86 | xc16x | xstormy16 | xtensa \
| z8k | z80)
@@ -328,10 +328,7 @@ case $basic_machine in
c6x)
basic_machine=tic6x-unknown
;;
- leon|leon[3-9])
- basic_machine=sparc-$basic_machine
- ;;
- m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip)
+ m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip)
basic_machine=$basic_machine-unknown
os=-none
;;
@@ -373,29 +370,26 @@ case $basic_machine in
| aarch64-* | aarch64_be-* \
| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
- | alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
+ | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
| arm-* | armbe-* | armle-* | armeb-* | armv*-* \
| avr-* | avr32-* \
- | ba-* \
| be32-* | be64-* \
| bfin-* | bs2000-* \
| c[123]* | c30-* | [cjt]90-* | c4x-* \
- | c8051-* | clipper-* | craynv-* | cydra-* \
+ | clipper-* | craynv-* | cydra-* \
| d10v-* | d30v-* | dlx-* \
- | e2k-* | elxsi-* \
+ | elxsi-* \
| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
| h8300-* | h8500-* \
| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
| hexagon-* \
| i*86-* | i860-* | i960-* | ia64-* \
| ip2k-* | iq2000-* \
- | k1om-* \
| le32-* | le64-* \
| lm32-* \
| m32c-* | m32r-* | m32rle-* \
| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
- | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \
- | microblaze-* | microblazeel-* \
+ | m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \
| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
| mips16-* \
| mips64-* | mips64el-* \
@@ -409,33 +403,28 @@ case $basic_machine in
| mips64vr5900-* | mips64vr5900el-* \
| mipsisa32-* | mipsisa32el-* \
| mipsisa32r2-* | mipsisa32r2el-* \
- | mipsisa32r6-* | mipsisa32r6el-* \
| mipsisa64-* | mipsisa64el-* \
| mipsisa64r2-* | mipsisa64r2el-* \
- | mipsisa64r6-* | mipsisa64r6el-* \
| mipsisa64sb1-* | mipsisa64sb1el-* \
| mipsisa64sr71k-* | mipsisa64sr71kel-* \
- | mipsr5900-* | mipsr5900el-* \
| mipstx39-* | mipstx39el-* \
| mmix-* \
| mt-* \
| msp430-* \
| nds32-* | nds32le-* | nds32be-* \
- | nios-* | nios2-* | nios2eb-* | nios2el-* \
+ | nios-* | nios2-* \
| none-* | np1-* | ns16k-* | ns32k-* \
| open8-* \
- | or1k*-* \
| orion-* \
| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
| pyramid-* \
- | riscv32-* | riscv64-* \
| rl78-* | romp-* | rs6000-* | rx-* \
| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
| sparclite-* \
- | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \
+ | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
| tahoe-* \
| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
| tile*-* \
@@ -443,7 +432,6 @@ case $basic_machine in
| ubicom32-* \
| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
| vax-* \
- | visium-* \
| we32k-* \
| x86-* | x86_64-* | xc16x-* | xps100-* \
| xstormy16-* | xtensa*-* \
@@ -520,9 +508,6 @@ case $basic_machine in
basic_machine=i386-pc
os=-aros
;;
- asmjs)
- basic_machine=asmjs-unknown
- ;;
aux)
basic_machine=m68k-apple
os=-aux
@@ -784,9 +769,6 @@ case $basic_machine in
basic_machine=m68k-isi
os=-sysv
;;
- leon-*|leon[3-9]-*)
- basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'`
- ;;
m68knommu)
basic_machine=m68k-unknown
os=-linux
@@ -806,7 +788,7 @@ case $basic_machine in
basic_machine=ns32k-utek
os=-sysv
;;
- microblaze*)
+ microblaze)
basic_machine=microblaze-xilinx
;;
mingw64)
@@ -814,7 +796,7 @@ case $basic_machine in
os=-mingw64
;;
mingw32)
- basic_machine=i686-pc
+ basic_machine=i386-pc
os=-mingw32
;;
mingw32ce)
@@ -842,10 +824,6 @@ case $basic_machine in
basic_machine=powerpc-unknown
os=-morphos
;;
- moxiebox)
- basic_machine=moxie-unknown
- os=-moxiebox
- ;;
msdos)
basic_machine=i386-pc
os=-msdos
@@ -854,7 +832,7 @@ case $basic_machine in
basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
;;
msys)
- basic_machine=i686-pc
+ basic_machine=i386-pc
os=-msys
;;
mvs)
@@ -1045,11 +1023,7 @@ case $basic_machine in
basic_machine=i586-unknown
os=-pw32
;;
- rdos | rdos64)
- basic_machine=x86_64-pc
- os=-rdos
- ;;
- rdos32)
+ rdos)
basic_machine=i386-pc
os=-rdos
;;
@@ -1376,13 +1350,13 @@ case $os in
-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
| -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
| -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
- | -sym* | -kopensolaris* | -plan9* \
+ | -sym* | -kopensolaris* \
| -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
- | -aos* | -aros* | -cloudabi* | -sortix* \
+ | -aos* | -aros* \
| -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
| -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
| -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
- | -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \
+ | -bitrig* | -openbsd* | -solidbsd* \
| -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
| -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
| -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
@@ -1391,15 +1365,14 @@ case $os in
| -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
| -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
| -linux-newlib* | -linux-musl* | -linux-uclibc* \
- | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \
+ | -uxpv* | -beos* | -mpeix* | -udk* \
| -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
| -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
| -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
| -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
| -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
| -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
- | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \
- | -onefs* | -tirtos*)
+ | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
# Remember, each alternative MUST END IN *, to match a version number.
;;
-qnx*)
@@ -1523,6 +1496,9 @@ case $os in
-aros*)
os=-aros
;;
+ -kaos*)
+ os=-kaos
+ ;;
-zvmoe)
os=-zvmoe
;;
@@ -1531,8 +1507,6 @@ case $os in
;;
-nacl*)
;;
- -ios)
- ;;
-none)
;;
*)
@@ -1573,9 +1547,6 @@ case $basic_machine in
c4x-* | tic4x-*)
os=-coff
;;
- c8051-*)
- os=-elf
- ;;
hexagon-*)
os=-elf
;;
diff --git a/lib/ffts/ffts.pc.cmake.in b/lib/ffts/ffts.pc.cmake.in
index 43f38e9..63d4cc0 100644
--- a/lib/ffts/ffts.pc.cmake.in
+++ b/lib/ffts/ffts.pc.cmake.in
@@ -1,7 +1,7 @@
prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${exec_prefix}
-libdir=${libdir}
-includedir=${includedir}
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
Name: @CMAKE_PROJECT_NAME@
Description: fast Fourier transform library
diff --git a/lib/ffts/include/ffts.h b/lib/ffts/include/ffts.h
index cc85a88..b13316f 100644
--- a/lib/ffts/include/ffts.h
+++ b/lib/ffts/include/ffts.h
@@ -3,6 +3,7 @@
This file is part of FFTS.
Copyright (c) 2012, Anthony M. Blake
+ Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -76,6 +77,9 @@ FFTS_API ffts_plan_t*
ffts_init_1d(size_t N, int sign);
FFTS_API ffts_plan_t*
+ffts_init_1d_64f(size_t N, int sign);
+
+FFTS_API ffts_plan_t*
ffts_init_2d(size_t N1, size_t N2, int sign);
FFTS_API ffts_plan_t*
diff --git a/lib/ffts/src/Makefile.am b/lib/ffts/src/Makefile.am
index 28c7879..ff6b0cc 100644
--- a/lib/ffts/src/Makefile.am
+++ b/lib/ffts/src/Makefile.am
@@ -2,7 +2,7 @@
lib_LTLIBRARIES = libffts.la
-libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c ffts_transpose.c ffts_trig.c ffts_static.c
+libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c ffts_transpose.c ffts_trig.c ffts_static.c ffts_chirp_z.c
libffts_la_SOURCES += codegen.h codegen_arm.h codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h macros-neon.h macros-sse.h macros.h neon.h neon_float.h patterns.h types.h vfp.h
if DYNAMIC_DISABLED
@@ -14,7 +14,7 @@ endif
libffts_includedir=$(includedir)/ffts
libffts_include_HEADERS = ../include/ffts.h
-AM_CFLAGS = -I$(top_srcdir)/include
+AM_CFLAGS = -I$(top_srcdir)/include -DAUTOTOOLS_BUILD=yes
if HAVE_VFP
libffts_la_SOURCES += vfp.s
diff --git a/lib/ffts/src/codegen.c b/lib/ffts/src/codegen.c
index c4e19e6..0bce616 100644
--- a/lib/ffts/src/codegen.c
+++ b/lib/ffts/src/codegen.c
@@ -139,9 +139,9 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
#ifdef HAVE_SSE
if (sign < 0) {
- p->constants = sse_constants;
+ p->constants = (const void*) sse_constants;
} else {
- p->constants = sse_constants_inv;
+ p->constants = (const void*) sse_constants_inv;
}
#endif
diff --git a/lib/ffts/src/codegen_sse.h b/lib/ffts/src/codegen_sse.h
index e9819f1..2ca540e 100644
--- a/lib/ffts/src/codegen_sse.h
+++ b/lib/ffts/src/codegen_sse.h
@@ -488,7 +488,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[0], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[2], X64_RAX, 2);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[3], X64_RAX, 2);
@@ -507,14 +507,14 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[6], X64_RAX, 2);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[7], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM8);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
@@ -530,7 +530,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0 ? 8 : 0);
extend--;
x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
@@ -538,10 +538,10 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM8);
- x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+ x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
- x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
+ x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
@@ -551,7 +551,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
@@ -580,7 +580,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
@@ -588,7 +588,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0 ? 8 : 0);
extend--;
x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
@@ -620,7 +620,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[0], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RSI, offsets[2], X64_RAX, 2);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[3], X64_RAX, 2);
@@ -640,14 +640,14 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
x64_sse_movaps_reg_memindex(ins, X64_XMM3, X64_RSI, offsets[6], X64_RAX, 2);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[7], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM3);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_movsxd_reg_memindex(ins, X64_R11, X64_R8, 0, X64_RAX, 2);
@@ -663,7 +663,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0 ? 8 : 0);
extend--;
x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
@@ -671,10 +671,10 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM3);
- x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+ x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
- x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
+ x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
@@ -684,7 +684,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM3, 0xB1);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
@@ -713,7 +713,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
@@ -721,7 +721,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0 ? 8 : 0);
extend--;
x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
@@ -1157,28 +1157,28 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[0], X64_RAX, 2);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[2], X64_RAX, 2);
- x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+ x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
- x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+ x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RDX, offsets[4], X64_RAX, 2);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RDX, offsets[5], X64_RAX, 2);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[6], X64_RAX, 2);
@@ -1206,7 +1206,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
@@ -1218,7 +1218,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
- x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
+ x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);
@@ -1257,28 +1257,28 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RSI, offsets[0], X64_RAX, 2);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[1], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[2], X64_RAX, 2);
- x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+ x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
- x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+ x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RSI, offsets[3], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RSI, offsets[4], X64_RAX, 2);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM3, X64_XMM6, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM3, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RSI, offsets[5], X64_RAX, 2);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[6], X64_RAX, 2);
@@ -1306,7 +1306,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
- x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_movsxd_reg_memindex(ins, X64_R12, X64_R8, 8, X64_RAX, 2);
@@ -1318,7 +1318,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
- x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
+ x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);
diff --git a/lib/ffts/src/ffts.c b/lib/ffts/src/ffts.c
index 7fa675a..35c5cad 100644
--- a/lib/ffts/src/ffts.c
+++ b/lib/ffts/src/ffts.c
@@ -34,6 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ffts.h"
#include "ffts_internal.h"
+#include "ffts_chirp_z.h"
#include "ffts_static.h"
#include "ffts_trig.h"
#include "macros.h"
@@ -76,7 +77,8 @@ static const FFTS_ALIGN(64) float w_data[16] = {
};
#endif
-static FFTS_INLINE int ffts_allow_execute(void *start, size_t len)
+static FFTS_INLINE int
+ffts_allow_execute(void *start, size_t len)
{
int result;
@@ -90,7 +92,8 @@ static FFTS_INLINE int ffts_allow_execute(void *start, size_t len)
return result;
}
-static FFTS_INLINE int ffts_deny_execute(void *start, size_t len)
+static FFTS_INLINE int
+ffts_deny_execute(void *start, size_t len)
{
int result;
@@ -104,7 +107,8 @@ static FFTS_INLINE int ffts_deny_execute(void *start, size_t len)
return result;
}
-static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length)
+static FFTS_INLINE int
+ffts_flush_instruction_cache(void *start, size_t length)
{
#ifdef _WIN32
return !FlushInstructionCache(GetCurrentProcess(), start, length);
@@ -124,7 +128,8 @@ static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length)
#endif
}
-static FFTS_INLINE void *ffts_vmem_alloc(size_t length)
+static FFTS_INLINE void*
+ffts_vmem_alloc(size_t length)
{
#if __APPLE__
return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
@@ -139,7 +144,8 @@ static FFTS_INLINE void *ffts_vmem_alloc(size_t length)
#endif
}
-static FFTS_INLINE void ffts_vmem_free(void *addr, size_t length)
+static FFTS_INLINE void
+ffts_vmem_free(void *addr, size_t length)
{
#ifdef _WIN32
(void) length;
@@ -174,7 +180,8 @@ ffts_free(ffts_plan_t *p)
}
}
-void ffts_free_1d(ffts_plan_t *p)
+static void
+ffts_free_1d(ffts_plan_t *p)
{
#if !defined(DYNAMIC_DISABLED)
if (p->transform_base) {
@@ -188,7 +195,7 @@ void ffts_free_1d(ffts_plan_t *p)
}
if (p->ws) {
- FFTS_FREE(p->ws);
+ ffts_aligned_free(p->ws);
}
if (p->is) {
@@ -233,7 +240,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f);
#endif
- p->ws = FFTS_MALLOC(lut_size, 32);
+ p->ws = ffts_aligned_malloc(lut_size);
if (!p->ws) {
goto cleanup;
}
@@ -253,7 +260,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
/* calculate factors */
m = leaf_N << (n_luts - 2);
- tmp = FFTS_MALLOC(m * sizeof(ffts_cpx_32f), 32);
+ tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_32f));
ffts_generate_cosine_sine_pow2_32f(tmp, m);
@@ -263,7 +270,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
p->ws_is[i] = w - (ffts_cpx_32f*) p->ws;
if (!i) {
- ffts_cpx_32f *w0 = FFTS_MALLOC(n/4 * sizeof(ffts_cpx_32f), 32);
+ ffts_cpx_32f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_32f));
float *fw0 = (float*) w0;
float *fw = (float*) w;
@@ -300,11 +307,11 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
w += n/4 * 2;
#endif
- FFTS_FREE(w0);
+ ffts_aligned_free(w0);
} else {
- ffts_cpx_32f *w0 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
- ffts_cpx_32f *w1 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
- ffts_cpx_32f *w2 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
+ ffts_cpx_32f *w0 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));
+ ffts_cpx_32f *w1 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));
+ ffts_cpx_32f *w2 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));
float *fw0 = (float*) w0;
float *fw1 = (float*) w1;
@@ -380,9 +387,9 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
w += n/8 * 3 * 2;
#endif
- FFTS_FREE(w0);
- FFTS_FREE(w1);
- FFTS_FREE(w2);
+ ffts_aligned_free(w0);
+ ffts_aligned_free(w1);
+ ffts_aligned_free(w2);
}
n *= 2;
@@ -401,7 +408,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
}
#endif
- FFTS_FREE(tmp);
+ ffts_aligned_free(tmp);
p->lastlut = w;
p->n_luts = n_luts;
@@ -411,18 +418,166 @@ cleanup:
return -1;
}
+#ifdef FFTS_DOUBLE
+static int
+ffts_generate_luts_64f(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
+{
+ V4DF MULI_SIGN;
+ size_t n_luts;
+ ffts_cpx_64f *w;
+ ffts_cpx_64f *tmp;
+ size_t i, j, m, n;
+ int stride;
+
+ if (sign < 0) {
+ MULI_SIGN = V4DF_LIT4(-0.0, 0.0, -0.0, 0.0);
+ } else {
+ MULI_SIGN = V4DF_LIT4(0.0, -0.0, 0.0, -0.0);
+ }
+
+ /* LUTS */
+ n_luts = ffts_ctzl(N / leaf_N);
+ if (n_luts >= 32) {
+ n_luts = 0;
+ }
+
+ if (n_luts) {
+ size_t lut_size;
+
+ lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_64f);
+
+ p->ws = ffts_aligned_malloc(lut_size);
+ if (!p->ws) {
+ goto cleanup;
+ }
+
+ p->ws_is = (size_t*) malloc(n_luts * sizeof(*p->ws_is));
+ if (!p->ws_is) {
+ goto cleanup;
+ }
+ }
+
+ w = p->ws;
+ n = leaf_N * 2;
+
+ /* calculate factors */
+ m = leaf_N << (n_luts - 2);
+ tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_64f));
+
+ ffts_generate_cosine_sine_pow2_64f(tmp, m);
+
+ /* generate lookup tables */
+ stride = 1 << (n_luts - 1);
+ for (i = 0; i < n_luts; i++) {
+ p->ws_is[i] = w - (ffts_cpx_64f*) p->ws;
+
+ if (!i) {
+ ffts_cpx_64f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_64f));
+ double *fw0 = (double*) w0;
+ double *fw = (double*) w;
+
+ for (j = 0; j < n/4; j++) {
+ w0[j][0] = tmp[j * stride][0];
+ w0[j][1] = tmp[j * stride][1];
+ }
+
+ for (j = 0; j < n/4; j += 2) {
+ V4DF re, im, temp0;
+ temp0 = V4DF_LD(fw0 + j*2);
+ re = V4DF_DUPLICATE_RE(temp0);
+ im = V4DF_DUPLICATE_IM(temp0);
+ im = V4DF_XOR(im, MULI_SIGN);
+ V4DF_ST(fw + j*4 + 0, re);
+ V4DF_ST(fw + j*4 + 4, im);
+ }
+
+ w += n/4 * 2;
+ ffts_aligned_free(w0);
+ } else {
+ ffts_cpx_64f *w0 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
+ ffts_cpx_64f *w1 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
+ ffts_cpx_64f *w2 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
+
+ double *fw0 = (double*) w0;
+ double *fw1 = (double*) w1;
+ double *fw2 = (double*) w2;
+
+ double *fw = (double*)w;
+
+ for (j = 0; j < n/8; j++) {
+ w0[j][0] = tmp[2 * j * stride][0];
+ w0[j][1] = tmp[2 * j * stride][1];
+
+ w1[j][0] = tmp[j * stride][0];
+ w1[j][1] = tmp[j * stride][1];
+
+ w2[j][0] = tmp[(j + (n/8)) * stride][0];
+ w2[j][1] = tmp[(j + (n/8)) * stride][1];
+ }
+
+ for (j = 0; j < n/8; j += 2) {
+ V4DF temp0, temp1, temp2, re, im;
+
+ temp0 = V4DF_LD(fw0 + j*2);
+ re = V4DF_DUPLICATE_RE(temp0);
+ im = V4DF_DUPLICATE_IM(temp0);
+ im = V4DF_XOR(im, MULI_SIGN);
+ V4DF_ST(fw + j*2*6+0, re);
+ V4DF_ST(fw + j*2*6+4, im);
+
+ temp1 = V4DF_LD(fw1 + j*2);
+ re = V4DF_DUPLICATE_RE(temp1);
+ im = V4DF_DUPLICATE_IM(temp1);
+ im = V4DF_XOR(im, MULI_SIGN);
+ V4DF_ST(fw + j*2*6+8 , re);
+ V4DF_ST(fw + j*2*6+12, im);
+
+ temp2 = V4DF_LD(fw2 + j*2);
+ re = V4DF_DUPLICATE_RE(temp2);
+ im = V4DF_DUPLICATE_IM(temp2);
+ im = V4DF_XOR(im, MULI_SIGN);
+ V4DF_ST(fw + j*2*6+16, re);
+ V4DF_ST(fw + j*2*6+20, im);
+ }
+
+ w += n/8 * 3 * 2;
+ ffts_aligned_free(w0);
+ ffts_aligned_free(w1);
+ ffts_aligned_free(w2);
+ }
+
+ n *= 2;
+ stride >>= 1;
+ }
+
+ ffts_aligned_free(tmp);
+
+ p->lastlut = w;
+ p->n_luts = n_luts;
+ return 0;
+
+cleanup:
+ return -1;
+}
+#endif
+
FFTS_API ffts_plan_t*
ffts_init_1d(size_t N, int sign)
{
const size_t leaf_N = 8;
ffts_plan_t *p;
- if (N < 2 || (N & (N - 1)) != 0) {
- LOG("FFT size must be a power of two\n");
+ if (N < 2) {
+ LOG("FFT size must be greater than 1");
return NULL;
}
- p = calloc(1, sizeof(*p));
+ /* check if size is not a power of two */
+ if (N & (N - 1)) {
+ return ffts_chirp_z_init(N, sign);
+ }
+
+ p = (ffts_plan_t*) calloc(1, sizeof(*p));
if (!p) {
return NULL;
}
@@ -537,3 +692,98 @@ cleanup:
ffts_free_1d(p);
return NULL;
}
+
+#ifdef FFTS_DOUBLE
+FFTS_API ffts_plan_t*
+ffts_init_1d_64f(size_t N, int sign)
+{
+ const size_t leaf_N = 8;
+ ffts_plan_t *p;
+
+ if (N < 2) {
+ LOG("FFT size must be greater than 1");
+ return NULL;
+ }
+
+ p = (ffts_plan_t*) calloc(1, sizeof(*p));
+ if (!p) {
+ return NULL;
+ }
+
+ p->destroy = ffts_free_1d;
+ p->N = N;
+
+ if (N >= 32) {
+ /* generate lookup tables */
+ if (ffts_generate_luts_64f(p, N, leaf_N, sign)) {
+ goto cleanup;
+ }
+
+ p->offsets = ffts_init_offsets(N, leaf_N);
+ if (!p->offsets) {
+ goto cleanup;
+ }
+
+ p->is = ffts_init_is(N, leaf_N, 1);
+ if (!p->is) {
+ goto cleanup;
+ }
+
+ p->i0 = N/leaf_N/3 + 1;
+ p->i1 = p->i2 = N/leaf_N/3;
+ if ((N/leaf_N) % 3 > 1) {
+ p->i1++;
+ }
+
+ p->i0 /= 2;
+ p->i1 /= 2;
+
+ if (sign < 0) {
+ p->transform = ffts_static_transform_f_64f;
+ } else {
+ p->transform = ffts_static_transform_i_64f;
+ }
+ } else {
+ switch (N) {
+ case 2:
+ p->transform = &ffts_small_2_64f;
+ break;
+ case 4:
+ if (sign == -1) {
+ p->transform = &ffts_small_forward4_64f;
+ } else if (sign == 1) {
+ p->transform = &ffts_small_backward4_64f;
+ }
+ break;
+ case 8:
+ if (sign == -1) {
+ p->transform = &ffts_small_forward8_64f;
+ } else if (sign == 1) {
+ p->transform = &ffts_small_backward8_64f;
+ }
+ break;
+ case 16:
+ default:
+ if (sign == -1) {
+ p->transform = &ffts_small_forward16_64f;
+ } else {
+ p->transform = &ffts_small_backward16_64f;
+ }
+ break;
+ }
+ }
+
+ return p;
+
+cleanup:
+ ffts_free_1d(p);
+ return NULL;
+}
+#else
+FFTS_API ffts_plan_t*
+ffts_init_1d_64f(size_t N, int sign)
+{
+ /* disabled */
+ return NULL;
+}
+#endif \ No newline at end of file
diff --git a/lib/ffts/src/ffts_chirp_z.c b/lib/ffts/src/ffts_chirp_z.c
new file mode 100644
index 0000000..e463a55
--- /dev/null
+++ b/lib/ffts/src/ffts_chirp_z.c
@@ -0,0 +1,225 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_chirp_z.h"
+
+#include "ffts_internal.h"
+#include "ffts_trig.h"
+
+/*
+* For more information on algorithms:
+*
+* L. I. Bluestein, A linear filtering approach to the computation of
+* the discrete Fourier transform, 1968 NEREM Rec., pp. 218-219
+*
+* Lawrence R. Rabiner, Ronald W. Schafer, Charles M. Rader,
+* The Chirp z-Transform Algorithm and Its Application
+* Bell Sys. Tech. J., vol. 48, pp. 1249-1292, May 1969.
+*
+* Rick Lyons, Four Ways to Compute an Inverse FFT Using the Forward FFT Algorithm
+* https://www.dsprelated.com/showarticle/800.php, July 7, 2015
+*/
+
+/* forward declarations */
+static void
+ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out);
+
+static void
+ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out);
+
+static void
+ffts_chirp_z_free(ffts_plan_t *p)
+{
+ if (p->B)
+ ffts_aligned_free(p->B);
+
+ if (p->A)
+ ffts_aligned_free(p->A);
+
+ if (p->buf)
+ ffts_aligned_free(p->buf);
+
+ if (p->plans[0])
+ ffts_free(p->plans[0]);
+
+ free(p);
+}
+
+ffts_plan_t*
+ffts_chirp_z_init(size_t N, int sign)
+{
+ float *A, *B, reciprocal_M, *tmp;
+ ffts_plan_t *p;
+ size_t i, M;
+
+ FFTS_ASSUME(N > 2);
+
+ p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
+ if (!p)
+ return NULL;
+
+ p->destroy = ffts_chirp_z_free;
+ p->N = N;
+ p->rank = 1;
+ p->plans = (ffts_plan_t**) &p[1];
+
+ if (sign < 0)
+ p->transform = ffts_chirp_z_transform_f_32f;
+ else
+ p->transform = ffts_chirp_z_transform_i_32f;
+
+ /* determinate next power of two such that M >= 2*N-1 */
+ M = ffts_next_power_of_2(2*N-1);
+ p->plans[0] = ffts_init_1d(M, FFTS_FORWARD);
+ if (!p->plans[0])
+ goto cleanup;
+
+ p->A = A = (float*) ffts_aligned_malloc(2 * N * sizeof(float));
+ if (!p->A)
+ goto cleanup;
+
+ p->B = B = (float*) ffts_aligned_malloc(2 * M * sizeof(float));
+ if (!p->B)
+ goto cleanup;
+
+ p->buf = tmp = (float*) ffts_aligned_malloc(2 * 2 * M * sizeof(float));
+
+ ffts_generate_chirp_32f((ffts_cpx_32f*) A, N);
+
+ /* scale with reciprocal of length */
+ reciprocal_M = 1.0f / M;
+ tmp[0] = A[0] * reciprocal_M;
+ tmp[1] = A[1] * reciprocal_M;
+ for (i = 1; i < N; ++i) {
+ tmp[2 * i + 0] = tmp[2 * (M - i) + 0] = A[2 * i + 0] * reciprocal_M;
+ tmp[2 * i + 1] = tmp[2 * (M - i) + 1] = A[2 * i + 1] * reciprocal_M;
+ }
+
+ /* zero pad */
+ for (; i <= M - N; ++i)
+ tmp[2 * i] = tmp[2 * i + 1] = 0.0f;
+
+ /* FFT */
+ p->plans[0]->transform(p->plans[0], tmp, B);
+ return p;
+
+cleanup:
+ ffts_chirp_z_free(p);
+ return NULL;
+}
+
+static void
+ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out)
+{
+ const float *A = FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *B = FFTS_ASSUME_ALIGNED_32(p->B);
+ size_t i, M = p->plans[0]->N, N = p->N;
+ float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf);
+ float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]);
+ const float *din = (const float*) in;
+ float *dout = (float*) out;
+
+ /* we know this */
+ FFTS_ASSUME(M >= 8);
+
+ /* multiply input with conjugated sequence */
+ for (i = 0; i < N; ++i) {
+ t1[2 * i + 0] = din[2 * i + 0] * A[2 * i + 0] + din[2 * i + 1] * A[2 * i + 1];
+ t1[2 * i + 1] = din[2 * i + 1] * A[2 * i + 0] - din[2 * i + 0] * A[2 * i + 1];
+ }
+
+ /* zero pad */
+ for (; i < M; ++i)
+ t1[2 * i] = t1[2 * i + 1] = 0.0f;
+
+ /* convolution using FFT */
+ p->plans[0]->transform(p->plans[0], t1, t2);
+
+ /* complex multiply */
+ for (i = 0; i < M; ++i) {
+ t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1];
+ t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1];
+ }
+
+ /* IFFT using FFT with real and imaginary parts swapped */
+ p->plans[0]->transform(p->plans[0], t1, t2);
+
+ /* multiply output with conjugated sequence */
+ for (i = 0; i < N; ++i) {
+ dout[2 * i + 0] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1];
+ dout[2 * i + 1] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1];
+ }
+}
+
+/* IFFT using FFT with real and imaginary parts swapped */
+static void
+ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out)
+{
+ const float *A = FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *B = FFTS_ASSUME_ALIGNED_32(p->B);
+ size_t i, M = p->plans[0]->N, N = p->N;
+ float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf);
+ float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]);
+ const float *din = (const float*) in;
+ float *dout = (float*) out;
+
+ /* we know this */
+ FFTS_ASSUME(M >= 8);
+
+ /* multiply input with conjugated sequence */
+ for (i = 0; i < N; ++i) {
+ t1[2 * i + 0] = din[2 * i + 1] * A[2 * i + 0] + din[2 * i + 0] * A[2 * i + 1];
+ t1[2 * i + 1] = din[2 * i + 0] * A[2 * i + 0] - din[2 * i + 1] * A[2 * i + 1];
+ }
+
+ /* zero pad */
+ for (; i < M; ++i)
+ t1[2 * i] = t1[2 * i + 1] = 0.0f;
+
+ /* convolution using FFT */
+ p->plans[0]->transform(p->plans[0], t1, t2);
+
+ /* complex multiply */
+ for (i = 0; i < M; ++i) {
+ t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1];
+ t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1];
+ }
+
+ /* IFFT using FFT with real and imaginary parts swapped */
+ p->plans[0]->transform(p->plans[0], t1, t2);
+
+ /* multiply output with conjugated sequence */
+ for (i = 0; i < N; ++i) {
+ dout[2 * i + 0] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1];
+ dout[2 * i + 1] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1];
+ }
+}
diff --git a/lib/ffts/src/ffts_chirp_z.h b/lib/ffts/src/ffts_chirp_z.h
new file mode 100644
index 0000000..2a6aa7b
--- /dev/null
+++ b/lib/ffts/src/ffts_chirp_z.h
@@ -0,0 +1,45 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_CHIRP_Z_H
+#define FFTS_CHIRP_Z_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts.h"
+
+ffts_plan_t*
+ffts_chirp_z_init(size_t N, int sign);
+
+#endif /* FFTS_CHIRP_Z_H */
diff --git a/lib/ffts/src/ffts_cpu.c b/lib/ffts/src/ffts_cpu.c
new file mode 100644
index 0000000..daf92c8
--- /dev/null
+++ b/lib/ffts/src/ffts_cpu.c
@@ -0,0 +1,371 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_cpu.h"
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+#include <stdio.h>
+#endif
+
+#if defined(_WIN32)
+#include <intrin.h>
+#include <windows.h>
+#endif
+
+/* TODO: add detection/declaration of these to CMake phase */
+#if !defined(FFTS_CPU_X64)
+#if defined(_M_AMD64) || defined(__amd64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64) || defined(__x86_64__)
+/* 64 bit x86 detected */
+#define FFTS_CPU_X64
+#endif
+#endif
+
+#if !defined(FFTS_CPU_X64) && !defined(FFTS_CPU_X86)
+#if defined(i386) || defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__X86__) || defined(_X86_)
+/* 32 bit x86 detected */
+#define FFTS_CPU_X86
+#endif
+#endif
+
+/* check if build is 32 bit or 64 bit x86 */
+#if defined(FFTS_CPU_X64) || defined(FFTS_CPU_X86)
+
+/* Build and tested on
+CentOS 6.8 2.6.32-642.11.1.el6.x86_64 - gcc version 4.4.7 20120313
+Mac OSX 10.9 - Apple Clang 6.0
+Ubuntu 14.04 LTS 4.2.0-42 x86_64 - gcc version 4.8.4
+Windows XP SP3 - Visual Studio 2005 SP1 x86/x64
+Windows Vista SP2 - Visual Studio 2010 SP1 x86/x64
+Windows 7 Ultimate SP1 - Visual Studio 2015 x86/x64
+Windows 7 Ultimate SP1 - gcc version 4.9.2 (i686-posix-dwarf-rev1)
+Windows 7 Ultimate SP1 - gcc version 4.9.2 (x86_64-posix-seh-rev3)
+Windows 10 Pro - Visual Studio 2017 x86/x64
+*/
+
+/* Visual Studio 2010 SP1 or newer have _xgetbv intrinsic */
+#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219)
+#define FFTS_HAVE_XGETBV
+#endif
+
+#ifndef BIT
+#define BIT(n) (1u << n)
+#endif
+
+/* bit masks */
+#define FFTS_CPU_X86_SSE_BITS (BIT(0) | BIT(15) | BIT(23) | BIT(24) | BIT(25))
+#define FFTS_CPU_X86_SSE2_BITS (BIT(26))
+#define FFTS_CPU_X86_SSE3_BITS (BIT(0))
+#define FFTS_CPU_X86_SSSE3_BITS (BIT(9))
+#define FFTS_CPU_X86_SSE4_1_BITS (BIT(19))
+#define FFTS_CPU_X86_SSE4_2_BITS (BIT(20) | BIT(23))
+#define FFTS_CPU_X86_AVX_BITS (BIT(26) | BIT(27) | BIT(28))
+#define FFTS_CPU_X86_XCR0_BITS (
+#define FFTS_CPU_X86_AVX2_BITS (BIT(5))
+#define FFTS_CPU_X86_AVX512_BITS (BIT(16))
+
+/* Visual Studio 2008 or older */
+#if defined(FFTS_CPU_X64) && defined(_MSC_VER) && _MSC_VER <= 1500
+#pragma optimize("", off)
+static void __fastcall ffts_cpuidex(int subleaf, int regs[4], int leaf)
+{
+ /* x64 uses a four register fast-call calling convention by default and
+ arguments are passed in registers RCX, RDX, R8, and R9. By disabling
+ optimization and passing subleaf as first argument we get __cpuidex
+ */
+ (void) subleaf;
+ __cpuid(regs, leaf);
+}
+#pragma optimize("", on)
+#endif
+
+static FFTS_INLINE void ffts_cpuid(int regs[4], int leaf, int subleaf)
+{
+#if defined(_MSC_VER)
+#if defined(FFTS_CPU_X64)
+ /* Visual Studio 2010 or newer */
+#if _MSC_VER > 1500
+ __cpuidex(regs, leaf, subleaf);
+#else
+ ffts_cpuidex(subleaf, regs, leaf);
+#endif
+#else
+ __asm {
+ mov eax, leaf
+ mov ecx, subleaf
+ mov esi, regs
+ cpuid
+ mov [esi + 0x0], eax
+ mov [esi + 0x4], ebx
+ mov [esi + 0x8], ecx
+ mov [esi + 0xc], edx
+ }
+#endif
+#elif defined(__GNUC__) && __GNUC__
+#if defined(FFTS_CPU_X64)
+ __asm__ __volatile__(
+ "cpuid\n\t"
+ : "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
+ : "a"(leaf), "c"(subleaf));
+#elif defined(__PIC__)
+ __asm__ __volatile__(
+ "xchgl %%ebx, %1\n\t"
+ "cpuid \n\t"
+ "xchgl %%ebx, %1\n\t"
+ : "=a"(regs[0]), "=r"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
+ : "a"(leaf), "c"(subleaf));
+#else
+ __asm__ __volatile__(
+ "cpuid\n\t"
+ : "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
+ : "a"(leaf), "c"(subleaf));
+#endif
+#else
+ /* unknown compiler for x86 */
+ regs[0] = regs[1] = regs[2] = regs[3] = 0;
+#endif
+}
+
+/* at least Visual Studio 2010 generates invalidate optimized _xgetbv */
+#if defined(FFTS_HAVE_XGETBV)
+#pragma optimize("", off)
+#endif
+static FFTS_INLINE unsigned int ffts_get_xcr0(void)
+{
+#if defined(FFTS_HAVE_XGETBV)
+ return (unsigned int) _xgetbv(0);
+#elif defined(_MSC_VER)
+#if defined(FFTS_CPU_X64)
+ /* emulate xgetbv(0) on Windows 7 SP1 or newer */
+ typedef DWORD64 (WINAPI *PGETENABLEDXSTATEFEATURES)(VOID);
+ PGETENABLEDXSTATEFEATURES pfnGetEnabledXStateFeatures =
+ (PGETENABLEDXSTATEFEATURES) GetProcAddress(
+ GetModuleHandle(TEXT("kernel32.dll")), "GetEnabledXStateFeatures");
+ return pfnGetEnabledXStateFeatures ? (unsigned int) pfnGetEnabledXStateFeatures() : 0;
+#else
+ /* note that we have to touch edx register to tell compiler it's used by emited xgetbv */
+ unsigned __int32 hi, lo;
+ __asm {
+ xor ecx, ecx
+ _emit 0x0f
+ _emit 0x01
+ _emit 0xd0
+ mov lo, eax
+ mov hi, edx
+ }
+ return (unsigned int) lo;
+#endif
+#elif defined(__GNUC__) && __GNUC__
+ unsigned int lo;
+ __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0\n"
+ : "=a"(lo)
+ : "c"(0)
+ : "edx");
+ return lo;
+#else
+ /* unknown x86 compiler */
+ return 0;
+#endif
+}
+#if defined(FFTS_HAVE_XGETBV)
+#pragma optimize("", on)
+#endif
+
+int
+ffts_cpu_detect(int *extra_flags)
+{
+ static int cpu_flags = -1;
+ static int cpu_extra_flags = -1;
+ int max_basic_func;
+ int regs[4];
+ unsigned int xcr0;
+
+ if (cpu_flags >= 0) {
+ goto exit;
+ }
+
+ /* initialize */
+ cpu_flags = cpu_extra_flags = 0;
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+ printf("cpuid check: ");
+#endif
+#if defined(FFTS_CPU_X64)
+ /* cpuid is always supported on x64 */
+#if defined(FFTS_BUILDING_CPU_TEST)
+ printf("skipped\n");
+#endif
+#else
+#if defined(_MSC_VER)
+ _asm {
+ pushfd
+ pop eax
+ mov ebx,eax
+ xor eax,200000h
+ push eax
+ popfd
+ pushfd
+ pop eax
+ push ebx
+ popfd
+ mov regs[0 * TYPE regs],eax
+ mov regs[1 * TYPE regs],ebx
+ }
+#else
+ __asm__ (
+ "pushfl\n\t"
+ "pop %0\n\t"
+ "movl %0,%1\n\t"
+ "xorl $0x200000,%0\n\t"
+ "pushl %0\n\t"
+ "popfl\n\t"
+ "pushfl\n\t"
+ "popl %0\n\t"
+ "pushl %1\n\t"
+ "popfl\n\t"
+ : "=r" (regs[0]), "=r" (regs[1])
+ );
+#endif
+ /* check CPUID bit (bit 21) in EFLAGS register can be toggled */
+ if (((regs[0] ^ regs[1]) & 0x200000) == 0) {
+#if defined(FFTS_BUILDING_CPU_TEST)
+ printf("not supported\n");
+#endif
+ goto exit;
+ }
+#if defined(FFTS_BUILDING_CPU_TEST)
+ printf("supported\n");
+#endif
+#endif
+
+ /* get the number of basic functions */
+ ffts_cpuid(regs, 0, 0);
+ max_basic_func = regs[0];
+#if defined(FFTS_BUILDING_CPU_TEST)
+ printf("cpuid eax=0, ecx=0: %d\n", max_basic_func);
+#endif
+ if (max_basic_func == 0)
+ goto exit;
+
+ /* get feature flags */
+ ffts_cpuid(regs, 1, 0);
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+ printf("cpuid eax=1, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
+#endif
+
+#if defined(FFTS_CPU_X64)
+ /* minimum for any x64 */
+ cpu_flags = FFTS_CPU_X86_SSE | FFTS_CPU_X86_SSE2;
+#else
+ /* test if SSE is supported */
+ if ((regs[3] & FFTS_CPU_X86_SSE_BITS) != FFTS_CPU_X86_SSE_BITS)
+ goto exit;
+ cpu_flags = FFTS_CPU_X86_SSE;
+
+ /* test if SSE2 is supported */
+ if (!(regs[3] & FFTS_CPU_X86_SSE2_BITS))
+ goto exit;
+ cpu_flags |= FFTS_CPU_X86_SSE2;
+#endif
+
+ /* test if SSE3 is supported */
+ if (!(regs[2] & FFTS_CPU_X86_SSE3_BITS))
+ goto exit;
+ cpu_flags |= FFTS_CPU_X86_SSE3;
+
+ /* test if SSSE3 is supported */
+ if (!(regs[2] & FFTS_CPU_X86_SSSE3_BITS))
+ goto exit;
+ cpu_flags |= FFTS_CPU_X86_SSSE3;
+
+ /* test if SSE4.1 is supported */
+ if (!(regs[2] & FFTS_CPU_X86_SSE4_1_BITS))
+ goto exit;
+ cpu_flags |= FFTS_CPU_X86_SSE4_1;
+
+ /* test if SSE4.2 is supported */
+ if ((regs[2] & FFTS_CPU_X86_SSE4_2_BITS) != FFTS_CPU_X86_SSE4_2_BITS)
+ goto exit;
+ cpu_flags |= FFTS_CPU_X86_SSE4_2;
+
+ /* test if AVX is supported */
+ if ((regs[2] & FFTS_CPU_X86_AVX_BITS) != FFTS_CPU_X86_AVX_BITS)
+ goto exit;
+
+ /* test if legaxy x87, 128-bit SSE and 256-bit AVX states are enabled in XCR0 */
+ xcr0 = ffts_get_xcr0();
+#if defined(FFTS_BUILDING_CPU_TEST)
+ printf("xcr0: %u\n", xcr0);
+#endif
+ if ((xcr0 & 0x6) != 0x6)
+ goto exit;
+
+ cpu_flags |= FFTS_CPU_X86_AVX;
+
+ /* check that cpuid extended features exist */
+ if (max_basic_func < 7)
+ goto exit;
+
+ /* get extended features */
+ ffts_cpuid(regs, 7, 0);
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+ printf("cpuid eax=7, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
+#endif
+
+ /* test if AVX2 is supported */
+ if ((regs[1] & FFTS_CPU_X86_AVX2_BITS) != FFTS_CPU_X86_AVX2_BITS)
+ goto exit;
+ cpu_flags |= FFTS_CPU_X86_AVX2;
+
+ /* test if AVX512 is supported */
+ if ((regs[1] & FFTS_CPU_X86_AVX512_BITS) != FFTS_CPU_X86_AVX512_BITS)
+ goto exit;
+ cpu_flags |= FFTS_CPU_X86_AVX512;
+
+exit:
+ if (extra_flags) {
+ *extra_flags = cpu_extra_flags;
+ }
+ return cpu_flags;
+}
+#else
+int
+ffts_cpu_detect(int *extra_flags)
+{
+ /* not implemented */
+#if defined(FFTS_BUILDING_CPU_TEST)
+ printf("CPU detection not implemented!!\n");
+#endif
+ return 0;
+}
+#endif \ No newline at end of file
diff --git a/lib/ffts/src/ffts_cpu.h b/lib/ffts/src/ffts_cpu.h
new file mode 100644
index 0000000..37d77e4
--- /dev/null
+++ b/lib/ffts/src/ffts_cpu.h
@@ -0,0 +1,54 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_CPU_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts_internal.h"
+
+#define FFTS_CPU_X86_SSE 0x001
+#define FFTS_CPU_X86_SSE2 0x002
+#define FFTS_CPU_X86_SSE3 0x004
+#define FFTS_CPU_X86_SSSE3 0x008
+#define FFTS_CPU_X86_SSE4_1 0x010
+#define FFTS_CPU_X86_SSE4_2 0x020
+#define FFTS_CPU_X86_AVX 0x040
+#define FFTS_CPU_X86_AVX2 0x080
+#define FFTS_CPU_X86_AVX512 0x100
+
+int
+ffts_cpu_detect(int *extra_flags);
+
+#endif /* FFTS_CPU_H */
diff --git a/lib/ffts/src/ffts_internal.h b/lib/ffts/src/ffts_internal.h
index 157c283..04ebb9c 100644
--- a/lib/ffts/src/ffts_internal.h
+++ b/lib/ffts/src/ffts_internal.h
@@ -2,6 +2,7 @@
This file is part of FFTS -- The Fastest Fourier Transform in the South
+Copyright (c) 2015-2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
@@ -34,7 +35,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef FFTS_INTERNAL_H
#define FFTS_INTERNAL_H
+#ifdef AUTOTOOLS_BUILD
#include "config.h"
+#endif
+
#include "ffts_attributes.h"
#include "types.h"
@@ -42,18 +46,59 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <malloc.h>
#endif
+#ifdef HAVE_MM_ALLOC_H
+#include <mm_malloc.h>
+#ifndef HAVE__MM_MALLOC
+#define HAVE__MM_MALLOC
+#endif
+#endif
+
#include <stddef.h>
-#ifdef HAVE_STDINT_H
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#elif HAVE_STDINT_H
#include <stdint.h>
+#elif _MSC_VER
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+#else
+typedef signed long int int32_t;
+typedef unsigned long int uint32_t;
+typedef signed long long int int64_t;
+typedef unsigned long long int uint64_t;
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+
#include <stdio.h>
+#if defined(HAVE_DECL_MEMALIGN) && !HAVE_DECL_MEMALIGN
+extern void *memalign(size_t, size_t);
+#endif
+
+#if defined(HAVE_DECL_POSIX_MEMALIGN) && !HAVE_DECL_POSIX_MEMALIGN
+extern int posix_memalign(void **, size_t, size_t);
+#endif
+
+#if defined(HAVE_DECL_VALLOC) && !HAVE_DECL_VALLOC
+extern void *valloc(size_t);
+#endif
+
+#ifdef _mm_malloc
+#ifndef HAVE__MM_MALLOC
+#define HAVE__MM_MALLOC
+#endif
+#endif
+
#ifdef ENABLE_LOG
#ifdef __ANDROID__
#include <android/log.h>
@@ -142,11 +187,9 @@ struct _ffts_plan_t {
*/
size_t transform_size;
- /**
- * Points to the cosnant variables used by
- * the Assembly Code
- */
- void *constants;
+ /* pointer to the constant variable used by SSE for sign change */
+ /* TODO: #ifdef HAVE_SSE */
+ const void *constants;
// multi-dimensional stuff:
struct _ffts_plan_t **plans;
@@ -171,44 +214,96 @@ struct _ffts_plan_t {
size_t i2;
};
-static FFTS_INLINE void *ffts_aligned_malloc(size_t size)
+static FFTS_INLINE void*
+ffts_aligned_malloc(size_t size)
{
-#if defined(_WIN32)
- return _aligned_malloc(size, 32);
+ void *p = NULL;
+
+ /* various ways to allocate aligned memory in order of preferance */
+#if defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
+ p = (void*) _mm_malloc(size, 32);
+#elif defined(HAVE_POSIX_MEMALIGN)
+ if (posix_memalign(&p, 32, size))
+ p = NULL;
+#elif defined(HAVE_MEMALIGN)
+ p = memalign(32, size);
+#elif defined(__ALTIVEC__)
+ p = vec_malloc(size);
+#elif defined(_MSC_VER) || defined(WIN32)
+ p = _aligned_malloc(size, 32);
+#elif defined(HAVE_VALLOC)
+ p = valloc(size);
#else
- return valloc(size);
+ p = malloc(size);
#endif
+
+ return p;
}
-static FFTS_INLINE void ffts_aligned_free(void *p)
+static FFTS_INLINE
+void ffts_aligned_free(void *p)
{
-#if defined(_WIN32)
+ /* order must match with ffts_aligned_malloc */
+#if defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
+ _mm_free(p);
+#elif defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_MEMALIGN)
+ free(p);
+#elif defined(__ALTIVEC__)
+ vec_free(p);
+#elif defined(_MSC_VER) || defined(WIN32)
_aligned_free(p);
#else
+ /* valloc or malloc */
free(p);
#endif
}
#if GCC_VERSION_AT_LEAST(3,3)
#define ffts_ctzl __builtin_ctzl
+
+static FFTS_INLINE size_t
+ffts_next_power_of_2(size_t N)
+{
+ return 1 << (32 - __builtin_clzl(N));
+}
#elif defined(_MSC_VER)
#include <intrin.h>
#ifdef _M_X64
#pragma intrinsic(_BitScanForward64)
-static __inline unsigned long ffts_ctzl(size_t N)
+static FFTS_INLINE unsigned long
+ffts_ctzl(size_t N)
{
unsigned long count;
_BitScanForward64((unsigned long*) &count, N);
return count;
}
+
+#pragma intrinsic(_BitScanReverse64)
+static FFTS_INLINE size_t
+ffts_next_power_of_2(size_t N)
+{
+ unsigned long log_2;
+ _BitScanReverse64((unsigned long*)&log_2, N);
+ return 1ULL << (log_2 + 1);
+}
#else
#pragma intrinsic(_BitScanForward)
-static __inline unsigned long ffts_ctzl(size_t N)
+static FFTS_INLINE unsigned long
+ffts_ctzl(size_t N)
{
unsigned long count;
_BitScanForward((unsigned long*) &count, N);
return count;
}
+
+#pragma intrinsic(_BitScanReverse)
+static FFTS_INLINE size_t
+ffts_next_power_of_2(size_t N)
+{
+ unsigned long log_2;
+ _BitScanReverse((unsigned long*)&log_2, N);
+ return 1 << (log_2 + 1);
+}
#endif /* _WIN64 */
#endif /* _MSC_VER */
diff --git a/lib/ffts/src/ffts_real.c b/lib/ffts/src/ffts_real.c
index 0f87a12..e0f0e1f 100644
--- a/lib/ffts/src/ffts_real.c
+++ b/lib/ffts/src/ffts_real.c
@@ -4,7 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
-Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2015 - 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@@ -33,6 +33,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_real.h"
+#include "ffts_cpu.h"
#include "ffts_internal.h"
#include "ffts_trig.h"
@@ -46,7 +47,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <pmmintrin.h>
#elif HAVE_INTRIN_H
#include <intrin.h>
-#else
+#endif
+
/* avoid using negative zero as some configurations have problems with those */
static const FFTS_ALIGN(16) unsigned int sign_mask_even[4] = {
0x80000000, 0, 0x80000000, 0
@@ -55,7 +57,6 @@ static const FFTS_ALIGN(16) unsigned int sign_mask_odd[4] = {
0, 0x80000000, 0, 0x80000000
};
#endif
-#endif
static void
ffts_free_1d_real(ffts_plan_t *p)
@@ -79,8 +80,9 @@ ffts_free_1d_real(ffts_plan_t *p)
free(p);
}
+#ifdef __ARM_NEON__
static void
-ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
+ffts_execute_1d_real_neon(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT out =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
@@ -91,25 +93,19 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
- int i;
-
-#ifdef __ARM_NEON__
float *p_buf0 = buf;
float *p_buf1 = buf + N - 2;
float *p_out = out;
-#endif
+ int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
p->plans[0]->transform(p->plans[0], input, buf);
-#ifndef HAVE_SSE
buf[N + 0] = buf[0];
buf[N + 1] = buf[1];
-#endif
-#ifdef __ARM_NEON__
for (i = 0; i < N; i += 4) {
__asm__ __volatile__ (
"vld1.32 {q8}, [%[pa]]!\n\t"
@@ -151,7 +147,35 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#elif HAVE_SSE3
+
+ out[N + 0] = buf[0] - buf[1];
+ out[N + 1] = 0.0f;
+}
+#endif
+
+#if HAVE_SSE3
+static void
+ffts_execute_1d_real_sse3(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT out =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
+ p->plans[0]->transform(p->plans[0], input, buf);
+
+ buf[N + 0] = buf[0];
+ buf[N + 1] = buf[1];
+
if (FFTS_UNLIKELY(N <= 8)) {
__m128 t0 = _mm_load_ps(buf);
__m128 t1 = _mm_load_ps(buf + N - 4);
@@ -235,7 +259,32 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
}
}
-#elif HAVE_SSE
+
+ out[N + 0] = buf[0] - buf[1];
+ out[N + 1] = 0.0f;
+}
+#endif
+
+#ifdef HAVE_SSE
+static void
+ffts_execute_1d_real_sse(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT out =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
+ p->plans[0]->transform(p->plans[0], input, buf);
+
if (FFTS_UNLIKELY(N <= 8)) {
__m128 c0 = _mm_load_ps((const float*) sign_mask_even);
__m128 t0 = _mm_load_ps(buf);
@@ -327,7 +376,34 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
_MM_SHUFFLE(2,3,0,1)))));
}
}
-#else
+
+ out[N + 0] = buf[0] - buf[1];
+ out[N + 1] = 0.0f;
+}
+#endif
+
+static void
+ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT out =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
+ p->plans[0]->transform(p->plans[0], input, buf);
+
+ buf[N + 0] = buf[0];
+ buf[N + 1] = buf[1];
+
for (i = 0; i < N/2; i++) {
out[2*i + 0] =
buf[ 2*i + 0] * A[2*i + 0] - buf[ 2*i + 1] * A[2*i + 1] +
@@ -336,14 +412,14 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
buf[ 2*i + 1] * A[2*i + 0] + buf[ 2*i + 0] * A[2*i + 1] +
buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0];
}
-#endif
out[N + 0] = buf[0] - buf[1];
out[N + 1] = 0.0f;
}
+#ifdef __ARM_NEON__
static void
-ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
+ffts_execute_1d_real_inv_neon(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT in =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
@@ -354,18 +430,14 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
- int i;
-
-#ifdef __ARM_NEON__
float *p_buf0 = in;
float *p_buf1 = in + N - 2;
float *p_out = buf;
-#endif
+ int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
-#ifdef __ARM_NEON__
for (i = 0; i < N/2; i += 2) {
__asm__ __volatile__ (
"vld1.32 {q8}, [%[pa]]!\n\t"
@@ -407,7 +479,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#elif HAVE_SSE3
+
+ p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+#if HAVE_SSE3
+static void
+ffts_execute_1d_real_inv_sse3(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT in =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
if (FFTS_UNLIKELY(N <= 8)) {
__m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
__m128 t1 = _mm_load_ps(in);
@@ -492,7 +586,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
}
}
-#elif HAVE_SSE
+
+ p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+#if HAVE_SSE
+static void
+ffts_execute_1d_real_inv_sse(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT in =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
if (FFTS_UNLIKELY(N <= 8)) {
__m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
__m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
@@ -585,7 +701,28 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
_mm_xor_ps(t4, c0))));
}
}
-#else
+
+ p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+static void
+ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
+{
+ float *const FFTS_RESTRICT in =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+ float *const FFTS_RESTRICT buf =
+ (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+ const float *const FFTS_RESTRICT A =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+ const float *const FFTS_RESTRICT B =
+ (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+ const int N = (const int) p->N;
+ int i;
+
+ /* we know this */
+ FFTS_ASSUME(N/2 > 0);
+
for (i = 0; i < N/2; i++) {
buf[2*i + 0] =
in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] +
@@ -594,7 +731,6 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] -
in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
}
-#endif
p->plans[0]->transform(p->plans[0], buf, output);
}
@@ -602,18 +738,35 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
FFTS_API ffts_plan_t*
ffts_init_1d_real(size_t N, int sign)
{
+#ifndef __ARM_NEON__
+ int cpu_flags = ffts_cpu_detect(NULL);
+#endif
ffts_plan_t *p;
+ int invert = 0;
p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
if (!p) {
return NULL;
}
- if (sign < 0) {
- p->transform = &ffts_execute_1d_real;
- } else {
- p->transform = &ffts_execute_1d_real_inv;
+#ifdef __ARM_NEON__
+ p->transform = (sign < 0) ? &ffts_execute_1d_real_neon : &ffts_execute_1d_real_inv;
+#else
+#ifdef HAVE_SSE3
+ if (cpu_flags & FFTS_CPU_X86_SSE3) {
+ p->transform = (sign < 0) ? &ffts_execute_1d_real_sse3 : &ffts_execute_1d_real_inv_sse3;
+ invert = 1;
+ } else
+#endif
+#ifdef HAVE_SSE
+ if (cpu_flags & FFTS_CPU_X86_SSE) {
+ p->transform = (sign < 0) ? &ffts_execute_1d_real_sse : &ffts_execute_1d_real_inv_sse;
+ } else
+#endif
+ {
+ p->transform = (sign < 0) ? &ffts_execute_1d_real : &ffts_execute_1d_real_inv;
}
+#endif
p->destroy = &ffts_free_1d_real;
p->N = N;
@@ -640,12 +793,7 @@ ffts_init_1d_real(size_t N, int sign)
goto cleanup;
}
-#ifdef HAVE_SSE3
- ffts_generate_table_1d_real_32f(p, sign, 1);
-#else
- ffts_generate_table_1d_real_32f(p, sign, 0);
-#endif
-
+ ffts_generate_table_1d_real_32f(p, sign, invert);
return p;
cleanup:
diff --git a/lib/ffts/src/ffts_static.c b/lib/ffts/src/ffts_static.c
index 09de6d7..87d8b23 100644
--- a/lib/ffts/src/ffts_static.c
+++ b/lib/ffts/src/ffts_static.c
@@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@@ -258,6 +259,29 @@ static const FFTS_ALIGN(16) double ffts_constants_inv_64f[16] = {
-0.7071067811865475244008443621048490392848359376884740
};
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_K_0(int inv,
+ V4DF *r0,
+ V4DF *r1,
+ V4DF *r2,
+ V4DF *r3)
+{
+ V4DF t0, t1, t2, t3;
+
+ t0 = *r0;
+ t1 = *r1;
+
+ t2 = V4DF_ADD(*r2, *r3);
+ t3 = V4DF_IMULI(inv, V4DF_SUB(*r2, *r3));
+
+ *r0 = V4DF_ADD(t0, t2);
+ *r2 = V4DF_SUB(t0, t2);
+ *r1 = V4DF_SUB(t1, t3);
+ *r3 = V4DF_ADD(t1, t3);
+}
+#endif
+
static FFTS_INLINE void
V4SF_K_0(int inv,
V4SF *r0,
@@ -279,6 +303,31 @@ V4SF_K_0(int inv,
*r3 = V4SF_ADD(t1, t3);
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_2(const double *FFTS_RESTRICT i0,
+ const double *FFTS_RESTRICT i1,
+ const double *FFTS_RESTRICT i2,
+ const double *FFTS_RESTRICT i3,
+ V4DF *r0,
+ V4DF *r1,
+ V4DF *r2,
+ V4DF *r3)
+{
+ V4DF t0, t1, t2, t3;
+
+ t0 = V4DF_LD(i0);
+ t1 = V4DF_LD(i1);
+ t2 = V4DF_LD(i2);
+ t3 = V4DF_LD(i3);
+
+ *r0 = V4DF_ADD(t0, t1);
+ *r1 = V4DF_SUB(t0, t1);
+ *r2 = V4DF_ADD(t2, t3);
+ *r3 = V4DF_SUB(t2, t3);
+}
+#endif
+
static FFTS_INLINE void
V4SF_L_2(const float *FFTS_RESTRICT i0,
const float *FFTS_RESTRICT i1,
@@ -302,6 +351,37 @@ V4SF_L_2(const float *FFTS_RESTRICT i0,
*r3 = V4SF_SUB(t2, t3);
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_4(int inv,
+ const double *FFTS_RESTRICT i0,
+ const double *FFTS_RESTRICT i1,
+ const double *FFTS_RESTRICT i2,
+ const double *FFTS_RESTRICT i3,
+ V4DF *r0,
+ V4DF *r1,
+ V4DF *r2,
+ V4DF *r3)
+{
+ V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = V4DF_LD(i0);
+ t1 = V4DF_LD(i1);
+ t2 = V4DF_LD(i2);
+ t3 = V4DF_LD(i3);
+
+ t4 = V4DF_ADD(t0, t1);
+ t5 = V4DF_SUB(t0, t1);
+ t6 = V4DF_ADD(t2, t3);
+ t7 = V4DF_IMULI(inv, V4DF_SUB(t2, t3));
+
+ *r0 = V4DF_ADD(t4, t6);
+ *r2 = V4DF_SUB(t4, t6);
+ *r1 = V4DF_SUB(t5, t7);
+ *r3 = V4DF_ADD(t5, t7);
+}
+#endif
+
static FFTS_INLINE void
V4SF_L_4(int inv,
const float *FFTS_RESTRICT i0,
@@ -331,6 +411,36 @@ V4SF_L_4(int inv,
*r3 = V4SF_ADD(t5, t7);
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_EE(double *const FFTS_RESTRICT out,
+ const ptrdiff_t *FFTS_RESTRICT os,
+ const double *FFTS_RESTRICT in,
+ const ptrdiff_t *FFTS_RESTRICT is,
+ int inv)
+{
+ const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+ V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+ double *out0 = out + os[0];
+ double *out1 = out + os[1];
+
+ V4DF_L_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+ V4DF_L_2(in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7);
+
+ V4DF_K_0(inv, &r0, &r2, &r4, &r6);
+ V4DF_K_N(inv, V4DF_LD(LUT + 0), V4DF_LD(LUT + 4), &r1, &r3, &r5, &r7);
+ V4DF_TX2(&r0, &r1);
+ V4DF_TX2(&r2, &r3);
+ V4DF_TX2(&r4, &r5);
+ V4DF_TX2(&r6, &r7);
+
+ V4DF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+ V4DF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
static FFTS_INLINE void
V4SF_LEAF_EE(float *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
@@ -359,6 +469,36 @@ V4SF_LEAF_EE(float *const FFTS_RESTRICT out,
V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_EE2(double *const FFTS_RESTRICT out,
+ const ptrdiff_t *FFTS_RESTRICT os,
+ const double *FFTS_RESTRICT in,
+ const ptrdiff_t *FFTS_RESTRICT is,
+ int inv)
+{
+ const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+ V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+ double *out0 = out + os[0];
+ double *out1 = out + os[1];
+
+ V4DF_L_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r0, &r1, &r2, &r3);
+ V4DF_L_2(in + is[0], in + is[1], in + is[3], in + is[2], &r4, &r5, &r6, &r7);
+
+ V4DF_K_0(inv, &r0, &r2, &r4, &r6);
+ V4DF_K_N(inv, V4DF_LD(LUT + 0), V4DF_LD(LUT + 4), &r1, &r3, &r5, &r7);
+ V4DF_TX2(&r0, &r1);
+ V4DF_TX2(&r2, &r3);
+ V4DF_TX2(&r4, &r5);
+ V4DF_TX2(&r6, &r7);
+
+ V4DF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+ V4DF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
static FFTS_INLINE void
V4SF_LEAF_EE2(float *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
@@ -387,6 +527,30 @@ V4SF_LEAF_EE2(float *const FFTS_RESTRICT out,
V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_EO(double *const FFTS_RESTRICT out,
+ const ptrdiff_t *FFTS_RESTRICT os,
+ const double *FFTS_RESTRICT in,
+ const ptrdiff_t *FFTS_RESTRICT is,
+ int inv)
+{
+ const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+ V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+ double *out0 = out + os[0];
+ double *out1 = out + os[1];
+
+ V4DF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+ V4DF_L_2_4(inv, in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7);
+
+ V4DF_S_4(r2, r3, r7, r6, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+ V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r0, &r1, &r4, &r5);
+ V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+}
+#endif
+
static FFTS_INLINE void
V4SF_LEAF_EO(float *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
@@ -409,6 +573,30 @@ V4SF_LEAF_EO(float *const FFTS_RESTRICT out,
V4SF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_OE(double *const FFTS_RESTRICT out,
+ const ptrdiff_t *FFTS_RESTRICT os,
+ const double *FFTS_RESTRICT in,
+ const ptrdiff_t *FFTS_RESTRICT is,
+ int inv)
+{
+ const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+ V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+ double *out0 = out + os[0];
+ double *out1 = out + os[1];
+
+ V4DF_L_4_2(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+ V4DF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7);
+
+ V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+ V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r6, &r7, &r2, &r3);
+ V4DF_S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
static FFTS_INLINE void
V4SF_LEAF_OE(float *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
@@ -431,6 +619,27 @@ V4SF_LEAF_OE(float *const FFTS_RESTRICT out,
V4SF_S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_OO(double *const FFTS_RESTRICT out,
+ const ptrdiff_t *FFTS_RESTRICT os,
+ const double *FFTS_RESTRICT in,
+ const ptrdiff_t *FFTS_RESTRICT is,
+ int inv)
+{
+ V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+ double *out0 = out + os[0];
+ double *out1 = out + os[1];
+
+ V4DF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+ V4DF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7);
+
+ V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+ V4DF_S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
static FFTS_INLINE void
V4SF_LEAF_OO(float *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
@@ -450,6 +659,34 @@ V4SF_LEAF_OO(float *const FFTS_RESTRICT out,
V4SF_S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_X_4(int inv,
+ double *FFTS_RESTRICT data,
+ size_t N,
+ const double *FFTS_RESTRICT LUT)
+{
+ size_t i;
+
+ for (i = 0; i < N/8; i++) {
+ V4DF r0 = V4DF_LD(data);
+ V4DF r1 = V4DF_LD(data + 2*N/4);
+ V4DF r2 = V4DF_LD(data + 4*N/4);
+ V4DF r3 = V4DF_LD(data + 6*N/4);
+
+ V4DF_K_N(inv, V4DF_LD(LUT), V4DF_LD(LUT + 4), &r0, &r1, &r2, &r3);
+
+ V4DF_ST(data , r0);
+ V4DF_ST(data + 2*N/4, r1);
+ V4DF_ST(data + 4*N/4, r2);
+ V4DF_ST(data + 6*N/4, r3);
+
+ LUT += 8;
+ data += 4;
+ }
+}
+#endif
+
static FFTS_INLINE void
V4SF_X_4(int inv,
float *FFTS_RESTRICT data,
@@ -536,6 +773,68 @@ V4SF_X_8(int inv,
}
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_X_8(int inv,
+ double *FFTS_RESTRICT data0,
+ size_t N,
+ const double *FFTS_RESTRICT LUT)
+{
+ double *data1 = data0 + 1*N/4;
+ double *data2 = data0 + 2*N/4;
+ double *data3 = data0 + 3*N/4;
+ double *data4 = data0 + 4*N/4;
+ double *data5 = data0 + 5*N/4;
+ double *data6 = data0 + 6*N/4;
+ double *data7 = data0 + 7*N/4;
+ size_t i;
+
+ for (i = 0; i < N/16; i++) {
+ V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+ r0 = V4DF_LD(data0);
+ r1 = V4DF_LD(data1);
+ r2 = V4DF_LD(data2);
+ r3 = V4DF_LD(data3);
+
+ V4DF_K_N(inv, V4DF_LD(LUT), V4DF_LD(LUT + 4), &r0, &r1, &r2, &r3);
+ r4 = V4DF_LD(data4);
+ r6 = V4DF_LD(data6);
+
+ V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r0, &r2, &r4, &r6);
+ r5 = V4DF_LD(data5);
+ r7 = V4DF_LD(data7);
+
+ V4DF_K_N(inv, V4DF_LD(LUT + 16), V4DF_LD(LUT + 20), &r1, &r3, &r5, &r7);
+ LUT += 24;
+
+ V4DF_ST(data0, r0);
+ data0 += 4;
+
+ V4DF_ST(data1, r1);
+ data1 += 4;
+
+ V4DF_ST(data2, r2);
+ data2 += 4;
+
+ V4DF_ST(data3, r3);
+ data3 += 4;
+
+ V4DF_ST(data4, r4);
+ data4 += 4;
+
+ V4DF_ST(data5, r5);
+ data5 += 4;
+
+ V4DF_ST(data6, r6);
+ data6 += 4;
+
+ V4DF_ST(data7, r7);
+ data7 += 4;
+ }
+}
+#endif
+
static FFTS_INLINE void
ffts_static_firstpass_odd_32f(float *const FFTS_RESTRICT out,
const float *FFTS_RESTRICT in,
@@ -569,6 +868,41 @@ ffts_static_firstpass_odd_32f(float *const FFTS_RESTRICT out,
}
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+ffts_static_firstpass_odd_64f(double *const FFTS_RESTRICT out,
+ const double *FFTS_RESTRICT in,
+ const ffts_plan_t *FFTS_RESTRICT p,
+ int inv)
+{
+ size_t i, i0 = p->i0, i1 = p->i1;
+ const ptrdiff_t *is = (const ptrdiff_t*) p->is;
+ const ptrdiff_t *os = (const ptrdiff_t*) p->offsets;
+
+ for (i = i0; i > 0; --i) {
+ V4DF_LEAF_EE(out, os, in, is, inv);
+ in += 4;
+ os += 2;
+ }
+
+ for (i = i1; i > 0; --i) {
+ V4DF_LEAF_OO(out, os, in, is, inv);
+ in += 4;
+ os += 2;
+ }
+
+ V4DF_LEAF_OE(out, os, in, is, inv);
+ in += 4;
+ os += 2;
+
+ for (i = i1; i > 0; --i) {
+ V4DF_LEAF_EE2(out, os, in, is, inv);
+ in += 4;
+ os += 2;
+ }
+}
+#endif
+
void
ffts_small_2_32f(ffts_plan_t *p, const void *in, void *out)
{
@@ -789,23 +1123,23 @@ ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out)
V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
}
+#ifdef FFTS_DOUBLE
void
ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out)
{
+ const double *FFTS_RESTRICT lut = ffts_constants_small_64f;
const double *din = (const double*) in;
double *dout = (double*) out;
-// V4SF r0_1, r2_3, r4_5, r6_7;
-// double *LUT8 = (double*) p->ws + p->ws_is[0];
+ V4DF r0_1, r2_3, r4_5, r6_7;
+
+ /* unreferenced parameter */
(void) p;
- (void) din;
- (void) dout;
-#if MACROS_READY
- L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
- K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
- S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
-#endif
+ V4DF_L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+ V4DF_K_N(0, V4DF_LD(lut), V4DF_LD(lut + 4), &r0_1, &r2_3, &r4_5, &r6_7);
+ V4DF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
}
+#endif
void
ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out)
@@ -823,24 +1157,23 @@ ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out)
V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
}
+#ifdef FFTS_DOUBLE
void
ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out)
{
+ const double *FFTS_RESTRICT lut = ffts_constants_small_inv_64f;
const double *din = (const double*) in;
double *dout = (double*) out;
-// V4SF r0_1, r2_3, r4_5, r6_7;
-// double *LUT8 = (double*) p->ws + p->ws_is[0];
- (void) p;
- (void) din;
- (void) dout;
+ V4DF r0_1, r2_3, r4_5, r6_7;
+ /* unreferenced parameter */
+ (void) p;
-#if MACROS_READY
- L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
- K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
- S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
-#endif
+ V4DF_L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+ V4DF_K_N(1, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
+ V4DF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
}
+#endif
void
ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out)
@@ -862,27 +1195,27 @@ ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out)
V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
}
+#ifdef FFTS_DOUBLE
void
ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out)
{
+ const double *FFTS_RESTRICT lut = ffts_constants_small_64f;
const double *din = (const double*) in;
double *dout = (double*) out;
-// double *LUT8 = (double*) p->ws;
-// V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+ V4DF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+
+ /* unreferenced parameter */
(void) p;
- (void) din;
- (void) dout;
-
-#ifdef MACROS_READY
- L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
- L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
- K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
- K_N(0, VLD(LUT8+8), VLD(LUT8+12), &r0_1, &r4_5, &r8_9, &r12_13);
- S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
- K_N(0, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15);
- S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
-#endif
+
+ V4DF_L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
+ V4DF_L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
+ V4DF_K_N(0, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
+ V4DF_K_N(0, V4DF_LD(lut+8), V4DF_LD(lut+12), &r0_1, &r4_5, &r8_9, &r12_13);
+ V4DF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
+ V4DF_K_N(0, V4DF_LD(lut+16), V4DF_LD(lut+20), &r2_3, &r6_7, &r10_11, &r14_15);
+ V4DF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
}
+#endif
void
ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out)
@@ -904,27 +1237,27 @@ ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out)
V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
}
+#ifdef FFTS_DOUBLE
void
ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out)
{
+ const double *FFTS_RESTRICT lut = ffts_constants_small_inv_64f;
const double *din = (const double*) in;
double *dout = (double*) out;
-// double *LUT8 = (double*) p->ws;
-// V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+ V4DF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+
+ /* unreferenced parameter */
(void) p;
- (void) din;
- (void) dout;
-
-#ifdef MACROS_READY
- L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
- L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
- K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
- K_N(1, VLD(LUT8+8), VLD(LUT8+12),&r0_1, &r4_5, &r8_9, &r12_13);
- S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
- K_N(1, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15);
- S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
-#endif
+
+ V4DF_L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
+ V4DF_L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
+ V4DF_K_N(1, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
+ V4DF_K_N(1, V4DF_LD(lut+8), V4DF_LD(lut+12), &r0_1, &r4_5, &r8_9, &r12_13);
+ V4DF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
+ V4DF_K_N(1, V4DF_LD(lut+16), V4DF_LD(lut+20), &r2_3, &r6_7, &r10_11, &r14_15);
+ V4DF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
}
+#endif
static FFTS_INLINE void
ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out,
@@ -959,6 +1292,41 @@ ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out,
}
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+ffts_static_firstpass_even_64f(double *FFTS_RESTRICT out,
+ const double *FFTS_RESTRICT in,
+ const ffts_plan_t *FFTS_RESTRICT p,
+ int inv)
+{
+ size_t i, i0 = p->i0, i1 = p->i1;
+ const ptrdiff_t *is = (const ptrdiff_t*) p->is;
+ const ptrdiff_t *os = (const ptrdiff_t*) p->offsets;
+
+ for(i = i0; i > 0; --i) {
+ V4DF_LEAF_EE(out, os, in, is, inv);
+ in += 4;
+ os += 2;
+ }
+
+ V4DF_LEAF_EO(out, os, in, is, inv);
+ in += 4;
+ os += 2;
+
+ for (i = i1; i > 0; --i) {
+ V4DF_LEAF_OO(out, os, in, is, inv);
+ in += 4;
+ os += 2;
+ }
+
+ for (i = i1; i > 0; --i) {
+ V4DF_LEAF_EE2(out, os, in, is, inv);
+ in += 4;
+ os += 2;
+ }
+}
+#endif
+
static void
ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
{
@@ -1035,6 +1403,47 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
#endif
}
+#ifdef FFTS_DOUBLE
+static void
+ffts_static_rec_f_64f(const ffts_plan_t *p, double *data, size_t N)
+{
+ const double *ws = (const double*) p->ws;
+
+ if (N > 128) {
+ const size_t N1 = N >> 1;
+ const size_t N2 = N >> 2;
+ const size_t N3 = N >> 3;
+
+ ffts_static_rec_f_64f(p, data , N2);
+ ffts_static_rec_f_64f(p, data + N1 , N3);
+ ffts_static_rec_f_64f(p, data + N1 + N2, N3);
+ ffts_static_rec_f_64f(p, data + N , N2);
+ ffts_static_rec_f_64f(p, data + N + N1 , N2);
+
+ V4DF_X_8(0, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1));
+ } else if (N == 128) {
+ const double *ws1 = ws + (p->ws_is[1] << 1);
+
+ V4DF_X_8(0, data + 0, 32, ws1);
+ V4DF_X_4(0, data + 64, 16, ws);
+ V4DF_X_4(0, data + 96, 16, ws);
+ V4DF_X_8(0, data + 128, 32, ws1);
+ V4DF_X_8(0, data + 192, 32, ws1);
+
+ V4DF_X_8(0, data, 128, ws + (p->ws_is[3] << 1));
+ } else if (N == 64) {
+ V4DF_X_4(0, data + 0, 16, ws);
+ V4DF_X_4(0, data + 64, 16, ws);
+ V4DF_X_4(0, data + 96, 16, ws);
+
+ V4DF_X_8(0, data, 64, ws + (p->ws_is[2] << 1));
+ } else {
+ assert(N == 32);
+ V4DF_X_8(0, data, 32, ws + (p->ws_is[1] << 1));
+ }
+}
+#endif
+
static void
ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
{
@@ -1111,6 +1520,47 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
#endif
}
+#ifdef FFTS_DOUBLE
+static void
+ffts_static_rec_i_64f(const ffts_plan_t *p, double *data, size_t N)
+{
+ const double *ws = (const double*) p->ws;
+
+ if (N > 128) {
+ const size_t N1 = N >> 1;
+ const size_t N2 = N >> 2;
+ const size_t N3 = N >> 3;
+
+ ffts_static_rec_i_64f(p, data , N2);
+ ffts_static_rec_i_64f(p, data + N1 , N3);
+ ffts_static_rec_i_64f(p, data + N1 + N2, N3);
+ ffts_static_rec_i_64f(p, data + N , N2);
+ ffts_static_rec_i_64f(p, data + N + N1 , N2);
+
+ V4DF_X_8(1, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1));
+ } else if (N == 128) {
+ const double *ws1 = ws + (p->ws_is[1] << 1);
+
+ V4DF_X_8(1, data + 0, 32, ws1);
+ V4DF_X_4(1, data + 64, 16, ws);
+ V4DF_X_4(1, data + 96, 16, ws);
+ V4DF_X_8(1, data + 128, 32, ws1);
+ V4DF_X_8(1, data + 192, 32, ws1);
+
+ V4DF_X_8(1, data, 128, ws + (p->ws_is[3] << 1));
+ } else if (N == 64) {
+ V4DF_X_4(1, data + 0, 16, ws);
+ V4DF_X_4(1, data + 64, 16, ws);
+ V4DF_X_4(1, data + 96, 16, ws);
+
+ V4DF_X_8(1, data, 64, ws + (p->ws_is[2] << 1));
+ } else {
+ assert(N == 32);
+ V4DF_X_8(1, data, 32, ws + (p->ws_is[1] << 1));
+ }
+}
+#endif
+
void
ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
{
@@ -1172,6 +1622,26 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
#endif
}
+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_f_64f(ffts_plan_t *p, const void *in, void *out)
+{
+ const double *din = (const double*) in;
+ double *dout = (double*) out;
+
+ const size_t N = p->N;
+ const int N_log_2 = ffts_ctzl(N);
+
+ if (N_log_2 & 1) {
+ ffts_static_firstpass_odd_64f(dout, din, p, 0);
+ } else {
+ ffts_static_firstpass_even_64f(dout, din, p, 0);
+ }
+
+ ffts_static_rec_f_64f(p, dout, N);
+}
+#endif
+
void
ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
{
@@ -1231,4 +1701,24 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
ffts_static_rec_i_32f(p, dout, N);
#endif
-} \ No newline at end of file
+}
+
+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_i_64f(ffts_plan_t *p, const void *in, void *out)
+{
+ const double *din = (const double*) in;
+ double *dout = (double*) out;
+
+ const size_t N = p->N;
+ const int N_log_2 = ffts_ctzl(N);
+
+ if (N_log_2 & 1) {
+ ffts_static_firstpass_odd_64f(dout, din, p, 1);
+ } else {
+ ffts_static_firstpass_even_64f(dout, din, p, 1);
+ }
+
+ ffts_static_rec_i_64f(p, dout, N);
+}
+#endif \ No newline at end of file
diff --git a/lib/ffts/src/ffts_static.h b/lib/ffts/src/ffts_static.h
index 5a42fc2..5de0059 100644
--- a/lib/ffts/src/ffts_static.h
+++ b/lib/ffts/src/ffts_static.h
@@ -43,49 +43,73 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
void
ffts_small_2_32f(ffts_plan_t *p, const void *in, void *out);
+#ifdef FFTS_DOUBLE
void
ffts_small_2_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
void
ffts_small_forward4_32f(ffts_plan_t *p, const void *in, void *out);
+#ifdef FFTS_DOUBLE
void
ffts_small_forward4_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
void
ffts_small_backward4_32f(ffts_plan_t *p, const void *in, void *out);
+#ifdef FFTS_DOUBLE
void
ffts_small_backward4_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
void
ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out);
+#ifdef FFTS_DOUBLE
void
ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
void
ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out);
+#ifdef FFTS_DOUBLE
void
ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
void
ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out);
+#ifdef FFTS_DOUBLE
void
ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
void
ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out);
+#ifdef FFTS_DOUBLE
void
ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
void
ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out);
+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_f_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
+
void
ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out);
+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_i_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
+
#endif /* FFTS_STATIC_H */
diff --git a/lib/ffts/src/ffts_trig.c b/lib/ffts/src/ffts_trig.c
index 74ebfd2..65efa86 100644
--- a/lib/ffts/src/ffts_trig.c
+++ b/lib/ffts/src/ffts_trig.c
@@ -2,7 +2,7 @@
This file is part of FFTS -- The Fastest Fourier Transform in the South
-Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2015-2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@@ -33,193 +33,707 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ffts_trig.h"
#include "ffts_dd.h"
+/*
+* For more information on algorithms:
+*
+* D. Potts, G. Steidl, M. Tasche, Numerical stability of fast
+* trigonometric transforms — a worst case study,
+* J. Concrete Appl. Math. 1 (2003) 1–36
+*
+* O. Buneman, Stable on–line creation of sines and cosines of
+* successive angles, Proc. IEEE 75, 1434 – 1435 (1987).
+*/
+
+/* An union to initialize doubles using byte presentation,
+* and to avoid breaking strict-aliasing rules
+*/
+
+/* TODO: we need macros to take care endianess */
+typedef union ffts_double {
+ int32_t i[2];
+ double d;
+} ffts_double_t;
+
/* 1/(2*cos(pow(2,-p)*pi)) */
-static const FFTS_ALIGN(16) unsigned int half_secant[132] = {
- 0x00000000, 0x3fe00000, 0xc9be45de, 0x3be3bd3c,
- 0x00000000, 0x3fe00000, 0xc9be45de, 0x3c03bd3c,
- 0x00000000, 0x3fe00000, 0xc9be45de, 0x3c23bd3c,
- 0x00000000, 0x3fe00000, 0xc9be45de, 0x3c43bd3c,
- 0x00000000, 0x3fe00000, 0xc9be45de, 0x3c63bd3c,
- 0x00000000, 0x3fe00000, 0xc9be45df, 0x3c83bd3c,
- 0x00000001, 0x3fe00000, 0x4df22efd, 0x3c7de9e6,
- 0x00000005, 0x3fe00000, 0x906e8725, 0xbc60b0cd,
- 0x00000014, 0x3fe00000, 0x906e8357, 0xbc80b0cd,
- 0x0000004f, 0x3fe00000, 0x0dce83c9, 0xbc5619b2,
- 0x0000013c, 0x3fe00000, 0x0dc6e79a, 0xbc7619b2,
- 0x000004ef, 0x3fe00000, 0xe4af1240, 0x3c83cc9b,
- 0x000013bd, 0x3fe00000, 0x2d14c08a, 0x3c7e64df,
- 0x00004ef5, 0x3fe00000, 0x47a85465, 0xbc59b20b,
- 0x00013bd4, 0x3fe00000, 0xab79c897, 0xbc79b203,
- 0x0004ef4f, 0x3fe00000, 0x15019a96, 0x3c79386b,
- 0x0013bd3d, 0x3fe00000, 0x7d6dbf4b, 0xbc7b16b7,
- 0x004ef4f3, 0x3fe00000, 0xf30832e0, 0x3c741ee4,
- 0x013bd3cd, 0x3fe00000, 0xd3bcd4bb, 0xbc83f41e,
- 0x04ef4f34, 0x3fe00000, 0xdd75aebb, 0xbc82ef06,
- 0x13bd3cde, 0x3fe00000, 0xb2b41b3d, 0x3c52d979,
- 0x4ef4f46c, 0x3fe00000, 0x4f0fb458, 0xbc851db3,
- 0x3bd3e0e7, 0x3fe00001, 0x8a0ce3f0, 0x3c58dbab,
- 0xef507722, 0x3fe00004, 0x2a8ec295, 0x3c83e351,
- 0xbd5114f9, 0x3fe00013, 0xc4c0d92d, 0x3c8b3ca4,
- 0xf637de7d, 0x3fe0004e, 0xb74de729, 0x3c45974e,
- 0xe8190891, 0x3fe0013b, 0x26edf4da, 0xbc814c20,
- 0x9436640e, 0x3fe004f0, 0xe2b34b50, 0x3c8091ab,
- 0x9c61d971, 0x3fe013d1, 0x6ce01b8e, 0x3c7f7df7,
- 0xd17cba53, 0x3fe0503e, 0x74ad7633, 0xbc697609,
- 0x7bdb3895, 0x3fe1517a, 0x82f9091b, 0xbc8008d1,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000
+static const FFTS_ALIGN(16) ffts_double_t half_secant[66] = {
+ { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3be3bd3c } },
+ { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c03bd3c } },
+ { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c23bd3c } },
+ { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c43bd3c } },
+ { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45de, 0x3c63bd3c } },
+ { { 0x00000000, 0x3fe00000 } }, { { 0xc9be45df, 0x3c83bd3c } },
+ { { 0x00000001, 0x3fe00000 } }, { { 0x4df22efd, 0x3c7de9e6 } },
+ { { 0x00000005, 0x3fe00000 } }, { { 0x906e8725, 0xbc60b0cd } },
+ { { 0x00000014, 0x3fe00000 } }, { { 0x906e8357, 0xbc80b0cd } },
+ { { 0x0000004f, 0x3fe00000 } }, { { 0x0dce83c9, 0xbc5619b2 } },
+ { { 0x0000013c, 0x3fe00000 } }, { { 0x0dc6e79a, 0xbc7619b2 } },
+ { { 0x000004ef, 0x3fe00000 } }, { { 0xe4af1240, 0x3c83cc9b } },
+ { { 0x000013bd, 0x3fe00000 } }, { { 0x2d14c08a, 0x3c7e64df } },
+ { { 0x00004ef5, 0x3fe00000 } }, { { 0x47a85465, 0xbc59b20b } },
+ { { 0x00013bd4, 0x3fe00000 } }, { { 0xab79c897, 0xbc79b203 } },
+ { { 0x0004ef4f, 0x3fe00000 } }, { { 0x15019a96, 0x3c79386b } },
+ { { 0x0013bd3d, 0x3fe00000 } }, { { 0x7d6dbf4b, 0xbc7b16b7 } },
+ { { 0x004ef4f3, 0x3fe00000 } }, { { 0xf30832e0, 0x3c741ee4 } },
+ { { 0x013bd3cd, 0x3fe00000 } }, { { 0xd3bcd4bb, 0xbc83f41e } },
+ { { 0x04ef4f34, 0x3fe00000 } }, { { 0xdd75aebb, 0xbc82ef06 } },
+ { { 0x13bd3cde, 0x3fe00000 } }, { { 0xb2b41b3d, 0x3c52d979 } },
+ { { 0x4ef4f46c, 0x3fe00000 } }, { { 0x4f0fb458, 0xbc851db3 } },
+ { { 0x3bd3e0e7, 0x3fe00001 } }, { { 0x8a0ce3f0, 0x3c58dbab } },
+ { { 0xef507722, 0x3fe00004 } }, { { 0x2a8ec295, 0x3c83e351 } },
+ { { 0xbd5114f9, 0x3fe00013 } }, { { 0xc4c0d92d, 0x3c8b3ca4 } },
+ { { 0xf637de7d, 0x3fe0004e } }, { { 0xb74de729, 0x3c45974e } },
+ { { 0xe8190891, 0x3fe0013b } }, { { 0x26edf4da, 0xbc814c20 } },
+ { { 0x9436640e, 0x3fe004f0 } }, { { 0xe2b34b50, 0x3c8091ab } },
+ { { 0x9c61d971, 0x3fe013d1 } }, { { 0x6ce01b8e, 0x3c7f7df7 } },
+ { { 0xd17cba53, 0x3fe0503e } }, { { 0x74ad7633, 0xbc697609 } },
+ { { 0x7bdb3895, 0x3fe1517a } }, { { 0x82f9091b, 0xbc8008d1 } },
+ { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } },
+ { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } }
};
/* cos(pow(2,-p)*pi), sin(pow(2,-p)*pi) */
-static const FFTS_ALIGN(16) unsigned int cos_sin_pi_table[264] = {
- 0x00000000, 0x3ff00000, 0x54442d18, 0x3df921fb,
- 0xc9be45de, 0xbbf3bd3c, 0xbb77974f, 0x3a91a390,
- 0x00000000, 0x3ff00000, 0x54442d18, 0x3e0921fb,
- 0xc9be45de, 0xbc13bd3c, 0x54a14928, 0x3aa19bd0,
- 0x00000000, 0x3ff00000, 0x54442d18, 0x3e1921fb,
- 0xc9be45de, 0xbc33bd3c, 0xb948108a, 0x3ab17cce,
- 0x00000000, 0x3ff00000, 0x54442d18, 0x3e2921fb,
- 0xc9be45de, 0xbc53bd3c, 0x4be32e14, 0x3ac100c8,
- 0x00000000, 0x3ff00000, 0x54442d18, 0x3e3921fb,
- 0xc9be45de, 0xbc73bd3c, 0x2c9f4879, 0x3ace215d,
- 0xffffffff, 0x3fefffff, 0x54442d18, 0x3e4921fb,
- 0x6c837443, 0x3c888586, 0x0005f376, 0x3acd411f,
- 0xfffffffe, 0x3fefffff, 0x54442d18, 0x3e5921fb,
- 0x4df22ef1, 0xbc8de9e6, 0x9937209e, 0xbaf7b153,
- 0xfffffff6, 0x3fefffff, 0x54442d16, 0x3e6921fb,
- 0x906e88aa, 0x3c70b0cd, 0xfe19968a, 0xbb03b7c0,
- 0xffffffd9, 0x3fefffff, 0x54442d0e, 0x3e7921fb,
- 0xdf22ed26, 0xbc8e9e64, 0x8d1b6ffb, 0xbaee8bb4,
- 0xffffff62, 0x3fefffff, 0x54442cef, 0x3e8921fb,
- 0x0dd18f0f, 0x3c6619b2, 0x7f2b20fb, 0xbb00e133,
- 0xfffffd88, 0x3fefffff, 0x54442c73, 0x3e9921fb,
- 0x0dd314b2, 0x3c8619b2, 0x619fdf6e, 0xbb174e98,
- 0xfffff621, 0x3fefffff, 0x54442a83, 0x3ea921fb,
- 0x3764acf5, 0x3c8866c8, 0xf5b2407f, 0xbb388215,
- 0xffffd886, 0x3fefffff, 0x544422c2, 0x3eb921fb,
- 0x20e7a944, 0xbc8e64df, 0x7b9b9f23, 0x3b5a0961,
- 0xffff6216, 0x3fefffff, 0x544403c1, 0x3ec921fb,
- 0x52ee25ea, 0x3c69b20e, 0x4df6a86a, 0xbb5999d9,
- 0xfffd8858, 0x3fefffff, 0x544387ba, 0x3ed921fb,
- 0xd8910ead, 0x3c89b20f, 0x0809d04d, 0x3b77d9db,
- 0xfff62162, 0x3fefffff, 0x544197a1, 0x3ee921fb,
- 0x438d3925, 0xbc8937a8, 0xa5d27f7a, 0xbb858b02,
- 0xffd88586, 0x3fefffff, 0x5439d73a, 0x3ef921fb,
- 0x94b3ddd2, 0x3c8b22e4, 0xf8a3b73d, 0xbb863c7f,
- 0xff62161a, 0x3fefffff, 0x541ad59e, 0x3f0921fb,
- 0x7ea469b2, 0xbc835c13, 0xb8cee262, 0x3bae9860,
- 0xfd885867, 0x3fefffff, 0x539ecf31, 0x3f1921fb,
- 0x23a32e63, 0xbc77d556, 0xfcd23a30, 0x3b96b111,
- 0xf621619c, 0x3fefffff, 0x51aeb57c, 0x3f2921fb,
- 0xbbbd8fe6, 0xbc87507d, 0x4916c435, 0xbbca6e1d,
- 0xd8858675, 0x3fefffff, 0x49ee4ea6, 0x3f3921fb,
- 0x54748eab, 0xbc879f0e, 0x744a453e, 0x3bde894d,
- 0x62161a34, 0x3fefffff, 0x2aecb360, 0x3f4921fb,
- 0xb1f9b9c4, 0xbc6136dc, 0x7e566b4c, 0x3be87615,
- 0x88586ee6, 0x3feffffd, 0xaee6472e, 0x3f5921fa,
- 0xf173ae5b, 0x3c81af64, 0x284a9df8, 0xbbfee52e,
- 0x21621d02, 0x3feffff6, 0xbecca4ba, 0x3f6921f8,
- 0xebc82813, 0xbc76acfc, 0x7bcab5b2, 0x3c02ba40,
- 0x858e8a92, 0x3fefffd8, 0xfe670071, 0x3f7921f0,
- 0x1883bcf7, 0x3c8359c7, 0xfe6b7a9b, 0x3bfab967,
- 0x169b92db, 0x3fefff62, 0xfcdec784, 0x3f8921d1,
- 0xc81fbd0d, 0x3c85dda3, 0xbe836d9d, 0x3c29878e,
- 0x6084cd0d, 0x3feffd88, 0xf7a3667e, 0x3f992155,
- 0x4556e4cb, 0xbc81354d, 0x091a0130, 0xbbfb1d63,
- 0xe3796d7e, 0x3feff621, 0xf10dd814, 0x3fa91f65,
- 0x2e24aa15, 0xbc6c57bc, 0x0d569a90, 0xbc2912bd,
- 0xa3d12526, 0x3fefd88d, 0xbc29b42c, 0x3fb917a6,
- 0x378811c7, 0xbc887df6, 0xd26ed688, 0xbc3e2718,
- 0xcff75cb0, 0x3fef6297, 0x3c69a60b, 0x3fc8f8b8,
- 0x2a361fd3, 0x3c756217, 0xb9ff8d82, 0xbc626d19,
- 0xcf328d46, 0x3fed906b, 0xa6aea963, 0x3fd87de2,
- 0x10231ac2, 0x3c7457e6, 0xd3d5a610, 0xbc672ced,
- 0x667f3bcd, 0x3fe6a09e, 0x667f3bcd, 0x3fe6a09e,
- 0x13b26456, 0xbc8bdd34, 0x13b26456, 0xbc8bdd34,
- 0x00000000, 0x00000000, 0x00000000, 0x3ff00000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000
+static const FFTS_ALIGN(32) ffts_double_t cos_sin_pi_table[132] = {
+ { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3df921fb } },
+ { { 0xc9be45de, 0xbbf3bd3c } }, { { 0xbb77974f, 0x3a91a390 } },
+ { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e0921fb } },
+ { { 0xc9be45de, 0xbc13bd3c } }, { { 0x54a14928, 0x3aa19bd0 } },
+ { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e1921fb } },
+ { { 0xc9be45de, 0xbc33bd3c } }, { { 0xb948108a, 0x3ab17cce } },
+ { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e2921fb } },
+ { { 0xc9be45de, 0xbc53bd3c } }, { { 0x4be32e14, 0x3ac100c8 } },
+ { { 0x00000000, 0x3ff00000 } }, { { 0x54442d18, 0x3e3921fb } },
+ { { 0xc9be45de, 0xbc73bd3c } }, { { 0x2c9f4879, 0x3ace215d } },
+ { { 0xffffffff, 0x3fefffff } }, { { 0x54442d18, 0x3e4921fb } },
+ { { 0x6c837443, 0x3c888586 } }, { { 0x0005f376, 0x3acd411f } },
+ { { 0xfffffffe, 0x3fefffff } }, { { 0x54442d18, 0x3e5921fb } },
+ { { 0x4df22ef1, 0xbc8de9e6 } }, { { 0x9937209e, 0xbaf7b153 } },
+ { { 0xfffffff6, 0x3fefffff } }, { { 0x54442d16, 0x3e6921fb } },
+ { { 0x906e88aa, 0x3c70b0cd } }, { { 0xfe19968a, 0xbb03b7c0 } },
+ { { 0xffffffd9, 0x3fefffff } }, { { 0x54442d0e, 0x3e7921fb } },
+ { { 0xdf22ed26, 0xbc8e9e64 } }, { { 0x8d1b6ffb, 0xbaee8bb4 } },
+ { { 0xffffff62, 0x3fefffff } }, { { 0x54442cef, 0x3e8921fb } },
+ { { 0x0dd18f0f, 0x3c6619b2 } }, { { 0x7f2b20fb, 0xbb00e133 } },
+ { { 0xfffffd88, 0x3fefffff } }, { { 0x54442c73, 0x3e9921fb } },
+ { { 0x0dd314b2, 0x3c8619b2 } }, { { 0x619fdf6e, 0xbb174e98 } },
+ { { 0xfffff621, 0x3fefffff } }, { { 0x54442a83, 0x3ea921fb } },
+ { { 0x3764acf5, 0x3c8866c8 } }, { { 0xf5b2407f, 0xbb388215 } },
+ { { 0xffffd886, 0x3fefffff } }, { { 0x544422c2, 0x3eb921fb } },
+ { { 0x20e7a944, 0xbc8e64df } }, { { 0x7b9b9f23, 0x3b5a0961 } },
+ { { 0xffff6216, 0x3fefffff } }, { { 0x544403c1, 0x3ec921fb } },
+ { { 0x52ee25ea, 0x3c69b20e } }, { { 0x4df6a86a, 0xbb5999d9 } },
+ { { 0xfffd8858, 0x3fefffff } }, { { 0x544387ba, 0x3ed921fb } },
+ { { 0xd8910ead, 0x3c89b20f } }, { { 0x0809d04d, 0x3b77d9db } },
+ { { 0xfff62162, 0x3fefffff } }, { { 0x544197a1, 0x3ee921fb } },
+ { { 0x438d3925, 0xbc8937a8 } }, { { 0xa5d27f7a, 0xbb858b02 } },
+ { { 0xffd88586, 0x3fefffff } }, { { 0x5439d73a, 0x3ef921fb } },
+ { { 0x94b3ddd2, 0x3c8b22e4 } }, { { 0xf8a3b73d, 0xbb863c7f } },
+ { { 0xff62161a, 0x3fefffff } }, { { 0x541ad59e, 0x3f0921fb } },
+ { { 0x7ea469b2, 0xbc835c13 } }, { { 0xb8cee262, 0x3bae9860 } },
+ { { 0xfd885867, 0x3fefffff } }, { { 0x539ecf31, 0x3f1921fb } },
+ { { 0x23a32e63, 0xbc77d556 } }, { { 0xfcd23a30, 0x3b96b111 } },
+ { { 0xf621619c, 0x3fefffff } }, { { 0x51aeb57c, 0x3f2921fb } },
+ { { 0xbbbd8fe6, 0xbc87507d } }, { { 0x4916c435, 0xbbca6e1d } },
+ { { 0xd8858675, 0x3fefffff } }, { { 0x49ee4ea6, 0x3f3921fb } },
+ { { 0x54748eab, 0xbc879f0e } }, { { 0x744a453e, 0x3bde894d } },
+ { { 0x62161a34, 0x3fefffff } }, { { 0x2aecb360, 0x3f4921fb } },
+ { { 0xb1f9b9c4, 0xbc6136dc } }, { { 0x7e566b4c, 0x3be87615 } },
+ { { 0x88586ee6, 0x3feffffd } }, { { 0xaee6472e, 0x3f5921fa } },
+ { { 0xf173ae5b, 0x3c81af64 } }, { { 0x284a9df8, 0xbbfee52e } },
+ { { 0x21621d02, 0x3feffff6 } }, { { 0xbecca4ba, 0x3f6921f8 } },
+ { { 0xebc82813, 0xbc76acfc } }, { { 0x7bcab5b2, 0x3c02ba40 } },
+ { { 0x858e8a92, 0x3fefffd8 } }, { { 0xfe670071, 0x3f7921f0 } },
+ { { 0x1883bcf7, 0x3c8359c7 } }, { { 0xfe6b7a9b, 0x3bfab967 } },
+ { { 0x169b92db, 0x3fefff62 } }, { { 0xfcdec784, 0x3f8921d1 } },
+ { { 0xc81fbd0d, 0x3c85dda3 } }, { { 0xbe836d9d, 0x3c29878e } },
+ { { 0x6084cd0d, 0x3feffd88 } }, { { 0xf7a3667e, 0x3f992155 } },
+ { { 0x4556e4cb, 0xbc81354d } }, { { 0x091a0130, 0xbbfb1d63 } },
+ { { 0xe3796d7e, 0x3feff621 } }, { { 0xf10dd814, 0x3fa91f65 } },
+ { { 0x2e24aa15, 0xbc6c57bc } }, { { 0x0d569a90, 0xbc2912bd } },
+ { { 0xa3d12526, 0x3fefd88d } }, { { 0xbc29b42c, 0x3fb917a6 } },
+ { { 0x378811c7, 0xbc887df6 } }, { { 0xd26ed688, 0xbc3e2718 } },
+ { { 0xcff75cb0, 0x3fef6297 } }, { { 0x3c69a60b, 0x3fc8f8b8 } },
+ { { 0x2a361fd3, 0x3c756217 } }, { { 0xb9ff8d82, 0xbc626d19 } },
+ { { 0xcf328d46, 0x3fed906b } }, { { 0xa6aea963, 0x3fd87de2 } },
+ { { 0x10231ac2, 0x3c7457e6 } }, { { 0xd3d5a610, 0xbc672ced } },
+ { { 0x667f3bcd, 0x3fe6a09e } }, { { 0x667f3bcd, 0x3fe6a09e } },
+ { { 0x13b26456, 0xbc8bdd34 } }, { { 0x13b26456, 0xbc8bdd34 } },
+ { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x3ff00000 } },
+ { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } }
+};
+
+#define COS_SIN_TABLE_SIZE 260
+
+/* cos(pi*k/256), sin(pi*k/256) */
+static const FFTS_ALIGN(32) ffts_double_t cos_sin_table[COS_SIN_TABLE_SIZE] = {
+ { { 0x00000000, 0x3FF00000 } }, { { 0x00000000, 0x00000000 } },
+ { { 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000 } },
+ { { 0x169B92DB, 0x3FEFFF62 } }, { { 0xFCDEC784, 0x3F8921D1 } },
+ { { 0xC81FBD0D, 0x3C85DDA3 } }, { { 0xBE836D9D, 0x3C29878E } },
+ { { 0x6084CD0D, 0x3FEFFD88 } }, { { 0xF7A3667E, 0x3F992155 } },
+ { { 0x4556E4CB, 0xBC81354D } }, { { 0x091A0130, 0xBBFB1D63 } },
+ { { 0xEFFEF75D, 0x3FEFFA72 } }, { { 0x759455CD, 0x3FA2D865 } },
+ { { 0xCDB25956, 0xBC88B4CD } }, { { 0x5BA93AC0, 0x3C2686F6 } },
+ { { 0xE3796D7E, 0x3FEFF621 } }, { { 0xF10DD814, 0x3FA91F65 } },
+ { { 0x2E24AA15, 0xBC6C57BC } }, { { 0x0D569A90, 0xBC2912BD } },
+ { { 0x658E71AD, 0x3FEFF095 } }, { { 0x79F820E0, 0x3FAF656E } },
+ { { 0xE18A4B9E, 0x3C801A8C } }, { { 0xE392BFFE, 0xBC22E1EB } },
+ { { 0xAD01883A, 0x3FEFE9CD } }, { { 0x92CE19F6, 0x3FB2D520 } },
+ { { 0xD0C67E35, 0x3C6521EC } }, { { 0xA8BF6B2C, 0xBC49A088 } },
+ { { 0xFCBD5B09, 0x3FEFE1CA } }, { { 0x0A9AA419, 0x3FB5F6D0 } },
+ { { 0x202A884E, 0x3C6A23E3 } }, { { 0xD03F6C9A, 0xBC4F4022 } },
+ { { 0xA3D12526, 0x3FEFD88D } }, { { 0xBC29B42C, 0x3FB917A6 } },
+ { { 0x378811C7, 0xBC887DF6 } }, { { 0xD26ED688, 0xBC3E2718 } },
+ { { 0xFD6DA67B, 0x3FEFCE15 } }, { { 0xC79EC2D5, 0x3FBC3785 } },
+ { { 0x830D4C09, 0xBC75DD6F } }, { { 0xF133FB21, 0xBC24F39D } },
+ { { 0x70E19FD3, 0x3FEFC264 } }, { { 0x56A9730E, 0x3FBF564E } },
+ { { 0x68ECACEE, 0x3C81EC86 } }, { { 0x729AE56D, 0x3C4A2704 } },
+ { { 0x7195D741, 0x3FEFB579 } }, { { 0xCEDAF577, 0x3FC139F0 } },
+ { { 0x7397CC08, 0x3C71BFAC } }, { { 0x4D1B3CFA, 0xBC652343 } },
+ { { 0x7F08A517, 0x3FEFA755 } }, { { 0x6E8E613A, 0x3FC2C810 } },
+ { { 0xCA13571F, 0xBC87A0A8 } }, { { 0xA89A11E0, 0x3C513000 } },
+ { { 0x24C9099B, 0x3FEF97F9 } }, { { 0xB1293E5A, 0x3FC45576 } },
+ { { 0xEEA5963B, 0xBC8E2AE0 } }, { { 0x4119F7B1, 0xBC5285A2 } },
+ { { 0xFA714BA9, 0x3FEF8764 } }, { { 0x448B3FC6, 0x3FC5E214 } },
+ { { 0x778FFCB6, 0x3C7AB256 } }, { { 0x779DDAC6, 0x3C6531FF } },
+ { { 0xA3A12077, 0x3FEF7599 } }, { { 0xDE50BF31, 0x3FC76DD9 } },
+ { { 0xD743195C, 0x3C884F31 } }, { { 0xEC501B2F, 0x3C61D5EE } },
+ { { 0xCFF75CB0, 0x3FEF6297 } }, { { 0x3C69A60B, 0x3FC8F8B8 } },
+ { { 0x2A361FD3, 0x3C756217 } }, { { 0xB9FF8D82, 0xBC626D19 } },
+ { { 0x3B0B2F2D, 0x3FEF4E60 } }, { { 0x25B00451, 0x3FCA82A0 } },
+ { { 0xE695AC05, 0xBC78EE01 } }, { { 0xFFD084AD, 0xBC687905 } },
+ { { 0xAC64E589, 0x3FEF38F3 } }, { { 0x6A7E4F63, 0x3FCC0B82 } },
+ { { 0xB51F72E6, 0xBC7D7BAF } }, { { 0x9E521935, 0xBC1AF143 } },
+ { { 0xF7763ADA, 0x3FEF2252 } }, { { 0xE5454311, 0x3FCD934F } },
+ { { 0x1C8D94AB, 0xBC820CB8 } }, { { 0x277107AD, 0x3C675B92 } },
+ { { 0xFB9230D7, 0x3FEF0A7E } }, { { 0x7B215F1B, 0x3FCF19F9 } },
+ { { 0xDC6B4989, 0x3C752C7A } }, { { 0xF11DA2C4, 0xBC642DEE } },
+ { { 0xA3E473C2, 0x3FEEF178 } }, { { 0x0E37FDAE, 0x3FD04FB8 } },
+ { { 0x67FE774F, 0x3C86310A } }, { { 0xB72583CC, 0xBC0412CD } },
+ { { 0xE7684963, 0x3FEED740 } }, { { 0x62B1F677, 0x3FD111D2 } },
+ { { 0x91F59CC2, 0x3C7E82C7 } }, { { 0x0AB7AA9A, 0x3C7824C2 } },
+ { { 0xC8DF0B74, 0x3FEEBBD8 } }, { { 0x3F4CDB3E, 0x3FD1D344 } },
+ { { 0x615E7277, 0x3C7C6C8C } }, { { 0x1C13519E, 0xBC6720D4 } },
+ { { 0x56C62DDA, 0x3FEE9F41 } }, { { 0x2ED59F06, 0x3FD29406 } },
+ { { 0xE2E3F81E, 0x3C8760B1 } }, { { 0xA2C4612D, 0xBC75D28D } },
+ { { 0xAB4CD10D, 0x3FEE817B } }, { { 0xC2E18152, 0x3FD35410 } },
+ { { 0x686B5E0A, 0xBC7D0AFE } }, { { 0x2F96E062, 0xBC73CB00 } },
+ { { 0xEC48E112, 0x3FEE6288 } }, { { 0x94176601, 0x3FD4135C } },
+ { { 0xF2847754, 0xBC616B56 } }, { { 0x4AFA2518, 0x3C70C97C } },
+ { { 0x4B2BC17E, 0x3FEE426A } }, { { 0x4278E76A, 0x3FD4D1E2 } },
+ { { 0x89744882, 0x3C8A8738 } }, { { 0x18792858, 0x3C624172 } },
+ { { 0x04F686E5, 0x3FEE2121 } }, { { 0x75AB1FDD, 0x3FD58F9A } },
+ { { 0x6C126527, 0xBC8014C7 } }, { { 0xD58CF620, 0xBC1EFDC0 } },
+ { { 0x622DBE2B, 0x3FEDFEAE } }, { { 0xDD3F27C6, 0x3FD64C7D } },
+ { { 0x88425567, 0xBC8514EA } }, { { 0x4A664121, 0x3C510D2B } },
+ { { 0xB6CCC23C, 0x3FEDDB13 } }, { { 0x30FA459F, 0x3FD70885 } },
+ { { 0xC6107DB3, 0x3C883C37 } }, { { 0xE0864C5D, 0xBC744B19 } },
+ { { 0x6238A09B, 0x3FEDB652 } }, { { 0x311DCCE7, 0x3FD7C3A9 } },
+ { { 0xEAE69460, 0xBC7ADEE7 } }, { { 0x1EF3E8D9, 0x3C19A3F2 } },
+ { { 0xCF328D46, 0x3FED906B } }, { { 0xA6AEA963, 0x3FD87DE2 } },
+ { { 0x10231AC2, 0x3C7457E6 } }, { { 0xD3D5A610, 0xBC672CED } },
+ { { 0x73C9E68B, 0x3FED6961 } }, { { 0x63BC93D7, 0x3FD9372A } },
+ { { 0xC6393D55, 0xBC7E8C61 } }, { { 0x9E5AD5B1, 0x3C668431 } },
+ { { 0xD14DC93A, 0x3FED4134 } }, { { 0x43A8ED8A, 0x3FD9EF79 } },
+ { { 0x95D25AF2, 0xBC84EF52 } }, { { 0x290BDBAB, 0x3C66DA81 } },
+ { { 0x743E35DC, 0x3FED17E7 } }, { { 0x2B6D3FCA, 0x3FDAA6C8 } },
+ { { 0x3540130A, 0xBC5101DA } }, { { 0x6EE5CCF7, 0xBC7D5F10 } },
+ { { 0xF43CC773, 0x3FECED7A } }, { { 0x09E15CC0, 0x3FDB5D10 } },
+ { { 0xB5AB58AE, 0xBC5E7B6B } }, { { 0xCB974183, 0x3C65B362 } },
+ { { 0xF3FCFC5C, 0x3FECC1F0 } }, { { 0xD8011EE7, 0x3FDC1249 } },
+ { { 0x3B68F6AB, 0x3C7E5761 } }, { { 0xBB515206, 0xBC7813AA } },
+ { { 0x213411F5, 0x3FEC954B } }, { { 0x9931C45E, 0x3FDCC66E } },
+ { { 0x1E946603, 0xBC52FB76 } }, { { 0x59C37F8F, 0x3C56850E } },
+ { { 0x3488739B, 0x3FEC678B } }, { { 0x5B86E389, 0x3FDD7977 } },
+ { { 0xC7C5FF5B, 0x3C6D86CA } }, { { 0x87BC0575, 0x3C7550EC } },
+ { { 0xF180BDB1, 0x3FEC38B2 } }, { { 0x3806F63B, 0x3FDE2B5D } },
+ { { 0x757C8D07, 0xBC76E0B1 } }, { { 0x1D3C6841, 0x3C5E0D89 } },
+ { { 0x26725549, 0x3FEC08C4 } }, { { 0x52EF78D6, 0x3FDEDC19 } },
+ { { 0xD80E2946, 0x3C5B157F } }, { { 0xC33EDEE6, 0xBC7DD0F7 } },
+ { { 0xAC6F952A, 0x3FEBD7C0 } }, { { 0xDBF89ABA, 0x3FDF8BA4 } },
+ { { 0x32AC700A, 0xBC8825A7 } }, { { 0xC1B776B8, 0xBC32EC1F } },
+ { { 0x673590D2, 0x3FEBA5AA } }, { { 0x874C3EB7, 0x3FE01CFC } },
+ { { 0x370753B6, 0x3C87EA4E } }, { { 0xE7C2368C, 0xBC734A35 } },
+ { { 0x45196E3E, 0x3FEB7283 } }, { { 0x9922FFEE, 0x3FE07387 } },
+ { { 0x324E6D61, 0xBC8BC69F } }, { { 0x4347406C, 0xBC8A5A01 } },
+ { { 0x3EF55712, 0x3FEB3E4D } }, { { 0x4D5D898F, 0x3FE0C970 } },
+ { { 0xBF11A493, 0xBC8EB6B8 } }, { { 0xDE6EE9B2, 0xBC88D3D7 } },
+ { { 0x58150200, 0x3FEB090A } }, { { 0x541B4B23, 0x3FE11EB3 } },
+ { { 0x300FFCCE, 0xBC8926DA } }, { { 0x69ABE4F1, 0xBC8EF23B } },
+ { { 0x9E21D511, 0x3FEAD2BC } }, { { 0x63DEDB49, 0x3FE1734D } },
+ { { 0x07BEA548, 0xBC847FBE } }, { { 0xCCC50575, 0xBC87EEF2 } },
+ { { 0x290EA1A3, 0x3FEA9B66 } }, { { 0x39AE68C8, 0x3FE1C73B } },
+ { { 0xE8B6DAC8, 0x3C39F630 } }, { { 0x267F6600, 0x3C8B25DD } },
+ { { 0x1B02FAE2, 0x3FEA6309 } }, { { 0x9933EB59, 0x3FE21A79 } },
+ { { 0x52248D10, 0xBC7E9111 } }, { { 0x77C68FB2, 0xBC83A7B1 } },
+ { { 0xA0462782, 0x3FEA29A7 } }, { { 0x4CDD12DF, 0x3FE26D05 } },
+ { { 0x015DF175, 0xBC7128BB } }, { { 0x3EF3770C, 0xBC85DA74 } },
+ { { 0xEF29AF94, 0x3FE9EF43 } }, { { 0x25FAF3EA, 0x3FE2BEDB } },
+ { { 0xB60445C2, 0x3C7B1DFC } }, { { 0xC796EE46, 0xBC514981 } },
+ { { 0x47F38741, 0x3FE9B3E0 } }, { { 0xFCE17035, 0x3FE30FF7 } },
+ { { 0x86712474, 0xBC830EE2 } }, { { 0x26F74A6F, 0xBC6EFCC6 } },
+ { { 0xF4C7D742, 0x3FE9777E } }, { { 0xB10659F3, 0x3FE36058 } },
+ { { 0xA240665E, 0xBC815479 } }, { { 0xA35857E7, 0xBC81FCB3 } },
+ { { 0x499263FB, 0x3FE93A22 } }, { { 0x292050B9, 0x3FE3AFFA } },
+ { { 0xA920DF0B, 0x3C83D419 } }, { { 0xE3954964, 0x3C7E3E25 } },
+ { { 0xA3EF940D, 0x3FE8FBCC } }, { { 0x534556D4, 0x3FE3FED9 } },
+ { { 0x9C86F2F1, 0xBC66DFA9 } }, { { 0x608C5061, 0x3C836916 } },
+ { { 0x6B151741, 0x3FE8BC80 } }, { { 0x25091DD6, 0x3FE44CF3 } },
+ { { 0x2ED1336D, 0xBC82C5E1 } }, { { 0x2CFDC6B3, 0x3C68076A } },
+ { { 0x0FBA2EBF, 0x3FE87C40 } }, { { 0x9B9B0939, 0x3FE49A44 } },
+ { { 0x0C3F64CD, 0xBC82DABC } }, { { 0x6D719B94, 0xBC827EE1 } },
+ { { 0x0BFF976E, 0x3FE83B0E } }, { { 0xBBE3E5E9, 0x3FE4E6CA } },
+ { { 0xF8EA3475, 0xBC76F420 } }, { { 0xEDCEB327, 0x3C63C293 } },
+ { { 0xE3571771, 0x3FE7F8EC } }, { { 0x92A35596, 0x3FE53282 } },
+ { { 0xCE93C917, 0xBC89C8D8 } }, { { 0x89DA0257, 0xBC7A12EB } },
+ { { 0x226AAFAF, 0x3FE7B5DF } }, { { 0x348CECA0, 0x3FE57D69 } },
+ { { 0xACDF0AD7, 0xBC70F537 } }, { { 0x992BFBB2, 0xBC875720 } },
+ { { 0x5F037261, 0x3FE771E7 } }, { { 0xBE65018C, 0x3FE5C77B } },
+ { { 0x8D84068F, 0x3C75CFCE } }, { { 0x9C0BC32A, 0x3C8069EA } },
+ { { 0x37EFFF96, 0x3FE72D08 } }, { { 0x551D2CDF, 0x3FE610B7 } },
+ { { 0x0F1D915C, 0x3C80D4EF } }, { { 0x52FF2A37, 0xBC7251B3 } },
+ { { 0x54EAA8AF, 0x3FE6E744 } }, { { 0x25F0783D, 0x3FE65919 } },
+ { { 0xC84E226E, 0xBC8DBC03 } }, { { 0xFBF5DE23, 0x3C8C3D64 } },
+ { { 0x667F3BCD, 0x3FE6A09E } }, { { 0x667F3BCD, 0x3FE6A09E } },
+ { { 0x13B26456, 0xBC8BDD34 } }, { { 0x13B26456, 0xBC8BDD34 } }
};
+/* cos(pi * x), x=[0;1/512] */
+static const FFTS_ALIGN(16) ffts_double_t cos_coeff[3] = {
+ { { 0xC9BE45DE, 0xC013BD3C } },
+ { { 0x081749FA, 0x40103C1F } },
+ { { 0x047EE98B, 0xBFF55D10 } }
+};
+
+/* sin(pi * x), x=[0;1/512] */
+static const FFTS_ALIGN(16) ffts_double_t sin_coeff[4] = {
+ { { 0x54442D18, 0x400921FB } },
+ { { 0xE62154CA, 0xC014ABBC } },
+ { { 0xCEF16BFE, 0x40046675 } },
+ { { 0xADE54A87, 0x40339228 } }
+};
+
+#ifndef M_1_256
+#define M_1_256 3.90625e-3
+#endif
+
+static int
+ffts_cexp_32f64f(size_t n, size_t d, double *out);
+
+/* calculate cos(pi * n / d) and sin(pi * n / d) with maximum error less than 1 ULP, average ~0.5 ULP */
int
-ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, int table_size)
+ffts_cexp_32f(size_t n, size_t d, float *output)
{
- double alpha, beta;
- double c[2], s[2];
- double x, z;
- int i;
+ double FFTS_ALIGN(16) z[2];
- if (!table || !table_size) {
+ if (!d || !output)
return -1;
+
+ /* reduction */
+ if (FFTS_UNLIKELY(n >= d))
+ n %= d;
+
+ ffts_cexp_32f64f(n, d, z);
+
+ output[0] = (float) z[0];
+ output[1] = (float) z[1];
+ return 0;
+}
+
+/* used as intermediate result for single precision calculations */
+static int
+ffts_cexp_32f64f(size_t n, size_t d, double *output)
+{
+ const ffts_double_t *ct = (const ffts_double_t*) FFTS_ASSUME_ALIGNED_32(cos_sin_table);
+ const ffts_double_t *cc = (const ffts_double_t*) FFTS_ASSUME_ALIGNED_16(cos_coeff);
+ const ffts_double_t *sc = (const ffts_double_t*) FFTS_ASSUME_ALIGNED_16(sin_coeff);
+ double *out = FFTS_ASSUME_ALIGNED_16(output);
+ double c, s, cos_a, cos_b, sin_a, sin_b;
+ double cos_sign, sin_sign, sign, x, z;
+ int i, j, swap;
+
+ /* we know this */
+ FFTS_ASSUME(d > 0);
+ FFTS_ASSUME(n < d);
+
+ /* determinate octant */
+ if (n > d - n) {
+ sin_sign = -1.0;
+ n = d - n;
+ } else {
+ sin_sign = 1.0;
}
- /* the first */
- table[0][0] = 1.0f;
- table[0][1] = -0.0f;
+ n <<= 1;
+ if (n > d - n) {
+ cos_sign = -1.0;
+ swap = 1;
+ n += n - d;
+ } else {
+ cos_sign = 1.0;
+ swap = 0;
+ n <<= 1;
+ }
- if (FFTS_UNLIKELY(table_size == 1)) {
- goto exit;
+ if (n > d - n) {
+ swap ^= 1;
+ n = d - n;
}
- if (FFTS_UNLIKELY(table_size == 2)) {
- /* skip over */
- i = 1;
- goto mid_point;
+ /* "binary long division" */
+ for (i = 0, j = (1 << 5), n <<= 1; j && n; j >>= 1) {
+ if (n > d - n) {
+ n += n - d;
+ i += j;
+ } else {
+ n <<= 1;
+ }
+ }
+
+ /* decide between two table values */
+ if (n > d - n) {
+ i++;
+ n = d - n;
+ sign = -1.0;
+ } else {
+ sign = 1.0;
}
- /* polynomial approximations calculated using Sollya */
- x = 1.0 / table_size;
+ /* divide by 256 is exact (as is the multiply with its reciprocal) */
+ x = ((double) n / d) * M_1_256;
+
+ /* 0 <= x <= 1/512 */
z = x * x;
- /* alpha = 2 * sin(M_PI_4 / m) * sin(M_PI_4 / m) */
- alpha = x * (1.1107207345394952717884501203293686870741139540138 +
- z * (-0.114191397993514079911985272577099412137126013186879 +
- z * 3.52164670852685621720746817665316575239342815885835e-3));
- alpha = alpha * alpha;
+ /* table lookup */
+ cos_a = ct[4 * i + 0].d;
+ sin_a = ct[4 * i + 2].d;
- /* beta = sin(M_PI_2 / m) */
- beta = x * (1.57079632679489455959753740899031981825828552246094 +
- z * (-0.64596409735041482313988581154262647032737731933593 +
- z * 7.9690915468332887416913479228242067620158195495605e-2));
+ /* evaluate polynomials */
+ cos_b = 1.0 + z * (cc[0].d + z * (cc[1].d + z * cc[2].d));
+ sin_b = x * (sc[0].d + z * (sc[1].d + z * (sc[2].d + z * sc[3].d)));
- /* cos(0) = 1.0, sin(0) = 0.0 */
- c[0] = 1.0;
- s[0] = 0.0;
+ /* sum or difference of angles */
+ c = cos_a * cos_b - sign * sin_a * sin_b;
+ s = sin_a * cos_b + sign * cos_a * sin_b;
+
+ if (swap) {
+ double tmp = c;
+ c = s;
+ s = tmp;
+ }
+
+ out[0] = cos_sign * c;
+ out[1] = sin_sign * s;
+ return 0;
+}
+
+int
+ffts_generate_chirp_32f(ffts_cpx_32f *const table, size_t table_size)
+{
+ ffts_cpx_32f *lut;
+ size_t i, j, n;
- /* generate sine and cosine tables with maximum error less than 1 ULP */
- for (i = 1; i < (table_size + 1)/2; i++) {
- c[1] = c[0] - ((alpha * c[0]) + (beta * s[0]));
- s[1] = s[0] - ((alpha * s[0]) - (beta * c[0]));
+ if (!table || !table_size)
+ return -1;
+
+ n = 2 * table_size;
+ lut = ffts_aligned_malloc(n * sizeof(*lut));
+ if (!lut)
+ return -1;
- table[i + 0][0] = (float) c[1];
- table[i + 0][1] = (float) -s[1];
- table[table_size - i][0] = (float) s[1];
- table[table_size - i][1] = (float) -c[1];
+ /* initialize LUT */
+ ffts_generate_cosine_sine_32f(lut, n);
- c[0] = c[1];
- s[0] = s[1];
+ /* generate CZT sequence */
+ for (i = 0, j = 0; i < table_size; ++i) {
+ table[i][0] = lut[j][0];
+ table[i][1] = lut[j][1];
+
+ j += 2 * i + 1;
+ if (j >= n)
+ j -= n;
}
- if (FFTS_UNLIKELY(table_size & 1)) {
+ ffts_aligned_free(lut);
+ return 0;
+}
+
+/* generate cosine and sine tables with maximum error less than 1 ULP, average ~0.5 ULP
+* using repeated subvector scaling algorithm, 16 - 20 times faster than
+* direct library calling algorithm.
+*/
+int
+ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, size_t table_size)
+{
+ ffts_cpx_64f *const tmp = (ffts_cpx_64f *const) table;
+ double FFTS_ALIGN(16) z[2], zz[2], x[2], xx[2];
+ size_t i, j, k, len;
+
+ if (!table || !table_size)
+ return -1;
+
+ if (FFTS_UNLIKELY(table_size == 1))
goto exit;
- }
-mid_point:
- table[i][0] = 0.70710677f;
- table[i][1] = -0.70710677f;
+ /* check if table size is a power of two */
+ if (!(table_size & (table_size - 1)))
+ return ffts_generate_cosine_sine_pow2_32f(table, table_size);
+
+ if (!(table_size & 1)) {
+ /* even table size -- check if multiply of four */
+ if (!(table_size & 3)) {
+ /* multiply of four */
+ len = table_size >> 2;
+ for (j = 1; 4 * j <= len; j <<= 1) {
+ ffts_cexp_32f64f(j, table_size, z);
+
+ tmp[j][0] = z[0];
+ tmp[j][1] = z[1];
+
+ tmp[len - j][0] = z[1];
+ tmp[len - j][1] = z[0];
+
+ for (i = 1; i < j; i++) {
+ zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+ zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+ tmp[j + i][0] = zz[0];
+ tmp[j + i][1] = zz[1];
+
+ tmp[len - j - i][0] = zz[1];
+ tmp[len - j - i][1] = zz[0];
+ }
+ }
+
+ /* this loops zero or one times */
+ for (k = j << 1; k <= len; j <<= 1) {
+ ffts_cexp_32f64f(j, table_size, z);
+
+ tmp[j][0] = z[0];
+ tmp[j][1] = z[1];
+ if (k++ == len)
+ break;
+
+ tmp[len - j][0] = z[1];
+ tmp[len - j][1] = z[0];
+ if (k++ == len)
+ break;
+
+ for (i = 1; i < j; i++) {
+ zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+ zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+ tmp[j + i][0] = zz[0];
+ tmp[j + i][1] = zz[1];
+ if (k++ == len)
+ break;
+
+ tmp[len - j - i][0] = zz[1];
+ tmp[len - j - i][1] = zz[0];
+ if (k++ == len)
+ break;
+ }
+ }
+
+ /* convert doubles to floats */
+ for (i = 1; i < len; i++) {
+ table[i][0] = (float) tmp[i][0];
+ table[i][1] = (float) tmp[i][1];
+ }
+
+ table[len][0] = 0.0f;
+ table[len][1] = 1.0f;
+
+ for (i = 1; i <= len; i++) {
+ table[len + i][0] = -table[i][1];
+ table[len + i][1] = table[i][0];
+ }
+ } else {
+ /* multiply of two */
+ len = table_size >> 1;
+ for (j = 1; 4 * j <= len; j <<= 1) {
+ ffts_cexp_32f64f(j, table_size, z);
+
+ tmp[j][0] = z[0];
+ tmp[j][1] = z[1];
+
+ tmp[len - j][0] = -z[0];
+ tmp[len - j][1] = z[1];
+
+ for (i = 1; i < j; i++) {
+ zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+ zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+ tmp[j + i][0] = zz[0];
+ tmp[j + i][1] = zz[1];
+
+ tmp[len - j - i][0] = -zz[0];
+ tmp[len - j - i][1] = zz[1];
+ }
+ }
+
+ /* this loops zero or one times */
+ for (k = j << 1; k <= len; j <<= 1) {
+ ffts_cexp_32f64f(j, table_size, z);
+
+ tmp[j][0] = z[0];
+ tmp[j][1] = z[1];
+ if (k++ == len)
+ break;
+
+ tmp[len - j][0] = -z[0];
+ tmp[len - j][1] = z[1];
+ if (k++ == len)
+ break;
+
+ for (i = 1; i < j; i++) {
+ zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+ zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+ tmp[j + i][0] = zz[0];
+ tmp[j + i][1] = zz[1];
+ if (k++ == len)
+ break;
+
+ tmp[len - j - i][0] = -zz[0];
+ tmp[len - j - i][1] = zz[1];
+ if (k++ == len)
+ break;
+ }
+ }
+
+ /* convert doubles to floats */
+ for (i = 1; i < len; i++) {
+ table[i][0] = (float) tmp[i][0];
+ table[i][1] = (float) tmp[i][1];
+ }
+
+ table[len][0] = -1.0f;
+ table[len][1] = 0.0f;
+ }
+
+ /* duplicate lower half to higher */
+ len = table_size >> 1;
+ for (i = 1; i < len; i++) {
+ table[table_size - i][0] = table[i][0];
+ table[table_size - i][1] = -table[i][1];
+ }
+ } else {
+ /* odd table size */
+
+ /* to avoid using temporary tables, generate the first 1/8 of table in
+ * double precision on lower half (and using the symmetry store
+ * the last 1/8 of table in single precision on higher half)
+ */
+ for (j = 1; 8 * j < table_size; j <<= 1) {
+ ffts_cexp_32f64f(j, table_size, z);
+
+ /* store double precision to lower half */
+ tmp[j][0] = z[0];
+ tmp[j][1] = z[1];
+
+ /* store single precision to higher half */
+ table[table_size - j][0] = (float) z[0];
+ table[table_size - j][1] = (float) -z[1];
+
+ for (i = 1; i < j; i++) {
+ /* use double precision for intermediate calculations */
+ zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+ zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+ tmp[i + j][0] = zz[0];
+ tmp[i + j][1] = zz[1];
+
+ table[table_size - i - j][0] = (float) zz[0];
+ table[table_size - i - j][1] = (float) -zz[1];
+ }
+ }
+
+ /* now generate 1/2 of table in single precision on higher half */
+ k = j << 1;
+ ffts_cexp_32f64f(j, table_size, z);
+ ffts_cexp_32f64f(k, table_size, x);
+
+ /* store single precision to higher half */
+ table[table_size - j][0] = (float) z[0];
+ table[table_size - j][1] = (float) -z[1];
+
+ table[table_size - k][0] = (float) x[0];
+ table[table_size - k][1] = (float) -x[1];
+
+ i = 1;
+ len = ((table_size + 1) >> 1) - k;
+ if (len > j) {
+ len -= j;
+
+ xx[0] = x[0] * z[0] - x[1] * z[1];
+ xx[1] = x[1] * z[0] + x[0] * z[1];
+
+ table[table_size - k - j][0] = (float) xx[0];
+ table[table_size - k - j][1] = (float) -xx[1];
+
+ for (; i < len; i++) {
+ zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+ zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+ table[table_size - i - j][0] = (float) zz[0];
+ table[table_size - i - j][1] = (float) -zz[1];
+
+ xx[0] = x[0] * tmp[i][0] - x[1] * tmp[i][1];
+ xx[1] = x[1] * tmp[i][0] + x[0] * tmp[i][1];
+
+ table[table_size - i - k][0] = (float) xx[0];
+ table[table_size - i - k][1] = (float) -xx[1];
+
+ xx[0] = x[0] * zz[0] - x[1] * zz[1];
+ xx[1] = x[1] * zz[0] + x[0] * zz[1];
+
+ table[table_size - i - k - j][0] = (float) xx[0];
+ table[table_size - i - k - j][1] = (float) -xx[1];
+ }
+
+ len = j;
+ }
+
+ for (; i < len; i++) {
+ zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+ zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+ table[table_size - i - j][0] = (float) zz[0];
+ table[table_size - i - j][1] = (float) -zz[1];
+
+ xx[0] = x[0] * tmp[i][0] - x[1] * tmp[i][1];
+ xx[1] = x[1] * tmp[i][0] + x[0] * tmp[i][1];
+
+ table[table_size - i - k][0] = (float) xx[0];
+ table[table_size - i - k][1] = (float) -xx[1];
+ }
+
+ for (; i < j; i++) {
+ zz[0] = z[0] * tmp[i][0] - z[1] * tmp[i][1];
+ zz[1] = z[1] * tmp[i][0] + z[0] * tmp[i][1];
+
+ table[table_size - i - j][0] = (float) zz[0];
+ table[table_size - i - j][1] = (float) -zz[1];
+ }
+
+ /* duplicate higher half to lower */
+ len = table_size >> 1;
+ for (i = 1; i <= len; i++) {
+ table[i][0] = table[table_size - i][0];
+ table[i][1] = -table[table_size - i][1];
+ }
+ }
exit:
+ /* cos(0) = 1.0, sin(0) = 0.0 */
+ table[0][0] = 1.0f;
+ table[0][1] = 0.0f;
return 0;
}
/* Oscar Buneman's method for generating a sequence of sines and cosines.
* Expired US Patent 4,878,187 A
-*
-* D. Potts, G. Steidl, M. Tasche, Numerical stability of fast
-* trigonometric transforms — a worst case study,
-* J. Concrete Appl. Math. 1 (2003) 1–36
-*
-* O. Buneman, Stable on–line creation of sines and cosines of
-* successive angles, Proc. IEEE 75, 1434 – 1435 (1987).
*/
#if HAVE_SSE2
int
@@ -227,10 +741,11 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
{
static const __m128d sign_swap = { 0.0, -0.0 };
const __m128d *FFTS_RESTRICT ct;
- const double *FFTS_RESTRICT hs;
+ const ffts_double_t *FFTS_RESTRICT cst;
+ const ffts_double_t *FFTS_RESTRICT hs;
__m128d FFTS_ALIGN(16) w[32];
__m128d FFTS_ALIGN(16) h[32];
- int i, log_2, offset;
+ int i, log_2, offset, step;
/* size must be a power of two */
if (!table || !table_size || (table_size & (table_size - 1))) {
@@ -251,21 +766,42 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
goto mid_point;
}
+ cst = (const ffts_double_t*)
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_table);
+
+ /* generate small tables from lookup table */
+ if (table_size <= 128) {
+ step = 128 / table_size;
+
+ for (i = 1; i < table_size/2; i++) {
+ float cosine = (float) cst[4 * i * step + 0].d;
+ float sine = (float) cst[4 * i * step + 1].d;
+
+ table[i + 0][0] = cosine;
+ table[i + 0][1] = -sine;
+ table[table_size - i][0] = sine;
+ table[table_size - i][1] = -cosine;
+ }
+
+ goto mid_point;
+ }
+
/* calculate table offset */
- FFTS_ASSUME(table_size/2 > 1);
+ FFTS_ASSUME(table_size/2 > 64);
log_2 = ffts_ctzl(table_size);
FFTS_ASSUME(log_2 > 1);
offset = 32 - log_2;
+ step = log_2 - 8;
ct = (const __m128d*)
- FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
- hs = (const double*) &half_secant[4 * offset];
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]);
+ hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]);
/* initialize from lookup table */
for (i = 0; i <= log_2; i++) {
w[i] = ct[2*i];
/* duplicate the high part */
- h[i] = _mm_set1_pd(hs[2*i]);
+ h[i] = _mm_set1_pd(hs[2*i].d);
}
/* generate sine and cosine tables with maximum error less than 0.5 ULP */
@@ -279,9 +815,20 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
_mm_storel_pi((__m64*) &table[table_size - i], _mm_cvtpd_ps(
_mm_or_pd(_mm_shuffle_pd(w[log_2], w[log_2], 1), sign_swap)));
- /* skip and find next trailing zero */
- offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
- w[log_2] = _mm_mul_pd(h[log_2], _mm_add_pd(w[log_2 + 1], w[offset]));
+ /* use lookup table when possible */
+ if (log_2 > step) {
+ offset = ((2 * i) >> step) + (4 << (log_2 - step));
+ if (offset >= COS_SIN_TABLE_SIZE) {
+ offset = COS_SIN_TABLE_SIZE - (2 << (log_2 - step)) - 4;
+ w[log_2] = _mm_loadr_pd(&cst[offset].d);
+ } else {
+ w[log_2] = _mm_load_pd(&cst[offset].d);
+ }
+ } else {
+ /* skip and find next trailing zero */
+ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+ w[log_2] = _mm_mul_pd(h[log_2], _mm_add_pd(w[log_2 + 1], w[offset]));
+ }
}
mid_point:
@@ -297,11 +844,12 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
{
static const __m128d sign_swap = { 0.0, -0.0 };
const struct ffts_dd2_t *FFTS_RESTRICT ct;
- const double *FFTS_RESTRICT hs;
+ const ffts_double_t *FFTS_RESTRICT cst;
+ const ffts_double_t *FFTS_RESTRICT hs;
struct ffts_dd2_t FFTS_ALIGN(16) w[32];
struct ffts_dd2_t FFTS_ALIGN(16) h[32];
struct ffts_dd2_t FFTS_ALIGN(16) sum;
- int i, log_2, offset;
+ int i, log_2, offset, step;
/* size must be a power of two */
if (!table || !table_size || (table_size & (table_size - 1))) {
@@ -322,22 +870,43 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
goto mid_point;
}
+ cst = (const ffts_double_t*)
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_table);
+
+ /* generate small tables from lookup table */
+ if (table_size <= 128) {
+ step = 128 / table_size;
+
+ for (i = 1; i < table_size/2; i++) {
+ double cosine = cst[4 * i * step + 0].d;
+ double sine = cst[4 * i * step + 1].d;
+
+ table[i + 0][0] = cosine;
+ table[i + 0][1] = -sine;
+ table[table_size - i][0] = sine;
+ table[table_size - i][1] = -cosine;
+ }
+
+ goto mid_point;
+ }
+
/* calculate table offset */
- FFTS_ASSUME(table_size/2 > 1);
+ FFTS_ASSUME(table_size/2 > 64);
log_2 = ffts_ctzl(table_size);
FFTS_ASSUME(log_2 > 1);
offset = 32 - log_2;
+ step = log_2 - 8;
ct = (const struct ffts_dd2_t*)
- FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
- hs = (const double*) &half_secant[4 * offset];
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]);
+ hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]);
/* initialize from lookup table */
for (i = 0; i <= log_2; i++) {
w[i] = ct[i];
/* duplicate the high and low parts */
- h[i].hi = _mm_set1_pd(hs[2*i + 0]);
- h[i].lo = _mm_set1_pd(hs[2*i + 1]);
+ h[i].hi = _mm_set1_pd(hs[2*i + 0].d);
+ h[i].lo = _mm_set1_pd(hs[2*i + 1].d);
}
/* generate sine and cosine tables with maximum error less than 0.5 ULP */
@@ -351,10 +920,23 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
_mm_store_pd((double*) &table[table_size - i],
_mm_or_pd(_mm_shuffle_pd(w[log_2].hi, w[log_2].hi, 1), sign_swap));
- /* skip and find next trailing zero */
- offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
- sum = ffts_dd2_add_dd2_unnormalized(&w[log_2 + 1], &w[offset]);
- w[log_2] = ffts_dd2_mul_dd2(&h[log_2], &sum);
+ /* use lookup table when possible */
+ if (log_2 > step) {
+ offset = ((2 * i) >> step) + (4 << (log_2 - step));
+ if (offset >= COS_SIN_TABLE_SIZE) {
+ offset = COS_SIN_TABLE_SIZE - (2 << (log_2 - step)) - 4;
+ w[log_2].hi = _mm_loadr_pd(&cst[offset + 0].d);
+ w[log_2].lo = _mm_loadr_pd(&cst[offset + 2].d);
+ } else {
+ w[log_2].hi = _mm_load_pd(&cst[offset + 0].d);
+ w[log_2].lo = _mm_load_pd(&cst[offset + 2].d);
+ }
+ } else {
+ /* skip and find next trailing zero */
+ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+ sum = ffts_dd2_add_dd2_unnormalized(&w[log_2 + 1], &w[offset]);
+ w[log_2] = ffts_dd2_mul_dd2(&h[log_2], &sum);
+ }
}
mid_point:
@@ -369,9 +951,10 @@ int
ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
{
const ffts_cpx_64f *FFTS_RESTRICT ct;
- const double *FFTS_RESTRICT hs;
+ const ffts_double_t *FFTS_RESTRICT cst;
+ const ffts_double_t *FFTS_RESTRICT hs;
ffts_cpx_64f FFTS_ALIGN(16) w[32];
- int i, log_2, offset;
+ int i, log_2, offset, step;
/* size must be a power of two */
if (!table || !table_size || (table_size & (table_size - 1))) {
@@ -392,14 +975,35 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
goto mid_point;
}
+ cst = (const ffts_double_t*)
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_table);
+
+ /* generate small tables from lookup table */
+ if (table_size <= 128) {
+ step = 128 / table_size;
+
+ for (i = 1; i < table_size/2; i++) {
+ float cosine = (float) cst[4 * i * step + 0].d;
+ float sine = (float) cst[4 * i * step + 1].d;
+
+ table[i + 0][0] = cosine;
+ table[i + 0][1] = -sine;
+ table[table_size - i][0] = sine;
+ table[table_size - i][1] = -cosine;
+ }
+
+ goto mid_point;
+ }
+
/* calculate table offset */
- FFTS_ASSUME(table_size/2 > 1);
+ FFTS_ASSUME(table_size/2 > 64);
log_2 = ffts_ctzl(table_size);
FFTS_ASSUME(log_2 > 1);
offset = 32 - log_2;
+ step = log_2 - 8;
ct = (const ffts_cpx_64f*)
- FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
- hs = (const double*) &half_secant[4 * offset];
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]);
+ hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]);
/* initialize from lookup table */
for (i = 0; i <= log_2; i++) {
@@ -417,10 +1021,23 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
table[table_size - i][0] = (float) w[log_2][1];
table[table_size - i][1] = (float) -w[log_2][0];
- /* skip and find next trailing zero */
- offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
- w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
- w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+ /* use lookup table when possible */
+ if (log_2 > step) {
+ offset = ((2 * i) >> step) + (4 << (log_2 - step));
+ if (offset >= 260) {
+ offset = 260 - (2 << (log_2 - step)) - 4;
+ w[log_2][0] = cst[offset + 0].d;
+ w[log_2][1] = cst[offset + 1].d;
+ } else {
+ w[log_2][0] = cst[offset + 0].d;
+ w[log_2][1] = cst[offset + 1].d;
+ }
+ } else {
+ /* skip and find next trailing zero */
+ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+ w[log_2][0] = hs[2 * log_2].d * (w[log_2 + 1][0] + w[offset][0]);
+ w[log_2][1] = hs[2 * log_2].d * (w[log_2 + 1][1] + w[offset][1]);
+ }
}
mid_point:
@@ -435,9 +1052,10 @@ int
ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
{
const struct ffts_dd_t *FFTS_RESTRICT ct;
+ const ffts_double_t *FFTS_RESTRICT cst;
const struct ffts_dd_t *FFTS_RESTRICT hs;
struct ffts_dd_t FFTS_ALIGN(16) w[32][2];
- int i, log_2, offset;
+ int i, log_2, offset, step;
/* size must be a power of two */
if (!table || !table_size || (table_size & (table_size - 1))) {
@@ -458,14 +1076,35 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
goto mid_point;
}
+ cst = (const ffts_double_t*)
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_table);
+
+ /* generate small tables from lookup table */
+ if (table_size <= 128) {
+ step = 128 / table_size;
+
+ for (i = 1; i < table_size/2; i++) {
+ double cosine = cst[4 * i * step + 0].d;
+ double sine = cst[4 * i * step + 1].d;
+
+ table[i + 0][0] = cosine;
+ table[i + 0][1] = -sine;
+ table[table_size - i][0] = sine;
+ table[table_size - i][1] = -cosine;
+ }
+
+ goto mid_point;
+ }
+
/* calculate table offset */
- FFTS_ASSUME(table_size/2 > 1);
+ FFTS_ASSUME(table_size/2 > 64);
log_2 = ffts_ctzl(table_size);
FFTS_ASSUME(log_2 > 1);
offset = 32 - log_2;
+ step = log_2 - 8;
ct = (const struct ffts_dd_t*)
- FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
- hs = (const struct ffts_dd_t*) &half_secant[4 * offset];
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]);
+ hs = (const struct ffts_dd_t*) &half_secant[2 * offset];
/* initialize from lookup table */
for (i = 0; i <= log_2; i++) {
@@ -486,12 +1125,29 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
table[table_size - i][0] = w[log_2][1].hi;
table[table_size - i][1] = -w[log_2][0].hi;
- /* skip and find next trailing zero */
- offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
- w[log_2][0] = ffts_dd_mul_dd(hs[log_2],
- ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0]));
- w[log_2][1] = ffts_dd_mul_dd(hs[log_2],
- ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1]));
+ /* use lookup table when possible */
+ if (log_2 > step) {
+ offset = ((2 * i) >> step) + (4 << (log_2 - step));
+ if (offset >= 260) {
+ offset = 260 - (2 << (log_2 - step)) - 4;
+ w[log_2][0].hi = cst[offset + 1].d;
+ w[log_2][1].hi = cst[offset + 0].d;
+ w[log_2][0].lo = cst[offset + 3].d;
+ w[log_2][1].lo = cst[offset + 2].d;
+ } else {
+ w[log_2][0].hi = cst[offset + 0].d;
+ w[log_2][1].hi = cst[offset + 1].d;
+ w[log_2][0].lo = cst[offset + 2].d;
+ w[log_2][1].lo = cst[offset + 3].d;
+ }
+ } else {
+ /* skip and find next trailing zero */
+ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+ w[log_2][0] = ffts_dd_mul_dd(hs[log_2],
+ ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0]));
+ w[log_2][1] = ffts_dd_mul_dd(hs[log_2],
+ ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1]));
+ }
}
mid_point:
@@ -509,7 +1165,7 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
int invert)
{
const ffts_cpx_64f *FFTS_RESTRICT ct;
- const double *FFTS_RESTRICT hs;
+ const ffts_double_t *FFTS_RESTRICT hs;
ffts_cpx_64f FFTS_ALIGN(16) w[32];
int i, log_2, offset, N;
float *A, *B;
@@ -547,8 +1203,8 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
FFTS_ASSUME(log_2 > 2);
offset = 34 - log_2;
ct = (const ffts_cpx_64f*)
- FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
- hs = (const double*) &half_secant[4 * offset];
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[4 * offset]);
+ hs = FFTS_ASSUME_ALIGNED_16(&half_secant[2 * offset]);
/* initialize from lookup table */
for (i = 0; i <= log_2; i++) {
@@ -556,7 +1212,6 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
w[i][1] = ct[2*i][1];
}
- /* generate sine and cosine tables with maximum error less than 0.5 ULP */
if (sign < 0) {
for (i = 1; i < N/4; i++) {
float t0, t1, t2;
@@ -580,8 +1235,8 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
/* skip and find next trailing zero */
offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
- w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
- w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+ w[log_2][0] = hs[2 * log_2].d * (w[log_2 + 1][0] + w[offset][0]);
+ w[log_2][1] = hs[2 * log_2].d * (w[log_2 + 1][1] + w[offset][1]);
}
} else {
for (i = 1; i < N/4; i++) {
@@ -606,8 +1261,8 @@ ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
/* skip and find next trailing zero */
offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
- w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
- w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+ w[log_2][0] = hs[2 * log_2].d * (w[log_2 + 1][0] + w[offset][0]);
+ w[log_2][1] = hs[2 * log_2].d * (w[log_2 + 1][1] + w[offset][1]);
}
}
@@ -625,4 +1280,4 @@ last:
}
return 0;
-} \ No newline at end of file
+}
diff --git a/lib/ffts/src/ffts_trig.h b/lib/ffts/src/ffts_trig.h
index 0b22738..f988340 100644
--- a/lib/ffts/src/ffts_trig.h
+++ b/lib/ffts/src/ffts_trig.h
@@ -2,7 +2,7 @@
This file is part of FFTS -- The Fastest Fourier Transform in the South
-Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2015-2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@@ -39,8 +39,16 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ffts_internal.h"
+/* calculate cos(pi * n / d) and sin(pi * n / d) with maximum error less than 1 ULP, average ~0.5 ULP */
int
-ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, int table_size);
+ffts_cexp_32f(size_t n, size_t d, float *output);
+
+int
+ffts_generate_chirp_32f(ffts_cpx_32f *const table, size_t table_size);
+
+/* generate cosine and sine tables with maximum error less than 1 ULP, average ~0.5 ULP */
+int
+ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, size_t table_size);
int
ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size);
diff --git a/lib/ffts/src/macros-alpha.h b/lib/ffts/src/macros-alpha.h
index f7795d4..c32d1e9 100644
--- a/lib/ffts/src/macros-alpha.h
+++ b/lib/ffts/src/macros-alpha.h
@@ -58,9 +58,6 @@ typedef union {
uint32_t u[4];
} V4SF;
-#define FFTS_MALLOC(d,a) (malloc(d))
-#define FFTS_FREE(d) (free(d))
-
static FFTS_ALWAYS_INLINE V4SF
V4SF_LIT4(float f3, float f2, float f1, float f0)
{
diff --git a/lib/ffts/src/macros-altivec.h b/lib/ffts/src/macros-altivec.h
index 28f552f..33f2346 100644
--- a/lib/ffts/src/macros-altivec.h
+++ b/lib/ffts/src/macros-altivec.h
@@ -4,6 +4,7 @@
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2019, Timothy Pearson <tpearson@raptorengineering.com>
All rights reserved.
@@ -39,99 +40,89 @@
#define restrict
-typedef vector float V;
+typedef vector float V4SF;
typedef vector unsigned char VUC;
-#ifdef __apple__
-#define FFTS_MALLOC(d,a) vec_malloc(d)
-#define FFTS_FREE(d) vec_free(d)
-#else
-/* It appears vec_malloc() and friends are not implemented on Linux */
-#include <malloc.h>
-#define FFTS_MALLOC(d,a) memalign(16,d)
-#define FFTS_FREE(d) free(d)
-#endif
-
-#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3})
+#define V4SF_LIT4(f0,f1,f2,f3) ((V4SF){f0, f1, f2, f3})
-#define VADD(x,y) vec_add(x,y)
-#define VSUB(x,y) vec_sub(x,y)
-#define VMUL(x,y) vec_madd(x,y,(V){0})
-#define VMULADD(x,y,z) vec_madd(x,y,z)
-#define VNMULSUB(x,y,z) vec_nmsub(x,y,z)
-#define VXOR(x,y) vec_xor((x),(y))
-#define VSWAPPAIRS(x) \
+#define V4SF_ADD(x,y) vec_add(x,y)
+#define V4SF_SUB(x,y) vec_sub(x,y)
+#define V4SF_MUL(x,y) vec_madd(x,y,(V4SF){0})
+#define V4SF_MULADD(x,y,z) vec_madd(x,y,z)
+#define V4SF_NMULSUB(x,y,z) vec_nmsub(x,y,z)
+#define V4SF_XOR(x,y) vec_xor((x),(y))
+#define V4SF_SWAPPAIRS(x) \
vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03, \
0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b})
-#define VBLEND(x,y) \
+#define V4SF_BLEND(x,y) \
vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
-#define VUNPACKHI(x,y) \
+#define V4SF_UNPACK_HI(x,y) \
vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, \
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
-#define VUNPACKLO(x,y) \
+#define V4SF_UNPACK_LO(x,y) \
vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17})
-#define VDUPRE(x) \
+#define V4SF_DUPLICATE_RE(x) \
vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03, \
0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b})
-#define VDUPIM(x) \
+#define V4SF_DUPLICATE_IM(x) \
vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07, \
0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f})
-static inline V IMUL(V d, V re, V im)
+static inline V4SF V4SF_IMUL(V4SF d, V4SF re, V4SF im)
{
- im = VMUL(im, VSWAPPAIRS(d));
- re = VMUL(re, d);
- return VSUB(re, im);
+ im = V4SF_MUL(im, V4SF_SWAPPAIRS(d));
+ re = V4SF_MUL(re, d);
+ return V4SF_SUB(re, im);
}
-static inline V IMULJ(V d, V re, V im)
+static inline V4SF V4SF_IMULJ(V4SF d, V4SF re, V4SF im)
{
- im = VMUL(im, VSWAPPAIRS(d));
- return VMULADD(re, d, im);
+ im = V4SF_MUL(im, V4SF_SWAPPAIRS(d));
+ return V4SF_MULADD(re, d, im);
}
#ifndef __GNUC__
/* gcc (4.6 and 4.7) ICEs on this code! */
-static inline V MULI(int inv, V x)
+static inline V4SF MULI(int inv, V4SF x)
{
- return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f));
+ return V4SF_XOR(x, inv ? V4SF_LIT4(-0.0f,0.0f,-0.0f,0.0f) : V4SF_LIT4(0.0f,-0.0f,0.0f,-0.0f));
}
#else
/* but compiles this fine... */
-static inline V MULI(int inv, V x)
+static inline V4SF MULI(int inv, V4SF x)
{
- V t;
- t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f);
- return VXOR(x, t);
+ V4SF t;
+ t = inv ? V4SF_LIT4(-0.0f,0.0f,-0.0f,0.0f) : V4SF_LIT4(0.0f,-0.0f,0.0f,-0.0f);
+ return V4SF_XOR(x, t);
}
#endif
-static inline V IMULI(int inv, V x)
+static inline V4SF V4SF_IMULI(int inv, V4SF x)
{
- return VSWAPPAIRS(MULI(inv, x));
+ return V4SF_SWAPPAIRS(MULI(inv, x));
}
-static inline V VLD(const void *s)
+static inline V4SF V4SF_LD(const void *s)
{
- V *d = (V *)s;
+ V4SF *d = (V4SF *)s;
return *d;
}
-static inline void VST(void *d, V s)
+static inline void V4SF_ST(void *d, V4SF s)
{
- V *r = (V *)d;
+ V4SF *r = (V4SF *)d;
*r = s;
}
#endif
diff --git a/lib/ffts/src/macros-neon.h b/lib/ffts/src/macros-neon.h
index 29aa49f..f0d1fff 100644
--- a/lib/ffts/src/macros-neon.h
+++ b/lib/ffts/src/macros-neon.h
@@ -39,9 +39,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdlib.h>
#endif
-#define FFTS_MALLOC(d,a) (valloc(d))
-#define FFTS_FREE(d) (free(d))
-
typedef float32x4_t V4SF;
typedef float32x4x2_t V4SF2;
diff --git a/lib/ffts/src/macros-sse.h b/lib/ffts/src/macros-sse.h
index 827aa67..46e1f29 100644
--- a/lib/ffts/src/macros-sse.h
+++ b/lib/ffts/src/macros-sse.h
@@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@@ -40,9 +41,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <xmmintrin.h>
-#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
-#define FFTS_FREE(d) (_mm_free(d))
-
typedef __m128 V4SF;
#define V4SF_ADD _mm_add_ps
@@ -56,8 +54,9 @@ typedef __m128 V4SF;
#define V4SF_SWAP_PAIRS(x) \
(_mm_shuffle_ps(x, x, _MM_SHUFFLE(2,3,0,1)))
+/* note: order is swapped */
#define V4SF_UNPACK_HI(x,y) \
- (_mm_shuffle_ps(x, y, _MM_SHUFFLE(3,2,3,2)))
+ (_mm_movehl_ps(y, x))
#define V4SF_UNPACK_LO(x,y) \
(_mm_movelh_ps(x, y))
@@ -97,4 +96,220 @@ V4SF_IMULJ(V4SF d, V4SF re, V4SF im)
return V4SF_ADD(re, im);
}
+#ifdef FFTS_DOUBLE
+typedef union {
+ struct {
+ double r1;
+ double i1;
+ double r2;
+ double i2;
+ } r;
+ uint32_t u[8];
+} V4DF;
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_LIT4(double f3, double f2, double f1, double f0)
+{
+ V4DF z;
+
+ z.r.r1 = f0;
+ z.r.i1 = f1;
+ z.r.r2 = f2;
+ z.r.i2 = f3;
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_ADD(V4DF x, V4DF y)
+{
+ V4DF z;
+
+ z.r.r1 = x.r.r1 + y.r.r1;
+ z.r.i1 = x.r.i1 + y.r.i1;
+ z.r.r2 = x.r.r2 + y.r.r2;
+ z.r.i2 = x.r.i2 + y.r.i2;
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_SUB(V4DF x, V4DF y)
+{
+ V4DF z;
+
+ z.r.r1 = x.r.r1 - y.r.r1;
+ z.r.i1 = x.r.i1 - y.r.i1;
+ z.r.r2 = x.r.r2 - y.r.r2;
+ z.r.i2 = x.r.i2 - y.r.i2;
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_MUL(V4DF x, V4DF y)
+{
+ V4DF z;
+
+ z.r.r1 = x.r.r1 * y.r.r1;
+ z.r.i1 = x.r.i1 * y.r.i1;
+ z.r.r2 = x.r.r2 * y.r.r2;
+ z.r.i2 = x.r.i2 * y.r.i2;
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_XOR(V4DF x, V4DF y)
+{
+ V4DF z;
+
+ z.u[0] = x.u[0] ^ y.u[0];
+ z.u[1] = x.u[1] ^ y.u[1];
+ z.u[2] = x.u[2] ^ y.u[2];
+ z.u[3] = x.u[3] ^ y.u[3];
+ z.u[4] = x.u[4] ^ y.u[4];
+ z.u[5] = x.u[5] ^ y.u[5];
+ z.u[6] = x.u[6] ^ y.u[6];
+ z.u[7] = x.u[7] ^ y.u[7];
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_SWAP_PAIRS(V4DF x)
+{
+ V4DF z;
+
+ z.r.r1 = x.r.i1;
+ z.r.i1 = x.r.r1;
+ z.r.r2 = x.r.i2;
+ z.r.i2 = x.r.r2;
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_BLEND(V4DF x, V4DF y)
+{
+ V4DF z;
+
+ z.r.r1 = x.r.r1;
+ z.r.i1 = x.r.i1;
+ z.r.r2 = y.r.r2;
+ z.r.i2 = y.r.i2;
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_UNPACK_HI(V4DF x, V4DF y)
+{
+ V4DF z;
+
+ z.r.r1 = x.r.r2;
+ z.r.i1 = x.r.i2;
+ z.r.r2 = y.r.r2;
+ z.r.i2 = y.r.i2;
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_UNPACK_LO(V4DF x, V4DF y)
+{
+ V4DF z;
+
+ z.r.r1 = x.r.r1;
+ z.r.i1 = x.r.i1;
+ z.r.r2 = y.r.r1;
+ z.r.i2 = y.r.i1;
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_DUPLICATE_RE(V4DF x)
+{
+ V4DF z;
+
+ z.r.r1 = x.r.r1;
+ z.r.i1 = x.r.r1;
+ z.r.r2 = x.r.r2;
+ z.r.i2 = x.r.r2;
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_DUPLICATE_IM(V4DF x)
+{
+ V4DF z;
+
+ z.r.r1 = x.r.i1;
+ z.r.i1 = x.r.i1;
+ z.r.r2 = x.r.i2;
+ z.r.i2 = x.r.i2;
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_IMUL(V4DF d, V4DF re, V4DF im)
+{
+ re = V4DF_MUL(re, d);
+ im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d));
+ return V4DF_SUB(re, im);
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_IMULJ(V4DF d, V4DF re, V4DF im)
+{
+ re = V4DF_MUL(re, d);
+ im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d));
+ return V4DF_ADD(re, im);
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_MULI(int inv, V4DF x)
+{
+ V4DF z;
+
+ if (inv) {
+ z.r.r1 = -x.r.r1;
+ z.r.i1 = x.r.i1;
+ z.r.r2 = -x.r.r2;
+ z.r.i2 = x.r.i2;
+ } else {
+ z.r.r1 = x.r.r1;
+ z.r.i1 = -x.r.i1;
+ z.r.r2 = x.r.r2;
+ z.r.i2 = -x.r.i2;
+ }
+
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_IMULI(int inv, V4DF x)
+{
+ return V4DF_SWAP_PAIRS(V4DF_MULI(inv, x));
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_LD(const void *s)
+{
+ V4DF z;
+ memcpy(&z, s, sizeof(z));
+ return z;
+}
+
+static FFTS_ALWAYS_INLINE void
+V4DF_ST(void *d, V4DF s)
+{
+ V4DF *r = (V4DF*) d;
+ *r = s;
+}
+#endif
+
#endif /* FFTS_MACROS_SSE_H */
diff --git a/lib/ffts/src/macros.h b/lib/ffts/src/macros.h
index e7e349f..99b0c53 100644
--- a/lib/ffts/src/macros.h
+++ b/lib/ffts/src/macros.h
@@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@@ -41,14 +42,29 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef HAVE_NEON
#include "macros-neon.h"
#elif HAVE_SSE
+#ifdef HAVE_AVX
+#include "macros-avx.h"
+#else
#include "macros-sse.h"
+#endif
// NOTE: AltiVec support disabled until updated to provide new V4SF variable type
-//#elif __powerpc__
-//#include "macros-altivec.h"
+#elif __powerpc__
+#include "macros-altivec.h"
#else
#include "macros-alpha.h"
#endif
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_TX2(V4DF *a, V4DF *b)
+{
+ V4DF t0 = V4DF_UNPACK_LO(*a, *b);
+ V4DF t1 = V4DF_UNPACK_HI(*a, *b);
+ *a = t0;
+ *b = t1;
+}
+#endif
+
static FFTS_INLINE void
V4SF_TX2(V4SF *a, V4SF *b)
{
@@ -58,6 +74,34 @@ V4SF_TX2(V4SF *a, V4SF *b)
*b = t1;
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_K_N(int inv,
+ V4DF re,
+ V4DF im,
+ V4DF *r0,
+ V4DF *r1,
+ V4DF *r2,
+ V4DF *r3)
+{
+ V4DF uk, uk2, zk_p, zk_n, zk, zk_d;
+
+ uk = *r0;
+ uk2 = *r1;
+
+ zk_p = V4DF_IMUL(*r2, re, im);
+ zk_n = V4DF_IMULJ(*r3, re, im);
+
+ zk = V4DF_ADD(zk_p, zk_n);
+ zk_d = V4DF_IMULI(inv, V4DF_SUB(zk_p, zk_n));
+
+ *r2 = V4DF_SUB(uk, zk);
+ *r0 = V4DF_ADD(uk, zk);
+ *r3 = V4DF_ADD(uk2, zk_d);
+ *r1 = V4DF_SUB(uk2, zk_d);
+}
+#endif
+
static FFTS_INLINE void
V4SF_K_N(int inv,
V4SF re,
@@ -84,6 +128,45 @@ V4SF_K_N(int inv,
*r1 = V4SF_SUB(uk2, zk_d);
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_2_4(int inv,
+ const double *FFTS_RESTRICT i0,
+ const double *FFTS_RESTRICT i1,
+ const double *FFTS_RESTRICT i2,
+ const double *FFTS_RESTRICT i3,
+ V4DF *r0,
+ V4DF *r1,
+ V4DF *r2,
+ V4DF *r3)
+{
+ V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = V4DF_LD(i0);
+ t1 = V4DF_LD(i1);
+ t2 = V4DF_LD(i2);
+ t3 = V4DF_LD(i3);
+
+ t4 = V4DF_ADD(t0, t1);
+ t5 = V4DF_SUB(t0, t1);
+ t6 = V4DF_ADD(t2, t3);
+ t7 = V4DF_SUB(t2, t3);
+
+ *r0 = V4DF_UNPACK_LO(t4, t5);
+ *r1 = V4DF_UNPACK_LO(t6, t7);
+
+ t5 = V4DF_IMULI(inv, t5);
+
+ t0 = V4DF_ADD(t6, t4);
+ t2 = V4DF_SUB(t6, t4);
+ t1 = V4DF_SUB(t7, t5);
+ t3 = V4DF_ADD(t7, t5);
+
+ *r3 = V4DF_UNPACK_HI(t0, t1);
+ *r2 = V4DF_UNPACK_HI(t2, t3);
+}
+#endif
+
static FFTS_INLINE void
V4SF_L_2_4(int inv,
const float *FFTS_RESTRICT i0,
@@ -121,6 +204,46 @@ V4SF_L_2_4(int inv,
*r2 = V4SF_UNPACK_HI(t2, t3);
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_4_4(int inv,
+ const double *FFTS_RESTRICT i0,
+ const double *FFTS_RESTRICT i1,
+ const double *FFTS_RESTRICT i2,
+ const double *FFTS_RESTRICT i3,
+ V4DF *r0,
+ V4DF *r1,
+ V4DF *r2,
+ V4DF *r3)
+{
+ V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = V4DF_LD(i0);
+ t1 = V4DF_LD(i1);
+ t2 = V4DF_LD(i2);
+ t3 = V4DF_LD(i3);
+
+ t4 = V4DF_ADD(t0, t1);
+ t5 = V4DF_SUB(t0, t1);
+ t6 = V4DF_ADD(t2, t3);
+
+ t7 = V4DF_IMULI(inv, V4DF_SUB(t2, t3));
+
+ t0 = V4DF_ADD(t4, t6);
+ t2 = V4DF_SUB(t4, t6);
+ t1 = V4DF_SUB(t5, t7);
+ t3 = V4DF_ADD(t5, t7);
+
+ V4DF_TX2(&t0, &t1);
+ V4DF_TX2(&t2, &t3);
+
+ *r0 = t0;
+ *r2 = t1;
+ *r1 = t2;
+ *r3 = t3;
+}
+#endif
+
static FFTS_INLINE void
V4SF_L_4_4(int inv,
const float *FFTS_RESTRICT i0,
@@ -159,6 +282,48 @@ V4SF_L_4_4(int inv,
*r3 = t3;
}
+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_4_2(int inv,
+ const double *FFTS_RESTRICT i0,
+ const double *FFTS_RESTRICT i1,
+ const double *FFTS_RESTRICT i2,
+ const double *FFTS_RESTRICT i3,
+ V4DF *r0,
+ V4DF *r1,
+ V4DF *r2,
+ V4DF *r3)
+{
+ V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = V4DF_LD(i0);
+ t1 = V4DF_LD(i1);
+ t6 = V4DF_LD(i2);
+ t7 = V4DF_LD(i3);
+
+ t2 = V4DF_BLEND(t6, t7);
+ t3 = V4DF_BLEND(t7, t6);
+
+ t4 = V4DF_ADD(t0, t1);
+ t5 = V4DF_SUB(t0, t1);
+ t6 = V4DF_ADD(t2, t3);
+ t7 = V4DF_SUB(t2, t3);
+
+ *r2 = V4DF_UNPACK_HI(t4, t5);
+ *r3 = V4DF_UNPACK_HI(t6, t7);
+
+ t7 = V4DF_IMULI(inv, t7);
+
+ t0 = V4DF_ADD(t4, t6);
+ t2 = V4DF_SUB(t4, t6);
+ t1 = V4DF_SUB(t5, t7);
+ t3 = V4DF_ADD(t5, t7);
+
+ *r0 = V4DF_UNPACK_LO(t0, t1);
+ *r1 = V4DF_UNPACK_LO(t2, t3);
+}
+#endif
+
static FFTS_INLINE void
V4SF_L_4_2(int inv,
const float *FFTS_RESTRICT i0,
@@ -199,6 +364,9 @@ V4SF_L_4_2(int inv,
*r1 = V4SF_UNPACK_LO(t2, t3);
}
+#define V4DF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
+ V4DF_ST(o0, r0); V4DF_ST(o1, r1); V4DF_ST(o2, r2); V4DF_ST(o3, r3);
+
#define V4SF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
V4SF_ST(o0, r0); V4SF_ST(o1, r1); V4SF_ST(o2, r2); V4SF_ST(o3, r3);