diff options
author | Anthony Blake <anthonix@me.com> | 2014-09-06 01:35:32 -0500 |
---|---|---|
committer | Timothy Pearson <kb9vqf@pearsoncomputing.net> | 2014-09-06 01:35:32 -0500 |
commit | 14d918151bd447d854c3d0b34a9d542a5dff38ff (patch) | |
tree | 897331864b7f2061cbf54b74f8232b318e084f6b /lib/ffts/src/vfp.s | |
parent | c223a6a35f0b6a1b06aa4ff8029065711a986120 (diff) | |
download | ulab-14d918151bd447d854c3d0b34a9d542a5dff38ff.tar.gz ulab-14d918151bd447d854c3d0b34a9d542a5dff38ff.zip |
Add FFTS v0.7
Diffstat (limited to 'lib/ffts/src/vfp.s')
-rw-r--r-- | lib/ffts/src/vfp.s | 473 |
1 files changed, 473 insertions, 0 deletions
diff --git a/lib/ffts/src/vfp.s b/lib/ffts/src/vfp.s new file mode 100644 index 0000000..8ced89d --- /dev/null +++ b/lib/ffts/src/vfp.s @@ -0,0 +1,473 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, 2013 The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +@ assumes r0 = out +@ r1 = in ? +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = loop iterations +@ r2 = const pointer +@ & lr = temps + + .align 4 +#ifdef __APPLE__ + .globl _vfp_e +_vfp_e: +#else + .globl vfp_e +vfp_e: +#endif +_vfp_e_loop: + vldr s15, [r2, #8] + vldr s2, [r3] @ x0 + vldr s0, [r3, #4] + vldr s4, [r4] @ x1 + vldr s11, [r2] + vldr s10, [r7] @ x4 + vldr s3, [r7, #4] + vldr s8, [r8] @ x5 + vldr s1, [r8, #4] + vldr s14, [r9] @ x6 + vldr s9, [r9, #4] + vldr s6, [r10] @ x7 + vldr s12, [r10, #4] + vsub.f32 s18, s3, s1 + vsub.f32 s7, s10, s8 + vsub.f32 s5, s14, s6 + vadd.f32 s6, s14, s6 + vldr s24, [r5, #4] + vsub.f32 s14, s9, s12 + vldr s22, [r6, #4] + vadd.f32 s8, s10, s8 + vldr s28, [r6] @ x3 + vldr s17, [r5] @ x2 + vadd.f32 s10, s9, s12 + vmul.f32 s13, s18, s15 + vmul.f32 s9, s7, s11 + vmul.f32 s16, s5, s11 + vmul.f32 s18, s18, s11 + vmul.f32 s30, s14, s11 + vldr s11, [r4, #4] + add r3, r3, #8 + add r4, r4, #8 + add r5, r5, #8 + add r6, r6, #8 + add r7, r7, #8 + add r8, r8, #8 + add r9, r9, #8 + add r10, r10, #8 + vmul.f32 s12, s5, s15 + vmul.f32 s20, s14, s15 + vadd.f32 s5, s2, s4 + vadd.f32 s3, s3, s1 + vmul.f32 s15, s7, s15 + vadd.f32 s1, s24, s22 + vsub.f32 s7, s24, s22 + vadd.f32 s24, s17, s28 + vadd.f32 s26, s0, s11 + vsub.f32 s14, s9, s13 + vsub.f32 s2, s2, s4 + vadd.f32 s4, s16, s20 + vsub.f32 s22, s0, s11 + vsub.f32 s16, s17, s28 + vadd.f32 s9, s5, s24 + vadd.f32 s28, s18, s15 + vadd.f32 s13, s8, s6 + vsub.f32 s5, s5, s24 + vsub.f32 s24, s8, s6 + vadd.f32 s11, s26, s1 + vsub.f32 s12, s30, s12 + vadd.f32 s20, s3, s10 + vsub.f32 s15, s3, s10 + vsub.f32 s3, s26, s1 + vadd.f32 s18, s9, s13 + vadd.f32 s10, s14, s4 + vadd.f32 s6, s2, s7 @ + vsub.f32 s0, s2, s7 @ + vadd.f32 s26, s11, s20 + vsub.f32 s4, s14, s4 + vsub.f32 s8, s22, s16 @ + vadd.f32 s1, s28, s12 +ldr lr, [r12], #4 +add lr, r0, lr, lsl #2 +subs r11, r11, #1 + vstr s18, [lr] + vsub.f32 s2, s28, s12 + vadd.f32 s12, s22, s16 @ + vsub.f32 s16, s3, s24 @ + vsub.f32 s13, s9, s13 + vstr s26, [lr, #4] + vadd.f32 s28, s5, s15 @ + vsub.f32 s7, s5, s15 @ + vadd.f32 s14, s6, s10 + vadd.f32 s5, s8, s1 + vadd.f32 s9, s0, s2 @ + vsub.f32 s2, s0, s2 @ + vsub.f32 s11, s11, s20 + vstr s28, [lr, #16] + vadd.f32 s3, s3, s24 @ + vstr s16, [lr, #20] + vsub.f32 s6, s6, s10 + vstr s13, [lr, #32] + vsub.f32 s13, s12, s4 @ + vsub.f32 s8, s8, s1 + vadd.f32 s0, s12, s4 @ + vstr s11, [lr, #36] + vstr s7, [lr, #48] + vstr s3, [lr, #52] + vstr s14, [lr, #8] + vstr s5, [lr, #12] + vstr s9, [lr, #24] + vstr s13, [lr, #28] + vstr s6, [lr, #40] + vstr s8, [lr, #44] + vstr s2, [lr, #56] + vstr s0, [lr, #60] + bne _vfp_e_loop + +@ assumes r0 = out +@ r1 = in ? +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = loop iterations +@ r2 & lr = temps + .align 4 +#ifdef __APPLE__ + .globl _vfp_o +_vfp_o: +#else + .globl vfp_o +vfp_o: +#endif + _vfp_o_loop: + vldr s4, [r3] @ x0 + vldr s0, [r3, #4] + vldr s6, [r4] @ x1 + vldr s5, [r4, #4] + vldr s7, [r5] @ x2 + vldr s1, [r5, #4] + vldr s3, [r6] @ x3 + vldr s8, [r6, #4] + subs r11, r11, #1 + ldr r2, [r12], #4 + add r2, r0, r2, lsl #2 + vadd.f32 s2, s4, s6 + vadd.f32 s14, s0, s5 + vadd.f32 s10, s1, s8 + vsub.f32 s4, s4, s6 + vsub.f32 s0, s0, s5 + vadd.f32 s12, s7, s3 + vsub.f32 s6, s7, s3 + vsub.f32 s8, s1, s8 + vadd.f32 s5, s14, s10 + vsub.f32 s10, s14, s10 + vadd.f32 s7, s2, s12 + vsub.f32 s1, s0, s6 @ + vsub.f32 s12, s2, s12 + vadd.f32 s3, s4, s8 @ + vsub.f32 s2, s4, s8 @ + vadd.f32 s0, s0, s6 @ + vstr s7, [r2] + vldr s7, [r9] @ x2 + vstr s5, [r2, #4] + vstr s3, [r2, #8] + vstr s1, [r2, #12] + vstr s12, [r2, #16] + vstr s10, [r2, #20] + vstr s2, [r2, #24] + vstr s0, [r2, #28] + vldr s4, [r7] @ x0 + vldr s0, [r7, #4] + vldr s6, [r8] @ x1 + vldr s5, [r8, #4] + vldr s3, [r10] @ x3 + vldr s8, [r10, #4] + vldr s1, [r9, #4] + add r3, r3, #8 + add r4, r4, #8 + add r5, r5, #8 + add r6, r6, #8 + add r7, r7, #8 + add r8, r8, #8 + add r9, r9, #8 + add r10, r10, #8 + vadd.f32 s2, s4, s6 + vadd.f32 s14, s0, s5 + vadd.f32 s10, s1, s8 + vsub.f32 s4, s4, s6 + vsub.f32 s0, s0, s5 + vadd.f32 s12, s7, s3 + vsub.f32 s6, s7, s3 + vsub.f32 s8, s1, s8 + vadd.f32 s5, s14, s10 + vsub.f32 s10, s14, s10 + vadd.f32 s7, s2, s12 + vsub.f32 s1, s0, s6 @ + vsub.f32 s12, s2, s12 + vadd.f32 s3, s4, s8 @ + vsub.f32 s2, s4, s8 @ + vadd.f32 s0, s0, s6 @ + vstr s7, [r2, #32] + vstr s5, [r2, #36] + vstr s3, [r2, #40] + vstr s1, [r2, #44] + vstr s12, [r2, #48] + vstr s10, [r2, #52] + vstr s2, [r2, #56] + vstr s0, [r2, #60] + bne _vfp_o_loop + + .align 4 +#ifdef __APPLE__ + .globl _vfp_x4 +_vfp_x4: +#else + .globl vfp_x4 +vfp_x4: +#endif + add r3, r0, #0 + add r7, r2, #0 + add r4, r0, r1, lsl #1 + add r5, r0, r1, lsl #2 + add r6, r4, r1, lsl #2 + mov r11, #4 +_vfp_x4_loop: + + vldr s8, [r3, #0] + vldr s9, [r3, #4] + vldr s10, [r4, #0] + vldr s11, [r4, #4] + vldr s12, [r5, #0] + vldr s13, [r5, #4] + vldr s14, [r6, #0] + vldr s15, [r6, #4] + vldr s2, [r7, #0] + vldr s3, [r7, #4] + add r7, r7, #8 + subs r11, r11, #1 + vmul.f32 s0, s13, s3 + vmul.f32 s5, s12, s2 + vmul.f32 s1, s14, s2 + vmul.f32 s4, s14, s3 + vmul.f32 s14, s12, s3 + vmul.f32 s13, s13, s2 + vmul.f32 s12, s15, s3 + vmul.f32 s2, s15, s2 + vsub.f32 s0, s5, s0 + vadd.f32 s13, s13, s14 + vadd.f32 s12, s12, s1 + vsub.f32 s1, s2, s4 + vadd.f32 s15, s0, s12 + vsub.f32 s12, s0, s12 + vadd.f32 s14, s13, s1 + vsub.f32 s13, s13, s1 + vadd.f32 s0, s8, s15 + vadd.f32 s1, s9, s14 + vadd.f32 s2, s10, s13 @ + vsub.f32 s4, s8, s15 + vsub.f32 s3, s11, s12 @ + vstr s0, [r3, #0] + vstr s1, [r3, #4] + add r3, r3, #8 + vsub.f32 s5, s9, s14 + vsub.f32 s6, s10, s13 @ + vadd.f32 s7, s11, s12 @ + vstr s2, [r4, #0] + vstr s3, [r4, #4] + add r4, r4, #8 + vstr s4, [r5, #0] + vstr s5, [r5, #4] + add r5, r5, #8 + vstr s6, [r6, #0] + vstr s7, [r6, #4] + add r6, r6, #8 + bne _vfp_x4_loop + bx lr + + .align 4 +#ifdef __APPLE__ + .globl _vfp_x8 +_vfp_x8: +#else + .globl vfp_x8 +vfp_x8: +#endif + mov r11, #0 + add r3, r0, #0 @ data0 + add r5, r0, r1, lsl #1 @ data2 + add r4, r0, r1 @ data1 + add r7, r5, r1, lsl #1 @ data4 + add r6, r5, r1 @ data3 + add r9, r7, r1, lsl #1 @ data6 + add r8, r7, r1 @ data5 + add r10, r9, r1 @ data7 + add r12, r2, #0 @ LUT + + sub r11, r11, r1, lsr #3 +_vfp_x8_loop: + vldr s10, [r3, #0] @ x0-re + vldr s8, [r3, #4] @ x0-im + vldr s2, [r4, #0] @ x1-re + vldr s0, [r4, #4] @ x1-im + vldr s6, [r5, #0] @ x2-re + vldr s4, [r5, #4] @ x2-im + vldr s13, [r6, #0] @ x3-re + vldr s15, [r6, #4] @ x3-im + vldr s7, [r12] + vldr s11, [r12, #4] + vldr s5, [r7, #0] @ x4-re + vldr s1, [r7, #4] @ x4-im + vldr s28, [r9, #0] @ x6-re + vldr s18, [r9, #4] @ x6-im + adds r11, r11, #1 + vmul.f32 s14, s15, s7 + vldr s24, [r12, #12] + vmul.f32 s12, s13, s11 + vmul.f32 s26, s13, s7 + vldr s13, [r12, #8] + vmul.f32 s3, s4, s11 + vmul.f32 s15, s15, s11 + vmul.f32 s16, s4, s7 + vmul.f32 s9, s6, s7 + vmul.f32 s11, s6, s11 + vmul.f32 s7, s18, s24 + vmul.f32 s20, s1, s24 + vmul.f32 s30, s5, s13 + vadd.f32 s4, s26, s15 + vsub.f32 s12, s14, s12 + vsub.f32 s6, s9, s3 + vadd.f32 s14, s16, s11 + vmul.f32 s22, s28, s13 + vmul.f32 s26, s28, s24 + vmul.f32 s18, s18, s13 + vmul.f32 s5, s5, s24 + vmul.f32 s1, s1, s13 + vsub.f32 s9, s30, s20 + vadd.f32 s16, s14, s12 + vadd.f32 s3, s22, s7 + vadd.f32 s15, s6, s4 + vsub.f32 s11, s18, s26 + vadd.f32 s18, s1, s5 + vadd.f32 s13, s8, s16 + vadd.f32 s1, s9, s3 + vadd.f32 s7, s10, s15 + vsub.f32 s15, s10, s15 + vsub.f32 s10, s9, s3 + vadd.f32 s5, s18, s11 + vsub.f32 s11, s18, s11 + vsub.f32 s8, s8, s16 + vadd.f32 s20, s7, s1 + vsub.f32 s7, s7, s1 + vadd.f32 s18, s13, s5 + vadd.f32 s16, s15, s11 @ + vsub.f32 s9, s8, s10 @ + vsub.f32 s3, s13, s5 + vsub.f32 s1, s15, s11 @ + vstr s20, [r3] + vadd.f32 s8, s8, s10 @ + vstr s18, [r3, #4] + add r3, r3, #8 + vstr s16, [r5] + vstr s9, [r5, #4] + add r5, r5, #8 + vstr s7, [r7] + vstr s3, [r7, #4] + add r7, r7, #8 + vstr s1, [r9] + vstr s8, [r9, #4] + add r9, r9, #8 + vldr s10, [r8, #0] @ x5-re + vldr s8, [r8, #4] @ x5-im + vldr s5, [r10, #0] @ x7-re + vldr s11, [r10, #4] @ x7-im + vldr s1, [r12, #16] + vldr s15, [r12, #20] + add r12, r12, #24 + vmul.f32 s9, s5, s1 + vmul.f32 s3, s11, s15 + vmul.f32 s13, s10, s1 + vmul.f32 s7, s8, s15 + vmul.f32 s5, s5, s15 + vmul.f32 s11, s11, s1 + vmul.f32 s10, s10, s15 + vmul.f32 s15, s8, s1 + vsub.f32 s1, s14, s12 + vadd.f32 s8, s9, s3 + vsub.f32 s3, s6, s4 + vsub.f32 s12, s13, s7 + vsub.f32 s5, s11, s5 + vadd.f32 s7, s15, s10 + vadd.f32 s4, s2, s1 @ + vsub.f32 s2, s2, s1 @ + vsub.f32 s6, s0, s3 @ + vadd.f32 s10, s12, s8 + vsub.f32 s9, s12, s8 + vadd.f32 s0, s0, s3 @ + vsub.f32 s1, s7, s5 + vadd.f32 s14, s7, s5 + vadd.f32 s7, s4, s10 + vsub.f32 s8, s4, s10 + vsub.f32 s12, s0, s9 @ + vadd.f32 s3, s2, s1 @ + vadd.f32 s5, s6, s14 + vsub.f32 s4, s6, s14 + vsub.f32 s2, s2, s1 @ + vadd.f32 s0, s0, s9 @ + vstr s7, [r4] + vstr s5, [r4, #4] + add r4, r4, #8 + vstr s3, [r6] + vstr s12, [r6, #4] + add r6, r6, #8 + vstr s8, [r8] + vstr s4, [r8, #4] + add r8, r8, #8 + vstr s2, [r10] + vstr s0, [r10, #4] + add r10, r10, #8 + bne _vfp_x8_loop + bx lr + + + .align 4 +#ifdef __APPLE__ + .globl _vfp_end +_vfp_end: +#else + .globl vfp_end +vfp_end: +#endif + bx lr |