diff options
Diffstat (limited to 'mpg123_artsplugin/mpg123/decode_i586.s')
-rw-r--r-- | mpg123_artsplugin/mpg123/decode_i586.s | 323 |
1 files changed, 323 insertions, 0 deletions
diff --git a/mpg123_artsplugin/mpg123/decode_i586.s b/mpg123_artsplugin/mpg123/decode_i586.s new file mode 100644 index 00000000..5b5169a5 --- /dev/null +++ b/mpg123_artsplugin/mpg123/decode_i586.s @@ -0,0 +1,323 @@ +# +# synth_1to1 works the same way as the c version of this +# file. only two types of changes have been made: +# - reordered floating point instructions to +# prevent pipline stalls +# - made WRITE_SAMPLE use integer instead of +# (slower) floating point +# all kinds of x86 processors should benefit from these +# modifications. +# +# useful sources of information on optimizing x86 code include: +# +# Intel Architecture Optimization Manual +# http:#/www.intel.com/design/pentium/manuals/242816.htm +# +# Cyrix 6x86 Instruction Set Summary +# ftp:#/ftp.cyrix.com/6x86/6x-dbch6.pdf +# +# AMD-K5 Processor Software Development +# http:#/www.amd.com/products/cpg/techdocs/appnotes/20007e.pdf +# +# Stefan Bieschewski <stb@acm.org> +# +# You can use this part under GPL. +# +# $Id$ +# +.bss + .comm buffs,4352,4 +.data + .align 4 +bo: + .long 1 +.section .rodata + .align 8 +.LC0: + .long 0x0,0x40dfffc0 + .align 8 +.LC1: + .long 0x0,0xc0e00000 + .align 8 +.text +.globl synth_1to1_pent +synth_1to1_pent: + subl $12,%esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + movl 32(%esp),%eax + movl 40(%esp),%esi + xorl %edi,%edi + movl bo,%ebp + cmpl %edi,36(%esp) + jne .L48 + decl %ebp + andl $15,%ebp + movl %ebp,bo + movl $buffs,%ecx + jmp .L49 +.L48: + addl $2,%esi + movl $buffs+2176,%ecx +.L49: + testl $1,%ebp + je .L50 + movl %ecx,%ebx + movl %ebp,16(%esp) + pushl %eax + movl 20(%esp),%edx + leal (%ebx,%edx,4),%eax + pushl %eax + movl 24(%esp),%eax + incl %eax + andl $15,%eax + leal 1088(,%eax,4),%eax + addl %ebx,%eax + jmp .L74 +.L50: + leal 1088(%ecx),%ebx + leal 1(%ebp),%edx + movl %edx,16(%esp) + pushl %eax + leal 1092(%ecx,%ebp,4),%eax + pushl %eax + leal (%ecx,%ebp,4),%eax +.L74: + pushl %eax + call dct64 + addl $12,%esp + movl 16(%esp),%edx + leal 0(,%edx,4),%edx + movl $decwin+64,%eax + movl %eax,%ecx + subl %edx,%ecx + movl $16,%ebp +.L55: + flds (%ecx) + fmuls (%ebx) + flds 4(%ecx) + fmuls 4(%ebx) + fxch %st(1) + flds 8(%ecx) + fmuls 8(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 12(%ecx) + fmuls 12(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 16(%ecx) + fmuls 16(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 20(%ecx) + fmuls 20(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 24(%ecx) + fmuls 24(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 28(%ecx) + fmuls 28(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 32(%ecx) + fmuls 32(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 36(%ecx) + fmuls 36(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 40(%ecx) + fmuls 40(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 44(%ecx) + fmuls 44(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 48(%ecx) + fmuls 48(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 52(%ecx) + fmuls 52(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 56(%ecx) + fmuls 56(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds 60(%ecx) + fmuls 60(%ebx) + fxch %st(2) + subl $4,%esp + faddp %st,%st(1) + fxch %st(1) + fsubrp %st,%st(1) + fistpl (%esp) + popl %eax + cmpl $32767,%eax + jg 1f + cmpl $-32768,%eax + jl 2f + movw %ax,(%esi) + jmp 4f +1: movw $32767,(%esi) + jmp 3f +2: movw $-32768,(%esi) +3: incl %edi +4: +.L54: + addl $64,%ebx + subl $-128,%ecx + addl $4,%esi + decl %ebp + jnz .L55 + flds (%ecx) + fmuls (%ebx) + flds 8(%ecx) + fmuls 8(%ebx) + flds 16(%ecx) + fmuls 16(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 24(%ecx) + fmuls 24(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 32(%ecx) + fmuls 32(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 40(%ecx) + fmuls 40(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 48(%ecx) + fmuls 48(%ebx) + fxch %st(2) + faddp %st,%st(1) + flds 56(%ecx) + fmuls 56(%ebx) + fxch %st(2) + subl $4,%esp + faddp %st,%st(1) + fxch %st(1) + faddp %st,%st(1) + fistpl (%esp) + popl %eax + cmpl $32767,%eax + jg 1f + cmpl $-32768,%eax + jl 2f + movw %ax,(%esi) + jmp 4f +1: movw $32767,(%esi) + jmp 3f +2: movw $-32768,(%esi) +3: incl %edi +4: +.L62: + addl $-64,%ebx + addl $4,%esi + movl 16(%esp),%edx + leal -128(%ecx,%edx,8),%ecx + movl $15,%ebp +.L68: + flds -4(%ecx) + fchs + fmuls (%ebx) + flds -8(%ecx) + fmuls 4(%ebx) + fxch %st(1) + flds -12(%ecx) + fmuls 8(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -16(%ecx) + fmuls 12(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -20(%ecx) + fmuls 16(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -24(%ecx) + fmuls 20(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -28(%ecx) + fmuls 24(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -32(%ecx) + fmuls 28(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -36(%ecx) + fmuls 32(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -40(%ecx) + fmuls 36(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -44(%ecx) + fmuls 40(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -48(%ecx) + fmuls 44(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -52(%ecx) + fmuls 48(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -56(%ecx) + fmuls 52(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds -60(%ecx) + fmuls 56(%ebx) + fxch %st(2) + fsubrp %st,%st(1) + flds (%ecx) + fmuls 60(%ebx) + fxch %st(2) + subl $4,%esp + fsubrp %st,%st(1) + fxch %st(1) + fsubrp %st,%st(1) + fistpl (%esp) + popl %eax + cmpl $32767,%eax + jg 1f + cmpl $-32768,%eax + jl 2f + movw %ax,(%esi) + jmp 4f +1: movw $32767,(%esi) + jmp 3f +2: movw $-32768,(%esi) +3: incl %edi +4: +.L67: + addl $-64,%ebx + addl $-128,%ecx + addl $4,%esi + decl %ebp + jnz .L68 + movl %edi,%eax + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $12,%esp + ret + |