diff options
Diffstat (limited to 'xorg/server/module/amd64')
-rw-r--r-- | xorg/server/module/amd64/cpuid_amd64.asm | 41 | ||||
-rw-r--r-- | xorg/server/module/amd64/funcs_amd64.h | 39 | ||||
-rw-r--r-- | xorg/server/module/amd64/i420_to_rgb32_amd64_sse2.asm | 248 | ||||
-rw-r--r-- | xorg/server/module/amd64/uyvy_to_rgb32_amd64_sse2.asm | 17 | ||||
-rw-r--r-- | xorg/server/module/amd64/yuy2_to_rgb32_amd64_sse2.asm | 17 | ||||
-rw-r--r-- | xorg/server/module/amd64/yv12_to_rgb32_amd64_sse2.asm | 248 |
6 files changed, 610 insertions, 0 deletions
diff --git a/xorg/server/module/amd64/cpuid_amd64.asm b/xorg/server/module/amd64/cpuid_amd64.asm new file mode 100644 index 00000000..b97937ad --- /dev/null +++ b/xorg/server/module/amd64/cpuid_amd64.asm @@ -0,0 +1,41 @@ + +SECTION .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +;The first six integer or pointer arguments are passed in registers +;RDI, RSI, RDX, RCX, R8, and R9 + +;int +;cpuid_amd64(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx) + +PROC cpuid_amd64 + ; save registers + push rbx + + push rdx + push rcx + push r8 + push r9 + + mov rax, rdi + mov rcx, rsi + cpuid + pop rdi + mov [rdi], edx + pop rdi + mov [rdi], ecx + pop rdi + mov [rdi], ebx + pop rdi + mov [rdi], eax + mov eax, 0 + ; restore registers + pop rbx + ret; + align 16 + diff --git a/xorg/server/module/amd64/funcs_amd64.h b/xorg/server/module/amd64/funcs_amd64.h new file mode 100644 index 00000000..10cffe0d --- /dev/null +++ b/xorg/server/module/amd64/funcs_amd64.h @@ -0,0 +1,39 @@ +/* +Copyright 2014 Jay Sorg + +Permission to use, copy, modify, distribute, and sell this software and its +documentation for any purpose is hereby granted without fee, provided that +the above copyright notice appear in all copies and that both that +copyright notice and this permission notice appear in supporting +documentation. + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +amd64 asm functions + +*/ + +#ifndef __FUNCS_AMD64_H +#define __FUNCS_AMD64_H + +int +cpuid_amd64(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx); +int +yv12_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs); +int +i420_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs); +int +yuy2_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs); +int +uyvy_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs); + +#endif + diff --git a/xorg/server/module/amd64/i420_to_rgb32_amd64_sse2.asm b/xorg/server/module/amd64/i420_to_rgb32_amd64_sse2.asm new file mode 100644 index 00000000..d9760caa --- /dev/null +++ b/xorg/server/module/amd64/i420_to_rgb32_amd64_sse2.asm @@ -0,0 +1,248 @@ +; +;Copyright 2014 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;I420 to RGB32 +;amd64 SSE2 32 bit +; +; RGB to YUV +; 0.299 0.587 0.114 +; -0.14713 -0.28886 0.436 +; 0.615 -0.51499 -0.10001 +; YUV to RGB +; 1 0 1.13983 +; 1 -0.39465 -0.58060 +; 1 2.03211 0 +; shift left 12 +; 4096 0 4669 +; 4096 -1616 -2378 +; 4096 9324 0 + +SECTION .data +align 16 +c128 times 8 dw 128 +c4669 times 8 dw 4669 +c1616 times 8 dw 1616 +c2378 times 8 dw 2378 +c9324 times 8 dw 9324 + +SECTION .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +do8_uv: + + ; v + movd xmm1, [rbx] ; 4 at a time + lea rbx, [rbx + 4] + punpcklbw xmm1, xmm1 + pxor xmm6, xmm6 + punpcklbw xmm1, xmm6 + movdqa xmm7, [rel c128] + psubw xmm1, xmm7 + psllw xmm1, 4 + + ; v + movd xmm2, [rdx] ; 4 at a time + lea rdx, [rdx + 4] + punpcklbw xmm2, xmm2 + punpcklbw xmm2, xmm6 + psubw xmm2, xmm7 + psllw xmm2, 4 + +do8: + + ; y + movq xmm0, [rsi] ; 8 at a time + lea rsi, [rsi + 8] + pxor xmm6, xmm6 + punpcklbw xmm0, xmm6 + + ; r = y + hiword(4669 * (v << 4)) + movdqa xmm4, [rel c4669] + pmulhw xmm4, xmm1 + movdqa xmm3, xmm0 + paddw xmm3, xmm4 + + ; g = y - hiword(1616 * (u << 4)) - hiword(2378 * (v << 4)) + movdqa xmm5, [rel c1616] + pmulhw xmm5, xmm2 + movdqa xmm6, [rel c2378] + pmulhw xmm6, xmm1 + movdqa xmm4, xmm0 + psubw xmm4, xmm5 + psubw xmm4, xmm6 + + ; b = y + hiword(9324 * (u << 4)) + movdqa xmm6, [rel c9324] + pmulhw xmm6, xmm2 + movdqa xmm5, xmm0 + paddw xmm5, xmm6 + + packuswb xmm3, xmm3 ; b + packuswb xmm4, xmm4 ; g + punpcklbw xmm3, xmm4 ; gb + + pxor xmm4, xmm4 ; a + packuswb xmm5, xmm5 ; r + punpcklbw xmm5, xmm4 ; ar + + movdqa xmm4, xmm3 + punpcklwd xmm3, xmm5 ; argb + movdqa [rdi], xmm3 + lea rdi, [rdi + 16] + punpckhwd xmm4, xmm5 ; argb + movdqa [rdi], xmm4 + lea rdi, [rdi + 16] + + ret; + +;The first six integer or pointer arguments are passed in registers +; RDI, RSI, RDX, RCX, R8, and R9 + +;int +;i420_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs) + +PROC i420_to_rgb32_amd64_sse2 + push rbx + push rsi + push rdi + push rbp + + push rdi + push rdx + mov rdi, rcx ; rgbs + + mov rcx, rsi ; width + mov rdx, rcx + pop rbp ; height + mov rax, rbp + shr rbp, 1 + imul rax, rcx ; rax = width * height + + pop rsi ; y + + mov rbx, rsi ; u = y + width * height + add rbx, rax + + ; local vars + ; char* yptr1 + ; char* yptr2 + ; char* uptr + ; char* vptr + ; int* rgbs1 + ; int* rgbs2 + ; int width + sub rsp, 56 ; local vars, 56 bytes + mov [rsp + 0], rsi ; save y1 + add rsi, rdx + mov [rsp + 8], rsi ; save y2 + mov [rsp + 16], rbx ; save u + shr rax, 2 + add rbx, rax ; v = u + (width * height / 4) + mov [rsp + 24], rbx ; save v + + mov [rsp + 32], rdi ; save rgbs1 + mov rax, rdx + shl rax, 2 + add rdi, rax + mov [rsp + 40], rdi ; save rgbs2 + +loop_y: + + mov rcx, rdx ; width + shr rcx, 3 + + ; save rdx + mov [rsp + 48], rdx + + ;prefetchnta 4096[rsp + 0] ; y + ;prefetchnta 1024[rsp + 16] ; u + ;prefetchnta 1024[rsp + 24] ; v + +loop_x: + + mov rsi, [rsp + 0] ; y1 + mov rbx, [rsp + 16] ; u + mov rdx, [rsp + 24] ; v + mov rdi, [rsp + 32] ; rgbs1 + + ; y1 + call do8_uv + + mov [rsp + 0], rsi ; y1 + mov [rsp + 32], rdi ; rgbs1 + + mov rsi, [rsp + 8] ; y2 + mov rdi, [rsp + 40] ; rgbs2 + + ; y2 + call do8 + + mov [rsp + 8], rsi ; y2 + mov [rsp + 16], rbx ; u + mov [rsp + 24], rdx ; v + mov [rsp + 40], rdi ; rgbs2 + + dec rcx ; width + jnz loop_x + + ; restore rdx + mov rdx, [rsp + 48] + + ; update y1 and 2 + mov rax, [rsp + 0] + mov rbx, rdx + add rax, rbx + mov [rsp + 0], rax + + mov rax, [rsp + 8] + add rax, rbx + mov [rsp + 8], rax + + ; update rgb1 and 2 + mov rax, [rsp + 32] + mov rbx, rdx + shl rbx, 2 + add rax, rbx + mov [rsp + 32], rax + + mov rax, [rsp + 40] + add rax, rbx + mov [rsp + 40], rax + + mov rcx, rbp + dec rcx ; height + mov rbp, rcx + jnz loop_y + + add rsp, 56 + + mov rax, 0 + pop rbp + pop rdi + pop rsi + pop rbx + ret + align 16 + + diff --git a/xorg/server/module/amd64/uyvy_to_rgb32_amd64_sse2.asm b/xorg/server/module/amd64/uyvy_to_rgb32_amd64_sse2.asm new file mode 100644 index 00000000..8866fd0f --- /dev/null +++ b/xorg/server/module/amd64/uyvy_to_rgb32_amd64_sse2.asm @@ -0,0 +1,17 @@ + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +;int +;uyvy_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs) + +PROC uyvy_to_rgb32_amd64_sse2 + push rbx + mov rax, 0 + pop rbx + ret + align 16 + diff --git a/xorg/server/module/amd64/yuy2_to_rgb32_amd64_sse2.asm b/xorg/server/module/amd64/yuy2_to_rgb32_amd64_sse2.asm new file mode 100644 index 00000000..c0ac5c1b --- /dev/null +++ b/xorg/server/module/amd64/yuy2_to_rgb32_amd64_sse2.asm @@ -0,0 +1,17 @@ + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +;int +;yuy2_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs) + +PROC yuy2_to_rgb32_amd64_sse2 + push rbx + mov rax, 0 + pop rbx + ret + align 16 + diff --git a/xorg/server/module/amd64/yv12_to_rgb32_amd64_sse2.asm b/xorg/server/module/amd64/yv12_to_rgb32_amd64_sse2.asm new file mode 100644 index 00000000..13e46878 --- /dev/null +++ b/xorg/server/module/amd64/yv12_to_rgb32_amd64_sse2.asm @@ -0,0 +1,248 @@ +; +;Copyright 2014 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;YV12 to RGB32 +;amd64 SSE2 32 bit +; +; RGB to YUV +; 0.299 0.587 0.114 +; -0.14713 -0.28886 0.436 +; 0.615 -0.51499 -0.10001 +; YUV to RGB +; 1 0 1.13983 +; 1 -0.39465 -0.58060 +; 1 2.03211 0 +; shift left 12 +; 4096 0 4669 +; 4096 -1616 -2378 +; 4096 9324 0 + +SECTION .data +align 16 +c128 times 8 dw 128 +c4669 times 8 dw 4669 +c1616 times 8 dw 1616 +c2378 times 8 dw 2378 +c9324 times 8 dw 9324 + +SECTION .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +do8_uv: + + ; u + movd xmm1, [rbx] ; 4 at a time + lea rbx, [rbx + 4] + punpcklbw xmm1, xmm1 + pxor xmm6, xmm6 + punpcklbw xmm1, xmm6 + movdqa xmm7, [rel c128] + psubw xmm1, xmm7 + psllw xmm1, 4 + + ; v + movd xmm2, [rdx] ; 4 at a time + lea rdx, [rdx + 4] + punpcklbw xmm2, xmm2 + punpcklbw xmm2, xmm6 + psubw xmm2, xmm7 + psllw xmm2, 4 + +do8: + + ; y + movq xmm0, [rsi] ; 8 at a time + lea rsi, [rsi + 8] + pxor xmm6, xmm6 + punpcklbw xmm0, xmm6 + + ; r = y + hiword(4669 * (v << 4)) + movdqa xmm4, [rel c4669] + pmulhw xmm4, xmm2 + movdqa xmm3, xmm0 + paddw xmm3, xmm4 + + ; g = y - hiword(1616 * (u << 4)) - hiword(2378 * (v << 4)) + movdqa xmm5, [rel c1616] + pmulhw xmm5, xmm1 + movdqa xmm6, [rel c2378] + pmulhw xmm6, xmm2 + movdqa xmm4, xmm0 + psubw xmm4, xmm5 + psubw xmm4, xmm6 + + ; b = y + hiword(9324 * (u << 4)) + movdqa xmm6, [rel c9324] + pmulhw xmm6, xmm1 + movdqa xmm5, xmm0 + paddw xmm5, xmm6 + + packuswb xmm3, xmm3 ; b + packuswb xmm4, xmm4 ; g + punpcklbw xmm3, xmm4 ; gb + + pxor xmm4, xmm4 ; a + packuswb xmm5, xmm5 ; r + punpcklbw xmm5, xmm4 ; ar + + movdqa xmm4, xmm3 + punpcklwd xmm3, xmm5 ; argb + movdqa [rdi], xmm3 + lea rdi, [rdi + 16] + punpckhwd xmm4, xmm5 ; argb + movdqa [rdi], xmm4 + lea rdi, [rdi + 16] + + ret; + +;The first six integer or pointer arguments are passed in registers +; RDI, RSI, RDX, RCX, R8, and R9 + +;int +;yv12_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs) + +PROC yv12_to_rgb32_amd64_sse2 + push rbx + push rsi + push rdi + push rbp + + push rdi + push rdx + mov rdi, rcx ; rgbs + + mov rcx, rsi ; width + mov rdx, rcx + pop rbp ; height + mov rax, rbp + shr rbp, 1 + imul rax, rcx ; rax = width * height + + pop rsi ; y + + mov rbx, rsi ; u = y + width * height + add rbx, rax + + ; local vars + ; char* yptr1 + ; char* yptr2 + ; char* uptr + ; char* vptr + ; int* rgbs1 + ; int* rgbs2 + ; int width + sub rsp, 56 ; local vars, 56 bytes + mov [rsp + 0], rsi ; save y1 + add rsi, rdx + mov [rsp + 8], rsi ; save y2 + mov [rsp + 16], rbx ; save u + shr rax, 2 + add rbx, rax ; v = u + (width * height / 4) + mov [rsp + 24], rbx ; save v + + mov [rsp + 32], rdi ; save rgbs1 + mov rax, rdx + shl rax, 2 + add rdi, rax + mov [rsp + 40], rdi ; save rgbs2 + +loop_y: + + mov rcx, rdx ; width + shr rcx, 3 + + ; save rdx + mov [rsp + 48], rdx + + ;prefetchnta 4096[rsp + 0] ; y + ;prefetchnta 1024[rsp + 16] ; u + ;prefetchnta 1024[rsp + 24] ; v + +loop_x: + + mov rsi, [rsp + 0] ; y1 + mov rbx, [rsp + 16] ; u + mov rdx, [rsp + 24] ; v + mov rdi, [rsp + 32] ; rgbs1 + + ; y1 + call do8_uv + + mov [rsp + 0], rsi ; y1 + mov [rsp + 32], rdi ; rgbs1 + + mov rsi, [rsp + 8] ; y2 + mov rdi, [rsp + 40] ; rgbs2 + + ; y2 + call do8 + + mov [rsp + 8], rsi ; y2 + mov [rsp + 16], rbx ; u + mov [rsp + 24], rdx ; v + mov [rsp + 40], rdi ; rgbs2 + + dec rcx ; width + jnz loop_x + + ; restore rdx + mov rdx, [rsp + 48] + + ; update y1 and 2 + mov rax, [rsp + 0] + mov rbx, rdx + add rax, rbx + mov [rsp + 0], rax + + mov rax, [rsp + 8] + add rax, rbx + mov [rsp + 8], rax + + ; update rgb1 and 2 + mov rax, [rsp + 32] + mov rbx, rdx + shl rbx, 2 + add rax, rbx + mov [rsp + 32], rax + + mov rax, [rsp + 40] + add rax, rbx + mov [rsp + 40], rax + + mov rcx, rbp + dec rcx ; height + mov rbp, rcx + jnz loop_y + + add rsp, 56 + + mov rax, 0 + pop rbp + pop rdi + pop rsi + pop rbx + ret + align 16 + + |