summaryrefslogtreecommitdiffstats
path: root/xorg
diff options
context:
space:
mode:
Diffstat (limited to 'xorg')
-rw-r--r--xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm185
1 files changed, 64 insertions, 121 deletions
diff --git a/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm b/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm
index 1e67ff68..4d7638ef 100644
--- a/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm
+++ b/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm
@@ -1,15 +1,24 @@
+; RGB to YUV
+; 0.299 0.587 0.114
+; -0.14713 -0.28886 0.436
+; 0.615 -0.51499 -0.10001
+; YUV to RGB
+; 1 0 1.13983
+; 1 -0.39465 -0.58060
+; 1 2.03211 0
+; shift left 12
+; 4096 0 4669
+; 4096 -1616 -2378
+; 4096 9324 0
+
SECTION .data
align 16
-c8 times 4 dd 8
-c16 times 4 dd 16
-c100 times 4 dd 100
-c128 times 4 dd 128
-c208 times 4 dd 208
-c255 times 4 dd 255
-c298 times 4 dd 298
-c409 times 4 dd 409
-c516 times 4 dd 516
+c128 times 8 dw 128
+c4669 times 8 dw 4669
+c1616 times 8 dw 1616
+c2378 times 8 dw 2378
+c9324 times 8 dw 9324
SECTION .text
@@ -19,136 +28,70 @@ SECTION .text
%1:
%endmacro
-y1_do4:
- ; y
- movd xmm0, [esi] ; 4 at a time
- add esi, 4
- pxor xmm6, xmm6
- punpcklbw xmm0, xmm6
- punpcklwd xmm0, xmm6
- movdqa xmm7, [c16]
- psubd xmm0, xmm7
+do8_uv:
; u
- movd xmm1, [ebx] ; read 4 but only using 2
- add ebx, 2
+ movd xmm1, [ebx] ; 4 at a time
+ add ebx, 4
punpcklbw xmm1, xmm1
+ pxor xmm6, xmm6
punpcklbw xmm1, xmm6
- punpcklwd xmm1, xmm6
movdqa xmm7, [c128]
- psubd xmm1, xmm7
+ psubw xmm1, xmm7
+ psllw xmm1, 4
; v
- movd xmm2, [edx] ; read 4 but only using 2
- add edx, 2
+ movd xmm2, [edx] ; 4 at a time
+ add edx, 4
punpcklbw xmm2, xmm2
punpcklbw xmm2, xmm6
- punpcklwd xmm2, xmm6
- psubd xmm2, xmm7
-
- ; t = (298 * c + 409 * e + 128) >> 8;
- movdqa xmm3, [c298]
- pmulld xmm3, xmm0
- movdqa xmm4, [c409]
- pmulld xmm4, xmm2
- paddd xmm3, xmm4
- paddd xmm3, xmm7
- psrad xmm3, 8
-
- ; t = (298 * c - 100 * d - 208 * e + 128) >> 8;
- movdqa xmm4, [c298]
- pmulld xmm4, xmm0
- movdqa xmm5, [c100]
- pmulld xmm5, xmm1
- movdqa xmm6, [c208]
- pmulld xmm6, xmm2
- psubd xmm4, xmm5
- psubd xmm4, xmm6
- paddd xmm4, xmm7
- psrad xmm4, 8
-
- ; t = (298 * c + 516 * d + 128) >> 8;
- movdqa xmm5, [c298]
- pmulld xmm5, xmm0
- movdqa xmm6, [c516]
- pmulld xmm6, xmm1
- paddd xmm5, xmm6
- paddd xmm5, xmm7
- psrad xmm5, 8
-
- packusdw xmm3, xmm3 ; b
- packuswb xmm3, xmm3
- packusdw xmm4, xmm4 ; g
- packuswb xmm4, xmm4
- punpcklbw xmm3, xmm4 ; gb
+ psubw xmm2, xmm7
+ psllw xmm2, 4
- pxor xmm4, xmm4 ; a
- packusdw xmm5, xmm5 ; b
- packuswb xmm5, xmm5
- punpcklbw xmm5, xmm4 ; ar
+do8:
- punpcklwd xmm3, xmm5 ; argb
- movdqu [edi], xmm3
- add edi, 16
-
- ret;
-
-y2_do4:
; y
- movd xmm0, [esi] ; read 4 but only using 2
- add esi, 4
+ movq xmm0, [esi] ; 8 at a time
+ add esi, 8
pxor xmm6, xmm6
punpcklbw xmm0, xmm6
- punpcklwd xmm0, xmm6
- movdqa xmm7, [c16]
- psubd xmm0, xmm7
-
- movdqa xmm7, [c128]
- ; t = (298 * c + 409 * e + 128) >> 8;
- movdqa xmm3, [c298]
- pmulld xmm3, xmm0
- movdqa xmm4, [c409]
- pmulld xmm4, xmm2
- paddd xmm3, xmm4
- paddd xmm3, xmm7
- psrad xmm3, 8
-
- ; t = (298 * c - 100 * d - 208 * e + 128) >> 8;
- movdqa xmm4, [c298]
- pmulld xmm4, xmm0
- movdqa xmm5, [c100]
- pmulld xmm5, xmm1
- movdqa xmm6, [c208]
- pmulld xmm6, xmm2
- psubd xmm4, xmm5
- psubd xmm4, xmm6
- paddd xmm4, xmm7
- psrad xmm4, 8
-
- ; t = (298 * c + 516 * d + 128) >> 8;
- movdqa xmm5, [c298]
- pmulld xmm5, xmm0
- movdqa xmm6, [c516]
- pmulld xmm6, xmm1
- paddd xmm5, xmm6
- paddd xmm5, xmm7
- psrad xmm5, 8
-
- packusdw xmm3, xmm3 ; b
- packuswb xmm3, xmm3
- packusdw xmm4, xmm4 ; g
- packuswb xmm4, xmm4
+ ; r = y + hiword(4669 * (v << 4))
+ movdqa xmm4, [c4669]
+ pmulhw xmm4, xmm2
+ movdqa xmm3, xmm0
+ paddw xmm3, xmm4
+
+ ; g = y - hiword(1616 * (u << 4)) - hiword(2378 * (v << 4))
+ movdqa xmm5, [c1616]
+ pmulhw xmm5, xmm1
+ movdqa xmm6, [c2378]
+ pmulhw xmm6, xmm2
+ movdqa xmm4, xmm0
+ psubw xmm4, xmm5
+ psubw xmm4, xmm6
+
+ ; b = y + hiword(9324 * (u << 4))
+ movdqa xmm6, [c9324]
+ pmulhw xmm6, xmm1
+ movdqa xmm5, xmm0
+ paddw xmm5, xmm6
+
+ packuswb xmm3, xmm3 ; b
+ packuswb xmm4, xmm4 ; g
punpcklbw xmm3, xmm4 ; gb
pxor xmm4, xmm4 ; a
- packusdw xmm5, xmm5 ; b
- packuswb xmm5, xmm5
+ packuswb xmm5, xmm5 ; r
punpcklbw xmm5, xmm4 ; ar
+ movdqa xmm4, xmm3
punpcklwd xmm3, xmm5 ; argb
movdqu [edi], xmm3
add edi, 16
+ punpckhwd xmm4, xmm5 ; argb
+ movdqu [edi], xmm4
+ add edi, 16
ret;
@@ -201,14 +144,14 @@ PROC yv12_to_rgb32_x86_sse2
loop_y:
mov ecx, edx ; width
- shr ecx, 2
+ shr ecx, 3
; save edx
mov [esp + 24], edx
prefetchnta 4096[esp + 0] ; y
- prefetchnta 4096[esp + 8] ; u
- prefetchnta 4096[esp + 12] ; v
+ prefetchnta 1024[esp + 8] ; u
+ prefetchnta 1024[esp + 12] ; v
loop_x:
@@ -218,7 +161,7 @@ loop_x:
mov edi, [esp + 16] ; rgbs1
; y1
- call y1_do4
+ call do8_uv
mov [esp + 0], esi ; y1
mov [esp + 16], edi ; rgbs1
@@ -227,7 +170,7 @@ loop_x:
mov edi, [esp + 20] ; rgbs2
; y2
- call y2_do4
+ call do8
mov [esp + 4], esi ; y2
mov [esp + 8], ebx ; u