diff options
Diffstat (limited to 'xorg')
-rw-r--r-- | xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm | 185 |
1 files changed, 64 insertions, 121 deletions
diff --git a/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm b/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm index 1e67ff68..4d7638ef 100644 --- a/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm +++ b/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm @@ -1,15 +1,24 @@ +; RGB to YUV +; 0.299 0.587 0.114 +; -0.14713 -0.28886 0.436 +; 0.615 -0.51499 -0.10001 +; YUV to RGB +; 1 0 1.13983 +; 1 -0.39465 -0.58060 +; 1 2.03211 0 +; shift left 12 +; 4096 0 4669 +; 4096 -1616 -2378 +; 4096 9324 0 + SECTION .data align 16 -c8 times 4 dd 8 -c16 times 4 dd 16 -c100 times 4 dd 100 -c128 times 4 dd 128 -c208 times 4 dd 208 -c255 times 4 dd 255 -c298 times 4 dd 298 -c409 times 4 dd 409 -c516 times 4 dd 516 +c128 times 8 dw 128 +c4669 times 8 dw 4669 +c1616 times 8 dw 1616 +c2378 times 8 dw 2378 +c9324 times 8 dw 9324 SECTION .text @@ -19,136 +28,70 @@ SECTION .text %1: %endmacro -y1_do4: - ; y - movd xmm0, [esi] ; 4 at a time - add esi, 4 - pxor xmm6, xmm6 - punpcklbw xmm0, xmm6 - punpcklwd xmm0, xmm6 - movdqa xmm7, [c16] - psubd xmm0, xmm7 +do8_uv: ; u - movd xmm1, [ebx] ; read 4 but only using 2 - add ebx, 2 + movd xmm1, [ebx] ; 4 at a time + add ebx, 4 punpcklbw xmm1, xmm1 + pxor xmm6, xmm6 punpcklbw xmm1, xmm6 - punpcklwd xmm1, xmm6 movdqa xmm7, [c128] - psubd xmm1, xmm7 + psubw xmm1, xmm7 + psllw xmm1, 4 ; v - movd xmm2, [edx] ; read 4 but only using 2 - add edx, 2 + movd xmm2, [edx] ; 4 at a time + add edx, 4 punpcklbw xmm2, xmm2 punpcklbw xmm2, xmm6 - punpcklwd xmm2, xmm6 - psubd xmm2, xmm7 - - ; t = (298 * c + 409 * e + 128) >> 8; - movdqa xmm3, [c298] - pmulld xmm3, xmm0 - movdqa xmm4, [c409] - pmulld xmm4, xmm2 - paddd xmm3, xmm4 - paddd xmm3, xmm7 - psrad xmm3, 8 - - ; t = (298 * c - 100 * d - 208 * e + 128) >> 8; - movdqa xmm4, [c298] - pmulld xmm4, xmm0 - movdqa xmm5, [c100] - pmulld xmm5, xmm1 - movdqa xmm6, [c208] - pmulld xmm6, xmm2 - psubd xmm4, xmm5 - psubd xmm4, xmm6 - paddd xmm4, xmm7 - psrad xmm4, 8 - - ; t = (298 * c + 516 * d + 128) >> 8; - movdqa xmm5, [c298] - pmulld xmm5, xmm0 - movdqa xmm6, [c516] - pmulld xmm6, xmm1 - paddd xmm5, xmm6 - paddd xmm5, xmm7 - psrad xmm5, 8 - - packusdw xmm3, xmm3 ; b - packuswb xmm3, xmm3 - packusdw xmm4, xmm4 ; g - packuswb xmm4, xmm4 - punpcklbw xmm3, xmm4 ; gb + psubw xmm2, xmm7 + psllw xmm2, 4 - pxor xmm4, xmm4 ; a - packusdw xmm5, xmm5 ; b - packuswb xmm5, xmm5 - punpcklbw xmm5, xmm4 ; ar +do8: - punpcklwd xmm3, xmm5 ; argb - movdqu [edi], xmm3 - add edi, 16 - - ret; - -y2_do4: ; y - movd xmm0, [esi] ; read 4 but only using 2 - add esi, 4 + movq xmm0, [esi] ; 8 at a time + add esi, 8 pxor xmm6, xmm6 punpcklbw xmm0, xmm6 - punpcklwd xmm0, xmm6 - movdqa xmm7, [c16] - psubd xmm0, xmm7 - - movdqa xmm7, [c128] - ; t = (298 * c + 409 * e + 128) >> 8; - movdqa xmm3, [c298] - pmulld xmm3, xmm0 - movdqa xmm4, [c409] - pmulld xmm4, xmm2 - paddd xmm3, xmm4 - paddd xmm3, xmm7 - psrad xmm3, 8 - - ; t = (298 * c - 100 * d - 208 * e + 128) >> 8; - movdqa xmm4, [c298] - pmulld xmm4, xmm0 - movdqa xmm5, [c100] - pmulld xmm5, xmm1 - movdqa xmm6, [c208] - pmulld xmm6, xmm2 - psubd xmm4, xmm5 - psubd xmm4, xmm6 - paddd xmm4, xmm7 - psrad xmm4, 8 - - ; t = (298 * c + 516 * d + 128) >> 8; - movdqa xmm5, [c298] - pmulld xmm5, xmm0 - movdqa xmm6, [c516] - pmulld xmm6, xmm1 - paddd xmm5, xmm6 - paddd xmm5, xmm7 - psrad xmm5, 8 - - packusdw xmm3, xmm3 ; b - packuswb xmm3, xmm3 - packusdw xmm4, xmm4 ; g - packuswb xmm4, xmm4 + ; r = y + hiword(4669 * (v << 4)) + movdqa xmm4, [c4669] + pmulhw xmm4, xmm2 + movdqa xmm3, xmm0 + paddw xmm3, xmm4 + + ; g = y - hiword(1616 * (u << 4)) - hiword(2378 * (v << 4)) + movdqa xmm5, [c1616] + pmulhw xmm5, xmm1 + movdqa xmm6, [c2378] + pmulhw xmm6, xmm2 + movdqa xmm4, xmm0 + psubw xmm4, xmm5 + psubw xmm4, xmm6 + + ; b = y + hiword(9324 * (u << 4)) + movdqa xmm6, [c9324] + pmulhw xmm6, xmm1 + movdqa xmm5, xmm0 + paddw xmm5, xmm6 + + packuswb xmm3, xmm3 ; b + packuswb xmm4, xmm4 ; g punpcklbw xmm3, xmm4 ; gb pxor xmm4, xmm4 ; a - packusdw xmm5, xmm5 ; b - packuswb xmm5, xmm5 + packuswb xmm5, xmm5 ; r punpcklbw xmm5, xmm4 ; ar + movdqa xmm4, xmm3 punpcklwd xmm3, xmm5 ; argb movdqu [edi], xmm3 add edi, 16 + punpckhwd xmm4, xmm5 ; argb + movdqu [edi], xmm4 + add edi, 16 ret; @@ -201,14 +144,14 @@ PROC yv12_to_rgb32_x86_sse2 loop_y: mov ecx, edx ; width - shr ecx, 2 + shr ecx, 3 ; save edx mov [esp + 24], edx prefetchnta 4096[esp + 0] ; y - prefetchnta 4096[esp + 8] ; u - prefetchnta 4096[esp + 12] ; v + prefetchnta 1024[esp + 8] ; u + prefetchnta 1024[esp + 12] ; v loop_x: @@ -218,7 +161,7 @@ loop_x: mov edi, [esp + 16] ; rgbs1 ; y1 - call y1_do4 + call do8_uv mov [esp + 0], esi ; y1 mov [esp + 16], edi ; rgbs1 @@ -227,7 +170,7 @@ loop_x: mov edi, [esp + 20] ; rgbs2 ; y2 - call y2_do4 + call do8 mov [esp + 4], esi ; y2 mov [esp + 8], ebx ; u |