summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Sorg <jay.sorg@gmail.com>2014-09-26 03:37:04 -0700
committerJay Sorg <jay.sorg@gmail.com>2014-09-26 03:37:04 -0700
commit656e6eae1ffc1d92ca717ca0b9b9928f6a263183 (patch)
treebc76fdb4ef622e0331c2692aaf2d28abf0a88582
parentebb00172b79f6dd61bff3bdebe9f246d034ff05c (diff)
downloadxrdp-proprietary-656e6eae1ffc1d92ca717ca0b9b9928f6a263183.tar.gz
xrdp-proprietary-656e6eae1ffc1d92ca717ca0b9b9928f6a263183.zip
Xorg: asm work on yv12
-rw-r--r--xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm356
1 files changed, 356 insertions, 0 deletions
diff --git a/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm b/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm
index 2d0533ea..85c6170d 100644
--- a/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm
+++ b/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm
@@ -1,10 +1,260 @@
+SECTION .data
+align 16
+c16 times 4 dd 16
+c100 times 4 dd 100
+c128 times 4 dd 128
+c208 times 4 dd 208
+c255 times 4 dd 255
+c298 times 4 dd 298
+c409 times 4 dd 409
+c516 times 4 dd 516
+
+SECTION .text
+
%macro PROC 1
align 16
global %1
%1:
%endmacro
+y1_do4:
+ ; y
+ mov eax, 0
+ mov al, [esi]
+ add esi, 1
+ pinsrd xmm0, eax, 0
+ mov al, [esi]
+ add esi, 1
+ pinsrd xmm0, eax, 1
+ mov al, [esi]
+ add esi, 1
+ pinsrd xmm0, eax, 2
+ mov al, [esi]
+ add esi, 1
+ pinsrd xmm0, eax, 3
+ movdqa xmm7, [c16]
+ psubd xmm0, xmm7
+
+ ; u
+ mov eax, 0
+ mov al, [ebx]
+ add ebx, 1
+ pinsrd xmm1, eax, 0
+ pinsrd xmm1, eax, 1
+ mov al, [ebx]
+ add ebx, 1
+ pinsrd xmm1, eax, 2
+ pinsrd xmm1, eax, 3
+ movdqa xmm7, [c128]
+ psubd xmm1, xmm7
+
+ ; v
+ mov eax, 0
+ mov al, [edx]
+ add edx, 1
+ pinsrd xmm2, eax, 0
+ pinsrd xmm2, eax, 1
+ mov al, [edx]
+ add edx, 1
+ pinsrd xmm2, eax, 2
+ pinsrd xmm2, eax, 3
+ psubd xmm2, xmm7
+
+ ; t = (298 * c + 409 * e + 128) >> 8;
+ movdqa xmm3, [c298]
+ pmulld xmm3, xmm0
+ movdqa xmm4, [c409]
+ pmulld xmm4, xmm2
+ paddd xmm3, xmm4
+ paddd xmm3, xmm7
+ psrad xmm3, 8
+ ; b = RDPCLAMP(t, 0, 255);
+ pxor xmm4, xmm4
+ pmaxsd xmm3, xmm4
+ movdqa xmm4, [c255]
+ pminsd xmm3, xmm4
+
+ ; t = (298 * c - 100 * d - 208 * e + 128) >> 8;
+ movdqa xmm4, [c298]
+ pmulld xmm4, xmm0
+ movdqa xmm5, [c100]
+ pmulld xmm5, xmm1
+ movdqa xmm6, [c208]
+ pmulld xmm6, xmm2
+ psubd xmm4, xmm5
+ psubd xmm4, xmm6
+ paddd xmm4, xmm7
+ psrad xmm4, 8
+ ; g = RDPCLAMP(t, 0, 255);
+ pxor xmm5, xmm5
+ pmaxsd xmm4, xmm5
+ movdqa xmm5, [c255]
+ pminsd xmm4, xmm5
+
+ ; t = (298 * c + 516 * d + 128) >> 8;
+ movdqa xmm5, [c298]
+ pmulld xmm5, xmm0
+ movdqa xmm6, [c516]
+ pmulld xmm6, xmm1
+ paddd xmm5, xmm6
+ paddd xmm5, xmm7
+ psrad xmm5, 8
+ ; r = RDPCLAMP(t, 0, 255);
+ pxor xmm6, xmm6
+ pmaxsd xmm5, xmm6
+ movdqa xmm6, [c255]
+ pminsd xmm5, xmm6
+
+ pextrd eax, xmm3, 0
+ mov [edi], al
+ pextrd eax, xmm4, 0
+ mov [edi + 1], al
+ pextrd eax, xmm5, 0
+ mov [edi + 2], al
+ mov eax, 0
+ mov [edi + 3], al
+ add edi, 4
+
+ pextrd eax, xmm3, 1
+ mov [edi], al
+ pextrd eax, xmm4, 1
+ mov [edi + 1], al
+ pextrd eax, xmm5, 1
+ mov [edi + 2], al
+ mov eax, 0
+ mov [edi + 3], al
+ add edi, 4
+
+ pextrd eax, xmm3, 2
+ mov [edi], al
+ pextrd eax, xmm4, 2
+ mov [edi + 1], al
+ pextrd eax, xmm5, 2
+ mov [edi + 2], al
+ mov eax, 0
+ mov [edi + 3], al
+ add edi, 4
+
+ pextrd eax, xmm3, 3
+ mov [edi], al
+ pextrd eax, xmm4, 3
+ mov [edi + 1], al
+ pextrd eax, xmm5, 3
+ mov [edi + 2], al
+ mov eax, 0
+ mov [edi + 3], al
+ add edi, 4
+
+ ret;
+
+y2_do4:
+ ; y
+ mov eax, 0
+ mov al, [esi]
+ add esi, 1
+ pinsrd xmm0, eax, 0
+ mov al, [esi]
+ add esi, 1
+ pinsrd xmm0, eax, 1
+ mov al, [esi]
+ add esi, 1
+ pinsrd xmm0, eax, 2
+ mov al, [esi]
+ add esi, 1
+ pinsrd xmm0, eax, 3
+ movdqa xmm7, [c16]
+ psubd xmm0, xmm7
+
+ movdqa xmm7, [c128]
+
+ ; t = (298 * c + 409 * e + 128) >> 8;
+ movdqa xmm3, [c298]
+ pmulld xmm3, xmm0
+ movdqa xmm4, [c409]
+ pmulld xmm4, xmm2
+ paddd xmm3, xmm4
+ paddd xmm3, xmm7
+ psrad xmm3, 8
+ ; b = RDPCLAMP(t, 0, 255);
+ pxor xmm4, xmm4
+ pmaxsd xmm3, xmm4
+ movdqa xmm4, [c255]
+ pminsd xmm3, xmm4
+
+ ; t = (298 * c - 100 * d - 208 * e + 128) >> 8;
+ movdqa xmm4, [c298]
+ pmulld xmm4, xmm0
+ movdqa xmm5, [c100]
+ pmulld xmm5, xmm1
+ movdqa xmm6, [c208]
+ pmulld xmm6, xmm2
+ psubd xmm4, xmm5
+ psubd xmm4, xmm6
+ paddd xmm4, xmm7
+ psrad xmm4, 8
+ ; g = RDPCLAMP(t, 0, 255);
+ pxor xmm5, xmm5
+ pmaxsd xmm4, xmm5
+ movdqa xmm5, [c255]
+ pminsd xmm4, xmm5
+
+ ; t = (298 * c + 516 * d + 128) >> 8;
+ movdqa xmm5, [c298]
+ pmulld xmm5, xmm0
+ movdqa xmm6, [c516]
+ pmulld xmm6, xmm1
+ paddd xmm5, xmm6
+ paddd xmm5, xmm7
+ psrad xmm5, 8
+ ; r = RDPCLAMP(t, 0, 255);
+ pxor xmm6, xmm6
+ pmaxsd xmm5, xmm6
+ movdqa xmm6, [c255]
+ pminsd xmm5, xmm6
+
+ pextrd eax, xmm3, 0
+ mov [edi], al
+ pextrd eax, xmm4, 0
+ mov [edi + 1], al
+ pextrd eax, xmm5, 0
+ mov [edi + 2], al
+ mov eax, 0
+ mov [edi + 3], al
+ add edi, 4
+
+ pextrd eax, xmm3, 1
+ mov [edi], al
+ pextrd eax, xmm4, 1
+ mov [edi + 1], al
+ pextrd eax, xmm5, 1
+ mov [edi + 2], al
+ mov eax, 0
+ mov [edi + 3], al
+ add edi, 4
+
+ pextrd eax, xmm3, 2
+ mov [edi], al
+ pextrd eax, xmm4, 2
+ mov [edi + 1], al
+ pextrd eax, xmm5, 2
+ mov [edi + 2], al
+ mov eax, 0
+ mov [edi + 3], al
+ add edi, 4
+
+ pextrd eax, xmm3, 3
+ mov [edi], al
+ pextrd eax, xmm4, 3
+ mov [edi + 1], al
+ pextrd eax, xmm5, 3
+ mov [edi + 2], al
+ mov eax, 0
+ mov [edi + 3], al
+ add edi, 4
+
+ ret;
+
;int
;yv12_to_rgb32_x86_sse2(unsigned char *yuvs, int width, int height, int *rgbs)
@@ -12,11 +262,117 @@ PROC yv12_to_rgb32_x86_sse2
push ebx
push esi
push edi
+ push ebp
+
+ mov edi, [esp + 32] ; rgbs
+
+ mov ecx, [esp + 24] ; width
+ mov edx, ecx
+ mov ebp, [esp + 28] ; height
+ mov eax, ebp
+ shr ebp, 1
+ imul eax, ecx ; eax = width * height
+
+ mov esi, [esp + 20] ; y
+
+ mov ebx, esi ; u = y + width * height
+ add ebx, eax
+
+ ; local vars
+ ; char* yptr1;
+ ; char* yptr2;
+ ; char* uptr;
+ ; char* vptr;
+ ; int* rgbs1;
+ ; int* rgbs2;
+ ; int width
+ sub esp, 28 ; local vars, 28 bytes
+ mov [esp + 0], esi ; save y1
+ add esi, edx
+ mov [esp + 4], esi ; save y2
+ mov [esp + 8], ebx ; save u
+ shr eax, 2
+ add ebx, eax ; v = u + (width * height / 4)
+ mov [esp + 12], ebx ; save v
+
+ mov [esp + 16], edi ; save rgbs1
+ mov eax, edx
+ shl eax, 2
+ add edi, eax
+ mov [esp + 20], edi ; save rgbs2
+
+loop_y:
+
+ mov ecx, edx ; width
+ shr ecx, 2
+
+ ; save edx
+ mov [esp + 24], edx
+
+loop_x:
+
+ mov esi, [esp + 0] ; y1
+ mov ebx, [esp + 8] ; u
+ mov edx, [esp + 12] ; v
+ mov edi, [esp + 16] ; rgbs1
+
+ ; y1
+ call y1_do4
+
+ mov [esp + 0], esi ; y1
+ mov [esp + 16], edi ; rgbs1
+
+ mov esi, [esp + 4] ; y2
+ mov edi, [esp + 20] ; rgbs2
+
+ ; y2
+ call y2_do4
+
+ mov [esp + 4], esi ; y2
+ mov [esp + 8], ebx ; u
+ mov [esp + 12], edx ; v
+ mov [esp + 20], edi ; rgbs2
+
+ dec ecx ; width
+ jnz loop_x
+
+ ; restore edx
+ mov edx, [esp + 24]
+
+ ; update y1 and 2
+ mov eax, [esp + 0]
+ mov ebx, edx
+ add eax, ebx
+ mov [esp + 0], eax
+
+ mov eax, [esp + 4]
+ add eax, ebx
+ mov [esp + 4], eax
+
+ ; update rgb1 and 2
+ mov eax, [esp + 16]
+ mov ebx, edx
+ shl ebx, 2
+ add eax, ebx
+ mov [esp + 16], eax
+
+ mov eax, [esp + 20]
+ add eax, ebx
+ mov [esp + 20], eax
+
+ mov ecx, ebp
+ dec ecx ; height
+ mov ebp, ecx
+ jnz loop_y
+
+ add esp, 28
mov eax, 0
+ pop ebp
pop edi
pop esi
pop ebx
ret
align 16
+