If you're interesting by the maths, my code is the following, i've got informations from net/Wiki, i hope values were right.
| CODE | void JPSDR_RGBConvert::Compute_Lookup(void) { double kr,kg,kb; double u1,u2,v1,v2,r1,g1,g2,b1; signed short i;
if (mData.full_range) { Min_Y=0; Max_Y=255; Min_U=0; Max_U=255; Min_V=0; Max_V=255; Coeff_Y=255.0; Coeff_U=255.0; Coeff_V=255.0; } else { Min_Y=16; Max_Y=235; Min_U=16; Max_U=240; Min_V=16; Max_V=240; Coeff_Y=219.0; Coeff_U=224.0; Coeff_V=224.0; } switch (mData.color_matrix) { case 0 : kr=0.2126; kb=0.0722; break; case 1 : kr=0.299; kb=0.114; break; case 2 : kr=0.212; kb=0.087; break; case 3 : kr=0.3; kb=0.11; break; } kg=1.0-kr-kb;
u1=-kr/(1.0-kb); u2=-kg/(1.0-kb); v1=-kg/(1.0-kr); v2=-kb/(1.0-kr);
r1=2.0*(1.0-kr); g1=-(2.0*(1.0-kb)*kb)/kg; g2=-(2.0*(1.0-kr)*kr)/kg; b1=2.0*(1.0-kb);
Offset_Y=(Min_Y << 6)+32; Offset_U=(128 << 6)+32; Offset_V=(128 << 6)+32;
Offset_R=(signed short)-round(16.0+((32.0*255.0*Min_Y)/Coeff_Y)+((128.0*r1*255.0*32.0)/Coeff_V)); Offset_G=(signed short)-round(16.0+((32.0*255.0*Min_Y)/Coeff_Y)+((128.0*g1*255.0*32.0)/Coeff_U)+((128.0*g2*255.0*32.0)/Coeff_V)); Offset_B=(signed short)-round(16.0+((32.0*255.0*Min_Y)/Coeff_Y)+((128.0*b1*255.0*32.0)/Coeff_U));
switch (convertion_mode) { case 0 : for (i=0; i<=255; i++) { lookup[i]=(signed short)round((i*kr*Coeff_Y*64.0)/255.0); lookup[i+256]=(signed short)round((i*kg*Coeff_Y*64.0)/255.0); lookup[i+512]=(signed short)round((i*kb*Coeff_Y*64.0)/255.0); lookup[i+768]=(signed short)round((i*u1*Coeff_U*0.5*64.0)/255.0); lookup[i+1024]=(signed short)round((i*u2*Coeff_U*0.5*64.0)/255.0); lookup[i+1280]=(signed short)round((i*Coeff_U*0.5*64.0)/255.0); lookup[i+1536]=(signed short)round((i*Coeff_V*0.5*64.0)/255.0); lookup[i+1792]=(signed short)round((i*v1*Coeff_V*0.5*64.0)/255.0); lookup[i+2048]=(signed short)round((i*v2*Coeff_V*0.5*64.0)/255.0); } break; case 1 : case 2 : case 3 : case 4 : case 5 : for (i=0; i<=255; i++) { lookup[i]=(signed short)round((i*255.0*32.0)/Coeff_Y); lookup[i+256]=(signed short)round((i*r1*255.0*32.0)/Coeff_V); lookup[i+512]=(signed short)round((i*g1*255.0*32.0)/Coeff_U); lookup[i+768]=(signed short)round((i*g2*255.0*32.0)/Coeff_V); lookup[i+1024]=(signed short)round((i*b1*255.0*32.0)/Coeff_U); lookup[i+1280]=0; lookup[i+1536]=0; lookup[i+1792]=0; lookup[i+2048]=0; } break; }
}
void JPSDR_RGBConvert::RGB32toYV24(const void *src_,void *dst_y_,void *dst_u_,void *dst_v_,sint32 w,sint32 h,ptrdiff_t src_pitch,ptrdiff_t dst_pitch_y, ptrdiff_t dst_pitch_u,ptrdiff_t dst_pitch_v) { const RGB32 *src; unsigned char *dst_y,*dst_u,*dst_v; sint32 i,j; signed short y,u,v; unsigned short r,g,b;
src=(RGB32 *)src_; dst_y=(unsigned char *)dst_y_; dst_u=(unsigned char *)dst_u_; dst_v=(unsigned char *)dst_v_;
for (i=0; i<h; i++) { for (j=0; j<w; j++) { b=src[j].b; g=src[j].g; r=src[j].r; y=(Offset_Y+lookup[r]+lookup[g+256]+lookup[b+512]) >> 6; u=(Offset_U+lookup[r+768]+lookup[g+1024]+lookup[b+1280]) >> 6; v=(Offset_V+lookup[r+1536]+lookup[g+1792]+lookup[b+2048]) >> 6; if (y<Min_Y) y=Min_Y; if (y>Max_Y) y=Max_Y; if (u<Min_U) u=Min_U; if (u>Max_U) u=Max_U; if (v<Min_V) v=Min_V; if (v>Max_V) v=Max_V; dst_y[j]=(unsigned char)y; dst_u[j]=(unsigned char)u; dst_v[j]=(unsigned char)v; } src=(RGB32 *)((char *)src+src_pitch); dst_y+=dst_pitch_y; dst_u+=dst_pitch_u; dst_v+=dst_pitch_v; } }
void JPSDR_RGBConvert::YV24toRGB32(const void *src_y_,const void *src_u_,const void *src_v_, void *dst_,sint32 w,sint32 h,ptrdiff_t src_pitch_y,ptrdiff_t src_pitch_u, ptrdiff_t src_pitch_v,ptrdiff_t dst_pitch) { RGB32 *dst; const unsigned char *src_y,*src_u,*src_v; sint32 i,j; signed short r,g,b; unsigned short y,u,v;
dst=(RGB32 *)dst_; src_y=(unsigned char *)src_y_; src_u=(unsigned char *)src_u_; src_v=(unsigned char *)src_v_;
for (i=0; i<h; i++) { for (j=0; j<w; j++) { y=src_y[j]; u=src_u[j]; v=src_v[j]; r=(lookup[y]+lookup[v+256]+Offset_R) >> 5; g=(lookup[y]+lookup[u+512]+lookup[v+768]+Offset_G) >> 5; b=(lookup[y]+lookup[u+1024]+Offset_B) >> 5; if (r<0) r=0; if (r>255) r=255; if (g<0) g=0; if (g>255) g=255; if (b<0) b=0; if (b>255) b=255; dst[j].b=(unsigned char)b; dst[j].g=(unsigned char)g; dst[j].r=(unsigned char)r; dst[j].alpha=0; } dst=(RGB32 *)((char *)dst+dst_pitch); src_y+=src_pitch_y; src_u+=src_pitch_u; src_v+=src_pitch_v; }
}
|
0 : BT709 1 : BT601 2 : SMTPE_240M 3 : FCC
Optimized code x86 :
| CODE | JPSDR_RGBConvert_RGB32toYV24_SSE2 proc src:dword,dst_y:dword,dst_u:dword,dst_v:dword,w:dword,h:dword,offset_Y:word, offset_U:word,offset_V:word,lookup:dword,src_modulo:dword,dst_modulo_y:dword,dst_modulo_u:dword,dst_modulo_v:dword, Min_Y:word,Max_Y:word,Min_U:word,Max_U:word,Min_V:word,Max_V:word
public JPSDR_RGBConvert_RGB32toYV24_SSE2
local i:dword
push esi push edi push ebx
xor eax,eax pxor xmm3,xmm3 pxor xmm2,xmm2 pxor xmm1,xmm1 pxor xmm0,xmm0 movzx eax,offset_Y pinsrw xmm1,eax,0 pinsrw xmm1,eax,4 movzx eax,offset_U pinsrw xmm1,eax,1 pinsrw xmm1,eax,5 movzx eax,offset_V pinsrw xmm1,eax,2 pinsrw xmm1,eax,6 movzx eax,Min_Y pinsrw xmm2,eax,0 pinsrw xmm2,eax,4 movzx eax,Max_Y pinsrw xmm3,eax,0 pinsrw xmm3,eax,4 movzx eax,Min_U pinsrw xmm2,eax,1 pinsrw xmm2,eax,5 movzx eax,Max_U pinsrw xmm3,eax,1 pinsrw xmm3,eax,5 movzx eax,Min_V pinsrw xmm2,eax,2 pinsrw xmm2,eax,6 movzx eax,Max_V pinsrw xmm3,eax,2 pinsrw xmm3,eax,6 mov esi,src
Boucle0_2: mov eax,w mov i,eax Boucle1_2: movzx edx,byte ptr[esi] movzx ecx,byte ptr[esi+1] movzx ebx,byte ptr[esi+2]; ebx=R ecx=G edx=B mov esi,lookup movzx eax,word ptr[esi+2*ebx] add ax,word ptr[esi+2*ecx+512] add ax,word ptr[esi+2*edx+1024] pinsrw xmm0,eax,0 movzx eax,word ptr[esi+2*ebx+1536] add ax,word ptr[esi+2*ecx+2048] add ax,word ptr[esi+2*edx+2560] pinsrw xmm0,eax,1 movzx eax,word ptr[esi+2*ebx+3072] add ax,word ptr[esi+2*ecx+3584] add ax,word ptr[esi+2*edx+4096] pinsrw xmm0,eax,2 mov esi,src movzx edx,byte ptr[esi+4] movzx ecx,byte ptr[esi+5] movzx ebx,byte ptr[esi+6]; ebx=R ecx=G edx=B mov esi,lookup movzx eax,word ptr[esi+2*ebx] add ax,word ptr[esi+2*ecx+512] add ax,word ptr[esi+2*edx+1024] pinsrw xmm0,eax,4 movzx eax,word ptr[esi+2*ebx+1536] add ax,word ptr[esi+2*ecx+2048] add ax,word ptr[esi+2*edx+2560] pinsrw xmm0,eax,5 movzx eax,word ptr[esi+2*ebx+3072] add ax,word ptr[esi+2*ecx+3584] add ax,word ptr[esi+2*edx+4096] pinsrw xmm0,eax,6 paddsw xmm0,xmm1 psraw xmm0,6 pmaxsw xmm0,xmm2 pminsw xmm0,xmm3 mov edi,dst_y pextrw eax,xmm0,0 mov byte ptr[edi],al pextrw eax,xmm0,4 mov byte ptr[edi+1],al mov edi,dst_u add dst_y,2 pextrw eax,xmm0,1 mov byte ptr[edi],al pextrw eax,xmm0,5 mov byte ptr[edi+1],al mov edi,dst_v add dst_u,2 pextrw eax,xmm0,2 mov byte ptr[edi],al pextrw eax,xmm0,6 mov byte ptr[edi+1],al add dst_v,2 add src,8 mov esi,src dec i jnz Boucle1_2 add esi,src_modulo mov eax,dst_y add eax,dst_modulo_y mov dst_y,eax mov eax,dst_u add eax,dst_modulo_u mov dst_u,eax mov eax,dst_v add eax,dst_modulo_v mov dst_v,eax mov src,esi dec h jnz Boucle0_2
pop ebx pop edi pop esi
ret
JPSDR_RGBConvert_RGB32toYV24_SSE2 endp
JPSDR_RGBConvert_YV24toRGB32_SSE2 proc src_y:dword,src_u:dword,src_v:dword,dst:dword,w:dword,h:dword,offset_R:word, offset_G:word,offset_B:word,lookup:dword,src_modulo_y:dword,src_modulo_u:dword,src_modulo_v:dword,dst_modulo:dword
public JPSDR_RGBConvert_YV24toRGB32_SSE2
local i:dword
push esi push edi push ebx
xor eax,eax pxor xmm2,xmm2 pxor xmm1,xmm1 pxor xmm0,xmm0 movzx eax,offset_B pinsrw xmm1,eax,0 pinsrw xmm1,eax,4 movzx eax,offset_G pinsrw xmm1,eax,1 pinsrw xmm1,eax,5 movzx eax,offset_R pinsrw xmm1,eax,2 pinsrw xmm1,eax,6 mov edi,dst
Boucle0_4: mov eax,w mov i,eax Boucle1_4: mov esi,src_y movzx ebx,byte ptr[esi] mov esi,src_u movzx ecx,byte ptr[esi] mov esi,src_v movzx edx,byte ptr[esi]; ebx=Y ecx=U edx=V mov esi,lookup movzx eax,word ptr[esi+2*ebx] add ax,word ptr[esi+2*edx+512] pinsrw xmm0,eax,2 movzx eax,word ptr[esi+2*ebx] add ax,word ptr[esi+2*ecx+1024] add ax,word ptr[esi+2*edx+1536] pinsrw xmm0,eax,1 movzx eax,word ptr[esi+2*ebx] add ax,word ptr[esi+2*ecx+2048] pinsrw xmm0,eax,0 mov esi,src_y movzx ebx,byte ptr[esi+1] mov esi,src_u add src_y,2 movzx ecx,byte ptr[esi+1] mov esi,src_v add src_u,2 movzx edx,byte ptr[esi+1]; ebx=Y ecx=U edx=V mov esi,lookup add src_v,2 movzx eax,word ptr[esi+2*ebx] add ax,word ptr[esi+2*edx+512] pinsrw xmm0,eax,6 movzx eax,word ptr[esi+2*ebx] add ax,word ptr[esi+2*ecx+1024] add ax,word ptr[esi+2*edx+1536] pinsrw xmm0,eax,5 movzx eax,word ptr[esi+2*ebx] add ax,word ptr[esi+2*ecx+2048] pinsrw xmm0,eax,4 paddsw xmm0,xmm1 psraw xmm0,5 packuswb xmm0,xmm2 movq qword ptr[edi],xmm0 add edi,8 dec i jnz Boucle1_4 add edi,dst_modulo mov eax,src_y add eax,src_modulo_y mov src_y,eax mov eax,src_u add eax,src_modulo_u mov src_u,eax mov eax,src_v add eax,src_modulo_v mov src_v,eax dec h jnz Boucle0_4
pop ebx pop edi pop esi
ret
JPSDR_RGBConvert_YV24toRGB32_SSE2 endp
|
Optimized x64 :
| CODE | JPSDR_RGBConvert_RGB32toYV24_SSE2 proc public frame
w equ dword ptr[rbp+48] h equ dword ptr[rbp+56] offset_Y equ word ptr[rbp+64] offset_U equ word ptr[rbp+72] offset_V equ word ptr[rbp+80] lookup equ qword ptr[rbp+88] src_modulo equ qword ptr[rbp+96] dst_modulo_y equ qword ptr[rbp+104] dst_modulo_u equ qword ptr[rbp+112] dst_modulo_v equ qword ptr[rbp+120] Min_Y equ word ptr[rbp+128] Max_Y equ word ptr[rbp+136] Min_U equ word ptr[rbp+144] Max_U equ word ptr[rbp+152] Min_V equ word ptr[rbp+160] Max_V equ word ptr[rbp+168]
push rbp .pushreg rbp mov rbp,rsp push rdi .pushreg rdi push rsi .pushreg rsi push rbx .pushreg rbx push r12 .pushreg r12 push r13 .pushreg r13 push r14 .pushreg r14 push r15 .pushreg r15 .endprolog
xor rax,rax pxor xmm3,xmm3 pxor xmm2,xmm2 pxor xmm1,xmm1 pxor xmm0,xmm0 movzx eax,offset_Y pinsrw xmm1,eax,0 pinsrw xmm1,eax,4 movzx eax,offset_U pinsrw xmm1,eax,1 pinsrw xmm1,eax,5 movzx eax,offset_V pinsrw xmm1,eax,2 pinsrw xmm1,eax,6 movzx eax,Min_Y pinsrw xmm2,eax,0 pinsrw xmm2,eax,4 movzx eax,Max_Y pinsrw xmm3,eax,0 pinsrw xmm3,eax,4 movzx eax,Min_U pinsrw xmm2,eax,1 pinsrw xmm2,eax,5 movzx eax,Max_U pinsrw xmm3,eax,1 pinsrw xmm3,eax,5 movzx eax,Min_V pinsrw xmm2,eax,2 pinsrw xmm2,eax,6 movzx eax,Max_V pinsrw xmm3,eax,2 pinsrw xmm3,eax,6
mov rsi,rcx mov r10,lookup mov rdi,rdx ;rdi=dst_y mov r11,r8 ;r11=dst_u mov r12,r9 ;r12=dst_v mov r13,2 mov r14,8 mov r8d,w mov r9d,h xor rcx,rcx xor rdx,rdx xor rbx,rbx xor r15,r15
Boucle0_2: mov ecx,r8d Boucle1_2: movzx edx,byte ptr[rsi] movzx r15d,byte ptr[rsi+1] movzx ebx,byte ptr[rsi+2]; rbx=R r15=G rdx=B movzx eax,word ptr[r10+2*rbx] add ax,word ptr[r10+2*r15+512] add ax,word ptr[r10+2*rdx+1024] pinsrw xmm0,eax,0 movzx eax,word ptr[r10+2*rbx+1536] add ax,word ptr[r10+2*r15+2048] add ax,word ptr[r10+2*rdx+2560] pinsrw xmm0,eax,1 movzx eax,word ptr[r10+2*rbx+3072] add ax,word ptr[r10+2*r15+3584] add ax,word ptr[r10+2*rdx+4096] pinsrw xmm0,eax,2 movzx edx,byte ptr[rsi+4] movzx r15d,byte ptr[rsi+5] movzx ebx,byte ptr[rsi+6]; rbx=R r15=G rdx=B movzx eax,word ptr[r10+2*rbx] add ax,word ptr[r10+2*r15+512] add ax,word ptr[r10+2*rdx+1024] pinsrw xmm0,eax,4 movzx eax,word ptr[r10+2*rbx+1536] add ax,word ptr[r10+2*r15+2048] add ax,word ptr[r10+2*rdx+2560] pinsrw xmm0,eax,5 movzx eax,word ptr[r10+2*rbx+3072] add ax,word ptr[r10+2*r15+3584] add ax,word ptr[r10+2*rdx+4096] pinsrw xmm0,eax,6
paddsw xmm0,xmm1 psraw xmm0,6 pmaxsw xmm0,xmm2 pminsw xmm0,xmm3 pextrw eax,xmm0,0 mov byte ptr[rdi],al pextrw eax,xmm0,4 mov byte ptr[rdi+1],al pextrw eax,xmm0,1 add rdi,r13 mov byte ptr[r11],al pextrw eax,xmm0,5 mov byte ptr[r11+1],al pextrw eax,xmm0,2 add r11,r13 mov byte ptr[r12],al pextrw eax,xmm0,6 mov byte ptr[r12+1],al add rsi,r14 add r12,r13 dec ecx jnz Boucle1_2 add rsi,src_modulo add rdi,dst_modulo_y add r11,dst_modulo_u add r12,dst_modulo_v dec r9d jnz Boucle0_2
pop r15 pop r14 pop r13 pop r12 pop rbx pop rsi pop rdi pop rbp
ret
JPSDR_RGBConvert_RGB32toYV24_SSE2 endp
JPSDR_RGBConvert_YV24toRGB32_SSE2 proc public frame
w equ dword ptr[rbp+48] h equ dword ptr[rbp+56] offset_R equ word ptr[rbp+64] offset_G equ word ptr[rbp+72] offset_B equ word ptr[rbp+80] lookup equ qword ptr[rbp+88] src_modulo_y equ qword ptr[rbp+96] src_modulo_u equ qword ptr[rbp+104] src_modulo_v equ qword ptr[rbp+112] dst_modulo equ qword ptr[rbp+120]
push rbp .pushreg rbp mov rbp,rsp push rdi .pushreg rdi push rsi .pushreg rsi push rbx .pushreg rbx push r12 .pushreg r12 push r13 .pushreg r13 push r14 .pushreg r14 push r15 .pushreg r15 .endprolog
xor rax,rax pxor xmm2,xmm2 pxor xmm1,xmm1 pxor xmm0,xmm0 movzx eax,offset_B pinsrw xmm1,eax,0 pinsrw xmm1,eax,4 movzx eax,offset_G pinsrw xmm1,eax,1 pinsrw xmm1,eax,5 movzx eax,offset_R pinsrw xmm1,eax,2 pinsrw xmm1,eax,6 mov rsi,rcx ;rsi=src_y mov r11,rdx ;r11=src_u mov r12,r8 ;r12=src_v mov rdi,r9 mov r8d,w mov r9d,h mov r10,lookup mov r13,2 mov r14,8 xor rcx,rcx xor rdx,rdx xor rbx,rbx xor r15,r15
Boucle0_4: mov ecx,r8d Boucle1_4: movzx ebx,byte ptr[rsi] movzx r15d,byte ptr[r11] movzx edx,byte ptr[r12]; rbx=Y r15=U rdx=V movzx eax,word ptr[r10+2*rbx] add ax,word ptr[r10+2*rdx+512] pinsrw xmm0,eax,2 movzx eax,word ptr[r10+2*rbx] add ax,word ptr[r10+2*r15+1024] add ax,word ptr[r10+2*rdx+1536] pinsrw xmm0,eax,1 movzx eax,word ptr[r10+2*rbx] add ax,word ptr[r10+2*r15+2048] pinsrw xmm0,eax,0 movzx ebx,byte ptr[rsi+1] add rsi,r13 movzx r15d,byte ptr[r11+1] add r11,r13 movzx edx,byte ptr[r12+1]; rbx=Y r15=U rdx=V add r12,r13 movzx eax,word ptr[r10+2*rbx] add ax,word ptr[r10+2*rdx+512] pinsrw xmm0,eax,6 movzx eax,word ptr[r10+2*rbx] add ax,word ptr[r10+2*r15+1024] add ax,word ptr[r10+2*rdx+1536] pinsrw xmm0,eax,5 movzx eax,word ptr[r10+2*rbx] add ax,word ptr[r10+2*r15+2048] pinsrw xmm0,eax,4 paddsw xmm0,xmm1 psraw xmm0,5 packuswb xmm0,xmm2 movq qword ptr[rdi],xmm0 add rdi,r14 dec ecx jnz Boucle1_4 add rsi,src_modulo_y add r11,src_modulo_u add r12,src_modulo_v add rdi,dst_modulo dec r9d jnz Boucle0_4
pop r15 pop r14 pop r13 pop r12 pop rbx pop rsi pop rdi pop rbp ret
JPSDR_RGBConvert_YV24toRGB32_SSE2 endp
| |