1; 2; jccolext.asm - colorspace conversion (64-bit SSE2) 3; 4; Copyright (C) 2009, D. R. Commander. 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; [TAB8] 17 18%include "jcolsamp.inc" 19 20; -------------------------------------------------------------------------- 21; 22; Convert some rows of samples to the output colorspace. 23; 24; GLOBAL(void) 25; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, 26; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, 27; JDIMENSION output_row, int num_rows); 28; 29 30; r10 = JDIMENSION img_width 31; r11 = JSAMPARRAY input_buf 32; r12 = JSAMPIMAGE output_buf 33; r13 = JDIMENSION output_row 34; r14 = int num_rows 35 36%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 37%define WK_NUM 8 38 39 align 16 40 41 global EXTN(jsimd_rgb_ycc_convert_sse2) 42 43EXTN(jsimd_rgb_ycc_convert_sse2): 44 push rbp 45 mov rax,rsp ; rax = original rbp 46 sub rsp, byte 4 47 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 48 mov [rsp],rax 49 mov rbp,rsp ; rbp = aligned rbp 50 lea rsp, [wk(0)] 51 collect_args 52 push rbx 53 54 mov ecx, r10d 55 test rcx,rcx 56 jz near .return 57 58 push rcx 59 60 mov rsi, r12 61 mov ecx, r13d 62 mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 63 mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] 64 mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] 65 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 66 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] 67 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] 68 69 pop rcx 70 71 mov rsi, r11 72 mov eax, r14d 73 test rax,rax 74 jle near .return 75.rowloop: 76 push rdx 77 push rbx 78 push rdi 79 push rsi 80 push rcx ; col 81 82 mov rsi, JSAMPROW [rsi] ; inptr 83 mov rdi, JSAMPROW [rdi] ; outptr0 84 mov rbx, JSAMPROW [rbx] ; outptr1 85 mov rdx, JSAMPROW [rdx] ; outptr2 86 87 cmp rcx, byte SIZEOF_XMMWORD 88 jae near .columnloop 89 90%if RGB_PIXELSIZE == 3 ; --------------- 91 92.column_ld1: 93 push rax 94 push rdx 95 lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 96 test cl, SIZEOF_BYTE 97 jz short .column_ld2 98 sub rcx, byte SIZEOF_BYTE 99 movzx rax, BYTE [rsi+rcx] 100.column_ld2: 101 test cl, SIZEOF_WORD 102 jz short .column_ld4 103 sub rcx, byte SIZEOF_WORD 104 movzx rdx, WORD [rsi+rcx] 105 shl rax, WORD_BIT 106 or rax,rdx 107.column_ld4: 108 movd xmmA,eax 109 pop rdx 110 pop rax 111 test cl, SIZEOF_DWORD 112 jz short .column_ld8 113 sub rcx, byte SIZEOF_DWORD 114 movd xmmF, XMM_DWORD [rsi+rcx] 115 pslldq xmmA, SIZEOF_DWORD 116 por xmmA,xmmF 117.column_ld8: 118 test cl, SIZEOF_MMWORD 119 jz short .column_ld16 120 sub rcx, byte SIZEOF_MMWORD 121 movq xmmB, XMM_MMWORD [rsi+rcx] 122 pslldq xmmA, SIZEOF_MMWORD 123 por xmmA,xmmB 124.column_ld16: 125 test cl, SIZEOF_XMMWORD 126 jz short .column_ld32 127 movdqa xmmF,xmmA 128 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 129 mov rcx, SIZEOF_XMMWORD 130 jmp short .rgb_ycc_cnv 131.column_ld32: 132 test cl, 2*SIZEOF_XMMWORD 133 mov rcx, SIZEOF_XMMWORD 134 jz short .rgb_ycc_cnv 135 movdqa xmmB,xmmA 136 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 137 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 138 jmp short .rgb_ycc_cnv 139 140.columnloop: 141 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 142 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 143 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] 144 145.rgb_ycc_cnv: 146 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 147 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 148 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 149 150 movdqa xmmG,xmmA 151 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 152 psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 153 154 punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 155 pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 156 157 punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 158 punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 159 160 movdqa xmmD,xmmA 161 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 162 psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 163 164 punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 165 pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 166 167 punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 168 punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 169 170 movdqa xmmE,xmmA 171 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 172 psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 173 174 punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 175 pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 176 177 punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 178 punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 179 180 pxor xmmH,xmmH 181 182 movdqa xmmC,xmmA 183 punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 184 punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 185 186 movdqa xmmB,xmmE 187 punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 188 punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 189 190 movdqa xmmF,xmmD 191 punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 192 punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 193 194%else ; RGB_PIXELSIZE == 4 ; ----------- 195 196.column_ld1: 197 test cl, SIZEOF_XMMWORD/16 198 jz short .column_ld2 199 sub rcx, byte SIZEOF_XMMWORD/16 200 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 201.column_ld2: 202 test cl, SIZEOF_XMMWORD/8 203 jz short .column_ld4 204 sub rcx, byte SIZEOF_XMMWORD/8 205 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 206 pslldq xmmA, SIZEOF_MMWORD 207 por xmmA,xmmE 208.column_ld4: 209 test cl, SIZEOF_XMMWORD/4 210 jz short .column_ld8 211 sub rcx, byte SIZEOF_XMMWORD/4 212 movdqa xmmE,xmmA 213 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 214.column_ld8: 215 test cl, SIZEOF_XMMWORD/2 216 mov rcx, SIZEOF_XMMWORD 217 jz short .rgb_ycc_cnv 218 movdqa xmmF,xmmA 219 movdqa xmmH,xmmE 220 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 221 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 222 jmp short .rgb_ycc_cnv 223 224.columnloop: 225 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 226 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 227 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] 228 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] 229 230.rgb_ycc_cnv: 231 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 232 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 233 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 234 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 235 236 movdqa xmmD,xmmA 237 punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 238 punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 239 240 movdqa xmmC,xmmF 241 punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 242 punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 243 244 movdqa xmmB,xmmA 245 punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 246 punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 247 248 movdqa xmmG,xmmD 249 punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 250 punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 251 252 movdqa xmmE,xmmA 253 punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 254 punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 255 256 movdqa xmmH,xmmB 257 punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 258 punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 259 260 pxor xmmF,xmmF 261 262 movdqa xmmC,xmmA 263 punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 264 punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 265 266 movdqa xmmD,xmmB 267 punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 268 punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 269 270 movdqa xmmG,xmmE 271 punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 272 punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 273 274 punpcklbw xmmF,xmmH 275 punpckhbw xmmH,xmmH 276 psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 277 psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 278 279%endif ; RGB_PIXELSIZE ; --------------- 280 281 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 282 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 283 284 ; (Original) 285 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 286 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 287 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 288 ; 289 ; (This implementation) 290 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 291 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 292 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 293 294 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE 295 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO 296 movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE 297 movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO 298 299 movdqa xmm6,xmm1 300 punpcklwd xmm1,xmm3 301 punpckhwd xmm6,xmm3 302 movdqa xmm7,xmm1 303 movdqa xmm4,xmm6 304 pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 305 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 306 pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) 307 pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) 308 309 movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) 310 movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) 311 312 pxor xmm1,xmm1 313 pxor xmm6,xmm6 314 punpcklwd xmm1,xmm5 ; xmm1=BOL 315 punpckhwd xmm6,xmm5 ; xmm6=BOH 316 psrld xmm1,1 ; xmm1=BOL*FIX(0.500) 317 psrld xmm6,1 ; xmm6=BOH*FIX(0.500) 318 319 movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] 320 321 paddd xmm7,xmm1 322 paddd xmm4,xmm6 323 paddd xmm7,xmm5 324 paddd xmm4,xmm5 325 psrld xmm7,SCALEBITS ; xmm7=CbOL 326 psrld xmm4,SCALEBITS ; xmm4=CbOH 327 packssdw xmm7,xmm4 ; xmm7=CbO 328 329 movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE 330 331 movdqa xmm6,xmm0 332 punpcklwd xmm0,xmm2 333 punpckhwd xmm6,xmm2 334 movdqa xmm5,xmm0 335 movdqa xmm4,xmm6 336 pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 337 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 338 pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) 339 pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) 340 341 movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) 342 movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) 343 344 pxor xmm0,xmm0 345 pxor xmm6,xmm6 346 punpcklwd xmm0,xmm1 ; xmm0=BEL 347 punpckhwd xmm6,xmm1 ; xmm6=BEH 348 psrld xmm0,1 ; xmm0=BEL*FIX(0.500) 349 psrld xmm6,1 ; xmm6=BEH*FIX(0.500) 350 351 movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] 352 353 paddd xmm5,xmm0 354 paddd xmm4,xmm6 355 paddd xmm5,xmm1 356 paddd xmm4,xmm1 357 psrld xmm5,SCALEBITS ; xmm5=CbEL 358 psrld xmm4,SCALEBITS ; xmm4=CbEH 359 packssdw xmm5,xmm4 ; xmm5=CbE 360 361 psllw xmm7,BYTE_BIT 362 por xmm5,xmm7 ; xmm5=Cb 363 movdqa XMMWORD [rbx], xmm5 ; Save Cb 364 365 movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO 366 movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE 367 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO 368 369 movdqa xmm4,xmm0 370 punpcklwd xmm0,xmm3 371 punpckhwd xmm4,xmm3 372 movdqa xmm7,xmm0 373 movdqa xmm5,xmm4 374 pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 375 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 376 pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) 377 pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) 378 379 movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] 380 381 paddd xmm0, XMMWORD [wk(4)] 382 paddd xmm4, XMMWORD [wk(5)] 383 paddd xmm0,xmm3 384 paddd xmm4,xmm3 385 psrld xmm0,SCALEBITS ; xmm0=YOL 386 psrld xmm4,SCALEBITS ; xmm4=YOH 387 packssdw xmm0,xmm4 ; xmm0=YO 388 389 pxor xmm3,xmm3 390 pxor xmm4,xmm4 391 punpcklwd xmm3,xmm1 ; xmm3=ROL 392 punpckhwd xmm4,xmm1 ; xmm4=ROH 393 psrld xmm3,1 ; xmm3=ROL*FIX(0.500) 394 psrld xmm4,1 ; xmm4=ROH*FIX(0.500) 395 396 movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] 397 398 paddd xmm7,xmm3 399 paddd xmm5,xmm4 400 paddd xmm7,xmm1 401 paddd xmm5,xmm1 402 psrld xmm7,SCALEBITS ; xmm7=CrOL 403 psrld xmm5,SCALEBITS ; xmm5=CrOH 404 packssdw xmm7,xmm5 ; xmm7=CrO 405 406 movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE 407 408 movdqa xmm4,xmm6 409 punpcklwd xmm6,xmm2 410 punpckhwd xmm4,xmm2 411 movdqa xmm1,xmm6 412 movdqa xmm5,xmm4 413 pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 414 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 415 pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) 416 pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) 417 418 movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] 419 420 paddd xmm6, XMMWORD [wk(6)] 421 paddd xmm4, XMMWORD [wk(7)] 422 paddd xmm6,xmm2 423 paddd xmm4,xmm2 424 psrld xmm6,SCALEBITS ; xmm6=YEL 425 psrld xmm4,SCALEBITS ; xmm4=YEH 426 packssdw xmm6,xmm4 ; xmm6=YE 427 428 psllw xmm0,BYTE_BIT 429 por xmm6,xmm0 ; xmm6=Y 430 movdqa XMMWORD [rdi], xmm6 ; Save Y 431 432 pxor xmm2,xmm2 433 pxor xmm4,xmm4 434 punpcklwd xmm2,xmm3 ; xmm2=REL 435 punpckhwd xmm4,xmm3 ; xmm4=REH 436 psrld xmm2,1 ; xmm2=REL*FIX(0.500) 437 psrld xmm4,1 ; xmm4=REH*FIX(0.500) 438 439 movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] 440 441 paddd xmm1,xmm2 442 paddd xmm5,xmm4 443 paddd xmm1,xmm0 444 paddd xmm5,xmm0 445 psrld xmm1,SCALEBITS ; xmm1=CrEL 446 psrld xmm5,SCALEBITS ; xmm5=CrEH 447 packssdw xmm1,xmm5 ; xmm1=CrE 448 449 psllw xmm7,BYTE_BIT 450 por xmm1,xmm7 ; xmm1=Cr 451 movdqa XMMWORD [rdx], xmm1 ; Save Cr 452 453 sub rcx, byte SIZEOF_XMMWORD 454 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 455 add rdi, byte SIZEOF_XMMWORD ; outptr0 456 add rbx, byte SIZEOF_XMMWORD ; outptr1 457 add rdx, byte SIZEOF_XMMWORD ; outptr2 458 cmp rcx, byte SIZEOF_XMMWORD 459 jae near .columnloop 460 test rcx,rcx 461 jnz near .column_ld1 462 463 pop rcx ; col 464 pop rsi 465 pop rdi 466 pop rbx 467 pop rdx 468 469 add rsi, byte SIZEOF_JSAMPROW ; input_buf 470 add rdi, byte SIZEOF_JSAMPROW 471 add rbx, byte SIZEOF_JSAMPROW 472 add rdx, byte SIZEOF_JSAMPROW 473 dec rax ; num_rows 474 jg near .rowloop 475 476.return: 477 pop rbx 478 uncollect_args 479 mov rsp,rbp ; rsp <- aligned rbp 480 pop rsp ; rsp <- original rbp 481 pop rbp 482 ret 483 484; For some reason, the OS X linker does not honor the request to align the 485; segment unless we do this. 486 align 16 487