1; 2; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) 3; 4; Copyright (C) 2011, D. R. Commander. 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; [TAB8] 17 18%include "jcolsamp.inc" 19 20; -------------------------------------------------------------------------- 21; 22; Convert some rows of samples to the output colorspace. 23; 24; GLOBAL(void) 25; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, 26; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, 27; JDIMENSION output_row, int num_rows); 28; 29 30; r10 = JDIMENSION img_width 31; r11 = JSAMPARRAY input_buf 32; r12 = JSAMPIMAGE output_buf 33; r13 = JDIMENSION output_row 34; r14 = int num_rows 35 36%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 37%define WK_NUM 2 38 39 align 16 40 41 global EXTN(jsimd_rgb_gray_convert_sse2) 42 43EXTN(jsimd_rgb_gray_convert_sse2): 44 push rbp 45 mov rax,rsp ; rax = original rbp 46 sub rsp, byte 4 47 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 48 mov [rsp],rax 49 mov rbp,rsp ; rbp = aligned rbp 50 lea rsp, [wk(0)] 51 collect_args 52 push rbx 53 54 mov ecx, r10d 55 test rcx,rcx 56 jz near .return 57 58 push rcx 59 60 mov rsi, r12 61 mov ecx, r13d 62 mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 63 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 64 65 pop rcx 66 67 mov rsi, r11 68 mov eax, r14d 69 test rax,rax 70 jle near .return 71.rowloop: 72 push rdi 73 push rsi 74 push rcx ; col 75 76 mov rsi, JSAMPROW [rsi] ; inptr 77 mov rdi, JSAMPROW [rdi] ; outptr0 78 79 cmp rcx, byte SIZEOF_XMMWORD 80 jae near .columnloop 81 82%if RGB_PIXELSIZE == 3 ; --------------- 83 84.column_ld1: 85 push rax 86 push rdx 87 lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 88 test cl, SIZEOF_BYTE 89 jz short .column_ld2 90 sub rcx, byte SIZEOF_BYTE 91 movzx rax, BYTE [rsi+rcx] 92.column_ld2: 93 test cl, SIZEOF_WORD 94 jz short .column_ld4 95 sub rcx, byte SIZEOF_WORD 96 movzx rdx, WORD [rsi+rcx] 97 shl rax, WORD_BIT 98 or rax,rdx 99.column_ld4: 100 movd xmmA,eax 101 pop rdx 102 pop rax 103 test cl, SIZEOF_DWORD 104 jz short .column_ld8 105 sub rcx, byte SIZEOF_DWORD 106 movd xmmF, XMM_DWORD [rsi+rcx] 107 pslldq xmmA, SIZEOF_DWORD 108 por xmmA,xmmF 109.column_ld8: 110 test cl, SIZEOF_MMWORD 111 jz short .column_ld16 112 sub rcx, byte SIZEOF_MMWORD 113 movq xmmB, XMM_MMWORD [rsi+rcx] 114 pslldq xmmA, SIZEOF_MMWORD 115 por xmmA,xmmB 116.column_ld16: 117 test cl, SIZEOF_XMMWORD 118 jz short .column_ld32 119 movdqa xmmF,xmmA 120 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 121 mov rcx, SIZEOF_XMMWORD 122 jmp short .rgb_gray_cnv 123.column_ld32: 124 test cl, 2*SIZEOF_XMMWORD 125 mov rcx, SIZEOF_XMMWORD 126 jz short .rgb_gray_cnv 127 movdqa xmmB,xmmA 128 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 129 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 130 jmp short .rgb_gray_cnv 131 132.columnloop: 133 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 134 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 135 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] 136 137.rgb_gray_cnv: 138 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 139 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 140 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 141 142 movdqa xmmG,xmmA 143 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 144 psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 145 146 punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 147 pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 148 149 punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 150 punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 151 152 movdqa xmmD,xmmA 153 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 154 psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 155 156 punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 157 pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 158 159 punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 160 punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 161 162 movdqa xmmE,xmmA 163 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 164 psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 165 166 punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 167 pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 168 169 punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 170 punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 171 172 pxor xmmH,xmmH 173 174 movdqa xmmC,xmmA 175 punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 176 punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 177 178 movdqa xmmB,xmmE 179 punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 180 punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 181 182 movdqa xmmF,xmmD 183 punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 184 punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 185 186%else ; RGB_PIXELSIZE == 4 ; ----------- 187 188.column_ld1: 189 test cl, SIZEOF_XMMWORD/16 190 jz short .column_ld2 191 sub rcx, byte SIZEOF_XMMWORD/16 192 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 193.column_ld2: 194 test cl, SIZEOF_XMMWORD/8 195 jz short .column_ld4 196 sub rcx, byte SIZEOF_XMMWORD/8 197 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 198 pslldq xmmA, SIZEOF_MMWORD 199 por xmmA,xmmE 200.column_ld4: 201 test cl, SIZEOF_XMMWORD/4 202 jz short .column_ld8 203 sub rcx, byte SIZEOF_XMMWORD/4 204 movdqa xmmE,xmmA 205 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 206.column_ld8: 207 test cl, SIZEOF_XMMWORD/2 208 mov rcx, SIZEOF_XMMWORD 209 jz short .rgb_gray_cnv 210 movdqa xmmF,xmmA 211 movdqa xmmH,xmmE 212 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 213 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 214 jmp short .rgb_gray_cnv 215 216.columnloop: 217 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 218 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 219 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] 220 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] 221 222.rgb_gray_cnv: 223 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 224 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 225 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 226 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 227 228 movdqa xmmD,xmmA 229 punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 230 punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 231 232 movdqa xmmC,xmmF 233 punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 234 punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 235 236 movdqa xmmB,xmmA 237 punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 238 punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 239 240 movdqa xmmG,xmmD 241 punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 242 punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 243 244 movdqa xmmE,xmmA 245 punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 246 punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 247 248 movdqa xmmH,xmmB 249 punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 250 punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 251 252 pxor xmmF,xmmF 253 254 movdqa xmmC,xmmA 255 punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 256 punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 257 258 movdqa xmmD,xmmB 259 punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 260 punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 261 262 movdqa xmmG,xmmE 263 punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 264 punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 265 266 punpcklbw xmmF,xmmH 267 punpckhbw xmmH,xmmH 268 psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 269 psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 270 271%endif ; RGB_PIXELSIZE ; --------------- 272 273 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 274 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 275 276 ; (Original) 277 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 278 ; 279 ; (This implementation) 280 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 281 282 movdqa xmm6,xmm1 283 punpcklwd xmm1,xmm3 284 punpckhwd xmm6,xmm3 285 pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 286 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 287 288 movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) 289 290 movdqa xmm6,xmm0 291 punpcklwd xmm0,xmm2 292 punpckhwd xmm6,xmm2 293 pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 294 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 295 296 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) 297 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) 298 299 movdqa xmm0, xmm5 ; xmm0=BO 300 movdqa xmm6, xmm4 ; xmm6=BE 301 302 movdqa xmm4,xmm0 303 punpcklwd xmm0,xmm3 304 punpckhwd xmm4,xmm3 305 pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 306 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 307 308 movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] 309 310 paddd xmm0, xmm1 311 paddd xmm4, xmm7 312 paddd xmm0,xmm3 313 paddd xmm4,xmm3 314 psrld xmm0,SCALEBITS ; xmm0=YOL 315 psrld xmm4,SCALEBITS ; xmm4=YOH 316 packssdw xmm0,xmm4 ; xmm0=YO 317 318 movdqa xmm4,xmm6 319 punpcklwd xmm6,xmm2 320 punpckhwd xmm4,xmm2 321 pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 322 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 323 324 movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] 325 326 paddd xmm6, XMMWORD [wk(0)] 327 paddd xmm4, XMMWORD [wk(1)] 328 paddd xmm6,xmm2 329 paddd xmm4,xmm2 330 psrld xmm6,SCALEBITS ; xmm6=YEL 331 psrld xmm4,SCALEBITS ; xmm4=YEH 332 packssdw xmm6,xmm4 ; xmm6=YE 333 334 psllw xmm0,BYTE_BIT 335 por xmm6,xmm0 ; xmm6=Y 336 movdqa XMMWORD [rdi], xmm6 ; Save Y 337 338 sub rcx, byte SIZEOF_XMMWORD 339 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 340 add rdi, byte SIZEOF_XMMWORD ; outptr0 341 cmp rcx, byte SIZEOF_XMMWORD 342 jae near .columnloop 343 test rcx,rcx 344 jnz near .column_ld1 345 346 pop rcx ; col 347 pop rsi 348 pop rdi 349 350 add rsi, byte SIZEOF_JSAMPROW ; input_buf 351 add rdi, byte SIZEOF_JSAMPROW 352 dec rax ; num_rows 353 jg near .rowloop 354 355.return: 356 pop rbx 357 uncollect_args 358 mov rsp,rbp ; rsp <- aligned rbp 359 pop rsp ; rsp <- original rbp 360 pop rbp 361 ret 362 363; For some reason, the OS X linker does not honor the request to align the 364; segment unless we do this. 365 align 16 366