1; 2; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) 3; 4; Copyright (C) 2011, 2016, D. R. Commander. 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; [TAB8] 17 18%include "jcolsamp.inc" 19 20; -------------------------------------------------------------------------- 21; 22; Convert some rows of samples to the output colorspace. 23; 24; GLOBAL(void) 25; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, 26; JSAMPIMAGE output_buf, JDIMENSION output_row, 27; int num_rows); 28; 29 30; r10d = JDIMENSION img_width 31; r11 = JSAMPARRAY input_buf 32; r12 = JSAMPIMAGE output_buf 33; r13d = JDIMENSION output_row 34; r14d = int num_rows 35 36%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 37%define WK_NUM 2 38 39 align 32 40 GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2) 41 42EXTN(jsimd_rgb_gray_convert_sse2): 43 push rbp 44 mov rax, rsp ; rax = original rbp 45 sub rsp, byte 4 46 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 47 mov [rsp], rax 48 mov rbp, rsp ; rbp = aligned rbp 49 lea rsp, [wk(0)] 50 collect_args 5 51 push rbx 52 53 mov ecx, r10d 54 test rcx, rcx 55 jz near .return 56 57 push rcx 58 59 mov rsi, r12 60 mov ecx, r13d 61 mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 62 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 63 64 pop rcx 65 66 mov rsi, r11 67 mov eax, r14d 68 test rax, rax 69 jle near .return 70.rowloop: 71 push rdi 72 push rsi 73 push rcx ; col 74 75 mov rsi, JSAMPROW [rsi] ; inptr 76 mov rdi, JSAMPROW [rdi] ; outptr0 77 78 cmp rcx, byte SIZEOF_XMMWORD 79 jae near .columnloop 80 81%if RGB_PIXELSIZE == 3 ; --------------- 82 83.column_ld1: 84 push rax 85 push rdx 86 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 87 test cl, SIZEOF_BYTE 88 jz short .column_ld2 89 sub rcx, byte SIZEOF_BYTE 90 movzx rax, BYTE [rsi+rcx] 91.column_ld2: 92 test cl, SIZEOF_WORD 93 jz short .column_ld4 94 sub rcx, byte SIZEOF_WORD 95 movzx rdx, WORD [rsi+rcx] 96 shl rax, WORD_BIT 97 or rax, rdx 98.column_ld4: 99 movd xmmA, eax 100 pop rdx 101 pop rax 102 test cl, SIZEOF_DWORD 103 jz short .column_ld8 104 sub rcx, byte SIZEOF_DWORD 105 movd xmmF, XMM_DWORD [rsi+rcx] 106 pslldq xmmA, SIZEOF_DWORD 107 por xmmA, xmmF 108.column_ld8: 109 test cl, SIZEOF_MMWORD 110 jz short .column_ld16 111 sub rcx, byte SIZEOF_MMWORD 112 movq xmmB, XMM_MMWORD [rsi+rcx] 113 pslldq xmmA, SIZEOF_MMWORD 114 por xmmA, xmmB 115.column_ld16: 116 test cl, SIZEOF_XMMWORD 117 jz short .column_ld32 118 movdqa xmmF, xmmA 119 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 120 mov rcx, SIZEOF_XMMWORD 121 jmp short .rgb_gray_cnv 122.column_ld32: 123 test cl, 2*SIZEOF_XMMWORD 124 mov rcx, SIZEOF_XMMWORD 125 jz short .rgb_gray_cnv 126 movdqa xmmB, xmmA 127 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 128 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 129 jmp short .rgb_gray_cnv 130 131.columnloop: 132 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 133 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 134 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] 135 136.rgb_gray_cnv: 137 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 138 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 139 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 140 141 movdqa xmmG, xmmA 142 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 143 psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 144 145 punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 146 pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 147 148 punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 149 punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 150 151 movdqa xmmD, xmmA 152 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 153 psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 154 155 punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 156 pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 157 158 punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 159 punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 160 161 movdqa xmmE, xmmA 162 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 163 psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 164 165 punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 166 pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 167 168 punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 169 punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 170 171 pxor xmmH, xmmH 172 173 movdqa xmmC, xmmA 174 punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 175 punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 176 177 movdqa xmmB, xmmE 178 punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 179 punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 180 181 movdqa xmmF, xmmD 182 punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 183 punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 184 185%else ; RGB_PIXELSIZE == 4 ; ----------- 186 187.column_ld1: 188 test cl, SIZEOF_XMMWORD/16 189 jz short .column_ld2 190 sub rcx, byte SIZEOF_XMMWORD/16 191 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 192.column_ld2: 193 test cl, SIZEOF_XMMWORD/8 194 jz short .column_ld4 195 sub rcx, byte SIZEOF_XMMWORD/8 196 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 197 pslldq xmmA, SIZEOF_MMWORD 198 por xmmA, xmmE 199.column_ld4: 200 test cl, SIZEOF_XMMWORD/4 201 jz short .column_ld8 202 sub rcx, byte SIZEOF_XMMWORD/4 203 movdqa xmmE, xmmA 204 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 205.column_ld8: 206 test cl, SIZEOF_XMMWORD/2 207 mov rcx, SIZEOF_XMMWORD 208 jz short .rgb_gray_cnv 209 movdqa xmmF, xmmA 210 movdqa xmmH, xmmE 211 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 212 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 213 jmp short .rgb_gray_cnv 214 215.columnloop: 216 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 217 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 218 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] 219 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] 220 221.rgb_gray_cnv: 222 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 223 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 224 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 225 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 226 227 movdqa xmmD, xmmA 228 punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 229 punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 230 231 movdqa xmmC, xmmF 232 punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 233 punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 234 235 movdqa xmmB, xmmA 236 punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 237 punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 238 239 movdqa xmmG, xmmD 240 punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 241 punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 242 243 movdqa xmmE, xmmA 244 punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 245 punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 246 247 movdqa xmmH, xmmB 248 punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 249 punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 250 251 pxor xmmF, xmmF 252 253 movdqa xmmC, xmmA 254 punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 255 punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 256 257 movdqa xmmD, xmmB 258 punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 259 punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 260 261 movdqa xmmG, xmmE 262 punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 263 punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 264 265 punpcklbw xmmF, xmmH 266 punpckhbw xmmH, xmmH 267 psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 268 psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 269 270%endif ; RGB_PIXELSIZE ; --------------- 271 272 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 273 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 274 275 ; (Original) 276 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 277 ; 278 ; (This implementation) 279 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 280 281 movdqa xmm6, xmm1 282 punpcklwd xmm1, xmm3 283 punpckhwd xmm6, xmm3 284 pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 285 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 286 287 movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) 288 289 movdqa xmm6, xmm0 290 punpcklwd xmm0, xmm2 291 punpckhwd xmm6, xmm2 292 pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 293 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 294 295 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) 296 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) 297 298 movdqa xmm0, xmm5 ; xmm0=BO 299 movdqa xmm6, xmm4 ; xmm6=BE 300 301 movdqa xmm4, xmm0 302 punpcklwd xmm0, xmm3 303 punpckhwd xmm4, xmm3 304 pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 305 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 306 307 movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] 308 309 paddd xmm0, xmm1 310 paddd xmm4, xmm7 311 paddd xmm0, xmm3 312 paddd xmm4, xmm3 313 psrld xmm0, SCALEBITS ; xmm0=YOL 314 psrld xmm4, SCALEBITS ; xmm4=YOH 315 packssdw xmm0, xmm4 ; xmm0=YO 316 317 movdqa xmm4, xmm6 318 punpcklwd xmm6, xmm2 319 punpckhwd xmm4, xmm2 320 pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 321 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 322 323 movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] 324 325 paddd xmm6, XMMWORD [wk(0)] 326 paddd xmm4, XMMWORD [wk(1)] 327 paddd xmm6, xmm2 328 paddd xmm4, xmm2 329 psrld xmm6, SCALEBITS ; xmm6=YEL 330 psrld xmm4, SCALEBITS ; xmm4=YEH 331 packssdw xmm6, xmm4 ; xmm6=YE 332 333 psllw xmm0, BYTE_BIT 334 por xmm6, xmm0 ; xmm6=Y 335 movdqa XMMWORD [rdi], xmm6 ; Save Y 336 337 sub rcx, byte SIZEOF_XMMWORD 338 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 339 add rdi, byte SIZEOF_XMMWORD ; outptr0 340 cmp rcx, byte SIZEOF_XMMWORD 341 jae near .columnloop 342 test rcx, rcx 343 jnz near .column_ld1 344 345 pop rcx ; col 346 pop rsi 347 pop rdi 348 349 add rsi, byte SIZEOF_JSAMPROW ; input_buf 350 add rdi, byte SIZEOF_JSAMPROW 351 dec rax ; num_rows 352 jg near .rowloop 353 354.return: 355 pop rbx 356 uncollect_args 5 357 mov rsp, rbp ; rsp <- aligned rbp 358 pop rsp ; rsp <- original rbp 359 pop rbp 360 ret 361 362; For some reason, the OS X linker does not honor the request to align the 363; segment unless we do this. 364 align 32 365