1; 2; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2) 3; 4; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2012, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jcolsamp.inc" 20 21; -------------------------------------------------------------------------- 22; 23; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. 24; 25; GLOBAL(void) 26; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width, 27; JSAMPIMAGE input_buf, 28; JDIMENSION in_row_group_ctr, 29; JSAMPARRAY output_buf); 30; 31 32; r10d = JDIMENSION output_width 33; r11 = JSAMPIMAGE input_buf 34; r12d = JDIMENSION in_row_group_ctr 35; r13 = JSAMPARRAY output_buf 36 37%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 38%define WK_NUM 3 39 40 align 32 41 GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2) 42 43EXTN(jsimd_h2v1_merged_upsample_sse2): 44 push rbp 45 mov rax, rsp ; rax = original rbp 46 sub rsp, byte 4 47 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 48 mov [rsp], rax 49 mov rbp, rsp ; rbp = aligned rbp 50 lea rsp, [wk(0)] 51 collect_args 4 52 push rbx 53 54 mov ecx, r10d ; col 55 test rcx, rcx 56 jz near .return 57 58 push rcx 59 60 mov rdi, r11 61 mov ecx, r12d 62 mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] 63 mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] 64 mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] 65 mov rdi, r13 66 mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 67 mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 68 mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 69 mov rdi, JSAMPROW [rdi] ; outptr 70 71 pop rcx ; col 72 73.columnloop: 74 75 movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) 76 movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) 77 78 pxor xmm1, xmm1 ; xmm1=(all 0's) 79 pcmpeqw xmm3, xmm3 80 psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} 81 82 movdqa xmm4, xmm6 83 punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH 84 punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL 85 movdqa xmm0, xmm7 86 punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH 87 punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL 88 89 paddw xmm6, xmm3 90 paddw xmm4, xmm3 91 paddw xmm7, xmm3 92 paddw xmm0, xmm3 93 94 ; (Original) 95 ; R = Y + 1.40200 * Cr 96 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 97 ; B = Y + 1.77200 * Cb 98 ; 99 ; (This implementation) 100 ; R = Y + 0.40200 * Cr + Cr 101 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 102 ; B = Y - 0.22800 * Cb + Cb + Cb 103 104 movdqa xmm5, xmm6 ; xmm5=CbH 105 movdqa xmm2, xmm4 ; xmm2=CbL 106 paddw xmm6, xmm6 ; xmm6=2*CbH 107 paddw xmm4, xmm4 ; xmm4=2*CbL 108 movdqa xmm1, xmm7 ; xmm1=CrH 109 movdqa xmm3, xmm0 ; xmm3=CrL 110 paddw xmm7, xmm7 ; xmm7=2*CrH 111 paddw xmm0, xmm0 ; xmm0=2*CrL 112 113 pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) 114 pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) 115 pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) 116 pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) 117 118 paddw xmm6, [rel PW_ONE] 119 paddw xmm4, [rel PW_ONE] 120 psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800)) 121 psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800)) 122 paddw xmm7, [rel PW_ONE] 123 paddw xmm0, [rel PW_ONE] 124 psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200)) 125 psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200)) 126 127 paddw xmm6, xmm5 128 paddw xmm4, xmm2 129 paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H 130 paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L 131 paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H 132 paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L 133 134 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H 135 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H 136 137 movdqa xmm6, xmm5 138 movdqa xmm7, xmm2 139 punpcklwd xmm5, xmm1 140 punpckhwd xmm6, xmm1 141 pmaddwd xmm5, [rel PW_MF0344_F0285] 142 pmaddwd xmm6, [rel PW_MF0344_F0285] 143 punpcklwd xmm2, xmm3 144 punpckhwd xmm7, xmm3 145 pmaddwd xmm2, [rel PW_MF0344_F0285] 146 pmaddwd xmm7, [rel PW_MF0344_F0285] 147 148 paddd xmm5, [rel PD_ONEHALF] 149 paddd xmm6, [rel PD_ONEHALF] 150 psrad xmm5, SCALEBITS 151 psrad xmm6, SCALEBITS 152 paddd xmm2, [rel PD_ONEHALF] 153 paddd xmm7, [rel PD_ONEHALF] 154 psrad xmm2, SCALEBITS 155 psrad xmm7, SCALEBITS 156 157 packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) 158 packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) 159 psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H 160 psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L 161 162 movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H 163 164 mov al, 2 ; Yctr 165 jmp short .Yloop_1st 166 167.Yloop_2nd: 168 movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H 169 movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H 170 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H 171 172.Yloop_1st: 173 movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) 174 175 pcmpeqw xmm6, xmm6 176 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 177 pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE 178 psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO 179 180 movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H) 181 movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H) 182 movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H) 183 184 paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) 185 paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) 186 packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) 187 packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) 188 189 paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) 190 paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) 191 packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) 192 packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) 193 194 paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) 195 paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) 196 packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) 197 packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) 198 199%if RGB_PIXELSIZE == 3 ; --------------- 200 201 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 202 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 203 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 204 ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) 205 206 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 207 punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) 208 punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) 209 210 movdqa xmmG, xmmA 211 movdqa xmmH, xmmA 212 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) 213 punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) 214 215 psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) 216 psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) 217 218 movdqa xmmC, xmmD 219 movdqa xmmB, xmmD 220 punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) 221 punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) 222 223 psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) 224 225 movdqa xmmF, xmmE 226 punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) 227 punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) 228 229 pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) 230 movdqa xmmB, xmmE 231 punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) 232 punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) 233 punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) 234 235 pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) 236 movdqa xmmB, xmmF 237 punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) 238 punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) 239 punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) 240 241 punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 242 punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 243 punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 244 245 cmp rcx, byte SIZEOF_XMMWORD 246 jb short .column_st32 247 248 test rdi, SIZEOF_XMMWORD-1 249 jnz short .out1 250 ; --(aligned)------------------- 251 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 252 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 253 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 254 jmp short .out0 255.out1: ; --(unaligned)----------------- 256 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 257 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 258 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 259.out0: 260 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 261 sub rcx, byte SIZEOF_XMMWORD 262 jz near .endcolumn 263 264 add rsi, byte SIZEOF_XMMWORD ; inptr0 265 dec al ; Yctr 266 jnz near .Yloop_2nd 267 268 add rbx, byte SIZEOF_XMMWORD ; inptr1 269 add rdx, byte SIZEOF_XMMWORD ; inptr2 270 jmp near .columnloop 271 272.column_st32: 273 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE 274 cmp rcx, byte 2*SIZEOF_XMMWORD 275 jb short .column_st16 276 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 277 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 278 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 279 movdqa xmmA, xmmF 280 sub rcx, byte 2*SIZEOF_XMMWORD 281 jmp short .column_st15 282.column_st16: 283 cmp rcx, byte SIZEOF_XMMWORD 284 jb short .column_st15 285 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 286 add rdi, byte SIZEOF_XMMWORD ; outptr 287 movdqa xmmA, xmmD 288 sub rcx, byte SIZEOF_XMMWORD 289.column_st15: 290 ; Store the lower 8 bytes of xmmA to the output when it has enough 291 ; space. 292 cmp rcx, byte SIZEOF_MMWORD 293 jb short .column_st7 294 movq XMM_MMWORD [rdi], xmmA 295 add rdi, byte SIZEOF_MMWORD 296 sub rcx, byte SIZEOF_MMWORD 297 psrldq xmmA, SIZEOF_MMWORD 298.column_st7: 299 ; Store the lower 4 bytes of xmmA to the output when it has enough 300 ; space. 301 cmp rcx, byte SIZEOF_DWORD 302 jb short .column_st3 303 movd XMM_DWORD [rdi], xmmA 304 add rdi, byte SIZEOF_DWORD 305 sub rcx, byte SIZEOF_DWORD 306 psrldq xmmA, SIZEOF_DWORD 307.column_st3: 308 ; Store the lower 2 bytes of rax to the output when it has enough 309 ; space. 310 movd eax, xmmA 311 cmp rcx, byte SIZEOF_WORD 312 jb short .column_st1 313 mov WORD [rdi], ax 314 add rdi, byte SIZEOF_WORD 315 sub rcx, byte SIZEOF_WORD 316 shr rax, 16 317.column_st1: 318 ; Store the lower 1 byte of rax to the output when it has enough 319 ; space. 320 test rcx, rcx 321 jz short .endcolumn 322 mov BYTE [rdi], al 323 324%else ; RGB_PIXELSIZE == 4 ; ----------- 325 326%ifdef RGBX_FILLER_0XFF 327 pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 328 pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 329%else 330 pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 331 pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 332%endif 333 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 334 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 335 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 336 ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) 337 338 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 339 punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) 340 punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) 341 punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) 342 343 movdqa xmmC, xmmA 344 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) 345 punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) 346 movdqa xmmG, xmmB 347 punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) 348 punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) 349 350 movdqa xmmD, xmmA 351 punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 352 punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 353 movdqa xmmH, xmmC 354 punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 355 punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 356 357 cmp rcx, byte SIZEOF_XMMWORD 358 jb short .column_st32 359 360 test rdi, SIZEOF_XMMWORD-1 361 jnz short .out1 362 ; --(aligned)------------------- 363 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 364 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 365 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 366 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 367 jmp short .out0 368.out1: ; --(unaligned)----------------- 369 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 370 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 371 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 372 movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 373.out0: 374 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 375 sub rcx, byte SIZEOF_XMMWORD 376 jz near .endcolumn 377 378 add rsi, byte SIZEOF_XMMWORD ; inptr0 379 dec al ; Yctr 380 jnz near .Yloop_2nd 381 382 add rbx, byte SIZEOF_XMMWORD ; inptr1 383 add rdx, byte SIZEOF_XMMWORD ; inptr2 384 jmp near .columnloop 385 386.column_st32: 387 cmp rcx, byte SIZEOF_XMMWORD/2 388 jb short .column_st16 389 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 390 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 391 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 392 movdqa xmmA, xmmC 393 movdqa xmmD, xmmH 394 sub rcx, byte SIZEOF_XMMWORD/2 395.column_st16: 396 cmp rcx, byte SIZEOF_XMMWORD/4 397 jb short .column_st15 398 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 399 add rdi, byte SIZEOF_XMMWORD ; outptr 400 movdqa xmmA, xmmD 401 sub rcx, byte SIZEOF_XMMWORD/4 402.column_st15: 403 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 404 ; space. 405 cmp rcx, byte SIZEOF_XMMWORD/8 406 jb short .column_st7 407 movq XMM_MMWORD [rdi], xmmA 408 add rdi, byte SIZEOF_XMMWORD/8*4 409 sub rcx, byte SIZEOF_XMMWORD/8 410 psrldq xmmA, SIZEOF_XMMWORD/8*4 411.column_st7: 412 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 413 ; space. 414 test rcx, rcx 415 jz short .endcolumn 416 movd XMM_DWORD [rdi], xmmA 417 418%endif ; RGB_PIXELSIZE ; --------------- 419 420.endcolumn: 421 sfence ; flush the write buffer 422 423.return: 424 pop rbx 425 uncollect_args 4 426 mov rsp, rbp ; rsp <- aligned rbp 427 pop rsp ; rsp <- original rbp 428 pop rbp 429 ret 430 431; -------------------------------------------------------------------------- 432; 433; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. 434; 435; GLOBAL(void) 436; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width, 437; JSAMPIMAGE input_buf, 438; JDIMENSION in_row_group_ctr, 439; JSAMPARRAY output_buf); 440; 441 442; r10d = JDIMENSION output_width 443; r11 = JSAMPIMAGE input_buf 444; r12d = JDIMENSION in_row_group_ctr 445; r13 = JSAMPARRAY output_buf 446 447 align 32 448 GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2) 449 450EXTN(jsimd_h2v2_merged_upsample_sse2): 451 push rbp 452 mov rax, rsp 453 mov rbp, rsp 454 collect_args 4 455 push rbx 456 457 mov eax, r10d 458 459 mov rdi, r11 460 mov ecx, r12d 461 mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] 462 mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] 463 mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] 464 mov rdi, r13 465 lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] 466 467 push rdx ; inptr2 468 push rbx ; inptr1 469 push rsi ; inptr00 470 mov rbx, rsp 471 472 push rdi 473 push rcx 474 push rax 475 476 %ifdef WIN64 477 mov r8, rcx 478 mov r9, rdi 479 mov rcx, rax 480 mov rdx, rbx 481 %else 482 mov rdx, rcx 483 mov rcx, rdi 484 mov rdi, rax 485 mov rsi, rbx 486 %endif 487 488 call EXTN(jsimd_h2v1_merged_upsample_sse2) 489 490 pop rax 491 pop rcx 492 pop rdi 493 pop rsi 494 pop rbx 495 pop rdx 496 497 add rdi, byte SIZEOF_JSAMPROW ; outptr1 498 add rsi, byte SIZEOF_JSAMPROW ; inptr01 499 500 push rdx ; inptr2 501 push rbx ; inptr1 502 push rsi ; inptr00 503 mov rbx, rsp 504 505 push rdi 506 push rcx 507 push rax 508 509 %ifdef WIN64 510 mov r8, rcx 511 mov r9, rdi 512 mov rcx, rax 513 mov rdx, rbx 514 %else 515 mov rdx, rcx 516 mov rcx, rdi 517 mov rdi, rax 518 mov rsi, rbx 519 %endif 520 521 call EXTN(jsimd_h2v1_merged_upsample_sse2) 522 523 pop rax 524 pop rcx 525 pop rdi 526 pop rsi 527 pop rbx 528 pop rdx 529 530 pop rbx 531 uncollect_args 4 532 pop rbp 533 ret 534 535; For some reason, the OS X linker does not honor the request to align the 536; segment unless we do this. 537 align 32 538