1; 2; jcsample.asm - downsampling (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_TEXT 23 BITS 64 24; 25; Downsample pixel values of a single component. 26; This version handles the common case of 2:1 horizontal and 1:1 vertical, 27; without smoothing. 28; 29; GLOBAL(void) 30; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 31; JDIMENSION v_samp_factor, 32; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 33; JSAMPARRAY output_data); 34; 35 36; r10d = JDIMENSION image_width 37; r11 = int max_v_samp_factor 38; r12d = JDIMENSION v_samp_factor 39; r13d = JDIMENSION width_in_blocks 40; r14 = JSAMPARRAY input_data 41; r15 = JSAMPARRAY output_data 42 43 align 32 44 GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) 45 46EXTN(jsimd_h2v1_downsample_sse2): 47 push rbp 48 mov rax, rsp 49 mov rbp, rsp 50 collect_args 6 51 52 mov ecx, r13d 53 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 54 jz near .return 55 56 mov edx, r10d 57 58 ; -- expand_right_edge 59 60 push rcx 61 shl rcx, 1 ; output_cols * 2 62 sub rcx, rdx 63 jle short .expand_end 64 65 mov rax, r11 66 test rax, rax 67 jle short .expand_end 68 69 cld 70 mov rsi, r14 ; input_data 71.expandloop: 72 push rax 73 push rcx 74 75 mov rdi, JSAMPROW [rsi] 76 add rdi, rdx 77 mov al, JSAMPLE [rdi-1] 78 79 rep stosb 80 81 pop rcx 82 pop rax 83 84 add rsi, byte SIZEOF_JSAMPROW 85 dec rax 86 jg short .expandloop 87 88.expand_end: 89 pop rcx ; output_cols 90 91 ; -- h2v1_downsample 92 93 mov eax, r12d ; rowctr 94 test eax, eax 95 jle near .return 96 97 mov rdx, 0x00010000 ; bias pattern 98 movd xmm7, edx 99 pcmpeqw xmm6, xmm6 100 pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 101 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 102 103 mov rsi, r14 ; input_data 104 mov rdi, r15 ; output_data 105.rowloop: 106 push rcx 107 push rdi 108 push rsi 109 110 mov rsi, JSAMPROW [rsi] ; inptr 111 mov rdi, JSAMPROW [rdi] ; outptr 112 113 cmp rcx, byte SIZEOF_XMMWORD 114 jae short .columnloop 115 116.columnloop_r8: 117 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 118 pxor xmm1, xmm1 119 mov rcx, SIZEOF_XMMWORD 120 jmp short .downsample 121 122.columnloop: 123 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 124 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] 125 126.downsample: 127 movdqa xmm2, xmm0 128 movdqa xmm3, xmm1 129 130 pand xmm0, xmm6 131 psrlw xmm2, BYTE_BIT 132 pand xmm1, xmm6 133 psrlw xmm3, BYTE_BIT 134 135 paddw xmm0, xmm2 136 paddw xmm1, xmm3 137 paddw xmm0, xmm7 138 paddw xmm1, xmm7 139 psrlw xmm0, 1 140 psrlw xmm1, 1 141 142 packuswb xmm0, xmm1 143 144 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 145 146 sub rcx, byte SIZEOF_XMMWORD ; outcol 147 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 148 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 149 cmp rcx, byte SIZEOF_XMMWORD 150 jae short .columnloop 151 test rcx, rcx 152 jnz short .columnloop_r8 153 154 pop rsi 155 pop rdi 156 pop rcx 157 158 add rsi, byte SIZEOF_JSAMPROW ; input_data 159 add rdi, byte SIZEOF_JSAMPROW ; output_data 160 dec rax ; rowctr 161 jg near .rowloop 162 163.return: 164 uncollect_args 6 165 pop rbp 166 ret 167 168; -------------------------------------------------------------------------- 169; 170; Downsample pixel values of a single component. 171; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 172; without smoothing. 173; 174; GLOBAL(void) 175; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 176; JDIMENSION v_samp_factor, 177; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 178; JSAMPARRAY output_data); 179; 180 181; r10d = JDIMENSION image_width 182; r11 = int max_v_samp_factor 183; r12d = JDIMENSION v_samp_factor 184; r13d = JDIMENSION width_in_blocks 185; r14 = JSAMPARRAY input_data 186; r15 = JSAMPARRAY output_data 187 188 align 32 189 GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) 190 191EXTN(jsimd_h2v2_downsample_sse2): 192 push rbp 193 mov rax, rsp 194 mov rbp, rsp 195 collect_args 6 196 197 mov ecx, r13d 198 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 199 jz near .return 200 201 mov edx, r10d 202 203 ; -- expand_right_edge 204 205 push rcx 206 shl rcx, 1 ; output_cols * 2 207 sub rcx, rdx 208 jle short .expand_end 209 210 mov rax, r11 211 test rax, rax 212 jle short .expand_end 213 214 cld 215 mov rsi, r14 ; input_data 216.expandloop: 217 push rax 218 push rcx 219 220 mov rdi, JSAMPROW [rsi] 221 add rdi, rdx 222 mov al, JSAMPLE [rdi-1] 223 224 rep stosb 225 226 pop rcx 227 pop rax 228 229 add rsi, byte SIZEOF_JSAMPROW 230 dec rax 231 jg short .expandloop 232 233.expand_end: 234 pop rcx ; output_cols 235 236 ; -- h2v2_downsample 237 238 mov eax, r12d ; rowctr 239 test rax, rax 240 jle near .return 241 242 mov rdx, 0x00020001 ; bias pattern 243 movd xmm7, edx 244 pcmpeqw xmm6, xmm6 245 pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 246 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 247 248 mov rsi, r14 ; input_data 249 mov rdi, r15 ; output_data 250.rowloop: 251 push rcx 252 push rdi 253 push rsi 254 255 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 256 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 257 mov rdi, JSAMPROW [rdi] ; outptr 258 259 cmp rcx, byte SIZEOF_XMMWORD 260 jae short .columnloop 261 262.columnloop_r8: 263 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 264 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 265 pxor xmm2, xmm2 266 pxor xmm3, xmm3 267 mov rcx, SIZEOF_XMMWORD 268 jmp short .downsample 269 270.columnloop: 271 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 272 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 273 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] 274 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] 275 276.downsample: 277 movdqa xmm4, xmm0 278 movdqa xmm5, xmm1 279 pand xmm0, xmm6 280 psrlw xmm4, BYTE_BIT 281 pand xmm1, xmm6 282 psrlw xmm5, BYTE_BIT 283 paddw xmm0, xmm4 284 paddw xmm1, xmm5 285 286 movdqa xmm4, xmm2 287 movdqa xmm5, xmm3 288 pand xmm2, xmm6 289 psrlw xmm4, BYTE_BIT 290 pand xmm3, xmm6 291 psrlw xmm5, BYTE_BIT 292 paddw xmm2, xmm4 293 paddw xmm3, xmm5 294 295 paddw xmm0, xmm1 296 paddw xmm2, xmm3 297 paddw xmm0, xmm7 298 paddw xmm2, xmm7 299 psrlw xmm0, 2 300 psrlw xmm2, 2 301 302 packuswb xmm0, xmm2 303 304 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 305 306 sub rcx, byte SIZEOF_XMMWORD ; outcol 307 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 308 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 309 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 310 cmp rcx, byte SIZEOF_XMMWORD 311 jae near .columnloop 312 test rcx, rcx 313 jnz near .columnloop_r8 314 315 pop rsi 316 pop rdi 317 pop rcx 318 319 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data 320 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data 321 dec rax ; rowctr 322 jg near .rowloop 323 324.return: 325 uncollect_args 6 326 pop rbp 327 ret 328 329; For some reason, the OS X linker does not honor the request to align the 330; segment unless we do this. 331 align 32 332