1; 2; jcsample.asm - downsampling (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on 7; x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_TEXT 23 BITS 32 24; 25; Downsample pixel values of a single component. 26; This version handles the common case of 2:1 horizontal and 1:1 vertical, 27; without smoothing. 28; 29; GLOBAL(void) 30; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, 31; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 32; JSAMPARRAY input_data, JSAMPARRAY output_data); 33; 34 35%define img_width(b) (b)+8 ; JDIMENSION image_width 36%define max_v_samp(b) (b)+12 ; int max_v_samp_factor 37%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 38%define width_blks(b) (b)+20 ; JDIMENSION width_blocks 39%define input_data(b) (b)+24 ; JSAMPARRAY input_data 40%define output_data(b) (b)+28 ; JSAMPARRAY output_data 41 42 align 16 43 global EXTN(jsimd_h2v1_downsample_sse2) 44 45EXTN(jsimd_h2v1_downsample_sse2): 46 push ebp 47 mov ebp,esp 48; push ebx ; unused 49; push ecx ; need not be preserved 50; push edx ; need not be preserved 51 push esi 52 push edi 53 54 mov ecx, JDIMENSION [width_blks(ebp)] 55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 56 jz near .return 57 58 mov edx, JDIMENSION [img_width(ebp)] 59 60 ; -- expand_right_edge 61 62 push ecx 63 shl ecx,1 ; output_cols * 2 64 sub ecx,edx 65 jle short .expand_end 66 67 mov eax, INT [max_v_samp(ebp)] 68 test eax,eax 69 jle short .expand_end 70 71 cld 72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 73 alignx 16,7 74.expandloop: 75 push eax 76 push ecx 77 78 mov edi, JSAMPROW [esi] 79 add edi,edx 80 mov al, JSAMPLE [edi-1] 81 82 rep stosb 83 84 pop ecx 85 pop eax 86 87 add esi, byte SIZEOF_JSAMPROW 88 dec eax 89 jg short .expandloop 90 91.expand_end: 92 pop ecx ; output_cols 93 94 ; -- h2v1_downsample 95 96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 97 test eax,eax 98 jle near .return 99 100 mov edx, 0x00010000 ; bias pattern 101 movd xmm7,edx 102 pcmpeqw xmm6,xmm6 103 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 104 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 105 106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 108 alignx 16,7 109.rowloop: 110 push ecx 111 push edi 112 push esi 113 114 mov esi, JSAMPROW [esi] ; inptr 115 mov edi, JSAMPROW [edi] ; outptr 116 117 cmp ecx, byte SIZEOF_XMMWORD 118 jae short .columnloop 119 alignx 16,7 120 121.columnloop_r8: 122 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 123 pxor xmm1,xmm1 124 mov ecx, SIZEOF_XMMWORD 125 jmp short .downsample 126 alignx 16,7 127 128.columnloop: 129 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 130 movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] 131 132.downsample: 133 movdqa xmm2,xmm0 134 movdqa xmm3,xmm1 135 136 pand xmm0,xmm6 137 psrlw xmm2,BYTE_BIT 138 pand xmm1,xmm6 139 psrlw xmm3,BYTE_BIT 140 141 paddw xmm0,xmm2 142 paddw xmm1,xmm3 143 paddw xmm0,xmm7 144 paddw xmm1,xmm7 145 psrlw xmm0,1 146 psrlw xmm1,1 147 148 packuswb xmm0,xmm1 149 150 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 151 152 sub ecx, byte SIZEOF_XMMWORD ; outcol 153 add esi, byte 2*SIZEOF_XMMWORD ; inptr 154 add edi, byte 1*SIZEOF_XMMWORD ; outptr 155 cmp ecx, byte SIZEOF_XMMWORD 156 jae short .columnloop 157 test ecx,ecx 158 jnz short .columnloop_r8 159 160 pop esi 161 pop edi 162 pop ecx 163 164 add esi, byte SIZEOF_JSAMPROW ; input_data 165 add edi, byte SIZEOF_JSAMPROW ; output_data 166 dec eax ; rowctr 167 jg near .rowloop 168 169.return: 170 pop edi 171 pop esi 172; pop edx ; need not be preserved 173; pop ecx ; need not be preserved 174; pop ebx ; unused 175 pop ebp 176 ret 177 178; -------------------------------------------------------------------------- 179; 180; Downsample pixel values of a single component. 181; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 182; without smoothing. 183; 184; GLOBAL(void) 185; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, 186; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 187; JSAMPARRAY input_data, JSAMPARRAY output_data); 188; 189 190%define img_width(b) (b)+8 ; JDIMENSION image_width 191%define max_v_samp(b) (b)+12 ; int max_v_samp_factor 192%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 193%define width_blks(b) (b)+20 ; JDIMENSION width_blocks 194%define input_data(b) (b)+24 ; JSAMPARRAY input_data 195%define output_data(b) (b)+28 ; JSAMPARRAY output_data 196 197 align 16 198 global EXTN(jsimd_h2v2_downsample_sse2) 199 200EXTN(jsimd_h2v2_downsample_sse2): 201 push ebp 202 mov ebp,esp 203; push ebx ; unused 204; push ecx ; need not be preserved 205; push edx ; need not be preserved 206 push esi 207 push edi 208 209 mov ecx, JDIMENSION [width_blks(ebp)] 210 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 211 jz near .return 212 213 mov edx, JDIMENSION [img_width(ebp)] 214 215 ; -- expand_right_edge 216 217 push ecx 218 shl ecx,1 ; output_cols * 2 219 sub ecx,edx 220 jle short .expand_end 221 222 mov eax, INT [max_v_samp(ebp)] 223 test eax,eax 224 jle short .expand_end 225 226 cld 227 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 228 alignx 16,7 229.expandloop: 230 push eax 231 push ecx 232 233 mov edi, JSAMPROW [esi] 234 add edi,edx 235 mov al, JSAMPLE [edi-1] 236 237 rep stosb 238 239 pop ecx 240 pop eax 241 242 add esi, byte SIZEOF_JSAMPROW 243 dec eax 244 jg short .expandloop 245 246.expand_end: 247 pop ecx ; output_cols 248 249 ; -- h2v2_downsample 250 251 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 252 test eax,eax 253 jle near .return 254 255 mov edx, 0x00020001 ; bias pattern 256 movd xmm7,edx 257 pcmpeqw xmm6,xmm6 258 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 259 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 260 261 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 262 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 263 alignx 16,7 264.rowloop: 265 push ecx 266 push edi 267 push esi 268 269 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 270 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 271 mov edi, JSAMPROW [edi] ; outptr 272 273 cmp ecx, byte SIZEOF_XMMWORD 274 jae short .columnloop 275 alignx 16,7 276 277.columnloop_r8: 278 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 279 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 280 pxor xmm2,xmm2 281 pxor xmm3,xmm3 282 mov ecx, SIZEOF_XMMWORD 283 jmp short .downsample 284 alignx 16,7 285 286.columnloop: 287 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 288 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 289 movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] 290 movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] 291 292.downsample: 293 movdqa xmm4,xmm0 294 movdqa xmm5,xmm1 295 pand xmm0,xmm6 296 psrlw xmm4,BYTE_BIT 297 pand xmm1,xmm6 298 psrlw xmm5,BYTE_BIT 299 paddw xmm0,xmm4 300 paddw xmm1,xmm5 301 302 movdqa xmm4,xmm2 303 movdqa xmm5,xmm3 304 pand xmm2,xmm6 305 psrlw xmm4,BYTE_BIT 306 pand xmm3,xmm6 307 psrlw xmm5,BYTE_BIT 308 paddw xmm2,xmm4 309 paddw xmm3,xmm5 310 311 paddw xmm0,xmm1 312 paddw xmm2,xmm3 313 paddw xmm0,xmm7 314 paddw xmm2,xmm7 315 psrlw xmm0,2 316 psrlw xmm2,2 317 318 packuswb xmm0,xmm2 319 320 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 321 322 sub ecx, byte SIZEOF_XMMWORD ; outcol 323 add edx, byte 2*SIZEOF_XMMWORD ; inptr0 324 add esi, byte 2*SIZEOF_XMMWORD ; inptr1 325 add edi, byte 1*SIZEOF_XMMWORD ; outptr 326 cmp ecx, byte SIZEOF_XMMWORD 327 jae near .columnloop 328 test ecx,ecx 329 jnz near .columnloop_r8 330 331 pop esi 332 pop edi 333 pop ecx 334 335 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 336 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 337 dec eax ; rowctr 338 jg near .rowloop 339 340.return: 341 pop edi 342 pop esi 343; pop edx ; need not be preserved 344; pop ecx ; need not be preserved 345; pop ebx ; unused 346 pop ebp 347 ret 348 349; For some reason, the OS X linker does not honor the request to align the 350; segment unless we do this. 351 align 16 352