1; 2; jcsample.asm - downsampling (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on 7; x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_TEXT 23 BITS 32 24; 25; Downsample pixel values of a single component. 26; This version handles the common case of 2:1 horizontal and 1:1 vertical, 27; without smoothing. 28; 29; GLOBAL(void) 30; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, 31; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 32; JSAMPARRAY input_data, JSAMPARRAY output_data); 33; 34 35%define img_width(b) (b)+8 ; JDIMENSION image_width 36%define max_v_samp(b) (b)+12 ; int max_v_samp_factor 37%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 38%define width_blks(b) (b)+20 ; JDIMENSION width_blocks 39%define input_data(b) (b)+24 ; JSAMPARRAY input_data 40%define output_data(b) (b)+28 ; JSAMPARRAY output_data 41 42 align 16 43 global EXTN(jsimd_h2v1_downsample_mmx) 44 45EXTN(jsimd_h2v1_downsample_mmx): 46 push ebp 47 mov ebp,esp 48; push ebx ; unused 49; push ecx ; need not be preserved 50; push edx ; need not be preserved 51 push esi 52 push edi 53 54 mov ecx, JDIMENSION [width_blks(ebp)] 55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 56 jz near .return 57 58 mov edx, JDIMENSION [img_width(ebp)] 59 60 ; -- expand_right_edge 61 62 push ecx 63 shl ecx,1 ; output_cols * 2 64 sub ecx,edx 65 jle short .expand_end 66 67 mov eax, INT [max_v_samp(ebp)] 68 test eax,eax 69 jle short .expand_end 70 71 cld 72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 73 alignx 16,7 74.expandloop: 75 push eax 76 push ecx 77 78 mov edi, JSAMPROW [esi] 79 add edi,edx 80 mov al, JSAMPLE [edi-1] 81 82 rep stosb 83 84 pop ecx 85 pop eax 86 87 add esi, byte SIZEOF_JSAMPROW 88 dec eax 89 jg short .expandloop 90 91.expand_end: 92 pop ecx ; output_cols 93 94 ; -- h2v1_downsample 95 96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 97 test eax,eax 98 jle near .return 99 100 mov edx, 0x00010000 ; bias pattern 101 movd mm7,edx 102 pcmpeqw mm6,mm6 103 punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} 104 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 105 106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 108 alignx 16,7 109.rowloop: 110 push ecx 111 push edi 112 push esi 113 114 mov esi, JSAMPROW [esi] ; inptr 115 mov edi, JSAMPROW [edi] ; outptr 116 alignx 16,7 117.columnloop: 118 119 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 120 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] 121 movq mm2,mm0 122 movq mm3,mm1 123 124 pand mm0,mm6 125 psrlw mm2,BYTE_BIT 126 pand mm1,mm6 127 psrlw mm3,BYTE_BIT 128 129 paddw mm0,mm2 130 paddw mm1,mm3 131 paddw mm0,mm7 132 paddw mm1,mm7 133 psrlw mm0,1 134 psrlw mm1,1 135 136 packuswb mm0,mm1 137 138 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 139 140 add esi, byte 2*SIZEOF_MMWORD ; inptr 141 add edi, byte 1*SIZEOF_MMWORD ; outptr 142 sub ecx, byte SIZEOF_MMWORD ; outcol 143 jnz short .columnloop 144 145 pop esi 146 pop edi 147 pop ecx 148 149 add esi, byte SIZEOF_JSAMPROW ; input_data 150 add edi, byte SIZEOF_JSAMPROW ; output_data 151 dec eax ; rowctr 152 jg short .rowloop 153 154 emms ; empty MMX state 155 156.return: 157 pop edi 158 pop esi 159; pop edx ; need not be preserved 160; pop ecx ; need not be preserved 161; pop ebx ; unused 162 pop ebp 163 ret 164 165; -------------------------------------------------------------------------- 166; 167; Downsample pixel values of a single component. 168; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 169; without smoothing. 170; 171; GLOBAL(void) 172; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, 173; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 174; JSAMPARRAY input_data, JSAMPARRAY output_data); 175; 176 177%define img_width(b) (b)+8 ; JDIMENSION image_width 178%define max_v_samp(b) (b)+12 ; int max_v_samp_factor 179%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 180%define width_blks(b) (b)+20 ; JDIMENSION width_blocks 181%define input_data(b) (b)+24 ; JSAMPARRAY input_data 182%define output_data(b) (b)+28 ; JSAMPARRAY output_data 183 184 align 16 185 global EXTN(jsimd_h2v2_downsample_mmx) 186 187EXTN(jsimd_h2v2_downsample_mmx): 188 push ebp 189 mov ebp,esp 190; push ebx ; unused 191; push ecx ; need not be preserved 192; push edx ; need not be preserved 193 push esi 194 push edi 195 196 mov ecx, JDIMENSION [width_blks(ebp)] 197 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 198 jz near .return 199 200 mov edx, JDIMENSION [img_width(ebp)] 201 202 ; -- expand_right_edge 203 204 push ecx 205 shl ecx,1 ; output_cols * 2 206 sub ecx,edx 207 jle short .expand_end 208 209 mov eax, INT [max_v_samp(ebp)] 210 test eax,eax 211 jle short .expand_end 212 213 cld 214 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 215 alignx 16,7 216.expandloop: 217 push eax 218 push ecx 219 220 mov edi, JSAMPROW [esi] 221 add edi,edx 222 mov al, JSAMPLE [edi-1] 223 224 rep stosb 225 226 pop ecx 227 pop eax 228 229 add esi, byte SIZEOF_JSAMPROW 230 dec eax 231 jg short .expandloop 232 233.expand_end: 234 pop ecx ; output_cols 235 236 ; -- h2v2_downsample 237 238 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 239 test eax,eax 240 jle near .return 241 242 mov edx, 0x00020001 ; bias pattern 243 movd mm7,edx 244 pcmpeqw mm6,mm6 245 punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} 246 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 247 248 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 249 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 250 alignx 16,7 251.rowloop: 252 push ecx 253 push edi 254 push esi 255 256 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 257 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 258 mov edi, JSAMPROW [edi] ; outptr 259 alignx 16,7 260.columnloop: 261 262 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] 263 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 264 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] 265 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] 266 267 movq mm4,mm0 268 movq mm5,mm1 269 pand mm0,mm6 270 psrlw mm4,BYTE_BIT 271 pand mm1,mm6 272 psrlw mm5,BYTE_BIT 273 paddw mm0,mm4 274 paddw mm1,mm5 275 276 movq mm4,mm2 277 movq mm5,mm3 278 pand mm2,mm6 279 psrlw mm4,BYTE_BIT 280 pand mm3,mm6 281 psrlw mm5,BYTE_BIT 282 paddw mm2,mm4 283 paddw mm3,mm5 284 285 paddw mm0,mm1 286 paddw mm2,mm3 287 paddw mm0,mm7 288 paddw mm2,mm7 289 psrlw mm0,2 290 psrlw mm2,2 291 292 packuswb mm0,mm2 293 294 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 295 296 add edx, byte 2*SIZEOF_MMWORD ; inptr0 297 add esi, byte 2*SIZEOF_MMWORD ; inptr1 298 add edi, byte 1*SIZEOF_MMWORD ; outptr 299 sub ecx, byte SIZEOF_MMWORD ; outcol 300 jnz near .columnloop 301 302 pop esi 303 pop edi 304 pop ecx 305 306 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 307 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 308 dec eax ; rowctr 309 jg near .rowloop 310 311 emms ; empty MMX state 312 313.return: 314 pop edi 315 pop esi 316; pop edx ; need not be preserved 317; pop ecx ; need not be preserved 318; pop ebx ; unused 319 pop ebp 320 ret 321 322; For some reason, the OS X linker does not honor the request to align the 323; segment unless we do this. 324 align 16 325