1; 2; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2) 3; 4; Copyright (C) 2011, 2016, D. R. Commander. 5; Copyright (C) 2015, Intel Corporation. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jcolsamp.inc" 20 21; -------------------------------------------------------------------------- 22; 23; Convert some rows of samples to the output colorspace. 24; 25; GLOBAL(void) 26; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, 27; JSAMPIMAGE output_buf, JDIMENSION output_row, 28; int num_rows); 29; 30 31; r10d = JDIMENSION img_width 32; r11 = JSAMPARRAY input_buf 33; r12 = JSAMPIMAGE output_buf 34; r13d = JDIMENSION output_row 35; r14d = int num_rows 36 37%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] 38%define WK_NUM 2 39 40 align 32 41 GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2) 42 43EXTN(jsimd_rgb_gray_convert_avx2): 44 push rbp 45 mov rax, rsp ; rax = original rbp 46 sub rsp, byte 4 47 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 48 mov [rsp], rax 49 mov rbp, rsp ; rbp = aligned rbp 50 lea rsp, [wk(0)] 51 collect_args 5 52 push rbx 53 54 mov ecx, r10d 55 test rcx, rcx 56 jz near .return 57 58 push rcx 59 60 mov rsi, r12 61 mov ecx, r13d 62 mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 63 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 64 65 pop rcx 66 67 mov rsi, r11 68 mov eax, r14d 69 test rax, rax 70 jle near .return 71.rowloop: 72 push rdi 73 push rsi 74 push rcx ; col 75 76 mov rsi, JSAMPROW [rsi] ; inptr 77 mov rdi, JSAMPROW [rdi] ; outptr0 78 79 cmp rcx, byte SIZEOF_YMMWORD 80 jae near .columnloop 81 82%if RGB_PIXELSIZE == 3 ; --------------- 83 84.column_ld1: 85 push rax 86 push rdx 87 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 88 test cl, SIZEOF_BYTE 89 jz short .column_ld2 90 sub rcx, byte SIZEOF_BYTE 91 movzx rax, BYTE [rsi+rcx] 92.column_ld2: 93 test cl, SIZEOF_WORD 94 jz short .column_ld4 95 sub rcx, byte SIZEOF_WORD 96 movzx rdx, WORD [rsi+rcx] 97 shl rax, WORD_BIT 98 or rax, rdx 99.column_ld4: 100 vmovd xmmA, eax 101 pop rdx 102 pop rax 103 test cl, SIZEOF_DWORD 104 jz short .column_ld8 105 sub rcx, byte SIZEOF_DWORD 106 vmovd xmmF, XMM_DWORD [rsi+rcx] 107 vpslldq xmmA, xmmA, SIZEOF_DWORD 108 vpor xmmA, xmmA, xmmF 109.column_ld8: 110 test cl, SIZEOF_MMWORD 111 jz short .column_ld16 112 sub rcx, byte SIZEOF_MMWORD 113 vmovq xmmB, XMM_MMWORD [rsi+rcx] 114 vpslldq xmmA, xmmA, SIZEOF_MMWORD 115 vpor xmmA, xmmA, xmmB 116.column_ld16: 117 test cl, SIZEOF_XMMWORD 118 jz short .column_ld32 119 sub rcx, byte SIZEOF_XMMWORD 120 vmovdqu xmmB, XMM_MMWORD [rsi+rcx] 121 vperm2i128 ymmA, ymmA, ymmA, 1 122 vpor ymmA, ymmB 123.column_ld32: 124 test cl, SIZEOF_YMMWORD 125 jz short .column_ld64 126 sub rcx, byte SIZEOF_YMMWORD 127 vmovdqa ymmF, ymmA 128 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 129.column_ld64: 130 test cl, 2*SIZEOF_YMMWORD 131 mov rcx, SIZEOF_YMMWORD 132 jz short .rgb_gray_cnv 133 vmovdqa ymmB, ymmA 134 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 135 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 136 jmp short .rgb_gray_cnv 137 138.columnloop: 139 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 140 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 141 vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD] 142 143.rgb_gray_cnv: 144 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 145 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 146 ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 147 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 148 ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 149 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 150 151 vmovdqu ymmC, ymmA 152 vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 153 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 154 vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 155 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 156 vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 157 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 158 vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A 159 ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) 160 161 vmovdqa ymmG, ymmA 162 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 163 ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) 164 vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I 165 ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) 166 167 vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A 168 ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) 169 vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 170 ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) 171 172 vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D 173 ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) 174 vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F 175 ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) 176 177 vmovdqa ymmD, ymmA 178 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 179 ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) 180 vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P 181 ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) 182 183 vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D 184 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) 185 vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B 186 ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) 187 188 vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E 189 ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) 190 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 191 ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) 192 193 vmovdqa ymmE, ymmA 194 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C 195 ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) 196 vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S 197 ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) 198 199 vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 200 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 201 vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D 202 ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) 203 204 vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F 205 ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) 206 vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F 207 ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) 208 209 vpxor ymmH, ymmH, ymmH 210 211 vmovdqa ymmC, ymmA 212 vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 213 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 214 215 vmovdqa ymmB, ymmE 216 vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 217 vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 218 219 vmovdqa ymmF, ymmD 220 vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 221 vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 222 223%else ; RGB_PIXELSIZE == 4 ; ----------- 224 225.column_ld1: 226 test cl, SIZEOF_XMMWORD/16 227 jz short .column_ld2 228 sub rcx, byte SIZEOF_XMMWORD/16 229 vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 230.column_ld2: 231 test cl, SIZEOF_XMMWORD/8 232 jz short .column_ld4 233 sub rcx, byte SIZEOF_XMMWORD/8 234 vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 235 vpslldq xmmA, xmmA, SIZEOF_MMWORD 236 vpor xmmA, xmmA, xmmF 237.column_ld4: 238 test cl, SIZEOF_XMMWORD/4 239 jz short .column_ld8 240 sub rcx, byte SIZEOF_XMMWORD/4 241 vmovdqa xmmF, xmmA 242 vperm2i128 ymmF, ymmF, ymmF, 1 243 vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 244 vpor ymmA, ymmA, ymmF 245.column_ld8: 246 test cl, SIZEOF_XMMWORD/2 247 jz short .column_ld16 248 sub rcx, byte SIZEOF_XMMWORD/2 249 vmovdqa ymmF, ymmA 250 vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE] 251.column_ld16: 252 test cl, SIZEOF_XMMWORD 253 mov rcx, SIZEOF_YMMWORD 254 jz short .rgb_gray_cnv 255 vmovdqa ymmE, ymmA 256 vmovdqa ymmH, ymmF 257 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 258 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 259 jmp short .rgb_gray_cnv 260 261.columnloop: 262 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 263 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 264 vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD] 265 vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD] 266 267.rgb_gray_cnv: 268 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 269 ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 270 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 271 ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 272 ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J 273 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 274 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R 275 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 276 277 vmovdqa ymmB, ymmA 278 vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 279 ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) 280 vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 281 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 282 283 vmovdqa ymmB, ymmF 284 vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 285 ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) 286 vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F 287 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 288 289 vmovdqa ymmD, ymmA 290 vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 291 ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) 292 vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 293 ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) 294 295 vmovdqa ymmC, ymmF 296 vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D 297 ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) 298 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F 299 ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) 300 301 vmovdqa ymmB, ymmA 302 vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C 303 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) 304 vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D 305 ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) 306 307 vmovdqa ymmG, ymmD 308 vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E 309 ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) 310 vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F 311 ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) 312 313 vmovdqa ymmE, ymmA 314 vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 315 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 316 vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E 317 ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) 318 319 vmovdqa ymmH, ymmB 320 vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F 321 ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) 322 vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F 323 ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) 324 325 vpxor ymmF, ymmF, ymmF 326 327 vmovdqa ymmC, ymmA 328 vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 329 vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 330 331 vmovdqa ymmD, ymmB 332 vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 333 vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 334 335 vmovdqa ymmG, ymmE 336 vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 337 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) 338 339 vpunpcklbw ymmF, ymmF, ymmH 340 vpunpckhbw ymmH, ymmH, ymmH 341 vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 342 vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) 343 344%endif ; RGB_PIXELSIZE ; --------------- 345 346 ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE 347 ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO 348 349 ; (Original) 350 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 351 ; 352 ; (This implementation) 353 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 354 355 vmovdqa ymm6, ymm1 356 vpunpcklwd ymm1, ymm1, ymm3 357 vpunpckhwd ymm6, ymm6, ymm3 358 vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) 359 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) 360 361 vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337) 362 363 vmovdqa ymm6, ymm0 364 vpunpcklwd ymm0, ymm0, ymm2 365 vpunpckhwd ymm6, ymm6, ymm2 366 vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) 367 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) 368 369 vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) 370 vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) 371 372 vmovdqa ymm0, ymm5 ; ymm0=BO 373 vmovdqa ymm6, ymm4 ; ymm6=BE 374 375 vmovdqa ymm4, ymm0 376 vpunpcklwd ymm0, ymm0, ymm3 377 vpunpckhwd ymm4, ymm4, ymm3 378 vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) 379 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) 380 381 vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF] 382 383 vpaddd ymm0, ymm0, ymm1 384 vpaddd ymm4, ymm4, ymm7 385 vpaddd ymm0, ymm0, ymm3 386 vpaddd ymm4, ymm4, ymm3 387 vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL 388 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH 389 vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO 390 391 vmovdqa ymm4, ymm6 392 vpunpcklwd ymm6, ymm6, ymm2 393 vpunpckhwd ymm4, ymm4, ymm2 394 vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) 395 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) 396 397 vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF] 398 399 vpaddd ymm6, ymm6, YMMWORD [wk(0)] 400 vpaddd ymm4, ymm4, YMMWORD [wk(1)] 401 vpaddd ymm6, ymm6, ymm2 402 vpaddd ymm4, ymm4, ymm2 403 vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL 404 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH 405 vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE 406 407 vpsllw ymm0, ymm0, BYTE_BIT 408 vpor ymm6, ymm6, ymm0 ; ymm6=Y 409 vmovdqu YMMWORD [rdi], ymm6 ; Save Y 410 411 sub rcx, byte SIZEOF_YMMWORD 412 add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr 413 add rdi, byte SIZEOF_YMMWORD ; outptr0 414 cmp rcx, byte SIZEOF_YMMWORD 415 jae near .columnloop 416 test rcx, rcx 417 jnz near .column_ld1 418 419 pop rcx ; col 420 pop rsi 421 pop rdi 422 423 add rsi, byte SIZEOF_JSAMPROW ; input_buf 424 add rdi, byte SIZEOF_JSAMPROW 425 dec rax ; num_rows 426 jg near .rowloop 427 428.return: 429 pop rbx 430 vzeroupper 431 uncollect_args 5 432 mov rsp, rbp ; rsp <- aligned rbp 433 pop rsp ; rsp <- original rbp 434 pop rbp 435 ret 436 437; For some reason, the OS X linker does not honor the request to align the 438; segment unless we do this. 439 align 32 440