1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp9_post_proc_down_and_across_xmm
15;(
16;    unsigned char *src_ptr,
17;    unsigned char *dst_ptr,
18;    int src_pixels_per_line,
19;    int dst_pixels_per_line,
20;    int rows,
21;    int cols,
22;    int flimit
23;)
24global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
25sym(vp9_post_proc_down_and_across_xmm):
26    push        rbp
27    mov         rbp, rsp
28    SHADOW_ARGS_TO_STACK 7
29    SAVE_XMM 7
30    GET_GOT     rbx
31    push        rsi
32    push        rdi
33    ; end prolog
34
35%if ABI_IS_32BIT=1 && CONFIG_PIC=1
36    ALIGN_STACK 16, rax
37    ; move the global rd onto the stack, since we don't have enough registers
38    ; to do PIC addressing
39    movdqa      xmm0, [GLOBAL(rd42)]
40    sub         rsp, 16
41    movdqa      [rsp], xmm0
42%define RD42 [rsp]
43%else
44%define RD42 [GLOBAL(rd42)]
45%endif
46
47
48        movd        xmm2,       dword ptr arg(6) ;flimit
49        punpcklwd   xmm2,       xmm2
50        punpckldq   xmm2,       xmm2
51        punpcklqdq  xmm2,       xmm2
52
53        mov         rsi,        arg(0) ;src_ptr
54        mov         rdi,        arg(1) ;dst_ptr
55
56        movsxd      rcx,        DWORD PTR arg(4) ;rows
57        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
58        pxor        xmm0,       xmm0              ; mm0 = 00000000
59
60.nextrow:
61
62        xor         rdx,        rdx       ; clear out rdx for use as loop counter
63.nextcol:
64        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
65        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
66        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
67        psllw       xmm3,       2                       ;
68
69        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
70        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
71        paddusw     xmm3,       xmm5                    ; mm3 += mm6
72
73        ; thresholding
74        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
75        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
76        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
77        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
78        pcmpgtw     xmm7,       xmm2
79
80        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
81        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
82        paddusw     xmm3,       xmm5                    ; mm3 += mm5
83
84        ; thresholding
85        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
86        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
87        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
88        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
89        pcmpgtw     xmm6,       xmm2
90        por         xmm7,       xmm6                    ; accumulate thresholds
91
92
93        neg         rax
94        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
95        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
96        paddusw     xmm3,       xmm5                    ; mm3 += mm5
97
98        ; thresholding
99        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
100        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
101        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
102        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
103        pcmpgtw     xmm6,       xmm2
104        por         xmm7,       xmm6                    ; accumulate thresholds
105
106        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
107        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
108        paddusw     xmm3,       xmm4                    ; mm3 += mm5
109
110        ; thresholding
111        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
112        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
113        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
114        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
115        pcmpgtw     xmm6,       xmm2
116        por         xmm7,       xmm6                    ; accumulate thresholds
117
118
119        paddusw     xmm3,       RD42                    ; mm3 += round value
120        psraw       xmm3,       3                       ; mm3 /= 8
121
122        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
123        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
124        paddusw     xmm1,       xmm7                    ; combination
125
126        packuswb    xmm1,       xmm0                    ; pack to bytes
127        movq        QWORD PTR [rdi], xmm1             ;
128
129        neg         rax                   ; pitch is positive
130        add         rsi,        8
131        add         rdi,        8
132
133        add         rdx,        8
134        cmp         edx,        dword arg(5) ;cols
135
136        jl          .nextcol
137
138        ; done with the all cols, start the across filtering in place
139        sub         rsi,        rdx
140        sub         rdi,        rdx
141
142        xor         rdx,        rdx
143        movq        mm0,        QWORD PTR [rdi-8];
144
145.acrossnextcol:
146        movq        xmm7,       QWORD PTR [rdi +rdx -2]
147        movd        xmm4,       DWORD PTR [rdi +rdx +6]
148
149        pslldq      xmm4,       8
150        por         xmm4,       xmm7
151
152        movdqa      xmm3,       xmm4
153        psrldq      xmm3,       2
154        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
155        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
156        psllw       xmm3,       2
157
158
159        movdqa      xmm5,       xmm4
160        psrldq      xmm5,       3
161        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
162        paddusw     xmm3,       xmm5              ; mm3 += mm6
163
164        ; thresholding
165        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
166        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
167        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
168        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
169        pcmpgtw     xmm7,       xmm2
170
171        movdqa      xmm5,       xmm4
172        psrldq      xmm5,       4
173        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
174        paddusw     xmm3,       xmm5              ; mm3 += mm5
175
176        ; thresholding
177        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
178        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
179        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
180        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
181        pcmpgtw     xmm6,       xmm2
182        por         xmm7,       xmm6              ; accumulate thresholds
183
184
185        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
186        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
187        paddusw     xmm3,       xmm5              ; mm3 += mm5
188
189        ; thresholding
190        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
191        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
192        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
193        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
194        pcmpgtw     xmm6,       xmm2
195        por         xmm7,       xmm6              ; accumulate thresholds
196
197        psrldq      xmm4,       1                   ; mm4 = p-1..p5
198        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
199        paddusw     xmm3,       xmm4              ; mm3 += mm5
200
201        ; thresholding
202        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
203        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
204        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
205        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
206        pcmpgtw     xmm6,       xmm2
207        por         xmm7,       xmm6              ; accumulate thresholds
208
209        paddusw     xmm3,       RD42              ; mm3 += round value
210        psraw       xmm3,       3                 ; mm3 /= 8
211
212        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
213        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
214        paddusw     xmm1,       xmm7              ; combination
215
216        packuswb    xmm1,       xmm0              ; pack to bytes
217        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
218        movdq2q     mm0,        xmm1
219
220        add         rdx,        8
221        cmp         edx,        dword arg(5) ;cols
222        jl          .acrossnextcol;
223
224        ; last 8 pixels
225        movq        QWORD PTR [rdi+rdx-8],  mm0
226
227        ; done with this rwo
228        add         rsi,rax               ; next line
229        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
230        add         rdi,rax               ; next destination
231        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
232
233        dec         rcx                   ; decrement count
234        jnz         .nextrow              ; next row
235
236%if ABI_IS_32BIT=1 && CONFIG_PIC=1
237    add rsp,16
238    pop rsp
239%endif
240    ; begin epilog
241    pop rdi
242    pop rsi
243    RESTORE_GOT
244    RESTORE_XMM
245    UNSHADOW_ARGS
246    pop         rbp
247    ret
248%undef RD42
249
250
251;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
252;                            int pitch, int rows, int cols,int flimit)
253extern sym(vp9_rv)
254global sym(vp9_mbpost_proc_down_xmm) PRIVATE
255sym(vp9_mbpost_proc_down_xmm):
256    push        rbp
257    mov         rbp, rsp
258    SHADOW_ARGS_TO_STACK 5
259    SAVE_XMM 7
260    GET_GOT     rbx
261    push        rsi
262    push        rdi
263    ; end prolog
264
265    ALIGN_STACK 16, rax
266    sub         rsp, 128+16
267
268    ; unsigned char d[16][8] at [rsp]
269    ; create flimit2 at [rsp+128]
270    mov         eax, dword ptr arg(4) ;flimit
271    mov         [rsp+128], eax
272    mov         [rsp+128+4], eax
273    mov         [rsp+128+8], eax
274    mov         [rsp+128+12], eax
275%define flimit4 [rsp+128]
276
277%if ABI_IS_32BIT=0
278    lea         r8,       [GLOBAL(sym(vp9_rv))]
279%endif
280
281    ;rows +=8;
282    add         dword arg(2), 8
283
284    ;for(c=0; c<cols; c+=8)
285.loop_col:
286            mov         rsi,        arg(0) ; s
287            pxor        xmm0,       xmm0        ;
288
289            movsxd      rax,        dword ptr arg(1) ;pitch       ;
290            neg         rax                                     ; rax = -pitch
291
292            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
293            neg         rax
294
295
296            pxor        xmm5,       xmm5
297            pxor        xmm6,       xmm6        ;
298
299            pxor        xmm7,       xmm7        ;
300            mov         rdi,        rsi
301
302            mov         rcx,        15          ;
303
304.loop_initvar:
305            movq        xmm1,       QWORD PTR [rdi];
306            punpcklbw   xmm1,       xmm0        ;
307
308            paddw       xmm5,       xmm1        ;
309            pmullw      xmm1,       xmm1        ;
310
311            movdqa      xmm2,       xmm1        ;
312            punpcklwd   xmm1,       xmm0        ;
313
314            punpckhwd   xmm2,       xmm0        ;
315            paddd       xmm6,       xmm1        ;
316
317            paddd       xmm7,       xmm2        ;
318            lea         rdi,        [rdi+rax]   ;
319
320            dec         rcx
321            jne         .loop_initvar
322            ;save the var and sum
323            xor         rdx,        rdx
324.loop_row:
325            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
326            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
327
328            punpcklbw   xmm1,       xmm0
329            punpcklbw   xmm2,       xmm0
330
331            paddw       xmm5,       xmm2
332            psubw       xmm5,       xmm1
333
334            pmullw      xmm2,       xmm2
335            movdqa      xmm4,       xmm2
336
337            punpcklwd   xmm2,       xmm0
338            punpckhwd   xmm4,       xmm0
339
340            paddd       xmm6,       xmm2
341            paddd       xmm7,       xmm4
342
343            pmullw      xmm1,       xmm1
344            movdqa      xmm2,       xmm1
345
346            punpcklwd   xmm1,       xmm0
347            psubd       xmm6,       xmm1
348
349            punpckhwd   xmm2,       xmm0
350            psubd       xmm7,       xmm2
351
352
353            movdqa      xmm3,       xmm6
354            pslld       xmm3,       4
355
356            psubd       xmm3,       xmm6
357            movdqa      xmm1,       xmm5
358
359            movdqa      xmm4,       xmm5
360            pmullw      xmm1,       xmm1
361
362            pmulhw      xmm4,       xmm4
363            movdqa      xmm2,       xmm1
364
365            punpcklwd   xmm1,       xmm4
366            punpckhwd   xmm2,       xmm4
367
368            movdqa      xmm4,       xmm7
369            pslld       xmm4,       4
370
371            psubd       xmm4,       xmm7
372
373            psubd       xmm3,       xmm1
374            psubd       xmm4,       xmm2
375
376            psubd       xmm3,       flimit4
377            psubd       xmm4,       flimit4
378
379            psrad       xmm3,       31
380            psrad       xmm4,       31
381
382            packssdw    xmm3,       xmm4
383            packsswb    xmm3,       xmm0
384
385            movq        xmm1,       QWORD PTR [rsi+rax*8]
386
387            movq        xmm2,       xmm1
388            punpcklbw   xmm1,       xmm0
389
390            paddw       xmm1,       xmm5
391            mov         rcx,        rdx
392
393            and         rcx,        127
394%if ABI_IS_32BIT=1 && CONFIG_PIC=1
395            push        rax
396            lea         rax,        [GLOBAL(sym(vp9_rv))]
397            movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]
398            pop         rax
399%elif ABI_IS_32BIT=0
400            movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]
401%else
402            movdqu      xmm4,       [sym(vp9_rv) + rcx*2]
403%endif
404
405            paddw       xmm1,       xmm4
406            ;paddw     xmm1,       eight8s
407            psraw       xmm1,       4
408
409            packuswb    xmm1,       xmm0
410            pand        xmm1,       xmm3
411
412            pandn       xmm3,       xmm2
413            por         xmm1,       xmm3
414
415            and         rcx,        15
416            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
417
418            mov         rcx,        rdx
419            sub         rcx,        8
420
421            and         rcx,        15
422            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
423
424            movq        [rsi],      mm0
425            lea         rsi,        [rsi+rax]
426
427            lea         rdi,        [rdi+rax]
428            add         rdx,        1
429
430            cmp         edx,        dword arg(2) ;rows
431            jl          .loop_row
432
433        add         dword arg(0), 8 ; s += 8
434        sub         dword arg(3), 8 ; cols -= 8
435        cmp         dword arg(3), 0
436        jg          .loop_col
437
438    add         rsp, 128+16
439    pop         rsp
440
441    ; begin epilog
442    pop rdi
443    pop rsi
444    RESTORE_GOT
445    RESTORE_XMM
446    UNSHADOW_ARGS
447    pop         rbp
448    ret
449%undef flimit4
450
451
452;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
453;                                int pitch, int rows, int cols,int flimit)
454global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
455sym(vp9_mbpost_proc_across_ip_xmm):
456    push        rbp
457    mov         rbp, rsp
458    SHADOW_ARGS_TO_STACK 5
459    SAVE_XMM 7
460    GET_GOT     rbx
461    push        rsi
462    push        rdi
463    ; end prolog
464
465    ALIGN_STACK 16, rax
466    sub         rsp, 16
467
468    ; create flimit4 at [rsp]
469    mov         eax, dword ptr arg(4) ;flimit
470    mov         [rsp], eax
471    mov         [rsp+4], eax
472    mov         [rsp+8], eax
473    mov         [rsp+12], eax
474%define flimit4 [rsp]
475
476
477    ;for(r=0;r<rows;r++)
478.ip_row_loop:
479
480        xor         rdx,    rdx ;sumsq=0;
481        xor         rcx,    rcx ;sum=0;
482        mov         rsi,    arg(0); s
483        mov         rdi,    -8
484.ip_var_loop:
485        ;for(i=-8;i<=6;i++)
486        ;{
487        ;    sumsq += s[i]*s[i];
488        ;    sum   += s[i];
489        ;}
490        movzx       eax, byte [rsi+rdi]
491        add         ecx, eax
492        mul         al
493        add         edx, eax
494        add         rdi, 1
495        cmp         rdi, 6
496        jle         .ip_var_loop
497
498
499            ;mov         rax,    sumsq
500            ;movd        xmm7,   rax
501            movd        xmm7,   edx
502
503            ;mov         rax,    sum
504            ;movd        xmm6,   rax
505            movd        xmm6,   ecx
506
507            mov         rsi,    arg(0) ;s
508            xor         rcx,    rcx
509
510            movsxd      rdx,    dword arg(3) ;cols
511            add         rdx,    8
512            pxor        mm0,    mm0
513            pxor        mm1,    mm1
514
515            pxor        xmm0,   xmm0
516.nextcol4:
517
518            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
519            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
520
521            punpcklbw   xmm1,   xmm0                    ; expanding
522            punpcklbw   xmm2,   xmm0                    ; expanding
523
524            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
525            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
526
527            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
528            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
529
530            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
531            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
532
533            paddd       xmm6,   xmm2
534            paddd       xmm7,   xmm1
535
536            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
537            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
538
539            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
540            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
541
542            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
543            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
544
545            paddd       xmm6,   xmm4
546            paddd       xmm7,   xmm3
547
548            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
549            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
550
551            paddd       xmm7,   xmm3
552            paddd       xmm6,   xmm4
553
554            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
555            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
556
557            paddd       xmm7,   xmm3
558            paddd       xmm6,   xmm4
559
560            movdqa      xmm3,   xmm6
561            pmaddwd     xmm3,   xmm3
562
563            movdqa      xmm5,   xmm7
564            pslld       xmm5,   4
565
566            psubd       xmm5,   xmm7
567            psubd       xmm5,   xmm3
568
569            psubd       xmm5,   flimit4
570            psrad       xmm5,   31
571
572            packssdw    xmm5,   xmm0
573            packsswb    xmm5,   xmm0
574
575            movd        xmm1,   DWORD PTR [rsi+rcx]
576            movq        xmm2,   xmm1
577
578            punpcklbw   xmm1,   xmm0
579            punpcklwd   xmm1,   xmm0
580
581            paddd       xmm1,   xmm6
582            paddd       xmm1,   [GLOBAL(four8s)]
583
584            psrad       xmm1,   4
585            packssdw    xmm1,   xmm0
586
587            packuswb    xmm1,   xmm0
588            pand        xmm1,   xmm5
589
590            pandn       xmm5,   xmm2
591            por         xmm5,   xmm1
592
593            movd        [rsi+rcx-8],  mm0
594            movq        mm0,    mm1
595
596            movdq2q     mm1,    xmm5
597            psrldq      xmm7,   12
598
599            psrldq      xmm6,   12
600            add         rcx,    4
601
602            cmp         rcx,    rdx
603            jl          .nextcol4
604
605        ;s+=pitch;
606        movsxd rax, dword arg(1)
607        add    arg(0), rax
608
609        sub dword arg(2), 1 ;rows-=1
610        cmp dword arg(2), 0
611        jg .ip_row_loop
612
613    add         rsp, 16
614    pop         rsp
615
616    ; begin epilog
617    pop rdi
618    pop rsi
619    RESTORE_GOT
620    RESTORE_XMM
621    UNSHADOW_ARGS
622    pop         rbp
623    ret
624%undef flimit4
625
626
627;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
628;                            unsigned char blackclamp[16],
629;                            unsigned char whiteclamp[16],
630;                            unsigned char bothclamp[16],
631;                            unsigned int width, unsigned int height, int pitch)
632extern sym(rand)
633global sym(vp9_plane_add_noise_wmt) PRIVATE
634sym(vp9_plane_add_noise_wmt):
635    push        rbp
636    mov         rbp, rsp
637    SHADOW_ARGS_TO_STACK 8
638    GET_GOT     rbx
639    push        rsi
640    push        rdi
641    ; end prolog
642
643.addnoise_loop:
644    call sym(rand) WRT_PLT
645    mov     rcx, arg(1) ;noise
646    and     rax, 0xff
647    add     rcx, rax
648
649    ; we rely on the fact that the clamping vectors are stored contiguously
650    ; in black/white/both order. Note that we have to reload this here because
651    ; rdx could be trashed by rand()
652    mov     rdx, arg(2) ; blackclamp
653
654
655            mov     rdi, rcx
656            movsxd  rcx, dword arg(5) ;[Width]
657            mov     rsi, arg(0) ;Pos
658            xor         rax,rax
659
660.addnoise_nextset:
661            movdqu      xmm1,[rsi+rax]         ; get the source
662
663            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
664            paddusb     xmm1, [rdx+32] ;bothclamp
665            psubusb     xmm1, [rdx+16] ;whiteclamp
666
667            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
668            paddb       xmm1,xmm2              ; add it in
669            movdqu      [rsi+rax],xmm1         ; store the result
670
671            add         rax,16                 ; move to the next line
672
673            cmp         rax, rcx
674            jl          .addnoise_nextset
675
676    movsxd  rax, dword arg(7) ; Pitch
677    add     arg(0), rax ; Start += Pitch
678    sub     dword arg(6), 1   ; Height -= 1
679    jg      .addnoise_loop
680
681    ; begin epilog
682    pop rdi
683    pop rsi
684    RESTORE_GOT
685    UNSHADOW_ARGS
686    pop         rbp
687    ret
688
689
690SECTION_RODATA
691align 16
692rd42:
693    times 8 dw 0x04
694four8s:
695    times 4 dd 8
696