1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21
22@**
23@******************************************************************************
24@*
25@*
26@* @brief
27@*  This file contains definitions of routines that compute distortion
28@*  between two macro/sub blocks of identical dimensions
29@*
30@* @author
31@*  Ittiam
32@*
33@* @par List of Functions:
34@*  - ime_compute_sad_16x16_a9q()
35@*  - ime_compute_sad_16x16_fast_a9q()
36@*  - ime_compute_sad_16x8_a9q()
37@*  - ime_compute_sad_16x16_ea8_a9q()
38@*  - ime_calculate_sad2_prog_a9q()
39@*  - ime_calculate_sad3_prog_a9q()
40@*  - ime_calculate_sad4_prog_a9q()
41@*  - ime_sub_pel_compute_sad_16x16_a9q()
42@*  - ime_compute_satqd_16x16_lumainter_a9q()
43@*  -
44@* @remarks
45@*  None
46@*
47@*******************************************************************************
48@
49
50
51@**
52@******************************************************************************
53@*
54@* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
55@*
56@* @par   Description
57@*   This functions computes SAD between 2 16x16 blocks. There is a provision
58@*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
59@*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
60@*
61@* @param[in] pu1_src
62@*  UWORD8 pointer to the source
63@*
64@* @param[out] pu1_dst
65@*  UWORD8 pointer to the destination
66@*
67@* @param[in] src_strd
68@*  integer source stride
69@*
70@* @param[in] dst_strd
71@*  integer destination stride
72@*
73@* @param[in] i4_max_sad
74@*  integer maximum allowed distortion
75@*
76@* @param[in] pi4_mb_distortion
77@*  integer evaluated sad
78@*
79@* @remarks
80@*
81@******************************************************************************
82@*
83.text
84.p2align 2
85
86    .global ime_compute_sad_16x16_fast_a9q
87
88ime_compute_sad_16x16_fast_a9q:
89
90    stmfd         sp!, {r12, lr}
91    vpush         {d8-d15}
92    lsl           r2, r2, #1
93    lsl           r3, r3, #1
94
95    @for bringing buffer2 into cache..., dummy load instructions
96    @LDR         r12,[r1]
97
98    vld1.8        {d4, d5}, [r0], r2
99    vld1.8        {d6, d7}, [r1], r3
100    mov           r12, #6
101    vld1.8        {d8, d9}, [r0], r2
102    vabdl.u8      q0, d6, d4
103    vabdl.u8      q1, d7, d5
104    vld1.8        {d10, d11}, [r1], r3
105
106loop_sad_16x16_fast:
107
108    vld1.8        {d4, d5}, [r0], r2
109    vabal.u8      q0, d10, d8
110    vabal.u8      q1, d11, d9
111    vld1.8        {d6, d7}, [r1], r3
112    subs          r12, #2
113    vld1.8        {d8, d9}, [r0], r2
114    vabal.u8      q0, d6, d4
115    vabal.u8      q1, d7, d5
116    vld1.8        {d10, d11}, [r1], r3
117
118    bne           loop_sad_16x16_fast
119
120    vabal.u8      q0, d10, d8
121    vabal.u8      q1, d11, d9
122
123    vadd.i16      q0, q0, q1
124    vadd.i16      d0, d1, d0
125    vpop          {d8-d15}
126    ldr           r12, [sp, #12]
127    vpaddl.u16    d0, d0
128    vpaddl.u32    d0, d0
129    vshl.u32      d0, d0, #1
130    vst1.32       {d0[0]}, [r12]
131
132    ldmfd         sp!, {r12, pc}
133
134
135
136
137@**
138@******************************************************************************
139@*
140@*  @brief computes distortion (SAD) between 2 16x8  blocks
141@*
142@*
143@*  @par   Description
144@*   This functions computes SAD between 2 16x8 blocks. There is a provision
145@*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
146@*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
147@*
148@* @param[in] pu1_src
149@*  UWORD8 pointer to the source
150@*
151@* @param[out] pu1_dst
152@*  UWORD8 pointer to the destination
153@*
154@* @param[in] src_strd
155@*  integer source stride
156@*
157@* @param[in] dst_strd
158@*  integer destination stride
159@*
160@* @param[in] u4_max_sad
161@*  integer maximum allowed distortion
162@*
163@* @param[in] pi4_mb_distortion
164@*  integer evaluated sad
165@*
166@* @remarks
167@*
168@******************************************************************************
169@*
170@
171    .global ime_compute_sad_16x8_a9q
172
173ime_compute_sad_16x8_a9q:
174
175    stmfd         sp!, {r12, lr}
176
177    @for bringing buffer2 into cache..., dummy load instructions
178    @LDR      r12,[r1]
179
180    vld1.8        {d4, d5}, [r0], r2
181    vld1.8        {d6, d7}, [r1], r3
182    mov           r12, #6
183    vpush         {d8-d15}
184    vld1.8        {d8, d9}, [r0], r2
185    vabdl.u8      q0, d6, d4
186    vabdl.u8      q1, d7, d5
187    vld1.8        {d10, d11}, [r1], r3
188
189loop_sad_16x8:
190
191    vld1.8        {d4, d5}, [r0], r2
192    vabal.u8      q0, d10, d8
193    vabal.u8      q1, d11, d9
194    vld1.8        {d6, d7}, [r1], r3
195    subs          r12, #2
196    vld1.8        {d8, d9}, [r0], r2
197    vabal.u8      q0, d6, d4
198    vabal.u8      q1, d7, d5
199    vld1.8        {d10, d11}, [r1], r3
200
201    bne           loop_sad_16x8
202
203    vabal.u8      q0, d10, d8
204    vabal.u8      q1, d11, d9
205
206    vadd.i16      q0, q0, q1
207    vadd.i16      d0, d1, d0
208    vpop          {d8-d15}
209    ldr           r12, [sp, #12]
210    vpaddl.u16    d0, d0
211    vpaddl.u32    d0, d0
212
213    vst1.32       {d0[0]}, [r12]
214
215    ldmfd         sp!, {r12, pc}
216
217
218
219@**
220@******************************************************************************
221@*
222@* @brief computes distortion (SAD) between 2 16x16 blocks with early exit
223@*
224@* @par   Description
225@*   This functions computes SAD between 2 16x16 blocks. There is a provision
226@*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
227@*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
228@*
229@* @param[in] pu1_src
230@*  UWORD8 pointer to the source
231@*
232@* @param[out] pu1_dst
233@*  UWORD8 pointer to the destination
234@*
235@* @param[in] src_strd
236@*  integer source stride
237@*
238@* @param[in] dst_strd
239@*  integer destination stride
240@*
241@* @param[in] i4_max_sad
242@*  integer maximum allowed distortion
243@*
244@* @param[in] pi4_mb_distortion
245@*  integer evaluated sad
246@*
247@* @remarks
248@*
249@******************************************************************************
250@*
251
252    .global ime_compute_sad_16x16_ea8_a9q
253
254ime_compute_sad_16x16_ea8_a9q:
255
256    stmfd         sp!, {r5-r7, lr}
257    lsl           r2, r2, #1
258    lsl           r3, r3, #1
259
260    @for bringing buffer2 into cache..., dummy load instructions
261    @LDR         r12,[r1]
262
263    vld1.8        {d4, d5}, [r0], r2
264    vld1.8        {d6, d7}, [r1], r3
265    mov           r5, #6
266    ldrd          r6, r7, [sp, #16]
267    vpush         {d8-d15}
268    vld1.8        {d8, d9}, [r0], r2
269    vabdl.u8      q0, d6, d4
270    vabdl.u8      q1, d7, d5
271    vld1.8        {d10, d11}, [r1], r3
272
273    @r6 = i4_max_sad, r7 = pi4_mb_distortion
274
275loop_sad_16x16_ea8_1:
276
277    vld1.8        {d4, d5}, [r0], r2
278    vabal.u8      q0, d10, d8
279    vabal.u8      q1, d11, d9
280    vld1.8        {d6, d7}, [r1], r3
281    subs          r5, #2
282    vld1.8        {d8, d9}, [r0], r2
283    vabal.u8      q0, d6, d4
284    vabal.u8      q1, d7, d5
285    vld1.8        {d10, d11}, [r1], r3
286
287    bne           loop_sad_16x16_ea8_1
288
289    vabal.u8      q0, d10, d8
290    sub           r0, r0, r2, lsl #3
291    vabal.u8      q1, d11, d9
292    sub           r1, r1, r3, lsl #3
293
294    vadd.i16      q6, q0, q1
295    add           r0, r0, r2, asr #1
296    vadd.i16      d12, d12, d13
297    add           r1, r1, r3, asr #1
298
299    vpaddl.u16    d12, d12
300    vld1.8        {d4, d5}, [r0], r2
301    vld1.8        {d6, d7}, [r1], r3
302    vpaddl.u32    d12, d12
303    vld1.8        {d8, d9}, [r0], r2
304    vabal.u8      q0, d6, d4
305    vabal.u8      q1, d7, d5
306
307    vst1.32       {d12[0]}, [r7]
308    ldr           r5, [r7]
309    cmp           r5, r6
310    bgt           end_func_16x16_ea8
311
312    vld1.8        {d10, d11}, [r1], r3
313    mov           r5, #6
314
315loop_sad_16x16_ea8_2:
316
317    vld1.8        {d4, d5}, [r0], r2
318    vabal.u8      q0, d10, d8
319    vabal.u8      q1, d11, d9
320    vld1.8        {d6, d7}, [r1], r3
321    subs          r5, #2
322    vld1.8        {d8, d9}, [r0], r2
323    vabal.u8      q0, d6, d4
324    vabal.u8      q1, d7, d5
325    vld1.8        {d10, d11}, [r1], r3
326
327    bne           loop_sad_16x16_ea8_2
328
329    vabal.u8      q0, d10, d8
330    vabal.u8      q1, d11, d9
331
332    vadd.i16      q0, q0, q1
333    vadd.i16      d0, d1, d0
334
335    vpaddl.u16    d0, d0
336    vpaddl.u32    d0, d0
337
338    vst1.32       {d0[0]}, [r7]
339
340end_func_16x16_ea8:
341    vpop          {d8-d15}
342    ldmfd         sp!, {r5-r7, pc}
343
344
345
346@*
347@//---------------------------------------------------------------------------
348@// Function Name      : Calculate_Mad2_prog()
349@//
350@// Detail Description : This function find the sad values of 4 Progressive MBs
351@//                        at one shot
352@//
353@// Platform           : CortexA8/NEON            .
354@//
355@//-----------------------------------------------------------------------------
356@*
357
358    .global ime_calculate_sad2_prog_a9q
359
360ime_calculate_sad2_prog_a9q:
361
362    @ r0    = ref1     <UWORD8 *>
363    @ r1    = ref2     <UWORD8 *>
364    @ r2    = src     <UWORD8 *>
365    @ r3    = RefBufferWidth <UWORD32>
366    @ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *>
367
368    stmfd         sp!, {r4-r5, lr}
369
370    ldr           r4, [sp, #8]          @ load src stride to r4
371    mov           r5, #14
372    vpush         {d8-d15}
373    @Row 1
374    vld1.8        {d0, d1}, [r2], r4    @ load src Row 1
375    vld1.8        {d2, d3}, [r0], r3    @ load ref1 Row 1
376    vld1.8        {d4, d5}, [r1], r3    @ load ref2 Row 1
377
378    @Row 2
379    vld1.8        {d6, d7}, [r2], r4    @ load src Row 2
380    vabdl.u8      q6, d2, d0
381    vabdl.u8      q7, d3, d1
382    vld1.8        {d8, d9}, [r0], r3    @ load ref1 Row 2
383    vabdl.u8      q8, d4, d0
384    vabdl.u8      q9, d5, d1
385    vld1.8        {d10, d11}, [r1], r3  @ load ref2 Row 2
386
387loop_sad2_prog:
388
389    subs          r5, #2
390    @Row 1
391    vld1.8        {d0, d1}, [r2], r4    @ load src Row 1
392    vabal.u8      q6, d8, d6
393    vabal.u8      q7, d9, d7
394    vld1.8        {d2, d3}, [r0], r3    @ load ref1 Row 1
395    vabal.u8      q8, d10, d6
396    vabal.u8      q9, d11, d7
397    vld1.8        {d4, d5}, [r1], r3    @ load ref2 Row 1
398
399    @Row 2
400    vld1.8        {d6, d7}, [r2], r4    @ load src Row 2
401    vabal.u8      q6, d2, d0
402    vabal.u8      q7, d3, d1
403    vld1.8        {d8, d9}, [r0], r3    @ load ref1 Row 2
404    vabal.u8      q8, d4, d0
405    vabal.u8      q9, d5, d1
406    vld1.8        {d10, d11}, [r1], r3  @ load ref2 Row 2
407
408    bne           loop_sad2_prog
409
410    vabal.u8      q6, d8, d6
411    vabal.u8      q7, d9, d7
412    vabal.u8      q8, d10, d6
413    vabal.u8      q9, d11, d7
414
415    @ Compute SAD
416
417    vadd.u16      q6, q6, q7            @ Q6  : sad_ref1
418    vadd.u16      q8, q8, q9            @ Q8  : sad_ref2
419
420    vadd.u16      d12, d12, d13
421    ldr           r5, [sp, #16]         @ loading pi4_sad to r5
422    vadd.u16      d16, d16, d17
423
424    vpadd.u16     d12, d12, d16
425    vpaddl.u16    d12, d12
426
427    vst1.64       {d12}, [r5]!
428    vpop          {d8-d15}
429    ldmfd         sp!, {r4-r5, pc}
430
431
432
433@*
434@//---------------------------------------------------------------------------
435@// Function Name      : Calculate_Mad3_prog()
436@//
437@// Detail Description : This function find the sad values of 4 Progressive MBs
438@//                        at one shot
439@//
440@// Platform           : CortexA8/NEON            .
441@//
442@//-----------------------------------------------------------------------------
443@*
444
445    .global ime_calculate_sad3_prog_a9q
446
447ime_calculate_sad3_prog_a9q:
448
449    @ r0    = ref1     <UWORD8 *>
450    @ r1    = ref2     <UWORD8 *>
451    @ r2    = ref3     <UWORD8 *>
452    @ r3    = src      <UWORD8 *>
453    @ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *>
454
455
456    stmfd         sp!, {r4-r6, lr}
457
458    ldrd          r4, r5, [sp, #16]     @ load ref stride to r4, src stride to r5
459    mov           r6, #14
460    vpush         {d8-d15}
461    @Row 1
462    vld1.8        {d0, d1}, [r3], r5    @ load src Row 1
463    vld1.8        {d2, d3}, [r0], r4    @ load ref1 Row 1
464    vld1.8        {d4, d5}, [r1], r4    @ load ref2 Row 1
465    vabdl.u8      q8, d2, d0
466    vabdl.u8      q9, d3, d1
467    vld1.8        {d6, d7}, [r2], r4    @ load ref3 Row 1
468    vabdl.u8      q10, d4, d0
469    vabdl.u8      q11, d5, d1
470
471    @Row 2
472    vld1.8        {d8, d9}, [r3], r5    @ load src Row 1
473    vabdl.u8      q12, d6, d0
474    vabdl.u8      q13, d7, d1
475    vld1.8        {d10, d11}, [r0], r4  @ load ref1 Row 1
476    vld1.8        {d12, d13}, [r1], r4  @ load ref2 Row 1
477    vabal.u8      q8, d10, d8
478    vabal.u8      q9, d11, d9
479    vld1.8        {d14, d15}, [r2], r4  @ load ref3 Row 1
480    vabal.u8      q10, d12, d8
481    vabal.u8      q11, d13, d9
482
483loop_sad3_prog:
484
485    @Row 1
486    vld1.8        {d0, d1}, [r3], r5    @ load src Row 1
487    vabal.u8      q12, d14, d8
488    vabal.u8      q13, d15, d9
489    vld1.8        {d2, d3}, [r0], r4    @ load ref1 Row 1
490    vld1.8        {d4, d5}, [r1], r4    @ load ref2 Row 1
491    vabal.u8      q8, d2, d0
492    vabal.u8      q9, d3, d1
493    vld1.8        {d6, d7}, [r2], r4    @ load ref3 Row 1
494    vabal.u8      q10, d4, d0
495    vabal.u8      q11, d5, d1
496
497    @Row 2
498    vld1.8        {d8, d9}, [r3], r5    @ load src Row 1
499    vabal.u8      q12, d6, d0
500    vabal.u8      q13, d7, d1
501    vld1.8        {d10, d11}, [r0], r4  @ load ref1 Row 1
502    subs          r6, #2
503    vld1.8        {d12, d13}, [r1], r4  @ load ref2 Row 1
504    vabal.u8      q8, d10, d8
505    vabal.u8      q9, d11, d9
506    vld1.8        {d14, d15}, [r2], r4  @ load ref3 Row 1
507    vabal.u8      q10, d12, d8
508    vabal.u8      q11, d13, d9
509
510    bne           loop_sad3_prog
511
512    vabal.u8      q12, d14, d8
513    vabal.u8      q13, d15, d9
514
515    @ Compute SAD
516
517    vadd.u16      q8, q8, q9            @ Q8  : sad_ref1
518    vadd.u16      q10, q10, q11         @ Q10 : sad_ref2
519    vadd.u16      q12, q12, q13         @ Q12 : sad_ref3
520
521    vadd.u16      d16, d16, d17
522    vadd.u16      d20, d20, d21
523    vadd.u16      d24, d24, d25
524
525    vpadd.u16     d16, d16, d20
526    vpadd.u16     d24, d24, d24
527
528    ldr           r6, [sp, #24]         @ loading pi4_sad to r6
529    vpaddl.u16    d16, d16
530    vpaddl.u16    d24, d24
531
532    vst1.64       {d16}, [r6]!
533    vst1.32       {d24[0]}, [r6]
534    vpop          {d8-d15}
535    ldmfd         sp!, {r4-r6, pc}
536
537
538
539@**
540@******************************************************************************
541@*
542@* @brief computes distortion (SAD) for sub-pel motion estimation
543@*
544@* @par   Description
545@*   This functions computes SAD for all the 8 half pel points
546@*
547@* @param[out] pi4_sad
548@*  integer evaluated sad
549@*  pi4_sad[0] - half x
550@*  pi4_sad[1] - half x - 1
551@*  pi4_sad[2] - half y
552@*  pi4_sad[3] - half y - 1
553@*  pi4_sad[4] - half xy
554@*  pi4_sad[5] - half xy - 1
555@*  pi4_sad[6] - half xy - strd
556@*  pi4_sad[7] - half xy - 1 - strd
557@*
558@* @remarks
559@*
560@******************************************************************************
561@*
562
563.text
564.p2align 2
565
566    .global ime_sub_pel_compute_sad_16x16_a9q
567
568ime_sub_pel_compute_sad_16x16_a9q:
569
570    stmfd         sp!, {r4-r11, lr}     @store register values to stack
571
572    ldr           r9, [sp, #36]
573    ldr           r10, [sp, #40]
574    vpush         {d8-d15}
575    sub           r4, r1, #1            @ x left
576    sub           r5, r2, r10           @ y top
577
578    sub           r6, r3, #1            @ xy left
579    sub           r7, r3, r10           @ xy top
580
581    sub           r8, r7, #1            @ xy top-left
582    mov           r11, #15
583
584    @for bringing buffer2 into cache..., dummy load instructions
585    @ LDR         r12,[r1]
586    @ LDR         r12,[sp,#12]
587
588    vld1.8        {d0, d1}, [r0], r9    @ src
589    vld1.8        {d2, d3}, [r5], r10   @ y top LOAD
590    vld1.8        {d4, d5}, [r7], r10   @ xy top LOAD
591    vld1.8        {d6, d7}, [r8], r10   @ xy top-left LOAD
592
593    vabdl.u8      q6, d2, d0            @ y top ABS1
594    vabdl.u8      q7, d4, d0            @ xy top ABS1
595    vld1.8        {d8, d9}, [r1], r10   @ x LOAD
596    vabdl.u8      q8, d6, d0            @ xy top-left ABS1
597    vabdl.u8      q9, d8, d0            @ x ABS1
598    vld1.8        {d10, d11}, [r4], r10 @ x left LOAD
599
600    vabal.u8      q6, d3, d1            @ y top ABS2
601    vabal.u8      q7, d5, d1            @ xy top ABS2
602    vld1.8        {d2, d3}, [r2], r10   @ y LOAD
603    vabal.u8      q8, d7, d1            @ xy top-left ABS2
604    vabal.u8      q9, d9, d1            @ x ABS2
605    vld1.8        {d4, d5}, [r3], r10   @ xy LOAD
606
607    vabdl.u8      q10, d10, d0          @ x left ABS1
608    vabdl.u8      q11, d2, d0           @ y ABS1
609    vld1.8        {d6, d7}, [r6], r10   @ xy left LOAD
610    vabdl.u8      q12, d4, d0           @ xy ABS1
611    vabdl.u8      q13, d6, d0           @ xy left ABS1
612
613loop_sub_pel_16x16:
614
615    vabal.u8      q10, d11, d1          @ x left ABS2
616    vabal.u8      q11, d3, d1           @ y ABS2
617    subs          r11, #1
618    vabal.u8      q12, d5, d1           @ xy ABS2
619    vabal.u8      q13, d7, d1           @ xy left ABS2
620
621    vld1.8        {d0, d1}, [r0], r9    @ src
622    vabal.u8      q6, d2, d0            @ y top ABS1
623    vabal.u8      q7, d4, d0            @ xy top ABS1
624    vld1.8        {d8, d9}, [r1], r10   @ x LOAD
625    vabal.u8      q8, d6, d0            @ xy top-left ABS1
626    vabal.u8      q9, d8, d0            @ x ABS1
627    vld1.8        {d10, d11}, [r4], r10 @ x left LOAD
628
629    vabal.u8      q6, d3, d1            @ y top ABS2
630    vabal.u8      q7, d5, d1            @ xy top ABS2
631    vld1.8        {d2, d3}, [r2], r10   @ y LOAD
632    vabal.u8      q8, d7, d1            @ xy top-left ABS2
633    vabal.u8      q9, d9, d1            @ x ABS2
634    vld1.8        {d4, d5}, [r3], r10   @ xy LOAD
635
636    vabal.u8      q10, d10, d0          @ x left ABS1
637    vabal.u8      q11, d2, d0           @ y ABS1
638    vld1.8        {d6, d7}, [r6], r10   @ xy left LOAD
639    vabal.u8      q12, d4, d0           @ xy ABS1
640    vabal.u8      q13, d6, d0           @ xy left ABS1
641
642    bne           loop_sub_pel_16x16
643
644    vabal.u8      q10, d11, d1          @ x left ABS2
645    vabal.u8      q11, d3, d1           @ y ABS2
646    vabal.u8      q12, d5, d1           @ xy ABS2
647    vabal.u8      q13, d7, d1           @ xy left ABS2
648
649    vadd.i16      d0, d18, d19          @ x
650    vadd.i16      d3, d12, d13          @ y top
651    vadd.i16      d6, d14, d15          @ xy top
652    vadd.i16      d5, d26, d27          @ xy left
653    vadd.i16      d1, d20, d21          @ x left
654    vadd.i16      d2, d22, d23          @ y
655    vadd.i16      d4, d24, d25          @ xy
656    vadd.i16      d7, d16, d17          @ xy top left
657
658    vpadd.i16     d0, d0, d1
659    vpadd.i16     d2, d2, d3
660    vpadd.i16     d4, d4, d5
661    vpadd.i16     d6, d6, d7
662
663    vpaddl.u16    d0, d0
664    vpaddl.u16    d2, d2
665    vpop          {d8-d15}
666    ldr           r11, [sp, #44]
667    vpaddl.u16    d4, d4
668    vpaddl.u16    d6, d6
669
670    vst1.32       {d0}, [r11]!
671    vst1.32       {d2}, [r11]!
672    vst1.32       {d4}, [r11]!
673    vst1.32       {d6}, [r11]!
674
675    ldmfd         sp!, {r4-r11, pc}     @Restoring registers from stack
676
677
678
679@**
680@******************************************************************************
681@*
682@* @brief computes distortion (SAD) between 2 16x16 blocks
683@*
684@* @par   Description
685@*   This functions computes SAD between 2 16x16 blocks. There is a provision
686@*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
687@*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
688@*
689@* @param[in] pu1_src
690@*  UWORD8 pointer to the source
691@*
692@* @param[out] pu1_dst
693@*  UWORD8 pointer to the destination
694@*
695@* @param[in] src_strd
696@*  integer source stride
697@*
698@* @param[in] dst_strd
699@*  integer destination stride
700@*
701@* @param[in] i4_max_sad
702@*  integer maximum allowed distortion
703@*
704@* @param[in] pi4_mb_distortion
705@*  integer evaluated sad
706@*
707@* @remarks
708@*
709@******************************************************************************
710@*
711
712.text
713.p2align 2
714
715    .global ime_compute_sad_16x16_a9q
716
717ime_compute_sad_16x16_a9q:
718
719
720    @STMFD       sp!,{r12,lr}
721    stmfd         sp!, {r12, r14}       @store register values to stack
722
723    @for bringing buffer2 into cache..., dummy load instructions
724    @ LDR         r12,[r1]
725    @ LDR         r12,[sp,#12]
726
727    vld1.8        {d4, d5}, [r0], r2
728    vld1.8        {d6, d7}, [r1], r3
729    vpush         {d8-d15}
730    mov           r12, #14
731    vld1.8        {d8, d9}, [r0], r2
732    vabdl.u8      q0, d4, d6
733    vld1.8        {d10, d11}, [r1], r3
734    vabdl.u8      q1, d5, d7
735
736loop_sad_16x16:
737
738    vld1.8        {d4, d5}, [r0], r2
739    vabal.u8      q0, d8, d10
740    vld1.8        {d6, d7}, [r1], r3
741    vabal.u8      q1, d9, d11
742
743    vld1.8        {d8, d9}, [r0], r2
744    vabal.u8      q0, d4, d6
745    subs          r12, #2
746    vld1.8        {d10, d11}, [r1], r3
747    vabal.u8      q1, d5, d7
748
749    bne           loop_sad_16x16
750
751    vabal.u8      q0, d8, d10
752    vabal.u8      q1, d9, d11
753
754    vadd.i16      q0, q0, q1
755    vadd.i16      d0, d1, d0
756    vpop          {d8-d15}
757    ldr           r12, [sp, #12]
758
759    vpaddl.u16    d0, d0
760    vpaddl.u32    d0, d0
761    vst1.32       {d0[0]}, [r12]
762
763    ldmfd         sp!, {r12, pc}        @Restoring registers from stack
764
765
766@*
767@//---------------------------------------------------------------------------
768@// Function Name      : Calculate_Mad4_prog()
769@//
770@// Detail Description : This function find the sad values of 4 Progressive MBs
771@//                        at one shot
772@//
773@// Platform           : CortexA8/NEON            .
774@//
775@//-----------------------------------------------------------------------------
776@*
777
778    .global ime_calculate_sad4_prog_a9q
779
780ime_calculate_sad4_prog_a9q:
781    @ r0    = temp_frame     <UWORD8 *>
782    @ r1    = buffer_ptr     <UWORD8 *>
783    @ r2    = RefBufferWidth <UWORD32>
784    @ r3    = CurBufferWidth <UWORD32>
785    @ stack = psad           <UWORD32 *> {at 0x34}
786
787    stmfd         sp!, {r4-r7, lr}
788
789    @UWORD8 *left_ptr       = temp_frame - 1;
790    @UWORD8 *right_ptr      = temp_frame + 1;
791    @UWORD8 *top_ptr        = temp_frame - RefBufferWidth;
792    @UWORD8 *bot_ptr        = temp_frame + RefBufferWidth;
793
794    mov           r7, #14
795    sub           r4, r0, #0x01         @r4 = left_ptr
796    add           r5, r0, #0x1          @r5 = right_ptr
797    sub           r6, r0, r2            @r6 = top_ptr
798    add           r0, r0, r2            @r0 = bot_ptr
799                                        @r1 = buffer_ptr
800    vpush         {d8-d15}
801    @D0:D1  : buffer
802    @D2:D3  : top
803    @D4:D5  : left
804    @D6:D7  : right
805    @D8:D9  : bottom
806
807    @Row 1
808    vld1.8        {d0, d1}, [r1], r3    @ load src Row 1
809    vld1.8        {d2, d3}, [r6], r2    @ load top Row 1
810    vld1.8        {d4, d5}, [r4], r2    @ load left Row 1
811
812    vabdl.u8      q5, d2, d0
813    vld1.8        {d6, d7}, [r5], r2    @ load right Row 1
814    vabdl.u8      q6, d3, d1
815
816    vabdl.u8      q7, d0, d4
817    vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 1
818    vabdl.u8      q8, d1, d5
819
820    @Row 2
821    vabdl.u8      q9, d0, d6
822    vld1.8        {d26, d27}, [r1], r3  @ load src Row 2
823    vabdl.u8      q10, d1, d7
824
825    vabdl.u8      q11, d0, d8
826    vld1.8        {d2, d3}, [r6], r2    @ load top Row 2
827    vabdl.u8      q12, d1, d9
828
829loop_sad4_prog:
830
831    vabal.u8      q5, d26, d2
832    vld1.8        {d4, d5}, [r4], r2    @ load left Row 2
833    vabal.u8      q6, d27, d3
834
835    vabal.u8      q7, d26, d4
836    vld1.8        {d6, d7}, [r5], r2    @ load right Row 2
837    vabal.u8      q8, d27, d5
838
839    vabal.u8      q9, d26, d6
840    vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 2
841    vabal.u8      q10, d27, d7
842
843    @Row 1
844    vabal.u8      q11, d26, d8
845    vld1.8        {d0, d1}, [r1], r3    @ load src Row 1
846    vabal.u8      q12, d27, d9
847
848    vld1.8        {d2, d3}, [r6], r2    @ load top Row 1
849    subs          r7, #2
850    vld1.8        {d4, d5}, [r4], r2    @ load left Row 1
851
852    vabal.u8      q5, d0, d2
853    vld1.8        {d6, d7}, [r5], r2    @ load right Row 1
854    vabal.u8      q6, d1, d3
855
856    vabal.u8      q7, d0, d4
857    vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 1
858    vabal.u8      q8, d1, d5
859
860    @Row 2
861    vabal.u8      q9, d0, d6
862    vld1.8        {d26, d27}, [r1], r3  @ load src Row 2
863    vabal.u8      q10, d1, d7
864
865    vabal.u8      q11, d0, d8
866    vld1.8        {d2, d3}, [r6], r2    @ load top Row 2
867    vabal.u8      q12, d1, d9
868
869    bne           loop_sad4_prog
870
871    vabal.u8      q5, d26, d2
872    vld1.8        {d4, d5}, [r4], r2    @ load left Row 2
873    vabal.u8      q6, d27, d3
874
875    vabal.u8      q7, d26, d4
876    vld1.8        {d6, d7}, [r5], r2    @ load right Row 2
877    vabal.u8      q8, d27, d5
878
879    vabal.u8      q9, d26, d6
880    vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 2
881    vabal.u8      q10, d27, d7
882
883    vabal.u8      q11, d26, d8
884    vabal.u8      q12, d27, d9
885
886    @;Q5:Q6   : sad_top
887    @;Q7:Q8   : sad_left
888    @;Q9:Q10  : sad_right
889    @;Q11:Q12 : sad_bot
890
891    vadd.u16      q5, q5, q6
892    vadd.u16      q7, q7, q8
893    vadd.u16      q9, q9, q10
894    vadd.u16      q11, q11, q12
895
896    @; Free :-
897    @; Q6,Q8,Q10,Q12
898
899    @;Q5  -> D10:D11
900    @;Q7  -> D14:D15
901    @;Q9  -> D18:D19
902    @;Q11 -> D22:D23
903
904    vadd.u16      d10, d10, d11
905    vadd.u16      d14, d14, d15
906    vadd.u16      d18, d18, d19
907    vadd.u16      d22, d22, d23
908
909    @;D10  : sad_top
910    @;D14  : sad_left
911    @;D18  : sad_right
912    @;D22  : sad_bot
913
914
915    vpaddl.u16    d11, d10
916    vpaddl.u16    d15, d14
917    vpaddl.u16    d19, d18
918    vpaddl.u16    d23, d22
919
920    @;D11  : sad_top
921    @;D15  : sad_left
922    @;D19  : sad_right
923    @;D23  : sad_bot
924
925    vpaddl.u32    d10, d11
926    vpaddl.u32    d22, d23
927    vpaddl.u32    d14, d15
928    vpaddl.u32    d18, d19
929
930    @;D10  : sad_top
931    @;D14  : sad_left
932    @;D18  : sad_right
933    @;D22  : sad_bot
934
935    ldr           r4, [sp, #84]         @;Can be rearranged
936
937    vsli.64       d10, d22, #32
938    vsli.64       d14, d18, #32
939
940    vst1.64       {d14}, [r4]!
941    vst1.64       {d10}, [r4]!
942    vpop          {d8-d15}
943    ldmfd         sp!, {r4-r7, pc}
944
945
946
947
948@*****************************************************************************
949@*
950@* Function Name        : ime_compute_satqd_16x16_lumainter_a9
951@* Description          : This fucntion computes SAD for a 16x16 block.
952@                       : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant
953@
954@  Arguments            :   R0 :pointer to src buffer
955@                           R1 :pointer to est buffer
956@                           R2 :source stride
957@                           R3 :est stride
958@                           STACk :Threshold,distotion,is_nonzero
959@*
960@* Values Returned   : NONE
961@*
962@* Register Usage    : R0-R11
963@* Stack Usage       :
964@* Cycles            : Around
965@* Interruptiaility  : Interruptable
966@*
967@* Known Limitations
968@*   \Assumptions    :
969@*
970@* Revision History  :
971@*         DD MM YYYY    Author(s)          Changes
972@*         14 04 2014    Harinarayanan K K  First version
973@*
974@*****************************************************************************
975    .global ime_compute_satqd_16x16_lumainter_a9q
976ime_compute_satqd_16x16_lumainter_a9q:
977    @R0 :pointer to src buffer
978    @R1 :pointer to est buffer
979    @R2 :Source stride
980    @R3 :Pred stride
981    @R4 :Threshold pointer
982    @R5 :Distortion,ie SAD
983    @R6 :is nonzero
984
985    push          {r4-r12, lr}          @push all the variables first
986    @ADD      SP,SP,#40         ;decrement stack pointer,to accomodate two variables
987    ldr           r4, [sp, #40]         @load the threshold address
988    vpush         {d8-d15}
989    mov           r8, #8                @Number of 4x8 blocks to be processed
990    mov           r10, #0               @Sad
991    mov           r7, #0                @Nonzero info
992    @----------------------------------------------------
993
994    vld1.u8       d30, [r0], r2         @I  load 8 pix src row 1
995
996    vld1.u8       d31, [r1], r3         @I  load 8 pix pred row 1
997
998    vld1.u8       d28, [r0], r2         @I  load 8 pix src row 2
999
1000    vld1.u8       d29, [r1], r3         @I  load 8 pix pred row 2
1001
1002    vld1.u8       d26, [r0], r2         @I  load 8 pix src row 3
1003    vabdl.u8      q0, d30, d31          @I  Abs diff r1 blk 12
1004
1005    vld1.u8       d27, [r1], r3         @I  load 8 pix pred row 3
1006
1007    vld1.u8       d24, [r0], r2         @I  load 8 pix src row 4
1008
1009    vld1.u8       d25, [r1], r3         @I  load 8 pix pred row 4
1010    vabdl.u8      q1, d28, d29          @I  Abs diff r1 blk 12
1011
1012    vld1.u16      {q11}, [r4]           @I  load the threhold
1013    vabdl.u8      q2, d26, d27          @I  Abs diff r1 blk 12
1014
1015    vabdl.u8      q3, d24, d25          @I  Abs diff r1 blk 12
1016
1017
1018
1019core_loop:
1020                                        @S1  S2  S3  S4     A1  A2  A3  A4
1021                                        @S5  S6  S7  S8     A5  A6  A7  A8
1022                                        @S9  S10 S11 S12    A9  A10 A11 A12
1023                                        @S13 S14 S15 S16    A13 A14 A15 A16
1024    ands          r11, r8, #1           @II See if we are at even or odd block
1025    vadd.u16      q4 , q0, q3           @I  Add r1 r4
1026    lsl           r11, r2, #2           @II Move back src 4 rows
1027
1028    subeq         r0, r0, r11           @II Move back src 4 rows if we are at even block
1029    vadd.u16      q5 , q1, q2           @I  Add r2 r3
1030    addeq         r0, r0, #8            @II Move src 8 cols forward if we are at even block
1031
1032    lsl           r11, r3, #2           @II Move back pred 4 rows
1033    vtrn.16       d8 , d10              @I trnspse 1
1034    subeq         r1, r1, r11           @II Move back pred 4 rows if we are at even block
1035
1036    addeq         r1, r1, #8            @II Move pred 8 cols forward if we are at even block
1037    vtrn.16       d9 , d11              @I trnspse 2
1038    subne         r0, r0, #8            @II Src 8clos back for odd rows
1039
1040    subne         r1, r1, #8            @II Pred 8 cols back for odd rows
1041    vtrn.32       d10, d11              @I trnspse 4
1042
1043
1044    vtrn.32       d8 , d9               @I trnspse 3
1045    vswp          d10, d11              @I rearrange so that the q4 and q5 add properly
1046                                        @D8     S1 S4 A1 A4
1047                                        @D9     S2 S3 A2 A3
1048                                        @D11    S1 S4 A1 A4
1049                                        @D10    S2 S3 A2 A3
1050
1051    vadd.s16      q6, q4, q5            @I  Get s1 s4
1052    vld1.u8       d30, [r0], r2         @II load first 8 pix src row 1
1053
1054    vtrn.s16      d12, d13              @I  Get s2 s3
1055                                        @D12 S1 S4 A1 A4
1056                                        @D13 S2 S3 A2 A3
1057
1058    vshl.s16      q7, q6 , #1           @I  si  = si<<1
1059    vld1.u8       d31, [r1], r3         @II load first 8 pix pred row 1
1060
1061    vpadd.s16     d16, d12, d13         @I  (s1 + s4) (s2 + s3)
1062    vld1.u8       d28, [r0], r2         @II load first 8 pix src row 2
1063                                        @   D16  S14 A14 S23 A23
1064    vrev32.16     d0, d16               @I
1065    vuzp.s16      d16, d0               @I
1066                                        @D16  S14 S23 A14 A23
1067    vadd.s16      d17, d12, d13         @I  (s1 + s2) (s3 + s4)
1068    vld1.u8       d29, [r1], r3         @II load first 8 pix pred row 2
1069                                        @D17  S12 S34 A12 A34
1070
1071    vrev32.16     q9, q7                @I  Rearrange si's
1072                                        @Q9  Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
1073
1074                                        @D12    S1 S4 A1 A4
1075                                        @D19    Z3 Z2 Y3 Y2
1076    vsub.s16      d8, d12, d19          @I  (s1 - (s3<<1)) (s4 - (s2<<1))
1077    vld1.u8       d26, [r0], r2         @II load first 8 pix src row 3
1078                                        @D13    S2 S3 A2 A3
1079                                        @D18    Z4 Z1 Y4 Y1
1080    vsub.s16      d9, d13, d18          @I  (s2 - (s4<<1)) (s3 - (s1<<1))
1081    vld1.u8       d27, [r1], r3         @II load first 8 pix pred row 3
1082                                        @Q10    S8 S5 A8 A5 S7 S4 A7 A4
1083
1084                                        @D16  S14 S23 A14 A23
1085    vpadd.s16     d10, d16, d17         @I  Get sad by adding s1 s2 s3 s4
1086    vld1.u8       d24, [r0], r2         @II load first 8 pix src row 4
1087                                        @D22 SAD1 SAD2 junk junk
1088
1089
1090                                        @Q8     S2 S1 A2 A1 S6 S3 A6 A3
1091                                        @Q10    S8 S5 A8 A5 S7 S4 A7 A4
1092    vtrn.32       q8, q4                @I  Rearrange to make ls of each block togather
1093                                        @Q8     S2 S1 S8 S5 S6 S3 S7 S4
1094                                        @Q10    A2 A1 A8 A5 A6 A3 A7 A4
1095
1096
1097    ldrh          r11, [r4, #16]        @I  Load the threshold for DC val blk 1
1098    vdup.s16      q6, d10[0]            @I  Get the sad blk 1
1099    vabdl.u8      q0, d30, d31          @II Abs diff r1 blk 12
1100
1101    vshl.s16      q7, q6, #1            @I  sad_2 = sad_1<<1
1102    vmov.s16      r9, d10[0]            @I  Get the sad for block 1
1103
1104    vsub.s16      q9, q7, q8            @I  Add to the lss
1105    vmov.s16      r5, d10[1]            @I  Get the sad for block 2
1106
1107    vcle.s16      q7, q11, q9           @I  Add to the lss
1108    vld1.u8       d25, [r1], r3         @II load first 8 pix pred row 4
1109
1110    vdup.s16      q15, d10[1]           @I  Get the sad blk 1
1111    vabdl.u8      q1, d28, d29          @II Abs diff r1 blk 12
1112
1113
1114    vshl.s16      q14, q15, #1          @I  sad_2 = sad_1<<1
1115    vsub.s16      q3, q14, q4           @I  Add to the lss
1116    vcle.s16      q15, q11, q3          @I  Add to the lss
1117
1118    ADD           R10, R10, R9          @I  Add to  the global sad blk 1
1119    vtrn.u8       q15, q7               @I  get all comparison bits to one reg
1120    vabdl.u8      q2, d26, d27          @II Abs diff r1 blk 12
1121
1122    ADD           R10, R10, R5          @I  Add to  the global sad blk 2
1123    vshr.u8       q14, q15, #7          @I  Shift the bits so that no  overflow occurs
1124    cmp           r11, r9
1125
1126    movle         r7, #0xf              @I  If not met mark it by mvoing non zero val to R7 blk 1                   ;I  Compare with threshold blk 1
1127    vadd.u8       d28, d28, d29         @I  Add the bits
1128    cmp           r11, r5               @I  Compare with threshold blk 2
1129
1130    movle         r7, #0xf              @I  If not met mark it by mvoing non zero val to R7 blk 2
1131    vpadd.u8      d28, d28, d29         @I  Add the bits
1132
1133    vmov.u32      r11, d28[0]           @I  Since a set bit now represents a unstatisofrd contifon store it in r11
1134    vabdl.u8      q3, d24, d25          @II Abs diff r1 blk 12
1135
1136    orr           r7, r7, r11           @I  get the guy to r11
1137
1138
1139    sub           r8, r8, #1            @I  Decremrnt block count
1140
1141    cmp           r7, #0                @I  If we have atlest one non zero block
1142    bne           compute_sad_only      @I  if a non zero block is der,From now on compute sad only
1143
1144    cmp           r8, #1                @I  See if we are at the last block
1145    bne           core_loop             @I  If the blocks are zero, lets continue the satdq
1146
1147
1148    @EPILOUGE for core loop
1149                                        @S1  S2  S3  S4     A1  A2  A3  A4
1150                                        @S5  S6  S7  S8     A5  A6  A7  A8
1151                                        @S9  S10 S11 S12    A9  A10 A11 A12
1152                                        @S13 S14 S15 S16    A13 A14 A15 A16
1153    vadd.u16      q4 , q0, q3           @Add r1 r4
1154    vadd.u16      q5 , q1, q2           @Add r2 r3
1155                                        @D8     S1 S2 S2 S1
1156                                        @D10    S4 S3 S3 S4
1157                                        @D9     A1 A2 A2 A1
1158                                        @D11    A4 A3 A3 A4
1159    vtrn.16       d8 , d10              @I trnspse 1
1160    vtrn.16       d9 , d11              @I trnspse 2
1161    vtrn.32       d8 , d9               @I trnspse 3
1162    vtrn.32       d10, d11              @I trnspse 4
1163
1164    vswp          d10, d11              @I rearrange so that the q4 and q5 add properly
1165                                        @D8     S1 S4 A1 A4
1166                                        @D9     S2 S3 A2 A3
1167                                        @D11    S1 S4 A1 A4
1168                                        @D10    S2 S3 A2 A3
1169    vadd.s16      q6, q4, q5            @Get s1 s4
1170    vtrn.s16      d12, d13              @Get s2 s3
1171                                        @D12 S1 S4 A1 A4
1172                                        @D13 S2 S3 A2 A3
1173
1174    vshl.s16      q7, q6 , #1           @si  = si<<1
1175    vmov.s16      r9, d10[0]            @Get the sad for block 1
1176
1177    vpadd.s16     d16, d12, d13         @(s1 + s4) (s2 + s3)
1178    vmov.s16      r5, d10[1]            @Get the sad for block 2
1179                                        @D16  S14 A14 S23 A23
1180    vrev32.16     d30, d16              @
1181    vuzp.s16      d16, d30              @
1182                                        @D16  S14 S23 A14 A23
1183    vadd.s16      d17, d12, d13         @(s1 + s2) (s3 + s4)
1184                                        @D17  S12 S34 A12 A34
1185
1186    vrev32.16     q9, q7                @Rearrange si's
1187                                        @Q9  Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
1188
1189                                        @D12    S1 S4 A1 A4
1190                                        @D19    Z3 Z2 Y3 Y2
1191    vsub.s16      d8, d12, d19          @(s1 - (s3<<1)) (s4 - (s2<<1))
1192                                        @D13    S2 S3 A2 A3
1193                                        @D18    Z4 Z1 Y4 Y1
1194    vsub.s16      d9, d13, d18          @(s2 - (s4<<1)) (s3 - (s1<<1))
1195                                        @Q10    S8 S5 A8 A5 S7 S4 A7 A4
1196
1197                                        @D16  S14 S23 A14 A23
1198    vpadd.s16     d10, d16, d17         @I  Get sad by adding s1 s2 s3 s4
1199                                        @D22 SAD1 SAD2 junk junk
1200    vmov.u16      r9, d10[0]            @Get the sad for block 1
1201    vmov.u16      r5, d10[1]            @Get the sad for block 2
1202
1203                                        @Q8     S2 S1 A2 A1 S6 S3 A6 A3
1204                                        @Q10    S8 S5 A8 A5 S7 S4 A7 A4
1205    ldrh          r11, [r4, #16]        @Load the threshold for DC val blk 1
1206    vtrn.32       q8, q4                @Rearrange to make ls of each block togather
1207    ADD           R10, R10, R9          @Add to  the global sad blk 1
1208
1209                                        @Q8     S2 S1 S8 S5 S6 S3 S7 S4
1210                                        @Q10    A2 A1 A8 A5 A6 A3 A7 A4
1211
1212    vld1.u16      {q11}, [r4]           @load the threhold
1213    ADD           R10, R10, R5          @Add to  the global sad blk 2
1214
1215    vdup.u16      q6, d10[0]            @Get the sad blk 1
1216
1217    cmp           r11, r9               @Compare with threshold blk 1
1218    vshl.u16      q7, q6, #1            @sad_2 = sad_1<<1
1219
1220    vsub.s16      q9, q7, q8            @Add to the lss
1221
1222    vcle.s16      q15, q11, q9          @Add to the lss
1223    movle         r7, #0xf              @If not met mark it by mvoing non zero val to R7 blk 1
1224
1225    cmp           r11, r5               @Compare with threshold blk 2
1226    vdup.u16      q14, d10[1]           @Get the sad blk 1
1227
1228    vshl.u16      q13, q14, #1          @sad_2 = sad_1<<1
1229    vsub.s16      q12, q13, q4          @Add to the lss
1230    vcle.s16      q14, q11, q12         @Add to the lss
1231    movle         r7, #0xf              @If not met mark it by mvoing non zero val to R7 blk 2
1232
1233    vtrn.u8       q14, q15              @get all comparison bits to one reg
1234    vshr.u8       q14, q14, #7          @Shift the bits so that no  overflow occurs
1235    vadd.u8       d28, d28, d29         @Add the bits
1236    vpadd.u8      d28, d28, d29         @Add the bits
1237    vmov.u32      r11, d28[0]           @Since a set bit now represents a unstatisofrd contifon store it in r11
1238    orr           r7, r7, r11           @get the guy to r11
1239
1240    b             funcend_sad_16x16     @Since all blocks ar processed nw, got to end
1241
1242compute_sad_only:                       @This block computes SAD only, so will be lighter
1243                                        @IT will start processign at n odd block
1244                                        @It will compute sad for odd blok,
1245                                        @and then for two blocks at a time
1246                                        @The counter is r7, hence r7 blocks will be processed
1247
1248    and           r11, r8, #1           @Get the last bit of counter
1249    cmp           r11, #0               @See if we are at even or odd block
1250                                        @iif the blk is even we just have to set the pointer to the
1251                                        @start of current row
1252
1253    lsleq         r11, r2, #2           @I  Move back src 4 rows
1254    subeq         r0, r0, r11           @I  Move back src 4 rows if we are at even block
1255
1256    lsleq         r11, r3, #2           @I  Move back pred 4 rows
1257    subeq         r1, r1, r11           @I  Move back pred 4 rows if we are at even block
1258    @ADDEQ R8,R8,#2         ;Inc counter
1259    beq           skip_odd_blk          @If the blk is odd we have to compute sad
1260
1261
1262    vadd.u16      q4, q0, q1            @Add SAD of row1 and row2
1263    vadd.u16      q5, q2, q3            @Add SAD of row3 and row4
1264    vadd.u16      q6, q4, q5            @Add SAD of row 1-4
1265    vadd.u16      d14, d12, d13         @Add Blk1 and blk2
1266    vpadd.u16     d16, d14, d15         @Add col 1-2 and 3-4
1267    vpadd.u16     d18, d16, d17         @Add col 12-34
1268
1269    vmov.u16      r9, d18[0]            @Move sad to arm
1270    ADD           R10, R10, R9          @Add to  the global sad
1271
1272    sub           r8, r8, #1            @Dec counter
1273    cmp           r8, #0                @See if we processed last block
1274    beq           funcend_sad_16x16     @if lprocessed last block goto end of func
1275
1276    sub           r0, r0, #8            @Since we processed od block move back src by 8 cols
1277    sub           r1, r1, #8            @Since we processed od block move back pred by 8 cols
1278
1279skip_odd_blk:
1280
1281    vmov.s16      q0, #0                @Initialize the accumulator
1282    vmov.s16      q1, #0                @Initialize the accumulator
1283
1284    vld1.u8       {q15}, [r0], r2       @load src r1
1285    vld1.u8       {q14}, [r1], r3       @load pred r1
1286
1287    vld1.u8       {q13}, [r0], r2       @load src r2
1288    vld1.u8       {q12}, [r1], r3       @load pred r2
1289
1290    vld1.u8       {q11}, [r0], r2       @load src r3
1291    vld1.u8       {q10}, [r1], r3       @load pred r2
1292
1293    vld1.u8       {q9}, [r0], r2        @load src r4
1294    vld1.u8       {q8}, [r1], r3        @load pred r4
1295
1296    cmp           r8, #2
1297    beq           sad_epilouge
1298
1299sad_loop:
1300
1301    vabal.u8      q0, d30, d28          @I  accumulate Abs diff R1
1302    vabal.u8      q1, d31, d29          @I  accumulate Abs diff R1
1303
1304    vld1.u8       {q15}, [r0], r2       @II load r1 src
1305    vabal.u8      q0, d26, d24          @I  accumulate Abs diff R2
1306
1307    vld1.u8       {q14}, [r1], r3       @II load r1 pred
1308    vabal.u8      q1, d27, d25          @I  accumulate Abs diff R2
1309
1310    vld1.u8       {q13}, [r0], r2       @II load r3 src
1311    vabal.u8      q0, d22, d20          @I  accumulate Abs diff R3
1312
1313    vld1.u8       {q12}, [r1], r3       @II load r2 pred
1314    vabal.u8      q1, d23, d21          @I  accumulate Abs diff R3
1315
1316    vld1.u8       {q11}, [r0], r2       @II load r3 src
1317    vabal.u8      q0, d18, d16          @I  accumulate Abs diff R4
1318
1319
1320    sub           r8, r8, #2            @Since we processe 16 pix @a time, dec by 2
1321    vld1.u8       {q10}, [r1], r3       @II load r3 pred
1322    vabal.u8      q1, d19, d17          @I  accumulate Abs diff R4
1323
1324    cmp           r8, #2                @Check if last loop
1325    vld1.u8       {q9}, [r0], r2        @II load r4 src
1326    vld1.u8       {q8}, [r1], r3        @II load r4 pred
1327
1328    bne           sad_loop              @Go back to SAD computation
1329
1330sad_epilouge:
1331    vabal.u8      q0, d30, d28          @Accumulate Abs diff R1
1332    vabal.u8      q1, d31, d29          @Accumulate Abs diff R1
1333
1334    vabal.u8      q0, d26, d24          @Accumulate Abs diff R2
1335    vabal.u8      q1, d27, d25          @Accumulate Abs diff R2
1336
1337    vabal.u8      q0, d22, d20          @Accumulate Abs diff R3
1338    vabal.u8      q1, d23, d21          @Aaccumulate Abs diff R3
1339
1340    vabal.u8      q0, d18, d16          @Accumulate Abs diff R4
1341    vabal.u8      q1, d19, d17          @Accumulate Abs diff R4
1342
1343    vadd.u16      q2, q0, q1            @ADD two accumulators
1344    vadd.u16      d6, d4, d5            @Add two blk sad
1345    vpadd.u16     d8, d6, d7            @Add col 1-2 and 3-4 sad
1346    vpadd.u16     d10, d8, d9           @Add col 12-34 sad
1347
1348    vmov.u16      r9, d10[0]            @move SAD to ARM
1349    ADD           R10, R10, R9          @Add to  the global sad
1350
1351funcend_sad_16x16:                      @End of fucntion process
1352
1353    vpop          {d8-d15}
1354    ldr           r5, [sp, #44]
1355    ldr           r6, [sp, #48]
1356
1357    str           r7, [r6]              @Store the is zero reg
1358    str           r10, [r5]             @Store sad
1359
1360    @SUB SP,SP,#40
1361    pop           {r4-r12, pc}
1362
1363
1364