1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20//**
21
22///**
23//******************************************************************************
24//*
25//*
26//* @brief
27//*  This file contains definitions of routines that compute distortion
28//*  between two macro/sub blocks of identical dimensions
29//*
30//* @author
31//*  Ittiam
32//*
33//* @par List of Functions:
34//*  - ime_compute_sad_16x16()
35//*  - ime_compute_sad_8x8()
36//*  - ime_compute_sad_4x4()
37//*  - ime_compute_sad_16x8()
38//*  - ime_compute_satqd_16x16_lumainter_av8()
39//*
40//* @remarks
41//*  None
42//*
43//*******************************************************************************
44//
45
46
47///**
48//******************************************************************************
49//*
50//* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
51//*
52//* @par   Description
53//*   This functions computes SAD between 2 16x16 blocks. There is a provision
54//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
55//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
56//*
57//* @param[in] pu1_src
58//*  UWORD8 pointer to the source
59//*
60//* @param[out] pu1_dst
61//*  UWORD8 pointer to the destination
62//*
63//* @param[in] src_strd
64//*  integer source stride
65//*
66//* @param[in] dst_strd
67//*  integer destination stride
68//*
69//* @param[in] i4_max_sad
70//*  integer maximum allowed distortion
71//*
72//* @param[in] pi4_mb_distortion
73//*  integer evaluated sad
74//*
75//* @remarks
76//*
77//******************************************************************************
78//*/
79.text
80.p2align 2
81
82.macro push_v_regs
83    stp       d8, d9, [sp, #-16]!
84    stp       d10, d11, [sp, #-16]!
85    stp       d12, d13, [sp, #-16]!
86    stp       d14, d15, [sp, #-16]!
87.endm
88.macro pop_v_regs
89    ldp       d14, d15, [sp], #16
90    ldp       d12, d13, [sp], #16
91    ldp       d10, d11, [sp], #16
92    ldp       d8, d9, [sp], #16
93.endm
94
95    .global ime_compute_sad_16x16_fast_av8
96ime_compute_sad_16x16_fast_av8:
97    push_v_regs
98    sxtw      x2, w2
99    sxtw      x3, w3
100    lsl       x2, x2, #1
101    lsl       x3, x3, #1
102
103    mov       x6, #2
104    movi      v30.8h, #0
105
106core_loop_ime_compute_sad_16x16_fast_av8:
107
108    ld1       {v0.16b}, [x0], x2
109    ld1       {v1.16b}, [x1], x3
110    ld1       {v2.16b}, [x0], x2
111    ld1       {v3.16b}, [x1], x3
112
113    uabal     v30.8h, v0.8b, v1.8b
114    uabal2    v30.8h, v0.16b, v1.16b
115
116    uabal     v30.8h, v2.8b, v3.8b
117    uabal2    v30.8h, v2.16b, v3.16b
118
119    ld1       {v4.16b}, [x0], x2
120    ld1       {v5.16b}, [x1], x3
121    ld1       {v6.16b}, [x0], x2
122    ld1       {v7.16b}, [x1], x3
123
124    uabal     v30.8h, v4.8b, v5.8b
125    uabal2    v30.8h, v4.16b, v5.16b
126
127    uabal     v30.8h, v6.8b, v7.8b
128    uabal2    v30.8h, v6.16b, v7.16b
129
130    subs      x6, x6, #1
131    bne       core_loop_ime_compute_sad_16x16_fast_av8
132
133
134    addp      v30.8h, v30.8h, v30.8h
135    uaddlp    v30.4s, v30.8h
136    addp      v30.2s, v30.2s, v30.2s
137    shl       v30.2s, v30.2s, #1
138
139    st1       {v30.s}[0], [x5]
140    pop_v_regs
141    ret
142
143
144///**
145//******************************************************************************
146//*
147//*  @brief computes distortion (SAD) between 2 16x8  blocks
148//*
149//*
150//*  @par   Description
151//*   This functions computes SAD between 2 16x8 blocks. There is a provision
152//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
153//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
154//*
155//* @param[in] pu1_src
156//*  UWORD8 pointer to the source
157//*
158//* @param[out] pu1_dst
159//*  UWORD8 pointer to the destination
160//*
161//* @param[in] src_strd
162//*  integer source stride
163//*
164//* @param[in] dst_strd
165//*  integer destination stride
166//*
167//* @param[in] u4_max_sad
168//*  integer maximum allowed distortion
169//*
170//* @param[in] pi4_mb_distortion
171//*  integer evaluated sad
172//*
173//* @remarks
174//*
175//******************************************************************************
176//*/
177//
178    .global ime_compute_sad_16x8_av8
179ime_compute_sad_16x8_av8:
180
181    //chheck what stride incremtn to use
182    //earlier code did not have this lsl
183    push_v_regs
184    sxtw      x2, w2
185    sxtw      x3, w3
186    mov       x6, #2
187    movi      v30.8h, #0
188
189core_loop_ime_compute_sad_16x8_av8:
190
191    ld1       {v0.16b}, [x0], x2
192    ld1       {v1.16b}, [x1], x3
193    ld1       {v2.16b}, [x0], x2
194    ld1       {v3.16b}, [x1], x3
195
196    uabal     v30.8h, v0.8b, v1.8b
197    uabal2    v30.8h, v0.16b, v1.16b
198
199    uabal     v30.8h, v2.8b, v3.8b
200    uabal2    v30.8h, v2.16b, v3.16b
201
202    ld1       {v4.16b}, [x0], x2
203    ld1       {v5.16b}, [x1], x3
204    ld1       {v6.16b}, [x0], x2
205    ld1       {v7.16b}, [x1], x3
206
207    uabal     v30.8h, v4.8b, v5.8b
208    uabal2    v30.8h, v4.16b, v5.16b
209
210    uabal     v30.8h, v6.8b, v7.8b
211    uabal2    v30.8h, v6.16b, v7.16b
212
213    subs      x6, x6, #1
214    bne       core_loop_ime_compute_sad_16x8_av8
215
216
217    addp      v30.8h, v30.8h, v30.8h
218    uaddlp    v30.4s, v30.8h
219    addp      v30.2s, v30.2s, v30.2s
220
221    st1       {v30.s}[0], [x5]
222    pop_v_regs
223    ret
224
225///**
226//******************************************************************************
227//*
228//* @brief computes distortion (SAD) between 2 16x16 blocks with early exit
229//*
230//* @par   Description
231//*   This functions computes SAD between 2 16x16 blocks. There is a provision
232//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
233//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
234//*
235//* @param[in] pu1_src
236//*  UWORD8 pointer to the source
237//*
238//* @param[out] pu1_dst
239//*  UWORD8 pointer to the destination
240//*
241//* @param[in] src_strd
242//*  integer source stride
243//*
244//* @param[in] dst_strd
245//*  integer destination stride
246//*
247//* @param[in] i4_max_sad
248//*  integer maximum allowed distortion
249//*
250//* @param[in] pi4_mb_distortion
251//*  integer evaluated sad
252//*
253//* @remarks
254//*
255//******************************************************************************
256//*/
257
258    .global ime_compute_sad_16x16_ea8_av8
259ime_compute_sad_16x16_ea8_av8:
260
261    push_v_regs
262    sxtw      x2, w2
263    sxtw      x3, w3
264    movi      v30.8h, #0
265
266    add       x7, x0, x2
267    add       x8, x1, x3
268
269    lsl       x2, x2, #1
270    lsl       x3, x3, #1
271
272    ld1       {v0.16b}, [x0], x2
273    ld1       {v1.16b}, [x1], x3
274    ld1       {v2.16b}, [x0], x2
275    ld1       {v3.16b}, [x1], x3
276    ld1       {v8.16b}, [x0], x2
277    ld1       {v9.16b}, [x1], x3
278    ld1       {v10.16b}, [x0], x2
279    ld1       {v11.16b}, [x1], x3
280    ld1       {v12.16b}, [x0], x2
281    ld1       {v13.16b}, [x1], x3
282    ld1       {v14.16b}, [x0], x2
283    ld1       {v15.16b}, [x1], x3
284    ld1       {v16.16b}, [x0], x2
285    ld1       {v17.16b}, [x1], x3
286    ld1       {v18.16b}, [x0], x2
287    ld1       {v19.16b}, [x1], x3
288
289    uabal     v30.8h, v0.8b, v1.8b
290    uabal2    v30.8h, v0.16b, v1.16b
291
292    uabal     v30.8h, v2.8b, v3.8b
293    uabal2    v30.8h, v2.16b, v3.16b
294
295    uabal     v30.8h, v8.8b, v9.8b
296    uabal2    v30.8h, v8.16b, v9.16b
297
298    uabal     v30.8h, v10.8b, v11.8b
299    uabal2    v30.8h, v10.16b, v11.16b
300
301    uabal     v30.8h, v12.8b, v13.8b
302    uabal2    v30.8h, v12.16b, v13.16b
303
304    uabal     v30.8h, v14.8b, v15.8b
305    uabal2    v30.8h, v14.16b, v15.16b
306
307    uabal     v30.8h, v16.8b, v17.8b
308    uabal2    v30.8h, v16.16b, v17.16b
309
310    uabal     v30.8h, v18.8b, v19.8b
311    uabal2    v30.8h, v18.16b, v19.16b
312
313    addp      v31.8h, v30.8h, v30.8h
314    uaddlp    v31.4s, v31.8h
315    addp      v31.2s, v31.2s, v31.2s
316    mov       w6, v31.s[0]
317    cmp       w6, w4
318    bgt       end_func_16x16
319
320    //do the stuff again
321    ld1       {v0.16b}, [x7], x2
322    ld1       {v1.16b}, [x8], x3
323    ld1       {v2.16b}, [x7], x2
324    ld1       {v3.16b}, [x8], x3
325    ld1       {v8.16b}, [x7], x2
326    ld1       {v9.16b}, [x8], x3
327    ld1       {v10.16b}, [x7], x2
328    ld1       {v11.16b}, [x8], x3
329    ld1       {v12.16b}, [x7], x2
330    ld1       {v13.16b}, [x8], x3
331    ld1       {v14.16b}, [x7], x2
332    ld1       {v15.16b}, [x8], x3
333    ld1       {v16.16b}, [x7], x2
334    ld1       {v17.16b}, [x8], x3
335    ld1       {v18.16b}, [x7], x2
336    ld1       {v19.16b}, [x8], x3
337
338    uabal     v30.8h, v0.8b, v1.8b
339    uabal2    v30.8h, v0.16b, v1.16b
340
341    uabal     v30.8h, v2.8b, v3.8b
342    uabal2    v30.8h, v2.16b, v3.16b
343
344    uabal     v30.8h, v8.8b, v9.8b
345    uabal2    v30.8h, v8.16b, v9.16b
346
347    uabal     v30.8h, v10.8b, v11.8b
348    uabal2    v30.8h, v10.16b, v11.16b
349
350    uabal     v30.8h, v12.8b, v13.8b
351    uabal2    v30.8h, v12.16b, v13.16b
352
353    uabal     v30.8h, v14.8b, v15.8b
354    uabal2    v30.8h, v14.16b, v15.16b
355
356    uabal     v30.8h, v16.8b, v17.8b
357    uabal2    v30.8h, v16.16b, v17.16b
358
359    uabal     v30.8h, v18.8b, v19.8b
360    uabal2    v30.8h, v18.16b, v19.16b
361
362    addp      v31.8h, v30.8h, v30.8h
363    uaddlp    v31.4s, v31.8h
364    addp      v31.2s, v31.2s, v31.2s
365
366end_func_16x16:
367    st1       {v31.s}[0], [x5]
368    pop_v_regs
369    ret
370
371
372///*
373////---------------------------------------------------------------------------
374//// Function Name      : ime_calculate_sad2_prog_av8()
375////
376//// Detail Description : This function find the sad values of 4 Progressive MBs
377////                        at one shot
378////
379//// Platform           : CortexAv8/NEON            .
380////
381////-----------------------------------------------------------------------------
382//*/
383
384    .global ime_calculate_sad2_prog_av8
385ime_calculate_sad2_prog_av8:
386
387    // x0    = ref1     <UWORD8 *>
388    // x1    = ref2     <UWORD8 *>
389    // x2    = src     <UWORD8 *>
390    // w3    = RefBufferWidth <UWORD32>
391    // w4    = CurBufferWidth <UWORD32>
392    // x5    = psad <UWORD32 *>
393    push_v_regs
394    sxtw      x3, w3
395    sxtw      x4, w4
396    mov       x6, #8
397    movi      v30.8h, #0
398    movi      v31.8h, #0
399
400core_loop_ime_calculate_sad2_prog_av8:
401
402    ld1       {v0.16b}, [x0], x3
403    ld1       {v1.16b}, [x1], x3
404    ld1       {v2.16b}, [x3], x4
405
406    ld1       {v3.16b}, [x0], x3
407    ld1       {v4.16b}, [x1], x3
408    ld1       {v5.16b}, [x3], x4
409
410
411    uabal     v30.8h, v0.8b, v2.8b
412    uabal2    v30.8h, v0.16b, v2.16b
413    uabal     v31.8h, v1.8b, v2.8b
414    uabal2    v31.8h, v1.16b, v2.16b
415
416    uabal     v30.8h, v3.8b, v5.8b
417    uabal2    v30.8h, v3.16b, v5.16b
418    uabal     v31.8h, v4.8b, v5.8b
419    uabal2    v31.8h, v4.16b, v5.16b
420
421
422    ld1       {v6.16b}, [x0], x3
423    ld1       {v7.16b}, [x1], x3
424    ld1       {v8.16b}, [x3], x4
425
426    ld1       {v9.16b}, [x0], x3
427    ld1       {v10.16b}, [x1], x3
428    ld1       {v11.16b}, [x3], x4
429
430    uabal     v30.8h, v6.8b, v8.8b
431    uabal2    v30.8h, v6.16b, v8.16b
432    uabal     v31.8h, v7.8b, v8.8b
433    uabal2    v31.8h, v7.16b, v8.16b
434
435    uabal     v30.8h, v9.8b, v11.8b
436    uabal2    v30.8h, v9.16b, v11.16b
437    uabal     v31.8h, v10.8b, v11.8b
438    uabal2    v31.8h, v0.16b, v11.16b
439
440    subs      x6, x6, #1
441    bne       core_loop_ime_calculate_sad2_prog_av8
442
443    addp      v30.8h, v30.8h, v31.8h
444    uaddlp    v30.4s, v30.8h
445    addp      v30.2s, v30.2s, v30.2s
446    shl       v30.2s, v30.2s, #1
447
448    st1       {v30.2s}, [x5]
449    pop_v_regs
450    ret
451
452///*
453////---------------------------------------------------------------------------
454//// Function Name      : Calculate_Mad3_prog()
455////
456//// Detail Description : This function find the sad values of 4 Progressive MBs
457////                        at one shot
458////
459//// Platform           : CortexA8/NEON            .
460////
461////-----------------------------------------------------------------------------
462//*/
463
464    .global ime_calculate_sad3_prog_av8
465ime_calculate_sad3_prog_av8:
466
467    // x0    = ref1     <UWORD8 *>
468    // x1    = ref2     <UWORD8 *>
469    // x2    = ref3     <UWORD8 *>
470    // x3    = src     <UWORD8 *>
471    // w4    = RefBufferWidth <UWORD32>
472    // w5    = CurBufferWidth <UWORD32>
473    // x6    = psad <UWORD32 *>
474
475
476    push_v_regs
477    sxtw      x4, w4
478    sxtw      x5, w5
479    mov       x7, #16
480    movi      v29.8h, #0
481    movi      v30.8h, #0
482    movi      v31.8h, #0
483
484core_loop_ime_calculate_sad3_prog_av8:
485
486    ld1       {v0.16b}, [x0], x4
487    ld1       {v1.16b}, [x1], x4
488    ld1       {v2.16b}, [x2], x4
489    ld1       {v3.16b}, [x3], x5
490
491    uabal     v29.8h, v0.8b, v3.8b
492    uabal2    v29.8h, v0.16b, v3.16b
493    uabal     v30.8h, v1.8b, v3.8b
494    uabal2    v30.8h, v1.16b, v3.16b
495    uabal     v31.8h, v2.8b, v3.8b
496    uabal2    v31.8h, v2.16b, v3.16b
497
498    ld1       {v4.16b}, [x0], x4
499    ld1       {v5.16b}, [x1], x4
500    ld1       {v6.16b}, [x2], x4
501    ld1       {v7.16b}, [x3], x5
502
503    uabal     v29.8h, v4.8b, v7.8b
504    uabal2    v29.8h, v4.16b, v7.16b
505    uabal     v30.8h, v5.8b, v7.8b
506    uabal2    v30.8h, v5.16b, v7.16b
507    uabal     v31.8h, v6.8b, v7.8b
508    uabal2    v31.8h, v6.16b, v7.16b
509
510    subs      x7, x7, #1
511    bne       core_loop_ime_calculate_sad3_prog_av8
512
513    addp      v30.8h, v30.8h, v31.8h
514    uaddlp    v30.4s, v30.8h
515    addp      v30.2s, v30.2s, v30.2s
516    shl       v30.2s, v30.2s, #1
517
518    st1       {v30.2s}, [x6]
519    pop_v_regs
520    ret
521
522
523
524
525///**
526//******************************************************************************
527//*
528//* @brief computes distortion (SAD) for sub-pel motion estimation
529//*
530//* @par   Description
531//*   This functions computes SAD for all the 8 half pel points
532//*
533//* @param[out] pi4_sad
534//*  integer evaluated sad
535//*  pi4_sad[0] - half x
536//*  pi4_sad[1] - half x - 1
537//*  pi4_sad[2] - half y
538//*  pi4_sad[3] - half y - 1
539//*  pi4_sad[4] - half xy
540//*  pi4_sad[5] - half xy - 1
541//*  pi4_sad[6] - half xy - strd
542//*  pi4_sad[7] - half xy - 1 - strd
543//*
544//* @remarks
545//*
546//******************************************************************************
547//*/
548
549.text
550.p2align 2
551
552    .global ime_sub_pel_compute_sad_16x16_av8
553ime_sub_pel_compute_sad_16x16_av8:
554    push_v_regs
555    sxtw      x4, w4
556    sxtw      x5, w5
557    sub       x7, x1, #1                //x left
558    sub       x8, x2, x5                //y top
559    sub       x9, x3, #1                //xy  left
560    sub       x10, x3, x5               //xy top
561    sub       x11, x10, #1              //xy top left
562
563    movi      v24.8h, #0
564    movi      v25.8h, #0
565    movi      v26.8h, #0
566    movi      v27.8h, #0
567    movi      v28.8h, #0
568    movi      v29.8h, #0
569    movi      v30.8h, #0
570    movi      v31.8h, #0
571
572    mov       x12, #16
573core_loop_ime_sub_pel_compute_sad_16x16_av8:
574
575    ld1       {v0.16b}, [x0], x4        //src
576    ld1       {v1.16b}, [x1], x5        //x
577    ld1       {v2.16b}, [x7], x5        //x left
578    ld1       {v3.16b}, [x2], x5        //y
579    ld1       {v9.16b}, [x8], x5        //y top
580    ld1       {v10.16b}, [x3], x5       //xy
581    ld1       {v11.16b}, [x9], x5       //xy left
582    ld1       {v12.16b}, [x10], x5      //xy top
583    ld1       {v13.16b}, [x11], x5      //xy top left
584
585    uabal     v24.8h, v0.8b, v1.8b
586    uabal2    v24.8h, v0.16b, v1.16b
587    uabal     v25.8h, v0.8b, v2.8b
588    uabal2    v25.8h, v0.16b, v2.16b
589    uabal     v26.8h, v0.8b, v3.8b
590    uabal2    v26.8h, v0.16b, v3.16b
591    uabal     v27.8h, v0.8b, v9.8b
592    uabal2    v27.8h, v0.16b, v9.16b
593    uabal     v28.8h, v0.8b, v10.8b
594    uabal2    v28.8h, v0.16b, v10.16b
595    uabal     v29.8h, v0.8b, v11.8b
596    uabal2    v29.8h, v0.16b, v11.16b
597    uabal     v30.8h, v0.8b, v12.8b
598    uabal2    v30.8h, v0.16b, v12.16b
599    uabal     v31.8h, v0.8b, v13.8b
600    uabal2    v31.8h, v0.16b, v13.16b
601
602    subs      x12, x12, #1
603    bne       core_loop_ime_sub_pel_compute_sad_16x16_av8
604
605    addp      v24.8h, v24.8h, v25.8h
606    addp      v26.8h, v26.8h, v27.8h
607    addp      v28.8h, v28.8h, v29.8h
608    addp      v30.8h, v30.8h, v31.8h
609
610    uaddlp    v24.4s, v24.8h
611    uaddlp    v26.4s, v26.8h
612    uaddlp    v28.4s, v28.8h
613    uaddlp    v30.4s, v30.8h
614
615    addp      v24.4s, v24.4s, v26.4s
616    addp      v25.4s, v28.4s, v30.4s
617
618    st1       {v24.4s-v25.4s}, [x6]
619
620
621    pop_v_regs
622    ret
623
624
625///**
626//******************************************************************************
627//*
628//* @brief computes distortion (SAD) between 2 16x16 blocks
629//*
630//* @par   Description
631//*   This functions computes SAD between 2 16x16 blocks. There is a provision
632//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
633//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
634//*
635//* @param[in] pu1_src
636//*  UWORD8 pointer to the source
637//*
638//* @param[out] pu1_dst
639//*  UWORD8 pointer to the destination
640//*
641//* @param[in] src_strd
642//*  integer source stride
643//*
644//* @param[in] dst_strd
645//*  integer destination stride
646//*
647//* @param[in] i4_max_sad
648//*  integer maximum allowed distortion
649//*
650//* @param[in] pi4_mb_distortion
651//*  integer evaluated sad
652//*
653//* @remarks
654//*
655//******************************************************************************
656//*/
657    .global ime_compute_sad_16x16_av8
658ime_compute_sad_16x16_av8:
659    push_v_regs
660    sxtw      x2, w2
661    sxtw      x3, w3
662    mov       x6, #4
663    movi      v30.8h, #0
664
665core_loop_ime_compute_sad_16x16_av8:
666
667    ld1       {v0.16b}, [x0], x2
668    ld1       {v1.16b}, [x1], x3
669    ld1       {v2.16b}, [x0], x2
670    ld1       {v3.16b}, [x1], x3
671
672    uabal     v30.8h, v0.8b, v1.8b
673    uabal2    v30.8h, v0.16b, v1.16b
674
675    uabal     v30.8h, v2.8b, v3.8b
676    uabal2    v30.8h, v2.16b, v3.16b
677
678    ld1       {v4.16b}, [x0], x2
679    ld1       {v5.16b}, [x1], x3
680    ld1       {v6.16b}, [x0], x2
681    ld1       {v7.16b}, [x1], x3
682
683    uabal     v30.8h, v4.8b, v5.8b
684    uabal2    v30.8h, v4.16b, v5.16b
685
686    uabal     v30.8h, v6.8b, v7.8b
687    uabal2    v30.8h, v6.16b, v7.16b
688
689    subs      x6, x6, #1
690    bne       core_loop_ime_compute_sad_16x16_av8
691
692
693    addp      v30.8h, v30.8h, v30.8h
694    uaddlp    v30.4s, v30.8h
695    addp      v30.2s, v30.2s, v30.2s
696
697    st1       {v30.s}[0], [x5]
698    pop_v_regs
699    ret
700
701
702///*
703////---------------------------------------------------------------------------
704//// Function Name      : Calculate_Mad4_prog()
705////
706//// Detail Description : This function find the sad values of 4 Progressive MBs
707////                        at one shot
708////
709//// Platform           : CortexA8/NEON            .
710////
711////-----------------------------------------------------------------------------
712//*/
713
714    .global ime_calculate_sad4_prog_av8
715ime_calculate_sad4_prog_av8:
716    push_v_regs
717    sxtw      x2, w2
718    sxtw      x3, w3
719    sub       x5, x0, #1                //left
720    add       x6, x0, #1                //right
721    sub       x7, x0, x2                //top
722    add       x8, x0, x2                //bottom
723
724    movi      v28.8h, #0
725    movi      v29.8h, #0
726    movi      v30.8h, #0
727    movi      v31.8h, #0
728
729    mov       x9, #16
730core_loop_ime_calculate_sad4_prog_av8:
731
732    ld1       {v0.16b}, [x1], x3
733    ld1       {v1.16b}, [x5], x2
734    ld1       {v2.16b}, [x6], x2
735    ld1       {v3.16b}, [x7], x2
736    ld1       {v9.16b}, [x8], x2
737
738    uabal     v28.8h, v0.8b, v1.8b
739    uabal2    v28.8h, v0.16b, v1.16b
740    uabal     v29.8h, v0.8b, v2.8b
741    uabal2    v29.8h, v0.16b, v2.16b
742    uabal     v30.8h, v0.8b, v3.8b
743    uabal2    v30.8h, v0.16b, v3.16b
744    uabal     v31.8h, v0.8b, v9.8b
745    uabal2    v31.8h, v0.16b, v9.16b
746
747    subs      x9, x9, #1
748    bne       core_loop_ime_calculate_sad4_prog_av8
749
750    addp      v28.8h, v28.8h, v29.8h
751    addp      v30.8h, v30.8h, v31.8h
752
753    uaddlp    v28.4s, v28.8h
754    uaddlp    v30.4s, v30.8h
755
756    addp      v28.4s, v28.4s, v30.4s
757    st1       {v28.4s}, [x4]
758    pop_v_regs
759    ret
760
761
762
763//*****************************************************************************
764//*
765//* Function Name         : ime_compute_satqd_16x16_lumainter_av8
766//* Description           : This fucntion computes SAD for a 16x16 block.
767//                        : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant
768//
769//  Arguments             :   x0 :pointer to src buffer
770//                            x1 :pointer to est buffer
771//                            x2 :source stride
772//                            x3 :est stride
773//                            STACk :Threshold,distotion,is_nonzero
774//*
775//* Values Returned   : NONE
776//*
777//* Register Usage    : x0-x11
778//* Stack Usage       :
779//* Cycles            : Around
780//* Interruptiaility  : Interruptable
781//*
782//* Known Limitations
783//*   \Assumptions    :
784//*
785//* Revision History  :
786//*         DD MM YYYY    Author(s)           Changes
787//*         14 04 2014    Harinarayanan K K  First version
788//*
789//*****************************************************************************
790    .global ime_compute_satqd_16x16_lumainter_av8
791ime_compute_satqd_16x16_lumainter_av8:
792    //x0 :pointer to src buffer
793    //x1 :pointer to est buffer
794    //w2 :Source stride
795    //w3 :Pred stride
796    //x4 :Threshold pointer
797    //x5 :Distortion,ie SAD
798    //x6 :is nonzero
799    //x7 :loop counter
800    push_v_regs
801    sxtw      x2, w2
802    sxtw      x3, w3
803    stp       d8, d9, [sp, #-16]!
804    stp       d10, d11, [sp, #-16]!
805    stp       d12, d13, [sp, #-16]!
806    stp       d14, d15, [sp, #-16]!
807
808    ld1       {v30.8h}, [x4]
809
810    dup       v20.4h, v30.h[1]          //ls1
811    dup       v24.4h, v30.h[0]          //ls2
812    dup       v21.4h, v30.h[5]          //ls3
813    dup       v25.4h, v30.h[7]          //ls4
814    dup       v22.4h, v30.h[3]          //ls5
815    dup       v26.4h, v30.h[4]          //ls6
816    dup       v23.4h, v30.h[6]          //ls7
817    dup       v27.4h, v30.h[2]          //ls8
818
819    mov       v20.d[1], v24.d[0]
820    mov       v21.d[1], v25.d[0]
821    mov       v22.d[1], v26.d[0]
822    mov       v23.d[1], v27.d[0]
823
824    add       x4, x4, #16
825    ld1       {v29.h}[0], [x4]
826    dup       v29.4h, v29.h[0]
827
828    movi      v31.8h, #0
829
830    mov       x7, #4
831core_loop_satqd_ime_compute_satqd_16x16_lumainter:
832    ld1       {v0.16b}, [x0], x2
833    ld1       {v1.16b}, [x1], x3
834    ld1       {v2.16b}, [x0], x2
835    ld1       {v3.16b}, [x1], x3
836    ld1       {v4.16b}, [x0], x2
837    ld1       {v5.16b}, [x1], x3
838    ld1       {v6.16b}, [x0], x2
839    ld1       {v7.16b}, [x1], x3
840
841    uabdl     v10.8h, v0.8b, v1.8b
842    uabdl2    v15.8h, v0.16b, v1.16b
843    uabdl     v11.8h, v2.8b, v3.8b
844    uabdl2    v16.8h, v2.16b, v3.16b
845    uabdl     v12.8h, v4.8b, v5.8b
846    uabdl2    v17.8h, v4.16b, v5.16b
847    uabdl     v13.8h, v6.8b, v7.8b
848    uabdl2    v18.8h, v6.16b, v7.16b
849
850    add       v0.8h, v10.8h, v13.8h
851    add       v1.8h, v11.8h, v12.8h
852    add       v2.8h, v15.8h, v18.8h
853    add       v3.8h, v16.8h, v17.8h
854
855    //v0 : S1     S4     S4     S1        A1    A4    A4    A1
856    //v1 : S2     S3     S3     S2        A2    A3    A3    A2
857    //v2 : B1     B4     B4     B1        X1    X4    X4    X1
858    //v3 : B3     B2     B2     B3        X3    X2    X2    X3
859
860    trn1      v4.8h, v0.8h, v1.8h
861    trn2      v5.8h, v0.8h, v1.8h
862    trn1      v6.8h, v2.8h, v3.8h
863    trn2      v7.8h, v2.8h, v3.8h
864
865    trn1      v0.4s, v4.4s, v6.4s
866    trn2      v2.4s, v4.4s, v6.4s
867    trn1      v1.4s, v5.4s, v7.4s
868    trn2      v3.4s, v5.4s, v7.4s
869
870    add       v4.8h, v0.8h, v3.8h
871    add       v5.8h, v1.8h, v2.8h
872    //v4 : S1     S2     B1     B2      A1    A2    X1    X2
873    //v5 : S4     S3     B4     B3      A4    A3    X4    X3
874
875    //compute sad for each 4x4 block
876    add       v6.8h, v4.8h, v5.8h
877    addp      v19.8h, v6.8h, v6.8h
878    //duplicate the sad into 128 bit so that we can compare using 128bit
879    add       v31.4h, v31.4h, v19.4h
880
881    //sad_2 = sad_1<<1;
882    shl       v28.8h, v19.8h, #1
883
884    //sad_2 - pu2_thrsh
885    sub       v24.8h, v28.8h, v20.8h
886    sub       v25.8h, v28.8h, v21.8h
887    sub       v26.8h, v28.8h, v22.8h
888    sub       v27.8h, v28.8h, v23.8h
889
890    trn1      v0.4s, v4.4s, v5.4s
891    trn2      v1.4s, v4.4s, v5.4s
892    //v0 : S1     S2     S4     S3      A1    A2    A4    A3
893    //v1 : B1     B2     B4     B3      X1    X2    X4    X3
894
895    trn1      v4.8h, v0.8h, v1.8h
896    trn2      v5.8h, v0.8h, v1.8h
897    //v4 : S1     B1     S4     B4      A1    X1    A4    X4
898    //v5 : S2     B2     S3     B3      A2    X2    A3    X3
899
900    mov       v7.s[0], v4.s[1]
901    mov       v7.s[1], v4.s[3]
902    mov       v6.s[0], v5.s[1]          // V4 //S1 B1 A1 X1
903    mov       v6.s[1], v5.s[3]          // V5 //S2 B2 A2 X2
904    mov       v4.s[1], v4.s[2]          // V6 //S3 B3 A3 X3
905    mov       v5.s[1], v5.s[2]          // V7 //S4 B4 A4 X4
906
907    shl       v0.4h, v4.4h, #1          //S1<<1
908    shl       v1.4h, v5.4h, #1          //S2<<1
909    shl       v2.4h, v6.4h, #1          //S3<<1
910    shl       v3.4h, v7.4h, #1          //S4<<1
911
912    add       v8.4h, v5.4h, v6.4h       //(s2[j] + s3[j]))
913    add       v9.4h, v4.4h, v7.4h       //(s1[j] + s4[j]))
914    add       v10.4h, v6.4h, v7.4h      //(s3[j] + s4[j]))
915    sub       v11.4h, v6.4h, v0.4h      //(s3[j] - (s1[j]<<1))
916    sub       v12.4h, v7.4h, v1.4h      //(s4[j] - (s2[j]<<1))
917    add       v13.4h, v4.4h, v5.4h      //(s1[j] + s2[j]))
918    sub       v14.4h, v5.4h, v3.4h      //(s2[j] - (s4[j]<<1)))
919    sub       v15.4h, v4.4h, v2.4h      //(s1[j] - (s3[j]<<1)))
920
921    mov       v8.d[1], v9.d[0]
922    mov       v10.d[1], v11.d[0]
923    mov       v12.d[1], v13.d[0]
924    mov       v14.d[1], v15.d[0]
925
926    cmge      v0.8h, v24.8h, v8.8h      //ls1 ls2
927    cmge      v1.8h, v25.8h, v10.8h     //ls3 ls4
928    cmge      v2.8h, v26.8h, v12.8h     //ls5 ls6
929    cmge      v3.8h, v27.8h, v14.8h     //ls7 ls8
930    cmge      v4.4h, v19.4h, v29.4h     //sad
931
932    orr       v0.16b, v0.16b, v1.16b
933    orr       v2.16b, v2.16b, v3.16b
934    orr       v2.16b, v0.16b, v2.16b
935    xtn       v2.8b, v2.8h
936    orr       v2.8b, v2.8b, v4.8b
937
938    //if the comparison is non zero, out
939    mov       x4, v2.d[0]
940    cmp       x4, #0
941    bne       core_loop_compute_sad_pre
942
943    subs      x7, x7, #1
944    bne       core_loop_satqd_ime_compute_satqd_16x16_lumainter
945    b         satdq_end_func
946
947
948core_loop_compute_sad:
949    ld1       {v0.16b}, [x0], x2
950    ld1       {v1.16b}, [x1], x3
951    ld1       {v2.16b}, [x0], x2
952    ld1       {v3.16b}, [x1], x3
953
954    uabal     v31.8h, v0.8b, v1.8b
955    uabal2    v31.8h, v0.16b, v1.16b
956
957    uabal     v31.8h, v2.8b, v3.8b
958    uabal2    v31.8h, v2.16b, v3.16b
959
960    ld1       {v4.16b}, [x0], x2
961    ld1       {v5.16b}, [x1], x3
962    ld1       {v6.16b}, [x0], x2
963    ld1       {v7.16b}, [x1], x3
964
965    uabal     v31.8h, v4.8b, v5.8b
966    uabal2    v31.8h, v4.16b, v5.16b
967
968    uabal     v31.8h, v6.8b, v7.8b
969    uabal2    v31.8h, v6.16b, v7.16b
970
971core_loop_compute_sad_pre:
972    subs      x7, x7, #1
973    bne       core_loop_compute_sad
974
975satdq_end_func:
976
977    mov       x7, #1
978    cmp       x4, #0
979    csel      x7, x4, x7, eq
980    str       w7, [x6]
981
982    addp      v31.8h, v31.8h, v31.8h
983    uaddlp    v31.4s, v31.8h
984    addp      v31.2s, v31.2s, v31.2s
985    st1       {v31.s}[0], [x5]
986
987
988    ldp       d14, d15, [sp], #16
989    ldp       d12, d13, [sp], #16
990    ldp       d10, d11, [sp], #16
991    ldp       d8, d9, [sp], #16
992    pop_v_regs
993    ret
994