1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20//**
21
22///**
23//******************************************************************************
24//*
25//*
26//* @brief
27//*  This file contains definitions of routines that compute distortion
28//*  between two macro/sub blocks of identical dimensions
29//*
30//* @author
31//*  Ittiam
32//*
33//* @par List of Functions:
34//*  - ime_compute_sad_16x16()
35//*  - ime_compute_sad_8x8()
36//*  - ime_compute_sad_4x4()
37//*  - ime_compute_sad_16x8()
38//*  - ime_compute_satqd_16x16_lumainter_av8()
39//*
40//* @remarks
41//*  None
42//*
43//*******************************************************************************
44//
45
46
47///**
48//******************************************************************************
49//*
50//* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
51//*
52//* @par   Description
53//*   This functions computes SAD between 2 16x16 blocks. There is a provision
54//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
55//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
56//*
57//* @param[in] pu1_src
58//*  UWORD8 pointer to the source
59//*
60//* @param[out] pu1_dst
61//*  UWORD8 pointer to the destination
62//*
63//* @param[in] src_strd
64//*  integer source stride
65//*
66//* @param[in] dst_strd
67//*  integer destination stride
68//*
69//* @param[in] i4_max_sad
70//*  integer maximum allowed distortion
71//*
72//* @param[in] pi4_mb_distortion
73//*  integer evaluated sad
74//*
75//* @remarks
76//*
77//******************************************************************************
78//*/
79.text
80.p2align 2
81
82.macro push_v_regs
83    stp       d8, d9, [sp, #-16]!
84    stp       d10, d11, [sp, #-16]!
85    stp       d12, d13, [sp, #-16]!
86    stp       d14, d15, [sp, #-16]!
87.endm
88.macro pop_v_regs
89    ldp       d14, d15, [sp], #16
90    ldp       d12, d13, [sp], #16
91    ldp       d10, d11, [sp], #16
92    ldp       d8, d9, [sp], #16
93.endm
94
95    .global ime_compute_sad_16x16_fast_av8
96ime_compute_sad_16x16_fast_av8:
97    push_v_regs
98    lsl       x2, x2, #1
99    lsl       x3, x3, #1
100
101    mov       x6, #2
102    movi      v30.8h, #0
103
104core_loop_ime_compute_sad_16x16_fast_av8:
105
106    ld1       {v0.16b}, [x0], x2
107    ld1       {v1.16b}, [x1], x3
108    ld1       {v2.16b}, [x0], x2
109    ld1       {v3.16b}, [x1], x3
110
111    uabal     v30.8h, v0.8b, v1.8b
112    uabal2    v30.8h, v0.16b, v1.16b
113
114    uabal     v30.8h, v2.8b, v3.8b
115    uabal2    v30.8h, v2.16b, v3.16b
116
117    ld1       {v4.16b}, [x0], x2
118    ld1       {v5.16b}, [x1], x3
119    ld1       {v6.16b}, [x0], x2
120    ld1       {v7.16b}, [x1], x3
121
122    uabal     v30.8h, v4.8b, v5.8b
123    uabal2    v30.8h, v4.16b, v5.16b
124
125    uabal     v30.8h, v6.8b, v7.8b
126    uabal2    v30.8h, v6.16b, v7.16b
127
128    subs      x6, x6, #1
129    bne       core_loop_ime_compute_sad_16x16_fast_av8
130
131
132    addp      v30.8h, v30.8h, v30.8h
133    uaddlp    v30.4s, v30.8h
134    addp      v30.2s, v30.2s, v30.2s
135    shl       v30.2s, v30.2s, #1
136
137    st1       {v30.s}[0], [x5]
138    pop_v_regs
139    ret
140
141
142///**
143//******************************************************************************
144//*
145//*  @brief computes distortion (SAD) between 2 16x8  blocks
146//*
147//*
148//*  @par   Description
149//*   This functions computes SAD between 2 16x8 blocks. There is a provision
150//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
151//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
152//*
153//* @param[in] pu1_src
154//*  UWORD8 pointer to the source
155//*
156//* @param[out] pu1_dst
157//*  UWORD8 pointer to the destination
158//*
159//* @param[in] src_strd
160//*  integer source stride
161//*
162//* @param[in] dst_strd
163//*  integer destination stride
164//*
165//* @param[in] u4_max_sad
166//*  integer maximum allowed distortion
167//*
168//* @param[in] pi4_mb_distortion
169//*  integer evaluated sad
170//*
171//* @remarks
172//*
173//******************************************************************************
174//*/
175//
176    .global ime_compute_sad_16x8_av8
177ime_compute_sad_16x8_av8:
178
179    //chheck what stride incremtn to use
180    //earlier code did not have this lsl
181    push_v_regs
182    mov       x6, #2
183    movi      v30.8h, #0
184
185core_loop_ime_compute_sad_16x8_av8:
186
187    ld1       {v0.16b}, [x0], x2
188    ld1       {v1.16b}, [x1], x3
189    ld1       {v2.16b}, [x0], x2
190    ld1       {v3.16b}, [x1], x3
191
192    uabal     v30.8h, v0.8b, v1.8b
193    uabal2    v30.8h, v0.16b, v1.16b
194
195    uabal     v30.8h, v2.8b, v3.8b
196    uabal2    v30.8h, v2.16b, v3.16b
197
198    ld1       {v4.16b}, [x0], x2
199    ld1       {v5.16b}, [x1], x3
200    ld1       {v6.16b}, [x0], x2
201    ld1       {v7.16b}, [x1], x3
202
203    uabal     v30.8h, v4.8b, v5.8b
204    uabal2    v30.8h, v4.16b, v5.16b
205
206    uabal     v30.8h, v6.8b, v7.8b
207    uabal2    v30.8h, v6.16b, v7.16b
208
209    subs      x6, x6, #1
210    bne       core_loop_ime_compute_sad_16x8_av8
211
212
213    addp      v30.8h, v30.8h, v30.8h
214    uaddlp    v30.4s, v30.8h
215    addp      v30.2s, v30.2s, v30.2s
216
217    st1       {v30.s}[0], [x5]
218    pop_v_regs
219    ret
220
221///**
222//******************************************************************************
223//*
224//* @brief computes distortion (SAD) between 2 16x16 blocks with early exit
225//*
226//* @par   Description
227//*   This functions computes SAD between 2 16x16 blocks. There is a provision
228//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
229//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
230//*
231//* @param[in] pu1_src
232//*  UWORD8 pointer to the source
233//*
234//* @param[out] pu1_dst
235//*  UWORD8 pointer to the destination
236//*
237//* @param[in] src_strd
238//*  integer source stride
239//*
240//* @param[in] dst_strd
241//*  integer destination stride
242//*
243//* @param[in] i4_max_sad
244//*  integer maximum allowed distortion
245//*
246//* @param[in] pi4_mb_distortion
247//*  integer evaluated sad
248//*
249//* @remarks
250//*
251//******************************************************************************
252//*/
253
254    .global ime_compute_sad_16x16_ea8_av8
255ime_compute_sad_16x16_ea8_av8:
256
257    push_v_regs
258    movi      v30.8h, #0
259
260    add       x7, x0, x2
261    add       x8, x1, x3
262
263    lsl       x2, x2, #1
264    lsl       x3, x3, #1
265
266    ld1       {v0.16b}, [x0], x2
267    ld1       {v1.16b}, [x1], x3
268    ld1       {v2.16b}, [x0], x2
269    ld1       {v3.16b}, [x1], x3
270    ld1       {v8.16b}, [x0], x2
271    ld1       {v9.16b}, [x1], x3
272    ld1       {v10.16b}, [x0], x2
273    ld1       {v11.16b}, [x1], x3
274    ld1       {v12.16b}, [x0], x2
275    ld1       {v13.16b}, [x1], x3
276    ld1       {v14.16b}, [x0], x2
277    ld1       {v15.16b}, [x1], x3
278    ld1       {v16.16b}, [x0], x2
279    ld1       {v17.16b}, [x1], x3
280    ld1       {v18.16b}, [x0], x2
281    ld1       {v19.16b}, [x1], x3
282
283    uabal     v30.8h, v0.8b, v1.8b
284    uabal2    v30.8h, v0.16b, v1.16b
285
286    uabal     v30.8h, v2.8b, v3.8b
287    uabal2    v30.8h, v2.16b, v3.16b
288
289    uabal     v30.8h, v8.8b, v9.8b
290    uabal2    v30.8h, v8.16b, v9.16b
291
292    uabal     v30.8h, v10.8b, v11.8b
293    uabal2    v30.8h, v10.16b, v11.16b
294
295    uabal     v30.8h, v12.8b, v13.8b
296    uabal2    v30.8h, v12.16b, v13.16b
297
298    uabal     v30.8h, v14.8b, v15.8b
299    uabal2    v30.8h, v14.16b, v15.16b
300
301    uabal     v30.8h, v16.8b, v17.8b
302    uabal2    v30.8h, v16.16b, v17.16b
303
304    uabal     v30.8h, v18.8b, v19.8b
305    uabal2    v30.8h, v18.16b, v19.16b
306
307    addp      v31.8h, v30.8h, v30.8h
308    uaddlp    v31.4s, v31.8h
309    addp      v31.2s, v31.2s, v31.2s
310    mov       w6, v31.s[0]
311    cmp       w6, w4
312    bgt       end_func_16x16
313
314    //do the stuff again
315    ld1       {v0.16b}, [x7], x2
316    ld1       {v1.16b}, [x8], x3
317    ld1       {v2.16b}, [x7], x2
318    ld1       {v3.16b}, [x8], x3
319    ld1       {v8.16b}, [x7], x2
320    ld1       {v9.16b}, [x8], x3
321    ld1       {v10.16b}, [x7], x2
322    ld1       {v11.16b}, [x8], x3
323    ld1       {v12.16b}, [x7], x2
324    ld1       {v13.16b}, [x8], x3
325    ld1       {v14.16b}, [x7], x2
326    ld1       {v15.16b}, [x8], x3
327    ld1       {v16.16b}, [x7], x2
328    ld1       {v17.16b}, [x8], x3
329    ld1       {v18.16b}, [x7], x2
330    ld1       {v19.16b}, [x8], x3
331
332    uabal     v30.8h, v0.8b, v1.8b
333    uabal2    v30.8h, v0.16b, v1.16b
334
335    uabal     v30.8h, v2.8b, v3.8b
336    uabal2    v30.8h, v2.16b, v3.16b
337
338    uabal     v30.8h, v8.8b, v9.8b
339    uabal2    v30.8h, v8.16b, v9.16b
340
341    uabal     v30.8h, v10.8b, v11.8b
342    uabal2    v30.8h, v10.16b, v11.16b
343
344    uabal     v30.8h, v12.8b, v13.8b
345    uabal2    v30.8h, v12.16b, v13.16b
346
347    uabal     v30.8h, v14.8b, v15.8b
348    uabal2    v30.8h, v14.16b, v15.16b
349
350    uabal     v30.8h, v16.8b, v17.8b
351    uabal2    v30.8h, v16.16b, v17.16b
352
353    uabal     v30.8h, v18.8b, v19.8b
354    uabal2    v30.8h, v18.16b, v19.16b
355
356    addp      v31.8h, v30.8h, v30.8h
357    uaddlp    v31.4s, v31.8h
358    addp      v31.2s, v31.2s, v31.2s
359
360end_func_16x16:
361    st1       {v31.s}[0], [x5]
362    pop_v_regs
363    ret
364
365
366///*
367////---------------------------------------------------------------------------
368//// Function Name      : ime_calculate_sad2_prog_av8()
369////
370//// Detail Description : This function find the sad values of 4 Progressive MBs
371////                        at one shot
372////
373//// Platform           : CortexAv8/NEON            .
374////
375////-----------------------------------------------------------------------------
376//*/
377
378    .global ime_calculate_sad2_prog_av8
379ime_calculate_sad2_prog_av8:
380
381    // x0    = ref1     <UWORD8 *>
382    // x1    = ref2     <UWORD8 *>
383    // x2    = src     <UWORD8 *>
384    // x3    = RefBufferWidth <UWORD32>
385    // stack = CurBufferWidth <UWORD32>, psad <UWORD32 *>
386    push_v_regs
387    mov       x6, #8
388    movi      v30.8h, #0
389    movi      v31.8h, #0
390
391core_loop_ime_calculate_sad2_prog_av8:
392
393    ld1       {v0.16b}, [x0], x3
394    ld1       {v1.16b}, [x1], x3
395    ld1       {v2.16b}, [x3], x4
396
397    ld1       {v3.16b}, [x0], x3
398    ld1       {v4.16b}, [x1], x3
399    ld1       {v5.16b}, [x3], x4
400
401
402    uabal     v30.8h, v0.8b, v2.8b
403    uabal2    v30.8h, v0.16b, v2.16b
404    uabal     v31.8h, v1.8b, v2.8b
405    uabal2    v31.8h, v1.16b, v2.16b
406
407    uabal     v30.8h, v3.8b, v5.8b
408    uabal2    v30.8h, v3.16b, v5.16b
409    uabal     v31.8h, v4.8b, v5.8b
410    uabal2    v31.8h, v4.16b, v5.16b
411
412
413    ld1       {v6.16b}, [x0], x3
414    ld1       {v7.16b}, [x1], x3
415    ld1       {v8.16b}, [x3], x4
416
417    ld1       {v9.16b}, [x0], x3
418    ld1       {v10.16b}, [x1], x3
419    ld1       {v11.16b}, [x3], x4
420
421    uabal     v30.8h, v6.8b, v8.8b
422    uabal2    v30.8h, v6.16b, v8.16b
423    uabal     v31.8h, v7.8b, v8.8b
424    uabal2    v31.8h, v7.16b, v8.16b
425
426    uabal     v30.8h, v9.8b, v11.8b
427    uabal2    v30.8h, v9.16b, v11.16b
428    uabal     v31.8h, v10.8b, v11.8b
429    uabal2    v31.8h, v0.16b, v11.16b
430
431    subs      x6, x6, #1
432    bne       core_loop_ime_calculate_sad2_prog_av8
433
434    addp      v30.8h, v30.8h, v31.8h
435    uaddlp    v30.4s, v30.8h
436    addp      v30.2s, v30.2s, v30.2s
437    shl       v30.2s, v30.2s, #1
438
439    st1       {v30.2s}, [x5]
440    pop_v_regs
441    ret
442
443///*
444////---------------------------------------------------------------------------
445//// Function Name      : Calculate_Mad3_prog()
446////
447//// Detail Description : This function find the sad values of 4 Progressive MBs
448////                        at one shot
449////
450//// Platform           : CortexA8/NEON            .
451////
452////-----------------------------------------------------------------------------
453//*/
454
455    .global ime_calculate_sad3_prog_av8
456ime_calculate_sad3_prog_av8:
457
458    // x0    = ref1     <UWORD8 *>
459    // x1    = ref2     <UWORD8 *>
460    // x2    = ref3     <UWORD8 *>
461    // x3    = src     <UWORD8 *>
462    // stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *>
463
464
465    // x0    = ref1     <UWORD8 *>
466    // x1    = ref2     <UWORD8 *>
467    // x2    = src     <UWORD8 *>
468    // x3    = RefBufferWidth <UWORD32>
469    // stack = CurBufferWidth <UWORD32>, psad <UWORD32 *>
470    push_v_regs
471    mov       x6, #16
472    movi      v29.8h, #0
473    movi      v30.8h, #0
474    movi      v31.8h, #0
475
476core_loop_ime_calculate_sad3_prog_av8:
477
478    ld1       {v0.16b}, [x0], x4
479    ld1       {v1.16b}, [x1], x4
480    ld1       {v2.16b}, [x2], x4
481    ld1       {v3.16b}, [x3], x5
482
483    uabal     v29.8h, v0.8b, v3.8b
484    uabal2    v29.8h, v0.16b, v3.16b
485    uabal     v30.8h, v1.8b, v3.8b
486    uabal2    v30.8h, v1.16b, v3.16b
487    uabal     v31.8h, v2.8b, v3.8b
488    uabal2    v31.8h, v2.16b, v3.16b
489
490    ld1       {v4.16b}, [x0], x4
491    ld1       {v5.16b}, [x1], x4
492    ld1       {v6.16b}, [x2], x4
493    ld1       {v7.16b}, [x3], x5
494
495    uabal     v29.8h, v4.8b, v7.8b
496    uabal2    v29.8h, v4.16b, v7.16b
497    uabal     v30.8h, v5.8b, v7.8b
498    uabal2    v30.8h, v5.16b, v7.16b
499    uabal     v31.8h, v6.8b, v7.8b
500    uabal2    v31.8h, v6.16b, v7.16b
501
502    subs      x6, x6, #1
503    bne       core_loop_ime_calculate_sad2_prog_av8
504
505    addp      v30.8h, v30.8h, v31.8h
506    uaddlp    v30.4s, v30.8h
507    addp      v30.2s, v30.2s, v30.2s
508    shl       v30.2s, v30.2s, #1
509
510    st1       {v30.2s}, [x5]
511    pop_v_regs
512    ret
513
514
515
516
517///**
518//******************************************************************************
519//*
520//* @brief computes distortion (SAD) for sub-pel motion estimation
521//*
522//* @par   Description
523//*   This functions computes SAD for all the 8 half pel points
524//*
525//* @param[out] pi4_sad
526//*  integer evaluated sad
527//*  pi4_sad[0] - half x
528//*  pi4_sad[1] - half x - 1
529//*  pi4_sad[2] - half y
530//*  pi4_sad[3] - half y - 1
531//*  pi4_sad[4] - half xy
532//*  pi4_sad[5] - half xy - 1
533//*  pi4_sad[6] - half xy - strd
534//*  pi4_sad[7] - half xy - 1 - strd
535//*
536//* @remarks
537//*
538//******************************************************************************
539//*/
540
541.text
542.p2align 2
543
544    .global ime_sub_pel_compute_sad_16x16_av8
545ime_sub_pel_compute_sad_16x16_av8:
546    push_v_regs
547    sub       x7, x1, #1                //x left
548    sub       x8, x2, x5                //y top
549    sub       x9, x3, #1                //xy  left
550    sub       x10, x3, x5               //xy top
551    sub       x11, x10, #1              //xy top left
552
553    movi      v24.8h, #0
554    movi      v25.8h, #0
555    movi      v26.8h, #0
556    movi      v27.8h, #0
557    movi      v28.8h, #0
558    movi      v29.8h, #0
559    movi      v30.8h, #0
560    movi      v31.8h, #0
561
562    mov       x12, #16
563core_loop_ime_sub_pel_compute_sad_16x16_av8:
564
565    ld1       {v0.16b}, [x0], x4        //src
566    ld1       {v1.16b}, [x1], x5        //x
567    ld1       {v2.16b}, [x7], x5        //x left
568    ld1       {v3.16b}, [x2], x5        //y
569    ld1       {v9.16b}, [x8], x5        //y top
570    ld1       {v10.16b}, [x3], x5       //xy
571    ld1       {v11.16b}, [x9], x5       //xy left
572    ld1       {v12.16b}, [x10], x5      //xy top
573    ld1       {v13.16b}, [x11], x5      //xy top left
574
575    uabal     v24.8h, v0.8b, v1.8b
576    uabal2    v24.8h, v0.16b, v1.16b
577    uabal     v25.8h, v0.8b, v2.8b
578    uabal2    v25.8h, v0.16b, v2.16b
579    uabal     v26.8h, v0.8b, v3.8b
580    uabal2    v26.8h, v0.16b, v3.16b
581    uabal     v27.8h, v0.8b, v9.8b
582    uabal2    v27.8h, v0.16b, v9.16b
583    uabal     v28.8h, v0.8b, v10.8b
584    uabal2    v28.8h, v0.16b, v10.16b
585    uabal     v29.8h, v0.8b, v11.8b
586    uabal2    v29.8h, v0.16b, v11.16b
587    uabal     v30.8h, v0.8b, v12.8b
588    uabal2    v30.8h, v0.16b, v12.16b
589    uabal     v31.8h, v0.8b, v13.8b
590    uabal2    v31.8h, v0.16b, v13.16b
591
592    subs      x12, x12, #1
593    bne       core_loop_ime_sub_pel_compute_sad_16x16_av8
594
595    addp      v24.8h, v24.8h, v25.8h
596    addp      v26.8h, v26.8h, v27.8h
597    addp      v28.8h, v28.8h, v29.8h
598    addp      v30.8h, v30.8h, v31.8h
599
600    uaddlp    v24.4s, v24.8h
601    uaddlp    v26.4s, v26.8h
602    uaddlp    v28.4s, v28.8h
603    uaddlp    v30.4s, v30.8h
604
605    addp      v24.4s, v24.4s, v26.4s
606    addp      v25.4s, v28.4s, v30.4s
607
608    st1       {v24.4s-v25.4s}, [x6]
609
610
611    pop_v_regs
612    ret
613
614
615///**
616//******************************************************************************
617//*
618//* @brief computes distortion (SAD) between 2 16x16 blocks
619//*
620//* @par   Description
621//*   This functions computes SAD between 2 16x16 blocks. There is a provision
622//*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
623//*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
624//*
625//* @param[in] pu1_src
626//*  UWORD8 pointer to the source
627//*
628//* @param[out] pu1_dst
629//*  UWORD8 pointer to the destination
630//*
631//* @param[in] src_strd
632//*  integer source stride
633//*
634//* @param[in] dst_strd
635//*  integer destination stride
636//*
637//* @param[in] i4_max_sad
638//*  integer maximum allowed distortion
639//*
640//* @param[in] pi4_mb_distortion
641//*  integer evaluated sad
642//*
643//* @remarks
644//*
645//******************************************************************************
646//*/
647    .global ime_compute_sad_16x16_av8
648ime_compute_sad_16x16_av8:
649    push_v_regs
650    mov       x6, #4
651    movi      v30.8h, #0
652
653core_loop_ime_compute_sad_16x16_av8:
654
655    ld1       {v0.16b}, [x0], x2
656    ld1       {v1.16b}, [x1], x3
657    ld1       {v2.16b}, [x0], x2
658    ld1       {v3.16b}, [x1], x3
659
660    uabal     v30.8h, v0.8b, v1.8b
661    uabal2    v30.8h, v0.16b, v1.16b
662
663    uabal     v30.8h, v2.8b, v3.8b
664    uabal2    v30.8h, v2.16b, v3.16b
665
666    ld1       {v4.16b}, [x0], x2
667    ld1       {v5.16b}, [x1], x3
668    ld1       {v6.16b}, [x0], x2
669    ld1       {v7.16b}, [x1], x3
670
671    uabal     v30.8h, v4.8b, v5.8b
672    uabal2    v30.8h, v4.16b, v5.16b
673
674    uabal     v30.8h, v6.8b, v7.8b
675    uabal2    v30.8h, v6.16b, v7.16b
676
677    subs      x6, x6, #1
678    bne       core_loop_ime_compute_sad_16x16_av8
679
680
681    addp      v30.8h, v30.8h, v30.8h
682    uaddlp    v30.4s, v30.8h
683    addp      v30.2s, v30.2s, v30.2s
684
685    st1       {v30.s}[0], [x5]
686    pop_v_regs
687    ret
688
689
690///*
691////---------------------------------------------------------------------------
692//// Function Name      : Calculate_Mad4_prog()
693////
694//// Detail Description : This function find the sad values of 4 Progressive MBs
695////                        at one shot
696////
697//// Platform           : CortexA8/NEON            .
698////
699////-----------------------------------------------------------------------------
700//*/
701
702    .global ime_calculate_sad4_prog_av8
703ime_calculate_sad4_prog_av8:
704    push_v_regs
705    sub       x5, x0, #1                //left
706    add       x6, x0, #1                //right
707    sub       x7, x0, x2                //top
708    add       x8, x0, x2                //bottom
709
710    movi      v28.8h, #0
711    movi      v29.8h, #0
712    movi      v30.8h, #0
713    movi      v31.8h, #0
714
715    mov       x9, #16
716core_loop_ime_calculate_sad4_prog_av8:
717
718    ld1       {v0.16b}, [x1], x3
719    ld1       {v1.16b}, [x5], x2
720    ld1       {v2.16b}, [x6], x2
721    ld1       {v3.16b}, [x7], x2
722    ld1       {v9.16b}, [x8], x2
723
724    uabal     v28.8h, v0.8b, v1.8b
725    uabal2    v28.8h, v0.16b, v1.16b
726    uabal     v29.8h, v0.8b, v2.8b
727    uabal2    v29.8h, v0.16b, v2.16b
728    uabal     v30.8h, v0.8b, v3.8b
729    uabal2    v30.8h, v0.16b, v3.16b
730    uabal     v31.8h, v0.8b, v9.8b
731    uabal2    v31.8h, v0.16b, v9.16b
732
733    subs      x9, x9, #1
734    bne       core_loop_ime_calculate_sad4_prog_av8
735
736    addp      v28.8h, v28.8h, v29.8h
737    addp      v30.8h, v30.8h, v31.8h
738
739    uaddlp    v28.4s, v28.8h
740    uaddlp    v30.4s, v30.8h
741
742    addp      v28.4s, v28.4s, v30.4s
743    st1       {v28.4s}, [x4]
744    pop_v_regs
745    ret
746
747
748
749//*****************************************************************************
750//*
751//* Function Name         : ime_compute_satqd_16x16_lumainter_av8
752//* Description           : This fucntion computes SAD for a 16x16 block.
753//                        : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant
754//
755//  Arguments             :   x0 :pointer to src buffer
756//                            x1 :pointer to est buffer
757//                            x2 :source stride
758//                            x3 :est stride
759//                            STACk :Threshold,distotion,is_nonzero
760//*
761//* Values Returned   : NONE
762//*
763//* Register Usage    : x0-x11
764//* Stack Usage       :
765//* Cycles            : Around
766//* Interruptiaility  : Interruptable
767//*
768//* Known Limitations
769//*   \Assumptions    :
770//*
771//* Revision History  :
772//*         DD MM YYYY    Author(s)           Changes
773//*         14 04 2014    Harinarayanan K K  First version
774//*
775//*****************************************************************************
776    .global ime_compute_satqd_16x16_lumainter_av8
777ime_compute_satqd_16x16_lumainter_av8:
778    //x0 :pointer to src buffer
779    //x1 :pointer to est buffer
780    //x2 :Source stride
781    //x3 :Pred stride
782    //x4 :Threshold pointer
783    //x5 :Distortion,ie SAD
784    //x6 :is nonzero
785    //x7 :loop counter
786    push_v_regs
787    stp       d8, d9, [sp, #-16]!
788    stp       d10, d11, [sp, #-16]!
789    stp       d12, d13, [sp, #-16]!
790    stp       d14, d15, [sp, #-16]!
791
792    ld1       {v30.8h}, [x4]
793
794    dup       v20.4h, v30.h[1]          //ls1
795    dup       v24.4h, v30.h[0]          //ls2
796    dup       v21.4h, v30.h[5]          //ls3
797    dup       v25.4h, v30.h[7]          //ls4
798    dup       v22.4h, v30.h[3]          //ls5
799    dup       v26.4h, v30.h[4]          //ls6
800    dup       v23.4h, v30.h[6]          //ls7
801    dup       v27.4h, v30.h[2]          //ls8
802
803    mov       v20.d[1], v24.d[0]
804    mov       v21.d[1], v25.d[0]
805    mov       v22.d[1], v26.d[0]
806    mov       v23.d[1], v27.d[0]
807
808    add       x4, x4, #16
809    ld1       {v29.h}[0], [x4]
810    dup       v29.4h, v29.h[0]
811
812    movi      v31.8h, #0
813
814    mov       x7, #4
815core_loop_satqd_ime_compute_satqd_16x16_lumainter:
816    ld1       {v0.16b}, [x0], x2
817    ld1       {v1.16b}, [x1], x3
818    ld1       {v2.16b}, [x0], x2
819    ld1       {v3.16b}, [x1], x3
820    ld1       {v4.16b}, [x0], x2
821    ld1       {v5.16b}, [x1], x3
822    ld1       {v6.16b}, [x0], x2
823    ld1       {v7.16b}, [x1], x3
824
825    uabdl     v10.8h, v0.8b, v1.8b
826    uabdl2    v15.8h, v0.16b, v1.16b
827    uabdl     v11.8h, v2.8b, v3.8b
828    uabdl2    v16.8h, v2.16b, v3.16b
829    uabdl     v12.8h, v4.8b, v5.8b
830    uabdl2    v17.8h, v4.16b, v5.16b
831    uabdl     v13.8h, v6.8b, v7.8b
832    uabdl2    v18.8h, v6.16b, v7.16b
833
834    add       v0.8h, v10.8h, v13.8h
835    add       v1.8h, v11.8h, v12.8h
836    add       v2.8h, v15.8h, v18.8h
837    add       v3.8h, v16.8h, v17.8h
838
839    //v0 : S1     S4     S4     S1        A1    A4    A4    A1
840    //v1 : S2     S3     S3     S2        A2    A3    A3    A2
841    //v2 : B1     B4     B4     B1        X1    X4    X4    X1
842    //v3 : B3     B2     B2     B3        X3    X2    X2    X3
843
844    trn1      v4.8h, v0.8h, v1.8h
845    trn2      v5.8h, v0.8h, v1.8h
846    trn1      v6.8h, v2.8h, v3.8h
847    trn2      v7.8h, v2.8h, v3.8h
848
849    trn1      v0.4s, v4.4s, v6.4s
850    trn2      v2.4s, v4.4s, v6.4s
851    trn1      v1.4s, v5.4s, v7.4s
852    trn2      v3.4s, v5.4s, v7.4s
853
854    add       v4.8h, v0.8h, v3.8h
855    add       v5.8h, v1.8h, v2.8h
856    //v4 : S1     S2     B1     B2      A1    A2    X1    X2
857    //v5 : S4     S3     B4     B3      A4    A3    X4    X3
858
859    //compute sad for each 4x4 block
860    add       v6.8h, v4.8h, v5.8h
861    addp      v19.8h, v6.8h, v6.8h
862    //duplicate the sad into 128 bit so that we can compare using 128bit
863    add       v31.4h, v31.4h, v19.4h
864
865    //sad_2 = sad_1<<1;
866    shl       v28.8h, v19.8h, #1
867
868    //sad_2 - pu2_thrsh
869    sub       v24.8h, v28.8h, v20.8h
870    sub       v25.8h, v28.8h, v21.8h
871    sub       v26.8h, v28.8h, v22.8h
872    sub       v27.8h, v28.8h, v23.8h
873
874    trn1      v0.4s, v4.4s, v5.4s
875    trn2      v1.4s, v4.4s, v5.4s
876    //v0 : S1     S2     S4     S3      A1    A2    A4    A3
877    //v1 : B1     B2     B4     B3      X1    X2    X4    X3
878
879    trn1      v4.8h, v0.8h, v1.8h
880    trn2      v5.8h, v0.8h, v1.8h
881    //v4 : S1     B1     S4     B4      A1    X1    A4    X4
882    //v5 : S2     B2     S3     B3      A2    X2    A3    X3
883
884    mov       v7.s[0], v4.s[1]
885    mov       v7.s[1], v4.s[3]
886    mov       v6.s[0], v5.s[1]          // V4 //S1 B1 A1 X1
887    mov       v6.s[1], v5.s[3]          // V5 //S2 B2 A2 X2
888    mov       v4.s[1], v4.s[2]          // V6 //S3 B3 A3 X3
889    mov       v5.s[1], v5.s[2]          // V7 //S4 B4 A4 X4
890
891    shl       v0.4h, v4.4h, #1          //S1<<1
892    shl       v1.4h, v5.4h, #1          //S2<<1
893    shl       v2.4h, v6.4h, #1          //S3<<1
894    shl       v3.4h, v7.4h, #1          //S4<<1
895
896    add       v8.4h, v5.4h, v6.4h       //(s2[j] + s3[j]))
897    add       v9.4h, v4.4h, v7.4h       //(s1[j] + s4[j]))
898    add       v10.4h, v6.4h, v7.4h      //(s3[j] + s4[j]))
899    sub       v11.4h, v6.4h, v0.4h      //(s3[j] - (s1[j]<<1))
900    sub       v12.4h, v7.4h, v1.4h      //(s4[j] - (s2[j]<<1))
901    add       v13.4h, v4.4h, v5.4h      //(s1[j] + s2[j]))
902    sub       v14.4h, v5.4h, v3.4h      //(s2[j] - (s4[j]<<1)))
903    sub       v15.4h, v4.4h, v2.4h      //(s1[j] - (s3[j]<<1)))
904
905    mov       v8.d[1], v9.d[0]
906    mov       v10.d[1], v11.d[0]
907    mov       v12.d[1], v13.d[0]
908    mov       v14.d[1], v15.d[0]
909
910    cmge      v0.8h, v24.8h, v8.8h      //ls1 ls2
911    cmge      v1.8h, v25.8h, v10.8h     //ls3 ls4
912    cmge      v2.8h, v26.8h, v12.8h     //ls5 ls6
913    cmge      v3.8h, v27.8h, v14.8h     //ls7 ls8
914    cmge      v4.4h, v19.4h, v29.4h     //sad
915
916    orr       v0.16b, v0.16b, v1.16b
917    orr       v2.16b, v2.16b, v3.16b
918    orr       v2.16b, v0.16b, v2.16b
919    xtn       v2.8b, v2.8h
920    orr       v2.8b, v2.8b, v4.8b
921
922    //if the comparison is non zero, out
923    mov       x4, v2.d[0]
924    cmp       x4, #0
925    bne       core_loop_compute_sad_pre
926
927    subs      x7, x7, #1
928    bne       core_loop_satqd_ime_compute_satqd_16x16_lumainter
929    b         satdq_end_func
930
931
932core_loop_compute_sad:
933    ld1       {v0.16b}, [x0], x2
934    ld1       {v1.16b}, [x1], x3
935    ld1       {v2.16b}, [x0], x2
936    ld1       {v3.16b}, [x1], x3
937
938    uabal     v31.8h, v0.8b, v1.8b
939    uabal2    v31.8h, v0.16b, v1.16b
940
941    uabal     v31.8h, v2.8b, v3.8b
942    uabal2    v31.8h, v2.16b, v3.16b
943
944    ld1       {v4.16b}, [x0], x2
945    ld1       {v5.16b}, [x1], x3
946    ld1       {v6.16b}, [x0], x2
947    ld1       {v7.16b}, [x1], x3
948
949    uabal     v31.8h, v4.8b, v5.8b
950    uabal2    v31.8h, v4.16b, v5.16b
951
952    uabal     v31.8h, v6.8b, v7.8b
953    uabal2    v31.8h, v6.16b, v7.16b
954
955core_loop_compute_sad_pre:
956    subs      x7, x7, #1
957    bne       core_loop_compute_sad
958
959satdq_end_func:
960
961    mov       x7, #1
962    cmp       x4, #0
963    csel      x7, x4, x7, eq
964    str       w7, [x6]
965
966    addp      v31.8h, v31.8h, v31.8h
967    uaddlp    v31.4s, v31.8h
968    addp      v31.2s, v31.2s, v31.2s
969    st1       {v31.s}[0], [x5]
970
971
972    ldp       d14, d15, [sp], #16
973    ldp       d12, d13, [sp], #16
974    ldp       d10, d11, [sp], #16
975    ldp       d8, d9, [sp], #16
976    pop_v_regs
977    ret
978