1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21///**
22//******************************************************************************
23//*
24//* @brief :Evaluate best intra 16x16 mode (among VERT, HORZ and DC )
25//*                and do the prediction.
26//*
27//* @par Description
28//*   This function evaluates  first three 16x16 modes and compute corresponding sad
29//*   and return the buffer predicted with best mode.
30//*
31//* @param[in] pu1_src
32//*  UWORD8 pointer to the source
33//*
34//** @param[in] pu1_ngbr_pels_i16
35//*  UWORD8 pointer to neighbouring pels
36//*
37//* @param[out] pu1_dst
38//*  UWORD8 pointer to the destination
39//*
40//* @param[in] src_strd
41//*  integer source stride
42//*
43//* @param[in] dst_strd
44//*  integer destination stride
45//*
46//* @param[in] u4_n_avblty
47//* availability of neighbouring pixels
48//*
49//* @param[in] u4_intra_mode
50//* Pointer to the variable in which best mode is returned
51//*
52//* @param[in] pu4_sadmin
53//* Pointer to the variable in which minimum sad is returned
54//*
55//* @param[in] u4_valid_intra_modes
56//* Says what all modes are valid
57//*
58//*
59//* @return      none
60//*
61//******************************************************************************
62//*/
63//
64//void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
65//                                      UWORD8 *pu1_ngbr_pels_i16,
66//                                      UWORD8 *pu1_dst,
67//                                      UWORD32 src_strd,
68//                                      UWORD32 dst_strd,
69//                                      WORD32 u4_n_avblty,
70//                                      UWORD32 *u4_intra_mode,
71//                                      WORD32 *pu4_sadmin,
72//                                       UWORD32 u4_valid_intra_modes)
73//
74.text
75.p2align 2
76.include "ih264_neon_macros.s"
77
78.global ih264e_evaluate_intra16x16_modes_av8
79
80ih264e_evaluate_intra16x16_modes_av8:
81
82//x0 = pu1_src,
83//x1 = pu1_ngbr_pels_i16,
84//x2 = pu1_dst,
85//x3 = src_strd,
86//x4 = dst_strd,
87//x5 = u4_n_avblty,
88//x6 = u4_intra_mode,
89//x7 = pu4_sadmin
90
91
92
93    // STMFD sp!, {x4-x12, x14}          //store register values to stack
94    push_v_regs
95    stp       x19, x20, [sp, #-16]!
96
97    ldr       x16, [sp, #80]
98    mov       x17, x4
99    mov       x14, x6
100    mov       x15, x7
101
102
103    sub       v0.16b, v0.16b, v0.16b
104    sub       v1.16b, v1.16b, v1.16b
105    mov       w10, #0
106    mov       w11 , #3
107
108    ands      x6, x5, #0x01
109    beq       top_available             //LEFT NOT AVAILABLE
110    ld1       {v0.16b}, [x1]
111    add       w10, w10, #8
112    add       w11, w11, #1
113top_available:
114    ands      x6, x5, #0x04
115    beq       none_available
116    add       x6, x1, #17
117    ld1       {v1.16b}, [x6]
118    add       w10, w10, #8
119    add       w11, w11, #1
120    b         summation
121none_available:
122    cmp       x5, #0
123    bne       summation
124    mov       w6, #128
125    dup       v30.16b, w6
126    dup       v31.16b, w6
127    b         sad_comp
128summation:
129    uaddl     v2.8h, v0.8b, v1.8b
130    uaddl2    v3.8h, v0.16b, v1.16b
131    dup       v10.8h, w10
132    neg       w11, w11
133    dup       v20.8h, w11
134    add       v0.8h, v2.8h, v3.8h
135    mov       v1.d[0], v0.d[1]
136    add       v0.4h, v0.4h, v1.4h
137    addp      v0.4h, v0.4h , v0.4h
138    addp      v0.4h, v0.4h , v0.4h
139    add       v0.4h, v0.4h, v10.4h
140    uqshl     v0.8h, v0.8h, v20.8h
141    sqxtun    v0.8b, v0.8h
142
143    dup       v30.16b, v0.b[0]
144    dup       v31.16b, v0.b[0]
145
146
147sad_comp:
148    ld1       { v0.2s, v1.2s }, [x0], x3 // source x0w 0
149
150    ld1       { v2.2s, v3.2s}, [x0], x3 //row 1
151
152    ld1       { v4.2s, v5.2s}, [x0], x3 //row 2
153
154    ld1       { v6.2s, v7.2s}, [x0], x3 //row 3
155
156    //---------------------
157
158    //values for vertical prediction
159    add       x6, x1, #17
160    ld1       {v10.8b}, [x6], #8
161    ld1       {v11.8b}, [x6], #8
162    ld1       {v9.16b}, [x1]
163
164
165
166    dup       v20.8b, v9.b[15]          ///HORIZONTAL VALUE ROW=0//
167    dup       v21.8b, v9.b[15]          ///HORIZONTAL VALUE ROW=0//
168
169
170///* computing SADs for all three modes*/
171    ///vertical row 0@
172    uabdl     v16.8h, v0.8b, v10.8b
173    uabdl     v18.8h, v1.8b, v11.8b
174
175    ///HORZ row 0@
176    uabdl     v26.8h, v0.8b, v20.8b
177    uabdl     v28.8h, v1.8b, v21.8b
178
179    ///dc row 0@
180    uabdl     v22.8h, v0.8b, v30.8b
181    uabdl     v24.8h, v1.8b, v31.8b
182
183
184
185
186
187    dup       v20.8b, v9.b[14]          ///HORIZONTAL VALUE ROW=1//
188    dup       v21.8b, v9.b[14]
189
190
191    ///vertical row 1@
192    uabal     v16.8h, v2.8b, v10.8b
193    uabal     v18.8h, v3.8b, v11.8b
194
195    ld1       { v0.2s, v1.2s }, [x0], x3 //row 4
196    ///HORZ row 1@
197    uabal     v26.8h, v2.8b, v20.8b
198    uabal     v28.8h, v3.8b, v21.8b
199
200    ///dc row 1@
201    uabal     v22.8h, v2.8b, v30.8b
202    uabal     v24.8h, v3.8b, v31.8b
203
204    dup       v20.8b, v9.b[13]          ///HORIZONTAL VALUE ROW=2//
205    dup       v21.8b, v9.b[13]
206
207    ///vertical row 2@
208    uabal     v16.8h, v4.8b, v10.8b
209    uabal     v18.8h, v5.8b, v11.8b
210
211    ld1       { v2.2s, v3.2s}, [x0], x3 //row 5
212    ///HORZ row 2@
213    uabal     v26.8h, v4.8b, v20.8b
214    uabal     v28.8h, v5.8b, v21.8b
215
216    ///dc row 2@
217    uabal     v22.8h, v4.8b, v30.8b
218    uabal     v24.8h, v5.8b, v31.8b
219
220    dup       v20.8b, v9.b[12]          ///HORIZONTAL VALUE ROW=3//
221    dup       v21.8b, v9.b[12]
222
223    ///vertical row 3@
224    uabal     v16.8h, v6.8b, v10.8b
225    uabal     v18.8h, v7.8b, v11.8b
226
227    ld1       { v4.2s, v5.2s}, [x0], x3 //row 6
228    ///HORZ row 3@
229    uabal     v26.8h, v6.8b, v20.8b
230    uabal     v28.8h, v7.8b, v21.8b
231
232    ///dc row 3@
233    uabal     v22.8h, v6.8b, v30.8b
234    uabal     v24.8h, v7.8b, v31.8b
235//----------------------------------------------------------------------------------------------
236
237    dup       v20.8b, v9.b[11]          ///HORIZONTAL VALUE ROW=0//
238    dup       v21.8b, v9.b[11]
239
240    ///vertical row 0@
241    uabal     v16.8h, v0.8b, v10.8b
242    uabal     v18.8h, v1.8b, v11.8b
243
244    ld1       {  v6.2s, v7.2s}, [x0], x3 //row 7
245    ///HORZ row 0@
246    uabal     v26.8h, v0.8b, v20.8b
247    uabal     v28.8h, v1.8b, v21.8b
248
249    ///dc row 0@
250    uabal     v22.8h, v0.8b, v30.8b
251    uabal     v24.8h, v1.8b, v31.8b
252
253    dup       v20.8b, v9.b[10]          ///HORIZONTAL VALUE ROW=1//
254    dup       v21.8b, v9.b[10]
255
256    ///vertical row 1@
257    uabal     v16.8h, v2.8b, v10.8b
258    uabal     v18.8h, v3.8b, v11.8b
259
260    ld1       { v0.2s, v1.2s }, [x0], x3 //row 8
261    ///HORZ row 1@
262    uabal     v26.8h, v2.8b, v20.8b
263    uabal     v28.8h, v3.8b, v21.8b
264
265    ///dc row 1@
266    uabal     v22.8h, v2.8b, v30.8b
267    uabal     v24.8h, v3.8b, v31.8b
268
269    dup       v20.8b, v9.b[9]           ///HORIZONTAL VALUE ROW=2//
270    dup       v21.8b, v9.b[9]
271
272    ///vertical row 2@
273    uabal     v16.8h, v4.8b, v10.8b
274    uabal     v18.8h, v5.8b, v11.8b
275
276    ld1       { v2.2s, v3.2s}, [x0], x3 //row 9
277
278    ///HORZ row 2@
279    uabal     v26.8h, v4.8b, v20.8b
280    uabal     v28.8h, v5.8b, v21.8b
281
282    ///dc row 2@
283    uabal     v22.8h, v4.8b, v30.8b
284    uabal     v24.8h, v5.8b, v31.8b
285
286    dup       v20.8b, v9.b[8]           ///HORIZONTAL VALUE ROW=3//
287    dup       v21.8b, v9.b[8]
288
289    ///vertical row 3@
290    uabal     v16.8h, v6.8b, v10.8b
291    uabal     v18.8h, v7.8b, v11.8b
292
293    ld1       { v4.2s, v5.2s}, [x0], x3 //row 10
294
295    ///HORZ row 3@
296    uabal     v26.8h, v6.8b, v20.8b
297    uabal     v28.8h, v7.8b, v21.8b
298
299    ///dc row 3@
300    uabal     v22.8h, v6.8b, v30.8b
301    uabal     v24.8h, v7.8b, v31.8b
302
303
304//-------------------------------------------
305
306    dup       v20.8b, v9.b[7]           ///HORIZONTAL VALUE ROW=0//
307    dup       v21.8b, v9.b[7]
308
309    ///vertical row 0@
310    uabal     v16.8h, v0.8b, v10.8b
311    uabal     v18.8h, v1.8b, v11.8b
312
313    ld1       {  v6.2s, v7.2s}, [x0], x3 //row11
314
315    ///HORZ row 0@
316    uabal     v26.8h, v0.8b, v20.8b
317    uabal     v28.8h, v1.8b, v21.8b
318
319    ///dc row 0@
320    uabal     v22.8h, v0.8b, v30.8b
321    uabal     v24.8h, v1.8b, v31.8b
322
323    dup       v20.8b, v9.b[6]           ///HORIZONTAL VALUE ROW=1//
324    dup       v21.8b, v9.b[6]
325
326    ///vertical row 1@
327    uabal     v16.8h, v2.8b, v10.8b
328    uabal     v18.8h, v3.8b, v11.8b
329
330    ld1       { v0.2s, v1.2s }, [x0], x3 //row12
331
332    ///HORZ row 1@
333    uabal     v26.8h, v2.8b, v20.8b
334    uabal     v28.8h, v3.8b, v21.8b
335
336    ///dc row 1@
337    uabal     v22.8h, v2.8b, v30.8b
338    uabal     v24.8h, v3.8b, v31.8b
339
340    dup       v20.8b, v9.b[5]           ///HORIZONTAL VALUE ROW=2//
341    dup       v21.8b, v9.b[5]
342
343    ///vertical row 2@
344    uabal     v16.8h, v4.8b, v10.8b
345    uabal     v18.8h, v5.8b, v11.8b
346
347    ld1       { v2.2s, v3.2s}, [x0], x3 //row13
348
349    ///HORZ row 2@
350    uabal     v26.8h, v4.8b, v20.8b
351    uabal     v28.8h, v5.8b, v21.8b
352
353    ///dc row 2@
354    uabal     v22.8h, v4.8b, v30.8b
355    uabal     v24.8h, v5.8b, v31.8b
356
357    dup       v20.8b, v9.b[4]           ///HORIZONTAL VALUE ROW=3//
358    dup       v21.8b, v9.b[4]
359
360    ///vertical row 3@
361    uabal     v16.8h, v6.8b, v10.8b
362    uabal     v18.8h, v7.8b, v11.8b
363
364    ld1       { v4.2s, v5.2s}, [x0], x3 //row14
365
366    ///HORZ row 3@
367    uabal     v26.8h, v6.8b, v20.8b
368    uabal     v28.8h, v7.8b, v21.8b
369
370    ///dc row 3@
371    uabal     v22.8h, v6.8b, v30.8b
372    uabal     v24.8h, v7.8b, v31.8b
373    //-----------------------------------------------------------------
374
375    dup       v20.8b, v9.b[3]           ///HORIZONTAL VALUE ROW=0//
376    dup       v21.8b, v9.b[3]
377
378    ///vertical row 0@
379    uabal     v16.8h, v0.8b, v10.8b
380    uabal     v18.8h, v1.8b, v11.8b
381
382    ld1       {  v6.2s, v7.2s}, [x0], x3 //row15
383
384    ///HORZ row 0@
385    uabal     v26.8h, v0.8b, v20.8b
386    uabal     v28.8h, v1.8b, v21.8b
387
388    ///dc row 0@
389    uabal     v22.8h, v0.8b, v30.8b
390    uabal     v24.8h, v1.8b, v31.8b
391
392    dup       v20.8b, v9.b[2]           ///HORIZONTAL VALUE ROW=1//
393    dup       v21.8b, v9.b[2]
394
395    ///vertical row 1@
396    uabal     v16.8h, v2.8b, v10.8b
397    uabal     v18.8h, v3.8b, v11.8b
398
399    ///HORZ row 1@
400    uabal     v26.8h, v2.8b, v20.8b
401    uabal     v28.8h, v3.8b, v21.8b
402
403    ///dc row 1@
404    uabal     v22.8h, v2.8b, v30.8b
405    uabal     v24.8h, v3.8b, v31.8b
406
407    dup       v20.8b, v9.b[1]           ///HORIZONTAL VALUE ROW=2//
408    dup       v21.8b, v9.b[1]
409
410    ///vertical row 2@
411    uabal     v16.8h, v4.8b, v10.8b
412    uabal     v18.8h, v5.8b, v11.8b
413
414    ///HORZ row 2@
415    uabal     v26.8h, v4.8b, v20.8b
416    uabal     v28.8h, v5.8b, v21.8b
417
418    ///dc row 2@
419    uabal     v22.8h, v4.8b, v30.8b
420    uabal     v24.8h, v5.8b, v31.8b
421
422    dup       v20.8b, v9.b[0]           ///HORIZONTAL VALUE ROW=3//
423    dup       v21.8b, v9.b[0]
424
425    ///vertical row 3@
426    uabal     v16.8h, v6.8b, v10.8b
427    uabal     v18.8h, v7.8b, v11.8b
428
429    ///HORZ row 3@
430    uabal     v26.8h, v6.8b, v20.8b
431    uabal     v28.8h, v7.8b, v21.8b
432
433    ///dc row 3@
434    uabal     v22.8h, v6.8b, v30.8b
435    uabal     v24.8h, v7.8b, v31.8b
436    //------------------------------------------------------------------------------
437
438
439    //vert sum
440
441    add       v16.8h, v16.8h , v18.8h
442    mov       v18.d[0], v16.d[1]
443    add       v16.4h, v16.4h , v18.4h
444    uaddlp    v16.2s, v16.4h
445    addp      v16.2s, v16.2s, v16.2s
446    smov      x8, v16.s[0]              //dc
447
448
449    //horz sum
450
451    add       v26.8h, v26.8h , v28.8h
452    mov       v28.d[0], v26.d[1]
453    add       v26.4h, v26.4h , v28.4h
454    uaddlp    v26.2s, v26.4h
455    addp      v26.2s, v26.2s, v26.2s
456    smov      x9, v26.s[0]
457
458    //dc sum
459
460    add       v24.8h, v22.8h , v24.8h   ///DC
461    mov       v25.d[0], v24.d[1]
462    add       v24.4h, v24.4h , v25.4h   ///DC
463    uaddlp    v24.2s, v24.4h            ///DC
464    addp      v24.2s, v24.2s, v24.2s    ///DC
465    smov      x10, v24.s[0]             //dc
466
467
468    //-----------------------
469    mov       x11, #1
470    lsl       x11, x11, #30
471
472    mov       x0, x16
473    //--------------------------------------------
474    ands      x7, x0, #01               // vert mode valid????????????
475    csel      x8, x11, x8, eq
476
477
478    ands      x6, x0, #02               // horz mode valid????????????
479    csel      x9, x11, x9, eq
480
481    ands      x6, x0, #04               // dc mode valid????????????
482    csel      x10, x11, x10, eq
483
484
485
486
487//--------------------------------
488
489    mov       x4, x17
490    mov       x7, x15
491    mov       x6, x14
492
493    //---------------------------
494
495    //--------------------------
496
497    cmp       x8, x9
498    bgt       not_vert
499    cmp       x8, x10
500    bgt       do_dc
501
502    ///----------------------
503    //DO VERTICAL PREDICTION
504    str       w8 , [x7]                 //MIN SAD
505    mov       w8, #0
506    str       w8 , [x6]                 // MODE
507    add       x6, x1, #17
508    ld1       {v30.16b}, [x6]
509    b         do_dc_vert
510    //-----------------------------
511not_vert: cmp x9, x10
512    bgt       do_dc
513
514    ///----------------------
515    //DO HORIZONTAL
516    str       w9 , [x7]                 //MIN SAD
517    mov       w9, #1
518    str       w9 , [x6]                 // MODE
519
520    ld1       {v0.16b}, [x1]
521    dup       v10.16b, v0.b[15]
522    dup       v11.16b, v0.b[14]
523    dup       v12.16b, v0.b[13]
524    dup       v13.16b, v0.b[12]
525    st1       {v10.16b}, [x2], x4
526    dup       v14.16b, v0.b[11]
527    st1       {v11.16b}, [x2], x4
528    dup       v15.16b, v0.b[10]
529    st1       {v12.16b}, [x2], x4
530    dup       v16.16b, v0.b[9]
531    st1       {v13.16b}, [x2], x4
532    dup       v17.16b, v0.b[8]
533    st1       {v14.16b}, [x2], x4
534    dup       v18.16b, v0.b[7]
535    st1       {v15.16b}, [x2], x4
536    dup       v19.16b, v0.b[6]
537    st1       {v16.16b}, [x2], x4
538    dup       v20.16b, v0.b[5]
539    st1       {v17.16b}, [x2], x4
540    dup       v21.16b, v0.b[4]
541    st1       {v18.16b}, [x2], x4
542    dup       v22.16b, v0.b[3]
543    st1       {v19.16b}, [x2], x4
544    dup       v23.16b, v0.b[2]
545    st1       {v20.16b}, [x2], x4
546    dup       v24.16b, v0.b[1]
547    st1       {v21.16b}, [x2], x4
548    dup       v25.16b, v0.b[0]
549    st1       {v22.16b}, [x2], x4
550    st1       {v23.16b}, [x2], x4
551    st1       {v24.16b}, [x2], x4
552    st1       {v25.16b}, [x2], x4
553
554
555
556    b         end_func
557
558
559    ///-----------------------------
560
561do_dc: ///---------------------------------
562    //DO DC
563    str       w10 , [x7]                //MIN SAD
564    mov       w10, #2
565    str       w10 , [x6]                // MODE
566do_dc_vert:
567    st1       {v30.4s}, [x2], x4        //0
568    st1       {v30.4s}, [x2], x4        //1
569    st1       {v30.4s}, [x2], x4        //2
570    st1       {v30.4s}, [x2], x4        //3
571    st1       {v30.4s}, [x2], x4        //4
572    st1       {v30.4s}, [x2], x4        //5
573    st1       {v30.4s}, [x2], x4        //6
574    st1       {v30.4s}, [x2], x4        //7
575    st1       {v30.4s}, [x2], x4        //8
576    st1       {v30.4s}, [x2], x4        //9
577    st1       {v30.4s}, [x2], x4        //10
578    st1       {v30.4s}, [x2], x4        //11
579    st1       {v30.4s}, [x2], x4        //12
580    st1       {v30.4s}, [x2], x4        //13
581    st1       {v30.4s}, [x2], x4        //14
582    st1       {v30.4s}, [x2], x4        //15
583    ///------------------
584end_func:
585    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
586    ldp       x19, x20, [sp], #16
587    pop_v_regs
588    ret
589
590
591