1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21///**
22//******************************************************************************
23//*
24//* @brief :Evaluate best intra 16x16 mode (among VERT, HORZ and DC )
25//*                and do the prediction.
26//*
27//* @par Description
28//*   This function evaluates  first three 16x16 modes and compute corresponding sad
29//*   and return the buffer predicted with best mode.
30//*
31//* @param[in] pu1_src
32//*  UWORD8 pointer to the source
33//*
34//** @param[in] pu1_ngbr_pels_i16
35//*  UWORD8 pointer to neighbouring pels
36//*
37//* @param[out] pu1_dst
38//*  UWORD8 pointer to the destination
39//*
40//* @param[in] src_strd
41//*  integer source stride
42//*
43//* @param[in] dst_strd
44//*  integer destination stride
45//*
46//* @param[in] u4_n_avblty
47//* availability of neighbouring pixels
48//*
49//* @param[in] u4_intra_mode
50//* Pointer to the variable in which best mode is returned
51//*
52//* @param[in] pu4_sadmin
53//* Pointer to the variable in which minimum sad is returned
54//*
55//* @param[in] u4_valid_intra_modes
56//* Says what all modes are valid
57//*
58//*
59//* @return      none
60//*
61//******************************************************************************
62//*/
63//
64//void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
65//                                      UWORD8 *pu1_ngbr_pels_i16,
66//                                      UWORD8 *pu1_dst,
67//                                      UWORD32 src_strd,
68//                                      UWORD32 dst_strd,
69//                                      WORD32 u4_n_avblty,
70//                                      UWORD32 *u4_intra_mode,
71//                                      WORD32 *pu4_sadmin,
72//                                       UWORD32 u4_valid_intra_modes)
73//
74.text
75.p2align 2
76.include "ih264_neon_macros.s"
77
78.global ih264e_evaluate_intra16x16_modes_av8
79
80ih264e_evaluate_intra16x16_modes_av8:
81
82//x0 = pu1_src,
83//x1 = pu1_ngbr_pels_i16,
84//x2 = pu1_dst,
85//w3 = src_strd,
86//w4 = dst_strd,
87//w5 = u4_n_avblty,
88//x6 = u4_intra_mode,
89//x7 = pu4_sadmin
90
91
92
93    // STMFD sp!, {x4-x12, x14}          //store register values to stack
94    push_v_regs
95    sxtw      x3, w3
96    sxtw      x4, w4
97    stp       x19, x20, [sp, #-16]!
98
99    ldr       w16, [sp, #80]
100    mov       x17, x4
101    mov       x14, x6
102    mov       x15, x7
103
104
105    sub       v0.16b, v0.16b, v0.16b
106    sub       v1.16b, v1.16b, v1.16b
107    mov       w10, #0
108    mov       w11 , #3
109
110    ands      w6, w5, #0x01
111    beq       top_available             //LEFT NOT AVAILABLE
112    ld1       {v0.16b}, [x1]
113    add       w10, w10, #8
114    add       w11, w11, #1
115top_available:
116    ands      w6, w5, #0x04
117    beq       none_available
118    add       x6, x1, #17
119    ld1       {v1.16b}, [x6]
120    add       w10, w10, #8
121    add       w11, w11, #1
122    b         summation
123none_available:
124    cmp       w5, #0
125    bne       summation
126    mov       w6, #128
127    dup       v30.16b, w6
128    dup       v31.16b, w6
129    b         sad_comp
130summation:
131    uaddl     v2.8h, v0.8b, v1.8b
132    uaddl2    v3.8h, v0.16b, v1.16b
133    dup       v10.8h, w10
134    neg       w11, w11
135    dup       v20.8h, w11
136    add       v0.8h, v2.8h, v3.8h
137    mov       v1.d[0], v0.d[1]
138    add       v0.4h, v0.4h, v1.4h
139    addp      v0.4h, v0.4h , v0.4h
140    addp      v0.4h, v0.4h , v0.4h
141    add       v0.4h, v0.4h, v10.4h
142    uqshl     v0.8h, v0.8h, v20.8h
143    sqxtun    v0.8b, v0.8h
144
145    dup       v30.16b, v0.b[0]
146    dup       v31.16b, v0.b[0]
147
148
149sad_comp:
150    ld1       { v0.2s, v1.2s }, [x0], x3 // source x0w 0
151
152    ld1       { v2.2s, v3.2s}, [x0], x3 //row 1
153
154    ld1       { v4.2s, v5.2s}, [x0], x3 //row 2
155
156    ld1       { v6.2s, v7.2s}, [x0], x3 //row 3
157
158    //---------------------
159
160    //values for vertical prediction
161    add       x6, x1, #17
162    ld1       {v10.8b}, [x6], #8
163    ld1       {v11.8b}, [x6], #8
164    ld1       {v9.16b}, [x1]
165
166
167
168    dup       v20.8b, v9.b[15]          ///HORIZONTAL VALUE ROW=0//
169    dup       v21.8b, v9.b[15]          ///HORIZONTAL VALUE ROW=0//
170
171
172///* computing SADs for all three modes*/
173    ///vertical row 0@
174    uabdl     v16.8h, v0.8b, v10.8b
175    uabdl     v18.8h, v1.8b, v11.8b
176
177    ///HORZ row 0@
178    uabdl     v26.8h, v0.8b, v20.8b
179    uabdl     v28.8h, v1.8b, v21.8b
180
181    ///dc row 0@
182    uabdl     v22.8h, v0.8b, v30.8b
183    uabdl     v24.8h, v1.8b, v31.8b
184
185
186
187
188
189    dup       v20.8b, v9.b[14]          ///HORIZONTAL VALUE ROW=1//
190    dup       v21.8b, v9.b[14]
191
192
193    ///vertical row 1@
194    uabal     v16.8h, v2.8b, v10.8b
195    uabal     v18.8h, v3.8b, v11.8b
196
197    ld1       { v0.2s, v1.2s }, [x0], x3 //row 4
198    ///HORZ row 1@
199    uabal     v26.8h, v2.8b, v20.8b
200    uabal     v28.8h, v3.8b, v21.8b
201
202    ///dc row 1@
203    uabal     v22.8h, v2.8b, v30.8b
204    uabal     v24.8h, v3.8b, v31.8b
205
206    dup       v20.8b, v9.b[13]          ///HORIZONTAL VALUE ROW=2//
207    dup       v21.8b, v9.b[13]
208
209    ///vertical row 2@
210    uabal     v16.8h, v4.8b, v10.8b
211    uabal     v18.8h, v5.8b, v11.8b
212
213    ld1       { v2.2s, v3.2s}, [x0], x3 //row 5
214    ///HORZ row 2@
215    uabal     v26.8h, v4.8b, v20.8b
216    uabal     v28.8h, v5.8b, v21.8b
217
218    ///dc row 2@
219    uabal     v22.8h, v4.8b, v30.8b
220    uabal     v24.8h, v5.8b, v31.8b
221
222    dup       v20.8b, v9.b[12]          ///HORIZONTAL VALUE ROW=3//
223    dup       v21.8b, v9.b[12]
224
225    ///vertical row 3@
226    uabal     v16.8h, v6.8b, v10.8b
227    uabal     v18.8h, v7.8b, v11.8b
228
229    ld1       { v4.2s, v5.2s}, [x0], x3 //row 6
230    ///HORZ row 3@
231    uabal     v26.8h, v6.8b, v20.8b
232    uabal     v28.8h, v7.8b, v21.8b
233
234    ///dc row 3@
235    uabal     v22.8h, v6.8b, v30.8b
236    uabal     v24.8h, v7.8b, v31.8b
237//----------------------------------------------------------------------------------------------
238
239    dup       v20.8b, v9.b[11]          ///HORIZONTAL VALUE ROW=0//
240    dup       v21.8b, v9.b[11]
241
242    ///vertical row 0@
243    uabal     v16.8h, v0.8b, v10.8b
244    uabal     v18.8h, v1.8b, v11.8b
245
246    ld1       {  v6.2s, v7.2s}, [x0], x3 //row 7
247    ///HORZ row 0@
248    uabal     v26.8h, v0.8b, v20.8b
249    uabal     v28.8h, v1.8b, v21.8b
250
251    ///dc row 0@
252    uabal     v22.8h, v0.8b, v30.8b
253    uabal     v24.8h, v1.8b, v31.8b
254
255    dup       v20.8b, v9.b[10]          ///HORIZONTAL VALUE ROW=1//
256    dup       v21.8b, v9.b[10]
257
258    ///vertical row 1@
259    uabal     v16.8h, v2.8b, v10.8b
260    uabal     v18.8h, v3.8b, v11.8b
261
262    ld1       { v0.2s, v1.2s }, [x0], x3 //row 8
263    ///HORZ row 1@
264    uabal     v26.8h, v2.8b, v20.8b
265    uabal     v28.8h, v3.8b, v21.8b
266
267    ///dc row 1@
268    uabal     v22.8h, v2.8b, v30.8b
269    uabal     v24.8h, v3.8b, v31.8b
270
271    dup       v20.8b, v9.b[9]           ///HORIZONTAL VALUE ROW=2//
272    dup       v21.8b, v9.b[9]
273
274    ///vertical row 2@
275    uabal     v16.8h, v4.8b, v10.8b
276    uabal     v18.8h, v5.8b, v11.8b
277
278    ld1       { v2.2s, v3.2s}, [x0], x3 //row 9
279
280    ///HORZ row 2@
281    uabal     v26.8h, v4.8b, v20.8b
282    uabal     v28.8h, v5.8b, v21.8b
283
284    ///dc row 2@
285    uabal     v22.8h, v4.8b, v30.8b
286    uabal     v24.8h, v5.8b, v31.8b
287
288    dup       v20.8b, v9.b[8]           ///HORIZONTAL VALUE ROW=3//
289    dup       v21.8b, v9.b[8]
290
291    ///vertical row 3@
292    uabal     v16.8h, v6.8b, v10.8b
293    uabal     v18.8h, v7.8b, v11.8b
294
295    ld1       { v4.2s, v5.2s}, [x0], x3 //row 10
296
297    ///HORZ row 3@
298    uabal     v26.8h, v6.8b, v20.8b
299    uabal     v28.8h, v7.8b, v21.8b
300
301    ///dc row 3@
302    uabal     v22.8h, v6.8b, v30.8b
303    uabal     v24.8h, v7.8b, v31.8b
304
305
306//-------------------------------------------
307
308    dup       v20.8b, v9.b[7]           ///HORIZONTAL VALUE ROW=0//
309    dup       v21.8b, v9.b[7]
310
311    ///vertical row 0@
312    uabal     v16.8h, v0.8b, v10.8b
313    uabal     v18.8h, v1.8b, v11.8b
314
315    ld1       {  v6.2s, v7.2s}, [x0], x3 //row11
316
317    ///HORZ row 0@
318    uabal     v26.8h, v0.8b, v20.8b
319    uabal     v28.8h, v1.8b, v21.8b
320
321    ///dc row 0@
322    uabal     v22.8h, v0.8b, v30.8b
323    uabal     v24.8h, v1.8b, v31.8b
324
325    dup       v20.8b, v9.b[6]           ///HORIZONTAL VALUE ROW=1//
326    dup       v21.8b, v9.b[6]
327
328    ///vertical row 1@
329    uabal     v16.8h, v2.8b, v10.8b
330    uabal     v18.8h, v3.8b, v11.8b
331
332    ld1       { v0.2s, v1.2s }, [x0], x3 //row12
333
334    ///HORZ row 1@
335    uabal     v26.8h, v2.8b, v20.8b
336    uabal     v28.8h, v3.8b, v21.8b
337
338    ///dc row 1@
339    uabal     v22.8h, v2.8b, v30.8b
340    uabal     v24.8h, v3.8b, v31.8b
341
342    dup       v20.8b, v9.b[5]           ///HORIZONTAL VALUE ROW=2//
343    dup       v21.8b, v9.b[5]
344
345    ///vertical row 2@
346    uabal     v16.8h, v4.8b, v10.8b
347    uabal     v18.8h, v5.8b, v11.8b
348
349    ld1       { v2.2s, v3.2s}, [x0], x3 //row13
350
351    ///HORZ row 2@
352    uabal     v26.8h, v4.8b, v20.8b
353    uabal     v28.8h, v5.8b, v21.8b
354
355    ///dc row 2@
356    uabal     v22.8h, v4.8b, v30.8b
357    uabal     v24.8h, v5.8b, v31.8b
358
359    dup       v20.8b, v9.b[4]           ///HORIZONTAL VALUE ROW=3//
360    dup       v21.8b, v9.b[4]
361
362    ///vertical row 3@
363    uabal     v16.8h, v6.8b, v10.8b
364    uabal     v18.8h, v7.8b, v11.8b
365
366    ld1       { v4.2s, v5.2s}, [x0], x3 //row14
367
368    ///HORZ row 3@
369    uabal     v26.8h, v6.8b, v20.8b
370    uabal     v28.8h, v7.8b, v21.8b
371
372    ///dc row 3@
373    uabal     v22.8h, v6.8b, v30.8b
374    uabal     v24.8h, v7.8b, v31.8b
375    //-----------------------------------------------------------------
376
377    dup       v20.8b, v9.b[3]           ///HORIZONTAL VALUE ROW=0//
378    dup       v21.8b, v9.b[3]
379
380    ///vertical row 0@
381    uabal     v16.8h, v0.8b, v10.8b
382    uabal     v18.8h, v1.8b, v11.8b
383
384    ld1       {  v6.2s, v7.2s}, [x0], x3 //row15
385
386    ///HORZ row 0@
387    uabal     v26.8h, v0.8b, v20.8b
388    uabal     v28.8h, v1.8b, v21.8b
389
390    ///dc row 0@
391    uabal     v22.8h, v0.8b, v30.8b
392    uabal     v24.8h, v1.8b, v31.8b
393
394    dup       v20.8b, v9.b[2]           ///HORIZONTAL VALUE ROW=1//
395    dup       v21.8b, v9.b[2]
396
397    ///vertical row 1@
398    uabal     v16.8h, v2.8b, v10.8b
399    uabal     v18.8h, v3.8b, v11.8b
400
401    ///HORZ row 1@
402    uabal     v26.8h, v2.8b, v20.8b
403    uabal     v28.8h, v3.8b, v21.8b
404
405    ///dc row 1@
406    uabal     v22.8h, v2.8b, v30.8b
407    uabal     v24.8h, v3.8b, v31.8b
408
409    dup       v20.8b, v9.b[1]           ///HORIZONTAL VALUE ROW=2//
410    dup       v21.8b, v9.b[1]
411
412    ///vertical row 2@
413    uabal     v16.8h, v4.8b, v10.8b
414    uabal     v18.8h, v5.8b, v11.8b
415
416    ///HORZ row 2@
417    uabal     v26.8h, v4.8b, v20.8b
418    uabal     v28.8h, v5.8b, v21.8b
419
420    ///dc row 2@
421    uabal     v22.8h, v4.8b, v30.8b
422    uabal     v24.8h, v5.8b, v31.8b
423
424    dup       v20.8b, v9.b[0]           ///HORIZONTAL VALUE ROW=3//
425    dup       v21.8b, v9.b[0]
426
427    ///vertical row 3@
428    uabal     v16.8h, v6.8b, v10.8b
429    uabal     v18.8h, v7.8b, v11.8b
430
431    ///HORZ row 3@
432    uabal     v26.8h, v6.8b, v20.8b
433    uabal     v28.8h, v7.8b, v21.8b
434
435    ///dc row 3@
436    uabal     v22.8h, v6.8b, v30.8b
437    uabal     v24.8h, v7.8b, v31.8b
438    //------------------------------------------------------------------------------
439
440
441    //vert sum
442
443    add       v16.8h, v16.8h , v18.8h
444    mov       v18.d[0], v16.d[1]
445    add       v16.4h, v16.4h , v18.4h
446    uaddlp    v16.2s, v16.4h
447    addp      v16.2s, v16.2s, v16.2s
448    smov      x8, v16.s[0]              //dc
449
450
451    //horz sum
452
453    add       v26.8h, v26.8h , v28.8h
454    mov       v28.d[0], v26.d[1]
455    add       v26.4h, v26.4h , v28.4h
456    uaddlp    v26.2s, v26.4h
457    addp      v26.2s, v26.2s, v26.2s
458    smov      x9, v26.s[0]
459
460    //dc sum
461
462    add       v24.8h, v22.8h , v24.8h   ///DC
463    mov       v25.d[0], v24.d[1]
464    add       v24.4h, v24.4h , v25.4h   ///DC
465    uaddlp    v24.2s, v24.4h            ///DC
466    addp      v24.2s, v24.2s, v24.2s    ///DC
467    smov      x10, v24.s[0]             //dc
468
469
470    //-----------------------
471    mov       x11, #1
472    lsl       x11, x11, #30
473
474    mov       w0, w16
475    //--------------------------------------------
476    ands      w7, w0, #01               // vert mode valid????????????
477    csel      x8, x11, x8, eq
478
479
480    ands      w6, w0, #02               // horz mode valid????????????
481    csel      x9, x11, x9, eq
482
483    ands      w6, w0, #04               // dc mode valid????????????
484    csel      x10, x11, x10, eq
485
486
487
488
489//--------------------------------
490
491    mov       x4, x17
492    mov       x7, x15
493    mov       x6, x14
494
495    //---------------------------
496
497    //--------------------------
498
499    cmp       x8, x9
500    bgt       not_vert
501    cmp       x8, x10
502    bgt       do_dc
503
504    ///----------------------
505    //DO VERTICAL PREDICTION
506    str       w8 , [x7]                 //MIN SAD
507    mov       w8, #0
508    str       w8 , [x6]                 // MODE
509    add       x6, x1, #17
510    ld1       {v30.16b}, [x6]
511    b         do_dc_vert
512    //-----------------------------
513not_vert: cmp x9, x10
514    bgt       do_dc
515
516    ///----------------------
517    //DO HORIZONTAL
518    str       w9 , [x7]                 //MIN SAD
519    mov       w9, #1
520    str       w9 , [x6]                 // MODE
521
522    ld1       {v0.16b}, [x1]
523    dup       v10.16b, v0.b[15]
524    dup       v11.16b, v0.b[14]
525    dup       v12.16b, v0.b[13]
526    dup       v13.16b, v0.b[12]
527    st1       {v10.16b}, [x2], x4
528    dup       v14.16b, v0.b[11]
529    st1       {v11.16b}, [x2], x4
530    dup       v15.16b, v0.b[10]
531    st1       {v12.16b}, [x2], x4
532    dup       v16.16b, v0.b[9]
533    st1       {v13.16b}, [x2], x4
534    dup       v17.16b, v0.b[8]
535    st1       {v14.16b}, [x2], x4
536    dup       v18.16b, v0.b[7]
537    st1       {v15.16b}, [x2], x4
538    dup       v19.16b, v0.b[6]
539    st1       {v16.16b}, [x2], x4
540    dup       v20.16b, v0.b[5]
541    st1       {v17.16b}, [x2], x4
542    dup       v21.16b, v0.b[4]
543    st1       {v18.16b}, [x2], x4
544    dup       v22.16b, v0.b[3]
545    st1       {v19.16b}, [x2], x4
546    dup       v23.16b, v0.b[2]
547    st1       {v20.16b}, [x2], x4
548    dup       v24.16b, v0.b[1]
549    st1       {v21.16b}, [x2], x4
550    dup       v25.16b, v0.b[0]
551    st1       {v22.16b}, [x2], x4
552    st1       {v23.16b}, [x2], x4
553    st1       {v24.16b}, [x2], x4
554    st1       {v25.16b}, [x2], x4
555
556
557
558    b         end_func
559
560
561    ///-----------------------------
562
563do_dc: ///---------------------------------
564    //DO DC
565    str       w10 , [x7]                //MIN SAD
566    mov       w10, #2
567    str       w10 , [x6]                // MODE
568do_dc_vert:
569    st1       {v30.4s}, [x2], x4        //0
570    st1       {v30.4s}, [x2], x4        //1
571    st1       {v30.4s}, [x2], x4        //2
572    st1       {v30.4s}, [x2], x4        //3
573    st1       {v30.4s}, [x2], x4        //4
574    st1       {v30.4s}, [x2], x4        //5
575    st1       {v30.4s}, [x2], x4        //6
576    st1       {v30.4s}, [x2], x4        //7
577    st1       {v30.4s}, [x2], x4        //8
578    st1       {v30.4s}, [x2], x4        //9
579    st1       {v30.4s}, [x2], x4        //10
580    st1       {v30.4s}, [x2], x4        //11
581    st1       {v30.4s}, [x2], x4        //12
582    st1       {v30.4s}, [x2], x4        //13
583    st1       {v30.4s}, [x2], x4        //14
584    st1       {v30.4s}, [x2], x4        //15
585    ///------------------
586end_func:
587    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
588    ldp       x19, x20, [sp], #16
589    pop_v_regs
590    ret
591
592
593