1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21///**
22//******************************************************************************
23//*
24//* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC )
25//*                and do the prediction.
26//*
27//* @par Description
28//*   This function evaluates  first three intra chroma modes and compute corresponding sad
29//*   and return the buffer predicted with best mode.
30//*
31//* @param[in] pu1_src
32//*  UWORD8 pointer to the source
33//*
34//** @param[in] pu1_ngbr_pels
35//*  UWORD8 pointer to neighbouring pels
36//*
37//* @param[out] pu1_dst
38//*  UWORD8 pointer to the destination
39//*
40//* @param[in] src_strd
41//*  integer source stride
42//*
43//* @param[in] dst_strd
44//*  integer destination stride
45//*
46//* @param[in] u4_n_avblty
47//* availability of neighbouring pixels
48//*
49//* @param[in] u4_intra_mode
50//* Pointer to the variable in which best mode is returned
51//*
52//* @param[in] pu4_sadmin
53//* Pointer to the variable in which minimum sad is returned
54//*
55//* @param[in] u4_valid_intra_modes
56//* Says what all modes are valid
57//*
58//*
59//* @return      none
60//*
61//******************************************************************************
62//*/
63//
64//void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
65//                                      UWORD8 *pu1_ngbr_pels_i16,
66//                                      UWORD8 *pu1_dst,
67//                                      UWORD32 src_strd,
68//                                      UWORD32 dst_strd,
69//                                      WORD32 u4_n_avblty,
70//                                      UWORD32 *u4_intra_mode,
71//                                      WORD32 *pu4_sadmin,
72//                                       UWORD32 u4_valid_intra_modes)
73//
74.text
75.p2align 2
76.include "ih264_neon_macros.s"
77
78.global ih264e_evaluate_intra_chroma_modes_av8
79
80ih264e_evaluate_intra_chroma_modes_av8:
81
82//x0 = pu1_src,
83//x1 = pu1_ngbr_pels_i16,
84//x2 = pu1_dst,
85//w3 = src_strd,
86//w4 = dst_strd,
87//w5 = u4_n_avblty,
88//x6 = u4_intra_mode,
89//x7 = pu4_sadmin
90
91
92
93    // STMFD sp!, {x4-x12, x14}          //store register values to stack
94    push_v_regs
95    sxtw      x3, w3
96    sxtw      x4, w4
97    stp       x19, x20, [sp, #-16]!
98    //-----------------------
99    ldr       w16, [sp, #80]
100    mov       x17, x4
101    mov       w18, w5
102    mov       x14, x6
103    mov       x15, x7
104
105    mov       w19, #5
106    ands      w6, w5, w19
107    beq       none_available
108    cmp       w6, #1
109    beq       left_only_available
110    cmp       w6, #4
111    beq       top_only_available
112
113all_available:
114    ld1       {v0.8b, v1.8b}, [x1]
115    add       x6, x1, #18
116    ld1       {v2.8b, v3.8b}, [x6]
117    uxtl      v0.8h, v0.8b
118    uxtl      v1.8h, v1.8b
119    addp      v0.4s, v0.4s , v0.4s
120    addp      v1.4s, v1.4s , v1.4s
121    addp      v0.4s, v0.4s , v0.4s
122    addp      v1.4s, v1.4s , v1.4s
123    uxtl      v2.8h, v2.8b
124    uxtl      v3.8h, v3.8b
125    addp      v2.4s, v2.4s , v2.4s
126    addp      v3.4s, v3.4s , v3.4s
127    addp      v2.4s, v2.4s , v2.4s
128    addp      v3.4s, v3.4s , v3.4s
129    rshrn     v5.8b, v0.8h, #2
130    dup       v21.8h, v5.h[0]
131    rshrn     v6.8b, v3.8h, #2
132    dup       v20.8h, v6.h[0]
133    add       v1.8h, v1.8h, v2.8h
134    rshrn     v1.8b, v1.8h, #3
135    dup       v23.8h, v1.h[0]
136    mov       v20.d[0], v23.d[0]
137    add       v0.8h, v0.8h, v3.8h
138    rshrn     v0.8b, v0.8h, #3
139    dup       v23.8h, v0.h[0]
140    mov       v31.d[0], v23.d[0]
141    mov       v28.d[0], v20.d[0]
142    mov       v29.d[0], v20.d[1]
143    mov       v30.d[0], v21.d[0]
144    b         sad_comp
145
146left_only_available:
147    ld1       {v0.8b, v1.8b}, [x1]
148    uxtl      v0.8h, v0.8b
149    uxtl      v1.8h, v1.8b
150    addp      v0.4s, v0.4s , v0.4s
151    addp      v1.4s, v1.4s , v1.4s
152    addp      v0.4s, v0.4s , v0.4s
153    addp      v1.4s, v1.4s , v1.4s
154    rshrn     v0.8b, v0.8h, #2
155    rshrn     v1.8b, v1.8h, #2
156
157    dup       v28.8h , v1.h[0]
158    dup       v29.8h , v1.h[0]
159    dup       v30.8h, v0.h[0]
160    dup       v31.8h, v0.h[0]
161    b         sad_comp
162
163top_only_available:
164    add       x6, x1, #18
165    ld1       {v0.8b, v1.8b}, [x6]
166    uxtl      v0.8h, v0.8b
167    uxtl      v1.8h, v1.8b
168    addp      v0.4s, v0.4s , v0.4s
169    addp      v1.4s, v1.4s , v1.4s
170    addp      v0.4s, v0.4s , v0.4s
171    addp      v1.4s, v1.4s , v1.4s
172    rshrn     v0.8b, v0.8h, #2
173    rshrn     v1.8b, v1.8h, #2
174    dup       v28.8h , v0.h[0]
175    dup       v30.8h, v1.h[0]
176    mov       v29.d[0], v30.d[1]
177    mov       v30.d[0], v28.d[0]
178    mov       v31.d[0], v30.d[1]
179    b         sad_comp
180none_available:
181    mov       w20, #128
182    dup       v28.16b, w20
183    dup       v29.16b, w20
184    dup       v30.16b, w20
185    dup       v31.16b, w20
186
187
188
189sad_comp:
190    add       x6, x1, #18
191    ld1       {v10.8b, v11.8b}, [x6]    // vertical values
192
193    ld1       {v27.8h}, [x1]
194
195    dup       v20.8h, v27.h[7]          ///HORIZONTAL VALUE ROW=0//
196    dup       v21.8h, v27.h[7]
197
198    ld1       { v0.8b, v1.8b}, [x0], x3
199
200
201    ///vertical row 0@
202    uabdl     v16.8h, v0.8b, v10.8b
203    uabdl     v18.8h, v1.8b, v11.8b
204
205    ///HORZ row 0@
206    uabdl     v26.8h, v0.8b, v20.8b
207    uabdl     v14.8h, v1.8b, v21.8b
208
209    ld1       {v2.8b, v3.8b}, [x0], x3
210
211
212
213    ///dc row 0@
214    uabdl     v22.8h, v0.8b, v28.8b
215    uabdl     v24.8h, v1.8b, v29.8b
216
217
218    dup       v20.8h, v27.h[6]
219    dup       v21.8h, v27.h[6]          ///HORIZONTAL VALUE ROW=1//
220
221    ///vertical row 1@
222    uabal     v16.8h, v2.8b, v10.8b
223    uabal     v18.8h, v3.8b, v11.8b
224
225    ld1       { v4.8b, v5.8b}, [x0], x3
226
227    ///HORZ row 1@
228    uabal     v26.8h, v2.8b, v20.8b
229    uabal     v14.8h, v3.8b, v21.8b
230
231    ///dc row 1@
232    uabal     v22.8h, v2.8b, v28.8b
233    uabal     v24.8h, v3.8b, v29.8b
234
235    dup       v20.8h, v27.h[5]
236    dup       v21.8h, v27.h[5]          ///HORIZONTAL VALUE ROW=2//
237
238    ///vertical row 2@
239    uabal     v16.8h, v4.8b, v10.8b
240    uabal     v18.8h, v5.8b, v11.8b
241
242    ld1       { v6.8b, v7.8b}, [x0], x3
243    ///HORZ row 2@
244    uabal     v26.8h, v4.8b, v20.8b
245    uabal     v14.8h, v5.8b, v21.8b
246
247    ///dc row 2@
248    uabal     v22.8h, v4.8b, v28.8b
249    uabal     v24.8h, v5.8b, v29.8b
250
251    dup       v20.8h, v27.h[4]
252    dup       v21.8h, v27.h[4]          ///HORIZONTAL VALUE ROW=3//
253
254    ///vertical row 3@
255    uabal     v16.8h, v6.8b, v10.8b
256    uabal     v18.8h, v7.8b, v11.8b
257
258    ///HORZ row 3@
259    uabal     v26.8h, v6.8b, v20.8b
260    uabal     v14.8h, v7.8b, v21.8b
261
262    ///dc row 3@
263    uabal     v22.8h, v6.8b, v28.8b
264    uabal     v24.8h, v7.8b, v29.8b
265
266    //----------------------------------------------------------------------------------------------
267    ld1       { v0.8b, v1.8b}, [x0], x3
268
269
270    dup       v20.8h, v27.h[3]
271    dup       v21.8h, v27.h[3]          ///HORIZONTAL VALUE ROW=0//
272
273    ///vertical row 0@
274    uabal     v16.8h, v0.8b, v10.8b
275    uabal     v18.8h, v1.8b, v11.8b
276
277    ///HORZ row 0@
278    uabal     v26.8h, v0.8b, v20.8b
279    uabal     v14.8h, v1.8b, v21.8b
280
281    ld1       { v2.8b, v3.8b}, [x0], x3
282
283    ///dc row 0@
284    uabal     v22.8h, v0.8b, v30.8b
285    uabal     v24.8h, v1.8b, v31.8b
286
287    dup       v20.8h, v27.h[2]
288    dup       v21.8h, v27.h[2]          ///HORIZONTAL VALUE ROW=1//
289
290    ///vertical row 1@
291    uabal     v16.8h, v2.8b, v10.8b
292    uabal     v18.8h, v3.8b, v11.8b
293
294    ///HORZ row 1@
295    uabal     v26.8h, v2.8b, v20.8b
296    uabal     v14.8h, v3.8b, v21.8b
297
298    ld1       { v4.8b, v5.8b}, [x0], x3
299
300    ///dc row 1@
301    uabal     v22.8h, v2.8b, v30.8b
302    uabal     v24.8h, v3.8b, v31.8b
303
304    dup       v20.8h, v27.h[1]
305    dup       v21.8h, v27.h[1]          ///HORIZONTAL VALUE ROW=2//
306
307    ///vertical row 2@
308    uabal     v16.8h, v4.8b, v10.8b
309    uabal     v18.8h, v5.8b, v11.8b
310
311    ///HORZ row 2@
312    uabal     v26.8h, v4.8b, v20.8b
313    uabal     v14.8h, v5.8b, v21.8b
314
315    ld1       {v6.8b, v7.8b}, [x0], x3
316
317    ///dc row 2@
318    uabal     v22.8h, v4.8b, v30.8b
319    uabal     v24.8h, v5.8b, v31.8b
320
321    dup       v20.8h, v27.h[0]
322    dup       v21.8h, v27.h[0]          ///HORIZONTAL VALUE ROW=3//
323
324    ///vertical row 3@
325    uabal     v16.8h, v6.8b, v10.8b
326    uabal     v18.8h, v7.8b, v11.8b
327
328    ///HORZ row 3@
329    uabal     v26.8h, v6.8b, v20.8b
330    uabal     v14.8h, v7.8b, v21.8b
331
332    ///dc row 3@
333    uabal     v22.8h, v6.8b, v30.8b
334    uabal     v24.8h, v7.8b, v31.8b
335
336
337//-------------------------------------------
338
339
340//vert sum
341
342    add       v16.8h, v16.8h , v18.8h
343    mov       v18.d[0], v16.d[1]
344    add       v16.4h, v16.4h , v18.4h
345    uaddlp    v16.2s, v16.4h
346    addp      v16.2s, v16.2s, v16.2s
347    smov      x8, v16.s[0]
348
349
350    //horz sum
351
352    add       v26.8h, v26.8h , v14.8h
353    mov       v14.d[0], v26.d[1]
354    add       v26.4h, v26.4h , v14.4h
355    uaddlp    v26.2s, v26.4h
356    addp      v26.2s, v26.2s, v26.2s
357    smov      x9, v26.s[0]
358
359    //dc sum
360
361    add       v24.8h, v22.8h , v24.8h   ///DC
362    mov       v25.d[0], v24.d[1]
363    add       v24.4h, v24.4h , v25.4h   ///DC
364    uaddlp    v24.2s, v24.4h            ///DC
365    addp      v24.2s, v24.2s, v24.2s    ///DC
366    smov      x10, v24.s[0]             //dc
367
368
369
370
371    mov       x11, #1
372//-----------------------
373    mov       w0, w16 // u4_valid_intra_modes
374
375//--------------------------------------------
376
377
378    lsl       x11, x11, #30
379
380    ands      w7, w0, #04               // vert mode valid????????????
381    csel      x8, x11, x8, eq
382
383    ands      w6, w0, #02               // horz mode valid????????????
384    csel      x9, x11, x9, eq
385
386    ands      w6, w0, #01               // dc mode valid????????????
387    csel      x10, x11, x10, eq
388
389
390    //---------------------------
391
392    mov       x4, x17
393    mov       x6, x14
394    mov       x7, x15
395
396    //--------------------------
397
398    cmp       x10, x9
399    bgt       not_dc
400    cmp       x10, x8
401    bgt       do_vert
402
403    ///----------------------
404    //DO DC PREDICTION
405    str       w10 , [x7]                //MIN SAD
406
407    mov       w10, #0
408    str       w10 , [x6]                // MODE
409
410    b         do_dc_vert
411    //-----------------------------
412
413not_dc:
414    cmp       x9, x8
415    bgt       do_vert
416    ///----------------------
417    //DO HORIZONTAL
418    str       w9 , [x7]                 //MIN SAD
419
420    mov       w10, #1
421    str       w10 , [x6]                // MODE
422    ld1       {v0.8h}, [x1]
423
424    dup       v10.8h, v0.h[7]
425    dup       v11.8h, v0.h[6]
426    dup       v12.8h, v0.h[5]
427    dup       v13.8h, v0.h[4]
428    st1       {v10.8h}, [x2], x4
429    dup       v14.8h, v0.h[3]
430    st1       {v11.8h}, [x2], x4
431    dup       v15.8h, v0.h[2]
432    st1       {v12.8h}, [x2], x4
433    dup       v16.8h, v0.h[1]
434    st1       {v13.8h}, [x2], x4
435    dup       v17.8h, v0.h[0]
436    st1       {v14.8h}, [x2], x4
437    st1       {v15.8h}, [x2], x4
438    st1       {v16.8h}, [x2], x4
439    st1       {v17.8h}, [x2], x4
440
441    b         end_func
442
443do_vert:
444    //DO VERTICAL PREDICTION
445    str       w8 , [x7]                 //MIN SAD
446    mov       w8, #2
447    str       w8 , [x6]                 // MODE
448    add       x6, x1, #18
449    ld1       {v28.8b, v29.8b}, [x6]    // vertical values
450    ld1       {v30.8b, v31.8b}, [x6]    // vertical values
451
452do_dc_vert:
453    st1       {v28.2s, v29.2s} , [x2], x4 //0
454    st1       {v28.2s, v29.2s} , [x2], x4 //1
455    st1       {v28.2s, v29.2s} , [x2], x4 //2
456    st1       {v28.2s, v29.2s} , [x2], x4 //3
457    st1       {v30.2s, v31.2s} , [x2], x4 //4
458    st1       {v30.2s, v31.2s} , [x2], x4 //5
459    st1       {v30.2s, v31.2s} , [x2], x4 //6
460    st1       {v30.2s, v31.2s} , [x2], x4 //7
461
462end_func:
463    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
464    ldp       x19, x20, [sp], #16
465    pop_v_regs
466    ret
467
468
469