1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19///*******************************************************************************
20//* //file
21//*  ihevcd_fmt_conv_420sp_to_rgba8888.s
22//*
23//* //brief
24//*  contains function definitions for format conversions
25//*
26//* //author
27//*  ittiam
28//*
29//* //par list of functions:
30//*
31//*
32//* //remarks
33//*  none
34//*
35//*******************************************************************************/
36
37    .equ DO1STROUNDING, 0
38
39    // ARM
40    //
41    // PRESERVE8
42
43.text
44.p2align 2
45
46.include "ihevc_neon_macros.s"
47
48
49
50///*****************************************************************************
51//*                                                                            *
52//*  Function Name    : ihevcd_fmt_conv_420sp_to_rgba8888()                    *
53//*                                                                            *
54//*  Description      : This function conversts the image from YUV422 color    *
55//*                     space to RGB888 color space. The function can be       *
56//*                     invoked at the MB level.                               *
57//*                                                                            *
58//*  Arguments        : x0           pubY                                      *
59//*                     x1           pubUV                                     *
60//*                     x2           pusRGB                                    *
61//*                     x3           pusRGB                                    *
62//*                     [x13 #40]    usHeight                                  *
63//*                     [x13 #44]    usWidth                                   *
64//*                     [x13 #48]    usStrideY                                 *
65//*                     [x13 #52]    usStrideU                                 *
66//*                     [x13 #56]    usStrideV                                 *
67//*                     [x13 #60]    usStrideRGB                               *
68//*                                                                            *
69//*  Values Returned  : None                                                   *
70//*                                                                            *
71//*  Register Usage   : x0 - x14                                               *
72//*                                                                            *
73//*  Stack Usage      : 40 Bytes                                               *
74//*                                                                            *
75//*  Interruptibility : Interruptible                                          *
76//*                                                                            *
77//*  Known Limitations                                                         *
78//*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
79//*                     greater than or equal to 16                  *
80//*                     Image Height:    Assumed to be even.                   *
81//*                                                                            *
82//*  Revision History :                                                        *
83//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
84//*         07 06 2010   Varshita        Draft                                 *
85//*         07 06 2010   Naveen Kr T     Completed                             *
86//*         05 08 2013   Naveen K P      Modified for HEVC                     *
87//*****************************************************************************/
88    .global ihevcd_fmt_conv_420sp_to_rgba8888_av8
89.type ihevcd_fmt_conv_420sp_to_rgba8888_av8, function
90ihevcd_fmt_conv_420sp_to_rgba8888_av8:
91
92    //// push the registers on the stack
93    // STMFD sp!,{x4-x12,x14}
94
95    stp         d12,d14,[sp,#-16]!
96    stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
97                                            // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
98    stp         x19, x20,[sp,#-16]!
99
100
101    ////x0 - Y PTR
102    ////x1 - UV PTR
103    ////x2 - RGB PTR
104    ////x3 - RGB PTR
105    ////x4 - PIC WIDTH
106    ////x5 - PIC HT
107    ////x6 - STRIDE Y
108    ////x7 - STRIDE U
109    ////x8 - STRIDE V
110    ////x9 - STRIDE RGB
111
112    ////ONE ROW PROCESSING AT A TIME
113
114    ////THE FOUR CONSTANTS ARE:
115    ////C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
116
117    //PLD        [x0]
118    //PLD        [x1]
119    //PLD        [x2]
120
121
122    ///* can be loaded from a defined const type */
123    mov         x10,#0x3311
124    mov         v0.h[0], w10               ////C1
125
126    mov         x10,#0xF379
127    mov         v0.h[1], w10               ////C2
128
129    mov         x10,#0xE5F8
130    mov         v0.h[2], w10               ////C3
131
132    mov         x10,#0x4092
133    mov         v0.h[3], w10               ////C4
134
135    ////LOAD CONSTANT 128 INTO A CORTEX REGISTER
136    MOV         x10,#128
137    dup         v1.8b,w10
138
139    ////D0 HAS C1-C2-C3-C4
140    //// load other parameters from stack
141    mov         x9, x7
142    mov         x7, x6
143    mov         x6, x5
144    mov         x5, x4
145    //LDR  x4,[sp,#44]
146    //LDR  x8,[sp,#52]
147
148    //// calculate offsets, offset = stride - width
149    SUB         x10,x6,x3                   //// luma offset
150    SUB         x11,x7,x3
151    //, LSR #1    @// u offset
152    //SUB     x12,x8,x3, LSR #1    @// v offset
153    SUB         x14,x9,x3                   //// rgb offset in pixels
154
155    //// calculate height loop count
156    LSR         x5, x5, #1                  //// height_cnt = height / 16
157
158    //// create next row pointers for rgb and luma data
159    ADD         x7,x0,x6                    //// luma_next_row = luma + luma_stride
160    ADD         x8,x2,x9,LSL #2             //// rgb_next_row = rgb + rgb_stride
161
162LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
163
164    ////LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
165    LD1         {v2.8b, v3.8b},[x1],#16     ////LOAD 8 VALUES OF UV
166    ////VLD1.8 {D3},[x2]!             @//LOAD 8 VALUES OF V
167
168    //// calculate width loop count
169    LSR         x6, x3, #4                  //// width_cnt = width / 16
170
171    ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
172    ////LOAD VALUES OF Y 8-BIT VALUES
173    LD2         {v30.8b, v31.8b},[x0],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
174                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
175    LD2         {v28.8b, v29.8b},[x7],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
176                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
177
178    SUBS        x6,x6,#1
179    BEQ         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
180
181LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
182    //VMOV.I8 Q1,#128
183    UZP1        v27.8b, v2.8b, v3.8b
184    UZP2        v3.8b, v2.8b, v3.8b
185    mov         v2.d[0], v27.d[0]
186
187    ////NEED TO SUBTRACT (U-128) AND (V-128)
188    ////(D2-D1),(D3-D1)
189    uSUBL       v4.8h, v2.8b, v1.8b         ////(U-128)
190    uSUBL       v6.8h, v3.8b, v1.8b         ////(V-128)
191
192    ////LOAD VALUES OF U&V for next row
193    LD1         {v2.8b, v3.8b},[x1],#16     ////LOAD 8 VALUES OF U
194    ////VLD1.8 {D3},[x2]!             @//LOAD 8 VALUES OF V
195
196    //PLD        [x0]
197    prfm        PLDL1KEEP,[x1]
198
199    ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
200    sMULL       v5.4s, v4.4h, v0.h[3]      ////(U-128)*C4 FOR B
201    sMULL2      v7.4s, v4.8h, v0.h[3]      ////(U-128)*C4 FOR B
202
203    sMULL       v20.4s, v6.4h, v0.h[0]     ////(V-128)*C1 FOR R
204    sMULL2      v22.4s, v6.8h, v0.h[0]     ////(V-128)*C1 FOR R
205
206    sMULL       v12.4s, v4.4h, v0.h[1]     ////(U-128)*C2 FOR G
207    sMLAL       v12.4s, v6.4h, v0.h[2]     ////Q6 = (U-128)*C2 + (V-128)*C3
208    sMULL2      v14.4s, v4.8h, v0.h[1]     ////(U-128)*C2 FOR G
209    sMLAL2      v14.4s, v6.8h, v0.h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3
210
211    ////NARROW RIGHT SHIFT BY 13 FOR R&B
212    sqshrn      v5.4h, v5.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
213    sqshrn2     v5.8h, v7.4s,#13            ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
214    ////Q4 - WEIGHT FOR B
215
216    ////NARROW RIGHT SHIFT BY 13 FOR R&B
217    sqshrn      v7.4h, v20.4s,#13           ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
218    sqshrn2     v7.8h, v22.4s,#13           ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
219    ////Q5 - WEIGHT FOR R
220
221    ////NARROW RIGHT SHIFT BY 13 FOR G
222    sqshrn      v12.4h, v12.4s,#13          ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
223    sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
224    ////Q6 - WEIGHT FOR G
225
226    UADDW       v14.8h,  v5.8h ,  v30.8b    ////Q7 - HAS Y + B
227    UADDW       v16.8h,  v7.8h ,  v30.8b    ////Q8 - HAS Y + R
228    UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G
229
230    UADDW       v20.8h,  v5.8h ,  v31.8b    ////Q10 - HAS Y + B
231    UADDW       v22.8h,  v7.8h ,  v31.8b    ////Q11 - HAS Y + R
232    UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G
233
234    sqxtun      v14.8b, v14.8h
235    sqxtun      v15.8b, v18.8h
236    sqxtun      v16.8b, v16.8h
237    movi        v17.8b, #0
238
239    sqxtun      v20.8b, v20.8h
240    sqxtun      v21.8b, v24.8h
241    sqxtun      v22.8b, v22.8h
242    movi        v23.8b, #0
243
244    ZIP1        v27.8b, v14.8b, v15.8b
245    ZIP2        v15.8b, v14.8b, v15.8b
246    mov         v14.d[0], v27.d[0]
247    ZIP1        v27.8b, v16.8b, v17.8b
248    ZIP2        v17.8b, v16.8b, v17.8b
249    mov         v16.d[0], v27.d[0]
250
251    ZIP1        v27.8b, v20.8b, v21.8b
252    ZIP2        v21.8b, v20.8b, v21.8b
253    mov         v20.d[0], v27.d[0]
254    ZIP1        v27.8b, v22.8b, v23.8b
255    ZIP2        v23.8b, v22.8b, v23.8b
256    mov         v22.d[0], v27.d[0]
257
258    mov         v14.d[1], v15.d[0]
259    mov         v20.d[1], v21.d[0]
260    mov         v16.d[1], v17.d[0]
261    mov         v22.d[1], v23.d[0]
262
263    ZIP1        v27.8h, v14.8h, v16.8h
264    ZIP2        v26.8h, v14.8h, v16.8h
265
266    ZIP1        v25.8h, v20.8h, v22.8h
267    ZIP2        v19.8h, v20.8h, v22.8h
268
269    ZIP1        v14.4s, v27.4s, v25.4s
270    ZIP2        v20.4s, v27.4s, v25.4s
271
272    ZIP1        v16.4s, v26.4s, v19.4s
273    ZIP2        v22.4s, v26.4s, v19.4s
274
275    ST1         {v14.4s},[x2],#16
276    ST1         {v20.4s},[x2],#16
277    ST1         {v16.4s},[x2],#16
278    ST1         {v22.4s},[x2],#16
279
280    ////D14-D20 - TOALLY HAVE 16 VALUES
281    ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
282    UADDW       v14.8h,  v5.8h ,  v28.8b    ////Q7 - HAS Y + B
283    UADDW       v16.8h,  v7.8h ,  v28.8b    ////Q2 - HAS Y + R
284    UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G
285
286    UADDW       v20.8h,  v5.8h ,  v29.8b    ////Q10 - HAS Y + B
287    UADDW       v22.8h,  v7.8h ,  v29.8b    ////Q11 - HAS Y + R
288    UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G
289
290    ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
291    ////LOAD VALUES OF Y 8-BIT VALUES
292    LD2         {v30.8b, v31.8b},[x0],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
293                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
294    LD2         {v28.8b, v29.8b},[x7],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
295                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
296
297    prfm        PLDL1KEEP,[x0]
298    prfm        PLDL1KEEP,[x7]
299
300    sqxtun      v14.8b, v14.8h
301    sqxtun      v15.8b, v18.8h
302    sqxtun      v16.8b, v16.8h
303    movi        v17.8b, #0
304
305    sqxtun      v20.8b, v20.8h
306    sqxtun      v21.8b, v24.8h
307    sqxtun      v22.8b, v22.8h
308    movi        v23.8b, #0
309
310    ZIP1        v27.8b, v14.8b, v15.8b
311    ZIP2        v15.8b, v14.8b, v15.8b
312    mov         v14.d[0], v27.d[0]
313    ZIP1        v27.8b, v16.8b, v17.8b
314    ZIP2        v17.8b, v16.8b, v17.8b
315    mov         v16.d[0], v27.d[0]
316
317    ZIP1        v27.8b, v20.8b, v21.8b
318    ZIP2        v21.8b, v20.8b, v21.8b
319    mov         v20.d[0], v27.d[0]
320    ZIP1        v27.8b, v22.8b, v23.8b
321    ZIP2        v23.8b, v22.8b, v23.8b
322    mov         v22.d[0], v27.d[0]
323
324    mov         v14.d[1], v15.d[0]
325    mov         v20.d[1], v21.d[0]
326    mov         v16.d[1], v17.d[0]
327    mov         v22.d[1], v23.d[0]
328
329    ZIP1        v27.8h, v14.8h, v16.8h
330    ZIP2        v26.8h, v14.8h, v16.8h
331
332    ZIP1        v25.8h, v20.8h, v22.8h
333    ZIP2        v19.8h, v20.8h, v22.8h
334
335    ZIP1        v14.4s, v27.4s, v25.4s
336    ZIP2        v20.4s, v27.4s, v25.4s
337
338    ZIP1        v16.4s, v26.4s, v19.4s
339    ZIP2        v22.4s, v26.4s, v19.4s
340
341    ST1         {v14.4s},[x8],#16
342    ST1         {v20.4s},[x8],#16
343    ST1         {v16.4s},[x8],#16
344    ST1         {v22.4s},[x8],#16
345
346    SUBS        x6,x6,#1                    //// width_cnt -= 1
347    BNE         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
348
349LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
350    //VMOV.I8 Q1,#128
351    UZP1        v27.8b, v2.8b, v3.8b
352    UZP2        v3.8b, v2.8b, v3.8b
353    mov         v2.d[0], v27.d[0]
354
355
356    ////NEED TO SUBTRACT (U-128) AND (V-128)
357    ////(D2-D1),(D3-D1)
358    uSUBL       v4.8h, v2.8b, v1.8b         ////(U-128)
359    uSUBL       v6.8h, v3.8b, v1.8b         ////(V-128)
360
361
362    ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
363    sMULL       v5.4s, v4.4h, v0.h[3]      ////(U-128)*C4 FOR B
364    sMULL2      v7.4s, v4.8h, v0.h[3]      ////(U-128)*C4 FOR B
365
366    sMULL       v20.4s, v6.4h, v0.h[0]     ////(V-128)*C1 FOR R
367    sMULL2      v22.4s, v6.8h, v0.h[0]     ////(V-128)*C1 FOR R
368
369    sMULL       v12.4s, v4.4h, v0.h[1]     ////(U-128)*C2 FOR G
370    sMLAL       v12.4s, v6.4h, v0.h[2]     ////Q6 = (U-128)*C2 + (V-128)*C3
371    sMULL2      v14.4s, v4.8h, v0.h[1]     ////(U-128)*C2 FOR G
372    sMLAL2      v14.4s, v6.8h, v0.h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3
373
374    ////NARROW RIGHT SHIFT BY 13 FOR R&B
375    sqshrn      v5.4h, v5.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
376    sqshrn2     v5.8h, v7.4s,#13            ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
377    ////Q4 - WEIGHT FOR B
378
379    ////NARROW RIGHT SHIFT BY 13 FOR R&B
380    sqshrn      v7.4h, v20.4s,#13           ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
381    sqshrn2     v7.8h, v22.4s,#13           ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
382    ////Q5 - WEIGHT FOR R
383
384    ////NARROW RIGHT SHIFT BY 13 FOR G
385    sqshrn      v12.4h, v12.4s,#13          ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
386    sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
387    ////Q6 - WEIGHT FOR G
388
389    UADDW       v14.8h,  v5.8h ,  v30.8b    ////Q7 - HAS Y + B
390    UADDW       v16.8h,  v7.8h ,  v30.8b    ////Q8 - HAS Y + R
391    UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G
392
393    UADDW       v20.8h,  v5.8h ,  v31.8b    ////Q10 - HAS Y + B
394    UADDW       v22.8h,  v7.8h ,  v31.8b    ////Q11 - HAS Y + R
395    UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G
396
397    sqxtun      v14.8b, v14.8h
398    sqxtun      v15.8b, v18.8h
399    sqxtun      v16.8b, v16.8h
400    movi        v17.8b, #0
401
402    sqxtun      v20.8b, v20.8h
403    sqxtun      v21.8b, v24.8h
404    sqxtun      v22.8b, v22.8h
405    movi        v23.8b, #0
406
407    ZIP1        v27.8b, v14.8b, v15.8b
408    ZIP2        v15.8b, v14.8b, v15.8b
409    mov         v14.d[0], v27.d[0]
410    ZIP1        v27.8b, v16.8b, v17.8b
411    ZIP2        v17.8b, v16.8b, v17.8b
412    mov         v16.d[0], v27.d[0]
413
414    ZIP1        v27.8b, v20.8b, v21.8b
415    ZIP2        v21.8b, v20.8b, v21.8b
416    mov         v20.d[0], v27.d[0]
417    ZIP1        v27.8b, v22.8b, v23.8b
418    ZIP2        v23.8b, v22.8b, v23.8b
419    mov         v22.d[0], v27.d[0]
420
421    mov         v14.d[1], v15.d[0]
422    mov         v20.d[1], v21.d[0]
423    mov         v16.d[1], v17.d[0]
424    mov         v22.d[1], v23.d[0]
425
426    ZIP1        v27.8h, v14.8h, v16.8h
427    ZIP2        v26.8h, v14.8h, v16.8h
428
429    ZIP1        v25.8h, v20.8h, v22.8h
430    ZIP2        v19.8h, v20.8h, v22.8h
431
432    ZIP1        v14.4s, v27.4s, v25.4s
433    ZIP2        v20.4s, v27.4s, v25.4s
434
435    ZIP1        v16.4s, v26.4s, v19.4s
436    ZIP2        v22.4s, v26.4s, v19.4s
437
438    ST1         {v14.4s},[x2],#16
439    ST1         {v20.4s},[x2],#16
440    ST1         {v16.4s},[x2],#16
441    ST1         {v22.4s},[x2],#16
442
443    ////D14-D20 - TOALLY HAVE 16 VALUES
444    ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
445    UADDW       v14.8h,  v5.8h ,  v28.8b    ////Q7 - HAS Y + B
446    UADDW       v16.8h,  v7.8h ,  v28.8b    ////Q2 - HAS Y + R
447    UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G
448
449    UADDW       v20.8h,  v5.8h ,  v29.8b    ////Q10 - HAS Y + B
450    UADDW       v22.8h,  v7.8h ,  v29.8b    ////Q11 - HAS Y + R
451    UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G
452
453    sqxtun      v14.8b, v14.8h
454    sqxtun      v15.8b, v18.8h
455    sqxtun      v16.8b, v16.8h
456    movi        v17.8b, #0
457
458    sqxtun      v20.8b, v20.8h
459    sqxtun      v21.8b, v24.8h
460    sqxtun      v22.8b, v22.8h
461    movi        v23.8b, #0
462
463    ZIP1        v27.8b, v14.8b, v15.8b
464    ZIP2        v15.8b, v14.8b, v15.8b
465    mov         v14.d[0], v27.d[0]
466    ZIP1        v27.8b, v16.8b, v17.8b
467    ZIP2        v17.8b, v16.8b, v17.8b
468    mov         v16.d[0], v27.d[0]
469
470    ZIP1        v27.8b, v20.8b, v21.8b
471    ZIP2        v21.8b, v20.8b, v21.8b
472    mov         v20.d[0], v27.d[0]
473    ZIP1        v27.8b, v22.8b, v23.8b
474    ZIP2        v23.8b, v22.8b, v23.8b
475    mov         v22.d[0], v27.d[0]
476
477    mov         v14.d[1], v15.d[0]
478    mov         v20.d[1], v21.d[0]
479    mov         v16.d[1], v17.d[0]
480    mov         v22.d[1], v23.d[0]
481
482    ZIP1        v27.8h, v14.8h, v16.8h
483    ZIP2        v26.8h, v14.8h, v16.8h
484
485    ZIP1        v25.8h, v20.8h, v22.8h
486    ZIP2        v19.8h, v20.8h, v22.8h
487
488    ZIP1        v14.4s, v27.4s, v25.4s
489    ZIP2        v20.4s, v27.4s, v25.4s
490
491    ZIP1        v16.4s, v26.4s, v19.4s
492    ZIP2        v22.4s, v26.4s, v19.4s
493
494    ST1         {v14.4s},[x8],#16
495    ST1         {v20.4s},[x8],#16
496    ST1         {v16.4s},[x8],#16
497    ST1         {v22.4s},[x8],#16
498
499    //// Adjust the address pointers
500    ADD         x0,x7,x10                   //// luma = luma_next + offset
501    ADD         x2,x8,x14,LSL #2            //// rgb = rgb_next + offset
502
503    ADD         x7,x0,x3                    //// luma_next = luma + width
504    ADD         x8,x2,x3,LSL #2             //// rgb_next_row = rgb + width
505
506    ADD         x1,x1,x11                   //// adjust u pointer
507    //ADD        x2,x2,x12            @// adjust v pointer
508
509    ADD         x7,x7,x10                   //// luma_next = luma + width + offset (because of register crunch)
510    ADD         x8,x8,x14,LSL #2            //// rgb_next_row = rgb + width + offset
511
512    SUBS        x5,x5,#1                    //// height_cnt -= 1
513
514    BNE         LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
515
516    ////POP THE REGISTERS
517    // LDMFD sp!,{x4-x12,PC}
518    ldp         x19, x20,[sp],#16
519    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
520                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
521    ldp         d12,d14,[sp],#16
522    ret
523
524
525
526
527    .section .note.GNU-stack,"",%progbits
528
529