1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@/*******************************************************************************
20@* @file
21@*  ihevcd_fmt_conv_420sp_to_rgba8888.s
22@*
23@* @brief
24@*  contains function definitions for format conversions
25@*
26@* @author
27@*  ittiam
28@*
29@* @par list of functions:
30@*
31@*
32@* @remarks
33@*  none
34@*
35@*******************************************************************************/
36    .equ DO1STROUNDING, 0
37
38    @ ARM
39    @
40    @ PRESERVE8
41
42.text
43.p2align 2
44
45
46
47
48@/*****************************************************************************
49@*                                                                            *
50@*  Function Name    : ihevcd_fmt_conv_420sp_to_rgba8888()                    *
51@*                                                                            *
52@*  Description      : This function conversts the image from YUV422 color    *
53@*                     space to RGB888 color space. The function can be       *
54@*                     invoked at the MB level.                               *
55@*                                                                            *
56@*  Arguments        : R0           pubY                                      *
57@*                     R1           pubUV                                     *
58@*                     R2           pusRGB                                    *
59@*                     R3           pusRGB                                    *
60@*                     [R13 #40]    usHeight                                  *
61@*                     [R13 #44]    usWidth                                   *
62@*                     [R13 #48]    usStrideY                                 *
63@*                     [R13 #52]    usStrideU                                 *
64@*                     [R13 #56]    usStrideV                                 *
65@*                     [R13 #60]    usStrideRGB                               *
66@*                                                                            *
67@*  Values Returned  : None                                                   *
68@*                                                                            *
69@*  Register Usage   : R0 - R14                                               *
70@*                                                                            *
71@*  Stack Usage      : 104 Bytes                                              *
72@*                                                                            *
73@*  Interruptibility : Interruptible                                          *
74@*                                                                            *
75@*  Known Limitations                                                         *
76@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
77@*                     greater than or equal to 16                            *
78@*                     Image Height:    Assumed to be even.                   *
79@*                                                                            *
80@*  Revision History :                                                        *
81@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
82@*         07 06 2010   Varshita        Draft                                 *
83@*         07 06 2010   Naveen Kr T     Completed                             *
84@*         05 08 2013   Naveen K P      Modified for HEVC                     *
85@*         30 10 2018   Saurabh Sood    Store D registers to stack            *
86@*****************************************************************************/
87    .global ihevcd_fmt_conv_420sp_to_rgba8888_a9q
88.type ihevcd_fmt_conv_420sp_to_rgba8888_a9q, function
89ihevcd_fmt_conv_420sp_to_rgba8888_a9q:
90
91    @// push the registers on the stack
92    STMFD       SP!,{R4-R12,LR}
93    VPUSH       {d8-d15}
94
95    @//R0 - Y PTR
96    @//R1 - UV PTR
97    @//R2 - RGB PTR
98    @//R3 - RGB PTR
99    @//R4 - PIC WIDTH
100    @//R5 - PIC HT
101    @//R6 - STRIDE Y
102    @//R7 - STRIDE U
103    @//R8 - STRIDE V
104    @//R9 - STRIDE RGB
105
106    @//ONE ROW PROCESSING AT A TIME
107
108    @//THE FOUR CONSTANTS ARE:
109    @//C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
110
111    @PLD        [R0]
112    @PLD        [R1]
113    @PLD        [R2]
114
115
116    @/* can be loaded from a defined const type */
117    MOVW        R10,#0x3311
118    VMOV.16     D0[0],R10                   @//C1
119
120    MOVW        R10,#0xF379
121    VMOV.16     D0[1],R10                   @//C2
122
123    MOVW        R10,#0xE5F8
124    VMOV.16     D0[2],R10                   @//C3
125
126    MOVW        R10,#0x4092
127    VMOV.16     D0[3],R10                   @//C4
128
129    @//LOAD CONSTANT 128 INTO A CORTEX REGISTER
130    MOV         R10,#128
131    VDUP.8      D1,R10
132
133    @//D0 HAS C1-C2-C3-C4
134    @// load other parameters from stack
135    LDR         R5,[sp,#104]
136    @LDR  R4,[sp,#44]
137    LDR         R6,[sp,#108]
138    LDR         R7,[sp,#112]
139    @LDR  R8,[sp,#52]
140    LDR         R9,[sp,#116]
141
142    @// calculate offsets, offset = stride - width
143    SUB         R10,R6,R3                   @// luma offset
144    SUB         R11,R7,R3
145    @, LSR #1   @// u offset
146    @SUB     R12,R8,R3, LSR #1  @// v offset
147    SUB         R14,R9,R3                   @// rgb offset in pixels
148
149    @// calculate height loop count
150    MOV         R5,R5, LSR #1               @// height_cnt = height / 16
151
152    @// create next row pointers for rgb and luma data
153    ADD         R7,R0,R6                    @// luma_next_row = luma + luma_stride
154    ADD         R8,R2,R9,LSL #2             @// rgb_next_row = rgb + rgb_stride
155
156LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
157
158    @//LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
159    VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF UV
160    @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V
161
162    @// calculate width loop count
163    MOV         R6,R3, LSR #4               @// width_cnt = width / 16
164
165    @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
166    @//LOAD VALUES OF Y 8-BIT VALUES
167    VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
168                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
169    VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
170                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
171
172    SUBS        R6,R6,#1
173    BEQ         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
174
175LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
176    @VMOV.I8 Q1,#128
177    VUZP.8      D2,D3
178
179
180    @//NEED TO SUBTRACT (U-128) AND (V-128)
181    @//(D2-D1),(D3-D1)
182    VSUBL.U8    Q2,D2,D1                    @//(U-128)
183    VSUBL.U8    Q3,D3,D1                    @//(V-128)
184
185    @//LOAD VALUES OF U&V for next row
186    VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF U
187    @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V
188
189    @PLD        [R0]
190    PLD         [R1]
191
192    @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
193    VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
194    VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B
195
196    VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
197    VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R
198
199    VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
200    VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
201    VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
202    VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3
203
204    @//NARROW RIGHT SHIFT BY 13 FOR R&B
205    VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
206    VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
207    @//Q4 - WEIGHT FOR B
208
209    @//NARROW RIGHT SHIFT BY 13 FOR R&B
210    VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
211    VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
212    @//Q5 - WEIGHT FOR R
213
214    @//NARROW RIGHT SHIFT BY 13 FOR G
215    VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
216    VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
217    @//Q6 - WEIGHT FOR G
218
219    VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
220    VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
221    VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G
222
223    VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
224    VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
225    VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G
226
227    VQMOVUN.S16 D14,Q7
228    VQMOVUN.S16 D15,Q9
229    VQMOVUN.S16 D16,Q8
230    VMOV.I8     D17,#0
231
232    VZIP.8      D14,D15
233    VZIP.8      D16,D17
234    VZIP.16     Q7,Q8
235
236
237    VQMOVUN.S16 D20,Q10
238    VQMOVUN.S16 D21,Q12
239    VQMOVUN.S16 D22,Q11
240    VMOV.I8     D23,#0
241
242    VZIP.8      D20,D21
243    VZIP.8      D22,D23
244    VZIP.16     Q10,Q11
245
246    VZIP.32     Q7,Q10
247    VZIP.32     Q8,Q11
248
249    VST1.32     D14,[R2]!
250    VST1.32     D15,[R2]!
251    VST1.32     D20,[R2]!
252    VST1.32     D21,[R2]!
253    VST1.32     D16,[R2]!
254    VST1.32     D17,[R2]!
255    VST1.32     D22,[R2]!
256    VST1.32     D23,[R2]!
257
258    @//D14-D20 - TOALLY HAVE 16 VALUES
259    @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
260    VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
261    VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
262    VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G
263
264    VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
265    VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
266    VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G
267
268    @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
269    @//LOAD VALUES OF Y 8-BIT VALUES
270    VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
271                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
272    VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
273                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
274
275    PLD         [R0]
276    PLD         [R7]
277
278    VQMOVUN.S16 D14,Q7
279    VQMOVUN.S16 D15,Q9
280    VQMOVUN.S16 D16,Q8
281    VMOV.I8     D17,#0
282
283    VZIP.8      D14,D15
284    VZIP.8      D16,D17
285    VZIP.16     Q7,Q8
286
287
288    VQMOVUN.S16 D20,Q10
289    VQMOVUN.S16 D21,Q12
290    VQMOVUN.S16 D22,Q11
291    VMOV.I8     D23,#0
292
293    VZIP.8      D20,D21
294    VZIP.8      D22,D23
295    VZIP.16     Q10,Q11
296
297    VZIP.32     Q7,Q10
298    VZIP.32     Q8,Q11
299
300    VST1.32     D14,[R8]!
301    VST1.32     D15,[R8]!
302    VST1.32     D20,[R8]!
303    VST1.32     D21,[R8]!
304    VST1.32     D16,[R8]!
305    VST1.32     D17,[R8]!
306    VST1.32     D22,[R8]!
307    VST1.32     D23,[R8]!
308
309    SUBS        R6,R6,#1                    @// width_cnt -= 1
310    BNE         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
311
312LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
313    @VMOV.I8 Q1,#128
314    VUZP.8      D2,D3
315
316
317    @//NEED TO SUBTRACT (U-128) AND (V-128)
318    @//(D2-D1),(D3-D1)
319    VSUBL.U8    Q2,D2,D1                    @//(U-128)
320    VSUBL.U8    Q3,D3,D1                    @//(V-128)
321
322
323    @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
324    VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
325    VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B
326
327    VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
328    VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R
329
330    VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
331    VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
332    VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
333    VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3
334
335    @//NARROW RIGHT SHIFT BY 13 FOR R&B
336    VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
337    VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
338    @//Q4 - WEIGHT FOR B
339
340    @//NARROW RIGHT SHIFT BY 13 FOR R&B
341    VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
342    VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
343    @//Q5 - WEIGHT FOR R
344
345    @//NARROW RIGHT SHIFT BY 13 FOR G
346    VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
347    VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
348    @//Q6 - WEIGHT FOR G
349
350    VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
351    VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
352    VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G
353
354    VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
355    VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
356    VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G
357
358    VQMOVUN.S16 D14,Q7
359    VQMOVUN.S16 D15,Q9
360    VQMOVUN.S16 D16,Q8
361    VMOV.I8     D17,#0
362
363    VZIP.8      D14,D15
364    VZIP.8      D16,D17
365    VZIP.16     Q7,Q8
366
367
368    VQMOVUN.S16 D20,Q10
369    VQMOVUN.S16 D21,Q12
370    VQMOVUN.S16 D22,Q11
371    VMOV.I8     D23,#0
372
373    VZIP.8      D20,D21
374    VZIP.8      D22,D23
375    VZIP.16     Q10,Q11
376
377    VZIP.32     Q7,Q10
378    VZIP.32     Q8,Q11
379
380    VST1.32     D14,[R2]!
381    VST1.32     D15,[R2]!
382    VST1.32     D20,[R2]!
383    VST1.32     D21,[R2]!
384    VST1.32     D16,[R2]!
385    VST1.32     D17,[R2]!
386    VST1.32     D22,[R2]!
387    VST1.32     D23,[R2]!
388
389    @//D14-D20 - TOALLY HAVE 16 VALUES
390    @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
391    VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
392    VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
393    VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G
394
395    VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
396    VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
397    VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G
398
399
400    VQMOVUN.S16 D14,Q7
401    VQMOVUN.S16 D15,Q9
402    VQMOVUN.S16 D16,Q8
403    VMOV.I8     D17,#0
404
405    VZIP.8      D14,D15
406    VZIP.8      D16,D17
407    VZIP.16     Q7,Q8
408
409
410    VQMOVUN.S16 D20,Q10
411    VQMOVUN.S16 D21,Q12
412    VQMOVUN.S16 D22,Q11
413    VMOV.I8     D23,#0
414
415    VZIP.8      D20,D21
416    VZIP.8      D22,D23
417    VZIP.16     Q10,Q11
418
419    VZIP.32     Q7,Q10
420    VZIP.32     Q8,Q11
421
422    VST1.32     D14,[R8]!
423    VST1.32     D15,[R8]!
424    VST1.32     D20,[R8]!
425    VST1.32     D21,[R8]!
426    VST1.32     D16,[R8]!
427    VST1.32     D17,[R8]!
428    VST1.32     D22,[R8]!
429    VST1.32     D23,[R8]!
430
431    @// Adjust the address pointers
432    ADD         R0,R7,R10                   @// luma = luma_next + offset
433    ADD         R2,R8,R14,LSL #2            @// rgb = rgb_next + offset
434
435    ADD         R7,R0,R3                    @// luma_next = luma + width
436    ADD         R8,R2,R3,LSL #2             @// rgb_next_row = rgb + width
437
438    ADD         R1,R1,R11                   @// adjust u pointer
439    @ADD        R2,R2,R12           @// adjust v pointer
440
441    ADD         R7,R7,R10                   @// luma_next = luma + width + offset (because of register crunch)
442    ADD         R8,R8,R14,LSL #2            @// rgb_next_row = rgb + width + offset
443
444    SUBS        R5,R5,#1                    @// height_cnt -= 1
445
446    BNE         LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
447
448    @//POP THE REGISTERS
449    VPOP        {d8-d15}
450    LDMFD       SP!,{R4-R12,PC}
451
452
453    .section .note.GNU-stack,"",%progbits
454