1@/******************************************************************************
2@ *
3@ * Copyright (C) 2018 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20
21.text
22.align 4
23
24@/**
25@/*******************************************************************************
26@/*
27@/* @brief
28@/*  Residue calculation and Forward Transform for 4x4 block with 8-bit input
29@/*
30@/* @par Description:
31@/*  Performs residue calculation by subtracting source and  prediction and
32@/*  followed by forward transform
33@/*
34@/* @param[in] pu1_src
35@/*  Input 4x4 pixels
36@/*
37@/* @param[in] pu1_pred
38@/*  Prediction data
39@/*
40@/* @param[in] pi4_tmp
41@/*  Temporary buffer of size 4x4
42@/*
43@/* @param[out] pi2_dst
44@/*  Output 4x4 coefficients
45@/*
46@/* @param[in] src_strd
47@/*  Input stride
48@/*
49@/* @param[in] pred_strd
50@/*  Prediction Stride
51@/*
52@/* @param[in] dst_strd_chr_flag
53@/*  Output Stride and Chroma Flag packed in the MS and LS 16-bit
54@/*
55@/* @returns  Void
56@/*
57@/* @remarks
58@/*  None
59@/*
60@/*******************************************************************************
61@/*/
62
63@/**************Variables Vs Registers*****************************************
64@    r0 => *pu1_src
65@    r1 => *pu1_pred
66@    r2 => *pi4_temp
67@    r3 => *pi2_dst
68@    r4 => src_strd
69@    r5 => pred_strd
70@    r6 => dst_strd_chr_flag
71
72    .global ihevc_resi_trans_4x4_a9q
73
74ihevc_resi_trans_4x4_a9q:
75
76    STMFD          sp!, {r4-r7, r14}   @ store all the register components from caller function to memory
77    LDR            r4, [sp,#20]        @ r4 contains src_strd
78    LDR            r5, [sp,#24]        @ r5 contains pred_strd
79    LDR            r6, [sp,#28]        @ r6 contains dst_strd_chr_flag
80
81    ANDS           r7, r6, #1          @check for chroma flag, if present interleaved data
82    CMP            r7, #0
83    BEQ            NON_INTERLEAVE_LOAD @if flag == 0, use non-interleaving loads
84
85    VLD1.64        d0, [r0], r4        @ load row 0 src
86    VLD1.64        d4, [r0], r4        @ load row 1 src
87    VLD1.64        d1, [r0], r4        @ load row 2 src
88    VLD1.64        d5, [r0], r4        @ load row 3 src
89    VUZP.8         d0, d4              @ de-interleaving unzip instruction to get luma data of pu1_src in d0
90    VUZP.8         d1, d5              @ de-interleaving unzip instruction to get luma data of pu1_src in d1
91
92    VLD1.64        d2, [r1], r5        @ load row 0 pred
93    VLD1.64        d6, [r1], r5        @ load row 1 pred
94    VLD1.64        d3, [r1], r5        @ load row 2 pred
95    VLD1.64        d7, [r1], r5        @ load row 3 pred
96    VUZP.8         d2, d6              @ de-interleaving unzip instruction to get luma data of pu1_pred in d2
97    VUZP.8         d3, d7              @ de-interleaving unzip instruction to get luma data of pu1_pred in d3
98
99    B LOAD_END
100
101NON_INTERLEAVE_LOAD:
102    VLD1.U32     d0[0], [r0], r4       @ load row 0 src
103    VLD1.U32     d0[1], [r0], r4       @ load row 1 src
104    VLD1.U32     d1[0], [r0], r4       @ load row 2 src
105    VLD1.U32     d1[1], [r0], r4       @ load row 3 src
106
107    VLD1.U32     d2[0], [r1], r5       @ load row 0 pred
108    VLD1.U32     d2[1], [r1], r5       @ load row 1 pred
109    VLD1.U32     d3[0], [r1], r5       @ load row 2 pred
110    VLD1.U32     d3[1], [r1], r5       @ load row 3 pred
111
112LOAD_END:
113    @ Finding the residue
114    VSUBL.U8    q2, d0, d2             @ q2 contains 1st 16-bit 8 residues
115    VSUBL.U8    q3, d1, d3             @ q3 contains 2nd 16-bit 8 residues
116
117    @ SAD caculation
118    VABDL.U8    q12, d0, d2            @ q12 contains absolute differences
119    VABAL.U8    q12, d1, d3            @ q12 accumulates absolute differences
120    VADD.U16    d26, d24, d25          @ add d-registers of q12
121    VPADDL.U16  d27, d26               @ d27 contains 2 32-bit values that have to be added
122    VPADDL.U32  d28, d27               @ d28 contains 64-bit SAD, only LSB important
123    VMOV.32     r0, d28[0]             @ SAD stored in r0 for return
124    @ SAD caculation ends
125
126    @ Forward transform - step 1
127    VMOV.I16    d2, #64                @ generate immediate constant in d2 for even row multiplication
128    VTRN.16     d4, d5                 @ 3-step transpose of residue matrix starts
129    VTRN.16     d6, d7                 @ 2nd step of the 3-step matrix transpose
130    VMOV.I16    d0, #83                @ generate immediate constant in d0 for odd row multiplication
131    VTRN.32     q2, q3                 @ Final step of matrix transpose
132
133    VMOV.I16    d1, #36                @ generate immediate constant in d1 for odd row multiplication
134    VSWP        d6, d7                 @ vector swap to allow even and odd row calculation using Q registers
135    VADD.S16    q10, q2, q3            @ q4 has the even array
136    VSUB.S16    q11, q2, q3            @ q5 has the odd array
137    VMULL.S16   q12, d20, d2           @ e[0]*64
138    VMLAL.S16   q12, d21, d2[0]        @ row 1 of results: e[0]*64 + e[1]*64
139    VMULL.S16   q13, d20, d2           @ e[0]*64
140    VMLSL.S16   q13, d21, d2[0]        @ row 3 of results: e[0]*64 - e[1]*64
141    VMULL.S16   q8, d22, d0            @ o[0]*83
142    VMLAL.S16   q8, d23, d1[0]         @ row 2 of results: o[0]*83 + o[1]*36
143    VMULL.S16   q9, d22, d1            @ o[0]*36
144    VMLSL.S16   q9, d23, d0[0]         @ row 4 of results: o[0]*36 - o[1]*83
145
146    @ Forward transform - step 2
147    VMOV.I32    d2, #64                @ generate immediate constant in d2 for even row multiplication
148    VMOV.I32    d0, #83                @ generate immediate constant in d0 for odd row multiplication
149    VTRN.32     q12, q8                @ 4-step transpose of residue matrix starts
150    VTRN.32     q13, q9                @ 2nd step of the 4-step matrix transpose
151
152    VMOV.I32    d1, #36                @ generate immediate constant in d1 for odd row multiplication
153    VSWP        d25, d26               @ 3rd step of the 4-step matrix transpose
154    VSWP        d17, d18               @ 4th step of the 4-step matrix transpose
155    VADD.S32    q2, q12, q9            @ e[0]
156    VADD.S32    q3, q8, q13            @ e[1]
157    VSUB.S32    q10, q12, q9           @ o[0]
158    VSUB.S32    q11, q8, q13           @ o[1]
159
160    VMUL.S32    q12, q2, d2[0]         @ e[0]*64
161    VMLA.S32    q12, q3, d2[0]         @ row 1 of results: e[0]*64 + e[1]*64
162    VMUL.S32    q13, q2, d2[0]         @ e[1]*64
163    VMLS.S32    q13, q3, d2[0]         @ row 3 of results: e[0]*64 - e[1]*64
164    VMUL.S32    q8, q10, d0[0]         @ o[0]*83
165    VMLA.S32    q8, q11, d1[0]         @ row 2 of results: o[0]*83 + o[1]*36
166    VMUL.S32    q9, q10, d1[0]         @ o[0]*36
167    VMLS.S32    q9, q11, d0[0]         @ row 4 of results: o[0]*36 - o[1]*83
168
169    VRSHRN.S32  d0, q12, #9            @ (row1 + 256)/512
170    VRSHRN.S32  d1, q8, #9             @ (row2 + 256)/512
171    VRSHRN.S32  d2, q13, #9            @ (row3 + 256)/512
172    VRSHRN.S32  d3, q9, #9             @ (row4 + 256)/512
173
174    LSR         r7, r6, #15            @ r7 = 2*dst_strd, as pi2_dst contains 2-bit integers
175    VST1.U16    d0, [r3], r7           @ store 1st row of result
176    VST1.U16    d1, [r3], r7           @ store 2nd row of result
177    VST1.U16    d2, [r3], r7           @ store 3rd row of result
178    VST1.U16    d3, [r3], r7           @ store 4th row of result
179
180    LDMFD       sp!,{r4-r7,r15}        @ Reload the registers from SP
181
182    @ Function End
183
184@/**
185@*******************************************************************************
186@*
187@* @brief
188@*  This function performs residue calculation and forward  transform type 1
189@*  on input pixels
190@*
191@* @description
192@*  Performs residue calculation by subtracting source and  prediction and
193@*  followed by forward transform
194@*
195@* @param[in] pu1_src
196@*  Input 4x4 pixels
197@*
198@* @param[in] pu1_pred
199@*  Prediction data
200@*
201@* @param[in] pi2_tmp
202@*  Temporary buffer of size 4x4
203@*
204@* @param[out] pi2_dst
205@*  Output 4x4 coefficients
206@*
207@* @param[in] src_strd
208@*  Input stride
209@*
210@* @param[in] pred_strd
211@*  Prediction Stride
212@*
213@* @param[in] dst_strd_chr_flag
214@*  Output Stride and Chroma Flag packed in the MS and LS 16-bit
215@*
216@* @returns void
217@*
218@* @remarks
219@*  None
220@*
221@*******************************************************************************
222@*/
223@ UWORD32 ihevc_resi_trans_4x4_ttype1(UWORD8 *pu1_src,
224@                                     UWORD8 *pu1_pred,
225@                                        WORD32 *pi4_temp,
226@                                     WORD16 *pi2_dst,
227@                                     WORD32 src_strd,
228@                                     WORD32 pred_strd,
229@                                       WORD32 dst_strd_chr_flag);
230@
231@**************Variables Vs Registers*******************************************
232@
233@ r0 - pu1_src
234@ r1 - pu1_pred
235@ r2 - pi4_temp
236@ r3 - pi2_dst
237@
238@ [sp]   - src_strd
239@ [sp+4] - pred_strd
240@ [sp+8] - dst_strd_chr_flag
241@
242@*******************************************************************************
243
244    .global ihevc_resi_trans_4x4_ttype1_a9q
245
246ihevc_resi_trans_4x4_ttype1_a9q:
247
248    PUSH {r4}
249    vpush {d8 - d15}
250
251    LDR r2,[sp,#68]                 @ r2 = src_strd
252    LDR r4,[sp,#72]                 @ r4 = pred_strd
253
254    VLD1.32 d2[0],[r0],r2           @ Row 1 of source in d2[0]
255    VLD1.32 d3[0],[r1],r4           @ Row 1 of prediction in d3[0]
256    VLD1.32 d2[1],[r0],r2           @ Row 2 of source in d2[1]
257    VLD1.32 d3[1],[r1],r4           @ Row 2 of prediction in d3[1]
258
259    VLD1.32 d8[0],[r0],r2           @ Row 3 of source in d8[0]
260    VABDL.U8 q0,d2,d3               @ Absolute differences of rows 1 and 2 in d0
261                                    @ R2:[d11[3] d11[2] d11[1] d11[0]] => Row 2 of residue
262    VLD1.32 d9[0],[r1],r4           @ Row 3 of prediction in d9[0]
263    VSUBL.U8 q5,d2,d3               @ R1:[d10[3] d10[2] d10[1] d10[0]] => Row 1 of residue
264    VLD1.32 d8[1],[r0]              @ Row 4 of source in d8[1]
265    VTRN.16 d10,d11                 @ Transpose step 1
266    VLD1.32 d9[1],[r1]              @ Row 4 of prediction in d9[1]
267
268    VSUBL.U8 q6,d8,d9               @ R3:[d12[3] d12[2] d12[1] d12[0]] => Row 3 of residue
269                                    @ R4:[d13[3] d13[2] d13[1] d13[0]] => Row 4 of residue
270    VABAL.U8 q0,d8,d9               @ Absolute differences of rows 3 and 4 in d1
271    VTRN.16 d12,d13                 @ Transpose step 2
272    VTRN.32 q5,q6                   @ Transpose step 3, Residue block transposed
273                                    @ Columns are in C1:d10, C2:d11, C3:d12 and C4:d13
274    VADD.S16 d23,d11,d13            @ d23 = C2 + C4
275    VMOV.I32 d6,#55                 @ Constant used for multiplication
276    VADD.S16 d22,d10,d13            @ d22 = C1 + C4
277    VADD.U16 d0,d1,d0               @ Accumulating SAD step 1
278    VMOV.I32 d7,#84                 @ Constant used for multiplication
279    VMULL.S16 q7,d23,d6[0]          @ q7  = 55*C2 + 55*C4
280    VMOV.I32 d4,#74                 @ Constant used for multiplication
281    VMULL.S16 q9,d22,d7[0]          @ q9  = 84*C1 + 84*C4
282    VADD.S16 d16,d10,d11            @ d16 = C1 + C2
283    VMUL.S16 d12,d12,d4[0]          @ d12 = 74*C3
284    VMOV.I32 d5,#29                 @ Constant used for multiplication
285    VPADDL.U16 d0,d0                @ Accumulating SAD step 2
286    VSUB.S16 d16,d16,d13            @ d16 = C1 + C2 - C4
287    VMLAL.S16 q7,d22,d5[0]          @ q7  = 29*C1 + 55*C2 + 84*C4
288    VMLSL.S16 q9,d23,d5[0]          @ q9  = 84*C1 - 29*C2 + 55*C4
289    VMULL.S16 q8,d16,d4[0]          @ q8  = 74*C1 + 74*C2 - 74*C4
290    VPADDL.U32 d0,d0                @ Accumulating SAD step 3, SAD in d0
291    VSUB.S32 q10,q9,q7              @ q10 = q9 - q7 = 55*C1 - 84*C2 - 29*C4
292    VMOV.32 r0,d0[0]                @ Return SAD value
293    VRSHR.S32 q8,q8,#1              @ Truncating the 1 bit in q8
294
295    VADDW.S16 q7,q7,d12             @ q7  = 29*C1 + 55*C2 + 74*C3 + 84*C4
296    VSUBW.S16 q9,q9,d12             @ q9  = 84*C1 - 29*C2 - 74*C3 + 55*C4
297    VADDW.S16 q10,q10,d12           @ q10 = 55*C1 - 84*C2 + 74*C3 - 29*C4
298
299    VRSHR.S32 q7,q7,#1              @ Truncating the 1 bit in q7
300    VRSHR.S32 q9,q9,#1              @ Truncating the 1 bit in q9
301    VRSHR.S32 q10,q10,#1            @ Truncating the 1 bit in q10
302                                    @ Transform stage 1 is in P1:q7, P2:q8, P3:q9 and P4:q10
303    VTRN.32 q7,q8
304    VTRN.32 q9,q10
305    VSWP d15,d18
306    VSWP d17,d20                    @ Residue block transposed
307                                    @ Corresponding columns are in S1:q7, S2:q8, S3:q9 and S4:q10
308    VADD.S32 q13,q7,q8              @ q13 = S1 + S2
309    VADD.S32 q1,q7,q10              @ q1 = S1 + S4
310    VADD.S32 q4,q8,q10              @ q4 = S2 + S4
311    VSUB.S32 q13,q13,q10            @ q13 = S1 + S2 - S4
312    VMUL.S32 q12,q1,d5[0]           @ q12 = 29*S1 + 29*S4
313    VMUL.S32 q14,q1,d7[0]           @ q14 = 84*S1 + 84*S4
314    VMUL.S32 q13,q13,d4[0]          @ q13 = 74*S1 + 74*S2 - 74*S4
315
316    VMLA.S32 q12,q4,d6[0]           @ q12 = 29*S1 + 55*S2 + 84*S4
317    VMLS.S32 q14,q4,d5[0]           @ q14 = 84*S1 - 29*S2 + 55*S4
318    VMUL.S32 q9,q9,d4[0]            @ q9 = 74*S3
319
320    LDR r4,[sp,#76]                 @ r4 = dst_strd_chr_flag
321    ASR r4,r4,#16                   @ r4 = dst_strd
322    LSL r4,r4,#1                    @ r4 = 2*dst_strd
323
324    VRSHRN.S32 d26,q13,#8
325    VSUB.S32 q15,q14,q12            @ q15 = q14 - q12 = 55*S1 - 84*S2 - 29*S4
326
327    VADD.S32 q12,q12,q9             @ q12 = 29*S1 + 55*S2 + 74*S3 + 84*S4
328    VSUB.S32 q14,q14,q9             @ q14 = 84*S1 - 29*S2 - 74*S3 + 55*S4
329    VADD.S32 q15,q15,q9             @ q15 = 55*S1 - 84*S2 + 74*S3 - 29*S4
330
331    VRSHRN.S32 d24,q12,#8
332    VRSHRN.S32 d28,q14,#8
333    VRSHRN.S32 d30,q15,#8           @ Truncating the last 8 bits
334                                    @ Transform stage 2 is in U1:d24, U2:d26, U3:d28 and U4:d30
335    VST1.64 d24,[r3],r4             @ Storing row 1 of transform stage 2
336    VST1.64 d26,[r3],r4             @ Storing row 2 of transform stage 2
337    VST1.64 d28,[r3],r4             @ Storing row 3 of transform stage 2
338    VST1.64 d30,[r3]                @ Storing row 4 of transform stage 2
339
340    vpop {d8 - d15}
341    POP {r4}
342    MOV pc,lr
343
344@/**
345@*******************************************************************************
346@*
347@* @brief
348@*  This function performs residue calculation and DCT integer forward transform
349@*  on 8x8 block
350@*
351@* @description
352@*  Performs residue calculation by subtracting source and prediction and
353@*  followed by DCT integer forward transform
354@*
355@* @param[in] pu1_src
356@*  Input 4x4 pixels
357@*
358@* @param[in] pu1_pred
359@*  Prediction data
360@*
361@* @param[in] pi2_tmp
362@*  Temporary buffer of size 8x8
363@*
364@* @param[out] pi2_dst
365@*  Output 8x8 coefficients
366@*
367@* @param[in] src_strd
368@*  Input stride
369@*
370@* @param[in] pred_strd
371@*  Prediction Stride
372@*
373@* @param[in] dst_strd_chr_flag
374@*  Output Stride and Chroma Flag packed in the MS and LS 16-bit
375@*
376@* @returns void
377@*
378@* @remarks
379@*  None
380@*
381@*******************************************************************************
382@*/
383@ UWORB32 ihevc_resi_trans_8x8(UWORD8 *pu1_src,
384@                              UWORD8 *pu1_pred,
385@                              WORB32 *pi4_temp,
386@                              WORB16 *pi2_dst,
387@                              WORB32 src_strd,
388@                              WORB32 pred_strd,
389@                              WORB32 dst_strd_chr_flag);
390@
391@**************Variables Vs Registers*******************************************
392@
393@ r0 - pu1_src
394@ r1 - pu1_pred
395@ r2 - pi4_temp
396@ r3 - pi2_dst
397@
398@ [sp]   - src_strd
399@ [sp+4] - pred_strd
400@ [sp+8] - dst_strd_chr_flag
401@
402@*******************************************************************************
403
404    .global ihevc_resi_trans_8x8_a9q
405
406ihevc_resi_trans_8x8_a9q:
407
408    PUSH {r4,r5}
409    vpush {d8 - d15}
410
411    @ Loading Prediction and Source blocks of sixe 8x8
412
413    LDR r4,[sp,#80]                 @ r4 = dst_strd_chr_flag
414    AND r4,r4,#1                    @ r4 = chr_flag
415    CMP r4,#1
416    BNE CHROMA_LOAD
417
418LUMA_LOAD:
419
420    LDR r5,[sp,#72]                 @ r5 = src_strd
421    LDR r4,[sp,#76]                 @ r4 = pred_strd
422
423    VLD2.8 {d0,d2},[r1],r4          @ Row 1 of prediction in d0
424    VLD2.8 {d1,d3},[r0],r5          @ Row 1 of source in d1
425
426    VABDL.U8 q15,d1,d0              @ Row 1 of absolute difference in q15
427    VLD2.8 {d2,d4},[r1],r4          @ Row 2 of prediction in d2
428    VSUBL.U8 q0,d1,d0               @ Row 1 of residue in q0
429    VLD2.8 {d3,d5},[r0],r5          @ Row 2 of source in d3
430
431    VABDL.U8 q9,d3,d2               @ Row 2 of absolute difference in q9
432    VLD2.8 {d4,d6},[r1],r4          @ Row 3 of prediction in d4
433    VSUBL.U8 q1,d3,d2               @ Row 2 of residue in q1
434    VLD2.8 {d5,d7},[r0],r5          @ Row 3 of source in d5
435
436    VABAL.U8 q15,d5,d4              @ Row 3 of absolute difference accumulated in q15
437    VLD2.8 {d6,d8},[r1],r4          @ Row 4 of prediction in d6
438    VSUBL.U8 q2,d5,d4               @ Row 3 of residue in q2
439    VLD2.8 {d7,d9},[r0],r5          @ Row 4 of source in d7
440
441    VABAL.U8 q9,d7,d6               @ Row 4 of absolute difference accumulated in q9
442    VLD2.8 {d8,d10},[r1],r4         @ Row 5 of prediction in d8
443    VSUBL.U8 q3,d7,d6               @ Row 4 of residue in q3
444    VLD2.8 {d9,d11},[r0],r5         @ Row 5 of source in d9
445
446    VABDL.U8 q10,d9,d8              @ Row 5 of absolute difference in q10
447    VLD2.8 {d10,d12},[r1],r4        @ Row 6 of prediction in d10
448    VSUBL.U8 q4,d9,d8               @ Row 5 of residue in q4
449    VLD2.8 {d11,d13},[r0],r5        @ Row 6 of source in d11
450
451    VABAL.U8 q15,d11,d10            @ Row 6 of absolute difference accumulated in q15
452    VLD2.8 {d12,d14},[r1],r4        @ Row 7 of prediction in d12
453    VSUBL.U8 q5,d11,d10             @ Row 6 of residue in q5
454    VLD2.8 {d13,d15},[r0],r5        @ Row 7 of source in d13
455
456    VABAL.U8 q9,d13,d12             @ Row 7 of absolute difference accumulated in q9
457    VLD2.8 {d14,d16},[r1]           @ Row 8 of prediction in d14
458    VSUBL.U8 q6,d13,d12             @ Row 7 of residue in q6
459    VLD2.8 {d15,d17},[r0]           @ Row 8 of source in d15
460
461    B CHROMA_LOAD_END
462
463CHROMA_LOAD:
464
465    LDR r5,[sp,#72]                 @ r5 = src_strd
466    LDR r4,[sp,#76]                 @ r4 = pred_strd
467
468    VLD1.64 d0,[r1],r4              @ Row 1 of prediction in d0
469    VLD1.64 d1,[r0],r5              @ Row 1 of source in d1
470
471    VABDL.U8 q15,d1,d0              @ Row 1 of absolute difference in q15
472    VLD1.64 d2,[r1],r4              @ Row 2 of prediction in d2
473    VSUBL.U8 q0,d1,d0               @ Row 1 of residue in q0
474    VLD1.64 d3,[r0],r5              @ Row 2 of source in d3
475
476    VABDL.U8 q9,d3,d2               @ Row 2 of absolute difference in q9
477    VLD1.64 d4,[r1],r4              @ Row 3 of prediction in d4
478    VSUBL.U8 q1,d3,d2               @ Row 2 of residue in q1
479    VLD1.64 d5,[r0],r5              @ Row 3 of source in d5
480
481    VABAL.U8 q15,d5,d4              @ Row 3 of absolute difference accumulated in q15
482    VLD1.64 d6,[r1],r4              @ Row 4 of prediction in d6
483    VSUBL.U8 q2,d5,d4               @ Row 3 of residue in q2
484    VLD1.64 d7,[r0],r5              @ Row 4 of source in d7
485
486    VABAL.U8 q9,d7,d6               @ Row 4 of absolute difference accumulated in q9
487    VLD1.64 d8,[r1],r4              @ Row 5 of prediction in d8
488    VSUBL.U8 q3,d7,d6               @ Row 4 of residue in q3
489    VLD1.64 d9,[r0],r5              @ Row 5 of source in d9
490
491    VABDL.U8 q10,d9,d8              @ Row 5 of absolute difference in q10
492    VLD1.64 d10,[r1],r4             @ Row 6 of prediction in d10
493    VSUBL.U8 q4,d9,d8               @ Row 5 of residue in q4
494    VLD1.64 d11,[r0],r5             @ Row 6 of source in d11
495
496    VABAL.U8 q15,d11,d10            @ Row 6 of absolute difference accumulated in q15
497    VLD1.64 d12,[r1],r4             @ Row 7 of prediction in d12
498    VSUBL.U8 q5,d11,d10             @ Row 6 of residue in q5
499    VLD1.64 d13,[r0],r5             @ Row 7 of source in d13
500
501    VABAL.U8 q9,d13,d12             @ Row 7 of absolute difference accumulated in q9
502    VLD1.64 d14,[r1]                @ Row 8 of prediction in d14
503    VSUBL.U8 q6,d13,d12             @ Row 7 of residue in q6
504    VLD1.64 d15,[r0]                @ Row 8 of source in d15
505
506CHROMA_LOAD_END:
507
508    @ Transform stage 1
509    @ Transposing residue matrix
510
511    VABAL.U8 q10,d15,d14            @ Row 8 of absolute difference accumulated in q10
512    VTRN.16 q0,q1                   @ Transpose residue matrix step (1a)
513    VSUBL.U8 q7,d15,d14             @ Row 8 of residue in q7
514    VTRN.16 q2,q3                   @ Transpose residue matrix step (1b)
515
516    VTRN.16 q4,q5                   @ Transpose residue matrix step (1c)
517    VTRN.16 q6,q7                   @ Transpose residue matrix step (1d)
518    VTRN.32 q0,q2                   @ Transpose residue matrix step (2a)
519    VTRN.32 q1,q3                   @ Transpose residue matrix step (2b)
520
521    VADD.U16 q8,q15,q9              @ SAD calculation (1)
522    VTRN.32 q4,q6                   @ Transpose residue matrix step (2c)
523    VTRN.32 q5,q7                   @ Transpose residue matrix step (2d)
524
525    VADD.U16 q8,q8,q10              @ SAD calculation (2)
526    VSWP d1,d8                      @ Transpose residue matrix step (3a)
527    VSWP d3,d10                     @ Transpose residue matrix step (3b)
528
529    VADD.U16 d16,d16,d17            @ SAD calculation (3)
530    VSWP d7,d14                     @ Transpose residue matrix step (3c)
531    VSWP d5,d12                     @ Transpose residue matrix step (3d)
532                                    @ Columns of residue C0-C7 (8x8 matrix) in q0-q7
533    VPADDL.U16 d16,d16              @ SAD calculation (4)
534
535    @ Evaluating first step in Butterfly diagram
536
537    VADD.S16 q10,q0,q7              @ q10 = C0 + C7
538    VADD.S16 q11,q1,q6              @ q11 = C1 + C6
539    VPADDL.U32 d16,d16              @ SAD calculation (5)
540    VADD.S16 q12,q2,q5              @ q12 = C2 + C5
541    VADD.S16 q13,q3,q4              @ q13 = C3 + C4
542
543    VSUB.S16 q4,q3,q4               @ q4  = C3 - C4
544    VSUB.S16 q5,q2,q5               @ q5  = C2 - C5
545    VSUB.S16 q6,q1,q6               @ q6  = C1 - C6
546    VSUB.S16 q7,q0,q7               @ q7  = C0 - C7
547
548    @ Calculating F0, F2, F4 and F6
549
550    VADD.S16 q1,q11,q12             @ q1  = C1 + C2 + C5 + C6
551    VADD.S16 q2,q10,q13             @ q2  = C0 + C3 + C4 + C7
552
553    MOV r4,#50
554    LSL r4,r4,#16
555    ADD r4,r4,#18
556    MOV r5,#89
557    LSL r5,r5,#16
558    ADD r5,r5,#75
559    VMOV d0,r4,r5                   @ 16-bit aligned, d0[3] = 89, d0[2] = 75, d0[1] = 50, d0[0]=18
560
561    MOV r4,#83
562    LSL r4,r4,#16
563    ADD r4,r4,#36
564    VMOV d1,r4,r4                   @ 16-bit aligned, d1[3] = 83, d1[2] = 36, d1[1] = 83, d1[0]=36
565
566    VSUB.S16 q10,q10,q13            @ q10 = C0 - C3 - C4 + C7
567    VSUB.S16 q11,q11,q12            @ q11 = C1 - C2 - C5 + C6
568    VMOV.32 r0,d16[0]               @ SAD calculation (6) : Return value = SAD
569
570    VSUB.S16 q3,q2,q1               @ q3 = C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7
571    VADD.S16 q2,q2,q1               @ q2 = C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7
572
573    VMULL.S16 q14,d20,d1[1]         @ q14 = [0] of 83*(C0 - C3 - C4 + C7)
574    VMULL.S16 q15,d21,d1[1]         @ q15 = [1] of 83*(C0 - C3 - C4 + C7)
575    VMULL.S16 q9,d20,d1[0]          @ q9  = [0] of 36*(C0 - C3 - C4 + C7)
576    VMULL.S16 q10,d21,d1[0]         @ q10 = [1] of 36*(C0 - C3 - C4 + C7)
577
578    VMLAL.S16 q14,d22,d1[0]         @ q14 = F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
579    VSHLL.S16 q13,d6,#6             @ q13 = F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
580    VMLAL.S16 q15,d23,d1[0]         @ q15 = F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
581    VSHLL.S16 q3,d7,#6              @ q3  = F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
582    VMLSL.S16 q9,d22,d1[1]          @ q9  = F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
583    VSHLL.S16 q12,d4,#6             @ q12 = F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)
584    VMLSL.S16 q10,d23,d1[1]         @ q10 = F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
585    VSHLL.S16 q2,d5,#6              @ q2  = F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)
586
587    @ Calculating F1, F3, F5 and F7
588
589    MOV r4,#48
590    VST1.64 {d24,d25},[r2]!         @ Row 1 of transform stage 1 F0[0] stored
591    VST1.64 {d4,d5},[r2],r4         @ Row 1 of transform stage 1 F0[1] stored
592    VST1.64 {d28,d29},[r2]!         @ Row 3 of transform stage 1 F2[0] stored
593    VST1.64 {d30,d31},[r2],r4       @ Row 3 of transform stage 1 F2[1] stored
594
595    VST1.64 {d26,d27},[r2]!         @ Row 5 of transform stage 1 F4[0] stored
596    VMULL.S16 q1,d14,d0[3]          @ q1  = [0] of 89*(C0 - C7)
597    VMULL.S16 q8,d15,d0[3]          @ q8  = [1] of 89*(C0 - C7)
598    VST1.64 {d6,d7},[r2],r4         @ Row 5 of transform stage 1 F4[1] stored
599    VMULL.S16 q11,d14,d0[2]         @ q11 = [0] of 75*(C0 - C7)
600    VMULL.S16 q13,d15,d0[2]         @ q13 = [1] of 75*(C0 - C7)
601    VST1.64 {d18,d19},[r2]!         @ Row 7 of transform stage 1 F6[0] stored
602    VMULL.S16 q3,d14,d0[1]          @ q3  = [0] of 50*(C0 - C7)
603    VMULL.S16 q9,d15,d0[1]          @ q9  = [1] of 50*(C0 - C7)
604    VST1.64 {d20,d21},[r2]          @ Row 7 of transform stage 1 F6[1] stored
605    VMULL.S16 q10,d14,d0[0]         @ q10 = [0] of 18*(C0 - C7)
606    VMULL.S16 q7,d15,d0[0]          @ q7  = [1] of 18*(C0 - C7)
607
608    VMLAL.S16 q1,d12,d0[2]          @ q1  = [0] of 89*(C0 - C7) + 75*(C1 - C6)
609    VMLAL.S16 q8,d13,d0[2]          @ q8  = [1] of 89*(C0 - C7) + 75*(C1 - C6)
610    VMLSL.S16 q11,d12,d0[0]         @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6)
611    VMLSL.S16 q13,d13,d0[0]         @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6)
612    VMLSL.S16 q3,d12,d0[3]          @ q3  = [0] of 50*(C0 - C7) - 89*(C1 - C6)
613    VMLSL.S16 q9,d13,d0[3]          @ q9  = [1] of 50*(C0 - C7) - 89*(C1 - C6)
614    VMLSL.S16 q10,d12,d0[1]         @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6)
615    VMLSL.S16 q7,d13,d0[1]          @ q7  = [1] of 18*(C0 - C7) - 50*(C1 - C6)
616
617    VMLAL.S16 q1,d10,d0[1]          @ q1  = [0] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
618    VMLAL.S16 q8,d11,d0[1]          @ q8  = [1] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
619    VMLSL.S16 q11,d10,d0[3]         @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
620    VMLSL.S16 q13,d11,d0[3]         @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
621    VMLAL.S16 q3,d10,d0[0]          @ q3  = [0] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
622    VMLAL.S16 q9,d11,d0[0]          @ q9  = [1] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
623    VMLAL.S16 q10,d10,d0[2]         @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)
624    VMLAL.S16 q7,d11,d0[2]          @ q7  = [1] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)
625
626    VMLAL.S16 q1,d8,d0[0]           @ q1  = F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
627    VMLAL.S16 q8,d9,d0[0]           @ q8  = F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
628    VMLSL.S16 q11,d8,d0[1]          @ q11 = F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
629    VMLSL.S16 q13,d9,d0[1]          @ q13 = F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
630    SUB r2,r2,#176                  @ r2 now points to the second row
631    VMLAL.S16 q3,d8,d0[2]           @ q3  = F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
632    VMLAL.S16 q9,d9,d0[2]           @ q9  = F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
633    VST1.64 {d2,d3},[r2]!           @ Row 2 of transform stage 1 F1[0] stored
634    VMLSL.S16 q10,d8,d0[3]          @ q10 = F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)
635    VMLSL.S16 q7,d9,d0[3]           @ q7  = F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)
636
637    VST1.64 {d16,d17},[r2],r4       @ Row 2 of transform stage 1 F1[1] stored
638    VST1.64 {d22,d23},[r2]!         @ Row 4 of transform stage 1 F3[0] stored
639    VST1.64 {d26,d27},[r2],r4       @ Row 4 of transform stage 1 F3[1] stored
640    VST1.64 {d6,d7},[r2]!           @ Row 6 of transform stage 1 F5[0] stored
641    VST1.64 {d18,d19},[r2],r4       @ Row 6 of transform stage 1 F5[1] stored
642    VST1.64 {d20,d21},[r2]!         @ Row 8 of transform stage 1 F7[0] stored
643    VST1.64 {d14,d15},[r2]          @ Row 8 of transform stage 1 F7[1] stored
644
645    @ Transform stage 2 (for rows 1-4 of transform stage 1)
646    @ Transposing the 4 rows (F0, F1, F2, F3)
647    @ F0 = {q2,q12},  F1 = {q8,q1}, F2 = {q15,q14} and F3 = {q13,q11}
648
649    VTRN.32 q12,q1                  @ Transposing first half of transform stage 1 (1a)
650    VTRN.32 q14,q11                 @ Transposing first half of transform stage 1 (1b)
651    VSWP d25,d28                    @ Transposing first half of transform stage 1 (2a)
652    VSWP d22,d3                     @ Transposing first half of transform stage 1 (2b)
653
654    VTRN.32 q2,q8                   @ Transposing first half of transform stage 1 (3a)
655    VTRN.32 q15,q13                 @ Transposing first half of transform stage 1 (3b)
656    VSWP d5,d30                     @ Transposing first half of transform stage 1 (4a)
657    VSWP d26,d17                    @ Transposing first half of transform stage 1 (4b)
658                                    @ B0:q12, B1:q1, B2:q14, B3:q11, B4:q2, B5:q8, B6:q15 and B7:q13
659
660    @ Evaluating first step in Butterfly diagram
661
662    VADD.S32 q0,q12,q13             @ q0  = B0 + B7
663    VADD.S32 q5,q11,q2              @ q5  = B3 + B4
664    VADD.S32 q3,q1,q15              @ q3  = B1 + B6
665    VADD.S32 q4,q14,q8              @ q4  = B2 + B5
666
667    VSUB.S32 q7,q14,q8              @ q7  = B2 - B5
668    VSUB.S32 q8,q1,q15              @ q8  = B1 - B6
669    VSUB.S32 q6,q11,q2              @ q6  = B3 - B4
670    VSUB.S32 q9,q12,q13             @ q9  = B0 - B7
671
672    @ Calculating G0, G2, G4 and G6
673
674    MOV r4,#18
675    MOV r5,#50
676    VMOV d2,r4,r5                   @ 32-bit aligned, d2[1] = 50, d2[0] = 18
677    VSUB.S32 q2,q0,q5               @ q2  = B0 - B3 - B4 + B7
678
679    MOV r4,#75
680    MOV r5,#89
681    VMOV d3,r4,r5                   @ 32-bit aligned, d3[1] = 89, d3[0] = 75
682    VADD.S32 q10,q0,q5              @ q10 = B0 + B3 + B4 + B7
683
684    MOV r4,#36
685    MOV r5,#83
686    VMOV d0,r4,r5                   @ 32-bit aligned, d0[1] = 83, d0[0] = 36
687    VSUB.S32 q11,q3,q4              @ q11 = B1 - B2 - B5 + B6
688    VADD.S32 q3,q3,q4               @ q3  = B1 + B2 + B5 + B6
689
690    VMUL.S32 q12,q2,d0[1]           @ q12 = 83*(B0 - B3 - B4 + B7)
691    VMUL.S32 q2,q2,d0[0]            @ q2  = 36*(B0 - B3 - B4 + B7)
692    VMUL.S32 q5,q9,d3[1]            @ q5 = 89*(B0 - B7)
693    VADD.S32 q14,q10,q3             @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
694    VMUL.S32 q4,q9,d3[0]            @ q4 = 75*(B0 - B7)
695    VSUB.S32 q15,q10,q3             @ q15 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7
696@    VSHL.S32 q14,q14,#6             ; q14 = G0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
697@    VSHL.S32 q15,q15,#6             ; q15 = G4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)
698
699    VMLA.S32 q12,q11,d0[0]          @ q12 = G2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
700    VRSHRN.I32 d28,q14,#5           @ Truncating last 11 bits in G0
701    VMLS.S32 q2,q11,d0[1]           @ q2  = G6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
702    VRSHRN.I32 d30,q15,#5           @ Truncating last 11 bits in G4
703
704    LDR r4,[sp,#80]                 @ r4 = dst_strd_chr_flag
705    ASR r4,r4,#16                   @ r4 = dst_strd
706    LSL r4,r4,#2                    @ r4 = 2*dst_strd*2
707
708    VMUL.S32 q3,q9,d2[1]            @ q3 = 50*(B0 - B7)
709    VRSHRN.I32 d24,q12,#11          @ Truncating last 11 bits in G2
710    VMUL.S32 q9,q9,d2[0]            @ q9 = 18*(B0 - B7)
711    VRSHRN.I32 d4,q2,#11            @ Truncating last 11 bits in G6
712
713    VMLA.S32 q5,q8,d3[0]            @ q5 = 89*(B0 - B7) + 75*(B1 - B6)
714    VST1.64 d28,[r3],r4             @ First half-row of row 1 of transform stage 2 (G0) stored
715    VMLS.S32 q4,q8,d2[0]            @ q4 = 75*(B0 - B7) - 18*(B1 - B6)
716
717    VMLS.S32 q3,q8,d3[1]            @ q3 = 50*(B0 - B7) - 89*(B1 - B6)
718    VST1.64 d24,[r3],r4             @ First half-row of row 3 of transform stage 2 (G2) stored
719    VMLS.S32 q9,q8,d2[1]            @ q9 = 18*(B0 - B7) - 50*(B1 - B6)
720
721    VMLA.S32 q5,q7,d2[1]            @ q5 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
722    VST1.64 d30,[r3],r4             @ First half-row of row 5 of transform stage 2 (G4) stored
723    VMLS.S32 q4,q7,d3[1]            @ q4 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)
724
725    VMLA.S32 q3,q7,d2[0]            @ q3 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
726    VST1.64 d4,[r3]                 @ First half-row of row 7 of transform stage 2 (G6) stored
727    VMLA.S32 q9,q7,d3[0]            @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)
728
729    VMLA.S32 q5,q6,d2[0]            @ q5 = G1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
730    VMLS.S32 q4,q6,d2[1]            @ q4 = G3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)
731    VMLA.S32 q3,q6,d3[0]            @ q3 = G5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
732    VMLS.S32 q9,q6,d3[1]            @ q9 = G7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)
733
734    SUB r3,r3,r4,LSL #1
735    SUB r3,r3,r4,ASR #1             @ r3 = r3 - 5*dst_strd*2
736                                    @ r3 is moved from row 7 to row 2
737    VRSHRN.I32 d10,q5,#11           @ Truncating last 11 bits in G1
738    VRSHRN.I32 d8,q4,#11            @ Truncating last 11 bits in G3
739    VRSHRN.I32 d6,q3,#11            @ Truncating last 11 bits in G5
740    VST1.64 d10,[r3],r4             @ First half-row of row 2 of transform stage 2 (G1) stored
741    VRSHRN.I32 d18,q9,#11           @ Truncating last 11 bits in G7
742
743    VST1.64 d8,[r3],r4              @ First half-row of row 4 of transform stage 2 (G3) stored
744    VST1.64 d6,[r3],r4              @ First half-row of row 6 of transform stage 2 (G5) stored
745    VST1.64 d18,[r3]!               @ First half-row of row 8 of transform stage 2 (G7) stored
746
747    @ Transform stage 2 (for rows 5-8 of transform stage 1)
748    @ Loading the 4 rows (F4, F5, F6, F7)
749
750    SUB r2,r2,#112                  @ r2 jumps from row 8 to row 5 in temporary memory
751    VLD1.64 {d20,d21},[r2]!         @ q10 = F4[0]
752    VLD1.64 {d22,d23},[r2]!         @ q11 = F4[1]
753    VLD1.64 {d8,d9},[r2]!           @ q4  = F5[0]
754    @ Transposing the 4 rows
755    @ F0 = {q11,q10}, F1 = {q5,q4}, F2 = {q3,q2} and F3 = {q13,q12}
756
757    VTRN.32 q10,q4                  @ Transposing second half of transform stage 1 (1a)
758    VLD1.64 {d10,d11},[r2]!         @ q5  = F5[1]
759    VLD1.64 {d4,d5},[r2]!           @ q2  = F6[0]
760    VLD1.64 {d6,d7},[r2]!           @ q3  = F6[1]
761    VLD1.64 {d24,d25},[r2]!         @ q12 = F7[0]
762    VTRN.32 q2,q12                  @ Transposing second half of transform stage 1 (1b)
763    VLD1.64 {d26,d27},[r2]          @ q13 = F7[1]
764
765    VSWP d21,d4                     @ Transposing second half of transform stage 1 (2a)
766    VSWP d24,d9                     @ Transposing second half of transform stage 1 (2b)
767
768    VTRN.32 q11,q5                  @ Transposing second half of transform stage 1 (3a)
769    VTRN.32 q3,q13                  @ Transposing second half of transform stage 1 (3b)
770    VSWP d26,d11                    @ Transposing second half of transform stage 1 (4b)
771    VSWP d23,d6                     @ Transposing second half of transform stage 1 (4a)
772                                    @ B0:q10, B1:q4, B2:q2, B3:q12, B4:q11, B5:q5, B6:q3 and B7:q13
773
774    @ Evaluating first step in Butterfly diagram
775
776    VADD.S32 q0,q10,q13             @ q0  = B0 + B7
777    VADD.S32 q15,q12,q11            @ q15 = B3 + B4
778    VADD.S32 q1,q4,q3               @ q1  = B1 + B6
779    VADD.S32 q14,q2,q5              @ q14 = B2 + B5
780
781    VSUB.S32 q9,q10,q13             @ q9  = B0 - B7
782    VSUB.S32 q6,q12,q11             @ q6  = B3 - B4
783    VSUB.S32 q7,q2,q5               @ q7  = B2 - B5
784    VSUB.S32 q8,q4,q3               @ q8  = B1 - B6
785
786    @ Calculating H0, H2, H4 and H6
787
788    VADD.S32 q3,q1,q14              @ q3 = B1 + B2 + B5 + B6
789    VSUB.S32 q5,q1,q14              @ q5 = B1 - B2 - B5 + B6
790
791    MOV r4,#18
792    MOV r5,#50
793    VSUB.S32 q4,q0,q15              @ q4 = B0 - B3 - B4 + B7
794    VMOV d2,r4,r5                   @ 32-bit aligned, d2[1] = 50, d2[0] = 18
795
796    MOV r4,#75
797    MOV r5,#89
798    VADD.S32 q2,q0,q15              @ q2 = B0 + B3 + B4 + B7
799    VMOV d3,r4,r5                   @ 32-bit aligned, d3[1] = 89, d3[0] = 75
800
801    MOV r4,#36
802    MOV r5,#83
803
804    @ Calculating H1, H3, H5 and H7
805
806    VMUL.S32 q10,q9,d3[1]           @ q10 = 89*(B0 - B7)
807    VMOV d0,r4,r5                   @ 32-bit aligned, d0[1] = 83, d0[0] = 36
808
809    VMUL.S32 q13,q9,d3[0]           @ q13 = 75*(B0 - B7)
810
811    VMUL.S32 q12,q4,d0[1]           @ q12 = 83*(B0 - B3 - B4 + B7)
812    VADD.S32 q14,q2,q3              @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
813    VMUL.S32 q4,q4,d0[0]            @ q4  = 36*(B0 - B3 - B4 + B7)
814    VSUB.S32 q2,q2,q3               @ q2  = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7
815
816
817    VMLA.S32 q12,q5,d0[0]           @ q12 = H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
818@    VSHL.S32 q14,q14,#6             ; q14 = H0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
819    VMLS.S32 q4,q5,d0[1]            @ q4 = H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
820@    VSHL.S32 q2,q15,#6              ; q2 = H4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)
821
822    VMUL.S32 q11,q9,d2[1]           @ q11 = 50*(B0 - B7)
823    VRSHRN.I32 d28,q14,#5           @ Truncating last 11 bits in H0
824    VMUL.S32 q9,q9,d2[0]            @ q9  = 18*(B0 - B7)
825    VRSHRN.I32 d24,q12,#11          @ Truncating last 11 bits in H2
826
827    VMLA.S32 q10,q8,d3[0]           @ q10 = 89*(B0 - B7) + 75*(B1 - B6)
828    VRSHRN.I32 d4,q2,#5             @ Truncating last 11 bits in H4
829    VMLS.S32 q13,q8,d2[0]           @ q13 = 75*(B0 - B7) - 18*(B1 - B6)
830    VRSHRN.I32 d8,q4,#11            @ Truncating last 11 bits in H6
831
832    LDR r4,[sp,#80]                 @ r4 = dst_strd_chr_flag
833    ASR r4,r4,#16                   @ r4 = dst_strd
834    LSL r4,r4,#2                    @ r4 = 2*dst_strd*2
835
836    SUB r3,r3,r4,LSL #2
837    ADD r3,r3,r4,ASR #1             @ r3 = r3 - 7*dst_strd*2
838                                    @ r3 is moved from row 8 to row 1
839    VMLS.S32 q11,q8,d3[1]           @ q11 = 50*(B0 - B7) - 89*(B1 - B6)
840    VST1.64 d28,[r3],r4             @ Second half-row of row 1 of transform stage 2 (H0) stored
841    VMLS.S32 q9,q8,d2[1]            @ q9  = 18*(B0 - B7) - 50*(B1 - B6)
842
843    VMLA.S32 q10,q7,d2[1]           @ q10 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
844    VST1.64 d24,[r3],r4             @ Second half-row of row 3 of transform stage 2 (H2) stored
845    VMLS.S32 q13,q7,d3[1]           @ q13 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)
846
847    VMLA.S32 q11,q7,d2[0]           @ q11 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
848    VST1.64 d4,[r3],r4              @ Second half-row of row 5 of transform stage 2 (H4) stored
849    VMLA.S32 q9,q7,d3[0]            @ q9  = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)
850
851    VMLA.S32 q10,q6,d2[0]           @ q10 = H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
852    VST1.64 d8,[r3]                 @ Second half-row of row 7 of transform stage 2 (H6) stored
853    VMLS.S32 q13,q6,d2[1]           @ q13 = H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)
854
855    VMLA.S32 q11,q6,d3[0]           @ q11 = H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
856    VMLS.S32 q9,q6,d3[1]            @ q9  = H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)
857
858    SUB r3,r3,r4,LSL #1
859    SUB r3,r3,r4,ASR #1             @ r3 = r3 - 5*dst_strd
860                                    @ r3 is moved from row 7 to row 2
861    VRSHRN.I32 d20,q10,#11          @ Truncating last 11 bits in H1
862    VRSHRN.I32 d26,q13,#11          @ Truncating last 11 bits in H3
863    VRSHRN.I32 d22,q11,#11          @ Truncating last 11 bits in H5
864    VST1.64 d20,[r3],r4             @ Second half-row of row 2 of transform stage 2 (H1) stored
865    VRSHRN.I32 d18,q9,#11           @ Truncating last 11 bits in H7
866
867    VST1.64 d26,[r3],r4             @ Second half-row of row 4 of transform stage 2 (H3) stored
868    VST1.64 d22,[r3],r4             @ Second half-row of row 6 of transform stage 2 (H5) stored
869    VST1.64 d18,[r3]                @ Second half-row of row 8 of transform stage 2 (H7) stored
870
871    vpop {d8 - d15}
872    POP {r4,r5}
873    MOV pc,lr
874
875@/**
876@*/ *******************************************************************************
877@*/
878@*/@brief
879@*/  This function performs residue calculation and forward  transform on
880@*/ input pixels
881@*/
882@*/@par Description:
883@*/ Performs residue calculation by subtracting source and  prediction and
884@*/ followed by forward transform
885@*/
886@*/ @param[in] pu1_src
887@*/  Input 16x16 pixels
888@*/
889@*/ @param[in] pu1_pred
890@*/  Prediction data
891@*/
892@*/ @param[in] pi2_tmp
893@*/  Temporary buffer of size 16x16
894@*/
895@*/ @param[out] pi2_dst
896@*/  Output 16x16 coefficients
897@*/
898@*/ @param[in] src_strd
899@*/  Input stride
900@*/
901@*/ @param[in] pred_strd
902@*/  Prediction Stride
903@*/
904@*/ @param[in] dst_strd_chr_flag
905@*/  Output Stride and Chroma Flag packed in the MS and LS 16-bit
906@*/
907@*/ @returns  Void
908@*/
909@*/ @remarks
910@*/  None
911@*/
912@*/*******************************************************************************
913@*/
914
915.extern g_ai2_ihevc_trans_16
916.extern g_ai4_ihevc_trans_16
917
918g_ai2_ihevc_trans_16_addr_1:
919.long g_ai2_ihevc_trans_16 - ulbl1 - 8
920
921g_ai2_ihevc_trans_16_addr_2:
922.long g_ai2_ihevc_trans_16 - ulbl2 - 8
923
924g_ai4_ihevc_trans_16_addr:
925.long g_ai4_ihevc_trans_16 - ulbl3 - 8
926
927    .global ihevc_resi_trans_16x16_a9q
928
929ihevc_resi_trans_16x16_a9q:
930
931.equ TMP_STRIDE        ,  64            @16*4, Stride of tmp register
932.equ SHIFT             ,  13            @shift = 13; // log2(iWidth) - 1 + g_uiBitIncrement
933.equ RADD              ,  4096          @1 << (shift - 1);
934
935.equ COFF_STD_2B       ,  32            @Stride for g_ai2_ihevc_trans_16 in bytes
936.equ COFF_STD_W        ,  32            @Stride for g_ai4_ihevc_trans_16 in bytes
937
938@;LOAD the fucntion
939    STMFD          SP!,{r4-r12,LR}      @stack store values of the arguments
940    vpush          {d8 - d15}
941    SUB            SP,SP,#32
942
943    LDR             R4,[SP,#136]            @get src_strd
944    LDR             R5,[SP,#140]         @get pred_strd
945    LDR             R6,[SP,#144]         @get dst_strd_chr_flag
946
947    MOV R8,#0                           @Set loop counter
948    LDR R9,g_ai2_ihevc_trans_16_addr_1    @get 16 bit transform matrix
949ulbl1:
950    ADD R9, R9, PC
951    @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] values of g_ai2_ihevc_trans_16
952    @and write to stack
953    MOV R12,#COFF_STD_2B
954    LSL R12,#2
955
956    VLD1.S32 D30[0],[R9],R12
957    VLD1.S32 D30[1],[R9],R12
958    VLD1.S32 D31[0],[R9],R12
959    VLD1.S32 D31[1],[R9],R12
960
961    VTRN.S32 D30,D31
962    VTRN.S16 D30,D31
963    VST1.S16 {d30,d31},[SP]
964
965    LDR R9,g_ai2_ihevc_trans_16_addr_2      @get back 16 bit transform matrix
966ulbl2:
967    ADD R9, R9, PC
968
969    MOV R7,#TMP_STRIDE
970    AND R14,R6,#0x1
971
972    VMOV.S32 Q14,#0
973
974@R0         pu1_src
975@R1         pu1_pred
976@R2         pi4_tmp
977@R3         pi2_dst
978@R4         src_strd
979@R5         pred_strd
980@R6         dst_strd_chr_flag
981@R7         tmp_dst Nx4 block stride
982@R8         loop cntr
983@R9         g_ai2_ihevc_trans_16
984@R10        tmp_dst Nx4 block offset
985@R11        tmp register
986@R12        ------
987@R14        ------.
988@q14        shift 32 bit
989@q15        add 32 bit
990
991CORE_LOOP_16X16_HORIZ:
992
993    CMP R14,#1
994    BEQ INTERLEAVED_LOAD_S1
995
996    VLD1.U8 {d0,d1},[R0],R4             @LOAD 1-16 src row 1
997    VLD1.U8 {d2,d3},[R1],R5             @LOAD 1-16 pred row 1
998    VLD1.U8 {d4,d5},[R0],R4             @LOAD 1-16 src row 2
999    VLD1.U8 {d6,d7},[R1],R5             @LOAD 1-16 pred row 2
1000    B    LOAD_DONE
1001
1002INTERLEAVED_LOAD_S1:
1003
1004    VLD2.U8 {Q0,Q1},[R0],R4             @LOAD 1-16 src row 1
1005    VLD2.U8 {Q1,Q2},[R1],R5             @LOAD 1-16 pred row 1
1006    VLD2.U8 {Q2,Q3},[R0],R4             @LOAD 1-16 src row 2
1007    VLD2.U8 {Q3,Q4},[R1],R5             @LOAD 1-16 pred row 2
1008LOAD_DONE:
1009
1010    VSUBL.U8 Q4,D0,D2                   @Get residue 1-8 row 1
1011    VSUBL.U8 Q5,D1,D3                   @Get residue 9-16 row 1
1012    VSUBL.U8 Q6,D4,D6                   @Get residue 1-8 row 2
1013    VSUBL.U8 Q7,D5,D7                   @Get residue 9-16 row 2
1014
1015    @Get blk sads
1016    VABDL.U8 Q15,D0,D2
1017    VABAL.U8 Q15,D1,D3
1018    VABAL.U8 Q15,D4,D6
1019    VABAL.U8 Q15,D5,D7
1020    VADDW.S16 Q14,Q14,D30
1021    VADDW.S16 Q14,Q14,D31
1022
1023    VREV64.S16 Q5,Q5                    @Rev row 1
1024    VREV64.S16 Q7,Q7                    @Rev row 2
1025    VSWP D10,D11
1026    VSWP D14,D15
1027
1028    VADD.S16 Q8 ,Q4,Q5                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-8 row 1
1029    VSUB.S16 Q9 ,Q4,Q5                  @o[k] = resi_tmp_1 - resi_tmp_2     k ->9-16 row 1
1030    VADD.S16 Q10,Q6,Q7                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-8 row 2
1031    VSUB.S16 Q11,Q6,Q7                  @o[k] = resi_tmp_1 - resi_tmp_2     k ->9-16 row 2
1032
1033    VREV64.S16    D24,D17               @rev e[k] k-> 4-7 row 1
1034    VREV64.S16    D25,D21               @rev e[k] k-> 4-7 row 2
1035    VMOV.S16    D17,D20
1036
1037    @arrangement OF DATA
1038    @Q8     A1 A2 A3 A4 B1 B2 B3 B4
1039    @Q12    A8 A7 A6 A5 B8 B7 B6 B5
1040
1041    VADD.S16 Q13,Q8,Q12                 @ee[k] = e[k] + e[7 - k] row 1 & 2
1042    VSUB.S16 Q0,Q8,Q12                  @eo[k] = e[k] - e[7 - k] row 1 & 2
1043
1044    @D26 R1ee[0] R1ee[1] R1ee[2] R1ee[3]
1045    @D27 R2ee[0] R2ee[1] R2ee[2] R2ee[3]
1046    VTRN.S32 D26,D27                    @1-cycle stall before it?
1047    @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
1048    @D27 R1ee[2] R1ee[3] R2ee[2] R2ee[3]
1049    VREV32.16 D2,D27                    @1-cycle stall before it?
1050    @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
1051    @D2 R1ee[3] R1ee[2] R2ee[3] R2ee[2]
1052    VMOV.S16 D27,D26
1053    VNEG.S16 D3,D2
1054    @Q13 R1ee[0] R1ee[1] R2ee[0] R2ee[1]  R1ee[0]  R1ee[1]  R2ee[0]  R2ee[1]
1055    @Q1  R1ee[3] R1ee[2] R2ee[3] R2ee[2] -R1ee[3] -R1ee[2] -R2ee[3] -R2ee[2]
1056
1057    @D8 : [0 0] [4 0] [8 0] [12 0]
1058    @D9 : [0 1] [4 1] [8 1] [12 1]
1059    VLD1.S16 {d8,d9},[SP]               @[0 0] [4 0] [8 0] [12 0] [0 1] [4 1] [8 1] [12 1]
1060    VADD.S16 Q1,Q13,Q1                  @ 1-cycle stall before it?
1061    @Q15 R1eee[0] R1eee[1] R2eee[0] R2eee[1] R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]
1062
1063    @Q1  R1eee[0] R1eee[1] R2eee[0] R2eee[1]
1064    @    R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]
1065    VTRN.S16 D2,D3                      @2-cycle stall before it?
1066    @Q1  R1eee[0] R1eeo[0] R2eee[0] R2eeo[0]
1067    @     R1eee[1] R1eeo[1] R2eee[1] R2eeo[1]
1068
1069    VDUP.S32 D4,D2[0]    @R1eee[0] R1eeo[0] R1eee[0] R1eeo[0]    ;1-cycle stall?
1070    VDUP.S32 D5,D2[1]    @R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
1071    VDUP.S32 D6,D3[0]    @R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
1072    VDUP.S32 D7,D3[1]    @R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]
1073
1074    @---------------Process EO--------------------
1075    @ Early start to avoid stalls
1076    MOV R12,#COFF_STD_2B                @Get stride of coeffs
1077
1078    VMULL.S16 Q5,D4,D8                  @   g_ai2_ihevc_trans_16 * R1eee[0] R1eeo[0] R1eee[0] R1eeo[0]
1079    VMLAL.S16 Q5,D6,D9                  @ + g_ai2_ihevc_trans_16 * R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
1080    VMULL.S16 Q6,D5,D8                  @   g_ai2_ihevc_trans_16 * R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
1081    VMLAL.S16 Q6,D7,D9                  @ + g_ai2_ihevc_trans_16 * R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]
1082
1083    ADD R11,R9,R12,LSL #1               @Load address of g_ai2_ihevc_trans_16[2]
1084    LSL R12,R12,#2
1085
1086    VLD1.S16 D26,[R11],R12              @LOAD g_ai2_ihevc_trans_16[2][0-4]]
1087
1088    VLD1.S16 D27,[R11],R12              @LOAD g_ai2_ihevc_trans_16[6][0-4]
1089    VMULL.S16 Q1,D26,D0                 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]    R1
1090
1091    VMULL.S16 Q2,D26,D1                 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]    R2
1092
1093    VZIP.S32 Q5,Q6                      @3-cycle instruction
1094    VMULL.S16 Q3,D27,D0                 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4]    R1
1095
1096
1097    VLD1.S16 D26,[R11],R12              @LOAD g_ai2_ihevc_trans_16[10][0-4]
1098    VMULL.S16 Q4,D27,D1                 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4]    R2
1099
1100    @These values must go to 0 4 8 12 colums hence we need stride *4
1101    LSL R10,R7,#2
1102
1103    VLD1.S16 D27,[R11],R12              @LOAD g_ai2_ihevc_trans_16[14][0-4]
1104
1105    VST1.32 D10,[R2],R10
1106    VMULL.S16 Q8,D27,D1                 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R2
1107
1108    VST1.32 D11,[R2],R10
1109    VMULL.S16 Q7,D27,D0                 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R1
1110
1111    VST1.32 D12,[R2],R10
1112    VMULL.S16 Q5,D26,D0                 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R1
1113
1114    VST1.32 D13,[R2],R10
1115    VMULL.S16 Q6,D26,D1                 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R2
1116
1117    SUB R2,R2,R10,LSL #2
1118
1119    @transpose the 4x4 matrix row1
1120    VTRN.32 Q1, Q3                      @R1 transpose1 -- 2 cycles
1121
1122    @transpose the 4x4 matrix row2
1123    VTRN.32 Q2,Q4                       @R2 transpose1 -- 2 cycles
1124
1125    VTRN.32 Q5, Q7                      @R1 transpose1 -- 2 cycles
1126
1127    VTRN.32 Q6,Q8                       @R2 transpose1 -- 2 cycles
1128
1129    VSWP    D10,D3                      @R1 transpose2
1130    VSWP    D14,D7                      @R1 transpose2
1131
1132    VSWP    D12,D5                      @R2 transpose2
1133    VSWP    D16,D9                      @R2 transpose2
1134
1135    VADD.S32 Q5,Q5,Q1                   @R1 add
1136    VADD.S32 Q3,Q3,Q7                   @R1 add
1137
1138    VADD.S32 Q2,Q2,Q4                   @R2 add
1139    VADD.S32 Q6,Q6,Q8                   @R2 add
1140
1141    VADD.S32 Q5,Q5,Q3                   @R1 add
1142
1143    VADD.S32 Q4,Q6,Q2                   @R2 add
1144
1145    @-----------------------Processing O ----------------------------
1146    @ Early start to avoid stalls
1147    MOV R12,#COFF_STD_2B                @Get coeffs stride
1148    LSL R12,R12,#1
1149    ADD R11,R9,#COFF_STD_2B             @Get address of g_ai2_ihevc_trans_16[1]
1150
1151    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7] -- 2 cycles
1152
1153    VZIP.S32 Q5,Q4                      @ 3 cycle instruction
1154    VMULL.S16 Q6,D18,D4                 @o[0][0-3]*  R1
1155
1156
1157    VMLAL.S16 Q6,D19,D5                 @o[0][4-7]*  R1     ; follows MULL instruction: Multiplier accumulator forwarding
1158    @write to memory
1159    @this should go to 2 6 10 14
1160    LSL R10,R7,#2
1161    ADD R2,R2,R7,LSL #1                 @move to third row
1162    VST1.32 D10,[R2],R10
1163    VMULL.S16 Q7,D22,D4                 @o[0][0-3]*  R2
1164
1165    VST1.32 D11,[R2],R10
1166    VMLAL.S16 Q7,D23,D5                 @o[0][4-7]*  R2
1167
1168    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
1169
1170    VST1.32 D8,[R2],R10
1171    VMULL.S16 Q8,D18,D4                 @o[1][0-3]*  R1
1172
1173    VST1.32 D9,[R2],R10
1174    VMLAL.S16 Q8,D19,D5                 @o[1][4-7]*  R1
1175    SUB R2,R2,R10,LSL #2
1176    SUB R2,R2,R7,LSL #1
1177
1178    @--------------------Done procrssing EO -------------------------
1179
1180    @ -----------------Processing O continues------------------------
1181
1182    VMULL.S16 Q10,D22,D4                @o[1][0-3]*  R2
1183    VMLAL.S16 Q10,D23,D5                @o[1][4-7]*  R2
1184
1185    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[5][0-7]
1186
1187    VLD1.S16 {d6,d7},[R11],R12          @g_ai2_ihevc_trans_16[7][0-7]
1188    VMULL.S16 Q12,D18,D4                @o[2][0-3]*  R1
1189
1190    VMLAL.S16 Q12,D19,D5                @o[2][4-7]*  R1
1191    VMULL.S16 Q0,D18,D6                 @o[3][0-3]*  R1
1192    VMLAL.S16 Q0,D19,D7                 @o[3][4-7]*  R1
1193
1194    VMULL.S16 Q13,D22,D4                @o[2][0-3]*  R2
1195    VMLAL.S16 Q13,D23,D5                @o[2][4-7]*  R2
1196    VMULL.S16 Q1,D22,D6                 @o[3][0-3]*  R2
1197    VMLAL.S16 Q1,D23,D7                 @o[3][4-7]*  R2
1198
1199    @transpose the 4x4 matrix R1
1200    VTRN.32 Q6, Q8                      @ 2-cycle instruction
1201
1202    VTRN.32 Q12,Q0                      @ 2-cycle instruction
1203
1204    @transpose the 4x4 matrix R2
1205    VTRN.32 Q7,Q10                      @ 2-cycle instruction
1206
1207    VTRN.32 Q13,Q1                      @ 2-cycle instruction
1208
1209    VSWP    D24,D13
1210    VSWP    D0, D17
1211
1212    VSWP     D26,D15
1213    VSWP    D2,D21
1214
1215    VADD.S32 Q8 ,Q8 ,Q6
1216    VADD.S32 Q12,Q12,Q0
1217
1218    VADD.S32 Q10,Q10,Q7
1219    VADD.S32 Q13,Q13,Q1
1220
1221    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[9][0-7]
1222    VADD.S32 Q12 ,Q12 ,Q8
1223
1224    VADD.S32 Q13,Q13,Q10
1225    VMULL.S16 Q3,D18,D4                 @o[4][0-3]*  R1
1226    VMLAL.S16 Q3,D19,D5                 @o[4][4-7]*  R1
1227
1228    VZIP.S32 Q12,Q13
1229    VMULL.S16 Q4,D22,D4                 @o[0][0-3]*  R2
1230
1231
1232    VMLAL.S16 Q4,D23,D5                 @o[0][4-7]*  R2
1233    @write to memory
1234    @this should go to 1 3 5 7
1235    ADD R2,R2,R7
1236    LSL R7,R7,#1
1237    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[11][0-7]
1238
1239    VST1.32 D24,[R2],R7
1240    VMULL.S16 Q5,D18,D4                 @o[5][0-3]*  R1
1241
1242    VST1.32 D25,[R2],R7
1243    VMLAL.S16 Q5,D19,D5                 @o[5][4-7]*  R1
1244
1245    VST1.32 D26,[R2],R7
1246    VMULL.S16 Q6,D22,D4                 @o[0][0-3]*  R2
1247
1248    VST1.32 D27,[R2],R7
1249    VMLAL.S16 Q6,D23,D5                 @o[0][4-7]*  R2
1250
1251    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[13][0-7]
1252
1253    VLD1.S16 {d2,d3},[R11],R12          @g_ai2_ihevc_trans_16[15][0-7]
1254    VMULL.S16 Q7,D18,D4                 @o[6][0-3]*  R1
1255
1256    VMLAL.S16 Q7,D19,D5                 @o[6][4-7]*  R1
1257    VMULL.S16 Q10,D18,D2                @o[7][0-3]*  R1
1258    VMLAL.S16 Q10,D19,D3                @o[7][4-7]*  R1
1259
1260    VMULL.S16 Q8,D22,D4                 @o[0][0-3]*  R2
1261    VMLAL.S16 Q8,D23,D5                 @o[0][4-7]*  R2
1262    VMULL.S16 Q12,D22,D2                @o[0][0-3]*  R2
1263    VMLAL.S16 Q12,D23,D3                @o[0][4-7]*  R2
1264
1265
1266    @transpose the 4x4 matrix R1
1267    VTRN.32 Q3 ,Q5                      @ 2-cycle instruction
1268
1269    VTRN.32 Q7 ,Q10                     @ transpose step 2 R1 , 2-cycle instruction
1270
1271    @transpose the 4x4 matrix R2
1272    VTRN.32 Q4 ,Q6                      @ 2-cycle instruction
1273
1274    VTRN.32 Q8 ,Q12                     @ transpose step 2 R2 , 2-cycle instruction
1275
1276    VSWP    D14,D7                      @ transpose step 3, R1
1277    VSWP    D20,D11                     @ transpose step 4, R1
1278    VSWP    D16,D9                      @ transpose step 3, R2
1279    VSWP    D24,D13                     @ transpose step 4, R2
1280
1281    VADD.S32 Q5 ,Q5 ,Q3
1282    VADD.S32 Q10,Q10,Q7
1283    VADD.S32 Q6 ,Q6 ,Q4
1284    VADD.S32 Q12,Q12,Q8
1285    VADD.S32 Q10,Q10,Q5
1286    VADD.S32 Q12,Q12,Q6
1287
1288    @ 2-cycle stall
1289    VZIP.S32 Q10,Q12                    @ 3-cycle instruction
1290
1291    @ 2-cycle stall
1292    @this should go to 9 11 13 15
1293    VST1.32 D20,[R2],R7
1294
1295    VST1.32 D21,[R2],R7
1296
1297    VST1.32 D24,[R2],R7
1298
1299    VST1.32 D25,[R2],R7
1300
1301    SUB R2,R2,R7,LSL #3
1302    LSR R7,R7,#1
1303    SUB R2,R2,R7
1304
1305    ADD R2,R2,#8                        @MOVE TO NEXT to next COLUMN - pi4_tmp
1306
1307    ADD R8,R8,#2                        @increment loop cntr
1308    CMP R8,#16                          @check lllop cntr
1309    BNE CORE_LOOP_16X16_HORIZ           @jump acc
1310
1311
1312@*****************Vertical transform************************************
1313
1314@Initialization for vert transform
1315@pi4_tmp will be the new src
1316@tmp stride will be new src stride
1317@dst will be new pi4_tmp
1318@dst stride will be new tmp stride
1319@trans table will be of 32 bit
1320
1321    LDR R9,g_ai4_ihevc_trans_16_addr        @get 32 bit transform matrix
1322ulbl3:
1323    ADD R9, R9, PC
1324
1325    SUB R0,R2,#64                       @set tmp as src [-32 to move back to orgin]
1326    MOV R2,R3                           @set dst as tmp
1327    MOV R4,#TMP_STRIDE                  @set tmp stride as src stride
1328    LSR R7,R6,#15                       @Set dst stride as tmp stride
1329    SUB R4,#48                          @Adjust stride 3 previous loads
1330
1331    @Block SAD
1332    VADD.S32 D28,D28,D29
1333    VPADD.S32 D28,D28,D29
1334    VMOV.S32 R3,D28[0]
1335    @ SAD calculation ends -- final value in R3.
1336
1337    @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1]
1338    @values of g_ai4_ihevc_trans_16 and write to stack
1339    MOV R12,#COFF_STD_W
1340    LSL R12,R12,#2
1341    VLD1.S32 D28,[R9],R12
1342    VLD1.S32 D29,[R9],R12
1343    VLD1.S32 D30,[R9],R12
1344    VLD1.S32 D31,[R9],R12
1345    SUB R9,R9,R12,LSL #2
1346
1347    VREV64.32 Q15,Q15
1348    VTRN.S32 Q14,Q15
1349    VST1.S32 {Q14-Q15},[SP]
1350
1351    VMOV.U32 Q14,#RADD                  @get the round factor to q14
1352    VMOV.U32 Q15,#SHIFT                 @Get the shift to neon
1353
1354    MOV R8,#0                           @INIT LOOP
1355
1356CORE_LOOP_16X16_VERT:
1357
1358    VLD1.S32 {D0,D1},[R0]!              @LOAD 1-4 src R1
1359    VLD1.S32 {D2,D3},[R0]!              @LOAD 5-8 pred R1
1360    VLD1.S32 {D4,D5},[R0]!              @LOAD 9-12 src R1
1361    VLD1.S32 {D6,D7},[R0],R4            @LOAD 12-16 pred R1
1362
1363    VLD1.S32 {D8,D9},[R0]!              @LOAD 1-4 src R2
1364    VLD1.S32 {D10,D11},[R0]!            @LOAD 5-8 pred R2
1365    VLD1.S32 {D12,D13},[R0]!            @LOAD 9-12 src R2
1366    VLD1.S32 {D14,D15},[R0],R4          @LOAD 12-16 pred R2
1367
1368    VREV64.S32 Q2,Q2                    @Rev 9-12 R1
1369    VREV64.S32 Q3,Q3                    @Rev 12-16 R1
1370    VREV64.S32 Q6,Q6                    @Rev 9-12 R2
1371    VREV64.S32 Q7,Q7                    @Rev 12-16 R2
1372
1373    VSWP D6,D7
1374    VSWP D4,D5
1375    VADD.S32 Q8 ,Q0,Q3                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-4  R1
1376    VSWP D12,D13                        @ dual issued with prev. instruction
1377    VADD.S32 Q9 ,Q1,Q2                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 5-8  R1
1378    VSWP D14,D15                        @ dual issued with prev. instruction
1379    VSUB.S32 Q10,Q0,Q3                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 1-4  R1
1380    VSUB.S32 Q11,Q1,Q2                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 5-8  R1
1381
1382    VADD.S32 Q12,Q4,Q7                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-4  R2
1383    VREV64.S32    Q9 ,Q9                @rev e[k] k-> 4-7 R1, dual issued with prev. instruction
1384    VADD.S32 Q13,Q5,Q6                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 5-8  R2
1385    VSUB.S32 Q0 ,Q4,Q7                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 1-4  R2
1386    VSWP D18,D19                        @ dual issued with prev. instruction
1387    VSUB.S32 Q1 ,Q5,Q6                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 5-8  R2
1388    VREV64.S32    Q13,Q13               @rev e[k] k-> 4-7 R2, dual issued with prev. instruction
1389
1390    VADD.S32 Q2,Q8,Q9                   @ee[k] = e[k] + e[7 - k] row R1
1391    VSUB.S32 Q3,Q8,Q9                   @eo[k] = e[k] - e[7 - k] row R1
1392    VSWP D26,D27
1393
1394
1395    VADD.S32 Q4,Q12,Q13                 @ee[k] = e[k] + e[7 - k] row R2
1396    VSUB.S32 Q5,Q12,Q13                 @eo[k] = e[k] - e[7 - k] row R2
1397    VREV64.S32 D5,D5                    @rev ee[k] 4-7 R1, dual issued with prev. instruction
1398
1399    VADD.S32 D12,D4,D5                  @eee[0] eee[1]    R1
1400    VSUB.S32 D13,D4,D5                  @eeo[0] eeo[1]    R1
1401    VREV64.S32 D9,D9                    @rev ee[k] 4-7 R2, dual issued with prev. instruction
1402
1403
1404    VADD.S32 D14,D8,D9                  @eee[0] eee[1]    R2
1405    VSUB.S32 D15,D8,D9                  @eeo[0] eeo[1]    R2
1406
1407    VLD1.S32 {Q12,Q13},[SP]             @Load g_ai2_ihevc_trans_16[xx]->  Q12 : [0 0] [8 0] [4 0] [12 0]  Q13 : [0 1] [8 1] [4 1] [12 1]
1408    VREV64.S32 Q8,Q6                    @Q6 : eee[0] eee[1] eeo[0] eeo[1] R1   ->     ;Q8 : eee[1] eee[0] eeo[1] eeo[0] R1
1409
1410    VREV64.S32 Q9,Q7                    @Q7 : eee[0] eee[1] eeo[0] eeo[1] R2     ->    ;Q9 : eee[1] eee[0] eeo[1] eeo[0] R2
1411
1412
1413    VMUL.S32 Q4,Q6,Q12                  @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1]    R1
1414    VMLA.S32 Q4,Q8,Q13                  @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0]    R1
1415
1416    VMUL.S32 Q6,Q7,Q12                  @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1]    R2
1417    VMLA.S32 Q6,Q9,Q13                  @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R2
1418
1419                                        @Q3    :R1E00 R1E01 R1E02 R1E03
1420                                        @Q5    :R2E00 R2E01 R2E02 R2E03
1421    VSWP D7,D10                         @ dual issued with prev. instruction
1422                                        @Q3    :R1E00 R1E01 R2E00 R2E01
1423                                        @Q5    :R1E02 R1E03 R2E02 R2E03
1424    VSWP D7,D11
1425                                        @Q3    :R1E00 R1E01 R2E02 R2E03
1426                                        @Q5    :R1E02 R1E03 R2E00 R2E01
1427
1428    MOV R12,#COFF_STD_W
1429    ADD R11,R9,R12,LSL #1               @Get to the 2nd row of src
1430    LSL R12,R12,#2
1431
1432    VLD1.S32  {D14,D15},[R11],R12       @LOAD g_ai2_ihevc_trans_16[2][0-4] -> 2G0 2G1 2G2 2G3, 2-cycle instr.
1433
1434    VADD.S32  Q4,Q4,Q14                 @ROUND  R1
1435    VMUL.S32  Q12,Q3,Q7                 @2G0 2G1 2G2 2G3 * R1E00 R1E01 R2E02 R2E03, 4-cycle instruction
1436    VSWP      D14,D15                   @2G0 2G1 2G2 2G3 -> 2G2 2G3 2G0 2G1, dual issued with prev. instruction
1437
1438    VADD.S32 Q6,Q6,Q14                  @ROUND  R2
1439
1440    VSHRN.S32 D8,Q4,#SHIFT              @NARROW R1
1441
1442    VLD1.S32  {D16,D17},[R11],R12       @LOAD g_ai2_ihevc_trans_16[6][0-4]
1443    VSHRN.S32 D9,Q6,#SHIFT              @NARROW R2, dual issued in 2nd cycle
1444
1445    VMUL.S32  Q2,Q3,Q8                  @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4], 4-cycle instruction
1446    VSWP      D16,D17                   @dual issued with prev. instr.
1447
1448    VZIP.S16 D8,D9                      @INTERLEAVE R1 R2 R1 R2 R1 R2 to write
1449    VMLA.S32  Q12,Q5,Q7                 @2G2 2G3 2G0 2G1 * R1E02 R1E03 R2E00 R2E01, 4-cycle instruction
1450
1451
1452    @WRITE INTO MEM the values or wait to be shuffled
1453    @These values must go to 0 4 8 12 colums
1454    LSL R10,R7,#2
1455    VST1.S32 D8[0],[R2],R10
1456
1457    VST1.S32 D9[0],[R2],R10
1458
1459    VST1.S32 D8[1],[R2],R10
1460    VPADD.S32 D18,D24,D25               @D18[0] -> 2G0*R1E00+2G1*R1E01 2G2*R2E02+2G3*R2E03
1461                                        @D18[1] -> 2G2*R1E02+2G3*R1E03 2G0*R2E00+*2G1R2E01
1462
1463    VST1.S32 D9[1],[R2],R10
1464    VMLA.S32  Q2,Q5,Q8                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
1465    LSL R10,R10,#2
1466    SUB R2,R2,R10
1467
1468    VLD1.S32  {D14,D15},[R11],R12       @LOAD g_ai2_ihevc_trans_16[10][0-4]
1469
1470    VMUL.S32  Q6,Q3,Q7                  @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4]
1471    VSWP      D14,D15                   @ dual issued with prev. instruction
1472    VPADD.S32 D19,D4,D5
1473
1474    VLD1.S32  {D16,D17},[R11],R12       @LOAD g_ai2_ihevc_trans_16[14][0-4]
1475    VMUL.S32  Q2,Q3,Q8                  @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4]
1476    VSWP      D16,D17
1477
1478    VMLA.S32  Q6,Q5,Q7                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
1479    VADD.S32 Q9,Q9,Q14                  @Round by RADD R1
1480    VMLA.S32  Q2,Q5,Q8                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
1481    VSHRN.S32 D8,Q9,#SHIFT              @Shift by SHIFT
1482    VPADD.S32 D24,D12,D13
1483    @---------------Processing O, Row 1 and Row 2--------------------------------------
1484    @ Early start to avoid stalls
1485    MOV R12,#COFF_STD_W
1486    ADD R11,R9,R12                      @Get 1ST row
1487    LSL R12,R12,#1
1488
1489    LSL R10,R7,#2
1490    ADD R2,R2,R7,LSL #1                 @move to third row
1491    @this should go to 2  6 10 14
1492    VST1.S32 D8[0],[R2],R10
1493
1494    VST1.S32 D8[1],[R2],R10
1495    VPADD.S32 D25,D4,D5                 @ dual issued with prev. instruction in 2nd cycle
1496
1497    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7]
1498    VADD.S32 Q12,Q12,Q14                @Round by RADD R2, dual issued with prev. instruction in 2nd cycle
1499    VMUL.S32 Q6,Q2,Q0                   @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R2
1500    VMLA.S32 Q6,Q3,Q1                   @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R2
1501    VSHRN.S32 D9,Q12,#SHIFT             @Shift by SHIFT
1502
1503    VMUL.S32 Q2,Q2,Q10                  @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R1
1504    VMLA.S32 Q2,Q3,Q11                  @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R1
1505    VADD.S32 D11,D12,D13                @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R2, dual issued with prev. instr.
1506    VST1.S32 D9[0],[R2],R10
1507
1508    VST1.S32 D9[1],[R2],R10
1509    VADD.S32 D10,D4,D5                  @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R1, dual issued with prev. instr.
1510    LSL R10,R10,#2                      @go back to orgin
1511    SUB R2,R2,R10
1512    SUB R2,R2,R7,LSL #1
1513
1514    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
1515
1516    VMUL.S32 Q7,Q2,Q10                  @o[0][0-3]
1517    VMLA.S32 Q7,Q3,Q11                  @o[0][4-7]
1518    VMUL.S32 Q8,Q2,Q0                   @o[0][0-3]
1519    VMLA.S32 Q8,Q3,Q1                   @o[0][4-7]
1520
1521    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[5][0-7]
1522    VADD.S32 D18,D14,D15
1523    VMUL.S32 Q12,Q2,Q10                 @o[0][0-3]
1524    VMLA.S32 Q12,Q3,Q11                 @o[0][4-7]
1525    VADD.S32 D19,D16,D17
1526    VMUL.S32 Q4,Q2,Q0
1527    VMLA.S32 Q4,Q3,Q1
1528    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[7][0-7]
1529    VADD.S32 D26,D24,D25                @ dual issued with prev. instr.
1530    VMUL.S32 Q6,Q2,Q10                  @o[0][0-3]
1531    VMLA.S32 Q6,Q3,Q11                  @o[0][4-7]
1532    VADD.S32 D27,D8,D9
1533    VMUL.S32 Q4,Q2,Q0
1534    VMLA.S32 Q4,Q3,Q1
1535    VADD.S32 D12,D12,D13
1536    @Q5 Q9 Q13 Q6
1537    VPADD.S32 D14,D10,D11
1538    VPADD.S32 D15,D18,D19
1539    VPADD.S32 D16,D26,D27
1540    VADD.S32  D13,D8,D9
1541    VADD.S32 Q9,Q7,Q14
1542    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[0][0-7]
1543    VPADD.S32 D17,D12,D13               @ dual issued with prev. instr. in 2nd cycle
1544
1545    VMUL.S32 Q4,Q2,Q10                  @o[0][0-3]
1546    VMLA.S32 Q4,Q3,Q11                  @o[0][4-7]
1547
1548    VADD.S32 Q12,Q8,Q14
1549
1550    VMUL.S32 Q6,Q2,Q0                   @o[0][0-3]
1551    VMLA.S32 Q6,Q3,Q1                   @o[0][4-7]
1552
1553    VSHRN.S32 D26,Q9,#SHIFT
1554    VSHRN.S32 D27,Q12,#SHIFT
1555    VADD.S32 D10,D8,D9
1556    @write to memory this should go to 1 3 5 7
1557    ADD R2,R2,R7
1558    LSL R7,R7,#1
1559    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7]
1560    VADD.S32 D11,D12,D13                @ dual issued with prev. instr.
1561
1562    VST1.S32 D26[0],[R2],R7
1563    VMUL.S32 Q7,Q2,Q10                  @o[0][0-3]
1564    VMLA.S32 Q7,Q3,Q11                  @o[0][4-7]
1565    VST1.S32 D26[1],[R2],R7
1566    VMUL.S32 Q8,Q2,Q0                   @o[0][0-3]
1567    VMLA.S32 Q8,Q3,Q1                   @o[0][4-7]
1568    VST1.S32 D27[0],[R2],R7
1569    VADD.S32 D18,D14,D15
1570    VST1.S32 D27[1],[R2],R7
1571
1572    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[2][0-7]
1573    VADD.S32 D19,D16,D17                @ dual issued with prev. instr.
1574
1575    VMUL.S32 Q12,Q2,Q10                 @o[0][0-3]
1576    VMLA.S32 Q12,Q3,Q11                 @o[0][4-7]
1577    VMUL.S32 Q4,Q2,Q0
1578    VMLA.S32 Q4,Q3,Q1
1579
1580    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
1581    VADD.S32 D26,D24,D25
1582
1583    VMUL.S32 Q6,Q2,Q10                  @o[0][0-3]
1584    VMLA.S32 Q6,Q3,Q11                  @o[0][4-7]
1585    VADD.S32  D27,D8,D9
1586
1587    VMUL.S32 Q4,Q2,Q0
1588    VMLA.S32 Q4,Q3,Q1
1589    VADD.S32 D12,D12,D13
1590    @Q5 Q9 Q13 Q6
1591    VPADD.S32 D14,D10,D11
1592    VPADD.S32 D15,D18,D19
1593    VPADD.S32 D16,D26,D27
1594    VADD.S32  D13,D8,D9
1595    VADD.S32 Q9,Q7,Q14
1596    @ 1- cycle stall?
1597    VPADD.S32 D17,D12,D13
1598    VSHRN.S32 D22,Q9,#SHIFT
1599    VADD.S32 Q10,Q8,Q14
1600    @ 2-cycle stall?
1601    VSHRN.S32 D23,Q10,#SHIFT
1602
1603    @this should go to 9 11 13 15
1604    @LSL R11,R7,#1
1605    VST1.S32 D22[0],[R2],R7
1606    VST1.S32 D22[1],[R2],R7
1607    VST1.S32 D23[0],[R2],R7
1608    VST1.S32 D23[1],[R2],R7
1609
1610    SUB R2,R2,R7,LSL #3
1611    LSR R7,R7,#1
1612    SUB R2,R2,R7
1613
1614    ADD R2,R2,#4                        @MOVE TO NEXT to next COLUMN
1615
1616    ADD R8,R8,#2                        @increment loop cntr by 2 since we process loop as 2 cols
1617    CMP R8,#16                          @check loop cntr
1618    BNE CORE_LOOP_16X16_VERT            @jump acc
1619
1620    MOV R0,R3
1621
1622    ADD SP,SP,#32
1623    vpop {d8 - d15}
1624    LDMFD          sp!,{r4-r12,PC}      @stack store values of the arguments
1625
1626