1//.include "ihevc_neon_macros.s"
2.macro push_v_regs
3    stp             X8, X9, [sp, #-16]!
4    stp             X10, X11, [sp, #-16]!
5    stp             X12, X13, [sp, #-16]!
6    stp             X14, X15, [sp, #-16]!
7    stp             X16, X17, [sp, #-16]!
8    stp             x19, x20, [sp, #-16]!
9    stp             x21, x22, [sp, #-16]!
10    stp             X29, X30, [sp, #-16]!
11.endm
12.macro pop_v_regs
13    ldp             X29, X30, [sp], #16
14    ldp             x21, x22, [sp], #16
15    ldp             x19, x20, [sp], #16
16    ldp             X16, X17, [sp], #16
17    ldp             X14, X15, [sp], #16
18    ldp             X12, X13, [sp], #16
19    ldp             X10, X11, [sp], #16
20    ldp             X8, X9, [sp], #16
21.endm
22
23.text
24.p2align 2
25.global ixheaacd_scale_factor_process_armv8
26
27ixheaacd_scale_factor_process_armv8:
28
29    push_v_regs
30
31    MOV             x9, x4
32
33    MOV             x21, x6
34    MOV             x22, x7
35    CMP             x2, #0              // Tbands
36
37    BGT             lbl17
38
39    pop_v_regs
40    ret
41lbl17:
42    MOV             x10, #0
43    CMP             x5, #2
44    BGT             ADD_34
45    MOV             x11, #0x25
46    B               TBANDS_LOOP
47ADD_34:
48    MOV             x11, #0x22
49    // MOV         x11, #0x25 // temp=37
50
51TBANDS_LOOP:
52    LDRSH           x5, [x1], #2        // scale_factor = *Scfactor++;
53    LDRB            w4, [x3], #1        //Offset [1]
54    sxtw            x4, w4
55
56
57    CMP             x5, #0x18           //if(scale_factor < 24)
58    BGE             SCALE_FACTOR_GE_12  //
59
60    CMP             x4, #0
61    BLE             OFFSET_ZERO
62
63SCALE_FACTOR_LT_12:
64
65    STR             x10, [x0], #8
66    STR             x10, [x0], #8
67    SUBS            x4, x4, #4
68    BGT             SCALE_FACTOR_LT_12
69    B               OFFSET_ZERO
70
71SCALE_FACTOR_GE_12:
72
73    SUBS            x6, x11, x5, ASR #2 // 37-(scale_factor >> 2)
74    AND             x5, x5, #3          // scale_factor & 0x0003
75
76    //ADD x5,x9,x5,LSL #1 ; scale_table_ptr[(scale_factor & 0x0003)];
77    LDR             w5, [x9, x5, LSL #2] // scale_short = scale_table_ptr[(scale_factor & 0x0003)];
78    sxtw            x5, w5
79    AND             w17, w5, #0x0000FFFF
80    sxth            w17, w17            //16-bit value stored as 32-bit,so SMULWB can still be used
81    BLE             SHIFT_LE_ZERO       // if shift less than or equal to zero
82
83    SUB             x14, x6, #1         //dont do that extra LSL #1 in SMULWB
84
85SHIFT_POSITIVE: //loop over sfbWidth a multiple of 4
86    LDP             w6, w7 , [x0, #0]   // temp1 = *x_invquant
87    LDP             w19, w20, [x0, #8]
88
89    //SMULWB      x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short);
90    SMULL           x6, w6, w17
91    SMULL           x7, w7, w17
92    SMULL           x19, w19, w17
93    SMULL           x20, w20, w17
94
95    ASR             x6, x6, #16
96    ASR             x7, x7 , #16
97    ASR             x19, x19 , #16
98    ASR             x20, x20 , #16
99
100    ASR             x6, x6, x14         // buffex1 = shx32(buffex1, shift);
101    ASR             x7, x7, x14
102    ASR             x19, x19, x14
103    ASR             x20, x20, x14
104
105    stp             w6, w7, [x0], #8
106    stp             w19, w20, [x0], #8
107
108    SUBS            x4, x4, #4
109
110    BGT             SHIFT_POSITIVE
111    B               OFFSET_ZERO
112SHIFT_LE_ZERO:
113
114    //RSBS        x14, x6, #0 //-shift
115    NEGS            x14, x6
116    BGT             SHIFT_NEGTIVE1
117
118SHIFT_ZERO: //loop over sfbWidth a multiple of 4
119    LDP             w6, w7, [x0, #0]    // temp1 = *x_invquant;
120
121    //SMULWB      x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short);
122    SMULL           x6, w6, w17
123    SMULL           x7, w7, w17
124
125    ASR             x6, x6, #16
126    ASR             x7, x7, #16
127
128    LSL             x6, x6, #1
129    LSL             x7, x7, #1
130
131    STP             w6, w7, [x0], #8    // *x_invquant++ = buffex1;
132
133    SUBS            x4, x4, #2
134
135    BGT             SHIFT_ZERO
136    B               OFFSET_ZERO
137
138SHIFT_NEGTIVE1:
139    SUB             x14, x14, #1
140SHIFT_NEGTIVE: //;loop over sfbWidth a multiple of 4
141
142    LDP             w6, w7, [x0, #0]
143    LSL             w6, w6, w14         // buffex1 = shl32(buffex1, shift-1);
144    LSL             w7, w7, w14         // buffex1 = shl32(buffex1, shift-1);
145
146    //SMULWB      x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short);
147    SMULL           x6, w6, w17
148    SMULL           x7, w7, w17
149    ASR             x6, x6, #16
150    ASR             x7, x7, #16
151
152    LSL             x6, x6, #2          // shl for fixmul_32x16b and shl32(buffer,1)
153    LSL             x7, x7, #2          // shl for fixmul_32x16b and shl32(buffer,1)
154
155    STP             w6, w7, [x0], #8    // *x_invquant++ = buffex1;
156
157    SUBS            x4, x4, #2
158
159    BGT             SHIFT_NEGTIVE
160
161OFFSET_ZERO:
162    SUBS            x2, x2, #1
163    BGT             TBANDS_LOOP
164
165    pop_v_regs
166    ret
167