1; Copyright (c) 2007-2008 CSIRO
2; Copyright (c) 2007-2009 Xiph.Org Foundation
3; Copyright (c) 2013      Parrot
4; Written by Aurélien Zanelli
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions
8; are met:
9;
10; - Redistributions of source code must retain the above copyright
11; notice, this list of conditions and the following disclaimer.
12;
13; - Redistributions in binary form must reproduce the above copyright
14; notice, this list of conditions and the following disclaimer in the
15; documentation and/or other materials provided with the distribution.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
21; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29  AREA  |.text|, CODE, READONLY
30
31  GET    celt/arm/armopts.s
32
33IF OPUS_ARM_MAY_HAVE_EDSP
34  EXPORT celt_pitch_xcorr_edsp
35ENDIF
36
37IF OPUS_ARM_MAY_HAVE_NEON
38  EXPORT celt_pitch_xcorr_neon
39ENDIF
40
41IF OPUS_ARM_MAY_HAVE_NEON
42
43; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
44xcorr_kernel_neon PROC
45  ; input:
46  ;   r3     = int         len
47  ;   r4     = opus_val16 *x
48  ;   r5     = opus_val16 *y
49  ;   q0     = opus_val32  sum[4]
50  ; output:
51  ;   q0     = opus_val32  sum[4]
52  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
53  ; internal usage:
54  ;   r12 = int j
55  ;   d3  = y_3|y_2|y_1|y_0
56  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
57  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
58  ;   q8  = scratch
59  ;
60  ; Load y[0...3]
61  ; This requires len>0 to always be valid (which we assert in the C code).
62  VLD1.16      {d5}, [r5]!
63  SUBS         r12, r3, #8
64  BLE xcorr_kernel_neon_process4
65; Process 8 samples at a time.
66; This loop loads one y value more than we actually need. Therefore we have to
67; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
68; reading past the end of the array.
69xcorr_kernel_neon_process8
70  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
71  ; - 2 cycles of ARM insrtuctions,
72  ; - 10 cycles of load/store/byte permute instructions, and
73  ; - 9 cycles of data processing instructions.
74  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
75  ; latter two categories, meaning the whole loop should run in 10 cycles per
76  ; iteration, barring cache misses.
77  ;
78  ; Load x[0...7]
79  VLD1.16      {d6, d7}, [r4]!
80  ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
81  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
82  VAND         d3, d5, d5
83  SUBS         r12, r12, #8
84  ; Load y[4...11]
85  VLD1.16      {d4, d5}, [r5]!
86  VMLAL.S16    q0, d3, d6[0]
87  VEXT.16      d16, d3, d4, #1
88  VMLAL.S16    q0, d4, d7[0]
89  VEXT.16      d17, d4, d5, #1
90  VMLAL.S16    q0, d16, d6[1]
91  VEXT.16      d16, d3, d4, #2
92  VMLAL.S16    q0, d17, d7[1]
93  VEXT.16      d17, d4, d5, #2
94  VMLAL.S16    q0, d16, d6[2]
95  VEXT.16      d16, d3, d4, #3
96  VMLAL.S16    q0, d17, d7[2]
97  VEXT.16      d17, d4, d5, #3
98  VMLAL.S16    q0, d16, d6[3]
99  VMLAL.S16    q0, d17, d7[3]
100  BGT xcorr_kernel_neon_process8
101; Process 4 samples here if we have > 4 left (still reading one extra y value).
102xcorr_kernel_neon_process4
103  ADDS         r12, r12, #4
104  BLE xcorr_kernel_neon_process2
105  ; Load x[0...3]
106  VLD1.16      d6, [r4]!
107  ; Use VAND since it's a data processing instruction again.
108  VAND         d4, d5, d5
109  SUB          r12, r12, #4
110  ; Load y[4...7]
111  VLD1.16      d5, [r5]!
112  VMLAL.S16    q0, d4, d6[0]
113  VEXT.16      d16, d4, d5, #1
114  VMLAL.S16    q0, d16, d6[1]
115  VEXT.16      d16, d4, d5, #2
116  VMLAL.S16    q0, d16, d6[2]
117  VEXT.16      d16, d4, d5, #3
118  VMLAL.S16    q0, d16, d6[3]
119; Process 2 samples here if we have > 2 left (still reading one extra y value).
120xcorr_kernel_neon_process2
121  ADDS         r12, r12, #2
122  BLE xcorr_kernel_neon_process1
123  ; Load x[0...1]
124  VLD2.16      {d6[],d7[]}, [r4]!
125  ; Use VAND since it's a data processing instruction again.
126  VAND         d4, d5, d5
127  SUB          r12, r12, #2
128  ; Load y[4...5]
129  VLD1.32      {d5[]}, [r5]!
130  VMLAL.S16    q0, d4, d6
131  VEXT.16      d16, d4, d5, #1
132  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
133  ; instead of VEXT, since it's a data-processing instruction.
134  VSRI.64      d5, d4, #32
135  VMLAL.S16    q0, d16, d7
136; Process 1 sample using the extra y value we loaded above.
137xcorr_kernel_neon_process1
138  ; Load next *x
139  VLD1.16      {d6[]}, [r4]!
140  ADDS         r12, r12, #1
141  ; y[0...3] are left in d5 from prior iteration(s) (if any)
142  VMLAL.S16    q0, d5, d6
143  MOVLE        pc, lr
144; Now process 1 last sample, not reading ahead.
145  ; Load last *y
146  VLD1.16      {d4[]}, [r5]!
147  VSRI.64      d4, d5, #16
148  ; Load last *x
149  VLD1.16      {d6[]}, [r4]!
150  VMLAL.S16    q0, d4, d6
151  MOV          pc, lr
152  ENDP
153
154; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
155;  opus_val32 *xcorr, int len, int max_pitch)
156celt_pitch_xcorr_neon PROC
157  ; input:
158  ;   r0  = opus_val16 *_x
159  ;   r1  = opus_val16 *_y
160  ;   r2  = opus_val32 *xcorr
161  ;   r3  = int         len
162  ; output:
163  ;   r0  = int         maxcorr
164  ; internal usage:
165  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
166  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
167  ;   r6  = int         max_pitch
168  ;   r12 = int         j
169  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
170  STMFD        sp!, {r4-r6, lr}
171  LDR          r6, [sp, #16]
172  VMOV.S32     q15, #1
173  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
174  SUBS         r6, r6, #4
175  BLT celt_pitch_xcorr_neon_process4_done
176celt_pitch_xcorr_neon_process4
177  ; xcorr_kernel_neon parameters:
178  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
179  MOV          r4, r0
180  MOV          r5, r1
181  VEOR         q0, q0, q0
182  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
183  ; So we don't save/restore any other registers.
184  BL xcorr_kernel_neon
185  SUBS         r6, r6, #4
186  VST1.32      {q0}, [r2]!
187  ; _y += 4
188  ADD          r1, r1, #8
189  VMAX.S32     q15, q15, q0
190  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
191  BGE celt_pitch_xcorr_neon_process4
192; We have less than 4 sums left to compute.
193celt_pitch_xcorr_neon_process4_done
194  ADDS         r6, r6, #4
195  ; Reduce maxcorr to a single value
196  VMAX.S32     d30, d30, d31
197  VPMAX.S32    d30, d30, d30
198  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
199  BLE celt_pitch_xcorr_neon_done
200; Now compute each remaining sum one at a time.
201celt_pitch_xcorr_neon_process_remaining
202  MOV          r4, r0
203  MOV          r5, r1
204  VMOV.I32     q0, #0
205  SUBS         r12, r3, #8
206  BLT celt_pitch_xcorr_neon_process_remaining4
207; Sum terms 8 at a time.
208celt_pitch_xcorr_neon_process_remaining_loop8
209  ; Load x[0...7]
210  VLD1.16      {q1}, [r4]!
211  ; Load y[0...7]
212  VLD1.16      {q2}, [r5]!
213  SUBS         r12, r12, #8
214  VMLAL.S16    q0, d4, d2
215  VMLAL.S16    q0, d5, d3
216  BGE celt_pitch_xcorr_neon_process_remaining_loop8
217; Sum terms 4 at a time.
218celt_pitch_xcorr_neon_process_remaining4
219  ADDS         r12, r12, #4
220  BLT celt_pitch_xcorr_neon_process_remaining4_done
221  ; Load x[0...3]
222  VLD1.16      {d2}, [r4]!
223  ; Load y[0...3]
224  VLD1.16      {d3}, [r5]!
225  SUB          r12, r12, #4
226  VMLAL.S16    q0, d3, d2
227celt_pitch_xcorr_neon_process_remaining4_done
228  ; Reduce the sum to a single value.
229  VADD.S32     d0, d0, d1
230  VPADDL.S32   d0, d0
231  ADDS         r12, r12, #4
232  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
233; Sum terms 1 at a time.
234celt_pitch_xcorr_neon_process_remaining_loop1
235  VLD1.16      {d2[]}, [r4]!
236  VLD1.16      {d3[]}, [r5]!
237  SUBS         r12, r12, #1
238  VMLAL.S16    q0, d2, d3
239  BGT celt_pitch_xcorr_neon_process_remaining_loop1
240celt_pitch_xcorr_neon_process_remaining_loop_done
241  VST1.32      {d0[0]}, [r2]!
242  VMAX.S32     d30, d30, d0
243  SUBS         r6, r6, #1
244  ; _y++
245  ADD          r1, r1, #2
246  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
247  BGT celt_pitch_xcorr_neon_process_remaining
248celt_pitch_xcorr_neon_done
249  VMOV.32      r0, d30[0]
250  LDMFD        sp!, {r4-r6, pc}
251  ENDP
252
253ENDIF
254
255IF OPUS_ARM_MAY_HAVE_EDSP
256
257; This will get used on ARMv7 devices without NEON, so it has been optimized
258; to take advantage of dual-issuing where possible.
259xcorr_kernel_edsp PROC
260  ; input:
261  ;   r3      = int         len
262  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
263  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
264  ;   r6...r9 = opus_val32  sum[4]
265  ; output:
266  ;   r6...r9 = opus_val32  sum[4]
267  ; preserved: r0-r5
268  ; internal usage
269  ;   r2      = int         j
270  ;   r12,r14 = opus_val16  x[4]
271  ;   r10,r11 = opus_val16  y[4]
272  STMFD        sp!, {r2,r4,r5,lr}
273  LDR          r10, [r5], #4      ; Load y[0...1]
274  SUBS         r2, r3, #4         ; j = len-4
275  LDR          r11, [r5], #4      ; Load y[2...3]
276  BLE xcorr_kernel_edsp_process4_done
277  LDR          r12, [r4], #4      ; Load x[0...1]
278  ; Stall
279xcorr_kernel_edsp_process4
280  ; The multiplies must issue from pipeline 0, and can't dual-issue with each
281  ; other. Every other instruction here dual-issues with a multiply, and is
282  ; thus "free". There should be no stalls in the body of the loop.
283  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
284  LDR          r14, [r4], #4      ; Load x[2...3]
285  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
286  SUBS         r2, r2, #4         ; j-=4
287  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
288  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
289  SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
290  LDR          r10, [r5], #4      ; Load y[4...5]
291  SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
292  SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
293  SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
294  LDRGT        r12, [r4], #4      ; Load x[0...1]
295  SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
296  SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
297  SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
298  SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
299  SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
300  LDR          r11, [r5], #4      ; Load y[6...7]
301  SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
302  SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
303  SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
304  BGT xcorr_kernel_edsp_process4
305xcorr_kernel_edsp_process4_done
306  ADDS         r2, r2, #4
307  BLE xcorr_kernel_edsp_done
308  LDRH         r12, [r4], #2      ; r12 = *x++
309  SUBS         r2, r2, #1         ; j--
310  ; Stall
311  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
312  LDRGTH       r14, [r4], #2      ; r14 = *x++
313  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
314  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
315  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
316  BLE xcorr_kernel_edsp_done
317  SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
318  SUBS         r2, r2, #1         ; j--
319  SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
320  LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
321  SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
322  LDRGTH       r12, [r4], #2      ; r12 = *x++
323  SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
324  BLE xcorr_kernel_edsp_done
325  SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
326  CMP          r2, #1             ; j--
327  SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
328  LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
329  SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
330  LDRGTH       r14, [r4]          ; r14 = *x
331  SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
332  BLE xcorr_kernel_edsp_done
333  SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
334  LDRH         r11, [r5]          ; r11 = y_6 = *y
335  SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
336  SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
337  SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
338xcorr_kernel_edsp_done
339  LDMFD        sp!, {r2,r4,r5,pc}
340  ENDP
341
342celt_pitch_xcorr_edsp PROC
343  ; input:
344  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
345  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
346  ;   r2  = opus_val32 *xcorr
347  ;   r3  = int         len
348  ; output:
349  ;   r0  = maxcorr
350  ; internal usage
351  ;   r4  = opus_val16 *x
352  ;   r5  = opus_val16 *y
353  ;   r6  = opus_val32  sum0
354  ;   r7  = opus_val32  sum1
355  ;   r8  = opus_val32  sum2
356  ;   r9  = opus_val32  sum3
357  ;   r1  = int         max_pitch
358  ;   r12 = int         j
359  STMFD        sp!, {r4-r11, lr}
360  MOV          r5, r1
361  LDR          r1, [sp, #36]
362  MOV          r4, r0
363  TST          r5, #3
364  ; maxcorr = 1
365  MOV          r0, #1
366  BEQ          celt_pitch_xcorr_edsp_process1u_done
367; Compute one sum at the start to make y 32-bit aligned.
368  SUBS         r12, r3, #4
369  ; r14 = sum = 0
370  MOV          r14, #0
371  LDRH         r8, [r5], #2
372  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
373  LDR          r6, [r4], #4
374  MOV          r8, r8, LSL #16
375celt_pitch_xcorr_edsp_process1u_loop4
376  LDR          r9, [r5], #4
377  SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
378  LDR          r7, [r4], #4
379  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
380  LDR          r8, [r5], #4
381  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
382  SUBS         r12, r12, #4         ; j-=4
383  SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
384  LDRGT        r6, [r4], #4
385  BGT celt_pitch_xcorr_edsp_process1u_loop4
386  MOV          r8, r8, LSR #16
387celt_pitch_xcorr_edsp_process1u_loop4_done
388  ADDS         r12, r12, #4
389celt_pitch_xcorr_edsp_process1u_loop1
390  LDRGEH       r6, [r4], #2
391  ; Stall
392  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
393  SUBGES       r12, r12, #1
394  LDRGTH       r8, [r5], #2
395  BGT celt_pitch_xcorr_edsp_process1u_loop1
396  ; Restore _x
397  SUB          r4, r4, r3, LSL #1
398  ; Restore and advance _y
399  SUB          r5, r5, r3, LSL #1
400  ; maxcorr = max(maxcorr, sum)
401  CMP          r0, r14
402  ADD          r5, r5, #2
403  MOVLT        r0, r14
404  SUBS         r1, r1, #1
405  ; xcorr[i] = sum
406  STR          r14, [r2], #4
407  BLE celt_pitch_xcorr_edsp_done
408celt_pitch_xcorr_edsp_process1u_done
409  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
410  SUBS         r1, r1, #4
411  BLT celt_pitch_xcorr_edsp_process2
412celt_pitch_xcorr_edsp_process4
413  ; xcorr_kernel_edsp parameters:
414  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
415  MOV          r6, #0
416  MOV          r7, #0
417  MOV          r8, #0
418  MOV          r9, #0
419  BL xcorr_kernel_edsp  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
420  ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
421  CMP          r0, r6
422  ; _y+=4
423  ADD          r5, r5, #8
424  MOVLT        r0, r6
425  CMP          r0, r7
426  MOVLT        r0, r7
427  CMP          r0, r8
428  MOVLT        r0, r8
429  CMP          r0, r9
430  MOVLT        r0, r9
431  STMIA        r2!, {r6-r9}
432  SUBS         r1, r1, #4
433  BGE celt_pitch_xcorr_edsp_process4
434celt_pitch_xcorr_edsp_process2
435  ADDS         r1, r1, #2
436  BLT celt_pitch_xcorr_edsp_process1a
437  SUBS         r12, r3, #4
438  ; {r10, r11} = {sum0, sum1} = {0, 0}
439  MOV          r10, #0
440  MOV          r11, #0
441  LDR          r8, [r5], #4
442  BLE celt_pitch_xcorr_edsp_process2_loop_done
443  LDR          r6, [r4], #4
444  LDR          r9, [r5], #4
445celt_pitch_xcorr_edsp_process2_loop4
446  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
447  LDR          r7, [r4], #4
448  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
449  SUBS         r12, r12, #4         ; j-=4
450  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
451  LDR          r8, [r5], #4
452  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
453  LDRGT        r6, [r4], #4
454  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
455  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
456  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
457  LDRGT        r9, [r5], #4
458  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
459  BGT celt_pitch_xcorr_edsp_process2_loop4
460celt_pitch_xcorr_edsp_process2_loop_done
461  ADDS         r12, r12, #2
462  BLE  celt_pitch_xcorr_edsp_process2_1
463  LDR          r6, [r4], #4
464  ; Stall
465  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
466  LDR          r9, [r5], #4
467  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
468  SUB          r12, r12, #2
469  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
470  MOV          r8, r9
471  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
472celt_pitch_xcorr_edsp_process2_1
473  LDRH         r6, [r4], #2
474  ADDS         r12, r12, #1
475  ; Stall
476  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
477  LDRGTH       r7, [r4], #2
478  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
479  BLE celt_pitch_xcorr_edsp_process2_done
480  LDRH         r9, [r5], #2
481  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
482  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
483celt_pitch_xcorr_edsp_process2_done
484  ; Restore _x
485  SUB          r4, r4, r3, LSL #1
486  ; Restore and advance _y
487  SUB          r5, r5, r3, LSL #1
488  ; maxcorr = max(maxcorr, sum0)
489  CMP          r0, r10
490  ADD          r5, r5, #2
491  MOVLT        r0, r10
492  SUB          r1, r1, #2
493  ; maxcorr = max(maxcorr, sum1)
494  CMP          r0, r11
495  ; xcorr[i] = sum
496  STR          r10, [r2], #4
497  MOVLT        r0, r11
498  STR          r11, [r2], #4
499celt_pitch_xcorr_edsp_process1a
500  ADDS         r1, r1, #1
501  BLT celt_pitch_xcorr_edsp_done
502  SUBS         r12, r3, #4
503  ; r14 = sum = 0
504  MOV          r14, #0
505  BLT celt_pitch_xcorr_edsp_process1a_loop_done
506  LDR          r6, [r4], #4
507  LDR          r8, [r5], #4
508  LDR          r7, [r4], #4
509  LDR          r9, [r5], #4
510celt_pitch_xcorr_edsp_process1a_loop4
511  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
512  SUBS         r12, r12, #4         ; j-=4
513  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
514  LDRGE        r6, [r4], #4
515  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
516  LDRGE        r8, [r5], #4
517  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
518  LDRGE        r7, [r4], #4
519  LDRGE        r9, [r5], #4
520  BGE celt_pitch_xcorr_edsp_process1a_loop4
521celt_pitch_xcorr_edsp_process1a_loop_done
522  ADDS         r12, r12, #2
523  LDRGE        r6, [r4], #4
524  LDRGE        r8, [r5], #4
525  ; Stall
526  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
527  SUBGE        r12, r12, #2
528  SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
529  ADDS         r12, r12, #1
530  LDRGEH       r6, [r4], #2
531  LDRGEH       r8, [r5], #2
532  ; Stall
533  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
534  ; maxcorr = max(maxcorr, sum)
535  CMP          r0, r14
536  ; xcorr[i] = sum
537  STR          r14, [r2], #4
538  MOVLT        r0, r14
539celt_pitch_xcorr_edsp_done
540  LDMFD        sp!, {r4-r11, pc}
541  ENDP
542
543ENDIF
544
545END
546