1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "asm_support_x86.S"
18
19#define MEMCMP  __memcmp16
20
21/* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */
22
23#ifndef L
24# define L(label)    .L##label
25#endif
26
27#define CFI_PUSH(REG)    \
28    CFI_ADJUST_CFA_OFFSET(4);    \
29    CFI_REL_OFFSET(REG, 0)
30
31#define CFI_POP(REG)    \
32    CFI_ADJUST_CFA_OFFSET(-4);    \
33    CFI_RESTORE(REG)
34
35#define PUSH(REG)    pushl REG; CFI_PUSH (REG)
36#define POP(REG)    popl REG; CFI_POP (REG)
37
38#define PARMS        4
39#define BLK1        PARMS
40#define BLK2        BLK1+4
41#define LEN        BLK2+4
42#define RETURN_END    POP (%edi); POP (%esi); POP (%ebx); ret
43MACRO0(RETURN)
44    RETURN_END
45    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
46    CFI_REMEMBER_STATE
47END_MACRO
48
49DEFINE_FUNCTION MEMCMP
50    movl       LEN(%esp), %ecx
51
52    shl        $1, %ecx
53    jz         L(zero)
54
55    movl       BLK1(%esp), %eax
56    cmp        $48, %ecx
57    movl       BLK2(%esp), %edx
58    jae        L(48bytesormore)
59
60    PUSH       (%ebx)
61    add        %ecx, %edx
62    add        %ecx, %eax
63    jmp        L(less48bytes)
64
65    CFI_POP    (%ebx)
66
67    .p2align 4
68L(zero):
69    xor        %eax, %eax
70    ret
71
72    .p2align 4
73L(48bytesormore):
74    PUSH       (%ebx)
75    PUSH       (%esi)
76    PUSH       (%edi)
77    CFI_REMEMBER_STATE
78    movdqu     (%eax), %xmm3
79    movdqu     (%edx), %xmm0
80    movl       %eax, %edi
81    movl       %edx, %esi
82    pcmpeqb    %xmm0, %xmm3
83    pmovmskb   %xmm3, %edx
84    lea        16(%edi), %edi
85
86    sub        $0xffff, %edx
87    lea        16(%esi), %esi
88    jnz        L(less16bytes)
89    mov        %edi, %edx
90    and        $0xf, %edx
91    xor        %edx, %edi
92    sub        %edx, %esi
93    add        %edx, %ecx
94    mov        %esi, %edx
95    and        $0xf, %edx
96    jz         L(shr_0)
97    xor        %edx, %esi
98
99    cmp        $0, %edx
100    je         L(shr_0)
101    cmp        $2, %edx
102    je         L(shr_2)
103    cmp        $4, %edx
104    je         L(shr_4)
105    cmp        $6, %edx
106    je         L(shr_6)
107    cmp        $8, %edx
108    je         L(shr_8)
109    cmp        $10, %edx
110    je         L(shr_10)
111    cmp        $12, %edx
112    je         L(shr_12)
113    jmp        L(shr_14)
114
115    .p2align 4
116L(shr_0):
117    cmp        $80, %ecx
118    jae        L(shr_0_gobble)
119    lea        -48(%ecx), %ecx
120    xor        %eax, %eax
121    movaps     (%esi), %xmm1
122    pcmpeqb    (%edi), %xmm1
123    movaps     16(%esi), %xmm2
124    pcmpeqb    16(%edi), %xmm2
125    pand       %xmm1, %xmm2
126    pmovmskb   %xmm2, %edx
127    add        $32, %edi
128    add        $32, %esi
129    sub        $0xffff, %edx
130    jnz        L(exit)
131
132    lea        (%ecx, %edi,1), %eax
133    lea        (%ecx, %esi,1), %edx
134    POP        (%edi)
135    POP        (%esi)
136    jmp        L(less48bytes)
137
138    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
139    CFI_REMEMBER_STATE
140    .p2align 4
141L(shr_0_gobble):
142    lea        -48(%ecx), %ecx
143    movdqa     (%esi), %xmm0
144    xor        %eax, %eax
145    pcmpeqb    (%edi), %xmm0
146    sub        $32, %ecx
147    movdqa     16(%esi), %xmm2
148    pcmpeqb    16(%edi), %xmm2
149L(shr_0_gobble_loop):
150    pand       %xmm0, %xmm2
151    sub        $32, %ecx
152    pmovmskb   %xmm2, %edx
153    movdqa     %xmm0, %xmm1
154    movdqa     32(%esi), %xmm0
155    movdqa     48(%esi), %xmm2
156    sbb        $0xffff, %edx
157    pcmpeqb    32(%edi), %xmm0
158    pcmpeqb    48(%edi), %xmm2
159    lea        32(%edi), %edi
160    lea        32(%esi), %esi
161    jz         L(shr_0_gobble_loop)
162
163    pand       %xmm0, %xmm2
164    cmp        $0, %ecx
165    jge        L(shr_0_gobble_loop_next)
166    inc        %edx
167    add        $32, %ecx
168L(shr_0_gobble_loop_next):
169    test       %edx, %edx
170    jnz        L(exit)
171
172    pmovmskb %xmm2, %edx
173    movdqa     %xmm0, %xmm1
174    lea        32(%edi), %edi
175    lea        32(%esi), %esi
176    sub        $0xffff, %edx
177    jnz        L(exit)
178    lea        (%ecx, %edi,1), %eax
179    lea        (%ecx, %esi,1), %edx
180    POP        (%edi)
181    POP        (%esi)
182    jmp        L(less48bytes)
183
184    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
185    CFI_REMEMBER_STATE
186    .p2align 4
187L(shr_2):
188    cmp        $80, %ecx
189    lea        -48(%ecx), %ecx
190    mov        %edx, %eax
191    jae        L(shr_2_gobble)
192
193    movdqa     16(%esi), %xmm1
194    movdqa     %xmm1, %xmm2
195    palignr    $2,(%esi), %xmm1
196    pcmpeqb    (%edi), %xmm1
197
198    movdqa     32(%esi), %xmm3
199    palignr    $2,%xmm2, %xmm3
200    pcmpeqb    16(%edi), %xmm3
201
202    pand       %xmm1, %xmm3
203    pmovmskb   %xmm3, %edx
204    lea        32(%edi), %edi
205    lea        32(%esi), %esi
206    sub        $0xffff, %edx
207    jnz        L(exit)
208    lea        (%ecx, %edi,1), %eax
209    lea        2(%ecx, %esi,1), %edx
210    POP        (%edi)
211    POP        (%esi)
212    jmp        L(less48bytes)
213
214    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
215    CFI_REMEMBER_STATE
216    .p2align 4
217L(shr_2_gobble):
218    sub        $32, %ecx
219    movdqa     16(%esi), %xmm0
220    palignr    $2,(%esi), %xmm0
221    pcmpeqb    (%edi), %xmm0
222
223    movdqa     32(%esi), %xmm3
224    palignr    $2,16(%esi), %xmm3
225    pcmpeqb    16(%edi), %xmm3
226
227L(shr_2_gobble_loop):
228    pand       %xmm0, %xmm3
229    sub        $32, %ecx
230    pmovmskb   %xmm3, %edx
231    movdqa     %xmm0, %xmm1
232
233    movdqa     64(%esi), %xmm3
234    palignr    $2,48(%esi), %xmm3
235    sbb        $0xffff, %edx
236    movdqa     48(%esi), %xmm0
237    palignr    $2,32(%esi), %xmm0
238    pcmpeqb    32(%edi), %xmm0
239    lea        32(%esi), %esi
240    pcmpeqb    48(%edi), %xmm3
241
242    lea        32(%edi), %edi
243    jz         L(shr_2_gobble_loop)
244    pand       %xmm0, %xmm3
245
246    cmp        $0, %ecx
247    jge        L(shr_2_gobble_next)
248    inc        %edx
249    add        $32, %ecx
250L(shr_2_gobble_next):
251    test       %edx, %edx
252    jnz        L(exit)
253
254    pmovmskb   %xmm3, %edx
255    movdqa     %xmm0, %xmm1
256    lea        32(%edi), %edi
257    lea        32(%esi), %esi
258    sub        $0xffff, %edx
259    jnz        L(exit)
260
261    lea        (%ecx, %edi,1), %eax
262    lea        2(%ecx, %esi,1), %edx
263    POP        (%edi)
264    POP        (%esi)
265    jmp        L(less48bytes)
266
267    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
268    CFI_REMEMBER_STATE
269    .p2align 4
270L(shr_4):
271    cmp        $80, %ecx
272    lea        -48(%ecx), %ecx
273    mov        %edx, %eax
274    jae        L(shr_4_gobble)
275
276    movdqa     16(%esi), %xmm1
277    movdqa     %xmm1, %xmm2
278    palignr    $4,(%esi), %xmm1
279    pcmpeqb    (%edi), %xmm1
280
281    movdqa     32(%esi), %xmm3
282    palignr    $4,%xmm2, %xmm3
283    pcmpeqb    16(%edi), %xmm3
284
285    pand       %xmm1, %xmm3
286    pmovmskb   %xmm3, %edx
287    lea        32(%edi), %edi
288    lea        32(%esi), %esi
289    sub        $0xffff, %edx
290    jnz        L(exit)
291    lea        (%ecx, %edi,1), %eax
292    lea        4(%ecx, %esi,1), %edx
293    POP        (%edi)
294    POP        (%esi)
295    jmp        L(less48bytes)
296
297    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
298    CFI_REMEMBER_STATE
299    .p2align 4
300L(shr_4_gobble):
301    sub        $32, %ecx
302    movdqa     16(%esi), %xmm0
303    palignr    $4,(%esi), %xmm0
304    pcmpeqb    (%edi), %xmm0
305
306    movdqa     32(%esi), %xmm3
307    palignr    $4,16(%esi), %xmm3
308    pcmpeqb    16(%edi), %xmm3
309
310L(shr_4_gobble_loop):
311    pand       %xmm0, %xmm3
312    sub        $32, %ecx
313    pmovmskb   %xmm3, %edx
314    movdqa     %xmm0, %xmm1
315
316    movdqa     64(%esi), %xmm3
317    palignr    $4,48(%esi), %xmm3
318    sbb        $0xffff, %edx
319    movdqa     48(%esi), %xmm0
320    palignr    $4,32(%esi), %xmm0
321    pcmpeqb    32(%edi), %xmm0
322    lea        32(%esi), %esi
323    pcmpeqb    48(%edi), %xmm3
324
325    lea        32(%edi), %edi
326    jz         L(shr_4_gobble_loop)
327    pand       %xmm0, %xmm3
328
329    cmp        $0, %ecx
330    jge        L(shr_4_gobble_next)
331    inc        %edx
332    add        $32, %ecx
333L(shr_4_gobble_next):
334    test       %edx, %edx
335    jnz        L(exit)
336
337    pmovmskb   %xmm3, %edx
338    movdqa     %xmm0, %xmm1
339    lea        32(%edi), %edi
340    lea        32(%esi), %esi
341    sub        $0xffff, %edx
342    jnz        L(exit)
343
344    lea        (%ecx, %edi,1), %eax
345    lea        4(%ecx, %esi,1), %edx
346    POP        (%edi)
347    POP        (%esi)
348    jmp        L(less48bytes)
349
350    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
351    CFI_REMEMBER_STATE
352    .p2align 4
353L(shr_6):
354    cmp        $80, %ecx
355    lea        -48(%ecx), %ecx
356    mov        %edx, %eax
357    jae        L(shr_6_gobble)
358
359    movdqa     16(%esi), %xmm1
360    movdqa     %xmm1, %xmm2
361    palignr    $6,(%esi), %xmm1
362    pcmpeqb    (%edi), %xmm1
363
364    movdqa     32(%esi), %xmm3
365    palignr    $6,%xmm2, %xmm3
366    pcmpeqb    16(%edi), %xmm3
367
368    pand       %xmm1, %xmm3
369    pmovmskb   %xmm3, %edx
370    lea        32(%edi), %edi
371    lea        32(%esi), %esi
372    sub        $0xffff, %edx
373    jnz        L(exit)
374    lea        (%ecx, %edi,1), %eax
375    lea        6(%ecx, %esi,1), %edx
376    POP        (%edi)
377    POP        (%esi)
378    jmp        L(less48bytes)
379
380    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
381    CFI_REMEMBER_STATE
382    .p2align 4
383L(shr_6_gobble):
384    sub        $32, %ecx
385    movdqa     16(%esi), %xmm0
386    palignr    $6,(%esi), %xmm0
387    pcmpeqb    (%edi), %xmm0
388
389    movdqa     32(%esi), %xmm3
390    palignr    $6,16(%esi), %xmm3
391    pcmpeqb    16(%edi), %xmm3
392
393L(shr_6_gobble_loop):
394    pand       %xmm0, %xmm3
395    sub        $32, %ecx
396    pmovmskb   %xmm3, %edx
397    movdqa     %xmm0, %xmm1
398
399    movdqa     64(%esi), %xmm3
400    palignr    $6,48(%esi), %xmm3
401    sbb        $0xffff, %edx
402    movdqa     48(%esi), %xmm0
403    palignr    $6,32(%esi), %xmm0
404    pcmpeqb    32(%edi), %xmm0
405    lea        32(%esi), %esi
406    pcmpeqb    48(%edi), %xmm3
407
408    lea        32(%edi), %edi
409    jz         L(shr_6_gobble_loop)
410    pand       %xmm0, %xmm3
411
412    cmp        $0, %ecx
413    jge        L(shr_6_gobble_next)
414    inc        %edx
415    add        $32, %ecx
416L(shr_6_gobble_next):
417    test       %edx, %edx
418    jnz        L(exit)
419
420    pmovmskb   %xmm3, %edx
421    movdqa     %xmm0, %xmm1
422    lea        32(%edi), %edi
423    lea        32(%esi), %esi
424    sub        $0xffff, %edx
425    jnz        L(exit)
426
427    lea        (%ecx, %edi,1), %eax
428    lea        6(%ecx, %esi,1), %edx
429    POP        (%edi)
430    POP        (%esi)
431    jmp        L(less48bytes)
432
433    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
434    CFI_REMEMBER_STATE
435    .p2align 4
436L(shr_8):
437    cmp        $80, %ecx
438    lea        -48(%ecx), %ecx
439    mov        %edx, %eax
440    jae        L(shr_8_gobble)
441
442    movdqa     16(%esi), %xmm1
443    movdqa     %xmm1, %xmm2
444    palignr    $8,(%esi), %xmm1
445    pcmpeqb    (%edi), %xmm1
446
447    movdqa     32(%esi), %xmm3
448    palignr    $8,%xmm2, %xmm3
449    pcmpeqb    16(%edi), %xmm3
450
451    pand       %xmm1, %xmm3
452    pmovmskb   %xmm3, %edx
453    lea        32(%edi), %edi
454    lea        32(%esi), %esi
455    sub        $0xffff, %edx
456    jnz        L(exit)
457    lea        (%ecx, %edi,1), %eax
458    lea        8(%ecx, %esi,1), %edx
459    POP        (%edi)
460    POP        (%esi)
461    jmp        L(less48bytes)
462
463    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
464    CFI_REMEMBER_STATE
465    .p2align 4
466L(shr_8_gobble):
467    sub        $32, %ecx
468    movdqa     16(%esi), %xmm0
469    palignr    $8,(%esi), %xmm0
470    pcmpeqb    (%edi), %xmm0
471
472    movdqa     32(%esi), %xmm3
473    palignr    $8,16(%esi), %xmm3
474    pcmpeqb    16(%edi), %xmm3
475
476L(shr_8_gobble_loop):
477    pand       %xmm0, %xmm3
478    sub        $32, %ecx
479    pmovmskb   %xmm3, %edx
480    movdqa     %xmm0, %xmm1
481
482    movdqa     64(%esi), %xmm3
483    palignr    $8,48(%esi), %xmm3
484    sbb        $0xffff, %edx
485    movdqa     48(%esi), %xmm0
486    palignr    $8,32(%esi), %xmm0
487    pcmpeqb    32(%edi), %xmm0
488    lea        32(%esi), %esi
489    pcmpeqb    48(%edi), %xmm3
490
491    lea        32(%edi), %edi
492    jz         L(shr_8_gobble_loop)
493    pand       %xmm0, %xmm3
494
495    cmp        $0, %ecx
496    jge        L(shr_8_gobble_next)
497    inc        %edx
498    add        $32, %ecx
499L(shr_8_gobble_next):
500    test       %edx, %edx
501    jnz        L(exit)
502
503    pmovmskb   %xmm3, %edx
504    movdqa     %xmm0, %xmm1
505    lea        32(%edi), %edi
506    lea        32(%esi), %esi
507    sub        $0xffff, %edx
508    jnz        L(exit)
509
510    lea        (%ecx, %edi,1), %eax
511    lea        8(%ecx, %esi,1), %edx
512    POP        (%edi)
513    POP        (%esi)
514    jmp        L(less48bytes)
515
516    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
517    CFI_REMEMBER_STATE
518    .p2align 4
519L(shr_10):
520    cmp        $80, %ecx
521    lea        -48(%ecx), %ecx
522    mov        %edx, %eax
523    jae        L(shr_10_gobble)
524
525    movdqa     16(%esi), %xmm1
526    movdqa     %xmm1, %xmm2
527    palignr    $10, (%esi), %xmm1
528    pcmpeqb    (%edi), %xmm1
529
530    movdqa     32(%esi), %xmm3
531    palignr    $10,%xmm2, %xmm3
532    pcmpeqb    16(%edi), %xmm3
533
534    pand       %xmm1, %xmm3
535    pmovmskb   %xmm3, %edx
536    lea        32(%edi), %edi
537    lea        32(%esi), %esi
538    sub        $0xffff, %edx
539    jnz        L(exit)
540    lea        (%ecx, %edi,1), %eax
541    lea        10(%ecx, %esi,1), %edx
542    POP        (%edi)
543    POP        (%esi)
544    jmp        L(less48bytes)
545
546    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
547    CFI_REMEMBER_STATE
548    .p2align 4
549L(shr_10_gobble):
550    sub        $32, %ecx
551    movdqa     16(%esi), %xmm0
552    palignr    $10, (%esi), %xmm0
553    pcmpeqb    (%edi), %xmm0
554
555    movdqa     32(%esi), %xmm3
556    palignr    $10, 16(%esi), %xmm3
557    pcmpeqb    16(%edi), %xmm3
558
559L(shr_10_gobble_loop):
560    pand       %xmm0, %xmm3
561    sub        $32, %ecx
562    pmovmskb   %xmm3, %edx
563    movdqa     %xmm0, %xmm1
564
565    movdqa     64(%esi), %xmm3
566    palignr    $10,48(%esi), %xmm3
567    sbb        $0xffff, %edx
568    movdqa     48(%esi), %xmm0
569    palignr    $10,32(%esi), %xmm0
570    pcmpeqb    32(%edi), %xmm0
571    lea        32(%esi), %esi
572    pcmpeqb    48(%edi), %xmm3
573
574    lea        32(%edi), %edi
575    jz         L(shr_10_gobble_loop)
576    pand       %xmm0, %xmm3
577
578    cmp        $0, %ecx
579    jge        L(shr_10_gobble_next)
580    inc        %edx
581    add        $32, %ecx
582L(shr_10_gobble_next):
583    test       %edx, %edx
584    jnz        L(exit)
585
586    pmovmskb   %xmm3, %edx
587    movdqa     %xmm0, %xmm1
588    lea        32(%edi), %edi
589    lea        32(%esi), %esi
590    sub        $0xffff, %edx
591    jnz        L(exit)
592
593    lea        (%ecx, %edi,1), %eax
594    lea        10(%ecx, %esi,1), %edx
595    POP        (%edi)
596    POP        (%esi)
597    jmp        L(less48bytes)
598
599    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
600    CFI_REMEMBER_STATE
601    .p2align 4
602L(shr_12):
603    cmp        $80, %ecx
604    lea        -48(%ecx), %ecx
605    mov        %edx, %eax
606    jae        L(shr_12_gobble)
607
608    movdqa     16(%esi), %xmm1
609    movdqa     %xmm1, %xmm2
610    palignr    $12, (%esi), %xmm1
611    pcmpeqb    (%edi), %xmm1
612
613    movdqa     32(%esi), %xmm3
614    palignr    $12, %xmm2, %xmm3
615    pcmpeqb    16(%edi), %xmm3
616
617    pand       %xmm1, %xmm3
618    pmovmskb   %xmm3, %edx
619    lea        32(%edi), %edi
620    lea        32(%esi), %esi
621    sub        $0xffff, %edx
622    jnz        L(exit)
623    lea        (%ecx, %edi,1), %eax
624    lea        12(%ecx, %esi,1), %edx
625    POP        (%edi)
626    POP        (%esi)
627    jmp        L(less48bytes)
628
629    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
630    CFI_REMEMBER_STATE
631    .p2align 4
632L(shr_12_gobble):
633    sub        $32, %ecx
634    movdqa     16(%esi), %xmm0
635    palignr    $12, (%esi), %xmm0
636    pcmpeqb    (%edi), %xmm0
637
638    movdqa     32(%esi), %xmm3
639    palignr    $12, 16(%esi), %xmm3
640    pcmpeqb    16(%edi), %xmm3
641
642L(shr_12_gobble_loop):
643    pand       %xmm0, %xmm3
644    sub        $32, %ecx
645    pmovmskb   %xmm3, %edx
646    movdqa     %xmm0, %xmm1
647
648    movdqa     64(%esi), %xmm3
649    palignr    $12,48(%esi), %xmm3
650    sbb        $0xffff, %edx
651    movdqa     48(%esi), %xmm0
652    palignr    $12,32(%esi), %xmm0
653    pcmpeqb    32(%edi), %xmm0
654    lea        32(%esi), %esi
655    pcmpeqb    48(%edi), %xmm3
656
657    lea        32(%edi), %edi
658    jz         L(shr_12_gobble_loop)
659    pand       %xmm0, %xmm3
660
661    cmp        $0, %ecx
662    jge        L(shr_12_gobble_next)
663    inc        %edx
664    add        $32, %ecx
665L(shr_12_gobble_next):
666    test       %edx, %edx
667    jnz        L(exit)
668
669    pmovmskb   %xmm3, %edx
670    movdqa     %xmm0, %xmm1
671    lea        32(%edi), %edi
672    lea        32(%esi), %esi
673    sub        $0xffff, %edx
674    jnz        L(exit)
675
676    lea        (%ecx, %edi,1), %eax
677    lea        12(%ecx, %esi,1), %edx
678    POP        (%edi)
679    POP        (%esi)
680    jmp        L(less48bytes)
681
682    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
683    CFI_REMEMBER_STATE
684    .p2align 4
685L(shr_14):
686    cmp        $80, %ecx
687    lea        -48(%ecx), %ecx
688    mov        %edx, %eax
689    jae        L(shr_14_gobble)
690
691    movdqa     16(%esi), %xmm1
692    movdqa     %xmm1, %xmm2
693    palignr    $14, (%esi), %xmm1
694    pcmpeqb    (%edi), %xmm1
695
696    movdqa     32(%esi), %xmm3
697    palignr    $14, %xmm2, %xmm3
698    pcmpeqb    16(%edi), %xmm3
699
700    pand       %xmm1, %xmm3
701    pmovmskb   %xmm3, %edx
702    lea        32(%edi), %edi
703    lea        32(%esi), %esi
704    sub        $0xffff, %edx
705    jnz        L(exit)
706    lea        (%ecx, %edi,1), %eax
707    lea        14(%ecx, %esi,1), %edx
708    POP        (%edi)
709    POP        (%esi)
710    jmp        L(less48bytes)
711
712    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
713    CFI_REMEMBER_STATE
714    .p2align 4
715L(shr_14_gobble):
716    sub        $32, %ecx
717    movdqa     16(%esi), %xmm0
718    palignr    $14, (%esi), %xmm0
719    pcmpeqb    (%edi), %xmm0
720
721    movdqa     32(%esi), %xmm3
722    palignr    $14, 16(%esi), %xmm3
723    pcmpeqb    16(%edi), %xmm3
724
725L(shr_14_gobble_loop):
726    pand       %xmm0, %xmm3
727    sub        $32, %ecx
728    pmovmskb   %xmm3, %edx
729    movdqa     %xmm0, %xmm1
730
731    movdqa     64(%esi), %xmm3
732    palignr    $14,48(%esi), %xmm3
733    sbb        $0xffff, %edx
734    movdqa     48(%esi), %xmm0
735    palignr    $14,32(%esi), %xmm0
736    pcmpeqb    32(%edi), %xmm0
737    lea        32(%esi), %esi
738    pcmpeqb    48(%edi), %xmm3
739
740    lea        32(%edi), %edi
741    jz         L(shr_14_gobble_loop)
742    pand       %xmm0, %xmm3
743
744    cmp        $0, %ecx
745    jge        L(shr_14_gobble_next)
746    inc        %edx
747    add        $32, %ecx
748L(shr_14_gobble_next):
749    test       %edx, %edx
750    jnz        L(exit)
751
752    pmovmskb   %xmm3, %edx
753    movdqa     %xmm0, %xmm1
754    lea        32(%edi), %edi
755    lea        32(%esi), %esi
756    sub        $0xffff, %edx
757    jnz        L(exit)
758
759    lea        (%ecx, %edi,1), %eax
760    lea        14(%ecx, %esi,1), %edx
761    POP        (%edi)
762    POP        (%esi)
763    jmp        L(less48bytes)
764
765    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
766    CFI_REMEMBER_STATE
767    .p2align 4
768L(exit):
769    pmovmskb   %xmm1, %ebx
770    sub        $0xffff, %ebx
771    jz         L(first16bytes)
772    lea        -16(%esi), %esi
773    lea        -16(%edi), %edi
774    mov        %ebx, %edx
775
776L(first16bytes):
777    add        %eax, %esi
778L(less16bytes):
779    test       %dl, %dl
780    jz         L(next_four_words)
781    test       $15, %dl
782    jz         L(second_two_words)
783    test       $3, %dl
784    jz         L(second_word)
785    movzwl     -16(%edi), %eax
786    movzwl     -16(%esi), %ebx
787    subl       %ebx, %eax
788    RETURN
789
790    .p2align 4
791L(second_word):
792    movzwl     -14(%edi), %eax
793    movzwl     -14(%esi), %ebx
794    subl       %ebx, %eax
795    RETURN
796
797    .p2align 4
798L(second_two_words):
799    test       $63, %dl
800    jz         L(fourth_word)
801    movzwl     -12(%edi), %eax
802    movzwl     -12(%esi), %ebx
803    subl       %ebx, %eax
804    RETURN
805
806    .p2align 4
807L(fourth_word):
808    movzwl     -10(%edi), %eax
809    movzwl     -10(%esi), %ebx
810    subl       %ebx, %eax
811    RETURN
812
813    .p2align 4
814L(next_four_words):
815    test       $15, %dh
816    jz         L(fourth_two_words)
817    test       $3, %dh
818    jz         L(sixth_word)
819    movzwl     -8(%edi), %eax
820    movzwl     -8(%esi), %ebx
821    subl       %ebx, %eax
822    RETURN
823
824    .p2align 4
825L(sixth_word):
826    movzwl     -6(%edi), %eax
827    movzwl     -6(%esi), %ebx
828    subl       %ebx, %eax
829    RETURN
830
831    .p2align 4
832L(fourth_two_words):
833    test       $63, %dh
834    jz         L(eighth_word)
835    movzwl     -4(%edi), %eax
836    movzwl     -4(%esi), %ebx
837    subl       %ebx, %eax
838    RETURN
839
840    .p2align 4
841L(eighth_word):
842    movzwl     -2(%edi), %eax
843    movzwl     -2(%esi), %ebx
844    subl       %ebx, %eax
845    RETURN
846
847    # Unreachable, but needed for static analysis in the check_cfi.py script,
848    # since it does just single forward pass, but the code below is only
849    # reachable via a backward branch.
850    CFI_DEF_CFA (esp, 4)
851    PUSH       (%ebx)
852
853    .p2align 4
854L(more8bytes):
855    cmp        $16, %ecx
856    jae        L(more16bytes)
857    cmp        $8, %ecx
858    je         L(8bytes)
859    cmp        $10, %ecx
860    je         L(10bytes)
861    cmp        $12, %ecx
862    je         L(12bytes)
863    jmp        L(14bytes)
864
865    .p2align 4
866L(more16bytes):
867    cmp        $24, %ecx
868    jae        L(more24bytes)
869    cmp        $16, %ecx
870    je         L(16bytes)
871    cmp        $18, %ecx
872    je         L(18bytes)
873    cmp        $20, %ecx
874    je         L(20bytes)
875    jmp        L(22bytes)
876
877    .p2align 4
878L(more24bytes):
879    cmp        $32, %ecx
880    jae        L(more32bytes)
881    cmp        $24, %ecx
882    je         L(24bytes)
883    cmp        $26, %ecx
884    je         L(26bytes)
885    cmp        $28, %ecx
886    je         L(28bytes)
887    jmp        L(30bytes)
888
889    .p2align 4
890L(more32bytes):
891    cmp        $40, %ecx
892    jae        L(more40bytes)
893    cmp        $32, %ecx
894    je         L(32bytes)
895    cmp        $34, %ecx
896    je         L(34bytes)
897    cmp        $36, %ecx
898    je         L(36bytes)
899    jmp        L(38bytes)
900
901    .p2align 4
902L(less48bytes):
903    cmp        $8, %ecx
904    jae        L(more8bytes)
905    cmp        $2, %ecx
906    je         L(2bytes)
907    cmp        $4, %ecx
908    je         L(4bytes)
909    jmp        L(6bytes)
910
911    .p2align 4
912L(more40bytes):
913    cmp        $40, %ecx
914    je         L(40bytes)
915    cmp        $42, %ecx
916    je         L(42bytes)
917    cmp        $44, %ecx
918    je         L(44bytes)
919    jmp        L(46bytes)
920
921    .p2align 4
922L(46bytes):
923    movzwl     -46(%eax), %ecx
924    movzwl     -46(%edx), %ebx
925    subl       %ebx, %ecx
926    jne        L(memcmp16_exit)
927L(44bytes):
928    movzwl     -44(%eax), %ecx
929    movzwl     -44(%edx), %ebx
930    subl       %ebx, %ecx
931    jne        L(memcmp16_exit)
932L(42bytes):
933    movzwl     -42(%eax), %ecx
934    movzwl     -42(%edx), %ebx
935    subl       %ebx, %ecx
936    jne        L(memcmp16_exit)
937L(40bytes):
938    movzwl     -40(%eax), %ecx
939    movzwl     -40(%edx), %ebx
940    subl       %ebx, %ecx
941    jne        L(memcmp16_exit)
942L(38bytes):
943    movzwl     -38(%eax), %ecx
944    movzwl     -38(%edx), %ebx
945    subl       %ebx, %ecx
946    jne        L(memcmp16_exit)
947L(36bytes):
948    movzwl     -36(%eax), %ecx
949    movzwl     -36(%edx), %ebx
950    subl       %ebx, %ecx
951    jne        L(memcmp16_exit)
952L(34bytes):
953    movzwl     -34(%eax), %ecx
954    movzwl     -34(%edx), %ebx
955    subl       %ebx, %ecx
956    jne        L(memcmp16_exit)
957L(32bytes):
958    movzwl     -32(%eax), %ecx
959    movzwl     -32(%edx), %ebx
960    subl       %ebx, %ecx
961    jne        L(memcmp16_exit)
962L(30bytes):
963    movzwl     -30(%eax), %ecx
964    movzwl     -30(%edx), %ebx
965    subl       %ebx, %ecx
966    jne        L(memcmp16_exit)
967L(28bytes):
968    movzwl     -28(%eax), %ecx
969    movzwl     -28(%edx), %ebx
970    subl       %ebx, %ecx
971    jne        L(memcmp16_exit)
972L(26bytes):
973    movzwl     -26(%eax), %ecx
974    movzwl     -26(%edx), %ebx
975    subl       %ebx, %ecx
976    jne        L(memcmp16_exit)
977L(24bytes):
978    movzwl     -24(%eax), %ecx
979    movzwl     -24(%edx), %ebx
980    subl       %ebx, %ecx
981    jne        L(memcmp16_exit)
982L(22bytes):
983    movzwl     -22(%eax), %ecx
984    movzwl     -22(%edx), %ebx
985    subl       %ebx, %ecx
986    jne        L(memcmp16_exit)
987L(20bytes):
988    movzwl     -20(%eax), %ecx
989    movzwl     -20(%edx), %ebx
990    subl       %ebx, %ecx
991    jne        L(memcmp16_exit)
992L(18bytes):
993    movzwl     -18(%eax), %ecx
994    movzwl     -18(%edx), %ebx
995    subl       %ebx, %ecx
996    jne        L(memcmp16_exit)
997L(16bytes):
998    movzwl     -16(%eax), %ecx
999    movzwl     -16(%edx), %ebx
1000    subl       %ebx, %ecx
1001    jne        L(memcmp16_exit)
1002L(14bytes):
1003    movzwl     -14(%eax), %ecx
1004    movzwl     -14(%edx), %ebx
1005    subl       %ebx, %ecx
1006    jne        L(memcmp16_exit)
1007L(12bytes):
1008    movzwl     -12(%eax), %ecx
1009    movzwl     -12(%edx), %ebx
1010    subl       %ebx, %ecx
1011    jne        L(memcmp16_exit)
1012L(10bytes):
1013    movzwl     -10(%eax), %ecx
1014    movzwl     -10(%edx), %ebx
1015    subl       %ebx, %ecx
1016    jne        L(memcmp16_exit)
1017L(8bytes):
1018    movzwl     -8(%eax), %ecx
1019    movzwl     -8(%edx), %ebx
1020    subl       %ebx, %ecx
1021    jne        L(memcmp16_exit)
1022L(6bytes):
1023    movzwl     -6(%eax), %ecx
1024    movzwl     -6(%edx), %ebx
1025    subl       %ebx, %ecx
1026    jne        L(memcmp16_exit)
1027L(4bytes):
1028    movzwl     -4(%eax), %ecx
1029    movzwl     -4(%edx), %ebx
1030    subl       %ebx, %ecx
1031    jne        L(memcmp16_exit)
1032L(2bytes):
1033    movzwl     -2(%eax), %eax
1034    movzwl     -2(%edx), %ebx
1035    subl       %ebx, %eax
1036    POP        (%ebx)
1037    ret
1038    CFI_PUSH   (%ebx)
1039
1040    .p2align 4
1041L(memcmp16_exit):
1042    POP        (%ebx)
1043    mov        %ecx, %eax
1044    ret
1045END_FUNCTION MEMCMP
1046