1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "asm_support_x86.S"
18
19#define MEMCMP  __memcmp16
20
21/* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */
22
23#ifndef L
24# define L(label)    .L##label
25#endif
26
27#define CFI_PUSH(REG)    \
28    CFI_ADJUST_CFA_OFFSET(4);    \
29    CFI_REL_OFFSET(REG, 0)
30
31#define CFI_POP(REG)    \
32    CFI_ADJUST_CFA_OFFSET(-4);    \
33    CFI_RESTORE(REG)
34
35#define PUSH(REG)    pushl REG; CFI_PUSH (REG)
36#define POP(REG)    popl REG; CFI_POP (REG)
37
38#define PARMS        4
39#define BLK1        PARMS
40#define BLK2        BLK1+4
41#define LEN        BLK2+4
42#define RETURN_END    POP (%edi); POP (%esi); POP (%ebx); ret
43#define RETURN        RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
44
45DEFINE_FUNCTION MEMCMP
46    movl       LEN(%esp), %ecx
47
48    shl        $1, %ecx
49    jz         L(zero)
50
51    movl       BLK1(%esp), %eax
52    cmp        $48, %ecx
53    movl       BLK2(%esp), %edx
54    jae        L(48bytesormore)
55
56    PUSH       (%ebx)
57    add        %ecx, %edx
58    add        %ecx, %eax
59    jmp        L(less48bytes)
60
61    CFI_POP    (%ebx)
62
63    .p2align 4
64L(zero):
65    xor        %eax, %eax
66    ret
67
68    .p2align 4
69L(48bytesormore):
70    PUSH       (%ebx)
71    PUSH       (%esi)
72    PUSH       (%edi)
73    CFI_REMEMBER_STATE
74    movdqu     (%eax), %xmm3
75    movdqu     (%edx), %xmm0
76    movl       %eax, %edi
77    movl       %edx, %esi
78    pcmpeqb    %xmm0, %xmm3
79    pmovmskb   %xmm3, %edx
80    lea        16(%edi), %edi
81
82    sub        $0xffff, %edx
83    lea        16(%esi), %esi
84    jnz        L(less16bytes)
85    mov        %edi, %edx
86    and        $0xf, %edx
87    xor        %edx, %edi
88    sub        %edx, %esi
89    add        %edx, %ecx
90    mov        %esi, %edx
91    and        $0xf, %edx
92    jz         L(shr_0)
93    xor        %edx, %esi
94
95    cmp        $0, %edx
96    je         L(shr_0)
97    cmp        $2, %edx
98    je         L(shr_2)
99    cmp        $4, %edx
100    je         L(shr_4)
101    cmp        $6, %edx
102    je         L(shr_6)
103    cmp        $8, %edx
104    je         L(shr_8)
105    cmp        $10, %edx
106    je         L(shr_10)
107    cmp        $12, %edx
108    je         L(shr_12)
109    jmp        L(shr_14)
110
111    .p2align 4
112L(shr_0):
113    cmp        $80, %ecx
114    jae        L(shr_0_gobble)
115    lea        -48(%ecx), %ecx
116    xor        %eax, %eax
117    movaps     (%esi), %xmm1
118    pcmpeqb    (%edi), %xmm1
119    movaps     16(%esi), %xmm2
120    pcmpeqb    16(%edi), %xmm2
121    pand       %xmm1, %xmm2
122    pmovmskb   %xmm2, %edx
123    add        $32, %edi
124    add        $32, %esi
125    sub        $0xffff, %edx
126    jnz        L(exit)
127
128    lea        (%ecx, %edi,1), %eax
129    lea        (%ecx, %esi,1), %edx
130    POP        (%edi)
131    POP        (%esi)
132    jmp        L(less48bytes)
133
134    CFI_RESTORE_STATE
135    CFI_REMEMBER_STATE
136    .p2align 4
137L(shr_0_gobble):
138    lea        -48(%ecx), %ecx
139    movdqa     (%esi), %xmm0
140    xor        %eax, %eax
141    pcmpeqb    (%edi), %xmm0
142    sub        $32, %ecx
143    movdqa     16(%esi), %xmm2
144    pcmpeqb    16(%edi), %xmm2
145L(shr_0_gobble_loop):
146    pand       %xmm0, %xmm2
147    sub        $32, %ecx
148    pmovmskb   %xmm2, %edx
149    movdqa     %xmm0, %xmm1
150    movdqa     32(%esi), %xmm0
151    movdqa     48(%esi), %xmm2
152    sbb        $0xffff, %edx
153    pcmpeqb    32(%edi), %xmm0
154    pcmpeqb    48(%edi), %xmm2
155    lea        32(%edi), %edi
156    lea        32(%esi), %esi
157    jz         L(shr_0_gobble_loop)
158
159    pand       %xmm0, %xmm2
160    cmp        $0, %ecx
161    jge        L(shr_0_gobble_loop_next)
162    inc        %edx
163    add        $32, %ecx
164L(shr_0_gobble_loop_next):
165    test       %edx, %edx
166    jnz        L(exit)
167
168    pmovmskb %xmm2, %edx
169    movdqa     %xmm0, %xmm1
170    lea        32(%edi), %edi
171    lea        32(%esi), %esi
172    sub        $0xffff, %edx
173    jnz        L(exit)
174    lea        (%ecx, %edi,1), %eax
175    lea        (%ecx, %esi,1), %edx
176    POP        (%edi)
177    POP        (%esi)
178    jmp        L(less48bytes)
179
180    CFI_RESTORE_STATE
181    CFI_REMEMBER_STATE
182    .p2align 4
183L(shr_2):
184    cmp        $80, %ecx
185    lea        -48(%ecx), %ecx
186    mov        %edx, %eax
187    jae        L(shr_2_gobble)
188
189    movdqa     16(%esi), %xmm1
190    movdqa     %xmm1, %xmm2
191    palignr    $2,(%esi), %xmm1
192    pcmpeqb    (%edi), %xmm1
193
194    movdqa     32(%esi), %xmm3
195    palignr    $2,%xmm2, %xmm3
196    pcmpeqb    16(%edi), %xmm3
197
198    pand       %xmm1, %xmm3
199    pmovmskb   %xmm3, %edx
200    lea        32(%edi), %edi
201    lea        32(%esi), %esi
202    sub        $0xffff, %edx
203    jnz        L(exit)
204    lea        (%ecx, %edi,1), %eax
205    lea        2(%ecx, %esi,1), %edx
206    POP        (%edi)
207    POP        (%esi)
208    jmp        L(less48bytes)
209
210    CFI_RESTORE_STATE
211    CFI_REMEMBER_STATE
212    .p2align 4
213L(shr_2_gobble):
214    sub        $32, %ecx
215    movdqa     16(%esi), %xmm0
216    palignr    $2,(%esi), %xmm0
217    pcmpeqb    (%edi), %xmm0
218
219    movdqa     32(%esi), %xmm3
220    palignr    $2,16(%esi), %xmm3
221    pcmpeqb    16(%edi), %xmm3
222
223L(shr_2_gobble_loop):
224    pand       %xmm0, %xmm3
225    sub        $32, %ecx
226    pmovmskb   %xmm3, %edx
227    movdqa     %xmm0, %xmm1
228
229    movdqa     64(%esi), %xmm3
230    palignr    $2,48(%esi), %xmm3
231    sbb        $0xffff, %edx
232    movdqa     48(%esi), %xmm0
233    palignr    $2,32(%esi), %xmm0
234    pcmpeqb    32(%edi), %xmm0
235    lea        32(%esi), %esi
236    pcmpeqb    48(%edi), %xmm3
237
238    lea        32(%edi), %edi
239    jz         L(shr_2_gobble_loop)
240    pand       %xmm0, %xmm3
241
242    cmp        $0, %ecx
243    jge        L(shr_2_gobble_next)
244    inc        %edx
245    add        $32, %ecx
246L(shr_2_gobble_next):
247    test       %edx, %edx
248    jnz        L(exit)
249
250    pmovmskb   %xmm3, %edx
251    movdqa     %xmm0, %xmm1
252    lea        32(%edi), %edi
253    lea        32(%esi), %esi
254    sub        $0xffff, %edx
255    jnz        L(exit)
256
257    lea        (%ecx, %edi,1), %eax
258    lea        2(%ecx, %esi,1), %edx
259    POP        (%edi)
260    POP        (%esi)
261    jmp        L(less48bytes)
262
263    CFI_RESTORE_STATE
264    CFI_REMEMBER_STATE
265    .p2align 4
266L(shr_4):
267    cmp        $80, %ecx
268    lea        -48(%ecx), %ecx
269    mov        %edx, %eax
270    jae        L(shr_4_gobble)
271
272    movdqa     16(%esi), %xmm1
273    movdqa     %xmm1, %xmm2
274    palignr    $4,(%esi), %xmm1
275    pcmpeqb    (%edi), %xmm1
276
277    movdqa     32(%esi), %xmm3
278    palignr    $4,%xmm2, %xmm3
279    pcmpeqb    16(%edi), %xmm3
280
281    pand       %xmm1, %xmm3
282    pmovmskb   %xmm3, %edx
283    lea        32(%edi), %edi
284    lea        32(%esi), %esi
285    sub        $0xffff, %edx
286    jnz        L(exit)
287    lea        (%ecx, %edi,1), %eax
288    lea        4(%ecx, %esi,1), %edx
289    POP        (%edi)
290    POP        (%esi)
291    jmp        L(less48bytes)
292
293    CFI_RESTORE_STATE
294    CFI_REMEMBER_STATE
295    .p2align 4
296L(shr_4_gobble):
297    sub        $32, %ecx
298    movdqa     16(%esi), %xmm0
299    palignr    $4,(%esi), %xmm0
300    pcmpeqb    (%edi), %xmm0
301
302    movdqa     32(%esi), %xmm3
303    palignr    $4,16(%esi), %xmm3
304    pcmpeqb    16(%edi), %xmm3
305
306L(shr_4_gobble_loop):
307    pand       %xmm0, %xmm3
308    sub        $32, %ecx
309    pmovmskb   %xmm3, %edx
310    movdqa     %xmm0, %xmm1
311
312    movdqa     64(%esi), %xmm3
313    palignr    $4,48(%esi), %xmm3
314    sbb        $0xffff, %edx
315    movdqa     48(%esi), %xmm0
316    palignr    $4,32(%esi), %xmm0
317    pcmpeqb    32(%edi), %xmm0
318    lea        32(%esi), %esi
319    pcmpeqb    48(%edi), %xmm3
320
321    lea        32(%edi), %edi
322    jz         L(shr_4_gobble_loop)
323    pand       %xmm0, %xmm3
324
325    cmp        $0, %ecx
326    jge        L(shr_4_gobble_next)
327    inc        %edx
328    add        $32, %ecx
329L(shr_4_gobble_next):
330    test       %edx, %edx
331    jnz        L(exit)
332
333    pmovmskb   %xmm3, %edx
334    movdqa     %xmm0, %xmm1
335    lea        32(%edi), %edi
336    lea        32(%esi), %esi
337    sub        $0xffff, %edx
338    jnz        L(exit)
339
340    lea        (%ecx, %edi,1), %eax
341    lea        4(%ecx, %esi,1), %edx
342    POP        (%edi)
343    POP        (%esi)
344    jmp        L(less48bytes)
345
346    CFI_RESTORE_STATE
347    CFI_REMEMBER_STATE
348    .p2align 4
349L(shr_6):
350    cmp        $80, %ecx
351    lea        -48(%ecx), %ecx
352    mov        %edx, %eax
353    jae        L(shr_6_gobble)
354
355    movdqa     16(%esi), %xmm1
356    movdqa     %xmm1, %xmm2
357    palignr    $6,(%esi), %xmm1
358    pcmpeqb    (%edi), %xmm1
359
360    movdqa     32(%esi), %xmm3
361    palignr    $6,%xmm2, %xmm3
362    pcmpeqb    16(%edi), %xmm3
363
364    pand       %xmm1, %xmm3
365    pmovmskb   %xmm3, %edx
366    lea        32(%edi), %edi
367    lea        32(%esi), %esi
368    sub        $0xffff, %edx
369    jnz        L(exit)
370    lea        (%ecx, %edi,1), %eax
371    lea        6(%ecx, %esi,1), %edx
372    POP        (%edi)
373    POP        (%esi)
374    jmp        L(less48bytes)
375
376    CFI_RESTORE_STATE
377    CFI_REMEMBER_STATE
378    .p2align 4
379L(shr_6_gobble):
380    sub        $32, %ecx
381    movdqa     16(%esi), %xmm0
382    palignr    $6,(%esi), %xmm0
383    pcmpeqb    (%edi), %xmm0
384
385    movdqa     32(%esi), %xmm3
386    palignr    $6,16(%esi), %xmm3
387    pcmpeqb    16(%edi), %xmm3
388
389L(shr_6_gobble_loop):
390    pand       %xmm0, %xmm3
391    sub        $32, %ecx
392    pmovmskb   %xmm3, %edx
393    movdqa     %xmm0, %xmm1
394
395    movdqa     64(%esi), %xmm3
396    palignr    $6,48(%esi), %xmm3
397    sbb        $0xffff, %edx
398    movdqa     48(%esi), %xmm0
399    palignr    $6,32(%esi), %xmm0
400    pcmpeqb    32(%edi), %xmm0
401    lea        32(%esi), %esi
402    pcmpeqb    48(%edi), %xmm3
403
404    lea        32(%edi), %edi
405    jz         L(shr_6_gobble_loop)
406    pand       %xmm0, %xmm3
407
408    cmp        $0, %ecx
409    jge        L(shr_6_gobble_next)
410    inc        %edx
411    add        $32, %ecx
412L(shr_6_gobble_next):
413    test       %edx, %edx
414    jnz        L(exit)
415
416    pmovmskb   %xmm3, %edx
417    movdqa     %xmm0, %xmm1
418    lea        32(%edi), %edi
419    lea        32(%esi), %esi
420    sub        $0xffff, %edx
421    jnz        L(exit)
422
423    lea        (%ecx, %edi,1), %eax
424    lea        6(%ecx, %esi,1), %edx
425    POP        (%edi)
426    POP        (%esi)
427    jmp        L(less48bytes)
428
429    CFI_RESTORE_STATE
430    CFI_REMEMBER_STATE
431    .p2align 4
432L(shr_8):
433    cmp        $80, %ecx
434    lea        -48(%ecx), %ecx
435    mov        %edx, %eax
436    jae        L(shr_8_gobble)
437
438    movdqa     16(%esi), %xmm1
439    movdqa     %xmm1, %xmm2
440    palignr    $8,(%esi), %xmm1
441    pcmpeqb    (%edi), %xmm1
442
443    movdqa     32(%esi), %xmm3
444    palignr    $8,%xmm2, %xmm3
445    pcmpeqb    16(%edi), %xmm3
446
447    pand       %xmm1, %xmm3
448    pmovmskb   %xmm3, %edx
449    lea        32(%edi), %edi
450    lea        32(%esi), %esi
451    sub        $0xffff, %edx
452    jnz        L(exit)
453    lea        (%ecx, %edi,1), %eax
454    lea        8(%ecx, %esi,1), %edx
455    POP        (%edi)
456    POP        (%esi)
457    jmp        L(less48bytes)
458
459    CFI_RESTORE_STATE
460    CFI_REMEMBER_STATE
461    .p2align 4
462L(shr_8_gobble):
463    sub        $32, %ecx
464    movdqa     16(%esi), %xmm0
465    palignr    $8,(%esi), %xmm0
466    pcmpeqb    (%edi), %xmm0
467
468    movdqa     32(%esi), %xmm3
469    palignr    $8,16(%esi), %xmm3
470    pcmpeqb    16(%edi), %xmm3
471
472L(shr_8_gobble_loop):
473    pand       %xmm0, %xmm3
474    sub        $32, %ecx
475    pmovmskb   %xmm3, %edx
476    movdqa     %xmm0, %xmm1
477
478    movdqa     64(%esi), %xmm3
479    palignr    $8,48(%esi), %xmm3
480    sbb        $0xffff, %edx
481    movdqa     48(%esi), %xmm0
482    palignr    $8,32(%esi), %xmm0
483    pcmpeqb    32(%edi), %xmm0
484    lea        32(%esi), %esi
485    pcmpeqb    48(%edi), %xmm3
486
487    lea        32(%edi), %edi
488    jz         L(shr_8_gobble_loop)
489    pand       %xmm0, %xmm3
490
491    cmp        $0, %ecx
492    jge        L(shr_8_gobble_next)
493    inc        %edx
494    add        $32, %ecx
495L(shr_8_gobble_next):
496    test       %edx, %edx
497    jnz        L(exit)
498
499    pmovmskb   %xmm3, %edx
500    movdqa     %xmm0, %xmm1
501    lea        32(%edi), %edi
502    lea        32(%esi), %esi
503    sub        $0xffff, %edx
504    jnz        L(exit)
505
506    lea        (%ecx, %edi,1), %eax
507    lea        8(%ecx, %esi,1), %edx
508    POP        (%edi)
509    POP        (%esi)
510    jmp        L(less48bytes)
511
512    CFI_RESTORE_STATE
513    CFI_REMEMBER_STATE
514    .p2align 4
515L(shr_10):
516    cmp        $80, %ecx
517    lea        -48(%ecx), %ecx
518    mov        %edx, %eax
519    jae        L(shr_10_gobble)
520
521    movdqa     16(%esi), %xmm1
522    movdqa     %xmm1, %xmm2
523    palignr    $10, (%esi), %xmm1
524    pcmpeqb    (%edi), %xmm1
525
526    movdqa     32(%esi), %xmm3
527    palignr    $10,%xmm2, %xmm3
528    pcmpeqb    16(%edi), %xmm3
529
530    pand       %xmm1, %xmm3
531    pmovmskb   %xmm3, %edx
532    lea        32(%edi), %edi
533    lea        32(%esi), %esi
534    sub        $0xffff, %edx
535    jnz        L(exit)
536    lea        (%ecx, %edi,1), %eax
537    lea        10(%ecx, %esi,1), %edx
538    POP        (%edi)
539    POP        (%esi)
540    jmp        L(less48bytes)
541
542    CFI_RESTORE_STATE
543    CFI_REMEMBER_STATE
544    .p2align 4
545L(shr_10_gobble):
546    sub        $32, %ecx
547    movdqa     16(%esi), %xmm0
548    palignr    $10, (%esi), %xmm0
549    pcmpeqb    (%edi), %xmm0
550
551    movdqa     32(%esi), %xmm3
552    palignr    $10, 16(%esi), %xmm3
553    pcmpeqb    16(%edi), %xmm3
554
555L(shr_10_gobble_loop):
556    pand       %xmm0, %xmm3
557    sub        $32, %ecx
558    pmovmskb   %xmm3, %edx
559    movdqa     %xmm0, %xmm1
560
561    movdqa     64(%esi), %xmm3
562    palignr    $10,48(%esi), %xmm3
563    sbb        $0xffff, %edx
564    movdqa     48(%esi), %xmm0
565    palignr    $10,32(%esi), %xmm0
566    pcmpeqb    32(%edi), %xmm0
567    lea        32(%esi), %esi
568    pcmpeqb    48(%edi), %xmm3
569
570    lea        32(%edi), %edi
571    jz         L(shr_10_gobble_loop)
572    pand       %xmm0, %xmm3
573
574    cmp        $0, %ecx
575    jge        L(shr_10_gobble_next)
576    inc        %edx
577    add        $32, %ecx
578L(shr_10_gobble_next):
579    test       %edx, %edx
580    jnz        L(exit)
581
582    pmovmskb   %xmm3, %edx
583    movdqa     %xmm0, %xmm1
584    lea        32(%edi), %edi
585    lea        32(%esi), %esi
586    sub        $0xffff, %edx
587    jnz        L(exit)
588
589    lea        (%ecx, %edi,1), %eax
590    lea        10(%ecx, %esi,1), %edx
591    POP        (%edi)
592    POP        (%esi)
593    jmp        L(less48bytes)
594
595    CFI_RESTORE_STATE
596    CFI_REMEMBER_STATE
597    .p2align 4
598L(shr_12):
599    cmp        $80, %ecx
600    lea        -48(%ecx), %ecx
601    mov        %edx, %eax
602    jae        L(shr_12_gobble)
603
604    movdqa     16(%esi), %xmm1
605    movdqa     %xmm1, %xmm2
606    palignr    $12, (%esi), %xmm1
607    pcmpeqb    (%edi), %xmm1
608
609    movdqa     32(%esi), %xmm3
610    palignr    $12, %xmm2, %xmm3
611    pcmpeqb    16(%edi), %xmm3
612
613    pand       %xmm1, %xmm3
614    pmovmskb   %xmm3, %edx
615    lea        32(%edi), %edi
616    lea        32(%esi), %esi
617    sub        $0xffff, %edx
618    jnz        L(exit)
619    lea        (%ecx, %edi,1), %eax
620    lea        12(%ecx, %esi,1), %edx
621    POP        (%edi)
622    POP        (%esi)
623    jmp        L(less48bytes)
624
625    CFI_RESTORE_STATE
626    CFI_REMEMBER_STATE
627    .p2align 4
628L(shr_12_gobble):
629    sub        $32, %ecx
630    movdqa     16(%esi), %xmm0
631    palignr    $12, (%esi), %xmm0
632    pcmpeqb    (%edi), %xmm0
633
634    movdqa     32(%esi), %xmm3
635    palignr    $12, 16(%esi), %xmm3
636    pcmpeqb    16(%edi), %xmm3
637
638L(shr_12_gobble_loop):
639    pand       %xmm0, %xmm3
640    sub        $32, %ecx
641    pmovmskb   %xmm3, %edx
642    movdqa     %xmm0, %xmm1
643
644    movdqa     64(%esi), %xmm3
645    palignr    $12,48(%esi), %xmm3
646    sbb        $0xffff, %edx
647    movdqa     48(%esi), %xmm0
648    palignr    $12,32(%esi), %xmm0
649    pcmpeqb    32(%edi), %xmm0
650    lea        32(%esi), %esi
651    pcmpeqb    48(%edi), %xmm3
652
653    lea        32(%edi), %edi
654    jz         L(shr_12_gobble_loop)
655    pand       %xmm0, %xmm3
656
657    cmp        $0, %ecx
658    jge        L(shr_12_gobble_next)
659    inc        %edx
660    add        $32, %ecx
661L(shr_12_gobble_next):
662    test       %edx, %edx
663    jnz        L(exit)
664
665    pmovmskb   %xmm3, %edx
666    movdqa     %xmm0, %xmm1
667    lea        32(%edi), %edi
668    lea        32(%esi), %esi
669    sub        $0xffff, %edx
670    jnz        L(exit)
671
672    lea        (%ecx, %edi,1), %eax
673    lea        12(%ecx, %esi,1), %edx
674    POP        (%edi)
675    POP        (%esi)
676    jmp        L(less48bytes)
677
678    CFI_RESTORE_STATE
679    CFI_REMEMBER_STATE
680    .p2align 4
681L(shr_14):
682    cmp        $80, %ecx
683    lea        -48(%ecx), %ecx
684    mov        %edx, %eax
685    jae        L(shr_14_gobble)
686
687    movdqa     16(%esi), %xmm1
688    movdqa     %xmm1, %xmm2
689    palignr    $14, (%esi), %xmm1
690    pcmpeqb    (%edi), %xmm1
691
692    movdqa     32(%esi), %xmm3
693    palignr    $14, %xmm2, %xmm3
694    pcmpeqb    16(%edi), %xmm3
695
696    pand       %xmm1, %xmm3
697    pmovmskb   %xmm3, %edx
698    lea        32(%edi), %edi
699    lea        32(%esi), %esi
700    sub        $0xffff, %edx
701    jnz        L(exit)
702    lea        (%ecx, %edi,1), %eax
703    lea        14(%ecx, %esi,1), %edx
704    POP        (%edi)
705    POP        (%esi)
706    jmp        L(less48bytes)
707
708    CFI_RESTORE_STATE
709    CFI_REMEMBER_STATE
710    .p2align 4
711L(shr_14_gobble):
712    sub        $32, %ecx
713    movdqa     16(%esi), %xmm0
714    palignr    $14, (%esi), %xmm0
715    pcmpeqb    (%edi), %xmm0
716
717    movdqa     32(%esi), %xmm3
718    palignr    $14, 16(%esi), %xmm3
719    pcmpeqb    16(%edi), %xmm3
720
721L(shr_14_gobble_loop):
722    pand       %xmm0, %xmm3
723    sub        $32, %ecx
724    pmovmskb   %xmm3, %edx
725    movdqa     %xmm0, %xmm1
726
727    movdqa     64(%esi), %xmm3
728    palignr    $14,48(%esi), %xmm3
729    sbb        $0xffff, %edx
730    movdqa     48(%esi), %xmm0
731    palignr    $14,32(%esi), %xmm0
732    pcmpeqb    32(%edi), %xmm0
733    lea        32(%esi), %esi
734    pcmpeqb    48(%edi), %xmm3
735
736    lea        32(%edi), %edi
737    jz         L(shr_14_gobble_loop)
738    pand       %xmm0, %xmm3
739
740    cmp        $0, %ecx
741    jge        L(shr_14_gobble_next)
742    inc        %edx
743    add        $32, %ecx
744L(shr_14_gobble_next):
745    test       %edx, %edx
746    jnz        L(exit)
747
748    pmovmskb   %xmm3, %edx
749    movdqa     %xmm0, %xmm1
750    lea        32(%edi), %edi
751    lea        32(%esi), %esi
752    sub        $0xffff, %edx
753    jnz        L(exit)
754
755    lea        (%ecx, %edi,1), %eax
756    lea        14(%ecx, %esi,1), %edx
757    POP        (%edi)
758    POP        (%esi)
759    jmp        L(less48bytes)
760
761    CFI_RESTORE_STATE
762    CFI_REMEMBER_STATE
763    .p2align 4
764L(exit):
765    pmovmskb   %xmm1, %ebx
766    sub        $0xffff, %ebx
767    jz         L(first16bytes)
768    lea        -16(%esi), %esi
769    lea        -16(%edi), %edi
770    mov        %ebx, %edx
771
772L(first16bytes):
773    add        %eax, %esi
774L(less16bytes):
775    test       %dl, %dl
776    jz         L(next_four_words)
777    test       $15, %dl
778    jz         L(second_two_words)
779    test       $3, %dl
780    jz         L(second_word)
781    movzwl     -16(%edi), %eax
782    movzwl     -16(%esi), %ebx
783    subl       %ebx, %eax
784    RETURN
785
786    .p2align 4
787L(second_word):
788    movzwl     -14(%edi), %eax
789    movzwl     -14(%esi), %ebx
790    subl       %ebx, %eax
791    RETURN
792
793    .p2align 4
794L(second_two_words):
795    test       $63, %dl
796    jz         L(fourth_word)
797    movzwl     -12(%edi), %eax
798    movzwl     -12(%esi), %ebx
799    subl       %ebx, %eax
800    RETURN
801
802    .p2align 4
803L(fourth_word):
804    movzwl     -10(%edi), %eax
805    movzwl     -10(%esi), %ebx
806    subl       %ebx, %eax
807    RETURN
808
809    .p2align 4
810L(next_four_words):
811    test       $15, %dh
812    jz         L(fourth_two_words)
813    test       $3, %dh
814    jz         L(sixth_word)
815    movzwl     -8(%edi), %eax
816    movzwl     -8(%esi), %ebx
817    subl       %ebx, %eax
818    RETURN
819
820    .p2align 4
821L(sixth_word):
822    movzwl     -6(%edi), %eax
823    movzwl     -6(%esi), %ebx
824    subl       %ebx, %eax
825    RETURN
826
827    .p2align 4
828L(fourth_two_words):
829    test       $63, %dh
830    jz         L(eighth_word)
831    movzwl     -4(%edi), %eax
832    movzwl     -4(%esi), %ebx
833    subl       %ebx, %eax
834    RETURN
835
836    .p2align 4
837L(eighth_word):
838    movzwl     -2(%edi), %eax
839    movzwl     -2(%esi), %ebx
840    subl       %ebx, %eax
841    RETURN
842
843
844    CFI_PUSH (%ebx)
845
846    .p2align 4
847L(more8bytes):
848    cmp        $16, %ecx
849    jae        L(more16bytes)
850    cmp        $8, %ecx
851    je         L(8bytes)
852    cmp        $10, %ecx
853    je         L(10bytes)
854    cmp        $12, %ecx
855    je         L(12bytes)
856    jmp        L(14bytes)
857
858    .p2align 4
859L(more16bytes):
860    cmp        $24, %ecx
861    jae        L(more24bytes)
862    cmp        $16, %ecx
863    je         L(16bytes)
864    cmp        $18, %ecx
865    je         L(18bytes)
866    cmp        $20, %ecx
867    je         L(20bytes)
868    jmp        L(22bytes)
869
870    .p2align 4
871L(more24bytes):
872    cmp        $32, %ecx
873    jae        L(more32bytes)
874    cmp        $24, %ecx
875    je         L(24bytes)
876    cmp        $26, %ecx
877    je         L(26bytes)
878    cmp        $28, %ecx
879    je         L(28bytes)
880    jmp        L(30bytes)
881
882    .p2align 4
883L(more32bytes):
884    cmp        $40, %ecx
885    jae        L(more40bytes)
886    cmp        $32, %ecx
887    je         L(32bytes)
888    cmp        $34, %ecx
889    je         L(34bytes)
890    cmp        $36, %ecx
891    je         L(36bytes)
892    jmp        L(38bytes)
893
894    .p2align 4
895L(less48bytes):
896    cmp        $8, %ecx
897    jae        L(more8bytes)
898    cmp        $2, %ecx
899    je         L(2bytes)
900    cmp        $4, %ecx
901    je         L(4bytes)
902    jmp        L(6bytes)
903
904    .p2align 4
905L(more40bytes):
906    cmp        $40, %ecx
907    je         L(40bytes)
908    cmp        $42, %ecx
909    je         L(42bytes)
910    cmp        $44, %ecx
911    je         L(44bytes)
912    jmp        L(46bytes)
913
914    .p2align 4
915L(46bytes):
916    movzwl     -46(%eax), %ecx
917    movzwl     -46(%edx), %ebx
918    subl       %ebx, %ecx
919    jne        L(memcmp16_exit)
920L(44bytes):
921    movzwl     -44(%eax), %ecx
922    movzwl     -44(%edx), %ebx
923    subl       %ebx, %ecx
924    jne        L(memcmp16_exit)
925L(42bytes):
926    movzwl     -42(%eax), %ecx
927    movzwl     -42(%edx), %ebx
928    subl       %ebx, %ecx
929    jne        L(memcmp16_exit)
930L(40bytes):
931    movzwl     -40(%eax), %ecx
932    movzwl     -40(%edx), %ebx
933    subl       %ebx, %ecx
934    jne        L(memcmp16_exit)
935L(38bytes):
936    movzwl     -38(%eax), %ecx
937    movzwl     -38(%edx), %ebx
938    subl       %ebx, %ecx
939    jne        L(memcmp16_exit)
940L(36bytes):
941    movzwl     -36(%eax), %ecx
942    movzwl     -36(%edx), %ebx
943    subl       %ebx, %ecx
944    jne        L(memcmp16_exit)
945L(34bytes):
946    movzwl     -34(%eax), %ecx
947    movzwl     -34(%edx), %ebx
948    subl       %ebx, %ecx
949    jne        L(memcmp16_exit)
950L(32bytes):
951    movzwl     -32(%eax), %ecx
952    movzwl     -32(%edx), %ebx
953    subl       %ebx, %ecx
954    jne        L(memcmp16_exit)
955L(30bytes):
956    movzwl     -30(%eax), %ecx
957    movzwl     -30(%edx), %ebx
958    subl       %ebx, %ecx
959    jne        L(memcmp16_exit)
960L(28bytes):
961    movzwl     -28(%eax), %ecx
962    movzwl     -28(%edx), %ebx
963    subl       %ebx, %ecx
964    jne        L(memcmp16_exit)
965L(26bytes):
966    movzwl     -26(%eax), %ecx
967    movzwl     -26(%edx), %ebx
968    subl       %ebx, %ecx
969    jne        L(memcmp16_exit)
970L(24bytes):
971    movzwl     -24(%eax), %ecx
972    movzwl     -24(%edx), %ebx
973    subl       %ebx, %ecx
974    jne        L(memcmp16_exit)
975L(22bytes):
976    movzwl     -22(%eax), %ecx
977    movzwl     -22(%edx), %ebx
978    subl       %ebx, %ecx
979    jne        L(memcmp16_exit)
980L(20bytes):
981    movzwl     -20(%eax), %ecx
982    movzwl     -20(%edx), %ebx
983    subl       %ebx, %ecx
984    jne        L(memcmp16_exit)
985L(18bytes):
986    movzwl     -18(%eax), %ecx
987    movzwl     -18(%edx), %ebx
988    subl       %ebx, %ecx
989    jne        L(memcmp16_exit)
990L(16bytes):
991    movzwl     -16(%eax), %ecx
992    movzwl     -16(%edx), %ebx
993    subl       %ebx, %ecx
994    jne        L(memcmp16_exit)
995L(14bytes):
996    movzwl     -14(%eax), %ecx
997    movzwl     -14(%edx), %ebx
998    subl       %ebx, %ecx
999    jne        L(memcmp16_exit)
1000L(12bytes):
1001    movzwl     -12(%eax), %ecx
1002    movzwl     -12(%edx), %ebx
1003    subl       %ebx, %ecx
1004    jne        L(memcmp16_exit)
1005L(10bytes):
1006    movzwl     -10(%eax), %ecx
1007    movzwl     -10(%edx), %ebx
1008    subl       %ebx, %ecx
1009    jne        L(memcmp16_exit)
1010L(8bytes):
1011    movzwl     -8(%eax), %ecx
1012    movzwl     -8(%edx), %ebx
1013    subl       %ebx, %ecx
1014    jne        L(memcmp16_exit)
1015L(6bytes):
1016    movzwl     -6(%eax), %ecx
1017    movzwl     -6(%edx), %ebx
1018    subl       %ebx, %ecx
1019    jne        L(memcmp16_exit)
1020L(4bytes):
1021    movzwl     -4(%eax), %ecx
1022    movzwl     -4(%edx), %ebx
1023    subl       %ebx, %ecx
1024    jne        L(memcmp16_exit)
1025L(2bytes):
1026    movzwl     -2(%eax), %eax
1027    movzwl     -2(%edx), %ebx
1028    subl       %ebx, %eax
1029    POP        (%ebx)
1030    ret
1031    CFI_PUSH   (%ebx)
1032
1033    .p2align 4
1034L(memcmp16_exit):
1035    POP        (%ebx)
1036    mov        %ecx, %eax
1037    ret
1038END_FUNCTION MEMCMP
1039