1/*
2 * memchr - find a character in a memory zone
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64
12 * Neon Available.
13 */
14
15#include "../asmdefs.h"
16
17/* Arguments and results.  */
18#define srcin		x0
19#define chrin		w1
20#define cntin		x2
21
22#define result		x0
23
24#define src		x3
25#define	tmp		x4
26#define wtmp2		w5
27#define synd		x6
28#define soff		x9
29#define cntrem		x10
30
31#define vrepchr		v0
32#define vdata1		v1
33#define vdata2		v2
34#define vhas_chr1	v3
35#define vhas_chr2	v4
36#define vrepmask	v5
37#define vend		v6
38
39/*
40 * Core algorithm:
41 *
42 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
43 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
44 * requested character and bit 1 is not used (faster than using a 32bit
45 * syndrome). Since the bits in the syndrome reflect exactly the order in which
46 * things occur in the original string, counting trailing zeros allows to
47 * identify exactly which byte has matched.
48 */
49
50ENTRY (__memchr_aarch64)
51	/* Do not dereference srcin if no bytes to compare.  */
52	cbz	cntin, L(zero_length)
53	/*
54	 * Magic constant 0x40100401 allows us to identify which lane matches
55	 * the requested byte.
56	 */
57	mov	wtmp2, #0x0401
58	movk	wtmp2, #0x4010, lsl #16
59	dup	vrepchr.16b, chrin
60	/* Work with aligned 32-byte chunks */
61	bic	src, srcin, #31
62	dup	vrepmask.4s, wtmp2
63	ands	soff, srcin, #31
64	and	cntrem, cntin, #31
65	b.eq	L(loop)
66
67	/*
68	 * Input string is not 32-byte aligned. We calculate the syndrome
69	 * value for the aligned 32 bytes block containing the first bytes
70	 * and mask the irrelevant part.
71	 */
72
73	ld1	{vdata1.16b, vdata2.16b}, [src], #32
74	sub	tmp, soff, #32
75	adds	cntin, cntin, tmp
76	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
77	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
78	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
79	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
80	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
81	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
82	mov	synd, vend.d[0]
83	/* Clear the soff*2 lower bits */
84	lsl	tmp, soff, #1
85	lsr	synd, synd, tmp
86	lsl	synd, synd, tmp
87	/* The first block can also be the last */
88	b.ls	L(masklast)
89	/* Have we found something already? */
90	cbnz	synd, L(tail)
91
92L(loop):
93	ld1	{vdata1.16b, vdata2.16b}, [src], #32
94	subs	cntin, cntin, #32
95	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
96	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
97	/* If we're out of data we finish regardless of the result */
98	b.ls	L(end)
99	/* Use a fast check for the termination condition */
100	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
101	addp	vend.2d, vend.2d, vend.2d
102	mov	synd, vend.d[0]
103	/* We're not out of data, loop if we haven't found the character */
104	cbz	synd, L(loop)
105
106L(end):
107	/* Termination condition found, let's calculate the syndrome value */
108	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
109	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
110	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
111	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
112	mov	synd, vend.d[0]
113	/* Only do the clear for the last possible block */
114	b.hi	L(tail)
115
116L(masklast):
117	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
118	add	tmp, cntrem, soff
119	and	tmp, tmp, #31
120	sub	tmp, tmp, #32
121	neg	tmp, tmp, lsl #1
122	lsl	synd, synd, tmp
123	lsr	synd, synd, tmp
124
125L(tail):
126	/* Count the trailing zeros using bit reversing */
127	rbit	synd, synd
128	/* Compensate the last post-increment */
129	sub	src, src, #32
130	/* Check that we have found a character */
131	cmp	synd, #0
132	/* And count the leading zeros */
133	clz	synd, synd
134	/* Compute the potential result */
135	add	result, src, synd, lsr #1
136	/* Select result or NULL */
137	csel	result, xzr, result, eq
138	ret
139
140L(zero_length):
141	mov	result, #0
142	ret
143
144END (__memchr_aarch64)
145