1/*
2 * strlen - calculate the length of a string
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64.
12 */
13
14#include "../asmdefs.h"
15
16/* Arguments and results.  */
17#define srcin		x0
18#define len		x0
19
20/* Locals and temporaries.  */
21#define src		x1
22#define data1		x2
23#define data2		x3
24#define has_nul1	x4
25#define has_nul2	x5
26#define tmp1		x4
27#define tmp2		x5
28#define tmp3		x6
29#define tmp4		x7
30#define zeroones	x8
31#define offset		x9
32
33	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
34	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
35	   can be done in parallel across the entire word. A faster check
36	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
37	   false hits for characters 129..255.	*/
38
39#define REP8_01 0x0101010101010101
40#define REP8_7f 0x7f7f7f7f7f7f7f7f
41
42	/* This implementation is compatible with Memory Tagging. All loads
43	   are 16 bytes in size and 16 bytes aligned. This also avoids the
44	   need for page boundary checks. This implementation is correct
45	   even without Memory Tagging, but other implementations could be
46	   more beneficial if Memory Tagging is not enabled.
47
48	   First load is aligned down and can contain bytes that are located
49	   before the string. This is handled by modifying the "zeroones"
50	   mask. The bytes that need to be ignored are set to zero.
51	   If the string is aligned in such a way that 8 or more bytes from
52	   the first load should be ignored, there is a special case
53	   (skip_first_8_bytes) which only compares the second 8 bytes.
54
55	   If there is a NUL byte in the first load, we calculate the length
56	   from the 2 8-byte words using conditional select to reduce branch
57	   mispredictions.
58
59	   If the string is longer than 16 bytes, we check 32 bytes per
60	   iteration using the fast NUL check (main_loop). If we encounter
61	   non-ASCII characters, we fallback to a second loop
62	   (nonascii_loop) using the full NUL check.  */
63
64ENTRY(__strlen_aarch64_mte)
65	bic	src, srcin, 15	/* Align down to 16 bytes.  */
66	mov	zeroones, REP8_01
67	/* (offset & 63) holds number of bits to ignore in a register.*/
68	lsl	offset, srcin, 3
69	ldp	data1, data2, [src], -16
70	lsl	tmp1, zeroones, offset	/* Shift (offset & 63).  */
71#ifdef __AARCH64EB__
72	/* For big-endian, carry propagation (if the final byte in the
73	   string is 0x01) means we cannot use has_nul1/2 directly.
74	   e.g. 0x0100 - 0x0101 = 0xffff, so 0x01 will be mistaken for NUL.
75	   Since we expect strings to be small and early-exit,
76	   byte-swap the data now so has_null1/2 will be correct.  */
77	rev	data1, data1
78	rev	data2, data2
79#endif
80	tbnz	srcin, 3, L(skip_first_8_bytes)
81	sub	tmp1, data1, tmp1
82	orr	tmp2, data1, REP8_7f
83	sub	tmp3, data2, zeroones
84	orr	tmp4, data2, REP8_7f
85	bics	has_nul1, tmp1, tmp2
86	bic	has_nul2, tmp3, tmp4
87	/* If comparison happens, C flag is always set. */
88	ccmp	has_nul2, 0, 0, eq
89	beq	L(main_loop)
90
91	/* Enter with C = has_nul1 == 0.  */
92	csel	has_nul1, has_nul1, has_nul2, cc
93	and	tmp2, srcin, 7	/* Bytes to ignore. */
94	rev	has_nul1, has_nul1
95	neg	tmp2, tmp2
96	clz	tmp1, has_nul1	/* Count bits before NUL. */
97	/* Add 8 if NUL byte is not in first register. */
98	add	tmp3, tmp2, 8
99	csel	len, tmp2, tmp3, cc
100	add	len, len, tmp1, lsr 3
101	ret
102
103L(skip_first_8_bytes):
104	sub	tmp1, data2, tmp1
105	orr	tmp2, data2, REP8_7f
106	bics	has_nul1, tmp1, tmp2
107	beq	L(main_loop)
108
109	rev	has_nul1, has_nul1
110	lsl	tmp1, has_nul1, offset	/* Ignore bytes before string. */
111	clz	tmp1, tmp1	/* Count bits before NUL. */
112	lsr	len, tmp1, 3
113	ret
114
115	/* The inner loop processes 32 bytes per iteration and uses the fast
116	   NUL check.  If we encounter non-ASCII characters, use a second
117	   loop with the accurate NUL check.  */
118	.p2align 4
119L(main_loop):
120	ldp	data1, data2, [src, 32]!
121	sub	tmp1, data1, zeroones
122	sub	tmp3, data2, zeroones
123	orr	tmp2, tmp1, tmp3
124	tst	tmp2, zeroones, lsl 7
125	bne	1f
126	ldp	data1, data2, [src, 16]
127	sub	tmp1, data1, zeroones
128	sub	tmp3, data2, zeroones
129	orr	tmp2, tmp1, tmp3
130	tst	tmp2, zeroones, lsl 7
131	beq	L(main_loop)
132	add	src, src, 16
1331:
134	/* The fast check failed, so do the slower, accurate NUL check.	 */
135	orr	tmp2, data1, REP8_7f
136	orr	tmp4, data2, REP8_7f
137	bics	has_nul1, tmp1, tmp2
138	bic	has_nul2, tmp3, tmp4
139	ccmp	has_nul2, 0, 0, eq
140	beq	L(nonascii_loop)
141
142	/* Enter with C = has_nul1 == 0.  */
143L(tail):
144#ifdef __AARCH64EB__
145	/* For big-endian, carry propagation (if the final byte in the
146	   string is 0x01) means we cannot use has_nul1/2 directly.  The
147	   easiest way to get the correct byte is to byte-swap the data
148	   and calculate the syndrome a second time.  */
149	csel	data1, data1, data2, cc
150	rev	data1, data1
151	sub	tmp1, data1, zeroones
152	orr	tmp2, data1, REP8_7f
153	bic	has_nul1, tmp1, tmp2
154#else
155	csel	has_nul1, has_nul1, has_nul2, cc
156#endif
157	sub	len, src, srcin
158	rev	has_nul1, has_nul1
159	add	tmp2, len, 8
160	clz	tmp1, has_nul1
161	csel	len, len, tmp2, cc
162	add	len, len, tmp1, lsr 3
163	ret
164
165L(nonascii_loop):
166	ldp	data1, data2, [src, 16]!
167	sub	tmp1, data1, zeroones
168	orr	tmp2, data1, REP8_7f
169	sub	tmp3, data2, zeroones
170	orr	tmp4, data2, REP8_7f
171	bics	has_nul1, tmp1, tmp2
172	bic	has_nul2, tmp3, tmp4
173	ccmp	has_nul2, 0, 0, eq
174	bne	L(tail)
175	ldp	data1, data2, [src, 16]!
176	sub	tmp1, data1, zeroones
177	orr	tmp2, data1, REP8_7f
178	sub	tmp3, data2, zeroones
179	orr	tmp4, data2, REP8_7f
180	bics	has_nul1, tmp1, tmp2
181	bic	has_nul2, tmp3, tmp4
182	ccmp	has_nul2, 0, 0, eq
183	beq	L(nonascii_loop)
184	b	L(tail)
185
186END(__strlen_aarch64_mte)
187