1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *  Copyright (c) 2016-2019 FUJITSU LIMITED. All rights reserved.
4  *  Author: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
5  *  Ported: Guangwen Feng <fenggw-fnst@cn.fujitsu.com>
6  *  Ported: Xiao Yang <yangx.jy@cn.fujitsu.com>
7  *  Ported: Yang Xu <xuyang2018.jy@cn.jujitsu.com>
8  */
9 
10 /*
11  * Description:
12  *
13  * Test #1:
14  *  This is a regression test for the race condition between move_pages()
15  *  and freeing hugepages, where move_pages() calls follow_page(FOLL_GET)
16  *  for hugepages internally and tries to get its refcount without
17  *  preventing concurrent freeing.
18  *
19  *  This test can crash the buggy kernel, and the bug was fixed in:
20  *
21  *   commit e66f17ff71772b209eed39de35aaa99ba819c93d
22  *   Author: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
23  *   Date:   Wed Feb 11 15:25:22 2015 -0800
24  *
25  *   mm/hugetlb: take page table lock in follow_huge_pmd()
26  *
27  *  Test #2:
28  *   #2.1:
29  *   This is a regression test for the race condition, where move_pages()
30  *   and soft offline are called on a single hugetlb page concurrently.
31  *
32  *   This bug can crash the buggy kernel, and was fixed by:
33  *
34  *   commit c9d398fa237882ea07167e23bcfc5e6847066518
35  *   Author: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
36  *   Date:   Fri Mar 31 15:11:55 2017 -0700
37  *
38  *   mm, hugetlb: use pte_present() instead of pmd_present() in
39  *   follow_huge_pmd()
40  *
41  *   #2.2:
42  *   This is also a regression test for an race condition causing SIGBUS
43  *   in hugepage migration/fault.
44  *
45  *   This bug was fixed by:
46  *
47  *   commit 4643d67e8cb0b3536ef0ab5cddd1cedc73fa14ad
48  *   Author: Mike Kravetz <mike.kravetz@oracle.com>
49  *   Date:   Tue Aug 13 15:38:00 2019 -0700
50  *
51  *   hugetlbfs: fix hugetlb page migration/fault race causing SIGBUS
52  *
53  */
54 
55 #include <errno.h>
56 #include <unistd.h>
57 #include <string.h>
58 #include <stdio.h>
59 #include <sys/types.h>
60 #include <sys/wait.h>
61 
62 #include "tst_test.h"
63 #include "move_pages_support.h"
64 #include "lapi/mmap.h"
65 
66 #ifdef HAVE_NUMA_V2
67 
68 #define LOOPS	1000
69 #define PATH_MEMINFO	"/proc/meminfo"
70 #define PATH_NR_HUGEPAGES	"/proc/sys/vm/nr_hugepages"
71 #define PATH_HUGEPAGES	"/sys/kernel/mm/hugepages/"
72 #define TEST_NODES	2
73 
74 static struct tcase {
75 	int tpages;
76 	int offline;
77 } tcases[] = {
78 	{2, 0},
79 	{2, 1},
80 };
81 
82 static int pgsz, hpsz;
83 static long orig_hugepages = -1;
84 static char path_hugepages_node1[PATH_MAX];
85 static char path_hugepages_node2[PATH_MAX];
86 static long orig_hugepages_node1 = -1;
87 static long orig_hugepages_node2 = -1;
88 static unsigned int node1, node2;
89 static void *addr;
90 
do_soft_offline(int tpgs)91 static int do_soft_offline(int tpgs)
92 {
93 	if (madvise(addr, tpgs * hpsz, MADV_SOFT_OFFLINE) == -1) {
94 		if (errno != EINVAL && errno != EBUSY)
95 			tst_res(TFAIL | TERRNO, "madvise failed");
96 		return errno;
97 	}
98 	return 0;
99 }
100 
do_child(int tpgs)101 static void do_child(int tpgs)
102 {
103 	int test_pages = tpgs * hpsz / pgsz;
104 	int i, j;
105 	int *nodes, *status;
106 	void **pages;
107 	pid_t ppid = getppid();
108 
109 	pages = SAFE_MALLOC(sizeof(char *) * test_pages);
110 	nodes = SAFE_MALLOC(sizeof(int) * test_pages);
111 	status = SAFE_MALLOC(sizeof(int) * test_pages);
112 
113 	for (i = 0; i < test_pages; i++)
114 		pages[i] = addr + i * pgsz;
115 
116 	for (i = 0; ; i++) {
117 		for (j = 0; j < test_pages; j++) {
118 			if (i % 2 == 0)
119 				nodes[j] = node1;
120 			else
121 				nodes[j] = node2;
122 			status[j] = 0;
123 		}
124 
125 		TEST(numa_move_pages(ppid, test_pages,
126 			pages, nodes, status, MPOL_MF_MOVE_ALL));
127 		if (TST_RET < 0) {
128 			if (errno == ENOMEM)
129 				continue;
130 
131 			tst_res(TFAIL | TTERRNO, "move_pages failed");
132 			break;
133 		}
134 	}
135 
136 	exit(0);
137 }
138 
do_test(unsigned int n)139 static void do_test(unsigned int n)
140 {
141 	int i, ret;
142 	void *ptr;
143 	pid_t cpid = -1;
144 	int status;
145 	unsigned int twenty_percent = (tst_timeout_remaining() / 5);
146 
147 	addr = SAFE_MMAP(NULL, tcases[n].tpages * hpsz, PROT_READ | PROT_WRITE,
148 		MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
149 
150 	SAFE_MUNMAP(addr, tcases[n].tpages * hpsz);
151 
152 	cpid = SAFE_FORK();
153 	if (cpid == 0)
154 		do_child(tcases[n].tpages);
155 
156 	for (i = 0; i < LOOPS; i++) {
157 		ptr = mmap(NULL, tcases[n].tpages * hpsz,
158 				PROT_READ | PROT_WRITE,
159 				MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
160 		if (ptr == MAP_FAILED) {
161 			if (i == 0)
162 				tst_brk(TBROK | TERRNO, "Cannot allocate hugepage");
163 
164 			if (errno == ENOMEM) {
165 				usleep(1000);
166 				continue;
167 			}
168 		}
169 
170 		if (ptr != addr)
171 			tst_brk(TBROK, "Failed to mmap at desired addr");
172 
173 		memset(addr, 0, tcases[n].tpages * hpsz);
174 
175 		if (tcases[n].offline) {
176 			ret = do_soft_offline(tcases[n].tpages);
177 
178 			if (ret == EINVAL) {
179 				SAFE_KILL(cpid, SIGKILL);
180 				SAFE_WAITPID(cpid, &status, 0);
181 				SAFE_MUNMAP(addr, tcases[n].tpages * hpsz);
182 				tst_res(TCONF,
183 					"madvise() didn't support MADV_SOFT_OFFLINE");
184 				return;
185 			}
186 		}
187 
188 		SAFE_MUNMAP(addr, tcases[n].tpages * hpsz);
189 
190 		if (tst_timeout_remaining() < twenty_percent)
191 			break;
192 	}
193 
194 	SAFE_KILL(cpid, SIGKILL);
195 	SAFE_WAITPID(cpid, &status, 0);
196 	if (!WIFEXITED(status))
197 		tst_res(TPASS, "Bug not reproduced");
198 }
199 
alloc_free_huge_on_node(unsigned int node,size_t size)200 static void alloc_free_huge_on_node(unsigned int node, size_t size)
201 {
202 	char *mem;
203 	long ret;
204 	struct bitmask *bm;
205 
206 	tst_res(TINFO, "Allocating and freeing %zu hugepages on node %u",
207 		size / hpsz, node);
208 
209 	mem = mmap(NULL, size, PROT_READ | PROT_WRITE,
210 		   MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
211 	if (mem == MAP_FAILED) {
212 		if (errno == ENOMEM)
213 			tst_brk(TCONF, "Cannot allocate huge pages");
214 
215 		tst_brk(TBROK | TERRNO, "mmap(..., MAP_HUGETLB, ...) failed");
216 	}
217 
218 	bm = numa_bitmask_alloc(numa_max_possible_node() + 1);
219 	if (!bm)
220 		tst_brk(TBROK | TERRNO, "numa_bitmask_alloc() failed");
221 
222 	numa_bitmask_setbit(bm, node);
223 
224 	ret = mbind(mem, size, MPOL_BIND, bm->maskp, bm->size + 1, 0);
225 	if (ret) {
226 		if (errno == ENOMEM)
227 			tst_brk(TCONF, "Cannot mbind huge pages");
228 
229 		tst_brk(TBROK | TERRNO, "mbind() failed");
230 	}
231 
232 	TEST(mlock(mem, size));
233 	if (TST_RET) {
234 		SAFE_MUNMAP(mem, size);
235 		if (TST_ERR == ENOMEM || TST_ERR == EAGAIN)
236 			tst_brk(TCONF, "Cannot lock huge pages");
237 		tst_brk(TBROK | TTERRNO, "mlock failed");
238 	}
239 
240 	numa_bitmask_free(bm);
241 
242 	SAFE_MUNMAP(mem, size);
243 }
244 
setup(void)245 static void setup(void)
246 {
247 	int ret;
248 	long memfree;
249 
250 	check_config(TEST_NODES);
251 
252 	if (access(PATH_HUGEPAGES, F_OK))
253 		tst_brk(TCONF, "Huge page not supported");
254 
255 	ret = get_allowed_nodes(NH_MEMS, TEST_NODES, &node1, &node2);
256 	if (ret < 0)
257 		tst_brk(TBROK | TERRNO, "get_allowed_nodes: %d", ret);
258 
259 	pgsz = (int)get_page_size();
260 	SAFE_FILE_LINES_SCANF(PATH_MEMINFO, "Hugepagesize: %d", &hpsz);
261 
262 	SAFE_FILE_LINES_SCANF(PATH_MEMINFO, "MemFree: %ld", &memfree);
263 	tst_res(TINFO, "Free RAM %ld kB", memfree);
264 
265 	if (4 * hpsz > memfree)
266 		tst_brk(TBROK, "Not enough free RAM");
267 
268 	snprintf(path_hugepages_node1, sizeof(path_hugepages_node1),
269 		 "/sys/devices/system/node/node%u/hugepages/hugepages-%dkB/nr_hugepages",
270 		 node1, hpsz);
271 
272 	snprintf(path_hugepages_node2, sizeof(path_hugepages_node2),
273 		 "/sys/devices/system/node/node%u/hugepages/hugepages-%dkB/nr_hugepages",
274 		 node2, hpsz);
275 
276 	if (!access(path_hugepages_node1, F_OK)) {
277 		SAFE_FILE_SCANF(path_hugepages_node1,
278 				"%ld", &orig_hugepages_node1);
279 		tst_res(TINFO,
280 			"Increasing %dkB hugepages pool on node %u to %ld",
281 			hpsz, node1, orig_hugepages_node1 + 4);
282 		SAFE_FILE_PRINTF(path_hugepages_node1,
283 				 "%ld", orig_hugepages_node1 + 4);
284 	}
285 
286 	if (!access(path_hugepages_node2, F_OK)) {
287 		SAFE_FILE_SCANF(path_hugepages_node2,
288 				"%ld", &orig_hugepages_node2);
289 		tst_res(TINFO,
290 			"Increasing %dkB hugepages pool on node %u to %ld",
291 			hpsz, node2, orig_hugepages_node2 + 4);
292 		SAFE_FILE_PRINTF(path_hugepages_node2,
293 				 "%ld", orig_hugepages_node2 + 4);
294 	}
295 
296 	hpsz *= 1024;
297 
298 	if (orig_hugepages_node1 == -1 || orig_hugepages_node2 == -1) {
299 		SAFE_FILE_SCANF(PATH_NR_HUGEPAGES, "%ld", &orig_hugepages);
300 		tst_res(TINFO, "Increasing global hugepages pool to %ld",
301 			orig_hugepages + 8);
302 		SAFE_FILE_PRINTF(PATH_NR_HUGEPAGES, "%ld", orig_hugepages + 8);
303 	}
304 
305 	alloc_free_huge_on_node(node1, 4L * hpsz);
306 	alloc_free_huge_on_node(node2, 4L * hpsz);
307 }
308 
cleanup(void)309 static void cleanup(void)
310 {
311 	if (orig_hugepages != -1)
312 		SAFE_FILE_PRINTF(PATH_NR_HUGEPAGES, "%ld", orig_hugepages);
313 
314 	if (orig_hugepages_node1 != -1) {
315 		SAFE_FILE_PRINTF(path_hugepages_node1,
316 				 "%ld", orig_hugepages_node1);
317 	}
318 
319 	if (orig_hugepages_node2 != -1) {
320 		SAFE_FILE_PRINTF(path_hugepages_node2,
321 				 "%ld", orig_hugepages_node2);
322 	}
323 }
324 
325 static struct tst_test test = {
326 	.min_kver = "2.6.32",
327 	.needs_root = 1,
328 	.forks_child = 1,
329 	.setup = setup,
330 	.cleanup = cleanup,
331 	.test = do_test,
332 	.tcnt = ARRAY_SIZE(tcases),
333 	.tags = (const struct tst_tag[]) {
334 		{"linux-git", "e66f17ff7177"},
335 		{"linux-git", "c9d398fa2378"},
336 		{"linux-git", "4643d67e8cb0"},
337 		{}
338 	}
339 };
340 
341 #else
342 	TST_TEST_TCONF(NUMA_ERROR_MSG);
343 #endif
344