1 /*
2  * Copyright (C) 2012 Linux Test Project, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it would be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11  *
12  * Further, this software is distributed without any warranty that it
13  * is free of the rightful claim of any third person regarding
14  * infringement or the like.  Any license provided herein, whether
15  * implied or otherwise, applies only to this software file.  Patent
16  * licenses, if any, provided herein do not apply to combinations of
17  * this program with other software, or any other product whatsoever.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22  * 02110-1301, USA.
23  */
24 
25 /*
26  * use migrate_pages() and check that address is on correct node
27  * 1. process A can migrate its non-shared mem with CAP_SYS_NICE
28  * 2. process A can migrate its non-shared mem without CAP_SYS_NICE
29  * 3. process A can migrate shared mem only with CAP_SYS_NICE
30  * 4. process A can migrate non-shared mem in process B with same effective uid
31  * 5. process A can migrate non-shared mem in process B with CAP_SYS_NICE
32  */
33 #include <sys/types.h>
34 #include <sys/syscall.h>
35 #include <sys/wait.h>
36 #include <sys/mman.h>
37 #include <sys/prctl.h>
38 #include <errno.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <unistd.h>
42 #include <pwd.h>
43 
44 #include "tst_test.h"
45 #include "lapi/syscalls.h"
46 #include "numa_helper.h"
47 #include "migrate_pages_common.h"
48 
49 /*
50  * This is an estimated minimum of free mem required to migrate this
51  * process to another node as migrate_pages will fail if there is not
52  * enough free space on node. While running this test on x86_64
53  * it used ~2048 pages (total VM, not just RSS). Considering ia64 as
54  * architecture with largest (non-huge) page size (16k), this limit
55  * is set to 2048*16k == 32M.
56  */
57 #define NODE_MIN_FREEMEM (32*1024*1024)
58 
59 #ifdef HAVE_NUMA_V2
60 
61 static const char nobody_uid[] = "nobody";
62 static struct passwd *ltpuser;
63 static int *nodes, nodeA, nodeB;
64 static int num_nodes;
65 
66 static const char * const save_restore[] = {
67 	"?/proc/sys/kernel/numa_balancing",
68 	NULL,
69 };
70 
print_mem_stats(pid_t pid,int node)71 static void print_mem_stats(pid_t pid, int node)
72 {
73 	char s[64];
74 	long long node_size, freep;
75 
76 	if (pid == 0)
77 		pid = getpid();
78 
79 	tst_res(TINFO, "mem_stats pid: %d, node: %d", pid, node);
80 
81 	/* dump pid's VM info */
82 	sprintf(s, "cat /proc/%d/status", pid);
83 	system(s);
84 	sprintf(s, "cat /proc/%d/numa_maps", pid);
85 	system(s);
86 
87 	/* dump node free mem */
88 	node_size = numa_node_size64(node, &freep);
89 	tst_res(TINFO, "Node id: %d, size: %lld, free: %lld",
90 		 node, node_size, freep);
91 }
92 
migrate_to_node(pid_t pid,int node)93 static int migrate_to_node(pid_t pid, int node)
94 {
95 	unsigned long nodemask_size, max_node;
96 	unsigned long *old_nodes, *new_nodes;
97 	int i;
98 
99 	tst_res(TINFO, "pid(%d) migrate pid %d to node -> %d",
100 		 getpid(), pid, node);
101 	max_node = LTP_ALIGN(get_max_node(), sizeof(unsigned long)*8);
102 	nodemask_size = max_node / 8;
103 	old_nodes = SAFE_MALLOC(nodemask_size);
104 	new_nodes = SAFE_MALLOC(nodemask_size);
105 
106 	memset(old_nodes, 0, nodemask_size);
107 	memset(new_nodes, 0, nodemask_size);
108 	for (i = 0; i < num_nodes; i++)
109 		set_bit(old_nodes, nodes[i], 1);
110 	set_bit(new_nodes, node, 1);
111 
112 	TEST(tst_syscall(__NR_migrate_pages, pid, max_node, old_nodes,
113 		new_nodes));
114 	if (TST_RET != 0) {
115 		if (TST_RET < 0) {
116 			tst_res(TFAIL | TERRNO, "migrate_pages failed "
117 				 "ret: %ld, ", TST_RET);
118 			print_mem_stats(pid, node);
119 		} else {
120 			tst_res(TINFO, "migrate_pages could not migrate all "
121 				 "pages, not migrated: %ld", TST_RET);
122 		}
123 	}
124 	free(old_nodes);
125 	free(new_nodes);
126 	return TST_RET;
127 }
128 
addr_on_node(void * addr)129 static int addr_on_node(void *addr)
130 {
131 	int node;
132 	int ret;
133 
134 	ret = tst_syscall(__NR_get_mempolicy, &node, NULL, (unsigned long)0,
135 		      (unsigned long)addr, MPOL_F_NODE | MPOL_F_ADDR);
136 	if (ret == -1) {
137 		tst_res(TBROK | TERRNO, "error getting memory policy "
138 			 "for page %p", addr);
139 	}
140 	return node;
141 }
142 
check_addr_on_node(void * addr,int exp_node)143 static int check_addr_on_node(void *addr, int exp_node)
144 {
145 	int node;
146 
147 	node = addr_on_node(addr);
148 	if (node == exp_node) {
149 		tst_res(TPASS, "pid(%d) addr %p is on expected node: %d",
150 			 getpid(), addr, exp_node);
151 		return TPASS;
152 	} else {
153 		tst_res(TFAIL, "pid(%d) addr %p not on expected node: %d "
154 			 ", expected %d", getpid(), addr, node, exp_node);
155 		print_mem_stats(0, exp_node);
156 		return TFAIL;
157 	}
158 }
159 
test_migrate_current_process(int node1,int node2,int cap_sys_nice)160 static void test_migrate_current_process(int node1, int node2, int cap_sys_nice)
161 {
162 	char *private, *shared;
163 	int ret;
164 	pid_t child;
165 
166 	/* parent can migrate its non-shared memory */
167 	tst_res(TINFO, "current_process, cap_sys_nice: %d", cap_sys_nice);
168 	private =  SAFE_MMAP(NULL, getpagesize(), PROT_READ | PROT_WRITE,
169 		MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
170 	private[0] = 0;
171 	tst_res(TINFO, "private anonymous: %p", private);
172 
173 	migrate_to_node(0, node2);
174 	check_addr_on_node(private, node2);
175 	migrate_to_node(0, node1);
176 	check_addr_on_node(private, node1);
177 	SAFE_MUNMAP(private, getpagesize());
178 
179 	/* parent can migrate shared memory with CAP_SYS_NICE */
180 	shared = SAFE_MMAP(NULL, getpagesize(), PROT_READ | PROT_WRITE,
181 		      MAP_ANONYMOUS | MAP_SHARED, 0, 0);
182 	shared[0] = 1;
183 	tst_res(TINFO, "shared anonymous: %p", shared);
184 	migrate_to_node(0, node2);
185 	check_addr_on_node(shared, node2);
186 
187 	/* shared mem is on node2, try to migrate in child to node1 */
188 	fflush(stdout);
189 	child = SAFE_FORK();
190 	if (child == 0) {
191 		tst_res(TINFO, "child shared anonymous, cap_sys_nice: %d",
192 			 cap_sys_nice);
193 		private =  SAFE_MMAP(NULL, getpagesize(),
194 			PROT_READ | PROT_WRITE,
195 			MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
196 		private[0] = 1;
197 		shared[0] = 1;
198 		if (!cap_sys_nice)
199 			SAFE_SETEUID(ltpuser->pw_uid);
200 
201 		migrate_to_node(0, node1);
202 		/* child can migrate non-shared memory */
203 		ret = check_addr_on_node(private, node1);
204 
205 		exit(ret);
206 	}
207 
208 	SAFE_WAITPID(child, NULL, 0);
209 	if (cap_sys_nice)
210 		/* child can migrate shared memory only
211 		 * with CAP_SYS_NICE */
212 		check_addr_on_node(shared, node1);
213 	else
214 		check_addr_on_node(shared, node2);
215 	SAFE_MUNMAP(shared, getpagesize());
216 }
217 
test_migrate_other_process(int node1,int node2,int cap_sys_nice)218 static void test_migrate_other_process(int node1, int node2, int cap_sys_nice)
219 {
220 	char *private;
221 	int ret;
222 	pid_t child1, child2;
223 
224 	tst_res(TINFO, "other_process, cap_sys_nice: %d", cap_sys_nice);
225 
226 	fflush(stdout);
227 	child1 = SAFE_FORK();
228 	if (child1 == 0) {
229 		private =  SAFE_MMAP(NULL, getpagesize(),
230 			PROT_READ | PROT_WRITE,
231 			MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
232 		private[0] = 0;
233 
234 		/* make sure we are on node1 */
235 		migrate_to_node(0, node1);
236 		check_addr_on_node(private, node1);
237 
238 		SAFE_SETUID(ltpuser->pw_uid);
239 
240 		/* commit_creds() will clear dumpable, restore it */
241 		if (prctl(PR_SET_DUMPABLE, 1))
242 			tst_brk(TBROK | TERRNO, "prctl");
243 
244 		/* signal child2 it's OK to migrate child1 and wait */
245 		TST_CHECKPOINT_WAKE(0);
246 		TST_CHECKPOINT_WAIT(1);
247 
248 		/* child2 can migrate child1 process if it's privileged */
249 		/* child2 can migrate child1 process if it has same uid */
250 		ret = check_addr_on_node(private, node2);
251 
252 		exit(ret);
253 	}
254 
255 	fflush(stdout);
256 	child2 = SAFE_FORK();
257 	if (child2 == 0) {
258 		if (!cap_sys_nice)
259 			SAFE_SETUID(ltpuser->pw_uid);
260 
261 		/* wait until child1 is ready on node1, then migrate and
262 		 * signal to check current node */
263 		TST_CHECKPOINT_WAIT(0);
264 		migrate_to_node(child1, node2);
265 		TST_CHECKPOINT_WAKE(1);
266 
267 		exit(TPASS);
268 	}
269 
270 	SAFE_WAITPID(child1, NULL, 0);
271 	SAFE_WAITPID(child2, NULL, 0);
272 }
273 
run(void)274 static void run(void)
275 {
276 	test_migrate_current_process(nodeA, nodeB, 1);
277 	test_migrate_current_process(nodeA, nodeB, 0);
278 	test_migrate_other_process(nodeA, nodeB, 1);
279 	test_migrate_other_process(nodeA, nodeB, 0);
280 }
281 
setup(void)282 static void setup(void)
283 {
284 	int ret, i, j;
285 	int pagesize = getpagesize();
286 	void *p;
287 
288 	tst_syscall(__NR_migrate_pages, 0, 0, NULL, NULL);
289 
290 	if (numa_available() == -1)
291 		tst_brk(TCONF, "NUMA not available");
292 
293 	ret = get_allowed_nodes_arr(NH_MEMS, &num_nodes, &nodes);
294 	if (ret < 0)
295 		tst_brk(TBROK | TERRNO, "get_allowed_nodes(): %d", ret);
296 
297 	if (num_nodes < 2)
298 		tst_brk(TCONF, "at least 2 allowed NUMA nodes"
299 			 " are required");
300 	else if (tst_kvercmp(2, 6, 18) < 0)
301 		tst_brk(TCONF, "2.6.18 or greater kernel required");
302 
303 	FILE_PRINTF("/proc/sys/kernel/numa_balancing", "0");
304 	/*
305 	 * find 2 nodes, which can hold NODE_MIN_FREEMEM bytes
306 	 * The reason is that:
307 	 * 1. migrate_pages() is expected to succeed
308 	 * 2. this test avoids hitting:
309 	 *    Bug 870326 - migrate_pages() reports success, but pages are
310 	 *                 not moved to desired node
311 	 *    https://bugzilla.redhat.com/show_bug.cgi?id=870326
312 	 */
313 	nodeA = nodeB = -1;
314 	for (i = 0; i < num_nodes; i++) {
315 		p = numa_alloc_onnode(NODE_MIN_FREEMEM, nodes[i]);
316 		if (p == NULL)
317 			break;
318 		memset(p, 0xff, NODE_MIN_FREEMEM);
319 
320 		j = 0;
321 		while (j < NODE_MIN_FREEMEM) {
322 			if (addr_on_node(p + j) != nodes[i])
323 				break;
324 			j += pagesize;
325 		}
326 		numa_free(p, NODE_MIN_FREEMEM);
327 
328 		if (j >= NODE_MIN_FREEMEM) {
329 			if (nodeA == -1)
330 				nodeA = nodes[i];
331 			else if (nodeB == -1)
332 				nodeB = nodes[i];
333 			else
334 				break;
335 		}
336 	}
337 
338 	if (nodeA == -1 || nodeB == -1)
339 		tst_brk(TCONF, "at least 2 NUMA nodes with "
340 			 "free mem > %d are needed", NODE_MIN_FREEMEM);
341 	tst_res(TINFO, "Using nodes: %d %d", nodeA, nodeB);
342 
343 	ltpuser = getpwnam(nobody_uid);
344 	if (ltpuser == NULL)
345 		tst_brk(TBROK | TERRNO, "getpwnam failed");
346 }
347 
348 static struct tst_test test = {
349 	.needs_root = 1,
350 	.needs_checkpoints = 1,
351 	.forks_child = 1,
352 	.test_all = run,
353 	.setup = setup,
354 	.save_restore = save_restore,
355 };
356 #else
357 TST_TEST_TCONF(NUMA_ERROR_MSG);
358 #endif
359