1 /*
2  * cpuset user library implementation.
3  *
4  * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved.
5  *
6  * Paul Jackson <pj@sgi.com>
7  */
8 
9 /*
10  *  This program is free software; you can redistribute it and/or modify
11  *  it under the terms of the GNU Lesser General Public License as published by
12  *  the Free Software Foundation; either version 2.1 of the License, or
13  *  (at your option) any later version.
14  *
15  *  This program is distributed in the hope that it will be useful,
16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *  GNU Lesser General Public License for more details.
19  *
20  *  You should have received a copy of the GNU Lesser General Public License
21  *  along with this program; if not, write to the Free Software
22  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
23  */
24 
25 #define _XOPEN_SOURCE 500	/* need to see pread() */
26 #define _BSD_SOURCE 1		/* need to see syscall() */
27 #include <unistd.h>
28 
29 #include <ctype.h>
30 #include <dirent.h>
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <fts.h>
34 #include <limits.h>
35 #include <signal.h>
36 #include <stdint.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <sys/stat.h>
41 #include <sys/syscall.h>
42 #include <sys/types.h>
43 #include <time.h>
44 #include <utime.h>
45 #include <sys/utsname.h>	/* for cpuset_would_crash_kernel() */
46 
47 #include "bitmask.h"
48 #include "cpuset.h"
49 #include "common.h"
50 #include "test.h"
51 #include "linux_syscall_numbers.h"
52 #include "config.h"
53 #if HAVE_LINUX_MEMPOLICY_H
54 #include <linux/mempolicy.h>
55 
56 /* Bump version, and update Change History, when libcpuset API changes */
57 #define CPUSET_VERSION 3
58 
59 /*
60  * For a history of what changed in each version, see the "Change
61  * History" section, at the end of the libcpuset master document.
62  */
63 
cpuset_version(void)64 int cpuset_version(void)
65 {
66 	return CPUSET_VERSION;
67 }
68 
69 struct cpuset {
70 	struct bitmask *cpus;
71 	struct bitmask *mems;
72 	char cpu_exclusive;
73 	char mem_exclusive;
74 	char mem_hardwall;
75 	char notify_on_release;
76 	char memory_migrate;
77 	char memory_pressure_enabled;
78 	char memory_spread_page;
79 	char memory_spread_slab;
80 	char sched_load_balance;
81 	int sched_relax_domain_level;
82 
83 	/*
84 	 * Each field 'x' above gets an 'x_valid' field below.
85 	 * The apply_cpuset_settings() will only set those fields whose
86 	 * corresponding *_valid flags are set.  The cpuset_alloc()
87 	 * routine clears these flags as part of the clear in calloc(),
88 	 * and the various cpuset_set*() routines set these flags when
89 	 * setting the corresponding value.
90 	 *
91 	 * The purpose of these valid fields is to ensure that when
92 	 * we create a new cpuset, we don't accidentally overwrite
93 	 * some non-zero kernel default, such as an inherited
94 	 * memory_spread_* flag, just because the user application
95 	 * code didn't override the default zero settings resulting
96 	 * from the calloc() call in cpuset_alloc().
97 	 *
98 	 * The choice of 'char' for the type of the flags above,
99 	 * but a bitfield for the flags below, is somewhat capricious.
100 	 */
101 	unsigned cpus_valid:1;
102 	unsigned mems_valid:1;
103 	unsigned cpu_exclusive_valid:1;
104 	unsigned mem_exclusive_valid:1;
105 	unsigned mem_hardwall_valid:1;
106 	unsigned notify_on_release_valid:1;
107 	unsigned memory_migrate_valid:1;
108 	unsigned memory_pressure_enabled_valid:1;
109 	unsigned memory_spread_page_valid:1;
110 	unsigned memory_spread_slab_valid:1;
111 	unsigned sched_load_balance_valid:1;
112 	unsigned sched_relax_domain_level_valid:1;
113 
114 	/*
115 	 * if the relative variable was modified, use following flags
116 	 * to put a mark
117 	 */
118 	unsigned cpus_dirty:1;
119 	unsigned mems_dirty:1;
120 	unsigned cpu_exclusive_dirty:1;
121 	unsigned mem_exclusive_dirty:1;
122 	unsigned mem_hardwall_dirty:1;
123 	unsigned notify_on_release_dirty:1;
124 	unsigned memory_migrate_dirty:1;
125 	unsigned memory_pressure_enabled_dirty:1;
126 	unsigned memory_spread_page_dirty:1;
127 	unsigned memory_spread_slab_dirty:1;
128 	unsigned sched_load_balance_dirty:1;
129 	unsigned sched_relax_domain_level_dirty:1;
130 };
131 
132 /* Presumed cpuset file system mount point */
133 static const char *cpusetmnt = "/dev/cpuset";
134 
135 /* Stashed copy of cpunodemap[], mapping each cpu to its node. */
136 static const char *mapfile = "/var/run/cpunodemap";
137 
138 /* The primary source for the cpunodemap[] is available below here. */
139 static const char *sysdevices = "/sys/devices/system";
140 
141 #define max(a,b) ((a) > (b) ? (a) : (b))
142 #define min(a,b) ((a) < (b) ? (a) : (b))
143 
144 /* small buffer size - for reading boolean flags or map file (1 or 2 ints) */
145 #define SMALL_BUFSZ 16
146 
147 /*
148  * The 'mask_size_file' is used to ferrit out the kernel cpumask_t
149  * and nodemask_t sizes.  The lines in this file that begin with the
150  * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask
151  * and nodemask string, respectively.  The lengths of these strings
152  * reflect the kernel's internal cpumask_t and nodemask_t sizes,
153  * which sizes are needed to correctly call the sched_setaffinity
154  * and set_mempolicy system calls, and to size user level
155  * bitmasks to match the kernels.
156  */
157 
158 static const char *mask_size_file = "/proc/self/status";
159 static const char *cpumask_prefix = "Cpus_allowed:\t";
160 static const char *nodemask_prefix = "Mems_allowed:\t";
161 
162 /*
163  * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits.
164  *
165  * The first time we need these, we parse the Cpus_allowed and
166  * Mems_allowed lines from mask_size_file ("/proc/self/status").
167  */
168 
169 static int cpumask_sz;
170 static int nodemask_sz;
171 
172 /*
173  * These defaults only kick in if we fail to size the kernel
174  * cpumask and nodemask by reading the Cpus_allowed and
175  * Mems_allowed fields from the /proc/self/status file.
176  */
177 
178 #define DEFCPUBITS (512)
179 #define DEFNODEBITS (DEFCPUBITS/2)
180 
181 /*
182  * Arch-neutral API for obtaining NUMA distances between CPUs
183  * and Memory Nodes, via the files:
184  *	/sys/devices/system/node/nodeN/distance
185  * which have lines such as:
186  *	46 66 10 20
187  * which say that for cpu on node N (from the path above), the
188  * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20,
189  * respectively.
190  */
191 
192 static const char *distance_directory = "/sys/devices/system/node";
193 
194 /*
195  * Someday, we should disable, then later discard, the SN code
196  * marked ALTERNATE_SN_DISTMAP.
197  */
198 
199 #define ALTERNATE_SN_DISTMAP 1
200 #ifdef ALTERNATE_SN_DISTMAP
201 
202 /*
203  * Alternative SN (SGI ia64) architecture specific API for obtaining
204  * NUMA distances between CPUs and Memory Nodes is via the file
205  * /proc/sgi_sn/sn_topology, which has lines such as:
206  *
207  *   node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20
208  *
209  * which says that for each CPU on node 2, the distance to nodes
210  * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively.
211  *
212  * This file has other lines as well, which start with other
213  * keywords than "node".  Ignore these other lines.
214  */
215 
216 static const char *sn_topology = "/proc/sgi_sn/sn_topology";
217 static const char *sn_top_node_prefix = "node ";
218 
219 #endif
220 
221 /*
222  * Check that cpusets supported, /dev/cpuset mounted.
223  * If ok, return 0.
224  * If not, return -1 and set errno:
225  *	ENOSYS - kernel doesn't support cpusets
226  *	ENODEV - /dev/cpuset not mounted
227  */
228 
229 static enum {
230 	check_notdone,
231 	check_enosys,
232 	check_enodev,
233 	check_ok
234 } check_state = check_notdone;
235 
check()236 static int check()
237 {
238 	if (check_state == check_notdone) {
239 		struct stat statbuf;
240 
241 		if (stat("/proc/self/cpuset", &statbuf) < 0) {
242 			check_state = check_enosys;
243 			goto done;
244 		}
245 
246 		if (stat("/dev/cpuset/tasks", &statbuf) < 0) {
247 			check_state = check_enodev;
248 			goto done;
249 		}
250 
251 		check_state = check_ok;
252 	}
253 done:
254 	switch (check_state) {
255 	case check_enosys:
256 		errno = ENOSYS;
257 		return -1;
258 	case check_enodev:
259 		errno = ENODEV;
260 		return -1;
261 	default:
262 		break;
263 	}
264 	return 0;
265 }
266 
chomp(char * s)267 static void chomp(char *s)
268 {
269 	char *t;
270 
271 	for (t = s + strlen(s) - 1; t >= s; t--) {
272 		if (*t == '\n' || *t == '\r')
273 			*t = '\0';
274 		else
275 			break;
276 	}
277 }
278 
279 /*
280  * Determine number of bytes in a seekable open file, without
281  * assuming that stat(2) on that file has a useful size.
282  * Has side affect of leaving the file rewound to the beginnning.
283  */
filesize(FILE * fp)284 static int filesize(FILE * fp)
285 {
286 	int sz = 0;
287 	rewind(fp);
288 	while (fgetc(fp) != EOF)
289 		sz++;
290 	rewind(fp);
291 	return sz;
292 }
293 
294 /* Are strings s1 and s2 equal? */
streq(const char * s1,const char * s2)295 static int streq(const char *s1, const char *s2)
296 {
297 	return strcmp(s1, s2) == 0;
298 }
299 
300 /* Is string 'pre' a prefix of string 's'? */
strprefix(const char * s,const char * pre)301 static int strprefix(const char *s, const char *pre)
302 {
303 	return strncmp(s, pre, strlen(pre)) == 0;
304 }
305 
306 /*
307  * char *flgets(char *buf, int buflen, FILE *fp)
308  *
309  * Obtain one line from input file fp.  Copy up to first
310  * buflen-1 chars of line into buffer buf, discarding any remainder
311  * of line.  Stop reading at newline, discarding newline.
312  * Nul terminate result and return pointer to buffer buf
313  * on success, or NULL if nothing more to read or failure.
314  */
315 
flgets(char * buf,int buflen,FILE * fp)316 static char *flgets(char *buf, int buflen, FILE * fp)
317 {
318 	int c = -1;
319 	char *bp;
320 
321 	bp = buf;
322 	while ((--buflen > 0) && ((c = getc(fp)) >= 0)) {
323 		if (c == '\n')
324 			goto newline;
325 		*bp++ = c;
326 	}
327 	if ((c < 0) && (bp == buf))
328 		return NULL;
329 
330 	if (c > 0) {
331 		while ((c = getc(fp)) >= 0) {
332 			if (c == '\n')
333 				break;
334 		}
335 	}
336 
337 newline:
338 	*bp++ = '\0';
339 	return buf;
340 }
341 
342 /*
343  * sgetc(const char *inputbuf, int *offsetptr)
344  *
345  * Return next char from nul-terminated input buffer inputbuf,
346  * starting at offset *offsetptr.  Increment *offsetptr.
347  * If next char would be nul ('\0'), return EOF and don't
348  * increment *offsetptr.
349  */
350 
sgetc(const char * inputbuf,int * offsetptr)351 static int sgetc(const char *inputbuf, int *offsetptr)
352 {
353 	char c;
354 
355 	if ((c = inputbuf[*offsetptr]) != 0) {
356 		*offsetptr = *offsetptr + 1;
357 		return c;
358 	} else {
359 		return EOF;
360 	}
361 }
362 
363 /*
364  * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
365  *
366  * Obtain next line from nul-terminated input buffer 'inputbuf',
367  * starting at offset *offsetptr.  Copy up to first buflen-1
368  * chars of line into output buffer buf, discarding any remainder
369  * of line.  Stop reading at newline, discarding newline.
370  * Nul terminate result and return pointer to output buffer
371  * buf on success, or NULL if nothing more to read.
372  */
373 
slgets(char * buf,int buflen,const char * inputbuf,int * offsetptr)374 static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
375 {
376 	int c = -1;
377 	char *bp;
378 
379 	bp = buf;
380 	while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) {
381 		if (c == '\n')
382 			goto newline;
383 		*bp++ = c;
384 	}
385 	if ((c < 0) && (bp == buf))
386 		return NULL;
387 
388 	if (c > 0) {
389 		while ((c = sgetc(inputbuf, offsetptr)) >= 0) {
390 			if (c == '\n')
391 				break;
392 		}
393 	}
394 
395 newline:
396 	*bp++ = '\0';
397 	return buf;
398 }
399 
400 /*
401  * time_t get_mtime(char *path)
402  *
403  * Return modtime of file at location path, else return 0.
404  */
405 
get_mtime(const char * path)406 static time_t get_mtime(const char *path)
407 {
408 	struct stat statbuf;
409 
410 	if (stat(path, &statbuf) != 0)
411 		return 0;
412 	return statbuf.st_mtime;
413 }
414 
415 /*
416  * int set_mtime(const char *path, time_t mtime)
417  *
418  * Set modtime of file 'path' to 'mtime'.  Return 0 on success,
419  * or -1 on error, setting errno.
420  */
421 
set_mtime(const char * path,time_t mtime)422 static int set_mtime(const char *path, time_t mtime)
423 {
424 	struct utimbuf times;
425 
426 	times.actime = mtime;
427 	times.modtime = mtime;
428 	return utime(path, &times);
429 }
430 
431 /*
432  * True if two pathnames resolve to same file.
433  * False if either path can not be stat'd,
434  * or if the two paths resolve to a different file.
435  */
436 
samefile(const char * path1,const char * path2)437 static int samefile(const char *path1, const char *path2)
438 {
439 	struct stat sb1, sb2;
440 
441 	if (stat(path1, &sb1) != 0)
442 		return 0;
443 	if (stat(path2, &sb2) != 0)
444 		return 0;
445 	return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev;
446 }
447 
448 #define slash(c) (*(c) == '/')
449 #define eocomp(c) (slash(c) || !*(c))
450 #define dot1(c) (*(c) == '.' && eocomp(c+1))
451 
452 /* In place path compression.  Remove extra dots and slashes. */
pathcomp(char * p)453 static char *pathcomp(char *p)
454 {
455 	char *a = p;
456 	char *b = p;
457 
458 	if (!p || !*p)
459 		return p;
460 	if (slash(p))
461 		*b++ = *a++;
462 	for (;;) {
463 		if (slash(a))
464 			while (slash(++a))
465 				continue;
466 		if (!*a) {
467 			if (b == p)
468 				*b++ = '.';
469 			*b = '\0';
470 			return (p);
471 		} else if (dot1(a)) {
472 			a++;
473 		} else {
474 			if ((b != p) && !slash(b - 1))
475 				*b++ = '/';
476 			while (!eocomp(a))
477 				*b++ = *a++;
478 		}
479 	}
480 }
481 
482 #undef slash
483 #undef eocomp
484 #undef dot1
485 
486 /*
487  * pathcat2(buf, buflen, name1, name2)
488  *
489  * Return buf, of length buflen, with name1/name2 stored in it.
490  */
491 
pathcat2(char * buf,int buflen,const char * name1,const char * name2)492 static char *pathcat2(char *buf, int buflen, const char *name1,
493 		      const char *name2)
494 {
495 	(void)snprintf(buf, buflen, "%s/%s", name1, name2);
496 	return pathcomp(buf);
497 }
498 
499 /*
500  * pathcat3(buf, buflen, name1, name2, name3)
501  *
502  * Return buf, of length buflen, with name1/name2/name3 stored in it.
503  */
504 
pathcat3(char * buf,int buflen,const char * name1,const char * name2,const char * name3)505 static char *pathcat3(char *buf, int buflen, const char *name1,
506 		      const char *name2, const char *name3)
507 {
508 	(void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3);
509 	return pathcomp(buf);
510 }
511 
512 /*
513  * fullpath(buf, buflen, name)
514  *
515  * Put full path of cpuset 'name' in buffer 'buf'.  If name
516  * starts with a slash (``/``) character, then this a path
517  * relative to ``/dev/cpuset``, otherwise it is relative to
518  * the current tasks cpuset.  Return 0 on success, else
519  * -1 on error, setting errno.
520  */
521 
fullpath(char * buf,int buflen,const char * name)522 static int fullpath(char *buf, int buflen, const char *name)
523 {
524 	int len;
525 
526 	/* easy case */
527 	if (*name == '/') {
528 		pathcat2(buf, buflen, cpusetmnt, name);
529 		pathcomp(buf);
530 		return 0;
531 	}
532 
533 	/* hard case */
534 	snprintf(buf, buflen, "%s/", cpusetmnt);
535 	len = strlen(buf);
536 	if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL)
537 		return -1;
538 	if (strlen(buf) >= buflen - 1 - strlen(name)) {
539 		errno = E2BIG;
540 		return -1;
541 	}
542 	strcat(buf, "/");
543 	strcat(buf, name);
544 	pathcomp(buf);
545 	return 0;
546 }
547 
548 /*
549  * fullpath2(buf, buflen, name1, name2)
550  *
551  * Like fullpath(), only concatenate two pathname components on end.
552  */
553 
fullpath2(char * buf,int buflen,const char * name1,const char * name2)554 static int fullpath2(char *buf, int buflen, const char *name1,
555 		     const char *name2)
556 {
557 	if (fullpath(buf, buflen, name1) < 0)
558 		return -1;
559 	if (strlen(buf) >= buflen - 1 - strlen(name2)) {
560 		errno = E2BIG;
561 		return -1;
562 	}
563 	strcat(buf, "/");
564 	strcat(buf, name2);
565 	pathcomp(buf);
566 	return 0;
567 }
568 
569 /*
570  * Convert the string length of an ascii hex mask to the number
571  * of bits represented by that mask.
572  *
573  * The cpumask and nodemask values in /proc/self/status are in an
574  * ascii format that uses 9 characters for each 32 bits of mask.
575  */
s2nbits(const char * s)576 static int s2nbits(const char *s)
577 {
578 	return strlen(s) * 32 / 9;
579 }
580 
update_mask_sizes()581 static void update_mask_sizes()
582 {
583 	FILE *fp = NULL;
584 	char *buf = NULL;
585 	int fsize;
586 
587 	if ((fp = fopen(mask_size_file, "r")) == NULL)
588 		goto done;
589 	fsize = filesize(fp);
590 	if ((buf = malloc(fsize)) == NULL)
591 		goto done;
592 
593 	/*
594 	 * Beware: mask sizing arithmetic is fussy.
595 	 * The trailing newline left by fgets() is required.
596 	 */
597 	while (fgets(buf, fsize, fp)) {
598 		if (strprefix(buf, cpumask_prefix))
599 			cpumask_sz = s2nbits(buf + strlen(cpumask_prefix));
600 		if (strprefix(buf, nodemask_prefix))
601 			nodemask_sz = s2nbits(buf + strlen(nodemask_prefix));
602 	}
603 done:
604 	free(buf);
605 	if (fp != NULL)
606 		fclose(fp);
607 	if (cpumask_sz == 0)
608 		cpumask_sz = DEFCPUBITS;
609 	if (nodemask_sz == 0)
610 		nodemask_sz = DEFNODEBITS;
611 }
612 
613 /* Allocate a new struct cpuset */
cpuset_alloc()614 struct cpuset *cpuset_alloc()
615 {
616 	struct cpuset *cp = NULL;
617 	int nbits;
618 
619 	if ((cp = calloc(1, sizeof(struct cpuset))) == NULL)
620 		goto err;
621 
622 	nbits = cpuset_cpus_nbits();
623 	if ((cp->cpus = bitmask_alloc(nbits)) == NULL)
624 		goto err;
625 
626 	nbits = cpuset_mems_nbits();
627 	if ((cp->mems = bitmask_alloc(nbits)) == NULL)
628 		goto err;
629 
630 	return cp;
631 err:
632 	if (cp && cp->cpus)
633 		bitmask_free(cp->cpus);
634 	if (cp && cp->mems)
635 		bitmask_free(cp->mems);
636 	free(cp);
637 	return NULL;
638 }
639 
640 /* Free struct cpuset *cp */
cpuset_free(struct cpuset * cp)641 void cpuset_free(struct cpuset *cp)
642 {
643 	if (!cp)
644 		return;
645 	if (cp->cpus)
646 		bitmask_free(cp->cpus);
647 	if (cp->mems)
648 		bitmask_free(cp->mems);
649 	free(cp);
650 }
651 
652 /* Number of bits in a CPU bitmask on current system */
cpuset_cpus_nbits()653 int cpuset_cpus_nbits()
654 {
655 	if (cpumask_sz == 0)
656 		update_mask_sizes();
657 	return cpumask_sz;
658 }
659 
660 /* Number of bits in a Memory bitmask on current system */
cpuset_mems_nbits()661 int cpuset_mems_nbits()
662 {
663 	if (nodemask_sz == 0)
664 		update_mask_sizes();
665 	return nodemask_sz;
666 }
667 
668 /* Set CPUs in cpuset cp to bitmask cpus */
cpuset_setcpus(struct cpuset * cp,const struct bitmask * cpus)669 int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus)
670 {
671 	if (cp->cpus)
672 		bitmask_free(cp->cpus);
673 	cp->cpus = bitmask_alloc(bitmask_nbits(cpus));
674 	if (cp->cpus == NULL)
675 		return -1;
676 	bitmask_copy(cp->cpus, cpus);
677 	cp->cpus_valid = 1;
678 	cp->cpus_dirty = 1;
679 	return 0;
680 }
681 
682 /* Set Memory Nodes in cpuset cp to bitmask mems */
cpuset_setmems(struct cpuset * cp,const struct bitmask * mems)683 int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems)
684 {
685 	if (cp->mems)
686 		bitmask_free(cp->mems);
687 	cp->mems = bitmask_alloc(bitmask_nbits(mems));
688 	if (cp->mems == NULL)
689 		return -1;
690 	bitmask_copy(cp->mems, mems);
691 	cp->mems_valid = 1;
692 	cp->mems_dirty = 1;
693 	return 0;
694 }
695 
696 /* Set integer value optname of cpuset cp */
cpuset_set_iopt(struct cpuset * cp,const char * optionname,int value)697 int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value)
698 {
699 	if (streq(optionname, "cpu_exclusive")) {
700 		cp->cpu_exclusive = ! !value;
701 		cp->cpu_exclusive_valid = 1;
702 		cp->cpu_exclusive_dirty = 1;
703 	} else if (streq(optionname, "mem_exclusive")) {
704 		cp->mem_exclusive = ! !value;
705 		cp->mem_exclusive_valid = 1;
706 		cp->mem_exclusive_dirty = 1;
707 	} else if (streq(optionname, "mem_hardwall")) {
708 		cp->mem_hardwall = ! !value;
709 		cp->mem_hardwall_valid = 1;
710 		cp->mem_hardwall_dirty = 1;
711 	} else if (streq(optionname, "notify_on_release")) {
712 		cp->notify_on_release = ! !value;
713 		cp->notify_on_release_valid = 1;
714 		cp->notify_on_release_dirty = 1;
715 	} else if (streq(optionname, "memory_pressure_enabled")) {
716 		cp->memory_pressure_enabled = ! !value;
717 		cp->memory_pressure_enabled_valid = 1;
718 		cp->memory_pressure_enabled_dirty = 1;
719 	} else if (streq(optionname, "memory_migrate")) {
720 		cp->memory_migrate = ! !value;
721 		cp->memory_migrate_valid = 1;
722 		cp->memory_migrate_dirty = 1;
723 	} else if (streq(optionname, "memory_spread_page")) {
724 		cp->memory_spread_page = ! !value;
725 		cp->memory_spread_page_valid = 1;
726 		cp->memory_spread_page_dirty = 1;
727 	} else if (streq(optionname, "memory_spread_slab")) {
728 		cp->memory_spread_slab = ! !value;
729 		cp->memory_spread_slab_valid = 1;
730 		cp->memory_spread_slab_dirty = 1;
731 	} else if (streq(optionname, "sched_load_balance")) {
732 		cp->sched_load_balance = ! !value;
733 		cp->sched_load_balance_valid = 1;
734 		cp->sched_load_balance_dirty = 1;
735 	} else if (streq(optionname, "sched_relax_domain_level")) {
736 		cp->sched_relax_domain_level = value;
737 		cp->sched_relax_domain_level_valid = 1;
738 		cp->sched_relax_domain_level_dirty = 1;
739 	} else
740 		return -2;	/* optionname not recognized */
741 	return 0;
742 }
743 
744 /* [optional] Set string value optname */
cpuset_set_sopt(UNUSED struct cpuset * cp,UNUSED const char * optionname,UNUSED const char * value)745 int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname,
746 		    UNUSED const char *value)
747 {
748 	return -2;		/* For now, all string options unrecognized */
749 }
750 
751 /* Return handle for reading memory_pressure. */
cpuset_open_memory_pressure(const char * cpusetpath)752 int cpuset_open_memory_pressure(const char *cpusetpath)
753 {
754 	char buf[PATH_MAX];
755 
756 	fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure");
757 	return open(buf, O_RDONLY);
758 }
759 
760 /* Return current memory_pressure of cpuset. */
cpuset_read_memory_pressure(int han)761 int cpuset_read_memory_pressure(int han)
762 {
763 	char buf[SMALL_BUFSZ];
764 
765 	if (pread(han, buf, sizeof(buf), 0L) < 0)
766 		return -1;
767 	return atoi(buf);
768 }
769 
770 /* Close handle for reading memory pressure. */
cpuset_close_memory_pressure(int han)771 void cpuset_close_memory_pressure(int han)
772 {
773 	close(han);
774 }
775 
776 /*
777  * Resolve cpuset pointer (to that of current task if cp == NULL).
778  *
779  * If cp not NULL, just return it.  If cp is NULL, return pointer
780  * to temporary cpuset for current task, and set *cp_tofree to
781  * pointer to that same temporary cpuset, to be freed later.
782  *
783  * Return NULL and set errno on error.  Errors can occur when
784  * resolving the current tasks cpuset.
785  */
resolve_cp(const struct cpuset * cp,struct cpuset ** cp_tofree)786 static const struct cpuset *resolve_cp(const struct cpuset *cp,
787 				       struct cpuset **cp_tofree)
788 {
789 	const struct cpuset *rcp;
790 
791 	if (cp) {
792 		rcp = cp;
793 	} else {
794 		struct cpuset *cp1 = cpuset_alloc();
795 		if (cp1 == NULL)
796 			goto err;
797 		if (cpuset_cpusetofpid(cp1, 0) < 0) {
798 			cpuset_free(cp1);
799 			goto err;
800 		}
801 		*cp_tofree = cp1;
802 		rcp = cp1;
803 	}
804 	return rcp;
805 err:
806 	return NULL;
807 }
808 
809 /* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */
cpuset_getcpus(const struct cpuset * cp,struct bitmask * cpus)810 int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus)
811 {
812 	struct cpuset *cp_tofree = NULL;
813 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
814 
815 	if (!cp1)
816 		goto err;
817 	if (cp1->cpus == NULL) {
818 		errno = EINVAL;
819 		goto err;
820 	}
821 	bitmask_copy(cpus, cp1->cpus);
822 	cpuset_free(cp_tofree);
823 	return 0;
824 err:
825 	cpuset_free(cp_tofree);
826 	return -1;
827 }
828 
829 /* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */
cpuset_getmems(const struct cpuset * cp,struct bitmask * mems)830 int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems)
831 {
832 	struct cpuset *cp_tofree = NULL;
833 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
834 
835 	if (!cp1)
836 		goto err;
837 	if (cp1->mems == NULL) {
838 		errno = EINVAL;
839 		goto err;
840 	}
841 	bitmask_copy(mems, cp1->mems);
842 	cpuset_free(cp_tofree);
843 	return 0;
844 err:
845 	cpuset_free(cp_tofree);
846 	return -1;
847 }
848 
849 /* Return number of CPUs in cpuset cp (current task if cp == NULL) */
cpuset_cpus_weight(const struct cpuset * cp)850 int cpuset_cpus_weight(const struct cpuset *cp)
851 {
852 	struct cpuset *cp_tofree = NULL;
853 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
854 	int w = -1;
855 
856 	if (!cp1)
857 		goto err;
858 	if (cp1->cpus == NULL) {
859 		errno = EINVAL;
860 		goto err;
861 	}
862 	w = bitmask_weight(cp1->cpus);
863 	/* fall into ... */
864 err:
865 	cpuset_free(cp_tofree);
866 	return w;
867 }
868 
869 /* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */
cpuset_mems_weight(const struct cpuset * cp)870 int cpuset_mems_weight(const struct cpuset *cp)
871 {
872 	struct cpuset *cp_tofree = NULL;
873 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
874 	int w = -1;
875 
876 	if (!cp1)
877 		goto err;
878 	if (cp1->mems == NULL) {
879 		errno = EINVAL;
880 		goto err;
881 	}
882 	w = bitmask_weight(cp1->mems);
883 	/* fall into ... */
884 err:
885 	cpuset_free(cp_tofree);
886 	return w;
887 }
888 
889 /* Return integer value of option optname in cp */
cpuset_get_iopt(const struct cpuset * cp,const char * optionname)890 int cpuset_get_iopt(const struct cpuset *cp, const char *optionname)
891 {
892 	if (streq(optionname, "cpu_exclusive"))
893 		return cp->cpu_exclusive;
894 	else if (streq(optionname, "mem_exclusive"))
895 		return cp->mem_exclusive;
896 	else if (streq(optionname, "mem_hardwall"))
897 		return cp->mem_hardwall;
898 	else if (streq(optionname, "notify_on_release"))
899 		return cp->notify_on_release;
900 	else if (streq(optionname, "memory_pressure_enabled"))
901 		return cp->memory_pressure_enabled;
902 	else if (streq(optionname, "memory_migrate"))
903 		return cp->memory_migrate;
904 	else if (streq(optionname, "memory_spread_page"))
905 		return cp->memory_spread_page;
906 	else if (streq(optionname, "memory_spread_slab"))
907 		return cp->memory_spread_slab;
908 	else if (streq(optionname, "sched_load_balance"))
909 		return cp->sched_load_balance;
910 	else if (streq(optionname, "sched_relax_domain_level"))
911 		return cp->sched_relax_domain_level;
912 	else
913 		return -2;	/* optionname not recognized */
914 }
915 
916 /* [optional] Return string value of optname */
cpuset_get_sopt(UNUSED const struct cpuset * cp,UNUSED const char * optionname)917 const char *cpuset_get_sopt(UNUSED const struct cpuset *cp,
918 			    UNUSED const char *optionname)
919 {
920 	return NULL;		/* For now, all string options unrecognized */
921 }
922 
read_flag(const char * filepath,char * flagp)923 static int read_flag(const char *filepath, char *flagp)
924 {
925 	char buf[SMALL_BUFSZ];	/* buffer a "0" or "1" flag line */
926 	int fd = -1;
927 
928 	if ((fd = open(filepath, O_RDONLY)) < 0)
929 		goto err;
930 	if (read(fd, buf, sizeof(buf)) < 1)
931 		goto err;
932 	if (atoi(buf))
933 		*flagp = 1;
934 	else
935 		*flagp = 0;
936 	close(fd);
937 	return 0;
938 err:
939 	if (fd >= 0)
940 		close(fd);
941 	return -1;
942 }
943 
load_flag(const char * path,char * flagp,const char * flag)944 static int load_flag(const char *path, char *flagp, const char *flag)
945 {
946 	char buf[PATH_MAX];
947 
948 	pathcat2(buf, sizeof(buf), path, flag);
949 	return read_flag(buf, flagp);
950 }
951 
read_number(const char * filepath,int * numberp)952 static int read_number(const char *filepath, int *numberp)
953 {
954 	char buf[SMALL_BUFSZ];
955 	int fd = -1;
956 
957 	if ((fd = open(filepath, O_RDONLY)) < 0)
958 		goto err;
959 	if (read(fd, buf, sizeof(buf)) < 1)
960 		goto err;
961 	*numberp = atoi(buf);
962 	close(fd);
963 	return 0;
964 err:
965 	if (fd >= 0)
966 		close(fd);
967 	return -1;
968 }
969 
load_number(const char * path,int * numberp,const char * file)970 static int load_number(const char *path, int *numberp, const char *file)
971 {
972 	char buf[PATH_MAX];
973 
974 	pathcat2(buf, sizeof(buf), path, file);
975 	return read_number(buf, numberp);
976 }
977 
read_mask(const char * filepath,struct bitmask ** bmpp,int nbits)978 static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits)
979 {
980 	FILE *fp = NULL;
981 	char *buf = NULL;
982 	int buflen;
983 	struct bitmask *bmp = NULL;
984 
985 	if ((fp = fopen(filepath, "r")) == NULL)
986 		goto err;
987 	buflen = filesize(fp) + 1;	/* + 1 for nul term */
988 	if ((buf = malloc(buflen)) == NULL)
989 		goto err;
990 	if (flgets(buf, buflen, fp) == NULL)
991 		goto err;
992 	fclose(fp);
993 	fp = NULL;
994 
995 	if ((bmp = bitmask_alloc(nbits)) == NULL)
996 		goto err;
997 	if (*buf && bitmask_parselist(buf, bmp) < 0)
998 		goto err;
999 	if (*bmpp)
1000 		bitmask_free(*bmpp);
1001 	*bmpp = bmp;
1002 	free(buf);
1003 	buf = NULL;
1004 	return 0;
1005 err:
1006 	if (buf != NULL)
1007 		free(buf);
1008 	if (fp != NULL)
1009 		fclose(fp);
1010 	if (bmp != NULL)
1011 		bitmask_free(bmp);
1012 	return -1;
1013 }
1014 
load_mask(const char * path,struct bitmask ** bmpp,int nbits,const char * mask)1015 static int load_mask(const char *path, struct bitmask **bmpp,
1016 		     int nbits, const char *mask)
1017 {
1018 	char buf[PATH_MAX];
1019 
1020 	pathcat2(buf, sizeof(buf), path, mask);
1021 	return read_mask(buf, bmpp, nbits);
1022 }
1023 
1024 /* Write string to file at given filepath.  Create or truncate file. */
write_string_file(const char * filepath,const char * str)1025 static int write_string_file(const char *filepath, const char *str)
1026 {
1027 	int fd = -1;
1028 
1029 	if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0)
1030 		goto err;
1031 	if (write(fd, str, strlen(str)) < 0)
1032 		goto err;
1033 	close(fd);
1034 	return 0;
1035 err:
1036 	if (fd >= 0)
1037 		close(fd);
1038 	return -1;
1039 }
1040 
1041 /* Size and allocate buffer.  Write bitmask into it.  Caller must free */
sprint_mask_buf(const struct bitmask * bmp)1042 static char *sprint_mask_buf(const struct bitmask *bmp)
1043 {
1044 	char *buf = NULL;
1045 	int buflen;
1046 	char c;
1047 
1048 	/* First bitmask_displaylist() call just to get the length */
1049 	buflen = bitmask_displaylist(&c, 1, bmp) + 1;	/* "+ 1" for nul */
1050 	if ((buf = malloc(buflen)) == NULL)
1051 		return NULL;
1052 	bitmask_displaylist(buf, buflen, bmp);
1053 	return buf;
1054 }
1055 
exists_flag(const char * path,const char * flag)1056 static int exists_flag(const char *path, const char *flag)
1057 {
1058 	char buf[PATH_MAX];
1059 	struct stat statbuf;
1060 	int rc;
1061 
1062 	pathcat2(buf, sizeof(buf), path, flag);
1063 	rc = (stat(buf, &statbuf) == 0);
1064 	errno = 0;
1065 	return rc;
1066 }
1067 
store_flag(const char * path,const char * flag,int val)1068 static int store_flag(const char *path, const char *flag, int val)
1069 {
1070 	char buf[PATH_MAX];
1071 
1072 	pathcat2(buf, sizeof(buf), path, flag);
1073 	return write_string_file(buf, val ? "1" : "0");
1074 }
1075 
store_number(const char * path,const char * file,int val)1076 static int store_number(const char *path, const char *file, int val)
1077 {
1078 	char buf[PATH_MAX];
1079 	char data[SMALL_BUFSZ];
1080 
1081 	memset(data, 0, sizeof(data));
1082 	pathcat2(buf, sizeof(buf), path, file);
1083 	snprintf(data, sizeof(data), "%d", val);
1084 	return write_string_file(buf, data);
1085 }
1086 
store_mask(const char * path,const char * mask,const struct bitmask * bmp)1087 static int store_mask(const char *path, const char *mask,
1088 		      const struct bitmask *bmp)
1089 {
1090 	char maskpath[PATH_MAX];
1091 	char *bp = NULL;
1092 	int rc;
1093 
1094 	if (bmp == NULL)
1095 		return 0;
1096 	pathcat2(maskpath, sizeof(maskpath), path, mask);
1097 	if ((bp = sprint_mask_buf(bmp)) == NULL)
1098 		return -1;
1099 	rc = write_string_file(maskpath, bp);
1100 	free(bp);
1101 	return rc;
1102 }
1103 
1104 /*
1105  * Return 1 if 'cpu' is online, else 0 if offline.  Tests the file
1106  * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents
1107  * were N == cpu number.
1108  */
1109 
cpu_online(unsigned int cpu)1110 char cpu_online(unsigned int cpu)
1111 {
1112 	char online;
1113 	char cpupath[PATH_MAX];
1114 
1115 	(void)snprintf(cpupath, sizeof(cpupath),
1116 		       "/sys/devices/system/cpu/cpu%d/online", cpu);
1117 	if (read_flag(cpupath, &online) < 0)
1118 		return 0;	/* oops - guess that cpu's not there */
1119 	return online;
1120 }
1121 
1122 /*
1123  * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()),
1124  * to the node on which that cpu resides or cpuset_mems_nbits().
1125  *
1126  * To avoid every user having to recalculate this relation
1127  * from various clues in the sysfs file system (below the
1128  * path /sys/devices/system) a copy of this map is kept at
1129  * /var/run/cpunodemap.
1130  *
1131  * The system automatically cleans out files below
1132  * /var/run on each system reboot (see the init script
1133  * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry
1134  * about stale data in this file across reboots.  If the file
1135  * is missing, let the first process that needs it, and has
1136  * permission to write in the /var/run directory, rebuild it.
1137  *
1138  * If using this cached data, remember the mtime of the mapfile
1139  * the last time we read it in case something like a hotplug
1140  * event results in the file being removed and rebuilt, so we
1141  * can detect if we're using a stale cache, and need to reload.
1142  *
1143  * The mtime of this file is set to the time when we did
1144  * the recalculation of the map, from the clues beneath
1145  * /sys/devices/system.  This is done so that a program
1146  * won't see the mapfile it just wrote as being newer than what
1147  * it just wrote out (store_map) and read the same map back in
1148  * (load_file).
1149  */
1150 
1151 /*
1152  * Hold flockfile(stdin) while using cpunodemap for posix thread safety.
1153  *
1154  * Note on locking and flockfile(FILE *):
1155  *
1156  *  We use flockfile() and funlockfile() instead of directly
1157  *  calling pthread_mutex_lock and pthread_mutex_unlock on
1158  *  a pthread_mutex_t, because this avoids forcing the app
1159  *  to link with libpthread.  The glibc implementation of
1160  *  flockfile/funlockfile will fall back to no-ops if libpthread
1161  *  doesn't happen to be linked.
1162  *
1163  *  Since flockfile already has the moderately convoluted
1164  *  combination of weak and strong symbols required to accomplish
1165  *  this, it is easier to use flockfile() on some handy FILE *
1166  *  stream as a surrogate for pthread locking than it is to so
1167  *  re-invent that wheel.
1168  *
1169  *  Forcing all apps that use cpusets to link with libpthread
1170  *  would force non-transparent initialization on apps that
1171  *  might not be prepared to handle it.
1172  *
1173  *  The application using libcpuset should never notice this
1174  *  odd use of flockfile(), because we never return to the
1175  *  application from any libcpuset call with any such lock held.
1176  *  We just use this locking for guarding some non-atomic cached
1177  *  data updates and accesses, internal to some libcpuset calls.
1178  *  Also, flockfile() allows recursive nesting, so if the app
1179  *  calls libcpuset holding such a file lock, we won't deadlock
1180  *  if we go to acquire the same lock.  We'll just get the lock
1181  *  and increment its counter while we hold it.
1182  */
1183 
1184 static struct cpunodemap {
1185 	int *map;		/* map[cpumask_sz]: maps cpu to its node */
1186 	time_t mtime;		/* modtime of mapfile when last read */
1187 } cpunodemap;
1188 
1189 /*
1190  * rebuild_map() - Rebuild cpunodemap[] from scratch.
1191  *
1192  * Situation:
1193  *	Neither our in-memory cpunodemap[] array nor the
1194  *	cache of it in mapfile is current.
1195  * Action:
1196  *	Rebuild it from first principles and the information
1197  *	available below /sys/devices/system.
1198  */
1199 
rebuild_map()1200 static void rebuild_map()
1201 {
1202 	char buf[PATH_MAX];
1203 	DIR *dir1, *dir2;
1204 	struct dirent *dent1, *dent2;
1205 	int ncpus = cpuset_cpus_nbits();
1206 	int nmems = cpuset_mems_nbits();
1207 	unsigned int cpu, mem;
1208 
1209 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1210 		cpunodemap.map[cpu] = -1;
1211 	pathcat2(buf, sizeof(buf), sysdevices, "node");
1212 	if ((dir1 = opendir(buf)) == NULL)
1213 		return;
1214 	while ((dent1 = readdir(dir1)) != NULL) {
1215 		if (sscanf(dent1->d_name, "node%u", &mem) < 1)
1216 			continue;
1217 		pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name);
1218 		if ((dir2 = opendir(buf)) == NULL)
1219 			continue;
1220 		while ((dent2 = readdir(dir2)) != NULL) {
1221 			if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
1222 				continue;
1223 			if (cpu >= (unsigned int)ncpus
1224 			    || mem >= (unsigned int)nmems)
1225 				continue;
1226 			cpunodemap.map[cpu] = mem;
1227 		}
1228 		closedir(dir2);
1229 	}
1230 	closedir(dir1);
1231 	cpunodemap.mtime = time(0);
1232 }
1233 
1234 /*
1235  * load_map() - Load cpunodemap[] from mapfile.
1236  *
1237  * Situation:
1238  *	The cpunodemap in mapfile is more recent than
1239  *	what we have in the cpunodemap[] array.
1240  * Action:
1241  *	Reload the cpunodemap[] array from the file.
1242  */
1243 
load_map()1244 static void load_map()
1245 {
1246 	char buf[SMALL_BUFSZ];	/* buffer 1 line of mapfile */
1247 	FILE *mapfp;		/* File stream on mapfile */
1248 	int ncpus = cpuset_cpus_nbits();
1249 	int nmems = cpuset_mems_nbits();
1250 	unsigned int cpu, mem;
1251 
1252 	if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL)
1253 		return;
1254 	cpunodemap.mtime = get_mtime(mapfile);
1255 	if ((mapfp = fopen(mapfile, "r")) == NULL)
1256 		return;
1257 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1258 		cpunodemap.map[cpu] = nmems;
1259 	while (flgets(buf, sizeof(buf), mapfp) != NULL) {
1260 		if (sscanf(buf, "%u %u", &cpu, &mem) < 2)
1261 			continue;
1262 		if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems)
1263 			continue;
1264 		cpunodemap.map[cpu] = mem;
1265 	}
1266 	fclose(mapfp);
1267 }
1268 
1269 /*
1270  * store_map() - Write cpunodemap[] out to mapfile.
1271  *
1272  * Situation:
1273  *	The cpunodemap in the cpunodemap[] array is
1274  *	more recent than the one in mapfile.
1275  * Action:
1276  *	Write cpunodemap[] out to mapfile.
1277  */
1278 
store_map()1279 static void store_map()
1280 {
1281 	char buf[PATH_MAX];
1282 	int fd = -1;
1283 	FILE *mapfp = NULL;
1284 	int ncpus = cpuset_cpus_nbits();
1285 	int nmems = cpuset_mems_nbits();
1286 	unsigned int cpu, mem;
1287 
1288 	snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX");
1289 	if ((fd = mkstemp(buf)) < 0)
1290 		goto err;
1291 	if ((mapfp = fdopen(fd, "w")) == NULL)
1292 		goto err;
1293 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1294 		mem = cpunodemap.map[cpu];
1295 		if (mem < (unsigned int)nmems)
1296 			fprintf(mapfp, "%u %u\n", cpu, mem);
1297 	}
1298 	fclose(mapfp);
1299 	set_mtime(buf, cpunodemap.mtime);
1300 	if (rename(buf, mapfile) < 0)
1301 		goto err;
1302 	/* mkstemp() creates mode 0600 - change to world readable */
1303 	(void)chmod(mapfile, 0444);
1304 	return;
1305 err:
1306 	if (mapfp != NULL) {
1307 		fclose(mapfp);
1308 		fd = -1;
1309 	}
1310 	if (fd >= 0)
1311 		close(fd);
1312 	(void)unlink(buf);
1313 }
1314 
1315 /*
1316  * Load and gain thread safe access to the <cpu, node> map.
1317  *
1318  * Return 0 on success with flockfile(stdin) held.
1319  * Each successful get_map() call must be matched with a
1320  * following put_map() call to release the lock.
1321  *
1322  * On error, return -1 with errno set and no lock held.
1323  */
1324 
get_map()1325 static int get_map()
1326 {
1327 	time_t file_mtime;
1328 
1329 	flockfile(stdin);
1330 
1331 	if (cpunodemap.map == NULL) {
1332 		cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int));
1333 		if (cpunodemap.map == NULL)
1334 			goto err;
1335 	}
1336 
1337 	/* If no one has a good cpunodemap, rebuild from scratch */
1338 	file_mtime = get_mtime(mapfile);
1339 	if (cpunodemap.mtime == 0 && file_mtime == 0)
1340 		rebuild_map();
1341 
1342 	/* If either cpunodemap[] or mapfile newer, update other with it */
1343 	file_mtime = get_mtime(mapfile);
1344 	if (cpunodemap.mtime < file_mtime)
1345 		load_map();
1346 	else if (cpunodemap.mtime > file_mtime)
1347 		store_map();
1348 	return 0;
1349 err:
1350 	funlockfile(stdin);
1351 	return -1;
1352 }
1353 
put_map()1354 static void put_map()
1355 {
1356 	funlockfile(stdin);
1357 }
1358 
1359 /* Set cpus to those local to Memory Nodes mems */
cpuset_localcpus(const struct bitmask * mems,struct bitmask * cpus)1360 int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus)
1361 {
1362 	int ncpus = cpuset_cpus_nbits();
1363 	unsigned int cpu;
1364 
1365 	if (check() < 0)
1366 		return -1;
1367 
1368 	get_map();
1369 	bitmask_clearall(cpus);
1370 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1371 		if (bitmask_isbitset(mems, cpunodemap.map[cpu]))
1372 			bitmask_setbit(cpus, cpu);
1373 	}
1374 	put_map();
1375 	return 0;
1376 }
1377 
1378 /* Set mems to those local to CPUs cpus */
cpuset_localmems(const struct bitmask * cpus,struct bitmask * mems)1379 int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems)
1380 {
1381 	int ncpus = cpuset_cpus_nbits();
1382 	unsigned int cpu;
1383 
1384 	if (check() < 0)
1385 		return -1;
1386 
1387 	get_map();
1388 	bitmask_clearall(mems);
1389 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1390 		if (bitmask_isbitset(cpus, cpu))
1391 			bitmask_setbit(mems, cpunodemap.map[cpu]);
1392 	}
1393 	put_map();
1394 	return 0;
1395 }
1396 
1397 /*
1398  * distmap[]
1399  *
1400  * Array of ints of size cpumask_sz by nodemask_sz.
1401  *
1402  * Element distmap[cpu][mem] is the distance between CPU cpu
1403  * and Memory Node mem.  Distances are weighted to roughly
1404  * approximate the cost of memory references, and scaled so that
1405  * the distance from a CPU to its local Memory Node is ten (10).
1406  *
1407  * The first call to cpuset_cpumemdist() builds this map, from
1408  * whatever means the kernel provides to obtain these distances.
1409  *
1410  * These distances derive from ACPI SLIT table entries, which are
1411  * eight bits in size.
1412  *
1413  * Hold flockfile(stdout) while using distmap for posix thread safety.
1414  */
1415 
1416 typedef unsigned char distmap_entry_t;	/* type of distmap[] entries */
1417 
1418 static distmap_entry_t *distmap;	/* maps <cpu, mem> to distance */
1419 
1420 #define DISTMAP_MAX UCHAR_MAX	/* maximum value in distmap[] */
1421 
1422 #define I(i,j) ((i) * nmems + (j))	/* 2-D array index simulation */
1423 
1424 /*
1425  * Parse arch neutral lines from 'distance' files of form:
1426  *
1427  *	46 66 10 20
1428  *
1429  * The lines contain a space separated list of distances, which is parsed
1430  * into array dists[] of each nodes distance from the specified node.
1431  *
1432  * Result is placed in distmap[ncpus][nmems]:
1433  *
1434  *	For each cpu c on node:
1435  *		For each node position n in list of distances:
1436  *			distmap[c][n] = dists[n]
1437  */
1438 
parse_distmap_line(unsigned int node,char * buf)1439 static int parse_distmap_line(unsigned int node, char *buf)
1440 {
1441 	char *p, *q;
1442 	int ncpus = cpuset_cpus_nbits();
1443 	int nmems = cpuset_mems_nbits();
1444 	unsigned int c, n;
1445 	distmap_entry_t *dists = NULL;
1446 	struct bitmask *cpus = NULL, *mems = NULL;
1447 	int ret = -1;
1448 
1449 	p = buf;
1450 	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1451 		goto err;
1452 	for (n = 0; n < (unsigned int)nmems; n++)
1453 		dists[n] = DISTMAP_MAX;
1454 
1455 	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1456 		unsigned int d;
1457 
1458 		if ((p = strpbrk(p, "0123456789")) == NULL)
1459 			break;
1460 		d = strtoul(p, &q, 10);
1461 		if (p == q)
1462 			break;
1463 		if (d < DISTMAP_MAX)
1464 			dists[n] = (distmap_entry_t) d;
1465 	}
1466 
1467 	if ((mems = bitmask_alloc(nmems)) == NULL)
1468 		goto err;
1469 	bitmask_setbit(mems, node);
1470 
1471 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1472 		goto err;
1473 	cpuset_localcpus(mems, cpus);
1474 
1475 	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1476 	     c = bitmask_next(cpus, c + 1))
1477 		for (n = 0; n < (unsigned int)nmems; n++)
1478 			distmap[I(c, n)] = dists[n];
1479 	ret = 0;
1480 	/* fall into ... */
1481 err:
1482 	bitmask_free(mems);
1483 	bitmask_free(cpus);
1484 	free(dists);
1485 	return ret;
1486 }
1487 
parse_distance_file(unsigned int node,const char * path)1488 static int parse_distance_file(unsigned int node, const char *path)
1489 {
1490 	FILE *fp;
1491 	char *buf = NULL;
1492 	int buflen;
1493 
1494 	if ((fp = fopen(path, "r")) == NULL)
1495 		goto err;
1496 
1497 	buflen = filesize(fp);
1498 
1499 	if ((buf = malloc(buflen)) == NULL)
1500 		goto err;
1501 
1502 	if (flgets(buf, buflen, fp) == NULL)
1503 		goto err;
1504 
1505 	if (parse_distmap_line(node, buf) < 0)
1506 		goto err;
1507 
1508 	free(buf);
1509 	fclose(fp);
1510 	return 0;
1511 err:
1512 	free(buf);
1513 	if (fp)
1514 		fclose(fp);
1515 	return -1;
1516 }
1517 
build_distmap()1518 static void build_distmap()
1519 {
1520 	static int tried_before = 0;
1521 	int ncpus = cpuset_cpus_nbits();
1522 	int nmems = cpuset_mems_nbits();
1523 	int c, m;
1524 	DIR *dir = NULL;
1525 	struct dirent *dent;
1526 
1527 	if (tried_before)
1528 		goto err;
1529 	tried_before = 1;
1530 
1531 	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1532 		goto err;
1533 
1534 	for (c = 0; c < ncpus; c++)
1535 		for (m = 0; m < nmems; m++)
1536 			distmap[I(c, m)] = DISTMAP_MAX;
1537 
1538 	if ((dir = opendir(distance_directory)) == NULL)
1539 		goto err;
1540 	while ((dent = readdir(dir)) != NULL) {
1541 		char buf[PATH_MAX];
1542 		unsigned int node;
1543 
1544 		if (sscanf(dent->d_name, "node%u", &node) < 1)
1545 			continue;
1546 		pathcat3(buf, sizeof(buf), distance_directory, dent->d_name,
1547 			 "distance");
1548 		if (parse_distance_file(node, buf) < 0)
1549 			goto err;
1550 	}
1551 	closedir(dir);
1552 	return;
1553 err:
1554 	if (dir)
1555 		closedir(dir);
1556 	free(distmap);
1557 	distmap = NULL;
1558 }
1559 
1560 #ifdef ALTERNATE_SN_DISTMAP
1561 
1562 /*
1563  * Parse SN architecture specific line of form:
1564  *
1565  *	node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10
1566  *
1567  * Second field is node number.  The "dist" field is the colon separated list
1568  * of distances, which is parsed into array dists[] of each nodes distance
1569  * from that node.
1570  *
1571  * Result is placed in distmap[ncpus][nmems]:
1572  *
1573  *	For each cpu c on that node:
1574  *		For each node position n in list of distances:
1575  *			distmap[c][n] = dists[n]
1576  */
1577 
parse_distmap_line_sn(char * buf)1578 static void parse_distmap_line_sn(char *buf)
1579 {
1580 	char *p, *pend, *q;
1581 	int ncpus = cpuset_cpus_nbits();
1582 	int nmems = cpuset_mems_nbits();
1583 	unsigned long c, n, node;
1584 	distmap_entry_t *dists = NULL;
1585 	struct bitmask *cpus = NULL, *mems = NULL;
1586 
1587 	if ((p = strchr(buf, ' ')) == NULL)
1588 		goto err;
1589 	if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems)
1590 		goto err;
1591 	if ((p = strstr(q, " dist ")) == NULL)
1592 		goto err;
1593 	p += strlen(" dist ");
1594 	if ((pend = strchr(p, ' ')) != NULL)
1595 		*pend = '\0';
1596 	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1597 		goto err;
1598 	for (n = 0; n < (unsigned int)nmems; n++)
1599 		dists[n] = DISTMAP_MAX;
1600 
1601 	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1602 		unsigned long d;
1603 
1604 		if ((p = strpbrk(p, "0123456789")) == NULL)
1605 			break;
1606 		d = strtoul(p, &q, 10);
1607 		if (p == q)
1608 			break;
1609 		if (d < DISTMAP_MAX)
1610 			dists[n] = (distmap_entry_t) d;
1611 	}
1612 
1613 	if ((mems = bitmask_alloc(nmems)) == NULL)
1614 		goto err;
1615 	bitmask_setbit(mems, node);
1616 
1617 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1618 		goto err;
1619 	cpuset_localcpus(mems, cpus);
1620 
1621 	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1622 	     c = bitmask_next(cpus, c + 1))
1623 		for (n = 0; n < (unsigned int)nmems; n++)
1624 			distmap[I(c, n)] = dists[n];
1625 	/* fall into ... */
1626 err:
1627 	bitmask_free(mems);
1628 	bitmask_free(cpus);
1629 	free(dists);
1630 }
1631 
build_distmap_sn()1632 static void build_distmap_sn()
1633 {
1634 	int ncpus = cpuset_cpus_nbits();
1635 	int nmems = cpuset_mems_nbits();
1636 	int c, m;
1637 	static int tried_before = 0;
1638 	FILE *fp = NULL;
1639 	char *buf = NULL;
1640 	int buflen;
1641 
1642 	if (tried_before)
1643 		goto err;
1644 	tried_before = 1;
1645 
1646 	if ((fp = fopen(sn_topology, "r")) == NULL)
1647 		goto err;
1648 
1649 	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1650 		goto err;
1651 
1652 	for (c = 0; c < ncpus; c++)
1653 		for (m = 0; m < nmems; m++)
1654 			distmap[I(c, m)] = DISTMAP_MAX;
1655 
1656 	buflen = filesize(fp);
1657 	if ((buf = malloc(buflen)) == NULL)
1658 		goto err;
1659 
1660 	while (flgets(buf, buflen, fp) != NULL)
1661 		if (strprefix(buf, sn_top_node_prefix))
1662 			parse_distmap_line_sn(buf);
1663 
1664 	free(buf);
1665 	fclose(fp);
1666 	return;
1667 err:
1668 	free(buf);
1669 	free(distmap);
1670 	distmap = NULL;
1671 	if (fp)
1672 		fclose(fp);
1673 }
1674 
1675 #endif
1676 
1677 /* [optional] Hardware distance from CPU to Memory Node */
cpuset_cpumemdist(int cpu,int mem)1678 unsigned int cpuset_cpumemdist(int cpu, int mem)
1679 {
1680 	int ncpus = cpuset_cpus_nbits();
1681 	int nmems = cpuset_mems_nbits();
1682 	distmap_entry_t r = DISTMAP_MAX;
1683 
1684 	flockfile(stdout);
1685 
1686 	if (check() < 0)
1687 		goto err;
1688 
1689 	if (distmap == NULL)
1690 		build_distmap();
1691 
1692 #ifdef ALTERNATE_SN_DISTMAP
1693 	if (distmap == NULL)
1694 		build_distmap_sn();
1695 #endif
1696 
1697 	if (distmap == NULL)
1698 		goto err;
1699 
1700 	if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems)
1701 		goto err;
1702 
1703 	r = distmap[I(cpu, mem)];
1704 	/* fall into ... */
1705 err:
1706 	funlockfile(stdout);
1707 	return r;
1708 }
1709 
1710 /* [optional] Return Memory Node closest to cpu */
cpuset_cpu2node(int cpu)1711 int cpuset_cpu2node(int cpu)
1712 {
1713 	int ncpus = cpuset_cpus_nbits();
1714 	int nmems = cpuset_mems_nbits();
1715 	struct bitmask *cpus = NULL, *mems = NULL;
1716 	int r = -1;
1717 
1718 	if (check() < 0)
1719 		goto err;
1720 
1721 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1722 		goto err;
1723 	bitmask_setbit(cpus, cpu);
1724 
1725 	if ((mems = bitmask_alloc(nmems)) == NULL)
1726 		goto err;
1727 	cpuset_localmems(cpus, mems);
1728 	r = bitmask_first(mems);
1729 	/* fall into ... */
1730 err:
1731 	bitmask_free(cpus);
1732 	bitmask_free(mems);
1733 	return r;
1734 }
1735 
apply_cpuset_settings(const char * path,const struct cpuset * cp)1736 static int apply_cpuset_settings(const char *path, const struct cpuset *cp)
1737 {
1738 	if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) {
1739 		if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0)
1740 			goto err;
1741 	}
1742 
1743 	if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) {
1744 		if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0)
1745 			goto err;
1746 	}
1747 
1748 	if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) {
1749 		if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0)
1750 			goto err;
1751 	}
1752 
1753 	if (cp->notify_on_release_valid && cp->notify_on_release_dirty) {
1754 		if (store_flag(path, "notify_on_release", cp->notify_on_release)
1755 		    < 0)
1756 			goto err;
1757 	}
1758 
1759 	if (cp->memory_migrate_valid &&
1760 	    cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) {
1761 		if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0)
1762 			goto err;
1763 	}
1764 
1765 	if (cp->memory_pressure_enabled_valid &&
1766 	    cp->memory_pressure_enabled_dirty &&
1767 	    exists_flag(path, "memory_pressure_enabled")) {
1768 		if (store_flag
1769 		    (path, "memory_pressure_enabled",
1770 		     cp->memory_pressure_enabled) < 0)
1771 			goto err;
1772 	}
1773 
1774 	if (cp->memory_spread_page_valid &&
1775 	    cp->memory_spread_page_dirty &&
1776 	    exists_flag(path, "memory_spread_page")) {
1777 		if (store_flag
1778 		    (path, "memory_spread_page", cp->memory_spread_page) < 0)
1779 			goto err;
1780 	}
1781 
1782 	if (cp->memory_spread_slab_valid &&
1783 	    cp->memory_spread_slab_dirty &&
1784 	    exists_flag(path, "memory_spread_slab")) {
1785 		if (store_flag
1786 		    (path, "memory_spread_slab", cp->memory_spread_slab) < 0)
1787 			goto err;
1788 	}
1789 
1790 	if (cp->sched_load_balance_valid &&
1791 	    cp->sched_load_balance_dirty &&
1792 	    exists_flag(path, "sched_load_balance")) {
1793 		if (store_flag
1794 		    (path, "sched_load_balance", cp->sched_load_balance) < 0)
1795 			goto err;
1796 	}
1797 
1798 	if (cp->sched_relax_domain_level_valid &&
1799 	    cp->sched_relax_domain_level_dirty &&
1800 	    exists_flag(path, "sched_relax_domain_level")) {
1801 		if (store_number
1802 		    (path, "sched_relax_domain_level",
1803 		     cp->sched_relax_domain_level) < 0)
1804 			goto err;
1805 	}
1806 
1807 	if (cp->cpus_valid && cp->cpus_dirty) {
1808 		if (store_mask(path, "cpus", cp->cpus) < 0)
1809 			goto err;
1810 	}
1811 
1812 	if (cp->mems_valid && cp->mems_dirty) {
1813 		if (store_mask(path, "mems", cp->mems) < 0)
1814 			goto err;
1815 	}
1816 	return 0;
1817 err:
1818 	return -1;
1819 }
1820 
1821 /*
1822  * get_siblings() - helper routine for cpuset_would_crash_kernel(), below.
1823  *
1824  * Extract max value of any 'siblings' field in /proc/cpuinfo.
1825  * Cache the result - only need to extract once in lifetime of task.
1826  *
1827  * The siblings field is the number of logical CPUs in a physical
1828  * processor package.  It is equal to the product of the number of
1829  * cores in that package, times the number of hyper-threads per core.
1830  * The bug that cpuset_would_crash_kernel() is detecting arises
1831  * when a cpu_exclusive cpuset tries to include just some, not all,
1832  * of the sibling logical CPUs available in a processor package.
1833  *
1834  * In the improbable case that a system has mixed values of siblings
1835  * (some processor packages have more than others, perhaps due to
1836  * partially enabling Hyper-Threading), we take the worse case value,
1837  * the largest siblings value.  This might be overkill.  I don't know
1838  * if this kernel bug considers each processor package's siblings
1839  * separately or not.  But it sure is easier this way ...
1840  *
1841  * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from
1842  * open to close, the first time called.
1843  */
1844 
get_siblings()1845 static int get_siblings()
1846 {
1847 	static int siblings;
1848 	char buf[32];		/* big enough for one 'siblings' line */
1849 	FILE *fp;
1850 
1851 	if (siblings)
1852 		return siblings;
1853 
1854 	if ((fp = fopen("/proc/cpuinfo", "r")) == NULL)
1855 		return 4;	/* wing it - /proc not mounted ? */
1856 	while (flgets(buf, sizeof(buf), fp) != NULL) {
1857 		int s;
1858 
1859 		if (sscanf(buf, "siblings : %d", &s) < 1)
1860 			continue;
1861 		if (s > siblings)
1862 			siblings = s;
1863 	}
1864 	fclose(fp);
1865 	if (siblings == 0)
1866 		siblings = 1;	/* old kernel, no siblings, default to 1 */
1867 	return siblings;
1868 }
1869 
1870 /*
1871  * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic
1872  * scheduler domain code invoked for cpu_exclusive cpusets that causes
1873  * the kernel to freeze, requiring a hardware reset.
1874  *
1875  * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive'
1876  * cpuset is defined where that cpusets 'cpus' are not on package
1877  * boundaries then the kernel will freeze, usually as soon as this
1878  * cpuset is created, requiring a hardware reset.
1879  *
1880  * A cpusets 'cpus' are not on package boundaries if the cpuset
1881  * includes a proper non-empty subset (some, but not all) of the
1882  * logical cpus on a processor package.  This requires multiple
1883  * logical CPUs per package, available with either Hyper-Thread or
1884  * Multi-Core support.  Without one of these features, there is only
1885  * one logical CPU per physical package, and it's not possible to
1886  * have a proper, non-empty subset of a set of cardinality one.
1887  *
1888  * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC
1889  * on i386 and x86_64 arch's.
1890  *
1891  * The objective of this routine cpuset_would_crash_kernel() is to
1892  * determine if a proposed cpuset setting would crash the kernel due
1893  * to this bug, so that the caller can avoid the crash.
1894  *
1895  * Ideally we'd check for exactly these conditions here, but computing
1896  * the package (identified by the 'physical id' field of /proc/cpuinfo)
1897  * of each cpu in a cpuset is more effort than it's worth here.
1898  *
1899  * Also there is no obvious way to identify exactly whether the kernel
1900  * one is executing on has this bug, short of trying it, and seeing
1901  * if the kernel just crashed.
1902  *
1903  * So for now, we look for a simpler set of conditions, that meets
1904  * our immediate need - avoid this crash on SUSE SLES10 systems that
1905  * are susceptible to it.  We look for the kernel version 2.6.16.*,
1906  * which is the base kernel of SUSE SLES10, and for i386 or x86_64
1907  * processors, which had CONFIG_SCHED_MC enabled.
1908  *
1909  * If these simpler conditions are met, we further simplify the check,
1910  * by presuming that the logical CPUs are numbered on processor
1911  * package boundaries.  If each package has S siblings, we assume
1912  * that CPUs numbered N through N + S -1 are on the same package,
1913  * for any CPU N such that N mod S == 0.
1914  *
1915  * Yes, this is a hack, focused on avoiding kernel freezes on
1916  * susceptible SUSE SLES10 systems.
1917  */
1918 
cpuset_would_crash_kernel(const struct cpuset * cp)1919 static int cpuset_would_crash_kernel(const struct cpuset *cp)
1920 {
1921 	static int susceptible_system = -1;
1922 
1923 	if (!cp->cpu_exclusive)
1924 		goto ok;
1925 
1926 	if (susceptible_system == -1) {
1927 		struct utsname u;
1928 		int rel_2_6_16, arch_i386, arch_x86_64;
1929 
1930 		if (uname(&u) < 0)
1931 			goto fail;
1932 		rel_2_6_16 = strprefix(u.release, "2.6.16.");
1933 		arch_i386 = streq(u.machine, "i386");
1934 		arch_x86_64 = streq(u.machine, "x86_64");
1935 		susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64);
1936 	}
1937 
1938 	if (susceptible_system) {
1939 		int ncpus = cpuset_cpus_nbits();
1940 		int siblings = get_siblings();
1941 		unsigned int cpu;
1942 
1943 		for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) {
1944 			int s, num_set = 0;
1945 
1946 			for (s = 0; s < siblings; s++) {
1947 				if (bitmask_isbitset(cp->cpus, cpu + s))
1948 					num_set++;
1949 			}
1950 
1951 			/* If none or all siblings set, we're still ok */
1952 			if (num_set == 0 || num_set == siblings)
1953 				continue;
1954 
1955 			/* Found one that would crash kernel.  Fail.  */
1956 			errno = ENXIO;
1957 			goto fail;
1958 		}
1959 	}
1960 	/* If not susceptible, or if all ok, fall into "ok" ... */
1961 ok:
1962 	return 0;		/* would not crash */
1963 fail:
1964 	return 1;		/* would crash */
1965 }
1966 
1967 /* compare two cpuset and mark the dirty variable */
mark_dirty_variable(struct cpuset * cp1,const struct cpuset * cp2)1968 static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2)
1969 {
1970 	if (cp1->cpu_exclusive_valid &&
1971 	    cp1->cpu_exclusive != cp2->cpu_exclusive)
1972 		cp1->cpu_exclusive_dirty = 1;
1973 
1974 	if (cp1->mem_exclusive_valid &&
1975 	    cp1->mem_exclusive != cp2->mem_exclusive)
1976 		cp1->mem_exclusive_dirty = 1;
1977 
1978 	if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall)
1979 		cp1->mem_hardwall_dirty = 1;
1980 
1981 	if (cp1->notify_on_release_valid &&
1982 	    cp1->notify_on_release != cp2->notify_on_release)
1983 		cp1->notify_on_release_dirty = 1;
1984 
1985 	if (cp1->memory_migrate_valid &&
1986 	    cp1->memory_migrate != cp2->memory_migrate)
1987 		cp1->memory_migrate_dirty = 1;
1988 
1989 	if (cp1->memory_pressure_enabled_valid &&
1990 	    cp1->memory_pressure_enabled != cp2->memory_pressure_enabled)
1991 		cp1->memory_pressure_enabled_dirty = 1;
1992 
1993 	if (cp1->memory_spread_page_valid &&
1994 	    cp1->memory_spread_page != cp2->memory_spread_page)
1995 		cp1->memory_spread_page_dirty = 1;
1996 
1997 	if (cp1->memory_spread_slab_valid &&
1998 	    cp1->memory_spread_slab != cp2->memory_spread_slab)
1999 		cp1->memory_spread_slab_dirty = 1;
2000 
2001 	if (cp1->sched_load_balance_valid &&
2002 	    cp1->sched_load_balance != cp2->sched_load_balance)
2003 		cp1->sched_load_balance_dirty = 1;
2004 
2005 	if (cp1->sched_relax_domain_level_valid &&
2006 	    cp1->sched_relax_domain_level != cp2->sched_relax_domain_level)
2007 		cp1->sched_relax_domain_level_dirty = 1;
2008 
2009 	if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus))
2010 		cp1->cpus_dirty = 1;
2011 	if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems))
2012 		cp1->mems_dirty = 1;
2013 }
2014 
2015 /* Create (if new set) or modify cpuset 'cp' at location 'relpath' */
cr_or_mod(const char * relpath,const struct cpuset * cp,int new)2016 static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new)
2017 {
2018 	char buf[PATH_MAX];
2019 	int do_rmdir_on_err = 0;
2020 	int do_restore_cp_sav_on_err = 0;
2021 	struct cpuset *cp_sav = NULL;
2022 	int sav_errno;
2023 
2024 	if (check() < 0)
2025 		goto err;
2026 
2027 	if (cpuset_would_crash_kernel(cp))
2028 		goto err;
2029 
2030 	fullpath(buf, sizeof(buf), relpath);
2031 
2032 	if (new) {
2033 		if (mkdir(buf, 0755) < 0)
2034 			goto err;
2035 		/* we made it, so we should remove it on error */
2036 		do_rmdir_on_err = 1;
2037 	}
2038 
2039 	if ((cp_sav = cpuset_alloc()) == NULL)
2040 		goto err;
2041 	if (cpuset_query(cp_sav, relpath) < 0)
2042 		goto err;
2043 	/* we have old settings to restore on error */
2044 	do_restore_cp_sav_on_err = 1;
2045 
2046 	/* check which variable need to restore on error */
2047 	mark_dirty_variable(cp_sav, cp);
2048 
2049 	if (apply_cpuset_settings(buf, cp) < 0)
2050 		goto err;
2051 
2052 	cpuset_free(cp_sav);
2053 	return 0;
2054 err:
2055 	sav_errno = errno;
2056 	if (do_restore_cp_sav_on_err)
2057 		(void)apply_cpuset_settings(buf, cp_sav);
2058 	if (cp_sav)
2059 		cpuset_free(cp_sav);
2060 	if (do_rmdir_on_err)
2061 		(void)rmdir(buf);
2062 	errno = sav_errno;
2063 	return -1;
2064 }
2065 
2066 /* Create cpuset 'cp' at location 'relpath' */
cpuset_create(const char * relpath,const struct cpuset * cp)2067 int cpuset_create(const char *relpath, const struct cpuset *cp)
2068 {
2069 	return cr_or_mod(relpath, cp, 1);
2070 }
2071 
2072 /* Delete cpuset at location 'path' (if empty) */
cpuset_delete(const char * relpath)2073 int cpuset_delete(const char *relpath)
2074 {
2075 	char buf[PATH_MAX];
2076 
2077 	if (check() < 0)
2078 		goto err;
2079 
2080 	fullpath(buf, sizeof(buf), relpath);
2081 	if (rmdir(buf) < 0)
2082 		goto err;
2083 
2084 	return 0;
2085 err:
2086 	return -1;
2087 }
2088 
2089 /* Set cpuset cp to the cpuset at location 'path' */
cpuset_query(struct cpuset * cp,const char * relpath)2090 int cpuset_query(struct cpuset *cp, const char *relpath)
2091 {
2092 	char buf[PATH_MAX];
2093 
2094 	if (check() < 0)
2095 		goto err;
2096 
2097 	fullpath(buf, sizeof(buf), relpath);
2098 
2099 	if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0)
2100 		goto err;
2101 	cp->cpu_exclusive_valid = 1;
2102 
2103 	if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0)
2104 		goto err;
2105 	cp->mem_exclusive_valid = 1;
2106 
2107 	if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0)
2108 		goto err;
2109 	cp->notify_on_release_valid = 1;
2110 
2111 	if (exists_flag(buf, "memory_migrate")) {
2112 		if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0)
2113 			goto err;
2114 		cp->memory_migrate_valid = 1;
2115 	}
2116 
2117 	if (exists_flag(buf, "mem_hardwall")) {
2118 		if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0)
2119 			goto err;
2120 		cp->mem_hardwall_valid = 1;
2121 	}
2122 
2123 	if (exists_flag(buf, "memory_pressure_enabled")) {
2124 		if (load_flag
2125 		    (buf, &cp->memory_pressure_enabled,
2126 		     "memory_pressure_enabled") < 0)
2127 			goto err;
2128 		cp->memory_pressure_enabled_valid = 1;
2129 	}
2130 
2131 	if (exists_flag(buf, "memory_spread_page")) {
2132 		if (load_flag
2133 		    (buf, &cp->memory_spread_page, "memory_spread_page") < 0)
2134 			goto err;
2135 		cp->memory_spread_page_valid = 1;
2136 	}
2137 
2138 	if (exists_flag(buf, "memory_spread_slab")) {
2139 		if (load_flag
2140 		    (buf, &cp->memory_spread_slab, "memory_spread_slab") < 0)
2141 			goto err;
2142 		cp->memory_spread_slab_valid = 1;
2143 	}
2144 
2145 	if (exists_flag(buf, "sched_load_balance")) {
2146 		if (load_flag
2147 		    (buf, &cp->sched_load_balance, "sched_load_balance") < 0)
2148 			goto err;
2149 		cp->sched_load_balance_valid = 1;
2150 	}
2151 
2152 	if (exists_flag(buf, "sched_relax_domain_level")) {
2153 		if (load_number
2154 		    (buf, &cp->sched_relax_domain_level,
2155 		     "sched_relax_domain_level") < 0)
2156 			goto err;
2157 		cp->sched_relax_domain_level_valid = 1;
2158 	}
2159 
2160 	if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0)
2161 		goto err;
2162 	cp->cpus_valid = 1;
2163 
2164 	if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0)
2165 		goto err;
2166 	cp->mems_valid = 1;
2167 
2168 	return 0;
2169 err:
2170 	return -1;
2171 }
2172 
2173 /* Modify cpuset at location 'relpath' to values of 'cp' */
cpuset_modify(const char * relpath,const struct cpuset * cp)2174 int cpuset_modify(const char *relpath, const struct cpuset *cp)
2175 {
2176 	return cr_or_mod(relpath, cp, 0);
2177 }
2178 
2179 /* Get cpuset path of pid into buf */
cpuset_getcpusetpath(pid_t pid,char * buf,size_t size)2180 char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size)
2181 {
2182 	int fd;			/* dual use: cpuset file for pid and self */
2183 	int rc;			/* dual use: snprintf and read return codes */
2184 
2185 	if (check() < 0)
2186 		return NULL;
2187 
2188 	/* borrow result buf[] to build cpuset file path */
2189 	if (pid == 0)
2190 		rc = snprintf(buf, size, "/proc/self/cpuset");
2191 	else
2192 		rc = snprintf(buf, size, "/proc/%d/cpuset", pid);
2193 	if (rc >= (int)size) {
2194 		errno = E2BIG;
2195 		return NULL;
2196 	}
2197 	if ((fd = open(buf, O_RDONLY)) < 0) {
2198 		int e = errno;
2199 		if (e == ENOENT)
2200 			e = ESRCH;
2201 		if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0)
2202 			e = ENOSYS;
2203 		else
2204 			close(fd);
2205 		errno = e;
2206 		return NULL;
2207 	}
2208 	rc = read(fd, buf, size);
2209 	close(fd);
2210 	if (rc < 0)
2211 		return NULL;
2212 	if (rc >= (int)size) {
2213 		errno = E2BIG;
2214 		return NULL;
2215 	}
2216 	buf[rc] = 0;
2217 	chomp(buf);
2218 	return buf;
2219 
2220 }
2221 
2222 /* Get cpuset 'cp' of pid */
cpuset_cpusetofpid(struct cpuset * cp,pid_t pid)2223 int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid)
2224 {
2225 	char buf[PATH_MAX];
2226 
2227 	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
2228 		return -1;
2229 	if (cpuset_query(cp, buf) < 0)
2230 		return -1;
2231 	return 0;
2232 }
2233 
2234 /* [optional] Return mountpoint of cpuset filesystem */
cpuset_mountpoint()2235 const char *cpuset_mountpoint()
2236 {
2237 	if (check() < 0) {
2238 		switch (errno) {
2239 		case ENODEV:
2240 			return "[cpuset filesystem not mounted]";
2241 		default:
2242 			return "[cpuset filesystem not supported]";
2243 		}
2244 	}
2245 	return cpusetmnt;
2246 }
2247 
2248 /* Return true if path is a directory. */
isdir(const char * path)2249 static int isdir(const char *path)
2250 {
2251 	struct stat statbuf;
2252 
2253 	if (stat(path, &statbuf) < 0)
2254 		return 0;
2255 	return S_ISDIR(statbuf.st_mode);
2256 }
2257 
2258 /*
2259  * [optional] cpuset_collides_exclusive() - True if would collide exclusive.
2260  *
2261  * Return true iff the specified cpuset would overlap with any
2262  * sibling cpusets in either cpus or mems, where either this
2263  * cpuset or the sibling is cpu_exclusive or mem_exclusive.
2264  *
2265  * cpuset_create() fails with errno == EINVAL if the requested cpuset
2266  * would overlap with any sibling, where either one is cpu_exclusive or
2267  * mem_exclusive.  This is a common, and not obvious error.  The
2268  * following routine checks for this particular case, so that code
2269  * creating cpusets can better identify the situation, perhaps to issue
2270  * a more informative error message.
2271  *
2272  * Can also be used to diagnose cpuset_modify failures.  This
2273  * routine ignores any existing cpuset with the same path as the
2274  * given 'cpusetpath', and only looks for exclusive collisions with
2275  * sibling cpusets of that path.
2276  *
2277  * In case of any error, returns (0) -- does not collide.  Presumably
2278  * any actual attempt to create or modify a cpuset will encounter the
2279  * same error, and report it usefully.
2280  *
2281  * This routine is not particularly efficient; most likely code creating or
2282  * modifying a cpuset will want to try the operation first, and then if that
2283  * fails with errno EINVAL, perhaps call this routine to determine if an
2284  * exclusive cpuset collision caused the error.
2285  */
2286 
cpuset_collides_exclusive(const char * cpusetpath,const struct cpuset * cp1)2287 int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1)
2288 {
2289 	char parent[PATH_MAX];
2290 	char *p;
2291 	char *pathcopy = NULL;
2292 	char *base;
2293 	DIR *dir = NULL;
2294 	struct dirent *dent;
2295 	struct cpuset *cp2 = NULL;
2296 	struct bitmask *cpus1 = NULL, *cpus2 = NULL;
2297 	struct bitmask *mems1 = NULL, *mems2 = NULL;
2298 	int ret;
2299 
2300 	if (check() < 0)
2301 		goto err;
2302 
2303 	fullpath(parent, sizeof(parent), cpusetpath);
2304 	if (streq(parent, cpusetmnt))
2305 		goto err;	/* only one cpuset root - can't collide */
2306 	pathcopy = strdup(parent);
2307 	p = strrchr(parent, '/');
2308 	if (!p)
2309 		goto err;	/* huh? - impossible - run and hide */
2310 	*p = 0;			/* now parent is dirname of fullpath */
2311 
2312 	p = strrchr(pathcopy, '/');
2313 	base = p + 1;		/* now base is basename of fullpath */
2314 	if (!*base)
2315 		goto err;	/* this is also impossible - run away */
2316 
2317 	if ((dir = opendir(parent)) == NULL)
2318 		goto err;
2319 	if ((cp2 = cpuset_alloc()) == NULL)
2320 		goto err;
2321 	if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2322 		goto err;
2323 	if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2324 		goto err;
2325 	if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2326 		goto err;
2327 	if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2328 		goto err;
2329 
2330 	while ((dent = readdir(dir)) != NULL) {
2331 		char child[PATH_MAX];
2332 
2333 		if (streq(dent->d_name, ".") || streq(dent->d_name, ".."))
2334 			continue;
2335 		if (streq(dent->d_name, base))
2336 			continue;
2337 		pathcat2(child, sizeof(child), parent, dent->d_name);
2338 		if (!isdir(child))
2339 			continue;
2340 		if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0)
2341 			goto err;
2342 		if (cp1->cpu_exclusive || cp2->cpu_exclusive) {
2343 			cpuset_getcpus(cp1, cpus1);
2344 			cpuset_getcpus(cp2, cpus2);
2345 			if (bitmask_intersects(cpus1, cpus2))
2346 				goto collides;
2347 		}
2348 		if (cp1->mem_exclusive || cp2->mem_exclusive) {
2349 			cpuset_getmems(cp1, mems1);
2350 			cpuset_getmems(cp2, mems2);
2351 			if (bitmask_intersects(mems1, mems2))
2352 				goto collides;
2353 		}
2354 	}
2355 err:
2356 	/* error, or did not collide */
2357 	ret = 0;
2358 	goto done;
2359 collides:
2360 	/* collides */
2361 	ret = 1;
2362 	/* fall into ... */
2363 done:
2364 	if (dir)
2365 		closedir(dir);
2366 	cpuset_free(cp2);
2367 	free(pathcopy);
2368 	bitmask_free(cpus1);
2369 	bitmask_free(cpus2);
2370 	bitmask_free(mems1);
2371 	bitmask_free(mems2);
2372 	return ret;
2373 }
2374 
2375 /*
2376  * [optional] cpuset_nuke() - Remove cpuset anyway possible
2377  *
2378  * Remove a cpuset, including killing tasks in it, and
2379  * removing any descendent cpusets and killing their tasks.
2380  *
2381  * Tasks can take a long time (minutes on some configurations)
2382  * to exit.  Loop up to 'seconds' seconds, trying to kill them.
2383  *
2384  * How we do it:
2385  *	1) First, kill all the pids, looping until there are
2386  *	   no more pids in this cpuset or below, or until the
2387  *	   'seconds' timeout limit is exceeded.
2388  *	2) Then depth first recursively rmdir the cpuset directories.
2389  *	3) If by this point the original cpuset is gone, we succeeded.
2390  *
2391  * If the timeout is exceeded, and tasks still exist, fail with
2392  * errno == ETIME.
2393  *
2394  * We sleep a variable amount of time.  After the first attempt to
2395  * kill all the tasks in the cpuset or its descendents, we sleep 1
2396  * second, the next time 2 seconds, increasing 1 second each loop
2397  * up to a max of 10 seconds.  If more loops past 10 are required
2398  * to kill all the tasks, we sleep 10 seconds each subsequent loop.
2399  * In any case, before the last loop, we sleep however many seconds
2400  * remain of the original timeout 'seconds' requested.  The total
2401  * time of all sleeps will be no more than the requested 'seconds'.
2402  *
2403  * If the cpuset started out empty of any tasks, or if the passed in
2404  * 'seconds' was zero, then this routine will return quickly, having
2405  * not slept at all.  Otherwise, this routine will at a minimum send
2406  * a SIGKILL to all the tasks in this cpuset subtree, then sleep one
2407  * second, before looking to see if any tasks remain.  If tasks remain
2408  * in the cpuset subtree, and a longer 'seconds' timeout was requested
2409  * (more than one), it will continue to kill remaining tasks and sleep,
2410  * in a loop, for as long as time and tasks remain.
2411  *
2412  * The signal sent for the kill is hardcoded to SIGKILL (9).  If some
2413  * other signal should be sent first, use a separate code loop,
2414  * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to
2415  * scan the task pids in a cpuset.  If SIGKILL should -not- be sent,
2416  * this cpuset_nuke() routine can still be called to recursively
2417  * remove a cpuset subtree, by specifying a timeout of zero 'seconds'.
2418  *
2419  * On success, returns 0 with errno == 0.
2420  *
2421  * On failure, returns -1, with errno possibly one of:
2422  *  EACCES - search permission denied on intervening directory
2423  *  ETIME - timed out - tasks remain after 'seconds' timeout
2424  *  EMFILE - too many open files
2425  *  ENODEV - /dev/cpuset not mounted
2426  *  ENOENT - component of cpuset path doesn't exist
2427  *  ENOMEM - out of memory
2428  *  ENOSYS - kernel doesn't support cpusets
2429  *  ENOTDIR - component of cpuset path is not a directory
2430  *  EPERM - lacked permission to kill a task
2431  *  EPERM - lacked permission to read cpusets or files therein
2432  */
2433 
2434 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree);
2435 
cpuset_nuke(const char * relpath,unsigned int seconds)2436 int cpuset_nuke(const char *relpath, unsigned int seconds)
2437 {
2438 	unsigned int secs_left = seconds;	/* total sleep seconds left */
2439 	unsigned int secs_loop = 1;	/* how much sleep next loop */
2440 	unsigned int secs_slept;	/* seconds slept in sleep() */
2441 	struct cpuset_pidlist *pl = NULL;	/* pids in cpuset subtree */
2442 	struct cpuset_fts_tree *cs_tree;
2443 	const struct cpuset_fts_entry *cs_entry;
2444 	int ret, sav_errno = 0;
2445 
2446 	if (check() < 0)
2447 		return -1;
2448 
2449 	if (seconds == 0)
2450 		goto rmdir_cpusets;
2451 
2452 	while (1) {
2453 		int plen, j;
2454 
2455 		if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) {
2456 			/* missing cpuset is as good as if already nuked */
2457 			if (errno == ENOENT) {
2458 				ret = 0;
2459 				goto no_more_cpuset;
2460 			}
2461 
2462 			/* other problems reading cpuset are bad news */
2463 			sav_errno = errno;
2464 			goto failed;
2465 		}
2466 
2467 		if ((plen = cpuset_pidlist_length(pl)) == 0)
2468 			goto rmdir_cpusets;
2469 
2470 		for (j = 0; j < plen; j++) {
2471 			pid_t pid;
2472 
2473 			if ((pid = cpuset_get_pidlist(pl, j)) > 1) {
2474 				if (kill(pid, SIGKILL) < 0 && errno != ESRCH) {
2475 					sav_errno = errno;
2476 					goto failed;
2477 				}
2478 			}
2479 		}
2480 
2481 		if (secs_left == 0)
2482 			goto took_too_long;
2483 
2484 		cpuset_freepidlist(pl);
2485 		pl = NULL;
2486 
2487 		secs_slept = secs_loop - sleep(secs_loop);
2488 
2489 		/* Ensure forward progress */
2490 		if (secs_slept == 0)
2491 			secs_slept = 1;
2492 
2493 		/* Ensure sane sleep() return (unnecessary?) */
2494 		if (secs_slept > secs_loop)
2495 			secs_slept = secs_loop;
2496 
2497 		secs_left -= secs_slept;
2498 
2499 		if (secs_loop < 10)
2500 			secs_loop++;
2501 
2502 		secs_loop = min(secs_left, secs_loop);
2503 	}
2504 
2505 took_too_long:
2506 	sav_errno = ETIME;
2507 	/* fall into ... */
2508 failed:
2509 	cpuset_freepidlist(pl);
2510 	errno = sav_errno;
2511 	return -1;
2512 
2513 rmdir_cpusets:
2514 	/* Let's try removing cpuset(s) now. */
2515 	cpuset_freepidlist(pl);
2516 
2517 	if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT)
2518 		return -1;
2519 	ret = 0;
2520 	cpuset_fts_reverse(cs_tree);	/* rmdir's must be done bottom up */
2521 	while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2522 		char buf[PATH_MAX];
2523 
2524 		fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry));
2525 		if (rmdir(buf) < 0 && errno != ENOENT) {
2526 			sav_errno = errno;
2527 			ret = -1;
2528 		}
2529 	}
2530 	cpuset_fts_close(cs_tree);
2531 	/* fall into ... */
2532 no_more_cpuset:
2533 	if (ret == 0)
2534 		errno = 0;
2535 	else
2536 		errno = sav_errno;
2537 	return ret;
2538 }
2539 
2540 /*
2541  * When recursively reading all the tasks files from a subtree,
2542  * chain together the read results, one pidblock per tasks file,
2543  * containing the raw unprocessed ascii as read(2) in.  After
2544  * we gather up this raw data, we then go back to count how
2545  * many pid's there are in total, allocate an array of pid_t
2546  * of that size, and transform the raw ascii data into this
2547  * array of pid_t's.
2548  */
2549 
2550 struct pidblock {
2551 	char *buf;
2552 	int buflen;
2553 	struct pidblock *next;
2554 };
2555 
2556 /*
2557  * Chain the raw contents of a file onto the pbhead list.
2558  *
2559  * We malloc "+ 1" extra byte for a nul-terminator, so that
2560  * the strtoul() loop in pid_transform() won't scan past
2561  * the end of pb->buf[] and accidentally find more pids.
2562  */
add_pidblock(const char * file,struct pidblock ** ppbhead)2563 static void add_pidblock(const char *file, struct pidblock **ppbhead)
2564 {
2565 	FILE *fp = NULL;
2566 	struct pidblock *pb = NULL;
2567 	int fsz;
2568 
2569 	if ((fp = fopen(file, "r")) == NULL)
2570 		goto err;
2571 	fsz = filesize(fp);
2572 	if (fsz == 0)
2573 		goto err;
2574 	if ((pb = calloc(1, sizeof(*pb))) == NULL)
2575 		goto err;
2576 	pb->buflen = fsz;
2577 	if ((pb->buf = malloc(pb->buflen + 1)) == NULL)
2578 		goto err;
2579 	if (fread(pb->buf, 1, pb->buflen, fp) > 0) {
2580 		pb->buf[pb->buflen] = '\0';
2581 		pb->next = *ppbhead;
2582 		*ppbhead = pb;
2583 	}
2584 	fclose(fp);
2585 	return;
2586 err:
2587 	if (fp)
2588 		fclose(fp);
2589 	free(pb);
2590 }
2591 
read_task_file(const char * relpath,struct pidblock ** ppbhead)2592 static void read_task_file(const char *relpath, struct pidblock **ppbhead)
2593 {
2594 	char buf[PATH_MAX];
2595 
2596 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2597 	add_pidblock(buf, ppbhead);
2598 }
2599 
2600 struct cpuset_pidlist {
2601 	pid_t *pids;
2602 	int npids;
2603 };
2604 
2605 /* Count how many pids in buf (one per line - just count newlines) */
pidcount(const char * buf,int buflen)2606 static int pidcount(const char *buf, int buflen)
2607 {
2608 	int n = 0;
2609 	const char *cp;
2610 
2611 	for (cp = buf; cp < buf + buflen; cp++) {
2612 		if (*cp == '\n')
2613 			n++;
2614 	}
2615 	return n;
2616 }
2617 
2618 /* Transform one-per-line ascii pids in pb to pid_t entries in pl */
pid_transform(struct pidblock * pb,struct cpuset_pidlist * pl,int n)2619 static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n)
2620 {
2621 	char *a, *b;
2622 
2623 	for (a = pb->buf; a < pb->buf + pb->buflen; a = b) {
2624 		pid_t p = strtoul(a, &b, 10);
2625 		if (a == b)
2626 			break;
2627 		pl->pids[n++] = p;
2628 	}
2629 	return n;
2630 }
2631 
free_pidblocks(struct pidblock * pbhead)2632 static void free_pidblocks(struct pidblock *pbhead)
2633 {
2634 	struct pidblock *pb, *nextpb;
2635 
2636 	for (pb = pbhead; pb; pb = nextpb) {
2637 		nextpb = pb->next;
2638 		free(pb->buf);
2639 		free(pb);
2640 	}
2641 }
2642 
2643 /* numeric comparison routine for qsort */
numericsort(const void * m1,const void * m2)2644 static int numericsort(const void *m1, const void *m2)
2645 {
2646 	pid_t p1 = *(pid_t *) m1;
2647 	pid_t p2 = *(pid_t *) m2;
2648 
2649 	return p1 - p2;
2650 }
2651 
2652 /* Return list pids in cpuset 'path' */
cpuset_init_pidlist(const char * relpath,int recursiveflag)2653 struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath,
2654 					   int recursiveflag)
2655 {
2656 	struct pidblock *pb = NULL;
2657 	struct cpuset_pidlist *pl = NULL;
2658 	struct pidblock *pbhead = NULL;
2659 	int n;
2660 
2661 	if (check() < 0)
2662 		goto err;
2663 
2664 	if (recursiveflag) {
2665 		struct cpuset_fts_tree *cs_tree;
2666 		const struct cpuset_fts_entry *cs_entry;
2667 
2668 		if ((cs_tree = cpuset_fts_open(relpath)) == NULL)
2669 			goto err;
2670 		while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2671 			if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET)
2672 				continue;
2673 			read_task_file(cpuset_fts_get_path(cs_entry), &pbhead);
2674 		}
2675 		cpuset_fts_close(cs_tree);
2676 	} else {
2677 		read_task_file(relpath, &pbhead);
2678 	}
2679 
2680 	if ((pl = calloc(1, sizeof(*pl))) == NULL)
2681 		goto err;
2682 	pl->npids = 0;
2683 	for (pb = pbhead; pb; pb = pb->next)
2684 		pl->npids += pidcount(pb->buf, pb->buflen);
2685 	if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL)
2686 		goto err;
2687 	n = 0;
2688 	for (pb = pbhead; pb; pb = pb->next)
2689 		n = pid_transform(pb, pl, n);
2690 	free_pidblocks(pbhead);
2691 	qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort);
2692 	return pl;
2693 err:
2694 	cpuset_freepidlist(pl);
2695 	free_pidblocks(pbhead);
2696 	return NULL;
2697 }
2698 
2699 /* Return number of elements in pidlist */
cpuset_pidlist_length(const struct cpuset_pidlist * pl)2700 int cpuset_pidlist_length(const struct cpuset_pidlist *pl)
2701 {
2702 	if (pl)
2703 		return pl->npids;
2704 	else
2705 		return 0;
2706 }
2707 
2708 /* Return i'th element of pidlist */
cpuset_get_pidlist(const struct cpuset_pidlist * pl,int i)2709 pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i)
2710 {
2711 	if (pl && i >= 0 && i < pl->npids)
2712 		return pl->pids[i];
2713 	else
2714 		return (pid_t) - 1;
2715 }
2716 
2717 /* Free pidlist */
cpuset_freepidlist(struct cpuset_pidlist * pl)2718 void cpuset_freepidlist(struct cpuset_pidlist *pl)
2719 {
2720 	if (pl && pl->pids)
2721 		free(pl->pids);
2722 	free(pl);
2723 }
2724 
__cpuset_move(pid_t pid,const char * path)2725 static int __cpuset_move(pid_t pid, const char *path)
2726 {
2727 	char buf[SMALL_BUFSZ];
2728 
2729 	snprintf(buf, sizeof(buf), "%u", pid);
2730 	return write_string_file(path, buf);
2731 }
2732 
2733 /* Move task (pid == 0 for current) to a cpuset */
cpuset_move(pid_t pid,const char * relpath)2734 int cpuset_move(pid_t pid, const char *relpath)
2735 {
2736 	char buf[PATH_MAX];
2737 
2738 	if (check() < 0)
2739 		return -1;
2740 
2741 	if (pid == 0)
2742 		pid = getpid();
2743 
2744 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2745 	return __cpuset_move(pid, buf);
2746 }
2747 
2748 /* Move all tasks in pidlist to a cpuset */
cpuset_move_all(struct cpuset_pidlist * pl,const char * relpath)2749 int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath)
2750 {
2751 	int i;
2752 	char buf[PATH_MAX];
2753 	int ret;
2754 
2755 	if (check() < 0)
2756 		return -1;
2757 
2758 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2759 
2760 	ret = 0;
2761 	for (i = 0; i < pl->npids; i++)
2762 		if (__cpuset_move(pl->pids[i], buf) < 0)
2763 			ret = -1;
2764 	return ret;
2765 }
2766 
2767 /*
2768  * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a
2769  *                                      cpuset to another cpuset
2770  *
2771  * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may
2772  * race with tasks being added to or forking into fromrelpath. Loop
2773  * repeatedly, reading the tasks file of cpuset fromrelpath and writing
2774  * any task pid's found there to the tasks file of cpuset torelpath,
2775  * up to ten attempts, or until the tasks file of cpuset fromrelpath
2776  * is empty, or until fromrelpath is no longer present.
2777  *
2778  * Returns 0 with errno == 0 if able to empty the tasks file of cpuset
2779  * fromrelpath. Of course it is still possible that some independent
2780  * task could add another task to cpuset fromrelpath at the same time
2781  * that such a successful result is being returned, so there can be
2782  * no guarantee that a successful return means that fromrelpath is
2783  * still empty of tasks.
2784  *
2785  * We are careful to allow for the possibility that the cpuset
2786  * fromrelpath might disappear out from under us, perhaps because it
2787  * has notify_on_release set and gets automatically removed as soon
2788  * as we detach its last task from it.  Consider a missing fromrelpath
2789  * to be a successful move.
2790  *
2791  * If called with fromrelpath and torelpath pathnames that evaluate to
2792  * the same cpuset, then treat that as if cpuset_reattach() was called,
2793  * rebinding each task in this cpuset one time, and return success or
2794  * failure depending on the return of that cpuset_reattach() call.
2795  *
2796  * On failure, returns -1, with errno possibly one of:
2797  *  EACCES - search permission denied on intervening directory
2798  *  ENOTEMPTY - tasks remain after multiple attempts to move them
2799  *  EMFILE - too many open files
2800  *  ENODEV - /dev/cpuset not mounted
2801  *  ENOENT - component of cpuset path doesn't exist
2802  *  ENOMEM - out of memory
2803  *  ENOSYS - kernel doesn't support cpusets
2804  *  ENOTDIR - component of cpuset path is not a directory
2805  *  EPERM - lacked permission to kill a task
2806  *  EPERM - lacked permission to read cpusets or files therein
2807  *
2808  * This is an [optional] function. Use cpuset_function to invoke it.
2809  */
2810 
2811 #define NUMBER_MOVE_TASK_ATTEMPTS 10
2812 
cpuset_move_cpuset_tasks(const char * fromrelpath,const char * torelpath)2813 int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath)
2814 {
2815 	char fromfullpath[PATH_MAX];
2816 	char tofullpath[PATH_MAX];
2817 	int i;
2818 	struct cpuset_pidlist *pl = NULL;
2819 	int sav_errno;
2820 
2821 	fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath);
2822 	fullpath(tofullpath, sizeof(tofullpath), torelpath);
2823 
2824 	if (samefile(fromfullpath, tofullpath))
2825 		return cpuset_reattach(fromrelpath);
2826 
2827 	for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) {
2828 		int plen, j;
2829 
2830 		if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) {
2831 			/* missing cpuset is as good as if all moved */
2832 			if (errno == ENOENT)
2833 				goto no_more_cpuset;
2834 
2835 			/* other problems reading cpuset are bad news */
2836 			sav_errno = errno;
2837 			goto failed;
2838 		}
2839 
2840 		if ((plen = cpuset_pidlist_length(pl)) == 0)
2841 			goto no_more_pids;
2842 
2843 		for (j = 0; j < plen; j++) {
2844 			pid_t pid;
2845 
2846 			pid = cpuset_get_pidlist(pl, j);
2847 			if (cpuset_move(pid, torelpath) < 0) {
2848 				/* missing task is as good as if moved */
2849 				if (errno == ESRCH)
2850 					continue;
2851 
2852 				/* other per-task errors are bad news */
2853 				sav_errno = errno;
2854 				goto failed;
2855 			}
2856 		}
2857 
2858 		cpuset_freepidlist(pl);
2859 		pl = NULL;
2860 	}
2861 
2862 	sav_errno = ENOTEMPTY;
2863 	/* fall into ... */
2864 failed:
2865 	cpuset_freepidlist(pl);
2866 	errno = sav_errno;
2867 	return -1;
2868 
2869 no_more_pids:
2870 no_more_cpuset:
2871 	/* Success - all tasks (or entire cpuset ;) gone. */
2872 	cpuset_freepidlist(pl);
2873 	errno = 0;
2874 	return 0;
2875 }
2876 
2877 /* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */
cpuset_migrate(pid_t pid,const char * relpath)2878 int cpuset_migrate(pid_t pid, const char *relpath)
2879 {
2880 	char buf[PATH_MAX];
2881 	char buf2[PATH_MAX];
2882 	char memory_migrate_flag;
2883 	int r;
2884 
2885 	if (check() < 0)
2886 		return -1;
2887 
2888 	if (pid == 0)
2889 		pid = getpid();
2890 
2891 	fullpath(buf2, sizeof(buf2), relpath);
2892 
2893 	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2894 		return -1;
2895 	if (store_flag(buf2, "memory_migrate", 1) < 0)
2896 		return -1;
2897 
2898 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2899 
2900 	r = __cpuset_move(pid, buf);
2901 
2902 	store_flag(buf2, "memory_migrate", memory_migrate_flag);
2903 	return r;
2904 }
2905 
2906 /* Migrate all tasks in pidlist to a cpuset (moves task and memory) */
cpuset_migrate_all(struct cpuset_pidlist * pl,const char * relpath)2907 int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath)
2908 {
2909 	int i;
2910 	char buf[PATH_MAX];
2911 	char buf2[PATH_MAX];
2912 	char memory_migrate_flag;
2913 	int ret;
2914 
2915 	if (check() < 0)
2916 		return -1;
2917 
2918 	fullpath(buf2, sizeof(buf2), relpath);
2919 
2920 	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2921 		return -1;
2922 	if (store_flag(buf2, "memory_migrate", 1) < 0)
2923 		return -1;
2924 
2925 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2926 
2927 	ret = 0;
2928 	for (i = 0; i < pl->npids; i++)
2929 		if (__cpuset_move(pl->pids[i], buf) < 0)
2930 			ret = -1;
2931 
2932 	if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0)
2933 		ret = -1;
2934 	return ret;
2935 }
2936 
2937 /* Rebind cpus_allowed of each task in cpuset 'path' */
cpuset_reattach(const char * relpath)2938 int cpuset_reattach(const char *relpath)
2939 {
2940 	struct cpuset_pidlist *pl;
2941 	int rc;
2942 
2943 	if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL)
2944 		return -1;
2945 	rc = cpuset_move_all(pl, relpath);
2946 	cpuset_freepidlist(pl);
2947 	return rc;
2948 }
2949 
2950 /* Map cpuset relative cpu number to system wide cpu number */
cpuset_c_rel_to_sys_cpu(const struct cpuset * cp,int cpu)2951 int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu)
2952 {
2953 	struct cpuset *cp_tofree = NULL;
2954 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2955 	int pos = -1;
2956 
2957 	if (!cp1)
2958 		goto err;
2959 	pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu);
2960 	/* fall into ... */
2961 err:
2962 	cpuset_free(cp_tofree);
2963 	return pos;
2964 }
2965 
2966 /* Map system wide cpu number to cpuset relative cpu number */
cpuset_c_sys_to_rel_cpu(const struct cpuset * cp,int cpu)2967 int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu)
2968 {
2969 	struct cpuset *cp_tofree = NULL;
2970 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2971 	int pos = -1;
2972 
2973 	if (!cp1)
2974 		goto err;
2975 	pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu);
2976 	/* fall into ... */
2977 err:
2978 	cpuset_free(cp_tofree);
2979 	return pos;
2980 }
2981 
2982 /* Map cpuset relative mem number to system wide mem number */
cpuset_c_rel_to_sys_mem(const struct cpuset * cp,int mem)2983 int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem)
2984 {
2985 	struct cpuset *cp_tofree = NULL;
2986 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2987 	int pos = -1;
2988 
2989 	if (!cp1)
2990 		goto err;
2991 	pos = bitmask_rel_to_abs_pos(cp1->mems, mem);
2992 	/* fall into ... */
2993 err:
2994 	cpuset_free(cp_tofree);
2995 	return pos;
2996 }
2997 
2998 /* Map system wide mem number to cpuset relative mem number */
cpuset_c_sys_to_rel_mem(const struct cpuset * cp,int mem)2999 int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem)
3000 {
3001 	struct cpuset *cp_tofree = NULL;
3002 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
3003 	int pos = -1;
3004 
3005 	if (!cp1)
3006 		goto err;
3007 	pos = bitmask_abs_to_rel_pos(cp1->mems, mem);
3008 	/* fall into ... */
3009 err:
3010 	cpuset_free(cp_tofree);
3011 	return pos;
3012 }
3013 
3014 /* Map pid's cpuset relative cpu number to system wide cpu number */
cpuset_p_rel_to_sys_cpu(pid_t pid,int cpu)3015 int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu)
3016 {
3017 	struct cpuset *cp;
3018 	int rc = -1;
3019 
3020 	if ((cp = cpuset_alloc()) == NULL)
3021 		goto done;
3022 	if (cpuset_cpusetofpid(cp, pid) < 0)
3023 		goto done;
3024 	rc = cpuset_c_rel_to_sys_cpu(cp, cpu);
3025 done:
3026 	cpuset_free(cp);
3027 	return rc;
3028 }
3029 
3030 /* Map system wide cpu number to pid's cpuset relative cpu number */
cpuset_p_sys_to_rel_cpu(pid_t pid,int cpu)3031 int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu)
3032 {
3033 	struct cpuset *cp;
3034 	int rc = -1;
3035 
3036 	if ((cp = cpuset_alloc()) == NULL)
3037 		goto done;
3038 	if (cpuset_cpusetofpid(cp, pid) < 0)
3039 		goto done;
3040 	rc = cpuset_c_sys_to_rel_cpu(cp, cpu);
3041 done:
3042 	cpuset_free(cp);
3043 	return rc;
3044 }
3045 
3046 /* Map pid's cpuset relative mem number to system wide mem number */
cpuset_p_rel_to_sys_mem(pid_t pid,int mem)3047 int cpuset_p_rel_to_sys_mem(pid_t pid, int mem)
3048 {
3049 	struct cpuset *cp;
3050 	int rc = -1;
3051 
3052 	if ((cp = cpuset_alloc()) == NULL)
3053 		goto done;
3054 	if (cpuset_cpusetofpid(cp, pid) < 0)
3055 		goto done;
3056 	rc = cpuset_c_rel_to_sys_mem(cp, mem);
3057 done:
3058 	cpuset_free(cp);
3059 	return rc;
3060 }
3061 
3062 /* Map system wide mem number to pid's cpuset relative mem number */
cpuset_p_sys_to_rel_mem(pid_t pid,int mem)3063 int cpuset_p_sys_to_rel_mem(pid_t pid, int mem)
3064 {
3065 	struct cpuset *cp;
3066 	int rc = -1;
3067 
3068 	if ((cp = cpuset_alloc()) == NULL)
3069 		goto done;
3070 	if (cpuset_cpusetofpid(cp, pid) < 0)
3071 		goto done;
3072 	rc = cpuset_c_sys_to_rel_mem(cp, mem);
3073 done:
3074 	cpuset_free(cp);
3075 	return rc;
3076 }
3077 
3078 /*
3079  * Override glibc's calls for get/set affinity - they have
3080  * something using cpu_set_t that will die when NR_CPUS > 1024.
3081  * Go directly to the 'real' system calls.  Also override calls
3082  * for get_mempolicy and set_mempolicy.  None of these
3083  * calls are yet (July 2004) guaranteed to be in all glibc versions
3084  * that we care about.
3085  */
3086 
sched_setaffinity(pid_t pid,unsigned len,unsigned long * mask)3087 static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask)
3088 {
3089 	return ltp_syscall(__NR_sched_setaffinity, pid, len, mask);
3090 }
3091 
3092 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
get_mempolicy(int * policy,unsigned long * nmask,unsigned long maxnode,void * addr,int flags)3093 static int get_mempolicy(int *policy, unsigned long *nmask,
3094 			 unsigned long maxnode, void *addr, int flags)
3095 {
3096 	return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode,
3097 		addr, flags);
3098 }
3099 #endif
3100 
3101 #if HAVE_DECL_MPOL_BIND || HAVE_DECL_MPOL_DEFAULT
set_mempolicy(int mode,unsigned long * nmask,unsigned long maxnode)3102 static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode)
3103 {
3104 	return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode);
3105 }
3106 #endif
3107 
3108 struct cpuset_placement {
3109 	struct bitmask *cpus;
3110 	struct bitmask *mems;
3111 	char *path;
3112 };
3113 
3114 /* Allocate and fill in a placement struct - cpatures current placement */
cpuset_get_placement(pid_t pid)3115 struct cpuset_placement *cpuset_get_placement(pid_t pid)
3116 {
3117 	struct cpuset_placement *plc;
3118 	struct cpuset *cp = NULL;
3119 	char buf[PATH_MAX];
3120 	int nbits;
3121 
3122 	if ((plc = calloc(1, sizeof(*plc))) == NULL)
3123 		goto err;
3124 
3125 	nbits = cpuset_cpus_nbits();
3126 	if ((plc->cpus = bitmask_alloc(nbits)) == NULL)
3127 		goto err;
3128 
3129 	nbits = cpuset_mems_nbits();
3130 	if ((plc->mems = bitmask_alloc(nbits)) == NULL)
3131 		goto err;
3132 
3133 	if ((cp = cpuset_alloc()) == NULL)
3134 		goto err;
3135 	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
3136 		goto err;
3137 	if (cpuset_query(cp, buf) < 0)
3138 		goto err;
3139 
3140 	bitmask_copy(plc->cpus, cp->cpus);
3141 	bitmask_copy(plc->mems, cp->mems);
3142 	plc->path = strdup(buf);
3143 
3144 	cpuset_free(cp);
3145 	return plc;
3146 err:
3147 	cpuset_free(cp);
3148 	cpuset_free_placement(plc);
3149 	return NULL;
3150 }
3151 
3152 /* Compare two placement structs - use to detect changes in placement */
cpuset_equal_placement(const struct cpuset_placement * plc1,const struct cpuset_placement * plc2)3153 int cpuset_equal_placement(const struct cpuset_placement *plc1,
3154 			   const struct cpuset_placement *plc2)
3155 {
3156 	return bitmask_equal(plc1->cpus, plc2->cpus) &&
3157 	    bitmask_equal(plc1->mems, plc2->mems) &&
3158 	    streq(plc1->path, plc2->path);
3159 }
3160 
3161 /* Free a placement struct */
cpuset_free_placement(struct cpuset_placement * plc)3162 void cpuset_free_placement(struct cpuset_placement *plc)
3163 {
3164 	if (!plc)
3165 		return;
3166 	bitmask_free(plc->cpus);
3167 	bitmask_free(plc->mems);
3168 	free(plc->path);
3169 	free(plc);
3170 }
3171 
3172 /*
3173  * A cpuset_fts_open() call constructs a linked list of entries
3174  * called a "cpuset_fts_tree", with one entry per cpuset below
3175  * the specified path.  The cpuset_fts_read() routine returns the
3176  * next entry on this list.  The various cpuset_fts_get_*() calls
3177  * return attributes of the specified entry.  The cpuset_fts_close()
3178  * call frees the linked list and all associated data.  All cpuset
3179  * entries and attributes for the cpuset_fts_tree returned from a
3180  * given cpuset_fts_open() call remain allocated and unchanged until
3181  * that cpuset_fts_tree is closed by a cpuset_fts_close() call.  Any
3182  * subsequent changes to the cpuset filesystem will go unnoticed
3183  * (not affect open cpuset_fts_tree's.)
3184  */
3185 
3186 struct cpuset_fts_entry;
3187 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree);
3188 
3189 struct cpuset_fts_tree {
3190 	struct cpuset_fts_entry *head;	/* head of linked entry list */
3191 	struct cpuset_fts_entry *next;	/* cpuset_fts_read() offset */
3192 };
3193 
3194 struct cpuset_fts_entry {
3195 	struct cpuset_fts_entry *next;	/* linked entry list chain */
3196 	struct cpuset *cpuset;
3197 	struct stat *stat;
3198 	char *path;
3199 	int info;
3200 	int err;
3201 };
3202 
3203 /* Open a handle on a cpuset hierarchy.  All the real work is done here. */
cpuset_fts_open(const char * cpusetpath)3204 struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath)
3205 {
3206 	FTS *fts = NULL;
3207 	FTSENT *ftsent;
3208 	char *path_argv[2];
3209 	char buf[PATH_MAX];
3210 	struct cpuset_fts_tree *cs_tree = NULL;
3211 	struct cpuset_fts_entry *ep;	/* the latest new list entry */
3212 	struct cpuset_fts_entry **pnlep;	/* ptr to next list entry ptr */
3213 	char *relpath;
3214 	int fts_flags;
3215 
3216 	fullpath(buf, sizeof(buf), cpusetpath);
3217 	path_argv[0] = buf;
3218 	path_argv[1] = NULL;
3219 
3220 	fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV;
3221 	fts = fts_open(path_argv, fts_flags, NULL);
3222 	if (fts == NULL)
3223 		goto err;
3224 
3225 	cs_tree = malloc(sizeof(*cs_tree));
3226 	if (cs_tree == NULL)
3227 		goto err;
3228 	pnlep = &cs_tree->head;
3229 	*pnlep = NULL;
3230 
3231 	while ((ftsent = fts_read(fts)) != NULL) {
3232 		if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR)
3233 			continue;
3234 
3235 		/* ftsent is a directory (perhaps unreadable) ==> cpuset */
3236 		ep = calloc(1, sizeof(*ep));
3237 		if (ep == NULL)
3238 			goto err;
3239 		*pnlep = ep;
3240 		pnlep = &ep->next;
3241 
3242 		/* Set entry's path, and if DNR, error */
3243 		relpath = ftsent->fts_path + strlen(cpusetmnt);
3244 		if (strlen(relpath) == 0)
3245 			relpath = "/";
3246 		ep->path = strdup(relpath);
3247 		if (ep->path == NULL)
3248 			goto err;
3249 		if (ftsent->fts_info == FTS_DNR) {
3250 			ep->info = CPUSET_FTS_ERR_DNR;
3251 			ep->err = ftsent->fts_errno;
3252 			continue;
3253 		}
3254 
3255 		/* ftsent is a -readable- cpuset: set entry's stat, etc */
3256 		ep->stat = calloc(1, sizeof(struct stat));
3257 		if (ep->stat == NULL)
3258 			goto err;
3259 		if (stat(ftsent->fts_path, ep->stat) < 0) {
3260 			ep->info = CPUSET_FTS_ERR_STAT;
3261 			ep->err = ftsent->fts_errno;
3262 			continue;
3263 		}
3264 
3265 		ep->cpuset = calloc(1, sizeof(struct cpuset));
3266 		if (ep->cpuset == NULL)
3267 			goto err;
3268 		if (cpuset_query(ep->cpuset, relpath) < 0) {
3269 			ep->info = CPUSET_FTS_ERR_CPUSET;
3270 			ep->err = errno;
3271 			continue;
3272 		}
3273 		ep->info = CPUSET_FTS_CPUSET;
3274 	}
3275 
3276 	(void)fts_close(fts);
3277 	cpuset_fts_rewind(cs_tree);
3278 	return cs_tree;
3279 
3280 err:
3281 	if (cs_tree)
3282 		cpuset_fts_close(cs_tree);
3283 	if (fts)
3284 		(void)fts_close(fts);
3285 	return NULL;
3286 }
3287 
3288 /* Return pointer to next cpuset entry in hierarchy */
cpuset_fts_read(struct cpuset_fts_tree * cs_tree)3289 const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree)
3290 {
3291 	const struct cpuset_fts_entry *cs_entry = cs_tree->next;
3292 	if (cs_tree->next != NULL)	/* seek to next entry */
3293 		cs_tree->next = cs_tree->next->next;
3294 	return cs_entry;
3295 }
3296 
3297 /* Reverse list of cpusets, in place.  Simulates pre-order/post-order flip. */
cpuset_fts_reverse(struct cpuset_fts_tree * cs_tree)3298 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree)
3299 {
3300 	struct cpuset_fts_entry *cs1, *cs2, *cs3;
3301 
3302 	/*
3303 	 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer
3304 	 * is redirected from cs3 to cs1.
3305 	 */
3306 
3307 	cs1 = cs2 = NULL;
3308 	cs3 = cs_tree->head;
3309 	while (cs3) {
3310 		cs1 = cs2;
3311 		cs2 = cs3;
3312 		cs3 = cs3->next;
3313 		cs2->next = cs1;
3314 	}
3315 	cs_tree->head = cs2;
3316 	cpuset_fts_rewind(cs_tree);
3317 }
3318 
3319 /* Rewind cpuset list to beginning */
cpuset_fts_rewind(struct cpuset_fts_tree * cs_tree)3320 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree)
3321 {
3322 	cs_tree->next = cs_tree->head;
3323 }
3324 
3325 /* Return pointer to nul-terminated cpuset path of entry in hierarchy */
cpuset_fts_get_path(const struct cpuset_fts_entry * cs_entry)3326 const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry)
3327 {
3328 	return cs_entry->path;
3329 }
3330 
3331 /* Return pointer to stat(2) structure of a cpuset entry's directory */
cpuset_fts_get_stat(const struct cpuset_fts_entry * cs_entry)3332 const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry)
3333 {
3334 	return cs_entry->stat;
3335 }
3336 
3337 /* Return pointer to cpuset structure of a cpuset entry */
cpuset_fts_get_cpuset(const struct cpuset_fts_entry * cs_entry)3338 const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry
3339 					   *cs_entry)
3340 {
3341 	return cs_entry->cpuset;
3342 }
3343 
3344 /* Return value of errno (0 if no error) on attempted cpuset operations */
cpuset_fts_get_errno(const struct cpuset_fts_entry * cs_entry)3345 int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry)
3346 {
3347 	return cs_entry->err;
3348 }
3349 
3350 /* Return operation identity causing error */
cpuset_fts_get_info(const struct cpuset_fts_entry * cs_entry)3351 int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry)
3352 {
3353 	return cs_entry->info;
3354 }
3355 
3356 /* Close a cpuset hierarchy handle (free's all associated memory) */
cpuset_fts_close(struct cpuset_fts_tree * cs_tree)3357 void cpuset_fts_close(struct cpuset_fts_tree *cs_tree)
3358 {
3359 	struct cpuset_fts_entry *cs_entry = cs_tree->head;
3360 
3361 	while (cs_entry) {
3362 		struct cpuset_fts_entry *ep = cs_entry;
3363 
3364 		cs_entry = cs_entry->next;
3365 		free(ep->path);
3366 		free(ep->stat);
3367 		cpuset_free(ep->cpuset);
3368 		free(ep);
3369 	}
3370 	free(cs_tree);
3371 }
3372 
3373 /* Bind current task to cpu (uses sched_setaffinity(2)) */
cpuset_cpubind(int cpu)3374 int cpuset_cpubind(int cpu)
3375 {
3376 	struct bitmask *bmp;
3377 	int r;
3378 
3379 	if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3380 		return -1;
3381 	bitmask_setbit(bmp, cpu);
3382 	r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp));
3383 	bitmask_free(bmp);
3384 	return r;
3385 }
3386 
3387 /*
3388  * int cpuset_latestcpu(pid_t pid)
3389  *
3390  * Return most recent CPU on which task pid executed.  If pid == 0,
3391  * examine current task.
3392  *
3393  * The last used CPU is visible for a given pid as field #39 (starting
3394  * with #1) in the file /proc/pid/stat.  Currently this file has 41
3395  * fields, in which case this is the 3rd to the last field.
3396  *
3397  * Unfortunately field #2 is a command name and might have embedded
3398  * whitespace.  So we can't just count white space separated fields.
3399  * Fortunately, this command name is surrounded by parentheses, as
3400  * for example "(sh)", and that closing parenthesis is the last ')'
3401  * character in the line.  No remaining fields can have embedded
3402  * whitespace or parentheses.  So instead of looking for the 39th
3403  * white space separated field, we can look for the 37th white space
3404  * separated field past the last ')' character on the line.
3405  */
3406 
3407 /* Return most recent CPU on which task pid executed */
cpuset_latestcpu(pid_t pid)3408 int cpuset_latestcpu(pid_t pid)
3409 {
3410 	char buf[PATH_MAX];
3411 	char *bp;
3412 	int fd = -1;
3413 	int cpu = -1;
3414 
3415 	if (pid == 0)
3416 		snprintf(buf, sizeof(buf), "/proc/self/stat");
3417 	else
3418 		snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
3419 
3420 	if ((fd = open(buf, O_RDONLY)) < 0)
3421 		goto err;
3422 	if (read(fd, buf, sizeof(buf)) < 1)
3423 		goto err;
3424 	close(fd);
3425 
3426 	bp = strrchr(buf, ')');
3427 	if (bp)
3428 		sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u",	/* 37th field past ')' */
3429 		       &cpu);
3430 	if (cpu < 0)
3431 		errno = EINVAL;
3432 	return cpu;
3433 err:
3434 	if (fd >= 0)
3435 		close(fd);
3436 	return -1;
3437 }
3438 
3439 /* Bind current task to memory (uses set_mempolicy(2)) */
cpuset_membind(int mem)3440 int cpuset_membind(int mem)
3441 {
3442 	struct bitmask *bmp;
3443 	int r;
3444 
3445 	if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3446 		return -1;
3447 	bitmask_setbit(bmp, mem);
3448 #if HAVE_DECL_MPOL_BIND
3449 	r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1);
3450 #else
3451 	r = -1;
3452 	errno = ENOSYS;
3453 #endif
3454 	bitmask_free(bmp);
3455 	return r;
3456 }
3457 
3458 /* [optional] Return Memory Node holding page at specified addr */
cpuset_addr2node(void * addr)3459 int cpuset_addr2node(void *addr)
3460 {
3461 	int node = -1;
3462 
3463 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
3464 	if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) {
3465 		/* I realize this seems redundant, but I _want_ to make sure
3466 		 * that this value is -1. */
3467 		node = -1;
3468 	}
3469 #endif
3470 	return node;
3471 }
3472 
3473 /*
3474  * Transform cpuset into Text Format Representation in buffer 'buf',
3475  * of length 'buflen', nul-terminated if space allows.  Return number
3476  * of characters that would have been written, if enough space had
3477  * been available, in the same way that snprintf() does.
3478  */
3479 
3480 /* Export cpuset settings to a regular file */
cpuset_export(const struct cpuset * cp,char * buf,int buflen)3481 int cpuset_export(const struct cpuset *cp, char *buf, int buflen)
3482 {
3483 	char *tmp = NULL;
3484 	int n = 0;
3485 
3486 	if (cp->cpu_exclusive)
3487 		n += snprintf(buf + n, max(buflen - n, 0), "cpu_exclusive\n");
3488 
3489 	if (cp->mem_exclusive)
3490 		n += snprintf(buf + n, max(buflen - n, 0), "mem_exclusive\n");
3491 
3492 	if (cp->notify_on_release)
3493 		n += snprintf(buf + n, max(buflen - n, 0),
3494 			      "notify_on_release\n");
3495 
3496 	if (cp->memory_pressure_enabled)
3497 		n += snprintf(buf + n, max(buflen - n, 0),
3498 			      "memory_pressure_enabled\n");
3499 
3500 	if (cp->memory_migrate)
3501 		n += snprintf(buf + n, max(buflen - n, 0), "memory_migrate\n");
3502 
3503 	if (cp->memory_spread_page)
3504 		n += snprintf(buf + n, max(buflen - n, 0),
3505 			      "memory_spread_page\n");
3506 
3507 	if (cp->memory_spread_slab)
3508 		n += snprintf(buf + n, max(buflen - n, 0),
3509 			      "memory_spread_slab\n");
3510 
3511 	if ((tmp = sprint_mask_buf(cp->cpus)) == NULL)
3512 		return -1;
3513 	n += snprintf(buf + n, max(buflen - n, 0), "cpus %s\n", tmp);
3514 	free(tmp);
3515 	tmp = NULL;
3516 
3517 	if ((tmp = sprint_mask_buf(cp->mems)) == NULL)
3518 		return -1;
3519 	n += snprintf(buf + n, max(buflen - n, 0), "mems %s\n", tmp);
3520 	free(tmp);
3521 	tmp = NULL;
3522 
3523 	return n;
3524 }
3525 
import_list(UNUSED const char * tok,const char * arg,struct bitmask * bmp,char * emsg,int elen)3526 static int import_list(UNUSED const char *tok, const char *arg,
3527 		       struct bitmask *bmp, char *emsg, int elen)
3528 {
3529 	if (bitmask_parselist(arg, bmp) < 0) {
3530 		if (emsg)
3531 			snprintf(emsg, elen, "Invalid list format: %s", arg);
3532 		return -1;
3533 	}
3534 	return 0;
3535 }
3536 
stolower(char * s)3537 static void stolower(char *s)
3538 {
3539 	while (*s) {
3540 		unsigned char c = *s;
3541 		*s = tolower(c);
3542 		s++;
3543 	}
3544 }
3545 
3546 /* Import cpuset settings from a regular file */
cpuset_import(struct cpuset * cp,const char * buf,int * elinenum,char * emsg,int elen)3547 int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum,
3548 		  char *emsg, int elen)
3549 {
3550 	char *linebuf = NULL;
3551 	int linebuflen;
3552 	int linenum = 0;
3553 	int offset = 0;
3554 
3555 	linebuflen = strlen(buf) + 1;
3556 	if ((linebuf = malloc(linebuflen)) == NULL) {
3557 		if (emsg)
3558 			snprintf(emsg, elen, "Insufficient memory");
3559 		goto err;
3560 	}
3561 
3562 	while (slgets(linebuf, linebuflen, buf, &offset)) {
3563 		char *tok, *arg;
3564 		char *ptr;	/* for strtok_r */
3565 
3566 		linenum++;
3567 		if ((tok = strchr(linebuf, '#')) != NULL)
3568 			*tok = 0;
3569 		if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL)
3570 			continue;
3571 		stolower(tok);
3572 
3573 		arg = strtok_r(0, " \t", &ptr);
3574 
3575 		if (streq(tok, "cpu_exclusive")) {
3576 			cp->cpu_exclusive = 1;
3577 			goto eol;
3578 		}
3579 		if (streq(tok, "mem_exclusive")) {
3580 			cp->mem_exclusive = 1;
3581 			goto eol;
3582 		}
3583 		if (streq(tok, "notify_on_release")) {
3584 			cp->notify_on_release = 1;
3585 			goto eol;
3586 		}
3587 		if (streq(tok, "memory_pressure_enabled")) {
3588 			cp->memory_pressure_enabled = 1;
3589 			goto eol;
3590 		}
3591 		if (streq(tok, "memory_migrate")) {
3592 			cp->memory_migrate = 1;
3593 			goto eol;
3594 		}
3595 		if (streq(tok, "memory_spread_page")) {
3596 			cp->memory_spread_page = 1;
3597 			goto eol;
3598 		}
3599 		if (streq(tok, "memory_spread_slab")) {
3600 			cp->memory_spread_slab = 1;
3601 			goto eol;
3602 		}
3603 		if (streq(tok, "cpu") || streq(tok, "cpus")) {
3604 			if (import_list(tok, arg, cp->cpus, emsg, elen) < 0)
3605 				goto err;
3606 			goto eol;
3607 		}
3608 		if (streq(tok, "mem") || streq(tok, "mems")) {
3609 			if (import_list(tok, arg, cp->mems, emsg, elen) < 0)
3610 				goto err;
3611 			goto eol;
3612 		}
3613 		if (emsg)
3614 			snprintf(emsg, elen, "Unrecognized token: '%s'", tok);
3615 		goto err;
3616 eol:
3617 		if ((tok = strtok_r(0, " \t", &ptr)) != NULL) {
3618 			if (emsg)
3619 				snprintf(emsg, elen, "Surplus token: '%s'",
3620 					 tok);
3621 			goto err;
3622 		}
3623 		continue;
3624 	}
3625 
3626 	free(linebuf);
3627 
3628 	if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems))
3629 		cpuset_localcpus(cp->mems, cp->cpus);
3630 	else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems))
3631 		cpuset_localmems(cp->cpus, cp->mems);
3632 
3633 	/*
3634 	 * All cpuset attributes are determined in an import.
3635 	 * Those that aren't explicitly specified are presumed
3636 	 * to be unchanged (zero, if it's a freshly allocated
3637 	 * struct cpuset.)
3638 	 */
3639 
3640 	cp->cpus_valid = 1;
3641 	cp->mems_valid = 1;
3642 	cp->cpu_exclusive_valid = 1;
3643 	cp->mem_exclusive_valid = 1;
3644 	cp->notify_on_release_valid = 1;
3645 	cp->memory_migrate_valid = 1;
3646 	cp->memory_pressure_enabled_valid = 1;
3647 	cp->memory_spread_page_valid = 1;
3648 	cp->memory_spread_slab_valid = 1;
3649 
3650 	return 0;
3651 err:
3652 	if (elinenum)
3653 		*elinenum = linenum;
3654 	free(linebuf);
3655 	return -1;
3656 }
3657 
3658 /* Pin current task CPU (and memory) */
cpuset_pin(int relcpu)3659 int cpuset_pin(int relcpu)
3660 {
3661 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3662 	int cpu, r;
3663 
3664 	if (check() < 0)
3665 		return -1;
3666 
3667 	do {
3668 		cpuset_free_placement(plc1);
3669 		plc1 = cpuset_get_placement(0);
3670 
3671 		r = 0;
3672 		if (cpuset_unpin() < 0)
3673 			r = -1;
3674 		cpu = cpuset_p_rel_to_sys_cpu(0, relcpu);
3675 		if (cpuset_cpubind(cpu) < 0)
3676 			r = -1;
3677 
3678 		cpuset_free_placement(plc2);
3679 		plc2 = cpuset_get_placement(0);
3680 	} while (!cpuset_equal_placement(plc1, plc2));
3681 
3682 	cpuset_free_placement(plc1);
3683 	cpuset_free_placement(plc2);
3684 	return r;
3685 }
3686 
3687 /* Return number CPUs in current tasks cpuset */
cpuset_size()3688 int cpuset_size()
3689 {
3690 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3691 	int r;
3692 
3693 	if (check() < 0)
3694 		return -1;
3695 
3696 	do {
3697 		cpuset_free_placement(plc1);
3698 		plc1 = cpuset_get_placement(0);
3699 
3700 		r = cpuset_cpus_weight(0);
3701 
3702 		cpuset_free_placement(plc2);
3703 		plc2 = cpuset_get_placement(0);
3704 	} while (!cpuset_equal_placement(plc1, plc2));
3705 
3706 	cpuset_free_placement(plc1);
3707 	cpuset_free_placement(plc2);
3708 	return r;
3709 }
3710 
3711 /* Return relative CPU number, within current cpuset, last executed on */
cpuset_where()3712 int cpuset_where()
3713 {
3714 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3715 	int r;
3716 
3717 	if (check() < 0)
3718 		return -1;
3719 
3720 	do {
3721 		cpuset_free_placement(plc1);
3722 		plc1 = cpuset_get_placement(0);
3723 
3724 		r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0));
3725 
3726 		cpuset_free_placement(plc2);
3727 		plc2 = cpuset_get_placement(0);
3728 	} while (!cpuset_equal_placement(plc1, plc2));
3729 
3730 	cpuset_free_placement(plc1);
3731 	cpuset_free_placement(plc2);
3732 	return r;
3733 }
3734 
3735 /* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */
cpuset_unpin()3736 int cpuset_unpin()
3737 {
3738 	struct bitmask *cpus = NULL, *mems = NULL;
3739 	int r = -1;
3740 
3741 	if (check() < 0)
3742 		goto err;
3743 
3744 	/*
3745 	 * Don't need cpuset_*_placement() guard against concurrent
3746 	 * cpuset migration, because none of the following depends
3747 	 * on the tasks cpuset placement.
3748 	 */
3749 
3750 	if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3751 		goto err;
3752 	bitmask_setall(cpus);
3753 	if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0)
3754 		goto err;
3755 
3756 	if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3757 		goto err;
3758 #if HAVE_DECL_MPOL_DEFAULT
3759 	if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems),
3760 			  bitmask_nbits(mems) + 1) < 0)
3761 		goto err;
3762 	r = 0;
3763 #endif
3764 	/* fall into ... */
3765 err:
3766 	bitmask_free(cpus);
3767 	bitmask_free(mems);
3768 	return r;
3769 
3770 }
3771 
3772 struct cpuset_function_list {
3773 	const char *fname;
3774 	void *func;
3775 } flist[] = {
3776 	{
3777 	"cpuset_version", cpuset_version}, {
3778 	"cpuset_alloc", cpuset_alloc}, {
3779 	"cpuset_free", cpuset_free}, {
3780 	"cpuset_cpus_nbits", cpuset_cpus_nbits}, {
3781 	"cpuset_mems_nbits", cpuset_mems_nbits}, {
3782 	"cpuset_setcpus", cpuset_setcpus}, {
3783 	"cpuset_setmems", cpuset_setmems}, {
3784 	"cpuset_set_iopt", cpuset_set_iopt}, {
3785 	"cpuset_set_sopt", cpuset_set_sopt}, {
3786 	"cpuset_getcpus", cpuset_getcpus}, {
3787 	"cpuset_getmems", cpuset_getmems}, {
3788 	"cpuset_cpus_weight", cpuset_cpus_weight}, {
3789 	"cpuset_mems_weight", cpuset_mems_weight}, {
3790 	"cpuset_get_iopt", cpuset_get_iopt}, {
3791 	"cpuset_get_sopt", cpuset_get_sopt}, {
3792 	"cpuset_localcpus", cpuset_localcpus}, {
3793 	"cpuset_localmems", cpuset_localmems}, {
3794 	"cpuset_cpumemdist", cpuset_cpumemdist}, {
3795 	"cpuset_cpu2node", cpuset_cpu2node}, {
3796 	"cpuset_addr2node", cpuset_addr2node}, {
3797 	"cpuset_create", cpuset_create}, {
3798 	"cpuset_delete", cpuset_delete}, {
3799 	"cpuset_query", cpuset_query}, {
3800 	"cpuset_modify", cpuset_modify}, {
3801 	"cpuset_getcpusetpath", cpuset_getcpusetpath}, {
3802 	"cpuset_cpusetofpid", cpuset_cpusetofpid}, {
3803 	"cpuset_mountpoint", cpuset_mountpoint}, {
3804 	"cpuset_collides_exclusive", cpuset_collides_exclusive}, {
3805 	"cpuset_nuke", cpuset_nuke}, {
3806 	"cpuset_init_pidlist", cpuset_init_pidlist}, {
3807 	"cpuset_pidlist_length", cpuset_pidlist_length}, {
3808 	"cpuset_get_pidlist", cpuset_get_pidlist}, {
3809 	"cpuset_freepidlist", cpuset_freepidlist}, {
3810 	"cpuset_move", cpuset_move}, {
3811 	"cpuset_move_all", cpuset_move_all}, {
3812 	"cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, {
3813 	"cpuset_migrate", cpuset_migrate}, {
3814 	"cpuset_migrate_all", cpuset_migrate_all}, {
3815 	"cpuset_reattach", cpuset_reattach}, {
3816 	"cpuset_open_memory_pressure", cpuset_open_memory_pressure}, {
3817 	"cpuset_read_memory_pressure", cpuset_read_memory_pressure}, {
3818 	"cpuset_close_memory_pressure", cpuset_close_memory_pressure}, {
3819 	"cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, {
3820 	"cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, {
3821 	"cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, {
3822 	"cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, {
3823 	"cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, {
3824 	"cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, {
3825 	"cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, {
3826 	"cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, {
3827 	"cpuset_get_placement", cpuset_get_placement}, {
3828 	"cpuset_equal_placement", cpuset_equal_placement}, {
3829 	"cpuset_free_placement", cpuset_free_placement}, {
3830 	"cpuset_fts_open", cpuset_fts_open}, {
3831 	"cpuset_fts_read", cpuset_fts_read}, {
3832 	"cpuset_fts_reverse", cpuset_fts_reverse}, {
3833 	"cpuset_fts_rewind", cpuset_fts_rewind}, {
3834 	"cpuset_fts_get_path", cpuset_fts_get_path}, {
3835 	"cpuset_fts_get_stat", cpuset_fts_get_stat}, {
3836 	"cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, {
3837 	"cpuset_fts_get_errno", cpuset_fts_get_errno}, {
3838 	"cpuset_fts_get_info", cpuset_fts_get_info}, {
3839 	"cpuset_fts_close", cpuset_fts_close}, {
3840 	"cpuset_cpubind", cpuset_cpubind}, {
3841 	"cpuset_latestcpu", cpuset_latestcpu}, {
3842 	"cpuset_membind", cpuset_membind}, {
3843 	"cpuset_export", cpuset_export}, {
3844 	"cpuset_import", cpuset_import}, {
3845 	"cpuset_function", cpuset_function}, {
3846 	"cpuset_pin", cpuset_pin}, {
3847 	"cpuset_size", cpuset_size}, {
3848 	"cpuset_where", cpuset_where}, {
3849 "cpuset_unpin", cpuset_unpin},};
3850 
3851 /* Return pointer to a libcpuset.so function, or NULL */
cpuset_function(const char * function_name)3852 void *cpuset_function(const char *function_name)
3853 {
3854 	unsigned int i;
3855 
3856 	for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++)
3857 		if (streq(function_name, flist[i].fname))
3858 			return flist[i].func;
3859 	return NULL;
3860 }
3861 
3862 /* Fortran interface to basic cpuset routines */
cpuset_pin_(int * ptr_relcpu)3863 int cpuset_pin_(int *ptr_relcpu)
3864 {
3865 	return cpuset_pin(*ptr_relcpu);
3866 }
3867 
cpuset_size_(void)3868 int cpuset_size_(void)
3869 {
3870 	return cpuset_size();
3871 }
3872 
cpuset_where_(void)3873 int cpuset_where_(void)
3874 {
3875 	return cpuset_where();
3876 }
3877 
cpuset_unpin_(void)3878 int cpuset_unpin_(void)
3879 {
3880 	return cpuset_unpin();
3881 }
3882 
3883 #endif /* HAVE_LINUX_MEMPOLICY_H */
3884