libcpuset.c revision 359980f68b19c77c698b121b57a071dfe6e3ca31
1/*
2 * cpuset user library implementation.
3 *
4 * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved.
5 *
6 * Paul Jackson <pj@sgi.com>
7 */
8
9/*
10 *  This program is free software; you can redistribute it and/or modify
11 *  it under the terms of the GNU Lesser General Public License as published by
12 *  the Free Software Foundation; either version 2.1 of the License, or
13 *  (at your option) any later version.
14 *
15 *  This program is distributed in the hope that it will be useful,
16 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 *  GNU Lesser General Public License for more details.
19 *
20 *  You should have received a copy of the GNU Lesser General Public License
21 *  along with this program; if not, write to the Free Software
22 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
23 */
24
25#define _XOPEN_SOURCE 500	/* need to see pread() */
26#define _BSD_SOURCE 1		/* need to see syscall() */
27#include <unistd.h>
28
29#include <ctype.h>
30#include <dirent.h>
31#include <errno.h>
32#include <fcntl.h>
33#include <fts.h>
34#include <limits.h>
35#include <signal.h>
36#include <stdint.h>
37#include <stdio.h>
38#include <stdlib.h>
39#include <string.h>
40#include <sys/stat.h>
41#include <sys/syscall.h>
42#include <sys/types.h>
43#include <time.h>
44#include <utime.h>
45#include <sys/utsname.h>	/* for cpuset_would_crash_kernel() */
46
47#include "bitmask.h"
48#include "cpuset.h"
49#include "common.h"
50#include "test.h"
51#include "linux_syscall_numbers.h"
52#include "config.h"
53#if HAVE_LINUX_MEMPOLICY_H
54#include <linux/mempolicy.h>
55
56/* Bump version, and update Change History, when libcpuset API changes */
57#define CPUSET_VERSION 3
58
59/*
60 * For a history of what changed in each version, see the "Change
61 * History" section, at the end of the libcpuset master document.
62 */
63
64int cpuset_version(void)
65{
66	return CPUSET_VERSION;
67}
68
69struct cpuset {
70	struct bitmask *cpus;
71	struct bitmask *mems;
72	char cpu_exclusive;
73	char mem_exclusive;
74	char mem_hardwall;
75	char notify_on_release;
76	char memory_migrate;
77	char memory_pressure_enabled;
78	char memory_spread_page;
79	char memory_spread_slab;
80	char sched_load_balance;
81	int sched_relax_domain_level;
82
83	/*
84	 * Each field 'x' above gets an 'x_valid' field below.
85	 * The apply_cpuset_settings() will only set those fields whose
86	 * corresponding *_valid flags are set.  The cpuset_alloc()
87	 * routine clears these flags as part of the clear in calloc(),
88	 * and the various cpuset_set*() routines set these flags when
89	 * setting the corresponding value.
90	 *
91	 * The purpose of these valid fields is to ensure that when
92	 * we create a new cpuset, we don't accidentally overwrite
93	 * some non-zero kernel default, such as an inherited
94	 * memory_spread_* flag, just because the user application
95	 * code didn't override the default zero settings resulting
96	 * from the calloc() call in cpuset_alloc().
97	 *
98	 * The choice of 'char' for the type of the flags above,
99	 * but a bitfield for the flags below, is somewhat capricious.
100	 */
101	unsigned cpus_valid:1;
102	unsigned mems_valid:1;
103	unsigned cpu_exclusive_valid:1;
104	unsigned mem_exclusive_valid:1;
105	unsigned mem_hardwall_valid:1;
106	unsigned notify_on_release_valid:1;
107	unsigned memory_migrate_valid:1;
108	unsigned memory_pressure_enabled_valid:1;
109	unsigned memory_spread_page_valid:1;
110	unsigned memory_spread_slab_valid:1;
111	unsigned sched_load_balance_valid:1;
112	unsigned sched_relax_domain_level_valid:1;
113
114	/*
115	 * if the relative variable was modified, use following flags
116	 * to put a mark
117	 */
118	unsigned cpus_dirty:1;
119	unsigned mems_dirty:1;
120	unsigned cpu_exclusive_dirty:1;
121	unsigned mem_exclusive_dirty:1;
122	unsigned mem_hardwall_dirty:1;
123	unsigned notify_on_release_dirty:1;
124	unsigned memory_migrate_dirty:1;
125	unsigned memory_pressure_enabled_dirty:1;
126	unsigned memory_spread_page_dirty:1;
127	unsigned memory_spread_slab_dirty:1;
128	unsigned sched_load_balance_dirty:1;
129	unsigned sched_relax_domain_level_dirty:1;
130};
131
132/* Presumed cpuset file system mount point */
133static const char *cpusetmnt = "/dev/cpuset";
134
135/* Stashed copy of cpunodemap[], mapping each cpu to its node. */
136static const char *mapfile = "/var/run/cpunodemap";
137
138/* The primary source for the cpunodemap[] is available below here. */
139static const char *sysdevices = "/sys/devices/system";
140
141#define max(a,b) ((a) > (b) ? (a) : (b))
142#define min(a,b) ((a) < (b) ? (a) : (b))
143
144/* small buffer size - for reading boolean flags or map file (1 or 2 ints) */
145#define SMALL_BUFSZ 16
146
147/*
148 * The 'mask_size_file' is used to ferrit out the kernel cpumask_t
149 * and nodemask_t sizes.  The lines in this file that begin with the
150 * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask
151 * and nodemask string, respectively.  The lengths of these strings
152 * reflect the kernel's internal cpumask_t and nodemask_t sizes,
153 * which sizes are needed to correctly call the sched_setaffinity
154 * and set_mempolicy system calls, and to size user level
155 * bitmasks to match the kernels.
156 */
157
158static const char *mask_size_file = "/proc/self/status";
159static const char *cpumask_prefix = "Cpus_allowed:\t";
160static const char *nodemask_prefix = "Mems_allowed:\t";
161
162/*
163 * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits.
164 *
165 * The first time we need these, we parse the Cpus_allowed and
166 * Mems_allowed lines from mask_size_file ("/proc/self/status").
167 */
168
169static int cpumask_sz;
170static int nodemask_sz;
171
172/*
173 * These defaults only kick in if we fail to size the kernel
174 * cpumask and nodemask by reading the Cpus_allowed and
175 * Mems_allowed fields from the /proc/self/status file.
176 */
177
178#define DEFCPUBITS (512)
179#define DEFNODEBITS (DEFCPUBITS/2)
180
181/*
182 * Arch-neutral API for obtaining NUMA distances between CPUs
183 * and Memory Nodes, via the files:
184 *	/sys/devices/system/node/nodeN/distance
185 * which have lines such as:
186 *	46 66 10 20
187 * which say that for cpu on node N (from the path above), the
188 * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20,
189 * respectively.
190 */
191
192static const char *distance_directory = "/sys/devices/system/node";
193
194/*
195 * Someday, we should disable, then later discard, the SN code
196 * marked ALTERNATE_SN_DISTMAP.
197 */
198
199#define ALTERNATE_SN_DISTMAP 1
200#ifdef ALTERNATE_SN_DISTMAP
201
202/*
203 * Alternative SN (SGI ia64) architecture specific API for obtaining
204 * NUMA distances between CPUs and Memory Nodes is via the file
205 * /proc/sgi_sn/sn_topology, which has lines such as:
206 *
207 *   node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20
208 *
209 * which says that for each CPU on node 2, the distance to nodes
210 * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively.
211 *
212 * This file has other lines as well, which start with other
213 * keywords than "node".  Ignore these other lines.
214 */
215
216static const char *sn_topology = "/proc/sgi_sn/sn_topology";
217static const char *sn_top_node_prefix = "node ";
218
219#endif
220
221/*
222 * Check that cpusets supported, /dev/cpuset mounted.
223 * If ok, return 0.
224 * If not, return -1 and set errno:
225 *	ENOSYS - kernel doesn't support cpusets
226 *	ENODEV - /dev/cpuset not mounted
227 */
228
229static enum {
230	check_notdone,
231	check_enosys,
232	check_enodev,
233	check_ok
234} check_state = check_notdone;
235
236static int check()
237{
238	if (check_state == check_notdone) {
239		struct stat statbuf;
240
241		if (stat("/proc/self/cpuset", &statbuf) < 0) {
242			check_state = check_enosys;
243			goto done;
244		}
245
246		if (stat("/dev/cpuset/tasks", &statbuf) < 0) {
247			check_state = check_enodev;
248			goto done;
249		}
250
251		check_state = check_ok;
252	}
253done:
254	switch (check_state) {
255	case check_enosys:
256		errno = ENOSYS;
257		return -1;
258	case check_enodev:
259		errno = ENODEV;
260		return -1;
261	default:
262		break;
263	}
264	return 0;
265}
266
267static void chomp(char *s)
268{
269	char *t;
270
271	for (t = s + strlen(s) - 1; t >= s; t--) {
272		if (*t == '\n' || *t == '\r')
273			*t = '\0';
274		else
275			break;
276	}
277}
278
279/*
280 * Determine number of bytes in a seekable open file, without
281 * assuming that stat(2) on that file has a useful size.
282 * Has side affect of leaving the file rewound to the beginnning.
283 */
284static int filesize(FILE * fp)
285{
286	int sz = 0;
287	rewind(fp);
288	while (fgetc(fp) != EOF)
289		sz++;
290	rewind(fp);
291	return sz;
292}
293
294/* Are strings s1 and s2 equal? */
295static int streq(const char *s1, const char *s2)
296{
297	return strcmp(s1, s2) == 0;
298}
299
300/* Is string 'pre' a prefix of string 's'? */
301static int strprefix(const char *s, const char *pre)
302{
303	return strncmp(s, pre, strlen(pre)) == 0;
304}
305
306/*
307 * char *flgets(char *buf, int buflen, FILE *fp)
308 *
309 * Obtain one line from input file fp.  Copy up to first
310 * buflen-1 chars of line into buffer buf, discarding any remainder
311 * of line.  Stop reading at newline, discarding newline.
312 * Nul terminate result and return pointer to buffer buf
313 * on success, or NULL if nothing more to read or failure.
314 */
315
316static char *flgets(char *buf, int buflen, FILE * fp)
317{
318	int c = -1;
319	char *bp;
320
321	bp = buf;
322	while ((--buflen > 0) && ((c = getc(fp)) >= 0)) {
323		if (c == '\n')
324			goto newline;
325		*bp++ = c;
326	}
327	if ((c < 0) && (bp == buf))
328		return NULL;
329
330	if (c > 0) {
331		while ((c = getc(fp)) >= 0) {
332			if (c == '\n')
333				break;
334		}
335	}
336
337newline:
338	*bp++ = '\0';
339	return buf;
340}
341
342/*
343 * sgetc(const char *inputbuf, int *offsetptr)
344 *
345 * Return next char from nul-terminated input buffer inputbuf,
346 * starting at offset *offsetptr.  Increment *offsetptr.
347 * If next char would be nul ('\0'), return EOF and don't
348 * increment *offsetptr.
349 */
350
351static int sgetc(const char *inputbuf, int *offsetptr)
352{
353	char c;
354
355	if ((c = inputbuf[*offsetptr]) != 0) {
356		*offsetptr = *offsetptr + 1;
357		return c;
358	} else {
359		return EOF;
360	}
361}
362
363/*
364 * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
365 *
366 * Obtain next line from nul-terminated input buffer 'inputbuf',
367 * starting at offset *offsetptr.  Copy up to first buflen-1
368 * chars of line into output buffer buf, discarding any remainder
369 * of line.  Stop reading at newline, discarding newline.
370 * Nul terminate result and return pointer to output buffer
371 * buf on success, or NULL if nothing more to read.
372 */
373
374static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
375{
376	int c = -1;
377	char *bp;
378
379	bp = buf;
380	while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) {
381		if (c == '\n')
382			goto newline;
383		*bp++ = c;
384	}
385	if ((c < 0) && (bp == buf))
386		return NULL;
387
388	if (c > 0) {
389		while ((c = sgetc(inputbuf, offsetptr)) >= 0) {
390			if (c == '\n')
391				break;
392		}
393	}
394
395newline:
396	*bp++ = '\0';
397	return buf;
398}
399
400/*
401 * time_t get_mtime(char *path)
402 *
403 * Return modtime of file at location path, else return 0.
404 */
405
406static time_t get_mtime(const char *path)
407{
408	struct stat statbuf;
409
410	if (stat(path, &statbuf) != 0)
411		return 0;
412	return statbuf.st_mtime;
413}
414
415/*
416 * int set_mtime(const char *path, time_t mtime)
417 *
418 * Set modtime of file 'path' to 'mtime'.  Return 0 on success,
419 * or -1 on error, setting errno.
420 */
421
422static int set_mtime(const char *path, time_t mtime)
423{
424	struct utimbuf times;
425
426	times.actime = mtime;
427	times.modtime = mtime;
428	return utime(path, &times);
429}
430
431/*
432 * True if two pathnames resolve to same file.
433 * False if either path can not be stat'd,
434 * or if the two paths resolve to a different file.
435 */
436
437static int samefile(const char *path1, const char *path2)
438{
439	struct stat sb1, sb2;
440
441	if (stat(path1, &sb1) != 0)
442		return 0;
443	if (stat(path2, &sb2) != 0)
444		return 0;
445	return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev;
446}
447
448#define slash(c) (*(c) == '/')
449#define eocomp(c) (slash(c) || !*(c))
450#define dot1(c) (*(c) == '.' && eocomp(c+1))
451
452/* In place path compression.  Remove extra dots and slashes. */
453static char *pathcomp(char *p)
454{
455	char *a = p;
456	char *b = p;
457
458	if (!p || !*p)
459		return p;
460	if (slash(p))
461		*b++ = *a++;
462	for (;;) {
463		if (slash(a))
464			while (slash(++a))
465				continue;
466		if (!*a) {
467			if (b == p)
468				*b++ = '.';
469			*b = '\0';
470			return (p);
471		} else if (dot1(a)) {
472			a++;
473		} else {
474			if ((b != p) && !slash(b - 1))
475				*b++ = '/';
476			while (!eocomp(a))
477				*b++ = *a++;
478		}
479	}
480}
481
482#undef slash
483#undef eocomp
484#undef dot1
485
486/*
487 * pathcat2(buf, buflen, name1, name2)
488 *
489 * Return buf, of length buflen, with name1/name2 stored in it.
490 */
491
492static char *pathcat2(char *buf, int buflen, const char *name1,
493		      const char *name2)
494{
495	(void)snprintf(buf, buflen, "%s/%s", name1, name2);
496	return pathcomp(buf);
497}
498
499/*
500 * pathcat3(buf, buflen, name1, name2, name3)
501 *
502 * Return buf, of length buflen, with name1/name2/name3 stored in it.
503 */
504
505static char *pathcat3(char *buf, int buflen, const char *name1,
506		      const char *name2, const char *name3)
507{
508	(void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3);
509	return pathcomp(buf);
510}
511
512/*
513 * fullpath(buf, buflen, name)
514 *
515 * Put full path of cpuset 'name' in buffer 'buf'.  If name
516 * starts with a slash (``/``) character, then this a path
517 * relative to ``/dev/cpuset``, otherwise it is relative to
518 * the current tasks cpuset.  Return 0 on success, else
519 * -1 on error, setting errno.
520 */
521
522static int fullpath(char *buf, int buflen, const char *name)
523{
524	int len;
525
526	/* easy case */
527	if (*name == '/') {
528		pathcat2(buf, buflen, cpusetmnt, name);
529		pathcomp(buf);
530		return 0;
531	}
532
533	/* hard case */
534	snprintf(buf, buflen, "%s/", cpusetmnt);
535	len = strlen(buf);
536	if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL)
537		return -1;
538	if (strlen(buf) >= buflen - 1 - strlen(name)) {
539		errno = E2BIG;
540		return -1;
541	}
542	strcat(buf, "/");
543	strcat(buf, name);
544	pathcomp(buf);
545	return 0;
546}
547
548/*
549 * fullpath2(buf, buflen, name1, name2)
550 *
551 * Like fullpath(), only concatenate two pathname components on end.
552 */
553
554static int fullpath2(char *buf, int buflen, const char *name1,
555		     const char *name2)
556{
557	if (fullpath(buf, buflen, name1) < 0)
558		return -1;
559	if (strlen(buf) >= buflen - 1 - strlen(name2)) {
560		errno = E2BIG;
561		return -1;
562	}
563	strcat(buf, "/");
564	strcat(buf, name2);
565	pathcomp(buf);
566	return 0;
567}
568
569/*
570 * Convert the string length of an ascii hex mask to the number
571 * of bits represented by that mask.
572 *
573 * The cpumask and nodemask values in /proc/self/status are in an
574 * ascii format that uses 9 characters for each 32 bits of mask.
575 */
576static int s2nbits(const char *s)
577{
578	return strlen(s) * 32 / 9;
579}
580
581static void update_mask_sizes()
582{
583	FILE *fp = NULL;
584	char *buf = NULL;
585	int fsize;
586
587	if ((fp = fopen(mask_size_file, "r")) == NULL)
588		goto done;
589	fsize = filesize(fp);
590	if ((buf = malloc(fsize)) == NULL)
591		goto done;
592
593	/*
594	 * Beware: mask sizing arithmetic is fussy.
595	 * The trailing newline left by fgets() is required.
596	 */
597	while (fgets(buf, fsize, fp)) {
598		if (strprefix(buf, cpumask_prefix))
599			cpumask_sz = s2nbits(buf + strlen(cpumask_prefix));
600		if (strprefix(buf, nodemask_prefix))
601			nodemask_sz = s2nbits(buf + strlen(nodemask_prefix));
602	}
603done:
604	if (buf != NULL)
605		free(buf);
606	if (fp != NULL)
607		fclose(fp);
608	if (cpumask_sz == 0)
609		cpumask_sz = DEFCPUBITS;
610	if (nodemask_sz == 0)
611		nodemask_sz = DEFNODEBITS;
612}
613
614/* Allocate a new struct cpuset */
615struct cpuset *cpuset_alloc()
616{
617	struct cpuset *cp = NULL;
618	int nbits;
619
620	if ((cp = calloc(1, sizeof(struct cpuset))) == NULL)
621		goto err;
622
623	nbits = cpuset_cpus_nbits();
624	if ((cp->cpus = bitmask_alloc(nbits)) == NULL)
625		goto err;
626
627	nbits = cpuset_mems_nbits();
628	if ((cp->mems = bitmask_alloc(nbits)) == NULL)
629		goto err;
630
631	return cp;
632err:
633	if (cp && cp->cpus)
634		bitmask_free(cp->cpus);
635	if (cp && cp->mems)
636		bitmask_free(cp->mems);
637	if (cp)
638		free(cp);
639	return NULL;
640}
641
642/* Free struct cpuset *cp */
643void cpuset_free(struct cpuset *cp)
644{
645	if (!cp)
646		return;
647	if (cp->cpus)
648		bitmask_free(cp->cpus);
649	if (cp->mems)
650		bitmask_free(cp->mems);
651	free(cp);
652}
653
654/* Number of bits in a CPU bitmask on current system */
655int cpuset_cpus_nbits()
656{
657	if (cpumask_sz == 0)
658		update_mask_sizes();
659	return cpumask_sz;
660}
661
662/* Number of bits in a Memory bitmask on current system */
663int cpuset_mems_nbits()
664{
665	if (nodemask_sz == 0)
666		update_mask_sizes();
667	return nodemask_sz;
668}
669
670/* Set CPUs in cpuset cp to bitmask cpus */
671int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus)
672{
673	if (cp->cpus)
674		bitmask_free(cp->cpus);
675	cp->cpus = bitmask_alloc(bitmask_nbits(cpus));
676	if (cp->cpus == NULL)
677		return -1;
678	bitmask_copy(cp->cpus, cpus);
679	cp->cpus_valid = 1;
680	cp->cpus_dirty = 1;
681	return 0;
682}
683
684/* Set Memory Nodes in cpuset cp to bitmask mems */
685int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems)
686{
687	if (cp->mems)
688		bitmask_free(cp->mems);
689	cp->mems = bitmask_alloc(bitmask_nbits(mems));
690	if (cp->mems == NULL)
691		return -1;
692	bitmask_copy(cp->mems, mems);
693	cp->mems_valid = 1;
694	cp->mems_dirty = 1;
695	return 0;
696}
697
698/* Set integer value optname of cpuset cp */
699int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value)
700{
701	if (streq(optionname, "cpu_exclusive")) {
702		cp->cpu_exclusive = ! !value;
703		cp->cpu_exclusive_valid = 1;
704		cp->cpu_exclusive_dirty = 1;
705	} else if (streq(optionname, "mem_exclusive")) {
706		cp->mem_exclusive = ! !value;
707		cp->mem_exclusive_valid = 1;
708		cp->mem_exclusive_dirty = 1;
709	} else if (streq(optionname, "mem_hardwall")) {
710		cp->mem_hardwall = ! !value;
711		cp->mem_hardwall_valid = 1;
712		cp->mem_hardwall_dirty = 1;
713	} else if (streq(optionname, "notify_on_release")) {
714		cp->notify_on_release = ! !value;
715		cp->notify_on_release_valid = 1;
716		cp->notify_on_release_dirty = 1;
717	} else if (streq(optionname, "memory_pressure_enabled")) {
718		cp->memory_pressure_enabled = ! !value;
719		cp->memory_pressure_enabled_valid = 1;
720		cp->memory_pressure_enabled_dirty = 1;
721	} else if (streq(optionname, "memory_migrate")) {
722		cp->memory_migrate = ! !value;
723		cp->memory_migrate_valid = 1;
724		cp->memory_migrate_dirty = 1;
725	} else if (streq(optionname, "memory_spread_page")) {
726		cp->memory_spread_page = ! !value;
727		cp->memory_spread_page_valid = 1;
728		cp->memory_spread_page_dirty = 1;
729	} else if (streq(optionname, "memory_spread_slab")) {
730		cp->memory_spread_slab = ! !value;
731		cp->memory_spread_slab_valid = 1;
732		cp->memory_spread_slab_dirty = 1;
733	} else if (streq(optionname, "sched_load_balance")) {
734		cp->sched_load_balance = ! !value;
735		cp->sched_load_balance_valid = 1;
736		cp->sched_load_balance_dirty = 1;
737	} else if (streq(optionname, "sched_relax_domain_level")) {
738		cp->sched_relax_domain_level = value;
739		cp->sched_relax_domain_level_valid = 1;
740		cp->sched_relax_domain_level_dirty = 1;
741	} else
742		return -2;	/* optionname not recognized */
743	return 0;
744}
745
746/* [optional] Set string value optname */
747int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname,
748		    UNUSED const char *value)
749{
750	return -2;		/* For now, all string options unrecognized */
751}
752
753/* Return handle for reading memory_pressure. */
754int cpuset_open_memory_pressure(const char *cpusetpath)
755{
756	char buf[PATH_MAX];
757
758	fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure");
759	return open(buf, O_RDONLY);
760}
761
762/* Return current memory_pressure of cpuset. */
763int cpuset_read_memory_pressure(int han)
764{
765	char buf[SMALL_BUFSZ];
766
767	if (pread(han, buf, sizeof(buf), 0L) < 0)
768		return -1;
769	return atoi(buf);
770}
771
772/* Close handle for reading memory pressure. */
773void cpuset_close_memory_pressure(int han)
774{
775	close(han);
776}
777
778/*
779 * Resolve cpuset pointer (to that of current task if cp == NULL).
780 *
781 * If cp not NULL, just return it.  If cp is NULL, return pointer
782 * to temporary cpuset for current task, and set *cp_tofree to
783 * pointer to that same temporary cpuset, to be freed later.
784 *
785 * Return NULL and set errno on error.  Errors can occur when
786 * resolving the current tasks cpuset.
787 */
788static const struct cpuset *resolve_cp(const struct cpuset *cp,
789				       struct cpuset **cp_tofree)
790{
791	const struct cpuset *rcp;
792
793	if (cp) {
794		rcp = cp;
795	} else {
796		struct cpuset *cp1 = cpuset_alloc();
797		if (cp1 == NULL)
798			goto err;
799		if (cpuset_cpusetofpid(cp1, 0) < 0) {
800			cpuset_free(cp1);
801			goto err;
802		}
803		*cp_tofree = cp1;
804		rcp = cp1;
805	}
806	return rcp;
807err:
808	return NULL;
809}
810
811/* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */
812int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus)
813{
814	struct cpuset *cp_tofree = NULL;
815	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
816
817	if (!cp1)
818		goto err;
819	if (cp1->cpus == NULL) {
820		errno = EINVAL;
821		goto err;
822	}
823	bitmask_copy(cpus, cp1->cpus);
824	cpuset_free(cp_tofree);
825	return 0;
826err:
827	cpuset_free(cp_tofree);
828	return -1;
829}
830
831/* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */
832int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems)
833{
834	struct cpuset *cp_tofree = NULL;
835	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
836
837	if (!cp1)
838		goto err;
839	if (cp1->mems == NULL) {
840		errno = EINVAL;
841		goto err;
842	}
843	bitmask_copy(mems, cp1->mems);
844	cpuset_free(cp_tofree);
845	return 0;
846err:
847	cpuset_free(cp_tofree);
848	return -1;
849}
850
851/* Return number of CPUs in cpuset cp (current task if cp == NULL) */
852int cpuset_cpus_weight(const struct cpuset *cp)
853{
854	struct cpuset *cp_tofree = NULL;
855	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
856	int w = -1;
857
858	if (!cp1)
859		goto err;
860	if (cp1->cpus == NULL) {
861		errno = EINVAL;
862		goto err;
863	}
864	w = bitmask_weight(cp1->cpus);
865	/* fall into ... */
866err:
867	cpuset_free(cp_tofree);
868	return w;
869}
870
871/* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */
872int cpuset_mems_weight(const struct cpuset *cp)
873{
874	struct cpuset *cp_tofree = NULL;
875	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
876	int w = -1;
877
878	if (!cp1)
879		goto err;
880	if (cp1->mems == NULL) {
881		errno = EINVAL;
882		goto err;
883	}
884	w = bitmask_weight(cp1->mems);
885	/* fall into ... */
886err:
887	cpuset_free(cp_tofree);
888	return w;
889}
890
891/* Return integer value of option optname in cp */
892int cpuset_get_iopt(const struct cpuset *cp, const char *optionname)
893{
894	if (streq(optionname, "cpu_exclusive"))
895		return cp->cpu_exclusive;
896	else if (streq(optionname, "mem_exclusive"))
897		return cp->mem_exclusive;
898	else if (streq(optionname, "mem_hardwall"))
899		return cp->mem_hardwall;
900	else if (streq(optionname, "notify_on_release"))
901		return cp->notify_on_release;
902	else if (streq(optionname, "memory_pressure_enabled"))
903		return cp->memory_pressure_enabled;
904	else if (streq(optionname, "memory_migrate"))
905		return cp->memory_migrate;
906	else if (streq(optionname, "memory_spread_page"))
907		return cp->memory_spread_page;
908	else if (streq(optionname, "memory_spread_slab"))
909		return cp->memory_spread_slab;
910	else if (streq(optionname, "sched_load_balance"))
911		return cp->sched_load_balance;
912	else if (streq(optionname, "sched_relax_domain_level"))
913		return cp->sched_relax_domain_level;
914	else
915		return -2;	/* optionname not recognized */
916}
917
918/* [optional] Return string value of optname */
919const char *cpuset_get_sopt(UNUSED const struct cpuset *cp,
920			    UNUSED const char *optionname)
921{
922	return NULL;		/* For now, all string options unrecognized */
923}
924
925static int read_flag(const char *filepath, char *flagp)
926{
927	char buf[SMALL_BUFSZ];	/* buffer a "0" or "1" flag line */
928	int fd = -1;
929
930	if ((fd = open(filepath, O_RDONLY)) < 0)
931		goto err;
932	if (read(fd, buf, sizeof(buf)) < 1)
933		goto err;
934	if (atoi(buf))
935		*flagp = 1;
936	else
937		*flagp = 0;
938	close(fd);
939	return 0;
940err:
941	if (fd >= 0)
942		close(fd);
943	return -1;
944}
945
946static int load_flag(const char *path, char *flagp, const char *flag)
947{
948	char buf[PATH_MAX];
949
950	pathcat2(buf, sizeof(buf), path, flag);
951	return read_flag(buf, flagp);
952}
953
954static int read_number(const char *filepath, int *numberp)
955{
956	char buf[SMALL_BUFSZ];
957	int fd = -1;
958
959	if ((fd = open(filepath, O_RDONLY)) < 0)
960		goto err;
961	if (read(fd, buf, sizeof(buf)) < 1)
962		goto err;
963	*numberp = atoi(buf);
964	close(fd);
965	return 0;
966err:
967	if (fd >= 0)
968		close(fd);
969	return -1;
970}
971
972static int load_number(const char *path, int *numberp, const char *file)
973{
974	char buf[PATH_MAX];
975
976	pathcat2(buf, sizeof(buf), path, file);
977	return read_number(buf, numberp);
978}
979
980static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits)
981{
982	FILE *fp = NULL;
983	char *buf = NULL;
984	int buflen;
985	struct bitmask *bmp = NULL;
986
987	if ((fp = fopen(filepath, "r")) == NULL)
988		goto err;
989	buflen = filesize(fp) + 1;	/* + 1 for nul term */
990	if ((buf = malloc(buflen)) == NULL)
991		goto err;
992	if (flgets(buf, buflen, fp) == NULL)
993		goto err;
994	fclose(fp);
995	fp = NULL;
996
997	if ((bmp = bitmask_alloc(nbits)) == NULL)
998		goto err;
999	if (*buf && bitmask_parselist(buf, bmp) < 0)
1000		goto err;
1001	if (*bmpp)
1002		bitmask_free(*bmpp);
1003	*bmpp = bmp;
1004	free(buf);
1005	buf = NULL;
1006	return 0;
1007err:
1008	if (buf != NULL)
1009		free(buf);
1010	if (fp != NULL)
1011		fclose(fp);
1012	if (bmp != NULL)
1013		bitmask_free(bmp);
1014	return -1;
1015}
1016
1017static int load_mask(const char *path, struct bitmask **bmpp,
1018		     int nbits, const char *mask)
1019{
1020	char buf[PATH_MAX];
1021
1022	pathcat2(buf, sizeof(buf), path, mask);
1023	return read_mask(buf, bmpp, nbits);
1024}
1025
1026/* Write string to file at given filepath.  Create or truncate file. */
1027static int write_string_file(const char *filepath, const char *str)
1028{
1029	int fd = -1;
1030
1031	if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0)
1032		goto err;
1033	if (write(fd, str, strlen(str)) < 0)
1034		goto err;
1035	close(fd);
1036	return 0;
1037err:
1038	if (fd >= 0)
1039		close(fd);
1040	return -1;
1041}
1042
1043/* Size and allocate buffer.  Write bitmask into it.  Caller must free */
1044static char *sprint_mask_buf(const struct bitmask *bmp)
1045{
1046	char *buf = NULL;
1047	int buflen;
1048	char c;
1049
1050	/* First bitmask_displaylist() call just to get the length */
1051	buflen = bitmask_displaylist(&c, 1, bmp) + 1;	/* "+ 1" for nul */
1052	if ((buf = malloc(buflen)) == NULL)
1053		return NULL;
1054	bitmask_displaylist(buf, buflen, bmp);
1055	return buf;
1056}
1057
1058static int exists_flag(const char *path, const char *flag)
1059{
1060	char buf[PATH_MAX];
1061	struct stat statbuf;
1062	int rc;
1063
1064	pathcat2(buf, sizeof(buf), path, flag);
1065	rc = (stat(buf, &statbuf) == 0);
1066	errno = 0;
1067	return rc;
1068}
1069
1070static int store_flag(const char *path, const char *flag, int val)
1071{
1072	char buf[PATH_MAX];
1073
1074	pathcat2(buf, sizeof(buf), path, flag);
1075	return write_string_file(buf, val ? "1" : "0");
1076}
1077
1078static int store_number(const char *path, const char *file, int val)
1079{
1080	char buf[PATH_MAX];
1081	char data[SMALL_BUFSZ];
1082
1083	memset(data, 0, sizeof(data));
1084	pathcat2(buf, sizeof(buf), path, file);
1085	snprintf(data, sizeof(data), "%d", val);
1086	return write_string_file(buf, data);
1087}
1088
1089static int store_mask(const char *path, const char *mask,
1090		      const struct bitmask *bmp)
1091{
1092	char maskpath[PATH_MAX];
1093	char *bp = NULL;
1094	int rc;
1095
1096	if (bmp == NULL)
1097		return 0;
1098	pathcat2(maskpath, sizeof(maskpath), path, mask);
1099	if ((bp = sprint_mask_buf(bmp)) == NULL)
1100		return -1;
1101	rc = write_string_file(maskpath, bp);
1102	free(bp);
1103	return rc;
1104}
1105
1106/*
1107 * Return 1 if 'cpu' is online, else 0 if offline.  Tests the file
1108 * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents
1109 * were N == cpu number.
1110 */
1111
1112char cpu_online(unsigned int cpu)
1113{
1114	char online;
1115	char cpupath[PATH_MAX];
1116
1117	(void)snprintf(cpupath, sizeof(cpupath),
1118		       "/sys/devices/system/cpu/cpu%d/online", cpu);
1119	if (read_flag(cpupath, &online) < 0)
1120		return 0;	/* oops - guess that cpu's not there */
1121	return online;
1122}
1123
1124/*
1125 * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()),
1126 * to the node on which that cpu resides or cpuset_mems_nbits().
1127 *
1128 * To avoid every user having to recalculate this relation
1129 * from various clues in the sysfs file system (below the
1130 * path /sys/devices/system) a copy of this map is kept at
1131 * /var/run/cpunodemap.
1132 *
1133 * The system automatically cleans out files below
1134 * /var/run on each system reboot (see the init script
1135 * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry
1136 * about stale data in this file across reboots.  If the file
1137 * is missing, let the first process that needs it, and has
1138 * permission to write in the /var/run directory, rebuild it.
1139 *
1140 * If using this cached data, remember the mtime of the mapfile
1141 * the last time we read it in case something like a hotplug
1142 * event results in the file being removed and rebuilt, so we
1143 * can detect if we're using a stale cache, and need to reload.
1144 *
1145 * The mtime of this file is set to the time when we did
1146 * the recalculation of the map, from the clues beneath
1147 * /sys/devices/system.  This is done so that a program
1148 * won't see the mapfile it just wrote as being newer than what
1149 * it just wrote out (store_map) and read the same map back in
1150 * (load_file).
1151 */
1152
1153/*
1154 * Hold flockfile(stdin) while using cpunodemap for posix thread safety.
1155 *
1156 * Note on locking and flockfile(FILE *):
1157 *
1158 *  We use flockfile() and funlockfile() instead of directly
1159 *  calling pthread_mutex_lock and pthread_mutex_unlock on
1160 *  a pthread_mutex_t, because this avoids forcing the app
1161 *  to link with libpthread.  The glibc implementation of
1162 *  flockfile/funlockfile will fall back to no-ops if libpthread
1163 *  doesn't happen to be linked.
1164 *
1165 *  Since flockfile already has the moderately convoluted
1166 *  combination of weak and strong symbols required to accomplish
1167 *  this, it is easier to use flockfile() on some handy FILE *
1168 *  stream as a surrogate for pthread locking than it is to so
1169 *  re-invent that wheel.
1170 *
1171 *  Forcing all apps that use cpusets to link with libpthread
1172 *  would force non-transparent initialization on apps that
1173 *  might not be prepared to handle it.
1174 *
1175 *  The application using libcpuset should never notice this
1176 *  odd use of flockfile(), because we never return to the
1177 *  application from any libcpuset call with any such lock held.
1178 *  We just use this locking for guarding some non-atomic cached
1179 *  data updates and accesses, internal to some libcpuset calls.
1180 *  Also, flockfile() allows recursive nesting, so if the app
1181 *  calls libcpuset holding such a file lock, we won't deadlock
1182 *  if we go to acquire the same lock.  We'll just get the lock
1183 *  and increment its counter while we hold it.
1184 */
1185
1186static struct cpunodemap {
1187	int *map;		/* map[cpumask_sz]: maps cpu to its node */
1188	time_t mtime;		/* modtime of mapfile when last read */
1189} cpunodemap;
1190
1191/*
1192 * rebuild_map() - Rebuild cpunodemap[] from scratch.
1193 *
1194 * Situation:
1195 *	Neither our in-memory cpunodemap[] array nor the
1196 *	cache of it in mapfile is current.
1197 * Action:
1198 *	Rebuild it from first principles and the information
1199 *	available below /sys/devices/system.
1200 */
1201
1202static void rebuild_map()
1203{
1204	char buf[PATH_MAX];
1205	DIR *dir1, *dir2;
1206	struct dirent *dent1, *dent2;
1207	int ncpus = cpuset_cpus_nbits();
1208	int nmems = cpuset_mems_nbits();
1209	unsigned int cpu, mem;
1210
1211	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1212		cpunodemap.map[cpu] = -1;
1213	pathcat2(buf, sizeof(buf), sysdevices, "node");
1214	if ((dir1 = opendir(buf)) == NULL)
1215		return;
1216	while ((dent1 = readdir(dir1)) != NULL) {
1217		if (sscanf(dent1->d_name, "node%u", &mem) < 1)
1218			continue;
1219		pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name);
1220		if ((dir2 = opendir(buf)) == NULL)
1221			continue;
1222		while ((dent2 = readdir(dir2)) != NULL) {
1223			if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
1224				continue;
1225			if (cpu >= (unsigned int)ncpus
1226			    || mem >= (unsigned int)nmems)
1227				continue;
1228			cpunodemap.map[cpu] = mem;
1229		}
1230		closedir(dir2);
1231	}
1232	closedir(dir1);
1233	cpunodemap.mtime = time(0);
1234}
1235
1236/*
1237 * load_map() - Load cpunodemap[] from mapfile.
1238 *
1239 * Situation:
1240 *	The cpunodemap in mapfile is more recent than
1241 *	what we have in the cpunodemap[] array.
1242 * Action:
1243 *	Reload the cpunodemap[] array from the file.
1244 */
1245
1246static void load_map()
1247{
1248	char buf[SMALL_BUFSZ];	/* buffer 1 line of mapfile */
1249	FILE *mapfp;		/* File stream on mapfile */
1250	int ncpus = cpuset_cpus_nbits();
1251	int nmems = cpuset_mems_nbits();
1252	unsigned int cpu, mem;
1253
1254	if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL)
1255		return;
1256	cpunodemap.mtime = get_mtime(mapfile);
1257	if ((mapfp = fopen(mapfile, "r")) == NULL)
1258		return;
1259	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1260		cpunodemap.map[cpu] = nmems;
1261	while (flgets(buf, sizeof(buf), mapfp) != NULL) {
1262		if (sscanf(buf, "%u %u", &cpu, &mem) < 2)
1263			continue;
1264		if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems)
1265			continue;
1266		cpunodemap.map[cpu] = mem;
1267	}
1268	fclose(mapfp);
1269}
1270
1271/*
1272 * store_map() - Write cpunodemap[] out to mapfile.
1273 *
1274 * Situation:
1275 *	The cpunodemap in the cpunodemap[] array is
1276 *	more recent than the one in mapfile.
1277 * Action:
1278 *	Write cpunodemap[] out to mapfile.
1279 */
1280
1281static void store_map()
1282{
1283	char buf[PATH_MAX];
1284	int fd = -1;
1285	FILE *mapfp = NULL;
1286	int ncpus = cpuset_cpus_nbits();
1287	int nmems = cpuset_mems_nbits();
1288	unsigned int cpu, mem;
1289
1290	snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX");
1291	if ((fd = mkstemp(buf)) < 0)
1292		goto err;
1293	if ((mapfp = fdopen(fd, "w")) == NULL)
1294		goto err;
1295	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1296		mem = cpunodemap.map[cpu];
1297		if (mem < (unsigned int)nmems)
1298			fprintf(mapfp, "%u %u\n", cpu, mem);
1299	}
1300	fclose(mapfp);
1301	set_mtime(buf, cpunodemap.mtime);
1302	if (rename(buf, mapfile) < 0)
1303		goto err;
1304	/* mkstemp() creates mode 0600 - change to world readable */
1305	(void)chmod(mapfile, 0444);
1306	return;
1307err:
1308	if (mapfp != NULL) {
1309		fclose(mapfp);
1310		fd = -1;
1311	}
1312	if (fd >= 0)
1313		close(fd);
1314	(void)unlink(buf);
1315}
1316
1317/*
1318 * Load and gain thread safe access to the <cpu, node> map.
1319 *
1320 * Return 0 on success with flockfile(stdin) held.
1321 * Each successful get_map() call must be matched with a
1322 * following put_map() call to release the lock.
1323 *
1324 * On error, return -1 with errno set and no lock held.
1325 */
1326
1327static int get_map()
1328{
1329	time_t file_mtime;
1330
1331	flockfile(stdin);
1332
1333	if (cpunodemap.map == NULL) {
1334		cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int));
1335		if (cpunodemap.map == NULL)
1336			goto err;
1337	}
1338
1339	/* If no one has a good cpunodemap, rebuild from scratch */
1340	file_mtime = get_mtime(mapfile);
1341	if (cpunodemap.mtime == 0 && file_mtime == 0)
1342		rebuild_map();
1343
1344	/* If either cpunodemap[] or mapfile newer, update other with it */
1345	file_mtime = get_mtime(mapfile);
1346	if (cpunodemap.mtime < file_mtime)
1347		load_map();
1348	else if (cpunodemap.mtime > file_mtime)
1349		store_map();
1350	return 0;
1351err:
1352	funlockfile(stdin);
1353	return -1;
1354}
1355
1356static void put_map()
1357{
1358	funlockfile(stdin);
1359}
1360
1361/* Set cpus to those local to Memory Nodes mems */
1362int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus)
1363{
1364	int ncpus = cpuset_cpus_nbits();
1365	unsigned int cpu;
1366
1367	if (check() < 0)
1368		return -1;
1369
1370	get_map();
1371	bitmask_clearall(cpus);
1372	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1373		if (bitmask_isbitset(mems, cpunodemap.map[cpu]))
1374			bitmask_setbit(cpus, cpu);
1375	}
1376	put_map();
1377	return 0;
1378}
1379
1380/* Set mems to those local to CPUs cpus */
1381int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems)
1382{
1383	int ncpus = cpuset_cpus_nbits();
1384	unsigned int cpu;
1385
1386	if (check() < 0)
1387		return -1;
1388
1389	get_map();
1390	bitmask_clearall(mems);
1391	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1392		if (bitmask_isbitset(cpus, cpu))
1393			bitmask_setbit(mems, cpunodemap.map[cpu]);
1394	}
1395	put_map();
1396	return 0;
1397}
1398
1399/*
1400 * distmap[]
1401 *
1402 * Array of ints of size cpumask_sz by nodemask_sz.
1403 *
1404 * Element distmap[cpu][mem] is the distance between CPU cpu
1405 * and Memory Node mem.  Distances are weighted to roughly
1406 * approximate the cost of memory references, and scaled so that
1407 * the distance from a CPU to its local Memory Node is ten (10).
1408 *
1409 * The first call to cpuset_cpumemdist() builds this map, from
1410 * whatever means the kernel provides to obtain these distances.
1411 *
1412 * These distances derive from ACPI SLIT table entries, which are
1413 * eight bits in size.
1414 *
1415 * Hold flockfile(stdout) while using distmap for posix thread safety.
1416 */
1417
1418typedef unsigned char distmap_entry_t;	/* type of distmap[] entries */
1419
1420static distmap_entry_t *distmap;	/* maps <cpu, mem> to distance */
1421
1422#define DISTMAP_MAX UCHAR_MAX	/* maximum value in distmap[] */
1423
1424#define I(i,j) ((i) * nmems + (j))	/* 2-D array index simulation */
1425
1426/*
1427 * Parse arch neutral lines from 'distance' files of form:
1428 *
1429 *	46 66 10 20
1430 *
1431 * The lines contain a space separated list of distances, which is parsed
1432 * into array dists[] of each nodes distance from the specified node.
1433 *
1434 * Result is placed in distmap[ncpus][nmems]:
1435 *
1436 *	For each cpu c on node:
1437 *		For each node position n in list of distances:
1438 *			distmap[c][n] = dists[n]
1439 */
1440
1441static int parse_distmap_line(unsigned int node, char *buf)
1442{
1443	char *p, *q;
1444	int ncpus = cpuset_cpus_nbits();
1445	int nmems = cpuset_mems_nbits();
1446	unsigned int c, n;
1447	distmap_entry_t *dists = NULL;
1448	struct bitmask *cpus = NULL, *mems = NULL;
1449	int ret = -1;
1450
1451	p = buf;
1452	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1453		goto err;
1454	for (n = 0; n < (unsigned int)nmems; n++)
1455		dists[n] = DISTMAP_MAX;
1456
1457	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1458		unsigned int d;
1459
1460		if ((p = strpbrk(p, "0123456789")) == NULL)
1461			break;
1462		d = strtoul(p, &q, 10);
1463		if (p == q)
1464			break;
1465		if (d < DISTMAP_MAX)
1466			dists[n] = (distmap_entry_t) d;
1467	}
1468
1469	if ((mems = bitmask_alloc(nmems)) == NULL)
1470		goto err;
1471	bitmask_setbit(mems, node);
1472
1473	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1474		goto err;
1475	cpuset_localcpus(mems, cpus);
1476
1477	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1478	     c = bitmask_next(cpus, c + 1))
1479		for (n = 0; n < (unsigned int)nmems; n++)
1480			distmap[I(c, n)] = dists[n];
1481	ret = 0;
1482	/* fall into ... */
1483err:
1484	bitmask_free(mems);
1485	bitmask_free(cpus);
1486	free(dists);
1487	return ret;
1488}
1489
1490static int parse_distance_file(unsigned int node, const char *path)
1491{
1492	FILE *fp;
1493	char *buf = NULL;
1494	int buflen;
1495
1496	if ((fp = fopen(path, "r")) == NULL)
1497		goto err;
1498
1499	buflen = filesize(fp);
1500
1501	if ((buf = malloc(buflen)) == NULL)
1502		goto err;
1503
1504	if (flgets(buf, buflen, fp) == NULL)
1505		goto err;
1506
1507	if (parse_distmap_line(node, buf) < 0)
1508		goto err;
1509
1510	free(buf);
1511	fclose(fp);
1512	return 0;
1513err:
1514	free(buf);
1515	if (fp)
1516		fclose(fp);
1517	return -1;
1518}
1519
1520static void build_distmap()
1521{
1522	static int tried_before = 0;
1523	int ncpus = cpuset_cpus_nbits();
1524	int nmems = cpuset_mems_nbits();
1525	int c, m;
1526	DIR *dir = NULL;
1527	struct dirent *dent;
1528
1529	if (tried_before)
1530		goto err;
1531	tried_before = 1;
1532
1533	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1534		goto err;
1535
1536	for (c = 0; c < ncpus; c++)
1537		for (m = 0; m < nmems; m++)
1538			distmap[I(c, m)] = DISTMAP_MAX;
1539
1540	if ((dir = opendir(distance_directory)) == NULL)
1541		goto err;
1542	while ((dent = readdir(dir)) != NULL) {
1543		char buf[PATH_MAX];
1544		unsigned int node;
1545
1546		if (sscanf(dent->d_name, "node%u", &node) < 1)
1547			continue;
1548		pathcat3(buf, sizeof(buf), distance_directory, dent->d_name,
1549			 "distance");
1550		if (parse_distance_file(node, buf) < 0)
1551			goto err;
1552	}
1553	closedir(dir);
1554	return;
1555err:
1556	if (dir)
1557		closedir(dir);
1558	free(distmap);
1559	distmap = NULL;
1560}
1561
1562#ifdef ALTERNATE_SN_DISTMAP
1563
1564/*
1565 * Parse SN architecture specific line of form:
1566 *
1567 *	node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10
1568 *
1569 * Second field is node number.  The "dist" field is the colon separated list
1570 * of distances, which is parsed into array dists[] of each nodes distance
1571 * from that node.
1572 *
1573 * Result is placed in distmap[ncpus][nmems]:
1574 *
1575 *	For each cpu c on that node:
1576 *		For each node position n in list of distances:
1577 *			distmap[c][n] = dists[n]
1578 */
1579
1580static void parse_distmap_line_sn(char *buf)
1581{
1582	char *p, *pend, *q;
1583	int ncpus = cpuset_cpus_nbits();
1584	int nmems = cpuset_mems_nbits();
1585	unsigned long c, n, node;
1586	distmap_entry_t *dists = NULL;
1587	struct bitmask *cpus = NULL, *mems = NULL;
1588
1589	if ((p = strchr(buf, ' ')) == NULL)
1590		goto err;
1591	if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems)
1592		goto err;
1593	if ((p = strstr(q, " dist ")) == NULL)
1594		goto err;
1595	p += strlen(" dist ");
1596	if ((pend = strchr(p, ' ')) != NULL)
1597		*pend = '\0';
1598	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1599		goto err;
1600	for (n = 0; n < (unsigned int)nmems; n++)
1601		dists[n] = DISTMAP_MAX;
1602
1603	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1604		unsigned long d;
1605
1606		if ((p = strpbrk(p, "0123456789")) == NULL)
1607			break;
1608		d = strtoul(p, &q, 10);
1609		if (p == q)
1610			break;
1611		if (d < DISTMAP_MAX)
1612			dists[n] = (distmap_entry_t) d;
1613	}
1614
1615	if ((mems = bitmask_alloc(nmems)) == NULL)
1616		goto err;
1617	bitmask_setbit(mems, node);
1618
1619	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1620		goto err;
1621	cpuset_localcpus(mems, cpus);
1622
1623	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1624	     c = bitmask_next(cpus, c + 1))
1625		for (n = 0; n < (unsigned int)nmems; n++)
1626			distmap[I(c, n)] = dists[n];
1627	/* fall into ... */
1628err:
1629	bitmask_free(mems);
1630	bitmask_free(cpus);
1631	free(dists);
1632}
1633
1634static void build_distmap_sn()
1635{
1636	int ncpus = cpuset_cpus_nbits();
1637	int nmems = cpuset_mems_nbits();
1638	int c, m;
1639	static int tried_before = 0;
1640	FILE *fp = NULL;
1641	char *buf = NULL;
1642	int buflen;
1643
1644	if (tried_before)
1645		goto err;
1646	tried_before = 1;
1647
1648	if ((fp = fopen(sn_topology, "r")) == NULL)
1649		goto err;
1650
1651	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1652		goto err;
1653
1654	for (c = 0; c < ncpus; c++)
1655		for (m = 0; m < nmems; m++)
1656			distmap[I(c, m)] = DISTMAP_MAX;
1657
1658	buflen = filesize(fp);
1659	if ((buf = malloc(buflen)) == NULL)
1660		goto err;
1661
1662	while (flgets(buf, buflen, fp) != NULL)
1663		if (strprefix(buf, sn_top_node_prefix))
1664			parse_distmap_line_sn(buf);
1665
1666	free(buf);
1667	fclose(fp);
1668	return;
1669err:
1670	free(buf);
1671	free(distmap);
1672	distmap = NULL;
1673	if (fp)
1674		fclose(fp);
1675}
1676
1677#endif
1678
1679/* [optional] Hardware distance from CPU to Memory Node */
1680unsigned int cpuset_cpumemdist(int cpu, int mem)
1681{
1682	int ncpus = cpuset_cpus_nbits();
1683	int nmems = cpuset_mems_nbits();
1684	distmap_entry_t r = DISTMAP_MAX;
1685
1686	flockfile(stdout);
1687
1688	if (check() < 0)
1689		goto err;
1690
1691	if (distmap == NULL)
1692		build_distmap();
1693
1694#ifdef ALTERNATE_SN_DISTMAP
1695	if (distmap == NULL)
1696		build_distmap_sn();
1697#endif
1698
1699	if (distmap == NULL)
1700		goto err;
1701
1702	if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems)
1703		goto err;
1704
1705	r = distmap[I(cpu, mem)];
1706	/* fall into ... */
1707err:
1708	funlockfile(stdout);
1709	return r;
1710}
1711
1712/* [optional] Return Memory Node closest to cpu */
1713int cpuset_cpu2node(int cpu)
1714{
1715	int ncpus = cpuset_cpus_nbits();
1716	int nmems = cpuset_mems_nbits();
1717	struct bitmask *cpus = NULL, *mems = NULL;
1718	int r = -1;
1719
1720	if (check() < 0)
1721		goto err;
1722
1723	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1724		goto err;
1725	bitmask_setbit(cpus, cpu);
1726
1727	if ((mems = bitmask_alloc(nmems)) == NULL)
1728		goto err;
1729	cpuset_localmems(cpus, mems);
1730	r = bitmask_first(mems);
1731	/* fall into ... */
1732err:
1733	bitmask_free(cpus);
1734	bitmask_free(mems);
1735	return r;
1736}
1737
1738static int apply_cpuset_settings(const char *path, const struct cpuset *cp)
1739{
1740	if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) {
1741		if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0)
1742			goto err;
1743	}
1744
1745	if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) {
1746		if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0)
1747			goto err;
1748	}
1749
1750	if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) {
1751		if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0)
1752			goto err;
1753	}
1754
1755	if (cp->notify_on_release_valid && cp->notify_on_release_dirty) {
1756		if (store_flag(path, "notify_on_release", cp->notify_on_release)
1757		    < 0)
1758			goto err;
1759	}
1760
1761	if (cp->memory_migrate_valid &&
1762	    cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) {
1763		if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0)
1764			goto err;
1765	}
1766
1767	if (cp->memory_pressure_enabled_valid &&
1768	    cp->memory_pressure_enabled_dirty &&
1769	    exists_flag(path, "memory_pressure_enabled")) {
1770		if (store_flag
1771		    (path, "memory_pressure_enabled",
1772		     cp->memory_pressure_enabled) < 0)
1773			goto err;
1774	}
1775
1776	if (cp->memory_spread_page_valid &&
1777	    cp->memory_spread_page_dirty &&
1778	    exists_flag(path, "memory_spread_page")) {
1779		if (store_flag
1780		    (path, "memory_spread_page", cp->memory_spread_page) < 0)
1781			goto err;
1782	}
1783
1784	if (cp->memory_spread_slab_valid &&
1785	    cp->memory_spread_slab_dirty &&
1786	    exists_flag(path, "memory_spread_slab")) {
1787		if (store_flag
1788		    (path, "memory_spread_slab", cp->memory_spread_slab) < 0)
1789			goto err;
1790	}
1791
1792	if (cp->sched_load_balance_valid &&
1793	    cp->sched_load_balance_dirty &&
1794	    exists_flag(path, "sched_load_balance")) {
1795		if (store_flag
1796		    (path, "sched_load_balance", cp->sched_load_balance) < 0)
1797			goto err;
1798	}
1799
1800	if (cp->sched_relax_domain_level_valid &&
1801	    cp->sched_relax_domain_level_dirty &&
1802	    exists_flag(path, "sched_relax_domain_level")) {
1803		if (store_number
1804		    (path, "sched_relax_domain_level",
1805		     cp->sched_relax_domain_level) < 0)
1806			goto err;
1807	}
1808
1809	if (cp->cpus_valid && cp->cpus_dirty) {
1810		if (store_mask(path, "cpus", cp->cpus) < 0)
1811			goto err;
1812	}
1813
1814	if (cp->mems_valid && cp->mems_dirty) {
1815		if (store_mask(path, "mems", cp->mems) < 0)
1816			goto err;
1817	}
1818	return 0;
1819err:
1820	return -1;
1821}
1822
1823/*
1824 * get_siblings() - helper routine for cpuset_would_crash_kernel(), below.
1825 *
1826 * Extract max value of any 'siblings' field in /proc/cpuinfo.
1827 * Cache the result - only need to extract once in lifetime of task.
1828 *
1829 * The siblings field is the number of logical CPUs in a physical
1830 * processor package.  It is equal to the product of the number of
1831 * cores in that package, times the number of hyper-threads per core.
1832 * The bug that cpuset_would_crash_kernel() is detecting arises
1833 * when a cpu_exclusive cpuset tries to include just some, not all,
1834 * of the sibling logical CPUs available in a processor package.
1835 *
1836 * In the improbable case that a system has mixed values of siblings
1837 * (some processor packages have more than others, perhaps due to
1838 * partially enabling Hyper-Threading), we take the worse case value,
1839 * the largest siblings value.  This might be overkill.  I don't know
1840 * if this kernel bug considers each processor package's siblings
1841 * separately or not.  But it sure is easier this way ...
1842 *
1843 * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from
1844 * open to close, the first time called.
1845 */
1846
1847static int get_siblings()
1848{
1849	static int siblings;
1850	char buf[32];		/* big enough for one 'siblings' line */
1851	FILE *fp;
1852
1853	if (siblings)
1854		return siblings;
1855
1856	if ((fp = fopen("/proc/cpuinfo", "r")) == NULL)
1857		return 4;	/* wing it - /proc not mounted ? */
1858	while (flgets(buf, sizeof(buf), fp) != NULL) {
1859		int s;
1860
1861		if (sscanf(buf, "siblings : %d", &s) < 1)
1862			continue;
1863		if (s > siblings)
1864			siblings = s;
1865	}
1866	fclose(fp);
1867	if (siblings == 0)
1868		siblings = 1;	/* old kernel, no siblings, default to 1 */
1869	return siblings;
1870}
1871
1872/*
1873 * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic
1874 * scheduler domain code invoked for cpu_exclusive cpusets that causes
1875 * the kernel to freeze, requiring a hardware reset.
1876 *
1877 * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive'
1878 * cpuset is defined where that cpusets 'cpus' are not on package
1879 * boundaries then the kernel will freeze, usually as soon as this
1880 * cpuset is created, requiring a hardware reset.
1881 *
1882 * A cpusets 'cpus' are not on package boundaries if the cpuset
1883 * includes a proper non-empty subset (some, but not all) of the
1884 * logical cpus on a processor package.  This requires multiple
1885 * logical CPUs per package, available with either Hyper-Thread or
1886 * Multi-Core support.  Without one of these features, there is only
1887 * one logical CPU per physical package, and it's not possible to
1888 * have a proper, non-empty subset of a set of cardinality one.
1889 *
1890 * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC
1891 * on i386 and x86_64 arch's.
1892 *
1893 * The objective of this routine cpuset_would_crash_kernel() is to
1894 * determine if a proposed cpuset setting would crash the kernel due
1895 * to this bug, so that the caller can avoid the crash.
1896 *
1897 * Ideally we'd check for exactly these conditions here, but computing
1898 * the package (identified by the 'physical id' field of /proc/cpuinfo)
1899 * of each cpu in a cpuset is more effort than it's worth here.
1900 *
1901 * Also there is no obvious way to identify exactly whether the kernel
1902 * one is executing on has this bug, short of trying it, and seeing
1903 * if the kernel just crashed.
1904 *
1905 * So for now, we look for a simpler set of conditions, that meets
1906 * our immediate need - avoid this crash on SUSE SLES10 systems that
1907 * are susceptible to it.  We look for the kernel version 2.6.16.*,
1908 * which is the base kernel of SUSE SLES10, and for i386 or x86_64
1909 * processors, which had CONFIG_SCHED_MC enabled.
1910 *
1911 * If these simpler conditions are met, we further simplify the check,
1912 * by presuming that the logical CPUs are numbered on processor
1913 * package boundaries.  If each package has S siblings, we assume
1914 * that CPUs numbered N through N + S -1 are on the same package,
1915 * for any CPU N such that N mod S == 0.
1916 *
1917 * Yes, this is a hack, focused on avoiding kernel freezes on
1918 * susceptible SUSE SLES10 systems.
1919 */
1920
1921static int cpuset_would_crash_kernel(const struct cpuset *cp)
1922{
1923	static int susceptible_system = -1;
1924
1925	if (!cp->cpu_exclusive)
1926		goto ok;
1927
1928	if (susceptible_system == -1) {
1929		struct utsname u;
1930		int rel_2_6_16, arch_i386, arch_x86_64;
1931
1932		if (uname(&u) < 0)
1933			goto fail;
1934		rel_2_6_16 = strprefix(u.release, "2.6.16.");
1935		arch_i386 = streq(u.machine, "i386");
1936		arch_x86_64 = streq(u.machine, "x86_64");
1937		susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64);
1938	}
1939
1940	if (susceptible_system) {
1941		int ncpus = cpuset_cpus_nbits();
1942		int siblings = get_siblings();
1943		unsigned int cpu;
1944
1945		for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) {
1946			int s, num_set = 0;
1947
1948			for (s = 0; s < siblings; s++) {
1949				if (bitmask_isbitset(cp->cpus, cpu + s))
1950					num_set++;
1951			}
1952
1953			/* If none or all siblings set, we're still ok */
1954			if (num_set == 0 || num_set == siblings)
1955				continue;
1956
1957			/* Found one that would crash kernel.  Fail.  */
1958			errno = ENXIO;
1959			goto fail;
1960		}
1961	}
1962	/* If not susceptible, or if all ok, fall into "ok" ... */
1963ok:
1964	return 0;		/* would not crash */
1965fail:
1966	return 1;		/* would crash */
1967}
1968
1969/* compare two cpuset and mark the dirty variable */
1970static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2)
1971{
1972	if (cp1->cpu_exclusive_valid &&
1973	    cp1->cpu_exclusive != cp2->cpu_exclusive)
1974		cp1->cpu_exclusive_dirty = 1;
1975
1976	if (cp1->mem_exclusive_valid &&
1977	    cp1->mem_exclusive != cp2->mem_exclusive)
1978		cp1->mem_exclusive_dirty = 1;
1979
1980	if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall)
1981		cp1->mem_hardwall_dirty = 1;
1982
1983	if (cp1->notify_on_release_valid &&
1984	    cp1->notify_on_release != cp2->notify_on_release)
1985		cp1->notify_on_release_dirty = 1;
1986
1987	if (cp1->memory_migrate_valid &&
1988	    cp1->memory_migrate != cp2->memory_migrate)
1989		cp1->memory_migrate_dirty = 1;
1990
1991	if (cp1->memory_pressure_enabled_valid &&
1992	    cp1->memory_pressure_enabled != cp2->memory_pressure_enabled)
1993		cp1->memory_pressure_enabled_dirty = 1;
1994
1995	if (cp1->memory_spread_page_valid &&
1996	    cp1->memory_spread_page != cp2->memory_spread_page)
1997		cp1->memory_spread_page_dirty = 1;
1998
1999	if (cp1->memory_spread_slab_valid &&
2000	    cp1->memory_spread_slab != cp2->memory_spread_slab)
2001		cp1->memory_spread_slab_dirty = 1;
2002
2003	if (cp1->sched_load_balance_valid &&
2004	    cp1->sched_load_balance != cp2->sched_load_balance)
2005		cp1->sched_load_balance_dirty = 1;
2006
2007	if (cp1->sched_relax_domain_level_valid &&
2008	    cp1->sched_relax_domain_level != cp2->sched_relax_domain_level)
2009		cp1->sched_relax_domain_level_dirty = 1;
2010
2011	if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus))
2012		cp1->cpus_dirty = 1;
2013	if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems))
2014		cp1->mems_dirty = 1;
2015}
2016
2017/* Create (if new set) or modify cpuset 'cp' at location 'relpath' */
2018static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new)
2019{
2020	char buf[PATH_MAX];
2021	int do_rmdir_on_err = 0;
2022	int do_restore_cp_sav_on_err = 0;
2023	struct cpuset *cp_sav = NULL;
2024	int sav_errno;
2025
2026	if (check() < 0)
2027		goto err;
2028
2029	if (cpuset_would_crash_kernel(cp))
2030		goto err;
2031
2032	fullpath(buf, sizeof(buf), relpath);
2033
2034	if (new) {
2035		if (mkdir(buf, 0755) < 0)
2036			goto err;
2037		/* we made it, so we should remove it on error */
2038		do_rmdir_on_err = 1;
2039	}
2040
2041	if ((cp_sav = cpuset_alloc()) == NULL)
2042		goto err;
2043	if (cpuset_query(cp_sav, relpath) < 0)
2044		goto err;
2045	/* we have old settings to restore on error */
2046	do_restore_cp_sav_on_err = 1;
2047
2048	/* check which variable need to restore on error */
2049	mark_dirty_variable(cp_sav, cp);
2050
2051	if (apply_cpuset_settings(buf, cp) < 0)
2052		goto err;
2053
2054	cpuset_free(cp_sav);
2055	return 0;
2056err:
2057	sav_errno = errno;
2058	if (do_restore_cp_sav_on_err)
2059		(void)apply_cpuset_settings(buf, cp_sav);
2060	if (cp_sav)
2061		cpuset_free(cp_sav);
2062	if (do_rmdir_on_err)
2063		(void)rmdir(buf);
2064	errno = sav_errno;
2065	return -1;
2066}
2067
2068/* Create cpuset 'cp' at location 'relpath' */
2069int cpuset_create(const char *relpath, const struct cpuset *cp)
2070{
2071	return cr_or_mod(relpath, cp, 1);
2072}
2073
2074/* Delete cpuset at location 'path' (if empty) */
2075int cpuset_delete(const char *relpath)
2076{
2077	char buf[PATH_MAX];
2078
2079	if (check() < 0)
2080		goto err;
2081
2082	fullpath(buf, sizeof(buf), relpath);
2083	if (rmdir(buf) < 0)
2084		goto err;
2085
2086	return 0;
2087err:
2088	return -1;
2089}
2090
2091/* Set cpuset cp to the cpuset at location 'path' */
2092int cpuset_query(struct cpuset *cp, const char *relpath)
2093{
2094	char buf[PATH_MAX];
2095
2096	if (check() < 0)
2097		goto err;
2098
2099	fullpath(buf, sizeof(buf), relpath);
2100
2101	if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0)
2102		goto err;
2103	cp->cpu_exclusive_valid = 1;
2104
2105	if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0)
2106		goto err;
2107	cp->mem_exclusive_valid = 1;
2108
2109	if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0)
2110		goto err;
2111	cp->notify_on_release_valid = 1;
2112
2113	if (exists_flag(buf, "memory_migrate")) {
2114		if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0)
2115			goto err;
2116		cp->memory_migrate_valid = 1;
2117	}
2118
2119	if (exists_flag(buf, "mem_hardwall")) {
2120		if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0)
2121			goto err;
2122		cp->mem_hardwall_valid = 1;
2123	}
2124
2125	if (exists_flag(buf, "memory_pressure_enabled")) {
2126		if (load_flag
2127		    (buf, &cp->memory_pressure_enabled,
2128		     "memory_pressure_enabled") < 0)
2129			goto err;
2130		cp->memory_pressure_enabled_valid = 1;
2131	}
2132
2133	if (exists_flag(buf, "memory_spread_page")) {
2134		if (load_flag
2135		    (buf, &cp->memory_spread_page, "memory_spread_page") < 0)
2136			goto err;
2137		cp->memory_spread_page_valid = 1;
2138	}
2139
2140	if (exists_flag(buf, "memory_spread_slab")) {
2141		if (load_flag
2142		    (buf, &cp->memory_spread_slab, "memory_spread_slab") < 0)
2143			goto err;
2144		cp->memory_spread_slab_valid = 1;
2145	}
2146
2147	if (exists_flag(buf, "sched_load_balance")) {
2148		if (load_flag
2149		    (buf, &cp->sched_load_balance, "sched_load_balance") < 0)
2150			goto err;
2151		cp->sched_load_balance_valid = 1;
2152	}
2153
2154	if (exists_flag(buf, "sched_relax_domain_level")) {
2155		if (load_number
2156		    (buf, &cp->sched_relax_domain_level,
2157		     "sched_relax_domain_level") < 0)
2158			goto err;
2159		cp->sched_relax_domain_level_valid = 1;
2160	}
2161
2162	if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0)
2163		goto err;
2164	cp->cpus_valid = 1;
2165
2166	if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0)
2167		goto err;
2168	cp->mems_valid = 1;
2169
2170	return 0;
2171err:
2172	return -1;
2173}
2174
2175/* Modify cpuset at location 'relpath' to values of 'cp' */
2176int cpuset_modify(const char *relpath, const struct cpuset *cp)
2177{
2178	return cr_or_mod(relpath, cp, 0);
2179}
2180
2181/* Get cpuset path of pid into buf */
2182char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size)
2183{
2184	int fd;			/* dual use: cpuset file for pid and self */
2185	int rc;			/* dual use: snprintf and read return codes */
2186
2187	if (check() < 0)
2188		return NULL;
2189
2190	/* borrow result buf[] to build cpuset file path */
2191	if (pid == 0)
2192		rc = snprintf(buf, size, "/proc/self/cpuset");
2193	else
2194		rc = snprintf(buf, size, "/proc/%d/cpuset", pid);
2195	if (rc >= (int)size) {
2196		errno = E2BIG;
2197		return NULL;
2198	}
2199	if ((fd = open(buf, O_RDONLY)) < 0) {
2200		int e = errno;
2201		if (e == ENOENT)
2202			e = ESRCH;
2203		if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0)
2204			e = ENOSYS;
2205		else
2206			close(fd);
2207		errno = e;
2208		return NULL;
2209	}
2210	rc = read(fd, buf, size);
2211	close(fd);
2212	if (rc < 0)
2213		return NULL;
2214	if (rc >= (int)size) {
2215		errno = E2BIG;
2216		return NULL;
2217	}
2218	buf[rc] = 0;
2219	chomp(buf);
2220	return buf;
2221
2222}
2223
2224/* Get cpuset 'cp' of pid */
2225int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid)
2226{
2227	char buf[PATH_MAX];
2228
2229	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
2230		return -1;
2231	if (cpuset_query(cp, buf) < 0)
2232		return -1;
2233	return 0;
2234}
2235
2236/* [optional] Return mountpoint of cpuset filesystem */
2237const char *cpuset_mountpoint()
2238{
2239	if (check() < 0) {
2240		switch (errno) {
2241		case ENODEV:
2242			return "[cpuset filesystem not mounted]";
2243		default:
2244			return "[cpuset filesystem not supported]";
2245		}
2246	}
2247	return cpusetmnt;
2248}
2249
2250/* Return true if path is a directory. */
2251static int isdir(const char *path)
2252{
2253	struct stat statbuf;
2254
2255	if (stat(path, &statbuf) < 0)
2256		return 0;
2257	return S_ISDIR(statbuf.st_mode);
2258}
2259
2260/*
2261 * [optional] cpuset_collides_exclusive() - True if would collide exclusive.
2262 *
2263 * Return true iff the specified cpuset would overlap with any
2264 * sibling cpusets in either cpus or mems, where either this
2265 * cpuset or the sibling is cpu_exclusive or mem_exclusive.
2266 *
2267 * cpuset_create() fails with errno == EINVAL if the requested cpuset
2268 * would overlap with any sibling, where either one is cpu_exclusive or
2269 * mem_exclusive.  This is a common, and not obvious error.  The
2270 * following routine checks for this particular case, so that code
2271 * creating cpusets can better identify the situation, perhaps to issue
2272 * a more informative error message.
2273 *
2274 * Can also be used to diagnose cpuset_modify failures.  This
2275 * routine ignores any existing cpuset with the same path as the
2276 * given 'cpusetpath', and only looks for exclusive collisions with
2277 * sibling cpusets of that path.
2278 *
2279 * In case of any error, returns (0) -- does not collide.  Presumably
2280 * any actual attempt to create or modify a cpuset will encounter the
2281 * same error, and report it usefully.
2282 *
2283 * This routine is not particularly efficient; most likely code creating or
2284 * modifying a cpuset will want to try the operation first, and then if that
2285 * fails with errno EINVAL, perhaps call this routine to determine if an
2286 * exclusive cpuset collision caused the error.
2287 */
2288
2289int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1)
2290{
2291	char parent[PATH_MAX];
2292	char *p;
2293	char *pathcopy = NULL;
2294	char *base;
2295	DIR *dir = NULL;
2296	struct dirent *dent;
2297	struct cpuset *cp2 = NULL;
2298	struct bitmask *cpus1 = NULL, *cpus2 = NULL;
2299	struct bitmask *mems1 = NULL, *mems2 = NULL;
2300	int ret;
2301
2302	if (check() < 0)
2303		goto err;
2304
2305	fullpath(parent, sizeof(parent), cpusetpath);
2306	if (streq(parent, cpusetmnt))
2307		goto err;	/* only one cpuset root - can't collide */
2308	pathcopy = strdup(parent);
2309	p = strrchr(parent, '/');
2310	if (!p)
2311		goto err;	/* huh? - impossible - run and hide */
2312	*p = 0;			/* now parent is dirname of fullpath */
2313
2314	p = strrchr(pathcopy, '/');
2315	base = p + 1;		/* now base is basename of fullpath */
2316	if (!*base)
2317		goto err;	/* this is also impossible - run away */
2318
2319	if ((dir = opendir(parent)) == NULL)
2320		goto err;
2321	if ((cp2 = cpuset_alloc()) == NULL)
2322		goto err;
2323	if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2324		goto err;
2325	if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2326		goto err;
2327	if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2328		goto err;
2329	if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2330		goto err;
2331
2332	while ((dent = readdir(dir)) != NULL) {
2333		char child[PATH_MAX];
2334
2335		if (streq(dent->d_name, ".") || streq(dent->d_name, ".."))
2336			continue;
2337		if (streq(dent->d_name, base))
2338			continue;
2339		pathcat2(child, sizeof(child), parent, dent->d_name);
2340		if (!isdir(child))
2341			continue;
2342		if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0)
2343			goto err;
2344		if (cp1->cpu_exclusive || cp2->cpu_exclusive) {
2345			cpuset_getcpus(cp1, cpus1);
2346			cpuset_getcpus(cp2, cpus2);
2347			if (bitmask_intersects(cpus1, cpus2))
2348				goto collides;
2349		}
2350		if (cp1->mem_exclusive || cp2->mem_exclusive) {
2351			cpuset_getmems(cp1, mems1);
2352			cpuset_getmems(cp2, mems2);
2353			if (bitmask_intersects(mems1, mems2))
2354				goto collides;
2355		}
2356	}
2357err:
2358	/* error, or did not collide */
2359	ret = 0;
2360	goto done;
2361collides:
2362	/* collides */
2363	ret = 1;
2364	/* fall into ... */
2365done:
2366	if (dir)
2367		closedir(dir);
2368	cpuset_free(cp2);
2369	free(pathcopy);
2370	bitmask_free(cpus1);
2371	bitmask_free(cpus2);
2372	bitmask_free(mems1);
2373	bitmask_free(mems2);
2374	return ret;
2375}
2376
2377/*
2378 * [optional] cpuset_nuke() - Remove cpuset anyway possible
2379 *
2380 * Remove a cpuset, including killing tasks in it, and
2381 * removing any descendent cpusets and killing their tasks.
2382 *
2383 * Tasks can take a long time (minutes on some configurations)
2384 * to exit.  Loop up to 'seconds' seconds, trying to kill them.
2385 *
2386 * How we do it:
2387 *	1) First, kill all the pids, looping until there are
2388 *	   no more pids in this cpuset or below, or until the
2389 *	   'seconds' timeout limit is exceeded.
2390 *	2) Then depth first recursively rmdir the cpuset directories.
2391 *	3) If by this point the original cpuset is gone, we succeeded.
2392 *
2393 * If the timeout is exceeded, and tasks still exist, fail with
2394 * errno == ETIME.
2395 *
2396 * We sleep a variable amount of time.  After the first attempt to
2397 * kill all the tasks in the cpuset or its descendents, we sleep 1
2398 * second, the next time 2 seconds, increasing 1 second each loop
2399 * up to a max of 10 seconds.  If more loops past 10 are required
2400 * to kill all the tasks, we sleep 10 seconds each subsequent loop.
2401 * In any case, before the last loop, we sleep however many seconds
2402 * remain of the original timeout 'seconds' requested.  The total
2403 * time of all sleeps will be no more than the requested 'seconds'.
2404 *
2405 * If the cpuset started out empty of any tasks, or if the passed in
2406 * 'seconds' was zero, then this routine will return quickly, having
2407 * not slept at all.  Otherwise, this routine will at a minimum send
2408 * a SIGKILL to all the tasks in this cpuset subtree, then sleep one
2409 * second, before looking to see if any tasks remain.  If tasks remain
2410 * in the cpuset subtree, and a longer 'seconds' timeout was requested
2411 * (more than one), it will continue to kill remaining tasks and sleep,
2412 * in a loop, for as long as time and tasks remain.
2413 *
2414 * The signal sent for the kill is hardcoded to SIGKILL (9).  If some
2415 * other signal should be sent first, use a separate code loop,
2416 * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to
2417 * scan the task pids in a cpuset.  If SIGKILL should -not- be sent,
2418 * this cpuset_nuke() routine can still be called to recursively
2419 * remove a cpuset subtree, by specifying a timeout of zero 'seconds'.
2420 *
2421 * On success, returns 0 with errno == 0.
2422 *
2423 * On failure, returns -1, with errno possibly one of:
2424 *  EACCES - search permission denied on intervening directory
2425 *  ETIME - timed out - tasks remain after 'seconds' timeout
2426 *  EMFILE - too many open files
2427 *  ENODEV - /dev/cpuset not mounted
2428 *  ENOENT - component of cpuset path doesn't exist
2429 *  ENOMEM - out of memory
2430 *  ENOSYS - kernel doesn't support cpusets
2431 *  ENOTDIR - component of cpuset path is not a directory
2432 *  EPERM - lacked permission to kill a task
2433 *  EPERM - lacked permission to read cpusets or files therein
2434 */
2435
2436void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree);
2437
2438int cpuset_nuke(const char *relpath, unsigned int seconds)
2439{
2440	unsigned int secs_left = seconds;	/* total sleep seconds left */
2441	unsigned int secs_loop = 1;	/* how much sleep next loop */
2442	unsigned int secs_slept;	/* seconds slept in sleep() */
2443	struct cpuset_pidlist *pl = NULL;	/* pids in cpuset subtree */
2444	struct cpuset_fts_tree *cs_tree;
2445	const struct cpuset_fts_entry *cs_entry;
2446	int ret, sav_errno = 0;
2447
2448	if (check() < 0)
2449		return -1;
2450
2451	if (seconds == 0)
2452		goto rmdir_cpusets;
2453
2454	while (1) {
2455		int plen, j;
2456
2457		if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) {
2458			/* missing cpuset is as good as if already nuked */
2459			if (errno == ENOENT) {
2460				ret = 0;
2461				goto no_more_cpuset;
2462			}
2463
2464			/* other problems reading cpuset are bad news */
2465			sav_errno = errno;
2466			goto failed;
2467		}
2468
2469		if ((plen = cpuset_pidlist_length(pl)) == 0)
2470			goto rmdir_cpusets;
2471
2472		for (j = 0; j < plen; j++) {
2473			pid_t pid;
2474
2475			if ((pid = cpuset_get_pidlist(pl, j)) > 1) {
2476				if (kill(pid, SIGKILL) < 0 && errno != ESRCH) {
2477					sav_errno = errno;
2478					goto failed;
2479				}
2480			}
2481		}
2482
2483		if (secs_left == 0)
2484			goto took_too_long;
2485
2486		cpuset_freepidlist(pl);
2487		pl = NULL;
2488
2489		secs_slept = secs_loop - sleep(secs_loop);
2490
2491		/* Ensure forward progress */
2492		if (secs_slept == 0)
2493			secs_slept = 1;
2494
2495		/* Ensure sane sleep() return (unnecessary?) */
2496		if (secs_slept > secs_loop)
2497			secs_slept = secs_loop;
2498
2499		secs_left -= secs_slept;
2500
2501		if (secs_loop < 10)
2502			secs_loop++;
2503
2504		secs_loop = min(secs_left, secs_loop);
2505	}
2506
2507took_too_long:
2508	sav_errno = ETIME;
2509	/* fall into ... */
2510failed:
2511	cpuset_freepidlist(pl);
2512	errno = sav_errno;
2513	return -1;
2514
2515rmdir_cpusets:
2516	/* Let's try removing cpuset(s) now. */
2517	cpuset_freepidlist(pl);
2518
2519	if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT)
2520		return -1;
2521	ret = 0;
2522	cpuset_fts_reverse(cs_tree);	/* rmdir's must be done bottom up */
2523	while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2524		char buf[PATH_MAX];
2525
2526		fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry));
2527		if (rmdir(buf) < 0 && errno != ENOENT) {
2528			sav_errno = errno;
2529			ret = -1;
2530		}
2531	}
2532	cpuset_fts_close(cs_tree);
2533	/* fall into ... */
2534no_more_cpuset:
2535	if (ret == 0)
2536		errno = 0;
2537	else
2538		errno = sav_errno;
2539	return ret;
2540}
2541
2542/*
2543 * When recursively reading all the tasks files from a subtree,
2544 * chain together the read results, one pidblock per tasks file,
2545 * containing the raw unprocessed ascii as read(2) in.  After
2546 * we gather up this raw data, we then go back to count how
2547 * many pid's there are in total, allocate an array of pid_t
2548 * of that size, and transform the raw ascii data into this
2549 * array of pid_t's.
2550 */
2551
2552struct pidblock {
2553	char *buf;
2554	int buflen;
2555	struct pidblock *next;
2556};
2557
2558/*
2559 * Chain the raw contents of a file onto the pbhead list.
2560 *
2561 * We malloc "+ 1" extra byte for a nul-terminator, so that
2562 * the strtoul() loop in pid_transform() won't scan past
2563 * the end of pb->buf[] and accidentally find more pids.
2564 */
2565static void add_pidblock(const char *file, struct pidblock **ppbhead)
2566{
2567	FILE *fp = NULL;
2568	struct pidblock *pb = NULL;
2569	int fsz;
2570
2571	if ((fp = fopen(file, "r")) == NULL)
2572		goto err;
2573	fsz = filesize(fp);
2574	if (fsz == 0)
2575		goto err;
2576	if ((pb = calloc(1, sizeof(*pb))) == NULL)
2577		goto err;
2578	pb->buflen = fsz;
2579	if ((pb->buf = malloc(pb->buflen + 1)) == NULL)
2580		goto err;
2581	if (fread(pb->buf, 1, pb->buflen, fp) > 0) {
2582		pb->buf[pb->buflen] = '\0';
2583		pb->next = *ppbhead;
2584		*ppbhead = pb;
2585	}
2586	fclose(fp);
2587	return;
2588err:
2589	if (fp)
2590		fclose(fp);
2591	if (pb)
2592		free(pb);
2593}
2594
2595static void read_task_file(const char *relpath, struct pidblock **ppbhead)
2596{
2597	char buf[PATH_MAX];
2598
2599	fullpath2(buf, sizeof(buf), relpath, "tasks");
2600	add_pidblock(buf, ppbhead);
2601}
2602
2603struct cpuset_pidlist {
2604	pid_t *pids;
2605	int npids;
2606};
2607
2608/* Count how many pids in buf (one per line - just count newlines) */
2609static int pidcount(const char *buf, int buflen)
2610{
2611	int n = 0;
2612	const char *cp;
2613
2614	for (cp = buf; cp < buf + buflen; cp++) {
2615		if (*cp == '\n')
2616			n++;
2617	}
2618	return n;
2619}
2620
2621/* Transform one-per-line ascii pids in pb to pid_t entries in pl */
2622static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n)
2623{
2624	char *a, *b;
2625
2626	for (a = pb->buf; a < pb->buf + pb->buflen; a = b) {
2627		pid_t p = strtoul(a, &b, 10);
2628		if (a == b)
2629			break;
2630		pl->pids[n++] = p;
2631	}
2632	return n;
2633}
2634
2635static void free_pidblocks(struct pidblock *pbhead)
2636{
2637	struct pidblock *pb, *nextpb;
2638
2639	for (pb = pbhead; pb; pb = nextpb) {
2640		nextpb = pb->next;
2641		free(pb->buf);
2642		free(pb);
2643	}
2644}
2645
2646/* numeric comparison routine for qsort */
2647static int numericsort(const void *m1, const void *m2)
2648{
2649	pid_t p1 = *(pid_t *) m1;
2650	pid_t p2 = *(pid_t *) m2;
2651
2652	return p1 - p2;
2653}
2654
2655/* Return list pids in cpuset 'path' */
2656struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath,
2657					   int recursiveflag)
2658{
2659	struct pidblock *pb = NULL;
2660	struct cpuset_pidlist *pl = NULL;
2661	struct pidblock *pbhead = NULL;
2662	int n;
2663
2664	if (check() < 0)
2665		goto err;
2666
2667	if (recursiveflag) {
2668		struct cpuset_fts_tree *cs_tree;
2669		const struct cpuset_fts_entry *cs_entry;
2670
2671		if ((cs_tree = cpuset_fts_open(relpath)) == NULL)
2672			goto err;
2673		while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2674			if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET)
2675				continue;
2676			read_task_file(cpuset_fts_get_path(cs_entry), &pbhead);
2677		}
2678		cpuset_fts_close(cs_tree);
2679	} else {
2680		read_task_file(relpath, &pbhead);
2681	}
2682
2683	if ((pl = calloc(1, sizeof(*pl))) == NULL)
2684		goto err;
2685	pl->npids = 0;
2686	for (pb = pbhead; pb; pb = pb->next)
2687		pl->npids += pidcount(pb->buf, pb->buflen);
2688	if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL)
2689		goto err;
2690	n = 0;
2691	for (pb = pbhead; pb; pb = pb->next)
2692		n = pid_transform(pb, pl, n);
2693	free_pidblocks(pbhead);
2694	qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort);
2695	return pl;
2696err:
2697	cpuset_freepidlist(pl);
2698	free_pidblocks(pbhead);
2699	return NULL;
2700}
2701
2702/* Return number of elements in pidlist */
2703int cpuset_pidlist_length(const struct cpuset_pidlist *pl)
2704{
2705	if (pl)
2706		return pl->npids;
2707	else
2708		return 0;
2709}
2710
2711/* Return i'th element of pidlist */
2712pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i)
2713{
2714	if (pl && i >= 0 && i < pl->npids)
2715		return pl->pids[i];
2716	else
2717		return (pid_t) - 1;
2718}
2719
2720/* Free pidlist */
2721void cpuset_freepidlist(struct cpuset_pidlist *pl)
2722{
2723	if (pl && pl->pids)
2724		free(pl->pids);
2725	if (pl)
2726		free(pl);
2727}
2728
2729static int __cpuset_move(pid_t pid, const char *path)
2730{
2731	char buf[SMALL_BUFSZ];
2732
2733	snprintf(buf, sizeof(buf), "%u", pid);
2734	return write_string_file(path, buf);
2735}
2736
2737/* Move task (pid == 0 for current) to a cpuset */
2738int cpuset_move(pid_t pid, const char *relpath)
2739{
2740	char buf[PATH_MAX];
2741
2742	if (check() < 0)
2743		return -1;
2744
2745	if (pid == 0)
2746		pid = getpid();
2747
2748	fullpath2(buf, sizeof(buf), relpath, "tasks");
2749	return __cpuset_move(pid, buf);
2750}
2751
2752/* Move all tasks in pidlist to a cpuset */
2753int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath)
2754{
2755	int i;
2756	char buf[PATH_MAX];
2757	int ret;
2758
2759	if (check() < 0)
2760		return -1;
2761
2762	fullpath2(buf, sizeof(buf), relpath, "tasks");
2763
2764	ret = 0;
2765	for (i = 0; i < pl->npids; i++)
2766		if (__cpuset_move(pl->pids[i], buf) < 0)
2767			ret = -1;
2768	return ret;
2769}
2770
2771/*
2772 * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a
2773 *                                      cpuset to another cpuset
2774 *
2775 * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may
2776 * race with tasks being added to or forking into fromrelpath. Loop
2777 * repeatedly, reading the tasks file of cpuset fromrelpath and writing
2778 * any task pid's found there to the tasks file of cpuset torelpath,
2779 * up to ten attempts, or until the tasks file of cpuset fromrelpath
2780 * is empty, or until fromrelpath is no longer present.
2781 *
2782 * Returns 0 with errno == 0 if able to empty the tasks file of cpuset
2783 * fromrelpath. Of course it is still possible that some independent
2784 * task could add another task to cpuset fromrelpath at the same time
2785 * that such a successful result is being returned, so there can be
2786 * no guarantee that a successful return means that fromrelpath is
2787 * still empty of tasks.
2788 *
2789 * We are careful to allow for the possibility that the cpuset
2790 * fromrelpath might disappear out from under us, perhaps because it
2791 * has notify_on_release set and gets automatically removed as soon
2792 * as we detach its last task from it.  Consider a missing fromrelpath
2793 * to be a successful move.
2794 *
2795 * If called with fromrelpath and torelpath pathnames that evaluate to
2796 * the same cpuset, then treat that as if cpuset_reattach() was called,
2797 * rebinding each task in this cpuset one time, and return success or
2798 * failure depending on the return of that cpuset_reattach() call.
2799 *
2800 * On failure, returns -1, with errno possibly one of:
2801 *  EACCES - search permission denied on intervening directory
2802 *  ENOTEMPTY - tasks remain after multiple attempts to move them
2803 *  EMFILE - too many open files
2804 *  ENODEV - /dev/cpuset not mounted
2805 *  ENOENT - component of cpuset path doesn't exist
2806 *  ENOMEM - out of memory
2807 *  ENOSYS - kernel doesn't support cpusets
2808 *  ENOTDIR - component of cpuset path is not a directory
2809 *  EPERM - lacked permission to kill a task
2810 *  EPERM - lacked permission to read cpusets or files therein
2811 *
2812 * This is an [optional] function. Use cpuset_function to invoke it.
2813 */
2814
2815#define NUMBER_MOVE_TASK_ATTEMPTS 10
2816
2817int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath)
2818{
2819	char fromfullpath[PATH_MAX];
2820	char tofullpath[PATH_MAX];
2821	int i;
2822	struct cpuset_pidlist *pl = NULL;
2823	int sav_errno;
2824
2825	fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath);
2826	fullpath(tofullpath, sizeof(tofullpath), torelpath);
2827
2828	if (samefile(fromfullpath, tofullpath))
2829		return cpuset_reattach(fromrelpath);
2830
2831	for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) {
2832		int plen, j;
2833
2834		if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) {
2835			/* missing cpuset is as good as if all moved */
2836			if (errno == ENOENT)
2837				goto no_more_cpuset;
2838
2839			/* other problems reading cpuset are bad news */
2840			sav_errno = errno;
2841			goto failed;
2842		}
2843
2844		if ((plen = cpuset_pidlist_length(pl)) == 0)
2845			goto no_more_pids;
2846
2847		for (j = 0; j < plen; j++) {
2848			pid_t pid;
2849
2850			pid = cpuset_get_pidlist(pl, j);
2851			if (cpuset_move(pid, torelpath) < 0) {
2852				/* missing task is as good as if moved */
2853				if (errno == ESRCH)
2854					continue;
2855
2856				/* other per-task errors are bad news */
2857				sav_errno = errno;
2858				goto failed;
2859			}
2860		}
2861
2862		cpuset_freepidlist(pl);
2863		pl = NULL;
2864	}
2865
2866	sav_errno = ENOTEMPTY;
2867	/* fall into ... */
2868failed:
2869	cpuset_freepidlist(pl);
2870	errno = sav_errno;
2871	return -1;
2872
2873no_more_pids:
2874no_more_cpuset:
2875	/* Success - all tasks (or entire cpuset ;) gone. */
2876	cpuset_freepidlist(pl);
2877	errno = 0;
2878	return 0;
2879}
2880
2881/* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */
2882int cpuset_migrate(pid_t pid, const char *relpath)
2883{
2884	char buf[PATH_MAX];
2885	char buf2[PATH_MAX];
2886	char memory_migrate_flag;
2887	int r;
2888
2889	if (check() < 0)
2890		return -1;
2891
2892	if (pid == 0)
2893		pid = getpid();
2894
2895	fullpath(buf2, sizeof(buf2), relpath);
2896
2897	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2898		return -1;
2899	if (store_flag(buf2, "memory_migrate", 1) < 0)
2900		return -1;
2901
2902	fullpath2(buf, sizeof(buf), relpath, "tasks");
2903
2904	r = __cpuset_move(pid, buf);
2905
2906	store_flag(buf2, "memory_migrate", memory_migrate_flag);
2907	return r;
2908}
2909
2910/* Migrate all tasks in pidlist to a cpuset (moves task and memory) */
2911int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath)
2912{
2913	int i;
2914	char buf[PATH_MAX];
2915	char buf2[PATH_MAX];
2916	char memory_migrate_flag;
2917	int ret;
2918
2919	if (check() < 0)
2920		return -1;
2921
2922	fullpath(buf2, sizeof(buf2), relpath);
2923
2924	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2925		return -1;
2926	if (store_flag(buf2, "memory_migrate", 1) < 0)
2927		return -1;
2928
2929	fullpath2(buf, sizeof(buf), relpath, "tasks");
2930
2931	ret = 0;
2932	for (i = 0; i < pl->npids; i++)
2933		if (__cpuset_move(pl->pids[i], buf) < 0)
2934			ret = -1;
2935
2936	if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0)
2937		ret = -1;
2938	return ret;
2939}
2940
2941/* Rebind cpus_allowed of each task in cpuset 'path' */
2942int cpuset_reattach(const char *relpath)
2943{
2944	struct cpuset_pidlist *pl;
2945	int rc;
2946
2947	if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL)
2948		return -1;
2949	rc = cpuset_move_all(pl, relpath);
2950	cpuset_freepidlist(pl);
2951	return rc;
2952}
2953
2954/* Map cpuset relative cpu number to system wide cpu number */
2955int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu)
2956{
2957	struct cpuset *cp_tofree = NULL;
2958	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2959	int pos = -1;
2960
2961	if (!cp1)
2962		goto err;
2963	pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu);
2964	/* fall into ... */
2965err:
2966	cpuset_free(cp_tofree);
2967	return pos;
2968}
2969
2970/* Map system wide cpu number to cpuset relative cpu number */
2971int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu)
2972{
2973	struct cpuset *cp_tofree = NULL;
2974	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2975	int pos = -1;
2976
2977	if (!cp1)
2978		goto err;
2979	pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu);
2980	/* fall into ... */
2981err:
2982	cpuset_free(cp_tofree);
2983	return pos;
2984}
2985
2986/* Map cpuset relative mem number to system wide mem number */
2987int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem)
2988{
2989	struct cpuset *cp_tofree = NULL;
2990	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2991	int pos = -1;
2992
2993	if (!cp1)
2994		goto err;
2995	pos = bitmask_rel_to_abs_pos(cp1->mems, mem);
2996	/* fall into ... */
2997err:
2998	cpuset_free(cp_tofree);
2999	return pos;
3000}
3001
3002/* Map system wide mem number to cpuset relative mem number */
3003int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem)
3004{
3005	struct cpuset *cp_tofree = NULL;
3006	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
3007	int pos = -1;
3008
3009	if (!cp1)
3010		goto err;
3011	pos = bitmask_abs_to_rel_pos(cp1->mems, mem);
3012	/* fall into ... */
3013err:
3014	cpuset_free(cp_tofree);
3015	return pos;
3016}
3017
3018/* Map pid's cpuset relative cpu number to system wide cpu number */
3019int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu)
3020{
3021	struct cpuset *cp;
3022	int rc = -1;
3023
3024	if ((cp = cpuset_alloc()) == NULL)
3025		goto done;
3026	if (cpuset_cpusetofpid(cp, pid) < 0)
3027		goto done;
3028	rc = cpuset_c_rel_to_sys_cpu(cp, cpu);
3029done:
3030	cpuset_free(cp);
3031	return rc;
3032}
3033
3034/* Map system wide cpu number to pid's cpuset relative cpu number */
3035int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu)
3036{
3037	struct cpuset *cp;
3038	int rc = -1;
3039
3040	if ((cp = cpuset_alloc()) == NULL)
3041		goto done;
3042	if (cpuset_cpusetofpid(cp, pid) < 0)
3043		goto done;
3044	rc = cpuset_c_sys_to_rel_cpu(cp, cpu);
3045done:
3046	cpuset_free(cp);
3047	return rc;
3048}
3049
3050/* Map pid's cpuset relative mem number to system wide mem number */
3051int cpuset_p_rel_to_sys_mem(pid_t pid, int mem)
3052{
3053	struct cpuset *cp;
3054	int rc = -1;
3055
3056	if ((cp = cpuset_alloc()) == NULL)
3057		goto done;
3058	if (cpuset_cpusetofpid(cp, pid) < 0)
3059		goto done;
3060	rc = cpuset_c_rel_to_sys_mem(cp, mem);
3061done:
3062	cpuset_free(cp);
3063	return rc;
3064}
3065
3066/* Map system wide mem number to pid's cpuset relative mem number */
3067int cpuset_p_sys_to_rel_mem(pid_t pid, int mem)
3068{
3069	struct cpuset *cp;
3070	int rc = -1;
3071
3072	if ((cp = cpuset_alloc()) == NULL)
3073		goto done;
3074	if (cpuset_cpusetofpid(cp, pid) < 0)
3075		goto done;
3076	rc = cpuset_c_sys_to_rel_mem(cp, mem);
3077done:
3078	cpuset_free(cp);
3079	return rc;
3080}
3081
3082/*
3083 * Override glibc's calls for get/set affinity - they have
3084 * something using cpu_set_t that will die when NR_CPUS > 1024.
3085 * Go directly to the 'real' system calls.  Also override calls
3086 * for get_mempolicy and set_mempolicy.  None of these
3087 * calls are yet (July 2004) guaranteed to be in all glibc versions
3088 * that we care about.
3089 */
3090
3091static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask)
3092{
3093	return ltp_syscall(__NR_sched_setaffinity, pid, len, mask);
3094}
3095
3096#if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
3097static int get_mempolicy(int *policy, unsigned long *nmask,
3098			 unsigned long maxnode, void *addr, int flags)
3099{
3100	return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode,
3101		addr, flags);
3102}
3103#endif
3104
3105#if HAVE_DECL_MPOL_BIND || HAVE_DECL_MPOL_DEFAULT
3106static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode)
3107{
3108	return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode);
3109}
3110#endif
3111
3112struct cpuset_placement {
3113	struct bitmask *cpus;
3114	struct bitmask *mems;
3115	char *path;
3116};
3117
3118/* Allocate and fill in a placement struct - cpatures current placement */
3119struct cpuset_placement *cpuset_get_placement(pid_t pid)
3120{
3121	struct cpuset_placement *plc;
3122	struct cpuset *cp = NULL;
3123	char buf[PATH_MAX];
3124	int nbits;
3125
3126	if ((plc = calloc(1, sizeof(*plc))) == NULL)
3127		goto err;
3128
3129	nbits = cpuset_cpus_nbits();
3130	if ((plc->cpus = bitmask_alloc(nbits)) == NULL)
3131		goto err;
3132
3133	nbits = cpuset_mems_nbits();
3134	if ((plc->mems = bitmask_alloc(nbits)) == NULL)
3135		goto err;
3136
3137	if ((cp = cpuset_alloc()) == NULL)
3138		goto err;
3139	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
3140		goto err;
3141	if (cpuset_query(cp, buf) < 0)
3142		goto err;
3143
3144	bitmask_copy(plc->cpus, cp->cpus);
3145	bitmask_copy(plc->mems, cp->mems);
3146	plc->path = strdup(buf);
3147
3148	cpuset_free(cp);
3149	return plc;
3150err:
3151	cpuset_free(cp);
3152	cpuset_free_placement(plc);
3153	return NULL;
3154}
3155
3156/* Compare two placement structs - use to detect changes in placement */
3157int cpuset_equal_placement(const struct cpuset_placement *plc1,
3158			   const struct cpuset_placement *plc2)
3159{
3160	return bitmask_equal(plc1->cpus, plc2->cpus) &&
3161	    bitmask_equal(plc1->mems, plc2->mems) &&
3162	    streq(plc1->path, plc2->path);
3163}
3164
3165/* Free a placement struct */
3166void cpuset_free_placement(struct cpuset_placement *plc)
3167{
3168	if (!plc)
3169		return;
3170	bitmask_free(plc->cpus);
3171	bitmask_free(plc->mems);
3172	free(plc->path);
3173	free(plc);
3174}
3175
3176/*
3177 * A cpuset_fts_open() call constructs a linked list of entries
3178 * called a "cpuset_fts_tree", with one entry per cpuset below
3179 * the specified path.  The cpuset_fts_read() routine returns the
3180 * next entry on this list.  The various cpuset_fts_get_*() calls
3181 * return attributes of the specified entry.  The cpuset_fts_close()
3182 * call frees the linked list and all associated data.  All cpuset
3183 * entries and attributes for the cpuset_fts_tree returned from a
3184 * given cpuset_fts_open() call remain allocated and unchanged until
3185 * that cpuset_fts_tree is closed by a cpuset_fts_close() call.  Any
3186 * subsequent changes to the cpuset filesystem will go unnoticed
3187 * (not affect open cpuset_fts_tree's.)
3188 */
3189
3190struct cpuset_fts_entry;
3191void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree);
3192
3193struct cpuset_fts_tree {
3194	struct cpuset_fts_entry *head;	/* head of linked entry list */
3195	struct cpuset_fts_entry *next;	/* cpuset_fts_read() offset */
3196};
3197
3198struct cpuset_fts_entry {
3199	struct cpuset_fts_entry *next;	/* linked entry list chain */
3200	struct cpuset *cpuset;
3201	struct stat *stat;
3202	char *path;
3203	int info;
3204	int err;
3205};
3206
3207/* Open a handle on a cpuset hierarchy.  All the real work is done here. */
3208struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath)
3209{
3210	FTS *fts = NULL;
3211	FTSENT *ftsent;
3212	char *path_argv[2];
3213	char buf[PATH_MAX];
3214	struct cpuset_fts_tree *cs_tree = NULL;
3215	struct cpuset_fts_entry *ep;	/* the latest new list entry */
3216	struct cpuset_fts_entry **pnlep;	/* ptr to next list entry ptr */
3217	char *relpath;
3218	int fts_flags;
3219
3220	fullpath(buf, sizeof(buf), cpusetpath);
3221	path_argv[0] = buf;
3222	path_argv[1] = NULL;
3223
3224	fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV;
3225	fts = fts_open(path_argv, fts_flags, NULL);
3226	if (fts == NULL)
3227		goto err;
3228
3229	cs_tree = malloc(sizeof(*cs_tree));
3230	if (cs_tree == NULL)
3231		goto err;
3232	pnlep = &cs_tree->head;
3233	*pnlep = NULL;
3234
3235	while ((ftsent = fts_read(fts)) != NULL) {
3236		if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR)
3237			continue;
3238
3239		/* ftsent is a directory (perhaps unreadable) ==> cpuset */
3240		ep = calloc(1, sizeof(*ep));
3241		if (ep == NULL)
3242			goto err;
3243		*pnlep = ep;
3244		pnlep = &ep->next;
3245
3246		/* Set entry's path, and if DNR, error */
3247		relpath = ftsent->fts_path + strlen(cpusetmnt);
3248		if (strlen(relpath) == 0)
3249			relpath = "/";
3250		ep->path = strdup(relpath);
3251		if (ep->path == NULL)
3252			goto err;
3253		if (ftsent->fts_info == FTS_DNR) {
3254			ep->info = CPUSET_FTS_ERR_DNR;
3255			ep->err = ftsent->fts_errno;
3256			continue;
3257		}
3258
3259		/* ftsent is a -readable- cpuset: set entry's stat, etc */
3260		ep->stat = calloc(1, sizeof(struct stat));
3261		if (ep->stat == NULL)
3262			goto err;
3263		if (stat(ftsent->fts_path, ep->stat) < 0) {
3264			ep->info = CPUSET_FTS_ERR_STAT;
3265			ep->err = ftsent->fts_errno;
3266			continue;
3267		}
3268
3269		ep->cpuset = calloc(1, sizeof(struct cpuset));
3270		if (ep->cpuset == NULL)
3271			goto err;
3272		if (cpuset_query(ep->cpuset, relpath) < 0) {
3273			ep->info = CPUSET_FTS_ERR_CPUSET;
3274			ep->err = errno;
3275			continue;
3276		}
3277		ep->info = CPUSET_FTS_CPUSET;
3278	}
3279
3280	(void)fts_close(fts);
3281	cpuset_fts_rewind(cs_tree);
3282	return cs_tree;
3283
3284err:
3285	if (cs_tree)
3286		cpuset_fts_close(cs_tree);
3287	if (fts)
3288		(void)fts_close(fts);
3289	return NULL;
3290}
3291
3292/* Return pointer to next cpuset entry in hierarchy */
3293const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree)
3294{
3295	const struct cpuset_fts_entry *cs_entry = cs_tree->next;
3296	if (cs_tree->next != NULL)	/* seek to next entry */
3297		cs_tree->next = cs_tree->next->next;
3298	return cs_entry;
3299}
3300
3301/* Reverse list of cpusets, in place.  Simulates pre-order/post-order flip. */
3302void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree)
3303{
3304	struct cpuset_fts_entry *cs1, *cs2, *cs3;
3305
3306	/*
3307	 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer
3308	 * is redirected from cs3 to cs1.
3309	 */
3310
3311	cs1 = cs2 = NULL;
3312	cs3 = cs_tree->head;
3313	while (cs3) {
3314		cs1 = cs2;
3315		cs2 = cs3;
3316		cs3 = cs3->next;
3317		cs2->next = cs1;
3318	}
3319	cs_tree->head = cs2;
3320	cpuset_fts_rewind(cs_tree);
3321}
3322
3323/* Rewind cpuset list to beginning */
3324void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree)
3325{
3326	cs_tree->next = cs_tree->head;
3327}
3328
3329/* Return pointer to nul-terminated cpuset path of entry in hierarchy */
3330const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry)
3331{
3332	return cs_entry->path;
3333}
3334
3335/* Return pointer to stat(2) structure of a cpuset entry's directory */
3336const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry)
3337{
3338	return cs_entry->stat;
3339}
3340
3341/* Return pointer to cpuset structure of a cpuset entry */
3342const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry
3343					   *cs_entry)
3344{
3345	return cs_entry->cpuset;
3346}
3347
3348/* Return value of errno (0 if no error) on attempted cpuset operations */
3349int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry)
3350{
3351	return cs_entry->err;
3352}
3353
3354/* Return operation identity causing error */
3355int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry)
3356{
3357	return cs_entry->info;
3358}
3359
3360/* Close a cpuset hierarchy handle (free's all associated memory) */
3361void cpuset_fts_close(struct cpuset_fts_tree *cs_tree)
3362{
3363	struct cpuset_fts_entry *cs_entry = cs_tree->head;
3364
3365	while (cs_entry) {
3366		struct cpuset_fts_entry *ep = cs_entry;
3367
3368		cs_entry = cs_entry->next;
3369		free(ep->path);
3370		free(ep->stat);
3371		cpuset_free(ep->cpuset);
3372		free(ep);
3373	}
3374	free(cs_tree);
3375}
3376
3377/* Bind current task to cpu (uses sched_setaffinity(2)) */
3378int cpuset_cpubind(int cpu)
3379{
3380	struct bitmask *bmp;
3381	int r;
3382
3383	if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3384		return -1;
3385	bitmask_setbit(bmp, cpu);
3386	r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp));
3387	bitmask_free(bmp);
3388	return r;
3389}
3390
3391/*
3392 * int cpuset_latestcpu(pid_t pid)
3393 *
3394 * Return most recent CPU on which task pid executed.  If pid == 0,
3395 * examine current task.
3396 *
3397 * The last used CPU is visible for a given pid as field #39 (starting
3398 * with #1) in the file /proc/pid/stat.  Currently this file has 41
3399 * fields, in which case this is the 3rd to the last field.
3400 *
3401 * Unfortunately field #2 is a command name and might have embedded
3402 * whitespace.  So we can't just count white space separated fields.
3403 * Fortunately, this command name is surrounded by parentheses, as
3404 * for example "(sh)", and that closing parenthesis is the last ')'
3405 * character in the line.  No remaining fields can have embedded
3406 * whitespace or parentheses.  So instead of looking for the 39th
3407 * white space separated field, we can look for the 37th white space
3408 * separated field past the last ')' character on the line.
3409 */
3410
3411/* Return most recent CPU on which task pid executed */
3412int cpuset_latestcpu(pid_t pid)
3413{
3414	char buf[PATH_MAX];
3415	char *bp;
3416	int fd = -1;
3417	int cpu = -1;
3418
3419	if (pid == 0)
3420		snprintf(buf, sizeof(buf), "/proc/self/stat");
3421	else
3422		snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
3423
3424	if ((fd = open(buf, O_RDONLY)) < 0)
3425		goto err;
3426	if (read(fd, buf, sizeof(buf)) < 1)
3427		goto err;
3428	close(fd);
3429
3430	bp = strrchr(buf, ')');
3431	if (bp)
3432		sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u",	/* 37th field past ')' */
3433		       &cpu);
3434	if (cpu < 0)
3435		errno = EINVAL;
3436	return cpu;
3437err:
3438	if (fd >= 0)
3439		close(fd);
3440	return -1;
3441}
3442
3443/* Bind current task to memory (uses set_mempolicy(2)) */
3444int cpuset_membind(int mem)
3445{
3446	struct bitmask *bmp;
3447	int r;
3448
3449	if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3450		return -1;
3451	bitmask_setbit(bmp, mem);
3452#if HAVE_DECL_MPOL_BIND
3453	r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1);
3454#else
3455	r = -1;
3456	errno = ENOSYS;
3457#endif
3458	bitmask_free(bmp);
3459	return r;
3460}
3461
3462/* [optional] Return Memory Node holding page at specified addr */
3463int cpuset_addr2node(void *addr)
3464{
3465	int node = -1;
3466
3467#if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
3468	if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) {
3469		/* I realize this seems redundant, but I _want_ to make sure
3470		 * that this value is -1. */
3471		node = -1;
3472	}
3473#endif
3474	return node;
3475}
3476
3477/*
3478 * Transform cpuset into Text Format Representation in buffer 'buf',
3479 * of length 'buflen', nul-terminated if space allows.  Return number
3480 * of characters that would have been written, if enough space had
3481 * been available, in the same way that snprintf() does.
3482 */
3483
3484/* Export cpuset settings to a regular file */
3485int cpuset_export(const struct cpuset *cp, char *buf, int buflen)
3486{
3487	char *tmp = NULL;
3488	int n = 0;
3489
3490	if (cp->cpu_exclusive)
3491		n += snprintf(buf + n, max(buflen - n, 0), "cpu_exclusive\n");
3492
3493	if (cp->mem_exclusive)
3494		n += snprintf(buf + n, max(buflen - n, 0), "mem_exclusive\n");
3495
3496	if (cp->notify_on_release)
3497		n += snprintf(buf + n, max(buflen - n, 0),
3498			      "notify_on_release\n");
3499
3500	if (cp->memory_pressure_enabled)
3501		n += snprintf(buf + n, max(buflen - n, 0),
3502			      "memory_pressure_enabled\n");
3503
3504	if (cp->memory_migrate)
3505		n += snprintf(buf + n, max(buflen - n, 0), "memory_migrate\n");
3506
3507	if (cp->memory_spread_page)
3508		n += snprintf(buf + n, max(buflen - n, 0),
3509			      "memory_spread_page\n");
3510
3511	if (cp->memory_spread_slab)
3512		n += snprintf(buf + n, max(buflen - n, 0),
3513			      "memory_spread_slab\n");
3514
3515	if ((tmp = sprint_mask_buf(cp->cpus)) == NULL)
3516		return -1;
3517	n += snprintf(buf + n, max(buflen - n, 0), "cpus %s\n", tmp);
3518	free(tmp);
3519	tmp = NULL;
3520
3521	if ((tmp = sprint_mask_buf(cp->mems)) == NULL)
3522		return -1;
3523	n += snprintf(buf + n, max(buflen - n, 0), "mems %s\n", tmp);
3524	free(tmp);
3525	tmp = NULL;
3526
3527	return n;
3528}
3529
3530static int import_list(UNUSED const char *tok, const char *arg,
3531		       struct bitmask *bmp, char *emsg, int elen)
3532{
3533	if (bitmask_parselist(arg, bmp) < 0) {
3534		if (emsg)
3535			snprintf(emsg, elen, "Invalid list format: %s", arg);
3536		return -1;
3537	}
3538	return 0;
3539}
3540
3541static void stolower(char *s)
3542{
3543	while (*s) {
3544		unsigned char c = *s;
3545		*s = tolower(c);
3546		s++;
3547	}
3548}
3549
3550/* Import cpuset settings from a regular file */
3551int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum,
3552		  char *emsg, int elen)
3553{
3554	char *linebuf = NULL;
3555	int linebuflen;
3556	int linenum = 0;
3557	int offset = 0;
3558
3559	linebuflen = strlen(buf) + 1;
3560	if ((linebuf = malloc(linebuflen)) == NULL) {
3561		if (emsg)
3562			snprintf(emsg, elen, "Insufficient memory");
3563		goto err;
3564	}
3565
3566	while (slgets(linebuf, linebuflen, buf, &offset)) {
3567		char *tok, *arg;
3568		char *ptr;	/* for strtok_r */
3569
3570		linenum++;
3571		if ((tok = strchr(linebuf, '#')) != NULL)
3572			*tok = 0;
3573		if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL)
3574			continue;
3575		stolower(tok);
3576
3577		arg = strtok_r(0, " \t", &ptr);
3578
3579		if (streq(tok, "cpu_exclusive")) {
3580			cp->cpu_exclusive = 1;
3581			goto eol;
3582		}
3583		if (streq(tok, "mem_exclusive")) {
3584			cp->mem_exclusive = 1;
3585			goto eol;
3586		}
3587		if (streq(tok, "notify_on_release")) {
3588			cp->notify_on_release = 1;
3589			goto eol;
3590		}
3591		if (streq(tok, "memory_pressure_enabled")) {
3592			cp->memory_pressure_enabled = 1;
3593			goto eol;
3594		}
3595		if (streq(tok, "memory_migrate")) {
3596			cp->memory_migrate = 1;
3597			goto eol;
3598		}
3599		if (streq(tok, "memory_spread_page")) {
3600			cp->memory_spread_page = 1;
3601			goto eol;
3602		}
3603		if (streq(tok, "memory_spread_slab")) {
3604			cp->memory_spread_slab = 1;
3605			goto eol;
3606		}
3607		if (streq(tok, "cpu") || streq(tok, "cpus")) {
3608			if (import_list(tok, arg, cp->cpus, emsg, elen) < 0)
3609				goto err;
3610			goto eol;
3611		}
3612		if (streq(tok, "mem") || streq(tok, "mems")) {
3613			if (import_list(tok, arg, cp->mems, emsg, elen) < 0)
3614				goto err;
3615			goto eol;
3616		}
3617		if (emsg)
3618			snprintf(emsg, elen, "Unrecognized token: '%s'", tok);
3619		goto err;
3620eol:
3621		if ((tok = strtok_r(0, " \t", &ptr)) != NULL) {
3622			if (emsg)
3623				snprintf(emsg, elen, "Surplus token: '%s'",
3624					 tok);
3625			goto err;
3626		}
3627		continue;
3628	}
3629
3630	free(linebuf);
3631
3632	if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems))
3633		cpuset_localcpus(cp->mems, cp->cpus);
3634	else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems))
3635		cpuset_localmems(cp->cpus, cp->mems);
3636
3637	/*
3638	 * All cpuset attributes are determined in an import.
3639	 * Those that aren't explicitly specified are presumed
3640	 * to be unchanged (zero, if it's a freshly allocated
3641	 * struct cpuset.)
3642	 */
3643
3644	cp->cpus_valid = 1;
3645	cp->mems_valid = 1;
3646	cp->cpu_exclusive_valid = 1;
3647	cp->mem_exclusive_valid = 1;
3648	cp->notify_on_release_valid = 1;
3649	cp->memory_migrate_valid = 1;
3650	cp->memory_pressure_enabled_valid = 1;
3651	cp->memory_spread_page_valid = 1;
3652	cp->memory_spread_slab_valid = 1;
3653
3654	return 0;
3655err:
3656	if (elinenum)
3657		*elinenum = linenum;
3658	if (linebuf)
3659		free(linebuf);
3660	return -1;
3661}
3662
3663/* Pin current task CPU (and memory) */
3664int cpuset_pin(int relcpu)
3665{
3666	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3667	int cpu, r;
3668
3669	if (check() < 0)
3670		return -1;
3671
3672	do {
3673		cpuset_free_placement(plc1);
3674		plc1 = cpuset_get_placement(0);
3675
3676		r = 0;
3677		if (cpuset_unpin() < 0)
3678			r = -1;
3679		cpu = cpuset_p_rel_to_sys_cpu(0, relcpu);
3680		if (cpuset_cpubind(cpu) < 0)
3681			r = -1;
3682
3683		cpuset_free_placement(plc2);
3684		plc2 = cpuset_get_placement(0);
3685	} while (!cpuset_equal_placement(plc1, plc2));
3686
3687	cpuset_free_placement(plc1);
3688	cpuset_free_placement(plc2);
3689	return r;
3690}
3691
3692/* Return number CPUs in current tasks cpuset */
3693int cpuset_size()
3694{
3695	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3696	int r;
3697
3698	if (check() < 0)
3699		return -1;
3700
3701	do {
3702		cpuset_free_placement(plc1);
3703		plc1 = cpuset_get_placement(0);
3704
3705		r = cpuset_cpus_weight(0);
3706
3707		cpuset_free_placement(plc2);
3708		plc2 = cpuset_get_placement(0);
3709	} while (!cpuset_equal_placement(plc1, plc2));
3710
3711	cpuset_free_placement(plc1);
3712	cpuset_free_placement(plc2);
3713	return r;
3714}
3715
3716/* Return relative CPU number, within current cpuset, last executed on */
3717int cpuset_where()
3718{
3719	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3720	int r;
3721
3722	if (check() < 0)
3723		return -1;
3724
3725	do {
3726		cpuset_free_placement(plc1);
3727		plc1 = cpuset_get_placement(0);
3728
3729		r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0));
3730
3731		cpuset_free_placement(plc2);
3732		plc2 = cpuset_get_placement(0);
3733	} while (!cpuset_equal_placement(plc1, plc2));
3734
3735	cpuset_free_placement(plc1);
3736	cpuset_free_placement(plc2);
3737	return r;
3738}
3739
3740/* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */
3741int cpuset_unpin()
3742{
3743	struct bitmask *cpus = NULL, *mems = NULL;
3744	int r = -1;
3745
3746	if (check() < 0)
3747		goto err;
3748
3749	/*
3750	 * Don't need cpuset_*_placement() guard against concurrent
3751	 * cpuset migration, because none of the following depends
3752	 * on the tasks cpuset placement.
3753	 */
3754
3755	if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3756		goto err;
3757	bitmask_setall(cpus);
3758	if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0)
3759		goto err;
3760
3761	if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3762		goto err;
3763#if HAVE_DECL_MPOL_DEFAULT
3764	if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems),
3765			  bitmask_nbits(mems) + 1) < 0)
3766		goto err;
3767	r = 0;
3768#endif
3769	/* fall into ... */
3770err:
3771	bitmask_free(cpus);
3772	bitmask_free(mems);
3773	return r;
3774
3775}
3776
3777struct cpuset_function_list {
3778	const char *fname;
3779	void *func;
3780} flist[] = {
3781	{
3782	"cpuset_version", cpuset_version}, {
3783	"cpuset_alloc", cpuset_alloc}, {
3784	"cpuset_free", cpuset_free}, {
3785	"cpuset_cpus_nbits", cpuset_cpus_nbits}, {
3786	"cpuset_mems_nbits", cpuset_mems_nbits}, {
3787	"cpuset_setcpus", cpuset_setcpus}, {
3788	"cpuset_setmems", cpuset_setmems}, {
3789	"cpuset_set_iopt", cpuset_set_iopt}, {
3790	"cpuset_set_sopt", cpuset_set_sopt}, {
3791	"cpuset_getcpus", cpuset_getcpus}, {
3792	"cpuset_getmems", cpuset_getmems}, {
3793	"cpuset_cpus_weight", cpuset_cpus_weight}, {
3794	"cpuset_mems_weight", cpuset_mems_weight}, {
3795	"cpuset_get_iopt", cpuset_get_iopt}, {
3796	"cpuset_get_sopt", cpuset_get_sopt}, {
3797	"cpuset_localcpus", cpuset_localcpus}, {
3798	"cpuset_localmems", cpuset_localmems}, {
3799	"cpuset_cpumemdist", cpuset_cpumemdist}, {
3800	"cpuset_cpu2node", cpuset_cpu2node}, {
3801	"cpuset_addr2node", cpuset_addr2node}, {
3802	"cpuset_create", cpuset_create}, {
3803	"cpuset_delete", cpuset_delete}, {
3804	"cpuset_query", cpuset_query}, {
3805	"cpuset_modify", cpuset_modify}, {
3806	"cpuset_getcpusetpath", cpuset_getcpusetpath}, {
3807	"cpuset_cpusetofpid", cpuset_cpusetofpid}, {
3808	"cpuset_mountpoint", cpuset_mountpoint}, {
3809	"cpuset_collides_exclusive", cpuset_collides_exclusive}, {
3810	"cpuset_nuke", cpuset_nuke}, {
3811	"cpuset_init_pidlist", cpuset_init_pidlist}, {
3812	"cpuset_pidlist_length", cpuset_pidlist_length}, {
3813	"cpuset_get_pidlist", cpuset_get_pidlist}, {
3814	"cpuset_freepidlist", cpuset_freepidlist}, {
3815	"cpuset_move", cpuset_move}, {
3816	"cpuset_move_all", cpuset_move_all}, {
3817	"cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, {
3818	"cpuset_migrate", cpuset_migrate}, {
3819	"cpuset_migrate_all", cpuset_migrate_all}, {
3820	"cpuset_reattach", cpuset_reattach}, {
3821	"cpuset_open_memory_pressure", cpuset_open_memory_pressure}, {
3822	"cpuset_read_memory_pressure", cpuset_read_memory_pressure}, {
3823	"cpuset_close_memory_pressure", cpuset_close_memory_pressure}, {
3824	"cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, {
3825	"cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, {
3826	"cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, {
3827	"cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, {
3828	"cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, {
3829	"cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, {
3830	"cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, {
3831	"cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, {
3832	"cpuset_get_placement", cpuset_get_placement}, {
3833	"cpuset_equal_placement", cpuset_equal_placement}, {
3834	"cpuset_free_placement", cpuset_free_placement}, {
3835	"cpuset_fts_open", cpuset_fts_open}, {
3836	"cpuset_fts_read", cpuset_fts_read}, {
3837	"cpuset_fts_reverse", cpuset_fts_reverse}, {
3838	"cpuset_fts_rewind", cpuset_fts_rewind}, {
3839	"cpuset_fts_get_path", cpuset_fts_get_path}, {
3840	"cpuset_fts_get_stat", cpuset_fts_get_stat}, {
3841	"cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, {
3842	"cpuset_fts_get_errno", cpuset_fts_get_errno}, {
3843	"cpuset_fts_get_info", cpuset_fts_get_info}, {
3844	"cpuset_fts_close", cpuset_fts_close}, {
3845	"cpuset_cpubind", cpuset_cpubind}, {
3846	"cpuset_latestcpu", cpuset_latestcpu}, {
3847	"cpuset_membind", cpuset_membind}, {
3848	"cpuset_export", cpuset_export}, {
3849	"cpuset_import", cpuset_import}, {
3850	"cpuset_function", cpuset_function}, {
3851	"cpuset_pin", cpuset_pin}, {
3852	"cpuset_size", cpuset_size}, {
3853	"cpuset_where", cpuset_where}, {
3854"cpuset_unpin", cpuset_unpin},};
3855
3856/* Return pointer to a libcpuset.so function, or NULL */
3857void *cpuset_function(const char *function_name)
3858{
3859	unsigned int i;
3860
3861	for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++)
3862		if (streq(function_name, flist[i].fname))
3863			return flist[i].func;
3864	return NULL;
3865}
3866
3867/* Fortran interface to basic cpuset routines */
3868int cpuset_pin_(int *ptr_relcpu)
3869{
3870	return cpuset_pin(*ptr_relcpu);
3871}
3872
3873int cpuset_size_(void)
3874{
3875	return cpuset_size();
3876}
3877
3878int cpuset_where_(void)
3879{
3880	return cpuset_where();
3881}
3882
3883int cpuset_unpin_(void)
3884{
3885	return cpuset_unpin();
3886}
3887
3888#endif /* HAVE_LINUX_MEMPOLICY_H */
3889