libcpuset.c revision 1e6f5a673655551de5734ff31ef48cd63b604e6d
1/*
2 * cpuset user library implementation.
3 *
4 * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved.
5 *
6 * Paul Jackson <pj@sgi.com>
7 */
8
9/*
10 *  This program is free software; you can redistribute it and/or modify
11 *  it under the terms of the GNU Lesser General Public License as published by
12 *  the Free Software Foundation; either version 2.1 of the License, or
13 *  (at your option) any later version.
14 *
15 *  This program is distributed in the hope that it will be useful,
16 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 *  GNU Lesser General Public License for more details.
19 *
20 *  You should have received a copy of the GNU Lesser General Public License
21 *  along with this program; if not, write to the Free Software
22 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
23 */
24
25#define _XOPEN_SOURCE 500	/* need to see pread() */
26#define _BSD_SOURCE 1		/* need to see syscall() */
27#include <unistd.h>
28
29#include <ctype.h>
30#include <dirent.h>
31#include <errno.h>
32#include <fcntl.h>
33#include <fts.h>
34#include <limits.h>
35#include <signal.h>
36#include <stdint.h>
37#include <stdio.h>
38#include <stdlib.h>
39#include <string.h>
40#include <sys/stat.h>
41#include <sys/syscall.h>
42#include <sys/types.h>
43#include <time.h>
44#include <utime.h>
45#include <sys/utsname.h> /* for cpuset_would_crash_kernel() */
46
47#include "bitmask.h"
48#include "cpuset.h"
49#include "common.h"
50#include "test.h"
51#include "linux_syscall_numbers.h"
52#include "config.h"
53#if HAVE_LINUX_MEMPOLICY_H
54#include <linux/mempolicy.h>
55
56/* Bump version, and update Change History, when libcpuset API changes */
57#define CPUSET_VERSION 3
58
59/*
60 * For a history of what changed in each version, see the "Change
61 * History" section, at the end of the libcpuset master document.
62 */
63
64int cpuset_version(void)
65{
66	return CPUSET_VERSION;
67}
68
69struct cpuset {
70	struct bitmask *cpus;
71	struct bitmask *mems;
72	char cpu_exclusive;
73	char mem_exclusive;
74	char mem_hardwall;
75	char notify_on_release;
76	char memory_migrate;
77	char memory_pressure_enabled;
78	char memory_spread_page;
79	char memory_spread_slab;
80	char sched_load_balance;
81	int sched_relax_domain_level;
82
83	/*
84	 * Each field 'x' above gets an 'x_valid' field below.
85	 * The apply_cpuset_settings() will only set those fields whose
86	 * corresponding *_valid flags are set.  The cpuset_alloc()
87	 * routine clears these flags as part of the clear in calloc(),
88	 * and the various cpuset_set*() routines set these flags when
89	 * setting the corresponding value.
90	 *
91	 * The purpose of these valid fields is to ensure that when
92	 * we create a new cpuset, we don't accidentally overwrite
93	 * some non-zero kernel default, such as an inherited
94	 * memory_spread_* flag, just because the user application
95	 * code didn't override the default zero settings resulting
96	 * from the calloc() call in cpuset_alloc().
97	 *
98	 * The choice of 'char' for the type of the flags above,
99	 * but a bitfield for the flags below, is somewhat capricious.
100	 */
101	unsigned cpus_valid:1;
102	unsigned mems_valid:1;
103	unsigned cpu_exclusive_valid:1;
104	unsigned mem_exclusive_valid:1;
105	unsigned mem_hardwall_valid:1;
106	unsigned notify_on_release_valid:1;
107	unsigned memory_migrate_valid:1;
108	unsigned memory_pressure_enabled_valid:1;
109	unsigned memory_spread_page_valid:1;
110	unsigned memory_spread_slab_valid:1;
111	unsigned sched_load_balance_valid:1;
112	unsigned sched_relax_domain_level_valid:1;
113
114	/*
115	 * if the relative variable was modified, use following flags
116	 * to put a mark
117	 */
118	unsigned cpus_dirty:1;
119	unsigned mems_dirty:1;
120	unsigned cpu_exclusive_dirty:1;
121	unsigned mem_exclusive_dirty:1;
122	unsigned mem_hardwall_dirty:1;
123	unsigned notify_on_release_dirty:1;
124	unsigned memory_migrate_dirty:1;
125	unsigned memory_pressure_enabled_dirty:1;
126	unsigned memory_spread_page_dirty:1;
127	unsigned memory_spread_slab_dirty:1;
128	unsigned sched_load_balance_dirty:1;
129	unsigned sched_relax_domain_level_dirty:1;
130};
131
132/* Presumed cpuset file system mount point */
133static const char *cpusetmnt = "/dev/cpuset";
134
135/* Stashed copy of cpunodemap[], mapping each cpu to its node. */
136static const char *mapfile = "/var/run/cpunodemap";
137
138/* The primary source for the cpunodemap[] is available below here. */
139static const char *sysdevices = "/sys/devices/system";
140
141#define max(a,b) ((a) > (b) ? (a) : (b))
142#define min(a,b) ((a) < (b) ? (a) : (b))
143
144/* small buffer size - for reading boolean flags or map file (1 or 2 ints) */
145#define SMALL_BUFSZ 16
146
147/*
148 * The 'mask_size_file' is used to ferrit out the kernel cpumask_t
149 * and nodemask_t sizes.  The lines in this file that begin with the
150 * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask
151 * and nodemask string, respectively.  The lengths of these strings
152 * reflect the kernel's internal cpumask_t and nodemask_t sizes,
153 * which sizes are needed to correctly call the sched_setaffinity
154 * and set_mempolicy system calls, and to size user level
155 * bitmasks to match the kernels.
156 */
157
158static const char *mask_size_file = "/proc/self/status";
159static const char *cpumask_prefix = "Cpus_allowed:\t";
160static const char *nodemask_prefix = "Mems_allowed:\t";
161
162/*
163 * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits.
164 *
165 * The first time we need these, we parse the Cpus_allowed and
166 * Mems_allowed lines from mask_size_file ("/proc/self/status").
167 */
168
169static int cpumask_sz;
170static int nodemask_sz;
171
172/*
173 * These defaults only kick in if we fail to size the kernel
174 * cpumask and nodemask by reading the Cpus_allowed and
175 * Mems_allowed fields from the /proc/self/status file.
176 */
177
178#define DEFCPUBITS (512)
179#define DEFNODEBITS (DEFCPUBITS/2)
180
181/*
182 * Arch-neutral API for obtaining NUMA distances between CPUs
183 * and Memory Nodes, via the files:
184 *	/sys/devices/system/node/nodeN/distance
185 * which have lines such as:
186 *	46 66 10 20
187 * which say that for cpu on node N (from the path above), the
188 * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20,
189 * respectively.
190 */
191
192static const char *distance_directory = "/sys/devices/system/node";
193
194/*
195 * Someday, we should disable, then later discard, the SN code
196 * marked ALTERNATE_SN_DISTMAP.
197 */
198
199#define ALTERNATE_SN_DISTMAP 1
200#ifdef ALTERNATE_SN_DISTMAP
201
202/*
203 * Alternative SN (SGI ia64) architecture specific API for obtaining
204 * NUMA distances between CPUs and Memory Nodes is via the file
205 * /proc/sgi_sn/sn_topology, which has lines such as:
206 *
207 *   node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20
208 *
209 * which says that for each CPU on node 2, the distance to nodes
210 * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively.
211 *
212 * This file has other lines as well, which start with other
213 * keywords than "node".  Ignore these other lines.
214 */
215
216static const char *sn_topology = "/proc/sgi_sn/sn_topology";
217static const char *sn_top_node_prefix = "node ";
218
219#endif
220
221/*
222 * Check that cpusets supported, /dev/cpuset mounted.
223 * If ok, return 0.
224 * If not, return -1 and set errno:
225 *	ENOSYS - kernel doesn't support cpusets
226 *	ENODEV - /dev/cpuset not mounted
227 */
228
229static enum {
230	check_notdone,
231	check_enosys,
232	check_enodev,
233	check_ok
234} check_state = check_notdone;
235
236static int check()
237{
238	if (check_state == check_notdone) {
239		struct stat statbuf;
240
241		if (stat("/proc/self/cpuset", &statbuf) < 0) {
242			check_state = check_enosys;
243			goto done;
244		}
245
246		if (stat("/dev/cpuset/tasks", &statbuf) < 0) {
247			check_state = check_enodev;
248			goto done;
249		}
250
251		check_state = check_ok;
252	}
253done:
254	switch (check_state) {
255	case check_enosys:
256		errno = ENOSYS;
257		return -1;
258	case check_enodev:
259		errno = ENODEV;
260		return -1;
261	default:
262		break;
263	}
264	return 0;
265}
266
267static void chomp(char *s)
268{
269	char *t;
270
271	for (t = s + strlen(s) - 1; t >= s; t--) {
272		if (*t == '\n' || *t == '\r')
273			*t = '\0';
274		else
275			break;
276	}
277}
278
279/*
280 * Determine number of bytes in a seekable open file, without
281 * assuming that stat(2) on that file has a useful size.
282 * Has side affect of leaving the file rewound to the beginnning.
283 */
284static int filesize(FILE *fp)
285{
286	int sz = 0;
287	rewind(fp);
288	while (fgetc(fp) != EOF)
289		sz++;
290	rewind(fp);
291	return sz;
292}
293
294/* Are strings s1 and s2 equal? */
295static int streq(const char *s1, const char *s2)
296{
297	return strcmp(s1, s2) == 0;
298}
299
300/* Is string 'pre' a prefix of string 's'? */
301static int strprefix(const char *s, const char *pre)
302{
303	return strncmp(s, pre, strlen(pre)) == 0;
304}
305
306/*
307 * char *flgets(char *buf, int buflen, FILE *fp)
308 *
309 * Obtain one line from input file fp.  Copy up to first
310 * buflen-1 chars of line into buffer buf, discarding any remainder
311 * of line.  Stop reading at newline, discarding newline.
312 * Nul terminate result and return pointer to buffer buf
313 * on success, or NULL if nothing more to read or failure.
314 */
315
316static char *flgets(char *buf, int buflen, FILE * fp)
317{
318	int c = -1;
319	char *bp;
320
321	bp = buf;
322	while ((--buflen > 0) && ((c = getc(fp)) >= 0)) {
323		if (c == '\n')
324			goto newline;
325		*bp++ = c;
326	}
327	if ((c < 0) && (bp == buf))
328		return NULL;
329
330	if (c > 0) {
331		while ((c = getc(fp)) >= 0) {
332			if (c == '\n')
333				break;
334		}
335	}
336
337newline:
338	*bp++ = '\0';
339	return buf;
340}
341
342/*
343 * sgetc(const char *inputbuf, int *offsetptr)
344 *
345 * Return next char from nul-terminated input buffer inputbuf,
346 * starting at offset *offsetptr.  Increment *offsetptr.
347 * If next char would be nul ('\0'), return EOF and don't
348 * increment *offsetptr.
349 */
350
351static int sgetc(const char *inputbuf, int *offsetptr)
352{
353	char c;
354
355	if ((c = inputbuf[*offsetptr]) != 0) {
356		*offsetptr = *offsetptr + 1;
357		return c;
358	} else {
359		return EOF;
360	}
361}
362
363/*
364 * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
365 *
366 * Obtain next line from nul-terminated input buffer 'inputbuf',
367 * starting at offset *offsetptr.  Copy up to first buflen-1
368 * chars of line into output buffer buf, discarding any remainder
369 * of line.  Stop reading at newline, discarding newline.
370 * Nul terminate result and return pointer to output buffer
371 * buf on success, or NULL if nothing more to read.
372 */
373
374static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
375{
376	int c = -1;
377	char *bp;
378
379	bp = buf;
380	while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) {
381		if (c == '\n')
382			goto newline;
383		*bp++ = c;
384	}
385	if ((c < 0) && (bp == buf))
386		return NULL;
387
388	if (c > 0) {
389		while ((c = sgetc(inputbuf, offsetptr)) >= 0) {
390			if (c == '\n')
391				break;
392		}
393	}
394
395newline:
396	*bp++ = '\0';
397	return buf;
398}
399
400/*
401 * time_t get_mtime(char *path)
402 *
403 * Return modtime of file at location path, else return 0.
404 */
405
406static time_t get_mtime(const char *path)
407{
408	struct stat statbuf;
409
410	if (stat(path, &statbuf) != 0)
411		return 0;
412	return statbuf.st_mtime;
413}
414
415/*
416 * int set_mtime(const char *path, time_t mtime)
417 *
418 * Set modtime of file 'path' to 'mtime'.  Return 0 on success,
419 * or -1 on error, setting errno.
420 */
421
422static int set_mtime(const char *path, time_t mtime)
423{
424	struct utimbuf times;
425
426	times.actime = mtime;
427	times.modtime = mtime;
428	return utime(path, &times);
429}
430
431/*
432 * True if two pathnames resolve to same file.
433 * False if either path can not be stat'd,
434 * or if the two paths resolve to a different file.
435 */
436
437static int samefile(const char *path1, const char *path2)
438{
439	struct stat sb1, sb2;
440
441	if (stat(path1, &sb1) != 0)
442		return 0;
443	if (stat(path2, &sb2) != 0)
444		return 0;
445	return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev;
446}
447
448#define slash(c) (*(c) == '/')
449#define eocomp(c) (slash(c) || !*(c))
450#define dot1(c) (*(c) == '.' && eocomp(c+1))
451
452/* In place path compression.  Remove extra dots and slashes. */
453static char *pathcomp(char *p)
454{
455	char *a=p;
456	char *b=p;
457
458	if (!p || !*p)
459		return p;
460	if (slash (p))
461		*b++ = *a++;
462	for (;;) {
463		if (slash (a))
464			while (slash (++a))
465				continue;
466		if (!*a) {
467			if (b==p)
468				*b++ = '.';
469			*b = '\0';
470			return (p);
471		} else if (dot1 (a)) {
472			a++;
473		} else {
474			if ((b!=p) && !slash(b-1))
475				*b++ = '/';
476			while (!eocomp(a))
477				*b++ = *a++;
478		}
479	}
480}
481
482#undef slash
483#undef eocomp
484#undef dot1
485
486/*
487 * pathcat2(buf, buflen, name1, name2)
488 *
489 * Return buf, of length buflen, with name1/name2 stored in it.
490 */
491
492static char *pathcat2(char *buf, int buflen, const char *name1,
493					const char *name2)
494{
495	(void) snprintf(buf, buflen, "%s/%s", name1, name2);
496	return pathcomp(buf);
497}
498
499/*
500 * pathcat3(buf, buflen, name1, name2, name3)
501 *
502 * Return buf, of length buflen, with name1/name2/name3 stored in it.
503 */
504
505static char *pathcat3(char *buf, int buflen, const char *name1,
506					const char *name2, const char *name3)
507{
508	(void) snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3);
509	return pathcomp(buf);
510}
511
512/*
513 * fullpath(buf, buflen, name)
514 *
515 * Put full path of cpuset 'name' in buffer 'buf'.  If name
516 * starts with a slash (``/``) character, then this a path
517 * relative to ``/dev/cpuset``, otherwise it is relative to
518 * the current tasks cpuset.  Return 0 on success, else
519 * -1 on error, setting errno.
520 */
521
522static int fullpath(char *buf, int buflen, const char *name)
523{
524	int len;
525
526	/* easy case */
527	if (*name == '/') {
528		pathcat2(buf, buflen, cpusetmnt, name);
529		pathcomp(buf);
530		return 0;
531	}
532
533	/* hard case */
534	snprintf(buf, buflen, "%s/", cpusetmnt);
535	len = strlen(buf);
536	if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL)
537		return -1;
538	if (strlen(buf) >= buflen - 1 - strlen(name)) {
539		errno = E2BIG;
540		return -1;
541	}
542	strcat(buf, "/");
543	strcat(buf, name);
544	pathcomp(buf);
545	return 0;
546}
547
548/*
549 * fullpath2(buf, buflen, name1, name2)
550 *
551 * Like fullpath(), only concatenate two pathname components on end.
552 */
553
554static int fullpath2(char *buf, int buflen, const char *name1,
555							const char *name2)
556{
557	if (fullpath(buf, buflen, name1) < 0)
558		return -1;
559	if (strlen(buf) >= buflen - 1 - strlen(name2)) {
560		errno = E2BIG;
561		return -1;
562	}
563	strcat(buf, "/");
564	strcat(buf, name2);
565	pathcomp(buf);
566	return 0;
567}
568
569/*
570 * Convert the string length of an ascii hex mask to the number
571 * of bits represented by that mask.
572 *
573 * The cpumask and nodemask values in /proc/self/status are in an
574 * ascii format that uses 9 characters for each 32 bits of mask.
575 */
576static int s2nbits(const char *s)
577{
578	return strlen(s) * 32 / 9;
579}
580
581static void update_mask_sizes()
582{
583	FILE *fp = NULL;
584	char *buf = NULL;
585	int fsize;
586
587	if ((fp = fopen(mask_size_file, "r")) == NULL)
588		goto done;
589	fsize = filesize(fp);
590	if ((buf = malloc(fsize)) == NULL)
591		goto done;
592
593	/*
594	 * Beware: mask sizing arithmetic is fussy.
595	 * The trailing newline left by fgets() is required.
596	 */
597	while (fgets(buf, fsize, fp)) {
598		if (strprefix(buf, cpumask_prefix))
599			cpumask_sz = s2nbits(buf + strlen(cpumask_prefix));
600		if (strprefix(buf, nodemask_prefix))
601			nodemask_sz = s2nbits(buf + strlen(nodemask_prefix));
602	}
603done:
604	if (buf != NULL)
605		free(buf);
606	if (fp != NULL)
607		fclose(fp);
608	if (cpumask_sz == 0)
609		cpumask_sz = DEFCPUBITS;
610	if (nodemask_sz == 0)
611		nodemask_sz = DEFNODEBITS;
612}
613
614/* Allocate a new struct cpuset */
615struct cpuset *cpuset_alloc()
616{
617	struct cpuset *cp = NULL;
618	int nbits;
619
620	if ((cp = calloc(1, sizeof(struct cpuset))) == NULL)
621		goto err;
622
623	nbits = cpuset_cpus_nbits();
624	if ((cp->cpus = bitmask_alloc(nbits)) == NULL)
625		goto err;
626
627	nbits = cpuset_mems_nbits();
628	if ((cp->mems = bitmask_alloc(nbits)) == NULL)
629		goto err;
630
631	return cp;
632err:
633	if (cp && cp->cpus)
634		bitmask_free(cp->cpus);
635	if (cp && cp->mems)
636		bitmask_free(cp->mems);
637	if (cp)
638		free(cp);
639	return NULL;
640}
641
642/* Free struct cpuset *cp */
643void cpuset_free(struct cpuset *cp)
644{
645	if (!cp)
646		return;
647	if (cp->cpus)
648		bitmask_free(cp->cpus);
649	if (cp->mems)
650		bitmask_free(cp->mems);
651	free(cp);
652}
653
654/* Number of bits in a CPU bitmask on current system */
655int cpuset_cpus_nbits()
656{
657	if (cpumask_sz == 0)
658		update_mask_sizes();
659	return cpumask_sz;
660}
661
662/* Number of bits in a Memory bitmask on current system */
663int cpuset_mems_nbits()
664{
665	if (nodemask_sz == 0)
666		update_mask_sizes();
667	return nodemask_sz;
668}
669
670/* Set CPUs in cpuset cp to bitmask cpus */
671int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus)
672{
673	if (cp->cpus)
674		bitmask_free(cp->cpus);
675	cp->cpus = bitmask_alloc(bitmask_nbits(cpus));
676	if (cp->cpus == NULL)
677		return -1;
678	bitmask_copy(cp->cpus, cpus);
679	cp->cpus_valid = 1;
680	cp->cpus_dirty = 1;
681	return 0;
682}
683
684/* Set Memory Nodes in cpuset cp to bitmask mems */
685int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems)
686{
687	if (cp->mems)
688		bitmask_free(cp->mems);
689	cp->mems = bitmask_alloc(bitmask_nbits(mems));
690	if (cp->mems == NULL)
691		return -1;
692	bitmask_copy(cp->mems, mems);
693	cp->mems_valid = 1;
694	cp->mems_dirty = 1;
695	return 0;
696}
697
698/* Set integer value optname of cpuset cp */
699int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value)
700{
701	if (streq(optionname, "cpu_exclusive")) {
702		cp->cpu_exclusive = !!value;
703		cp->cpu_exclusive_valid = 1;
704		cp->cpu_exclusive_dirty = 1;
705	} else if (streq(optionname, "mem_exclusive")) {
706		cp->mem_exclusive = !!value;
707		cp->mem_exclusive_valid = 1;
708		cp->mem_exclusive_dirty = 1;
709	} else if (streq(optionname, "mem_hardwall")) {
710		cp->mem_hardwall = !!value;
711		cp->mem_hardwall_valid = 1;
712		cp->mem_hardwall_dirty = 1;
713	} else if (streq(optionname, "notify_on_release")) {
714		cp->notify_on_release = !!value;
715		cp->notify_on_release_valid = 1;
716		cp->notify_on_release_dirty = 1;
717	} else if (streq(optionname, "memory_pressure_enabled")) {
718		cp->memory_pressure_enabled = !!value;
719		cp->memory_pressure_enabled_valid = 1;
720		cp->memory_pressure_enabled_dirty = 1;
721	} else if (streq(optionname, "memory_migrate")) {
722		cp->memory_migrate = !!value;
723		cp->memory_migrate_valid = 1;
724		cp->memory_migrate_dirty = 1;
725	} else if (streq(optionname, "memory_spread_page")) {
726		cp->memory_spread_page = !!value;
727		cp->memory_spread_page_valid = 1;
728		cp->memory_spread_page_dirty = 1;
729	} else if (streq(optionname, "memory_spread_slab")) {
730		cp->memory_spread_slab = !!value;
731		cp->memory_spread_slab_valid = 1;
732		cp->memory_spread_slab_dirty = 1;
733	} else if (streq(optionname, "sched_load_balance")) {
734		cp->sched_load_balance = !!value;
735		cp->sched_load_balance_valid = 1;
736		cp->sched_load_balance_dirty = 1;
737	} else if (streq(optionname, "sched_relax_domain_level")) {
738		cp->sched_relax_domain_level = value;
739		cp->sched_relax_domain_level_valid = 1;
740		cp->sched_relax_domain_level_dirty = 1;
741	} else
742		return -2;	/* optionname not recognized */
743	return 0;
744}
745
746/* [optional] Set string value optname */
747int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname,
748						UNUSED const char *value)
749{
750	return -2;	/* For now, all string options unrecognized */
751}
752
753/* Return handle for reading memory_pressure. */
754int cpuset_open_memory_pressure(const char *cpusetpath)
755{
756	char buf[PATH_MAX];
757
758	fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure");
759	return open(buf, O_RDONLY);
760}
761
762/* Return current memory_pressure of cpuset. */
763int cpuset_read_memory_pressure(int han)
764{
765	char buf[SMALL_BUFSZ];
766
767	if (pread(han, buf, sizeof(buf), 0L) < 0)
768		return -1;
769	return atoi(buf);
770}
771
772/* Close handle for reading memory pressure. */
773void cpuset_close_memory_pressure(int han)
774{
775	close(han);
776}
777
778/*
779 * Resolve cpuset pointer (to that of current task if cp == NULL).
780 *
781 * If cp not NULL, just return it.  If cp is NULL, return pointer
782 * to temporary cpuset for current task, and set *cp_tofree to
783 * pointer to that same temporary cpuset, to be freed later.
784 *
785 * Return NULL and set errno on error.  Errors can occur when
786 * resolving the current tasks cpuset.
787 */
788static const struct cpuset *resolve_cp(const struct cpuset *cp,
789			struct cpuset **cp_tofree)
790{
791	const struct cpuset *rcp;
792
793	if (cp) {
794		rcp = cp;
795	} else {
796		struct cpuset *cp1 = cpuset_alloc();
797		if (cp1 == NULL)
798			goto err;
799		if (cpuset_cpusetofpid(cp1, 0) < 0) {
800			cpuset_free(cp1);
801			goto err;
802		}
803		*cp_tofree = cp1;
804		rcp = cp1;
805	}
806	return rcp;
807err:
808	return NULL;
809}
810
811/* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */
812int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus)
813{
814	struct cpuset *cp_tofree = NULL;
815	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
816
817	if (!cp1)
818		goto err;
819	if (cp1->cpus == NULL) {
820		errno = EINVAL;
821		goto err;
822	}
823	bitmask_copy(cpus, cp1->cpus);
824	cpuset_free(cp_tofree);
825	return 0;
826err:
827	cpuset_free(cp_tofree);
828	return -1;
829}
830
831/* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */
832int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems)
833{
834	struct cpuset *cp_tofree = NULL;
835	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
836
837	if (!cp1)
838		goto err;
839	if (cp1->mems == NULL) {
840		errno = EINVAL;
841		goto err;
842	}
843	bitmask_copy(mems, cp1->mems);
844	cpuset_free(cp_tofree);
845	return 0;
846err:
847	cpuset_free(cp_tofree);
848	return -1;
849}
850
851/* Return number of CPUs in cpuset cp (current task if cp == NULL) */
852int cpuset_cpus_weight(const struct cpuset *cp)
853{
854	struct cpuset *cp_tofree = NULL;
855	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
856	int w = -1;
857
858	if (!cp1)
859		goto err;
860	if (cp1->cpus == NULL) {
861		errno = EINVAL;
862		goto err;
863	}
864	w = bitmask_weight(cp1->cpus);
865	/* fall into ... */
866err:
867	cpuset_free(cp_tofree);
868	return w;
869}
870
871/* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */
872int cpuset_mems_weight(const struct cpuset *cp)
873{
874	struct cpuset *cp_tofree = NULL;
875	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
876	int w = -1;
877
878	if (!cp1)
879		goto err;
880	if (cp1->mems == NULL) {
881		errno = EINVAL;
882		goto err;
883	}
884	w = bitmask_weight(cp1->mems);
885	/* fall into ... */
886err:
887	cpuset_free(cp_tofree);
888	return w;
889}
890
891/* Return integer value of option optname in cp */
892int cpuset_get_iopt(const struct cpuset *cp, const char *optionname)
893{
894	if (streq(optionname, "cpu_exclusive"))
895		return cp->cpu_exclusive;
896	else if (streq(optionname, "mem_exclusive"))
897		return cp->mem_exclusive;
898	else if (streq(optionname, "mem_hardwall"))
899		return cp->mem_hardwall;
900	else if (streq(optionname, "notify_on_release"))
901		return cp->notify_on_release;
902	else if (streq(optionname, "memory_pressure_enabled"))
903		return cp->memory_pressure_enabled;
904	else if (streq(optionname, "memory_migrate"))
905		return cp->memory_migrate;
906	else if (streq(optionname, "memory_spread_page"))
907		return cp->memory_spread_page;
908	else if (streq(optionname, "memory_spread_slab"))
909		return cp->memory_spread_slab;
910	else if (streq(optionname, "sched_load_balance"))
911		return cp->sched_load_balance;
912	else if (streq(optionname, "sched_relax_domain_level"))
913		return cp->sched_relax_domain_level;
914	else
915		return -2;	/* optionname not recognized */
916}
917
918/* [optional] Return string value of optname */
919const char *cpuset_get_sopt(UNUSED const struct cpuset *cp,
920				UNUSED const char *optionname)
921{
922	return NULL;	/* For now, all string options unrecognized */
923}
924
925static int read_flag(const char *filepath, char *flagp)
926{
927	char buf[SMALL_BUFSZ];		/* buffer a "0" or "1" flag line */
928	int fd = -1;
929
930	if ((fd = open(filepath, O_RDONLY)) < 0)
931		goto err;
932	if (read(fd, buf, sizeof(buf)) < 1)
933		goto err;
934	if (atoi(buf))
935		*flagp = 1;
936	else
937		*flagp = 0;
938	close(fd);
939	return 0;
940err:
941	if (fd >= 0)
942		close(fd);
943	return -1;
944}
945
946static int load_flag(const char *path, char *flagp, const char *flag)
947{
948	char buf[PATH_MAX];
949
950	pathcat2(buf, sizeof(buf), path, flag);
951	return read_flag(buf, flagp);
952}
953
954static int read_number(const char *filepath, int *numberp)
955{
956	char buf[SMALL_BUFSZ];
957	int fd = -1;
958
959	if ((fd = open(filepath, O_RDONLY)) < 0)
960		goto err;
961	if (read(fd, buf, sizeof(buf)) < 1)
962		goto err;
963	*numberp = atoi(buf);
964	close(fd);
965	return 0;
966err:
967	if (fd >= 0)
968		close(fd);
969	return -1;
970}
971
972static int load_number(const char *path, int *numberp, const char *file)
973{
974	char buf[PATH_MAX];
975
976	pathcat2(buf, sizeof(buf), path, file);
977	return read_number(buf, numberp);
978}
979
980static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits)
981{
982	FILE *fp = NULL;
983	char *buf = NULL;
984	int buflen;
985	struct bitmask *bmp = NULL;
986
987	if ((fp = fopen(filepath, "r")) == NULL)
988		goto err;
989	buflen = filesize(fp) + 1;	/* + 1 for nul term */
990	if ((buf = malloc(buflen)) == NULL)
991		goto err;
992	if (flgets(buf, buflen, fp) == NULL)
993		goto err;
994	fclose(fp);
995	fp = NULL;
996
997	if ((bmp = bitmask_alloc(nbits)) == NULL)
998		goto err;
999	if (*buf && bitmask_parselist(buf, bmp) < 0)
1000		goto err;
1001	if (*bmpp)
1002		bitmask_free(*bmpp);
1003	*bmpp = bmp;
1004	free(buf);
1005	buf = NULL;
1006	return 0;
1007err:
1008	if (buf != NULL)
1009		free(buf);
1010	if (fp != NULL)
1011		fclose(fp);
1012	if (bmp != NULL)
1013		bitmask_free(bmp);
1014	return -1;
1015}
1016
1017static int load_mask(const char *path, struct bitmask **bmpp,
1018						int nbits, const char *mask)
1019{
1020	char buf[PATH_MAX];
1021
1022	pathcat2(buf, sizeof(buf), path, mask);
1023	return read_mask(buf, bmpp, nbits);
1024}
1025
1026/* Write string to file at given filepath.  Create or truncate file. */
1027static int write_string_file(const char *filepath, const char *str)
1028{
1029	int fd = -1;
1030
1031	if ((fd = open(filepath, O_WRONLY|O_CREAT, 0644)) < 0)
1032		goto err;
1033	if (write(fd, str, strlen(str)) < 0)
1034		goto err;
1035	close(fd);
1036	return 0;
1037err:
1038	if (fd >= 0)
1039		close(fd);
1040	return -1;
1041}
1042
1043/* Size and allocate buffer.  Write bitmask into it.  Caller must free */
1044static char *sprint_mask_buf(const struct bitmask *bmp)
1045{
1046	char *buf = NULL;
1047	int buflen;
1048	char c;
1049
1050	/* First bitmask_displaylist() call just to get the length */
1051	buflen = bitmask_displaylist(&c, 1, bmp) + 1;	/* "+ 1" for nul */
1052	if ((buf = malloc(buflen)) == NULL)
1053		return NULL;
1054	bitmask_displaylist(buf, buflen, bmp);
1055	return buf;
1056}
1057
1058static int exists_flag(const char *path, const char *flag)
1059{
1060	char buf[PATH_MAX];
1061	struct stat statbuf;
1062	int rc;
1063
1064	pathcat2(buf, sizeof(buf), path, flag);
1065	rc = (stat(buf, &statbuf) == 0);
1066	errno = 0;
1067	return rc;
1068}
1069
1070static int store_flag(const char *path, const char *flag, int val)
1071{
1072	char buf[PATH_MAX];
1073
1074	pathcat2(buf, sizeof(buf), path, flag);
1075	return write_string_file(buf, val ? "1" : "0");
1076}
1077
1078static int store_number(const char *path, const char *file, int val)
1079{
1080	char buf[PATH_MAX];
1081	char data[SMALL_BUFSZ];
1082
1083	memset(data, 0, sizeof(data));
1084	pathcat2(buf, sizeof(buf), path, file);
1085	snprintf(data, sizeof(data), "%d", val);
1086	return write_string_file(buf, data);
1087}
1088
1089static int store_mask(const char *path, const char *mask,
1090						const struct bitmask *bmp)
1091{
1092	char maskpath[PATH_MAX];
1093	char *bp = NULL;
1094	int rc;
1095
1096	if (bmp == NULL)
1097		return 0;
1098	pathcat2(maskpath, sizeof(maskpath), path, mask);
1099	if ((bp = sprint_mask_buf(bmp)) == NULL)
1100		return -1;
1101	rc = write_string_file(maskpath, bp);
1102	free(bp);
1103	return rc;
1104}
1105
1106/*
1107 * Return 1 if 'cpu' is online, else 0 if offline.  Tests the file
1108 * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents
1109 * were N == cpu number.
1110 */
1111
1112char cpu_online(unsigned int cpu)
1113{
1114	char online;
1115	char cpupath[PATH_MAX];
1116
1117	(void) snprintf(cpupath, sizeof(cpupath),
1118		"/sys/devices/system/cpu/cpu%d/online", cpu);
1119	if (read_flag(cpupath, &online) < 0)
1120		return 0;	/* oops - guess that cpu's not there */
1121	return online;
1122}
1123
1124/*
1125 * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()),
1126 * to the node on which that cpu resides or cpuset_mems_nbits().
1127 *
1128 * To avoid every user having to recalculate this relation
1129 * from various clues in the sysfs file system (below the
1130 * path /sys/devices/system) a copy of this map is kept at
1131 * /var/run/cpunodemap.
1132 *
1133 * The system automatically cleans out files below
1134 * /var/run on each system reboot (see the init script
1135 * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry
1136 * about stale data in this file across reboots.  If the file
1137 * is missing, let the first process that needs it, and has
1138 * permission to write in the /var/run directory, rebuild it.
1139 *
1140 * If using this cached data, remember the mtime of the mapfile
1141 * the last time we read it in case something like a hotplug
1142 * event results in the file being removed and rebuilt, so we
1143 * can detect if we're using a stale cache, and need to reload.
1144 *
1145 * The mtime of this file is set to the time when we did
1146 * the recalculation of the map, from the clues beneath
1147 * /sys/devices/system.  This is done so that a program
1148 * won't see the mapfile it just wrote as being newer than what
1149 * it just wrote out (store_map) and read the same map back in
1150 * (load_file).
1151 */
1152
1153/*
1154 * Hold flockfile(stdin) while using cpunodemap for posix thread safety.
1155 *
1156 * Note on locking and flockfile(FILE *):
1157 *
1158 *  We use flockfile() and funlockfile() instead of directly
1159 *  calling pthread_mutex_lock and pthread_mutex_unlock on
1160 *  a pthread_mutex_t, because this avoids forcing the app
1161 *  to link with libpthread.  The glibc implementation of
1162 *  flockfile/funlockfile will fall back to no-ops if libpthread
1163 *  doesn't happen to be linked.
1164 *
1165 *  Since flockfile already has the moderately convoluted
1166 *  combination of weak and strong symbols required to accomplish
1167 *  this, it is easier to use flockfile() on some handy FILE *
1168 *  stream as a surrogate for pthread locking than it is to so
1169 *  re-invent that wheel.
1170 *
1171 *  Forcing all apps that use cpusets to link with libpthread
1172 *  would force non-transparent initialization on apps that
1173 *  might not be prepared to handle it.
1174 *
1175 *  The application using libcpuset should never notice this
1176 *  odd use of flockfile(), because we never return to the
1177 *  application from any libcpuset call with any such lock held.
1178 *  We just use this locking for guarding some non-atomic cached
1179 *  data updates and accesses, internal to some libcpuset calls.
1180 *  Also, flockfile() allows recursive nesting, so if the app
1181 *  calls libcpuset holding such a file lock, we won't deadlock
1182 *  if we go to acquire the same lock.  We'll just get the lock
1183 *  and increment its counter while we hold it.
1184 */
1185
1186static struct cpunodemap {
1187	int *map;	/* map[cpumask_sz]: maps cpu to its node */
1188	time_t mtime;	/* modtime of mapfile when last read */
1189} cpunodemap;
1190
1191/*
1192 * rebuild_map() - Rebuild cpunodemap[] from scratch.
1193 *
1194 * Situation:
1195 *	Neither our in-memory cpunodemap[] array nor the
1196 *	cache of it in mapfile is current.
1197 * Action:
1198 *	Rebuild it from first principles and the information
1199 *	available below /sys/devices/system.
1200 */
1201
1202static void rebuild_map()
1203{
1204	char buf[PATH_MAX];
1205	DIR *dir1, *dir2;
1206	struct dirent *dent1, *dent2;
1207	int ncpus = cpuset_cpus_nbits();
1208	int nmems = cpuset_mems_nbits();
1209	unsigned int cpu, mem;
1210
1211	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1212		cpunodemap.map[cpu] = -1;
1213	pathcat2(buf, sizeof(buf), sysdevices, "node");
1214	if ((dir1 = opendir(buf)) == NULL)
1215		return;
1216	while ((dent1 = readdir(dir1)) != NULL) {
1217		if (sscanf(dent1->d_name, "node%u", &mem) < 1)
1218			continue;
1219		pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name);
1220		if ((dir2 = opendir(buf)) == NULL)
1221			continue;
1222		while ((dent2 = readdir(dir2)) != NULL) {
1223			if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
1224				continue;
1225			if (cpu >= (unsigned int)ncpus
1226				|| mem >= (unsigned int)nmems)
1227				continue;
1228			cpunodemap.map[cpu] = mem;
1229		}
1230		closedir(dir2);
1231	}
1232	closedir(dir1);
1233	cpunodemap.mtime = time(0);
1234}
1235
1236/*
1237 * load_map() - Load cpunodemap[] from mapfile.
1238 *
1239 * Situation:
1240 *	The cpunodemap in mapfile is more recent than
1241 *	what we have in the cpunodemap[] array.
1242 * Action:
1243 *	Reload the cpunodemap[] array from the file.
1244 */
1245
1246static void load_map()
1247{
1248	char buf[SMALL_BUFSZ];		/* buffer 1 line of mapfile */
1249	FILE *mapfp;			/* File stream on mapfile */
1250	int ncpus = cpuset_cpus_nbits();
1251	int nmems = cpuset_mems_nbits();
1252	unsigned int cpu, mem;
1253
1254	if ((cpunodemap.map = calloc(ncpus, sizeof (int))) == NULL)
1255		return;
1256	cpunodemap.mtime = get_mtime(mapfile);
1257	if ((mapfp = fopen(mapfile, "r")) == NULL)
1258		return;
1259	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1260		cpunodemap.map[cpu] = nmems;
1261	while (flgets(buf, sizeof(buf), mapfp) != NULL) {
1262		if (sscanf(buf, "%u %u", &cpu, &mem) < 2)
1263			continue;
1264		if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems)
1265			continue;
1266		cpunodemap.map[cpu] = mem;
1267	}
1268	fclose(mapfp);
1269}
1270
1271/*
1272 * store_map() - Write cpunodemap[] out to mapfile.
1273 *
1274 * Situation:
1275 *	The cpunodemap in the cpunodemap[] array is
1276 *	more recent than the one in mapfile.
1277 * Action:
1278 *	Write cpunodemap[] out to mapfile.
1279 */
1280
1281static void store_map()
1282{
1283	char buf[PATH_MAX];
1284	int fd = -1;
1285	FILE *mapfp = NULL;
1286	int ncpus = cpuset_cpus_nbits();
1287	int nmems = cpuset_mems_nbits();
1288	unsigned int cpu, mem;
1289
1290	snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX");
1291	if ((fd = mkstemp(buf)) < 0)
1292		goto err;
1293	if ((mapfp = fdopen(fd, "w")) == NULL)
1294		goto err;
1295	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1296		mem = cpunodemap.map[cpu];
1297		if (mem < (unsigned int)nmems)
1298			fprintf(mapfp, "%u %u\n", cpu, mem);
1299	}
1300	fclose(mapfp);
1301	set_mtime(buf, cpunodemap.mtime);
1302	if (rename(buf, mapfile) < 0)
1303		goto err;
1304	/* mkstemp() creates mode 0600 - change to world readable */
1305	(void) chmod(mapfile, 0444);
1306	return;
1307err:
1308	if (mapfp != NULL) {
1309		fclose(mapfp);
1310		fd = -1;
1311	}
1312	if (fd >= 0)
1313		close(fd);
1314	(void) unlink(buf);
1315}
1316
1317/*
1318 * Load and gain thread safe access to the <cpu, node> map.
1319 *
1320 * Return 0 on success with flockfile(stdin) held.
1321 * Each successful get_map() call must be matched with a
1322 * following put_map() call to release the lock.
1323 *
1324 * On error, return -1 with errno set and no lock held.
1325 */
1326
1327static int get_map()
1328{
1329	time_t file_mtime;
1330
1331	flockfile(stdin);
1332
1333	if (cpunodemap.map == NULL) {
1334		cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int));
1335		if (cpunodemap.map == NULL)
1336			goto err;
1337	}
1338
1339	/* If no one has a good cpunodemap, rebuild from scratch */
1340	file_mtime = get_mtime(mapfile);
1341	if (cpunodemap.mtime == 0 && file_mtime == 0)
1342		rebuild_map();
1343
1344	/* If either cpunodemap[] or mapfile newer, update other with it */
1345	file_mtime = get_mtime(mapfile);
1346	if (cpunodemap.mtime < file_mtime)
1347		load_map();
1348	else if (cpunodemap.mtime > file_mtime)
1349		store_map();
1350	return 0;
1351err:
1352	funlockfile(stdin);
1353	return -1;
1354}
1355
1356static void put_map()
1357{
1358	funlockfile(stdin);
1359}
1360
1361/* Set cpus to those local to Memory Nodes mems */
1362int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus)
1363{
1364	int ncpus = cpuset_cpus_nbits();
1365	unsigned int cpu;
1366
1367	if (check() < 0)
1368		return -1;
1369
1370	get_map();
1371	bitmask_clearall(cpus);
1372	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1373		if (bitmask_isbitset(mems, cpunodemap.map[cpu]))
1374			bitmask_setbit(cpus, cpu);
1375	}
1376	put_map();
1377	return 0;
1378}
1379
1380/* Set mems to those local to CPUs cpus */
1381int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems)
1382{
1383	int ncpus = cpuset_cpus_nbits();
1384	unsigned int cpu;
1385
1386	if (check() < 0)
1387		return -1;
1388
1389	get_map();
1390	bitmask_clearall(mems);
1391	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1392		if (bitmask_isbitset(cpus, cpu))
1393			bitmask_setbit(mems, cpunodemap.map[cpu]);
1394	}
1395	put_map();
1396	return 0;
1397}
1398
1399/*
1400 * distmap[]
1401 *
1402 * Array of ints of size cpumask_sz by nodemask_sz.
1403 *
1404 * Element distmap[cpu][mem] is the distance between CPU cpu
1405 * and Memory Node mem.  Distances are weighted to roughly
1406 * approximate the cost of memory references, and scaled so that
1407 * the distance from a CPU to its local Memory Node is ten (10).
1408 *
1409 * The first call to cpuset_cpumemdist() builds this map, from
1410 * whatever means the kernel provides to obtain these distances.
1411 *
1412 * These distances derive from ACPI SLIT table entries, which are
1413 * eight bits in size.
1414 *
1415 * Hold flockfile(stdout) while using distmap for posix thread safety.
1416 */
1417
1418typedef unsigned char distmap_entry_t;	/* type of distmap[] entries */
1419
1420static distmap_entry_t *distmap;  	/* maps <cpu, mem> to distance */
1421
1422#define DISTMAP_MAX UCHAR_MAX		/* maximum value in distmap[] */
1423
1424#define I(i,j) ((i) * nmems + (j))	/* 2-D array index simulation */
1425
1426/*
1427 * Parse arch neutral lines from 'distance' files of form:
1428 *
1429 *	46 66 10 20
1430 *
1431 * The lines contain a space separated list of distances, which is parsed
1432 * into array dists[] of each nodes distance from the specified node.
1433 *
1434 * Result is placed in distmap[ncpus][nmems]:
1435 *
1436 *	For each cpu c on node:
1437 *		For each node position n in list of distances:
1438 *			distmap[c][n] = dists[n]
1439 */
1440
1441static int parse_distmap_line(unsigned int node, char *buf)
1442{
1443	char *p, *q;
1444	int ncpus = cpuset_cpus_nbits();
1445	int nmems = cpuset_mems_nbits();
1446	unsigned int c, n;
1447	distmap_entry_t *dists = NULL;
1448	struct bitmask *cpus = NULL, *mems = NULL;
1449	int ret = -1;
1450
1451	p = buf;
1452	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1453		goto err;
1454	for (n = 0; n < (unsigned int)nmems; n++)
1455		dists[n] = DISTMAP_MAX;
1456
1457	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1458		unsigned int d;
1459
1460		if ((p = strpbrk(p, "0123456789")) == NULL)
1461			break;
1462		d = strtoul(p, &q, 10);
1463		if (p == q)
1464			break;
1465		if (d < DISTMAP_MAX)
1466			dists[n] = (distmap_entry_t)d;
1467	}
1468
1469	if ((mems = bitmask_alloc(nmems)) == NULL)
1470		goto err;
1471	bitmask_setbit(mems, node);
1472
1473	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1474		goto err;
1475	cpuset_localcpus(mems, cpus);
1476
1477	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1478		c = bitmask_next(cpus, c + 1))
1479		for (n = 0; n < (unsigned int)nmems; n++)
1480			distmap[I(c, n)] = dists[n];
1481	ret = 0;
1482	/* fall into ... */
1483err:
1484	bitmask_free(mems);
1485	bitmask_free(cpus);
1486	free(dists);
1487	return ret;
1488}
1489
1490static int parse_distance_file(unsigned int node, const char *path)
1491{
1492	FILE *fp;
1493	char *buf = NULL;
1494	int buflen;
1495
1496	if ((fp = fopen(path, "r")) == NULL)
1497		goto err;
1498
1499	buflen = filesize(fp);
1500
1501	if ((buf = malloc(buflen)) == NULL)
1502		goto err;
1503
1504	if (flgets(buf, buflen, fp) == NULL)
1505		goto err;
1506
1507	if (parse_distmap_line(node, buf) < 0)
1508		goto err;
1509
1510	free(buf);
1511	fclose(fp);
1512	return 0;
1513err:
1514	free(buf);
1515	if (fp)
1516		fclose(fp);
1517	return -1;
1518}
1519
1520static void build_distmap()
1521{
1522	static int tried_before = 0;
1523	int ncpus = cpuset_cpus_nbits();
1524	int nmems = cpuset_mems_nbits();
1525	int c, m;
1526	DIR *dir = NULL;
1527	struct dirent *dent;
1528
1529	if (tried_before)
1530		goto err;
1531	tried_before = 1;
1532
1533	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1534		goto err;
1535
1536	for (c = 0; c < ncpus; c++)
1537		for (m = 0; m < nmems; m++)
1538			distmap[I(c, m)] = DISTMAP_MAX;
1539
1540	if ((dir = opendir(distance_directory)) == NULL)
1541		goto err;
1542	while ((dent = readdir(dir)) != NULL) {
1543		char buf[PATH_MAX];
1544		unsigned int node;
1545
1546		if (sscanf(dent->d_name, "node%u", &node) < 1)
1547			continue;
1548		pathcat3(buf, sizeof(buf), distance_directory, dent->d_name,
1549								"distance");
1550		if (parse_distance_file(node, buf) < 0)
1551			goto err;
1552	}
1553	closedir(dir);
1554	return;
1555err:
1556	if (dir)
1557		closedir(dir);
1558	free(distmap);
1559	distmap = NULL;
1560}
1561
1562#ifdef ALTERNATE_SN_DISTMAP
1563
1564/*
1565 * Parse SN architecture specific line of form:
1566 *
1567 *	node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10
1568 *
1569 * Second field is node number.  The "dist" field is the colon separated list
1570 * of distances, which is parsed into array dists[] of each nodes distance
1571 * from that node.
1572 *
1573 * Result is placed in distmap[ncpus][nmems]:
1574 *
1575 *	For each cpu c on that node:
1576 *		For each node position n in list of distances:
1577 *			distmap[c][n] = dists[n]
1578 */
1579
1580static void parse_distmap_line_sn(char *buf)
1581{
1582	char *p, *pend, *q;
1583	int ncpus = cpuset_cpus_nbits();
1584	int nmems = cpuset_mems_nbits();
1585	unsigned long c, n, node;
1586	distmap_entry_t *dists = NULL;
1587	struct bitmask *cpus = NULL, *mems = NULL;
1588
1589	if ((p = strchr(buf, ' ')) == NULL)
1590		goto err;
1591	if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems)
1592		goto err;
1593	if ((p = strstr(q, " dist ")) == NULL)
1594		goto err;
1595	p += strlen(" dist ");
1596	if ((pend = strchr(p, ' ')) != NULL)
1597		*pend = '\0';
1598	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1599		goto err;
1600	for (n = 0; n < (unsigned int)nmems; n++)
1601		dists[n] = DISTMAP_MAX;
1602
1603	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1604		unsigned long d;
1605
1606		if ((p = strpbrk(p, "0123456789")) == NULL)
1607			break;
1608		d = strtoul(p, &q, 10);
1609		if (p == q)
1610			break;
1611		if (d < DISTMAP_MAX)
1612			dists[n] = (distmap_entry_t)d;
1613	}
1614
1615	if ((mems = bitmask_alloc(nmems)) == NULL)
1616		goto err;
1617	bitmask_setbit(mems, node);
1618
1619	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1620		goto err;
1621	cpuset_localcpus(mems, cpus);
1622
1623	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1624			c = bitmask_next(cpus, c + 1))
1625		for (n = 0; n < (unsigned int)nmems; n++)
1626			distmap[I(c, n)] = dists[n];
1627	/* fall into ... */
1628err:
1629	bitmask_free(mems);
1630	bitmask_free(cpus);
1631	free(dists);
1632}
1633
1634static void build_distmap_sn()
1635{
1636	int ncpus = cpuset_cpus_nbits();
1637	int nmems = cpuset_mems_nbits();
1638	int c, m;
1639	static int tried_before = 0;
1640	FILE *fp = NULL;
1641	char *buf = NULL;
1642	int buflen;
1643
1644	if (tried_before)
1645		goto err;
1646	tried_before = 1;
1647
1648	if ((fp = fopen(sn_topology, "r")) == NULL)
1649		goto err;
1650
1651	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1652		goto err;
1653
1654	for (c = 0; c < ncpus; c++)
1655		for (m = 0; m < nmems; m++)
1656			distmap[I(c, m)] = DISTMAP_MAX;
1657
1658	buflen = filesize(fp);
1659	if ((buf = malloc(buflen)) == NULL)
1660		goto err;
1661
1662	while (flgets(buf, buflen, fp) != NULL)
1663		if (strprefix(buf, sn_top_node_prefix))
1664			parse_distmap_line_sn(buf);
1665
1666	free(buf);
1667	fclose(fp);
1668	return;
1669err:
1670	free(buf);
1671	free(distmap);
1672	distmap = NULL;
1673	if (fp)
1674		fclose(fp);
1675}
1676
1677#endif
1678
1679/* [optional] Hardware distance from CPU to Memory Node */
1680unsigned int cpuset_cpumemdist(int cpu, int mem)
1681{
1682	int ncpus = cpuset_cpus_nbits();
1683	int nmems = cpuset_mems_nbits();
1684	distmap_entry_t r = DISTMAP_MAX;
1685
1686	flockfile(stdout);
1687
1688	if (check() < 0)
1689		goto err;
1690
1691	if (distmap == NULL)
1692		build_distmap();
1693
1694#ifdef ALTERNATE_SN_DISTMAP
1695	if (distmap == NULL)
1696		build_distmap_sn();
1697#endif
1698
1699	if (distmap == NULL)
1700		goto err;
1701
1702	if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems)
1703		goto err;
1704
1705	r = distmap[I(cpu, mem)];
1706	/* fall into ... */
1707err:
1708	funlockfile(stdout);
1709	return r;
1710}
1711
1712/* [optional] Return Memory Node closest to cpu */
1713int cpuset_cpu2node(int cpu)
1714{
1715	int ncpus = cpuset_cpus_nbits();
1716	int nmems = cpuset_mems_nbits();
1717	struct bitmask *cpus = NULL, *mems = NULL;
1718	int r = -1;
1719
1720	if (check() < 0)
1721		goto err;
1722
1723	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1724		goto err;
1725	bitmask_setbit(cpus, cpu);
1726
1727	if ((mems = bitmask_alloc(nmems)) == NULL)
1728		goto err;
1729	cpuset_localmems(cpus, mems);
1730	r = bitmask_first(mems);
1731	/* fall into ... */
1732err:
1733	bitmask_free(cpus);
1734	bitmask_free(mems);
1735	return r;
1736}
1737
1738static int apply_cpuset_settings(const char *path, const struct cpuset *cp)
1739{
1740	if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) {
1741		if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0)
1742			goto err;
1743	}
1744
1745	if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) {
1746		if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0)
1747			goto err;
1748	}
1749
1750	if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) {
1751		if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0)
1752			goto err;
1753	}
1754
1755	if (cp->notify_on_release_valid && cp->notify_on_release_dirty) {
1756		if (store_flag(path, "notify_on_release", cp->notify_on_release) < 0)
1757			goto err;
1758	}
1759
1760	if (cp->memory_migrate_valid &&
1761	    cp->memory_migrate_dirty &&
1762	    exists_flag(path, "memory_migrate")) {
1763		if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0)
1764			goto err;
1765	}
1766
1767	if (cp->memory_pressure_enabled_valid &&
1768	    cp->memory_pressure_enabled_dirty &&
1769	    exists_flag(path, "memory_pressure_enabled")) {
1770		if (store_flag(path, "memory_pressure_enabled", cp->memory_pressure_enabled) < 0)
1771			goto err;
1772	}
1773
1774	if (cp->memory_spread_page_valid &&
1775	    cp->memory_spread_page_dirty &&
1776	    exists_flag(path, "memory_spread_page")) {
1777		if (store_flag(path, "memory_spread_page", cp->memory_spread_page) < 0)
1778			goto err;
1779	}
1780
1781	if (cp->memory_spread_slab_valid &&
1782	    cp->memory_spread_slab_dirty &&
1783	    exists_flag(path, "memory_spread_slab")) {
1784		if (store_flag(path, "memory_spread_slab", cp->memory_spread_slab) < 0)
1785			goto err;
1786	}
1787
1788	if (cp->sched_load_balance_valid &&
1789	    cp->sched_load_balance_dirty &&
1790	    exists_flag(path, "sched_load_balance")) {
1791		if (store_flag(path, "sched_load_balance", cp->sched_load_balance) < 0)
1792			goto err;
1793	}
1794
1795	if (cp->sched_relax_domain_level_valid &&
1796	    cp->sched_relax_domain_level_dirty &&
1797	    exists_flag(path, "sched_relax_domain_level")) {
1798		if (store_number(path, "sched_relax_domain_level", cp->sched_relax_domain_level) < 0)
1799			goto err;
1800	}
1801
1802	if (cp->cpus_valid && cp->cpus_dirty) {
1803		if (store_mask(path, "cpus", cp->cpus) < 0)
1804			goto err;
1805	}
1806
1807	if (cp->mems_valid && cp->mems_dirty) {
1808		if (store_mask(path, "mems", cp->mems) < 0)
1809			goto err;
1810	}
1811	return 0;
1812err:
1813	return -1;
1814}
1815
1816/*
1817 * get_siblings() - helper routine for cpuset_would_crash_kernel(), below.
1818 *
1819 * Extract max value of any 'siblings' field in /proc/cpuinfo.
1820 * Cache the result - only need to extract once in lifetime of task.
1821 *
1822 * The siblings field is the number of logical CPUs in a physical
1823 * processor package.  It is equal to the product of the number of
1824 * cores in that package, times the number of hyper-threads per core.
1825 * The bug that cpuset_would_crash_kernel() is detecting arises
1826 * when a cpu_exclusive cpuset tries to include just some, not all,
1827 * of the sibling logical CPUs available in a processor package.
1828 *
1829 * In the improbable case that a system has mixed values of siblings
1830 * (some processor packages have more than others, perhaps due to
1831 * partially enabling Hyper-Threading), we take the worse case value,
1832 * the largest siblings value.  This might be overkill.  I don't know
1833 * if this kernel bug considers each processor package's siblings
1834 * separately or not.  But it sure is easier this way ...
1835 *
1836 * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from
1837 * open to close, the first time called.
1838 */
1839
1840static int get_siblings()
1841{
1842	static int siblings;
1843	char buf[32];		/* big enough for one 'siblings' line */
1844	FILE *fp;
1845
1846	if (siblings)
1847		return siblings;
1848
1849	if ((fp = fopen("/proc/cpuinfo", "r")) == NULL)
1850		return 4;	/* wing it - /proc not mounted ? */
1851	while (flgets(buf, sizeof(buf), fp) != NULL) {
1852		int s;
1853
1854		if (sscanf(buf, "siblings : %d", &s) < 1)
1855			continue;
1856		if (s > siblings)
1857			siblings = s;
1858	}
1859	fclose(fp);
1860	if (siblings == 0)
1861		siblings = 1;	/* old kernel, no siblings, default to 1 */
1862	return siblings;
1863}
1864
1865/*
1866 * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic
1867 * scheduler domain code invoked for cpu_exclusive cpusets that causes
1868 * the kernel to freeze, requiring a hardware reset.
1869 *
1870 * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive'
1871 * cpuset is defined where that cpusets 'cpus' are not on package
1872 * boundaries then the kernel will freeze, usually as soon as this
1873 * cpuset is created, requiring a hardware reset.
1874 *
1875 * A cpusets 'cpus' are not on package boundaries if the cpuset
1876 * includes a proper non-empty subset (some, but not all) of the
1877 * logical cpus on a processor package.  This requires multiple
1878 * logical CPUs per package, available with either Hyper-Thread or
1879 * Multi-Core support.  Without one of these features, there is only
1880 * one logical CPU per physical package, and it's not possible to
1881 * have a proper, non-empty subset of a set of cardinality one.
1882 *
1883 * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC
1884 * on i386 and x86_64 arch's.
1885 *
1886 * The objective of this routine cpuset_would_crash_kernel() is to
1887 * determine if a proposed cpuset setting would crash the kernel due
1888 * to this bug, so that the caller can avoid the crash.
1889 *
1890 * Ideally we'd check for exactly these conditions here, but computing
1891 * the package (identified by the 'physical id' field of /proc/cpuinfo)
1892 * of each cpu in a cpuset is more effort than it's worth here.
1893 *
1894 * Also there is no obvious way to identify exactly whether the kernel
1895 * one is executing on has this bug, short of trying it, and seeing
1896 * if the kernel just crashed.
1897 *
1898 * So for now, we look for a simpler set of conditions, that meets
1899 * our immediate need - avoid this crash on SUSE SLES10 systems that
1900 * are susceptible to it.  We look for the kernel version 2.6.16.*,
1901 * which is the base kernel of SUSE SLES10, and for i386 or x86_64
1902 * processors, which had CONFIG_SCHED_MC enabled.
1903 *
1904 * If these simpler conditions are met, we further simplify the check,
1905 * by presuming that the logical CPUs are numbered on processor
1906 * package boundaries.  If each package has S siblings, we assume
1907 * that CPUs numbered N through N + S -1 are on the same package,
1908 * for any CPU N such that N mod S == 0.
1909 *
1910 * Yes, this is a hack, focused on avoiding kernel freezes on
1911 * susceptible SUSE SLES10 systems.
1912 */
1913
1914static int cpuset_would_crash_kernel(const struct cpuset *cp)
1915{
1916	static int susceptible_system = -1;
1917
1918	if (!cp->cpu_exclusive)
1919		goto ok;
1920
1921	if (susceptible_system == -1) {
1922		struct utsname u;
1923		int rel_2_6_16, arch_i386, arch_x86_64;
1924
1925		if (uname(&u) < 0)
1926			goto fail;
1927		rel_2_6_16 = strprefix(u.release, "2.6.16.");
1928		arch_i386 = streq(u.machine, "i386");
1929		arch_x86_64 = streq(u.machine, "x86_64");
1930		susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64);
1931	}
1932
1933	if (susceptible_system) {
1934		int ncpus = cpuset_cpus_nbits();
1935		int siblings = get_siblings();
1936		unsigned int cpu;
1937
1938		for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) {
1939			int s, num_set = 0;
1940
1941			for (s = 0; s < siblings; s++) {
1942				if (bitmask_isbitset(cp->cpus, cpu + s))
1943					num_set++;
1944			}
1945
1946			/* If none or all siblings set, we're still ok */
1947			if (num_set == 0 || num_set == siblings)
1948				continue;
1949
1950			/* Found one that would crash kernel.  Fail.  */
1951			errno = ENXIO;
1952			goto fail;
1953		}
1954	}
1955	/* If not susceptible, or if all ok, fall into "ok" ... */
1956ok:
1957	return 0;		/* would not crash */
1958fail:
1959	return 1;		/* would crash */
1960}
1961
1962/* compare two cpuset and mark the dirty variable */
1963static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2)
1964{
1965	if (cp1->cpu_exclusive_valid &&
1966	    cp1->cpu_exclusive != cp2->cpu_exclusive)
1967		cp1->cpu_exclusive_dirty = 1;
1968
1969	if (cp1->mem_exclusive_valid &&
1970	    cp1->mem_exclusive != cp2->mem_exclusive)
1971		cp1->mem_exclusive_dirty = 1;
1972
1973	if (cp1->mem_hardwall_valid &&
1974	    cp1->mem_hardwall != cp2->mem_hardwall)
1975		cp1->mem_hardwall_dirty = 1;
1976
1977	if (cp1->notify_on_release_valid &&
1978	    cp1->notify_on_release != cp2->notify_on_release)
1979		cp1->notify_on_release_dirty = 1;
1980
1981	if (cp1->memory_migrate_valid &&
1982	    cp1->memory_migrate != cp2->memory_migrate)
1983		cp1->memory_migrate_dirty = 1;
1984
1985	if (cp1->memory_pressure_enabled_valid &&
1986	    cp1->memory_pressure_enabled != cp2->memory_pressure_enabled)
1987		cp1->memory_pressure_enabled_dirty = 1;
1988
1989	if (cp1->memory_spread_page_valid &&
1990	    cp1->memory_spread_page != cp2->memory_spread_page)
1991		cp1->memory_spread_page_dirty = 1;
1992
1993	if (cp1->memory_spread_slab_valid &&
1994	    cp1->memory_spread_slab != cp2->memory_spread_slab)
1995		cp1->memory_spread_slab_dirty = 1;
1996
1997	if (cp1->sched_load_balance_valid &&
1998	    cp1->sched_load_balance != cp2->sched_load_balance)
1999		cp1->sched_load_balance_dirty = 1;
2000
2001	if (cp1->sched_relax_domain_level_valid &&
2002	    cp1->sched_relax_domain_level != cp2->sched_relax_domain_level)
2003		cp1->sched_relax_domain_level_dirty = 1;
2004
2005	if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus))
2006		cp1->cpus_dirty = 1;
2007	if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems))
2008		cp1->mems_dirty = 1;
2009}
2010
2011/* Create (if new set) or modify cpuset 'cp' at location 'relpath' */
2012static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new)
2013{
2014	char buf[PATH_MAX];
2015	int do_rmdir_on_err = 0;
2016	int do_restore_cp_sav_on_err = 0;
2017	struct cpuset *cp_sav = NULL;
2018	int sav_errno;
2019
2020	if (check() < 0)
2021		goto err;
2022
2023	if (cpuset_would_crash_kernel(cp))
2024		goto err;
2025
2026	fullpath(buf, sizeof(buf), relpath);
2027
2028	if (new) {
2029		if (mkdir(buf, 0755) < 0)
2030			goto err;
2031		/* we made it, so we should remove it on error */
2032		do_rmdir_on_err = 1;
2033	}
2034
2035	if ((cp_sav = cpuset_alloc()) == NULL)
2036		goto err;
2037	if (cpuset_query(cp_sav, relpath) < 0)
2038		goto err;
2039	/* we have old settings to restore on error */
2040	do_restore_cp_sav_on_err = 1;
2041
2042	/* check which variable need to restore on error */
2043	mark_dirty_variable(cp_sav, cp);
2044
2045	if (apply_cpuset_settings(buf, cp) < 0)
2046		goto err;
2047
2048	cpuset_free(cp_sav);
2049	return 0;
2050err:
2051	sav_errno = errno;
2052	if (do_restore_cp_sav_on_err)
2053		(void) apply_cpuset_settings(buf, cp_sav);
2054	if (cp_sav)
2055		cpuset_free(cp_sav);
2056	if (do_rmdir_on_err)
2057		(void) rmdir(buf);
2058	errno = sav_errno;
2059	return -1;
2060}
2061
2062/* Create cpuset 'cp' at location 'relpath' */
2063int cpuset_create(const char *relpath, const struct cpuset *cp)
2064{
2065	return cr_or_mod(relpath, cp, 1);
2066}
2067
2068/* Delete cpuset at location 'path' (if empty) */
2069int cpuset_delete(const char *relpath)
2070{
2071	char buf[PATH_MAX];
2072
2073	if (check() < 0)
2074		goto err;
2075
2076	fullpath(buf, sizeof(buf), relpath);
2077	if (rmdir(buf) < 0)
2078		goto err;
2079
2080	return 0;
2081err:
2082	return -1;
2083}
2084
2085/* Set cpuset cp to the cpuset at location 'path' */
2086int cpuset_query(struct cpuset *cp, const char *relpath)
2087{
2088	char buf[PATH_MAX];
2089
2090	if (check() < 0)
2091		goto err;
2092
2093	fullpath(buf, sizeof(buf), relpath);
2094
2095	if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0)
2096		goto err;
2097	cp->cpu_exclusive_valid = 1;
2098
2099	if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0)
2100		goto err;
2101	cp->mem_exclusive_valid = 1;
2102
2103	if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0)
2104		goto err;
2105	cp->notify_on_release_valid = 1;
2106
2107	if (exists_flag(buf, "memory_migrate")) {
2108		if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0)
2109			goto err;
2110		cp->memory_migrate_valid = 1;
2111	}
2112
2113	if (exists_flag(buf, "mem_hardwall")) {
2114		if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0)
2115			goto err;
2116		cp->mem_hardwall_valid = 1;
2117	}
2118
2119	if (exists_flag(buf, "memory_pressure_enabled")) {
2120		if (load_flag(buf, &cp->memory_pressure_enabled, "memory_pressure_enabled") < 0)
2121			goto err;
2122		cp->memory_pressure_enabled_valid = 1;
2123	}
2124
2125	if (exists_flag(buf, "memory_spread_page")) {
2126		if (load_flag(buf, &cp->memory_spread_page, "memory_spread_page") < 0)
2127			goto err;
2128		cp->memory_spread_page_valid = 1;
2129	}
2130
2131	if (exists_flag(buf, "memory_spread_slab")) {
2132		if (load_flag(buf, &cp->memory_spread_slab, "memory_spread_slab") < 0)
2133			goto err;
2134		cp->memory_spread_slab_valid = 1;
2135	}
2136
2137	if (exists_flag(buf, "sched_load_balance")) {
2138		if (load_flag(buf, &cp->sched_load_balance, "sched_load_balance") < 0)
2139			goto err;
2140		cp->sched_load_balance_valid = 1;
2141	}
2142
2143	if (exists_flag(buf, "sched_relax_domain_level")) {
2144		if (load_number(buf, &cp->sched_relax_domain_level, "sched_relax_domain_level") < 0)
2145			goto err;
2146		cp->sched_relax_domain_level_valid = 1;
2147	}
2148
2149	if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0)
2150		goto err;
2151	cp->cpus_valid = 1;
2152
2153	if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0)
2154		goto err;
2155	cp->mems_valid = 1;
2156
2157	return 0;
2158err:
2159	return -1;
2160}
2161
2162/* Modify cpuset at location 'relpath' to values of 'cp' */
2163int cpuset_modify(const char *relpath, const struct cpuset *cp)
2164{
2165	return cr_or_mod(relpath, cp, 0);
2166}
2167
2168/* Get cpuset path of pid into buf */
2169char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size)
2170{
2171	int fd;		     /* dual use: cpuset file for pid and self */
2172	int rc;		     /* dual use: snprintf and read return codes */
2173
2174	if (check() < 0)
2175		return NULL;
2176
2177	/* borrow result buf[] to build cpuset file path */
2178	if (pid == 0)
2179		rc = snprintf(buf, size, "/proc/self/cpuset");
2180	else
2181		rc = snprintf(buf, size, "/proc/%d/cpuset", pid);
2182	if (rc >= (int)size) {
2183		errno = E2BIG;
2184		return NULL;
2185	}
2186	if ((fd = open(buf, O_RDONLY)) < 0) {
2187		int e = errno;
2188		if (e == ENOENT)
2189			e = ESRCH;
2190		if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0)
2191			e = ENOSYS;
2192		else
2193			close(fd);
2194		errno = e;
2195		return NULL;
2196	}
2197	rc = read(fd, buf, size);
2198	close(fd);
2199	if (rc < 0)
2200		return NULL;
2201	if (rc >= (int)size) {
2202		errno = E2BIG;
2203		return NULL;
2204	}
2205	buf[rc] = 0;
2206	chomp(buf);
2207	return buf;
2208
2209}
2210
2211/* Get cpuset 'cp' of pid */
2212int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid)
2213{
2214	char buf[PATH_MAX];
2215
2216	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
2217		return -1;
2218	if (cpuset_query(cp, buf) < 0)
2219		return -1;
2220	return 0;
2221}
2222
2223/* [optional] Return mountpoint of cpuset filesystem */
2224const char *cpuset_mountpoint()
2225{
2226	if (check() < 0) {
2227		switch (errno) {
2228		case ENODEV:
2229			return "[cpuset filesystem not mounted]";
2230		default:
2231			return "[cpuset filesystem not supported]";
2232		}
2233	}
2234	return cpusetmnt;
2235}
2236
2237/* Return true if path is a directory. */
2238static int isdir(const char *path)
2239{
2240	struct stat statbuf;
2241
2242	if (stat(path, &statbuf) < 0)
2243		return 0;
2244	return S_ISDIR(statbuf.st_mode);
2245}
2246
2247/*
2248 * [optional] cpuset_collides_exclusive() - True if would collide exclusive.
2249 *
2250 * Return true iff the specified cpuset would overlap with any
2251 * sibling cpusets in either cpus or mems, where either this
2252 * cpuset or the sibling is cpu_exclusive or mem_exclusive.
2253 *
2254 * cpuset_create() fails with errno == EINVAL if the requested cpuset
2255 * would overlap with any sibling, where either one is cpu_exclusive or
2256 * mem_exclusive.  This is a common, and not obvious error.  The
2257 * following routine checks for this particular case, so that code
2258 * creating cpusets can better identify the situation, perhaps to issue
2259 * a more informative error message.
2260 *
2261 * Can also be used to diagnose cpuset_modify failures.  This
2262 * routine ignores any existing cpuset with the same path as the
2263 * given 'cpusetpath', and only looks for exclusive collisions with
2264 * sibling cpusets of that path.
2265 *
2266 * In case of any error, returns (0) -- does not collide.  Presumably
2267 * any actual attempt to create or modify a cpuset will encounter the
2268 * same error, and report it usefully.
2269 *
2270 * This routine is not particularly efficient; most likely code creating or
2271 * modifying a cpuset will want to try the operation first, and then if that
2272 * fails with errno EINVAL, perhaps call this routine to determine if an
2273 * exclusive cpuset collision caused the error.
2274 */
2275
2276int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1)
2277{
2278	char parent[PATH_MAX];
2279	char *p;
2280	char *pathcopy = NULL;
2281	char *base;
2282	DIR *dir = NULL;
2283	struct dirent *dent;
2284	struct cpuset *cp2 = NULL;
2285	struct bitmask *cpus1 = NULL, *cpus2 = NULL;
2286	struct bitmask *mems1 = NULL, *mems2 = NULL;
2287	int ret;
2288
2289	if (check() < 0)
2290		goto err;
2291
2292	fullpath(parent, sizeof(parent), cpusetpath);
2293	if (streq(parent, cpusetmnt))
2294		goto err;	/* only one cpuset root - can't collide */
2295	pathcopy = strdup(parent);
2296	p = strrchr(parent, '/');
2297	if (!p)
2298		goto err;	/* huh? - impossible - run and hide */
2299	*p = 0;			/* now parent is dirname of fullpath */
2300
2301	p = strrchr(pathcopy, '/');
2302	base = p + 1;		/* now base is basename of fullpath */
2303	if (!*base)
2304		goto err;	/* this is also impossible - run away */
2305
2306	if ((dir = opendir(parent)) == NULL)
2307		goto err;
2308	if ((cp2 = cpuset_alloc()) == NULL)
2309		goto err;
2310	if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2311		goto err;
2312	if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2313		goto err;
2314	if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2315		goto err;
2316	if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2317		goto err;
2318
2319	while ((dent = readdir(dir)) != NULL) {
2320		char child[PATH_MAX];
2321
2322		if (streq(dent->d_name, ".") || streq(dent->d_name, ".."))
2323			continue;
2324		if (streq(dent->d_name, base))
2325			continue;
2326		pathcat2(child, sizeof(child), parent, dent->d_name);
2327		if (!isdir(child))
2328			continue;
2329		if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0)
2330			goto err;
2331		if (cp1->cpu_exclusive || cp2->cpu_exclusive) {
2332			cpuset_getcpus(cp1, cpus1);
2333			cpuset_getcpus(cp2, cpus2);
2334			if (bitmask_intersects(cpus1, cpus2))
2335				goto collides;
2336		}
2337		if (cp1->mem_exclusive || cp2->mem_exclusive) {
2338			cpuset_getmems(cp1, mems1);
2339			cpuset_getmems(cp2, mems2);
2340			if (bitmask_intersects(mems1, mems2))
2341				goto collides;
2342		}
2343	}
2344err:
2345	/* error, or did not collide */
2346	ret = 0;
2347	goto done;
2348collides:
2349	/* collides */
2350	ret = 1;
2351	/* fall into ... */
2352done:
2353	if (dir)
2354		closedir(dir);
2355	cpuset_free(cp2);
2356	free(pathcopy);
2357	bitmask_free(cpus1);
2358	bitmask_free(cpus2);
2359	bitmask_free(mems1);
2360	bitmask_free(mems2);
2361	return ret;
2362}
2363
2364/*
2365 * [optional] cpuset_nuke() - Remove cpuset anyway possible
2366 *
2367 * Remove a cpuset, including killing tasks in it, and
2368 * removing any descendent cpusets and killing their tasks.
2369 *
2370 * Tasks can take a long time (minutes on some configurations)
2371 * to exit.  Loop up to 'seconds' seconds, trying to kill them.
2372 *
2373 * How we do it:
2374 *	1) First, kill all the pids, looping until there are
2375 *	   no more pids in this cpuset or below, or until the
2376 *	   'seconds' timeout limit is exceeded.
2377 *	2) Then depth first recursively rmdir the cpuset directories.
2378 *	3) If by this point the original cpuset is gone, we succeeded.
2379 *
2380 * If the timeout is exceeded, and tasks still exist, fail with
2381 * errno == ETIME.
2382 *
2383 * We sleep a variable amount of time.  After the first attempt to
2384 * kill all the tasks in the cpuset or its descendents, we sleep 1
2385 * second, the next time 2 seconds, increasing 1 second each loop
2386 * up to a max of 10 seconds.  If more loops past 10 are required
2387 * to kill all the tasks, we sleep 10 seconds each subsequent loop.
2388 * In any case, before the last loop, we sleep however many seconds
2389 * remain of the original timeout 'seconds' requested.  The total
2390 * time of all sleeps will be no more than the requested 'seconds'.
2391 *
2392 * If the cpuset started out empty of any tasks, or if the passed in
2393 * 'seconds' was zero, then this routine will return quickly, having
2394 * not slept at all.  Otherwise, this routine will at a minimum send
2395 * a SIGKILL to all the tasks in this cpuset subtree, then sleep one
2396 * second, before looking to see if any tasks remain.  If tasks remain
2397 * in the cpuset subtree, and a longer 'seconds' timeout was requested
2398 * (more than one), it will continue to kill remaining tasks and sleep,
2399 * in a loop, for as long as time and tasks remain.
2400 *
2401 * The signal sent for the kill is hardcoded to SIGKILL (9).  If some
2402 * other signal should be sent first, use a separate code loop,
2403 * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to
2404 * scan the task pids in a cpuset.  If SIGKILL should -not- be sent,
2405 * this cpuset_nuke() routine can still be called to recursively
2406 * remove a cpuset subtree, by specifying a timeout of zero 'seconds'.
2407 *
2408 * On success, returns 0 with errno == 0.
2409 *
2410 * On failure, returns -1, with errno possibly one of:
2411 *  EACCES - search permission denied on intervening directory
2412 *  ETIME - timed out - tasks remain after 'seconds' timeout
2413 *  EMFILE - too many open files
2414 *  ENODEV - /dev/cpuset not mounted
2415 *  ENOENT - component of cpuset path doesn't exist
2416 *  ENOMEM - out of memory
2417 *  ENOSYS - kernel doesn't support cpusets
2418 *  ENOTDIR - component of cpuset path is not a directory
2419 *  EPERM - lacked permission to kill a task
2420 *  EPERM - lacked permission to read cpusets or files therein
2421 */
2422
2423void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree);
2424
2425int cpuset_nuke(const char *relpath, unsigned int seconds)
2426{
2427	unsigned int secs_left = seconds;	/* total sleep seconds left */
2428	unsigned int secs_loop = 1;		/* how much sleep next loop */
2429	unsigned int secs_slept;		/* seconds slept in sleep() */
2430	struct cpuset_pidlist *pl = NULL;	/* pids in cpuset subtree */
2431	struct cpuset_fts_tree *cs_tree;
2432	const struct cpuset_fts_entry *cs_entry;
2433	int ret, sav_errno = 0;
2434
2435	if (check() < 0)
2436		return -1;
2437
2438	if (seconds == 0)
2439		goto rmdir_cpusets;
2440
2441	while (1) {
2442		int plen, j;
2443
2444		if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) {
2445			/* missing cpuset is as good as if already nuked */
2446			if (errno == ENOENT) {
2447				ret = 0;
2448				goto no_more_cpuset;
2449			}
2450
2451			/* other problems reading cpuset are bad news */
2452			sav_errno = errno;
2453			goto failed;
2454		}
2455
2456		if ((plen = cpuset_pidlist_length(pl)) == 0)
2457			goto rmdir_cpusets;
2458
2459		for (j = 0; j < plen; j++) {
2460			pid_t pid;
2461
2462			if ((pid = cpuset_get_pidlist(pl, j)) > 1) {
2463				if (kill(pid, SIGKILL) < 0 && errno != ESRCH) {
2464					sav_errno = errno;
2465					goto failed;
2466				}
2467			}
2468		}
2469
2470		if (secs_left == 0)
2471			goto took_too_long;
2472
2473		cpuset_freepidlist(pl);
2474		pl = NULL;
2475
2476		secs_slept = secs_loop - sleep(secs_loop);
2477
2478		/* Ensure forward progress */
2479		if (secs_slept == 0)
2480			secs_slept = 1;
2481
2482		/* Ensure sane sleep() return (unnecessary?) */
2483		if (secs_slept > secs_loop)
2484			secs_slept = secs_loop;
2485
2486		secs_left -= secs_slept;
2487
2488		if (secs_loop < 10)
2489			secs_loop++;
2490
2491		secs_loop = min(secs_left, secs_loop);
2492	}
2493
2494took_too_long:
2495	sav_errno = ETIME;
2496	/* fall into ... */
2497failed:
2498	cpuset_freepidlist(pl);
2499	errno = sav_errno;
2500	return -1;
2501
2502rmdir_cpusets:
2503	/* Let's try removing cpuset(s) now. */
2504	cpuset_freepidlist(pl);
2505
2506	if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT)
2507		return -1;
2508	ret = 0;
2509	cpuset_fts_reverse(cs_tree);	/* rmdir's must be done bottom up */
2510	while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2511		char buf[PATH_MAX];
2512
2513		fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry));
2514		if (rmdir(buf) < 0 && errno != ENOENT) {
2515			sav_errno = errno;
2516			ret = -1;
2517		}
2518	}
2519	cpuset_fts_close(cs_tree);
2520	/* fall into ... */
2521no_more_cpuset:
2522	if (ret == 0)
2523		errno = 0;
2524	else
2525		errno = sav_errno;
2526	return ret;
2527}
2528
2529/*
2530 * When recursively reading all the tasks files from a subtree,
2531 * chain together the read results, one pidblock per tasks file,
2532 * containing the raw unprocessed ascii as read(2) in.  After
2533 * we gather up this raw data, we then go back to count how
2534 * many pid's there are in total, allocate an array of pid_t
2535 * of that size, and transform the raw ascii data into this
2536 * array of pid_t's.
2537 */
2538
2539struct pidblock {
2540	char *buf;
2541	int buflen;
2542	struct pidblock *next;
2543};
2544
2545/*
2546 * Chain the raw contents of a file onto the pbhead list.
2547 *
2548 * We malloc "+ 1" extra byte for a nul-terminator, so that
2549 * the strtoul() loop in pid_transform() won't scan past
2550 * the end of pb->buf[] and accidentally find more pids.
2551 */
2552static void add_pidblock(const char *file, struct pidblock **ppbhead)
2553{
2554	FILE *fp = NULL;
2555	struct pidblock *pb = NULL;
2556	int fsz;
2557
2558	if ((fp = fopen(file, "r")) == NULL)
2559		goto err;
2560	fsz = filesize(fp);
2561	if (fsz == 0)
2562		goto err;
2563	if ((pb = calloc(1, sizeof(*pb))) == NULL)
2564		goto err;
2565	pb->buflen = fsz;
2566	if ((pb->buf = malloc(pb->buflen + 1)) == NULL)
2567		goto err;
2568	if (fread(pb->buf, 1, pb->buflen, fp) > 0) {
2569		pb->buf[pb->buflen] = '\0';
2570		pb->next = *ppbhead;
2571		*ppbhead = pb;
2572	}
2573	fclose(fp);
2574	return;
2575err:
2576	if (fp)
2577		fclose(fp);
2578	if (pb)
2579		free(pb);
2580}
2581
2582static void read_task_file(const char *relpath, struct pidblock **ppbhead)
2583{
2584	char buf[PATH_MAX];
2585
2586	fullpath2(buf, sizeof(buf), relpath, "tasks");
2587	add_pidblock(buf, ppbhead);
2588}
2589
2590struct cpuset_pidlist {
2591	pid_t *pids;
2592	int npids;
2593};
2594
2595/* Count how many pids in buf (one per line - just count newlines) */
2596static int pidcount(const char *buf, int buflen)
2597{
2598	int n = 0;
2599	const char *cp;
2600
2601	for (cp = buf; cp < buf + buflen; cp++) {
2602		if (*cp == '\n')
2603			n++;
2604	}
2605	return n;
2606}
2607
2608/* Transform one-per-line ascii pids in pb to pid_t entries in pl */
2609static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n)
2610{
2611	char *a, *b;
2612
2613	for (a = pb->buf; a < pb->buf + pb->buflen; a = b) {
2614		pid_t p = strtoul(a, &b, 10);
2615		if (a == b)
2616			break;
2617		pl->pids[n++] = p;
2618	}
2619	return n;
2620}
2621
2622static void free_pidblocks(struct pidblock *pbhead)
2623{
2624	struct pidblock *pb, *nextpb;
2625
2626	for (pb = pbhead; pb; pb = nextpb) {
2627		nextpb = pb->next;
2628		free(pb->buf);
2629		free(pb);
2630	}
2631}
2632
2633/* numeric comparison routine for qsort */
2634static int numericsort(const void *m1, const void *m2)
2635{
2636	pid_t p1 = * (pid_t *) m1;
2637	pid_t p2 = * (pid_t *) m2;
2638
2639	return p1 - p2;
2640}
2641
2642/* Return list pids in cpuset 'path' */
2643struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath,
2644							int recursiveflag)
2645{
2646	struct pidblock *pb = NULL;
2647	struct cpuset_pidlist *pl = NULL;
2648	struct pidblock *pbhead = NULL;
2649	int n;
2650
2651	if (check() < 0)
2652		goto err;
2653
2654	if (recursiveflag) {
2655		struct cpuset_fts_tree *cs_tree;
2656		const struct cpuset_fts_entry *cs_entry;
2657
2658		if ((cs_tree = cpuset_fts_open(relpath)) == NULL)
2659			goto err;
2660		while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2661			if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET)
2662				continue;
2663			read_task_file(cpuset_fts_get_path(cs_entry), &pbhead);
2664		}
2665		cpuset_fts_close(cs_tree);
2666	} else {
2667		read_task_file(relpath, &pbhead);
2668	}
2669
2670	if ((pl = calloc(1, sizeof(*pl))) == NULL)
2671		goto err;
2672	pl->npids = 0;
2673	for (pb = pbhead; pb; pb = pb->next)
2674		pl->npids += pidcount(pb->buf, pb->buflen);
2675	if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL)
2676		goto err;
2677	n = 0;
2678	for (pb = pbhead; pb; pb = pb->next)
2679		n = pid_transform(pb, pl, n);
2680	free_pidblocks(pbhead);
2681	qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort);
2682	return pl;
2683err:
2684	cpuset_freepidlist(pl);
2685	free_pidblocks(pbhead);
2686	return NULL;
2687}
2688
2689/* Return number of elements in pidlist */
2690int cpuset_pidlist_length(const struct cpuset_pidlist *pl)
2691{
2692	if (pl)
2693		return pl->npids;
2694	else
2695		return 0;
2696}
2697
2698/* Return i'th element of pidlist */
2699pid_t cpuset_get_pidlist(const struct cpuset_pidlist *pl, int i)
2700{
2701	if (pl && i >= 0 && i < pl->npids)
2702		return pl->pids[i];
2703	else
2704		return (pid_t)-1;
2705}
2706
2707/* Free pidlist */
2708void cpuset_freepidlist(struct cpuset_pidlist *pl)
2709{
2710	if (pl && pl->pids)
2711		free(pl->pids);
2712	if (pl)
2713		free(pl);
2714}
2715
2716static int __cpuset_move(pid_t pid, const char *path)
2717{
2718	char buf[SMALL_BUFSZ];
2719
2720	snprintf(buf, sizeof(buf), "%u", pid);
2721	return write_string_file(path, buf);
2722}
2723
2724/* Move task (pid == 0 for current) to a cpuset */
2725int cpuset_move(pid_t pid, const char *relpath)
2726{
2727	char buf[PATH_MAX];
2728
2729	if (check() < 0)
2730		return -1;
2731
2732	if (pid == 0)
2733		pid = getpid();
2734
2735	fullpath2(buf, sizeof(buf), relpath, "tasks");
2736	return __cpuset_move(pid, buf);
2737}
2738
2739/* Move all tasks in pidlist to a cpuset */
2740int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath)
2741{
2742	int i;
2743	char buf[PATH_MAX];
2744	int ret;
2745
2746	if (check() < 0)
2747		return -1;
2748
2749	fullpath2(buf, sizeof(buf), relpath, "tasks");
2750
2751	ret = 0;
2752	for (i = 0; i < pl->npids; i++)
2753		if (__cpuset_move(pl->pids[i], buf) < 0)
2754			ret = -1;
2755	return ret;
2756}
2757
2758/*
2759 * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a
2760 *                                      cpuset to another cpuset
2761 *
2762 * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may
2763 * race with tasks being added to or forking into fromrelpath. Loop
2764 * repeatedly, reading the tasks file of cpuset fromrelpath and writing
2765 * any task pid's found there to the tasks file of cpuset torelpath,
2766 * up to ten attempts, or until the tasks file of cpuset fromrelpath
2767 * is empty, or until fromrelpath is no longer present.
2768 *
2769 * Returns 0 with errno == 0 if able to empty the tasks file of cpuset
2770 * fromrelpath. Of course it is still possible that some independent
2771 * task could add another task to cpuset fromrelpath at the same time
2772 * that such a successful result is being returned, so there can be
2773 * no guarantee that a successful return means that fromrelpath is
2774 * still empty of tasks.
2775 *
2776 * We are careful to allow for the possibility that the cpuset
2777 * fromrelpath might disappear out from under us, perhaps because it
2778 * has notify_on_release set and gets automatically removed as soon
2779 * as we detach its last task from it.  Consider a missing fromrelpath
2780 * to be a successful move.
2781 *
2782 * If called with fromrelpath and torelpath pathnames that evaluate to
2783 * the same cpuset, then treat that as if cpuset_reattach() was called,
2784 * rebinding each task in this cpuset one time, and return success or
2785 * failure depending on the return of that cpuset_reattach() call.
2786 *
2787 * On failure, returns -1, with errno possibly one of:
2788 *  EACCES - search permission denied on intervening directory
2789 *  ENOTEMPTY - tasks remain after multiple attempts to move them
2790 *  EMFILE - too many open files
2791 *  ENODEV - /dev/cpuset not mounted
2792 *  ENOENT - component of cpuset path doesn't exist
2793 *  ENOMEM - out of memory
2794 *  ENOSYS - kernel doesn't support cpusets
2795 *  ENOTDIR - component of cpuset path is not a directory
2796 *  EPERM - lacked permission to kill a task
2797 *  EPERM - lacked permission to read cpusets or files therein
2798 *
2799 * This is an [optional] function. Use cpuset_function to invoke it.
2800 */
2801
2802#define NUMBER_MOVE_TASK_ATTEMPTS 10
2803
2804int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath)
2805{
2806	char fromfullpath[PATH_MAX];
2807	char tofullpath[PATH_MAX];
2808	int i;
2809	struct cpuset_pidlist *pl = NULL;
2810	int sav_errno;
2811
2812	fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath);
2813	fullpath(tofullpath, sizeof(tofullpath), torelpath);
2814
2815	if (samefile(fromfullpath, tofullpath))
2816		return cpuset_reattach(fromrelpath);
2817
2818	for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) {
2819		int plen, j;
2820
2821		if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) {
2822			/* missing cpuset is as good as if all moved */
2823			if (errno == ENOENT)
2824				goto no_more_cpuset;
2825
2826			/* other problems reading cpuset are bad news */
2827			sav_errno = errno;
2828			goto failed;
2829		}
2830
2831		if ((plen = cpuset_pidlist_length(pl)) == 0)
2832			goto no_more_pids;
2833
2834		for (j = 0; j < plen; j++) {
2835			pid_t pid;
2836
2837			pid = cpuset_get_pidlist(pl, j);
2838			if (cpuset_move(pid, torelpath) < 0) {
2839				/* missing task is as good as if moved */
2840				if (errno == ESRCH)
2841					continue;
2842
2843				/* other per-task errors are bad news */
2844				sav_errno = errno;
2845				goto failed;
2846			}
2847		}
2848
2849		cpuset_freepidlist(pl);
2850		pl = NULL;
2851	}
2852
2853	sav_errno = ENOTEMPTY;
2854	/* fall into ... */
2855failed:
2856	cpuset_freepidlist(pl);
2857	errno = sav_errno;
2858	return -1;
2859
2860no_more_pids:
2861no_more_cpuset:
2862	/* Success - all tasks (or entire cpuset ;) gone. */
2863	cpuset_freepidlist(pl);
2864	errno = 0;
2865	return 0;
2866}
2867
2868/* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */
2869int cpuset_migrate(pid_t pid, const char *relpath)
2870{
2871	char buf[PATH_MAX];
2872	char buf2[PATH_MAX];
2873	char memory_migrate_flag;
2874	int r;
2875
2876	if (check() < 0)
2877		return -1;
2878
2879	if (pid == 0)
2880		pid = getpid();
2881
2882	fullpath(buf2, sizeof(buf2), relpath);
2883
2884	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2885		return -1;
2886	if (store_flag(buf2, "memory_migrate", 1) < 0)
2887		return -1;
2888
2889	fullpath2(buf, sizeof(buf), relpath, "tasks");
2890
2891	r = __cpuset_move(pid, buf);
2892
2893	store_flag(buf2, "memory_migrate", memory_migrate_flag);
2894	return r;
2895}
2896
2897/* Migrate all tasks in pidlist to a cpuset (moves task and memory) */
2898int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath)
2899{
2900	int i;
2901	char buf[PATH_MAX];
2902	char buf2[PATH_MAX];
2903	char memory_migrate_flag;
2904	int ret;
2905
2906	if (check() < 0)
2907		return -1;
2908
2909	fullpath(buf2, sizeof(buf2), relpath);
2910
2911	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2912		return -1;
2913	if (store_flag(buf2, "memory_migrate", 1) < 0)
2914		return -1;
2915
2916 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2917
2918	ret = 0;
2919	for (i = 0; i < pl->npids; i++)
2920		if (__cpuset_move(pl->pids[i], buf) < 0)
2921			ret = -1;
2922
2923	if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0)
2924		ret = -1;
2925	return ret;
2926}
2927
2928/* Rebind cpus_allowed of each task in cpuset 'path' */
2929int cpuset_reattach(const char *relpath)
2930{
2931	struct cpuset_pidlist *pl;
2932	int rc;
2933
2934	if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL)
2935		return -1;
2936	rc = cpuset_move_all(pl, relpath);
2937	cpuset_freepidlist(pl);
2938	return rc;
2939}
2940
2941/* Map cpuset relative cpu number to system wide cpu number */
2942int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu)
2943{
2944	struct cpuset *cp_tofree = NULL;
2945	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2946	int pos = -1;
2947
2948	if (!cp1)
2949		goto err;
2950	pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu);
2951	/* fall into ... */
2952err:
2953	cpuset_free(cp_tofree);
2954	return pos;
2955}
2956
2957/* Map system wide cpu number to cpuset relative cpu number */
2958int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu)
2959{
2960	struct cpuset *cp_tofree = NULL;
2961	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2962	int pos = -1;
2963
2964	if (!cp1)
2965		goto err;
2966	pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu);
2967	/* fall into ... */
2968err:
2969	cpuset_free(cp_tofree);
2970	return pos;
2971}
2972
2973/* Map cpuset relative mem number to system wide mem number */
2974int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem)
2975{
2976	struct cpuset *cp_tofree = NULL;
2977	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2978	int pos = -1;
2979
2980	if (!cp1)
2981		goto err;
2982	pos = bitmask_rel_to_abs_pos(cp1->mems, mem);
2983	/* fall into ... */
2984err:
2985	cpuset_free(cp_tofree);
2986	return pos;
2987}
2988
2989/* Map system wide mem number to cpuset relative mem number */
2990int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem)
2991{
2992	struct cpuset *cp_tofree = NULL;
2993	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2994	int pos = -1;
2995
2996	if (!cp1)
2997		goto err;
2998	pos = bitmask_abs_to_rel_pos(cp1->mems, mem);
2999	/* fall into ... */
3000err:
3001	cpuset_free(cp_tofree);
3002	return pos;
3003}
3004
3005/* Map pid's cpuset relative cpu number to system wide cpu number */
3006int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu)
3007{
3008	struct cpuset *cp;
3009	int rc = -1;
3010
3011	if ((cp = cpuset_alloc()) == NULL)
3012		goto done;
3013	if (cpuset_cpusetofpid(cp, pid) < 0)
3014		goto done;
3015	rc = cpuset_c_rel_to_sys_cpu(cp, cpu);
3016done:
3017	cpuset_free(cp);
3018	return rc;
3019}
3020
3021/* Map system wide cpu number to pid's cpuset relative cpu number */
3022int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu)
3023{
3024	struct cpuset *cp;
3025	int rc = -1;
3026
3027	if ((cp = cpuset_alloc()) == NULL)
3028		goto done;
3029	if (cpuset_cpusetofpid(cp, pid) < 0)
3030		goto done;
3031	rc = cpuset_c_sys_to_rel_cpu(cp, cpu);
3032done:
3033	cpuset_free(cp);
3034	return rc;
3035}
3036
3037/* Map pid's cpuset relative mem number to system wide mem number */
3038int cpuset_p_rel_to_sys_mem(pid_t pid, int mem)
3039{
3040	struct cpuset *cp;
3041	int rc = -1;
3042
3043	if ((cp = cpuset_alloc()) == NULL)
3044		goto done;
3045	if (cpuset_cpusetofpid(cp, pid) < 0)
3046		goto done;
3047	rc = cpuset_c_rel_to_sys_mem(cp, mem);
3048done:
3049	cpuset_free(cp);
3050	return rc;
3051}
3052
3053/* Map system wide mem number to pid's cpuset relative mem number */
3054int cpuset_p_sys_to_rel_mem(pid_t pid, int mem)
3055{
3056	struct cpuset *cp;
3057	int rc = -1;
3058
3059	if ((cp = cpuset_alloc()) == NULL)
3060		goto done;
3061	if (cpuset_cpusetofpid(cp, pid) < 0)
3062		goto done;
3063	rc = cpuset_c_sys_to_rel_mem(cp, mem);
3064done:
3065	cpuset_free(cp);
3066	return rc;
3067}
3068
3069/*
3070 * Override glibc's calls for get/set affinity - they have
3071 * something using cpu_set_t that will die when NR_CPUS > 1024.
3072 * Go directly to the 'real' system calls.  Also override calls
3073 * for get_mempolicy and set_mempolicy.  None of these
3074 * calls are yet (July 2004) guaranteed to be in all glibc versions
3075 * that we care about.
3076 */
3077
3078static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask)
3079{
3080	return syscall(__NR_sched_setaffinity, pid, len, mask);
3081}
3082
3083#if HAVE_DECL_MEMPOLICY
3084static int get_mempolicy(int *policy, unsigned long *nmask,
3085			unsigned long maxnode, void *addr, int flags)
3086{
3087	return syscall(__NR_get_mempolicy, policy, nmask, maxnode, addr, flags);
3088}
3089
3090static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode)
3091{
3092	return syscall(__NR_set_mempolicy, mode, nmask, maxnode);
3093}
3094#endif
3095
3096struct cpuset_placement {
3097	struct bitmask *cpus;
3098	struct bitmask *mems;
3099	char *path;
3100};
3101
3102/* Allocate and fill in a placement struct - cpatures current placement */
3103struct cpuset_placement *cpuset_get_placement(pid_t pid)
3104{
3105	struct cpuset_placement *plc;
3106	struct cpuset *cp = NULL;
3107	char buf[PATH_MAX];
3108	int nbits;
3109
3110	if ((plc = calloc(1, sizeof(*plc))) == NULL)
3111		goto err;
3112
3113	nbits = cpuset_cpus_nbits();
3114	if ((plc->cpus = bitmask_alloc(nbits)) == NULL)
3115		goto err;
3116
3117	nbits = cpuset_mems_nbits();
3118	if ((plc->mems = bitmask_alloc(nbits)) == NULL)
3119		goto err;
3120
3121	if ((cp = cpuset_alloc()) == NULL)
3122		goto err;
3123	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
3124		goto err;
3125	if (cpuset_query(cp, buf) < 0)
3126		goto err;
3127
3128	bitmask_copy(plc->cpus, cp->cpus);
3129	bitmask_copy(plc->mems, cp->mems);
3130	plc->path = strdup(buf);
3131
3132	cpuset_free(cp);
3133	return plc;
3134err:
3135	cpuset_free(cp);
3136	cpuset_free_placement(plc);
3137	return NULL;
3138}
3139
3140/* Compare two placement structs - use to detect changes in placement */
3141int cpuset_equal_placement(const struct cpuset_placement *plc1,
3142					const struct cpuset_placement *plc2)
3143{
3144	return	bitmask_equal(plc1->cpus, plc2->cpus) &&
3145		bitmask_equal(plc1->mems, plc2->mems) &&
3146		streq(plc1->path, plc2->path);
3147}
3148
3149/* Free a placement struct */
3150void cpuset_free_placement(struct cpuset_placement *plc)
3151{
3152	if (!plc)
3153		return;
3154	bitmask_free(plc->cpus);
3155	bitmask_free(plc->mems);
3156	free(plc->path);
3157	free(plc);
3158}
3159
3160/*
3161 * A cpuset_fts_open() call constructs a linked list of entries
3162 * called a "cpuset_fts_tree", with one entry per cpuset below
3163 * the specified path.  The cpuset_fts_read() routine returns the
3164 * next entry on this list.  The various cpuset_fts_get_*() calls
3165 * return attributes of the specified entry.  The cpuset_fts_close()
3166 * call frees the linked list and all associated data.  All cpuset
3167 * entries and attributes for the cpuset_fts_tree returned from a
3168 * given cpuset_fts_open() call remain allocated and unchanged until
3169 * that cpuset_fts_tree is closed by a cpuset_fts_close() call.  Any
3170 * subsequent changes to the cpuset filesystem will go unnoticed
3171 * (not affect open cpuset_fts_tree's.)
3172 */
3173
3174struct cpuset_fts_entry;
3175void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree);
3176
3177struct cpuset_fts_tree {
3178	struct cpuset_fts_entry *head;	/* head of linked entry list */
3179	struct cpuset_fts_entry *next;	/* cpuset_fts_read() offset */
3180};
3181
3182struct cpuset_fts_entry {
3183	struct cpuset_fts_entry *next;	/* linked entry list chain */
3184	struct cpuset *cpuset;
3185	struct stat *stat;
3186	char *path;
3187	int info;
3188	int err;
3189};
3190
3191/* Open a handle on a cpuset hierarchy.  All the real work is done here. */
3192struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath)
3193{
3194	FTS* fts = NULL;
3195	FTSENT *ftsent;
3196	char *path_argv[2];
3197	char buf[PATH_MAX];
3198	struct cpuset_fts_tree *cs_tree = NULL;
3199	struct cpuset_fts_entry *ep;	  /* the latest new list entry */
3200	struct cpuset_fts_entry **pnlep;  /* ptr to next list entry ptr */
3201	char *relpath;
3202	int fts_flags;
3203
3204	fullpath(buf, sizeof(buf), cpusetpath);
3205	path_argv[0] = buf;
3206	path_argv[1] = NULL;
3207
3208	fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV;
3209	fts = fts_open(path_argv, fts_flags, NULL);
3210	if (fts == NULL)
3211		goto err;
3212
3213	cs_tree = malloc(sizeof(*cs_tree));
3214	if (cs_tree == NULL)
3215		goto err;
3216	pnlep = &cs_tree->head;
3217	*pnlep = NULL;
3218
3219	while ((ftsent = fts_read(fts)) != NULL) {
3220		if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR)
3221			continue;
3222
3223		/* ftsent is a directory (perhaps unreadable) ==> cpuset */
3224		ep = calloc(1, sizeof(*ep));
3225		if (ep == NULL)
3226			goto err;
3227		*pnlep = ep;
3228		pnlep = &ep->next;
3229
3230		/* Set entry's path, and if DNR, error */
3231		relpath = ftsent->fts_path + strlen(cpusetmnt);
3232		if (strlen(relpath) == 0)
3233			relpath = "/";
3234		ep->path = strdup(relpath);
3235		if (ep->path == NULL)
3236			goto err;
3237		if (ftsent->fts_info == FTS_DNR) {
3238			ep->info = CPUSET_FTS_ERR_DNR;
3239			ep->err = ftsent->fts_errno;
3240			continue;
3241		}
3242
3243		/* ftsent is a -readable- cpuset: set entry's stat, etc */
3244		ep->stat = calloc(1, sizeof(struct stat));
3245		if (ep->stat == NULL)
3246			goto err;
3247		if (stat(ftsent->fts_path, ep->stat) < 0) {
3248			ep->info = CPUSET_FTS_ERR_STAT;
3249			ep->err = ftsent->fts_errno;
3250			continue;
3251		}
3252
3253		ep->cpuset = calloc(1, sizeof(struct cpuset));
3254		if (ep->cpuset == NULL)
3255			goto err;
3256		if (cpuset_query(ep->cpuset, relpath) < 0) {
3257			ep->info = CPUSET_FTS_ERR_CPUSET;
3258			ep->err = errno;
3259			continue;
3260		}
3261		ep->info = CPUSET_FTS_CPUSET;
3262	}
3263
3264	(void) fts_close(fts);
3265	cpuset_fts_rewind(cs_tree);
3266	return cs_tree;
3267
3268err:
3269	if (cs_tree)
3270		cpuset_fts_close(cs_tree);
3271	if (fts)
3272		(void) fts_close(fts);
3273	return NULL;
3274}
3275
3276/* Return pointer to next cpuset entry in hierarchy */
3277const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree)
3278{
3279	const struct cpuset_fts_entry *cs_entry = cs_tree->next;
3280	if (cs_tree->next != NULL)		/* seek to next entry */
3281		cs_tree->next = cs_tree->next->next;
3282	return cs_entry;
3283}
3284
3285/* Reverse list of cpusets, in place.  Simulates pre-order/post-order flip. */
3286void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree)
3287{
3288	struct cpuset_fts_entry *cs1, *cs2, *cs3;
3289
3290	/*
3291	 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer
3292	 * is redirected from cs3 to cs1.
3293	 */
3294
3295	cs1 = cs2 = NULL;
3296	cs3 = cs_tree->head;
3297	while (cs3) {
3298		cs1 = cs2;
3299		cs2 = cs3;
3300		cs3 = cs3->next;
3301		cs2->next = cs1;
3302	}
3303	cs_tree->head = cs2;
3304	cpuset_fts_rewind(cs_tree);
3305}
3306
3307/* Rewind cpuset list to beginning */
3308void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree)
3309{
3310	cs_tree->next = cs_tree->head;
3311}
3312
3313/* Return pointer to nul-terminated cpuset path of entry in hierarchy */
3314const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry)
3315{
3316	return cs_entry->path;
3317}
3318
3319/* Return pointer to stat(2) structure of a cpuset entry's directory */
3320const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry)
3321{
3322	return cs_entry->stat;
3323}
3324
3325/* Return pointer to cpuset structure of a cpuset entry */
3326const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry *cs_entry)
3327{
3328	return cs_entry->cpuset;
3329}
3330
3331/* Return value of errno (0 if no error) on attempted cpuset operations */
3332int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry)
3333{
3334	return cs_entry->err;
3335}
3336
3337/* Return operation identity causing error */
3338int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry)
3339{
3340	return cs_entry->info;
3341}
3342
3343/* Close a cpuset hierarchy handle (free's all associated memory) */
3344void cpuset_fts_close(struct cpuset_fts_tree *cs_tree)
3345{
3346	struct cpuset_fts_entry *cs_entry = cs_tree->head;
3347
3348	while (cs_entry) {
3349		struct cpuset_fts_entry *ep = cs_entry;
3350
3351		cs_entry = cs_entry->next;
3352		free(ep->path);
3353		free(ep->stat);
3354		cpuset_free(ep->cpuset);
3355		free(ep);
3356	}
3357	free(cs_tree);
3358}
3359
3360/* Bind current task to cpu (uses sched_setaffinity(2)) */
3361int cpuset_cpubind(int cpu)
3362{
3363	struct bitmask *bmp;
3364	int r;
3365
3366	if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3367		return -1;
3368	bitmask_setbit(bmp, cpu);
3369	r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp));
3370	bitmask_free(bmp);
3371	return r;
3372}
3373
3374/*
3375 * int cpuset_latestcpu(pid_t pid)
3376 *
3377 * Return most recent CPU on which task pid executed.  If pid == 0,
3378 * examine current task.
3379 *
3380 * The last used CPU is visible for a given pid as field #39 (starting
3381 * with #1) in the file /proc/pid/stat.  Currently this file has 41
3382 * fields, in which case this is the 3rd to the last field.
3383 *
3384 * Unfortunately field #2 is a command name and might have embedded
3385 * whitespace.  So we can't just count white space separated fields.
3386 * Fortunately, this command name is surrounded by parentheses, as
3387 * for example "(sh)", and that closing parenthesis is the last ')'
3388 * character in the line.  No remaining fields can have embedded
3389 * whitespace or parentheses.  So instead of looking for the 39th
3390 * white space separated field, we can look for the 37th white space
3391 * separated field past the last ')' character on the line.
3392 */
3393
3394/* Return most recent CPU on which task pid executed */
3395int cpuset_latestcpu(pid_t pid)
3396{
3397	char buf[PATH_MAX];
3398	char *bp;
3399	int fd = -1;
3400	int cpu = -1;
3401
3402	if (pid == 0)
3403		snprintf(buf, sizeof(buf), "/proc/self/stat");
3404	else
3405		snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
3406
3407	if ((fd = open(buf, O_RDONLY)) < 0)
3408		goto err;
3409	if (read(fd, buf, sizeof(buf)) < 1)
3410		goto err;
3411	close(fd);
3412
3413	bp = strrchr(buf, ')');
3414	if (bp)
3415	     sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u "
3416		    "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u "
3417		    "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u "
3418		    "%*u %*u %*u %*u %*u %*u %*u %*u %u", /* 37th field past ')' */
3419		    &cpu);
3420	if (cpu < 0)
3421		errno = EINVAL;
3422	return cpu;
3423err:
3424	if (fd >= 0)
3425		close(fd);
3426	return -1;
3427}
3428
3429/* Bind current task to memory (uses set_mempolicy(2)) */
3430int cpuset_membind(int mem)
3431{
3432	struct bitmask *bmp;
3433	int r;
3434
3435	if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3436		return -1;
3437	bitmask_setbit(bmp, mem);
3438#if HAVE_DECL_MPOL_BIND
3439	r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp),
3440		bitmask_nbits(bmp) + 1);
3441#else
3442	r = -1;
3443	errno = ENOSYS;
3444#endif
3445	bitmask_free(bmp);
3446	return r;
3447}
3448
3449/* [optional] Return Memory Node holding page at specified addr */
3450int cpuset_addr2node(void *addr)
3451{
3452	int node = -1;
3453
3454#if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
3455	if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE|MPOL_F_ADDR)) {
3456		/* I realize this seems redundant, but I _want_ to make sure
3457		 * that this value is -1. */
3458		node = -1;
3459	}
3460#endif
3461	return node;
3462}
3463
3464/*
3465 * Transform cpuset into Text Format Representation in buffer 'buf',
3466 * of length 'buflen', nul-terminated if space allows.  Return number
3467 * of characters that would have been written, if enough space had
3468 * been available, in the same way that snprintf() does.
3469 */
3470
3471/* Export cpuset settings to a regular file */
3472int cpuset_export(const struct cpuset *cp, char *buf, int buflen)
3473{
3474	char *tmp = NULL;
3475	int n = 0;
3476
3477	if (cp->cpu_exclusive)
3478		n += snprintf(buf + n, max(buflen - n, 0), "cpu_exclusive\n");
3479
3480	if (cp->mem_exclusive)
3481		n += snprintf(buf + n, max(buflen - n, 0), "mem_exclusive\n");
3482
3483	if (cp->notify_on_release)
3484		n += snprintf(buf + n, max(buflen - n, 0),
3485							"notify_on_release\n");
3486
3487	if (cp->memory_pressure_enabled)
3488		n += snprintf(buf + n, max(buflen - n, 0),
3489							"memory_pressure_enabled\n");
3490
3491	if (cp->memory_migrate)
3492		n += snprintf(buf + n, max(buflen - n, 0),
3493							"memory_migrate\n");
3494
3495	if (cp->memory_spread_page)
3496		n += snprintf(buf + n, max(buflen - n, 0),
3497							"memory_spread_page\n");
3498
3499	if (cp->memory_spread_slab)
3500		n += snprintf(buf + n, max(buflen - n, 0),
3501							"memory_spread_slab\n");
3502
3503	if ((tmp = sprint_mask_buf(cp->cpus)) == NULL)
3504		return -1;
3505	n += snprintf(buf + n, max(buflen - n, 0), "cpus %s\n", tmp);
3506	free(tmp);
3507	tmp = NULL;
3508
3509	if ((tmp = sprint_mask_buf(cp->mems)) == NULL)
3510		return -1;
3511	n += snprintf(buf + n, max(buflen - n, 0), "mems %s\n", tmp);
3512	free(tmp);
3513	tmp = NULL;
3514
3515	return n;
3516}
3517
3518static int import_list(UNUSED const char *tok, const char *arg,
3519				struct bitmask *bmp, char *emsg, int elen)
3520{
3521	if (bitmask_parselist(arg, bmp) < 0) {
3522		if (emsg)
3523			snprintf(emsg, elen, "Invalid list format: %s", arg);
3524		return -1;
3525	}
3526	return 0;
3527}
3528
3529static void stolower(char *s)
3530{
3531	while (*s) {
3532		unsigned char c = *s;
3533		*s = tolower(c);
3534		s++;
3535	}
3536}
3537
3538/* Import cpuset settings from a regular file */
3539int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum,
3540							char *emsg, int elen)
3541{
3542	char *linebuf = NULL;
3543	int linebuflen;
3544	int linenum = 0;
3545	int offset = 0;
3546
3547	linebuflen = strlen(buf) + 1;
3548	if ((linebuf = malloc(linebuflen)) == NULL) {
3549		if (emsg)
3550			snprintf(emsg, elen, "Insufficient memory");
3551		goto err;
3552	}
3553
3554	while (slgets(linebuf, linebuflen, buf, &offset)) {
3555		char *tok, *arg;
3556		char *ptr; 		/* for strtok_r */
3557
3558		linenum++;
3559		if ((tok = strchr(linebuf, '#')) != NULL)
3560			*tok = 0;
3561		if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL)
3562			continue;
3563		stolower(tok);
3564
3565		arg = strtok_r(0, " \t", &ptr);
3566
3567		if (streq(tok, "cpu_exclusive")) {
3568			cp->cpu_exclusive = 1;
3569			goto eol;
3570		}
3571		if (streq(tok, "mem_exclusive")) {
3572			cp->mem_exclusive = 1;
3573			goto eol;
3574		}
3575		if (streq(tok, "notify_on_release")) {
3576			cp->notify_on_release = 1;
3577			goto eol;
3578		}
3579		if (streq(tok, "memory_pressure_enabled")) {
3580			cp->memory_pressure_enabled = 1;
3581			goto eol;
3582		}
3583		if (streq(tok, "memory_migrate")) {
3584			cp->memory_migrate = 1;
3585			goto eol;
3586		}
3587		if (streq(tok, "memory_spread_page")) {
3588			cp->memory_spread_page = 1;
3589			goto eol;
3590		}
3591		if (streq(tok, "memory_spread_slab")) {
3592			cp->memory_spread_slab = 1;
3593			goto eol;
3594		}
3595		if (streq(tok, "cpu") || streq(tok, "cpus")) {
3596			if (import_list(tok, arg, cp->cpus, emsg, elen) < 0)
3597				goto err;
3598			goto eol;
3599		}
3600		if (streq(tok, "mem") || streq(tok, "mems")) {
3601			if (import_list(tok, arg, cp->mems, emsg, elen) < 0)
3602				goto err;
3603			goto eol;
3604		}
3605		if (emsg)
3606			snprintf(emsg, elen, "Unrecognized token: '%s'", tok);
3607		goto err;
3608eol:
3609		if ((tok = strtok_r(0, " \t", &ptr)) != NULL) {
3610			if (emsg)
3611				snprintf(emsg, elen, "Surplus token: '%s'",
3612							tok);
3613			goto err;
3614		}
3615		continue;
3616	}
3617
3618	free(linebuf);
3619
3620	if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems))
3621		cpuset_localcpus(cp->mems, cp->cpus);
3622	else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems))
3623		cpuset_localmems(cp->cpus, cp->mems);
3624
3625	/*
3626	 * All cpuset attributes are determined in an import.
3627	 * Those that aren't explicitly specified are presumed
3628	 * to be unchanged (zero, if it's a freshly allocated
3629	 * struct cpuset.)
3630	 */
3631
3632	cp->cpus_valid = 1;
3633	cp->mems_valid = 1;
3634	cp->cpu_exclusive_valid = 1;
3635	cp->mem_exclusive_valid = 1;
3636	cp->notify_on_release_valid = 1;
3637	cp->memory_migrate_valid = 1;
3638	cp->memory_pressure_enabled_valid = 1;
3639	cp->memory_spread_page_valid = 1;
3640	cp->memory_spread_slab_valid = 1;
3641
3642	return 0;
3643err:
3644	if (elinenum)
3645		*elinenum = linenum;
3646	if (linebuf)
3647		free(linebuf);
3648	return -1;
3649}
3650
3651/* Pin current task CPU (and memory) */
3652int cpuset_pin(int relcpu)
3653{
3654	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3655	int cpu, r;
3656
3657	if (check() < 0)
3658		return -1;
3659
3660	do {
3661		cpuset_free_placement(plc1);
3662		plc1 = cpuset_get_placement(0);
3663
3664		r = 0;
3665		if (cpuset_unpin() < 0)
3666			r = -1;
3667		cpu = cpuset_p_rel_to_sys_cpu(0, relcpu);
3668		if (cpuset_cpubind(cpu) < 0)
3669			r = -1;
3670
3671		cpuset_free_placement(plc2);
3672		plc2 = cpuset_get_placement(0);
3673	} while (!cpuset_equal_placement(plc1, plc2));
3674
3675	cpuset_free_placement(plc1);
3676	cpuset_free_placement(plc2);
3677	return r;
3678 }
3679
3680/* Return number CPUs in current tasks cpuset */
3681int cpuset_size()
3682{
3683	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3684	int r;
3685
3686	if (check() < 0)
3687		return -1;
3688
3689	do {
3690		cpuset_free_placement(plc1);
3691		plc1 = cpuset_get_placement(0);
3692
3693		r = cpuset_cpus_weight(0);
3694
3695		cpuset_free_placement(plc2);
3696		plc2 = cpuset_get_placement(0);
3697	} while (!cpuset_equal_placement(plc1, plc2));
3698
3699	cpuset_free_placement(plc1);
3700	cpuset_free_placement(plc2);
3701	return r;
3702}
3703
3704/* Return relative CPU number, within current cpuset, last executed on */
3705int cpuset_where()
3706{
3707	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3708	int r;
3709
3710	if (check() < 0)
3711		return -1;
3712
3713	do {
3714		cpuset_free_placement(plc1);
3715		plc1 = cpuset_get_placement(0);
3716
3717		r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0));
3718
3719		cpuset_free_placement(plc2);
3720		plc2 = cpuset_get_placement(0);
3721	} while (!cpuset_equal_placement(plc1, plc2));
3722
3723	cpuset_free_placement(plc1);
3724	cpuset_free_placement(plc2);
3725	return r;
3726}
3727
3728/* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */
3729int cpuset_unpin()
3730{
3731	struct bitmask *cpus = NULL, *mems = NULL;
3732	int r = -1;
3733
3734	if (check() < 0)
3735		goto err;
3736
3737	/*
3738	 * Don't need cpuset_*_placement() guard against concurrent
3739	 * cpuset migration, because none of the following depends
3740	 * on the tasks cpuset placement.
3741	 */
3742
3743	if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3744		goto err;
3745	bitmask_setall(cpus);
3746	if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0)
3747		goto err;
3748
3749	if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3750		goto err;
3751#if HAVE_DECL_MPOL_DEFAULT
3752	if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems),
3753						bitmask_nbits(mems) + 1) < 0)
3754		goto err;
3755	r = 0;
3756#endif
3757	/* fall into ... */
3758err:
3759	bitmask_free(cpus);
3760	bitmask_free(mems);
3761	return r;
3762
3763}
3764
3765struct cpuset_function_list {
3766	const char *fname;
3767	void *func;
3768} flist[] = {
3769	{ "cpuset_version", cpuset_version },
3770	{ "cpuset_alloc", cpuset_alloc },
3771	{ "cpuset_free", cpuset_free },
3772	{ "cpuset_cpus_nbits", cpuset_cpus_nbits },
3773	{ "cpuset_mems_nbits", cpuset_mems_nbits },
3774	{ "cpuset_setcpus", cpuset_setcpus },
3775	{ "cpuset_setmems", cpuset_setmems },
3776	{ "cpuset_set_iopt", cpuset_set_iopt },
3777	{ "cpuset_set_sopt", cpuset_set_sopt },
3778	{ "cpuset_getcpus", cpuset_getcpus },
3779	{ "cpuset_getmems", cpuset_getmems },
3780	{ "cpuset_cpus_weight", cpuset_cpus_weight },
3781	{ "cpuset_mems_weight", cpuset_mems_weight },
3782	{ "cpuset_get_iopt", cpuset_get_iopt },
3783	{ "cpuset_get_sopt", cpuset_get_sopt },
3784	{ "cpuset_localcpus", cpuset_localcpus },
3785	{ "cpuset_localmems", cpuset_localmems },
3786	{ "cpuset_cpumemdist", cpuset_cpumemdist },
3787	{ "cpuset_cpu2node", cpuset_cpu2node },
3788	{ "cpuset_addr2node", cpuset_addr2node },
3789	{ "cpuset_create", cpuset_create },
3790	{ "cpuset_delete", cpuset_delete },
3791	{ "cpuset_query", cpuset_query },
3792	{ "cpuset_modify", cpuset_modify },
3793	{ "cpuset_getcpusetpath", cpuset_getcpusetpath },
3794	{ "cpuset_cpusetofpid", cpuset_cpusetofpid },
3795	{ "cpuset_mountpoint", cpuset_mountpoint },
3796	{ "cpuset_collides_exclusive", cpuset_collides_exclusive },
3797	{ "cpuset_nuke", cpuset_nuke },
3798	{ "cpuset_init_pidlist", cpuset_init_pidlist },
3799	{ "cpuset_pidlist_length", cpuset_pidlist_length },
3800	{ "cpuset_get_pidlist", cpuset_get_pidlist },
3801	{ "cpuset_freepidlist", cpuset_freepidlist },
3802	{ "cpuset_move", cpuset_move },
3803	{ "cpuset_move_all", cpuset_move_all },
3804	{ "cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks },
3805	{ "cpuset_migrate", cpuset_migrate },
3806	{ "cpuset_migrate_all", cpuset_migrate_all },
3807	{ "cpuset_reattach", cpuset_reattach },
3808	{ "cpuset_open_memory_pressure", cpuset_open_memory_pressure },
3809	{ "cpuset_read_memory_pressure", cpuset_read_memory_pressure },
3810	{ "cpuset_close_memory_pressure", cpuset_close_memory_pressure },
3811	{ "cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu },
3812	{ "cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu },
3813	{ "cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem },
3814	{ "cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem },
3815	{ "cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu },
3816	{ "cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu },
3817	{ "cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem },
3818	{ "cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem },
3819	{ "cpuset_get_placement", cpuset_get_placement },
3820	{ "cpuset_equal_placement", cpuset_equal_placement },
3821	{ "cpuset_free_placement", cpuset_free_placement },
3822	{ "cpuset_fts_open", cpuset_fts_open },
3823	{ "cpuset_fts_read", cpuset_fts_read },
3824	{ "cpuset_fts_reverse", cpuset_fts_reverse },
3825	{ "cpuset_fts_rewind", cpuset_fts_rewind },
3826	{ "cpuset_fts_get_path", cpuset_fts_get_path },
3827	{ "cpuset_fts_get_stat", cpuset_fts_get_stat },
3828	{ "cpuset_fts_get_cpuset", cpuset_fts_get_cpuset },
3829	{ "cpuset_fts_get_errno", cpuset_fts_get_errno },
3830	{ "cpuset_fts_get_info", cpuset_fts_get_info },
3831	{ "cpuset_fts_close", cpuset_fts_close },
3832	{ "cpuset_cpubind", cpuset_cpubind },
3833	{ "cpuset_latestcpu", cpuset_latestcpu },
3834	{ "cpuset_membind", cpuset_membind },
3835	{ "cpuset_export", cpuset_export },
3836	{ "cpuset_import", cpuset_import },
3837	{ "cpuset_function", cpuset_function },
3838	{ "cpuset_pin", cpuset_pin },
3839	{ "cpuset_size", cpuset_size },
3840	{ "cpuset_where", cpuset_where },
3841	{ "cpuset_unpin", cpuset_unpin },
3842};
3843
3844/* Return pointer to a libcpuset.so function, or NULL */
3845void *cpuset_function(const char * function_name)
3846{
3847	unsigned int i;
3848
3849	for (i = 0; i < sizeof(flist)/sizeof(flist[0]); i++)
3850		if (streq(function_name, flist[i].fname))
3851			return flist[i].func;
3852	return NULL;
3853}
3854
3855/* Fortran interface to basic cpuset routines */
3856int cpuset_pin_(int *ptr_relcpu) {return cpuset_pin(*ptr_relcpu);}
3857int cpuset_size_(void) { return cpuset_size(); }
3858int cpuset_where_(void) { return cpuset_where(); }
3859int cpuset_unpin_(void) { return cpuset_unpin(); }
3860
3861#endif /* HAVE_LINUX_MEMPOLICY_H */