libcpuset.c revision 1e6f5a673655551de5734ff31ef48cd63b604e6d
1/* 2 * cpuset user library implementation. 3 * 4 * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved. 5 * 6 * Paul Jackson <pj@sgi.com> 7 */ 8 9/* 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU Lesser General Public License as published by 12 * the Free Software Foundation; either version 2.1 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 */ 24 25#define _XOPEN_SOURCE 500 /* need to see pread() */ 26#define _BSD_SOURCE 1 /* need to see syscall() */ 27#include <unistd.h> 28 29#include <ctype.h> 30#include <dirent.h> 31#include <errno.h> 32#include <fcntl.h> 33#include <fts.h> 34#include <limits.h> 35#include <signal.h> 36#include <stdint.h> 37#include <stdio.h> 38#include <stdlib.h> 39#include <string.h> 40#include <sys/stat.h> 41#include <sys/syscall.h> 42#include <sys/types.h> 43#include <time.h> 44#include <utime.h> 45#include <sys/utsname.h> /* for cpuset_would_crash_kernel() */ 46 47#include "bitmask.h" 48#include "cpuset.h" 49#include "common.h" 50#include "test.h" 51#include "linux_syscall_numbers.h" 52#include "config.h" 53#if HAVE_LINUX_MEMPOLICY_H 54#include <linux/mempolicy.h> 55 56/* Bump version, and update Change History, when libcpuset API changes */ 57#define CPUSET_VERSION 3 58 59/* 60 * For a history of what changed in each version, see the "Change 61 * History" section, at the end of the libcpuset master document. 62 */ 63 64int cpuset_version(void) 65{ 66 return CPUSET_VERSION; 67} 68 69struct cpuset { 70 struct bitmask *cpus; 71 struct bitmask *mems; 72 char cpu_exclusive; 73 char mem_exclusive; 74 char mem_hardwall; 75 char notify_on_release; 76 char memory_migrate; 77 char memory_pressure_enabled; 78 char memory_spread_page; 79 char memory_spread_slab; 80 char sched_load_balance; 81 int sched_relax_domain_level; 82 83 /* 84 * Each field 'x' above gets an 'x_valid' field below. 85 * The apply_cpuset_settings() will only set those fields whose 86 * corresponding *_valid flags are set. The cpuset_alloc() 87 * routine clears these flags as part of the clear in calloc(), 88 * and the various cpuset_set*() routines set these flags when 89 * setting the corresponding value. 90 * 91 * The purpose of these valid fields is to ensure that when 92 * we create a new cpuset, we don't accidentally overwrite 93 * some non-zero kernel default, such as an inherited 94 * memory_spread_* flag, just because the user application 95 * code didn't override the default zero settings resulting 96 * from the calloc() call in cpuset_alloc(). 97 * 98 * The choice of 'char' for the type of the flags above, 99 * but a bitfield for the flags below, is somewhat capricious. 100 */ 101 unsigned cpus_valid:1; 102 unsigned mems_valid:1; 103 unsigned cpu_exclusive_valid:1; 104 unsigned mem_exclusive_valid:1; 105 unsigned mem_hardwall_valid:1; 106 unsigned notify_on_release_valid:1; 107 unsigned memory_migrate_valid:1; 108 unsigned memory_pressure_enabled_valid:1; 109 unsigned memory_spread_page_valid:1; 110 unsigned memory_spread_slab_valid:1; 111 unsigned sched_load_balance_valid:1; 112 unsigned sched_relax_domain_level_valid:1; 113 114 /* 115 * if the relative variable was modified, use following flags 116 * to put a mark 117 */ 118 unsigned cpus_dirty:1; 119 unsigned mems_dirty:1; 120 unsigned cpu_exclusive_dirty:1; 121 unsigned mem_exclusive_dirty:1; 122 unsigned mem_hardwall_dirty:1; 123 unsigned notify_on_release_dirty:1; 124 unsigned memory_migrate_dirty:1; 125 unsigned memory_pressure_enabled_dirty:1; 126 unsigned memory_spread_page_dirty:1; 127 unsigned memory_spread_slab_dirty:1; 128 unsigned sched_load_balance_dirty:1; 129 unsigned sched_relax_domain_level_dirty:1; 130}; 131 132/* Presumed cpuset file system mount point */ 133static const char *cpusetmnt = "/dev/cpuset"; 134 135/* Stashed copy of cpunodemap[], mapping each cpu to its node. */ 136static const char *mapfile = "/var/run/cpunodemap"; 137 138/* The primary source for the cpunodemap[] is available below here. */ 139static const char *sysdevices = "/sys/devices/system"; 140 141#define max(a,b) ((a) > (b) ? (a) : (b)) 142#define min(a,b) ((a) < (b) ? (a) : (b)) 143 144/* small buffer size - for reading boolean flags or map file (1 or 2 ints) */ 145#define SMALL_BUFSZ 16 146 147/* 148 * The 'mask_size_file' is used to ferrit out the kernel cpumask_t 149 * and nodemask_t sizes. The lines in this file that begin with the 150 * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask 151 * and nodemask string, respectively. The lengths of these strings 152 * reflect the kernel's internal cpumask_t and nodemask_t sizes, 153 * which sizes are needed to correctly call the sched_setaffinity 154 * and set_mempolicy system calls, and to size user level 155 * bitmasks to match the kernels. 156 */ 157 158static const char *mask_size_file = "/proc/self/status"; 159static const char *cpumask_prefix = "Cpus_allowed:\t"; 160static const char *nodemask_prefix = "Mems_allowed:\t"; 161 162/* 163 * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits. 164 * 165 * The first time we need these, we parse the Cpus_allowed and 166 * Mems_allowed lines from mask_size_file ("/proc/self/status"). 167 */ 168 169static int cpumask_sz; 170static int nodemask_sz; 171 172/* 173 * These defaults only kick in if we fail to size the kernel 174 * cpumask and nodemask by reading the Cpus_allowed and 175 * Mems_allowed fields from the /proc/self/status file. 176 */ 177 178#define DEFCPUBITS (512) 179#define DEFNODEBITS (DEFCPUBITS/2) 180 181/* 182 * Arch-neutral API for obtaining NUMA distances between CPUs 183 * and Memory Nodes, via the files: 184 * /sys/devices/system/node/nodeN/distance 185 * which have lines such as: 186 * 46 66 10 20 187 * which say that for cpu on node N (from the path above), the 188 * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20, 189 * respectively. 190 */ 191 192static const char *distance_directory = "/sys/devices/system/node"; 193 194/* 195 * Someday, we should disable, then later discard, the SN code 196 * marked ALTERNATE_SN_DISTMAP. 197 */ 198 199#define ALTERNATE_SN_DISTMAP 1 200#ifdef ALTERNATE_SN_DISTMAP 201 202/* 203 * Alternative SN (SGI ia64) architecture specific API for obtaining 204 * NUMA distances between CPUs and Memory Nodes is via the file 205 * /proc/sgi_sn/sn_topology, which has lines such as: 206 * 207 * node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20 208 * 209 * which says that for each CPU on node 2, the distance to nodes 210 * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively. 211 * 212 * This file has other lines as well, which start with other 213 * keywords than "node". Ignore these other lines. 214 */ 215 216static const char *sn_topology = "/proc/sgi_sn/sn_topology"; 217static const char *sn_top_node_prefix = "node "; 218 219#endif 220 221/* 222 * Check that cpusets supported, /dev/cpuset mounted. 223 * If ok, return 0. 224 * If not, return -1 and set errno: 225 * ENOSYS - kernel doesn't support cpusets 226 * ENODEV - /dev/cpuset not mounted 227 */ 228 229static enum { 230 check_notdone, 231 check_enosys, 232 check_enodev, 233 check_ok 234} check_state = check_notdone; 235 236static int check() 237{ 238 if (check_state == check_notdone) { 239 struct stat statbuf; 240 241 if (stat("/proc/self/cpuset", &statbuf) < 0) { 242 check_state = check_enosys; 243 goto done; 244 } 245 246 if (stat("/dev/cpuset/tasks", &statbuf) < 0) { 247 check_state = check_enodev; 248 goto done; 249 } 250 251 check_state = check_ok; 252 } 253done: 254 switch (check_state) { 255 case check_enosys: 256 errno = ENOSYS; 257 return -1; 258 case check_enodev: 259 errno = ENODEV; 260 return -1; 261 default: 262 break; 263 } 264 return 0; 265} 266 267static void chomp(char *s) 268{ 269 char *t; 270 271 for (t = s + strlen(s) - 1; t >= s; t--) { 272 if (*t == '\n' || *t == '\r') 273 *t = '\0'; 274 else 275 break; 276 } 277} 278 279/* 280 * Determine number of bytes in a seekable open file, without 281 * assuming that stat(2) on that file has a useful size. 282 * Has side affect of leaving the file rewound to the beginnning. 283 */ 284static int filesize(FILE *fp) 285{ 286 int sz = 0; 287 rewind(fp); 288 while (fgetc(fp) != EOF) 289 sz++; 290 rewind(fp); 291 return sz; 292} 293 294/* Are strings s1 and s2 equal? */ 295static int streq(const char *s1, const char *s2) 296{ 297 return strcmp(s1, s2) == 0; 298} 299 300/* Is string 'pre' a prefix of string 's'? */ 301static int strprefix(const char *s, const char *pre) 302{ 303 return strncmp(s, pre, strlen(pre)) == 0; 304} 305 306/* 307 * char *flgets(char *buf, int buflen, FILE *fp) 308 * 309 * Obtain one line from input file fp. Copy up to first 310 * buflen-1 chars of line into buffer buf, discarding any remainder 311 * of line. Stop reading at newline, discarding newline. 312 * Nul terminate result and return pointer to buffer buf 313 * on success, or NULL if nothing more to read or failure. 314 */ 315 316static char *flgets(char *buf, int buflen, FILE * fp) 317{ 318 int c = -1; 319 char *bp; 320 321 bp = buf; 322 while ((--buflen > 0) && ((c = getc(fp)) >= 0)) { 323 if (c == '\n') 324 goto newline; 325 *bp++ = c; 326 } 327 if ((c < 0) && (bp == buf)) 328 return NULL; 329 330 if (c > 0) { 331 while ((c = getc(fp)) >= 0) { 332 if (c == '\n') 333 break; 334 } 335 } 336 337newline: 338 *bp++ = '\0'; 339 return buf; 340} 341 342/* 343 * sgetc(const char *inputbuf, int *offsetptr) 344 * 345 * Return next char from nul-terminated input buffer inputbuf, 346 * starting at offset *offsetptr. Increment *offsetptr. 347 * If next char would be nul ('\0'), return EOF and don't 348 * increment *offsetptr. 349 */ 350 351static int sgetc(const char *inputbuf, int *offsetptr) 352{ 353 char c; 354 355 if ((c = inputbuf[*offsetptr]) != 0) { 356 *offsetptr = *offsetptr + 1; 357 return c; 358 } else { 359 return EOF; 360 } 361} 362 363/* 364 * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) 365 * 366 * Obtain next line from nul-terminated input buffer 'inputbuf', 367 * starting at offset *offsetptr. Copy up to first buflen-1 368 * chars of line into output buffer buf, discarding any remainder 369 * of line. Stop reading at newline, discarding newline. 370 * Nul terminate result and return pointer to output buffer 371 * buf on success, or NULL if nothing more to read. 372 */ 373 374static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) 375{ 376 int c = -1; 377 char *bp; 378 379 bp = buf; 380 while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) { 381 if (c == '\n') 382 goto newline; 383 *bp++ = c; 384 } 385 if ((c < 0) && (bp == buf)) 386 return NULL; 387 388 if (c > 0) { 389 while ((c = sgetc(inputbuf, offsetptr)) >= 0) { 390 if (c == '\n') 391 break; 392 } 393 } 394 395newline: 396 *bp++ = '\0'; 397 return buf; 398} 399 400/* 401 * time_t get_mtime(char *path) 402 * 403 * Return modtime of file at location path, else return 0. 404 */ 405 406static time_t get_mtime(const char *path) 407{ 408 struct stat statbuf; 409 410 if (stat(path, &statbuf) != 0) 411 return 0; 412 return statbuf.st_mtime; 413} 414 415/* 416 * int set_mtime(const char *path, time_t mtime) 417 * 418 * Set modtime of file 'path' to 'mtime'. Return 0 on success, 419 * or -1 on error, setting errno. 420 */ 421 422static int set_mtime(const char *path, time_t mtime) 423{ 424 struct utimbuf times; 425 426 times.actime = mtime; 427 times.modtime = mtime; 428 return utime(path, ×); 429} 430 431/* 432 * True if two pathnames resolve to same file. 433 * False if either path can not be stat'd, 434 * or if the two paths resolve to a different file. 435 */ 436 437static int samefile(const char *path1, const char *path2) 438{ 439 struct stat sb1, sb2; 440 441 if (stat(path1, &sb1) != 0) 442 return 0; 443 if (stat(path2, &sb2) != 0) 444 return 0; 445 return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev; 446} 447 448#define slash(c) (*(c) == '/') 449#define eocomp(c) (slash(c) || !*(c)) 450#define dot1(c) (*(c) == '.' && eocomp(c+1)) 451 452/* In place path compression. Remove extra dots and slashes. */ 453static char *pathcomp(char *p) 454{ 455 char *a=p; 456 char *b=p; 457 458 if (!p || !*p) 459 return p; 460 if (slash (p)) 461 *b++ = *a++; 462 for (;;) { 463 if (slash (a)) 464 while (slash (++a)) 465 continue; 466 if (!*a) { 467 if (b==p) 468 *b++ = '.'; 469 *b = '\0'; 470 return (p); 471 } else if (dot1 (a)) { 472 a++; 473 } else { 474 if ((b!=p) && !slash(b-1)) 475 *b++ = '/'; 476 while (!eocomp(a)) 477 *b++ = *a++; 478 } 479 } 480} 481 482#undef slash 483#undef eocomp 484#undef dot1 485 486/* 487 * pathcat2(buf, buflen, name1, name2) 488 * 489 * Return buf, of length buflen, with name1/name2 stored in it. 490 */ 491 492static char *pathcat2(char *buf, int buflen, const char *name1, 493 const char *name2) 494{ 495 (void) snprintf(buf, buflen, "%s/%s", name1, name2); 496 return pathcomp(buf); 497} 498 499/* 500 * pathcat3(buf, buflen, name1, name2, name3) 501 * 502 * Return buf, of length buflen, with name1/name2/name3 stored in it. 503 */ 504 505static char *pathcat3(char *buf, int buflen, const char *name1, 506 const char *name2, const char *name3) 507{ 508 (void) snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3); 509 return pathcomp(buf); 510} 511 512/* 513 * fullpath(buf, buflen, name) 514 * 515 * Put full path of cpuset 'name' in buffer 'buf'. If name 516 * starts with a slash (``/``) character, then this a path 517 * relative to ``/dev/cpuset``, otherwise it is relative to 518 * the current tasks cpuset. Return 0 on success, else 519 * -1 on error, setting errno. 520 */ 521 522static int fullpath(char *buf, int buflen, const char *name) 523{ 524 int len; 525 526 /* easy case */ 527 if (*name == '/') { 528 pathcat2(buf, buflen, cpusetmnt, name); 529 pathcomp(buf); 530 return 0; 531 } 532 533 /* hard case */ 534 snprintf(buf, buflen, "%s/", cpusetmnt); 535 len = strlen(buf); 536 if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL) 537 return -1; 538 if (strlen(buf) >= buflen - 1 - strlen(name)) { 539 errno = E2BIG; 540 return -1; 541 } 542 strcat(buf, "/"); 543 strcat(buf, name); 544 pathcomp(buf); 545 return 0; 546} 547 548/* 549 * fullpath2(buf, buflen, name1, name2) 550 * 551 * Like fullpath(), only concatenate two pathname components on end. 552 */ 553 554static int fullpath2(char *buf, int buflen, const char *name1, 555 const char *name2) 556{ 557 if (fullpath(buf, buflen, name1) < 0) 558 return -1; 559 if (strlen(buf) >= buflen - 1 - strlen(name2)) { 560 errno = E2BIG; 561 return -1; 562 } 563 strcat(buf, "/"); 564 strcat(buf, name2); 565 pathcomp(buf); 566 return 0; 567} 568 569/* 570 * Convert the string length of an ascii hex mask to the number 571 * of bits represented by that mask. 572 * 573 * The cpumask and nodemask values in /proc/self/status are in an 574 * ascii format that uses 9 characters for each 32 bits of mask. 575 */ 576static int s2nbits(const char *s) 577{ 578 return strlen(s) * 32 / 9; 579} 580 581static void update_mask_sizes() 582{ 583 FILE *fp = NULL; 584 char *buf = NULL; 585 int fsize; 586 587 if ((fp = fopen(mask_size_file, "r")) == NULL) 588 goto done; 589 fsize = filesize(fp); 590 if ((buf = malloc(fsize)) == NULL) 591 goto done; 592 593 /* 594 * Beware: mask sizing arithmetic is fussy. 595 * The trailing newline left by fgets() is required. 596 */ 597 while (fgets(buf, fsize, fp)) { 598 if (strprefix(buf, cpumask_prefix)) 599 cpumask_sz = s2nbits(buf + strlen(cpumask_prefix)); 600 if (strprefix(buf, nodemask_prefix)) 601 nodemask_sz = s2nbits(buf + strlen(nodemask_prefix)); 602 } 603done: 604 if (buf != NULL) 605 free(buf); 606 if (fp != NULL) 607 fclose(fp); 608 if (cpumask_sz == 0) 609 cpumask_sz = DEFCPUBITS; 610 if (nodemask_sz == 0) 611 nodemask_sz = DEFNODEBITS; 612} 613 614/* Allocate a new struct cpuset */ 615struct cpuset *cpuset_alloc() 616{ 617 struct cpuset *cp = NULL; 618 int nbits; 619 620 if ((cp = calloc(1, sizeof(struct cpuset))) == NULL) 621 goto err; 622 623 nbits = cpuset_cpus_nbits(); 624 if ((cp->cpus = bitmask_alloc(nbits)) == NULL) 625 goto err; 626 627 nbits = cpuset_mems_nbits(); 628 if ((cp->mems = bitmask_alloc(nbits)) == NULL) 629 goto err; 630 631 return cp; 632err: 633 if (cp && cp->cpus) 634 bitmask_free(cp->cpus); 635 if (cp && cp->mems) 636 bitmask_free(cp->mems); 637 if (cp) 638 free(cp); 639 return NULL; 640} 641 642/* Free struct cpuset *cp */ 643void cpuset_free(struct cpuset *cp) 644{ 645 if (!cp) 646 return; 647 if (cp->cpus) 648 bitmask_free(cp->cpus); 649 if (cp->mems) 650 bitmask_free(cp->mems); 651 free(cp); 652} 653 654/* Number of bits in a CPU bitmask on current system */ 655int cpuset_cpus_nbits() 656{ 657 if (cpumask_sz == 0) 658 update_mask_sizes(); 659 return cpumask_sz; 660} 661 662/* Number of bits in a Memory bitmask on current system */ 663int cpuset_mems_nbits() 664{ 665 if (nodemask_sz == 0) 666 update_mask_sizes(); 667 return nodemask_sz; 668} 669 670/* Set CPUs in cpuset cp to bitmask cpus */ 671int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus) 672{ 673 if (cp->cpus) 674 bitmask_free(cp->cpus); 675 cp->cpus = bitmask_alloc(bitmask_nbits(cpus)); 676 if (cp->cpus == NULL) 677 return -1; 678 bitmask_copy(cp->cpus, cpus); 679 cp->cpus_valid = 1; 680 cp->cpus_dirty = 1; 681 return 0; 682} 683 684/* Set Memory Nodes in cpuset cp to bitmask mems */ 685int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems) 686{ 687 if (cp->mems) 688 bitmask_free(cp->mems); 689 cp->mems = bitmask_alloc(bitmask_nbits(mems)); 690 if (cp->mems == NULL) 691 return -1; 692 bitmask_copy(cp->mems, mems); 693 cp->mems_valid = 1; 694 cp->mems_dirty = 1; 695 return 0; 696} 697 698/* Set integer value optname of cpuset cp */ 699int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value) 700{ 701 if (streq(optionname, "cpu_exclusive")) { 702 cp->cpu_exclusive = !!value; 703 cp->cpu_exclusive_valid = 1; 704 cp->cpu_exclusive_dirty = 1; 705 } else if (streq(optionname, "mem_exclusive")) { 706 cp->mem_exclusive = !!value; 707 cp->mem_exclusive_valid = 1; 708 cp->mem_exclusive_dirty = 1; 709 } else if (streq(optionname, "mem_hardwall")) { 710 cp->mem_hardwall = !!value; 711 cp->mem_hardwall_valid = 1; 712 cp->mem_hardwall_dirty = 1; 713 } else if (streq(optionname, "notify_on_release")) { 714 cp->notify_on_release = !!value; 715 cp->notify_on_release_valid = 1; 716 cp->notify_on_release_dirty = 1; 717 } else if (streq(optionname, "memory_pressure_enabled")) { 718 cp->memory_pressure_enabled = !!value; 719 cp->memory_pressure_enabled_valid = 1; 720 cp->memory_pressure_enabled_dirty = 1; 721 } else if (streq(optionname, "memory_migrate")) { 722 cp->memory_migrate = !!value; 723 cp->memory_migrate_valid = 1; 724 cp->memory_migrate_dirty = 1; 725 } else if (streq(optionname, "memory_spread_page")) { 726 cp->memory_spread_page = !!value; 727 cp->memory_spread_page_valid = 1; 728 cp->memory_spread_page_dirty = 1; 729 } else if (streq(optionname, "memory_spread_slab")) { 730 cp->memory_spread_slab = !!value; 731 cp->memory_spread_slab_valid = 1; 732 cp->memory_spread_slab_dirty = 1; 733 } else if (streq(optionname, "sched_load_balance")) { 734 cp->sched_load_balance = !!value; 735 cp->sched_load_balance_valid = 1; 736 cp->sched_load_balance_dirty = 1; 737 } else if (streq(optionname, "sched_relax_domain_level")) { 738 cp->sched_relax_domain_level = value; 739 cp->sched_relax_domain_level_valid = 1; 740 cp->sched_relax_domain_level_dirty = 1; 741 } else 742 return -2; /* optionname not recognized */ 743 return 0; 744} 745 746/* [optional] Set string value optname */ 747int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname, 748 UNUSED const char *value) 749{ 750 return -2; /* For now, all string options unrecognized */ 751} 752 753/* Return handle for reading memory_pressure. */ 754int cpuset_open_memory_pressure(const char *cpusetpath) 755{ 756 char buf[PATH_MAX]; 757 758 fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure"); 759 return open(buf, O_RDONLY); 760} 761 762/* Return current memory_pressure of cpuset. */ 763int cpuset_read_memory_pressure(int han) 764{ 765 char buf[SMALL_BUFSZ]; 766 767 if (pread(han, buf, sizeof(buf), 0L) < 0) 768 return -1; 769 return atoi(buf); 770} 771 772/* Close handle for reading memory pressure. */ 773void cpuset_close_memory_pressure(int han) 774{ 775 close(han); 776} 777 778/* 779 * Resolve cpuset pointer (to that of current task if cp == NULL). 780 * 781 * If cp not NULL, just return it. If cp is NULL, return pointer 782 * to temporary cpuset for current task, and set *cp_tofree to 783 * pointer to that same temporary cpuset, to be freed later. 784 * 785 * Return NULL and set errno on error. Errors can occur when 786 * resolving the current tasks cpuset. 787 */ 788static const struct cpuset *resolve_cp(const struct cpuset *cp, 789 struct cpuset **cp_tofree) 790{ 791 const struct cpuset *rcp; 792 793 if (cp) { 794 rcp = cp; 795 } else { 796 struct cpuset *cp1 = cpuset_alloc(); 797 if (cp1 == NULL) 798 goto err; 799 if (cpuset_cpusetofpid(cp1, 0) < 0) { 800 cpuset_free(cp1); 801 goto err; 802 } 803 *cp_tofree = cp1; 804 rcp = cp1; 805 } 806 return rcp; 807err: 808 return NULL; 809} 810 811/* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */ 812int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus) 813{ 814 struct cpuset *cp_tofree = NULL; 815 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 816 817 if (!cp1) 818 goto err; 819 if (cp1->cpus == NULL) { 820 errno = EINVAL; 821 goto err; 822 } 823 bitmask_copy(cpus, cp1->cpus); 824 cpuset_free(cp_tofree); 825 return 0; 826err: 827 cpuset_free(cp_tofree); 828 return -1; 829} 830 831/* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */ 832int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems) 833{ 834 struct cpuset *cp_tofree = NULL; 835 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 836 837 if (!cp1) 838 goto err; 839 if (cp1->mems == NULL) { 840 errno = EINVAL; 841 goto err; 842 } 843 bitmask_copy(mems, cp1->mems); 844 cpuset_free(cp_tofree); 845 return 0; 846err: 847 cpuset_free(cp_tofree); 848 return -1; 849} 850 851/* Return number of CPUs in cpuset cp (current task if cp == NULL) */ 852int cpuset_cpus_weight(const struct cpuset *cp) 853{ 854 struct cpuset *cp_tofree = NULL; 855 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 856 int w = -1; 857 858 if (!cp1) 859 goto err; 860 if (cp1->cpus == NULL) { 861 errno = EINVAL; 862 goto err; 863 } 864 w = bitmask_weight(cp1->cpus); 865 /* fall into ... */ 866err: 867 cpuset_free(cp_tofree); 868 return w; 869} 870 871/* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */ 872int cpuset_mems_weight(const struct cpuset *cp) 873{ 874 struct cpuset *cp_tofree = NULL; 875 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 876 int w = -1; 877 878 if (!cp1) 879 goto err; 880 if (cp1->mems == NULL) { 881 errno = EINVAL; 882 goto err; 883 } 884 w = bitmask_weight(cp1->mems); 885 /* fall into ... */ 886err: 887 cpuset_free(cp_tofree); 888 return w; 889} 890 891/* Return integer value of option optname in cp */ 892int cpuset_get_iopt(const struct cpuset *cp, const char *optionname) 893{ 894 if (streq(optionname, "cpu_exclusive")) 895 return cp->cpu_exclusive; 896 else if (streq(optionname, "mem_exclusive")) 897 return cp->mem_exclusive; 898 else if (streq(optionname, "mem_hardwall")) 899 return cp->mem_hardwall; 900 else if (streq(optionname, "notify_on_release")) 901 return cp->notify_on_release; 902 else if (streq(optionname, "memory_pressure_enabled")) 903 return cp->memory_pressure_enabled; 904 else if (streq(optionname, "memory_migrate")) 905 return cp->memory_migrate; 906 else if (streq(optionname, "memory_spread_page")) 907 return cp->memory_spread_page; 908 else if (streq(optionname, "memory_spread_slab")) 909 return cp->memory_spread_slab; 910 else if (streq(optionname, "sched_load_balance")) 911 return cp->sched_load_balance; 912 else if (streq(optionname, "sched_relax_domain_level")) 913 return cp->sched_relax_domain_level; 914 else 915 return -2; /* optionname not recognized */ 916} 917 918/* [optional] Return string value of optname */ 919const char *cpuset_get_sopt(UNUSED const struct cpuset *cp, 920 UNUSED const char *optionname) 921{ 922 return NULL; /* For now, all string options unrecognized */ 923} 924 925static int read_flag(const char *filepath, char *flagp) 926{ 927 char buf[SMALL_BUFSZ]; /* buffer a "0" or "1" flag line */ 928 int fd = -1; 929 930 if ((fd = open(filepath, O_RDONLY)) < 0) 931 goto err; 932 if (read(fd, buf, sizeof(buf)) < 1) 933 goto err; 934 if (atoi(buf)) 935 *flagp = 1; 936 else 937 *flagp = 0; 938 close(fd); 939 return 0; 940err: 941 if (fd >= 0) 942 close(fd); 943 return -1; 944} 945 946static int load_flag(const char *path, char *flagp, const char *flag) 947{ 948 char buf[PATH_MAX]; 949 950 pathcat2(buf, sizeof(buf), path, flag); 951 return read_flag(buf, flagp); 952} 953 954static int read_number(const char *filepath, int *numberp) 955{ 956 char buf[SMALL_BUFSZ]; 957 int fd = -1; 958 959 if ((fd = open(filepath, O_RDONLY)) < 0) 960 goto err; 961 if (read(fd, buf, sizeof(buf)) < 1) 962 goto err; 963 *numberp = atoi(buf); 964 close(fd); 965 return 0; 966err: 967 if (fd >= 0) 968 close(fd); 969 return -1; 970} 971 972static int load_number(const char *path, int *numberp, const char *file) 973{ 974 char buf[PATH_MAX]; 975 976 pathcat2(buf, sizeof(buf), path, file); 977 return read_number(buf, numberp); 978} 979 980static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits) 981{ 982 FILE *fp = NULL; 983 char *buf = NULL; 984 int buflen; 985 struct bitmask *bmp = NULL; 986 987 if ((fp = fopen(filepath, "r")) == NULL) 988 goto err; 989 buflen = filesize(fp) + 1; /* + 1 for nul term */ 990 if ((buf = malloc(buflen)) == NULL) 991 goto err; 992 if (flgets(buf, buflen, fp) == NULL) 993 goto err; 994 fclose(fp); 995 fp = NULL; 996 997 if ((bmp = bitmask_alloc(nbits)) == NULL) 998 goto err; 999 if (*buf && bitmask_parselist(buf, bmp) < 0) 1000 goto err; 1001 if (*bmpp) 1002 bitmask_free(*bmpp); 1003 *bmpp = bmp; 1004 free(buf); 1005 buf = NULL; 1006 return 0; 1007err: 1008 if (buf != NULL) 1009 free(buf); 1010 if (fp != NULL) 1011 fclose(fp); 1012 if (bmp != NULL) 1013 bitmask_free(bmp); 1014 return -1; 1015} 1016 1017static int load_mask(const char *path, struct bitmask **bmpp, 1018 int nbits, const char *mask) 1019{ 1020 char buf[PATH_MAX]; 1021 1022 pathcat2(buf, sizeof(buf), path, mask); 1023 return read_mask(buf, bmpp, nbits); 1024} 1025 1026/* Write string to file at given filepath. Create or truncate file. */ 1027static int write_string_file(const char *filepath, const char *str) 1028{ 1029 int fd = -1; 1030 1031 if ((fd = open(filepath, O_WRONLY|O_CREAT, 0644)) < 0) 1032 goto err; 1033 if (write(fd, str, strlen(str)) < 0) 1034 goto err; 1035 close(fd); 1036 return 0; 1037err: 1038 if (fd >= 0) 1039 close(fd); 1040 return -1; 1041} 1042 1043/* Size and allocate buffer. Write bitmask into it. Caller must free */ 1044static char *sprint_mask_buf(const struct bitmask *bmp) 1045{ 1046 char *buf = NULL; 1047 int buflen; 1048 char c; 1049 1050 /* First bitmask_displaylist() call just to get the length */ 1051 buflen = bitmask_displaylist(&c, 1, bmp) + 1; /* "+ 1" for nul */ 1052 if ((buf = malloc(buflen)) == NULL) 1053 return NULL; 1054 bitmask_displaylist(buf, buflen, bmp); 1055 return buf; 1056} 1057 1058static int exists_flag(const char *path, const char *flag) 1059{ 1060 char buf[PATH_MAX]; 1061 struct stat statbuf; 1062 int rc; 1063 1064 pathcat2(buf, sizeof(buf), path, flag); 1065 rc = (stat(buf, &statbuf) == 0); 1066 errno = 0; 1067 return rc; 1068} 1069 1070static int store_flag(const char *path, const char *flag, int val) 1071{ 1072 char buf[PATH_MAX]; 1073 1074 pathcat2(buf, sizeof(buf), path, flag); 1075 return write_string_file(buf, val ? "1" : "0"); 1076} 1077 1078static int store_number(const char *path, const char *file, int val) 1079{ 1080 char buf[PATH_MAX]; 1081 char data[SMALL_BUFSZ]; 1082 1083 memset(data, 0, sizeof(data)); 1084 pathcat2(buf, sizeof(buf), path, file); 1085 snprintf(data, sizeof(data), "%d", val); 1086 return write_string_file(buf, data); 1087} 1088 1089static int store_mask(const char *path, const char *mask, 1090 const struct bitmask *bmp) 1091{ 1092 char maskpath[PATH_MAX]; 1093 char *bp = NULL; 1094 int rc; 1095 1096 if (bmp == NULL) 1097 return 0; 1098 pathcat2(maskpath, sizeof(maskpath), path, mask); 1099 if ((bp = sprint_mask_buf(bmp)) == NULL) 1100 return -1; 1101 rc = write_string_file(maskpath, bp); 1102 free(bp); 1103 return rc; 1104} 1105 1106/* 1107 * Return 1 if 'cpu' is online, else 0 if offline. Tests the file 1108 * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents 1109 * were N == cpu number. 1110 */ 1111 1112char cpu_online(unsigned int cpu) 1113{ 1114 char online; 1115 char cpupath[PATH_MAX]; 1116 1117 (void) snprintf(cpupath, sizeof(cpupath), 1118 "/sys/devices/system/cpu/cpu%d/online", cpu); 1119 if (read_flag(cpupath, &online) < 0) 1120 return 0; /* oops - guess that cpu's not there */ 1121 return online; 1122} 1123 1124/* 1125 * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()), 1126 * to the node on which that cpu resides or cpuset_mems_nbits(). 1127 * 1128 * To avoid every user having to recalculate this relation 1129 * from various clues in the sysfs file system (below the 1130 * path /sys/devices/system) a copy of this map is kept at 1131 * /var/run/cpunodemap. 1132 * 1133 * The system automatically cleans out files below 1134 * /var/run on each system reboot (see the init script 1135 * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry 1136 * about stale data in this file across reboots. If the file 1137 * is missing, let the first process that needs it, and has 1138 * permission to write in the /var/run directory, rebuild it. 1139 * 1140 * If using this cached data, remember the mtime of the mapfile 1141 * the last time we read it in case something like a hotplug 1142 * event results in the file being removed and rebuilt, so we 1143 * can detect if we're using a stale cache, and need to reload. 1144 * 1145 * The mtime of this file is set to the time when we did 1146 * the recalculation of the map, from the clues beneath 1147 * /sys/devices/system. This is done so that a program 1148 * won't see the mapfile it just wrote as being newer than what 1149 * it just wrote out (store_map) and read the same map back in 1150 * (load_file). 1151 */ 1152 1153/* 1154 * Hold flockfile(stdin) while using cpunodemap for posix thread safety. 1155 * 1156 * Note on locking and flockfile(FILE *): 1157 * 1158 * We use flockfile() and funlockfile() instead of directly 1159 * calling pthread_mutex_lock and pthread_mutex_unlock on 1160 * a pthread_mutex_t, because this avoids forcing the app 1161 * to link with libpthread. The glibc implementation of 1162 * flockfile/funlockfile will fall back to no-ops if libpthread 1163 * doesn't happen to be linked. 1164 * 1165 * Since flockfile already has the moderately convoluted 1166 * combination of weak and strong symbols required to accomplish 1167 * this, it is easier to use flockfile() on some handy FILE * 1168 * stream as a surrogate for pthread locking than it is to so 1169 * re-invent that wheel. 1170 * 1171 * Forcing all apps that use cpusets to link with libpthread 1172 * would force non-transparent initialization on apps that 1173 * might not be prepared to handle it. 1174 * 1175 * The application using libcpuset should never notice this 1176 * odd use of flockfile(), because we never return to the 1177 * application from any libcpuset call with any such lock held. 1178 * We just use this locking for guarding some non-atomic cached 1179 * data updates and accesses, internal to some libcpuset calls. 1180 * Also, flockfile() allows recursive nesting, so if the app 1181 * calls libcpuset holding such a file lock, we won't deadlock 1182 * if we go to acquire the same lock. We'll just get the lock 1183 * and increment its counter while we hold it. 1184 */ 1185 1186static struct cpunodemap { 1187 int *map; /* map[cpumask_sz]: maps cpu to its node */ 1188 time_t mtime; /* modtime of mapfile when last read */ 1189} cpunodemap; 1190 1191/* 1192 * rebuild_map() - Rebuild cpunodemap[] from scratch. 1193 * 1194 * Situation: 1195 * Neither our in-memory cpunodemap[] array nor the 1196 * cache of it in mapfile is current. 1197 * Action: 1198 * Rebuild it from first principles and the information 1199 * available below /sys/devices/system. 1200 */ 1201 1202static void rebuild_map() 1203{ 1204 char buf[PATH_MAX]; 1205 DIR *dir1, *dir2; 1206 struct dirent *dent1, *dent2; 1207 int ncpus = cpuset_cpus_nbits(); 1208 int nmems = cpuset_mems_nbits(); 1209 unsigned int cpu, mem; 1210 1211 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) 1212 cpunodemap.map[cpu] = -1; 1213 pathcat2(buf, sizeof(buf), sysdevices, "node"); 1214 if ((dir1 = opendir(buf)) == NULL) 1215 return; 1216 while ((dent1 = readdir(dir1)) != NULL) { 1217 if (sscanf(dent1->d_name, "node%u", &mem) < 1) 1218 continue; 1219 pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name); 1220 if ((dir2 = opendir(buf)) == NULL) 1221 continue; 1222 while ((dent2 = readdir(dir2)) != NULL) { 1223 if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1) 1224 continue; 1225 if (cpu >= (unsigned int)ncpus 1226 || mem >= (unsigned int)nmems) 1227 continue; 1228 cpunodemap.map[cpu] = mem; 1229 } 1230 closedir(dir2); 1231 } 1232 closedir(dir1); 1233 cpunodemap.mtime = time(0); 1234} 1235 1236/* 1237 * load_map() - Load cpunodemap[] from mapfile. 1238 * 1239 * Situation: 1240 * The cpunodemap in mapfile is more recent than 1241 * what we have in the cpunodemap[] array. 1242 * Action: 1243 * Reload the cpunodemap[] array from the file. 1244 */ 1245 1246static void load_map() 1247{ 1248 char buf[SMALL_BUFSZ]; /* buffer 1 line of mapfile */ 1249 FILE *mapfp; /* File stream on mapfile */ 1250 int ncpus = cpuset_cpus_nbits(); 1251 int nmems = cpuset_mems_nbits(); 1252 unsigned int cpu, mem; 1253 1254 if ((cpunodemap.map = calloc(ncpus, sizeof (int))) == NULL) 1255 return; 1256 cpunodemap.mtime = get_mtime(mapfile); 1257 if ((mapfp = fopen(mapfile, "r")) == NULL) 1258 return; 1259 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) 1260 cpunodemap.map[cpu] = nmems; 1261 while (flgets(buf, sizeof(buf), mapfp) != NULL) { 1262 if (sscanf(buf, "%u %u", &cpu, &mem) < 2) 1263 continue; 1264 if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems) 1265 continue; 1266 cpunodemap.map[cpu] = mem; 1267 } 1268 fclose(mapfp); 1269} 1270 1271/* 1272 * store_map() - Write cpunodemap[] out to mapfile. 1273 * 1274 * Situation: 1275 * The cpunodemap in the cpunodemap[] array is 1276 * more recent than the one in mapfile. 1277 * Action: 1278 * Write cpunodemap[] out to mapfile. 1279 */ 1280 1281static void store_map() 1282{ 1283 char buf[PATH_MAX]; 1284 int fd = -1; 1285 FILE *mapfp = NULL; 1286 int ncpus = cpuset_cpus_nbits(); 1287 int nmems = cpuset_mems_nbits(); 1288 unsigned int cpu, mem; 1289 1290 snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX"); 1291 if ((fd = mkstemp(buf)) < 0) 1292 goto err; 1293 if ((mapfp = fdopen(fd, "w")) == NULL) 1294 goto err; 1295 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1296 mem = cpunodemap.map[cpu]; 1297 if (mem < (unsigned int)nmems) 1298 fprintf(mapfp, "%u %u\n", cpu, mem); 1299 } 1300 fclose(mapfp); 1301 set_mtime(buf, cpunodemap.mtime); 1302 if (rename(buf, mapfile) < 0) 1303 goto err; 1304 /* mkstemp() creates mode 0600 - change to world readable */ 1305 (void) chmod(mapfile, 0444); 1306 return; 1307err: 1308 if (mapfp != NULL) { 1309 fclose(mapfp); 1310 fd = -1; 1311 } 1312 if (fd >= 0) 1313 close(fd); 1314 (void) unlink(buf); 1315} 1316 1317/* 1318 * Load and gain thread safe access to the <cpu, node> map. 1319 * 1320 * Return 0 on success with flockfile(stdin) held. 1321 * Each successful get_map() call must be matched with a 1322 * following put_map() call to release the lock. 1323 * 1324 * On error, return -1 with errno set and no lock held. 1325 */ 1326 1327static int get_map() 1328{ 1329 time_t file_mtime; 1330 1331 flockfile(stdin); 1332 1333 if (cpunodemap.map == NULL) { 1334 cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int)); 1335 if (cpunodemap.map == NULL) 1336 goto err; 1337 } 1338 1339 /* If no one has a good cpunodemap, rebuild from scratch */ 1340 file_mtime = get_mtime(mapfile); 1341 if (cpunodemap.mtime == 0 && file_mtime == 0) 1342 rebuild_map(); 1343 1344 /* If either cpunodemap[] or mapfile newer, update other with it */ 1345 file_mtime = get_mtime(mapfile); 1346 if (cpunodemap.mtime < file_mtime) 1347 load_map(); 1348 else if (cpunodemap.mtime > file_mtime) 1349 store_map(); 1350 return 0; 1351err: 1352 funlockfile(stdin); 1353 return -1; 1354} 1355 1356static void put_map() 1357{ 1358 funlockfile(stdin); 1359} 1360 1361/* Set cpus to those local to Memory Nodes mems */ 1362int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus) 1363{ 1364 int ncpus = cpuset_cpus_nbits(); 1365 unsigned int cpu; 1366 1367 if (check() < 0) 1368 return -1; 1369 1370 get_map(); 1371 bitmask_clearall(cpus); 1372 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1373 if (bitmask_isbitset(mems, cpunodemap.map[cpu])) 1374 bitmask_setbit(cpus, cpu); 1375 } 1376 put_map(); 1377 return 0; 1378} 1379 1380/* Set mems to those local to CPUs cpus */ 1381int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems) 1382{ 1383 int ncpus = cpuset_cpus_nbits(); 1384 unsigned int cpu; 1385 1386 if (check() < 0) 1387 return -1; 1388 1389 get_map(); 1390 bitmask_clearall(mems); 1391 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1392 if (bitmask_isbitset(cpus, cpu)) 1393 bitmask_setbit(mems, cpunodemap.map[cpu]); 1394 } 1395 put_map(); 1396 return 0; 1397} 1398 1399/* 1400 * distmap[] 1401 * 1402 * Array of ints of size cpumask_sz by nodemask_sz. 1403 * 1404 * Element distmap[cpu][mem] is the distance between CPU cpu 1405 * and Memory Node mem. Distances are weighted to roughly 1406 * approximate the cost of memory references, and scaled so that 1407 * the distance from a CPU to its local Memory Node is ten (10). 1408 * 1409 * The first call to cpuset_cpumemdist() builds this map, from 1410 * whatever means the kernel provides to obtain these distances. 1411 * 1412 * These distances derive from ACPI SLIT table entries, which are 1413 * eight bits in size. 1414 * 1415 * Hold flockfile(stdout) while using distmap for posix thread safety. 1416 */ 1417 1418typedef unsigned char distmap_entry_t; /* type of distmap[] entries */ 1419 1420static distmap_entry_t *distmap; /* maps <cpu, mem> to distance */ 1421 1422#define DISTMAP_MAX UCHAR_MAX /* maximum value in distmap[] */ 1423 1424#define I(i,j) ((i) * nmems + (j)) /* 2-D array index simulation */ 1425 1426/* 1427 * Parse arch neutral lines from 'distance' files of form: 1428 * 1429 * 46 66 10 20 1430 * 1431 * The lines contain a space separated list of distances, which is parsed 1432 * into array dists[] of each nodes distance from the specified node. 1433 * 1434 * Result is placed in distmap[ncpus][nmems]: 1435 * 1436 * For each cpu c on node: 1437 * For each node position n in list of distances: 1438 * distmap[c][n] = dists[n] 1439 */ 1440 1441static int parse_distmap_line(unsigned int node, char *buf) 1442{ 1443 char *p, *q; 1444 int ncpus = cpuset_cpus_nbits(); 1445 int nmems = cpuset_mems_nbits(); 1446 unsigned int c, n; 1447 distmap_entry_t *dists = NULL; 1448 struct bitmask *cpus = NULL, *mems = NULL; 1449 int ret = -1; 1450 1451 p = buf; 1452 if ((dists = calloc(nmems, sizeof(*dists))) == NULL) 1453 goto err; 1454 for (n = 0; n < (unsigned int)nmems; n++) 1455 dists[n] = DISTMAP_MAX; 1456 1457 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { 1458 unsigned int d; 1459 1460 if ((p = strpbrk(p, "0123456789")) == NULL) 1461 break; 1462 d = strtoul(p, &q, 10); 1463 if (p == q) 1464 break; 1465 if (d < DISTMAP_MAX) 1466 dists[n] = (distmap_entry_t)d; 1467 } 1468 1469 if ((mems = bitmask_alloc(nmems)) == NULL) 1470 goto err; 1471 bitmask_setbit(mems, node); 1472 1473 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1474 goto err; 1475 cpuset_localcpus(mems, cpus); 1476 1477 for (c = bitmask_first(cpus); c < (unsigned int)ncpus; 1478 c = bitmask_next(cpus, c + 1)) 1479 for (n = 0; n < (unsigned int)nmems; n++) 1480 distmap[I(c, n)] = dists[n]; 1481 ret = 0; 1482 /* fall into ... */ 1483err: 1484 bitmask_free(mems); 1485 bitmask_free(cpus); 1486 free(dists); 1487 return ret; 1488} 1489 1490static int parse_distance_file(unsigned int node, const char *path) 1491{ 1492 FILE *fp; 1493 char *buf = NULL; 1494 int buflen; 1495 1496 if ((fp = fopen(path, "r")) == NULL) 1497 goto err; 1498 1499 buflen = filesize(fp); 1500 1501 if ((buf = malloc(buflen)) == NULL) 1502 goto err; 1503 1504 if (flgets(buf, buflen, fp) == NULL) 1505 goto err; 1506 1507 if (parse_distmap_line(node, buf) < 0) 1508 goto err; 1509 1510 free(buf); 1511 fclose(fp); 1512 return 0; 1513err: 1514 free(buf); 1515 if (fp) 1516 fclose(fp); 1517 return -1; 1518} 1519 1520static void build_distmap() 1521{ 1522 static int tried_before = 0; 1523 int ncpus = cpuset_cpus_nbits(); 1524 int nmems = cpuset_mems_nbits(); 1525 int c, m; 1526 DIR *dir = NULL; 1527 struct dirent *dent; 1528 1529 if (tried_before) 1530 goto err; 1531 tried_before = 1; 1532 1533 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) 1534 goto err; 1535 1536 for (c = 0; c < ncpus; c++) 1537 for (m = 0; m < nmems; m++) 1538 distmap[I(c, m)] = DISTMAP_MAX; 1539 1540 if ((dir = opendir(distance_directory)) == NULL) 1541 goto err; 1542 while ((dent = readdir(dir)) != NULL) { 1543 char buf[PATH_MAX]; 1544 unsigned int node; 1545 1546 if (sscanf(dent->d_name, "node%u", &node) < 1) 1547 continue; 1548 pathcat3(buf, sizeof(buf), distance_directory, dent->d_name, 1549 "distance"); 1550 if (parse_distance_file(node, buf) < 0) 1551 goto err; 1552 } 1553 closedir(dir); 1554 return; 1555err: 1556 if (dir) 1557 closedir(dir); 1558 free(distmap); 1559 distmap = NULL; 1560} 1561 1562#ifdef ALTERNATE_SN_DISTMAP 1563 1564/* 1565 * Parse SN architecture specific line of form: 1566 * 1567 * node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10 1568 * 1569 * Second field is node number. The "dist" field is the colon separated list 1570 * of distances, which is parsed into array dists[] of each nodes distance 1571 * from that node. 1572 * 1573 * Result is placed in distmap[ncpus][nmems]: 1574 * 1575 * For each cpu c on that node: 1576 * For each node position n in list of distances: 1577 * distmap[c][n] = dists[n] 1578 */ 1579 1580static void parse_distmap_line_sn(char *buf) 1581{ 1582 char *p, *pend, *q; 1583 int ncpus = cpuset_cpus_nbits(); 1584 int nmems = cpuset_mems_nbits(); 1585 unsigned long c, n, node; 1586 distmap_entry_t *dists = NULL; 1587 struct bitmask *cpus = NULL, *mems = NULL; 1588 1589 if ((p = strchr(buf, ' ')) == NULL) 1590 goto err; 1591 if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems) 1592 goto err; 1593 if ((p = strstr(q, " dist ")) == NULL) 1594 goto err; 1595 p += strlen(" dist "); 1596 if ((pend = strchr(p, ' ')) != NULL) 1597 *pend = '\0'; 1598 if ((dists = calloc(nmems, sizeof(*dists))) == NULL) 1599 goto err; 1600 for (n = 0; n < (unsigned int)nmems; n++) 1601 dists[n] = DISTMAP_MAX; 1602 1603 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { 1604 unsigned long d; 1605 1606 if ((p = strpbrk(p, "0123456789")) == NULL) 1607 break; 1608 d = strtoul(p, &q, 10); 1609 if (p == q) 1610 break; 1611 if (d < DISTMAP_MAX) 1612 dists[n] = (distmap_entry_t)d; 1613 } 1614 1615 if ((mems = bitmask_alloc(nmems)) == NULL) 1616 goto err; 1617 bitmask_setbit(mems, node); 1618 1619 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1620 goto err; 1621 cpuset_localcpus(mems, cpus); 1622 1623 for (c = bitmask_first(cpus); c < (unsigned int)ncpus; 1624 c = bitmask_next(cpus, c + 1)) 1625 for (n = 0; n < (unsigned int)nmems; n++) 1626 distmap[I(c, n)] = dists[n]; 1627 /* fall into ... */ 1628err: 1629 bitmask_free(mems); 1630 bitmask_free(cpus); 1631 free(dists); 1632} 1633 1634static void build_distmap_sn() 1635{ 1636 int ncpus = cpuset_cpus_nbits(); 1637 int nmems = cpuset_mems_nbits(); 1638 int c, m; 1639 static int tried_before = 0; 1640 FILE *fp = NULL; 1641 char *buf = NULL; 1642 int buflen; 1643 1644 if (tried_before) 1645 goto err; 1646 tried_before = 1; 1647 1648 if ((fp = fopen(sn_topology, "r")) == NULL) 1649 goto err; 1650 1651 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) 1652 goto err; 1653 1654 for (c = 0; c < ncpus; c++) 1655 for (m = 0; m < nmems; m++) 1656 distmap[I(c, m)] = DISTMAP_MAX; 1657 1658 buflen = filesize(fp); 1659 if ((buf = malloc(buflen)) == NULL) 1660 goto err; 1661 1662 while (flgets(buf, buflen, fp) != NULL) 1663 if (strprefix(buf, sn_top_node_prefix)) 1664 parse_distmap_line_sn(buf); 1665 1666 free(buf); 1667 fclose(fp); 1668 return; 1669err: 1670 free(buf); 1671 free(distmap); 1672 distmap = NULL; 1673 if (fp) 1674 fclose(fp); 1675} 1676 1677#endif 1678 1679/* [optional] Hardware distance from CPU to Memory Node */ 1680unsigned int cpuset_cpumemdist(int cpu, int mem) 1681{ 1682 int ncpus = cpuset_cpus_nbits(); 1683 int nmems = cpuset_mems_nbits(); 1684 distmap_entry_t r = DISTMAP_MAX; 1685 1686 flockfile(stdout); 1687 1688 if (check() < 0) 1689 goto err; 1690 1691 if (distmap == NULL) 1692 build_distmap(); 1693 1694#ifdef ALTERNATE_SN_DISTMAP 1695 if (distmap == NULL) 1696 build_distmap_sn(); 1697#endif 1698 1699 if (distmap == NULL) 1700 goto err; 1701 1702 if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems) 1703 goto err; 1704 1705 r = distmap[I(cpu, mem)]; 1706 /* fall into ... */ 1707err: 1708 funlockfile(stdout); 1709 return r; 1710} 1711 1712/* [optional] Return Memory Node closest to cpu */ 1713int cpuset_cpu2node(int cpu) 1714{ 1715 int ncpus = cpuset_cpus_nbits(); 1716 int nmems = cpuset_mems_nbits(); 1717 struct bitmask *cpus = NULL, *mems = NULL; 1718 int r = -1; 1719 1720 if (check() < 0) 1721 goto err; 1722 1723 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1724 goto err; 1725 bitmask_setbit(cpus, cpu); 1726 1727 if ((mems = bitmask_alloc(nmems)) == NULL) 1728 goto err; 1729 cpuset_localmems(cpus, mems); 1730 r = bitmask_first(mems); 1731 /* fall into ... */ 1732err: 1733 bitmask_free(cpus); 1734 bitmask_free(mems); 1735 return r; 1736} 1737 1738static int apply_cpuset_settings(const char *path, const struct cpuset *cp) 1739{ 1740 if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) { 1741 if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0) 1742 goto err; 1743 } 1744 1745 if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) { 1746 if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0) 1747 goto err; 1748 } 1749 1750 if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) { 1751 if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0) 1752 goto err; 1753 } 1754 1755 if (cp->notify_on_release_valid && cp->notify_on_release_dirty) { 1756 if (store_flag(path, "notify_on_release", cp->notify_on_release) < 0) 1757 goto err; 1758 } 1759 1760 if (cp->memory_migrate_valid && 1761 cp->memory_migrate_dirty && 1762 exists_flag(path, "memory_migrate")) { 1763 if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0) 1764 goto err; 1765 } 1766 1767 if (cp->memory_pressure_enabled_valid && 1768 cp->memory_pressure_enabled_dirty && 1769 exists_flag(path, "memory_pressure_enabled")) { 1770 if (store_flag(path, "memory_pressure_enabled", cp->memory_pressure_enabled) < 0) 1771 goto err; 1772 } 1773 1774 if (cp->memory_spread_page_valid && 1775 cp->memory_spread_page_dirty && 1776 exists_flag(path, "memory_spread_page")) { 1777 if (store_flag(path, "memory_spread_page", cp->memory_spread_page) < 0) 1778 goto err; 1779 } 1780 1781 if (cp->memory_spread_slab_valid && 1782 cp->memory_spread_slab_dirty && 1783 exists_flag(path, "memory_spread_slab")) { 1784 if (store_flag(path, "memory_spread_slab", cp->memory_spread_slab) < 0) 1785 goto err; 1786 } 1787 1788 if (cp->sched_load_balance_valid && 1789 cp->sched_load_balance_dirty && 1790 exists_flag(path, "sched_load_balance")) { 1791 if (store_flag(path, "sched_load_balance", cp->sched_load_balance) < 0) 1792 goto err; 1793 } 1794 1795 if (cp->sched_relax_domain_level_valid && 1796 cp->sched_relax_domain_level_dirty && 1797 exists_flag(path, "sched_relax_domain_level")) { 1798 if (store_number(path, "sched_relax_domain_level", cp->sched_relax_domain_level) < 0) 1799 goto err; 1800 } 1801 1802 if (cp->cpus_valid && cp->cpus_dirty) { 1803 if (store_mask(path, "cpus", cp->cpus) < 0) 1804 goto err; 1805 } 1806 1807 if (cp->mems_valid && cp->mems_dirty) { 1808 if (store_mask(path, "mems", cp->mems) < 0) 1809 goto err; 1810 } 1811 return 0; 1812err: 1813 return -1; 1814} 1815 1816/* 1817 * get_siblings() - helper routine for cpuset_would_crash_kernel(), below. 1818 * 1819 * Extract max value of any 'siblings' field in /proc/cpuinfo. 1820 * Cache the result - only need to extract once in lifetime of task. 1821 * 1822 * The siblings field is the number of logical CPUs in a physical 1823 * processor package. It is equal to the product of the number of 1824 * cores in that package, times the number of hyper-threads per core. 1825 * The bug that cpuset_would_crash_kernel() is detecting arises 1826 * when a cpu_exclusive cpuset tries to include just some, not all, 1827 * of the sibling logical CPUs available in a processor package. 1828 * 1829 * In the improbable case that a system has mixed values of siblings 1830 * (some processor packages have more than others, perhaps due to 1831 * partially enabling Hyper-Threading), we take the worse case value, 1832 * the largest siblings value. This might be overkill. I don't know 1833 * if this kernel bug considers each processor package's siblings 1834 * separately or not. But it sure is easier this way ... 1835 * 1836 * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from 1837 * open to close, the first time called. 1838 */ 1839 1840static int get_siblings() 1841{ 1842 static int siblings; 1843 char buf[32]; /* big enough for one 'siblings' line */ 1844 FILE *fp; 1845 1846 if (siblings) 1847 return siblings; 1848 1849 if ((fp = fopen("/proc/cpuinfo", "r")) == NULL) 1850 return 4; /* wing it - /proc not mounted ? */ 1851 while (flgets(buf, sizeof(buf), fp) != NULL) { 1852 int s; 1853 1854 if (sscanf(buf, "siblings : %d", &s) < 1) 1855 continue; 1856 if (s > siblings) 1857 siblings = s; 1858 } 1859 fclose(fp); 1860 if (siblings == 0) 1861 siblings = 1; /* old kernel, no siblings, default to 1 */ 1862 return siblings; 1863} 1864 1865/* 1866 * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic 1867 * scheduler domain code invoked for cpu_exclusive cpusets that causes 1868 * the kernel to freeze, requiring a hardware reset. 1869 * 1870 * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive' 1871 * cpuset is defined where that cpusets 'cpus' are not on package 1872 * boundaries then the kernel will freeze, usually as soon as this 1873 * cpuset is created, requiring a hardware reset. 1874 * 1875 * A cpusets 'cpus' are not on package boundaries if the cpuset 1876 * includes a proper non-empty subset (some, but not all) of the 1877 * logical cpus on a processor package. This requires multiple 1878 * logical CPUs per package, available with either Hyper-Thread or 1879 * Multi-Core support. Without one of these features, there is only 1880 * one logical CPU per physical package, and it's not possible to 1881 * have a proper, non-empty subset of a set of cardinality one. 1882 * 1883 * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC 1884 * on i386 and x86_64 arch's. 1885 * 1886 * The objective of this routine cpuset_would_crash_kernel() is to 1887 * determine if a proposed cpuset setting would crash the kernel due 1888 * to this bug, so that the caller can avoid the crash. 1889 * 1890 * Ideally we'd check for exactly these conditions here, but computing 1891 * the package (identified by the 'physical id' field of /proc/cpuinfo) 1892 * of each cpu in a cpuset is more effort than it's worth here. 1893 * 1894 * Also there is no obvious way to identify exactly whether the kernel 1895 * one is executing on has this bug, short of trying it, and seeing 1896 * if the kernel just crashed. 1897 * 1898 * So for now, we look for a simpler set of conditions, that meets 1899 * our immediate need - avoid this crash on SUSE SLES10 systems that 1900 * are susceptible to it. We look for the kernel version 2.6.16.*, 1901 * which is the base kernel of SUSE SLES10, and for i386 or x86_64 1902 * processors, which had CONFIG_SCHED_MC enabled. 1903 * 1904 * If these simpler conditions are met, we further simplify the check, 1905 * by presuming that the logical CPUs are numbered on processor 1906 * package boundaries. If each package has S siblings, we assume 1907 * that CPUs numbered N through N + S -1 are on the same package, 1908 * for any CPU N such that N mod S == 0. 1909 * 1910 * Yes, this is a hack, focused on avoiding kernel freezes on 1911 * susceptible SUSE SLES10 systems. 1912 */ 1913 1914static int cpuset_would_crash_kernel(const struct cpuset *cp) 1915{ 1916 static int susceptible_system = -1; 1917 1918 if (!cp->cpu_exclusive) 1919 goto ok; 1920 1921 if (susceptible_system == -1) { 1922 struct utsname u; 1923 int rel_2_6_16, arch_i386, arch_x86_64; 1924 1925 if (uname(&u) < 0) 1926 goto fail; 1927 rel_2_6_16 = strprefix(u.release, "2.6.16."); 1928 arch_i386 = streq(u.machine, "i386"); 1929 arch_x86_64 = streq(u.machine, "x86_64"); 1930 susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64); 1931 } 1932 1933 if (susceptible_system) { 1934 int ncpus = cpuset_cpus_nbits(); 1935 int siblings = get_siblings(); 1936 unsigned int cpu; 1937 1938 for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) { 1939 int s, num_set = 0; 1940 1941 for (s = 0; s < siblings; s++) { 1942 if (bitmask_isbitset(cp->cpus, cpu + s)) 1943 num_set++; 1944 } 1945 1946 /* If none or all siblings set, we're still ok */ 1947 if (num_set == 0 || num_set == siblings) 1948 continue; 1949 1950 /* Found one that would crash kernel. Fail. */ 1951 errno = ENXIO; 1952 goto fail; 1953 } 1954 } 1955 /* If not susceptible, or if all ok, fall into "ok" ... */ 1956ok: 1957 return 0; /* would not crash */ 1958fail: 1959 return 1; /* would crash */ 1960} 1961 1962/* compare two cpuset and mark the dirty variable */ 1963static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2) 1964{ 1965 if (cp1->cpu_exclusive_valid && 1966 cp1->cpu_exclusive != cp2->cpu_exclusive) 1967 cp1->cpu_exclusive_dirty = 1; 1968 1969 if (cp1->mem_exclusive_valid && 1970 cp1->mem_exclusive != cp2->mem_exclusive) 1971 cp1->mem_exclusive_dirty = 1; 1972 1973 if (cp1->mem_hardwall_valid && 1974 cp1->mem_hardwall != cp2->mem_hardwall) 1975 cp1->mem_hardwall_dirty = 1; 1976 1977 if (cp1->notify_on_release_valid && 1978 cp1->notify_on_release != cp2->notify_on_release) 1979 cp1->notify_on_release_dirty = 1; 1980 1981 if (cp1->memory_migrate_valid && 1982 cp1->memory_migrate != cp2->memory_migrate) 1983 cp1->memory_migrate_dirty = 1; 1984 1985 if (cp1->memory_pressure_enabled_valid && 1986 cp1->memory_pressure_enabled != cp2->memory_pressure_enabled) 1987 cp1->memory_pressure_enabled_dirty = 1; 1988 1989 if (cp1->memory_spread_page_valid && 1990 cp1->memory_spread_page != cp2->memory_spread_page) 1991 cp1->memory_spread_page_dirty = 1; 1992 1993 if (cp1->memory_spread_slab_valid && 1994 cp1->memory_spread_slab != cp2->memory_spread_slab) 1995 cp1->memory_spread_slab_dirty = 1; 1996 1997 if (cp1->sched_load_balance_valid && 1998 cp1->sched_load_balance != cp2->sched_load_balance) 1999 cp1->sched_load_balance_dirty = 1; 2000 2001 if (cp1->sched_relax_domain_level_valid && 2002 cp1->sched_relax_domain_level != cp2->sched_relax_domain_level) 2003 cp1->sched_relax_domain_level_dirty = 1; 2004 2005 if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus)) 2006 cp1->cpus_dirty = 1; 2007 if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems)) 2008 cp1->mems_dirty = 1; 2009} 2010 2011/* Create (if new set) or modify cpuset 'cp' at location 'relpath' */ 2012static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new) 2013{ 2014 char buf[PATH_MAX]; 2015 int do_rmdir_on_err = 0; 2016 int do_restore_cp_sav_on_err = 0; 2017 struct cpuset *cp_sav = NULL; 2018 int sav_errno; 2019 2020 if (check() < 0) 2021 goto err; 2022 2023 if (cpuset_would_crash_kernel(cp)) 2024 goto err; 2025 2026 fullpath(buf, sizeof(buf), relpath); 2027 2028 if (new) { 2029 if (mkdir(buf, 0755) < 0) 2030 goto err; 2031 /* we made it, so we should remove it on error */ 2032 do_rmdir_on_err = 1; 2033 } 2034 2035 if ((cp_sav = cpuset_alloc()) == NULL) 2036 goto err; 2037 if (cpuset_query(cp_sav, relpath) < 0) 2038 goto err; 2039 /* we have old settings to restore on error */ 2040 do_restore_cp_sav_on_err = 1; 2041 2042 /* check which variable need to restore on error */ 2043 mark_dirty_variable(cp_sav, cp); 2044 2045 if (apply_cpuset_settings(buf, cp) < 0) 2046 goto err; 2047 2048 cpuset_free(cp_sav); 2049 return 0; 2050err: 2051 sav_errno = errno; 2052 if (do_restore_cp_sav_on_err) 2053 (void) apply_cpuset_settings(buf, cp_sav); 2054 if (cp_sav) 2055 cpuset_free(cp_sav); 2056 if (do_rmdir_on_err) 2057 (void) rmdir(buf); 2058 errno = sav_errno; 2059 return -1; 2060} 2061 2062/* Create cpuset 'cp' at location 'relpath' */ 2063int cpuset_create(const char *relpath, const struct cpuset *cp) 2064{ 2065 return cr_or_mod(relpath, cp, 1); 2066} 2067 2068/* Delete cpuset at location 'path' (if empty) */ 2069int cpuset_delete(const char *relpath) 2070{ 2071 char buf[PATH_MAX]; 2072 2073 if (check() < 0) 2074 goto err; 2075 2076 fullpath(buf, sizeof(buf), relpath); 2077 if (rmdir(buf) < 0) 2078 goto err; 2079 2080 return 0; 2081err: 2082 return -1; 2083} 2084 2085/* Set cpuset cp to the cpuset at location 'path' */ 2086int cpuset_query(struct cpuset *cp, const char *relpath) 2087{ 2088 char buf[PATH_MAX]; 2089 2090 if (check() < 0) 2091 goto err; 2092 2093 fullpath(buf, sizeof(buf), relpath); 2094 2095 if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0) 2096 goto err; 2097 cp->cpu_exclusive_valid = 1; 2098 2099 if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0) 2100 goto err; 2101 cp->mem_exclusive_valid = 1; 2102 2103 if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0) 2104 goto err; 2105 cp->notify_on_release_valid = 1; 2106 2107 if (exists_flag(buf, "memory_migrate")) { 2108 if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0) 2109 goto err; 2110 cp->memory_migrate_valid = 1; 2111 } 2112 2113 if (exists_flag(buf, "mem_hardwall")) { 2114 if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0) 2115 goto err; 2116 cp->mem_hardwall_valid = 1; 2117 } 2118 2119 if (exists_flag(buf, "memory_pressure_enabled")) { 2120 if (load_flag(buf, &cp->memory_pressure_enabled, "memory_pressure_enabled") < 0) 2121 goto err; 2122 cp->memory_pressure_enabled_valid = 1; 2123 } 2124 2125 if (exists_flag(buf, "memory_spread_page")) { 2126 if (load_flag(buf, &cp->memory_spread_page, "memory_spread_page") < 0) 2127 goto err; 2128 cp->memory_spread_page_valid = 1; 2129 } 2130 2131 if (exists_flag(buf, "memory_spread_slab")) { 2132 if (load_flag(buf, &cp->memory_spread_slab, "memory_spread_slab") < 0) 2133 goto err; 2134 cp->memory_spread_slab_valid = 1; 2135 } 2136 2137 if (exists_flag(buf, "sched_load_balance")) { 2138 if (load_flag(buf, &cp->sched_load_balance, "sched_load_balance") < 0) 2139 goto err; 2140 cp->sched_load_balance_valid = 1; 2141 } 2142 2143 if (exists_flag(buf, "sched_relax_domain_level")) { 2144 if (load_number(buf, &cp->sched_relax_domain_level, "sched_relax_domain_level") < 0) 2145 goto err; 2146 cp->sched_relax_domain_level_valid = 1; 2147 } 2148 2149 if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0) 2150 goto err; 2151 cp->cpus_valid = 1; 2152 2153 if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0) 2154 goto err; 2155 cp->mems_valid = 1; 2156 2157 return 0; 2158err: 2159 return -1; 2160} 2161 2162/* Modify cpuset at location 'relpath' to values of 'cp' */ 2163int cpuset_modify(const char *relpath, const struct cpuset *cp) 2164{ 2165 return cr_or_mod(relpath, cp, 0); 2166} 2167 2168/* Get cpuset path of pid into buf */ 2169char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size) 2170{ 2171 int fd; /* dual use: cpuset file for pid and self */ 2172 int rc; /* dual use: snprintf and read return codes */ 2173 2174 if (check() < 0) 2175 return NULL; 2176 2177 /* borrow result buf[] to build cpuset file path */ 2178 if (pid == 0) 2179 rc = snprintf(buf, size, "/proc/self/cpuset"); 2180 else 2181 rc = snprintf(buf, size, "/proc/%d/cpuset", pid); 2182 if (rc >= (int)size) { 2183 errno = E2BIG; 2184 return NULL; 2185 } 2186 if ((fd = open(buf, O_RDONLY)) < 0) { 2187 int e = errno; 2188 if (e == ENOENT) 2189 e = ESRCH; 2190 if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0) 2191 e = ENOSYS; 2192 else 2193 close(fd); 2194 errno = e; 2195 return NULL; 2196 } 2197 rc = read(fd, buf, size); 2198 close(fd); 2199 if (rc < 0) 2200 return NULL; 2201 if (rc >= (int)size) { 2202 errno = E2BIG; 2203 return NULL; 2204 } 2205 buf[rc] = 0; 2206 chomp(buf); 2207 return buf; 2208 2209} 2210 2211/* Get cpuset 'cp' of pid */ 2212int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid) 2213{ 2214 char buf[PATH_MAX]; 2215 2216 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) 2217 return -1; 2218 if (cpuset_query(cp, buf) < 0) 2219 return -1; 2220 return 0; 2221} 2222 2223/* [optional] Return mountpoint of cpuset filesystem */ 2224const char *cpuset_mountpoint() 2225{ 2226 if (check() < 0) { 2227 switch (errno) { 2228 case ENODEV: 2229 return "[cpuset filesystem not mounted]"; 2230 default: 2231 return "[cpuset filesystem not supported]"; 2232 } 2233 } 2234 return cpusetmnt; 2235} 2236 2237/* Return true if path is a directory. */ 2238static int isdir(const char *path) 2239{ 2240 struct stat statbuf; 2241 2242 if (stat(path, &statbuf) < 0) 2243 return 0; 2244 return S_ISDIR(statbuf.st_mode); 2245} 2246 2247/* 2248 * [optional] cpuset_collides_exclusive() - True if would collide exclusive. 2249 * 2250 * Return true iff the specified cpuset would overlap with any 2251 * sibling cpusets in either cpus or mems, where either this 2252 * cpuset or the sibling is cpu_exclusive or mem_exclusive. 2253 * 2254 * cpuset_create() fails with errno == EINVAL if the requested cpuset 2255 * would overlap with any sibling, where either one is cpu_exclusive or 2256 * mem_exclusive. This is a common, and not obvious error. The 2257 * following routine checks for this particular case, so that code 2258 * creating cpusets can better identify the situation, perhaps to issue 2259 * a more informative error message. 2260 * 2261 * Can also be used to diagnose cpuset_modify failures. This 2262 * routine ignores any existing cpuset with the same path as the 2263 * given 'cpusetpath', and only looks for exclusive collisions with 2264 * sibling cpusets of that path. 2265 * 2266 * In case of any error, returns (0) -- does not collide. Presumably 2267 * any actual attempt to create or modify a cpuset will encounter the 2268 * same error, and report it usefully. 2269 * 2270 * This routine is not particularly efficient; most likely code creating or 2271 * modifying a cpuset will want to try the operation first, and then if that 2272 * fails with errno EINVAL, perhaps call this routine to determine if an 2273 * exclusive cpuset collision caused the error. 2274 */ 2275 2276int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1) 2277{ 2278 char parent[PATH_MAX]; 2279 char *p; 2280 char *pathcopy = NULL; 2281 char *base; 2282 DIR *dir = NULL; 2283 struct dirent *dent; 2284 struct cpuset *cp2 = NULL; 2285 struct bitmask *cpus1 = NULL, *cpus2 = NULL; 2286 struct bitmask *mems1 = NULL, *mems2 = NULL; 2287 int ret; 2288 2289 if (check() < 0) 2290 goto err; 2291 2292 fullpath(parent, sizeof(parent), cpusetpath); 2293 if (streq(parent, cpusetmnt)) 2294 goto err; /* only one cpuset root - can't collide */ 2295 pathcopy = strdup(parent); 2296 p = strrchr(parent, '/'); 2297 if (!p) 2298 goto err; /* huh? - impossible - run and hide */ 2299 *p = 0; /* now parent is dirname of fullpath */ 2300 2301 p = strrchr(pathcopy, '/'); 2302 base = p + 1; /* now base is basename of fullpath */ 2303 if (!*base) 2304 goto err; /* this is also impossible - run away */ 2305 2306 if ((dir = opendir(parent)) == NULL) 2307 goto err; 2308 if ((cp2 = cpuset_alloc()) == NULL) 2309 goto err; 2310 if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 2311 goto err; 2312 if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 2313 goto err; 2314 if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL) 2315 goto err; 2316 if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL) 2317 goto err; 2318 2319 while ((dent = readdir(dir)) != NULL) { 2320 char child[PATH_MAX]; 2321 2322 if (streq(dent->d_name, ".") || streq(dent->d_name, "..")) 2323 continue; 2324 if (streq(dent->d_name, base)) 2325 continue; 2326 pathcat2(child, sizeof(child), parent, dent->d_name); 2327 if (!isdir(child)) 2328 continue; 2329 if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0) 2330 goto err; 2331 if (cp1->cpu_exclusive || cp2->cpu_exclusive) { 2332 cpuset_getcpus(cp1, cpus1); 2333 cpuset_getcpus(cp2, cpus2); 2334 if (bitmask_intersects(cpus1, cpus2)) 2335 goto collides; 2336 } 2337 if (cp1->mem_exclusive || cp2->mem_exclusive) { 2338 cpuset_getmems(cp1, mems1); 2339 cpuset_getmems(cp2, mems2); 2340 if (bitmask_intersects(mems1, mems2)) 2341 goto collides; 2342 } 2343 } 2344err: 2345 /* error, or did not collide */ 2346 ret = 0; 2347 goto done; 2348collides: 2349 /* collides */ 2350 ret = 1; 2351 /* fall into ... */ 2352done: 2353 if (dir) 2354 closedir(dir); 2355 cpuset_free(cp2); 2356 free(pathcopy); 2357 bitmask_free(cpus1); 2358 bitmask_free(cpus2); 2359 bitmask_free(mems1); 2360 bitmask_free(mems2); 2361 return ret; 2362} 2363 2364/* 2365 * [optional] cpuset_nuke() - Remove cpuset anyway possible 2366 * 2367 * Remove a cpuset, including killing tasks in it, and 2368 * removing any descendent cpusets and killing their tasks. 2369 * 2370 * Tasks can take a long time (minutes on some configurations) 2371 * to exit. Loop up to 'seconds' seconds, trying to kill them. 2372 * 2373 * How we do it: 2374 * 1) First, kill all the pids, looping until there are 2375 * no more pids in this cpuset or below, or until the 2376 * 'seconds' timeout limit is exceeded. 2377 * 2) Then depth first recursively rmdir the cpuset directories. 2378 * 3) If by this point the original cpuset is gone, we succeeded. 2379 * 2380 * If the timeout is exceeded, and tasks still exist, fail with 2381 * errno == ETIME. 2382 * 2383 * We sleep a variable amount of time. After the first attempt to 2384 * kill all the tasks in the cpuset or its descendents, we sleep 1 2385 * second, the next time 2 seconds, increasing 1 second each loop 2386 * up to a max of 10 seconds. If more loops past 10 are required 2387 * to kill all the tasks, we sleep 10 seconds each subsequent loop. 2388 * In any case, before the last loop, we sleep however many seconds 2389 * remain of the original timeout 'seconds' requested. The total 2390 * time of all sleeps will be no more than the requested 'seconds'. 2391 * 2392 * If the cpuset started out empty of any tasks, or if the passed in 2393 * 'seconds' was zero, then this routine will return quickly, having 2394 * not slept at all. Otherwise, this routine will at a minimum send 2395 * a SIGKILL to all the tasks in this cpuset subtree, then sleep one 2396 * second, before looking to see if any tasks remain. If tasks remain 2397 * in the cpuset subtree, and a longer 'seconds' timeout was requested 2398 * (more than one), it will continue to kill remaining tasks and sleep, 2399 * in a loop, for as long as time and tasks remain. 2400 * 2401 * The signal sent for the kill is hardcoded to SIGKILL (9). If some 2402 * other signal should be sent first, use a separate code loop, 2403 * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to 2404 * scan the task pids in a cpuset. If SIGKILL should -not- be sent, 2405 * this cpuset_nuke() routine can still be called to recursively 2406 * remove a cpuset subtree, by specifying a timeout of zero 'seconds'. 2407 * 2408 * On success, returns 0 with errno == 0. 2409 * 2410 * On failure, returns -1, with errno possibly one of: 2411 * EACCES - search permission denied on intervening directory 2412 * ETIME - timed out - tasks remain after 'seconds' timeout 2413 * EMFILE - too many open files 2414 * ENODEV - /dev/cpuset not mounted 2415 * ENOENT - component of cpuset path doesn't exist 2416 * ENOMEM - out of memory 2417 * ENOSYS - kernel doesn't support cpusets 2418 * ENOTDIR - component of cpuset path is not a directory 2419 * EPERM - lacked permission to kill a task 2420 * EPERM - lacked permission to read cpusets or files therein 2421 */ 2422 2423void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree); 2424 2425int cpuset_nuke(const char *relpath, unsigned int seconds) 2426{ 2427 unsigned int secs_left = seconds; /* total sleep seconds left */ 2428 unsigned int secs_loop = 1; /* how much sleep next loop */ 2429 unsigned int secs_slept; /* seconds slept in sleep() */ 2430 struct cpuset_pidlist *pl = NULL; /* pids in cpuset subtree */ 2431 struct cpuset_fts_tree *cs_tree; 2432 const struct cpuset_fts_entry *cs_entry; 2433 int ret, sav_errno = 0; 2434 2435 if (check() < 0) 2436 return -1; 2437 2438 if (seconds == 0) 2439 goto rmdir_cpusets; 2440 2441 while (1) { 2442 int plen, j; 2443 2444 if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) { 2445 /* missing cpuset is as good as if already nuked */ 2446 if (errno == ENOENT) { 2447 ret = 0; 2448 goto no_more_cpuset; 2449 } 2450 2451 /* other problems reading cpuset are bad news */ 2452 sav_errno = errno; 2453 goto failed; 2454 } 2455 2456 if ((plen = cpuset_pidlist_length(pl)) == 0) 2457 goto rmdir_cpusets; 2458 2459 for (j = 0; j < plen; j++) { 2460 pid_t pid; 2461 2462 if ((pid = cpuset_get_pidlist(pl, j)) > 1) { 2463 if (kill(pid, SIGKILL) < 0 && errno != ESRCH) { 2464 sav_errno = errno; 2465 goto failed; 2466 } 2467 } 2468 } 2469 2470 if (secs_left == 0) 2471 goto took_too_long; 2472 2473 cpuset_freepidlist(pl); 2474 pl = NULL; 2475 2476 secs_slept = secs_loop - sleep(secs_loop); 2477 2478 /* Ensure forward progress */ 2479 if (secs_slept == 0) 2480 secs_slept = 1; 2481 2482 /* Ensure sane sleep() return (unnecessary?) */ 2483 if (secs_slept > secs_loop) 2484 secs_slept = secs_loop; 2485 2486 secs_left -= secs_slept; 2487 2488 if (secs_loop < 10) 2489 secs_loop++; 2490 2491 secs_loop = min(secs_left, secs_loop); 2492 } 2493 2494took_too_long: 2495 sav_errno = ETIME; 2496 /* fall into ... */ 2497failed: 2498 cpuset_freepidlist(pl); 2499 errno = sav_errno; 2500 return -1; 2501 2502rmdir_cpusets: 2503 /* Let's try removing cpuset(s) now. */ 2504 cpuset_freepidlist(pl); 2505 2506 if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT) 2507 return -1; 2508 ret = 0; 2509 cpuset_fts_reverse(cs_tree); /* rmdir's must be done bottom up */ 2510 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { 2511 char buf[PATH_MAX]; 2512 2513 fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry)); 2514 if (rmdir(buf) < 0 && errno != ENOENT) { 2515 sav_errno = errno; 2516 ret = -1; 2517 } 2518 } 2519 cpuset_fts_close(cs_tree); 2520 /* fall into ... */ 2521no_more_cpuset: 2522 if (ret == 0) 2523 errno = 0; 2524 else 2525 errno = sav_errno; 2526 return ret; 2527} 2528 2529/* 2530 * When recursively reading all the tasks files from a subtree, 2531 * chain together the read results, one pidblock per tasks file, 2532 * containing the raw unprocessed ascii as read(2) in. After 2533 * we gather up this raw data, we then go back to count how 2534 * many pid's there are in total, allocate an array of pid_t 2535 * of that size, and transform the raw ascii data into this 2536 * array of pid_t's. 2537 */ 2538 2539struct pidblock { 2540 char *buf; 2541 int buflen; 2542 struct pidblock *next; 2543}; 2544 2545/* 2546 * Chain the raw contents of a file onto the pbhead list. 2547 * 2548 * We malloc "+ 1" extra byte for a nul-terminator, so that 2549 * the strtoul() loop in pid_transform() won't scan past 2550 * the end of pb->buf[] and accidentally find more pids. 2551 */ 2552static void add_pidblock(const char *file, struct pidblock **ppbhead) 2553{ 2554 FILE *fp = NULL; 2555 struct pidblock *pb = NULL; 2556 int fsz; 2557 2558 if ((fp = fopen(file, "r")) == NULL) 2559 goto err; 2560 fsz = filesize(fp); 2561 if (fsz == 0) 2562 goto err; 2563 if ((pb = calloc(1, sizeof(*pb))) == NULL) 2564 goto err; 2565 pb->buflen = fsz; 2566 if ((pb->buf = malloc(pb->buflen + 1)) == NULL) 2567 goto err; 2568 if (fread(pb->buf, 1, pb->buflen, fp) > 0) { 2569 pb->buf[pb->buflen] = '\0'; 2570 pb->next = *ppbhead; 2571 *ppbhead = pb; 2572 } 2573 fclose(fp); 2574 return; 2575err: 2576 if (fp) 2577 fclose(fp); 2578 if (pb) 2579 free(pb); 2580} 2581 2582static void read_task_file(const char *relpath, struct pidblock **ppbhead) 2583{ 2584 char buf[PATH_MAX]; 2585 2586 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2587 add_pidblock(buf, ppbhead); 2588} 2589 2590struct cpuset_pidlist { 2591 pid_t *pids; 2592 int npids; 2593}; 2594 2595/* Count how many pids in buf (one per line - just count newlines) */ 2596static int pidcount(const char *buf, int buflen) 2597{ 2598 int n = 0; 2599 const char *cp; 2600 2601 for (cp = buf; cp < buf + buflen; cp++) { 2602 if (*cp == '\n') 2603 n++; 2604 } 2605 return n; 2606} 2607 2608/* Transform one-per-line ascii pids in pb to pid_t entries in pl */ 2609static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n) 2610{ 2611 char *a, *b; 2612 2613 for (a = pb->buf; a < pb->buf + pb->buflen; a = b) { 2614 pid_t p = strtoul(a, &b, 10); 2615 if (a == b) 2616 break; 2617 pl->pids[n++] = p; 2618 } 2619 return n; 2620} 2621 2622static void free_pidblocks(struct pidblock *pbhead) 2623{ 2624 struct pidblock *pb, *nextpb; 2625 2626 for (pb = pbhead; pb; pb = nextpb) { 2627 nextpb = pb->next; 2628 free(pb->buf); 2629 free(pb); 2630 } 2631} 2632 2633/* numeric comparison routine for qsort */ 2634static int numericsort(const void *m1, const void *m2) 2635{ 2636 pid_t p1 = * (pid_t *) m1; 2637 pid_t p2 = * (pid_t *) m2; 2638 2639 return p1 - p2; 2640} 2641 2642/* Return list pids in cpuset 'path' */ 2643struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath, 2644 int recursiveflag) 2645{ 2646 struct pidblock *pb = NULL; 2647 struct cpuset_pidlist *pl = NULL; 2648 struct pidblock *pbhead = NULL; 2649 int n; 2650 2651 if (check() < 0) 2652 goto err; 2653 2654 if (recursiveflag) { 2655 struct cpuset_fts_tree *cs_tree; 2656 const struct cpuset_fts_entry *cs_entry; 2657 2658 if ((cs_tree = cpuset_fts_open(relpath)) == NULL) 2659 goto err; 2660 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { 2661 if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET) 2662 continue; 2663 read_task_file(cpuset_fts_get_path(cs_entry), &pbhead); 2664 } 2665 cpuset_fts_close(cs_tree); 2666 } else { 2667 read_task_file(relpath, &pbhead); 2668 } 2669 2670 if ((pl = calloc(1, sizeof(*pl))) == NULL) 2671 goto err; 2672 pl->npids = 0; 2673 for (pb = pbhead; pb; pb = pb->next) 2674 pl->npids += pidcount(pb->buf, pb->buflen); 2675 if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL) 2676 goto err; 2677 n = 0; 2678 for (pb = pbhead; pb; pb = pb->next) 2679 n = pid_transform(pb, pl, n); 2680 free_pidblocks(pbhead); 2681 qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort); 2682 return pl; 2683err: 2684 cpuset_freepidlist(pl); 2685 free_pidblocks(pbhead); 2686 return NULL; 2687} 2688 2689/* Return number of elements in pidlist */ 2690int cpuset_pidlist_length(const struct cpuset_pidlist *pl) 2691{ 2692 if (pl) 2693 return pl->npids; 2694 else 2695 return 0; 2696} 2697 2698/* Return i'th element of pidlist */ 2699pid_t cpuset_get_pidlist(const struct cpuset_pidlist *pl, int i) 2700{ 2701 if (pl && i >= 0 && i < pl->npids) 2702 return pl->pids[i]; 2703 else 2704 return (pid_t)-1; 2705} 2706 2707/* Free pidlist */ 2708void cpuset_freepidlist(struct cpuset_pidlist *pl) 2709{ 2710 if (pl && pl->pids) 2711 free(pl->pids); 2712 if (pl) 2713 free(pl); 2714} 2715 2716static int __cpuset_move(pid_t pid, const char *path) 2717{ 2718 char buf[SMALL_BUFSZ]; 2719 2720 snprintf(buf, sizeof(buf), "%u", pid); 2721 return write_string_file(path, buf); 2722} 2723 2724/* Move task (pid == 0 for current) to a cpuset */ 2725int cpuset_move(pid_t pid, const char *relpath) 2726{ 2727 char buf[PATH_MAX]; 2728 2729 if (check() < 0) 2730 return -1; 2731 2732 if (pid == 0) 2733 pid = getpid(); 2734 2735 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2736 return __cpuset_move(pid, buf); 2737} 2738 2739/* Move all tasks in pidlist to a cpuset */ 2740int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath) 2741{ 2742 int i; 2743 char buf[PATH_MAX]; 2744 int ret; 2745 2746 if (check() < 0) 2747 return -1; 2748 2749 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2750 2751 ret = 0; 2752 for (i = 0; i < pl->npids; i++) 2753 if (__cpuset_move(pl->pids[i], buf) < 0) 2754 ret = -1; 2755 return ret; 2756} 2757 2758/* 2759 * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a 2760 * cpuset to another cpuset 2761 * 2762 * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may 2763 * race with tasks being added to or forking into fromrelpath. Loop 2764 * repeatedly, reading the tasks file of cpuset fromrelpath and writing 2765 * any task pid's found there to the tasks file of cpuset torelpath, 2766 * up to ten attempts, or until the tasks file of cpuset fromrelpath 2767 * is empty, or until fromrelpath is no longer present. 2768 * 2769 * Returns 0 with errno == 0 if able to empty the tasks file of cpuset 2770 * fromrelpath. Of course it is still possible that some independent 2771 * task could add another task to cpuset fromrelpath at the same time 2772 * that such a successful result is being returned, so there can be 2773 * no guarantee that a successful return means that fromrelpath is 2774 * still empty of tasks. 2775 * 2776 * We are careful to allow for the possibility that the cpuset 2777 * fromrelpath might disappear out from under us, perhaps because it 2778 * has notify_on_release set and gets automatically removed as soon 2779 * as we detach its last task from it. Consider a missing fromrelpath 2780 * to be a successful move. 2781 * 2782 * If called with fromrelpath and torelpath pathnames that evaluate to 2783 * the same cpuset, then treat that as if cpuset_reattach() was called, 2784 * rebinding each task in this cpuset one time, and return success or 2785 * failure depending on the return of that cpuset_reattach() call. 2786 * 2787 * On failure, returns -1, with errno possibly one of: 2788 * EACCES - search permission denied on intervening directory 2789 * ENOTEMPTY - tasks remain after multiple attempts to move them 2790 * EMFILE - too many open files 2791 * ENODEV - /dev/cpuset not mounted 2792 * ENOENT - component of cpuset path doesn't exist 2793 * ENOMEM - out of memory 2794 * ENOSYS - kernel doesn't support cpusets 2795 * ENOTDIR - component of cpuset path is not a directory 2796 * EPERM - lacked permission to kill a task 2797 * EPERM - lacked permission to read cpusets or files therein 2798 * 2799 * This is an [optional] function. Use cpuset_function to invoke it. 2800 */ 2801 2802#define NUMBER_MOVE_TASK_ATTEMPTS 10 2803 2804int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath) 2805{ 2806 char fromfullpath[PATH_MAX]; 2807 char tofullpath[PATH_MAX]; 2808 int i; 2809 struct cpuset_pidlist *pl = NULL; 2810 int sav_errno; 2811 2812 fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath); 2813 fullpath(tofullpath, sizeof(tofullpath), torelpath); 2814 2815 if (samefile(fromfullpath, tofullpath)) 2816 return cpuset_reattach(fromrelpath); 2817 2818 for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) { 2819 int plen, j; 2820 2821 if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) { 2822 /* missing cpuset is as good as if all moved */ 2823 if (errno == ENOENT) 2824 goto no_more_cpuset; 2825 2826 /* other problems reading cpuset are bad news */ 2827 sav_errno = errno; 2828 goto failed; 2829 } 2830 2831 if ((plen = cpuset_pidlist_length(pl)) == 0) 2832 goto no_more_pids; 2833 2834 for (j = 0; j < plen; j++) { 2835 pid_t pid; 2836 2837 pid = cpuset_get_pidlist(pl, j); 2838 if (cpuset_move(pid, torelpath) < 0) { 2839 /* missing task is as good as if moved */ 2840 if (errno == ESRCH) 2841 continue; 2842 2843 /* other per-task errors are bad news */ 2844 sav_errno = errno; 2845 goto failed; 2846 } 2847 } 2848 2849 cpuset_freepidlist(pl); 2850 pl = NULL; 2851 } 2852 2853 sav_errno = ENOTEMPTY; 2854 /* fall into ... */ 2855failed: 2856 cpuset_freepidlist(pl); 2857 errno = sav_errno; 2858 return -1; 2859 2860no_more_pids: 2861no_more_cpuset: 2862 /* Success - all tasks (or entire cpuset ;) gone. */ 2863 cpuset_freepidlist(pl); 2864 errno = 0; 2865 return 0; 2866} 2867 2868/* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */ 2869int cpuset_migrate(pid_t pid, const char *relpath) 2870{ 2871 char buf[PATH_MAX]; 2872 char buf2[PATH_MAX]; 2873 char memory_migrate_flag; 2874 int r; 2875 2876 if (check() < 0) 2877 return -1; 2878 2879 if (pid == 0) 2880 pid = getpid(); 2881 2882 fullpath(buf2, sizeof(buf2), relpath); 2883 2884 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) 2885 return -1; 2886 if (store_flag(buf2, "memory_migrate", 1) < 0) 2887 return -1; 2888 2889 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2890 2891 r = __cpuset_move(pid, buf); 2892 2893 store_flag(buf2, "memory_migrate", memory_migrate_flag); 2894 return r; 2895} 2896 2897/* Migrate all tasks in pidlist to a cpuset (moves task and memory) */ 2898int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath) 2899{ 2900 int i; 2901 char buf[PATH_MAX]; 2902 char buf2[PATH_MAX]; 2903 char memory_migrate_flag; 2904 int ret; 2905 2906 if (check() < 0) 2907 return -1; 2908 2909 fullpath(buf2, sizeof(buf2), relpath); 2910 2911 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) 2912 return -1; 2913 if (store_flag(buf2, "memory_migrate", 1) < 0) 2914 return -1; 2915 2916 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2917 2918 ret = 0; 2919 for (i = 0; i < pl->npids; i++) 2920 if (__cpuset_move(pl->pids[i], buf) < 0) 2921 ret = -1; 2922 2923 if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0) 2924 ret = -1; 2925 return ret; 2926} 2927 2928/* Rebind cpus_allowed of each task in cpuset 'path' */ 2929int cpuset_reattach(const char *relpath) 2930{ 2931 struct cpuset_pidlist *pl; 2932 int rc; 2933 2934 if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL) 2935 return -1; 2936 rc = cpuset_move_all(pl, relpath); 2937 cpuset_freepidlist(pl); 2938 return rc; 2939} 2940 2941/* Map cpuset relative cpu number to system wide cpu number */ 2942int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu) 2943{ 2944 struct cpuset *cp_tofree = NULL; 2945 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2946 int pos = -1; 2947 2948 if (!cp1) 2949 goto err; 2950 pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu); 2951 /* fall into ... */ 2952err: 2953 cpuset_free(cp_tofree); 2954 return pos; 2955} 2956 2957/* Map system wide cpu number to cpuset relative cpu number */ 2958int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu) 2959{ 2960 struct cpuset *cp_tofree = NULL; 2961 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2962 int pos = -1; 2963 2964 if (!cp1) 2965 goto err; 2966 pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu); 2967 /* fall into ... */ 2968err: 2969 cpuset_free(cp_tofree); 2970 return pos; 2971} 2972 2973/* Map cpuset relative mem number to system wide mem number */ 2974int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem) 2975{ 2976 struct cpuset *cp_tofree = NULL; 2977 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2978 int pos = -1; 2979 2980 if (!cp1) 2981 goto err; 2982 pos = bitmask_rel_to_abs_pos(cp1->mems, mem); 2983 /* fall into ... */ 2984err: 2985 cpuset_free(cp_tofree); 2986 return pos; 2987} 2988 2989/* Map system wide mem number to cpuset relative mem number */ 2990int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem) 2991{ 2992 struct cpuset *cp_tofree = NULL; 2993 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2994 int pos = -1; 2995 2996 if (!cp1) 2997 goto err; 2998 pos = bitmask_abs_to_rel_pos(cp1->mems, mem); 2999 /* fall into ... */ 3000err: 3001 cpuset_free(cp_tofree); 3002 return pos; 3003} 3004 3005/* Map pid's cpuset relative cpu number to system wide cpu number */ 3006int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu) 3007{ 3008 struct cpuset *cp; 3009 int rc = -1; 3010 3011 if ((cp = cpuset_alloc()) == NULL) 3012 goto done; 3013 if (cpuset_cpusetofpid(cp, pid) < 0) 3014 goto done; 3015 rc = cpuset_c_rel_to_sys_cpu(cp, cpu); 3016done: 3017 cpuset_free(cp); 3018 return rc; 3019} 3020 3021/* Map system wide cpu number to pid's cpuset relative cpu number */ 3022int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu) 3023{ 3024 struct cpuset *cp; 3025 int rc = -1; 3026 3027 if ((cp = cpuset_alloc()) == NULL) 3028 goto done; 3029 if (cpuset_cpusetofpid(cp, pid) < 0) 3030 goto done; 3031 rc = cpuset_c_sys_to_rel_cpu(cp, cpu); 3032done: 3033 cpuset_free(cp); 3034 return rc; 3035} 3036 3037/* Map pid's cpuset relative mem number to system wide mem number */ 3038int cpuset_p_rel_to_sys_mem(pid_t pid, int mem) 3039{ 3040 struct cpuset *cp; 3041 int rc = -1; 3042 3043 if ((cp = cpuset_alloc()) == NULL) 3044 goto done; 3045 if (cpuset_cpusetofpid(cp, pid) < 0) 3046 goto done; 3047 rc = cpuset_c_rel_to_sys_mem(cp, mem); 3048done: 3049 cpuset_free(cp); 3050 return rc; 3051} 3052 3053/* Map system wide mem number to pid's cpuset relative mem number */ 3054int cpuset_p_sys_to_rel_mem(pid_t pid, int mem) 3055{ 3056 struct cpuset *cp; 3057 int rc = -1; 3058 3059 if ((cp = cpuset_alloc()) == NULL) 3060 goto done; 3061 if (cpuset_cpusetofpid(cp, pid) < 0) 3062 goto done; 3063 rc = cpuset_c_sys_to_rel_mem(cp, mem); 3064done: 3065 cpuset_free(cp); 3066 return rc; 3067} 3068 3069/* 3070 * Override glibc's calls for get/set affinity - they have 3071 * something using cpu_set_t that will die when NR_CPUS > 1024. 3072 * Go directly to the 'real' system calls. Also override calls 3073 * for get_mempolicy and set_mempolicy. None of these 3074 * calls are yet (July 2004) guaranteed to be in all glibc versions 3075 * that we care about. 3076 */ 3077 3078static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask) 3079{ 3080 return syscall(__NR_sched_setaffinity, pid, len, mask); 3081} 3082 3083#if HAVE_DECL_MEMPOLICY 3084static int get_mempolicy(int *policy, unsigned long *nmask, 3085 unsigned long maxnode, void *addr, int flags) 3086{ 3087 return syscall(__NR_get_mempolicy, policy, nmask, maxnode, addr, flags); 3088} 3089 3090static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode) 3091{ 3092 return syscall(__NR_set_mempolicy, mode, nmask, maxnode); 3093} 3094#endif 3095 3096struct cpuset_placement { 3097 struct bitmask *cpus; 3098 struct bitmask *mems; 3099 char *path; 3100}; 3101 3102/* Allocate and fill in a placement struct - cpatures current placement */ 3103struct cpuset_placement *cpuset_get_placement(pid_t pid) 3104{ 3105 struct cpuset_placement *plc; 3106 struct cpuset *cp = NULL; 3107 char buf[PATH_MAX]; 3108 int nbits; 3109 3110 if ((plc = calloc(1, sizeof(*plc))) == NULL) 3111 goto err; 3112 3113 nbits = cpuset_cpus_nbits(); 3114 if ((plc->cpus = bitmask_alloc(nbits)) == NULL) 3115 goto err; 3116 3117 nbits = cpuset_mems_nbits(); 3118 if ((plc->mems = bitmask_alloc(nbits)) == NULL) 3119 goto err; 3120 3121 if ((cp = cpuset_alloc()) == NULL) 3122 goto err; 3123 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) 3124 goto err; 3125 if (cpuset_query(cp, buf) < 0) 3126 goto err; 3127 3128 bitmask_copy(plc->cpus, cp->cpus); 3129 bitmask_copy(plc->mems, cp->mems); 3130 plc->path = strdup(buf); 3131 3132 cpuset_free(cp); 3133 return plc; 3134err: 3135 cpuset_free(cp); 3136 cpuset_free_placement(plc); 3137 return NULL; 3138} 3139 3140/* Compare two placement structs - use to detect changes in placement */ 3141int cpuset_equal_placement(const struct cpuset_placement *plc1, 3142 const struct cpuset_placement *plc2) 3143{ 3144 return bitmask_equal(plc1->cpus, plc2->cpus) && 3145 bitmask_equal(plc1->mems, plc2->mems) && 3146 streq(plc1->path, plc2->path); 3147} 3148 3149/* Free a placement struct */ 3150void cpuset_free_placement(struct cpuset_placement *plc) 3151{ 3152 if (!plc) 3153 return; 3154 bitmask_free(plc->cpus); 3155 bitmask_free(plc->mems); 3156 free(plc->path); 3157 free(plc); 3158} 3159 3160/* 3161 * A cpuset_fts_open() call constructs a linked list of entries 3162 * called a "cpuset_fts_tree", with one entry per cpuset below 3163 * the specified path. The cpuset_fts_read() routine returns the 3164 * next entry on this list. The various cpuset_fts_get_*() calls 3165 * return attributes of the specified entry. The cpuset_fts_close() 3166 * call frees the linked list and all associated data. All cpuset 3167 * entries and attributes for the cpuset_fts_tree returned from a 3168 * given cpuset_fts_open() call remain allocated and unchanged until 3169 * that cpuset_fts_tree is closed by a cpuset_fts_close() call. Any 3170 * subsequent changes to the cpuset filesystem will go unnoticed 3171 * (not affect open cpuset_fts_tree's.) 3172 */ 3173 3174struct cpuset_fts_entry; 3175void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree); 3176 3177struct cpuset_fts_tree { 3178 struct cpuset_fts_entry *head; /* head of linked entry list */ 3179 struct cpuset_fts_entry *next; /* cpuset_fts_read() offset */ 3180}; 3181 3182struct cpuset_fts_entry { 3183 struct cpuset_fts_entry *next; /* linked entry list chain */ 3184 struct cpuset *cpuset; 3185 struct stat *stat; 3186 char *path; 3187 int info; 3188 int err; 3189}; 3190 3191/* Open a handle on a cpuset hierarchy. All the real work is done here. */ 3192struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath) 3193{ 3194 FTS* fts = NULL; 3195 FTSENT *ftsent; 3196 char *path_argv[2]; 3197 char buf[PATH_MAX]; 3198 struct cpuset_fts_tree *cs_tree = NULL; 3199 struct cpuset_fts_entry *ep; /* the latest new list entry */ 3200 struct cpuset_fts_entry **pnlep; /* ptr to next list entry ptr */ 3201 char *relpath; 3202 int fts_flags; 3203 3204 fullpath(buf, sizeof(buf), cpusetpath); 3205 path_argv[0] = buf; 3206 path_argv[1] = NULL; 3207 3208 fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV; 3209 fts = fts_open(path_argv, fts_flags, NULL); 3210 if (fts == NULL) 3211 goto err; 3212 3213 cs_tree = malloc(sizeof(*cs_tree)); 3214 if (cs_tree == NULL) 3215 goto err; 3216 pnlep = &cs_tree->head; 3217 *pnlep = NULL; 3218 3219 while ((ftsent = fts_read(fts)) != NULL) { 3220 if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR) 3221 continue; 3222 3223 /* ftsent is a directory (perhaps unreadable) ==> cpuset */ 3224 ep = calloc(1, sizeof(*ep)); 3225 if (ep == NULL) 3226 goto err; 3227 *pnlep = ep; 3228 pnlep = &ep->next; 3229 3230 /* Set entry's path, and if DNR, error */ 3231 relpath = ftsent->fts_path + strlen(cpusetmnt); 3232 if (strlen(relpath) == 0) 3233 relpath = "/"; 3234 ep->path = strdup(relpath); 3235 if (ep->path == NULL) 3236 goto err; 3237 if (ftsent->fts_info == FTS_DNR) { 3238 ep->info = CPUSET_FTS_ERR_DNR; 3239 ep->err = ftsent->fts_errno; 3240 continue; 3241 } 3242 3243 /* ftsent is a -readable- cpuset: set entry's stat, etc */ 3244 ep->stat = calloc(1, sizeof(struct stat)); 3245 if (ep->stat == NULL) 3246 goto err; 3247 if (stat(ftsent->fts_path, ep->stat) < 0) { 3248 ep->info = CPUSET_FTS_ERR_STAT; 3249 ep->err = ftsent->fts_errno; 3250 continue; 3251 } 3252 3253 ep->cpuset = calloc(1, sizeof(struct cpuset)); 3254 if (ep->cpuset == NULL) 3255 goto err; 3256 if (cpuset_query(ep->cpuset, relpath) < 0) { 3257 ep->info = CPUSET_FTS_ERR_CPUSET; 3258 ep->err = errno; 3259 continue; 3260 } 3261 ep->info = CPUSET_FTS_CPUSET; 3262 } 3263 3264 (void) fts_close(fts); 3265 cpuset_fts_rewind(cs_tree); 3266 return cs_tree; 3267 3268err: 3269 if (cs_tree) 3270 cpuset_fts_close(cs_tree); 3271 if (fts) 3272 (void) fts_close(fts); 3273 return NULL; 3274} 3275 3276/* Return pointer to next cpuset entry in hierarchy */ 3277const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree) 3278{ 3279 const struct cpuset_fts_entry *cs_entry = cs_tree->next; 3280 if (cs_tree->next != NULL) /* seek to next entry */ 3281 cs_tree->next = cs_tree->next->next; 3282 return cs_entry; 3283} 3284 3285/* Reverse list of cpusets, in place. Simulates pre-order/post-order flip. */ 3286void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree) 3287{ 3288 struct cpuset_fts_entry *cs1, *cs2, *cs3; 3289 3290 /* 3291 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer 3292 * is redirected from cs3 to cs1. 3293 */ 3294 3295 cs1 = cs2 = NULL; 3296 cs3 = cs_tree->head; 3297 while (cs3) { 3298 cs1 = cs2; 3299 cs2 = cs3; 3300 cs3 = cs3->next; 3301 cs2->next = cs1; 3302 } 3303 cs_tree->head = cs2; 3304 cpuset_fts_rewind(cs_tree); 3305} 3306 3307/* Rewind cpuset list to beginning */ 3308void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree) 3309{ 3310 cs_tree->next = cs_tree->head; 3311} 3312 3313/* Return pointer to nul-terminated cpuset path of entry in hierarchy */ 3314const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry) 3315{ 3316 return cs_entry->path; 3317} 3318 3319/* Return pointer to stat(2) structure of a cpuset entry's directory */ 3320const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry) 3321{ 3322 return cs_entry->stat; 3323} 3324 3325/* Return pointer to cpuset structure of a cpuset entry */ 3326const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry *cs_entry) 3327{ 3328 return cs_entry->cpuset; 3329} 3330 3331/* Return value of errno (0 if no error) on attempted cpuset operations */ 3332int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry) 3333{ 3334 return cs_entry->err; 3335} 3336 3337/* Return operation identity causing error */ 3338int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry) 3339{ 3340 return cs_entry->info; 3341} 3342 3343/* Close a cpuset hierarchy handle (free's all associated memory) */ 3344void cpuset_fts_close(struct cpuset_fts_tree *cs_tree) 3345{ 3346 struct cpuset_fts_entry *cs_entry = cs_tree->head; 3347 3348 while (cs_entry) { 3349 struct cpuset_fts_entry *ep = cs_entry; 3350 3351 cs_entry = cs_entry->next; 3352 free(ep->path); 3353 free(ep->stat); 3354 cpuset_free(ep->cpuset); 3355 free(ep); 3356 } 3357 free(cs_tree); 3358} 3359 3360/* Bind current task to cpu (uses sched_setaffinity(2)) */ 3361int cpuset_cpubind(int cpu) 3362{ 3363 struct bitmask *bmp; 3364 int r; 3365 3366 if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 3367 return -1; 3368 bitmask_setbit(bmp, cpu); 3369 r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp)); 3370 bitmask_free(bmp); 3371 return r; 3372} 3373 3374/* 3375 * int cpuset_latestcpu(pid_t pid) 3376 * 3377 * Return most recent CPU on which task pid executed. If pid == 0, 3378 * examine current task. 3379 * 3380 * The last used CPU is visible for a given pid as field #39 (starting 3381 * with #1) in the file /proc/pid/stat. Currently this file has 41 3382 * fields, in which case this is the 3rd to the last field. 3383 * 3384 * Unfortunately field #2 is a command name and might have embedded 3385 * whitespace. So we can't just count white space separated fields. 3386 * Fortunately, this command name is surrounded by parentheses, as 3387 * for example "(sh)", and that closing parenthesis is the last ')' 3388 * character in the line. No remaining fields can have embedded 3389 * whitespace or parentheses. So instead of looking for the 39th 3390 * white space separated field, we can look for the 37th white space 3391 * separated field past the last ')' character on the line. 3392 */ 3393 3394/* Return most recent CPU on which task pid executed */ 3395int cpuset_latestcpu(pid_t pid) 3396{ 3397 char buf[PATH_MAX]; 3398 char *bp; 3399 int fd = -1; 3400 int cpu = -1; 3401 3402 if (pid == 0) 3403 snprintf(buf, sizeof(buf), "/proc/self/stat"); 3404 else 3405 snprintf(buf, sizeof(buf), "/proc/%d/stat", pid); 3406 3407 if ((fd = open(buf, O_RDONLY)) < 0) 3408 goto err; 3409 if (read(fd, buf, sizeof(buf)) < 1) 3410 goto err; 3411 close(fd); 3412 3413 bp = strrchr(buf, ')'); 3414 if (bp) 3415 sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " 3416 "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " 3417 "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " 3418 "%*u %*u %*u %*u %*u %*u %*u %*u %u", /* 37th field past ')' */ 3419 &cpu); 3420 if (cpu < 0) 3421 errno = EINVAL; 3422 return cpu; 3423err: 3424 if (fd >= 0) 3425 close(fd); 3426 return -1; 3427} 3428 3429/* Bind current task to memory (uses set_mempolicy(2)) */ 3430int cpuset_membind(int mem) 3431{ 3432 struct bitmask *bmp; 3433 int r; 3434 3435 if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL) 3436 return -1; 3437 bitmask_setbit(bmp, mem); 3438#if HAVE_DECL_MPOL_BIND 3439 r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), 3440 bitmask_nbits(bmp) + 1); 3441#else 3442 r = -1; 3443 errno = ENOSYS; 3444#endif 3445 bitmask_free(bmp); 3446 return r; 3447} 3448 3449/* [optional] Return Memory Node holding page at specified addr */ 3450int cpuset_addr2node(void *addr) 3451{ 3452 int node = -1; 3453 3454#if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE 3455 if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE|MPOL_F_ADDR)) { 3456 /* I realize this seems redundant, but I _want_ to make sure 3457 * that this value is -1. */ 3458 node = -1; 3459 } 3460#endif 3461 return node; 3462} 3463 3464/* 3465 * Transform cpuset into Text Format Representation in buffer 'buf', 3466 * of length 'buflen', nul-terminated if space allows. Return number 3467 * of characters that would have been written, if enough space had 3468 * been available, in the same way that snprintf() does. 3469 */ 3470 3471/* Export cpuset settings to a regular file */ 3472int cpuset_export(const struct cpuset *cp, char *buf, int buflen) 3473{ 3474 char *tmp = NULL; 3475 int n = 0; 3476 3477 if (cp->cpu_exclusive) 3478 n += snprintf(buf + n, max(buflen - n, 0), "cpu_exclusive\n"); 3479 3480 if (cp->mem_exclusive) 3481 n += snprintf(buf + n, max(buflen - n, 0), "mem_exclusive\n"); 3482 3483 if (cp->notify_on_release) 3484 n += snprintf(buf + n, max(buflen - n, 0), 3485 "notify_on_release\n"); 3486 3487 if (cp->memory_pressure_enabled) 3488 n += snprintf(buf + n, max(buflen - n, 0), 3489 "memory_pressure_enabled\n"); 3490 3491 if (cp->memory_migrate) 3492 n += snprintf(buf + n, max(buflen - n, 0), 3493 "memory_migrate\n"); 3494 3495 if (cp->memory_spread_page) 3496 n += snprintf(buf + n, max(buflen - n, 0), 3497 "memory_spread_page\n"); 3498 3499 if (cp->memory_spread_slab) 3500 n += snprintf(buf + n, max(buflen - n, 0), 3501 "memory_spread_slab\n"); 3502 3503 if ((tmp = sprint_mask_buf(cp->cpus)) == NULL) 3504 return -1; 3505 n += snprintf(buf + n, max(buflen - n, 0), "cpus %s\n", tmp); 3506 free(tmp); 3507 tmp = NULL; 3508 3509 if ((tmp = sprint_mask_buf(cp->mems)) == NULL) 3510 return -1; 3511 n += snprintf(buf + n, max(buflen - n, 0), "mems %s\n", tmp); 3512 free(tmp); 3513 tmp = NULL; 3514 3515 return n; 3516} 3517 3518static int import_list(UNUSED const char *tok, const char *arg, 3519 struct bitmask *bmp, char *emsg, int elen) 3520{ 3521 if (bitmask_parselist(arg, bmp) < 0) { 3522 if (emsg) 3523 snprintf(emsg, elen, "Invalid list format: %s", arg); 3524 return -1; 3525 } 3526 return 0; 3527} 3528 3529static void stolower(char *s) 3530{ 3531 while (*s) { 3532 unsigned char c = *s; 3533 *s = tolower(c); 3534 s++; 3535 } 3536} 3537 3538/* Import cpuset settings from a regular file */ 3539int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum, 3540 char *emsg, int elen) 3541{ 3542 char *linebuf = NULL; 3543 int linebuflen; 3544 int linenum = 0; 3545 int offset = 0; 3546 3547 linebuflen = strlen(buf) + 1; 3548 if ((linebuf = malloc(linebuflen)) == NULL) { 3549 if (emsg) 3550 snprintf(emsg, elen, "Insufficient memory"); 3551 goto err; 3552 } 3553 3554 while (slgets(linebuf, linebuflen, buf, &offset)) { 3555 char *tok, *arg; 3556 char *ptr; /* for strtok_r */ 3557 3558 linenum++; 3559 if ((tok = strchr(linebuf, '#')) != NULL) 3560 *tok = 0; 3561 if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL) 3562 continue; 3563 stolower(tok); 3564 3565 arg = strtok_r(0, " \t", &ptr); 3566 3567 if (streq(tok, "cpu_exclusive")) { 3568 cp->cpu_exclusive = 1; 3569 goto eol; 3570 } 3571 if (streq(tok, "mem_exclusive")) { 3572 cp->mem_exclusive = 1; 3573 goto eol; 3574 } 3575 if (streq(tok, "notify_on_release")) { 3576 cp->notify_on_release = 1; 3577 goto eol; 3578 } 3579 if (streq(tok, "memory_pressure_enabled")) { 3580 cp->memory_pressure_enabled = 1; 3581 goto eol; 3582 } 3583 if (streq(tok, "memory_migrate")) { 3584 cp->memory_migrate = 1; 3585 goto eol; 3586 } 3587 if (streq(tok, "memory_spread_page")) { 3588 cp->memory_spread_page = 1; 3589 goto eol; 3590 } 3591 if (streq(tok, "memory_spread_slab")) { 3592 cp->memory_spread_slab = 1; 3593 goto eol; 3594 } 3595 if (streq(tok, "cpu") || streq(tok, "cpus")) { 3596 if (import_list(tok, arg, cp->cpus, emsg, elen) < 0) 3597 goto err; 3598 goto eol; 3599 } 3600 if (streq(tok, "mem") || streq(tok, "mems")) { 3601 if (import_list(tok, arg, cp->mems, emsg, elen) < 0) 3602 goto err; 3603 goto eol; 3604 } 3605 if (emsg) 3606 snprintf(emsg, elen, "Unrecognized token: '%s'", tok); 3607 goto err; 3608eol: 3609 if ((tok = strtok_r(0, " \t", &ptr)) != NULL) { 3610 if (emsg) 3611 snprintf(emsg, elen, "Surplus token: '%s'", 3612 tok); 3613 goto err; 3614 } 3615 continue; 3616 } 3617 3618 free(linebuf); 3619 3620 if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems)) 3621 cpuset_localcpus(cp->mems, cp->cpus); 3622 else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems)) 3623 cpuset_localmems(cp->cpus, cp->mems); 3624 3625 /* 3626 * All cpuset attributes are determined in an import. 3627 * Those that aren't explicitly specified are presumed 3628 * to be unchanged (zero, if it's a freshly allocated 3629 * struct cpuset.) 3630 */ 3631 3632 cp->cpus_valid = 1; 3633 cp->mems_valid = 1; 3634 cp->cpu_exclusive_valid = 1; 3635 cp->mem_exclusive_valid = 1; 3636 cp->notify_on_release_valid = 1; 3637 cp->memory_migrate_valid = 1; 3638 cp->memory_pressure_enabled_valid = 1; 3639 cp->memory_spread_page_valid = 1; 3640 cp->memory_spread_slab_valid = 1; 3641 3642 return 0; 3643err: 3644 if (elinenum) 3645 *elinenum = linenum; 3646 if (linebuf) 3647 free(linebuf); 3648 return -1; 3649} 3650 3651/* Pin current task CPU (and memory) */ 3652int cpuset_pin(int relcpu) 3653{ 3654 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3655 int cpu, r; 3656 3657 if (check() < 0) 3658 return -1; 3659 3660 do { 3661 cpuset_free_placement(plc1); 3662 plc1 = cpuset_get_placement(0); 3663 3664 r = 0; 3665 if (cpuset_unpin() < 0) 3666 r = -1; 3667 cpu = cpuset_p_rel_to_sys_cpu(0, relcpu); 3668 if (cpuset_cpubind(cpu) < 0) 3669 r = -1; 3670 3671 cpuset_free_placement(plc2); 3672 plc2 = cpuset_get_placement(0); 3673 } while (!cpuset_equal_placement(plc1, plc2)); 3674 3675 cpuset_free_placement(plc1); 3676 cpuset_free_placement(plc2); 3677 return r; 3678 } 3679 3680/* Return number CPUs in current tasks cpuset */ 3681int cpuset_size() 3682{ 3683 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3684 int r; 3685 3686 if (check() < 0) 3687 return -1; 3688 3689 do { 3690 cpuset_free_placement(plc1); 3691 plc1 = cpuset_get_placement(0); 3692 3693 r = cpuset_cpus_weight(0); 3694 3695 cpuset_free_placement(plc2); 3696 plc2 = cpuset_get_placement(0); 3697 } while (!cpuset_equal_placement(plc1, plc2)); 3698 3699 cpuset_free_placement(plc1); 3700 cpuset_free_placement(plc2); 3701 return r; 3702} 3703 3704/* Return relative CPU number, within current cpuset, last executed on */ 3705int cpuset_where() 3706{ 3707 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3708 int r; 3709 3710 if (check() < 0) 3711 return -1; 3712 3713 do { 3714 cpuset_free_placement(plc1); 3715 plc1 = cpuset_get_placement(0); 3716 3717 r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0)); 3718 3719 cpuset_free_placement(plc2); 3720 plc2 = cpuset_get_placement(0); 3721 } while (!cpuset_equal_placement(plc1, plc2)); 3722 3723 cpuset_free_placement(plc1); 3724 cpuset_free_placement(plc2); 3725 return r; 3726} 3727 3728/* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */ 3729int cpuset_unpin() 3730{ 3731 struct bitmask *cpus = NULL, *mems = NULL; 3732 int r = -1; 3733 3734 if (check() < 0) 3735 goto err; 3736 3737 /* 3738 * Don't need cpuset_*_placement() guard against concurrent 3739 * cpuset migration, because none of the following depends 3740 * on the tasks cpuset placement. 3741 */ 3742 3743 if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 3744 goto err; 3745 bitmask_setall(cpus); 3746 if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0) 3747 goto err; 3748 3749 if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL) 3750 goto err; 3751#if HAVE_DECL_MPOL_DEFAULT 3752 if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems), 3753 bitmask_nbits(mems) + 1) < 0) 3754 goto err; 3755 r = 0; 3756#endif 3757 /* fall into ... */ 3758err: 3759 bitmask_free(cpus); 3760 bitmask_free(mems); 3761 return r; 3762 3763} 3764 3765struct cpuset_function_list { 3766 const char *fname; 3767 void *func; 3768} flist[] = { 3769 { "cpuset_version", cpuset_version }, 3770 { "cpuset_alloc", cpuset_alloc }, 3771 { "cpuset_free", cpuset_free }, 3772 { "cpuset_cpus_nbits", cpuset_cpus_nbits }, 3773 { "cpuset_mems_nbits", cpuset_mems_nbits }, 3774 { "cpuset_setcpus", cpuset_setcpus }, 3775 { "cpuset_setmems", cpuset_setmems }, 3776 { "cpuset_set_iopt", cpuset_set_iopt }, 3777 { "cpuset_set_sopt", cpuset_set_sopt }, 3778 { "cpuset_getcpus", cpuset_getcpus }, 3779 { "cpuset_getmems", cpuset_getmems }, 3780 { "cpuset_cpus_weight", cpuset_cpus_weight }, 3781 { "cpuset_mems_weight", cpuset_mems_weight }, 3782 { "cpuset_get_iopt", cpuset_get_iopt }, 3783 { "cpuset_get_sopt", cpuset_get_sopt }, 3784 { "cpuset_localcpus", cpuset_localcpus }, 3785 { "cpuset_localmems", cpuset_localmems }, 3786 { "cpuset_cpumemdist", cpuset_cpumemdist }, 3787 { "cpuset_cpu2node", cpuset_cpu2node }, 3788 { "cpuset_addr2node", cpuset_addr2node }, 3789 { "cpuset_create", cpuset_create }, 3790 { "cpuset_delete", cpuset_delete }, 3791 { "cpuset_query", cpuset_query }, 3792 { "cpuset_modify", cpuset_modify }, 3793 { "cpuset_getcpusetpath", cpuset_getcpusetpath }, 3794 { "cpuset_cpusetofpid", cpuset_cpusetofpid }, 3795 { "cpuset_mountpoint", cpuset_mountpoint }, 3796 { "cpuset_collides_exclusive", cpuset_collides_exclusive }, 3797 { "cpuset_nuke", cpuset_nuke }, 3798 { "cpuset_init_pidlist", cpuset_init_pidlist }, 3799 { "cpuset_pidlist_length", cpuset_pidlist_length }, 3800 { "cpuset_get_pidlist", cpuset_get_pidlist }, 3801 { "cpuset_freepidlist", cpuset_freepidlist }, 3802 { "cpuset_move", cpuset_move }, 3803 { "cpuset_move_all", cpuset_move_all }, 3804 { "cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks }, 3805 { "cpuset_migrate", cpuset_migrate }, 3806 { "cpuset_migrate_all", cpuset_migrate_all }, 3807 { "cpuset_reattach", cpuset_reattach }, 3808 { "cpuset_open_memory_pressure", cpuset_open_memory_pressure }, 3809 { "cpuset_read_memory_pressure", cpuset_read_memory_pressure }, 3810 { "cpuset_close_memory_pressure", cpuset_close_memory_pressure }, 3811 { "cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu }, 3812 { "cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu }, 3813 { "cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem }, 3814 { "cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem }, 3815 { "cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu }, 3816 { "cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu }, 3817 { "cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem }, 3818 { "cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem }, 3819 { "cpuset_get_placement", cpuset_get_placement }, 3820 { "cpuset_equal_placement", cpuset_equal_placement }, 3821 { "cpuset_free_placement", cpuset_free_placement }, 3822 { "cpuset_fts_open", cpuset_fts_open }, 3823 { "cpuset_fts_read", cpuset_fts_read }, 3824 { "cpuset_fts_reverse", cpuset_fts_reverse }, 3825 { "cpuset_fts_rewind", cpuset_fts_rewind }, 3826 { "cpuset_fts_get_path", cpuset_fts_get_path }, 3827 { "cpuset_fts_get_stat", cpuset_fts_get_stat }, 3828 { "cpuset_fts_get_cpuset", cpuset_fts_get_cpuset }, 3829 { "cpuset_fts_get_errno", cpuset_fts_get_errno }, 3830 { "cpuset_fts_get_info", cpuset_fts_get_info }, 3831 { "cpuset_fts_close", cpuset_fts_close }, 3832 { "cpuset_cpubind", cpuset_cpubind }, 3833 { "cpuset_latestcpu", cpuset_latestcpu }, 3834 { "cpuset_membind", cpuset_membind }, 3835 { "cpuset_export", cpuset_export }, 3836 { "cpuset_import", cpuset_import }, 3837 { "cpuset_function", cpuset_function }, 3838 { "cpuset_pin", cpuset_pin }, 3839 { "cpuset_size", cpuset_size }, 3840 { "cpuset_where", cpuset_where }, 3841 { "cpuset_unpin", cpuset_unpin }, 3842}; 3843 3844/* Return pointer to a libcpuset.so function, or NULL */ 3845void *cpuset_function(const char * function_name) 3846{ 3847 unsigned int i; 3848 3849 for (i = 0; i < sizeof(flist)/sizeof(flist[0]); i++) 3850 if (streq(function_name, flist[i].fname)) 3851 return flist[i].func; 3852 return NULL; 3853} 3854 3855/* Fortran interface to basic cpuset routines */ 3856int cpuset_pin_(int *ptr_relcpu) {return cpuset_pin(*ptr_relcpu);} 3857int cpuset_size_(void) { return cpuset_size(); } 3858int cpuset_where_(void) { return cpuset_where(); } 3859int cpuset_unpin_(void) { return cpuset_unpin(); } 3860 3861#endif /* HAVE_LINUX_MEMPOLICY_H */