libcpuset.c revision 359980f68b19c77c698b121b57a071dfe6e3ca31
1/* 2 * cpuset user library implementation. 3 * 4 * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved. 5 * 6 * Paul Jackson <pj@sgi.com> 7 */ 8 9/* 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU Lesser General Public License as published by 12 * the Free Software Foundation; either version 2.1 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 */ 24 25#define _XOPEN_SOURCE 500 /* need to see pread() */ 26#define _BSD_SOURCE 1 /* need to see syscall() */ 27#include <unistd.h> 28 29#include <ctype.h> 30#include <dirent.h> 31#include <errno.h> 32#include <fcntl.h> 33#include <fts.h> 34#include <limits.h> 35#include <signal.h> 36#include <stdint.h> 37#include <stdio.h> 38#include <stdlib.h> 39#include <string.h> 40#include <sys/stat.h> 41#include <sys/syscall.h> 42#include <sys/types.h> 43#include <time.h> 44#include <utime.h> 45#include <sys/utsname.h> /* for cpuset_would_crash_kernel() */ 46 47#include "bitmask.h" 48#include "cpuset.h" 49#include "common.h" 50#include "test.h" 51#include "linux_syscall_numbers.h" 52#include "config.h" 53#if HAVE_LINUX_MEMPOLICY_H 54#include <linux/mempolicy.h> 55 56/* Bump version, and update Change History, when libcpuset API changes */ 57#define CPUSET_VERSION 3 58 59/* 60 * For a history of what changed in each version, see the "Change 61 * History" section, at the end of the libcpuset master document. 62 */ 63 64int cpuset_version(void) 65{ 66 return CPUSET_VERSION; 67} 68 69struct cpuset { 70 struct bitmask *cpus; 71 struct bitmask *mems; 72 char cpu_exclusive; 73 char mem_exclusive; 74 char mem_hardwall; 75 char notify_on_release; 76 char memory_migrate; 77 char memory_pressure_enabled; 78 char memory_spread_page; 79 char memory_spread_slab; 80 char sched_load_balance; 81 int sched_relax_domain_level; 82 83 /* 84 * Each field 'x' above gets an 'x_valid' field below. 85 * The apply_cpuset_settings() will only set those fields whose 86 * corresponding *_valid flags are set. The cpuset_alloc() 87 * routine clears these flags as part of the clear in calloc(), 88 * and the various cpuset_set*() routines set these flags when 89 * setting the corresponding value. 90 * 91 * The purpose of these valid fields is to ensure that when 92 * we create a new cpuset, we don't accidentally overwrite 93 * some non-zero kernel default, such as an inherited 94 * memory_spread_* flag, just because the user application 95 * code didn't override the default zero settings resulting 96 * from the calloc() call in cpuset_alloc(). 97 * 98 * The choice of 'char' for the type of the flags above, 99 * but a bitfield for the flags below, is somewhat capricious. 100 */ 101 unsigned cpus_valid:1; 102 unsigned mems_valid:1; 103 unsigned cpu_exclusive_valid:1; 104 unsigned mem_exclusive_valid:1; 105 unsigned mem_hardwall_valid:1; 106 unsigned notify_on_release_valid:1; 107 unsigned memory_migrate_valid:1; 108 unsigned memory_pressure_enabled_valid:1; 109 unsigned memory_spread_page_valid:1; 110 unsigned memory_spread_slab_valid:1; 111 unsigned sched_load_balance_valid:1; 112 unsigned sched_relax_domain_level_valid:1; 113 114 /* 115 * if the relative variable was modified, use following flags 116 * to put a mark 117 */ 118 unsigned cpus_dirty:1; 119 unsigned mems_dirty:1; 120 unsigned cpu_exclusive_dirty:1; 121 unsigned mem_exclusive_dirty:1; 122 unsigned mem_hardwall_dirty:1; 123 unsigned notify_on_release_dirty:1; 124 unsigned memory_migrate_dirty:1; 125 unsigned memory_pressure_enabled_dirty:1; 126 unsigned memory_spread_page_dirty:1; 127 unsigned memory_spread_slab_dirty:1; 128 unsigned sched_load_balance_dirty:1; 129 unsigned sched_relax_domain_level_dirty:1; 130}; 131 132/* Presumed cpuset file system mount point */ 133static const char *cpusetmnt = "/dev/cpuset"; 134 135/* Stashed copy of cpunodemap[], mapping each cpu to its node. */ 136static const char *mapfile = "/var/run/cpunodemap"; 137 138/* The primary source for the cpunodemap[] is available below here. */ 139static const char *sysdevices = "/sys/devices/system"; 140 141#define max(a,b) ((a) > (b) ? (a) : (b)) 142#define min(a,b) ((a) < (b) ? (a) : (b)) 143 144/* small buffer size - for reading boolean flags or map file (1 or 2 ints) */ 145#define SMALL_BUFSZ 16 146 147/* 148 * The 'mask_size_file' is used to ferrit out the kernel cpumask_t 149 * and nodemask_t sizes. The lines in this file that begin with the 150 * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask 151 * and nodemask string, respectively. The lengths of these strings 152 * reflect the kernel's internal cpumask_t and nodemask_t sizes, 153 * which sizes are needed to correctly call the sched_setaffinity 154 * and set_mempolicy system calls, and to size user level 155 * bitmasks to match the kernels. 156 */ 157 158static const char *mask_size_file = "/proc/self/status"; 159static const char *cpumask_prefix = "Cpus_allowed:\t"; 160static const char *nodemask_prefix = "Mems_allowed:\t"; 161 162/* 163 * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits. 164 * 165 * The first time we need these, we parse the Cpus_allowed and 166 * Mems_allowed lines from mask_size_file ("/proc/self/status"). 167 */ 168 169static int cpumask_sz; 170static int nodemask_sz; 171 172/* 173 * These defaults only kick in if we fail to size the kernel 174 * cpumask and nodemask by reading the Cpus_allowed and 175 * Mems_allowed fields from the /proc/self/status file. 176 */ 177 178#define DEFCPUBITS (512) 179#define DEFNODEBITS (DEFCPUBITS/2) 180 181/* 182 * Arch-neutral API for obtaining NUMA distances between CPUs 183 * and Memory Nodes, via the files: 184 * /sys/devices/system/node/nodeN/distance 185 * which have lines such as: 186 * 46 66 10 20 187 * which say that for cpu on node N (from the path above), the 188 * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20, 189 * respectively. 190 */ 191 192static const char *distance_directory = "/sys/devices/system/node"; 193 194/* 195 * Someday, we should disable, then later discard, the SN code 196 * marked ALTERNATE_SN_DISTMAP. 197 */ 198 199#define ALTERNATE_SN_DISTMAP 1 200#ifdef ALTERNATE_SN_DISTMAP 201 202/* 203 * Alternative SN (SGI ia64) architecture specific API for obtaining 204 * NUMA distances between CPUs and Memory Nodes is via the file 205 * /proc/sgi_sn/sn_topology, which has lines such as: 206 * 207 * node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20 208 * 209 * which says that for each CPU on node 2, the distance to nodes 210 * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively. 211 * 212 * This file has other lines as well, which start with other 213 * keywords than "node". Ignore these other lines. 214 */ 215 216static const char *sn_topology = "/proc/sgi_sn/sn_topology"; 217static const char *sn_top_node_prefix = "node "; 218 219#endif 220 221/* 222 * Check that cpusets supported, /dev/cpuset mounted. 223 * If ok, return 0. 224 * If not, return -1 and set errno: 225 * ENOSYS - kernel doesn't support cpusets 226 * ENODEV - /dev/cpuset not mounted 227 */ 228 229static enum { 230 check_notdone, 231 check_enosys, 232 check_enodev, 233 check_ok 234} check_state = check_notdone; 235 236static int check() 237{ 238 if (check_state == check_notdone) { 239 struct stat statbuf; 240 241 if (stat("/proc/self/cpuset", &statbuf) < 0) { 242 check_state = check_enosys; 243 goto done; 244 } 245 246 if (stat("/dev/cpuset/tasks", &statbuf) < 0) { 247 check_state = check_enodev; 248 goto done; 249 } 250 251 check_state = check_ok; 252 } 253done: 254 switch (check_state) { 255 case check_enosys: 256 errno = ENOSYS; 257 return -1; 258 case check_enodev: 259 errno = ENODEV; 260 return -1; 261 default: 262 break; 263 } 264 return 0; 265} 266 267static void chomp(char *s) 268{ 269 char *t; 270 271 for (t = s + strlen(s) - 1; t >= s; t--) { 272 if (*t == '\n' || *t == '\r') 273 *t = '\0'; 274 else 275 break; 276 } 277} 278 279/* 280 * Determine number of bytes in a seekable open file, without 281 * assuming that stat(2) on that file has a useful size. 282 * Has side affect of leaving the file rewound to the beginnning. 283 */ 284static int filesize(FILE * fp) 285{ 286 int sz = 0; 287 rewind(fp); 288 while (fgetc(fp) != EOF) 289 sz++; 290 rewind(fp); 291 return sz; 292} 293 294/* Are strings s1 and s2 equal? */ 295static int streq(const char *s1, const char *s2) 296{ 297 return strcmp(s1, s2) == 0; 298} 299 300/* Is string 'pre' a prefix of string 's'? */ 301static int strprefix(const char *s, const char *pre) 302{ 303 return strncmp(s, pre, strlen(pre)) == 0; 304} 305 306/* 307 * char *flgets(char *buf, int buflen, FILE *fp) 308 * 309 * Obtain one line from input file fp. Copy up to first 310 * buflen-1 chars of line into buffer buf, discarding any remainder 311 * of line. Stop reading at newline, discarding newline. 312 * Nul terminate result and return pointer to buffer buf 313 * on success, or NULL if nothing more to read or failure. 314 */ 315 316static char *flgets(char *buf, int buflen, FILE * fp) 317{ 318 int c = -1; 319 char *bp; 320 321 bp = buf; 322 while ((--buflen > 0) && ((c = getc(fp)) >= 0)) { 323 if (c == '\n') 324 goto newline; 325 *bp++ = c; 326 } 327 if ((c < 0) && (bp == buf)) 328 return NULL; 329 330 if (c > 0) { 331 while ((c = getc(fp)) >= 0) { 332 if (c == '\n') 333 break; 334 } 335 } 336 337newline: 338 *bp++ = '\0'; 339 return buf; 340} 341 342/* 343 * sgetc(const char *inputbuf, int *offsetptr) 344 * 345 * Return next char from nul-terminated input buffer inputbuf, 346 * starting at offset *offsetptr. Increment *offsetptr. 347 * If next char would be nul ('\0'), return EOF and don't 348 * increment *offsetptr. 349 */ 350 351static int sgetc(const char *inputbuf, int *offsetptr) 352{ 353 char c; 354 355 if ((c = inputbuf[*offsetptr]) != 0) { 356 *offsetptr = *offsetptr + 1; 357 return c; 358 } else { 359 return EOF; 360 } 361} 362 363/* 364 * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) 365 * 366 * Obtain next line from nul-terminated input buffer 'inputbuf', 367 * starting at offset *offsetptr. Copy up to first buflen-1 368 * chars of line into output buffer buf, discarding any remainder 369 * of line. Stop reading at newline, discarding newline. 370 * Nul terminate result and return pointer to output buffer 371 * buf on success, or NULL if nothing more to read. 372 */ 373 374static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) 375{ 376 int c = -1; 377 char *bp; 378 379 bp = buf; 380 while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) { 381 if (c == '\n') 382 goto newline; 383 *bp++ = c; 384 } 385 if ((c < 0) && (bp == buf)) 386 return NULL; 387 388 if (c > 0) { 389 while ((c = sgetc(inputbuf, offsetptr)) >= 0) { 390 if (c == '\n') 391 break; 392 } 393 } 394 395newline: 396 *bp++ = '\0'; 397 return buf; 398} 399 400/* 401 * time_t get_mtime(char *path) 402 * 403 * Return modtime of file at location path, else return 0. 404 */ 405 406static time_t get_mtime(const char *path) 407{ 408 struct stat statbuf; 409 410 if (stat(path, &statbuf) != 0) 411 return 0; 412 return statbuf.st_mtime; 413} 414 415/* 416 * int set_mtime(const char *path, time_t mtime) 417 * 418 * Set modtime of file 'path' to 'mtime'. Return 0 on success, 419 * or -1 on error, setting errno. 420 */ 421 422static int set_mtime(const char *path, time_t mtime) 423{ 424 struct utimbuf times; 425 426 times.actime = mtime; 427 times.modtime = mtime; 428 return utime(path, ×); 429} 430 431/* 432 * True if two pathnames resolve to same file. 433 * False if either path can not be stat'd, 434 * or if the two paths resolve to a different file. 435 */ 436 437static int samefile(const char *path1, const char *path2) 438{ 439 struct stat sb1, sb2; 440 441 if (stat(path1, &sb1) != 0) 442 return 0; 443 if (stat(path2, &sb2) != 0) 444 return 0; 445 return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev; 446} 447 448#define slash(c) (*(c) == '/') 449#define eocomp(c) (slash(c) || !*(c)) 450#define dot1(c) (*(c) == '.' && eocomp(c+1)) 451 452/* In place path compression. Remove extra dots and slashes. */ 453static char *pathcomp(char *p) 454{ 455 char *a = p; 456 char *b = p; 457 458 if (!p || !*p) 459 return p; 460 if (slash(p)) 461 *b++ = *a++; 462 for (;;) { 463 if (slash(a)) 464 while (slash(++a)) 465 continue; 466 if (!*a) { 467 if (b == p) 468 *b++ = '.'; 469 *b = '\0'; 470 return (p); 471 } else if (dot1(a)) { 472 a++; 473 } else { 474 if ((b != p) && !slash(b - 1)) 475 *b++ = '/'; 476 while (!eocomp(a)) 477 *b++ = *a++; 478 } 479 } 480} 481 482#undef slash 483#undef eocomp 484#undef dot1 485 486/* 487 * pathcat2(buf, buflen, name1, name2) 488 * 489 * Return buf, of length buflen, with name1/name2 stored in it. 490 */ 491 492static char *pathcat2(char *buf, int buflen, const char *name1, 493 const char *name2) 494{ 495 (void)snprintf(buf, buflen, "%s/%s", name1, name2); 496 return pathcomp(buf); 497} 498 499/* 500 * pathcat3(buf, buflen, name1, name2, name3) 501 * 502 * Return buf, of length buflen, with name1/name2/name3 stored in it. 503 */ 504 505static char *pathcat3(char *buf, int buflen, const char *name1, 506 const char *name2, const char *name3) 507{ 508 (void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3); 509 return pathcomp(buf); 510} 511 512/* 513 * fullpath(buf, buflen, name) 514 * 515 * Put full path of cpuset 'name' in buffer 'buf'. If name 516 * starts with a slash (``/``) character, then this a path 517 * relative to ``/dev/cpuset``, otherwise it is relative to 518 * the current tasks cpuset. Return 0 on success, else 519 * -1 on error, setting errno. 520 */ 521 522static int fullpath(char *buf, int buflen, const char *name) 523{ 524 int len; 525 526 /* easy case */ 527 if (*name == '/') { 528 pathcat2(buf, buflen, cpusetmnt, name); 529 pathcomp(buf); 530 return 0; 531 } 532 533 /* hard case */ 534 snprintf(buf, buflen, "%s/", cpusetmnt); 535 len = strlen(buf); 536 if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL) 537 return -1; 538 if (strlen(buf) >= buflen - 1 - strlen(name)) { 539 errno = E2BIG; 540 return -1; 541 } 542 strcat(buf, "/"); 543 strcat(buf, name); 544 pathcomp(buf); 545 return 0; 546} 547 548/* 549 * fullpath2(buf, buflen, name1, name2) 550 * 551 * Like fullpath(), only concatenate two pathname components on end. 552 */ 553 554static int fullpath2(char *buf, int buflen, const char *name1, 555 const char *name2) 556{ 557 if (fullpath(buf, buflen, name1) < 0) 558 return -1; 559 if (strlen(buf) >= buflen - 1 - strlen(name2)) { 560 errno = E2BIG; 561 return -1; 562 } 563 strcat(buf, "/"); 564 strcat(buf, name2); 565 pathcomp(buf); 566 return 0; 567} 568 569/* 570 * Convert the string length of an ascii hex mask to the number 571 * of bits represented by that mask. 572 * 573 * The cpumask and nodemask values in /proc/self/status are in an 574 * ascii format that uses 9 characters for each 32 bits of mask. 575 */ 576static int s2nbits(const char *s) 577{ 578 return strlen(s) * 32 / 9; 579} 580 581static void update_mask_sizes() 582{ 583 FILE *fp = NULL; 584 char *buf = NULL; 585 int fsize; 586 587 if ((fp = fopen(mask_size_file, "r")) == NULL) 588 goto done; 589 fsize = filesize(fp); 590 if ((buf = malloc(fsize)) == NULL) 591 goto done; 592 593 /* 594 * Beware: mask sizing arithmetic is fussy. 595 * The trailing newline left by fgets() is required. 596 */ 597 while (fgets(buf, fsize, fp)) { 598 if (strprefix(buf, cpumask_prefix)) 599 cpumask_sz = s2nbits(buf + strlen(cpumask_prefix)); 600 if (strprefix(buf, nodemask_prefix)) 601 nodemask_sz = s2nbits(buf + strlen(nodemask_prefix)); 602 } 603done: 604 if (buf != NULL) 605 free(buf); 606 if (fp != NULL) 607 fclose(fp); 608 if (cpumask_sz == 0) 609 cpumask_sz = DEFCPUBITS; 610 if (nodemask_sz == 0) 611 nodemask_sz = DEFNODEBITS; 612} 613 614/* Allocate a new struct cpuset */ 615struct cpuset *cpuset_alloc() 616{ 617 struct cpuset *cp = NULL; 618 int nbits; 619 620 if ((cp = calloc(1, sizeof(struct cpuset))) == NULL) 621 goto err; 622 623 nbits = cpuset_cpus_nbits(); 624 if ((cp->cpus = bitmask_alloc(nbits)) == NULL) 625 goto err; 626 627 nbits = cpuset_mems_nbits(); 628 if ((cp->mems = bitmask_alloc(nbits)) == NULL) 629 goto err; 630 631 return cp; 632err: 633 if (cp && cp->cpus) 634 bitmask_free(cp->cpus); 635 if (cp && cp->mems) 636 bitmask_free(cp->mems); 637 if (cp) 638 free(cp); 639 return NULL; 640} 641 642/* Free struct cpuset *cp */ 643void cpuset_free(struct cpuset *cp) 644{ 645 if (!cp) 646 return; 647 if (cp->cpus) 648 bitmask_free(cp->cpus); 649 if (cp->mems) 650 bitmask_free(cp->mems); 651 free(cp); 652} 653 654/* Number of bits in a CPU bitmask on current system */ 655int cpuset_cpus_nbits() 656{ 657 if (cpumask_sz == 0) 658 update_mask_sizes(); 659 return cpumask_sz; 660} 661 662/* Number of bits in a Memory bitmask on current system */ 663int cpuset_mems_nbits() 664{ 665 if (nodemask_sz == 0) 666 update_mask_sizes(); 667 return nodemask_sz; 668} 669 670/* Set CPUs in cpuset cp to bitmask cpus */ 671int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus) 672{ 673 if (cp->cpus) 674 bitmask_free(cp->cpus); 675 cp->cpus = bitmask_alloc(bitmask_nbits(cpus)); 676 if (cp->cpus == NULL) 677 return -1; 678 bitmask_copy(cp->cpus, cpus); 679 cp->cpus_valid = 1; 680 cp->cpus_dirty = 1; 681 return 0; 682} 683 684/* Set Memory Nodes in cpuset cp to bitmask mems */ 685int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems) 686{ 687 if (cp->mems) 688 bitmask_free(cp->mems); 689 cp->mems = bitmask_alloc(bitmask_nbits(mems)); 690 if (cp->mems == NULL) 691 return -1; 692 bitmask_copy(cp->mems, mems); 693 cp->mems_valid = 1; 694 cp->mems_dirty = 1; 695 return 0; 696} 697 698/* Set integer value optname of cpuset cp */ 699int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value) 700{ 701 if (streq(optionname, "cpu_exclusive")) { 702 cp->cpu_exclusive = ! !value; 703 cp->cpu_exclusive_valid = 1; 704 cp->cpu_exclusive_dirty = 1; 705 } else if (streq(optionname, "mem_exclusive")) { 706 cp->mem_exclusive = ! !value; 707 cp->mem_exclusive_valid = 1; 708 cp->mem_exclusive_dirty = 1; 709 } else if (streq(optionname, "mem_hardwall")) { 710 cp->mem_hardwall = ! !value; 711 cp->mem_hardwall_valid = 1; 712 cp->mem_hardwall_dirty = 1; 713 } else if (streq(optionname, "notify_on_release")) { 714 cp->notify_on_release = ! !value; 715 cp->notify_on_release_valid = 1; 716 cp->notify_on_release_dirty = 1; 717 } else if (streq(optionname, "memory_pressure_enabled")) { 718 cp->memory_pressure_enabled = ! !value; 719 cp->memory_pressure_enabled_valid = 1; 720 cp->memory_pressure_enabled_dirty = 1; 721 } else if (streq(optionname, "memory_migrate")) { 722 cp->memory_migrate = ! !value; 723 cp->memory_migrate_valid = 1; 724 cp->memory_migrate_dirty = 1; 725 } else if (streq(optionname, "memory_spread_page")) { 726 cp->memory_spread_page = ! !value; 727 cp->memory_spread_page_valid = 1; 728 cp->memory_spread_page_dirty = 1; 729 } else if (streq(optionname, "memory_spread_slab")) { 730 cp->memory_spread_slab = ! !value; 731 cp->memory_spread_slab_valid = 1; 732 cp->memory_spread_slab_dirty = 1; 733 } else if (streq(optionname, "sched_load_balance")) { 734 cp->sched_load_balance = ! !value; 735 cp->sched_load_balance_valid = 1; 736 cp->sched_load_balance_dirty = 1; 737 } else if (streq(optionname, "sched_relax_domain_level")) { 738 cp->sched_relax_domain_level = value; 739 cp->sched_relax_domain_level_valid = 1; 740 cp->sched_relax_domain_level_dirty = 1; 741 } else 742 return -2; /* optionname not recognized */ 743 return 0; 744} 745 746/* [optional] Set string value optname */ 747int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname, 748 UNUSED const char *value) 749{ 750 return -2; /* For now, all string options unrecognized */ 751} 752 753/* Return handle for reading memory_pressure. */ 754int cpuset_open_memory_pressure(const char *cpusetpath) 755{ 756 char buf[PATH_MAX]; 757 758 fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure"); 759 return open(buf, O_RDONLY); 760} 761 762/* Return current memory_pressure of cpuset. */ 763int cpuset_read_memory_pressure(int han) 764{ 765 char buf[SMALL_BUFSZ]; 766 767 if (pread(han, buf, sizeof(buf), 0L) < 0) 768 return -1; 769 return atoi(buf); 770} 771 772/* Close handle for reading memory pressure. */ 773void cpuset_close_memory_pressure(int han) 774{ 775 close(han); 776} 777 778/* 779 * Resolve cpuset pointer (to that of current task if cp == NULL). 780 * 781 * If cp not NULL, just return it. If cp is NULL, return pointer 782 * to temporary cpuset for current task, and set *cp_tofree to 783 * pointer to that same temporary cpuset, to be freed later. 784 * 785 * Return NULL and set errno on error. Errors can occur when 786 * resolving the current tasks cpuset. 787 */ 788static const struct cpuset *resolve_cp(const struct cpuset *cp, 789 struct cpuset **cp_tofree) 790{ 791 const struct cpuset *rcp; 792 793 if (cp) { 794 rcp = cp; 795 } else { 796 struct cpuset *cp1 = cpuset_alloc(); 797 if (cp1 == NULL) 798 goto err; 799 if (cpuset_cpusetofpid(cp1, 0) < 0) { 800 cpuset_free(cp1); 801 goto err; 802 } 803 *cp_tofree = cp1; 804 rcp = cp1; 805 } 806 return rcp; 807err: 808 return NULL; 809} 810 811/* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */ 812int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus) 813{ 814 struct cpuset *cp_tofree = NULL; 815 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 816 817 if (!cp1) 818 goto err; 819 if (cp1->cpus == NULL) { 820 errno = EINVAL; 821 goto err; 822 } 823 bitmask_copy(cpus, cp1->cpus); 824 cpuset_free(cp_tofree); 825 return 0; 826err: 827 cpuset_free(cp_tofree); 828 return -1; 829} 830 831/* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */ 832int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems) 833{ 834 struct cpuset *cp_tofree = NULL; 835 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 836 837 if (!cp1) 838 goto err; 839 if (cp1->mems == NULL) { 840 errno = EINVAL; 841 goto err; 842 } 843 bitmask_copy(mems, cp1->mems); 844 cpuset_free(cp_tofree); 845 return 0; 846err: 847 cpuset_free(cp_tofree); 848 return -1; 849} 850 851/* Return number of CPUs in cpuset cp (current task if cp == NULL) */ 852int cpuset_cpus_weight(const struct cpuset *cp) 853{ 854 struct cpuset *cp_tofree = NULL; 855 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 856 int w = -1; 857 858 if (!cp1) 859 goto err; 860 if (cp1->cpus == NULL) { 861 errno = EINVAL; 862 goto err; 863 } 864 w = bitmask_weight(cp1->cpus); 865 /* fall into ... */ 866err: 867 cpuset_free(cp_tofree); 868 return w; 869} 870 871/* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */ 872int cpuset_mems_weight(const struct cpuset *cp) 873{ 874 struct cpuset *cp_tofree = NULL; 875 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 876 int w = -1; 877 878 if (!cp1) 879 goto err; 880 if (cp1->mems == NULL) { 881 errno = EINVAL; 882 goto err; 883 } 884 w = bitmask_weight(cp1->mems); 885 /* fall into ... */ 886err: 887 cpuset_free(cp_tofree); 888 return w; 889} 890 891/* Return integer value of option optname in cp */ 892int cpuset_get_iopt(const struct cpuset *cp, const char *optionname) 893{ 894 if (streq(optionname, "cpu_exclusive")) 895 return cp->cpu_exclusive; 896 else if (streq(optionname, "mem_exclusive")) 897 return cp->mem_exclusive; 898 else if (streq(optionname, "mem_hardwall")) 899 return cp->mem_hardwall; 900 else if (streq(optionname, "notify_on_release")) 901 return cp->notify_on_release; 902 else if (streq(optionname, "memory_pressure_enabled")) 903 return cp->memory_pressure_enabled; 904 else if (streq(optionname, "memory_migrate")) 905 return cp->memory_migrate; 906 else if (streq(optionname, "memory_spread_page")) 907 return cp->memory_spread_page; 908 else if (streq(optionname, "memory_spread_slab")) 909 return cp->memory_spread_slab; 910 else if (streq(optionname, "sched_load_balance")) 911 return cp->sched_load_balance; 912 else if (streq(optionname, "sched_relax_domain_level")) 913 return cp->sched_relax_domain_level; 914 else 915 return -2; /* optionname not recognized */ 916} 917 918/* [optional] Return string value of optname */ 919const char *cpuset_get_sopt(UNUSED const struct cpuset *cp, 920 UNUSED const char *optionname) 921{ 922 return NULL; /* For now, all string options unrecognized */ 923} 924 925static int read_flag(const char *filepath, char *flagp) 926{ 927 char buf[SMALL_BUFSZ]; /* buffer a "0" or "1" flag line */ 928 int fd = -1; 929 930 if ((fd = open(filepath, O_RDONLY)) < 0) 931 goto err; 932 if (read(fd, buf, sizeof(buf)) < 1) 933 goto err; 934 if (atoi(buf)) 935 *flagp = 1; 936 else 937 *flagp = 0; 938 close(fd); 939 return 0; 940err: 941 if (fd >= 0) 942 close(fd); 943 return -1; 944} 945 946static int load_flag(const char *path, char *flagp, const char *flag) 947{ 948 char buf[PATH_MAX]; 949 950 pathcat2(buf, sizeof(buf), path, flag); 951 return read_flag(buf, flagp); 952} 953 954static int read_number(const char *filepath, int *numberp) 955{ 956 char buf[SMALL_BUFSZ]; 957 int fd = -1; 958 959 if ((fd = open(filepath, O_RDONLY)) < 0) 960 goto err; 961 if (read(fd, buf, sizeof(buf)) < 1) 962 goto err; 963 *numberp = atoi(buf); 964 close(fd); 965 return 0; 966err: 967 if (fd >= 0) 968 close(fd); 969 return -1; 970} 971 972static int load_number(const char *path, int *numberp, const char *file) 973{ 974 char buf[PATH_MAX]; 975 976 pathcat2(buf, sizeof(buf), path, file); 977 return read_number(buf, numberp); 978} 979 980static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits) 981{ 982 FILE *fp = NULL; 983 char *buf = NULL; 984 int buflen; 985 struct bitmask *bmp = NULL; 986 987 if ((fp = fopen(filepath, "r")) == NULL) 988 goto err; 989 buflen = filesize(fp) + 1; /* + 1 for nul term */ 990 if ((buf = malloc(buflen)) == NULL) 991 goto err; 992 if (flgets(buf, buflen, fp) == NULL) 993 goto err; 994 fclose(fp); 995 fp = NULL; 996 997 if ((bmp = bitmask_alloc(nbits)) == NULL) 998 goto err; 999 if (*buf && bitmask_parselist(buf, bmp) < 0) 1000 goto err; 1001 if (*bmpp) 1002 bitmask_free(*bmpp); 1003 *bmpp = bmp; 1004 free(buf); 1005 buf = NULL; 1006 return 0; 1007err: 1008 if (buf != NULL) 1009 free(buf); 1010 if (fp != NULL) 1011 fclose(fp); 1012 if (bmp != NULL) 1013 bitmask_free(bmp); 1014 return -1; 1015} 1016 1017static int load_mask(const char *path, struct bitmask **bmpp, 1018 int nbits, const char *mask) 1019{ 1020 char buf[PATH_MAX]; 1021 1022 pathcat2(buf, sizeof(buf), path, mask); 1023 return read_mask(buf, bmpp, nbits); 1024} 1025 1026/* Write string to file at given filepath. Create or truncate file. */ 1027static int write_string_file(const char *filepath, const char *str) 1028{ 1029 int fd = -1; 1030 1031 if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0) 1032 goto err; 1033 if (write(fd, str, strlen(str)) < 0) 1034 goto err; 1035 close(fd); 1036 return 0; 1037err: 1038 if (fd >= 0) 1039 close(fd); 1040 return -1; 1041} 1042 1043/* Size and allocate buffer. Write bitmask into it. Caller must free */ 1044static char *sprint_mask_buf(const struct bitmask *bmp) 1045{ 1046 char *buf = NULL; 1047 int buflen; 1048 char c; 1049 1050 /* First bitmask_displaylist() call just to get the length */ 1051 buflen = bitmask_displaylist(&c, 1, bmp) + 1; /* "+ 1" for nul */ 1052 if ((buf = malloc(buflen)) == NULL) 1053 return NULL; 1054 bitmask_displaylist(buf, buflen, bmp); 1055 return buf; 1056} 1057 1058static int exists_flag(const char *path, const char *flag) 1059{ 1060 char buf[PATH_MAX]; 1061 struct stat statbuf; 1062 int rc; 1063 1064 pathcat2(buf, sizeof(buf), path, flag); 1065 rc = (stat(buf, &statbuf) == 0); 1066 errno = 0; 1067 return rc; 1068} 1069 1070static int store_flag(const char *path, const char *flag, int val) 1071{ 1072 char buf[PATH_MAX]; 1073 1074 pathcat2(buf, sizeof(buf), path, flag); 1075 return write_string_file(buf, val ? "1" : "0"); 1076} 1077 1078static int store_number(const char *path, const char *file, int val) 1079{ 1080 char buf[PATH_MAX]; 1081 char data[SMALL_BUFSZ]; 1082 1083 memset(data, 0, sizeof(data)); 1084 pathcat2(buf, sizeof(buf), path, file); 1085 snprintf(data, sizeof(data), "%d", val); 1086 return write_string_file(buf, data); 1087} 1088 1089static int store_mask(const char *path, const char *mask, 1090 const struct bitmask *bmp) 1091{ 1092 char maskpath[PATH_MAX]; 1093 char *bp = NULL; 1094 int rc; 1095 1096 if (bmp == NULL) 1097 return 0; 1098 pathcat2(maskpath, sizeof(maskpath), path, mask); 1099 if ((bp = sprint_mask_buf(bmp)) == NULL) 1100 return -1; 1101 rc = write_string_file(maskpath, bp); 1102 free(bp); 1103 return rc; 1104} 1105 1106/* 1107 * Return 1 if 'cpu' is online, else 0 if offline. Tests the file 1108 * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents 1109 * were N == cpu number. 1110 */ 1111 1112char cpu_online(unsigned int cpu) 1113{ 1114 char online; 1115 char cpupath[PATH_MAX]; 1116 1117 (void)snprintf(cpupath, sizeof(cpupath), 1118 "/sys/devices/system/cpu/cpu%d/online", cpu); 1119 if (read_flag(cpupath, &online) < 0) 1120 return 0; /* oops - guess that cpu's not there */ 1121 return online; 1122} 1123 1124/* 1125 * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()), 1126 * to the node on which that cpu resides or cpuset_mems_nbits(). 1127 * 1128 * To avoid every user having to recalculate this relation 1129 * from various clues in the sysfs file system (below the 1130 * path /sys/devices/system) a copy of this map is kept at 1131 * /var/run/cpunodemap. 1132 * 1133 * The system automatically cleans out files below 1134 * /var/run on each system reboot (see the init script 1135 * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry 1136 * about stale data in this file across reboots. If the file 1137 * is missing, let the first process that needs it, and has 1138 * permission to write in the /var/run directory, rebuild it. 1139 * 1140 * If using this cached data, remember the mtime of the mapfile 1141 * the last time we read it in case something like a hotplug 1142 * event results in the file being removed and rebuilt, so we 1143 * can detect if we're using a stale cache, and need to reload. 1144 * 1145 * The mtime of this file is set to the time when we did 1146 * the recalculation of the map, from the clues beneath 1147 * /sys/devices/system. This is done so that a program 1148 * won't see the mapfile it just wrote as being newer than what 1149 * it just wrote out (store_map) and read the same map back in 1150 * (load_file). 1151 */ 1152 1153/* 1154 * Hold flockfile(stdin) while using cpunodemap for posix thread safety. 1155 * 1156 * Note on locking and flockfile(FILE *): 1157 * 1158 * We use flockfile() and funlockfile() instead of directly 1159 * calling pthread_mutex_lock and pthread_mutex_unlock on 1160 * a pthread_mutex_t, because this avoids forcing the app 1161 * to link with libpthread. The glibc implementation of 1162 * flockfile/funlockfile will fall back to no-ops if libpthread 1163 * doesn't happen to be linked. 1164 * 1165 * Since flockfile already has the moderately convoluted 1166 * combination of weak and strong symbols required to accomplish 1167 * this, it is easier to use flockfile() on some handy FILE * 1168 * stream as a surrogate for pthread locking than it is to so 1169 * re-invent that wheel. 1170 * 1171 * Forcing all apps that use cpusets to link with libpthread 1172 * would force non-transparent initialization on apps that 1173 * might not be prepared to handle it. 1174 * 1175 * The application using libcpuset should never notice this 1176 * odd use of flockfile(), because we never return to the 1177 * application from any libcpuset call with any such lock held. 1178 * We just use this locking for guarding some non-atomic cached 1179 * data updates and accesses, internal to some libcpuset calls. 1180 * Also, flockfile() allows recursive nesting, so if the app 1181 * calls libcpuset holding such a file lock, we won't deadlock 1182 * if we go to acquire the same lock. We'll just get the lock 1183 * and increment its counter while we hold it. 1184 */ 1185 1186static struct cpunodemap { 1187 int *map; /* map[cpumask_sz]: maps cpu to its node */ 1188 time_t mtime; /* modtime of mapfile when last read */ 1189} cpunodemap; 1190 1191/* 1192 * rebuild_map() - Rebuild cpunodemap[] from scratch. 1193 * 1194 * Situation: 1195 * Neither our in-memory cpunodemap[] array nor the 1196 * cache of it in mapfile is current. 1197 * Action: 1198 * Rebuild it from first principles and the information 1199 * available below /sys/devices/system. 1200 */ 1201 1202static void rebuild_map() 1203{ 1204 char buf[PATH_MAX]; 1205 DIR *dir1, *dir2; 1206 struct dirent *dent1, *dent2; 1207 int ncpus = cpuset_cpus_nbits(); 1208 int nmems = cpuset_mems_nbits(); 1209 unsigned int cpu, mem; 1210 1211 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) 1212 cpunodemap.map[cpu] = -1; 1213 pathcat2(buf, sizeof(buf), sysdevices, "node"); 1214 if ((dir1 = opendir(buf)) == NULL) 1215 return; 1216 while ((dent1 = readdir(dir1)) != NULL) { 1217 if (sscanf(dent1->d_name, "node%u", &mem) < 1) 1218 continue; 1219 pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name); 1220 if ((dir2 = opendir(buf)) == NULL) 1221 continue; 1222 while ((dent2 = readdir(dir2)) != NULL) { 1223 if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1) 1224 continue; 1225 if (cpu >= (unsigned int)ncpus 1226 || mem >= (unsigned int)nmems) 1227 continue; 1228 cpunodemap.map[cpu] = mem; 1229 } 1230 closedir(dir2); 1231 } 1232 closedir(dir1); 1233 cpunodemap.mtime = time(0); 1234} 1235 1236/* 1237 * load_map() - Load cpunodemap[] from mapfile. 1238 * 1239 * Situation: 1240 * The cpunodemap in mapfile is more recent than 1241 * what we have in the cpunodemap[] array. 1242 * Action: 1243 * Reload the cpunodemap[] array from the file. 1244 */ 1245 1246static void load_map() 1247{ 1248 char buf[SMALL_BUFSZ]; /* buffer 1 line of mapfile */ 1249 FILE *mapfp; /* File stream on mapfile */ 1250 int ncpus = cpuset_cpus_nbits(); 1251 int nmems = cpuset_mems_nbits(); 1252 unsigned int cpu, mem; 1253 1254 if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL) 1255 return; 1256 cpunodemap.mtime = get_mtime(mapfile); 1257 if ((mapfp = fopen(mapfile, "r")) == NULL) 1258 return; 1259 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) 1260 cpunodemap.map[cpu] = nmems; 1261 while (flgets(buf, sizeof(buf), mapfp) != NULL) { 1262 if (sscanf(buf, "%u %u", &cpu, &mem) < 2) 1263 continue; 1264 if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems) 1265 continue; 1266 cpunodemap.map[cpu] = mem; 1267 } 1268 fclose(mapfp); 1269} 1270 1271/* 1272 * store_map() - Write cpunodemap[] out to mapfile. 1273 * 1274 * Situation: 1275 * The cpunodemap in the cpunodemap[] array is 1276 * more recent than the one in mapfile. 1277 * Action: 1278 * Write cpunodemap[] out to mapfile. 1279 */ 1280 1281static void store_map() 1282{ 1283 char buf[PATH_MAX]; 1284 int fd = -1; 1285 FILE *mapfp = NULL; 1286 int ncpus = cpuset_cpus_nbits(); 1287 int nmems = cpuset_mems_nbits(); 1288 unsigned int cpu, mem; 1289 1290 snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX"); 1291 if ((fd = mkstemp(buf)) < 0) 1292 goto err; 1293 if ((mapfp = fdopen(fd, "w")) == NULL) 1294 goto err; 1295 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1296 mem = cpunodemap.map[cpu]; 1297 if (mem < (unsigned int)nmems) 1298 fprintf(mapfp, "%u %u\n", cpu, mem); 1299 } 1300 fclose(mapfp); 1301 set_mtime(buf, cpunodemap.mtime); 1302 if (rename(buf, mapfile) < 0) 1303 goto err; 1304 /* mkstemp() creates mode 0600 - change to world readable */ 1305 (void)chmod(mapfile, 0444); 1306 return; 1307err: 1308 if (mapfp != NULL) { 1309 fclose(mapfp); 1310 fd = -1; 1311 } 1312 if (fd >= 0) 1313 close(fd); 1314 (void)unlink(buf); 1315} 1316 1317/* 1318 * Load and gain thread safe access to the <cpu, node> map. 1319 * 1320 * Return 0 on success with flockfile(stdin) held. 1321 * Each successful get_map() call must be matched with a 1322 * following put_map() call to release the lock. 1323 * 1324 * On error, return -1 with errno set and no lock held. 1325 */ 1326 1327static int get_map() 1328{ 1329 time_t file_mtime; 1330 1331 flockfile(stdin); 1332 1333 if (cpunodemap.map == NULL) { 1334 cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int)); 1335 if (cpunodemap.map == NULL) 1336 goto err; 1337 } 1338 1339 /* If no one has a good cpunodemap, rebuild from scratch */ 1340 file_mtime = get_mtime(mapfile); 1341 if (cpunodemap.mtime == 0 && file_mtime == 0) 1342 rebuild_map(); 1343 1344 /* If either cpunodemap[] or mapfile newer, update other with it */ 1345 file_mtime = get_mtime(mapfile); 1346 if (cpunodemap.mtime < file_mtime) 1347 load_map(); 1348 else if (cpunodemap.mtime > file_mtime) 1349 store_map(); 1350 return 0; 1351err: 1352 funlockfile(stdin); 1353 return -1; 1354} 1355 1356static void put_map() 1357{ 1358 funlockfile(stdin); 1359} 1360 1361/* Set cpus to those local to Memory Nodes mems */ 1362int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus) 1363{ 1364 int ncpus = cpuset_cpus_nbits(); 1365 unsigned int cpu; 1366 1367 if (check() < 0) 1368 return -1; 1369 1370 get_map(); 1371 bitmask_clearall(cpus); 1372 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1373 if (bitmask_isbitset(mems, cpunodemap.map[cpu])) 1374 bitmask_setbit(cpus, cpu); 1375 } 1376 put_map(); 1377 return 0; 1378} 1379 1380/* Set mems to those local to CPUs cpus */ 1381int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems) 1382{ 1383 int ncpus = cpuset_cpus_nbits(); 1384 unsigned int cpu; 1385 1386 if (check() < 0) 1387 return -1; 1388 1389 get_map(); 1390 bitmask_clearall(mems); 1391 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1392 if (bitmask_isbitset(cpus, cpu)) 1393 bitmask_setbit(mems, cpunodemap.map[cpu]); 1394 } 1395 put_map(); 1396 return 0; 1397} 1398 1399/* 1400 * distmap[] 1401 * 1402 * Array of ints of size cpumask_sz by nodemask_sz. 1403 * 1404 * Element distmap[cpu][mem] is the distance between CPU cpu 1405 * and Memory Node mem. Distances are weighted to roughly 1406 * approximate the cost of memory references, and scaled so that 1407 * the distance from a CPU to its local Memory Node is ten (10). 1408 * 1409 * The first call to cpuset_cpumemdist() builds this map, from 1410 * whatever means the kernel provides to obtain these distances. 1411 * 1412 * These distances derive from ACPI SLIT table entries, which are 1413 * eight bits in size. 1414 * 1415 * Hold flockfile(stdout) while using distmap for posix thread safety. 1416 */ 1417 1418typedef unsigned char distmap_entry_t; /* type of distmap[] entries */ 1419 1420static distmap_entry_t *distmap; /* maps <cpu, mem> to distance */ 1421 1422#define DISTMAP_MAX UCHAR_MAX /* maximum value in distmap[] */ 1423 1424#define I(i,j) ((i) * nmems + (j)) /* 2-D array index simulation */ 1425 1426/* 1427 * Parse arch neutral lines from 'distance' files of form: 1428 * 1429 * 46 66 10 20 1430 * 1431 * The lines contain a space separated list of distances, which is parsed 1432 * into array dists[] of each nodes distance from the specified node. 1433 * 1434 * Result is placed in distmap[ncpus][nmems]: 1435 * 1436 * For each cpu c on node: 1437 * For each node position n in list of distances: 1438 * distmap[c][n] = dists[n] 1439 */ 1440 1441static int parse_distmap_line(unsigned int node, char *buf) 1442{ 1443 char *p, *q; 1444 int ncpus = cpuset_cpus_nbits(); 1445 int nmems = cpuset_mems_nbits(); 1446 unsigned int c, n; 1447 distmap_entry_t *dists = NULL; 1448 struct bitmask *cpus = NULL, *mems = NULL; 1449 int ret = -1; 1450 1451 p = buf; 1452 if ((dists = calloc(nmems, sizeof(*dists))) == NULL) 1453 goto err; 1454 for (n = 0; n < (unsigned int)nmems; n++) 1455 dists[n] = DISTMAP_MAX; 1456 1457 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { 1458 unsigned int d; 1459 1460 if ((p = strpbrk(p, "0123456789")) == NULL) 1461 break; 1462 d = strtoul(p, &q, 10); 1463 if (p == q) 1464 break; 1465 if (d < DISTMAP_MAX) 1466 dists[n] = (distmap_entry_t) d; 1467 } 1468 1469 if ((mems = bitmask_alloc(nmems)) == NULL) 1470 goto err; 1471 bitmask_setbit(mems, node); 1472 1473 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1474 goto err; 1475 cpuset_localcpus(mems, cpus); 1476 1477 for (c = bitmask_first(cpus); c < (unsigned int)ncpus; 1478 c = bitmask_next(cpus, c + 1)) 1479 for (n = 0; n < (unsigned int)nmems; n++) 1480 distmap[I(c, n)] = dists[n]; 1481 ret = 0; 1482 /* fall into ... */ 1483err: 1484 bitmask_free(mems); 1485 bitmask_free(cpus); 1486 free(dists); 1487 return ret; 1488} 1489 1490static int parse_distance_file(unsigned int node, const char *path) 1491{ 1492 FILE *fp; 1493 char *buf = NULL; 1494 int buflen; 1495 1496 if ((fp = fopen(path, "r")) == NULL) 1497 goto err; 1498 1499 buflen = filesize(fp); 1500 1501 if ((buf = malloc(buflen)) == NULL) 1502 goto err; 1503 1504 if (flgets(buf, buflen, fp) == NULL) 1505 goto err; 1506 1507 if (parse_distmap_line(node, buf) < 0) 1508 goto err; 1509 1510 free(buf); 1511 fclose(fp); 1512 return 0; 1513err: 1514 free(buf); 1515 if (fp) 1516 fclose(fp); 1517 return -1; 1518} 1519 1520static void build_distmap() 1521{ 1522 static int tried_before = 0; 1523 int ncpus = cpuset_cpus_nbits(); 1524 int nmems = cpuset_mems_nbits(); 1525 int c, m; 1526 DIR *dir = NULL; 1527 struct dirent *dent; 1528 1529 if (tried_before) 1530 goto err; 1531 tried_before = 1; 1532 1533 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) 1534 goto err; 1535 1536 for (c = 0; c < ncpus; c++) 1537 for (m = 0; m < nmems; m++) 1538 distmap[I(c, m)] = DISTMAP_MAX; 1539 1540 if ((dir = opendir(distance_directory)) == NULL) 1541 goto err; 1542 while ((dent = readdir(dir)) != NULL) { 1543 char buf[PATH_MAX]; 1544 unsigned int node; 1545 1546 if (sscanf(dent->d_name, "node%u", &node) < 1) 1547 continue; 1548 pathcat3(buf, sizeof(buf), distance_directory, dent->d_name, 1549 "distance"); 1550 if (parse_distance_file(node, buf) < 0) 1551 goto err; 1552 } 1553 closedir(dir); 1554 return; 1555err: 1556 if (dir) 1557 closedir(dir); 1558 free(distmap); 1559 distmap = NULL; 1560} 1561 1562#ifdef ALTERNATE_SN_DISTMAP 1563 1564/* 1565 * Parse SN architecture specific line of form: 1566 * 1567 * node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10 1568 * 1569 * Second field is node number. The "dist" field is the colon separated list 1570 * of distances, which is parsed into array dists[] of each nodes distance 1571 * from that node. 1572 * 1573 * Result is placed in distmap[ncpus][nmems]: 1574 * 1575 * For each cpu c on that node: 1576 * For each node position n in list of distances: 1577 * distmap[c][n] = dists[n] 1578 */ 1579 1580static void parse_distmap_line_sn(char *buf) 1581{ 1582 char *p, *pend, *q; 1583 int ncpus = cpuset_cpus_nbits(); 1584 int nmems = cpuset_mems_nbits(); 1585 unsigned long c, n, node; 1586 distmap_entry_t *dists = NULL; 1587 struct bitmask *cpus = NULL, *mems = NULL; 1588 1589 if ((p = strchr(buf, ' ')) == NULL) 1590 goto err; 1591 if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems) 1592 goto err; 1593 if ((p = strstr(q, " dist ")) == NULL) 1594 goto err; 1595 p += strlen(" dist "); 1596 if ((pend = strchr(p, ' ')) != NULL) 1597 *pend = '\0'; 1598 if ((dists = calloc(nmems, sizeof(*dists))) == NULL) 1599 goto err; 1600 for (n = 0; n < (unsigned int)nmems; n++) 1601 dists[n] = DISTMAP_MAX; 1602 1603 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { 1604 unsigned long d; 1605 1606 if ((p = strpbrk(p, "0123456789")) == NULL) 1607 break; 1608 d = strtoul(p, &q, 10); 1609 if (p == q) 1610 break; 1611 if (d < DISTMAP_MAX) 1612 dists[n] = (distmap_entry_t) d; 1613 } 1614 1615 if ((mems = bitmask_alloc(nmems)) == NULL) 1616 goto err; 1617 bitmask_setbit(mems, node); 1618 1619 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1620 goto err; 1621 cpuset_localcpus(mems, cpus); 1622 1623 for (c = bitmask_first(cpus); c < (unsigned int)ncpus; 1624 c = bitmask_next(cpus, c + 1)) 1625 for (n = 0; n < (unsigned int)nmems; n++) 1626 distmap[I(c, n)] = dists[n]; 1627 /* fall into ... */ 1628err: 1629 bitmask_free(mems); 1630 bitmask_free(cpus); 1631 free(dists); 1632} 1633 1634static void build_distmap_sn() 1635{ 1636 int ncpus = cpuset_cpus_nbits(); 1637 int nmems = cpuset_mems_nbits(); 1638 int c, m; 1639 static int tried_before = 0; 1640 FILE *fp = NULL; 1641 char *buf = NULL; 1642 int buflen; 1643 1644 if (tried_before) 1645 goto err; 1646 tried_before = 1; 1647 1648 if ((fp = fopen(sn_topology, "r")) == NULL) 1649 goto err; 1650 1651 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) 1652 goto err; 1653 1654 for (c = 0; c < ncpus; c++) 1655 for (m = 0; m < nmems; m++) 1656 distmap[I(c, m)] = DISTMAP_MAX; 1657 1658 buflen = filesize(fp); 1659 if ((buf = malloc(buflen)) == NULL) 1660 goto err; 1661 1662 while (flgets(buf, buflen, fp) != NULL) 1663 if (strprefix(buf, sn_top_node_prefix)) 1664 parse_distmap_line_sn(buf); 1665 1666 free(buf); 1667 fclose(fp); 1668 return; 1669err: 1670 free(buf); 1671 free(distmap); 1672 distmap = NULL; 1673 if (fp) 1674 fclose(fp); 1675} 1676 1677#endif 1678 1679/* [optional] Hardware distance from CPU to Memory Node */ 1680unsigned int cpuset_cpumemdist(int cpu, int mem) 1681{ 1682 int ncpus = cpuset_cpus_nbits(); 1683 int nmems = cpuset_mems_nbits(); 1684 distmap_entry_t r = DISTMAP_MAX; 1685 1686 flockfile(stdout); 1687 1688 if (check() < 0) 1689 goto err; 1690 1691 if (distmap == NULL) 1692 build_distmap(); 1693 1694#ifdef ALTERNATE_SN_DISTMAP 1695 if (distmap == NULL) 1696 build_distmap_sn(); 1697#endif 1698 1699 if (distmap == NULL) 1700 goto err; 1701 1702 if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems) 1703 goto err; 1704 1705 r = distmap[I(cpu, mem)]; 1706 /* fall into ... */ 1707err: 1708 funlockfile(stdout); 1709 return r; 1710} 1711 1712/* [optional] Return Memory Node closest to cpu */ 1713int cpuset_cpu2node(int cpu) 1714{ 1715 int ncpus = cpuset_cpus_nbits(); 1716 int nmems = cpuset_mems_nbits(); 1717 struct bitmask *cpus = NULL, *mems = NULL; 1718 int r = -1; 1719 1720 if (check() < 0) 1721 goto err; 1722 1723 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1724 goto err; 1725 bitmask_setbit(cpus, cpu); 1726 1727 if ((mems = bitmask_alloc(nmems)) == NULL) 1728 goto err; 1729 cpuset_localmems(cpus, mems); 1730 r = bitmask_first(mems); 1731 /* fall into ... */ 1732err: 1733 bitmask_free(cpus); 1734 bitmask_free(mems); 1735 return r; 1736} 1737 1738static int apply_cpuset_settings(const char *path, const struct cpuset *cp) 1739{ 1740 if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) { 1741 if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0) 1742 goto err; 1743 } 1744 1745 if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) { 1746 if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0) 1747 goto err; 1748 } 1749 1750 if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) { 1751 if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0) 1752 goto err; 1753 } 1754 1755 if (cp->notify_on_release_valid && cp->notify_on_release_dirty) { 1756 if (store_flag(path, "notify_on_release", cp->notify_on_release) 1757 < 0) 1758 goto err; 1759 } 1760 1761 if (cp->memory_migrate_valid && 1762 cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) { 1763 if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0) 1764 goto err; 1765 } 1766 1767 if (cp->memory_pressure_enabled_valid && 1768 cp->memory_pressure_enabled_dirty && 1769 exists_flag(path, "memory_pressure_enabled")) { 1770 if (store_flag 1771 (path, "memory_pressure_enabled", 1772 cp->memory_pressure_enabled) < 0) 1773 goto err; 1774 } 1775 1776 if (cp->memory_spread_page_valid && 1777 cp->memory_spread_page_dirty && 1778 exists_flag(path, "memory_spread_page")) { 1779 if (store_flag 1780 (path, "memory_spread_page", cp->memory_spread_page) < 0) 1781 goto err; 1782 } 1783 1784 if (cp->memory_spread_slab_valid && 1785 cp->memory_spread_slab_dirty && 1786 exists_flag(path, "memory_spread_slab")) { 1787 if (store_flag 1788 (path, "memory_spread_slab", cp->memory_spread_slab) < 0) 1789 goto err; 1790 } 1791 1792 if (cp->sched_load_balance_valid && 1793 cp->sched_load_balance_dirty && 1794 exists_flag(path, "sched_load_balance")) { 1795 if (store_flag 1796 (path, "sched_load_balance", cp->sched_load_balance) < 0) 1797 goto err; 1798 } 1799 1800 if (cp->sched_relax_domain_level_valid && 1801 cp->sched_relax_domain_level_dirty && 1802 exists_flag(path, "sched_relax_domain_level")) { 1803 if (store_number 1804 (path, "sched_relax_domain_level", 1805 cp->sched_relax_domain_level) < 0) 1806 goto err; 1807 } 1808 1809 if (cp->cpus_valid && cp->cpus_dirty) { 1810 if (store_mask(path, "cpus", cp->cpus) < 0) 1811 goto err; 1812 } 1813 1814 if (cp->mems_valid && cp->mems_dirty) { 1815 if (store_mask(path, "mems", cp->mems) < 0) 1816 goto err; 1817 } 1818 return 0; 1819err: 1820 return -1; 1821} 1822 1823/* 1824 * get_siblings() - helper routine for cpuset_would_crash_kernel(), below. 1825 * 1826 * Extract max value of any 'siblings' field in /proc/cpuinfo. 1827 * Cache the result - only need to extract once in lifetime of task. 1828 * 1829 * The siblings field is the number of logical CPUs in a physical 1830 * processor package. It is equal to the product of the number of 1831 * cores in that package, times the number of hyper-threads per core. 1832 * The bug that cpuset_would_crash_kernel() is detecting arises 1833 * when a cpu_exclusive cpuset tries to include just some, not all, 1834 * of the sibling logical CPUs available in a processor package. 1835 * 1836 * In the improbable case that a system has mixed values of siblings 1837 * (some processor packages have more than others, perhaps due to 1838 * partially enabling Hyper-Threading), we take the worse case value, 1839 * the largest siblings value. This might be overkill. I don't know 1840 * if this kernel bug considers each processor package's siblings 1841 * separately or not. But it sure is easier this way ... 1842 * 1843 * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from 1844 * open to close, the first time called. 1845 */ 1846 1847static int get_siblings() 1848{ 1849 static int siblings; 1850 char buf[32]; /* big enough for one 'siblings' line */ 1851 FILE *fp; 1852 1853 if (siblings) 1854 return siblings; 1855 1856 if ((fp = fopen("/proc/cpuinfo", "r")) == NULL) 1857 return 4; /* wing it - /proc not mounted ? */ 1858 while (flgets(buf, sizeof(buf), fp) != NULL) { 1859 int s; 1860 1861 if (sscanf(buf, "siblings : %d", &s) < 1) 1862 continue; 1863 if (s > siblings) 1864 siblings = s; 1865 } 1866 fclose(fp); 1867 if (siblings == 0) 1868 siblings = 1; /* old kernel, no siblings, default to 1 */ 1869 return siblings; 1870} 1871 1872/* 1873 * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic 1874 * scheduler domain code invoked for cpu_exclusive cpusets that causes 1875 * the kernel to freeze, requiring a hardware reset. 1876 * 1877 * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive' 1878 * cpuset is defined where that cpusets 'cpus' are not on package 1879 * boundaries then the kernel will freeze, usually as soon as this 1880 * cpuset is created, requiring a hardware reset. 1881 * 1882 * A cpusets 'cpus' are not on package boundaries if the cpuset 1883 * includes a proper non-empty subset (some, but not all) of the 1884 * logical cpus on a processor package. This requires multiple 1885 * logical CPUs per package, available with either Hyper-Thread or 1886 * Multi-Core support. Without one of these features, there is only 1887 * one logical CPU per physical package, and it's not possible to 1888 * have a proper, non-empty subset of a set of cardinality one. 1889 * 1890 * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC 1891 * on i386 and x86_64 arch's. 1892 * 1893 * The objective of this routine cpuset_would_crash_kernel() is to 1894 * determine if a proposed cpuset setting would crash the kernel due 1895 * to this bug, so that the caller can avoid the crash. 1896 * 1897 * Ideally we'd check for exactly these conditions here, but computing 1898 * the package (identified by the 'physical id' field of /proc/cpuinfo) 1899 * of each cpu in a cpuset is more effort than it's worth here. 1900 * 1901 * Also there is no obvious way to identify exactly whether the kernel 1902 * one is executing on has this bug, short of trying it, and seeing 1903 * if the kernel just crashed. 1904 * 1905 * So for now, we look for a simpler set of conditions, that meets 1906 * our immediate need - avoid this crash on SUSE SLES10 systems that 1907 * are susceptible to it. We look for the kernel version 2.6.16.*, 1908 * which is the base kernel of SUSE SLES10, and for i386 or x86_64 1909 * processors, which had CONFIG_SCHED_MC enabled. 1910 * 1911 * If these simpler conditions are met, we further simplify the check, 1912 * by presuming that the logical CPUs are numbered on processor 1913 * package boundaries. If each package has S siblings, we assume 1914 * that CPUs numbered N through N + S -1 are on the same package, 1915 * for any CPU N such that N mod S == 0. 1916 * 1917 * Yes, this is a hack, focused on avoiding kernel freezes on 1918 * susceptible SUSE SLES10 systems. 1919 */ 1920 1921static int cpuset_would_crash_kernel(const struct cpuset *cp) 1922{ 1923 static int susceptible_system = -1; 1924 1925 if (!cp->cpu_exclusive) 1926 goto ok; 1927 1928 if (susceptible_system == -1) { 1929 struct utsname u; 1930 int rel_2_6_16, arch_i386, arch_x86_64; 1931 1932 if (uname(&u) < 0) 1933 goto fail; 1934 rel_2_6_16 = strprefix(u.release, "2.6.16."); 1935 arch_i386 = streq(u.machine, "i386"); 1936 arch_x86_64 = streq(u.machine, "x86_64"); 1937 susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64); 1938 } 1939 1940 if (susceptible_system) { 1941 int ncpus = cpuset_cpus_nbits(); 1942 int siblings = get_siblings(); 1943 unsigned int cpu; 1944 1945 for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) { 1946 int s, num_set = 0; 1947 1948 for (s = 0; s < siblings; s++) { 1949 if (bitmask_isbitset(cp->cpus, cpu + s)) 1950 num_set++; 1951 } 1952 1953 /* If none or all siblings set, we're still ok */ 1954 if (num_set == 0 || num_set == siblings) 1955 continue; 1956 1957 /* Found one that would crash kernel. Fail. */ 1958 errno = ENXIO; 1959 goto fail; 1960 } 1961 } 1962 /* If not susceptible, or if all ok, fall into "ok" ... */ 1963ok: 1964 return 0; /* would not crash */ 1965fail: 1966 return 1; /* would crash */ 1967} 1968 1969/* compare two cpuset and mark the dirty variable */ 1970static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2) 1971{ 1972 if (cp1->cpu_exclusive_valid && 1973 cp1->cpu_exclusive != cp2->cpu_exclusive) 1974 cp1->cpu_exclusive_dirty = 1; 1975 1976 if (cp1->mem_exclusive_valid && 1977 cp1->mem_exclusive != cp2->mem_exclusive) 1978 cp1->mem_exclusive_dirty = 1; 1979 1980 if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall) 1981 cp1->mem_hardwall_dirty = 1; 1982 1983 if (cp1->notify_on_release_valid && 1984 cp1->notify_on_release != cp2->notify_on_release) 1985 cp1->notify_on_release_dirty = 1; 1986 1987 if (cp1->memory_migrate_valid && 1988 cp1->memory_migrate != cp2->memory_migrate) 1989 cp1->memory_migrate_dirty = 1; 1990 1991 if (cp1->memory_pressure_enabled_valid && 1992 cp1->memory_pressure_enabled != cp2->memory_pressure_enabled) 1993 cp1->memory_pressure_enabled_dirty = 1; 1994 1995 if (cp1->memory_spread_page_valid && 1996 cp1->memory_spread_page != cp2->memory_spread_page) 1997 cp1->memory_spread_page_dirty = 1; 1998 1999 if (cp1->memory_spread_slab_valid && 2000 cp1->memory_spread_slab != cp2->memory_spread_slab) 2001 cp1->memory_spread_slab_dirty = 1; 2002 2003 if (cp1->sched_load_balance_valid && 2004 cp1->sched_load_balance != cp2->sched_load_balance) 2005 cp1->sched_load_balance_dirty = 1; 2006 2007 if (cp1->sched_relax_domain_level_valid && 2008 cp1->sched_relax_domain_level != cp2->sched_relax_domain_level) 2009 cp1->sched_relax_domain_level_dirty = 1; 2010 2011 if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus)) 2012 cp1->cpus_dirty = 1; 2013 if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems)) 2014 cp1->mems_dirty = 1; 2015} 2016 2017/* Create (if new set) or modify cpuset 'cp' at location 'relpath' */ 2018static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new) 2019{ 2020 char buf[PATH_MAX]; 2021 int do_rmdir_on_err = 0; 2022 int do_restore_cp_sav_on_err = 0; 2023 struct cpuset *cp_sav = NULL; 2024 int sav_errno; 2025 2026 if (check() < 0) 2027 goto err; 2028 2029 if (cpuset_would_crash_kernel(cp)) 2030 goto err; 2031 2032 fullpath(buf, sizeof(buf), relpath); 2033 2034 if (new) { 2035 if (mkdir(buf, 0755) < 0) 2036 goto err; 2037 /* we made it, so we should remove it on error */ 2038 do_rmdir_on_err = 1; 2039 } 2040 2041 if ((cp_sav = cpuset_alloc()) == NULL) 2042 goto err; 2043 if (cpuset_query(cp_sav, relpath) < 0) 2044 goto err; 2045 /* we have old settings to restore on error */ 2046 do_restore_cp_sav_on_err = 1; 2047 2048 /* check which variable need to restore on error */ 2049 mark_dirty_variable(cp_sav, cp); 2050 2051 if (apply_cpuset_settings(buf, cp) < 0) 2052 goto err; 2053 2054 cpuset_free(cp_sav); 2055 return 0; 2056err: 2057 sav_errno = errno; 2058 if (do_restore_cp_sav_on_err) 2059 (void)apply_cpuset_settings(buf, cp_sav); 2060 if (cp_sav) 2061 cpuset_free(cp_sav); 2062 if (do_rmdir_on_err) 2063 (void)rmdir(buf); 2064 errno = sav_errno; 2065 return -1; 2066} 2067 2068/* Create cpuset 'cp' at location 'relpath' */ 2069int cpuset_create(const char *relpath, const struct cpuset *cp) 2070{ 2071 return cr_or_mod(relpath, cp, 1); 2072} 2073 2074/* Delete cpuset at location 'path' (if empty) */ 2075int cpuset_delete(const char *relpath) 2076{ 2077 char buf[PATH_MAX]; 2078 2079 if (check() < 0) 2080 goto err; 2081 2082 fullpath(buf, sizeof(buf), relpath); 2083 if (rmdir(buf) < 0) 2084 goto err; 2085 2086 return 0; 2087err: 2088 return -1; 2089} 2090 2091/* Set cpuset cp to the cpuset at location 'path' */ 2092int cpuset_query(struct cpuset *cp, const char *relpath) 2093{ 2094 char buf[PATH_MAX]; 2095 2096 if (check() < 0) 2097 goto err; 2098 2099 fullpath(buf, sizeof(buf), relpath); 2100 2101 if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0) 2102 goto err; 2103 cp->cpu_exclusive_valid = 1; 2104 2105 if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0) 2106 goto err; 2107 cp->mem_exclusive_valid = 1; 2108 2109 if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0) 2110 goto err; 2111 cp->notify_on_release_valid = 1; 2112 2113 if (exists_flag(buf, "memory_migrate")) { 2114 if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0) 2115 goto err; 2116 cp->memory_migrate_valid = 1; 2117 } 2118 2119 if (exists_flag(buf, "mem_hardwall")) { 2120 if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0) 2121 goto err; 2122 cp->mem_hardwall_valid = 1; 2123 } 2124 2125 if (exists_flag(buf, "memory_pressure_enabled")) { 2126 if (load_flag 2127 (buf, &cp->memory_pressure_enabled, 2128 "memory_pressure_enabled") < 0) 2129 goto err; 2130 cp->memory_pressure_enabled_valid = 1; 2131 } 2132 2133 if (exists_flag(buf, "memory_spread_page")) { 2134 if (load_flag 2135 (buf, &cp->memory_spread_page, "memory_spread_page") < 0) 2136 goto err; 2137 cp->memory_spread_page_valid = 1; 2138 } 2139 2140 if (exists_flag(buf, "memory_spread_slab")) { 2141 if (load_flag 2142 (buf, &cp->memory_spread_slab, "memory_spread_slab") < 0) 2143 goto err; 2144 cp->memory_spread_slab_valid = 1; 2145 } 2146 2147 if (exists_flag(buf, "sched_load_balance")) { 2148 if (load_flag 2149 (buf, &cp->sched_load_balance, "sched_load_balance") < 0) 2150 goto err; 2151 cp->sched_load_balance_valid = 1; 2152 } 2153 2154 if (exists_flag(buf, "sched_relax_domain_level")) { 2155 if (load_number 2156 (buf, &cp->sched_relax_domain_level, 2157 "sched_relax_domain_level") < 0) 2158 goto err; 2159 cp->sched_relax_domain_level_valid = 1; 2160 } 2161 2162 if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0) 2163 goto err; 2164 cp->cpus_valid = 1; 2165 2166 if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0) 2167 goto err; 2168 cp->mems_valid = 1; 2169 2170 return 0; 2171err: 2172 return -1; 2173} 2174 2175/* Modify cpuset at location 'relpath' to values of 'cp' */ 2176int cpuset_modify(const char *relpath, const struct cpuset *cp) 2177{ 2178 return cr_or_mod(relpath, cp, 0); 2179} 2180 2181/* Get cpuset path of pid into buf */ 2182char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size) 2183{ 2184 int fd; /* dual use: cpuset file for pid and self */ 2185 int rc; /* dual use: snprintf and read return codes */ 2186 2187 if (check() < 0) 2188 return NULL; 2189 2190 /* borrow result buf[] to build cpuset file path */ 2191 if (pid == 0) 2192 rc = snprintf(buf, size, "/proc/self/cpuset"); 2193 else 2194 rc = snprintf(buf, size, "/proc/%d/cpuset", pid); 2195 if (rc >= (int)size) { 2196 errno = E2BIG; 2197 return NULL; 2198 } 2199 if ((fd = open(buf, O_RDONLY)) < 0) { 2200 int e = errno; 2201 if (e == ENOENT) 2202 e = ESRCH; 2203 if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0) 2204 e = ENOSYS; 2205 else 2206 close(fd); 2207 errno = e; 2208 return NULL; 2209 } 2210 rc = read(fd, buf, size); 2211 close(fd); 2212 if (rc < 0) 2213 return NULL; 2214 if (rc >= (int)size) { 2215 errno = E2BIG; 2216 return NULL; 2217 } 2218 buf[rc] = 0; 2219 chomp(buf); 2220 return buf; 2221 2222} 2223 2224/* Get cpuset 'cp' of pid */ 2225int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid) 2226{ 2227 char buf[PATH_MAX]; 2228 2229 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) 2230 return -1; 2231 if (cpuset_query(cp, buf) < 0) 2232 return -1; 2233 return 0; 2234} 2235 2236/* [optional] Return mountpoint of cpuset filesystem */ 2237const char *cpuset_mountpoint() 2238{ 2239 if (check() < 0) { 2240 switch (errno) { 2241 case ENODEV: 2242 return "[cpuset filesystem not mounted]"; 2243 default: 2244 return "[cpuset filesystem not supported]"; 2245 } 2246 } 2247 return cpusetmnt; 2248} 2249 2250/* Return true if path is a directory. */ 2251static int isdir(const char *path) 2252{ 2253 struct stat statbuf; 2254 2255 if (stat(path, &statbuf) < 0) 2256 return 0; 2257 return S_ISDIR(statbuf.st_mode); 2258} 2259 2260/* 2261 * [optional] cpuset_collides_exclusive() - True if would collide exclusive. 2262 * 2263 * Return true iff the specified cpuset would overlap with any 2264 * sibling cpusets in either cpus or mems, where either this 2265 * cpuset or the sibling is cpu_exclusive or mem_exclusive. 2266 * 2267 * cpuset_create() fails with errno == EINVAL if the requested cpuset 2268 * would overlap with any sibling, where either one is cpu_exclusive or 2269 * mem_exclusive. This is a common, and not obvious error. The 2270 * following routine checks for this particular case, so that code 2271 * creating cpusets can better identify the situation, perhaps to issue 2272 * a more informative error message. 2273 * 2274 * Can also be used to diagnose cpuset_modify failures. This 2275 * routine ignores any existing cpuset with the same path as the 2276 * given 'cpusetpath', and only looks for exclusive collisions with 2277 * sibling cpusets of that path. 2278 * 2279 * In case of any error, returns (0) -- does not collide. Presumably 2280 * any actual attempt to create or modify a cpuset will encounter the 2281 * same error, and report it usefully. 2282 * 2283 * This routine is not particularly efficient; most likely code creating or 2284 * modifying a cpuset will want to try the operation first, and then if that 2285 * fails with errno EINVAL, perhaps call this routine to determine if an 2286 * exclusive cpuset collision caused the error. 2287 */ 2288 2289int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1) 2290{ 2291 char parent[PATH_MAX]; 2292 char *p; 2293 char *pathcopy = NULL; 2294 char *base; 2295 DIR *dir = NULL; 2296 struct dirent *dent; 2297 struct cpuset *cp2 = NULL; 2298 struct bitmask *cpus1 = NULL, *cpus2 = NULL; 2299 struct bitmask *mems1 = NULL, *mems2 = NULL; 2300 int ret; 2301 2302 if (check() < 0) 2303 goto err; 2304 2305 fullpath(parent, sizeof(parent), cpusetpath); 2306 if (streq(parent, cpusetmnt)) 2307 goto err; /* only one cpuset root - can't collide */ 2308 pathcopy = strdup(parent); 2309 p = strrchr(parent, '/'); 2310 if (!p) 2311 goto err; /* huh? - impossible - run and hide */ 2312 *p = 0; /* now parent is dirname of fullpath */ 2313 2314 p = strrchr(pathcopy, '/'); 2315 base = p + 1; /* now base is basename of fullpath */ 2316 if (!*base) 2317 goto err; /* this is also impossible - run away */ 2318 2319 if ((dir = opendir(parent)) == NULL) 2320 goto err; 2321 if ((cp2 = cpuset_alloc()) == NULL) 2322 goto err; 2323 if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 2324 goto err; 2325 if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 2326 goto err; 2327 if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL) 2328 goto err; 2329 if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL) 2330 goto err; 2331 2332 while ((dent = readdir(dir)) != NULL) { 2333 char child[PATH_MAX]; 2334 2335 if (streq(dent->d_name, ".") || streq(dent->d_name, "..")) 2336 continue; 2337 if (streq(dent->d_name, base)) 2338 continue; 2339 pathcat2(child, sizeof(child), parent, dent->d_name); 2340 if (!isdir(child)) 2341 continue; 2342 if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0) 2343 goto err; 2344 if (cp1->cpu_exclusive || cp2->cpu_exclusive) { 2345 cpuset_getcpus(cp1, cpus1); 2346 cpuset_getcpus(cp2, cpus2); 2347 if (bitmask_intersects(cpus1, cpus2)) 2348 goto collides; 2349 } 2350 if (cp1->mem_exclusive || cp2->mem_exclusive) { 2351 cpuset_getmems(cp1, mems1); 2352 cpuset_getmems(cp2, mems2); 2353 if (bitmask_intersects(mems1, mems2)) 2354 goto collides; 2355 } 2356 } 2357err: 2358 /* error, or did not collide */ 2359 ret = 0; 2360 goto done; 2361collides: 2362 /* collides */ 2363 ret = 1; 2364 /* fall into ... */ 2365done: 2366 if (dir) 2367 closedir(dir); 2368 cpuset_free(cp2); 2369 free(pathcopy); 2370 bitmask_free(cpus1); 2371 bitmask_free(cpus2); 2372 bitmask_free(mems1); 2373 bitmask_free(mems2); 2374 return ret; 2375} 2376 2377/* 2378 * [optional] cpuset_nuke() - Remove cpuset anyway possible 2379 * 2380 * Remove a cpuset, including killing tasks in it, and 2381 * removing any descendent cpusets and killing their tasks. 2382 * 2383 * Tasks can take a long time (minutes on some configurations) 2384 * to exit. Loop up to 'seconds' seconds, trying to kill them. 2385 * 2386 * How we do it: 2387 * 1) First, kill all the pids, looping until there are 2388 * no more pids in this cpuset or below, or until the 2389 * 'seconds' timeout limit is exceeded. 2390 * 2) Then depth first recursively rmdir the cpuset directories. 2391 * 3) If by this point the original cpuset is gone, we succeeded. 2392 * 2393 * If the timeout is exceeded, and tasks still exist, fail with 2394 * errno == ETIME. 2395 * 2396 * We sleep a variable amount of time. After the first attempt to 2397 * kill all the tasks in the cpuset or its descendents, we sleep 1 2398 * second, the next time 2 seconds, increasing 1 second each loop 2399 * up to a max of 10 seconds. If more loops past 10 are required 2400 * to kill all the tasks, we sleep 10 seconds each subsequent loop. 2401 * In any case, before the last loop, we sleep however many seconds 2402 * remain of the original timeout 'seconds' requested. The total 2403 * time of all sleeps will be no more than the requested 'seconds'. 2404 * 2405 * If the cpuset started out empty of any tasks, or if the passed in 2406 * 'seconds' was zero, then this routine will return quickly, having 2407 * not slept at all. Otherwise, this routine will at a minimum send 2408 * a SIGKILL to all the tasks in this cpuset subtree, then sleep one 2409 * second, before looking to see if any tasks remain. If tasks remain 2410 * in the cpuset subtree, and a longer 'seconds' timeout was requested 2411 * (more than one), it will continue to kill remaining tasks and sleep, 2412 * in a loop, for as long as time and tasks remain. 2413 * 2414 * The signal sent for the kill is hardcoded to SIGKILL (9). If some 2415 * other signal should be sent first, use a separate code loop, 2416 * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to 2417 * scan the task pids in a cpuset. If SIGKILL should -not- be sent, 2418 * this cpuset_nuke() routine can still be called to recursively 2419 * remove a cpuset subtree, by specifying a timeout of zero 'seconds'. 2420 * 2421 * On success, returns 0 with errno == 0. 2422 * 2423 * On failure, returns -1, with errno possibly one of: 2424 * EACCES - search permission denied on intervening directory 2425 * ETIME - timed out - tasks remain after 'seconds' timeout 2426 * EMFILE - too many open files 2427 * ENODEV - /dev/cpuset not mounted 2428 * ENOENT - component of cpuset path doesn't exist 2429 * ENOMEM - out of memory 2430 * ENOSYS - kernel doesn't support cpusets 2431 * ENOTDIR - component of cpuset path is not a directory 2432 * EPERM - lacked permission to kill a task 2433 * EPERM - lacked permission to read cpusets or files therein 2434 */ 2435 2436void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree); 2437 2438int cpuset_nuke(const char *relpath, unsigned int seconds) 2439{ 2440 unsigned int secs_left = seconds; /* total sleep seconds left */ 2441 unsigned int secs_loop = 1; /* how much sleep next loop */ 2442 unsigned int secs_slept; /* seconds slept in sleep() */ 2443 struct cpuset_pidlist *pl = NULL; /* pids in cpuset subtree */ 2444 struct cpuset_fts_tree *cs_tree; 2445 const struct cpuset_fts_entry *cs_entry; 2446 int ret, sav_errno = 0; 2447 2448 if (check() < 0) 2449 return -1; 2450 2451 if (seconds == 0) 2452 goto rmdir_cpusets; 2453 2454 while (1) { 2455 int plen, j; 2456 2457 if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) { 2458 /* missing cpuset is as good as if already nuked */ 2459 if (errno == ENOENT) { 2460 ret = 0; 2461 goto no_more_cpuset; 2462 } 2463 2464 /* other problems reading cpuset are bad news */ 2465 sav_errno = errno; 2466 goto failed; 2467 } 2468 2469 if ((plen = cpuset_pidlist_length(pl)) == 0) 2470 goto rmdir_cpusets; 2471 2472 for (j = 0; j < plen; j++) { 2473 pid_t pid; 2474 2475 if ((pid = cpuset_get_pidlist(pl, j)) > 1) { 2476 if (kill(pid, SIGKILL) < 0 && errno != ESRCH) { 2477 sav_errno = errno; 2478 goto failed; 2479 } 2480 } 2481 } 2482 2483 if (secs_left == 0) 2484 goto took_too_long; 2485 2486 cpuset_freepidlist(pl); 2487 pl = NULL; 2488 2489 secs_slept = secs_loop - sleep(secs_loop); 2490 2491 /* Ensure forward progress */ 2492 if (secs_slept == 0) 2493 secs_slept = 1; 2494 2495 /* Ensure sane sleep() return (unnecessary?) */ 2496 if (secs_slept > secs_loop) 2497 secs_slept = secs_loop; 2498 2499 secs_left -= secs_slept; 2500 2501 if (secs_loop < 10) 2502 secs_loop++; 2503 2504 secs_loop = min(secs_left, secs_loop); 2505 } 2506 2507took_too_long: 2508 sav_errno = ETIME; 2509 /* fall into ... */ 2510failed: 2511 cpuset_freepidlist(pl); 2512 errno = sav_errno; 2513 return -1; 2514 2515rmdir_cpusets: 2516 /* Let's try removing cpuset(s) now. */ 2517 cpuset_freepidlist(pl); 2518 2519 if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT) 2520 return -1; 2521 ret = 0; 2522 cpuset_fts_reverse(cs_tree); /* rmdir's must be done bottom up */ 2523 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { 2524 char buf[PATH_MAX]; 2525 2526 fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry)); 2527 if (rmdir(buf) < 0 && errno != ENOENT) { 2528 sav_errno = errno; 2529 ret = -1; 2530 } 2531 } 2532 cpuset_fts_close(cs_tree); 2533 /* fall into ... */ 2534no_more_cpuset: 2535 if (ret == 0) 2536 errno = 0; 2537 else 2538 errno = sav_errno; 2539 return ret; 2540} 2541 2542/* 2543 * When recursively reading all the tasks files from a subtree, 2544 * chain together the read results, one pidblock per tasks file, 2545 * containing the raw unprocessed ascii as read(2) in. After 2546 * we gather up this raw data, we then go back to count how 2547 * many pid's there are in total, allocate an array of pid_t 2548 * of that size, and transform the raw ascii data into this 2549 * array of pid_t's. 2550 */ 2551 2552struct pidblock { 2553 char *buf; 2554 int buflen; 2555 struct pidblock *next; 2556}; 2557 2558/* 2559 * Chain the raw contents of a file onto the pbhead list. 2560 * 2561 * We malloc "+ 1" extra byte for a nul-terminator, so that 2562 * the strtoul() loop in pid_transform() won't scan past 2563 * the end of pb->buf[] and accidentally find more pids. 2564 */ 2565static void add_pidblock(const char *file, struct pidblock **ppbhead) 2566{ 2567 FILE *fp = NULL; 2568 struct pidblock *pb = NULL; 2569 int fsz; 2570 2571 if ((fp = fopen(file, "r")) == NULL) 2572 goto err; 2573 fsz = filesize(fp); 2574 if (fsz == 0) 2575 goto err; 2576 if ((pb = calloc(1, sizeof(*pb))) == NULL) 2577 goto err; 2578 pb->buflen = fsz; 2579 if ((pb->buf = malloc(pb->buflen + 1)) == NULL) 2580 goto err; 2581 if (fread(pb->buf, 1, pb->buflen, fp) > 0) { 2582 pb->buf[pb->buflen] = '\0'; 2583 pb->next = *ppbhead; 2584 *ppbhead = pb; 2585 } 2586 fclose(fp); 2587 return; 2588err: 2589 if (fp) 2590 fclose(fp); 2591 if (pb) 2592 free(pb); 2593} 2594 2595static void read_task_file(const char *relpath, struct pidblock **ppbhead) 2596{ 2597 char buf[PATH_MAX]; 2598 2599 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2600 add_pidblock(buf, ppbhead); 2601} 2602 2603struct cpuset_pidlist { 2604 pid_t *pids; 2605 int npids; 2606}; 2607 2608/* Count how many pids in buf (one per line - just count newlines) */ 2609static int pidcount(const char *buf, int buflen) 2610{ 2611 int n = 0; 2612 const char *cp; 2613 2614 for (cp = buf; cp < buf + buflen; cp++) { 2615 if (*cp == '\n') 2616 n++; 2617 } 2618 return n; 2619} 2620 2621/* Transform one-per-line ascii pids in pb to pid_t entries in pl */ 2622static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n) 2623{ 2624 char *a, *b; 2625 2626 for (a = pb->buf; a < pb->buf + pb->buflen; a = b) { 2627 pid_t p = strtoul(a, &b, 10); 2628 if (a == b) 2629 break; 2630 pl->pids[n++] = p; 2631 } 2632 return n; 2633} 2634 2635static void free_pidblocks(struct pidblock *pbhead) 2636{ 2637 struct pidblock *pb, *nextpb; 2638 2639 for (pb = pbhead; pb; pb = nextpb) { 2640 nextpb = pb->next; 2641 free(pb->buf); 2642 free(pb); 2643 } 2644} 2645 2646/* numeric comparison routine for qsort */ 2647static int numericsort(const void *m1, const void *m2) 2648{ 2649 pid_t p1 = *(pid_t *) m1; 2650 pid_t p2 = *(pid_t *) m2; 2651 2652 return p1 - p2; 2653} 2654 2655/* Return list pids in cpuset 'path' */ 2656struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath, 2657 int recursiveflag) 2658{ 2659 struct pidblock *pb = NULL; 2660 struct cpuset_pidlist *pl = NULL; 2661 struct pidblock *pbhead = NULL; 2662 int n; 2663 2664 if (check() < 0) 2665 goto err; 2666 2667 if (recursiveflag) { 2668 struct cpuset_fts_tree *cs_tree; 2669 const struct cpuset_fts_entry *cs_entry; 2670 2671 if ((cs_tree = cpuset_fts_open(relpath)) == NULL) 2672 goto err; 2673 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { 2674 if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET) 2675 continue; 2676 read_task_file(cpuset_fts_get_path(cs_entry), &pbhead); 2677 } 2678 cpuset_fts_close(cs_tree); 2679 } else { 2680 read_task_file(relpath, &pbhead); 2681 } 2682 2683 if ((pl = calloc(1, sizeof(*pl))) == NULL) 2684 goto err; 2685 pl->npids = 0; 2686 for (pb = pbhead; pb; pb = pb->next) 2687 pl->npids += pidcount(pb->buf, pb->buflen); 2688 if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL) 2689 goto err; 2690 n = 0; 2691 for (pb = pbhead; pb; pb = pb->next) 2692 n = pid_transform(pb, pl, n); 2693 free_pidblocks(pbhead); 2694 qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort); 2695 return pl; 2696err: 2697 cpuset_freepidlist(pl); 2698 free_pidblocks(pbhead); 2699 return NULL; 2700} 2701 2702/* Return number of elements in pidlist */ 2703int cpuset_pidlist_length(const struct cpuset_pidlist *pl) 2704{ 2705 if (pl) 2706 return pl->npids; 2707 else 2708 return 0; 2709} 2710 2711/* Return i'th element of pidlist */ 2712pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i) 2713{ 2714 if (pl && i >= 0 && i < pl->npids) 2715 return pl->pids[i]; 2716 else 2717 return (pid_t) - 1; 2718} 2719 2720/* Free pidlist */ 2721void cpuset_freepidlist(struct cpuset_pidlist *pl) 2722{ 2723 if (pl && pl->pids) 2724 free(pl->pids); 2725 if (pl) 2726 free(pl); 2727} 2728 2729static int __cpuset_move(pid_t pid, const char *path) 2730{ 2731 char buf[SMALL_BUFSZ]; 2732 2733 snprintf(buf, sizeof(buf), "%u", pid); 2734 return write_string_file(path, buf); 2735} 2736 2737/* Move task (pid == 0 for current) to a cpuset */ 2738int cpuset_move(pid_t pid, const char *relpath) 2739{ 2740 char buf[PATH_MAX]; 2741 2742 if (check() < 0) 2743 return -1; 2744 2745 if (pid == 0) 2746 pid = getpid(); 2747 2748 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2749 return __cpuset_move(pid, buf); 2750} 2751 2752/* Move all tasks in pidlist to a cpuset */ 2753int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath) 2754{ 2755 int i; 2756 char buf[PATH_MAX]; 2757 int ret; 2758 2759 if (check() < 0) 2760 return -1; 2761 2762 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2763 2764 ret = 0; 2765 for (i = 0; i < pl->npids; i++) 2766 if (__cpuset_move(pl->pids[i], buf) < 0) 2767 ret = -1; 2768 return ret; 2769} 2770 2771/* 2772 * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a 2773 * cpuset to another cpuset 2774 * 2775 * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may 2776 * race with tasks being added to or forking into fromrelpath. Loop 2777 * repeatedly, reading the tasks file of cpuset fromrelpath and writing 2778 * any task pid's found there to the tasks file of cpuset torelpath, 2779 * up to ten attempts, or until the tasks file of cpuset fromrelpath 2780 * is empty, or until fromrelpath is no longer present. 2781 * 2782 * Returns 0 with errno == 0 if able to empty the tasks file of cpuset 2783 * fromrelpath. Of course it is still possible that some independent 2784 * task could add another task to cpuset fromrelpath at the same time 2785 * that such a successful result is being returned, so there can be 2786 * no guarantee that a successful return means that fromrelpath is 2787 * still empty of tasks. 2788 * 2789 * We are careful to allow for the possibility that the cpuset 2790 * fromrelpath might disappear out from under us, perhaps because it 2791 * has notify_on_release set and gets automatically removed as soon 2792 * as we detach its last task from it. Consider a missing fromrelpath 2793 * to be a successful move. 2794 * 2795 * If called with fromrelpath and torelpath pathnames that evaluate to 2796 * the same cpuset, then treat that as if cpuset_reattach() was called, 2797 * rebinding each task in this cpuset one time, and return success or 2798 * failure depending on the return of that cpuset_reattach() call. 2799 * 2800 * On failure, returns -1, with errno possibly one of: 2801 * EACCES - search permission denied on intervening directory 2802 * ENOTEMPTY - tasks remain after multiple attempts to move them 2803 * EMFILE - too many open files 2804 * ENODEV - /dev/cpuset not mounted 2805 * ENOENT - component of cpuset path doesn't exist 2806 * ENOMEM - out of memory 2807 * ENOSYS - kernel doesn't support cpusets 2808 * ENOTDIR - component of cpuset path is not a directory 2809 * EPERM - lacked permission to kill a task 2810 * EPERM - lacked permission to read cpusets or files therein 2811 * 2812 * This is an [optional] function. Use cpuset_function to invoke it. 2813 */ 2814 2815#define NUMBER_MOVE_TASK_ATTEMPTS 10 2816 2817int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath) 2818{ 2819 char fromfullpath[PATH_MAX]; 2820 char tofullpath[PATH_MAX]; 2821 int i; 2822 struct cpuset_pidlist *pl = NULL; 2823 int sav_errno; 2824 2825 fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath); 2826 fullpath(tofullpath, sizeof(tofullpath), torelpath); 2827 2828 if (samefile(fromfullpath, tofullpath)) 2829 return cpuset_reattach(fromrelpath); 2830 2831 for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) { 2832 int plen, j; 2833 2834 if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) { 2835 /* missing cpuset is as good as if all moved */ 2836 if (errno == ENOENT) 2837 goto no_more_cpuset; 2838 2839 /* other problems reading cpuset are bad news */ 2840 sav_errno = errno; 2841 goto failed; 2842 } 2843 2844 if ((plen = cpuset_pidlist_length(pl)) == 0) 2845 goto no_more_pids; 2846 2847 for (j = 0; j < plen; j++) { 2848 pid_t pid; 2849 2850 pid = cpuset_get_pidlist(pl, j); 2851 if (cpuset_move(pid, torelpath) < 0) { 2852 /* missing task is as good as if moved */ 2853 if (errno == ESRCH) 2854 continue; 2855 2856 /* other per-task errors are bad news */ 2857 sav_errno = errno; 2858 goto failed; 2859 } 2860 } 2861 2862 cpuset_freepidlist(pl); 2863 pl = NULL; 2864 } 2865 2866 sav_errno = ENOTEMPTY; 2867 /* fall into ... */ 2868failed: 2869 cpuset_freepidlist(pl); 2870 errno = sav_errno; 2871 return -1; 2872 2873no_more_pids: 2874no_more_cpuset: 2875 /* Success - all tasks (or entire cpuset ;) gone. */ 2876 cpuset_freepidlist(pl); 2877 errno = 0; 2878 return 0; 2879} 2880 2881/* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */ 2882int cpuset_migrate(pid_t pid, const char *relpath) 2883{ 2884 char buf[PATH_MAX]; 2885 char buf2[PATH_MAX]; 2886 char memory_migrate_flag; 2887 int r; 2888 2889 if (check() < 0) 2890 return -1; 2891 2892 if (pid == 0) 2893 pid = getpid(); 2894 2895 fullpath(buf2, sizeof(buf2), relpath); 2896 2897 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) 2898 return -1; 2899 if (store_flag(buf2, "memory_migrate", 1) < 0) 2900 return -1; 2901 2902 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2903 2904 r = __cpuset_move(pid, buf); 2905 2906 store_flag(buf2, "memory_migrate", memory_migrate_flag); 2907 return r; 2908} 2909 2910/* Migrate all tasks in pidlist to a cpuset (moves task and memory) */ 2911int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath) 2912{ 2913 int i; 2914 char buf[PATH_MAX]; 2915 char buf2[PATH_MAX]; 2916 char memory_migrate_flag; 2917 int ret; 2918 2919 if (check() < 0) 2920 return -1; 2921 2922 fullpath(buf2, sizeof(buf2), relpath); 2923 2924 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) 2925 return -1; 2926 if (store_flag(buf2, "memory_migrate", 1) < 0) 2927 return -1; 2928 2929 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2930 2931 ret = 0; 2932 for (i = 0; i < pl->npids; i++) 2933 if (__cpuset_move(pl->pids[i], buf) < 0) 2934 ret = -1; 2935 2936 if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0) 2937 ret = -1; 2938 return ret; 2939} 2940 2941/* Rebind cpus_allowed of each task in cpuset 'path' */ 2942int cpuset_reattach(const char *relpath) 2943{ 2944 struct cpuset_pidlist *pl; 2945 int rc; 2946 2947 if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL) 2948 return -1; 2949 rc = cpuset_move_all(pl, relpath); 2950 cpuset_freepidlist(pl); 2951 return rc; 2952} 2953 2954/* Map cpuset relative cpu number to system wide cpu number */ 2955int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu) 2956{ 2957 struct cpuset *cp_tofree = NULL; 2958 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2959 int pos = -1; 2960 2961 if (!cp1) 2962 goto err; 2963 pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu); 2964 /* fall into ... */ 2965err: 2966 cpuset_free(cp_tofree); 2967 return pos; 2968} 2969 2970/* Map system wide cpu number to cpuset relative cpu number */ 2971int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu) 2972{ 2973 struct cpuset *cp_tofree = NULL; 2974 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2975 int pos = -1; 2976 2977 if (!cp1) 2978 goto err; 2979 pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu); 2980 /* fall into ... */ 2981err: 2982 cpuset_free(cp_tofree); 2983 return pos; 2984} 2985 2986/* Map cpuset relative mem number to system wide mem number */ 2987int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem) 2988{ 2989 struct cpuset *cp_tofree = NULL; 2990 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2991 int pos = -1; 2992 2993 if (!cp1) 2994 goto err; 2995 pos = bitmask_rel_to_abs_pos(cp1->mems, mem); 2996 /* fall into ... */ 2997err: 2998 cpuset_free(cp_tofree); 2999 return pos; 3000} 3001 3002/* Map system wide mem number to cpuset relative mem number */ 3003int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem) 3004{ 3005 struct cpuset *cp_tofree = NULL; 3006 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 3007 int pos = -1; 3008 3009 if (!cp1) 3010 goto err; 3011 pos = bitmask_abs_to_rel_pos(cp1->mems, mem); 3012 /* fall into ... */ 3013err: 3014 cpuset_free(cp_tofree); 3015 return pos; 3016} 3017 3018/* Map pid's cpuset relative cpu number to system wide cpu number */ 3019int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu) 3020{ 3021 struct cpuset *cp; 3022 int rc = -1; 3023 3024 if ((cp = cpuset_alloc()) == NULL) 3025 goto done; 3026 if (cpuset_cpusetofpid(cp, pid) < 0) 3027 goto done; 3028 rc = cpuset_c_rel_to_sys_cpu(cp, cpu); 3029done: 3030 cpuset_free(cp); 3031 return rc; 3032} 3033 3034/* Map system wide cpu number to pid's cpuset relative cpu number */ 3035int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu) 3036{ 3037 struct cpuset *cp; 3038 int rc = -1; 3039 3040 if ((cp = cpuset_alloc()) == NULL) 3041 goto done; 3042 if (cpuset_cpusetofpid(cp, pid) < 0) 3043 goto done; 3044 rc = cpuset_c_sys_to_rel_cpu(cp, cpu); 3045done: 3046 cpuset_free(cp); 3047 return rc; 3048} 3049 3050/* Map pid's cpuset relative mem number to system wide mem number */ 3051int cpuset_p_rel_to_sys_mem(pid_t pid, int mem) 3052{ 3053 struct cpuset *cp; 3054 int rc = -1; 3055 3056 if ((cp = cpuset_alloc()) == NULL) 3057 goto done; 3058 if (cpuset_cpusetofpid(cp, pid) < 0) 3059 goto done; 3060 rc = cpuset_c_rel_to_sys_mem(cp, mem); 3061done: 3062 cpuset_free(cp); 3063 return rc; 3064} 3065 3066/* Map system wide mem number to pid's cpuset relative mem number */ 3067int cpuset_p_sys_to_rel_mem(pid_t pid, int mem) 3068{ 3069 struct cpuset *cp; 3070 int rc = -1; 3071 3072 if ((cp = cpuset_alloc()) == NULL) 3073 goto done; 3074 if (cpuset_cpusetofpid(cp, pid) < 0) 3075 goto done; 3076 rc = cpuset_c_sys_to_rel_mem(cp, mem); 3077done: 3078 cpuset_free(cp); 3079 return rc; 3080} 3081 3082/* 3083 * Override glibc's calls for get/set affinity - they have 3084 * something using cpu_set_t that will die when NR_CPUS > 1024. 3085 * Go directly to the 'real' system calls. Also override calls 3086 * for get_mempolicy and set_mempolicy. None of these 3087 * calls are yet (July 2004) guaranteed to be in all glibc versions 3088 * that we care about. 3089 */ 3090 3091static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask) 3092{ 3093 return ltp_syscall(__NR_sched_setaffinity, pid, len, mask); 3094} 3095 3096#if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE 3097static int get_mempolicy(int *policy, unsigned long *nmask, 3098 unsigned long maxnode, void *addr, int flags) 3099{ 3100 return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode, 3101 addr, flags); 3102} 3103#endif 3104 3105#if HAVE_DECL_MPOL_BIND || HAVE_DECL_MPOL_DEFAULT 3106static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode) 3107{ 3108 return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode); 3109} 3110#endif 3111 3112struct cpuset_placement { 3113 struct bitmask *cpus; 3114 struct bitmask *mems; 3115 char *path; 3116}; 3117 3118/* Allocate and fill in a placement struct - cpatures current placement */ 3119struct cpuset_placement *cpuset_get_placement(pid_t pid) 3120{ 3121 struct cpuset_placement *plc; 3122 struct cpuset *cp = NULL; 3123 char buf[PATH_MAX]; 3124 int nbits; 3125 3126 if ((plc = calloc(1, sizeof(*plc))) == NULL) 3127 goto err; 3128 3129 nbits = cpuset_cpus_nbits(); 3130 if ((plc->cpus = bitmask_alloc(nbits)) == NULL) 3131 goto err; 3132 3133 nbits = cpuset_mems_nbits(); 3134 if ((plc->mems = bitmask_alloc(nbits)) == NULL) 3135 goto err; 3136 3137 if ((cp = cpuset_alloc()) == NULL) 3138 goto err; 3139 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) 3140 goto err; 3141 if (cpuset_query(cp, buf) < 0) 3142 goto err; 3143 3144 bitmask_copy(plc->cpus, cp->cpus); 3145 bitmask_copy(plc->mems, cp->mems); 3146 plc->path = strdup(buf); 3147 3148 cpuset_free(cp); 3149 return plc; 3150err: 3151 cpuset_free(cp); 3152 cpuset_free_placement(plc); 3153 return NULL; 3154} 3155 3156/* Compare two placement structs - use to detect changes in placement */ 3157int cpuset_equal_placement(const struct cpuset_placement *plc1, 3158 const struct cpuset_placement *plc2) 3159{ 3160 return bitmask_equal(plc1->cpus, plc2->cpus) && 3161 bitmask_equal(plc1->mems, plc2->mems) && 3162 streq(plc1->path, plc2->path); 3163} 3164 3165/* Free a placement struct */ 3166void cpuset_free_placement(struct cpuset_placement *plc) 3167{ 3168 if (!plc) 3169 return; 3170 bitmask_free(plc->cpus); 3171 bitmask_free(plc->mems); 3172 free(plc->path); 3173 free(plc); 3174} 3175 3176/* 3177 * A cpuset_fts_open() call constructs a linked list of entries 3178 * called a "cpuset_fts_tree", with one entry per cpuset below 3179 * the specified path. The cpuset_fts_read() routine returns the 3180 * next entry on this list. The various cpuset_fts_get_*() calls 3181 * return attributes of the specified entry. The cpuset_fts_close() 3182 * call frees the linked list and all associated data. All cpuset 3183 * entries and attributes for the cpuset_fts_tree returned from a 3184 * given cpuset_fts_open() call remain allocated and unchanged until 3185 * that cpuset_fts_tree is closed by a cpuset_fts_close() call. Any 3186 * subsequent changes to the cpuset filesystem will go unnoticed 3187 * (not affect open cpuset_fts_tree's.) 3188 */ 3189 3190struct cpuset_fts_entry; 3191void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree); 3192 3193struct cpuset_fts_tree { 3194 struct cpuset_fts_entry *head; /* head of linked entry list */ 3195 struct cpuset_fts_entry *next; /* cpuset_fts_read() offset */ 3196}; 3197 3198struct cpuset_fts_entry { 3199 struct cpuset_fts_entry *next; /* linked entry list chain */ 3200 struct cpuset *cpuset; 3201 struct stat *stat; 3202 char *path; 3203 int info; 3204 int err; 3205}; 3206 3207/* Open a handle on a cpuset hierarchy. All the real work is done here. */ 3208struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath) 3209{ 3210 FTS *fts = NULL; 3211 FTSENT *ftsent; 3212 char *path_argv[2]; 3213 char buf[PATH_MAX]; 3214 struct cpuset_fts_tree *cs_tree = NULL; 3215 struct cpuset_fts_entry *ep; /* the latest new list entry */ 3216 struct cpuset_fts_entry **pnlep; /* ptr to next list entry ptr */ 3217 char *relpath; 3218 int fts_flags; 3219 3220 fullpath(buf, sizeof(buf), cpusetpath); 3221 path_argv[0] = buf; 3222 path_argv[1] = NULL; 3223 3224 fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV; 3225 fts = fts_open(path_argv, fts_flags, NULL); 3226 if (fts == NULL) 3227 goto err; 3228 3229 cs_tree = malloc(sizeof(*cs_tree)); 3230 if (cs_tree == NULL) 3231 goto err; 3232 pnlep = &cs_tree->head; 3233 *pnlep = NULL; 3234 3235 while ((ftsent = fts_read(fts)) != NULL) { 3236 if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR) 3237 continue; 3238 3239 /* ftsent is a directory (perhaps unreadable) ==> cpuset */ 3240 ep = calloc(1, sizeof(*ep)); 3241 if (ep == NULL) 3242 goto err; 3243 *pnlep = ep; 3244 pnlep = &ep->next; 3245 3246 /* Set entry's path, and if DNR, error */ 3247 relpath = ftsent->fts_path + strlen(cpusetmnt); 3248 if (strlen(relpath) == 0) 3249 relpath = "/"; 3250 ep->path = strdup(relpath); 3251 if (ep->path == NULL) 3252 goto err; 3253 if (ftsent->fts_info == FTS_DNR) { 3254 ep->info = CPUSET_FTS_ERR_DNR; 3255 ep->err = ftsent->fts_errno; 3256 continue; 3257 } 3258 3259 /* ftsent is a -readable- cpuset: set entry's stat, etc */ 3260 ep->stat = calloc(1, sizeof(struct stat)); 3261 if (ep->stat == NULL) 3262 goto err; 3263 if (stat(ftsent->fts_path, ep->stat) < 0) { 3264 ep->info = CPUSET_FTS_ERR_STAT; 3265 ep->err = ftsent->fts_errno; 3266 continue; 3267 } 3268 3269 ep->cpuset = calloc(1, sizeof(struct cpuset)); 3270 if (ep->cpuset == NULL) 3271 goto err; 3272 if (cpuset_query(ep->cpuset, relpath) < 0) { 3273 ep->info = CPUSET_FTS_ERR_CPUSET; 3274 ep->err = errno; 3275 continue; 3276 } 3277 ep->info = CPUSET_FTS_CPUSET; 3278 } 3279 3280 (void)fts_close(fts); 3281 cpuset_fts_rewind(cs_tree); 3282 return cs_tree; 3283 3284err: 3285 if (cs_tree) 3286 cpuset_fts_close(cs_tree); 3287 if (fts) 3288 (void)fts_close(fts); 3289 return NULL; 3290} 3291 3292/* Return pointer to next cpuset entry in hierarchy */ 3293const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree) 3294{ 3295 const struct cpuset_fts_entry *cs_entry = cs_tree->next; 3296 if (cs_tree->next != NULL) /* seek to next entry */ 3297 cs_tree->next = cs_tree->next->next; 3298 return cs_entry; 3299} 3300 3301/* Reverse list of cpusets, in place. Simulates pre-order/post-order flip. */ 3302void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree) 3303{ 3304 struct cpuset_fts_entry *cs1, *cs2, *cs3; 3305 3306 /* 3307 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer 3308 * is redirected from cs3 to cs1. 3309 */ 3310 3311 cs1 = cs2 = NULL; 3312 cs3 = cs_tree->head; 3313 while (cs3) { 3314 cs1 = cs2; 3315 cs2 = cs3; 3316 cs3 = cs3->next; 3317 cs2->next = cs1; 3318 } 3319 cs_tree->head = cs2; 3320 cpuset_fts_rewind(cs_tree); 3321} 3322 3323/* Rewind cpuset list to beginning */ 3324void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree) 3325{ 3326 cs_tree->next = cs_tree->head; 3327} 3328 3329/* Return pointer to nul-terminated cpuset path of entry in hierarchy */ 3330const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry) 3331{ 3332 return cs_entry->path; 3333} 3334 3335/* Return pointer to stat(2) structure of a cpuset entry's directory */ 3336const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry) 3337{ 3338 return cs_entry->stat; 3339} 3340 3341/* Return pointer to cpuset structure of a cpuset entry */ 3342const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry 3343 *cs_entry) 3344{ 3345 return cs_entry->cpuset; 3346} 3347 3348/* Return value of errno (0 if no error) on attempted cpuset operations */ 3349int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry) 3350{ 3351 return cs_entry->err; 3352} 3353 3354/* Return operation identity causing error */ 3355int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry) 3356{ 3357 return cs_entry->info; 3358} 3359 3360/* Close a cpuset hierarchy handle (free's all associated memory) */ 3361void cpuset_fts_close(struct cpuset_fts_tree *cs_tree) 3362{ 3363 struct cpuset_fts_entry *cs_entry = cs_tree->head; 3364 3365 while (cs_entry) { 3366 struct cpuset_fts_entry *ep = cs_entry; 3367 3368 cs_entry = cs_entry->next; 3369 free(ep->path); 3370 free(ep->stat); 3371 cpuset_free(ep->cpuset); 3372 free(ep); 3373 } 3374 free(cs_tree); 3375} 3376 3377/* Bind current task to cpu (uses sched_setaffinity(2)) */ 3378int cpuset_cpubind(int cpu) 3379{ 3380 struct bitmask *bmp; 3381 int r; 3382 3383 if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 3384 return -1; 3385 bitmask_setbit(bmp, cpu); 3386 r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp)); 3387 bitmask_free(bmp); 3388 return r; 3389} 3390 3391/* 3392 * int cpuset_latestcpu(pid_t pid) 3393 * 3394 * Return most recent CPU on which task pid executed. If pid == 0, 3395 * examine current task. 3396 * 3397 * The last used CPU is visible for a given pid as field #39 (starting 3398 * with #1) in the file /proc/pid/stat. Currently this file has 41 3399 * fields, in which case this is the 3rd to the last field. 3400 * 3401 * Unfortunately field #2 is a command name and might have embedded 3402 * whitespace. So we can't just count white space separated fields. 3403 * Fortunately, this command name is surrounded by parentheses, as 3404 * for example "(sh)", and that closing parenthesis is the last ')' 3405 * character in the line. No remaining fields can have embedded 3406 * whitespace or parentheses. So instead of looking for the 39th 3407 * white space separated field, we can look for the 37th white space 3408 * separated field past the last ')' character on the line. 3409 */ 3410 3411/* Return most recent CPU on which task pid executed */ 3412int cpuset_latestcpu(pid_t pid) 3413{ 3414 char buf[PATH_MAX]; 3415 char *bp; 3416 int fd = -1; 3417 int cpu = -1; 3418 3419 if (pid == 0) 3420 snprintf(buf, sizeof(buf), "/proc/self/stat"); 3421 else 3422 snprintf(buf, sizeof(buf), "/proc/%d/stat", pid); 3423 3424 if ((fd = open(buf, O_RDONLY)) < 0) 3425 goto err; 3426 if (read(fd, buf, sizeof(buf)) < 1) 3427 goto err; 3428 close(fd); 3429 3430 bp = strrchr(buf, ')'); 3431 if (bp) 3432 sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u", /* 37th field past ')' */ 3433 &cpu); 3434 if (cpu < 0) 3435 errno = EINVAL; 3436 return cpu; 3437err: 3438 if (fd >= 0) 3439 close(fd); 3440 return -1; 3441} 3442 3443/* Bind current task to memory (uses set_mempolicy(2)) */ 3444int cpuset_membind(int mem) 3445{ 3446 struct bitmask *bmp; 3447 int r; 3448 3449 if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL) 3450 return -1; 3451 bitmask_setbit(bmp, mem); 3452#if HAVE_DECL_MPOL_BIND 3453 r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1); 3454#else 3455 r = -1; 3456 errno = ENOSYS; 3457#endif 3458 bitmask_free(bmp); 3459 return r; 3460} 3461 3462/* [optional] Return Memory Node holding page at specified addr */ 3463int cpuset_addr2node(void *addr) 3464{ 3465 int node = -1; 3466 3467#if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE 3468 if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) { 3469 /* I realize this seems redundant, but I _want_ to make sure 3470 * that this value is -1. */ 3471 node = -1; 3472 } 3473#endif 3474 return node; 3475} 3476 3477/* 3478 * Transform cpuset into Text Format Representation in buffer 'buf', 3479 * of length 'buflen', nul-terminated if space allows. Return number 3480 * of characters that would have been written, if enough space had 3481 * been available, in the same way that snprintf() does. 3482 */ 3483 3484/* Export cpuset settings to a regular file */ 3485int cpuset_export(const struct cpuset *cp, char *buf, int buflen) 3486{ 3487 char *tmp = NULL; 3488 int n = 0; 3489 3490 if (cp->cpu_exclusive) 3491 n += snprintf(buf + n, max(buflen - n, 0), "cpu_exclusive\n"); 3492 3493 if (cp->mem_exclusive) 3494 n += snprintf(buf + n, max(buflen - n, 0), "mem_exclusive\n"); 3495 3496 if (cp->notify_on_release) 3497 n += snprintf(buf + n, max(buflen - n, 0), 3498 "notify_on_release\n"); 3499 3500 if (cp->memory_pressure_enabled) 3501 n += snprintf(buf + n, max(buflen - n, 0), 3502 "memory_pressure_enabled\n"); 3503 3504 if (cp->memory_migrate) 3505 n += snprintf(buf + n, max(buflen - n, 0), "memory_migrate\n"); 3506 3507 if (cp->memory_spread_page) 3508 n += snprintf(buf + n, max(buflen - n, 0), 3509 "memory_spread_page\n"); 3510 3511 if (cp->memory_spread_slab) 3512 n += snprintf(buf + n, max(buflen - n, 0), 3513 "memory_spread_slab\n"); 3514 3515 if ((tmp = sprint_mask_buf(cp->cpus)) == NULL) 3516 return -1; 3517 n += snprintf(buf + n, max(buflen - n, 0), "cpus %s\n", tmp); 3518 free(tmp); 3519 tmp = NULL; 3520 3521 if ((tmp = sprint_mask_buf(cp->mems)) == NULL) 3522 return -1; 3523 n += snprintf(buf + n, max(buflen - n, 0), "mems %s\n", tmp); 3524 free(tmp); 3525 tmp = NULL; 3526 3527 return n; 3528} 3529 3530static int import_list(UNUSED const char *tok, const char *arg, 3531 struct bitmask *bmp, char *emsg, int elen) 3532{ 3533 if (bitmask_parselist(arg, bmp) < 0) { 3534 if (emsg) 3535 snprintf(emsg, elen, "Invalid list format: %s", arg); 3536 return -1; 3537 } 3538 return 0; 3539} 3540 3541static void stolower(char *s) 3542{ 3543 while (*s) { 3544 unsigned char c = *s; 3545 *s = tolower(c); 3546 s++; 3547 } 3548} 3549 3550/* Import cpuset settings from a regular file */ 3551int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum, 3552 char *emsg, int elen) 3553{ 3554 char *linebuf = NULL; 3555 int linebuflen; 3556 int linenum = 0; 3557 int offset = 0; 3558 3559 linebuflen = strlen(buf) + 1; 3560 if ((linebuf = malloc(linebuflen)) == NULL) { 3561 if (emsg) 3562 snprintf(emsg, elen, "Insufficient memory"); 3563 goto err; 3564 } 3565 3566 while (slgets(linebuf, linebuflen, buf, &offset)) { 3567 char *tok, *arg; 3568 char *ptr; /* for strtok_r */ 3569 3570 linenum++; 3571 if ((tok = strchr(linebuf, '#')) != NULL) 3572 *tok = 0; 3573 if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL) 3574 continue; 3575 stolower(tok); 3576 3577 arg = strtok_r(0, " \t", &ptr); 3578 3579 if (streq(tok, "cpu_exclusive")) { 3580 cp->cpu_exclusive = 1; 3581 goto eol; 3582 } 3583 if (streq(tok, "mem_exclusive")) { 3584 cp->mem_exclusive = 1; 3585 goto eol; 3586 } 3587 if (streq(tok, "notify_on_release")) { 3588 cp->notify_on_release = 1; 3589 goto eol; 3590 } 3591 if (streq(tok, "memory_pressure_enabled")) { 3592 cp->memory_pressure_enabled = 1; 3593 goto eol; 3594 } 3595 if (streq(tok, "memory_migrate")) { 3596 cp->memory_migrate = 1; 3597 goto eol; 3598 } 3599 if (streq(tok, "memory_spread_page")) { 3600 cp->memory_spread_page = 1; 3601 goto eol; 3602 } 3603 if (streq(tok, "memory_spread_slab")) { 3604 cp->memory_spread_slab = 1; 3605 goto eol; 3606 } 3607 if (streq(tok, "cpu") || streq(tok, "cpus")) { 3608 if (import_list(tok, arg, cp->cpus, emsg, elen) < 0) 3609 goto err; 3610 goto eol; 3611 } 3612 if (streq(tok, "mem") || streq(tok, "mems")) { 3613 if (import_list(tok, arg, cp->mems, emsg, elen) < 0) 3614 goto err; 3615 goto eol; 3616 } 3617 if (emsg) 3618 snprintf(emsg, elen, "Unrecognized token: '%s'", tok); 3619 goto err; 3620eol: 3621 if ((tok = strtok_r(0, " \t", &ptr)) != NULL) { 3622 if (emsg) 3623 snprintf(emsg, elen, "Surplus token: '%s'", 3624 tok); 3625 goto err; 3626 } 3627 continue; 3628 } 3629 3630 free(linebuf); 3631 3632 if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems)) 3633 cpuset_localcpus(cp->mems, cp->cpus); 3634 else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems)) 3635 cpuset_localmems(cp->cpus, cp->mems); 3636 3637 /* 3638 * All cpuset attributes are determined in an import. 3639 * Those that aren't explicitly specified are presumed 3640 * to be unchanged (zero, if it's a freshly allocated 3641 * struct cpuset.) 3642 */ 3643 3644 cp->cpus_valid = 1; 3645 cp->mems_valid = 1; 3646 cp->cpu_exclusive_valid = 1; 3647 cp->mem_exclusive_valid = 1; 3648 cp->notify_on_release_valid = 1; 3649 cp->memory_migrate_valid = 1; 3650 cp->memory_pressure_enabled_valid = 1; 3651 cp->memory_spread_page_valid = 1; 3652 cp->memory_spread_slab_valid = 1; 3653 3654 return 0; 3655err: 3656 if (elinenum) 3657 *elinenum = linenum; 3658 if (linebuf) 3659 free(linebuf); 3660 return -1; 3661} 3662 3663/* Pin current task CPU (and memory) */ 3664int cpuset_pin(int relcpu) 3665{ 3666 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3667 int cpu, r; 3668 3669 if (check() < 0) 3670 return -1; 3671 3672 do { 3673 cpuset_free_placement(plc1); 3674 plc1 = cpuset_get_placement(0); 3675 3676 r = 0; 3677 if (cpuset_unpin() < 0) 3678 r = -1; 3679 cpu = cpuset_p_rel_to_sys_cpu(0, relcpu); 3680 if (cpuset_cpubind(cpu) < 0) 3681 r = -1; 3682 3683 cpuset_free_placement(plc2); 3684 plc2 = cpuset_get_placement(0); 3685 } while (!cpuset_equal_placement(plc1, plc2)); 3686 3687 cpuset_free_placement(plc1); 3688 cpuset_free_placement(plc2); 3689 return r; 3690} 3691 3692/* Return number CPUs in current tasks cpuset */ 3693int cpuset_size() 3694{ 3695 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3696 int r; 3697 3698 if (check() < 0) 3699 return -1; 3700 3701 do { 3702 cpuset_free_placement(plc1); 3703 plc1 = cpuset_get_placement(0); 3704 3705 r = cpuset_cpus_weight(0); 3706 3707 cpuset_free_placement(plc2); 3708 plc2 = cpuset_get_placement(0); 3709 } while (!cpuset_equal_placement(plc1, plc2)); 3710 3711 cpuset_free_placement(plc1); 3712 cpuset_free_placement(plc2); 3713 return r; 3714} 3715 3716/* Return relative CPU number, within current cpuset, last executed on */ 3717int cpuset_where() 3718{ 3719 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3720 int r; 3721 3722 if (check() < 0) 3723 return -1; 3724 3725 do { 3726 cpuset_free_placement(plc1); 3727 plc1 = cpuset_get_placement(0); 3728 3729 r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0)); 3730 3731 cpuset_free_placement(plc2); 3732 plc2 = cpuset_get_placement(0); 3733 } while (!cpuset_equal_placement(plc1, plc2)); 3734 3735 cpuset_free_placement(plc1); 3736 cpuset_free_placement(plc2); 3737 return r; 3738} 3739 3740/* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */ 3741int cpuset_unpin() 3742{ 3743 struct bitmask *cpus = NULL, *mems = NULL; 3744 int r = -1; 3745 3746 if (check() < 0) 3747 goto err; 3748 3749 /* 3750 * Don't need cpuset_*_placement() guard against concurrent 3751 * cpuset migration, because none of the following depends 3752 * on the tasks cpuset placement. 3753 */ 3754 3755 if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 3756 goto err; 3757 bitmask_setall(cpus); 3758 if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0) 3759 goto err; 3760 3761 if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL) 3762 goto err; 3763#if HAVE_DECL_MPOL_DEFAULT 3764 if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems), 3765 bitmask_nbits(mems) + 1) < 0) 3766 goto err; 3767 r = 0; 3768#endif 3769 /* fall into ... */ 3770err: 3771 bitmask_free(cpus); 3772 bitmask_free(mems); 3773 return r; 3774 3775} 3776 3777struct cpuset_function_list { 3778 const char *fname; 3779 void *func; 3780} flist[] = { 3781 { 3782 "cpuset_version", cpuset_version}, { 3783 "cpuset_alloc", cpuset_alloc}, { 3784 "cpuset_free", cpuset_free}, { 3785 "cpuset_cpus_nbits", cpuset_cpus_nbits}, { 3786 "cpuset_mems_nbits", cpuset_mems_nbits}, { 3787 "cpuset_setcpus", cpuset_setcpus}, { 3788 "cpuset_setmems", cpuset_setmems}, { 3789 "cpuset_set_iopt", cpuset_set_iopt}, { 3790 "cpuset_set_sopt", cpuset_set_sopt}, { 3791 "cpuset_getcpus", cpuset_getcpus}, { 3792 "cpuset_getmems", cpuset_getmems}, { 3793 "cpuset_cpus_weight", cpuset_cpus_weight}, { 3794 "cpuset_mems_weight", cpuset_mems_weight}, { 3795 "cpuset_get_iopt", cpuset_get_iopt}, { 3796 "cpuset_get_sopt", cpuset_get_sopt}, { 3797 "cpuset_localcpus", cpuset_localcpus}, { 3798 "cpuset_localmems", cpuset_localmems}, { 3799 "cpuset_cpumemdist", cpuset_cpumemdist}, { 3800 "cpuset_cpu2node", cpuset_cpu2node}, { 3801 "cpuset_addr2node", cpuset_addr2node}, { 3802 "cpuset_create", cpuset_create}, { 3803 "cpuset_delete", cpuset_delete}, { 3804 "cpuset_query", cpuset_query}, { 3805 "cpuset_modify", cpuset_modify}, { 3806 "cpuset_getcpusetpath", cpuset_getcpusetpath}, { 3807 "cpuset_cpusetofpid", cpuset_cpusetofpid}, { 3808 "cpuset_mountpoint", cpuset_mountpoint}, { 3809 "cpuset_collides_exclusive", cpuset_collides_exclusive}, { 3810 "cpuset_nuke", cpuset_nuke}, { 3811 "cpuset_init_pidlist", cpuset_init_pidlist}, { 3812 "cpuset_pidlist_length", cpuset_pidlist_length}, { 3813 "cpuset_get_pidlist", cpuset_get_pidlist}, { 3814 "cpuset_freepidlist", cpuset_freepidlist}, { 3815 "cpuset_move", cpuset_move}, { 3816 "cpuset_move_all", cpuset_move_all}, { 3817 "cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, { 3818 "cpuset_migrate", cpuset_migrate}, { 3819 "cpuset_migrate_all", cpuset_migrate_all}, { 3820 "cpuset_reattach", cpuset_reattach}, { 3821 "cpuset_open_memory_pressure", cpuset_open_memory_pressure}, { 3822 "cpuset_read_memory_pressure", cpuset_read_memory_pressure}, { 3823 "cpuset_close_memory_pressure", cpuset_close_memory_pressure}, { 3824 "cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, { 3825 "cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, { 3826 "cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, { 3827 "cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, { 3828 "cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, { 3829 "cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, { 3830 "cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, { 3831 "cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, { 3832 "cpuset_get_placement", cpuset_get_placement}, { 3833 "cpuset_equal_placement", cpuset_equal_placement}, { 3834 "cpuset_free_placement", cpuset_free_placement}, { 3835 "cpuset_fts_open", cpuset_fts_open}, { 3836 "cpuset_fts_read", cpuset_fts_read}, { 3837 "cpuset_fts_reverse", cpuset_fts_reverse}, { 3838 "cpuset_fts_rewind", cpuset_fts_rewind}, { 3839 "cpuset_fts_get_path", cpuset_fts_get_path}, { 3840 "cpuset_fts_get_stat", cpuset_fts_get_stat}, { 3841 "cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, { 3842 "cpuset_fts_get_errno", cpuset_fts_get_errno}, { 3843 "cpuset_fts_get_info", cpuset_fts_get_info}, { 3844 "cpuset_fts_close", cpuset_fts_close}, { 3845 "cpuset_cpubind", cpuset_cpubind}, { 3846 "cpuset_latestcpu", cpuset_latestcpu}, { 3847 "cpuset_membind", cpuset_membind}, { 3848 "cpuset_export", cpuset_export}, { 3849 "cpuset_import", cpuset_import}, { 3850 "cpuset_function", cpuset_function}, { 3851 "cpuset_pin", cpuset_pin}, { 3852 "cpuset_size", cpuset_size}, { 3853 "cpuset_where", cpuset_where}, { 3854"cpuset_unpin", cpuset_unpin},}; 3855 3856/* Return pointer to a libcpuset.so function, or NULL */ 3857void *cpuset_function(const char *function_name) 3858{ 3859 unsigned int i; 3860 3861 for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++) 3862 if (streq(function_name, flist[i].fname)) 3863 return flist[i].func; 3864 return NULL; 3865} 3866 3867/* Fortran interface to basic cpuset routines */ 3868int cpuset_pin_(int *ptr_relcpu) 3869{ 3870 return cpuset_pin(*ptr_relcpu); 3871} 3872 3873int cpuset_size_(void) 3874{ 3875 return cpuset_size(); 3876} 3877 3878int cpuset_where_(void) 3879{ 3880 return cpuset_where(); 3881} 3882 3883int cpuset_unpin_(void) 3884{ 3885 return cpuset_unpin(); 3886} 3887 3888#endif /* HAVE_LINUX_MEMPOLICY_H */ 3889