1/* 2 * linux/kernel/sys.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 */ 6 7#include <linux/export.h> 8#include <linux/mm.h> 9#include <linux/utsname.h> 10#include <linux/mman.h> 11#include <linux/reboot.h> 12#include <linux/prctl.h> 13#include <linux/highuid.h> 14#include <linux/fs.h> 15#include <linux/kmod.h> 16#include <linux/perf_event.h> 17#include <linux/resource.h> 18#include <linux/kernel.h> 19#include <linux/workqueue.h> 20#include <linux/capability.h> 21#include <linux/device.h> 22#include <linux/key.h> 23#include <linux/times.h> 24#include <linux/posix-timers.h> 25#include <linux/security.h> 26#include <linux/dcookies.h> 27#include <linux/suspend.h> 28#include <linux/tty.h> 29#include <linux/signal.h> 30#include <linux/cn_proc.h> 31#include <linux/getcpu.h> 32#include <linux/task_io_accounting_ops.h> 33#include <linux/seccomp.h> 34#include <linux/cpu.h> 35#include <linux/personality.h> 36#include <linux/ptrace.h> 37#include <linux/fs_struct.h> 38#include <linux/file.h> 39#include <linux/mount.h> 40#include <linux/gfp.h> 41#include <linux/syscore_ops.h> 42#include <linux/version.h> 43#include <linux/ctype.h> 44#include <linux/mm.h> 45#include <linux/mempolicy.h> 46#include <linux/sched.h> 47 48#include <linux/compat.h> 49#include <linux/syscalls.h> 50#include <linux/kprobes.h> 51#include <linux/user_namespace.h> 52#include <linux/binfmts.h> 53 54#include <linux/sched.h> 55#include <linux/rcupdate.h> 56#include <linux/uidgid.h> 57#include <linux/cred.h> 58 59#include <linux/kmsg_dump.h> 60/* Move somewhere else to avoid recompiling? */ 61#include <generated/utsrelease.h> 62 63#include <asm/uaccess.h> 64#include <asm/io.h> 65#include <asm/unistd.h> 66 67#ifndef SET_UNALIGN_CTL 68# define SET_UNALIGN_CTL(a, b) (-EINVAL) 69#endif 70#ifndef GET_UNALIGN_CTL 71# define GET_UNALIGN_CTL(a, b) (-EINVAL) 72#endif 73#ifndef SET_FPEMU_CTL 74# define SET_FPEMU_CTL(a, b) (-EINVAL) 75#endif 76#ifndef GET_FPEMU_CTL 77# define GET_FPEMU_CTL(a, b) (-EINVAL) 78#endif 79#ifndef SET_FPEXC_CTL 80# define SET_FPEXC_CTL(a, b) (-EINVAL) 81#endif 82#ifndef GET_FPEXC_CTL 83# define GET_FPEXC_CTL(a, b) (-EINVAL) 84#endif 85#ifndef GET_ENDIAN 86# define GET_ENDIAN(a, b) (-EINVAL) 87#endif 88#ifndef SET_ENDIAN 89# define SET_ENDIAN(a, b) (-EINVAL) 90#endif 91#ifndef GET_TSC_CTL 92# define GET_TSC_CTL(a) (-EINVAL) 93#endif 94#ifndef SET_TSC_CTL 95# define SET_TSC_CTL(a) (-EINVAL) 96#endif 97 98/* 99 * this is where the system-wide overflow UID and GID are defined, for 100 * architectures that now have 32-bit UID/GID but didn't in the past 101 */ 102 103int overflowuid = DEFAULT_OVERFLOWUID; 104int overflowgid = DEFAULT_OVERFLOWGID; 105 106EXPORT_SYMBOL(overflowuid); 107EXPORT_SYMBOL(overflowgid); 108 109/* 110 * the same as above, but for filesystems which can only store a 16-bit 111 * UID and GID. as such, this is needed on all architectures 112 */ 113 114int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; 115int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; 116 117EXPORT_SYMBOL(fs_overflowuid); 118EXPORT_SYMBOL(fs_overflowgid); 119 120/* 121 * Returns true if current's euid is same as p's uid or euid, 122 * or has CAP_SYS_NICE to p's user_ns. 123 * 124 * Called with rcu_read_lock, creds are safe 125 */ 126static bool set_one_prio_perm(struct task_struct *p) 127{ 128 const struct cred *cred = current_cred(), *pcred = __task_cred(p); 129 130 if (uid_eq(pcred->uid, cred->euid) || 131 uid_eq(pcred->euid, cred->euid)) 132 return true; 133 if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) 134 return true; 135 return false; 136} 137 138/* 139 * set the priority of a task 140 * - the caller must hold the RCU read lock 141 */ 142static int set_one_prio(struct task_struct *p, int niceval, int error) 143{ 144 int no_nice; 145 146 if (!set_one_prio_perm(p)) { 147 error = -EPERM; 148 goto out; 149 } 150 if (niceval < task_nice(p) && !can_nice(p, niceval)) { 151 error = -EACCES; 152 goto out; 153 } 154 no_nice = security_task_setnice(p, niceval); 155 if (no_nice) { 156 error = no_nice; 157 goto out; 158 } 159 if (error == -ESRCH) 160 error = 0; 161 set_user_nice(p, niceval); 162out: 163 return error; 164} 165 166SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) 167{ 168 struct task_struct *g, *p; 169 struct user_struct *user; 170 const struct cred *cred = current_cred(); 171 int error = -EINVAL; 172 struct pid *pgrp; 173 kuid_t uid; 174 175 if (which > PRIO_USER || which < PRIO_PROCESS) 176 goto out; 177 178 /* normalize: avoid signed division (rounding problems) */ 179 error = -ESRCH; 180 if (niceval < MIN_NICE) 181 niceval = MIN_NICE; 182 if (niceval > MAX_NICE) 183 niceval = MAX_NICE; 184 185 rcu_read_lock(); 186 read_lock(&tasklist_lock); 187 switch (which) { 188 case PRIO_PROCESS: 189 if (who) 190 p = find_task_by_vpid(who); 191 else 192 p = current; 193 if (p) 194 error = set_one_prio(p, niceval, error); 195 break; 196 case PRIO_PGRP: 197 if (who) 198 pgrp = find_vpid(who); 199 else 200 pgrp = task_pgrp(current); 201 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 202 error = set_one_prio(p, niceval, error); 203 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 204 break; 205 case PRIO_USER: 206 uid = make_kuid(cred->user_ns, who); 207 user = cred->user; 208 if (!who) 209 uid = cred->uid; 210 else if (!uid_eq(uid, cred->uid)) { 211 user = find_user(uid); 212 if (!user) 213 goto out_unlock; /* No processes for this user */ 214 } 215 do_each_thread(g, p) { 216 if (uid_eq(task_uid(p), uid)) 217 error = set_one_prio(p, niceval, error); 218 } while_each_thread(g, p); 219 if (!uid_eq(uid, cred->uid)) 220 free_uid(user); /* For find_user() */ 221 break; 222 } 223out_unlock: 224 read_unlock(&tasklist_lock); 225 rcu_read_unlock(); 226out: 227 return error; 228} 229 230/* 231 * Ugh. To avoid negative return values, "getpriority()" will 232 * not return the normal nice-value, but a negated value that 233 * has been offset by 20 (ie it returns 40..1 instead of -20..19) 234 * to stay compatible. 235 */ 236SYSCALL_DEFINE2(getpriority, int, which, int, who) 237{ 238 struct task_struct *g, *p; 239 struct user_struct *user; 240 const struct cred *cred = current_cred(); 241 long niceval, retval = -ESRCH; 242 struct pid *pgrp; 243 kuid_t uid; 244 245 if (which > PRIO_USER || which < PRIO_PROCESS) 246 return -EINVAL; 247 248 rcu_read_lock(); 249 read_lock(&tasklist_lock); 250 switch (which) { 251 case PRIO_PROCESS: 252 if (who) 253 p = find_task_by_vpid(who); 254 else 255 p = current; 256 if (p) { 257 niceval = nice_to_rlimit(task_nice(p)); 258 if (niceval > retval) 259 retval = niceval; 260 } 261 break; 262 case PRIO_PGRP: 263 if (who) 264 pgrp = find_vpid(who); 265 else 266 pgrp = task_pgrp(current); 267 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 268 niceval = nice_to_rlimit(task_nice(p)); 269 if (niceval > retval) 270 retval = niceval; 271 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 272 break; 273 case PRIO_USER: 274 uid = make_kuid(cred->user_ns, who); 275 user = cred->user; 276 if (!who) 277 uid = cred->uid; 278 else if (!uid_eq(uid, cred->uid)) { 279 user = find_user(uid); 280 if (!user) 281 goto out_unlock; /* No processes for this user */ 282 } 283 do_each_thread(g, p) { 284 if (uid_eq(task_uid(p), uid)) { 285 niceval = nice_to_rlimit(task_nice(p)); 286 if (niceval > retval) 287 retval = niceval; 288 } 289 } while_each_thread(g, p); 290 if (!uid_eq(uid, cred->uid)) 291 free_uid(user); /* for find_user() */ 292 break; 293 } 294out_unlock: 295 read_unlock(&tasklist_lock); 296 rcu_read_unlock(); 297 298 return retval; 299} 300 301/* 302 * Unprivileged users may change the real gid to the effective gid 303 * or vice versa. (BSD-style) 304 * 305 * If you set the real gid at all, or set the effective gid to a value not 306 * equal to the real gid, then the saved gid is set to the new effective gid. 307 * 308 * This makes it possible for a setgid program to completely drop its 309 * privileges, which is often a useful assertion to make when you are doing 310 * a security audit over a program. 311 * 312 * The general idea is that a program which uses just setregid() will be 313 * 100% compatible with BSD. A program which uses just setgid() will be 314 * 100% compatible with POSIX with saved IDs. 315 * 316 * SMP: There are not races, the GIDs are checked only by filesystem 317 * operations (as far as semantic preservation is concerned). 318 */ 319SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) 320{ 321 struct user_namespace *ns = current_user_ns(); 322 const struct cred *old; 323 struct cred *new; 324 int retval; 325 kgid_t krgid, kegid; 326 327 krgid = make_kgid(ns, rgid); 328 kegid = make_kgid(ns, egid); 329 330 if ((rgid != (gid_t) -1) && !gid_valid(krgid)) 331 return -EINVAL; 332 if ((egid != (gid_t) -1) && !gid_valid(kegid)) 333 return -EINVAL; 334 335 new = prepare_creds(); 336 if (!new) 337 return -ENOMEM; 338 old = current_cred(); 339 340 retval = -EPERM; 341 if (rgid != (gid_t) -1) { 342 if (gid_eq(old->gid, krgid) || 343 gid_eq(old->egid, krgid) || 344 ns_capable(old->user_ns, CAP_SETGID)) 345 new->gid = krgid; 346 else 347 goto error; 348 } 349 if (egid != (gid_t) -1) { 350 if (gid_eq(old->gid, kegid) || 351 gid_eq(old->egid, kegid) || 352 gid_eq(old->sgid, kegid) || 353 ns_capable(old->user_ns, CAP_SETGID)) 354 new->egid = kegid; 355 else 356 goto error; 357 } 358 359 if (rgid != (gid_t) -1 || 360 (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) 361 new->sgid = new->egid; 362 new->fsgid = new->egid; 363 364 return commit_creds(new); 365 366error: 367 abort_creds(new); 368 return retval; 369} 370 371/* 372 * setgid() is implemented like SysV w/ SAVED_IDS 373 * 374 * SMP: Same implicit races as above. 375 */ 376SYSCALL_DEFINE1(setgid, gid_t, gid) 377{ 378 struct user_namespace *ns = current_user_ns(); 379 const struct cred *old; 380 struct cred *new; 381 int retval; 382 kgid_t kgid; 383 384 kgid = make_kgid(ns, gid); 385 if (!gid_valid(kgid)) 386 return -EINVAL; 387 388 new = prepare_creds(); 389 if (!new) 390 return -ENOMEM; 391 old = current_cred(); 392 393 retval = -EPERM; 394 if (ns_capable(old->user_ns, CAP_SETGID)) 395 new->gid = new->egid = new->sgid = new->fsgid = kgid; 396 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) 397 new->egid = new->fsgid = kgid; 398 else 399 goto error; 400 401 return commit_creds(new); 402 403error: 404 abort_creds(new); 405 return retval; 406} 407 408/* 409 * change the user struct in a credentials set to match the new UID 410 */ 411static int set_user(struct cred *new) 412{ 413 struct user_struct *new_user; 414 415 new_user = alloc_uid(new->uid); 416 if (!new_user) 417 return -EAGAIN; 418 419 /* 420 * We don't fail in case of NPROC limit excess here because too many 421 * poorly written programs don't check set*uid() return code, assuming 422 * it never fails if called by root. We may still enforce NPROC limit 423 * for programs doing set*uid()+execve() by harmlessly deferring the 424 * failure to the execve() stage. 425 */ 426 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && 427 new_user != INIT_USER) 428 current->flags |= PF_NPROC_EXCEEDED; 429 else 430 current->flags &= ~PF_NPROC_EXCEEDED; 431 432 free_uid(new->user); 433 new->user = new_user; 434 return 0; 435} 436 437/* 438 * Unprivileged users may change the real uid to the effective uid 439 * or vice versa. (BSD-style) 440 * 441 * If you set the real uid at all, or set the effective uid to a value not 442 * equal to the real uid, then the saved uid is set to the new effective uid. 443 * 444 * This makes it possible for a setuid program to completely drop its 445 * privileges, which is often a useful assertion to make when you are doing 446 * a security audit over a program. 447 * 448 * The general idea is that a program which uses just setreuid() will be 449 * 100% compatible with BSD. A program which uses just setuid() will be 450 * 100% compatible with POSIX with saved IDs. 451 */ 452SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) 453{ 454 struct user_namespace *ns = current_user_ns(); 455 const struct cred *old; 456 struct cred *new; 457 int retval; 458 kuid_t kruid, keuid; 459 460 kruid = make_kuid(ns, ruid); 461 keuid = make_kuid(ns, euid); 462 463 if ((ruid != (uid_t) -1) && !uid_valid(kruid)) 464 return -EINVAL; 465 if ((euid != (uid_t) -1) && !uid_valid(keuid)) 466 return -EINVAL; 467 468 new = prepare_creds(); 469 if (!new) 470 return -ENOMEM; 471 old = current_cred(); 472 473 retval = -EPERM; 474 if (ruid != (uid_t) -1) { 475 new->uid = kruid; 476 if (!uid_eq(old->uid, kruid) && 477 !uid_eq(old->euid, kruid) && 478 !ns_capable(old->user_ns, CAP_SETUID)) 479 goto error; 480 } 481 482 if (euid != (uid_t) -1) { 483 new->euid = keuid; 484 if (!uid_eq(old->uid, keuid) && 485 !uid_eq(old->euid, keuid) && 486 !uid_eq(old->suid, keuid) && 487 !ns_capable(old->user_ns, CAP_SETUID)) 488 goto error; 489 } 490 491 if (!uid_eq(new->uid, old->uid)) { 492 retval = set_user(new); 493 if (retval < 0) 494 goto error; 495 } 496 if (ruid != (uid_t) -1 || 497 (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) 498 new->suid = new->euid; 499 new->fsuid = new->euid; 500 501 retval = security_task_fix_setuid(new, old, LSM_SETID_RE); 502 if (retval < 0) 503 goto error; 504 505 return commit_creds(new); 506 507error: 508 abort_creds(new); 509 return retval; 510} 511 512/* 513 * setuid() is implemented like SysV with SAVED_IDS 514 * 515 * Note that SAVED_ID's is deficient in that a setuid root program 516 * like sendmail, for example, cannot set its uid to be a normal 517 * user and then switch back, because if you're root, setuid() sets 518 * the saved uid too. If you don't like this, blame the bright people 519 * in the POSIX committee and/or USG. Note that the BSD-style setreuid() 520 * will allow a root program to temporarily drop privileges and be able to 521 * regain them by swapping the real and effective uid. 522 */ 523SYSCALL_DEFINE1(setuid, uid_t, uid) 524{ 525 struct user_namespace *ns = current_user_ns(); 526 const struct cred *old; 527 struct cred *new; 528 int retval; 529 kuid_t kuid; 530 531 kuid = make_kuid(ns, uid); 532 if (!uid_valid(kuid)) 533 return -EINVAL; 534 535 new = prepare_creds(); 536 if (!new) 537 return -ENOMEM; 538 old = current_cred(); 539 540 retval = -EPERM; 541 if (ns_capable(old->user_ns, CAP_SETUID)) { 542 new->suid = new->uid = kuid; 543 if (!uid_eq(kuid, old->uid)) { 544 retval = set_user(new); 545 if (retval < 0) 546 goto error; 547 } 548 } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { 549 goto error; 550 } 551 552 new->fsuid = new->euid = kuid; 553 554 retval = security_task_fix_setuid(new, old, LSM_SETID_ID); 555 if (retval < 0) 556 goto error; 557 558 return commit_creds(new); 559 560error: 561 abort_creds(new); 562 return retval; 563} 564 565 566/* 567 * This function implements a generic ability to update ruid, euid, 568 * and suid. This allows you to implement the 4.4 compatible seteuid(). 569 */ 570SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) 571{ 572 struct user_namespace *ns = current_user_ns(); 573 const struct cred *old; 574 struct cred *new; 575 int retval; 576 kuid_t kruid, keuid, ksuid; 577 578 kruid = make_kuid(ns, ruid); 579 keuid = make_kuid(ns, euid); 580 ksuid = make_kuid(ns, suid); 581 582 if ((ruid != (uid_t) -1) && !uid_valid(kruid)) 583 return -EINVAL; 584 585 if ((euid != (uid_t) -1) && !uid_valid(keuid)) 586 return -EINVAL; 587 588 if ((suid != (uid_t) -1) && !uid_valid(ksuid)) 589 return -EINVAL; 590 591 new = prepare_creds(); 592 if (!new) 593 return -ENOMEM; 594 595 old = current_cred(); 596 597 retval = -EPERM; 598 if (!ns_capable(old->user_ns, CAP_SETUID)) { 599 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && 600 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) 601 goto error; 602 if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && 603 !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) 604 goto error; 605 if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && 606 !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) 607 goto error; 608 } 609 610 if (ruid != (uid_t) -1) { 611 new->uid = kruid; 612 if (!uid_eq(kruid, old->uid)) { 613 retval = set_user(new); 614 if (retval < 0) 615 goto error; 616 } 617 } 618 if (euid != (uid_t) -1) 619 new->euid = keuid; 620 if (suid != (uid_t) -1) 621 new->suid = ksuid; 622 new->fsuid = new->euid; 623 624 retval = security_task_fix_setuid(new, old, LSM_SETID_RES); 625 if (retval < 0) 626 goto error; 627 628 return commit_creds(new); 629 630error: 631 abort_creds(new); 632 return retval; 633} 634 635SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) 636{ 637 const struct cred *cred = current_cred(); 638 int retval; 639 uid_t ruid, euid, suid; 640 641 ruid = from_kuid_munged(cred->user_ns, cred->uid); 642 euid = from_kuid_munged(cred->user_ns, cred->euid); 643 suid = from_kuid_munged(cred->user_ns, cred->suid); 644 645 retval = put_user(ruid, ruidp); 646 if (!retval) { 647 retval = put_user(euid, euidp); 648 if (!retval) 649 return put_user(suid, suidp); 650 } 651 return retval; 652} 653 654/* 655 * Same as above, but for rgid, egid, sgid. 656 */ 657SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) 658{ 659 struct user_namespace *ns = current_user_ns(); 660 const struct cred *old; 661 struct cred *new; 662 int retval; 663 kgid_t krgid, kegid, ksgid; 664 665 krgid = make_kgid(ns, rgid); 666 kegid = make_kgid(ns, egid); 667 ksgid = make_kgid(ns, sgid); 668 669 if ((rgid != (gid_t) -1) && !gid_valid(krgid)) 670 return -EINVAL; 671 if ((egid != (gid_t) -1) && !gid_valid(kegid)) 672 return -EINVAL; 673 if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) 674 return -EINVAL; 675 676 new = prepare_creds(); 677 if (!new) 678 return -ENOMEM; 679 old = current_cred(); 680 681 retval = -EPERM; 682 if (!ns_capable(old->user_ns, CAP_SETGID)) { 683 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && 684 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) 685 goto error; 686 if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && 687 !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) 688 goto error; 689 if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && 690 !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) 691 goto error; 692 } 693 694 if (rgid != (gid_t) -1) 695 new->gid = krgid; 696 if (egid != (gid_t) -1) 697 new->egid = kegid; 698 if (sgid != (gid_t) -1) 699 new->sgid = ksgid; 700 new->fsgid = new->egid; 701 702 return commit_creds(new); 703 704error: 705 abort_creds(new); 706 return retval; 707} 708 709SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) 710{ 711 const struct cred *cred = current_cred(); 712 int retval; 713 gid_t rgid, egid, sgid; 714 715 rgid = from_kgid_munged(cred->user_ns, cred->gid); 716 egid = from_kgid_munged(cred->user_ns, cred->egid); 717 sgid = from_kgid_munged(cred->user_ns, cred->sgid); 718 719 retval = put_user(rgid, rgidp); 720 if (!retval) { 721 retval = put_user(egid, egidp); 722 if (!retval) 723 retval = put_user(sgid, sgidp); 724 } 725 726 return retval; 727} 728 729 730/* 731 * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This 732 * is used for "access()" and for the NFS daemon (letting nfsd stay at 733 * whatever uid it wants to). It normally shadows "euid", except when 734 * explicitly set by setfsuid() or for access.. 735 */ 736SYSCALL_DEFINE1(setfsuid, uid_t, uid) 737{ 738 const struct cred *old; 739 struct cred *new; 740 uid_t old_fsuid; 741 kuid_t kuid; 742 743 old = current_cred(); 744 old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); 745 746 kuid = make_kuid(old->user_ns, uid); 747 if (!uid_valid(kuid)) 748 return old_fsuid; 749 750 new = prepare_creds(); 751 if (!new) 752 return old_fsuid; 753 754 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || 755 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || 756 ns_capable(old->user_ns, CAP_SETUID)) { 757 if (!uid_eq(kuid, old->fsuid)) { 758 new->fsuid = kuid; 759 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 760 goto change_okay; 761 } 762 } 763 764 abort_creds(new); 765 return old_fsuid; 766 767change_okay: 768 commit_creds(new); 769 return old_fsuid; 770} 771 772/* 773 * Samma på svenska.. 774 */ 775SYSCALL_DEFINE1(setfsgid, gid_t, gid) 776{ 777 const struct cred *old; 778 struct cred *new; 779 gid_t old_fsgid; 780 kgid_t kgid; 781 782 old = current_cred(); 783 old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); 784 785 kgid = make_kgid(old->user_ns, gid); 786 if (!gid_valid(kgid)) 787 return old_fsgid; 788 789 new = prepare_creds(); 790 if (!new) 791 return old_fsgid; 792 793 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || 794 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || 795 ns_capable(old->user_ns, CAP_SETGID)) { 796 if (!gid_eq(kgid, old->fsgid)) { 797 new->fsgid = kgid; 798 goto change_okay; 799 } 800 } 801 802 abort_creds(new); 803 return old_fsgid; 804 805change_okay: 806 commit_creds(new); 807 return old_fsgid; 808} 809 810/** 811 * sys_getpid - return the thread group id of the current process 812 * 813 * Note, despite the name, this returns the tgid not the pid. The tgid and 814 * the pid are identical unless CLONE_THREAD was specified on clone() in 815 * which case the tgid is the same in all threads of the same group. 816 * 817 * This is SMP safe as current->tgid does not change. 818 */ 819SYSCALL_DEFINE0(getpid) 820{ 821 return task_tgid_vnr(current); 822} 823 824/* Thread ID - the internal kernel "pid" */ 825SYSCALL_DEFINE0(gettid) 826{ 827 return task_pid_vnr(current); 828} 829 830/* 831 * Accessing ->real_parent is not SMP-safe, it could 832 * change from under us. However, we can use a stale 833 * value of ->real_parent under rcu_read_lock(), see 834 * release_task()->call_rcu(delayed_put_task_struct). 835 */ 836SYSCALL_DEFINE0(getppid) 837{ 838 int pid; 839 840 rcu_read_lock(); 841 pid = task_tgid_vnr(rcu_dereference(current->real_parent)); 842 rcu_read_unlock(); 843 844 return pid; 845} 846 847SYSCALL_DEFINE0(getuid) 848{ 849 /* Only we change this so SMP safe */ 850 return from_kuid_munged(current_user_ns(), current_uid()); 851} 852 853SYSCALL_DEFINE0(geteuid) 854{ 855 /* Only we change this so SMP safe */ 856 return from_kuid_munged(current_user_ns(), current_euid()); 857} 858 859SYSCALL_DEFINE0(getgid) 860{ 861 /* Only we change this so SMP safe */ 862 return from_kgid_munged(current_user_ns(), current_gid()); 863} 864 865SYSCALL_DEFINE0(getegid) 866{ 867 /* Only we change this so SMP safe */ 868 return from_kgid_munged(current_user_ns(), current_egid()); 869} 870 871void do_sys_times(struct tms *tms) 872{ 873 cputime_t tgutime, tgstime, cutime, cstime; 874 875 thread_group_cputime_adjusted(current, &tgutime, &tgstime); 876 cutime = current->signal->cutime; 877 cstime = current->signal->cstime; 878 tms->tms_utime = cputime_to_clock_t(tgutime); 879 tms->tms_stime = cputime_to_clock_t(tgstime); 880 tms->tms_cutime = cputime_to_clock_t(cutime); 881 tms->tms_cstime = cputime_to_clock_t(cstime); 882} 883 884SYSCALL_DEFINE1(times, struct tms __user *, tbuf) 885{ 886 if (tbuf) { 887 struct tms tmp; 888 889 do_sys_times(&tmp); 890 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 891 return -EFAULT; 892 } 893 force_successful_syscall_return(); 894 return (long) jiffies_64_to_clock_t(get_jiffies_64()); 895} 896 897/* 898 * This needs some heavy checking ... 899 * I just haven't the stomach for it. I also don't fully 900 * understand sessions/pgrp etc. Let somebody who does explain it. 901 * 902 * OK, I think I have the protection semantics right.... this is really 903 * only important on a multi-user system anyway, to make sure one user 904 * can't send a signal to a process owned by another. -TYT, 12/12/91 905 * 906 * !PF_FORKNOEXEC check to conform completely to POSIX. 907 */ 908SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) 909{ 910 struct task_struct *p; 911 struct task_struct *group_leader = current->group_leader; 912 struct pid *pgrp; 913 int err; 914 915 if (!pid) 916 pid = task_pid_vnr(group_leader); 917 if (!pgid) 918 pgid = pid; 919 if (pgid < 0) 920 return -EINVAL; 921 rcu_read_lock(); 922 923 /* From this point forward we keep holding onto the tasklist lock 924 * so that our parent does not change from under us. -DaveM 925 */ 926 write_lock_irq(&tasklist_lock); 927 928 err = -ESRCH; 929 p = find_task_by_vpid(pid); 930 if (!p) 931 goto out; 932 933 err = -EINVAL; 934 if (!thread_group_leader(p)) 935 goto out; 936 937 if (same_thread_group(p->real_parent, group_leader)) { 938 err = -EPERM; 939 if (task_session(p) != task_session(group_leader)) 940 goto out; 941 err = -EACCES; 942 if (!(p->flags & PF_FORKNOEXEC)) 943 goto out; 944 } else { 945 err = -ESRCH; 946 if (p != group_leader) 947 goto out; 948 } 949 950 err = -EPERM; 951 if (p->signal->leader) 952 goto out; 953 954 pgrp = task_pid(p); 955 if (pgid != pid) { 956 struct task_struct *g; 957 958 pgrp = find_vpid(pgid); 959 g = pid_task(pgrp, PIDTYPE_PGID); 960 if (!g || task_session(g) != task_session(group_leader)) 961 goto out; 962 } 963 964 err = security_task_setpgid(p, pgid); 965 if (err) 966 goto out; 967 968 if (task_pgrp(p) != pgrp) 969 change_pid(p, PIDTYPE_PGID, pgrp); 970 971 err = 0; 972out: 973 /* All paths lead to here, thus we are safe. -DaveM */ 974 write_unlock_irq(&tasklist_lock); 975 rcu_read_unlock(); 976 return err; 977} 978 979SYSCALL_DEFINE1(getpgid, pid_t, pid) 980{ 981 struct task_struct *p; 982 struct pid *grp; 983 int retval; 984 985 rcu_read_lock(); 986 if (!pid) 987 grp = task_pgrp(current); 988 else { 989 retval = -ESRCH; 990 p = find_task_by_vpid(pid); 991 if (!p) 992 goto out; 993 grp = task_pgrp(p); 994 if (!grp) 995 goto out; 996 997 retval = security_task_getpgid(p); 998 if (retval) 999 goto out; 1000 } 1001 retval = pid_vnr(grp); 1002out: 1003 rcu_read_unlock(); 1004 return retval; 1005} 1006 1007#ifdef __ARCH_WANT_SYS_GETPGRP 1008 1009SYSCALL_DEFINE0(getpgrp) 1010{ 1011 return sys_getpgid(0); 1012} 1013 1014#endif 1015 1016SYSCALL_DEFINE1(getsid, pid_t, pid) 1017{ 1018 struct task_struct *p; 1019 struct pid *sid; 1020 int retval; 1021 1022 rcu_read_lock(); 1023 if (!pid) 1024 sid = task_session(current); 1025 else { 1026 retval = -ESRCH; 1027 p = find_task_by_vpid(pid); 1028 if (!p) 1029 goto out; 1030 sid = task_session(p); 1031 if (!sid) 1032 goto out; 1033 1034 retval = security_task_getsid(p); 1035 if (retval) 1036 goto out; 1037 } 1038 retval = pid_vnr(sid); 1039out: 1040 rcu_read_unlock(); 1041 return retval; 1042} 1043 1044static void set_special_pids(struct pid *pid) 1045{ 1046 struct task_struct *curr = current->group_leader; 1047 1048 if (task_session(curr) != pid) 1049 change_pid(curr, PIDTYPE_SID, pid); 1050 1051 if (task_pgrp(curr) != pid) 1052 change_pid(curr, PIDTYPE_PGID, pid); 1053} 1054 1055SYSCALL_DEFINE0(setsid) 1056{ 1057 struct task_struct *group_leader = current->group_leader; 1058 struct pid *sid = task_pid(group_leader); 1059 pid_t session = pid_vnr(sid); 1060 int err = -EPERM; 1061 1062 write_lock_irq(&tasklist_lock); 1063 /* Fail if I am already a session leader */ 1064 if (group_leader->signal->leader) 1065 goto out; 1066 1067 /* Fail if a process group id already exists that equals the 1068 * proposed session id. 1069 */ 1070 if (pid_task(sid, PIDTYPE_PGID)) 1071 goto out; 1072 1073 group_leader->signal->leader = 1; 1074 set_special_pids(sid); 1075 1076 proc_clear_tty(group_leader); 1077 1078 err = session; 1079out: 1080 write_unlock_irq(&tasklist_lock); 1081 if (err > 0) { 1082 proc_sid_connector(group_leader); 1083 sched_autogroup_create_attach(group_leader); 1084 } 1085 return err; 1086} 1087 1088DECLARE_RWSEM(uts_sem); 1089 1090#ifdef COMPAT_UTS_MACHINE 1091#define override_architecture(name) \ 1092 (personality(current->personality) == PER_LINUX32 && \ 1093 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ 1094 sizeof(COMPAT_UTS_MACHINE))) 1095#else 1096#define override_architecture(name) 0 1097#endif 1098 1099/* 1100 * Work around broken programs that cannot handle "Linux 3.0". 1101 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 1102 */ 1103static int override_release(char __user *release, size_t len) 1104{ 1105 int ret = 0; 1106 1107 if (current->personality & UNAME26) { 1108 const char *rest = UTS_RELEASE; 1109 char buf[65] = { 0 }; 1110 int ndots = 0; 1111 unsigned v; 1112 size_t copy; 1113 1114 while (*rest) { 1115 if (*rest == '.' && ++ndots >= 3) 1116 break; 1117 if (!isdigit(*rest) && *rest != '.') 1118 break; 1119 rest++; 1120 } 1121 v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; 1122 copy = clamp_t(size_t, len, 1, sizeof(buf)); 1123 copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); 1124 ret = copy_to_user(release, buf, copy + 1); 1125 } 1126 return ret; 1127} 1128 1129SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1130{ 1131 int errno = 0; 1132 1133 down_read(&uts_sem); 1134 if (copy_to_user(name, utsname(), sizeof *name)) 1135 errno = -EFAULT; 1136 up_read(&uts_sem); 1137 1138 if (!errno && override_release(name->release, sizeof(name->release))) 1139 errno = -EFAULT; 1140 if (!errno && override_architecture(name)) 1141 errno = -EFAULT; 1142 return errno; 1143} 1144 1145#ifdef __ARCH_WANT_SYS_OLD_UNAME 1146/* 1147 * Old cruft 1148 */ 1149SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) 1150{ 1151 int error = 0; 1152 1153 if (!name) 1154 return -EFAULT; 1155 1156 down_read(&uts_sem); 1157 if (copy_to_user(name, utsname(), sizeof(*name))) 1158 error = -EFAULT; 1159 up_read(&uts_sem); 1160 1161 if (!error && override_release(name->release, sizeof(name->release))) 1162 error = -EFAULT; 1163 if (!error && override_architecture(name)) 1164 error = -EFAULT; 1165 return error; 1166} 1167 1168SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) 1169{ 1170 int error; 1171 1172 if (!name) 1173 return -EFAULT; 1174 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) 1175 return -EFAULT; 1176 1177 down_read(&uts_sem); 1178 error = __copy_to_user(&name->sysname, &utsname()->sysname, 1179 __OLD_UTS_LEN); 1180 error |= __put_user(0, name->sysname + __OLD_UTS_LEN); 1181 error |= __copy_to_user(&name->nodename, &utsname()->nodename, 1182 __OLD_UTS_LEN); 1183 error |= __put_user(0, name->nodename + __OLD_UTS_LEN); 1184 error |= __copy_to_user(&name->release, &utsname()->release, 1185 __OLD_UTS_LEN); 1186 error |= __put_user(0, name->release + __OLD_UTS_LEN); 1187 error |= __copy_to_user(&name->version, &utsname()->version, 1188 __OLD_UTS_LEN); 1189 error |= __put_user(0, name->version + __OLD_UTS_LEN); 1190 error |= __copy_to_user(&name->machine, &utsname()->machine, 1191 __OLD_UTS_LEN); 1192 error |= __put_user(0, name->machine + __OLD_UTS_LEN); 1193 up_read(&uts_sem); 1194 1195 if (!error && override_architecture(name)) 1196 error = -EFAULT; 1197 if (!error && override_release(name->release, sizeof(name->release))) 1198 error = -EFAULT; 1199 return error ? -EFAULT : 0; 1200} 1201#endif 1202 1203SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) 1204{ 1205 int errno; 1206 char tmp[__NEW_UTS_LEN]; 1207 1208 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) 1209 return -EPERM; 1210 1211 if (len < 0 || len > __NEW_UTS_LEN) 1212 return -EINVAL; 1213 down_write(&uts_sem); 1214 errno = -EFAULT; 1215 if (!copy_from_user(tmp, name, len)) { 1216 struct new_utsname *u = utsname(); 1217 1218 memcpy(u->nodename, tmp, len); 1219 memset(u->nodename + len, 0, sizeof(u->nodename) - len); 1220 errno = 0; 1221 uts_proc_notify(UTS_PROC_HOSTNAME); 1222 } 1223 up_write(&uts_sem); 1224 return errno; 1225} 1226 1227#ifdef __ARCH_WANT_SYS_GETHOSTNAME 1228 1229SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) 1230{ 1231 int i, errno; 1232 struct new_utsname *u; 1233 1234 if (len < 0) 1235 return -EINVAL; 1236 down_read(&uts_sem); 1237 u = utsname(); 1238 i = 1 + strlen(u->nodename); 1239 if (i > len) 1240 i = len; 1241 errno = 0; 1242 if (copy_to_user(name, u->nodename, i)) 1243 errno = -EFAULT; 1244 up_read(&uts_sem); 1245 return errno; 1246} 1247 1248#endif 1249 1250/* 1251 * Only setdomainname; getdomainname can be implemented by calling 1252 * uname() 1253 */ 1254SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) 1255{ 1256 int errno; 1257 char tmp[__NEW_UTS_LEN]; 1258 1259 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) 1260 return -EPERM; 1261 if (len < 0 || len > __NEW_UTS_LEN) 1262 return -EINVAL; 1263 1264 down_write(&uts_sem); 1265 errno = -EFAULT; 1266 if (!copy_from_user(tmp, name, len)) { 1267 struct new_utsname *u = utsname(); 1268 1269 memcpy(u->domainname, tmp, len); 1270 memset(u->domainname + len, 0, sizeof(u->domainname) - len); 1271 errno = 0; 1272 uts_proc_notify(UTS_PROC_DOMAINNAME); 1273 } 1274 up_write(&uts_sem); 1275 return errno; 1276} 1277 1278SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1279{ 1280 struct rlimit value; 1281 int ret; 1282 1283 ret = do_prlimit(current, resource, NULL, &value); 1284 if (!ret) 1285 ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; 1286 1287 return ret; 1288} 1289 1290#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT 1291 1292/* 1293 * Back compatibility for getrlimit. Needed for some apps. 1294 */ 1295SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, 1296 struct rlimit __user *, rlim) 1297{ 1298 struct rlimit x; 1299 if (resource >= RLIM_NLIMITS) 1300 return -EINVAL; 1301 1302 task_lock(current->group_leader); 1303 x = current->signal->rlim[resource]; 1304 task_unlock(current->group_leader); 1305 if (x.rlim_cur > 0x7FFFFFFF) 1306 x.rlim_cur = 0x7FFFFFFF; 1307 if (x.rlim_max > 0x7FFFFFFF) 1308 x.rlim_max = 0x7FFFFFFF; 1309 return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; 1310} 1311 1312#endif 1313 1314static inline bool rlim64_is_infinity(__u64 rlim64) 1315{ 1316#if BITS_PER_LONG < 64 1317 return rlim64 >= ULONG_MAX; 1318#else 1319 return rlim64 == RLIM64_INFINITY; 1320#endif 1321} 1322 1323static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) 1324{ 1325 if (rlim->rlim_cur == RLIM_INFINITY) 1326 rlim64->rlim_cur = RLIM64_INFINITY; 1327 else 1328 rlim64->rlim_cur = rlim->rlim_cur; 1329 if (rlim->rlim_max == RLIM_INFINITY) 1330 rlim64->rlim_max = RLIM64_INFINITY; 1331 else 1332 rlim64->rlim_max = rlim->rlim_max; 1333} 1334 1335static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) 1336{ 1337 if (rlim64_is_infinity(rlim64->rlim_cur)) 1338 rlim->rlim_cur = RLIM_INFINITY; 1339 else 1340 rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; 1341 if (rlim64_is_infinity(rlim64->rlim_max)) 1342 rlim->rlim_max = RLIM_INFINITY; 1343 else 1344 rlim->rlim_max = (unsigned long)rlim64->rlim_max; 1345} 1346 1347/* make sure you are allowed to change @tsk limits before calling this */ 1348int do_prlimit(struct task_struct *tsk, unsigned int resource, 1349 struct rlimit *new_rlim, struct rlimit *old_rlim) 1350{ 1351 struct rlimit *rlim; 1352 int retval = 0; 1353 1354 if (resource >= RLIM_NLIMITS) 1355 return -EINVAL; 1356 if (new_rlim) { 1357 if (new_rlim->rlim_cur > new_rlim->rlim_max) 1358 return -EINVAL; 1359 if (resource == RLIMIT_NOFILE && 1360 new_rlim->rlim_max > sysctl_nr_open) 1361 return -EPERM; 1362 } 1363 1364 /* protect tsk->signal and tsk->sighand from disappearing */ 1365 read_lock(&tasklist_lock); 1366 if (!tsk->sighand) { 1367 retval = -ESRCH; 1368 goto out; 1369 } 1370 1371 rlim = tsk->signal->rlim + resource; 1372 task_lock(tsk->group_leader); 1373 if (new_rlim) { 1374 /* Keep the capable check against init_user_ns until 1375 cgroups can contain all limits */ 1376 if (new_rlim->rlim_max > rlim->rlim_max && 1377 !capable(CAP_SYS_RESOURCE)) 1378 retval = -EPERM; 1379 if (!retval) 1380 retval = security_task_setrlimit(tsk->group_leader, 1381 resource, new_rlim); 1382 if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { 1383 /* 1384 * The caller is asking for an immediate RLIMIT_CPU 1385 * expiry. But we use the zero value to mean "it was 1386 * never set". So let's cheat and make it one second 1387 * instead 1388 */ 1389 new_rlim->rlim_cur = 1; 1390 } 1391 } 1392 if (!retval) { 1393 if (old_rlim) 1394 *old_rlim = *rlim; 1395 if (new_rlim) 1396 *rlim = *new_rlim; 1397 } 1398 task_unlock(tsk->group_leader); 1399 1400 /* 1401 * RLIMIT_CPU handling. Note that the kernel fails to return an error 1402 * code if it rejected the user's attempt to set RLIMIT_CPU. This is a 1403 * very long-standing error, and fixing it now risks breakage of 1404 * applications, so we live with it 1405 */ 1406 if (!retval && new_rlim && resource == RLIMIT_CPU && 1407 new_rlim->rlim_cur != RLIM_INFINITY) 1408 update_rlimit_cpu(tsk, new_rlim->rlim_cur); 1409out: 1410 read_unlock(&tasklist_lock); 1411 return retval; 1412} 1413 1414/* rcu lock must be held */ 1415static int check_prlimit_permission(struct task_struct *task) 1416{ 1417 const struct cred *cred = current_cred(), *tcred; 1418 1419 if (current == task) 1420 return 0; 1421 1422 tcred = __task_cred(task); 1423 if (uid_eq(cred->uid, tcred->euid) && 1424 uid_eq(cred->uid, tcred->suid) && 1425 uid_eq(cred->uid, tcred->uid) && 1426 gid_eq(cred->gid, tcred->egid) && 1427 gid_eq(cred->gid, tcred->sgid) && 1428 gid_eq(cred->gid, tcred->gid)) 1429 return 0; 1430 if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) 1431 return 0; 1432 1433 return -EPERM; 1434} 1435 1436SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, 1437 const struct rlimit64 __user *, new_rlim, 1438 struct rlimit64 __user *, old_rlim) 1439{ 1440 struct rlimit64 old64, new64; 1441 struct rlimit old, new; 1442 struct task_struct *tsk; 1443 int ret; 1444 1445 if (new_rlim) { 1446 if (copy_from_user(&new64, new_rlim, sizeof(new64))) 1447 return -EFAULT; 1448 rlim64_to_rlim(&new64, &new); 1449 } 1450 1451 rcu_read_lock(); 1452 tsk = pid ? find_task_by_vpid(pid) : current; 1453 if (!tsk) { 1454 rcu_read_unlock(); 1455 return -ESRCH; 1456 } 1457 ret = check_prlimit_permission(tsk); 1458 if (ret) { 1459 rcu_read_unlock(); 1460 return ret; 1461 } 1462 get_task_struct(tsk); 1463 rcu_read_unlock(); 1464 1465 ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, 1466 old_rlim ? &old : NULL); 1467 1468 if (!ret && old_rlim) { 1469 rlim_to_rlim64(&old, &old64); 1470 if (copy_to_user(old_rlim, &old64, sizeof(old64))) 1471 ret = -EFAULT; 1472 } 1473 1474 put_task_struct(tsk); 1475 return ret; 1476} 1477 1478SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1479{ 1480 struct rlimit new_rlim; 1481 1482 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1483 return -EFAULT; 1484 return do_prlimit(current, resource, &new_rlim, NULL); 1485} 1486 1487/* 1488 * It would make sense to put struct rusage in the task_struct, 1489 * except that would make the task_struct be *really big*. After 1490 * task_struct gets moved into malloc'ed memory, it would 1491 * make sense to do this. It will make moving the rest of the information 1492 * a lot simpler! (Which we're not doing right now because we're not 1493 * measuring them yet). 1494 * 1495 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have 1496 * races with threads incrementing their own counters. But since word 1497 * reads are atomic, we either get new values or old values and we don't 1498 * care which for the sums. We always take the siglock to protect reading 1499 * the c* fields from p->signal from races with exit.c updating those 1500 * fields when reaping, so a sample either gets all the additions of a 1501 * given child after it's reaped, or none so this sample is before reaping. 1502 * 1503 * Locking: 1504 * We need to take the siglock for CHILDEREN, SELF and BOTH 1505 * for the cases current multithreaded, non-current single threaded 1506 * non-current multithreaded. Thread traversal is now safe with 1507 * the siglock held. 1508 * Strictly speaking, we donot need to take the siglock if we are current and 1509 * single threaded, as no one else can take our signal_struct away, no one 1510 * else can reap the children to update signal->c* counters, and no one else 1511 * can race with the signal-> fields. If we do not take any lock, the 1512 * signal-> fields could be read out of order while another thread was just 1513 * exiting. So we should place a read memory barrier when we avoid the lock. 1514 * On the writer side, write memory barrier is implied in __exit_signal 1515 * as __exit_signal releases the siglock spinlock after updating the signal-> 1516 * fields. But we don't do this yet to keep things simple. 1517 * 1518 */ 1519 1520static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) 1521{ 1522 r->ru_nvcsw += t->nvcsw; 1523 r->ru_nivcsw += t->nivcsw; 1524 r->ru_minflt += t->min_flt; 1525 r->ru_majflt += t->maj_flt; 1526 r->ru_inblock += task_io_get_inblock(t); 1527 r->ru_oublock += task_io_get_oublock(t); 1528} 1529 1530static void k_getrusage(struct task_struct *p, int who, struct rusage *r) 1531{ 1532 struct task_struct *t; 1533 unsigned long flags; 1534 cputime_t tgutime, tgstime, utime, stime; 1535 unsigned long maxrss = 0; 1536 1537 memset((char *)r, 0, sizeof (*r)); 1538 utime = stime = 0; 1539 1540 if (who == RUSAGE_THREAD) { 1541 task_cputime_adjusted(current, &utime, &stime); 1542 accumulate_thread_rusage(p, r); 1543 maxrss = p->signal->maxrss; 1544 goto out; 1545 } 1546 1547 if (!lock_task_sighand(p, &flags)) 1548 return; 1549 1550 switch (who) { 1551 case RUSAGE_BOTH: 1552 case RUSAGE_CHILDREN: 1553 utime = p->signal->cutime; 1554 stime = p->signal->cstime; 1555 r->ru_nvcsw = p->signal->cnvcsw; 1556 r->ru_nivcsw = p->signal->cnivcsw; 1557 r->ru_minflt = p->signal->cmin_flt; 1558 r->ru_majflt = p->signal->cmaj_flt; 1559 r->ru_inblock = p->signal->cinblock; 1560 r->ru_oublock = p->signal->coublock; 1561 maxrss = p->signal->cmaxrss; 1562 1563 if (who == RUSAGE_CHILDREN) 1564 break; 1565 1566 case RUSAGE_SELF: 1567 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1568 utime += tgutime; 1569 stime += tgstime; 1570 r->ru_nvcsw += p->signal->nvcsw; 1571 r->ru_nivcsw += p->signal->nivcsw; 1572 r->ru_minflt += p->signal->min_flt; 1573 r->ru_majflt += p->signal->maj_flt; 1574 r->ru_inblock += p->signal->inblock; 1575 r->ru_oublock += p->signal->oublock; 1576 if (maxrss < p->signal->maxrss) 1577 maxrss = p->signal->maxrss; 1578 t = p; 1579 do { 1580 accumulate_thread_rusage(t, r); 1581 } while_each_thread(p, t); 1582 break; 1583 1584 default: 1585 BUG(); 1586 } 1587 unlock_task_sighand(p, &flags); 1588 1589out: 1590 cputime_to_timeval(utime, &r->ru_utime); 1591 cputime_to_timeval(stime, &r->ru_stime); 1592 1593 if (who != RUSAGE_CHILDREN) { 1594 struct mm_struct *mm = get_task_mm(p); 1595 1596 if (mm) { 1597 setmax_mm_hiwater_rss(&maxrss, mm); 1598 mmput(mm); 1599 } 1600 } 1601 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ 1602} 1603 1604int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1605{ 1606 struct rusage r; 1607 1608 k_getrusage(p, who, &r); 1609 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1610} 1611 1612SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) 1613{ 1614 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && 1615 who != RUSAGE_THREAD) 1616 return -EINVAL; 1617 return getrusage(current, who, ru); 1618} 1619 1620#ifdef CONFIG_COMPAT 1621COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) 1622{ 1623 struct rusage r; 1624 1625 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && 1626 who != RUSAGE_THREAD) 1627 return -EINVAL; 1628 1629 k_getrusage(current, who, &r); 1630 return put_compat_rusage(&r, ru); 1631} 1632#endif 1633 1634SYSCALL_DEFINE1(umask, int, mask) 1635{ 1636 mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); 1637 return mask; 1638} 1639 1640static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) 1641{ 1642 struct fd exe; 1643 struct inode *inode; 1644 int err; 1645 1646 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); 1647 1648 exe = fdget(fd); 1649 if (!exe.file) 1650 return -EBADF; 1651 1652 inode = file_inode(exe.file); 1653 1654 /* 1655 * Because the original mm->exe_file points to executable file, make 1656 * sure that this one is executable as well, to avoid breaking an 1657 * overall picture. 1658 */ 1659 err = -EACCES; 1660 if (!S_ISREG(inode->i_mode) || 1661 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) 1662 goto exit; 1663 1664 err = inode_permission(inode, MAY_EXEC); 1665 if (err) 1666 goto exit; 1667 1668 /* 1669 * Forbid mm->exe_file change if old file still mapped. 1670 */ 1671 err = -EBUSY; 1672 if (mm->exe_file) { 1673 struct vm_area_struct *vma; 1674 1675 for (vma = mm->mmap; vma; vma = vma->vm_next) 1676 if (vma->vm_file && 1677 path_equal(&vma->vm_file->f_path, 1678 &mm->exe_file->f_path)) 1679 goto exit; 1680 } 1681 1682 /* 1683 * The symlink can be changed only once, just to disallow arbitrary 1684 * transitions malicious software might bring in. This means one 1685 * could make a snapshot over all processes running and monitor 1686 * /proc/pid/exe changes to notice unusual activity if needed. 1687 */ 1688 err = -EPERM; 1689 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) 1690 goto exit; 1691 1692 err = 0; 1693 set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ 1694exit: 1695 fdput(exe); 1696 return err; 1697} 1698 1699#ifdef CONFIG_CHECKPOINT_RESTORE 1700/* 1701 * WARNING: we don't require any capability here so be very careful 1702 * in what is allowed for modification from userspace. 1703 */ 1704static int validate_prctl_map(struct prctl_mm_map *prctl_map) 1705{ 1706 unsigned long mmap_max_addr = TASK_SIZE; 1707 struct mm_struct *mm = current->mm; 1708 int error = -EINVAL, i; 1709 1710 static const unsigned char offsets[] = { 1711 offsetof(struct prctl_mm_map, start_code), 1712 offsetof(struct prctl_mm_map, end_code), 1713 offsetof(struct prctl_mm_map, start_data), 1714 offsetof(struct prctl_mm_map, end_data), 1715 offsetof(struct prctl_mm_map, start_brk), 1716 offsetof(struct prctl_mm_map, brk), 1717 offsetof(struct prctl_mm_map, start_stack), 1718 offsetof(struct prctl_mm_map, arg_start), 1719 offsetof(struct prctl_mm_map, arg_end), 1720 offsetof(struct prctl_mm_map, env_start), 1721 offsetof(struct prctl_mm_map, env_end), 1722 }; 1723 1724 /* 1725 * Make sure the members are not somewhere outside 1726 * of allowed address space. 1727 */ 1728 for (i = 0; i < ARRAY_SIZE(offsets); i++) { 1729 u64 val = *(u64 *)((char *)prctl_map + offsets[i]); 1730 1731 if ((unsigned long)val >= mmap_max_addr || 1732 (unsigned long)val < mmap_min_addr) 1733 goto out; 1734 } 1735 1736 /* 1737 * Make sure the pairs are ordered. 1738 */ 1739#define __prctl_check_order(__m1, __op, __m2) \ 1740 ((unsigned long)prctl_map->__m1 __op \ 1741 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL 1742 error = __prctl_check_order(start_code, <, end_code); 1743 error |= __prctl_check_order(start_data, <, end_data); 1744 error |= __prctl_check_order(start_brk, <=, brk); 1745 error |= __prctl_check_order(arg_start, <=, arg_end); 1746 error |= __prctl_check_order(env_start, <=, env_end); 1747 if (error) 1748 goto out; 1749#undef __prctl_check_order 1750 1751 error = -EINVAL; 1752 1753 /* 1754 * @brk should be after @end_data in traditional maps. 1755 */ 1756 if (prctl_map->start_brk <= prctl_map->end_data || 1757 prctl_map->brk <= prctl_map->end_data) 1758 goto out; 1759 1760 /* 1761 * Neither we should allow to override limits if they set. 1762 */ 1763 if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, 1764 prctl_map->start_brk, prctl_map->end_data, 1765 prctl_map->start_data)) 1766 goto out; 1767 1768 /* 1769 * Someone is trying to cheat the auxv vector. 1770 */ 1771 if (prctl_map->auxv_size) { 1772 if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) 1773 goto out; 1774 } 1775 1776 /* 1777 * Finally, make sure the caller has the rights to 1778 * change /proc/pid/exe link: only local root should 1779 * be allowed to. 1780 */ 1781 if (prctl_map->exe_fd != (u32)-1) { 1782 struct user_namespace *ns = current_user_ns(); 1783 const struct cred *cred = current_cred(); 1784 1785 if (!uid_eq(cred->uid, make_kuid(ns, 0)) || 1786 !gid_eq(cred->gid, make_kgid(ns, 0))) 1787 goto out; 1788 } 1789 1790 error = 0; 1791out: 1792 return error; 1793} 1794 1795static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) 1796{ 1797 struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; 1798 unsigned long user_auxv[AT_VECTOR_SIZE]; 1799 struct mm_struct *mm = current->mm; 1800 int error; 1801 1802 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); 1803 BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); 1804 1805 if (opt == PR_SET_MM_MAP_SIZE) 1806 return put_user((unsigned int)sizeof(prctl_map), 1807 (unsigned int __user *)addr); 1808 1809 if (data_size != sizeof(prctl_map)) 1810 return -EINVAL; 1811 1812 if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) 1813 return -EFAULT; 1814 1815 error = validate_prctl_map(&prctl_map); 1816 if (error) 1817 return error; 1818 1819 if (prctl_map.auxv_size) { 1820 memset(user_auxv, 0, sizeof(user_auxv)); 1821 if (copy_from_user(user_auxv, 1822 (const void __user *)prctl_map.auxv, 1823 prctl_map.auxv_size)) 1824 return -EFAULT; 1825 1826 /* Last entry must be AT_NULL as specification requires */ 1827 user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; 1828 user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; 1829 } 1830 1831 down_write(&mm->mmap_sem); 1832 if (prctl_map.exe_fd != (u32)-1) 1833 error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); 1834 downgrade_write(&mm->mmap_sem); 1835 if (error) 1836 goto out; 1837 1838 /* 1839 * We don't validate if these members are pointing to 1840 * real present VMAs because application may have correspond 1841 * VMAs already unmapped and kernel uses these members for statistics 1842 * output in procfs mostly, except 1843 * 1844 * - @start_brk/@brk which are used in do_brk but kernel lookups 1845 * for VMAs when updating these memvers so anything wrong written 1846 * here cause kernel to swear at userspace program but won't lead 1847 * to any problem in kernel itself 1848 */ 1849 1850 mm->start_code = prctl_map.start_code; 1851 mm->end_code = prctl_map.end_code; 1852 mm->start_data = prctl_map.start_data; 1853 mm->end_data = prctl_map.end_data; 1854 mm->start_brk = prctl_map.start_brk; 1855 mm->brk = prctl_map.brk; 1856 mm->start_stack = prctl_map.start_stack; 1857 mm->arg_start = prctl_map.arg_start; 1858 mm->arg_end = prctl_map.arg_end; 1859 mm->env_start = prctl_map.env_start; 1860 mm->env_end = prctl_map.env_end; 1861 1862 /* 1863 * Note this update of @saved_auxv is lockless thus 1864 * if someone reads this member in procfs while we're 1865 * updating -- it may get partly updated results. It's 1866 * known and acceptable trade off: we leave it as is to 1867 * not introduce additional locks here making the kernel 1868 * more complex. 1869 */ 1870 if (prctl_map.auxv_size) 1871 memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); 1872 1873 error = 0; 1874out: 1875 up_read(&mm->mmap_sem); 1876 return error; 1877} 1878#endif /* CONFIG_CHECKPOINT_RESTORE */ 1879 1880static int prctl_set_mm(int opt, unsigned long addr, 1881 unsigned long arg4, unsigned long arg5) 1882{ 1883 struct mm_struct *mm = current->mm; 1884 struct vm_area_struct *vma; 1885 int error; 1886 1887 if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && 1888 opt != PR_SET_MM_MAP && 1889 opt != PR_SET_MM_MAP_SIZE))) 1890 return -EINVAL; 1891 1892#ifdef CONFIG_CHECKPOINT_RESTORE 1893 if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) 1894 return prctl_set_mm_map(opt, (const void __user *)addr, arg4); 1895#endif 1896 1897 if (!capable(CAP_SYS_RESOURCE)) 1898 return -EPERM; 1899 1900 if (opt == PR_SET_MM_EXE_FILE) { 1901 down_write(&mm->mmap_sem); 1902 error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr); 1903 up_write(&mm->mmap_sem); 1904 return error; 1905 } 1906 1907 if (addr >= TASK_SIZE || addr < mmap_min_addr) 1908 return -EINVAL; 1909 1910 error = -EINVAL; 1911 1912 down_read(&mm->mmap_sem); 1913 vma = find_vma(mm, addr); 1914 1915 switch (opt) { 1916 case PR_SET_MM_START_CODE: 1917 mm->start_code = addr; 1918 break; 1919 case PR_SET_MM_END_CODE: 1920 mm->end_code = addr; 1921 break; 1922 case PR_SET_MM_START_DATA: 1923 mm->start_data = addr; 1924 break; 1925 case PR_SET_MM_END_DATA: 1926 mm->end_data = addr; 1927 break; 1928 1929 case PR_SET_MM_START_BRK: 1930 if (addr <= mm->end_data) 1931 goto out; 1932 1933 if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr, 1934 mm->end_data, mm->start_data)) 1935 goto out; 1936 1937 mm->start_brk = addr; 1938 break; 1939 1940 case PR_SET_MM_BRK: 1941 if (addr <= mm->end_data) 1942 goto out; 1943 1944 if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk, 1945 mm->end_data, mm->start_data)) 1946 goto out; 1947 1948 mm->brk = addr; 1949 break; 1950 1951 /* 1952 * If command line arguments and environment 1953 * are placed somewhere else on stack, we can 1954 * set them up here, ARG_START/END to setup 1955 * command line argumets and ENV_START/END 1956 * for environment. 1957 */ 1958 case PR_SET_MM_START_STACK: 1959 case PR_SET_MM_ARG_START: 1960 case PR_SET_MM_ARG_END: 1961 case PR_SET_MM_ENV_START: 1962 case PR_SET_MM_ENV_END: 1963 if (!vma) { 1964 error = -EFAULT; 1965 goto out; 1966 } 1967 if (opt == PR_SET_MM_START_STACK) 1968 mm->start_stack = addr; 1969 else if (opt == PR_SET_MM_ARG_START) 1970 mm->arg_start = addr; 1971 else if (opt == PR_SET_MM_ARG_END) 1972 mm->arg_end = addr; 1973 else if (opt == PR_SET_MM_ENV_START) 1974 mm->env_start = addr; 1975 else if (opt == PR_SET_MM_ENV_END) 1976 mm->env_end = addr; 1977 break; 1978 1979 /* 1980 * This doesn't move auxiliary vector itself 1981 * since it's pinned to mm_struct, but allow 1982 * to fill vector with new values. It's up 1983 * to a caller to provide sane values here 1984 * otherwise user space tools which use this 1985 * vector might be unhappy. 1986 */ 1987 case PR_SET_MM_AUXV: { 1988 unsigned long user_auxv[AT_VECTOR_SIZE]; 1989 1990 if (arg4 > sizeof(user_auxv)) 1991 goto out; 1992 up_read(&mm->mmap_sem); 1993 1994 if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) 1995 return -EFAULT; 1996 1997 /* Make sure the last entry is always AT_NULL */ 1998 user_auxv[AT_VECTOR_SIZE - 2] = 0; 1999 user_auxv[AT_VECTOR_SIZE - 1] = 0; 2000 2001 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); 2002 2003 task_lock(current); 2004 memcpy(mm->saved_auxv, user_auxv, arg4); 2005 task_unlock(current); 2006 2007 return 0; 2008 } 2009 default: 2010 goto out; 2011 } 2012 2013 error = 0; 2014out: 2015 up_read(&mm->mmap_sem); 2016 return error; 2017} 2018 2019#ifdef CONFIG_CHECKPOINT_RESTORE 2020static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) 2021{ 2022 return put_user(me->clear_child_tid, tid_addr); 2023} 2024#else 2025static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) 2026{ 2027 return -EINVAL; 2028} 2029#endif 2030 2031#ifdef CONFIG_MMU 2032static int prctl_update_vma_anon_name(struct vm_area_struct *vma, 2033 struct vm_area_struct **prev, 2034 unsigned long start, unsigned long end, 2035 const char __user *name_addr) 2036{ 2037 struct mm_struct * mm = vma->vm_mm; 2038 int error = 0; 2039 pgoff_t pgoff; 2040 2041 if (name_addr == vma_get_anon_name(vma)) { 2042 *prev = vma; 2043 goto out; 2044 } 2045 2046 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 2047 *prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma, 2048 vma->vm_file, pgoff, vma_policy(vma), 2049 name_addr); 2050 if (*prev) { 2051 vma = *prev; 2052 goto success; 2053 } 2054 2055 *prev = vma; 2056 2057 if (start != vma->vm_start) { 2058 error = split_vma(mm, vma, start, 1); 2059 if (error) 2060 goto out; 2061 } 2062 2063 if (end != vma->vm_end) { 2064 error = split_vma(mm, vma, end, 0); 2065 if (error) 2066 goto out; 2067 } 2068 2069success: 2070 if (!vma->vm_file) 2071 vma->shared.anon_name = name_addr; 2072 2073out: 2074 if (error == -ENOMEM) 2075 error = -EAGAIN; 2076 return error; 2077} 2078 2079static int prctl_set_vma_anon_name(unsigned long start, unsigned long end, 2080 unsigned long arg) 2081{ 2082 unsigned long tmp; 2083 struct vm_area_struct * vma, *prev; 2084 int unmapped_error = 0; 2085 int error = -EINVAL; 2086 2087 /* 2088 * If the interval [start,end) covers some unmapped address 2089 * ranges, just ignore them, but return -ENOMEM at the end. 2090 * - this matches the handling in madvise. 2091 */ 2092 vma = find_vma_prev(current->mm, start, &prev); 2093 if (vma && start > vma->vm_start) 2094 prev = vma; 2095 2096 for (;;) { 2097 /* Still start < end. */ 2098 error = -ENOMEM; 2099 if (!vma) 2100 return error; 2101 2102 /* Here start < (end|vma->vm_end). */ 2103 if (start < vma->vm_start) { 2104 unmapped_error = -ENOMEM; 2105 start = vma->vm_start; 2106 if (start >= end) 2107 return error; 2108 } 2109 2110 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 2111 tmp = vma->vm_end; 2112 if (end < tmp) 2113 tmp = end; 2114 2115 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 2116 error = prctl_update_vma_anon_name(vma, &prev, start, tmp, 2117 (const char __user *)arg); 2118 if (error) 2119 return error; 2120 start = tmp; 2121 if (prev && start < prev->vm_end) 2122 start = prev->vm_end; 2123 error = unmapped_error; 2124 if (start >= end) 2125 return error; 2126 if (prev) 2127 vma = prev->vm_next; 2128 else /* madvise_remove dropped mmap_sem */ 2129 vma = find_vma(current->mm, start); 2130 } 2131} 2132 2133static int prctl_set_vma(unsigned long opt, unsigned long start, 2134 unsigned long len_in, unsigned long arg) 2135{ 2136 struct mm_struct *mm = current->mm; 2137 int error; 2138 unsigned long len; 2139 unsigned long end; 2140 2141 if (start & ~PAGE_MASK) 2142 return -EINVAL; 2143 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 2144 2145 /* Check to see whether len was rounded up from small -ve to zero */ 2146 if (len_in && !len) 2147 return -EINVAL; 2148 2149 end = start + len; 2150 if (end < start) 2151 return -EINVAL; 2152 2153 if (end == start) 2154 return 0; 2155 2156 down_write(&mm->mmap_sem); 2157 2158 switch (opt) { 2159 case PR_SET_VMA_ANON_NAME: 2160 error = prctl_set_vma_anon_name(start, end, arg); 2161 break; 2162 default: 2163 error = -EINVAL; 2164 } 2165 2166 up_write(&mm->mmap_sem); 2167 2168 return error; 2169} 2170#else /* CONFIG_MMU */ 2171static int prctl_set_vma(unsigned long opt, unsigned long start, 2172 unsigned long len_in, unsigned long arg) 2173{ 2174 return -EINVAL; 2175} 2176#endif 2177 2178SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 2179 unsigned long, arg4, unsigned long, arg5) 2180{ 2181 struct task_struct *me = current; 2182 struct task_struct *tsk; 2183 unsigned char comm[sizeof(me->comm)]; 2184 long error; 2185 2186 error = security_task_prctl(option, arg2, arg3, arg4, arg5); 2187 if (error != -ENOSYS) 2188 return error; 2189 2190 error = 0; 2191 switch (option) { 2192 case PR_SET_PDEATHSIG: 2193 if (!valid_signal(arg2)) { 2194 error = -EINVAL; 2195 break; 2196 } 2197 me->pdeath_signal = arg2; 2198 break; 2199 case PR_GET_PDEATHSIG: 2200 error = put_user(me->pdeath_signal, (int __user *)arg2); 2201 break; 2202 case PR_GET_DUMPABLE: 2203 error = get_dumpable(me->mm); 2204 break; 2205 case PR_SET_DUMPABLE: 2206 if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { 2207 error = -EINVAL; 2208 break; 2209 } 2210 set_dumpable(me->mm, arg2); 2211 break; 2212 2213 case PR_SET_UNALIGN: 2214 error = SET_UNALIGN_CTL(me, arg2); 2215 break; 2216 case PR_GET_UNALIGN: 2217 error = GET_UNALIGN_CTL(me, arg2); 2218 break; 2219 case PR_SET_FPEMU: 2220 error = SET_FPEMU_CTL(me, arg2); 2221 break; 2222 case PR_GET_FPEMU: 2223 error = GET_FPEMU_CTL(me, arg2); 2224 break; 2225 case PR_SET_FPEXC: 2226 error = SET_FPEXC_CTL(me, arg2); 2227 break; 2228 case PR_GET_FPEXC: 2229 error = GET_FPEXC_CTL(me, arg2); 2230 break; 2231 case PR_GET_TIMING: 2232 error = PR_TIMING_STATISTICAL; 2233 break; 2234 case PR_SET_TIMING: 2235 if (arg2 != PR_TIMING_STATISTICAL) 2236 error = -EINVAL; 2237 break; 2238 case PR_SET_NAME: 2239 comm[sizeof(me->comm) - 1] = 0; 2240 if (strncpy_from_user(comm, (char __user *)arg2, 2241 sizeof(me->comm) - 1) < 0) 2242 return -EFAULT; 2243 set_task_comm(me, comm); 2244 proc_comm_connector(me); 2245 break; 2246 case PR_GET_NAME: 2247 get_task_comm(comm, me); 2248 if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) 2249 return -EFAULT; 2250 break; 2251 case PR_GET_ENDIAN: 2252 error = GET_ENDIAN(me, arg2); 2253 break; 2254 case PR_SET_ENDIAN: 2255 error = SET_ENDIAN(me, arg2); 2256 break; 2257 case PR_GET_SECCOMP: 2258 error = prctl_get_seccomp(); 2259 break; 2260 case PR_SET_SECCOMP: 2261 error = prctl_set_seccomp(arg2, (char __user *)arg3); 2262 break; 2263 case PR_GET_TSC: 2264 error = GET_TSC_CTL(arg2); 2265 break; 2266 case PR_SET_TSC: 2267 error = SET_TSC_CTL(arg2); 2268 break; 2269 case PR_TASK_PERF_EVENTS_DISABLE: 2270 error = perf_event_task_disable(); 2271 break; 2272 case PR_TASK_PERF_EVENTS_ENABLE: 2273 error = perf_event_task_enable(); 2274 break; 2275 case PR_GET_TIMERSLACK: 2276 error = current->timer_slack_ns; 2277 break; 2278 case PR_SET_TIMERSLACK: 2279 if (arg2 <= 0) 2280 current->timer_slack_ns = 2281 current->default_timer_slack_ns; 2282 else 2283 current->timer_slack_ns = arg2; 2284 break; 2285 case PR_MCE_KILL: 2286 if (arg4 | arg5) 2287 return -EINVAL; 2288 switch (arg2) { 2289 case PR_MCE_KILL_CLEAR: 2290 if (arg3 != 0) 2291 return -EINVAL; 2292 current->flags &= ~PF_MCE_PROCESS; 2293 break; 2294 case PR_MCE_KILL_SET: 2295 current->flags |= PF_MCE_PROCESS; 2296 if (arg3 == PR_MCE_KILL_EARLY) 2297 current->flags |= PF_MCE_EARLY; 2298 else if (arg3 == PR_MCE_KILL_LATE) 2299 current->flags &= ~PF_MCE_EARLY; 2300 else if (arg3 == PR_MCE_KILL_DEFAULT) 2301 current->flags &= 2302 ~(PF_MCE_EARLY|PF_MCE_PROCESS); 2303 else 2304 return -EINVAL; 2305 break; 2306 default: 2307 return -EINVAL; 2308 } 2309 break; 2310 case PR_MCE_KILL_GET: 2311 if (arg2 | arg3 | arg4 | arg5) 2312 return -EINVAL; 2313 if (current->flags & PF_MCE_PROCESS) 2314 error = (current->flags & PF_MCE_EARLY) ? 2315 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; 2316 else 2317 error = PR_MCE_KILL_DEFAULT; 2318 break; 2319 case PR_SET_MM: 2320 error = prctl_set_mm(arg2, arg3, arg4, arg5); 2321 break; 2322 case PR_GET_TID_ADDRESS: 2323 error = prctl_get_tid_address(me, (int __user **)arg2); 2324 break; 2325 case PR_SET_TIMERSLACK_PID: 2326 if (task_pid_vnr(current) != (pid_t)arg3 && 2327 !capable(CAP_SYS_NICE)) 2328 return -EPERM; 2329 rcu_read_lock(); 2330 tsk = find_task_by_vpid((pid_t)arg3); 2331 if (tsk == NULL) { 2332 rcu_read_unlock(); 2333 return -EINVAL; 2334 } 2335 get_task_struct(tsk); 2336 rcu_read_unlock(); 2337 if (arg2 <= 0) 2338 tsk->timer_slack_ns = 2339 tsk->default_timer_slack_ns; 2340 else 2341 tsk->timer_slack_ns = arg2; 2342 put_task_struct(tsk); 2343 error = 0; 2344 break; 2345 case PR_SET_CHILD_SUBREAPER: 2346 me->signal->is_child_subreaper = !!arg2; 2347 break; 2348 case PR_GET_CHILD_SUBREAPER: 2349 error = put_user(me->signal->is_child_subreaper, 2350 (int __user *)arg2); 2351 break; 2352 case PR_SET_NO_NEW_PRIVS: 2353 if (arg2 != 1 || arg3 || arg4 || arg5) 2354 return -EINVAL; 2355 2356 task_set_no_new_privs(current); 2357 break; 2358 case PR_GET_NO_NEW_PRIVS: 2359 if (arg2 || arg3 || arg4 || arg5) 2360 return -EINVAL; 2361 return task_no_new_privs(current) ? 1 : 0; 2362 case PR_GET_THP_DISABLE: 2363 if (arg2 || arg3 || arg4 || arg5) 2364 return -EINVAL; 2365 error = !!(me->mm->def_flags & VM_NOHUGEPAGE); 2366 break; 2367 case PR_SET_THP_DISABLE: 2368 if (arg3 || arg4 || arg5) 2369 return -EINVAL; 2370 down_write(&me->mm->mmap_sem); 2371 if (arg2) 2372 me->mm->def_flags |= VM_NOHUGEPAGE; 2373 else 2374 me->mm->def_flags &= ~VM_NOHUGEPAGE; 2375 up_write(&me->mm->mmap_sem); 2376 break; 2377 case PR_SET_VMA: 2378 error = prctl_set_vma(arg2, arg3, arg4, arg5); 2379 break; 2380 default: 2381 error = -EINVAL; 2382 break; 2383 } 2384 return error; 2385} 2386 2387SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, 2388 struct getcpu_cache __user *, unused) 2389{ 2390 int err = 0; 2391 int cpu = raw_smp_processor_id(); 2392 2393 if (cpup) 2394 err |= put_user(cpu, cpup); 2395 if (nodep) 2396 err |= put_user(cpu_to_node(cpu), nodep); 2397 return err ? -EFAULT : 0; 2398} 2399 2400/** 2401 * do_sysinfo - fill in sysinfo struct 2402 * @info: pointer to buffer to fill 2403 */ 2404static int do_sysinfo(struct sysinfo *info) 2405{ 2406 unsigned long mem_total, sav_total; 2407 unsigned int mem_unit, bitcount; 2408 struct timespec tp; 2409 2410 memset(info, 0, sizeof(struct sysinfo)); 2411 2412 get_monotonic_boottime(&tp); 2413 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); 2414 2415 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); 2416 2417 info->procs = nr_threads; 2418 2419 si_meminfo(info); 2420 si_swapinfo(info); 2421 2422 /* 2423 * If the sum of all the available memory (i.e. ram + swap) 2424 * is less than can be stored in a 32 bit unsigned long then 2425 * we can be binary compatible with 2.2.x kernels. If not, 2426 * well, in that case 2.2.x was broken anyways... 2427 * 2428 * -Erik Andersen <andersee@debian.org> 2429 */ 2430 2431 mem_total = info->totalram + info->totalswap; 2432 if (mem_total < info->totalram || mem_total < info->totalswap) 2433 goto out; 2434 bitcount = 0; 2435 mem_unit = info->mem_unit; 2436 while (mem_unit > 1) { 2437 bitcount++; 2438 mem_unit >>= 1; 2439 sav_total = mem_total; 2440 mem_total <<= 1; 2441 if (mem_total < sav_total) 2442 goto out; 2443 } 2444 2445 /* 2446 * If mem_total did not overflow, multiply all memory values by 2447 * info->mem_unit and set it to 1. This leaves things compatible 2448 * with 2.2.x, and also retains compatibility with earlier 2.4.x 2449 * kernels... 2450 */ 2451 2452 info->mem_unit = 1; 2453 info->totalram <<= bitcount; 2454 info->freeram <<= bitcount; 2455 info->sharedram <<= bitcount; 2456 info->bufferram <<= bitcount; 2457 info->totalswap <<= bitcount; 2458 info->freeswap <<= bitcount; 2459 info->totalhigh <<= bitcount; 2460 info->freehigh <<= bitcount; 2461 2462out: 2463 return 0; 2464} 2465 2466SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) 2467{ 2468 struct sysinfo val; 2469 2470 do_sysinfo(&val); 2471 2472 if (copy_to_user(info, &val, sizeof(struct sysinfo))) 2473 return -EFAULT; 2474 2475 return 0; 2476} 2477 2478#ifdef CONFIG_COMPAT 2479struct compat_sysinfo { 2480 s32 uptime; 2481 u32 loads[3]; 2482 u32 totalram; 2483 u32 freeram; 2484 u32 sharedram; 2485 u32 bufferram; 2486 u32 totalswap; 2487 u32 freeswap; 2488 u16 procs; 2489 u16 pad; 2490 u32 totalhigh; 2491 u32 freehigh; 2492 u32 mem_unit; 2493 char _f[20-2*sizeof(u32)-sizeof(int)]; 2494}; 2495 2496COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) 2497{ 2498 struct sysinfo s; 2499 2500 do_sysinfo(&s); 2501 2502 /* Check to see if any memory value is too large for 32-bit and scale 2503 * down if needed 2504 */ 2505 if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { 2506 int bitcount = 0; 2507 2508 while (s.mem_unit < PAGE_SIZE) { 2509 s.mem_unit <<= 1; 2510 bitcount++; 2511 } 2512 2513 s.totalram >>= bitcount; 2514 s.freeram >>= bitcount; 2515 s.sharedram >>= bitcount; 2516 s.bufferram >>= bitcount; 2517 s.totalswap >>= bitcount; 2518 s.freeswap >>= bitcount; 2519 s.totalhigh >>= bitcount; 2520 s.freehigh >>= bitcount; 2521 } 2522 2523 if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || 2524 __put_user(s.uptime, &info->uptime) || 2525 __put_user(s.loads[0], &info->loads[0]) || 2526 __put_user(s.loads[1], &info->loads[1]) || 2527 __put_user(s.loads[2], &info->loads[2]) || 2528 __put_user(s.totalram, &info->totalram) || 2529 __put_user(s.freeram, &info->freeram) || 2530 __put_user(s.sharedram, &info->sharedram) || 2531 __put_user(s.bufferram, &info->bufferram) || 2532 __put_user(s.totalswap, &info->totalswap) || 2533 __put_user(s.freeswap, &info->freeswap) || 2534 __put_user(s.procs, &info->procs) || 2535 __put_user(s.totalhigh, &info->totalhigh) || 2536 __put_user(s.freehigh, &info->freehigh) || 2537 __put_user(s.mem_unit, &info->mem_unit)) 2538 return -EFAULT; 2539 2540 return 0; 2541} 2542#endif /* CONFIG_COMPAT */ 2543