xpc_main.c revision 261f3b4979db88d29fc86aad9f76fbc0c2c6d21a
1/* 2 * This file is subject to the terms and conditions of the GNU General Public 3 * License. See the file "COPYING" in the main directory of this archive 4 * for more details. 5 * 6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved. 7 */ 8 9/* 10 * Cross Partition Communication (XPC) support - standard version. 11 * 12 * XPC provides a message passing capability that crosses partition 13 * boundaries. This module is made up of two parts: 14 * 15 * partition This part detects the presence/absence of other 16 * partitions. It provides a heartbeat and monitors 17 * the heartbeats of other partitions. 18 * 19 * channel This part manages the channels and sends/receives 20 * messages across them to/from other partitions. 21 * 22 * There are a couple of additional functions residing in XP, which 23 * provide an interface to XPC for its users. 24 * 25 * 26 * Caveats: 27 * 28 * . Currently on sn2, we have no way to determine which nasid an IRQ 29 * came from. Thus, xpc_send_IRQ_sn2() does a remote amo write 30 * followed by an IPI. The amo indicates where data is to be pulled 31 * from, so after the IPI arrives, the remote partition checks the amo 32 * word. The IPI can actually arrive before the amo however, so other 33 * code must periodically check for this case. Also, remote amo 34 * operations do not reliably time out. Thus we do a remote PIO read 35 * solely to know whether the remote partition is down and whether we 36 * should stop sending IPIs to it. This remote PIO read operation is 37 * set up in a special nofault region so SAL knows to ignore (and 38 * cleanup) any errors due to the remote amo write, PIO read, and/or 39 * PIO write operations. 40 * 41 * If/when new hardware solves this IPI problem, we should abandon 42 * the current approach. 43 * 44 */ 45 46#include <linux/module.h> 47#include <linux/sysctl.h> 48#include <linux/device.h> 49#include <linux/delay.h> 50#include <linux/reboot.h> 51#include <linux/kdebug.h> 52#include <linux/kthread.h> 53#include "xpc.h" 54 55/* define two XPC debug device structures to be used with dev_dbg() et al */ 56 57struct device_driver xpc_dbg_name = { 58 .name = "xpc" 59}; 60 61struct device xpc_part_dbg_subname = { 62 .bus_id = {0}, /* set to "part" at xpc_init() time */ 63 .driver = &xpc_dbg_name 64}; 65 66struct device xpc_chan_dbg_subname = { 67 .bus_id = {0}, /* set to "chan" at xpc_init() time */ 68 .driver = &xpc_dbg_name 69}; 70 71struct device *xpc_part = &xpc_part_dbg_subname; 72struct device *xpc_chan = &xpc_chan_dbg_subname; 73 74static int xpc_kdebug_ignore; 75 76/* systune related variables for /proc/sys directories */ 77 78static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL; 79static int xpc_hb_min_interval = 1; 80static int xpc_hb_max_interval = 10; 81 82static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL; 83static int xpc_hb_check_min_interval = 10; 84static int xpc_hb_check_max_interval = 120; 85 86int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT; 87static int xpc_disengage_min_timelimit; /* = 0 */ 88static int xpc_disengage_max_timelimit = 120; 89 90static ctl_table xpc_sys_xpc_hb_dir[] = { 91 { 92 .ctl_name = CTL_UNNUMBERED, 93 .procname = "hb_interval", 94 .data = &xpc_hb_interval, 95 .maxlen = sizeof(int), 96 .mode = 0644, 97 .proc_handler = &proc_dointvec_minmax, 98 .strategy = &sysctl_intvec, 99 .extra1 = &xpc_hb_min_interval, 100 .extra2 = &xpc_hb_max_interval}, 101 { 102 .ctl_name = CTL_UNNUMBERED, 103 .procname = "hb_check_interval", 104 .data = &xpc_hb_check_interval, 105 .maxlen = sizeof(int), 106 .mode = 0644, 107 .proc_handler = &proc_dointvec_minmax, 108 .strategy = &sysctl_intvec, 109 .extra1 = &xpc_hb_check_min_interval, 110 .extra2 = &xpc_hb_check_max_interval}, 111 {} 112}; 113static ctl_table xpc_sys_xpc_dir[] = { 114 { 115 .ctl_name = CTL_UNNUMBERED, 116 .procname = "hb", 117 .mode = 0555, 118 .child = xpc_sys_xpc_hb_dir}, 119 { 120 .ctl_name = CTL_UNNUMBERED, 121 .procname = "disengage_timelimit", 122 .data = &xpc_disengage_timelimit, 123 .maxlen = sizeof(int), 124 .mode = 0644, 125 .proc_handler = &proc_dointvec_minmax, 126 .strategy = &sysctl_intvec, 127 .extra1 = &xpc_disengage_min_timelimit, 128 .extra2 = &xpc_disengage_max_timelimit}, 129 {} 130}; 131static ctl_table xpc_sys_dir[] = { 132 { 133 .ctl_name = CTL_UNNUMBERED, 134 .procname = "xpc", 135 .mode = 0555, 136 .child = xpc_sys_xpc_dir}, 137 {} 138}; 139static struct ctl_table_header *xpc_sysctl; 140 141/* non-zero if any remote partition disengage was timed out */ 142int xpc_disengage_timedout; 143 144/* #of activate IRQs received */ 145atomic_t xpc_activate_IRQ_rcvd = ATOMIC_INIT(0); 146 147/* IRQ handler notifies this wait queue on receipt of an IRQ */ 148DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq); 149 150static unsigned long xpc_hb_check_timeout; 151static struct timer_list xpc_hb_timer; 152void *xpc_heartbeating_to_mask; 153 154/* notification that the xpc_hb_checker thread has exited */ 155static DECLARE_COMPLETION(xpc_hb_checker_exited); 156 157/* notification that the xpc_discovery thread has exited */ 158static DECLARE_COMPLETION(xpc_discovery_exited); 159 160static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *); 161 162static int xpc_system_reboot(struct notifier_block *, unsigned long, void *); 163static struct notifier_block xpc_reboot_notifier = { 164 .notifier_call = xpc_system_reboot, 165}; 166 167static int xpc_system_die(struct notifier_block *, unsigned long, void *); 168static struct notifier_block xpc_die_notifier = { 169 .notifier_call = xpc_system_die, 170}; 171 172enum xp_retval (*xpc_get_partition_rsvd_page_pa) (u64 buf, u64 *cookie, 173 u64 *paddr, size_t *len); 174enum xp_retval (*xpc_rsvd_page_init) (struct xpc_rsvd_page *rp); 175void (*xpc_heartbeat_init) (void); 176void (*xpc_heartbeat_exit) (void); 177void (*xpc_increment_heartbeat) (void); 178void (*xpc_offline_heartbeat) (void); 179void (*xpc_online_heartbeat) (void); 180void (*xpc_check_remote_hb) (void); 181 182enum xp_retval (*xpc_make_first_contact) (struct xpc_partition *part); 183void (*xpc_notify_senders_of_disconnect) (struct xpc_channel *ch); 184u64 (*xpc_get_chctl_all_flags) (struct xpc_partition *part); 185enum xp_retval (*xpc_allocate_msgqueues) (struct xpc_channel *ch); 186void (*xpc_free_msgqueues) (struct xpc_channel *ch); 187void (*xpc_process_msg_chctl_flags) (struct xpc_partition *part, int ch_number); 188int (*xpc_n_of_deliverable_msgs) (struct xpc_channel *ch); 189struct xpc_msg *(*xpc_get_deliverable_msg) (struct xpc_channel *ch); 190 191void (*xpc_request_partition_activation) (struct xpc_rsvd_page *remote_rp, 192 u64 remote_rp_pa, int nasid); 193void (*xpc_request_partition_reactivation) (struct xpc_partition *part); 194void (*xpc_request_partition_deactivation) (struct xpc_partition *part); 195void (*xpc_cancel_partition_deactivation_request) (struct xpc_partition *part); 196 197void (*xpc_process_activate_IRQ_rcvd) (int n_IRQs_expected); 198enum xp_retval (*xpc_setup_infrastructure) (struct xpc_partition *part); 199void (*xpc_teardown_infrastructure) (struct xpc_partition *part); 200 201void (*xpc_indicate_partition_engaged) (struct xpc_partition *part); 202int (*xpc_partition_engaged) (short partid); 203int (*xpc_any_partition_engaged) (void); 204void (*xpc_indicate_partition_disengaged) (struct xpc_partition *part); 205void (*xpc_assume_partition_disengaged) (short partid); 206 207void (*xpc_send_chctl_closerequest) (struct xpc_channel *ch, 208 unsigned long *irq_flags); 209void (*xpc_send_chctl_closereply) (struct xpc_channel *ch, 210 unsigned long *irq_flags); 211void (*xpc_send_chctl_openrequest) (struct xpc_channel *ch, 212 unsigned long *irq_flags); 213void (*xpc_send_chctl_openreply) (struct xpc_channel *ch, 214 unsigned long *irq_flags); 215 216enum xp_retval (*xpc_send_msg) (struct xpc_channel *ch, u32 flags, 217 void *payload, u16 payload_size, u8 notify_type, 218 xpc_notify_func func, void *key); 219void (*xpc_received_msg) (struct xpc_channel *ch, struct xpc_msg *msg); 220 221/* 222 * Timer function to enforce the timelimit on the partition disengage. 223 */ 224static void 225xpc_timeout_partition_disengage(unsigned long data) 226{ 227 struct xpc_partition *part = (struct xpc_partition *)data; 228 229 DBUG_ON(time_is_after_jiffies(part->disengage_timeout)); 230 231 (void)xpc_partition_disengaged(part); 232 233 DBUG_ON(part->disengage_timeout != 0); 234 DBUG_ON(xpc_partition_engaged(XPC_PARTID(part))); 235} 236 237/* 238 * Timer to produce the heartbeat. The timer structures function is 239 * already set when this is initially called. A tunable is used to 240 * specify when the next timeout should occur. 241 */ 242static void 243xpc_hb_beater(unsigned long dummy) 244{ 245 xpc_increment_heartbeat(); 246 247 if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) 248 wake_up_interruptible(&xpc_activate_IRQ_wq); 249 250 xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ); 251 add_timer(&xpc_hb_timer); 252} 253 254static void 255xpc_start_hb_beater(void) 256{ 257 xpc_heartbeat_init(); 258 init_timer(&xpc_hb_timer); 259 xpc_hb_timer.function = xpc_hb_beater; 260 xpc_hb_beater(0); 261} 262 263static void 264xpc_stop_hb_beater(void) 265{ 266 del_timer_sync(&xpc_hb_timer); 267 xpc_heartbeat_exit(); 268} 269 270/* 271 * This thread is responsible for nearly all of the partition 272 * activation/deactivation. 273 */ 274static int 275xpc_hb_checker(void *ignore) 276{ 277 int last_IRQ_count = 0; 278 int new_IRQ_count; 279 int force_IRQ = 0; 280 281 /* this thread was marked active by xpc_hb_init() */ 282 283 set_cpus_allowed_ptr(current, &cpumask_of_cpu(XPC_HB_CHECK_CPU)); 284 285 /* set our heartbeating to other partitions into motion */ 286 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ); 287 xpc_start_hb_beater(); 288 289 while (!xpc_exiting) { 290 291 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have " 292 "been received\n", 293 (int)(xpc_hb_check_timeout - jiffies), 294 atomic_read(&xpc_activate_IRQ_rcvd) - last_IRQ_count); 295 296 /* checking of remote heartbeats is skewed by IRQ handling */ 297 if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) { 298 dev_dbg(xpc_part, "checking remote heartbeats\n"); 299 xpc_check_remote_hb(); 300 301 /* 302 * We need to periodically recheck to ensure no 303 * IRQ/amo pairs have been missed. That check 304 * must always reset xpc_hb_check_timeout. 305 */ 306 force_IRQ = 1; 307 } 308 309 /* check for outstanding IRQs */ 310 new_IRQ_count = atomic_read(&xpc_activate_IRQ_rcvd); 311 if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) { 312 force_IRQ = 0; 313 314 dev_dbg(xpc_part, "found an IRQ to process; will be " 315 "resetting xpc_hb_check_timeout\n"); 316 317 xpc_process_activate_IRQ_rcvd(new_IRQ_count - 318 last_IRQ_count); 319 last_IRQ_count = new_IRQ_count; 320 321 xpc_hb_check_timeout = jiffies + 322 (xpc_hb_check_interval * HZ); 323 } 324 325 /* wait for IRQ or timeout */ 326 (void)wait_event_interruptible(xpc_activate_IRQ_wq, 327 (last_IRQ_count < atomic_read( 328 &xpc_activate_IRQ_rcvd) 329 || time_is_before_eq_jiffies( 330 xpc_hb_check_timeout) || 331 xpc_exiting)); 332 } 333 334 xpc_stop_hb_beater(); 335 336 dev_dbg(xpc_part, "heartbeat checker is exiting\n"); 337 338 /* mark this thread as having exited */ 339 complete(&xpc_hb_checker_exited); 340 return 0; 341} 342 343/* 344 * This thread will attempt to discover other partitions to activate 345 * based on info provided by SAL. This new thread is short lived and 346 * will exit once discovery is complete. 347 */ 348static int 349xpc_initiate_discovery(void *ignore) 350{ 351 xpc_discovery(); 352 353 dev_dbg(xpc_part, "discovery thread is exiting\n"); 354 355 /* mark this thread as having exited */ 356 complete(&xpc_discovery_exited); 357 return 0; 358} 359 360/* 361 * The first kthread assigned to a newly activated partition is the one 362 * created by XPC HB with which it calls xpc_activating(). XPC hangs on to 363 * that kthread until the partition is brought down, at which time that kthread 364 * returns back to XPC HB. (The return of that kthread will signify to XPC HB 365 * that XPC has dismantled all communication infrastructure for the associated 366 * partition.) This kthread becomes the channel manager for that partition. 367 * 368 * Each active partition has a channel manager, who, besides connecting and 369 * disconnecting channels, will ensure that each of the partition's connected 370 * channels has the required number of assigned kthreads to get the work done. 371 */ 372static void 373xpc_channel_mgr(struct xpc_partition *part) 374{ 375 while (part->act_state != XPC_P_DEACTIVATING || 376 atomic_read(&part->nchannels_active) > 0 || 377 !xpc_partition_disengaged(part)) { 378 379 xpc_process_sent_chctl_flags(part); 380 381 /* 382 * Wait until we've been requested to activate kthreads or 383 * all of the channel's message queues have been torn down or 384 * a signal is pending. 385 * 386 * The channel_mgr_requests is set to 1 after being awakened, 387 * This is done to prevent the channel mgr from making one pass 388 * through the loop for each request, since he will 389 * be servicing all the requests in one pass. The reason it's 390 * set to 1 instead of 0 is so that other kthreads will know 391 * that the channel mgr is running and won't bother trying to 392 * wake him up. 393 */ 394 atomic_dec(&part->channel_mgr_requests); 395 (void)wait_event_interruptible(part->channel_mgr_wq, 396 (atomic_read(&part->channel_mgr_requests) > 0 || 397 part->chctl.all_flags != 0 || 398 (part->act_state == XPC_P_DEACTIVATING && 399 atomic_read(&part->nchannels_active) == 0 && 400 xpc_partition_disengaged(part)))); 401 atomic_set(&part->channel_mgr_requests, 1); 402 } 403} 404 405/* 406 * When XPC HB determines that a partition has come up, it will create a new 407 * kthread and that kthread will call this function to attempt to set up the 408 * basic infrastructure used for Cross Partition Communication with the newly 409 * upped partition. 410 * 411 * The kthread that was created by XPC HB and which setup the XPC 412 * infrastructure will remain assigned to the partition becoming the channel 413 * manager for that partition until the partition is deactivating, at which 414 * time the kthread will teardown the XPC infrastructure and then exit. 415 */ 416static int 417xpc_activating(void *__partid) 418{ 419 short partid = (u64)__partid; 420 struct xpc_partition *part = &xpc_partitions[partid]; 421 unsigned long irq_flags; 422 423 DBUG_ON(partid < 0 || partid >= xp_max_npartitions); 424 425 spin_lock_irqsave(&part->act_lock, irq_flags); 426 427 if (part->act_state == XPC_P_DEACTIVATING) { 428 part->act_state = XPC_P_INACTIVE; 429 spin_unlock_irqrestore(&part->act_lock, irq_flags); 430 part->remote_rp_pa = 0; 431 return 0; 432 } 433 434 /* indicate the thread is activating */ 435 DBUG_ON(part->act_state != XPC_P_ACTIVATION_REQ); 436 part->act_state = XPC_P_ACTIVATING; 437 438 XPC_SET_REASON(part, 0, 0); 439 spin_unlock_irqrestore(&part->act_lock, irq_flags); 440 441 dev_dbg(xpc_part, "activating partition %d\n", partid); 442 443 xpc_allow_hb(partid); 444 445 if (xpc_setup_infrastructure(part) == xpSuccess) { 446 (void)xpc_part_ref(part); /* this will always succeed */ 447 448 if (xpc_make_first_contact(part) == xpSuccess) { 449 xpc_mark_partition_active(part); 450 xpc_channel_mgr(part); 451 /* won't return until partition is deactivating */ 452 } 453 454 xpc_part_deref(part); 455 xpc_teardown_infrastructure(part); 456 } 457 458 xpc_disallow_hb(partid); 459 xpc_mark_partition_inactive(part); 460 461 if (part->reason == xpReactivating) { 462 /* interrupting ourselves results in activating partition */ 463 xpc_request_partition_reactivation(part); 464 } 465 466 return 0; 467} 468 469void 470xpc_activate_partition(struct xpc_partition *part) 471{ 472 short partid = XPC_PARTID(part); 473 unsigned long irq_flags; 474 struct task_struct *kthread; 475 476 spin_lock_irqsave(&part->act_lock, irq_flags); 477 478 DBUG_ON(part->act_state != XPC_P_INACTIVE); 479 480 part->act_state = XPC_P_ACTIVATION_REQ; 481 XPC_SET_REASON(part, xpCloneKThread, __LINE__); 482 483 spin_unlock_irqrestore(&part->act_lock, irq_flags); 484 485 kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d", 486 partid); 487 if (IS_ERR(kthread)) { 488 spin_lock_irqsave(&part->act_lock, irq_flags); 489 part->act_state = XPC_P_INACTIVE; 490 XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__); 491 spin_unlock_irqrestore(&part->act_lock, irq_flags); 492 } 493} 494 495void 496xpc_activate_kthreads(struct xpc_channel *ch, int needed) 497{ 498 int idle = atomic_read(&ch->kthreads_idle); 499 int assigned = atomic_read(&ch->kthreads_assigned); 500 int wakeup; 501 502 DBUG_ON(needed <= 0); 503 504 if (idle > 0) { 505 wakeup = (needed > idle) ? idle : needed; 506 needed -= wakeup; 507 508 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, " 509 "channel=%d\n", wakeup, ch->partid, ch->number); 510 511 /* only wakeup the requested number of kthreads */ 512 wake_up_nr(&ch->idle_wq, wakeup); 513 } 514 515 if (needed <= 0) 516 return; 517 518 if (needed + assigned > ch->kthreads_assigned_limit) { 519 needed = ch->kthreads_assigned_limit - assigned; 520 if (needed <= 0) 521 return; 522 } 523 524 dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n", 525 needed, ch->partid, ch->number); 526 527 xpc_create_kthreads(ch, needed, 0); 528} 529 530/* 531 * This function is where XPC's kthreads wait for messages to deliver. 532 */ 533static void 534xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch) 535{ 536 do { 537 /* deliver messages to their intended recipients */ 538 539 while (xpc_n_of_deliverable_msgs(ch) > 0 && 540 !(ch->flags & XPC_C_DISCONNECTING)) { 541 xpc_deliver_msg(ch); 542 } 543 544 if (atomic_inc_return(&ch->kthreads_idle) > 545 ch->kthreads_idle_limit) { 546 /* too many idle kthreads on this channel */ 547 atomic_dec(&ch->kthreads_idle); 548 break; 549 } 550 551 dev_dbg(xpc_chan, "idle kthread calling " 552 "wait_event_interruptible_exclusive()\n"); 553 554 (void)wait_event_interruptible_exclusive(ch->idle_wq, 555 (xpc_n_of_deliverable_msgs(ch) > 0 || 556 (ch->flags & XPC_C_DISCONNECTING))); 557 558 atomic_dec(&ch->kthreads_idle); 559 560 } while (!(ch->flags & XPC_C_DISCONNECTING)); 561} 562 563static int 564xpc_kthread_start(void *args) 565{ 566 short partid = XPC_UNPACK_ARG1(args); 567 u16 ch_number = XPC_UNPACK_ARG2(args); 568 struct xpc_partition *part = &xpc_partitions[partid]; 569 struct xpc_channel *ch; 570 int n_needed; 571 unsigned long irq_flags; 572 573 dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n", 574 partid, ch_number); 575 576 ch = &part->channels[ch_number]; 577 578 if (!(ch->flags & XPC_C_DISCONNECTING)) { 579 580 /* let registerer know that connection has been established */ 581 582 spin_lock_irqsave(&ch->lock, irq_flags); 583 if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) { 584 ch->flags |= XPC_C_CONNECTEDCALLOUT; 585 spin_unlock_irqrestore(&ch->lock, irq_flags); 586 587 xpc_connected_callout(ch); 588 589 spin_lock_irqsave(&ch->lock, irq_flags); 590 ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE; 591 spin_unlock_irqrestore(&ch->lock, irq_flags); 592 593 /* 594 * It is possible that while the callout was being 595 * made that the remote partition sent some messages. 596 * If that is the case, we may need to activate 597 * additional kthreads to help deliver them. We only 598 * need one less than total #of messages to deliver. 599 */ 600 n_needed = xpc_n_of_deliverable_msgs(ch) - 1; 601 if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING)) 602 xpc_activate_kthreads(ch, n_needed); 603 604 } else { 605 spin_unlock_irqrestore(&ch->lock, irq_flags); 606 } 607 608 xpc_kthread_waitmsgs(part, ch); 609 } 610 611 /* let registerer know that connection is disconnecting */ 612 613 spin_lock_irqsave(&ch->lock, irq_flags); 614 if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) && 615 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) { 616 ch->flags |= XPC_C_DISCONNECTINGCALLOUT; 617 spin_unlock_irqrestore(&ch->lock, irq_flags); 618 619 xpc_disconnect_callout(ch, xpDisconnecting); 620 621 spin_lock_irqsave(&ch->lock, irq_flags); 622 ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE; 623 } 624 spin_unlock_irqrestore(&ch->lock, irq_flags); 625 626 if (atomic_dec_return(&ch->kthreads_assigned) == 0 && 627 atomic_dec_return(&part->nchannels_engaged) == 0) { 628 xpc_indicate_partition_disengaged(part); 629 } 630 631 xpc_msgqueue_deref(ch); 632 633 dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n", 634 partid, ch_number); 635 636 xpc_part_deref(part); 637 return 0; 638} 639 640/* 641 * For each partition that XPC has established communications with, there is 642 * a minimum of one kernel thread assigned to perform any operation that 643 * may potentially sleep or block (basically the callouts to the asynchronous 644 * functions registered via xpc_connect()). 645 * 646 * Additional kthreads are created and destroyed by XPC as the workload 647 * demands. 648 * 649 * A kthread is assigned to one of the active channels that exists for a given 650 * partition. 651 */ 652void 653xpc_create_kthreads(struct xpc_channel *ch, int needed, 654 int ignore_disconnecting) 655{ 656 unsigned long irq_flags; 657 u64 args = XPC_PACK_ARGS(ch->partid, ch->number); 658 struct xpc_partition *part = &xpc_partitions[ch->partid]; 659 struct task_struct *kthread; 660 661 while (needed-- > 0) { 662 663 /* 664 * The following is done on behalf of the newly created 665 * kthread. That kthread is responsible for doing the 666 * counterpart to the following before it exits. 667 */ 668 if (ignore_disconnecting) { 669 if (!atomic_inc_not_zero(&ch->kthreads_assigned)) { 670 /* kthreads assigned had gone to zero */ 671 BUG_ON(!(ch->flags & 672 XPC_C_DISCONNECTINGCALLOUT_MADE)); 673 break; 674 } 675 676 } else if (ch->flags & XPC_C_DISCONNECTING) { 677 break; 678 679 } else if (atomic_inc_return(&ch->kthreads_assigned) == 1 && 680 atomic_inc_return(&part->nchannels_engaged) == 1) { 681 xpc_indicate_partition_engaged(part); 682 } 683 (void)xpc_part_ref(part); 684 xpc_msgqueue_ref(ch); 685 686 kthread = kthread_run(xpc_kthread_start, (void *)args, 687 "xpc%02dc%d", ch->partid, ch->number); 688 if (IS_ERR(kthread)) { 689 /* the fork failed */ 690 691 /* 692 * NOTE: if (ignore_disconnecting && 693 * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true, 694 * then we'll deadlock if all other kthreads assigned 695 * to this channel are blocked in the channel's 696 * registerer, because the only thing that will unblock 697 * them is the xpDisconnecting callout that this 698 * failed kthread_run() would have made. 699 */ 700 701 if (atomic_dec_return(&ch->kthreads_assigned) == 0 && 702 atomic_dec_return(&part->nchannels_engaged) == 0) { 703 xpc_indicate_partition_disengaged(part); 704 } 705 xpc_msgqueue_deref(ch); 706 xpc_part_deref(part); 707 708 if (atomic_read(&ch->kthreads_assigned) < 709 ch->kthreads_idle_limit) { 710 /* 711 * Flag this as an error only if we have an 712 * insufficient #of kthreads for the channel 713 * to function. 714 */ 715 spin_lock_irqsave(&ch->lock, irq_flags); 716 XPC_DISCONNECT_CHANNEL(ch, xpLackOfResources, 717 &irq_flags); 718 spin_unlock_irqrestore(&ch->lock, irq_flags); 719 } 720 break; 721 } 722 } 723} 724 725void 726xpc_disconnect_wait(int ch_number) 727{ 728 unsigned long irq_flags; 729 short partid; 730 struct xpc_partition *part; 731 struct xpc_channel *ch; 732 int wakeup_channel_mgr; 733 734 /* now wait for all callouts to the caller's function to cease */ 735 for (partid = 0; partid < xp_max_npartitions; partid++) { 736 part = &xpc_partitions[partid]; 737 738 if (!xpc_part_ref(part)) 739 continue; 740 741 ch = &part->channels[ch_number]; 742 743 if (!(ch->flags & XPC_C_WDISCONNECT)) { 744 xpc_part_deref(part); 745 continue; 746 } 747 748 wait_for_completion(&ch->wdisconnect_wait); 749 750 spin_lock_irqsave(&ch->lock, irq_flags); 751 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED)); 752 wakeup_channel_mgr = 0; 753 754 if (ch->delayed_chctl_flags) { 755 if (part->act_state != XPC_P_DEACTIVATING) { 756 spin_lock(&part->chctl_lock); 757 part->chctl.flags[ch->number] |= 758 ch->delayed_chctl_flags; 759 spin_unlock(&part->chctl_lock); 760 wakeup_channel_mgr = 1; 761 } 762 ch->delayed_chctl_flags = 0; 763 } 764 765 ch->flags &= ~XPC_C_WDISCONNECT; 766 spin_unlock_irqrestore(&ch->lock, irq_flags); 767 768 if (wakeup_channel_mgr) 769 xpc_wakeup_channel_mgr(part); 770 771 xpc_part_deref(part); 772 } 773} 774 775static void 776xpc_do_exit(enum xp_retval reason) 777{ 778 short partid; 779 int active_part_count, printed_waiting_msg = 0; 780 struct xpc_partition *part; 781 unsigned long printmsg_time, disengage_timeout = 0; 782 783 /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */ 784 DBUG_ON(xpc_exiting == 1); 785 786 /* 787 * Let the heartbeat checker thread and the discovery thread 788 * (if one is running) know that they should exit. Also wake up 789 * the heartbeat checker thread in case it's sleeping. 790 */ 791 xpc_exiting = 1; 792 wake_up_interruptible(&xpc_activate_IRQ_wq); 793 794 /* wait for the discovery thread to exit */ 795 wait_for_completion(&xpc_discovery_exited); 796 797 /* wait for the heartbeat checker thread to exit */ 798 wait_for_completion(&xpc_hb_checker_exited); 799 800 /* sleep for a 1/3 of a second or so */ 801 (void)msleep_interruptible(300); 802 803 /* wait for all partitions to become inactive */ 804 805 printmsg_time = jiffies + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ); 806 xpc_disengage_timedout = 0; 807 808 do { 809 active_part_count = 0; 810 811 for (partid = 0; partid < xp_max_npartitions; partid++) { 812 part = &xpc_partitions[partid]; 813 814 if (xpc_partition_disengaged(part) && 815 part->act_state == XPC_P_INACTIVE) { 816 continue; 817 } 818 819 active_part_count++; 820 821 XPC_DEACTIVATE_PARTITION(part, reason); 822 823 if (part->disengage_timeout > disengage_timeout) 824 disengage_timeout = part->disengage_timeout; 825 } 826 827 if (xpc_any_partition_engaged()) { 828 if (time_is_before_jiffies(printmsg_time)) { 829 dev_info(xpc_part, "waiting for remote " 830 "partitions to deactivate, timeout in " 831 "%ld seconds\n", (disengage_timeout - 832 jiffies) / HZ); 833 printmsg_time = jiffies + 834 (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ); 835 printed_waiting_msg = 1; 836 } 837 838 } else if (active_part_count > 0) { 839 if (printed_waiting_msg) { 840 dev_info(xpc_part, "waiting for local partition" 841 " to deactivate\n"); 842 printed_waiting_msg = 0; 843 } 844 845 } else { 846 if (!xpc_disengage_timedout) { 847 dev_info(xpc_part, "all partitions have " 848 "deactivated\n"); 849 } 850 break; 851 } 852 853 /* sleep for a 1/3 of a second or so */ 854 (void)msleep_interruptible(300); 855 856 } while (1); 857 858 DBUG_ON(xpc_any_partition_engaged()); 859 DBUG_ON(xpc_any_hbs_allowed() != 0); 860 861 /* a zero timestamp indicates our rsvd page is not initialized */ 862 xpc_rsvd_page->ts_jiffies = 0; 863 864 if (reason == xpUnloading) { 865 (void)unregister_die_notifier(&xpc_die_notifier); 866 (void)unregister_reboot_notifier(&xpc_reboot_notifier); 867 } 868 869 /* clear the interface to XPC's functions */ 870 xpc_clear_interface(); 871 872 if (xpc_sysctl) 873 unregister_sysctl_table(xpc_sysctl); 874 875 kfree(xpc_partitions); 876 877 if (is_shub()) 878 xpc_exit_sn2(); 879 else 880 xpc_exit_uv(); 881} 882 883/* 884 * This function is called when the system is being rebooted. 885 */ 886static int 887xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused) 888{ 889 enum xp_retval reason; 890 891 switch (event) { 892 case SYS_RESTART: 893 reason = xpSystemReboot; 894 break; 895 case SYS_HALT: 896 reason = xpSystemHalt; 897 break; 898 case SYS_POWER_OFF: 899 reason = xpSystemPoweroff; 900 break; 901 default: 902 reason = xpSystemGoingDown; 903 } 904 905 xpc_do_exit(reason); 906 return NOTIFY_DONE; 907} 908 909/* 910 * Notify other partitions to deactivate from us by first disengaging from all 911 * references to our memory. 912 */ 913static void 914xpc_die_deactivate(void) 915{ 916 struct xpc_partition *part; 917 short partid; 918 int any_engaged; 919 long keep_waiting; 920 long wait_to_print; 921 922 /* keep xpc_hb_checker thread from doing anything (just in case) */ 923 xpc_exiting = 1; 924 925 xpc_disallow_all_hbs(); /*indicate we're deactivated */ 926 927 for (partid = 0; partid < xp_max_npartitions; partid++) { 928 part = &xpc_partitions[partid]; 929 930 if (xpc_partition_engaged(partid) || 931 part->act_state != XPC_P_INACTIVE) { 932 xpc_request_partition_deactivation(part); 933 xpc_indicate_partition_disengaged(part); 934 } 935 } 936 937 /* 938 * Though we requested that all other partitions deactivate from us, 939 * we only wait until they've all disengaged or we've reached the 940 * defined timelimit. 941 * 942 * Given that one iteration through the following while-loop takes 943 * approximately 200 microseconds, calculate the #of loops to take 944 * before bailing and the #of loops before printing a waiting message. 945 */ 946 keep_waiting = xpc_disengage_timelimit * 1000 * 5; 947 wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1000 * 5; 948 949 while (1) { 950 any_engaged = xpc_any_partition_engaged(); 951 if (!any_engaged) { 952 dev_info(xpc_part, "all partitions have deactivated\n"); 953 break; 954 } 955 956 if (!keep_waiting--) { 957 for (partid = 0; partid < xp_max_npartitions; 958 partid++) { 959 if (xpc_partition_engaged(partid)) { 960 dev_info(xpc_part, "deactivate from " 961 "remote partition %d timed " 962 "out\n", partid); 963 } 964 } 965 break; 966 } 967 968 if (!wait_to_print--) { 969 dev_info(xpc_part, "waiting for remote partitions to " 970 "deactivate, timeout in %ld seconds\n", 971 keep_waiting / (1000 * 5)); 972 wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 973 1000 * 5; 974 } 975 976 udelay(200); 977 } 978} 979 980/* 981 * This function is called when the system is being restarted or halted due 982 * to some sort of system failure. If this is the case we need to notify the 983 * other partitions to disengage from all references to our memory. 984 * This function can also be called when our heartbeater could be offlined 985 * for a time. In this case we need to notify other partitions to not worry 986 * about the lack of a heartbeat. 987 */ 988static int 989xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused) 990{ 991#ifdef CONFIG_IA64 /* !!! temporary kludge */ 992 switch (event) { 993 case DIE_MACHINE_RESTART: 994 case DIE_MACHINE_HALT: 995 xpc_die_deactivate(); 996 break; 997 998 case DIE_KDEBUG_ENTER: 999 /* Should lack of heartbeat be ignored by other partitions? */ 1000 if (!xpc_kdebug_ignore) 1001 break; 1002 1003 /* fall through */ 1004 case DIE_MCA_MONARCH_ENTER: 1005 case DIE_INIT_MONARCH_ENTER: 1006 xpc_offline_heartbeat(); 1007 break; 1008 1009 case DIE_KDEBUG_LEAVE: 1010 /* Is lack of heartbeat being ignored by other partitions? */ 1011 if (!xpc_kdebug_ignore) 1012 break; 1013 1014 /* fall through */ 1015 case DIE_MCA_MONARCH_LEAVE: 1016 case DIE_INIT_MONARCH_LEAVE: 1017 xpc_online_heartbeat(); 1018 break; 1019 } 1020#else 1021 xpc_die_deactivate(); 1022#endif 1023 1024 return NOTIFY_DONE; 1025} 1026 1027int __init 1028xpc_init(void) 1029{ 1030 int ret; 1031 short partid; 1032 struct xpc_partition *part; 1033 struct task_struct *kthread; 1034 1035 snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part"); 1036 snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan"); 1037 1038 if (is_shub()) { 1039 /* 1040 * The ia64-sn2 architecture supports at most 64 partitions. 1041 * And the inability to unregister remote amos restricts us 1042 * further to only support exactly 64 partitions on this 1043 * architecture, no less. 1044 */ 1045 if (xp_max_npartitions != 64) 1046 return -EINVAL; 1047 1048 ret = xpc_init_sn2(); 1049 if (ret != 0) 1050 return ret; 1051 1052 } else if (is_uv()) { 1053 xpc_init_uv(); 1054 1055 } else { 1056 return -ENODEV; 1057 } 1058 1059 xpc_partitions = kzalloc(sizeof(struct xpc_partition) * 1060 xp_max_npartitions, GFP_KERNEL); 1061 if (xpc_partitions == NULL) { 1062 dev_err(xpc_part, "can't get memory for partition structure\n"); 1063 ret = -ENOMEM; 1064 goto out_1; 1065 } 1066 1067 /* 1068 * The first few fields of each entry of xpc_partitions[] need to 1069 * be initialized now so that calls to xpc_connect() and 1070 * xpc_disconnect() can be made prior to the activation of any remote 1071 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE 1072 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING 1073 * PARTITION HAS BEEN ACTIVATED. 1074 */ 1075 for (partid = 0; partid < xp_max_npartitions; partid++) { 1076 part = &xpc_partitions[partid]; 1077 1078 DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part)); 1079 1080 part->activate_IRQ_rcvd = 0; 1081 spin_lock_init(&part->act_lock); 1082 part->act_state = XPC_P_INACTIVE; 1083 XPC_SET_REASON(part, 0, 0); 1084 1085 init_timer(&part->disengage_timer); 1086 part->disengage_timer.function = 1087 xpc_timeout_partition_disengage; 1088 part->disengage_timer.data = (unsigned long)part; 1089 1090 part->setup_state = XPC_P_UNSET; 1091 init_waitqueue_head(&part->teardown_wq); 1092 atomic_set(&part->references, 0); 1093 } 1094 1095 xpc_sysctl = register_sysctl_table(xpc_sys_dir); 1096 1097 /* 1098 * Fill the partition reserved page with the information needed by 1099 * other partitions to discover we are alive and establish initial 1100 * communications. 1101 */ 1102 xpc_rsvd_page = xpc_setup_rsvd_page(); 1103 if (xpc_rsvd_page == NULL) { 1104 dev_err(xpc_part, "can't setup our reserved page\n"); 1105 ret = -EBUSY; 1106 goto out_2; 1107 } 1108 1109 /* add ourselves to the reboot_notifier_list */ 1110 ret = register_reboot_notifier(&xpc_reboot_notifier); 1111 if (ret != 0) 1112 dev_warn(xpc_part, "can't register reboot notifier\n"); 1113 1114 /* add ourselves to the die_notifier list */ 1115 ret = register_die_notifier(&xpc_die_notifier); 1116 if (ret != 0) 1117 dev_warn(xpc_part, "can't register die notifier\n"); 1118 1119 /* 1120 * The real work-horse behind xpc. This processes incoming 1121 * interrupts and monitors remote heartbeats. 1122 */ 1123 kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME); 1124 if (IS_ERR(kthread)) { 1125 dev_err(xpc_part, "failed while forking hb check thread\n"); 1126 ret = -EBUSY; 1127 goto out_3; 1128 } 1129 1130 /* 1131 * Startup a thread that will attempt to discover other partitions to 1132 * activate based on info provided by SAL. This new thread is short 1133 * lived and will exit once discovery is complete. 1134 */ 1135 kthread = kthread_run(xpc_initiate_discovery, NULL, 1136 XPC_DISCOVERY_THREAD_NAME); 1137 if (IS_ERR(kthread)) { 1138 dev_err(xpc_part, "failed while forking discovery thread\n"); 1139 1140 /* mark this new thread as a non-starter */ 1141 complete(&xpc_discovery_exited); 1142 1143 xpc_do_exit(xpUnloading); 1144 return -EBUSY; 1145 } 1146 1147 /* set the interface to point at XPC's functions */ 1148 xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect, 1149 xpc_initiate_send, xpc_initiate_send_notify, 1150 xpc_initiate_received, xpc_initiate_partid_to_nasids); 1151 1152 return 0; 1153 1154 /* initialization was not successful */ 1155out_3: 1156 /* a zero timestamp indicates our rsvd page is not initialized */ 1157 xpc_rsvd_page->ts_jiffies = 0; 1158 1159 (void)unregister_die_notifier(&xpc_die_notifier); 1160 (void)unregister_reboot_notifier(&xpc_reboot_notifier); 1161out_2: 1162 if (xpc_sysctl) 1163 unregister_sysctl_table(xpc_sysctl); 1164 kfree(xpc_partitions); 1165out_1: 1166 if (is_shub()) 1167 xpc_exit_sn2(); 1168 else 1169 xpc_exit_uv(); 1170 return ret; 1171} 1172 1173module_init(xpc_init); 1174 1175void __exit 1176xpc_exit(void) 1177{ 1178 xpc_do_exit(xpUnloading); 1179} 1180 1181module_exit(xpc_exit); 1182 1183MODULE_AUTHOR("Silicon Graphics, Inc."); 1184MODULE_DESCRIPTION("Cross Partition Communication (XPC) support"); 1185MODULE_LICENSE("GPL"); 1186 1187module_param(xpc_hb_interval, int, 0); 1188MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between " 1189 "heartbeat increments."); 1190 1191module_param(xpc_hb_check_interval, int, 0); 1192MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between " 1193 "heartbeat checks."); 1194 1195module_param(xpc_disengage_timelimit, int, 0); 1196MODULE_PARM_DESC(xpc_disengage_timelimit, "Number of seconds to wait " 1197 "for disengage to complete."); 1198 1199module_param(xpc_kdebug_ignore, int, 0); 1200MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by " 1201 "other partitions when dropping into kdebug."); 1202