tlb_uv.c revision 88ed9dd7f63c3ae71c1984d99ee2dced0b386dea
1/* 2 * SGI UltraViolet TLB flush routines. 3 * 4 * (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI. 5 * 6 * This code is released under the GNU General Public License version 2 or 7 * later. 8 */ 9#include <linux/seq_file.h> 10#include <linux/proc_fs.h> 11#include <linux/debugfs.h> 12#include <linux/kernel.h> 13#include <linux/slab.h> 14#include <linux/delay.h> 15 16#include <asm/mmu_context.h> 17#include <asm/uv/uv.h> 18#include <asm/uv/uv_mmrs.h> 19#include <asm/uv/uv_hub.h> 20#include <asm/uv/uv_bau.h> 21#include <asm/apic.h> 22#include <asm/idle.h> 23#include <asm/tsc.h> 24#include <asm/irq_vectors.h> 25#include <asm/timer.h> 26 27/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ 28static int timeout_base_ns[] = { 29 20, 30 160, 31 1280, 32 10240, 33 81920, 34 655360, 35 5242880, 36 167772160 37}; 38 39static int timeout_us; 40static int nobau; 41static int baudisabled; 42static spinlock_t disable_lock; 43static cycles_t congested_cycles; 44 45/* tunables: */ 46static int max_concurr = MAX_BAU_CONCURRENT; 47static int max_concurr_const = MAX_BAU_CONCURRENT; 48static int plugged_delay = PLUGGED_DELAY; 49static int plugsb4reset = PLUGSB4RESET; 50static int timeoutsb4reset = TIMEOUTSB4RESET; 51static int ipi_reset_limit = IPI_RESET_LIMIT; 52static int complete_threshold = COMPLETE_THRESHOLD; 53static int congested_respns_us = CONGESTED_RESPONSE_US; 54static int congested_reps = CONGESTED_REPS; 55static int congested_period = CONGESTED_PERIOD; 56 57static struct tunables tunables[] = { 58 {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ 59 {&plugged_delay, PLUGGED_DELAY}, 60 {&plugsb4reset, PLUGSB4RESET}, 61 {&timeoutsb4reset, TIMEOUTSB4RESET}, 62 {&ipi_reset_limit, IPI_RESET_LIMIT}, 63 {&complete_threshold, COMPLETE_THRESHOLD}, 64 {&congested_respns_us, CONGESTED_RESPONSE_US}, 65 {&congested_reps, CONGESTED_REPS}, 66 {&congested_period, CONGESTED_PERIOD} 67}; 68 69static struct dentry *tunables_dir; 70static struct dentry *tunables_file; 71 72/* these correspond to the statistics printed by ptc_seq_show() */ 73static char *stat_description[] = { 74 "sent: number of shootdown messages sent", 75 "stime: time spent sending messages", 76 "numuvhubs: number of hubs targeted with shootdown", 77 "numuvhubs16: number times 16 or more hubs targeted", 78 "numuvhubs8: number times 8 or more hubs targeted", 79 "numuvhubs4: number times 4 or more hubs targeted", 80 "numuvhubs2: number times 2 or more hubs targeted", 81 "numuvhubs1: number times 1 hub targeted", 82 "numcpus: number of cpus targeted with shootdown", 83 "dto: number of destination timeouts", 84 "retries: destination timeout retries sent", 85 "rok: : destination timeouts successfully retried", 86 "resetp: ipi-style resource resets for plugs", 87 "resett: ipi-style resource resets for timeouts", 88 "giveup: fall-backs to ipi-style shootdowns", 89 "sto: number of source timeouts", 90 "bz: number of stay-busy's", 91 "throt: number times spun in throttle", 92 "swack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE", 93 "recv: shootdown messages received", 94 "rtime: time spent processing messages", 95 "all: shootdown all-tlb messages", 96 "one: shootdown one-tlb messages", 97 "mult: interrupts that found multiple messages", 98 "none: interrupts that found no messages", 99 "retry: number of retry messages processed", 100 "canc: number messages canceled by retries", 101 "nocan: number retries that found nothing to cancel", 102 "reset: number of ipi-style reset requests processed", 103 "rcan: number messages canceled by reset requests", 104 "disable: number times use of the BAU was disabled", 105 "enable: number times use of the BAU was re-enabled" 106}; 107 108static int __init 109setup_nobau(char *arg) 110{ 111 nobau = 1; 112 return 0; 113} 114early_param("nobau", setup_nobau); 115 116/* base pnode in this partition */ 117static int uv_base_pnode __read_mostly; 118 119static DEFINE_PER_CPU(struct ptc_stats, ptcstats); 120static DEFINE_PER_CPU(struct bau_control, bau_control); 121static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); 122 123/* 124 * Determine the first node on a uvhub. 'Nodes' are used for kernel 125 * memory allocation. 126 */ 127static int __init uvhub_to_first_node(int uvhub) 128{ 129 int node, b; 130 131 for_each_online_node(node) { 132 b = uv_node_to_blade_id(node); 133 if (uvhub == b) 134 return node; 135 } 136 return -1; 137} 138 139/* 140 * Determine the apicid of the first cpu on a uvhub. 141 */ 142static int __init uvhub_to_first_apicid(int uvhub) 143{ 144 int cpu; 145 146 for_each_present_cpu(cpu) 147 if (uvhub == uv_cpu_to_blade_id(cpu)) 148 return per_cpu(x86_cpu_to_apicid, cpu); 149 return -1; 150} 151 152/* 153 * Free a software acknowledge hardware resource by clearing its Pending 154 * bit. This will return a reply to the sender. 155 * If the message has timed out, a reply has already been sent by the 156 * hardware but the resource has not been released. In that case our 157 * clear of the Timeout bit (as well) will free the resource. No reply will 158 * be sent (the hardware will only do one reply per message). 159 */ 160static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp, 161 int do_acknowledge) 162{ 163 unsigned long dw; 164 struct bau_pq_entry *msg; 165 166 msg = mdp->msg; 167 if (!msg->canceled && do_acknowledge) { 168 dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; 169 write_mmr_sw_ack(dw); 170 } 171 msg->replied_to = 1; 172 msg->swack_vec = 0; 173} 174 175/* 176 * Process the receipt of a RETRY message 177 */ 178static void bau_process_retry_msg(struct msg_desc *mdp, 179 struct bau_control *bcp) 180{ 181 int i; 182 int cancel_count = 0; 183 unsigned long msg_res; 184 unsigned long mmr = 0; 185 struct bau_pq_entry *msg = mdp->msg; 186 struct bau_pq_entry *msg2; 187 struct ptc_stats *stat = bcp->statp; 188 189 stat->d_retries++; 190 /* 191 * cancel any message from msg+1 to the retry itself 192 */ 193 for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { 194 if (msg2 > mdp->queue_last) 195 msg2 = mdp->queue_first; 196 if (msg2 == msg) 197 break; 198 199 /* same conditions for cancellation as do_reset */ 200 if ((msg2->replied_to == 0) && (msg2->canceled == 0) && 201 (msg2->swack_vec) && ((msg2->swack_vec & 202 msg->swack_vec) == 0) && 203 (msg2->sending_cpu == msg->sending_cpu) && 204 (msg2->msg_type != MSG_NOOP)) { 205 mmr = read_mmr_sw_ack(); 206 msg_res = msg2->swack_vec; 207 /* 208 * This is a message retry; clear the resources held 209 * by the previous message only if they timed out. 210 * If it has not timed out we have an unexpected 211 * situation to report. 212 */ 213 if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { 214 unsigned long mr; 215 /* 216 * Is the resource timed out? 217 * Make everyone ignore the cancelled message. 218 */ 219 msg2->canceled = 1; 220 stat->d_canceled++; 221 cancel_count++; 222 mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res; 223 write_mmr_sw_ack(mr); 224 } 225 } 226 } 227 if (!cancel_count) 228 stat->d_nocanceled++; 229} 230 231/* 232 * Do all the things a cpu should do for a TLB shootdown message. 233 * Other cpu's may come here at the same time for this message. 234 */ 235static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, 236 int do_acknowledge) 237{ 238 short socket_ack_count = 0; 239 short *sp; 240 struct atomic_short *asp; 241 struct ptc_stats *stat = bcp->statp; 242 struct bau_pq_entry *msg = mdp->msg; 243 struct bau_control *smaster = bcp->socket_master; 244 245 /* 246 * This must be a normal message, or retry of a normal message 247 */ 248 if (msg->address == TLB_FLUSH_ALL) { 249 local_flush_tlb(); 250 stat->d_alltlb++; 251 } else { 252 __flush_tlb_one(msg->address); 253 stat->d_onetlb++; 254 } 255 stat->d_requestee++; 256 257 /* 258 * One cpu on each uvhub has the additional job on a RETRY 259 * of releasing the resource held by the message that is 260 * being retried. That message is identified by sending 261 * cpu number. 262 */ 263 if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) 264 bau_process_retry_msg(mdp, bcp); 265 266 /* 267 * This is a swack message, so we have to reply to it. 268 * Count each responding cpu on the socket. This avoids 269 * pinging the count's cache line back and forth between 270 * the sockets. 271 */ 272 sp = &smaster->socket_acknowledge_count[mdp->msg_slot]; 273 asp = (struct atomic_short *)sp; 274 socket_ack_count = atom_asr(1, asp); 275 if (socket_ack_count == bcp->cpus_in_socket) { 276 int msg_ack_count; 277 /* 278 * Both sockets dump their completed count total into 279 * the message's count. 280 */ 281 smaster->socket_acknowledge_count[mdp->msg_slot] = 0; 282 asp = (struct atomic_short *)&msg->acknowledge_count; 283 msg_ack_count = atom_asr(socket_ack_count, asp); 284 285 if (msg_ack_count == bcp->cpus_in_uvhub) { 286 /* 287 * All cpus in uvhub saw it; reply 288 * (unless we are in the UV2 workaround) 289 */ 290 reply_to_message(mdp, bcp, do_acknowledge); 291 } 292 } 293 294 return; 295} 296 297/* 298 * Determine the first cpu on a pnode. 299 */ 300static int pnode_to_first_cpu(int pnode, struct bau_control *smaster) 301{ 302 int cpu; 303 struct hub_and_pnode *hpp; 304 305 for_each_present_cpu(cpu) { 306 hpp = &smaster->thp[cpu]; 307 if (pnode == hpp->pnode) 308 return cpu; 309 } 310 return -1; 311} 312 313/* 314 * Last resort when we get a large number of destination timeouts is 315 * to clear resources held by a given cpu. 316 * Do this with IPI so that all messages in the BAU message queue 317 * can be identified by their nonzero swack_vec field. 318 * 319 * This is entered for a single cpu on the uvhub. 320 * The sender want's this uvhub to free a specific message's 321 * swack resources. 322 */ 323static void do_reset(void *ptr) 324{ 325 int i; 326 struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id()); 327 struct reset_args *rap = (struct reset_args *)ptr; 328 struct bau_pq_entry *msg; 329 struct ptc_stats *stat = bcp->statp; 330 331 stat->d_resets++; 332 /* 333 * We're looking for the given sender, and 334 * will free its swack resource. 335 * If all cpu's finally responded after the timeout, its 336 * message 'replied_to' was set. 337 */ 338 for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { 339 unsigned long msg_res; 340 /* do_reset: same conditions for cancellation as 341 bau_process_retry_msg() */ 342 if ((msg->replied_to == 0) && 343 (msg->canceled == 0) && 344 (msg->sending_cpu == rap->sender) && 345 (msg->swack_vec) && 346 (msg->msg_type != MSG_NOOP)) { 347 unsigned long mmr; 348 unsigned long mr; 349 /* 350 * make everyone else ignore this message 351 */ 352 msg->canceled = 1; 353 /* 354 * only reset the resource if it is still pending 355 */ 356 mmr = read_mmr_sw_ack(); 357 msg_res = msg->swack_vec; 358 mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res; 359 if (mmr & msg_res) { 360 stat->d_rcanceled++; 361 write_mmr_sw_ack(mr); 362 } 363 } 364 } 365 return; 366} 367 368/* 369 * Use IPI to get all target uvhubs to release resources held by 370 * a given sending cpu number. 371 */ 372static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) 373{ 374 int pnode; 375 int apnode; 376 int maskbits; 377 int sender = bcp->cpu; 378 cpumask_t *mask = bcp->uvhub_master->cpumask; 379 struct bau_control *smaster = bcp->socket_master; 380 struct reset_args reset_args; 381 382 reset_args.sender = sender; 383 cpus_clear(*mask); 384 /* find a single cpu for each uvhub in this distribution mask */ 385 maskbits = sizeof(struct pnmask) * BITSPERBYTE; 386 /* each bit is a pnode relative to the partition base pnode */ 387 for (pnode = 0; pnode < maskbits; pnode++) { 388 int cpu; 389 if (!bau_uvhub_isset(pnode, distribution)) 390 continue; 391 apnode = pnode + bcp->partition_base_pnode; 392 cpu = pnode_to_first_cpu(apnode, smaster); 393 cpu_set(cpu, *mask); 394 } 395 396 /* IPI all cpus; preemption is already disabled */ 397 smp_call_function_many(mask, do_reset, (void *)&reset_args, 1); 398 return; 399} 400 401static inline unsigned long cycles_2_us(unsigned long long cyc) 402{ 403 unsigned long long ns; 404 unsigned long us; 405 int cpu = smp_processor_id(); 406 407 ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; 408 us = ns / 1000; 409 return us; 410} 411 412/* 413 * wait for all cpus on this hub to finish their sends and go quiet 414 * leaves uvhub_quiesce set so that no new broadcasts are started by 415 * bau_flush_send_and_wait() 416 */ 417static inline void quiesce_local_uvhub(struct bau_control *hmaster) 418{ 419 atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce); 420} 421 422/* 423 * mark this quiet-requestor as done 424 */ 425static inline void end_uvhub_quiesce(struct bau_control *hmaster) 426{ 427 atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce); 428} 429 430static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift) 431{ 432 unsigned long descriptor_status; 433 434 descriptor_status = uv_read_local_mmr(mmr_offset); 435 descriptor_status >>= right_shift; 436 descriptor_status &= UV_ACT_STATUS_MASK; 437 return descriptor_status; 438} 439 440/* 441 * Wait for completion of a broadcast software ack message 442 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP 443 */ 444static int uv1_wait_completion(struct bau_desc *bau_desc, 445 unsigned long mmr_offset, int right_shift, 446 struct bau_control *bcp, long try) 447{ 448 unsigned long descriptor_status; 449 cycles_t ttm; 450 struct ptc_stats *stat = bcp->statp; 451 452 descriptor_status = uv1_read_status(mmr_offset, right_shift); 453 /* spin on the status MMR, waiting for it to go idle */ 454 while ((descriptor_status != DS_IDLE)) { 455 /* 456 * Our software ack messages may be blocked because 457 * there are no swack resources available. As long 458 * as none of them has timed out hardware will NACK 459 * our message and its state will stay IDLE. 460 */ 461 if (descriptor_status == DS_SOURCE_TIMEOUT) { 462 stat->s_stimeout++; 463 return FLUSH_GIVEUP; 464 } else if (descriptor_status == DS_DESTINATION_TIMEOUT) { 465 stat->s_dtimeout++; 466 ttm = get_cycles(); 467 468 /* 469 * Our retries may be blocked by all destination 470 * swack resources being consumed, and a timeout 471 * pending. In that case hardware returns the 472 * ERROR that looks like a destination timeout. 473 */ 474 if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { 475 bcp->conseccompletes = 0; 476 return FLUSH_RETRY_PLUGGED; 477 } 478 479 bcp->conseccompletes = 0; 480 return FLUSH_RETRY_TIMEOUT; 481 } else { 482 /* 483 * descriptor_status is still BUSY 484 */ 485 cpu_relax(); 486 } 487 descriptor_status = uv1_read_status(mmr_offset, right_shift); 488 } 489 bcp->conseccompletes++; 490 return FLUSH_COMPLETE; 491} 492 493/* 494 * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. 495 */ 496static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) 497{ 498 unsigned long descriptor_status; 499 unsigned long descriptor_status2; 500 501 descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); 502 descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; 503 descriptor_status = (descriptor_status << 1) | descriptor_status2; 504 return descriptor_status; 505} 506 507/* 508 * Return whether the status of the descriptor that is normally used for this 509 * cpu (the one indexed by its hub-relative cpu number) is busy. 510 * The status of the original 32 descriptors is always reflected in the 64 511 * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. 512 * The bit provided by the activation_status_2 register is irrelevant to 513 * the status if it is only being tested for busy or not busy. 514 */ 515int normal_busy(struct bau_control *bcp) 516{ 517 int cpu = bcp->uvhub_cpu; 518 int mmr_offset; 519 int right_shift; 520 521 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; 522 right_shift = cpu * UV_ACT_STATUS_SIZE; 523 return (((((read_lmmr(mmr_offset) >> right_shift) & 524 UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); 525} 526 527/* 528 * Entered when a bau descriptor has gone into a permanent busy wait because 529 * of a hardware bug. 530 * Workaround the bug. 531 */ 532int handle_uv2_busy(struct bau_control *bcp) 533{ 534 int busy_one = bcp->using_desc; 535 int normal = bcp->uvhub_cpu; 536 int selected = -1; 537 int i; 538 unsigned long descriptor_status; 539 unsigned long status; 540 int mmr_offset; 541 struct bau_desc *bau_desc_old; 542 struct bau_desc *bau_desc_new; 543 struct bau_control *hmaster = bcp->uvhub_master; 544 struct ptc_stats *stat = bcp->statp; 545 cycles_t ttm; 546 547 stat->s_uv2_wars++; 548 spin_lock(&hmaster->uvhub_lock); 549 /* try for the original first */ 550 if (busy_one != normal) { 551 if (!normal_busy(bcp)) 552 selected = normal; 553 } 554 if (selected < 0) { 555 /* can't use the normal, select an alternate */ 556 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; 557 descriptor_status = read_lmmr(mmr_offset); 558 559 /* scan available descriptors 32-63 */ 560 for (i = 0; i < UV_CPUS_PER_AS; i++) { 561 if ((hmaster->inuse_map & (1 << i)) == 0) { 562 status = ((descriptor_status >> 563 (i * UV_ACT_STATUS_SIZE)) & 564 UV_ACT_STATUS_MASK) << 1; 565 if (status != UV2H_DESC_BUSY) { 566 selected = i + UV_CPUS_PER_AS; 567 break; 568 } 569 } 570 } 571 } 572 573 if (busy_one != normal) 574 /* mark the busy alternate as not in-use */ 575 hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); 576 577 if (selected >= 0) { 578 /* switch to the selected descriptor */ 579 if (selected != normal) { 580 /* set the selected alternate as in-use */ 581 hmaster->inuse_map |= 582 (1 << (selected - UV_CPUS_PER_AS)); 583 if (selected > stat->s_uv2_wars_hw) 584 stat->s_uv2_wars_hw = selected; 585 } 586 bau_desc_old = bcp->descriptor_base; 587 bau_desc_old += (ITEMS_PER_DESC * busy_one); 588 bcp->using_desc = selected; 589 bau_desc_new = bcp->descriptor_base; 590 bau_desc_new += (ITEMS_PER_DESC * selected); 591 *bau_desc_new = *bau_desc_old; 592 } else { 593 /* 594 * All are busy. Wait for the normal one for this cpu to 595 * free up. 596 */ 597 stat->s_uv2_war_waits++; 598 spin_unlock(&hmaster->uvhub_lock); 599 ttm = get_cycles(); 600 do { 601 cpu_relax(); 602 } while (normal_busy(bcp)); 603 spin_lock(&hmaster->uvhub_lock); 604 /* switch to the original descriptor */ 605 bcp->using_desc = normal; 606 bau_desc_old = bcp->descriptor_base; 607 bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); 608 bcp->using_desc = (ITEMS_PER_DESC * normal); 609 bau_desc_new = bcp->descriptor_base; 610 bau_desc_new += (ITEMS_PER_DESC * normal); 611 *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ 612 } 613 spin_unlock(&hmaster->uvhub_lock); 614 return FLUSH_RETRY_BUSYBUG; 615} 616 617static int uv2_wait_completion(struct bau_desc *bau_desc, 618 unsigned long mmr_offset, int right_shift, 619 struct bau_control *bcp, long try) 620{ 621 unsigned long descriptor_stat; 622 cycles_t ttm; 623 int desc = bcp->using_desc; 624 long busy_reps = 0; 625 struct ptc_stats *stat = bcp->statp; 626 627 descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc); 628 629 /* spin on the status MMR, waiting for it to go idle */ 630 while (descriptor_stat != UV2H_DESC_IDLE) { 631 /* 632 * Our software ack messages may be blocked because 633 * there are no swack resources available. As long 634 * as none of them has timed out hardware will NACK 635 * our message and its state will stay IDLE. 636 */ 637 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || 638 (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) || 639 (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { 640 stat->s_stimeout++; 641 return FLUSH_GIVEUP; 642 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { 643 stat->s_dtimeout++; 644 ttm = get_cycles(); 645 bcp->conseccompletes = 0; 646 return FLUSH_RETRY_TIMEOUT; 647 } else { 648 busy_reps++; 649 if (busy_reps > 1000000) { 650 /* not to hammer on the clock */ 651 busy_reps = 0; 652 ttm = get_cycles(); 653 if ((ttm - bcp->send_message) > 654 (bcp->clocks_per_100_usec)) { 655 return handle_uv2_busy(bcp); 656 } 657 } 658 /* 659 * descriptor_stat is still BUSY 660 */ 661 cpu_relax(); 662 } 663 descriptor_stat = uv2_read_status(mmr_offset, right_shift, 664 desc); 665 } 666 bcp->conseccompletes++; 667 return FLUSH_COMPLETE; 668} 669 670/* 671 * There are 2 status registers; each and array[32] of 2 bits. Set up for 672 * which register to read and position in that register based on cpu in 673 * current hub. 674 */ 675static int wait_completion(struct bau_desc *bau_desc, 676 struct bau_control *bcp, long try) 677{ 678 int right_shift; 679 unsigned long mmr_offset; 680 int desc = bcp->using_desc; 681 682 if (desc < UV_CPUS_PER_AS) { 683 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; 684 right_shift = desc * UV_ACT_STATUS_SIZE; 685 } else { 686 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; 687 right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); 688 } 689 690 if (bcp->uvhub_version == 1) 691 return uv1_wait_completion(bau_desc, mmr_offset, right_shift, 692 bcp, try); 693 else 694 return uv2_wait_completion(bau_desc, mmr_offset, right_shift, 695 bcp, try); 696} 697 698static inline cycles_t sec_2_cycles(unsigned long sec) 699{ 700 unsigned long ns; 701 cycles_t cyc; 702 703 ns = sec * 1000000000; 704 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); 705 return cyc; 706} 707 708/* 709 * Our retries are blocked by all destination sw ack resources being 710 * in use, and a timeout is pending. In that case hardware immediately 711 * returns the ERROR that looks like a destination timeout. 712 */ 713static void destination_plugged(struct bau_desc *bau_desc, 714 struct bau_control *bcp, 715 struct bau_control *hmaster, struct ptc_stats *stat) 716{ 717 udelay(bcp->plugged_delay); 718 bcp->plugged_tries++; 719 720 if (bcp->plugged_tries >= bcp->plugsb4reset) { 721 bcp->plugged_tries = 0; 722 723 quiesce_local_uvhub(hmaster); 724 725 spin_lock(&hmaster->queue_lock); 726 reset_with_ipi(&bau_desc->distribution, bcp); 727 spin_unlock(&hmaster->queue_lock); 728 729 end_uvhub_quiesce(hmaster); 730 731 bcp->ipi_attempts++; 732 stat->s_resets_plug++; 733 } 734} 735 736static void destination_timeout(struct bau_desc *bau_desc, 737 struct bau_control *bcp, struct bau_control *hmaster, 738 struct ptc_stats *stat) 739{ 740 hmaster->max_concurr = 1; 741 bcp->timeout_tries++; 742 if (bcp->timeout_tries >= bcp->timeoutsb4reset) { 743 bcp->timeout_tries = 0; 744 745 quiesce_local_uvhub(hmaster); 746 747 spin_lock(&hmaster->queue_lock); 748 reset_with_ipi(&bau_desc->distribution, bcp); 749 spin_unlock(&hmaster->queue_lock); 750 751 end_uvhub_quiesce(hmaster); 752 753 bcp->ipi_attempts++; 754 stat->s_resets_timeout++; 755 } 756} 757 758/* 759 * Completions are taking a very long time due to a congested numalink 760 * network. 761 */ 762static void disable_for_congestion(struct bau_control *bcp, 763 struct ptc_stats *stat) 764{ 765 /* let only one cpu do this disabling */ 766 spin_lock(&disable_lock); 767 768 if (!baudisabled && bcp->period_requests && 769 ((bcp->period_time / bcp->period_requests) > congested_cycles)) { 770 int tcpu; 771 struct bau_control *tbcp; 772 /* it becomes this cpu's job to turn on the use of the 773 BAU again */ 774 baudisabled = 1; 775 bcp->set_bau_off = 1; 776 bcp->set_bau_on_time = get_cycles(); 777 bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period); 778 stat->s_bau_disabled++; 779 for_each_present_cpu(tcpu) { 780 tbcp = &per_cpu(bau_control, tcpu); 781 tbcp->baudisabled = 1; 782 } 783 } 784 785 spin_unlock(&disable_lock); 786} 787 788static void count_max_concurr(int stat, struct bau_control *bcp, 789 struct bau_control *hmaster) 790{ 791 bcp->plugged_tries = 0; 792 bcp->timeout_tries = 0; 793 if (stat != FLUSH_COMPLETE) 794 return; 795 if (bcp->conseccompletes <= bcp->complete_threshold) 796 return; 797 if (hmaster->max_concurr >= hmaster->max_concurr_const) 798 return; 799 hmaster->max_concurr++; 800} 801 802static void record_send_stats(cycles_t time1, cycles_t time2, 803 struct bau_control *bcp, struct ptc_stats *stat, 804 int completion_status, int try) 805{ 806 cycles_t elapsed; 807 808 if (time2 > time1) { 809 elapsed = time2 - time1; 810 stat->s_time += elapsed; 811 812 if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { 813 bcp->period_requests++; 814 bcp->period_time += elapsed; 815 if ((elapsed > congested_cycles) && 816 (bcp->period_requests > bcp->cong_reps)) 817 disable_for_congestion(bcp, stat); 818 } 819 } else 820 stat->s_requestor--; 821 822 if (completion_status == FLUSH_COMPLETE && try > 1) 823 stat->s_retriesok++; 824 else if (completion_status == FLUSH_GIVEUP) 825 stat->s_giveup++; 826} 827 828/* 829 * Because of a uv1 hardware bug only a limited number of concurrent 830 * requests can be made. 831 */ 832static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat) 833{ 834 spinlock_t *lock = &hmaster->uvhub_lock; 835 atomic_t *v; 836 837 v = &hmaster->active_descriptor_count; 838 if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) { 839 stat->s_throttles++; 840 do { 841 cpu_relax(); 842 } while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)); 843 } 844} 845 846/* 847 * Handle the completion status of a message send. 848 */ 849static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, 850 struct bau_control *bcp, struct bau_control *hmaster, 851 struct ptc_stats *stat) 852{ 853 if (completion_status == FLUSH_RETRY_PLUGGED) 854 destination_plugged(bau_desc, bcp, hmaster, stat); 855 else if (completion_status == FLUSH_RETRY_TIMEOUT) 856 destination_timeout(bau_desc, bcp, hmaster, stat); 857} 858 859/* 860 * Send a broadcast and wait for it to complete. 861 * 862 * The flush_mask contains the cpus the broadcast is to be sent to including 863 * cpus that are on the local uvhub. 864 * 865 * Returns 0 if all flushing represented in the mask was done. 866 * Returns 1 if it gives up entirely and the original cpu mask is to be 867 * returned to the kernel. 868 */ 869int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) 870{ 871 int seq_number = 0; 872 int completion_stat = 0; 873 int uv1 = 0; 874 long try = 0; 875 unsigned long index; 876 cycles_t time1; 877 cycles_t time2; 878 struct ptc_stats *stat = bcp->statp; 879 struct bau_control *hmaster = bcp->uvhub_master; 880 struct uv1_bau_msg_header *uv1_hdr = NULL; 881 struct uv2_bau_msg_header *uv2_hdr = NULL; 882 struct bau_desc *bau_desc; 883 884 if (bcp->uvhub_version == 1) 885 uv1_throttle(hmaster, stat); 886 887 while (hmaster->uvhub_quiesce) 888 cpu_relax(); 889 890 time1 = get_cycles(); 891 do { 892 bau_desc = bcp->descriptor_base; 893 bau_desc += (ITEMS_PER_DESC * bcp->using_desc); 894 if (bcp->uvhub_version == 1) { 895 uv1 = 1; 896 uv1_hdr = &bau_desc->header.uv1_hdr; 897 } else 898 uv2_hdr = &bau_desc->header.uv2_hdr; 899 if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { 900 if (uv1) 901 uv1_hdr->msg_type = MSG_REGULAR; 902 else 903 uv2_hdr->msg_type = MSG_REGULAR; 904 seq_number = bcp->message_number++; 905 } else { 906 if (uv1) 907 uv1_hdr->msg_type = MSG_RETRY; 908 else 909 uv2_hdr->msg_type = MSG_RETRY; 910 stat->s_retry_messages++; 911 } 912 913 if (uv1) 914 uv1_hdr->sequence = seq_number; 915 else 916 uv2_hdr->sequence = seq_number; 917 index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; 918 bcp->send_message = get_cycles(); 919 920 write_mmr_activation(index); 921 922 try++; 923 completion_stat = wait_completion(bau_desc, bcp, try); 924 /* UV2: wait_completion() may change the bcp->using_desc */ 925 926 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); 927 928 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { 929 bcp->ipi_attempts = 0; 930 completion_stat = FLUSH_GIVEUP; 931 break; 932 } 933 cpu_relax(); 934 } while ((completion_stat == FLUSH_RETRY_PLUGGED) || 935 (completion_stat == FLUSH_RETRY_BUSYBUG) || 936 (completion_stat == FLUSH_RETRY_TIMEOUT)); 937 938 time2 = get_cycles(); 939 940 count_max_concurr(completion_stat, bcp, hmaster); 941 942 while (hmaster->uvhub_quiesce) 943 cpu_relax(); 944 945 atomic_dec(&hmaster->active_descriptor_count); 946 947 record_send_stats(time1, time2, bcp, stat, completion_stat, try); 948 949 if (completion_stat == FLUSH_GIVEUP) 950 /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */ 951 return 1; 952 return 0; 953} 954 955/* 956 * The BAU is disabled. When the disabled time period has expired, the cpu 957 * that disabled it must re-enable it. 958 * Return 0 if it is re-enabled for all cpus. 959 */ 960static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) 961{ 962 int tcpu; 963 struct bau_control *tbcp; 964 965 if (bcp->set_bau_off) { 966 if (get_cycles() >= bcp->set_bau_on_time) { 967 stat->s_bau_reenabled++; 968 baudisabled = 0; 969 for_each_present_cpu(tcpu) { 970 tbcp = &per_cpu(bau_control, tcpu); 971 tbcp->baudisabled = 0; 972 tbcp->period_requests = 0; 973 tbcp->period_time = 0; 974 } 975 return 0; 976 } 977 } 978 return -1; 979} 980 981static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs, 982 int remotes, struct bau_desc *bau_desc) 983{ 984 stat->s_requestor++; 985 stat->s_ntargcpu += remotes + locals; 986 stat->s_ntargremotes += remotes; 987 stat->s_ntarglocals += locals; 988 989 /* uvhub statistics */ 990 hubs = bau_uvhub_weight(&bau_desc->distribution); 991 if (locals) { 992 stat->s_ntarglocaluvhub++; 993 stat->s_ntargremoteuvhub += (hubs - 1); 994 } else 995 stat->s_ntargremoteuvhub += hubs; 996 997 stat->s_ntarguvhub += hubs; 998 999 if (hubs >= 16) 1000 stat->s_ntarguvhub16++; 1001 else if (hubs >= 8) 1002 stat->s_ntarguvhub8++; 1003 else if (hubs >= 4) 1004 stat->s_ntarguvhub4++; 1005 else if (hubs >= 2) 1006 stat->s_ntarguvhub2++; 1007 else 1008 stat->s_ntarguvhub1++; 1009} 1010 1011/* 1012 * Translate a cpu mask to the uvhub distribution mask in the BAU 1013 * activation descriptor. 1014 */ 1015static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, 1016 struct bau_desc *bau_desc, int *localsp, int *remotesp) 1017{ 1018 int cpu; 1019 int pnode; 1020 int cnt = 0; 1021 struct hub_and_pnode *hpp; 1022 1023 for_each_cpu(cpu, flush_mask) { 1024 /* 1025 * The distribution vector is a bit map of pnodes, relative 1026 * to the partition base pnode (and the partition base nasid 1027 * in the header). 1028 * Translate cpu to pnode and hub using a local memory array. 1029 */ 1030 hpp = &bcp->socket_master->thp[cpu]; 1031 pnode = hpp->pnode - bcp->partition_base_pnode; 1032 bau_uvhub_set(pnode, &bau_desc->distribution); 1033 cnt++; 1034 if (hpp->uvhub == bcp->uvhub) 1035 (*localsp)++; 1036 else 1037 (*remotesp)++; 1038 } 1039 if (!cnt) 1040 return 1; 1041 return 0; 1042} 1043 1044/* 1045 * globally purge translation cache of a virtual address or all TLB's 1046 * @cpumask: mask of all cpu's in which the address is to be removed 1047 * @mm: mm_struct containing virtual address range 1048 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) 1049 * @cpu: the current cpu 1050 * 1051 * This is the entry point for initiating any UV global TLB shootdown. 1052 * 1053 * Purges the translation caches of all specified processors of the given 1054 * virtual address, or purges all TLB's on specified processors. 1055 * 1056 * The caller has derived the cpumask from the mm_struct. This function 1057 * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) 1058 * 1059 * The cpumask is converted into a uvhubmask of the uvhubs containing 1060 * those cpus. 1061 * 1062 * Note that this function should be called with preemption disabled. 1063 * 1064 * Returns NULL if all remote flushing was done. 1065 * Returns pointer to cpumask if some remote flushing remains to be 1066 * done. The returned pointer is valid till preemption is re-enabled. 1067 */ 1068const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 1069 struct mm_struct *mm, unsigned long va, 1070 unsigned int cpu) 1071{ 1072 int locals = 0; 1073 int remotes = 0; 1074 int hubs = 0; 1075 struct bau_desc *bau_desc; 1076 struct cpumask *flush_mask; 1077 struct ptc_stats *stat; 1078 struct bau_control *bcp; 1079 1080 /* kernel was booted 'nobau' */ 1081 if (nobau) 1082 return cpumask; 1083 1084 bcp = &per_cpu(bau_control, cpu); 1085 stat = bcp->statp; 1086 1087 /* bau was disabled due to slow response */ 1088 if (bcp->baudisabled) { 1089 if (check_enable(bcp, stat)) 1090 return cpumask; 1091 } 1092 1093 /* 1094 * Each sending cpu has a per-cpu mask which it fills from the caller's 1095 * cpu mask. All cpus are converted to uvhubs and copied to the 1096 * activation descriptor. 1097 */ 1098 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); 1099 /* don't actually do a shootdown of the local cpu */ 1100 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 1101 1102 if (cpu_isset(cpu, *cpumask)) 1103 stat->s_ntargself++; 1104 1105 bau_desc = bcp->descriptor_base; 1106 bau_desc += (ITEMS_PER_DESC * bcp->using_desc); 1107 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 1108 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) 1109 return NULL; 1110 1111 record_send_statistics(stat, locals, hubs, remotes, bau_desc); 1112 1113 bau_desc->payload.address = va; 1114 bau_desc->payload.sending_cpu = cpu; 1115 /* 1116 * uv_flush_send_and_wait returns 0 if all cpu's were messaged, 1117 * or 1 if it gave up and the original cpumask should be returned. 1118 */ 1119 if (!uv_flush_send_and_wait(flush_mask, bcp)) 1120 return NULL; 1121 else 1122 return cpumask; 1123} 1124 1125/* 1126 * Search the message queue for any 'other' message with the same software 1127 * acknowledge resource bit vector. 1128 */ 1129struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, 1130 struct bau_control *bcp, unsigned char swack_vec) 1131{ 1132 struct bau_pq_entry *msg_next = msg + 1; 1133 1134 if (msg_next > bcp->queue_last) 1135 msg_next = bcp->queue_first; 1136 while ((msg_next->swack_vec != 0) && (msg_next != msg)) { 1137 if (msg_next->swack_vec == swack_vec) 1138 return msg_next; 1139 msg_next++; 1140 if (msg_next > bcp->queue_last) 1141 msg_next = bcp->queue_first; 1142 } 1143 return NULL; 1144} 1145 1146/* 1147 * UV2 needs to work around a bug in which an arriving message has not 1148 * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. 1149 * Such a message must be ignored. 1150 */ 1151void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) 1152{ 1153 unsigned long mmr_image; 1154 unsigned char swack_vec; 1155 struct bau_pq_entry *msg = mdp->msg; 1156 struct bau_pq_entry *other_msg; 1157 1158 mmr_image = read_mmr_sw_ack(); 1159 swack_vec = msg->swack_vec; 1160 1161 if ((swack_vec & mmr_image) == 0) { 1162 /* 1163 * This message was assigned a swack resource, but no 1164 * reserved acknowlegment is pending. 1165 * The bug has prevented this message from setting the MMR. 1166 * And no other message has used the same sw_ack resource. 1167 * Do the requested shootdown but do not reply to the msg. 1168 * (the 0 means make no acknowledge) 1169 */ 1170 bau_process_message(mdp, bcp, 0); 1171 return; 1172 } 1173 1174 /* 1175 * Some message has set the MMR 'pending' bit; it might have been 1176 * another message. Look for that message. 1177 */ 1178 other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); 1179 if (other_msg) { 1180 /* There is another. Do not ack the current one. */ 1181 bau_process_message(mdp, bcp, 0); 1182 /* 1183 * Let the natural processing of that message acknowledge 1184 * it. Don't get the processing of sw_ack's out of order. 1185 */ 1186 return; 1187 } 1188 1189 /* 1190 * There is no other message using this sw_ack, so it is safe to 1191 * acknowledge it. 1192 */ 1193 bau_process_message(mdp, bcp, 1); 1194 1195 return; 1196} 1197 1198/* 1199 * The BAU message interrupt comes here. (registered by set_intr_gate) 1200 * See entry_64.S 1201 * 1202 * We received a broadcast assist message. 1203 * 1204 * Interrupts are disabled; this interrupt could represent 1205 * the receipt of several messages. 1206 * 1207 * All cores/threads on this hub get this interrupt. 1208 * The last one to see it does the software ack. 1209 * (the resource will not be freed until noninterruptable cpus see this 1210 * interrupt; hardware may timeout the s/w ack and reply ERROR) 1211 */ 1212void uv_bau_message_interrupt(struct pt_regs *regs) 1213{ 1214 int count = 0; 1215 cycles_t time_start; 1216 struct bau_pq_entry *msg; 1217 struct bau_control *bcp; 1218 struct ptc_stats *stat; 1219 struct msg_desc msgdesc; 1220 1221 ack_APIC_irq(); 1222 time_start = get_cycles(); 1223 1224 bcp = &per_cpu(bau_control, smp_processor_id()); 1225 stat = bcp->statp; 1226 1227 msgdesc.queue_first = bcp->queue_first; 1228 msgdesc.queue_last = bcp->queue_last; 1229 1230 msg = bcp->bau_msg_head; 1231 while (msg->swack_vec) { 1232 count++; 1233 1234 msgdesc.msg_slot = msg - msgdesc.queue_first; 1235 msgdesc.msg = msg; 1236 if (bcp->uvhub_version == 2) 1237 process_uv2_message(&msgdesc, bcp); 1238 else 1239 bau_process_message(&msgdesc, bcp, 1); 1240 1241 msg++; 1242 if (msg > msgdesc.queue_last) 1243 msg = msgdesc.queue_first; 1244 bcp->bau_msg_head = msg; 1245 } 1246 stat->d_time += (get_cycles() - time_start); 1247 if (!count) 1248 stat->d_nomsg++; 1249 else if (count > 1) 1250 stat->d_multmsg++; 1251} 1252 1253/* 1254 * Each target uvhub (i.e. a uvhub that has cpu's) needs to have 1255 * shootdown message timeouts enabled. The timeout does not cause 1256 * an interrupt, but causes an error message to be returned to 1257 * the sender. 1258 */ 1259static void __init enable_timeouts(void) 1260{ 1261 int uvhub; 1262 int nuvhubs; 1263 int pnode; 1264 unsigned long mmr_image; 1265 1266 nuvhubs = uv_num_possible_blades(); 1267 1268 for (uvhub = 0; uvhub < nuvhubs; uvhub++) { 1269 if (!uv_blade_nr_possible_cpus(uvhub)) 1270 continue; 1271 1272 pnode = uv_blade_to_pnode(uvhub); 1273 mmr_image = read_mmr_misc_control(pnode); 1274 /* 1275 * Set the timeout period and then lock it in, in three 1276 * steps; captures and locks in the period. 1277 * 1278 * To program the period, the SOFT_ACK_MODE must be off. 1279 */ 1280 mmr_image &= ~(1L << SOFTACK_MSHIFT); 1281 write_mmr_misc_control(pnode, mmr_image); 1282 /* 1283 * Set the 4-bit period. 1284 */ 1285 mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT); 1286 mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT); 1287 write_mmr_misc_control(pnode, mmr_image); 1288 /* 1289 * UV1: 1290 * Subsequent reversals of the timebase bit (3) cause an 1291 * immediate timeout of one or all INTD resources as 1292 * indicated in bits 2:0 (7 causes all of them to timeout). 1293 */ 1294 mmr_image |= (1L << SOFTACK_MSHIFT); 1295 if (is_uv2_hub()) { 1296 mmr_image &= ~(1L << UV2_LEG_SHFT); 1297 mmr_image |= (1L << UV2_EXT_SHFT); 1298 } 1299 write_mmr_misc_control(pnode, mmr_image); 1300 } 1301} 1302 1303static void *ptc_seq_start(struct seq_file *file, loff_t *offset) 1304{ 1305 if (*offset < num_possible_cpus()) 1306 return offset; 1307 return NULL; 1308} 1309 1310static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) 1311{ 1312 (*offset)++; 1313 if (*offset < num_possible_cpus()) 1314 return offset; 1315 return NULL; 1316} 1317 1318static void ptc_seq_stop(struct seq_file *file, void *data) 1319{ 1320} 1321 1322static inline unsigned long long usec_2_cycles(unsigned long microsec) 1323{ 1324 unsigned long ns; 1325 unsigned long long cyc; 1326 1327 ns = microsec * 1000; 1328 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); 1329 return cyc; 1330} 1331 1332/* 1333 * Display the statistics thru /proc/sgi_uv/ptc_statistics 1334 * 'data' points to the cpu number 1335 * Note: see the descriptions in stat_description[]. 1336 */ 1337static int ptc_seq_show(struct seq_file *file, void *data) 1338{ 1339 struct ptc_stats *stat; 1340 int cpu; 1341 1342 cpu = *(loff_t *)data; 1343 if (!cpu) { 1344 seq_printf(file, 1345 "# cpu sent stime self locals remotes ncpus localhub "); 1346 seq_printf(file, 1347 "remotehub numuvhubs numuvhubs16 numuvhubs8 "); 1348 seq_printf(file, 1349 "numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok "); 1350 seq_printf(file, 1351 "resetp resett giveup sto bz throt swack recv rtime "); 1352 seq_printf(file, 1353 "all one mult none retry canc nocan reset rcan "); 1354 seq_printf(file, 1355 "disable enable wars warshw warwaits\n"); 1356 } 1357 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 1358 stat = &per_cpu(ptcstats, cpu); 1359 /* source side statistics */ 1360 seq_printf(file, 1361 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 1362 cpu, stat->s_requestor, cycles_2_us(stat->s_time), 1363 stat->s_ntargself, stat->s_ntarglocals, 1364 stat->s_ntargremotes, stat->s_ntargcpu, 1365 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, 1366 stat->s_ntarguvhub, stat->s_ntarguvhub16); 1367 seq_printf(file, "%ld %ld %ld %ld %ld ", 1368 stat->s_ntarguvhub8, stat->s_ntarguvhub4, 1369 stat->s_ntarguvhub2, stat->s_ntarguvhub1, 1370 stat->s_dtimeout); 1371 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", 1372 stat->s_retry_messages, stat->s_retriesok, 1373 stat->s_resets_plug, stat->s_resets_timeout, 1374 stat->s_giveup, stat->s_stimeout, 1375 stat->s_busy, stat->s_throttles); 1376 1377 /* destination side statistics */ 1378 seq_printf(file, 1379 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 1380 read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)), 1381 stat->d_requestee, cycles_2_us(stat->d_time), 1382 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, 1383 stat->d_nomsg, stat->d_retries, stat->d_canceled, 1384 stat->d_nocanceled, stat->d_resets, 1385 stat->d_rcanceled); 1386 seq_printf(file, "%ld %ld %ld %ld %ld\n", 1387 stat->s_bau_disabled, stat->s_bau_reenabled, 1388 stat->s_uv2_wars, stat->s_uv2_wars_hw, 1389 stat->s_uv2_war_waits); 1390 } 1391 return 0; 1392} 1393 1394/* 1395 * Display the tunables thru debugfs 1396 */ 1397static ssize_t tunables_read(struct file *file, char __user *userbuf, 1398 size_t count, loff_t *ppos) 1399{ 1400 char *buf; 1401 int ret; 1402 1403 buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", 1404 "max_concur plugged_delay plugsb4reset", 1405 "timeoutsb4reset ipi_reset_limit complete_threshold", 1406 "congested_response_us congested_reps congested_period", 1407 max_concurr, plugged_delay, plugsb4reset, 1408 timeoutsb4reset, ipi_reset_limit, complete_threshold, 1409 congested_respns_us, congested_reps, congested_period); 1410 1411 if (!buf) 1412 return -ENOMEM; 1413 1414 ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); 1415 kfree(buf); 1416 return ret; 1417} 1418 1419/* 1420 * handle a write to /proc/sgi_uv/ptc_statistics 1421 * -1: reset the statistics 1422 * 0: display meaning of the statistics 1423 */ 1424static ssize_t ptc_proc_write(struct file *file, const char __user *user, 1425 size_t count, loff_t *data) 1426{ 1427 int cpu; 1428 int i; 1429 int elements; 1430 long input_arg; 1431 char optstr[64]; 1432 struct ptc_stats *stat; 1433 1434 if (count == 0 || count > sizeof(optstr)) 1435 return -EINVAL; 1436 if (copy_from_user(optstr, user, count)) 1437 return -EFAULT; 1438 optstr[count - 1] = '\0'; 1439 1440 if (strict_strtol(optstr, 10, &input_arg) < 0) { 1441 printk(KERN_DEBUG "%s is invalid\n", optstr); 1442 return -EINVAL; 1443 } 1444 1445 if (input_arg == 0) { 1446 elements = sizeof(stat_description)/sizeof(*stat_description); 1447 printk(KERN_DEBUG "# cpu: cpu number\n"); 1448 printk(KERN_DEBUG "Sender statistics:\n"); 1449 for (i = 0; i < elements; i++) 1450 printk(KERN_DEBUG "%s\n", stat_description[i]); 1451 } else if (input_arg == -1) { 1452 for_each_present_cpu(cpu) { 1453 stat = &per_cpu(ptcstats, cpu); 1454 memset(stat, 0, sizeof(struct ptc_stats)); 1455 } 1456 } 1457 1458 return count; 1459} 1460 1461static int local_atoi(const char *name) 1462{ 1463 int val = 0; 1464 1465 for (;; name++) { 1466 switch (*name) { 1467 case '0' ... '9': 1468 val = 10*val+(*name-'0'); 1469 break; 1470 default: 1471 return val; 1472 } 1473 } 1474} 1475 1476/* 1477 * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables. 1478 * Zero values reset them to defaults. 1479 */ 1480static int parse_tunables_write(struct bau_control *bcp, char *instr, 1481 int count) 1482{ 1483 char *p; 1484 char *q; 1485 int cnt = 0; 1486 int val; 1487 int e = sizeof(tunables) / sizeof(*tunables); 1488 1489 p = instr + strspn(instr, WHITESPACE); 1490 q = p; 1491 for (; *p; p = q + strspn(q, WHITESPACE)) { 1492 q = p + strcspn(p, WHITESPACE); 1493 cnt++; 1494 if (q == p) 1495 break; 1496 } 1497 if (cnt != e) { 1498 printk(KERN_INFO "bau tunable error: should be %d values\n", e); 1499 return -EINVAL; 1500 } 1501 1502 p = instr + strspn(instr, WHITESPACE); 1503 q = p; 1504 for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) { 1505 q = p + strcspn(p, WHITESPACE); 1506 val = local_atoi(p); 1507 switch (cnt) { 1508 case 0: 1509 if (val == 0) { 1510 max_concurr = MAX_BAU_CONCURRENT; 1511 max_concurr_const = MAX_BAU_CONCURRENT; 1512 continue; 1513 } 1514 if (val < 1 || val > bcp->cpus_in_uvhub) { 1515 printk(KERN_DEBUG 1516 "Error: BAU max concurrent %d is invalid\n", 1517 val); 1518 return -EINVAL; 1519 } 1520 max_concurr = val; 1521 max_concurr_const = val; 1522 continue; 1523 default: 1524 if (val == 0) 1525 *tunables[cnt].tunp = tunables[cnt].deflt; 1526 else 1527 *tunables[cnt].tunp = val; 1528 continue; 1529 } 1530 if (q == p) 1531 break; 1532 } 1533 return 0; 1534} 1535 1536/* 1537 * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables) 1538 */ 1539static ssize_t tunables_write(struct file *file, const char __user *user, 1540 size_t count, loff_t *data) 1541{ 1542 int cpu; 1543 int ret; 1544 char instr[100]; 1545 struct bau_control *bcp; 1546 1547 if (count == 0 || count > sizeof(instr)-1) 1548 return -EINVAL; 1549 if (copy_from_user(instr, user, count)) 1550 return -EFAULT; 1551 1552 instr[count] = '\0'; 1553 1554 cpu = get_cpu(); 1555 bcp = &per_cpu(bau_control, cpu); 1556 ret = parse_tunables_write(bcp, instr, count); 1557 put_cpu(); 1558 if (ret) 1559 return ret; 1560 1561 for_each_present_cpu(cpu) { 1562 bcp = &per_cpu(bau_control, cpu); 1563 bcp->max_concurr = max_concurr; 1564 bcp->max_concurr_const = max_concurr; 1565 bcp->plugged_delay = plugged_delay; 1566 bcp->plugsb4reset = plugsb4reset; 1567 bcp->timeoutsb4reset = timeoutsb4reset; 1568 bcp->ipi_reset_limit = ipi_reset_limit; 1569 bcp->complete_threshold = complete_threshold; 1570 bcp->cong_response_us = congested_respns_us; 1571 bcp->cong_reps = congested_reps; 1572 bcp->cong_period = congested_period; 1573 } 1574 return count; 1575} 1576 1577static const struct seq_operations uv_ptc_seq_ops = { 1578 .start = ptc_seq_start, 1579 .next = ptc_seq_next, 1580 .stop = ptc_seq_stop, 1581 .show = ptc_seq_show 1582}; 1583 1584static int ptc_proc_open(struct inode *inode, struct file *file) 1585{ 1586 return seq_open(file, &uv_ptc_seq_ops); 1587} 1588 1589static int tunables_open(struct inode *inode, struct file *file) 1590{ 1591 return 0; 1592} 1593 1594static const struct file_operations proc_uv_ptc_operations = { 1595 .open = ptc_proc_open, 1596 .read = seq_read, 1597 .write = ptc_proc_write, 1598 .llseek = seq_lseek, 1599 .release = seq_release, 1600}; 1601 1602static const struct file_operations tunables_fops = { 1603 .open = tunables_open, 1604 .read = tunables_read, 1605 .write = tunables_write, 1606 .llseek = default_llseek, 1607}; 1608 1609static int __init uv_ptc_init(void) 1610{ 1611 struct proc_dir_entry *proc_uv_ptc; 1612 1613 if (!is_uv_system()) 1614 return 0; 1615 1616 proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL, 1617 &proc_uv_ptc_operations); 1618 if (!proc_uv_ptc) { 1619 printk(KERN_ERR "unable to create %s proc entry\n", 1620 UV_PTC_BASENAME); 1621 return -EINVAL; 1622 } 1623 1624 tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); 1625 if (!tunables_dir) { 1626 printk(KERN_ERR "unable to create debugfs directory %s\n", 1627 UV_BAU_TUNABLES_DIR); 1628 return -EINVAL; 1629 } 1630 tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, 1631 tunables_dir, NULL, &tunables_fops); 1632 if (!tunables_file) { 1633 printk(KERN_ERR "unable to create debugfs file %s\n", 1634 UV_BAU_TUNABLES_FILE); 1635 return -EINVAL; 1636 } 1637 return 0; 1638} 1639 1640/* 1641 * Initialize the sending side's sending buffers. 1642 */ 1643static void activation_descriptor_init(int node, int pnode, int base_pnode) 1644{ 1645 int i; 1646 int cpu; 1647 int uv1 = 0; 1648 unsigned long gpa; 1649 unsigned long m; 1650 unsigned long n; 1651 size_t dsize; 1652 struct bau_desc *bau_desc; 1653 struct bau_desc *bd2; 1654 struct uv1_bau_msg_header *uv1_hdr; 1655 struct uv2_bau_msg_header *uv2_hdr; 1656 struct bau_control *bcp; 1657 1658 /* 1659 * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC) 1660 * per cpu; and one per cpu on the uvhub (ADP_SZ) 1661 */ 1662 dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC; 1663 bau_desc = kmalloc_node(dsize, GFP_KERNEL, node); 1664 BUG_ON(!bau_desc); 1665 1666 gpa = uv_gpa(bau_desc); 1667 n = uv_gpa_to_gnode(gpa); 1668 m = uv_gpa_to_offset(gpa); 1669 if (is_uv1_hub()) 1670 uv1 = 1; 1671 1672 /* the 14-bit pnode */ 1673 write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); 1674 /* 1675 * Initializing all 8 (ITEMS_PER_DESC) descriptors for each 1676 * cpu even though we only use the first one; one descriptor can 1677 * describe a broadcast to 256 uv hubs. 1678 */ 1679 for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) { 1680 memset(bd2, 0, sizeof(struct bau_desc)); 1681 if (uv1) { 1682 uv1_hdr = &bd2->header.uv1_hdr; 1683 uv1_hdr->swack_flag = 1; 1684 /* 1685 * The base_dest_nasid set in the message header 1686 * is the nasid of the first uvhub in the partition. 1687 * The bit map will indicate destination pnode numbers 1688 * relative to that base. They may not be consecutive 1689 * if nasid striding is being used. 1690 */ 1691 uv1_hdr->base_dest_nasid = 1692 UV_PNODE_TO_NASID(base_pnode); 1693 uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID; 1694 uv1_hdr->command = UV_NET_ENDPOINT_INTD; 1695 uv1_hdr->int_both = 1; 1696 /* 1697 * all others need to be set to zero: 1698 * fairness chaining multilevel count replied_to 1699 */ 1700 } else { 1701 uv2_hdr = &bd2->header.uv2_hdr; 1702 uv2_hdr->swack_flag = 1; 1703 uv2_hdr->base_dest_nasid = 1704 UV_PNODE_TO_NASID(base_pnode); 1705 uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID; 1706 uv2_hdr->command = UV_NET_ENDPOINT_INTD; 1707 } 1708 } 1709 for_each_present_cpu(cpu) { 1710 if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) 1711 continue; 1712 bcp = &per_cpu(bau_control, cpu); 1713 bcp->descriptor_base = bau_desc; 1714 } 1715} 1716 1717/* 1718 * initialize the destination side's receiving buffers 1719 * entered for each uvhub in the partition 1720 * - node is first node (kernel memory notion) on the uvhub 1721 * - pnode is the uvhub's physical identifier 1722 */ 1723static void pq_init(int node, int pnode) 1724{ 1725 int cpu; 1726 size_t plsize; 1727 char *cp; 1728 void *vp; 1729 unsigned long pn; 1730 unsigned long first; 1731 unsigned long pn_first; 1732 unsigned long last; 1733 struct bau_pq_entry *pqp; 1734 struct bau_control *bcp; 1735 1736 plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry); 1737 vp = kmalloc_node(plsize, GFP_KERNEL, node); 1738 pqp = (struct bau_pq_entry *)vp; 1739 BUG_ON(!pqp); 1740 1741 cp = (char *)pqp + 31; 1742 pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5); 1743 1744 for_each_present_cpu(cpu) { 1745 if (pnode != uv_cpu_to_pnode(cpu)) 1746 continue; 1747 /* for every cpu on this pnode: */ 1748 bcp = &per_cpu(bau_control, cpu); 1749 bcp->queue_first = pqp; 1750 bcp->bau_msg_head = pqp; 1751 bcp->queue_last = pqp + (DEST_Q_SIZE - 1); 1752 } 1753 /* 1754 * need the gnode of where the memory was really allocated 1755 */ 1756 pn = uv_gpa_to_gnode(uv_gpa(pqp)); 1757 first = uv_physnodeaddr(pqp); 1758 pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first; 1759 last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)); 1760 write_mmr_payload_first(pnode, pn_first); 1761 write_mmr_payload_tail(pnode, first); 1762 write_mmr_payload_last(pnode, last); 1763 write_gmmr_sw_ack(pnode, 0xffffUL); 1764 1765 /* in effect, all msg_type's are set to MSG_NOOP */ 1766 memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); 1767} 1768 1769/* 1770 * Initialization of each UV hub's structures 1771 */ 1772static void __init init_uvhub(int uvhub, int vector, int base_pnode) 1773{ 1774 int node; 1775 int pnode; 1776 unsigned long apicid; 1777 1778 node = uvhub_to_first_node(uvhub); 1779 pnode = uv_blade_to_pnode(uvhub); 1780 1781 activation_descriptor_init(node, pnode, base_pnode); 1782 1783 pq_init(node, pnode); 1784 /* 1785 * The below initialization can't be in firmware because the 1786 * messaging IRQ will be determined by the OS. 1787 */ 1788 apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; 1789 write_mmr_data_config(pnode, ((apicid << 32) | vector)); 1790} 1791 1792/* 1793 * We will set BAU_MISC_CONTROL with a timeout period. 1794 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. 1795 * So the destination timeout period has to be calculated from them. 1796 */ 1797static int calculate_destination_timeout(void) 1798{ 1799 unsigned long mmr_image; 1800 int mult1; 1801 int mult2; 1802 int index; 1803 int base; 1804 int ret; 1805 unsigned long ts_ns; 1806 1807 if (is_uv1_hub()) { 1808 mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; 1809 mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); 1810 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; 1811 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); 1812 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; 1813 base = timeout_base_ns[index]; 1814 ts_ns = base * mult1 * mult2; 1815 ret = ts_ns / 1000; 1816 } else { 1817 /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ 1818 mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); 1819 mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; 1820 if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) 1821 base = 80; 1822 else 1823 base = 10; 1824 mult1 = mmr_image & UV2_ACK_MASK; 1825 ret = mult1 * base; 1826 } 1827 return ret; 1828} 1829 1830static void __init init_per_cpu_tunables(void) 1831{ 1832 int cpu; 1833 struct bau_control *bcp; 1834 1835 for_each_present_cpu(cpu) { 1836 bcp = &per_cpu(bau_control, cpu); 1837 bcp->baudisabled = 0; 1838 bcp->statp = &per_cpu(ptcstats, cpu); 1839 /* time interval to catch a hardware stay-busy bug */ 1840 bcp->timeout_interval = usec_2_cycles(2*timeout_us); 1841 bcp->max_concurr = max_concurr; 1842 bcp->max_concurr_const = max_concurr; 1843 bcp->plugged_delay = plugged_delay; 1844 bcp->plugsb4reset = plugsb4reset; 1845 bcp->timeoutsb4reset = timeoutsb4reset; 1846 bcp->ipi_reset_limit = ipi_reset_limit; 1847 bcp->complete_threshold = complete_threshold; 1848 bcp->cong_response_us = congested_respns_us; 1849 bcp->cong_reps = congested_reps; 1850 bcp->cong_period = congested_period; 1851 bcp->clocks_per_100_usec = usec_2_cycles(100); 1852 } 1853} 1854 1855/* 1856 * Scan all cpus to collect blade and socket summaries. 1857 */ 1858static int __init get_cpu_topology(int base_pnode, 1859 struct uvhub_desc *uvhub_descs, 1860 unsigned char *uvhub_mask) 1861{ 1862 int cpu; 1863 int pnode; 1864 int uvhub; 1865 int socket; 1866 struct bau_control *bcp; 1867 struct uvhub_desc *bdp; 1868 struct socket_desc *sdp; 1869 1870 for_each_present_cpu(cpu) { 1871 bcp = &per_cpu(bau_control, cpu); 1872 1873 memset(bcp, 0, sizeof(struct bau_control)); 1874 1875 pnode = uv_cpu_hub_info(cpu)->pnode; 1876 if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) { 1877 printk(KERN_EMERG 1878 "cpu %d pnode %d-%d beyond %d; BAU disabled\n", 1879 cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE); 1880 return 1; 1881 } 1882 1883 bcp->osnode = cpu_to_node(cpu); 1884 bcp->partition_base_pnode = base_pnode; 1885 1886 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; 1887 *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); 1888 bdp = &uvhub_descs[uvhub]; 1889 1890 bdp->num_cpus++; 1891 bdp->uvhub = uvhub; 1892 bdp->pnode = pnode; 1893 1894 /* kludge: 'assuming' one node per socket, and assuming that 1895 disabling a socket just leaves a gap in node numbers */ 1896 socket = bcp->osnode & 1; 1897 bdp->socket_mask |= (1 << socket); 1898 sdp = &bdp->socket[socket]; 1899 sdp->cpu_number[sdp->num_cpus] = cpu; 1900 sdp->num_cpus++; 1901 if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) { 1902 printk(KERN_EMERG "%d cpus per socket invalid\n", 1903 sdp->num_cpus); 1904 return 1; 1905 } 1906 } 1907 return 0; 1908} 1909 1910/* 1911 * Each socket is to get a local array of pnodes/hubs. 1912 */ 1913static void make_per_cpu_thp(struct bau_control *smaster) 1914{ 1915 int cpu; 1916 size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus(); 1917 1918 smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode); 1919 memset(smaster->thp, 0, hpsz); 1920 for_each_present_cpu(cpu) { 1921 smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode; 1922 smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; 1923 } 1924} 1925 1926/* 1927 * Each uvhub is to get a local cpumask. 1928 */ 1929static void make_per_hub_cpumask(struct bau_control *hmaster) 1930{ 1931 int sz = sizeof(cpumask_t); 1932 1933 hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode); 1934} 1935 1936/* 1937 * Initialize all the per_cpu information for the cpu's on a given socket, 1938 * given what has been gathered into the socket_desc struct. 1939 * And reports the chosen hub and socket masters back to the caller. 1940 */ 1941static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, 1942 struct bau_control **smasterp, 1943 struct bau_control **hmasterp) 1944{ 1945 int i; 1946 int cpu; 1947 struct bau_control *bcp; 1948 1949 for (i = 0; i < sdp->num_cpus; i++) { 1950 cpu = sdp->cpu_number[i]; 1951 bcp = &per_cpu(bau_control, cpu); 1952 bcp->cpu = cpu; 1953 if (i == 0) { 1954 *smasterp = bcp; 1955 if (!(*hmasterp)) 1956 *hmasterp = bcp; 1957 } 1958 bcp->cpus_in_uvhub = bdp->num_cpus; 1959 bcp->cpus_in_socket = sdp->num_cpus; 1960 bcp->socket_master = *smasterp; 1961 bcp->uvhub = bdp->uvhub; 1962 if (is_uv1_hub()) 1963 bcp->uvhub_version = 1; 1964 else if (is_uv2_hub()) 1965 bcp->uvhub_version = 2; 1966 else { 1967 printk(KERN_EMERG "uvhub version not 1 or 2\n"); 1968 return 1; 1969 } 1970 bcp->uvhub_master = *hmasterp; 1971 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; 1972 bcp->using_desc = bcp->uvhub_cpu; 1973 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { 1974 printk(KERN_EMERG "%d cpus per uvhub invalid\n", 1975 bcp->uvhub_cpu); 1976 return 1; 1977 } 1978 } 1979 return 0; 1980} 1981 1982/* 1983 * Summarize the blade and socket topology into the per_cpu structures. 1984 */ 1985static int __init summarize_uvhub_sockets(int nuvhubs, 1986 struct uvhub_desc *uvhub_descs, 1987 unsigned char *uvhub_mask) 1988{ 1989 int socket; 1990 int uvhub; 1991 unsigned short socket_mask; 1992 1993 for (uvhub = 0; uvhub < nuvhubs; uvhub++) { 1994 struct uvhub_desc *bdp; 1995 struct bau_control *smaster = NULL; 1996 struct bau_control *hmaster = NULL; 1997 1998 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) 1999 continue; 2000 2001 bdp = &uvhub_descs[uvhub]; 2002 socket_mask = bdp->socket_mask; 2003 socket = 0; 2004 while (socket_mask) { 2005 struct socket_desc *sdp; 2006 if ((socket_mask & 1)) { 2007 sdp = &bdp->socket[socket]; 2008 if (scan_sock(sdp, bdp, &smaster, &hmaster)) 2009 return 1; 2010 make_per_cpu_thp(smaster); 2011 } 2012 socket++; 2013 socket_mask = (socket_mask >> 1); 2014 } 2015 make_per_hub_cpumask(hmaster); 2016 } 2017 return 0; 2018} 2019 2020/* 2021 * initialize the bau_control structure for each cpu 2022 */ 2023static int __init init_per_cpu(int nuvhubs, int base_part_pnode) 2024{ 2025 unsigned char *uvhub_mask; 2026 void *vp; 2027 struct uvhub_desc *uvhub_descs; 2028 2029 timeout_us = calculate_destination_timeout(); 2030 2031 vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); 2032 uvhub_descs = (struct uvhub_desc *)vp; 2033 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); 2034 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); 2035 2036 if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask)) 2037 goto fail; 2038 2039 if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask)) 2040 goto fail; 2041 2042 kfree(uvhub_descs); 2043 kfree(uvhub_mask); 2044 init_per_cpu_tunables(); 2045 return 0; 2046 2047fail: 2048 kfree(uvhub_descs); 2049 kfree(uvhub_mask); 2050 return 1; 2051} 2052 2053/* 2054 * Initialization of BAU-related structures 2055 */ 2056static int __init uv_bau_init(void) 2057{ 2058 int uvhub; 2059 int pnode; 2060 int nuvhubs; 2061 int cur_cpu; 2062 int cpus; 2063 int vector; 2064 cpumask_var_t *mask; 2065 2066 if (!is_uv_system()) 2067 return 0; 2068 2069 if (nobau) 2070 return 0; 2071 2072 for_each_possible_cpu(cur_cpu) { 2073 mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); 2074 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); 2075 } 2076 2077 nuvhubs = uv_num_possible_blades(); 2078 spin_lock_init(&disable_lock); 2079 congested_cycles = usec_2_cycles(congested_respns_us); 2080 2081 uv_base_pnode = 0x7fffffff; 2082 for (uvhub = 0; uvhub < nuvhubs; uvhub++) { 2083 cpus = uv_blade_nr_possible_cpus(uvhub); 2084 if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode)) 2085 uv_base_pnode = uv_blade_to_pnode(uvhub); 2086 } 2087 2088 enable_timeouts(); 2089 2090 if (init_per_cpu(nuvhubs, uv_base_pnode)) { 2091 nobau = 1; 2092 return 0; 2093 } 2094 2095 vector = UV_BAU_MESSAGE; 2096 for_each_possible_blade(uvhub) 2097 if (uv_blade_nr_possible_cpus(uvhub)) 2098 init_uvhub(uvhub, vector, uv_base_pnode); 2099 2100 alloc_intr_gate(vector, uv_bau_message_intr1); 2101 2102 for_each_possible_blade(uvhub) { 2103 if (uv_blade_nr_possible_cpus(uvhub)) { 2104 unsigned long val; 2105 unsigned long mmr; 2106 pnode = uv_blade_to_pnode(uvhub); 2107 /* INIT the bau */ 2108 val = 1L << 63; 2109 write_gmmr_activation(pnode, val); 2110 mmr = 1; /* should be 1 to broadcast to both sockets */ 2111 if (!is_uv1_hub()) 2112 write_mmr_data_broadcast(pnode, mmr); 2113 } 2114 } 2115 2116 return 0; 2117} 2118core_initcall(uv_bau_init); 2119fs_initcall(uv_ptc_init); 2120