tlb_uv.c revision 8191c9f69202d4dbc66063cb92059b8a58640d34
1/*
2 *	SGI UltraViolet TLB flush routines.
3 *
4 *	(c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
5 *
6 *	This code is released under the GNU General Public License version 2 or
7 *	later.
8 */
9#include <linux/seq_file.h>
10#include <linux/proc_fs.h>
11#include <linux/debugfs.h>
12#include <linux/kernel.h>
13#include <linux/slab.h>
14
15#include <asm/mmu_context.h>
16#include <asm/uv/uv.h>
17#include <asm/uv/uv_mmrs.h>
18#include <asm/uv/uv_hub.h>
19#include <asm/uv/uv_bau.h>
20#include <asm/apic.h>
21#include <asm/idle.h>
22#include <asm/tsc.h>
23#include <asm/irq_vectors.h>
24#include <asm/timer.h>
25
26/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
27static int timeout_base_ns[] = {
28		20,
29		160,
30		1280,
31		10240,
32		81920,
33		655360,
34		5242880,
35		167772160
36};
37static int timeout_us;
38static int nobau;
39static int baudisabled;
40static spinlock_t disable_lock;
41static cycles_t congested_cycles;
42
43/* tunables: */
44static int max_bau_concurrent = MAX_BAU_CONCURRENT;
45static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
46static int plugged_delay = PLUGGED_DELAY;
47static int plugsb4reset = PLUGSB4RESET;
48static int timeoutsb4reset = TIMEOUTSB4RESET;
49static int ipi_reset_limit = IPI_RESET_LIMIT;
50static int complete_threshold = COMPLETE_THRESHOLD;
51static int congested_response_us = CONGESTED_RESPONSE_US;
52static int congested_reps = CONGESTED_REPS;
53static int congested_period = CONGESTED_PERIOD;
54static struct dentry *tunables_dir;
55static struct dentry *tunables_file;
56
57static int __init setup_nobau(char *arg)
58{
59	nobau = 1;
60	return 0;
61}
62early_param("nobau", setup_nobau);
63
64/* base pnode in this partition */
65static int uv_partition_base_pnode __read_mostly;
66/* position of pnode (which is nasid>>1): */
67static int uv_nshift __read_mostly;
68static unsigned long uv_mmask __read_mostly;
69
70static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
71static DEFINE_PER_CPU(struct bau_control, bau_control);
72static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
73
74/*
75 * Determine the first node on a uvhub. 'Nodes' are used for kernel
76 * memory allocation.
77 */
78static int __init uvhub_to_first_node(int uvhub)
79{
80	int node, b;
81
82	for_each_online_node(node) {
83		b = uv_node_to_blade_id(node);
84		if (uvhub == b)
85			return node;
86	}
87	return -1;
88}
89
90/*
91 * Determine the apicid of the first cpu on a uvhub.
92 */
93static int __init uvhub_to_first_apicid(int uvhub)
94{
95	int cpu;
96
97	for_each_present_cpu(cpu)
98		if (uvhub == uv_cpu_to_blade_id(cpu))
99			return per_cpu(x86_cpu_to_apicid, cpu);
100	return -1;
101}
102
103/*
104 * Free a software acknowledge hardware resource by clearing its Pending
105 * bit. This will return a reply to the sender.
106 * If the message has timed out, a reply has already been sent by the
107 * hardware but the resource has not been released. In that case our
108 * clear of the Timeout bit (as well) will free the resource. No reply will
109 * be sent (the hardware will only do one reply per message).
110 */
111static inline void uv_reply_to_message(struct msg_desc *mdp,
112				       struct bau_control *bcp)
113{
114	unsigned long dw;
115	struct bau_payload_queue_entry *msg;
116
117	msg = mdp->msg;
118	if (!msg->canceled) {
119		dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
120						msg->sw_ack_vector;
121		uv_write_local_mmr(
122				UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
123	}
124	msg->replied_to = 1;
125	msg->sw_ack_vector = 0;
126}
127
128/*
129 * Process the receipt of a RETRY message
130 */
131static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
132					    struct bau_control *bcp)
133{
134	int i;
135	int cancel_count = 0;
136	int slot2;
137	unsigned long msg_res;
138	unsigned long mmr = 0;
139	struct bau_payload_queue_entry *msg;
140	struct bau_payload_queue_entry *msg2;
141	struct ptc_stats *stat;
142
143	msg = mdp->msg;
144	stat = bcp->statp;
145	stat->d_retries++;
146	/*
147	 * cancel any message from msg+1 to the retry itself
148	 */
149	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
150		if (msg2 > mdp->va_queue_last)
151			msg2 = mdp->va_queue_first;
152		if (msg2 == msg)
153			break;
154
155		/* same conditions for cancellation as uv_do_reset */
156		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
157		    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
158			msg->sw_ack_vector) == 0) &&
159		    (msg2->sending_cpu == msg->sending_cpu) &&
160		    (msg2->msg_type != MSG_NOOP)) {
161			slot2 = msg2 - mdp->va_queue_first;
162			mmr = uv_read_local_mmr
163				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
164			msg_res = msg2->sw_ack_vector;
165			/*
166			 * This is a message retry; clear the resources held
167			 * by the previous message only if they timed out.
168			 * If it has not timed out we have an unexpected
169			 * situation to report.
170			 */
171			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
172				/*
173				 * is the resource timed out?
174				 * make everyone ignore the cancelled message.
175				 */
176				msg2->canceled = 1;
177				stat->d_canceled++;
178				cancel_count++;
179				uv_write_local_mmr(
180				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
181					(msg_res << UV_SW_ACK_NPENDING) |
182					 msg_res);
183			}
184		}
185	}
186	if (!cancel_count)
187		stat->d_nocanceled++;
188}
189
190/*
191 * Do all the things a cpu should do for a TLB shootdown message.
192 * Other cpu's may come here at the same time for this message.
193 */
194static void uv_bau_process_message(struct msg_desc *mdp,
195				   struct bau_control *bcp)
196{
197	int msg_ack_count;
198	short socket_ack_count = 0;
199	struct ptc_stats *stat;
200	struct bau_payload_queue_entry *msg;
201	struct bau_control *smaster = bcp->socket_master;
202
203	/*
204	 * This must be a normal message, or retry of a normal message
205	 */
206	msg = mdp->msg;
207	stat = bcp->statp;
208	if (msg->address == TLB_FLUSH_ALL) {
209		local_flush_tlb();
210		stat->d_alltlb++;
211	} else {
212		__flush_tlb_one(msg->address);
213		stat->d_onetlb++;
214	}
215	stat->d_requestee++;
216
217	/*
218	 * One cpu on each uvhub has the additional job on a RETRY
219	 * of releasing the resource held by the message that is
220	 * being retried.  That message is identified by sending
221	 * cpu number.
222	 */
223	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
224		uv_bau_process_retry_msg(mdp, bcp);
225
226	/*
227	 * This is a sw_ack message, so we have to reply to it.
228	 * Count each responding cpu on the socket. This avoids
229	 * pinging the count's cache line back and forth between
230	 * the sockets.
231	 */
232	socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
233			&smaster->socket_acknowledge_count[mdp->msg_slot]);
234	if (socket_ack_count == bcp->cpus_in_socket) {
235		/*
236		 * Both sockets dump their completed count total into
237		 * the message's count.
238		 */
239		smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
240		msg_ack_count = atomic_add_short_return(socket_ack_count,
241				(struct atomic_short *)&msg->acknowledge_count);
242
243		if (msg_ack_count == bcp->cpus_in_uvhub) {
244			/*
245			 * All cpus in uvhub saw it; reply
246			 */
247			uv_reply_to_message(mdp, bcp);
248		}
249	}
250
251	return;
252}
253
254/*
255 * Determine the first cpu on a uvhub.
256 */
257static int uvhub_to_first_cpu(int uvhub)
258{
259	int cpu;
260	for_each_present_cpu(cpu)
261		if (uvhub == uv_cpu_to_blade_id(cpu))
262			return cpu;
263	return -1;
264}
265
266/*
267 * Last resort when we get a large number of destination timeouts is
268 * to clear resources held by a given cpu.
269 * Do this with IPI so that all messages in the BAU message queue
270 * can be identified by their nonzero sw_ack_vector field.
271 *
272 * This is entered for a single cpu on the uvhub.
273 * The sender want's this uvhub to free a specific message's
274 * sw_ack resources.
275 */
276static void
277uv_do_reset(void *ptr)
278{
279	int i;
280	int slot;
281	int count = 0;
282	unsigned long mmr;
283	unsigned long msg_res;
284	struct bau_control *bcp;
285	struct reset_args *rap;
286	struct bau_payload_queue_entry *msg;
287	struct ptc_stats *stat;
288
289	bcp = &per_cpu(bau_control, smp_processor_id());
290	rap = (struct reset_args *)ptr;
291	stat = bcp->statp;
292	stat->d_resets++;
293
294	/*
295	 * We're looking for the given sender, and
296	 * will free its sw_ack resource.
297	 * If all cpu's finally responded after the timeout, its
298	 * message 'replied_to' was set.
299	 */
300	for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
301		/* uv_do_reset: same conditions for cancellation as
302		   uv_bau_process_retry_msg() */
303		if ((msg->replied_to == 0) &&
304		    (msg->canceled == 0) &&
305		    (msg->sending_cpu == rap->sender) &&
306		    (msg->sw_ack_vector) &&
307		    (msg->msg_type != MSG_NOOP)) {
308			/*
309			 * make everyone else ignore this message
310			 */
311			msg->canceled = 1;
312			slot = msg - bcp->va_queue_first;
313			count++;
314			/*
315			 * only reset the resource if it is still pending
316			 */
317			mmr = uv_read_local_mmr
318					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
319			msg_res = msg->sw_ack_vector;
320			if (mmr & msg_res) {
321				stat->d_rcanceled++;
322				uv_write_local_mmr(
323				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
324					(msg_res << UV_SW_ACK_NPENDING) |
325					 msg_res);
326			}
327		}
328	}
329	return;
330}
331
332/*
333 * Use IPI to get all target uvhubs to release resources held by
334 * a given sending cpu number.
335 */
336static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
337			      int sender)
338{
339	int uvhub;
340	int cpu;
341	cpumask_t mask;
342	struct reset_args reset_args;
343
344	reset_args.sender = sender;
345
346	cpus_clear(mask);
347	/* find a single cpu for each uvhub in this distribution mask */
348	for (uvhub = 0;
349		    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
350		    uvhub++) {
351		if (!bau_uvhub_isset(uvhub, distribution))
352			continue;
353		/* find a cpu for this uvhub */
354		cpu = uvhub_to_first_cpu(uvhub);
355		cpu_set(cpu, mask);
356	}
357	/* IPI all cpus; Preemption is already disabled */
358	smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
359	return;
360}
361
362static inline unsigned long
363cycles_2_us(unsigned long long cyc)
364{
365	unsigned long long ns;
366	unsigned long us;
367	ns =  (cyc * per_cpu(cyc2ns, smp_processor_id()))
368						>> CYC2NS_SCALE_FACTOR;
369	us = ns / 1000;
370	return us;
371}
372
373/*
374 * wait for all cpus on this hub to finish their sends and go quiet
375 * leaves uvhub_quiesce set so that no new broadcasts are started by
376 * bau_flush_send_and_wait()
377 */
378static inline void
379quiesce_local_uvhub(struct bau_control *hmaster)
380{
381	atomic_add_short_return(1, (struct atomic_short *)
382		 &hmaster->uvhub_quiesce);
383}
384
385/*
386 * mark this quiet-requestor as done
387 */
388static inline void
389end_uvhub_quiesce(struct bau_control *hmaster)
390{
391	atomic_add_short_return(-1, (struct atomic_short *)
392		&hmaster->uvhub_quiesce);
393}
394
395/*
396 * Wait for completion of a broadcast software ack message
397 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
398 */
399static int uv_wait_completion(struct bau_desc *bau_desc,
400	unsigned long mmr_offset, int right_shift, int this_cpu,
401	struct bau_control *bcp, struct bau_control *smaster, long try)
402{
403	unsigned long descriptor_status;
404	cycles_t ttime;
405	struct ptc_stats *stat = bcp->statp;
406	struct bau_control *hmaster;
407
408	hmaster = bcp->uvhub_master;
409
410	/* spin on the status MMR, waiting for it to go idle */
411	while ((descriptor_status = (((unsigned long)
412		uv_read_local_mmr(mmr_offset) >>
413			right_shift) & UV_ACT_STATUS_MASK)) !=
414			DESC_STATUS_IDLE) {
415		/*
416		 * Our software ack messages may be blocked because there are
417		 * no swack resources available.  As long as none of them
418		 * has timed out hardware will NACK our message and its
419		 * state will stay IDLE.
420		 */
421		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
422			stat->s_stimeout++;
423			return FLUSH_GIVEUP;
424		} else if (descriptor_status ==
425					DESC_STATUS_DESTINATION_TIMEOUT) {
426			stat->s_dtimeout++;
427			ttime = get_cycles();
428
429			/*
430			 * Our retries may be blocked by all destination
431			 * swack resources being consumed, and a timeout
432			 * pending.  In that case hardware returns the
433			 * ERROR that looks like a destination timeout.
434			 */
435			if (cycles_2_us(ttime - bcp->send_message) <
436							timeout_us) {
437				bcp->conseccompletes = 0;
438				return FLUSH_RETRY_PLUGGED;
439			}
440
441			bcp->conseccompletes = 0;
442			return FLUSH_RETRY_TIMEOUT;
443		} else {
444			/*
445			 * descriptor_status is still BUSY
446			 */
447			cpu_relax();
448		}
449	}
450	bcp->conseccompletes++;
451	return FLUSH_COMPLETE;
452}
453
454static inline cycles_t
455sec_2_cycles(unsigned long sec)
456{
457	unsigned long ns;
458	cycles_t cyc;
459
460	ns = sec * 1000000000;
461	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
462	return cyc;
463}
464
465/*
466 * conditionally add 1 to *v, unless *v is >= u
467 * return 0 if we cannot add 1 to *v because it is >= u
468 * return 1 if we can add 1 to *v because it is < u
469 * the add is atomic
470 *
471 * This is close to atomic_add_unless(), but this allows the 'u' value
472 * to be lowered below the current 'v'.  atomic_add_unless can only stop
473 * on equal.
474 */
475static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
476{
477	spin_lock(lock);
478	if (atomic_read(v) >= u) {
479		spin_unlock(lock);
480		return 0;
481	}
482	atomic_inc(v);
483	spin_unlock(lock);
484	return 1;
485}
486
487/*
488 * Our retries are blocked by all destination swack resources being
489 * in use, and a timeout is pending. In that case hardware immediately
490 * returns the ERROR that looks like a destination timeout.
491 */
492static void
493destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
494			struct bau_control *hmaster, struct ptc_stats *stat)
495{
496	udelay(bcp->plugged_delay);
497	bcp->plugged_tries++;
498	if (bcp->plugged_tries >= bcp->plugsb4reset) {
499		bcp->plugged_tries = 0;
500		quiesce_local_uvhub(hmaster);
501		spin_lock(&hmaster->queue_lock);
502		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
503		spin_unlock(&hmaster->queue_lock);
504		end_uvhub_quiesce(hmaster);
505		bcp->ipi_attempts++;
506		stat->s_resets_plug++;
507	}
508}
509
510static void
511destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
512			struct bau_control *hmaster, struct ptc_stats *stat)
513{
514	hmaster->max_bau_concurrent = 1;
515	bcp->timeout_tries++;
516	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
517		bcp->timeout_tries = 0;
518		quiesce_local_uvhub(hmaster);
519		spin_lock(&hmaster->queue_lock);
520		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
521		spin_unlock(&hmaster->queue_lock);
522		end_uvhub_quiesce(hmaster);
523		bcp->ipi_attempts++;
524		stat->s_resets_timeout++;
525	}
526}
527
528/*
529 * Completions are taking a very long time due to a congested numalink
530 * network.
531 */
532static void
533disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
534{
535	int tcpu;
536	struct bau_control *tbcp;
537
538	/* let only one cpu do this disabling */
539	spin_lock(&disable_lock);
540	if (!baudisabled && bcp->period_requests &&
541	    ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
542		/* it becomes this cpu's job to turn on the use of the
543		   BAU again */
544		baudisabled = 1;
545		bcp->set_bau_off = 1;
546		bcp->set_bau_on_time = get_cycles() +
547			sec_2_cycles(bcp->congested_period);
548		stat->s_bau_disabled++;
549		for_each_present_cpu(tcpu) {
550			tbcp = &per_cpu(bau_control, tcpu);
551				tbcp->baudisabled = 1;
552		}
553	}
554	spin_unlock(&disable_lock);
555}
556
557/**
558 * uv_flush_send_and_wait
559 *
560 * Send a broadcast and wait for it to complete.
561 *
562 * The flush_mask contains the cpus the broadcast is to be sent to including
563 * cpus that are on the local uvhub.
564 *
565 * Returns 0 if all flushing represented in the mask was done.
566 * Returns 1 if it gives up entirely and the original cpu mask is to be
567 * returned to the kernel.
568 */
569int uv_flush_send_and_wait(struct bau_desc *bau_desc,
570			   struct cpumask *flush_mask, struct bau_control *bcp)
571{
572	int right_shift;
573	int completion_status = 0;
574	int seq_number = 0;
575	long try = 0;
576	int cpu = bcp->uvhub_cpu;
577	int this_cpu = bcp->cpu;
578	unsigned long mmr_offset;
579	unsigned long index;
580	cycles_t time1;
581	cycles_t time2;
582	cycles_t elapsed;
583	struct ptc_stats *stat = bcp->statp;
584	struct bau_control *smaster = bcp->socket_master;
585	struct bau_control *hmaster = bcp->uvhub_master;
586
587	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
588			&hmaster->active_descriptor_count,
589			hmaster->max_bau_concurrent)) {
590		stat->s_throttles++;
591		do {
592			cpu_relax();
593		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
594			&hmaster->active_descriptor_count,
595			hmaster->max_bau_concurrent));
596	}
597	while (hmaster->uvhub_quiesce)
598		cpu_relax();
599
600	if (cpu < UV_CPUS_PER_ACT_STATUS) {
601		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
602		right_shift = cpu * UV_ACT_STATUS_SIZE;
603	} else {
604		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
605		right_shift =
606		    ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
607	}
608	time1 = get_cycles();
609	do {
610		if (try == 0) {
611			bau_desc->header.msg_type = MSG_REGULAR;
612			seq_number = bcp->message_number++;
613		} else {
614			bau_desc->header.msg_type = MSG_RETRY;
615			stat->s_retry_messages++;
616		}
617		bau_desc->header.sequence = seq_number;
618		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
619			bcp->uvhub_cpu;
620		bcp->send_message = get_cycles();
621		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
622		try++;
623		completion_status = uv_wait_completion(bau_desc, mmr_offset,
624			right_shift, this_cpu, bcp, smaster, try);
625
626		if (completion_status == FLUSH_RETRY_PLUGGED) {
627			destination_plugged(bau_desc, bcp, hmaster, stat);
628		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
629			destination_timeout(bau_desc, bcp, hmaster, stat);
630		}
631		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
632			bcp->ipi_attempts = 0;
633			completion_status = FLUSH_GIVEUP;
634			break;
635		}
636		cpu_relax();
637	} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
638		 (completion_status == FLUSH_RETRY_TIMEOUT));
639	time2 = get_cycles();
640	bcp->plugged_tries = 0;
641	bcp->timeout_tries = 0;
642	if ((completion_status == FLUSH_COMPLETE) &&
643	    (bcp->conseccompletes > bcp->complete_threshold) &&
644	    (hmaster->max_bau_concurrent <
645					hmaster->max_bau_concurrent_constant))
646			hmaster->max_bau_concurrent++;
647	while (hmaster->uvhub_quiesce)
648		cpu_relax();
649	atomic_dec(&hmaster->active_descriptor_count);
650	if (time2 > time1) {
651		elapsed = time2 - time1;
652		stat->s_time += elapsed;
653		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
654			bcp->period_requests++;
655			bcp->period_time += elapsed;
656			if ((elapsed > congested_cycles) &&
657			    (bcp->period_requests > bcp->congested_reps)) {
658				disable_for_congestion(bcp, stat);
659			}
660		}
661	} else
662		stat->s_requestor--;
663	if (completion_status == FLUSH_COMPLETE && try > 1)
664		stat->s_retriesok++;
665	else if (completion_status == FLUSH_GIVEUP) {
666		stat->s_giveup++;
667		return 1;
668	}
669	return 0;
670}
671
672/**
673 * uv_flush_tlb_others - globally purge translation cache of a virtual
674 * address or all TLB's
675 * @cpumask: mask of all cpu's in which the address is to be removed
676 * @mm: mm_struct containing virtual address range
677 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
678 * @cpu: the current cpu
679 *
680 * This is the entry point for initiating any UV global TLB shootdown.
681 *
682 * Purges the translation caches of all specified processors of the given
683 * virtual address, or purges all TLB's on specified processors.
684 *
685 * The caller has derived the cpumask from the mm_struct.  This function
686 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
687 *
688 * The cpumask is converted into a uvhubmask of the uvhubs containing
689 * those cpus.
690 *
691 * Note that this function should be called with preemption disabled.
692 *
693 * Returns NULL if all remote flushing was done.
694 * Returns pointer to cpumask if some remote flushing remains to be
695 * done.  The returned pointer is valid till preemption is re-enabled.
696 */
697const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
698					  struct mm_struct *mm,
699					  unsigned long va, unsigned int cpu)
700{
701	int tcpu;
702	int uvhub;
703	int locals = 0;
704	int remotes = 0;
705	int hubs = 0;
706	struct bau_desc *bau_desc;
707	struct cpumask *flush_mask;
708	struct ptc_stats *stat;
709	struct bau_control *bcp;
710	struct bau_control *tbcp;
711
712	/* kernel was booted 'nobau' */
713	if (nobau)
714		return cpumask;
715
716	bcp = &per_cpu(bau_control, cpu);
717	stat = bcp->statp;
718
719	/* bau was disabled due to slow response */
720	if (bcp->baudisabled) {
721		/* the cpu that disabled it must re-enable it */
722		if (bcp->set_bau_off) {
723			if (get_cycles() >= bcp->set_bau_on_time) {
724				stat->s_bau_reenabled++;
725				baudisabled = 0;
726				for_each_present_cpu(tcpu) {
727					tbcp = &per_cpu(bau_control, tcpu);
728					tbcp->baudisabled = 0;
729					tbcp->period_requests = 0;
730					tbcp->period_time = 0;
731				}
732			}
733		}
734		return cpumask;
735	}
736
737	/*
738	 * Each sending cpu has a per-cpu mask which it fills from the caller's
739	 * cpu mask.  All cpus are converted to uvhubs and copied to the
740	 * activation descriptor.
741	 */
742	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
743	/* don't actually do a shootdown of the local cpu */
744	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
745	if (cpu_isset(cpu, *cpumask))
746		stat->s_ntargself++;
747
748	bau_desc = bcp->descriptor_base;
749	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
750	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
751
752	/* cpu statistics */
753	for_each_cpu(tcpu, flush_mask) {
754		uvhub = uv_cpu_to_blade_id(tcpu);
755		bau_uvhub_set(uvhub, &bau_desc->distribution);
756		if (uvhub == bcp->uvhub)
757			locals++;
758		else
759			remotes++;
760	}
761	if ((locals + remotes) == 0)
762		return NULL;
763	stat->s_requestor++;
764	stat->s_ntargcpu += remotes + locals;
765	stat->s_ntargremotes += remotes;
766	stat->s_ntarglocals += locals;
767	remotes = bau_uvhub_weight(&bau_desc->distribution);
768
769	/* uvhub statistics */
770	hubs = bau_uvhub_weight(&bau_desc->distribution);
771	if (locals) {
772		stat->s_ntarglocaluvhub++;
773		stat->s_ntargremoteuvhub += (hubs - 1);
774	} else
775		stat->s_ntargremoteuvhub += hubs;
776	stat->s_ntarguvhub += hubs;
777	if (hubs >= 16)
778		stat->s_ntarguvhub16++;
779	else if (hubs >= 8)
780		stat->s_ntarguvhub8++;
781	else if (hubs >= 4)
782		stat->s_ntarguvhub4++;
783	else if (hubs >= 2)
784		stat->s_ntarguvhub2++;
785	else
786		stat->s_ntarguvhub1++;
787
788	bau_desc->payload.address = va;
789	bau_desc->payload.sending_cpu = cpu;
790
791	/*
792	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
793	 * or 1 if it gave up and the original cpumask should be returned.
794	 */
795	if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
796		return NULL;
797	else
798		return cpumask;
799}
800
801/*
802 * The BAU message interrupt comes here. (registered by set_intr_gate)
803 * See entry_64.S
804 *
805 * We received a broadcast assist message.
806 *
807 * Interrupts are disabled; this interrupt could represent
808 * the receipt of several messages.
809 *
810 * All cores/threads on this hub get this interrupt.
811 * The last one to see it does the software ack.
812 * (the resource will not be freed until noninterruptable cpus see this
813 *  interrupt; hardware may timeout the s/w ack and reply ERROR)
814 */
815void uv_bau_message_interrupt(struct pt_regs *regs)
816{
817	int count = 0;
818	cycles_t time_start;
819	struct bau_payload_queue_entry *msg;
820	struct bau_control *bcp;
821	struct ptc_stats *stat;
822	struct msg_desc msgdesc;
823
824	time_start = get_cycles();
825	bcp = &per_cpu(bau_control, smp_processor_id());
826	stat = bcp->statp;
827	msgdesc.va_queue_first = bcp->va_queue_first;
828	msgdesc.va_queue_last = bcp->va_queue_last;
829	msg = bcp->bau_msg_head;
830	while (msg->sw_ack_vector) {
831		count++;
832		msgdesc.msg_slot = msg - msgdesc.va_queue_first;
833		msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
834		msgdesc.msg = msg;
835		uv_bau_process_message(&msgdesc, bcp);
836		msg++;
837		if (msg > msgdesc.va_queue_last)
838			msg = msgdesc.va_queue_first;
839		bcp->bau_msg_head = msg;
840	}
841	stat->d_time += (get_cycles() - time_start);
842	if (!count)
843		stat->d_nomsg++;
844	else if (count > 1)
845		stat->d_multmsg++;
846	ack_APIC_irq();
847}
848
849/*
850 * uv_enable_timeouts
851 *
852 * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
853 * shootdown message timeouts enabled.  The timeout does not cause
854 * an interrupt, but causes an error message to be returned to
855 * the sender.
856 */
857static void uv_enable_timeouts(void)
858{
859	int uvhub;
860	int nuvhubs;
861	int pnode;
862	unsigned long mmr_image;
863
864	nuvhubs = uv_num_possible_blades();
865
866	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
867		if (!uv_blade_nr_possible_cpus(uvhub))
868			continue;
869
870		pnode = uv_blade_to_pnode(uvhub);
871		mmr_image =
872		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
873		/*
874		 * Set the timeout period and then lock it in, in three
875		 * steps; captures and locks in the period.
876		 *
877		 * To program the period, the SOFT_ACK_MODE must be off.
878		 */
879		mmr_image &= ~((unsigned long)1 <<
880		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
881		uv_write_global_mmr64
882		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
883		/*
884		 * Set the 4-bit period.
885		 */
886		mmr_image &= ~((unsigned long)0xf <<
887		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
888		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
889		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
890		uv_write_global_mmr64
891		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
892		/*
893		 * Subsequent reversals of the timebase bit (3) cause an
894		 * immediate timeout of one or all INTD resources as
895		 * indicated in bits 2:0 (7 causes all of them to timeout).
896		 */
897		mmr_image |= ((unsigned long)1 <<
898		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
899		uv_write_global_mmr64
900		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
901	}
902}
903
904static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
905{
906	if (*offset < num_possible_cpus())
907		return offset;
908	return NULL;
909}
910
911static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
912{
913	(*offset)++;
914	if (*offset < num_possible_cpus())
915		return offset;
916	return NULL;
917}
918
919static void uv_ptc_seq_stop(struct seq_file *file, void *data)
920{
921}
922
923static inline unsigned long long
924microsec_2_cycles(unsigned long microsec)
925{
926	unsigned long ns;
927	unsigned long long cyc;
928
929	ns = microsec * 1000;
930	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
931	return cyc;
932}
933
934/*
935 * Display the statistics thru /proc.
936 * 'data' points to the cpu number
937 */
938static int uv_ptc_seq_show(struct seq_file *file, void *data)
939{
940	struct ptc_stats *stat;
941	int cpu;
942
943	cpu = *(loff_t *)data;
944
945	if (!cpu) {
946		seq_printf(file,
947			"# cpu sent stime self locals remotes ncpus localhub ");
948		seq_printf(file,
949			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
950		seq_printf(file,
951			"numuvhubs4 numuvhubs2 numuvhubs1 dto ");
952		seq_printf(file,
953			"retries rok resetp resett giveup sto bz throt ");
954		seq_printf(file,
955			"sw_ack recv rtime all ");
956		seq_printf(file,
957			"one mult none retry canc nocan reset rcan ");
958		seq_printf(file,
959			"disable enable\n");
960	}
961	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
962		stat = &per_cpu(ptcstats, cpu);
963		/* source side statistics */
964		seq_printf(file,
965			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
966			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
967			   stat->s_ntargself, stat->s_ntarglocals,
968			   stat->s_ntargremotes, stat->s_ntargcpu,
969			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
970			   stat->s_ntarguvhub, stat->s_ntarguvhub16);
971		seq_printf(file, "%ld %ld %ld %ld %ld ",
972			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
973			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
974			   stat->s_dtimeout);
975		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
976			   stat->s_retry_messages, stat->s_retriesok,
977			   stat->s_resets_plug, stat->s_resets_timeout,
978			   stat->s_giveup, stat->s_stimeout,
979			   stat->s_busy, stat->s_throttles);
980
981		/* destination side statistics */
982		seq_printf(file,
983			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
984			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
985					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
986			   stat->d_requestee, cycles_2_us(stat->d_time),
987			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
988			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
989			   stat->d_nocanceled, stat->d_resets,
990			   stat->d_rcanceled);
991		seq_printf(file, "%ld %ld\n",
992			stat->s_bau_disabled, stat->s_bau_reenabled);
993	}
994
995	return 0;
996}
997
998/*
999 * Display the tunables thru debugfs
1000 */
1001static ssize_t tunables_read(struct file *file, char __user *userbuf,
1002						size_t count, loff_t *ppos)
1003{
1004	char *buf;
1005	int ret;
1006
1007	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
1008		"max_bau_concurrent plugged_delay plugsb4reset",
1009		"timeoutsb4reset ipi_reset_limit complete_threshold",
1010		"congested_response_us congested_reps congested_period",
1011		max_bau_concurrent, plugged_delay, plugsb4reset,
1012		timeoutsb4reset, ipi_reset_limit, complete_threshold,
1013		congested_response_us, congested_reps, congested_period);
1014
1015	if (!buf)
1016		return -ENOMEM;
1017
1018	ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
1019	kfree(buf);
1020	return ret;
1021}
1022
1023/*
1024 * -1: resetf the statistics
1025 *  0: display meaning of the statistics
1026 */
1027static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
1028				 size_t count, loff_t *data)
1029{
1030	int cpu;
1031	long input_arg;
1032	char optstr[64];
1033	struct ptc_stats *stat;
1034
1035	if (count == 0 || count > sizeof(optstr))
1036		return -EINVAL;
1037	if (copy_from_user(optstr, user, count))
1038		return -EFAULT;
1039	optstr[count - 1] = '\0';
1040	if (strict_strtol(optstr, 10, &input_arg) < 0) {
1041		printk(KERN_DEBUG "%s is invalid\n", optstr);
1042		return -EINVAL;
1043	}
1044
1045	if (input_arg == 0) {
1046		printk(KERN_DEBUG "# cpu:      cpu number\n");
1047		printk(KERN_DEBUG "Sender statistics:\n");
1048		printk(KERN_DEBUG
1049		"sent:     number of shootdown messages sent\n");
1050		printk(KERN_DEBUG
1051		"stime:    time spent sending messages\n");
1052		printk(KERN_DEBUG
1053		"numuvhubs: number of hubs targeted with shootdown\n");
1054		printk(KERN_DEBUG
1055		"numuvhubs16: number times 16 or more hubs targeted\n");
1056		printk(KERN_DEBUG
1057		"numuvhubs8: number times 8 or more hubs targeted\n");
1058		printk(KERN_DEBUG
1059		"numuvhubs4: number times 4 or more hubs targeted\n");
1060		printk(KERN_DEBUG
1061		"numuvhubs2: number times 2 or more hubs targeted\n");
1062		printk(KERN_DEBUG
1063		"numuvhubs1: number times 1 hub targeted\n");
1064		printk(KERN_DEBUG
1065		"numcpus:  number of cpus targeted with shootdown\n");
1066		printk(KERN_DEBUG
1067		"dto:      number of destination timeouts\n");
1068		printk(KERN_DEBUG
1069		"retries:  destination timeout retries sent\n");
1070		printk(KERN_DEBUG
1071		"rok:   :  destination timeouts successfully retried\n");
1072		printk(KERN_DEBUG
1073		"resetp:   ipi-style resource resets for plugs\n");
1074		printk(KERN_DEBUG
1075		"resett:   ipi-style resource resets for timeouts\n");
1076		printk(KERN_DEBUG
1077		"giveup:   fall-backs to ipi-style shootdowns\n");
1078		printk(KERN_DEBUG
1079		"sto:      number of source timeouts\n");
1080		printk(KERN_DEBUG
1081		"bz:       number of stay-busy's\n");
1082		printk(KERN_DEBUG
1083		"throt:    number times spun in throttle\n");
1084		printk(KERN_DEBUG "Destination side statistics:\n");
1085		printk(KERN_DEBUG
1086		"sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
1087		printk(KERN_DEBUG
1088		"recv:     shootdown messages received\n");
1089		printk(KERN_DEBUG
1090		"rtime:    time spent processing messages\n");
1091		printk(KERN_DEBUG
1092		"all:      shootdown all-tlb messages\n");
1093		printk(KERN_DEBUG
1094		"one:      shootdown one-tlb messages\n");
1095		printk(KERN_DEBUG
1096		"mult:     interrupts that found multiple messages\n");
1097		printk(KERN_DEBUG
1098		"none:     interrupts that found no messages\n");
1099		printk(KERN_DEBUG
1100		"retry:    number of retry messages processed\n");
1101		printk(KERN_DEBUG
1102		"canc:     number messages canceled by retries\n");
1103		printk(KERN_DEBUG
1104		"nocan:    number retries that found nothing to cancel\n");
1105		printk(KERN_DEBUG
1106		"reset:    number of ipi-style reset requests processed\n");
1107		printk(KERN_DEBUG
1108		"rcan:     number messages canceled by reset requests\n");
1109		printk(KERN_DEBUG
1110		"disable:  number times use of the BAU was disabled\n");
1111		printk(KERN_DEBUG
1112		"enable:   number times use of the BAU was re-enabled\n");
1113	} else if (input_arg == -1) {
1114		for_each_present_cpu(cpu) {
1115			stat = &per_cpu(ptcstats, cpu);
1116			memset(stat, 0, sizeof(struct ptc_stats));
1117		}
1118	}
1119
1120	return count;
1121}
1122
1123static int local_atoi(const char *name)
1124{
1125	int val = 0;
1126
1127	for (;; name++) {
1128		switch (*name) {
1129		case '0' ... '9':
1130			val = 10*val+(*name-'0');
1131			break;
1132		default:
1133			return val;
1134		}
1135	}
1136}
1137
1138/*
1139 * set the tunables
1140 * 0 values reset them to defaults
1141 */
1142static ssize_t tunables_write(struct file *file, const char __user *user,
1143				 size_t count, loff_t *data)
1144{
1145	int cpu;
1146	int cnt = 0;
1147	int val;
1148	char *p;
1149	char *q;
1150	char instr[64];
1151	struct bau_control *bcp;
1152
1153	if (count == 0 || count > sizeof(instr)-1)
1154		return -EINVAL;
1155	if (copy_from_user(instr, user, count))
1156		return -EFAULT;
1157
1158	instr[count] = '\0';
1159	/* count the fields */
1160	p = instr + strspn(instr, WHITESPACE);
1161	q = p;
1162	for (; *p; p = q + strspn(q, WHITESPACE)) {
1163		q = p + strcspn(p, WHITESPACE);
1164		cnt++;
1165		if (q == p)
1166			break;
1167	}
1168	if (cnt != 9) {
1169		printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
1170		return -EINVAL;
1171	}
1172
1173	p = instr + strspn(instr, WHITESPACE);
1174	q = p;
1175	for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
1176		q = p + strcspn(p, WHITESPACE);
1177		val = local_atoi(p);
1178		switch (cnt) {
1179		case 0:
1180			if (val == 0) {
1181				max_bau_concurrent = MAX_BAU_CONCURRENT;
1182				max_bau_concurrent_constant =
1183							MAX_BAU_CONCURRENT;
1184				continue;
1185			}
1186			bcp = &per_cpu(bau_control, smp_processor_id());
1187			if (val < 1 || val > bcp->cpus_in_uvhub) {
1188				printk(KERN_DEBUG
1189				"Error: BAU max concurrent %d is invalid\n",
1190				val);
1191				return -EINVAL;
1192			}
1193			max_bau_concurrent = val;
1194			max_bau_concurrent_constant = val;
1195			continue;
1196		case 1:
1197			if (val == 0)
1198				plugged_delay = PLUGGED_DELAY;
1199			else
1200				plugged_delay = val;
1201			continue;
1202		case 2:
1203			if (val == 0)
1204				plugsb4reset = PLUGSB4RESET;
1205			else
1206				plugsb4reset = val;
1207			continue;
1208		case 3:
1209			if (val == 0)
1210				timeoutsb4reset = TIMEOUTSB4RESET;
1211			else
1212				timeoutsb4reset = val;
1213			continue;
1214		case 4:
1215			if (val == 0)
1216				ipi_reset_limit = IPI_RESET_LIMIT;
1217			else
1218				ipi_reset_limit = val;
1219			continue;
1220		case 5:
1221			if (val == 0)
1222				complete_threshold = COMPLETE_THRESHOLD;
1223			else
1224				complete_threshold = val;
1225			continue;
1226		case 6:
1227			if (val == 0)
1228				congested_response_us = CONGESTED_RESPONSE_US;
1229			else
1230				congested_response_us = val;
1231			continue;
1232		case 7:
1233			if (val == 0)
1234				congested_reps = CONGESTED_REPS;
1235			else
1236				congested_reps = val;
1237			continue;
1238		case 8:
1239			if (val == 0)
1240				congested_period = CONGESTED_PERIOD;
1241			else
1242				congested_period = val;
1243			continue;
1244		}
1245		if (q == p)
1246			break;
1247	}
1248	for_each_present_cpu(cpu) {
1249		bcp = &per_cpu(bau_control, cpu);
1250		bcp->max_bau_concurrent = max_bau_concurrent;
1251		bcp->max_bau_concurrent_constant = max_bau_concurrent;
1252		bcp->plugged_delay = plugged_delay;
1253		bcp->plugsb4reset = plugsb4reset;
1254		bcp->timeoutsb4reset = timeoutsb4reset;
1255		bcp->ipi_reset_limit = ipi_reset_limit;
1256		bcp->complete_threshold = complete_threshold;
1257		bcp->congested_response_us = congested_response_us;
1258		bcp->congested_reps = congested_reps;
1259		bcp->congested_period = congested_period;
1260	}
1261	return count;
1262}
1263
1264static const struct seq_operations uv_ptc_seq_ops = {
1265	.start		= uv_ptc_seq_start,
1266	.next		= uv_ptc_seq_next,
1267	.stop		= uv_ptc_seq_stop,
1268	.show		= uv_ptc_seq_show
1269};
1270
1271static int uv_ptc_proc_open(struct inode *inode, struct file *file)
1272{
1273	return seq_open(file, &uv_ptc_seq_ops);
1274}
1275
1276static int tunables_open(struct inode *inode, struct file *file)
1277{
1278	return 0;
1279}
1280
1281static const struct file_operations proc_uv_ptc_operations = {
1282	.open		= uv_ptc_proc_open,
1283	.read		= seq_read,
1284	.write		= uv_ptc_proc_write,
1285	.llseek		= seq_lseek,
1286	.release	= seq_release,
1287};
1288
1289static const struct file_operations tunables_fops = {
1290	.open		= tunables_open,
1291	.read		= tunables_read,
1292	.write		= tunables_write,
1293	.llseek		= default_llseek,
1294};
1295
1296static int __init uv_ptc_init(void)
1297{
1298	struct proc_dir_entry *proc_uv_ptc;
1299
1300	if (!is_uv_system())
1301		return 0;
1302
1303	proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
1304				  &proc_uv_ptc_operations);
1305	if (!proc_uv_ptc) {
1306		printk(KERN_ERR "unable to create %s proc entry\n",
1307		       UV_PTC_BASENAME);
1308		return -EINVAL;
1309	}
1310
1311	tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
1312	if (!tunables_dir) {
1313		printk(KERN_ERR "unable to create debugfs directory %s\n",
1314		       UV_BAU_TUNABLES_DIR);
1315		return -EINVAL;
1316	}
1317	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
1318			tunables_dir, NULL, &tunables_fops);
1319	if (!tunables_file) {
1320		printk(KERN_ERR "unable to create debugfs file %s\n",
1321		       UV_BAU_TUNABLES_FILE);
1322		return -EINVAL;
1323	}
1324	return 0;
1325}
1326
1327/*
1328 * initialize the sending side's sending buffers
1329 */
1330static void
1331uv_activation_descriptor_init(int node, int pnode)
1332{
1333	int i;
1334	int cpu;
1335	unsigned long pa;
1336	unsigned long m;
1337	unsigned long n;
1338	struct bau_desc *bau_desc;
1339	struct bau_desc *bd2;
1340	struct bau_control *bcp;
1341
1342	/*
1343	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
1344	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
1345	 */
1346	bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
1347				* UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
1348	BUG_ON(!bau_desc);
1349
1350	pa = uv_gpa(bau_desc); /* need the real nasid*/
1351	n = pa >> uv_nshift;
1352	m = pa & uv_mmask;
1353
1354	uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
1355			      (n << UV_DESC_BASE_PNODE_SHIFT | m));
1356
1357	/*
1358	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
1359	 * cpu even though we only use the first one; one descriptor can
1360	 * describe a broadcast to 256 uv hubs.
1361	 */
1362	for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
1363		i++, bd2++) {
1364		memset(bd2, 0, sizeof(struct bau_desc));
1365		bd2->header.sw_ack_flag = 1;
1366		/*
1367		 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
1368		 * in the partition. The bit map will indicate uvhub numbers,
1369		 * which are 0-N in a partition. Pnodes are unique system-wide.
1370		 */
1371		bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
1372		bd2->header.dest_subnodeid = 0x10; /* the LB */
1373		bd2->header.command = UV_NET_ENDPOINT_INTD;
1374		bd2->header.int_both = 1;
1375		/*
1376		 * all others need to be set to zero:
1377		 *   fairness chaining multilevel count replied_to
1378		 */
1379	}
1380	for_each_present_cpu(cpu) {
1381		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
1382			continue;
1383		bcp = &per_cpu(bau_control, cpu);
1384		bcp->descriptor_base = bau_desc;
1385	}
1386}
1387
1388/*
1389 * initialize the destination side's receiving buffers
1390 * entered for each uvhub in the partition
1391 * - node is first node (kernel memory notion) on the uvhub
1392 * - pnode is the uvhub's physical identifier
1393 */
1394static void
1395uv_payload_queue_init(int node, int pnode)
1396{
1397	int pn;
1398	int cpu;
1399	char *cp;
1400	unsigned long pa;
1401	struct bau_payload_queue_entry *pqp;
1402	struct bau_payload_queue_entry *pqp_malloc;
1403	struct bau_control *bcp;
1404
1405	pqp = kmalloc_node((DEST_Q_SIZE + 1)
1406			   * sizeof(struct bau_payload_queue_entry),
1407			   GFP_KERNEL, node);
1408	BUG_ON(!pqp);
1409	pqp_malloc = pqp;
1410
1411	cp = (char *)pqp + 31;
1412	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
1413
1414	for_each_present_cpu(cpu) {
1415		if (pnode != uv_cpu_to_pnode(cpu))
1416			continue;
1417		/* for every cpu on this pnode: */
1418		bcp = &per_cpu(bau_control, cpu);
1419		bcp->va_queue_first = pqp;
1420		bcp->bau_msg_head = pqp;
1421		bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
1422	}
1423	/*
1424	 * need the pnode of where the memory was really allocated
1425	 */
1426	pa = uv_gpa(pqp);
1427	pn = pa >> uv_nshift;
1428	uv_write_global_mmr64(pnode,
1429			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
1430			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
1431			      uv_physnodeaddr(pqp));
1432	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
1433			      uv_physnodeaddr(pqp));
1434	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
1435			      (unsigned long)
1436			      uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
1437	/* in effect, all msg_type's are set to MSG_NOOP */
1438	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
1439}
1440
1441/*
1442 * Initialization of each UV hub's structures
1443 */
1444static void __init uv_init_uvhub(int uvhub, int vector)
1445{
1446	int node;
1447	int pnode;
1448	unsigned long apicid;
1449
1450	node = uvhub_to_first_node(uvhub);
1451	pnode = uv_blade_to_pnode(uvhub);
1452	uv_activation_descriptor_init(node, pnode);
1453	uv_payload_queue_init(node, pnode);
1454	/*
1455	 * the below initialization can't be in firmware because the
1456	 * messaging IRQ will be determined by the OS
1457	 */
1458	apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
1459	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
1460				      ((apicid << 32) | vector));
1461}
1462
1463/*
1464 * We will set BAU_MISC_CONTROL with a timeout period.
1465 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
1466 * So the destination timeout period has be be calculated from them.
1467 */
1468static int
1469calculate_destination_timeout(void)
1470{
1471	unsigned long mmr_image;
1472	int mult1;
1473	int mult2;
1474	int index;
1475	int base;
1476	int ret;
1477	unsigned long ts_ns;
1478
1479	mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1480	mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1481	index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1482	mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1483	mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1484	base = timeout_base_ns[index];
1485	ts_ns = base * mult1 * mult2;
1486	ret = ts_ns / 1000;
1487	return ret;
1488}
1489
1490/*
1491 * initialize the bau_control structure for each cpu
1492 */
1493static void __init uv_init_per_cpu(int nuvhubs)
1494{
1495	int i;
1496	int cpu;
1497	int pnode;
1498	int uvhub;
1499	int have_hmaster;
1500	short socket = 0;
1501	unsigned short socket_mask;
1502	unsigned char *uvhub_mask;
1503	struct bau_control *bcp;
1504	struct uvhub_desc *bdp;
1505	struct socket_desc *sdp;
1506	struct bau_control *hmaster = NULL;
1507	struct bau_control *smaster = NULL;
1508	struct socket_desc {
1509		short num_cpus;
1510		short cpu_number[16];
1511	};
1512	struct uvhub_desc {
1513		unsigned short socket_mask;
1514		short num_cpus;
1515		short uvhub;
1516		short pnode;
1517		struct socket_desc socket[2];
1518	};
1519	struct uvhub_desc *uvhub_descs;
1520
1521	timeout_us = calculate_destination_timeout();
1522
1523	uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1524	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1525	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1526	for_each_present_cpu(cpu) {
1527		bcp = &per_cpu(bau_control, cpu);
1528		memset(bcp, 0, sizeof(struct bau_control));
1529		pnode = uv_cpu_hub_info(cpu)->pnode;
1530		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1531		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1532		bdp = &uvhub_descs[uvhub];
1533		bdp->num_cpus++;
1534		bdp->uvhub = uvhub;
1535		bdp->pnode = pnode;
1536		/* kludge: 'assuming' one node per socket, and assuming that
1537		   disabling a socket just leaves a gap in node numbers */
1538		socket = (cpu_to_node(cpu) & 1);
1539		bdp->socket_mask |= (1 << socket);
1540		sdp = &bdp->socket[socket];
1541		sdp->cpu_number[sdp->num_cpus] = cpu;
1542		sdp->num_cpus++;
1543	}
1544	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1545		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
1546			continue;
1547		have_hmaster = 0;
1548		bdp = &uvhub_descs[uvhub];
1549		socket_mask = bdp->socket_mask;
1550		socket = 0;
1551		while (socket_mask) {
1552			if (!(socket_mask & 1))
1553				goto nextsocket;
1554			sdp = &bdp->socket[socket];
1555			for (i = 0; i < sdp->num_cpus; i++) {
1556				cpu = sdp->cpu_number[i];
1557				bcp = &per_cpu(bau_control, cpu);
1558				bcp->cpu = cpu;
1559				if (i == 0) {
1560					smaster = bcp;
1561					if (!have_hmaster) {
1562						have_hmaster++;
1563						hmaster = bcp;
1564					}
1565				}
1566				bcp->cpus_in_uvhub = bdp->num_cpus;
1567				bcp->cpus_in_socket = sdp->num_cpus;
1568				bcp->socket_master = smaster;
1569				bcp->uvhub = bdp->uvhub;
1570				bcp->uvhub_master = hmaster;
1571				bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
1572						blade_processor_id;
1573			}
1574nextsocket:
1575			socket++;
1576			socket_mask = (socket_mask >> 1);
1577		}
1578	}
1579	kfree(uvhub_descs);
1580	kfree(uvhub_mask);
1581	for_each_present_cpu(cpu) {
1582		bcp = &per_cpu(bau_control, cpu);
1583		bcp->baudisabled = 0;
1584		bcp->statp = &per_cpu(ptcstats, cpu);
1585		/* time interval to catch a hardware stay-busy bug */
1586		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
1587		bcp->max_bau_concurrent = max_bau_concurrent;
1588		bcp->max_bau_concurrent_constant = max_bau_concurrent;
1589		bcp->plugged_delay = plugged_delay;
1590		bcp->plugsb4reset = plugsb4reset;
1591		bcp->timeoutsb4reset = timeoutsb4reset;
1592		bcp->ipi_reset_limit = ipi_reset_limit;
1593		bcp->complete_threshold = complete_threshold;
1594		bcp->congested_response_us = congested_response_us;
1595		bcp->congested_reps = congested_reps;
1596		bcp->congested_period = congested_period;
1597	}
1598}
1599
1600/*
1601 * Initialization of BAU-related structures
1602 */
1603static int __init uv_bau_init(void)
1604{
1605	int uvhub;
1606	int pnode;
1607	int nuvhubs;
1608	int cur_cpu;
1609	int vector;
1610	unsigned long mmr;
1611
1612	if (!is_uv_system())
1613		return 0;
1614
1615	if (nobau)
1616		return 0;
1617
1618	for_each_possible_cpu(cur_cpu)
1619		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
1620				       GFP_KERNEL, cpu_to_node(cur_cpu));
1621
1622	uv_nshift = uv_hub_info->m_val;
1623	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
1624	nuvhubs = uv_num_possible_blades();
1625	spin_lock_init(&disable_lock);
1626	congested_cycles = microsec_2_cycles(congested_response_us);
1627
1628	uv_init_per_cpu(nuvhubs);
1629
1630	uv_partition_base_pnode = 0x7fffffff;
1631	for (uvhub = 0; uvhub < nuvhubs; uvhub++)
1632		if (uv_blade_nr_possible_cpus(uvhub) &&
1633			(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
1634			uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
1635
1636	vector = UV_BAU_MESSAGE;
1637	for_each_possible_blade(uvhub)
1638		if (uv_blade_nr_possible_cpus(uvhub))
1639			uv_init_uvhub(uvhub, vector);
1640
1641	uv_enable_timeouts();
1642	alloc_intr_gate(vector, uv_bau_message_intr1);
1643
1644	for_each_possible_blade(uvhub) {
1645		if (uv_blade_nr_possible_cpus(uvhub)) {
1646			pnode = uv_blade_to_pnode(uvhub);
1647			/* INIT the bau */
1648			uv_write_global_mmr64(pnode,
1649					UVH_LB_BAU_SB_ACTIVATION_CONTROL,
1650					((unsigned long)1 << 63));
1651			mmr = 1; /* should be 1 to broadcast to both sockets */
1652			uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
1653						mmr);
1654		}
1655	}
1656
1657	return 0;
1658}
1659core_initcall(uv_bau_init);
1660fs_initcall(uv_ptc_init);
1661