flow_dissector.c revision 8ed781668dd49b608f1e67a22e3b445fd0c2cd6f
1#include <linux/skbuff.h>
2#include <linux/export.h>
3#include <linux/ip.h>
4#include <linux/ipv6.h>
5#include <linux/if_vlan.h>
6#include <net/ip.h>
7#include <net/ipv6.h>
8#include <linux/if_tunnel.h>
9#include <linux/if_pppox.h>
10#include <linux/ppp_defs.h>
11#include <net/flow_keys.h>
12
13/* copy saddr & daddr, possibly using 64bit load/store
14 * Equivalent to :	flow->src = iph->saddr;
15 *			flow->dst = iph->daddr;
16 */
17static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
18{
19	BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
20		     offsetof(typeof(*flow), src) + sizeof(flow->src));
21	memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
22}
23
24bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
25{
26	int poff, nhoff = skb_network_offset(skb);
27	u8 ip_proto;
28	__be16 proto = skb->protocol;
29
30	memset(flow, 0, sizeof(*flow));
31
32again:
33	switch (proto) {
34	case __constant_htons(ETH_P_IP): {
35		const struct iphdr *iph;
36		struct iphdr _iph;
37ip:
38		iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
39		if (!iph)
40			return false;
41
42		if (ip_is_fragment(iph))
43			ip_proto = 0;
44		else
45			ip_proto = iph->protocol;
46		iph_to_flow_copy_addrs(flow, iph);
47		nhoff += iph->ihl * 4;
48		break;
49	}
50	case __constant_htons(ETH_P_IPV6): {
51		const struct ipv6hdr *iph;
52		struct ipv6hdr _iph;
53ipv6:
54		iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
55		if (!iph)
56			return false;
57
58		ip_proto = iph->nexthdr;
59		flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
60		flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
61		nhoff += sizeof(struct ipv6hdr);
62		break;
63	}
64	case __constant_htons(ETH_P_8021Q): {
65		const struct vlan_hdr *vlan;
66		struct vlan_hdr _vlan;
67
68		vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan);
69		if (!vlan)
70			return false;
71
72		proto = vlan->h_vlan_encapsulated_proto;
73		nhoff += sizeof(*vlan);
74		goto again;
75	}
76	case __constant_htons(ETH_P_PPP_SES): {
77		struct {
78			struct pppoe_hdr hdr;
79			__be16 proto;
80		} *hdr, _hdr;
81		hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
82		if (!hdr)
83			return false;
84		proto = hdr->proto;
85		nhoff += PPPOE_SES_HLEN;
86		switch (proto) {
87		case __constant_htons(PPP_IP):
88			goto ip;
89		case __constant_htons(PPP_IPV6):
90			goto ipv6;
91		default:
92			return false;
93		}
94	}
95	default:
96		return false;
97	}
98
99	switch (ip_proto) {
100	case IPPROTO_GRE: {
101		struct gre_hdr {
102			__be16 flags;
103			__be16 proto;
104		} *hdr, _hdr;
105
106		hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
107		if (!hdr)
108			return false;
109		/*
110		 * Only look inside GRE if version zero and no
111		 * routing
112		 */
113		if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
114			proto = hdr->proto;
115			nhoff += 4;
116			if (hdr->flags & GRE_CSUM)
117				nhoff += 4;
118			if (hdr->flags & GRE_KEY)
119				nhoff += 4;
120			if (hdr->flags & GRE_SEQ)
121				nhoff += 4;
122			goto again;
123		}
124		break;
125	}
126	case IPPROTO_IPIP:
127		goto again;
128	default:
129		break;
130	}
131
132	flow->ip_proto = ip_proto;
133	poff = proto_ports_offset(ip_proto);
134	if (poff >= 0) {
135		__be32 *ports, _ports;
136
137		nhoff += poff;
138		ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports);
139		if (ports)
140			flow->ports = *ports;
141	}
142
143	flow->thoff = (u16) nhoff;
144
145	return true;
146}
147EXPORT_SYMBOL(skb_flow_dissect);
148
149static u32 hashrnd __read_mostly;
150
151/*
152 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
153 * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
154 * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
155 * if hash is a canonical 4-tuple hash over transport ports.
156 */
157void __skb_get_rxhash(struct sk_buff *skb)
158{
159	struct flow_keys keys;
160	u32 hash;
161
162	if (!skb_flow_dissect(skb, &keys))
163		return;
164
165	if (keys.ports)
166		skb->l4_rxhash = 1;
167
168	/* get a consistent hash (same value on both flow directions) */
169	if (((__force u32)keys.dst < (__force u32)keys.src) ||
170	    (((__force u32)keys.dst == (__force u32)keys.src) &&
171	     ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
172		swap(keys.dst, keys.src);
173		swap(keys.port16[0], keys.port16[1]);
174	}
175
176	hash = jhash_3words((__force u32)keys.dst,
177			    (__force u32)keys.src,
178			    (__force u32)keys.ports, hashrnd);
179	if (!hash)
180		hash = 1;
181
182	skb->rxhash = hash;
183}
184EXPORT_SYMBOL(__skb_get_rxhash);
185
186/*
187 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
188 * to be used as a distribution range.
189 */
190u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
191		  unsigned int num_tx_queues)
192{
193	u32 hash;
194	u16 qoffset = 0;
195	u16 qcount = num_tx_queues;
196
197	if (skb_rx_queue_recorded(skb)) {
198		hash = skb_get_rx_queue(skb);
199		while (unlikely(hash >= num_tx_queues))
200			hash -= num_tx_queues;
201		return hash;
202	}
203
204	if (dev->num_tc) {
205		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
206		qoffset = dev->tc_to_txq[tc].offset;
207		qcount = dev->tc_to_txq[tc].count;
208	}
209
210	if (skb->sk && skb->sk->sk_hash)
211		hash = skb->sk->sk_hash;
212	else
213		hash = (__force u16) skb->protocol;
214	hash = jhash_1word(hash, hashrnd);
215
216	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
217}
218EXPORT_SYMBOL(__skb_tx_hash);
219
220static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
221{
222	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
223		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
224				     dev->name, queue_index,
225				     dev->real_num_tx_queues);
226		return 0;
227	}
228	return queue_index;
229}
230
231static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
232{
233#ifdef CONFIG_XPS
234	struct xps_dev_maps *dev_maps;
235	struct xps_map *map;
236	int queue_index = -1;
237
238	rcu_read_lock();
239	dev_maps = rcu_dereference(dev->xps_maps);
240	if (dev_maps) {
241		map = rcu_dereference(
242		    dev_maps->cpu_map[raw_smp_processor_id()]);
243		if (map) {
244			if (map->len == 1)
245				queue_index = map->queues[0];
246			else {
247				u32 hash;
248				if (skb->sk && skb->sk->sk_hash)
249					hash = skb->sk->sk_hash;
250				else
251					hash = (__force u16) skb->protocol ^
252					    skb->rxhash;
253				hash = jhash_1word(hash, hashrnd);
254				queue_index = map->queues[
255				    ((u64)hash * map->len) >> 32];
256			}
257			if (unlikely(queue_index >= dev->real_num_tx_queues))
258				queue_index = -1;
259		}
260	}
261	rcu_read_unlock();
262
263	return queue_index;
264#else
265	return -1;
266#endif
267}
268
269u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
270{
271	struct sock *sk = skb->sk;
272	int queue_index = sk_tx_queue_get(sk);
273
274	if (queue_index < 0 || skb->ooo_okay ||
275	    queue_index >= dev->real_num_tx_queues) {
276		int new_index = get_xps_queue(dev, skb);
277		if (new_index < 0)
278			new_index = skb_tx_hash(dev, skb);
279
280		if (queue_index != new_index && sk) {
281			struct dst_entry *dst =
282				    rcu_dereference_check(sk->sk_dst_cache, 1);
283
284			if (dst && skb_dst(skb) == dst)
285				sk_tx_queue_set(sk, queue_index);
286
287		}
288
289		queue_index = new_index;
290	}
291
292	return queue_index;
293}
294EXPORT_SYMBOL(__netdev_pick_tx);
295
296struct netdev_queue *netdev_pick_tx(struct net_device *dev,
297				    struct sk_buff *skb)
298{
299	int queue_index = 0;
300
301	if (dev->real_num_tx_queues != 1) {
302		const struct net_device_ops *ops = dev->netdev_ops;
303		if (ops->ndo_select_queue)
304			queue_index = ops->ndo_select_queue(dev, skb);
305		else
306			queue_index = __netdev_pick_tx(dev, skb);
307		queue_index = dev_cap_txqueue(dev, queue_index);
308	}
309
310	skb_set_queue_mapping(skb, queue_index);
311	return netdev_get_tx_queue(dev, queue_index);
312}
313
314static int __init initialize_hashrnd(void)
315{
316	get_random_bytes(&hashrnd, sizeof(hashrnd));
317	return 0;
318}
319
320late_initcall_sync(initialize_hashrnd);
321