1/* Evaluate MSG_ZEROCOPY
2 *
3 * Send traffic between two processes over one of the supported
4 * protocols and modes:
5 *
6 * PF_INET/PF_INET6
7 * - SOCK_STREAM
8 * - SOCK_DGRAM
9 * - SOCK_DGRAM with UDP_CORK
10 * - SOCK_RAW
11 * - SOCK_RAW with IP_HDRINCL
12 *
13 * PF_PACKET
14 * - SOCK_DGRAM
15 * - SOCK_RAW
16 *
17 * Start this program on two connected hosts, one in send mode and
18 * the other with option '-r' to put it in receiver mode.
19 *
20 * If zerocopy mode ('-z') is enabled, the sender will verify that
21 * the kernel queues completions on the error queue for all zerocopy
22 * transfers.
23 */
24
25#define _GNU_SOURCE
26
27#include <arpa/inet.h>
28#include <error.h>
29#include <errno.h>
30#include <limits.h>
31#include <linux/errqueue.h>
32#include <linux/if_packet.h>
33#include <linux/ipv6.h>
34#include <linux/socket.h>
35#include <linux/sockios.h>
36#include <net/ethernet.h>
37#include <net/if.h>
38#include <netinet/ip.h>
39#include <netinet/ip6.h>
40#include <netinet/tcp.h>
41#include <netinet/udp.h>
42#include <poll.h>
43#include <sched.h>
44#include <stdbool.h>
45#include <stdio.h>
46#include <stdint.h>
47#include <stdlib.h>
48#include <string.h>
49#include <sys/ioctl.h>
50#include <sys/socket.h>
51#include <sys/stat.h>
52#include <sys/time.h>
53#include <sys/types.h>
54#include <sys/wait.h>
55#include <unistd.h>
56
57#ifndef SO_EE_ORIGIN_ZEROCOPY
58#define SO_EE_ORIGIN_ZEROCOPY		5
59#endif
60
61#ifndef SO_ZEROCOPY
62#define SO_ZEROCOPY	60
63#endif
64
65#ifndef SO_EE_CODE_ZEROCOPY_COPIED
66#define SO_EE_CODE_ZEROCOPY_COPIED	1
67#endif
68
69#ifndef MSG_ZEROCOPY
70#define MSG_ZEROCOPY	0x4000000
71#endif
72
73static int  cfg_cork;
74static bool cfg_cork_mixed;
75static int  cfg_cpu		= -1;		/* default: pin to last cpu */
76static int  cfg_family		= PF_UNSPEC;
77static int  cfg_ifindex		= 1;
78static int  cfg_payload_len;
79static int  cfg_port		= 8000;
80static bool cfg_rx;
81static int  cfg_runtime_ms	= 4200;
82static int  cfg_verbose;
83static int  cfg_waittime_ms	= 500;
84static bool cfg_zerocopy;
85
86static socklen_t cfg_alen;
87static struct sockaddr_storage cfg_dst_addr;
88static struct sockaddr_storage cfg_src_addr;
89
90static char payload[IP_MAXPACKET];
91static long packets, bytes, completions, expected_completions;
92static int  zerocopied = -1;
93static uint32_t next_completion;
94
95static unsigned long gettimeofday_ms(void)
96{
97	struct timeval tv;
98
99	gettimeofday(&tv, NULL);
100	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
101}
102
103static uint16_t get_ip_csum(const uint16_t *start, int num_words)
104{
105	unsigned long sum = 0;
106	int i;
107
108	for (i = 0; i < num_words; i++)
109		sum += start[i];
110
111	while (sum >> 16)
112		sum = (sum & 0xFFFF) + (sum >> 16);
113
114	return ~sum;
115}
116
117static int do_setcpu(int cpu)
118{
119	cpu_set_t mask;
120
121	CPU_ZERO(&mask);
122	CPU_SET(cpu, &mask);
123	if (sched_setaffinity(0, sizeof(mask), &mask))
124		error(1, 0, "setaffinity %d", cpu);
125
126	if (cfg_verbose)
127		fprintf(stderr, "cpu: %u\n", cpu);
128
129	return 0;
130}
131
132static void do_setsockopt(int fd, int level, int optname, int val)
133{
134	if (setsockopt(fd, level, optname, &val, sizeof(val)))
135		error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
136}
137
138static int do_poll(int fd, int events)
139{
140	struct pollfd pfd;
141	int ret;
142
143	pfd.events = events;
144	pfd.revents = 0;
145	pfd.fd = fd;
146
147	ret = poll(&pfd, 1, cfg_waittime_ms);
148	if (ret == -1)
149		error(1, errno, "poll");
150
151	return ret && (pfd.revents & events);
152}
153
154static int do_accept(int fd)
155{
156	int fda = fd;
157
158	fd = accept(fda, NULL, NULL);
159	if (fd == -1)
160		error(1, errno, "accept");
161	if (close(fda))
162		error(1, errno, "close listen sock");
163
164	return fd;
165}
166
167static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy)
168{
169	int ret, len, i, flags;
170
171	len = 0;
172	for (i = 0; i < msg->msg_iovlen; i++)
173		len += msg->msg_iov[i].iov_len;
174
175	flags = MSG_DONTWAIT;
176	if (do_zerocopy)
177		flags |= MSG_ZEROCOPY;
178
179	ret = sendmsg(fd, msg, flags);
180	if (ret == -1 && errno == EAGAIN)
181		return false;
182	if (ret == -1)
183		error(1, errno, "send");
184	if (cfg_verbose && ret != len)
185		fprintf(stderr, "send: ret=%u != %u\n", ret, len);
186
187	if (len) {
188		packets++;
189		bytes += ret;
190		if (do_zerocopy && ret)
191			expected_completions++;
192	}
193
194	return true;
195}
196
197static void do_sendmsg_corked(int fd, struct msghdr *msg)
198{
199	bool do_zerocopy = cfg_zerocopy;
200	int i, payload_len, extra_len;
201
202	/* split up the packet. for non-multiple, make first buffer longer */
203	payload_len = cfg_payload_len / cfg_cork;
204	extra_len = cfg_payload_len - (cfg_cork * payload_len);
205
206	do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
207
208	for (i = 0; i < cfg_cork; i++) {
209
210		/* in mixed-frags mode, alternate zerocopy and copy frags
211		 * start with non-zerocopy, to ensure attach later works
212		 */
213		if (cfg_cork_mixed)
214			do_zerocopy = (i & 1);
215
216		msg->msg_iov[0].iov_len = payload_len + extra_len;
217		extra_len = 0;
218
219		do_sendmsg(fd, msg, do_zerocopy);
220	}
221
222	do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
223}
224
225static int setup_iph(struct iphdr *iph, uint16_t payload_len)
226{
227	struct sockaddr_in *daddr = (void *) &cfg_dst_addr;
228	struct sockaddr_in *saddr = (void *) &cfg_src_addr;
229
230	memset(iph, 0, sizeof(*iph));
231
232	iph->version	= 4;
233	iph->tos	= 0;
234	iph->ihl	= 5;
235	iph->ttl	= 2;
236	iph->saddr	= saddr->sin_addr.s_addr;
237	iph->daddr	= daddr->sin_addr.s_addr;
238	iph->protocol	= IPPROTO_EGP;
239	iph->tot_len	= htons(sizeof(*iph) + payload_len);
240	iph->check	= get_ip_csum((void *) iph, iph->ihl << 1);
241
242	return sizeof(*iph);
243}
244
245static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len)
246{
247	struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr;
248	struct sockaddr_in6 *saddr = (void *) &cfg_src_addr;
249
250	memset(ip6h, 0, sizeof(*ip6h));
251
252	ip6h->version		= 6;
253	ip6h->payload_len	= htons(payload_len);
254	ip6h->nexthdr		= IPPROTO_EGP;
255	ip6h->hop_limit		= 2;
256	ip6h->saddr		= saddr->sin6_addr;
257	ip6h->daddr		= daddr->sin6_addr;
258
259	return sizeof(*ip6h);
260}
261
262static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr)
263{
264	struct sockaddr_in6 *addr6 = (void *) sockaddr;
265	struct sockaddr_in *addr4 = (void *) sockaddr;
266
267	switch (domain) {
268	case PF_INET:
269		addr4->sin_family = AF_INET;
270		addr4->sin_port = htons(cfg_port);
271		if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
272			error(1, 0, "ipv4 parse error: %s", str_addr);
273		break;
274	case PF_INET6:
275		addr6->sin6_family = AF_INET6;
276		addr6->sin6_port = htons(cfg_port);
277		if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
278			error(1, 0, "ipv6 parse error: %s", str_addr);
279		break;
280	default:
281		error(1, 0, "illegal domain");
282	}
283}
284
285static int do_setup_tx(int domain, int type, int protocol)
286{
287	int fd;
288
289	fd = socket(domain, type, protocol);
290	if (fd == -1)
291		error(1, errno, "socket t");
292
293	do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
294	if (cfg_zerocopy)
295		do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
296
297	if (domain != PF_PACKET)
298		if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
299			error(1, errno, "connect");
300
301	return fd;
302}
303
304static bool do_recv_completion(int fd)
305{
306	struct sock_extended_err *serr;
307	struct msghdr msg = {};
308	struct cmsghdr *cm;
309	uint32_t hi, lo, range;
310	int ret, zerocopy;
311	char control[100];
312
313	msg.msg_control = control;
314	msg.msg_controllen = sizeof(control);
315
316	ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
317	if (ret == -1 && errno == EAGAIN)
318		return false;
319	if (ret == -1)
320		error(1, errno, "recvmsg notification");
321	if (msg.msg_flags & MSG_CTRUNC)
322		error(1, errno, "recvmsg notification: truncated");
323
324	cm = CMSG_FIRSTHDR(&msg);
325	if (!cm)
326		error(1, 0, "cmsg: no cmsg");
327	if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
328	      (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) ||
329	      (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP)))
330		error(1, 0, "serr: wrong type: %d.%d",
331		      cm->cmsg_level, cm->cmsg_type);
332
333	serr = (void *) CMSG_DATA(cm);
334	if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
335		error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
336	if (serr->ee_errno != 0)
337		error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
338
339	hi = serr->ee_data;
340	lo = serr->ee_info;
341	range = hi - lo + 1;
342
343	/* Detect notification gaps. These should not happen often, if at all.
344	 * Gaps can occur due to drops, reordering and retransmissions.
345	 */
346	if (lo != next_completion)
347		fprintf(stderr, "gap: %u..%u does not append to %u\n",
348			lo, hi, next_completion);
349	next_completion = hi + 1;
350
351	zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED);
352	if (zerocopied == -1)
353		zerocopied = zerocopy;
354	else if (zerocopied != zerocopy) {
355		fprintf(stderr, "serr: inconsistent\n");
356		zerocopied = zerocopy;
357	}
358
359	if (cfg_verbose >= 2)
360		fprintf(stderr, "completed: %u (h=%u l=%u)\n",
361			range, hi, lo);
362
363	completions += range;
364	return true;
365}
366
367/* Read all outstanding messages on the errqueue */
368static void do_recv_completions(int fd)
369{
370	while (do_recv_completion(fd)) {}
371}
372
373/* Wait for all remaining completions on the errqueue */
374static void do_recv_remaining_completions(int fd)
375{
376	int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
377
378	while (completions < expected_completions &&
379	       gettimeofday_ms() < tstop) {
380		if (do_poll(fd, POLLERR))
381			do_recv_completions(fd);
382	}
383
384	if (completions < expected_completions)
385		fprintf(stderr, "missing notifications: %lu < %lu\n",
386			completions, expected_completions);
387}
388
389static void do_tx(int domain, int type, int protocol)
390{
391	struct iovec iov[3] = { {0} };
392	struct sockaddr_ll laddr;
393	struct msghdr msg = {0};
394	struct ethhdr eth;
395	union {
396		struct ipv6hdr ip6h;
397		struct iphdr iph;
398	} nh;
399	uint64_t tstop;
400	int fd;
401
402	fd = do_setup_tx(domain, type, protocol);
403
404	if (domain == PF_PACKET) {
405		uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6;
406
407		/* sock_raw passes ll header as data */
408		if (type == SOCK_RAW) {
409			memset(eth.h_dest, 0x06, ETH_ALEN);
410			memset(eth.h_source, 0x02, ETH_ALEN);
411			eth.h_proto = htons(proto);
412			iov[0].iov_base = &eth;
413			iov[0].iov_len = sizeof(eth);
414			msg.msg_iovlen++;
415		}
416
417		/* both sock_raw and sock_dgram expect name */
418		memset(&laddr, 0, sizeof(laddr));
419		laddr.sll_family	= AF_PACKET;
420		laddr.sll_ifindex	= cfg_ifindex;
421		laddr.sll_protocol	= htons(proto);
422		laddr.sll_halen		= ETH_ALEN;
423
424		memset(laddr.sll_addr, 0x06, ETH_ALEN);
425
426		msg.msg_name		= &laddr;
427		msg.msg_namelen		= sizeof(laddr);
428	}
429
430	/* packet and raw sockets with hdrincl must pass network header */
431	if (domain == PF_PACKET || protocol == IPPROTO_RAW) {
432		if (cfg_family == PF_INET)
433			iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len);
434		else
435			iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len);
436
437		iov[1].iov_base = (void *) &nh;
438		msg.msg_iovlen++;
439	}
440
441	iov[2].iov_base = payload;
442	iov[2].iov_len = cfg_payload_len;
443	msg.msg_iovlen++;
444	msg.msg_iov = &iov[3 - msg.msg_iovlen];
445
446	tstop = gettimeofday_ms() + cfg_runtime_ms;
447	do {
448		if (cfg_cork)
449			do_sendmsg_corked(fd, &msg);
450		else
451			do_sendmsg(fd, &msg, cfg_zerocopy);
452
453		while (!do_poll(fd, POLLOUT)) {
454			if (cfg_zerocopy)
455				do_recv_completions(fd);
456		}
457
458	} while (gettimeofday_ms() < tstop);
459
460	if (cfg_zerocopy)
461		do_recv_remaining_completions(fd);
462
463	if (close(fd))
464		error(1, errno, "close");
465
466	fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
467		packets, bytes >> 20, completions,
468		zerocopied == 1 ? 'y' : 'n');
469}
470
471static int do_setup_rx(int domain, int type, int protocol)
472{
473	int fd;
474
475	/* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
476	 * to recv the only copy of the packet, not a clone
477	 */
478	if (domain == PF_PACKET)
479		error(1, 0, "Use PF_INET/SOCK_RAW to read");
480
481	if (type == SOCK_RAW && protocol == IPPROTO_RAW)
482		error(1, 0, "IPPROTO_RAW: not supported on Rx");
483
484	fd = socket(domain, type, protocol);
485	if (fd == -1)
486		error(1, errno, "socket r");
487
488	do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21);
489	do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16);
490	do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
491
492	if (bind(fd, (void *) &cfg_dst_addr, cfg_alen))
493		error(1, errno, "bind");
494
495	if (type == SOCK_STREAM) {
496		if (listen(fd, 1))
497			error(1, errno, "listen");
498		fd = do_accept(fd);
499	}
500
501	return fd;
502}
503
504/* Flush all outstanding bytes for the tcp receive queue */
505static void do_flush_tcp(int fd)
506{
507	int ret;
508
509	/* MSG_TRUNC flushes up to len bytes */
510	ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
511	if (ret == -1 && errno == EAGAIN)
512		return;
513	if (ret == -1)
514		error(1, errno, "flush");
515	if (!ret)
516		return;
517
518	packets++;
519	bytes += ret;
520}
521
522/* Flush all outstanding datagrams. Verify first few bytes of each. */
523static void do_flush_datagram(int fd, int type)
524{
525	int ret, off = 0;
526	char buf[64];
527
528	/* MSG_TRUNC will return full datagram length */
529	ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC);
530	if (ret == -1 && errno == EAGAIN)
531		return;
532
533	/* raw ipv4 return with header, raw ipv6 without */
534	if (cfg_family == PF_INET && type == SOCK_RAW) {
535		off += sizeof(struct iphdr);
536		ret -= sizeof(struct iphdr);
537	}
538
539	if (ret == -1)
540		error(1, errno, "recv");
541	if (ret != cfg_payload_len)
542		error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
543	if (ret > sizeof(buf) - off)
544		ret = sizeof(buf) - off;
545	if (memcmp(buf + off, payload, ret))
546		error(1, 0, "recv: data mismatch");
547
548	packets++;
549	bytes += cfg_payload_len;
550}
551
552static void do_rx(int domain, int type, int protocol)
553{
554	uint64_t tstop;
555	int fd;
556
557	fd = do_setup_rx(domain, type, protocol);
558
559	tstop = gettimeofday_ms() + cfg_runtime_ms;
560	do {
561		if (type == SOCK_STREAM)
562			do_flush_tcp(fd);
563		else
564			do_flush_datagram(fd, type);
565
566		do_poll(fd, POLLIN);
567
568	} while (gettimeofday_ms() < tstop);
569
570	if (close(fd))
571		error(1, errno, "close");
572
573	fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20);
574}
575
576static void do_test(int domain, int type, int protocol)
577{
578	int i;
579
580	if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM))
581		error(1, 0, "can only cork udp sockets");
582
583	do_setcpu(cfg_cpu);
584
585	for (i = 0; i < IP_MAXPACKET; i++)
586		payload[i] = 'a' + (i % 26);
587
588	if (cfg_rx)
589		do_rx(domain, type, protocol);
590	else
591		do_tx(domain, type, protocol);
592}
593
594static void usage(const char *filepath)
595{
596	error(1, 0, "Usage: %s [options] <test>", filepath);
597}
598
599static void parse_opts(int argc, char **argv)
600{
601	const int max_payload_len = sizeof(payload) -
602				    sizeof(struct ipv6hdr) -
603				    sizeof(struct tcphdr) -
604				    40 /* max tcp options */;
605	int c;
606
607	cfg_payload_len = max_payload_len;
608
609	while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
610		switch (c) {
611		case '4':
612			if (cfg_family != PF_UNSPEC)
613				error(1, 0, "Pass one of -4 or -6");
614			cfg_family = PF_INET;
615			cfg_alen = sizeof(struct sockaddr_in);
616			break;
617		case '6':
618			if (cfg_family != PF_UNSPEC)
619				error(1, 0, "Pass one of -4 or -6");
620			cfg_family = PF_INET6;
621			cfg_alen = sizeof(struct sockaddr_in6);
622			break;
623		case 'c':
624			cfg_cork = strtol(optarg, NULL, 0);
625			break;
626		case 'C':
627			cfg_cpu = strtol(optarg, NULL, 0);
628			break;
629		case 'D':
630			setup_sockaddr(cfg_family, optarg, &cfg_dst_addr);
631			break;
632		case 'i':
633			cfg_ifindex = if_nametoindex(optarg);
634			if (cfg_ifindex == 0)
635				error(1, errno, "invalid iface: %s", optarg);
636			break;
637		case 'm':
638			cfg_cork_mixed = true;
639			break;
640		case 'p':
641			cfg_port = htons(strtoul(optarg, NULL, 0));
642			break;
643		case 'r':
644			cfg_rx = true;
645			break;
646		case 's':
647			cfg_payload_len = strtoul(optarg, NULL, 0);
648			break;
649		case 'S':
650			setup_sockaddr(cfg_family, optarg, &cfg_src_addr);
651			break;
652		case 't':
653			cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
654			break;
655		case 'v':
656			cfg_verbose++;
657			break;
658		case 'z':
659			cfg_zerocopy = true;
660			break;
661		}
662	}
663
664	if (cfg_payload_len > max_payload_len)
665		error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
666	if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork))
667		error(1, 0, "-m: cork_mixed requires corking and zerocopy");
668
669	if (optind != argc - 1)
670		usage(argv[0]);
671}
672
673int main(int argc, char **argv)
674{
675	const char *cfg_test;
676
677	parse_opts(argc, argv);
678
679	cfg_test = argv[argc - 1];
680
681	if (!strcmp(cfg_test, "packet"))
682		do_test(PF_PACKET, SOCK_RAW, 0);
683	else if (!strcmp(cfg_test, "packet_dgram"))
684		do_test(PF_PACKET, SOCK_DGRAM, 0);
685	else if (!strcmp(cfg_test, "raw"))
686		do_test(cfg_family, SOCK_RAW, IPPROTO_EGP);
687	else if (!strcmp(cfg_test, "raw_hdrincl"))
688		do_test(cfg_family, SOCK_RAW, IPPROTO_RAW);
689	else if (!strcmp(cfg_test, "tcp"))
690		do_test(cfg_family, SOCK_STREAM, 0);
691	else if (!strcmp(cfg_test, "udp"))
692		do_test(cfg_family, SOCK_DGRAM, 0);
693	else
694		error(1, 0, "unknown cfg_test %s", cfg_test);
695
696	return 0;
697}
698