tc_bpf.c revision fd7f9c7fd11fa926bda2edc8bc492e7515753a32
1/*
2 * tc_bpf.c	BPF common code
3 *
4 *		This program is free software; you can distribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Daniel Borkmann <dborkman@redhat.com>
10 *		Jiri Pirko <jiri@resnulli.us>
11 *		Alexei Starovoitov <ast@plumgrid.com>
12 */
13
14#include <stdio.h>
15#include <stdlib.h>
16#include <unistd.h>
17#include <string.h>
18#include <stdbool.h>
19#include <stdint.h>
20#include <errno.h>
21#include <fcntl.h>
22#include <stdarg.h>
23
24#ifdef HAVE_ELF
25#include <libelf.h>
26#include <gelf.h>
27#endif
28
29#include <sys/types.h>
30#include <sys/stat.h>
31#include <sys/un.h>
32#include <sys/vfs.h>
33#include <sys/mount.h>
34#include <sys/syscall.h>
35#include <sys/sendfile.h>
36#include <sys/resource.h>
37
38#include <linux/bpf.h>
39#include <linux/filter.h>
40#include <linux/if_alg.h>
41
42#include "utils.h"
43
44#include "bpf_elf.h"
45#include "bpf_scm.h"
46
47#include "tc_util.h"
48#include "tc_bpf.h"
49
50#ifdef HAVE_ELF
51static int bpf_obj_open(const char *path, enum bpf_prog_type type,
52			const char *sec, bool verbose);
53#else
54static int bpf_obj_open(const char *path, enum bpf_prog_type type,
55			const char *sec, bool verbose)
56{
57	fprintf(stderr, "No ELF library support compiled in.\n");
58	errno = ENOSYS;
59	return -1;
60}
61#endif
62
63static inline __u64 bpf_ptr_to_u64(const void *ptr)
64{
65	return (__u64)(unsigned long)ptr;
66}
67
68static int bpf(int cmd, union bpf_attr *attr, unsigned int size)
69{
70#ifdef __NR_bpf
71	return syscall(__NR_bpf, cmd, attr, size);
72#else
73	fprintf(stderr, "No bpf syscall, kernel headers too old?\n");
74	errno = ENOSYS;
75	return -1;
76#endif
77}
78
79static int bpf_map_update(int fd, const void *key, const void *value,
80			  uint64_t flags)
81{
82	union bpf_attr attr = {
83		.map_fd		= fd,
84		.key		= bpf_ptr_to_u64(key),
85		.value		= bpf_ptr_to_u64(value),
86		.flags		= flags,
87	};
88
89	return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
90}
91
92static int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len,
93			    char **bpf_string, bool *need_release,
94			    const char separator)
95{
96	char sp;
97
98	if (from_file) {
99		size_t tmp_len, op_len = sizeof("65535 255 255 4294967295,");
100		char *tmp_string;
101		FILE *fp;
102
103		tmp_len = sizeof("4096,") + BPF_MAXINSNS * op_len;
104		tmp_string = malloc(tmp_len);
105		if (tmp_string == NULL)
106			return -ENOMEM;
107
108		memset(tmp_string, 0, tmp_len);
109
110		fp = fopen(arg, "r");
111		if (fp == NULL) {
112			perror("Cannot fopen");
113			free(tmp_string);
114			return -ENOENT;
115		}
116
117		if (!fgets(tmp_string, tmp_len, fp)) {
118			free(tmp_string);
119			fclose(fp);
120			return -EIO;
121		}
122
123		fclose(fp);
124
125		*need_release = true;
126		*bpf_string = tmp_string;
127	} else {
128		*need_release = false;
129		*bpf_string = arg;
130	}
131
132	if (sscanf(*bpf_string, "%hu%c", bpf_len, &sp) != 2 ||
133	    sp != separator) {
134		if (*need_release)
135			free(*bpf_string);
136		return -EINVAL;
137	}
138
139	return 0;
140}
141
142static int bpf_ops_parse(int argc, char **argv, struct sock_filter *bpf_ops,
143			 bool from_file)
144{
145	char *bpf_string, *token, separator = ',';
146	int ret = 0, i = 0;
147	bool need_release;
148	__u16 bpf_len = 0;
149
150	if (argc < 1)
151		return -EINVAL;
152	if (bpf_parse_string(argv[0], from_file, &bpf_len, &bpf_string,
153			     &need_release, separator))
154		return -EINVAL;
155	if (bpf_len == 0 || bpf_len > BPF_MAXINSNS) {
156		ret = -EINVAL;
157		goto out;
158	}
159
160	token = bpf_string;
161	while ((token = strchr(token, separator)) && (++token)[0]) {
162		if (i >= bpf_len) {
163			fprintf(stderr, "Real program length exceeds encoded "
164				"length parameter!\n");
165			ret = -EINVAL;
166			goto out;
167		}
168
169		if (sscanf(token, "%hu %hhu %hhu %u,",
170			   &bpf_ops[i].code, &bpf_ops[i].jt,
171			   &bpf_ops[i].jf, &bpf_ops[i].k) != 4) {
172			fprintf(stderr, "Error at instruction %d!\n", i);
173			ret = -EINVAL;
174			goto out;
175		}
176
177		i++;
178	}
179
180	if (i != bpf_len) {
181		fprintf(stderr, "Parsed program length is less than encoded"
182			"length parameter!\n");
183		ret = -EINVAL;
184		goto out;
185	}
186	ret = bpf_len;
187out:
188	if (need_release)
189		free(bpf_string);
190
191	return ret;
192}
193
194void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len)
195{
196	struct sock_filter *ops = (struct sock_filter *) RTA_DATA(bpf_ops);
197	int i;
198
199	if (len == 0)
200		return;
201
202	fprintf(f, "bytecode \'%u,", len);
203
204	for (i = 0; i < len - 1; i++)
205		fprintf(f, "%hu %hhu %hhu %u,", ops[i].code, ops[i].jt,
206			ops[i].jf, ops[i].k);
207
208	fprintf(f, "%hu %hhu %hhu %u\'", ops[i].code, ops[i].jt,
209		ops[i].jf, ops[i].k);
210}
211
212static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map,
213				    int length)
214{
215	char file[PATH_MAX], buff[4096];
216	struct bpf_elf_map tmp, zero;
217	unsigned int val;
218	FILE *fp;
219
220	snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd);
221
222	fp = fopen(file, "r");
223	if (!fp) {
224		fprintf(stderr, "No procfs support?!\n");
225		return -EIO;
226	}
227
228	memset(&tmp, 0, sizeof(tmp));
229	while (fgets(buff, sizeof(buff), fp)) {
230		if (sscanf(buff, "map_type:\t%u", &val) == 1)
231			tmp.type = val;
232		else if (sscanf(buff, "key_size:\t%u", &val) == 1)
233			tmp.size_key = val;
234		else if (sscanf(buff, "value_size:\t%u", &val) == 1)
235			tmp.size_value = val;
236		else if (sscanf(buff, "max_entries:\t%u", &val) == 1)
237			tmp.max_elem = val;
238	}
239
240	fclose(fp);
241
242	if (!memcmp(&tmp, map, length)) {
243		return 0;
244	} else {
245		memset(&zero, 0, sizeof(zero));
246		/* If kernel doesn't have eBPF-related fdinfo, we cannot do much,
247		 * so just accept it. We know we do have an eBPF fd and in this
248		 * case, everything is 0. It is guaranteed that no such map exists
249		 * since map type of 0 is unloadable BPF_MAP_TYPE_UNSPEC.
250		 */
251		if (!memcmp(&tmp, &zero, length))
252			return 0;
253
254		fprintf(stderr, "Map specs from pinned file differ!\n");
255		return -EINVAL;
256	}
257}
258
259static int bpf_mnt_fs(const char *target)
260{
261	bool bind_done = false;
262
263	while (mount("", target, "none", MS_PRIVATE | MS_REC, NULL)) {
264		if (errno != EINVAL || bind_done) {
265			fprintf(stderr, "mount --make-private %s failed: %s\n",
266				target,	strerror(errno));
267			return -1;
268		}
269
270		if (mount(target, target, "none", MS_BIND, NULL)) {
271			fprintf(stderr, "mount --bind %s %s failed: %s\n",
272				target,	target, strerror(errno));
273			return -1;
274		}
275
276		bind_done = true;
277	}
278
279	if (mount("bpf", target, "bpf", 0, NULL)) {
280		fprintf(stderr, "mount -t bpf bpf %s failed: %s\n",
281			target,	strerror(errno));
282		return -1;
283	}
284
285	return 0;
286}
287
288static int bpf_valid_mntpt(const char *mnt, unsigned long magic)
289{
290	struct statfs st_fs;
291
292	if (statfs(mnt, &st_fs) < 0)
293		return -ENOENT;
294	if ((unsigned long)st_fs.f_type != magic)
295		return -ENOENT;
296
297	return 0;
298}
299
300static const char *bpf_find_mntpt(const char *fstype, unsigned long magic,
301				  char *mnt, int len,
302				  const char * const *known_mnts)
303{
304	const char * const *ptr;
305	char type[100];
306	FILE *fp;
307
308	if (known_mnts) {
309		ptr = known_mnts;
310		while (*ptr) {
311			if (bpf_valid_mntpt(*ptr, magic) == 0) {
312				strncpy(mnt, *ptr, len - 1);
313				mnt[len - 1] = 0;
314				return mnt;
315			}
316			ptr++;
317		}
318	}
319
320	fp = fopen("/proc/mounts", "r");
321	if (fp == NULL || len != PATH_MAX)
322		return NULL;
323
324	while (fscanf(fp, "%*s %" textify(PATH_MAX) "s %99s %*s %*d %*d\n",
325		      mnt, type) == 2) {
326		if (strcmp(type, fstype) == 0)
327			break;
328	}
329
330	fclose(fp);
331	if (strcmp(type, fstype) != 0)
332		return NULL;
333
334	return mnt;
335}
336
337int bpf_trace_pipe(void)
338{
339	char tracefs_mnt[PATH_MAX] = TRACE_DIR_MNT;
340	static const char * const tracefs_known_mnts[] = {
341		TRACE_DIR_MNT,
342		"/sys/kernel/debug/tracing",
343		"/tracing",
344		"/trace",
345		0,
346	};
347	char tpipe[PATH_MAX];
348	const char *mnt;
349	int fd;
350
351	mnt = bpf_find_mntpt("tracefs", TRACEFS_MAGIC, tracefs_mnt,
352			     sizeof(tracefs_mnt), tracefs_known_mnts);
353	if (!mnt) {
354		fprintf(stderr, "tracefs not mounted?\n");
355		return -1;
356	}
357
358	snprintf(tpipe, sizeof(tpipe), "%s/trace_pipe", mnt);
359
360	fd = open(tpipe, O_RDONLY);
361	if (fd < 0)
362		return -1;
363
364	fprintf(stderr, "Running! Hang up with ^C!\n\n");
365	while (1) {
366		static char buff[4096];
367		ssize_t ret;
368
369		ret = read(fd, buff, sizeof(buff) - 1);
370		if (ret > 0) {
371			write(2, buff, ret);
372			fflush(stderr);
373		}
374	}
375
376	return 0;
377}
378
379static const char *bpf_get_tc_dir(void)
380{
381	static bool bpf_mnt_cached = false;
382	static char bpf_tc_dir[PATH_MAX];
383	static const char *mnt;
384	static const char * const bpf_known_mnts[] = {
385		BPF_DIR_MNT,
386		0,
387	};
388	char bpf_mnt[PATH_MAX] = BPF_DIR_MNT;
389	char bpf_glo_dir[PATH_MAX];
390	int ret;
391
392	if (bpf_mnt_cached)
393		goto done;
394
395	mnt = bpf_find_mntpt("bpf", BPF_FS_MAGIC, bpf_mnt, sizeof(bpf_mnt),
396			     bpf_known_mnts);
397	if (!mnt) {
398		mnt = getenv(BPF_ENV_MNT);
399		if (!mnt)
400			mnt = BPF_DIR_MNT;
401		ret = bpf_mnt_fs(mnt);
402		if (ret) {
403			mnt = NULL;
404			goto out;
405		}
406	}
407
408	snprintf(bpf_tc_dir, sizeof(bpf_tc_dir), "%s/%s", mnt, BPF_DIR_TC);
409	ret = mkdir(bpf_tc_dir, S_IRWXU);
410	if (ret && errno != EEXIST) {
411		fprintf(stderr, "mkdir %s failed: %s\n", bpf_tc_dir,
412			strerror(errno));
413		mnt = NULL;
414		goto out;
415	}
416
417	snprintf(bpf_glo_dir, sizeof(bpf_glo_dir), "%s/%s",
418		 bpf_tc_dir, BPF_DIR_GLOBALS);
419	ret = mkdir(bpf_glo_dir, S_IRWXU);
420	if (ret && errno != EEXIST) {
421		fprintf(stderr, "mkdir %s failed: %s\n", bpf_glo_dir,
422			strerror(errno));
423		mnt = NULL;
424		goto out;
425	}
426
427	mnt = bpf_tc_dir;
428out:
429	bpf_mnt_cached = true;
430done:
431	return mnt;
432}
433
434static int bpf_obj_get(const char *pathname)
435{
436	union bpf_attr attr;
437	char tmp[PATH_MAX];
438
439	if (strlen(pathname) > 2 && pathname[0] == 'm' &&
440	    pathname[1] == ':' && bpf_get_tc_dir()) {
441		snprintf(tmp, sizeof(tmp), "%s/%s",
442			 bpf_get_tc_dir(), pathname + 2);
443		pathname = tmp;
444	}
445
446	memset(&attr, 0, sizeof(attr));
447	attr.pathname = bpf_ptr_to_u64(pathname);
448
449	return bpf(BPF_OBJ_GET, &attr, sizeof(attr));
450}
451
452const char *bpf_default_section(const enum bpf_prog_type type)
453{
454	switch (type) {
455	case BPF_PROG_TYPE_SCHED_CLS:
456		return ELF_SECTION_CLASSIFIER;
457	case BPF_PROG_TYPE_SCHED_ACT:
458		return ELF_SECTION_ACTION;
459	default:
460		return NULL;
461	}
462}
463
464enum bpf_mode {
465	CBPF_BYTECODE = 0,
466	CBPF_FILE,
467	EBPF_OBJECT,
468	EBPF_PINNED,
469	__BPF_MODE_MAX,
470#define BPF_MODE_MAX	__BPF_MODE_MAX
471};
472
473static int bpf_parse(int *ptr_argc, char ***ptr_argv, const bool *opt_tbl,
474		     enum bpf_prog_type *type, enum bpf_mode *mode,
475		     const char **ptr_object, const char **ptr_section,
476		     const char **ptr_uds_name, struct sock_filter *opcodes)
477{
478	const char *file, *section, *uds_name;
479	bool verbose = false;
480	int ret, argc;
481	char **argv;
482
483	argv = *ptr_argv;
484	argc = *ptr_argc;
485
486	if (opt_tbl[CBPF_BYTECODE] &&
487	    (matches(*argv, "bytecode") == 0 ||
488	     strcmp(*argv, "bc") == 0)) {
489		*mode = CBPF_BYTECODE;
490	} else if (opt_tbl[CBPF_FILE] &&
491		   (matches(*argv, "bytecode-file") == 0 ||
492		    strcmp(*argv, "bcf") == 0)) {
493		*mode = CBPF_FILE;
494	} else if (opt_tbl[EBPF_OBJECT] &&
495		   (matches(*argv, "object-file") == 0 ||
496		    strcmp(*argv, "obj") == 0)) {
497		*mode = EBPF_OBJECT;
498	} else if (opt_tbl[EBPF_PINNED] &&
499		   (matches(*argv, "object-pinned") == 0 ||
500		    matches(*argv, "pinned") == 0 ||
501		    matches(*argv, "fd") == 0)) {
502		*mode = EBPF_PINNED;
503	} else {
504		fprintf(stderr, "What mode is \"%s\"?\n", *argv);
505		return -1;
506	}
507
508	NEXT_ARG();
509	file = section = uds_name = NULL;
510	if (*mode == EBPF_OBJECT || *mode == EBPF_PINNED) {
511		file = *argv;
512		NEXT_ARG_FWD();
513
514		if (*type == BPF_PROG_TYPE_UNSPEC) {
515			if (argc > 0 && matches(*argv, "type") == 0) {
516				NEXT_ARG();
517				if (matches(*argv, "cls") == 0) {
518					*type = BPF_PROG_TYPE_SCHED_CLS;
519				} else if (matches(*argv, "act") == 0) {
520					*type = BPF_PROG_TYPE_SCHED_ACT;
521				} else {
522					fprintf(stderr, "What type is \"%s\"?\n",
523						*argv);
524					return -1;
525				}
526				NEXT_ARG_FWD();
527			} else {
528				*type = BPF_PROG_TYPE_SCHED_CLS;
529			}
530		}
531
532		section = bpf_default_section(*type);
533		if (argc > 0 && matches(*argv, "section") == 0) {
534			NEXT_ARG();
535			section = *argv;
536			NEXT_ARG_FWD();
537		}
538
539		uds_name = getenv(BPF_ENV_UDS);
540		if (argc > 0 && !uds_name &&
541		    matches(*argv, "export") == 0) {
542			NEXT_ARG();
543			uds_name = *argv;
544			NEXT_ARG_FWD();
545		}
546
547		if (argc > 0 && matches(*argv, "verbose") == 0) {
548			verbose = true;
549			NEXT_ARG_FWD();
550		}
551
552		PREV_ARG();
553	}
554
555	if (*mode == CBPF_BYTECODE || *mode == CBPF_FILE)
556		ret = bpf_ops_parse(argc, argv, opcodes, *mode == CBPF_FILE);
557	else if (*mode == EBPF_OBJECT)
558		ret = bpf_obj_open(file, *type, section, verbose);
559	else if (*mode == EBPF_PINNED)
560		ret = bpf_obj_get(file);
561	else
562		return -1;
563
564	if (ptr_object)
565		*ptr_object = file;
566	if (ptr_section)
567		*ptr_section = section;
568	if (ptr_uds_name)
569		*ptr_uds_name = uds_name;
570
571	*ptr_argc = argc;
572	*ptr_argv = argv;
573
574	return ret;
575}
576
577int bpf_parse_common(int *ptr_argc, char ***ptr_argv, const int *nla_tbl,
578		     enum bpf_prog_type type, const char **ptr_object,
579		     const char **ptr_uds_name, struct nlmsghdr *n)
580{
581	struct sock_filter opcodes[BPF_MAXINSNS];
582	const bool opt_tbl[BPF_MODE_MAX] = {
583		[CBPF_BYTECODE]	= true,
584		[CBPF_FILE]	= true,
585		[EBPF_OBJECT]	= true,
586		[EBPF_PINNED]	= true,
587	};
588	char annotation[256];
589	const char *section;
590	enum bpf_mode mode;
591	int ret;
592
593	ret = bpf_parse(ptr_argc, ptr_argv, opt_tbl, &type, &mode,
594			ptr_object, &section, ptr_uds_name, opcodes);
595	if (ret < 0)
596		return ret;
597
598	if (mode == CBPF_BYTECODE || mode == CBPF_FILE) {
599		addattr16(n, MAX_MSG, nla_tbl[BPF_NLA_OPS_LEN], ret);
600		addattr_l(n, MAX_MSG, nla_tbl[BPF_NLA_OPS], opcodes,
601			  ret * sizeof(struct sock_filter));
602	}
603
604	if (mode == EBPF_OBJECT || mode == EBPF_PINNED) {
605		snprintf(annotation, sizeof(annotation), "%s:[%s]",
606			 basename(*ptr_object), mode == EBPF_PINNED ?
607			 "*fsobj" : section);
608
609		addattr32(n, MAX_MSG, nla_tbl[BPF_NLA_FD], ret);
610		addattrstrz(n, MAX_MSG, nla_tbl[BPF_NLA_NAME], annotation);
611	}
612
613	return 0;
614}
615
616int bpf_graft_map(const char *map_path, uint32_t *key, int argc, char **argv)
617{
618	enum bpf_prog_type type = BPF_PROG_TYPE_UNSPEC;
619	const bool opt_tbl[BPF_MODE_MAX] = {
620		[CBPF_BYTECODE]	= false,
621		[CBPF_FILE]	= false,
622		[EBPF_OBJECT]	= true,
623		[EBPF_PINNED]	= true,
624	};
625	const struct bpf_elf_map test = {
626		.type		= BPF_MAP_TYPE_PROG_ARRAY,
627		.size_key	= sizeof(int),
628		.size_value	= sizeof(int),
629	};
630	int ret, prog_fd, map_fd;
631	const char *section;
632	enum bpf_mode mode;
633	uint32_t map_key;
634
635	prog_fd = bpf_parse(&argc, &argv, opt_tbl, &type, &mode,
636			    NULL, &section, NULL, NULL);
637	if (prog_fd < 0)
638		return prog_fd;
639	if (key) {
640		map_key = *key;
641	} else {
642		ret = sscanf(section, "%*i/%i", &map_key);
643		if (ret != 1) {
644			fprintf(stderr, "Couldn\'t infer map key from section "
645				"name! Please provide \'key\' argument!\n");
646			ret = -EINVAL;
647			goto out_prog;
648		}
649	}
650
651	map_fd = bpf_obj_get(map_path);
652	if (map_fd < 0) {
653		fprintf(stderr, "Couldn\'t retrieve pinned map \'%s\': %s\n",
654			map_path, strerror(errno));
655		ret = map_fd;
656		goto out_prog;
657	}
658
659	ret = bpf_map_selfcheck_pinned(map_fd, &test,
660				       offsetof(struct bpf_elf_map, max_elem));
661	if (ret < 0) {
662		fprintf(stderr, "Map \'%s\' self-check failed!\n", map_path);
663		goto out_map;
664	}
665
666	ret = bpf_map_update(map_fd, &map_key, &prog_fd, BPF_ANY);
667	if (ret < 0)
668		fprintf(stderr, "Map update failed: %s\n", strerror(errno));
669out_map:
670	close(map_fd);
671out_prog:
672	close(prog_fd);
673	return ret;
674}
675
676#ifdef HAVE_ELF
677struct bpf_elf_prog {
678	enum bpf_prog_type	type;
679	const struct bpf_insn	*insns;
680	size_t			size;
681	const char		*license;
682};
683
684struct bpf_hash_entry {
685	unsigned int		pinning;
686	const char		*subpath;
687	struct bpf_hash_entry	*next;
688};
689
690struct bpf_elf_ctx {
691	Elf			*elf_fd;
692	GElf_Ehdr		elf_hdr;
693	Elf_Data		*sym_tab;
694	Elf_Data		*str_tab;
695	int			obj_fd;
696	int			map_fds[ELF_MAX_MAPS];
697	struct bpf_elf_map	maps[ELF_MAX_MAPS];
698	int			sym_num;
699	int			map_num;
700	bool			*sec_done;
701	int			sec_maps;
702	char			license[ELF_MAX_LICENSE_LEN];
703	enum bpf_prog_type	type;
704	bool			verbose;
705	struct bpf_elf_st	stat;
706	struct bpf_hash_entry	*ht[256];
707};
708
709struct bpf_elf_sec_data {
710	GElf_Shdr		sec_hdr;
711	Elf_Data		*sec_data;
712	const char		*sec_name;
713};
714
715struct bpf_map_data {
716	int			*fds;
717	const char		*obj;
718	struct bpf_elf_st	*st;
719	struct bpf_elf_map	*ent;
720};
721
722/* If we provide a small buffer with log level enabled, the kernel
723 * could fail program load as no buffer space is available for the
724 * log and thus verifier fails. In case something doesn't pass the
725 * verifier we still want to hand something descriptive to the user.
726 */
727static char bpf_log_buf[65536];
728
729static __check_format_string(1, 2) void bpf_dump_error(const char *format, ...)
730{
731	va_list vl;
732
733	va_start(vl, format);
734	vfprintf(stderr, format, vl);
735	va_end(vl);
736
737	if (bpf_log_buf[0]) {
738		fprintf(stderr, "%s\n", bpf_log_buf);
739		memset(bpf_log_buf, 0, sizeof(bpf_log_buf));
740	}
741}
742
743static int bpf_map_create(enum bpf_map_type type, unsigned int size_key,
744			  unsigned int size_value, unsigned int max_elem)
745{
746	union bpf_attr attr = {
747		.map_type	= type,
748		.key_size	= size_key,
749		.value_size	= size_value,
750		.max_entries	= max_elem,
751	};
752
753	return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
754}
755
756static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
757			 size_t size, const char *license)
758{
759	union bpf_attr attr = {
760		.prog_type	= type,
761		.insns		= bpf_ptr_to_u64(insns),
762		.insn_cnt	= size / sizeof(struct bpf_insn),
763		.license	= bpf_ptr_to_u64(license),
764		.log_buf	= bpf_ptr_to_u64(bpf_log_buf),
765		.log_size	= sizeof(bpf_log_buf),
766		.log_level	= 1,
767	};
768
769	if (getenv(BPF_ENV_NOLOG)) {
770		attr.log_buf	= 0;
771		attr.log_size	= 0;
772		attr.log_level	= 0;
773	}
774
775	return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
776}
777
778static int bpf_obj_pin(int fd, const char *pathname)
779{
780	union bpf_attr attr = {
781		.pathname	= bpf_ptr_to_u64(pathname),
782		.bpf_fd		= fd,
783	};
784
785	return bpf(BPF_OBJ_PIN, &attr, sizeof(attr));
786}
787
788static int bpf_obj_hash(const char *object, uint8_t *out, size_t len)
789{
790	struct sockaddr_alg alg = {
791		.salg_family	= AF_ALG,
792		.salg_type	= "hash",
793		.salg_name	= "sha1",
794	};
795	int ret, cfd, ofd, ffd;
796	struct stat stbuff;
797	ssize_t size;
798
799	if (!object || len != 20)
800		return -EINVAL;
801
802	cfd = socket(AF_ALG, SOCK_SEQPACKET, 0);
803	if (cfd < 0) {
804		fprintf(stderr, "Cannot get AF_ALG socket: %s\n",
805			strerror(errno));
806		return cfd;
807	}
808
809	ret = bind(cfd, (struct sockaddr *)&alg, sizeof(alg));
810	if (ret < 0) {
811		fprintf(stderr, "Error binding socket: %s\n", strerror(errno));
812		goto out_cfd;
813	}
814
815	ofd = accept(cfd, NULL, 0);
816	if (ofd < 0) {
817		fprintf(stderr, "Error accepting socket: %s\n",
818			strerror(errno));
819		ret = ofd;
820		goto out_cfd;
821	}
822
823	ffd = open(object, O_RDONLY);
824	if (ffd < 0) {
825		fprintf(stderr, "Error opening object %s: %s\n",
826			object, strerror(errno));
827		ret = ffd;
828		goto out_ofd;
829	}
830
831        ret = fstat(ffd, &stbuff);
832	if (ret < 0) {
833		fprintf(stderr, "Error doing fstat: %s\n",
834			strerror(errno));
835		goto out_ffd;
836	}
837
838	size = sendfile(ofd, ffd, NULL, stbuff.st_size);
839	if (size != stbuff.st_size) {
840		fprintf(stderr, "Error from sendfile (%zd vs %zu bytes): %s\n",
841			size, stbuff.st_size, strerror(errno));
842		ret = -1;
843		goto out_ffd;
844	}
845
846	size = read(ofd, out, len);
847	if (size != len) {
848		fprintf(stderr, "Error from read (%zd vs %zu bytes): %s\n",
849			size, len, strerror(errno));
850		ret = -1;
851	} else {
852		ret = 0;
853	}
854out_ffd:
855	close(ffd);
856out_ofd:
857	close(ofd);
858out_cfd:
859	close(cfd);
860	return ret;
861}
862
863static const char *bpf_get_obj_uid(const char *pathname)
864{
865	static bool bpf_uid_cached = false;
866	static char bpf_uid[64];
867	uint8_t tmp[20];
868	int ret;
869
870	if (bpf_uid_cached)
871		goto done;
872
873	ret = bpf_obj_hash(pathname, tmp, sizeof(tmp));
874	if (ret) {
875		fprintf(stderr, "Object hashing failed!\n");
876		return NULL;
877	}
878
879	hexstring_n2a(tmp, sizeof(tmp), bpf_uid, sizeof(bpf_uid));
880	bpf_uid_cached = true;
881done:
882	return bpf_uid;
883}
884
885static int bpf_init_env(const char *pathname)
886{
887	struct rlimit limit = {
888		.rlim_cur = RLIM_INFINITY,
889		.rlim_max = RLIM_INFINITY,
890	};
891
892	/* Don't bother in case we fail! */
893	setrlimit(RLIMIT_MEMLOCK, &limit);
894
895	if (!bpf_get_tc_dir()) {
896		fprintf(stderr, "Continuing without mounted eBPF fs. "
897			"Too old kernel?\n");
898		return 0;
899	}
900
901	if (!bpf_get_obj_uid(pathname))
902		return -1;
903
904	return 0;
905}
906
907static const char *bpf_custom_pinning(const struct bpf_elf_ctx *ctx,
908				      uint32_t pinning)
909{
910	struct bpf_hash_entry *entry;
911
912	entry = ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)];
913	while (entry && entry->pinning != pinning)
914		entry = entry->next;
915
916	return entry ? entry->subpath : NULL;
917}
918
919static bool bpf_no_pinning(const struct bpf_elf_ctx *ctx,
920			   uint32_t pinning)
921{
922	switch (pinning) {
923	case PIN_OBJECT_NS:
924	case PIN_GLOBAL_NS:
925		return false;
926	case PIN_NONE:
927		return true;
928	default:
929		return !bpf_custom_pinning(ctx, pinning);
930	}
931}
932
933static void bpf_make_pathname(char *pathname, size_t len, const char *name,
934			      const struct bpf_elf_ctx *ctx, uint32_t pinning)
935{
936	switch (pinning) {
937	case PIN_OBJECT_NS:
938		snprintf(pathname, len, "%s/%s/%s", bpf_get_tc_dir(),
939			 bpf_get_obj_uid(NULL), name);
940		break;
941	case PIN_GLOBAL_NS:
942		snprintf(pathname, len, "%s/%s/%s", bpf_get_tc_dir(),
943			 BPF_DIR_GLOBALS, name);
944		break;
945	default:
946		snprintf(pathname, len, "%s/../%s/%s", bpf_get_tc_dir(),
947			 bpf_custom_pinning(ctx, pinning), name);
948		break;
949	}
950}
951
952static int bpf_probe_pinned(const char *name, const struct bpf_elf_ctx *ctx,
953			    uint32_t pinning)
954{
955	char pathname[PATH_MAX];
956
957	if (bpf_no_pinning(ctx, pinning) || !bpf_get_tc_dir())
958		return 0;
959
960	bpf_make_pathname(pathname, sizeof(pathname), name, ctx, pinning);
961	return bpf_obj_get(pathname);
962}
963
964static int bpf_make_obj_path(void)
965{
966	char tmp[PATH_MAX];
967	int ret;
968
969	snprintf(tmp, sizeof(tmp), "%s/%s", bpf_get_tc_dir(),
970		 bpf_get_obj_uid(NULL));
971
972	ret = mkdir(tmp, S_IRWXU);
973	if (ret && errno != EEXIST) {
974		fprintf(stderr, "mkdir %s failed: %s\n", tmp, strerror(errno));
975		return ret;
976	}
977
978	return 0;
979}
980
981static int bpf_make_custom_path(const char *todo)
982{
983	char tmp[PATH_MAX], rem[PATH_MAX], *sub;
984	int ret;
985
986	snprintf(tmp, sizeof(tmp), "%s/../", bpf_get_tc_dir());
987	snprintf(rem, sizeof(rem), "%s/", todo);
988	sub = strtok(rem, "/");
989
990	while (sub) {
991		if (strlen(tmp) + strlen(sub) + 2 > PATH_MAX)
992			return -EINVAL;
993
994		strcat(tmp, sub);
995		strcat(tmp, "/");
996
997		ret = mkdir(tmp, S_IRWXU);
998		if (ret && errno != EEXIST) {
999			fprintf(stderr, "mkdir %s failed: %s\n", tmp,
1000				strerror(errno));
1001			return ret;
1002		}
1003
1004		sub = strtok(NULL, "/");
1005	}
1006
1007	return 0;
1008}
1009
1010static int bpf_place_pinned(int fd, const char *name,
1011			    const struct bpf_elf_ctx *ctx, uint32_t pinning)
1012{
1013	char pathname[PATH_MAX];
1014	const char *tmp;
1015	int ret = 0;
1016
1017	if (bpf_no_pinning(ctx, pinning) || !bpf_get_tc_dir())
1018		return 0;
1019
1020	if (pinning == PIN_OBJECT_NS)
1021		ret = bpf_make_obj_path();
1022	else if ((tmp = bpf_custom_pinning(ctx, pinning)))
1023		ret = bpf_make_custom_path(tmp);
1024	if (ret < 0)
1025		return ret;
1026
1027	bpf_make_pathname(pathname, sizeof(pathname), name, ctx, pinning);
1028	return bpf_obj_pin(fd, pathname);
1029}
1030
1031static int bpf_prog_attach(const char *section,
1032			   const struct bpf_elf_prog *prog, bool verbose)
1033{
1034	int fd;
1035
1036	/* We can add pinning here later as well, same as bpf_map_attach(). */
1037	errno = 0;
1038	fd = bpf_prog_load(prog->type, prog->insns, prog->size,
1039			   prog->license);
1040	if (fd < 0 || verbose) {
1041		bpf_dump_error("Prog section \'%s\' (type:%u insns:%zu "
1042			       "license:\'%s\') %s%s (%d)!\n\n",
1043			       section, prog->type,
1044			       prog->size / sizeof(struct bpf_insn),
1045			       prog->license, fd < 0 ? "rejected: " :
1046			       "loaded", fd < 0 ? strerror(errno) : "",
1047			       fd < 0 ? errno : fd);
1048	}
1049
1050	return fd;
1051}
1052
1053static int bpf_map_attach(const char *name, const struct bpf_elf_map *map,
1054			  const struct bpf_elf_ctx *ctx, bool verbose)
1055{
1056	int fd, ret;
1057
1058	fd = bpf_probe_pinned(name, ctx, map->pinning);
1059	if (fd > 0) {
1060		ret = bpf_map_selfcheck_pinned(fd, map,
1061					       offsetof(struct bpf_elf_map,
1062							id));
1063		if (ret < 0) {
1064			close(fd);
1065			fprintf(stderr, "Map \'%s\' self-check failed!\n",
1066				name);
1067			return ret;
1068		}
1069		if (verbose)
1070			fprintf(stderr, "Map \'%s\' loaded as pinned!\n",
1071				name);
1072		return fd;
1073	}
1074
1075	errno = 0;
1076	fd = bpf_map_create(map->type, map->size_key, map->size_value,
1077			    map->max_elem);
1078	if (fd < 0 || verbose) {
1079		bpf_dump_error("Map \'%s\' (type:%u id:%u pinning:%u "
1080			       "ksize:%u vsize:%u max-elems:%u) %s%s (%d)!\n",
1081			       name, map->type, map->id, map->pinning,
1082			       map->size_key, map->size_value, map->max_elem,
1083			       fd < 0 ? "rejected: " : "loaded", fd < 0 ?
1084			       strerror(errno) : "", fd < 0 ? errno : fd);
1085		if (fd < 0)
1086			return fd;
1087	}
1088
1089	ret = bpf_place_pinned(fd, name, ctx, map->pinning);
1090	if (ret < 0 && errno != EEXIST) {
1091		fprintf(stderr, "Could not pin %s map: %s\n", name,
1092			strerror(errno));
1093		close(fd);
1094		return ret;
1095	}
1096
1097	return fd;
1098}
1099
1100#define __ELF_ST_BIND(x)	((x) >> 4)
1101#define __ELF_ST_TYPE(x)	(((unsigned int) x) & 0xf)
1102
1103static const char *bpf_str_tab_name(const struct bpf_elf_ctx *ctx,
1104				    const GElf_Sym *sym)
1105{
1106	return ctx->str_tab->d_buf + sym->st_name;
1107}
1108
1109static const char *bpf_map_fetch_name(struct bpf_elf_ctx *ctx, int which)
1110{
1111	GElf_Sym sym;
1112	int i;
1113
1114	for (i = 0; i < ctx->sym_num; i++) {
1115		if (gelf_getsym(ctx->sym_tab, i, &sym) != &sym)
1116			continue;
1117
1118		if (__ELF_ST_BIND(sym.st_info) != STB_GLOBAL ||
1119		    __ELF_ST_TYPE(sym.st_info) != STT_NOTYPE ||
1120		    sym.st_shndx != ctx->sec_maps ||
1121		    sym.st_value / sizeof(struct bpf_elf_map) != which)
1122			continue;
1123
1124		return bpf_str_tab_name(ctx, &sym);
1125	}
1126
1127	return NULL;
1128}
1129
1130static int bpf_maps_attach_all(struct bpf_elf_ctx *ctx)
1131{
1132	const char *map_name;
1133	int i, fd;
1134
1135	for (i = 0; i < ctx->map_num; i++) {
1136		map_name = bpf_map_fetch_name(ctx, i);
1137		if (!map_name)
1138			return -EIO;
1139
1140		fd = bpf_map_attach(map_name, &ctx->maps[i], ctx,
1141				    ctx->verbose);
1142		if (fd < 0)
1143			return fd;
1144
1145		ctx->map_fds[i] = fd;
1146	}
1147
1148	return 0;
1149}
1150
1151static int bpf_fill_section_data(struct bpf_elf_ctx *ctx, int section,
1152				 struct bpf_elf_sec_data *data)
1153{
1154	Elf_Data *sec_edata;
1155	GElf_Shdr sec_hdr;
1156	Elf_Scn *sec_fd;
1157	char *sec_name;
1158
1159	memset(data, 0, sizeof(*data));
1160
1161	sec_fd = elf_getscn(ctx->elf_fd, section);
1162	if (!sec_fd)
1163		return -EINVAL;
1164	if (gelf_getshdr(sec_fd, &sec_hdr) != &sec_hdr)
1165		return -EIO;
1166
1167	sec_name = elf_strptr(ctx->elf_fd, ctx->elf_hdr.e_shstrndx,
1168			      sec_hdr.sh_name);
1169	if (!sec_name || !sec_hdr.sh_size)
1170		return -ENOENT;
1171
1172	sec_edata = elf_getdata(sec_fd, NULL);
1173	if (!sec_edata || elf_getdata(sec_fd, sec_edata))
1174		return -EIO;
1175
1176	memcpy(&data->sec_hdr, &sec_hdr, sizeof(sec_hdr));
1177
1178	data->sec_name = sec_name;
1179	data->sec_data = sec_edata;
1180	return 0;
1181}
1182
1183static int bpf_fetch_maps(struct bpf_elf_ctx *ctx, int section,
1184			  struct bpf_elf_sec_data *data)
1185{
1186	if (data->sec_data->d_size % sizeof(struct bpf_elf_map) != 0)
1187		return -EINVAL;
1188
1189	ctx->map_num = data->sec_data->d_size / sizeof(struct bpf_elf_map);
1190	ctx->sec_maps = section;
1191	ctx->sec_done[section] = true;
1192
1193	if (ctx->map_num > ARRAY_SIZE(ctx->map_fds)) {
1194		fprintf(stderr, "Too many BPF maps in ELF section!\n");
1195		return -ENOMEM;
1196	}
1197
1198	memcpy(ctx->maps, data->sec_data->d_buf, data->sec_data->d_size);
1199	return 0;
1200}
1201
1202static int bpf_fetch_license(struct bpf_elf_ctx *ctx, int section,
1203			     struct bpf_elf_sec_data *data)
1204{
1205	if (data->sec_data->d_size > sizeof(ctx->license))
1206		return -ENOMEM;
1207
1208	memcpy(ctx->license, data->sec_data->d_buf, data->sec_data->d_size);
1209	ctx->sec_done[section] = true;
1210	return 0;
1211}
1212
1213static int bpf_fetch_symtab(struct bpf_elf_ctx *ctx, int section,
1214			    struct bpf_elf_sec_data *data)
1215{
1216	ctx->sym_tab = data->sec_data;
1217	ctx->sym_num = data->sec_hdr.sh_size / data->sec_hdr.sh_entsize;
1218	ctx->sec_done[section] = true;
1219	return 0;
1220}
1221
1222static int bpf_fetch_strtab(struct bpf_elf_ctx *ctx, int section,
1223			    struct bpf_elf_sec_data *data)
1224{
1225	ctx->str_tab = data->sec_data;
1226	ctx->sec_done[section] = true;
1227	return 0;
1228}
1229
1230static int bpf_fetch_ancillary(struct bpf_elf_ctx *ctx)
1231{
1232	struct bpf_elf_sec_data data;
1233	int i, ret = -1;
1234
1235	for (i = 1; i < ctx->elf_hdr.e_shnum; i++) {
1236		ret = bpf_fill_section_data(ctx, i, &data);
1237		if (ret < 0)
1238			continue;
1239
1240		if (!strcmp(data.sec_name, ELF_SECTION_MAPS))
1241			ret = bpf_fetch_maps(ctx, i, &data);
1242		else if (!strcmp(data.sec_name, ELF_SECTION_LICENSE))
1243			ret = bpf_fetch_license(ctx, i, &data);
1244		else if (data.sec_hdr.sh_type == SHT_SYMTAB)
1245			ret = bpf_fetch_symtab(ctx, i, &data);
1246		else if (data.sec_hdr.sh_type == SHT_STRTAB &&
1247			 i != ctx->elf_hdr.e_shstrndx)
1248			ret = bpf_fetch_strtab(ctx, i, &data);
1249		if (ret < 0) {
1250			fprintf(stderr, "Error parsing section %d! Perhaps"
1251				"check with readelf -a?\n", i);
1252			break;
1253		}
1254	}
1255
1256	if (ctx->sym_tab && ctx->str_tab && ctx->sec_maps) {
1257		ret = bpf_maps_attach_all(ctx);
1258		if (ret < 0) {
1259			fprintf(stderr, "Error loading maps into kernel!\n");
1260			return ret;
1261		}
1262	}
1263
1264	return ret;
1265}
1266
1267static int bpf_fetch_prog(struct bpf_elf_ctx *ctx, const char *section)
1268{
1269	struct bpf_elf_sec_data data;
1270	struct bpf_elf_prog prog;
1271	int ret, i, fd = -1;
1272
1273	for (i = 1; i < ctx->elf_hdr.e_shnum; i++) {
1274		if (ctx->sec_done[i])
1275			continue;
1276
1277		ret = bpf_fill_section_data(ctx, i, &data);
1278		if (ret < 0 || strcmp(data.sec_name, section))
1279			continue;
1280
1281		memset(&prog, 0, sizeof(prog));
1282		prog.type    = ctx->type;
1283		prog.insns   = data.sec_data->d_buf;
1284		prog.size    = data.sec_data->d_size;
1285		prog.license = ctx->license;
1286
1287		fd = bpf_prog_attach(section, &prog, ctx->verbose);
1288		if (fd < 0)
1289			continue;
1290
1291		ctx->sec_done[i] = true;
1292		break;
1293	}
1294
1295	return fd;
1296}
1297
1298static int bpf_apply_relo_data(struct bpf_elf_ctx *ctx,
1299			       struct bpf_elf_sec_data *data_relo,
1300			       struct bpf_elf_sec_data *data_insn)
1301{
1302	Elf_Data *idata = data_insn->sec_data;
1303	GElf_Shdr *rhdr = &data_relo->sec_hdr;
1304	int relo_ent, relo_num = rhdr->sh_size / rhdr->sh_entsize;
1305	struct bpf_insn *insns = idata->d_buf;
1306	unsigned int num_insns = idata->d_size / sizeof(*insns);
1307
1308	for (relo_ent = 0; relo_ent < relo_num; relo_ent++) {
1309		unsigned int ioff, rmap;
1310		GElf_Rel relo;
1311		GElf_Sym sym;
1312
1313		if (gelf_getrel(data_relo->sec_data, relo_ent, &relo) != &relo)
1314			return -EIO;
1315
1316		ioff = relo.r_offset / sizeof(struct bpf_insn);
1317		if (ioff >= num_insns ||
1318		    insns[ioff].code != (BPF_LD | BPF_IMM | BPF_DW))
1319			return -EINVAL;
1320
1321		if (gelf_getsym(ctx->sym_tab, GELF_R_SYM(relo.r_info), &sym) != &sym)
1322			return -EIO;
1323
1324		rmap = sym.st_value / sizeof(struct bpf_elf_map);
1325		if (rmap >= ARRAY_SIZE(ctx->map_fds))
1326			return -EINVAL;
1327		if (!ctx->map_fds[rmap])
1328			return -EINVAL;
1329
1330		if (ctx->verbose)
1331			fprintf(stderr, "Map \'%s\' (%d) injected into prog "
1332				"section \'%s\' at offset %u!\n",
1333				bpf_str_tab_name(ctx, &sym), ctx->map_fds[rmap],
1334				data_insn->sec_name, ioff);
1335
1336		insns[ioff].src_reg = BPF_PSEUDO_MAP_FD;
1337		insns[ioff].imm     = ctx->map_fds[rmap];
1338	}
1339
1340	return 0;
1341}
1342
1343static int bpf_fetch_prog_relo(struct bpf_elf_ctx *ctx, const char *section)
1344{
1345	struct bpf_elf_sec_data data_relo, data_insn;
1346	struct bpf_elf_prog prog;
1347	int ret, idx, i, fd = -1;
1348
1349	for (i = 1; i < ctx->elf_hdr.e_shnum; i++) {
1350		ret = bpf_fill_section_data(ctx, i, &data_relo);
1351		if (ret < 0 || data_relo.sec_hdr.sh_type != SHT_REL)
1352			continue;
1353
1354		idx = data_relo.sec_hdr.sh_info;
1355		ret = bpf_fill_section_data(ctx, idx, &data_insn);
1356		if (ret < 0 || strcmp(data_insn.sec_name, section))
1357			continue;
1358
1359		ret = bpf_apply_relo_data(ctx, &data_relo, &data_insn);
1360		if (ret < 0)
1361			continue;
1362
1363		memset(&prog, 0, sizeof(prog));
1364		prog.type    = ctx->type;
1365		prog.insns   = data_insn.sec_data->d_buf;
1366		prog.size    = data_insn.sec_data->d_size;
1367		prog.license = ctx->license;
1368
1369		fd = bpf_prog_attach(section, &prog, ctx->verbose);
1370		if (fd < 0)
1371			continue;
1372
1373		ctx->sec_done[i]   = true;
1374		ctx->sec_done[idx] = true;
1375		break;
1376	}
1377
1378	return fd;
1379}
1380
1381static int bpf_fetch_prog_sec(struct bpf_elf_ctx *ctx, const char *section)
1382{
1383	int ret = -1;
1384
1385	if (ctx->sym_tab)
1386		ret = bpf_fetch_prog_relo(ctx, section);
1387	if (ret < 0)
1388		ret = bpf_fetch_prog(ctx, section);
1389
1390	return ret;
1391}
1392
1393static int bpf_find_map_by_id(struct bpf_elf_ctx *ctx, uint32_t id)
1394{
1395	int i;
1396
1397	for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++)
1398		if (ctx->map_fds[i] && ctx->maps[i].id == id &&
1399		    ctx->maps[i].type == BPF_MAP_TYPE_PROG_ARRAY)
1400			return i;
1401	return -1;
1402}
1403
1404static int bpf_fill_prog_arrays(struct bpf_elf_ctx *ctx)
1405{
1406	struct bpf_elf_sec_data data;
1407	uint32_t map_id, key_id;
1408	int fd, i, ret, idx;
1409
1410	for (i = 1; i < ctx->elf_hdr.e_shnum; i++) {
1411		if (ctx->sec_done[i])
1412			continue;
1413
1414		ret = bpf_fill_section_data(ctx, i, &data);
1415		if (ret < 0)
1416			continue;
1417
1418		ret = sscanf(data.sec_name, "%i/%i", &map_id, &key_id);
1419		if (ret != 2)
1420			continue;
1421
1422		idx = bpf_find_map_by_id(ctx, map_id);
1423		if (idx < 0)
1424			continue;
1425
1426		fd = bpf_fetch_prog_sec(ctx, data.sec_name);
1427		if (fd < 0)
1428			return -EIO;
1429
1430		ret = bpf_map_update(ctx->map_fds[idx], &key_id,
1431				     &fd, BPF_ANY);
1432		if (ret < 0)
1433			return -ENOENT;
1434
1435		ctx->sec_done[i] = true;
1436	}
1437
1438	return 0;
1439}
1440
1441static void bpf_save_finfo(struct bpf_elf_ctx *ctx)
1442{
1443	struct stat st;
1444	int ret;
1445
1446	memset(&ctx->stat, 0, sizeof(ctx->stat));
1447
1448	ret = fstat(ctx->obj_fd, &st);
1449	if (ret < 0) {
1450		fprintf(stderr, "Stat of elf file failed: %s\n",
1451			strerror(errno));
1452		return;
1453	}
1454
1455	ctx->stat.st_dev = st.st_dev;
1456	ctx->stat.st_ino = st.st_ino;
1457}
1458
1459static int bpf_read_pin_mapping(FILE *fp, uint32_t *id, char *path)
1460{
1461	char buff[PATH_MAX];
1462
1463	while (fgets(buff, sizeof(buff), fp)) {
1464		char *ptr = buff;
1465
1466		while (*ptr == ' ' || *ptr == '\t')
1467			ptr++;
1468
1469		if (*ptr == '#' || *ptr == '\n' || *ptr == 0)
1470			continue;
1471
1472		if (sscanf(ptr, "%i %s\n", id, path) != 2 &&
1473		    sscanf(ptr, "%i %s #", id, path) != 2) {
1474			strcpy(path, ptr);
1475			return -1;
1476		}
1477
1478		return 1;
1479	}
1480
1481	return 0;
1482}
1483
1484static bool bpf_pinning_reserved(uint32_t pinning)
1485{
1486	switch (pinning) {
1487	case PIN_NONE:
1488	case PIN_OBJECT_NS:
1489	case PIN_GLOBAL_NS:
1490		return true;
1491	default:
1492		return false;
1493	}
1494}
1495
1496static void bpf_hash_init(struct bpf_elf_ctx *ctx, const char *db_file)
1497{
1498	struct bpf_hash_entry *entry;
1499	char subpath[PATH_MAX];
1500	uint32_t pinning;
1501	FILE *fp;
1502	int ret;
1503
1504	fp = fopen(db_file, "r");
1505	if (!fp)
1506		return;
1507
1508	memset(subpath, 0, sizeof(subpath));
1509	while ((ret = bpf_read_pin_mapping(fp, &pinning, subpath))) {
1510		if (ret == -1) {
1511			fprintf(stderr, "Database %s is corrupted at: %s\n",
1512				db_file, subpath);
1513			fclose(fp);
1514			return;
1515		}
1516
1517		if (bpf_pinning_reserved(pinning)) {
1518			fprintf(stderr, "Database %s, id %u is reserved - "
1519				"ignoring!\n", db_file, pinning);
1520			continue;
1521		}
1522
1523		entry = malloc(sizeof(*entry));
1524		if (!entry) {
1525			fprintf(stderr, "No memory left for db entry!\n");
1526			continue;
1527		}
1528
1529		entry->pinning = pinning;
1530		entry->subpath = strdup(subpath);
1531		if (!entry->subpath) {
1532			fprintf(stderr, "No memory left for db entry!\n");
1533			free(entry);
1534			continue;
1535		}
1536
1537		entry->next = ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)];
1538		ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)] = entry;
1539	}
1540
1541	fclose(fp);
1542}
1543
1544static void bpf_hash_destroy(struct bpf_elf_ctx *ctx)
1545{
1546	struct bpf_hash_entry *entry;
1547	int i;
1548
1549	for (i = 0; i < ARRAY_SIZE(ctx->ht); i++) {
1550		while ((entry = ctx->ht[i]) != NULL) {
1551			ctx->ht[i] = entry->next;
1552			free((char *)entry->subpath);
1553			free(entry);
1554		}
1555	}
1556}
1557
1558static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname,
1559			    enum bpf_prog_type type, bool verbose)
1560{
1561	int ret = -EINVAL;
1562
1563	if (elf_version(EV_CURRENT) == EV_NONE ||
1564	    bpf_init_env(pathname))
1565		return ret;
1566
1567	memset(ctx, 0, sizeof(*ctx));
1568	ctx->verbose = verbose;
1569	ctx->type    = type;
1570
1571	ctx->obj_fd = open(pathname, O_RDONLY);
1572	if (ctx->obj_fd < 0)
1573		return ctx->obj_fd;
1574
1575	ctx->elf_fd = elf_begin(ctx->obj_fd, ELF_C_READ, NULL);
1576	if (!ctx->elf_fd) {
1577		ret = -EINVAL;
1578		goto out_fd;
1579	}
1580
1581	if (gelf_getehdr(ctx->elf_fd, &ctx->elf_hdr) !=
1582	    &ctx->elf_hdr) {
1583		ret = -EIO;
1584		goto out_elf;
1585	}
1586
1587	ctx->sec_done = calloc(ctx->elf_hdr.e_shnum,
1588			       sizeof(*(ctx->sec_done)));
1589	if (!ctx->sec_done) {
1590		ret = -ENOMEM;
1591		goto out_elf;
1592	}
1593
1594	bpf_save_finfo(ctx);
1595	bpf_hash_init(ctx, CONFDIR "/bpf_pinning");
1596
1597	return 0;
1598out_elf:
1599	elf_end(ctx->elf_fd);
1600out_fd:
1601	close(ctx->obj_fd);
1602	return ret;
1603}
1604
1605static int bpf_maps_count(struct bpf_elf_ctx *ctx)
1606{
1607	int i, count = 0;
1608
1609	for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++) {
1610		if (!ctx->map_fds[i])
1611			break;
1612		count++;
1613	}
1614
1615	return count;
1616}
1617
1618static void bpf_maps_teardown(struct bpf_elf_ctx *ctx)
1619{
1620	int i;
1621
1622	for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++) {
1623		if (ctx->map_fds[i])
1624			close(ctx->map_fds[i]);
1625	}
1626}
1627
1628static void bpf_elf_ctx_destroy(struct bpf_elf_ctx *ctx, bool failure)
1629{
1630	if (failure)
1631		bpf_maps_teardown(ctx);
1632
1633	bpf_hash_destroy(ctx);
1634	free(ctx->sec_done);
1635	elf_end(ctx->elf_fd);
1636	close(ctx->obj_fd);
1637}
1638
1639static struct bpf_elf_ctx __ctx;
1640
1641static int bpf_obj_open(const char *pathname, enum bpf_prog_type type,
1642			const char *section, bool verbose)
1643{
1644	struct bpf_elf_ctx *ctx = &__ctx;
1645	int fd = 0, ret;
1646
1647	ret = bpf_elf_ctx_init(ctx, pathname, type, verbose);
1648	if (ret < 0) {
1649		fprintf(stderr, "Cannot initialize ELF context!\n");
1650		return ret;
1651	}
1652
1653	ret = bpf_fetch_ancillary(ctx);
1654	if (ret < 0) {
1655		fprintf(stderr, "Error fetching ELF ancillary data!\n");
1656		goto out;
1657	}
1658
1659	fd = bpf_fetch_prog_sec(ctx, section);
1660	if (fd < 0) {
1661		fprintf(stderr, "Error fetching program/map!\n");
1662		ret = fd;
1663		goto out;
1664	}
1665
1666	ret = bpf_fill_prog_arrays(ctx);
1667	if (ret < 0)
1668		fprintf(stderr, "Error filling program arrays!\n");
1669out:
1670	bpf_elf_ctx_destroy(ctx, ret < 0);
1671	if (ret < 0) {
1672		if (fd)
1673			close(fd);
1674		return ret;
1675	}
1676
1677	return fd;
1678}
1679
1680static int
1681bpf_map_set_send(int fd, struct sockaddr_un *addr, unsigned int addr_len,
1682		 const struct bpf_map_data *aux, unsigned int entries)
1683{
1684	struct bpf_map_set_msg msg;
1685	int *cmsg_buf, min_fd;
1686	char *amsg_buf;
1687	int i;
1688
1689	memset(&msg, 0, sizeof(msg));
1690
1691	msg.aux.uds_ver = BPF_SCM_AUX_VER;
1692	msg.aux.num_ent = entries;
1693
1694	strncpy(msg.aux.obj_name, aux->obj, sizeof(msg.aux.obj_name));
1695	memcpy(&msg.aux.obj_st, aux->st, sizeof(msg.aux.obj_st));
1696
1697	cmsg_buf = bpf_map_set_init(&msg, addr, addr_len);
1698	amsg_buf = (char *)msg.aux.ent;
1699
1700	for (i = 0; i < entries; i += min_fd) {
1701		int ret;
1702
1703		min_fd = min(BPF_SCM_MAX_FDS * 1U, entries - i);
1704		bpf_map_set_init_single(&msg, min_fd);
1705
1706		memcpy(cmsg_buf, &aux->fds[i], sizeof(aux->fds[0]) * min_fd);
1707		memcpy(amsg_buf, &aux->ent[i], sizeof(aux->ent[0]) * min_fd);
1708
1709		ret = sendmsg(fd, &msg.hdr, 0);
1710		if (ret <= 0)
1711			return ret ? : -1;
1712	}
1713
1714	return 0;
1715}
1716
1717static int
1718bpf_map_set_recv(int fd, int *fds,  struct bpf_map_aux *aux,
1719		 unsigned int entries)
1720{
1721	struct bpf_map_set_msg msg;
1722	int *cmsg_buf, min_fd;
1723	char *amsg_buf, *mmsg_buf;
1724	unsigned int needed = 1;
1725	int i;
1726
1727	cmsg_buf = bpf_map_set_init(&msg, NULL, 0);
1728	amsg_buf = (char *)msg.aux.ent;
1729	mmsg_buf = (char *)&msg.aux;
1730
1731	for (i = 0; i < min(entries, needed); i += min_fd) {
1732		struct cmsghdr *cmsg;
1733		int ret;
1734
1735		min_fd = min(entries, entries - i);
1736		bpf_map_set_init_single(&msg, min_fd);
1737
1738		ret = recvmsg(fd, &msg.hdr, 0);
1739		if (ret <= 0)
1740			return ret ? : -1;
1741
1742		cmsg = CMSG_FIRSTHDR(&msg.hdr);
1743		if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS)
1744			return -EINVAL;
1745		if (msg.hdr.msg_flags & MSG_CTRUNC)
1746			return -EIO;
1747		if (msg.aux.uds_ver != BPF_SCM_AUX_VER)
1748			return -ENOSYS;
1749
1750		min_fd = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof(fd);
1751		if (min_fd > entries || min_fd <= 0)
1752			return -EINVAL;
1753
1754		memcpy(&fds[i], cmsg_buf, sizeof(fds[0]) * min_fd);
1755		memcpy(&aux->ent[i], amsg_buf, sizeof(aux->ent[0]) * min_fd);
1756		memcpy(aux, mmsg_buf, offsetof(struct bpf_map_aux, ent));
1757
1758		needed = aux->num_ent;
1759	}
1760
1761	return 0;
1762}
1763
1764int bpf_send_map_fds(const char *path, const char *obj)
1765{
1766	struct bpf_elf_ctx *ctx = &__ctx;
1767	struct sockaddr_un addr;
1768	struct bpf_map_data bpf_aux;
1769	int fd, ret;
1770
1771	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
1772	if (fd < 0) {
1773		fprintf(stderr, "Cannot open socket: %s\n",
1774			strerror(errno));
1775		return -1;
1776	}
1777
1778	memset(&addr, 0, sizeof(addr));
1779	addr.sun_family = AF_UNIX;
1780	strncpy(addr.sun_path, path, sizeof(addr.sun_path));
1781
1782	ret = connect(fd, (struct sockaddr *)&addr, sizeof(addr));
1783	if (ret < 0) {
1784		fprintf(stderr, "Cannot connect to %s: %s\n",
1785			path, strerror(errno));
1786		return -1;
1787	}
1788
1789	memset(&bpf_aux, 0, sizeof(bpf_aux));
1790
1791	bpf_aux.fds = ctx->map_fds;
1792	bpf_aux.ent = ctx->maps;
1793	bpf_aux.st  = &ctx->stat;
1794	bpf_aux.obj = obj;
1795
1796	ret = bpf_map_set_send(fd, &addr, sizeof(addr), &bpf_aux,
1797			       bpf_maps_count(ctx));
1798	if (ret < 0)
1799		fprintf(stderr, "Cannot send fds to %s: %s\n",
1800			path, strerror(errno));
1801
1802	bpf_maps_teardown(ctx);
1803	close(fd);
1804	return ret;
1805}
1806
1807int bpf_recv_map_fds(const char *path, int *fds, struct bpf_map_aux *aux,
1808		     unsigned int entries)
1809{
1810	struct sockaddr_un addr;
1811	int fd, ret;
1812
1813	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
1814	if (fd < 0) {
1815		fprintf(stderr, "Cannot open socket: %s\n",
1816			strerror(errno));
1817		return -1;
1818	}
1819
1820	memset(&addr, 0, sizeof(addr));
1821	addr.sun_family = AF_UNIX;
1822	strncpy(addr.sun_path, path, sizeof(addr.sun_path));
1823
1824	ret = bind(fd, (struct sockaddr *)&addr, sizeof(addr));
1825	if (ret < 0) {
1826		fprintf(stderr, "Cannot bind to socket: %s\n",
1827			strerror(errno));
1828		return -1;
1829	}
1830
1831	ret = bpf_map_set_recv(fd, fds, aux, entries);
1832	if (ret < 0)
1833		fprintf(stderr, "Cannot recv fds from %s: %s\n",
1834			path, strerror(errno));
1835
1836	unlink(addr.sun_path);
1837	close(fd);
1838	return ret;
1839}
1840#endif /* HAVE_ELF */
1841