tc_bpf.c revision 8187b012731cf2699c0abd5c88673bdaebca53b2
1/*
2 * tc_bpf.c	BPF common code
3 *
4 *		This program is free software; you can distribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Daniel Borkmann <dborkman@redhat.com>
10 *		Jiri Pirko <jiri@resnulli.us>
11 *		Alexei Starovoitov <ast@plumgrid.com>
12 */
13
14#include <stdio.h>
15#include <stdlib.h>
16#include <unistd.h>
17#include <string.h>
18#include <stdbool.h>
19#include <stdint.h>
20#include <errno.h>
21#include <fcntl.h>
22#include <stdarg.h>
23
24#ifdef HAVE_ELF
25#include <libelf.h>
26#include <gelf.h>
27#endif
28
29#include <sys/types.h>
30#include <sys/stat.h>
31#include <sys/un.h>
32#include <sys/vfs.h>
33#include <sys/mount.h>
34#include <sys/syscall.h>
35#include <sys/sendfile.h>
36#include <sys/resource.h>
37
38#include <linux/bpf.h>
39#include <linux/filter.h>
40#include <linux/if_alg.h>
41
42#include <arpa/inet.h>
43
44#include "utils.h"
45
46#include "bpf_elf.h"
47#include "bpf_scm.h"
48
49#include "tc_util.h"
50#include "tc_bpf.h"
51
52#ifdef HAVE_ELF
53static int bpf_obj_open(const char *path, enum bpf_prog_type type,
54			const char *sec, bool verbose);
55#else
56static int bpf_obj_open(const char *path, enum bpf_prog_type type,
57			const char *sec, bool verbose)
58{
59	fprintf(stderr, "No ELF library support compiled in.\n");
60	errno = ENOSYS;
61	return -1;
62}
63#endif
64
65static inline __u64 bpf_ptr_to_u64(const void *ptr)
66{
67	return (__u64)(unsigned long)ptr;
68}
69
70static int bpf(int cmd, union bpf_attr *attr, unsigned int size)
71{
72#ifdef __NR_bpf
73	return syscall(__NR_bpf, cmd, attr, size);
74#else
75	fprintf(stderr, "No bpf syscall, kernel headers too old?\n");
76	errno = ENOSYS;
77	return -1;
78#endif
79}
80
81static int bpf_map_update(int fd, const void *key, const void *value,
82			  uint64_t flags)
83{
84	union bpf_attr attr = {
85		.map_fd		= fd,
86		.key		= bpf_ptr_to_u64(key),
87		.value		= bpf_ptr_to_u64(value),
88		.flags		= flags,
89	};
90
91	return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
92}
93
94static int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len,
95			    char **bpf_string, bool *need_release,
96			    const char separator)
97{
98	char sp;
99
100	if (from_file) {
101		size_t tmp_len, op_len = sizeof("65535 255 255 4294967295,");
102		char *tmp_string;
103		FILE *fp;
104
105		tmp_len = sizeof("4096,") + BPF_MAXINSNS * op_len;
106		tmp_string = malloc(tmp_len);
107		if (tmp_string == NULL)
108			return -ENOMEM;
109
110		memset(tmp_string, 0, tmp_len);
111
112		fp = fopen(arg, "r");
113		if (fp == NULL) {
114			perror("Cannot fopen");
115			free(tmp_string);
116			return -ENOENT;
117		}
118
119		if (!fgets(tmp_string, tmp_len, fp)) {
120			free(tmp_string);
121			fclose(fp);
122			return -EIO;
123		}
124
125		fclose(fp);
126
127		*need_release = true;
128		*bpf_string = tmp_string;
129	} else {
130		*need_release = false;
131		*bpf_string = arg;
132	}
133
134	if (sscanf(*bpf_string, "%hu%c", bpf_len, &sp) != 2 ||
135	    sp != separator) {
136		if (*need_release)
137			free(*bpf_string);
138		return -EINVAL;
139	}
140
141	return 0;
142}
143
144static int bpf_ops_parse(int argc, char **argv, struct sock_filter *bpf_ops,
145			 bool from_file)
146{
147	char *bpf_string, *token, separator = ',';
148	int ret = 0, i = 0;
149	bool need_release;
150	__u16 bpf_len = 0;
151
152	if (argc < 1)
153		return -EINVAL;
154	if (bpf_parse_string(argv[0], from_file, &bpf_len, &bpf_string,
155			     &need_release, separator))
156		return -EINVAL;
157	if (bpf_len == 0 || bpf_len > BPF_MAXINSNS) {
158		ret = -EINVAL;
159		goto out;
160	}
161
162	token = bpf_string;
163	while ((token = strchr(token, separator)) && (++token)[0]) {
164		if (i >= bpf_len) {
165			fprintf(stderr, "Real program length exceeds encoded "
166				"length parameter!\n");
167			ret = -EINVAL;
168			goto out;
169		}
170
171		if (sscanf(token, "%hu %hhu %hhu %u,",
172			   &bpf_ops[i].code, &bpf_ops[i].jt,
173			   &bpf_ops[i].jf, &bpf_ops[i].k) != 4) {
174			fprintf(stderr, "Error at instruction %d!\n", i);
175			ret = -EINVAL;
176			goto out;
177		}
178
179		i++;
180	}
181
182	if (i != bpf_len) {
183		fprintf(stderr, "Parsed program length is less than encoded"
184			"length parameter!\n");
185		ret = -EINVAL;
186		goto out;
187	}
188	ret = bpf_len;
189out:
190	if (need_release)
191		free(bpf_string);
192
193	return ret;
194}
195
196void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len)
197{
198	struct sock_filter *ops = (struct sock_filter *) RTA_DATA(bpf_ops);
199	int i;
200
201	if (len == 0)
202		return;
203
204	fprintf(f, "bytecode \'%u,", len);
205
206	for (i = 0; i < len - 1; i++)
207		fprintf(f, "%hu %hhu %hhu %u,", ops[i].code, ops[i].jt,
208			ops[i].jf, ops[i].k);
209
210	fprintf(f, "%hu %hhu %hhu %u\'", ops[i].code, ops[i].jt,
211		ops[i].jf, ops[i].k);
212}
213
214static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map,
215				    int length)
216{
217	char file[PATH_MAX], buff[4096];
218	struct bpf_elf_map tmp, zero;
219	unsigned int val;
220	FILE *fp;
221
222	snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd);
223
224	fp = fopen(file, "r");
225	if (!fp) {
226		fprintf(stderr, "No procfs support?!\n");
227		return -EIO;
228	}
229
230	memset(&tmp, 0, sizeof(tmp));
231	while (fgets(buff, sizeof(buff), fp)) {
232		if (sscanf(buff, "map_type:\t%u", &val) == 1)
233			tmp.type = val;
234		else if (sscanf(buff, "key_size:\t%u", &val) == 1)
235			tmp.size_key = val;
236		else if (sscanf(buff, "value_size:\t%u", &val) == 1)
237			tmp.size_value = val;
238		else if (sscanf(buff, "max_entries:\t%u", &val) == 1)
239			tmp.max_elem = val;
240	}
241
242	fclose(fp);
243
244	if (!memcmp(&tmp, map, length)) {
245		return 0;
246	} else {
247		memset(&zero, 0, sizeof(zero));
248		/* If kernel doesn't have eBPF-related fdinfo, we cannot do much,
249		 * so just accept it. We know we do have an eBPF fd and in this
250		 * case, everything is 0. It is guaranteed that no such map exists
251		 * since map type of 0 is unloadable BPF_MAP_TYPE_UNSPEC.
252		 */
253		if (!memcmp(&tmp, &zero, length))
254			return 0;
255
256		fprintf(stderr, "Map specs from pinned file differ!\n");
257		return -EINVAL;
258	}
259}
260
261static int bpf_mnt_fs(const char *target)
262{
263	bool bind_done = false;
264
265	while (mount("", target, "none", MS_PRIVATE | MS_REC, NULL)) {
266		if (errno != EINVAL || bind_done) {
267			fprintf(stderr, "mount --make-private %s failed: %s\n",
268				target,	strerror(errno));
269			return -1;
270		}
271
272		if (mount(target, target, "none", MS_BIND, NULL)) {
273			fprintf(stderr, "mount --bind %s %s failed: %s\n",
274				target,	target, strerror(errno));
275			return -1;
276		}
277
278		bind_done = true;
279	}
280
281	if (mount("bpf", target, "bpf", 0, NULL)) {
282		fprintf(stderr, "mount -t bpf bpf %s failed: %s\n",
283			target,	strerror(errno));
284		return -1;
285	}
286
287	return 0;
288}
289
290static int bpf_valid_mntpt(const char *mnt, unsigned long magic)
291{
292	struct statfs st_fs;
293
294	if (statfs(mnt, &st_fs) < 0)
295		return -ENOENT;
296	if ((unsigned long)st_fs.f_type != magic)
297		return -ENOENT;
298
299	return 0;
300}
301
302static const char *bpf_find_mntpt(const char *fstype, unsigned long magic,
303				  char *mnt, int len,
304				  const char * const *known_mnts)
305{
306	const char * const *ptr;
307	char type[100];
308	FILE *fp;
309
310	if (known_mnts) {
311		ptr = known_mnts;
312		while (*ptr) {
313			if (bpf_valid_mntpt(*ptr, magic) == 0) {
314				strncpy(mnt, *ptr, len - 1);
315				mnt[len - 1] = 0;
316				return mnt;
317			}
318			ptr++;
319		}
320	}
321
322	fp = fopen("/proc/mounts", "r");
323	if (fp == NULL || len != PATH_MAX)
324		return NULL;
325
326	while (fscanf(fp, "%*s %" textify(PATH_MAX) "s %99s %*s %*d %*d\n",
327		      mnt, type) == 2) {
328		if (strcmp(type, fstype) == 0)
329			break;
330	}
331
332	fclose(fp);
333	if (strcmp(type, fstype) != 0)
334		return NULL;
335
336	return mnt;
337}
338
339int bpf_trace_pipe(void)
340{
341	char tracefs_mnt[PATH_MAX] = TRACE_DIR_MNT;
342	static const char * const tracefs_known_mnts[] = {
343		TRACE_DIR_MNT,
344		"/sys/kernel/debug/tracing",
345		"/tracing",
346		"/trace",
347		0,
348	};
349	char tpipe[PATH_MAX];
350	const char *mnt;
351	int fd;
352
353	mnt = bpf_find_mntpt("tracefs", TRACEFS_MAGIC, tracefs_mnt,
354			     sizeof(tracefs_mnt), tracefs_known_mnts);
355	if (!mnt) {
356		fprintf(stderr, "tracefs not mounted?\n");
357		return -1;
358	}
359
360	snprintf(tpipe, sizeof(tpipe), "%s/trace_pipe", mnt);
361
362	fd = open(tpipe, O_RDONLY);
363	if (fd < 0)
364		return -1;
365
366	fprintf(stderr, "Running! Hang up with ^C!\n\n");
367	while (1) {
368		static char buff[4096];
369		ssize_t ret;
370
371		ret = read(fd, buff, sizeof(buff) - 1);
372		if (ret > 0) {
373			write(2, buff, ret);
374			fflush(stderr);
375		}
376	}
377
378	return 0;
379}
380
381static const char *bpf_get_tc_dir(void)
382{
383	static bool bpf_mnt_cached = false;
384	static char bpf_tc_dir[PATH_MAX];
385	static const char *mnt;
386	static const char * const bpf_known_mnts[] = {
387		BPF_DIR_MNT,
388		0,
389	};
390	char bpf_mnt[PATH_MAX] = BPF_DIR_MNT;
391	char bpf_glo_dir[PATH_MAX];
392	int ret;
393
394	if (bpf_mnt_cached)
395		goto done;
396
397	mnt = bpf_find_mntpt("bpf", BPF_FS_MAGIC, bpf_mnt, sizeof(bpf_mnt),
398			     bpf_known_mnts);
399	if (!mnt) {
400		mnt = getenv(BPF_ENV_MNT);
401		if (!mnt)
402			mnt = BPF_DIR_MNT;
403		ret = bpf_mnt_fs(mnt);
404		if (ret) {
405			mnt = NULL;
406			goto out;
407		}
408	}
409
410	snprintf(bpf_tc_dir, sizeof(bpf_tc_dir), "%s/%s", mnt, BPF_DIR_TC);
411	ret = mkdir(bpf_tc_dir, S_IRWXU);
412	if (ret && errno != EEXIST) {
413		fprintf(stderr, "mkdir %s failed: %s\n", bpf_tc_dir,
414			strerror(errno));
415		mnt = NULL;
416		goto out;
417	}
418
419	snprintf(bpf_glo_dir, sizeof(bpf_glo_dir), "%s/%s",
420		 bpf_tc_dir, BPF_DIR_GLOBALS);
421	ret = mkdir(bpf_glo_dir, S_IRWXU);
422	if (ret && errno != EEXIST) {
423		fprintf(stderr, "mkdir %s failed: %s\n", bpf_glo_dir,
424			strerror(errno));
425		mnt = NULL;
426		goto out;
427	}
428
429	mnt = bpf_tc_dir;
430out:
431	bpf_mnt_cached = true;
432done:
433	return mnt;
434}
435
436static int bpf_obj_get(const char *pathname)
437{
438	union bpf_attr attr;
439	char tmp[PATH_MAX];
440
441	if (strlen(pathname) > 2 && pathname[0] == 'm' &&
442	    pathname[1] == ':' && bpf_get_tc_dir()) {
443		snprintf(tmp, sizeof(tmp), "%s/%s",
444			 bpf_get_tc_dir(), pathname + 2);
445		pathname = tmp;
446	}
447
448	memset(&attr, 0, sizeof(attr));
449	attr.pathname = bpf_ptr_to_u64(pathname);
450
451	return bpf(BPF_OBJ_GET, &attr, sizeof(attr));
452}
453
454const char *bpf_default_section(const enum bpf_prog_type type)
455{
456	switch (type) {
457	case BPF_PROG_TYPE_SCHED_CLS:
458		return ELF_SECTION_CLASSIFIER;
459	case BPF_PROG_TYPE_SCHED_ACT:
460		return ELF_SECTION_ACTION;
461	default:
462		return NULL;
463	}
464}
465
466enum bpf_mode {
467	CBPF_BYTECODE = 0,
468	CBPF_FILE,
469	EBPF_OBJECT,
470	EBPF_PINNED,
471	__BPF_MODE_MAX,
472#define BPF_MODE_MAX	__BPF_MODE_MAX
473};
474
475static int bpf_parse(int *ptr_argc, char ***ptr_argv, const bool *opt_tbl,
476		     enum bpf_prog_type *type, enum bpf_mode *mode,
477		     const char **ptr_object, const char **ptr_section,
478		     const char **ptr_uds_name, struct sock_filter *opcodes)
479{
480	const char *file, *section, *uds_name;
481	bool verbose = false;
482	int ret, argc;
483	char **argv;
484
485	argv = *ptr_argv;
486	argc = *ptr_argc;
487
488	if (opt_tbl[CBPF_BYTECODE] &&
489	    (matches(*argv, "bytecode") == 0 ||
490	     strcmp(*argv, "bc") == 0)) {
491		*mode = CBPF_BYTECODE;
492	} else if (opt_tbl[CBPF_FILE] &&
493		   (matches(*argv, "bytecode-file") == 0 ||
494		    strcmp(*argv, "bcf") == 0)) {
495		*mode = CBPF_FILE;
496	} else if (opt_tbl[EBPF_OBJECT] &&
497		   (matches(*argv, "object-file") == 0 ||
498		    strcmp(*argv, "obj") == 0)) {
499		*mode = EBPF_OBJECT;
500	} else if (opt_tbl[EBPF_PINNED] &&
501		   (matches(*argv, "object-pinned") == 0 ||
502		    matches(*argv, "pinned") == 0 ||
503		    matches(*argv, "fd") == 0)) {
504		*mode = EBPF_PINNED;
505	} else {
506		fprintf(stderr, "What mode is \"%s\"?\n", *argv);
507		return -1;
508	}
509
510	NEXT_ARG();
511	file = section = uds_name = NULL;
512	if (*mode == EBPF_OBJECT || *mode == EBPF_PINNED) {
513		file = *argv;
514		NEXT_ARG_FWD();
515
516		if (*type == BPF_PROG_TYPE_UNSPEC) {
517			if (argc > 0 && matches(*argv, "type") == 0) {
518				NEXT_ARG();
519				if (matches(*argv, "cls") == 0) {
520					*type = BPF_PROG_TYPE_SCHED_CLS;
521				} else if (matches(*argv, "act") == 0) {
522					*type = BPF_PROG_TYPE_SCHED_ACT;
523				} else {
524					fprintf(stderr, "What type is \"%s\"?\n",
525						*argv);
526					return -1;
527				}
528				NEXT_ARG_FWD();
529			} else {
530				*type = BPF_PROG_TYPE_SCHED_CLS;
531			}
532		}
533
534		section = bpf_default_section(*type);
535		if (argc > 0 && matches(*argv, "section") == 0) {
536			NEXT_ARG();
537			section = *argv;
538			NEXT_ARG_FWD();
539		}
540
541		uds_name = getenv(BPF_ENV_UDS);
542		if (argc > 0 && !uds_name &&
543		    matches(*argv, "export") == 0) {
544			NEXT_ARG();
545			uds_name = *argv;
546			NEXT_ARG_FWD();
547		}
548
549		if (argc > 0 && matches(*argv, "verbose") == 0) {
550			verbose = true;
551			NEXT_ARG_FWD();
552		}
553
554		PREV_ARG();
555	}
556
557	if (*mode == CBPF_BYTECODE || *mode == CBPF_FILE)
558		ret = bpf_ops_parse(argc, argv, opcodes, *mode == CBPF_FILE);
559	else if (*mode == EBPF_OBJECT)
560		ret = bpf_obj_open(file, *type, section, verbose);
561	else if (*mode == EBPF_PINNED)
562		ret = bpf_obj_get(file);
563	else
564		return -1;
565
566	if (ptr_object)
567		*ptr_object = file;
568	if (ptr_section)
569		*ptr_section = section;
570	if (ptr_uds_name)
571		*ptr_uds_name = uds_name;
572
573	*ptr_argc = argc;
574	*ptr_argv = argv;
575
576	return ret;
577}
578
579int bpf_parse_common(int *ptr_argc, char ***ptr_argv, const int *nla_tbl,
580		     enum bpf_prog_type type, const char **ptr_object,
581		     const char **ptr_uds_name, struct nlmsghdr *n)
582{
583	struct sock_filter opcodes[BPF_MAXINSNS];
584	const bool opt_tbl[BPF_MODE_MAX] = {
585		[CBPF_BYTECODE]	= true,
586		[CBPF_FILE]	= true,
587		[EBPF_OBJECT]	= true,
588		[EBPF_PINNED]	= true,
589	};
590	char annotation[256];
591	const char *section;
592	enum bpf_mode mode;
593	int ret;
594
595	ret = bpf_parse(ptr_argc, ptr_argv, opt_tbl, &type, &mode,
596			ptr_object, &section, ptr_uds_name, opcodes);
597	if (ret < 0)
598		return ret;
599
600	if (mode == CBPF_BYTECODE || mode == CBPF_FILE) {
601		addattr16(n, MAX_MSG, nla_tbl[BPF_NLA_OPS_LEN], ret);
602		addattr_l(n, MAX_MSG, nla_tbl[BPF_NLA_OPS], opcodes,
603			  ret * sizeof(struct sock_filter));
604	}
605
606	if (mode == EBPF_OBJECT || mode == EBPF_PINNED) {
607		snprintf(annotation, sizeof(annotation), "%s:[%s]",
608			 basename(*ptr_object), mode == EBPF_PINNED ?
609			 "*fsobj" : section);
610
611		addattr32(n, MAX_MSG, nla_tbl[BPF_NLA_FD], ret);
612		addattrstrz(n, MAX_MSG, nla_tbl[BPF_NLA_NAME], annotation);
613	}
614
615	return 0;
616}
617
618int bpf_graft_map(const char *map_path, uint32_t *key, int argc, char **argv)
619{
620	enum bpf_prog_type type = BPF_PROG_TYPE_UNSPEC;
621	const bool opt_tbl[BPF_MODE_MAX] = {
622		[CBPF_BYTECODE]	= false,
623		[CBPF_FILE]	= false,
624		[EBPF_OBJECT]	= true,
625		[EBPF_PINNED]	= true,
626	};
627	const struct bpf_elf_map test = {
628		.type		= BPF_MAP_TYPE_PROG_ARRAY,
629		.size_key	= sizeof(int),
630		.size_value	= sizeof(int),
631	};
632	int ret, prog_fd, map_fd;
633	const char *section;
634	enum bpf_mode mode;
635	uint32_t map_key;
636
637	prog_fd = bpf_parse(&argc, &argv, opt_tbl, &type, &mode,
638			    NULL, &section, NULL, NULL);
639	if (prog_fd < 0)
640		return prog_fd;
641	if (key) {
642		map_key = *key;
643	} else {
644		ret = sscanf(section, "%*i/%i", &map_key);
645		if (ret != 1) {
646			fprintf(stderr, "Couldn\'t infer map key from section "
647				"name! Please provide \'key\' argument!\n");
648			ret = -EINVAL;
649			goto out_prog;
650		}
651	}
652
653	map_fd = bpf_obj_get(map_path);
654	if (map_fd < 0) {
655		fprintf(stderr, "Couldn\'t retrieve pinned map \'%s\': %s\n",
656			map_path, strerror(errno));
657		ret = map_fd;
658		goto out_prog;
659	}
660
661	ret = bpf_map_selfcheck_pinned(map_fd, &test,
662				       offsetof(struct bpf_elf_map, max_elem));
663	if (ret < 0) {
664		fprintf(stderr, "Map \'%s\' self-check failed!\n", map_path);
665		goto out_map;
666	}
667
668	ret = bpf_map_update(map_fd, &map_key, &prog_fd, BPF_ANY);
669	if (ret < 0)
670		fprintf(stderr, "Map update failed: %s\n", strerror(errno));
671out_map:
672	close(map_fd);
673out_prog:
674	close(prog_fd);
675	return ret;
676}
677
678#ifdef HAVE_ELF
679struct bpf_elf_prog {
680	enum bpf_prog_type	type;
681	const struct bpf_insn	*insns;
682	size_t			size;
683	const char		*license;
684};
685
686struct bpf_hash_entry {
687	unsigned int		pinning;
688	const char		*subpath;
689	struct bpf_hash_entry	*next;
690};
691
692struct bpf_elf_ctx {
693	Elf			*elf_fd;
694	GElf_Ehdr		elf_hdr;
695	Elf_Data		*sym_tab;
696	Elf_Data		*str_tab;
697	int			obj_fd;
698	int			map_fds[ELF_MAX_MAPS];
699	struct bpf_elf_map	maps[ELF_MAX_MAPS];
700	int			sym_num;
701	int			map_num;
702	bool			*sec_done;
703	int			sec_maps;
704	char			license[ELF_MAX_LICENSE_LEN];
705	enum bpf_prog_type	type;
706	bool			verbose;
707	struct bpf_elf_st	stat;
708	struct bpf_hash_entry	*ht[256];
709};
710
711struct bpf_elf_sec_data {
712	GElf_Shdr		sec_hdr;
713	Elf_Data		*sec_data;
714	const char		*sec_name;
715};
716
717struct bpf_map_data {
718	int			*fds;
719	const char		*obj;
720	struct bpf_elf_st	*st;
721	struct bpf_elf_map	*ent;
722};
723
724/* If we provide a small buffer with log level enabled, the kernel
725 * could fail program load as no buffer space is available for the
726 * log and thus verifier fails. In case something doesn't pass the
727 * verifier we still want to hand something descriptive to the user.
728 */
729static char bpf_log_buf[65536];
730
731static __check_format_string(1, 2) void bpf_dump_error(const char *format, ...)
732{
733	va_list vl;
734
735	va_start(vl, format);
736	vfprintf(stderr, format, vl);
737	va_end(vl);
738
739	if (bpf_log_buf[0]) {
740		fprintf(stderr, "%s\n", bpf_log_buf);
741		memset(bpf_log_buf, 0, sizeof(bpf_log_buf));
742	}
743}
744
745static int bpf_map_create(enum bpf_map_type type, unsigned int size_key,
746			  unsigned int size_value, unsigned int max_elem)
747{
748	union bpf_attr attr = {
749		.map_type	= type,
750		.key_size	= size_key,
751		.value_size	= size_value,
752		.max_entries	= max_elem,
753	};
754
755	return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
756}
757
758static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
759			 size_t size, const char *license)
760{
761	union bpf_attr attr = {
762		.prog_type	= type,
763		.insns		= bpf_ptr_to_u64(insns),
764		.insn_cnt	= size / sizeof(struct bpf_insn),
765		.license	= bpf_ptr_to_u64(license),
766		.log_buf	= bpf_ptr_to_u64(bpf_log_buf),
767		.log_size	= sizeof(bpf_log_buf),
768		.log_level	= 1,
769	};
770
771	if (getenv(BPF_ENV_NOLOG)) {
772		attr.log_buf	= 0;
773		attr.log_size	= 0;
774		attr.log_level	= 0;
775	}
776
777	return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
778}
779
780static int bpf_obj_pin(int fd, const char *pathname)
781{
782	union bpf_attr attr = {
783		.pathname	= bpf_ptr_to_u64(pathname),
784		.bpf_fd		= fd,
785	};
786
787	return bpf(BPF_OBJ_PIN, &attr, sizeof(attr));
788}
789
790static int bpf_obj_hash(const char *object, uint8_t *out, size_t len)
791{
792	struct sockaddr_alg alg = {
793		.salg_family	= AF_ALG,
794		.salg_type	= "hash",
795		.salg_name	= "sha1",
796	};
797	int ret, cfd, ofd, ffd;
798	struct stat stbuff;
799	ssize_t size;
800
801	if (!object || len != 20)
802		return -EINVAL;
803
804	cfd = socket(AF_ALG, SOCK_SEQPACKET, 0);
805	if (cfd < 0) {
806		fprintf(stderr, "Cannot get AF_ALG socket: %s\n",
807			strerror(errno));
808		return cfd;
809	}
810
811	ret = bind(cfd, (struct sockaddr *)&alg, sizeof(alg));
812	if (ret < 0) {
813		fprintf(stderr, "Error binding socket: %s\n", strerror(errno));
814		goto out_cfd;
815	}
816
817	ofd = accept(cfd, NULL, 0);
818	if (ofd < 0) {
819		fprintf(stderr, "Error accepting socket: %s\n",
820			strerror(errno));
821		ret = ofd;
822		goto out_cfd;
823	}
824
825	ffd = open(object, O_RDONLY);
826	if (ffd < 0) {
827		fprintf(stderr, "Error opening object %s: %s\n",
828			object, strerror(errno));
829		ret = ffd;
830		goto out_ofd;
831	}
832
833        ret = fstat(ffd, &stbuff);
834	if (ret < 0) {
835		fprintf(stderr, "Error doing fstat: %s\n",
836			strerror(errno));
837		goto out_ffd;
838	}
839
840	size = sendfile(ofd, ffd, NULL, stbuff.st_size);
841	if (size != stbuff.st_size) {
842		fprintf(stderr, "Error from sendfile (%zd vs %zu bytes): %s\n",
843			size, stbuff.st_size, strerror(errno));
844		ret = -1;
845		goto out_ffd;
846	}
847
848	size = read(ofd, out, len);
849	if (size != len) {
850		fprintf(stderr, "Error from read (%zd vs %zu bytes): %s\n",
851			size, len, strerror(errno));
852		ret = -1;
853	} else {
854		ret = 0;
855	}
856out_ffd:
857	close(ffd);
858out_ofd:
859	close(ofd);
860out_cfd:
861	close(cfd);
862	return ret;
863}
864
865static const char *bpf_get_obj_uid(const char *pathname)
866{
867	static bool bpf_uid_cached = false;
868	static char bpf_uid[64];
869	uint8_t tmp[20];
870	int ret;
871
872	if (bpf_uid_cached)
873		goto done;
874
875	ret = bpf_obj_hash(pathname, tmp, sizeof(tmp));
876	if (ret) {
877		fprintf(stderr, "Object hashing failed!\n");
878		return NULL;
879	}
880
881	hexstring_n2a(tmp, sizeof(tmp), bpf_uid, sizeof(bpf_uid));
882	bpf_uid_cached = true;
883done:
884	return bpf_uid;
885}
886
887static int bpf_init_env(const char *pathname)
888{
889	struct rlimit limit = {
890		.rlim_cur = RLIM_INFINITY,
891		.rlim_max = RLIM_INFINITY,
892	};
893
894	/* Don't bother in case we fail! */
895	setrlimit(RLIMIT_MEMLOCK, &limit);
896
897	if (!bpf_get_tc_dir()) {
898		fprintf(stderr, "Continuing without mounted eBPF fs. "
899			"Too old kernel?\n");
900		return 0;
901	}
902
903	if (!bpf_get_obj_uid(pathname))
904		return -1;
905
906	return 0;
907}
908
909static const char *bpf_custom_pinning(const struct bpf_elf_ctx *ctx,
910				      uint32_t pinning)
911{
912	struct bpf_hash_entry *entry;
913
914	entry = ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)];
915	while (entry && entry->pinning != pinning)
916		entry = entry->next;
917
918	return entry ? entry->subpath : NULL;
919}
920
921static bool bpf_no_pinning(const struct bpf_elf_ctx *ctx,
922			   uint32_t pinning)
923{
924	switch (pinning) {
925	case PIN_OBJECT_NS:
926	case PIN_GLOBAL_NS:
927		return false;
928	case PIN_NONE:
929		return true;
930	default:
931		return !bpf_custom_pinning(ctx, pinning);
932	}
933}
934
935static void bpf_make_pathname(char *pathname, size_t len, const char *name,
936			      const struct bpf_elf_ctx *ctx, uint32_t pinning)
937{
938	switch (pinning) {
939	case PIN_OBJECT_NS:
940		snprintf(pathname, len, "%s/%s/%s", bpf_get_tc_dir(),
941			 bpf_get_obj_uid(NULL), name);
942		break;
943	case PIN_GLOBAL_NS:
944		snprintf(pathname, len, "%s/%s/%s", bpf_get_tc_dir(),
945			 BPF_DIR_GLOBALS, name);
946		break;
947	default:
948		snprintf(pathname, len, "%s/../%s/%s", bpf_get_tc_dir(),
949			 bpf_custom_pinning(ctx, pinning), name);
950		break;
951	}
952}
953
954static int bpf_probe_pinned(const char *name, const struct bpf_elf_ctx *ctx,
955			    uint32_t pinning)
956{
957	char pathname[PATH_MAX];
958
959	if (bpf_no_pinning(ctx, pinning) || !bpf_get_tc_dir())
960		return 0;
961
962	bpf_make_pathname(pathname, sizeof(pathname), name, ctx, pinning);
963	return bpf_obj_get(pathname);
964}
965
966static int bpf_make_obj_path(void)
967{
968	char tmp[PATH_MAX];
969	int ret;
970
971	snprintf(tmp, sizeof(tmp), "%s/%s", bpf_get_tc_dir(),
972		 bpf_get_obj_uid(NULL));
973
974	ret = mkdir(tmp, S_IRWXU);
975	if (ret && errno != EEXIST) {
976		fprintf(stderr, "mkdir %s failed: %s\n", tmp, strerror(errno));
977		return ret;
978	}
979
980	return 0;
981}
982
983static int bpf_make_custom_path(const char *todo)
984{
985	char tmp[PATH_MAX], rem[PATH_MAX], *sub;
986	int ret;
987
988	snprintf(tmp, sizeof(tmp), "%s/../", bpf_get_tc_dir());
989	snprintf(rem, sizeof(rem), "%s/", todo);
990	sub = strtok(rem, "/");
991
992	while (sub) {
993		if (strlen(tmp) + strlen(sub) + 2 > PATH_MAX)
994			return -EINVAL;
995
996		strcat(tmp, sub);
997		strcat(tmp, "/");
998
999		ret = mkdir(tmp, S_IRWXU);
1000		if (ret && errno != EEXIST) {
1001			fprintf(stderr, "mkdir %s failed: %s\n", tmp,
1002				strerror(errno));
1003			return ret;
1004		}
1005
1006		sub = strtok(NULL, "/");
1007	}
1008
1009	return 0;
1010}
1011
1012static int bpf_place_pinned(int fd, const char *name,
1013			    const struct bpf_elf_ctx *ctx, uint32_t pinning)
1014{
1015	char pathname[PATH_MAX];
1016	const char *tmp;
1017	int ret = 0;
1018
1019	if (bpf_no_pinning(ctx, pinning) || !bpf_get_tc_dir())
1020		return 0;
1021
1022	if (pinning == PIN_OBJECT_NS)
1023		ret = bpf_make_obj_path();
1024	else if ((tmp = bpf_custom_pinning(ctx, pinning)))
1025		ret = bpf_make_custom_path(tmp);
1026	if (ret < 0)
1027		return ret;
1028
1029	bpf_make_pathname(pathname, sizeof(pathname), name, ctx, pinning);
1030	return bpf_obj_pin(fd, pathname);
1031}
1032
1033static int bpf_prog_attach(const char *section,
1034			   const struct bpf_elf_prog *prog, bool verbose)
1035{
1036	int fd;
1037
1038	/* We can add pinning here later as well, same as bpf_map_attach(). */
1039	errno = 0;
1040	fd = bpf_prog_load(prog->type, prog->insns, prog->size,
1041			   prog->license);
1042	if (fd < 0 || verbose) {
1043		bpf_dump_error("Prog section \'%s\' (type:%u insns:%zu "
1044			       "license:\'%s\') %s%s (%d)!\n\n",
1045			       section, prog->type,
1046			       prog->size / sizeof(struct bpf_insn),
1047			       prog->license, fd < 0 ? "rejected: " :
1048			       "loaded", fd < 0 ? strerror(errno) : "",
1049			       fd < 0 ? errno : fd);
1050	}
1051
1052	return fd;
1053}
1054
1055static int bpf_map_attach(const char *name, const struct bpf_elf_map *map,
1056			  const struct bpf_elf_ctx *ctx, bool verbose)
1057{
1058	int fd, ret;
1059
1060	fd = bpf_probe_pinned(name, ctx, map->pinning);
1061	if (fd > 0) {
1062		ret = bpf_map_selfcheck_pinned(fd, map,
1063					       offsetof(struct bpf_elf_map,
1064							id));
1065		if (ret < 0) {
1066			close(fd);
1067			fprintf(stderr, "Map \'%s\' self-check failed!\n",
1068				name);
1069			return ret;
1070		}
1071		if (verbose)
1072			fprintf(stderr, "Map \'%s\' loaded as pinned!\n",
1073				name);
1074		return fd;
1075	}
1076
1077	errno = 0;
1078	fd = bpf_map_create(map->type, map->size_key, map->size_value,
1079			    map->max_elem);
1080	if (fd < 0 || verbose) {
1081		bpf_dump_error("Map \'%s\' (type:%u id:%u pinning:%u "
1082			       "ksize:%u vsize:%u max-elems:%u) %s%s (%d)!\n",
1083			       name, map->type, map->id, map->pinning,
1084			       map->size_key, map->size_value, map->max_elem,
1085			       fd < 0 ? "rejected: " : "loaded", fd < 0 ?
1086			       strerror(errno) : "", fd < 0 ? errno : fd);
1087		if (fd < 0)
1088			return fd;
1089	}
1090
1091	ret = bpf_place_pinned(fd, name, ctx, map->pinning);
1092	if (ret < 0 && errno != EEXIST) {
1093		fprintf(stderr, "Could not pin %s map: %s\n", name,
1094			strerror(errno));
1095		close(fd);
1096		return ret;
1097	}
1098
1099	return fd;
1100}
1101
1102#define __ELF_ST_BIND(x)	((x) >> 4)
1103#define __ELF_ST_TYPE(x)	(((unsigned int) x) & 0xf)
1104
1105static const char *bpf_str_tab_name(const struct bpf_elf_ctx *ctx,
1106				    const GElf_Sym *sym)
1107{
1108	return ctx->str_tab->d_buf + sym->st_name;
1109}
1110
1111static const char *bpf_map_fetch_name(struct bpf_elf_ctx *ctx, int which)
1112{
1113	GElf_Sym sym;
1114	int i;
1115
1116	for (i = 0; i < ctx->sym_num; i++) {
1117		if (gelf_getsym(ctx->sym_tab, i, &sym) != &sym)
1118			continue;
1119
1120		if (__ELF_ST_BIND(sym.st_info) != STB_GLOBAL ||
1121		    __ELF_ST_TYPE(sym.st_info) != STT_NOTYPE ||
1122		    sym.st_shndx != ctx->sec_maps ||
1123		    sym.st_value / sizeof(struct bpf_elf_map) != which)
1124			continue;
1125
1126		return bpf_str_tab_name(ctx, &sym);
1127	}
1128
1129	return NULL;
1130}
1131
1132static int bpf_maps_attach_all(struct bpf_elf_ctx *ctx)
1133{
1134	const char *map_name;
1135	int i, fd;
1136
1137	for (i = 0; i < ctx->map_num; i++) {
1138		map_name = bpf_map_fetch_name(ctx, i);
1139		if (!map_name)
1140			return -EIO;
1141
1142		fd = bpf_map_attach(map_name, &ctx->maps[i], ctx,
1143				    ctx->verbose);
1144		if (fd < 0)
1145			return fd;
1146
1147		ctx->map_fds[i] = fd;
1148	}
1149
1150	return 0;
1151}
1152
1153static int bpf_fill_section_data(struct bpf_elf_ctx *ctx, int section,
1154				 struct bpf_elf_sec_data *data)
1155{
1156	Elf_Data *sec_edata;
1157	GElf_Shdr sec_hdr;
1158	Elf_Scn *sec_fd;
1159	char *sec_name;
1160
1161	memset(data, 0, sizeof(*data));
1162
1163	sec_fd = elf_getscn(ctx->elf_fd, section);
1164	if (!sec_fd)
1165		return -EINVAL;
1166	if (gelf_getshdr(sec_fd, &sec_hdr) != &sec_hdr)
1167		return -EIO;
1168
1169	sec_name = elf_strptr(ctx->elf_fd, ctx->elf_hdr.e_shstrndx,
1170			      sec_hdr.sh_name);
1171	if (!sec_name || !sec_hdr.sh_size)
1172		return -ENOENT;
1173
1174	sec_edata = elf_getdata(sec_fd, NULL);
1175	if (!sec_edata || elf_getdata(sec_fd, sec_edata))
1176		return -EIO;
1177
1178	memcpy(&data->sec_hdr, &sec_hdr, sizeof(sec_hdr));
1179
1180	data->sec_name = sec_name;
1181	data->sec_data = sec_edata;
1182	return 0;
1183}
1184
1185static int bpf_fetch_maps(struct bpf_elf_ctx *ctx, int section,
1186			  struct bpf_elf_sec_data *data)
1187{
1188	if (data->sec_data->d_size % sizeof(struct bpf_elf_map) != 0)
1189		return -EINVAL;
1190
1191	ctx->map_num = data->sec_data->d_size / sizeof(struct bpf_elf_map);
1192	ctx->sec_maps = section;
1193	ctx->sec_done[section] = true;
1194
1195	if (ctx->map_num > ARRAY_SIZE(ctx->map_fds)) {
1196		fprintf(stderr, "Too many BPF maps in ELF section!\n");
1197		return -ENOMEM;
1198	}
1199
1200	memcpy(ctx->maps, data->sec_data->d_buf, data->sec_data->d_size);
1201	return 0;
1202}
1203
1204static int bpf_fetch_license(struct bpf_elf_ctx *ctx, int section,
1205			     struct bpf_elf_sec_data *data)
1206{
1207	if (data->sec_data->d_size > sizeof(ctx->license))
1208		return -ENOMEM;
1209
1210	memcpy(ctx->license, data->sec_data->d_buf, data->sec_data->d_size);
1211	ctx->sec_done[section] = true;
1212	return 0;
1213}
1214
1215static int bpf_fetch_symtab(struct bpf_elf_ctx *ctx, int section,
1216			    struct bpf_elf_sec_data *data)
1217{
1218	ctx->sym_tab = data->sec_data;
1219	ctx->sym_num = data->sec_hdr.sh_size / data->sec_hdr.sh_entsize;
1220	ctx->sec_done[section] = true;
1221	return 0;
1222}
1223
1224static int bpf_fetch_strtab(struct bpf_elf_ctx *ctx, int section,
1225			    struct bpf_elf_sec_data *data)
1226{
1227	ctx->str_tab = data->sec_data;
1228	ctx->sec_done[section] = true;
1229	return 0;
1230}
1231
1232static int bpf_fetch_ancillary(struct bpf_elf_ctx *ctx)
1233{
1234	struct bpf_elf_sec_data data;
1235	int i, ret = -1;
1236
1237	for (i = 1; i < ctx->elf_hdr.e_shnum; i++) {
1238		ret = bpf_fill_section_data(ctx, i, &data);
1239		if (ret < 0)
1240			continue;
1241
1242		if (data.sec_hdr.sh_type == SHT_PROGBITS &&
1243		    !strcmp(data.sec_name, ELF_SECTION_MAPS))
1244			ret = bpf_fetch_maps(ctx, i, &data);
1245		else if (data.sec_hdr.sh_type == SHT_PROGBITS &&
1246			 !strcmp(data.sec_name, ELF_SECTION_LICENSE))
1247			ret = bpf_fetch_license(ctx, i, &data);
1248		else if (data.sec_hdr.sh_type == SHT_SYMTAB &&
1249			 !strcmp(data.sec_name, ".symtab"))
1250			ret = bpf_fetch_symtab(ctx, i, &data);
1251		else if (data.sec_hdr.sh_type == SHT_STRTAB &&
1252			 !strcmp(data.sec_name, ".strtab"))
1253			ret = bpf_fetch_strtab(ctx, i, &data);
1254		if (ret < 0) {
1255			fprintf(stderr, "Error parsing section %d! Perhaps"
1256				"check with readelf -a?\n", i);
1257			break;
1258		}
1259	}
1260
1261	if (ctx->sym_tab && ctx->str_tab && ctx->sec_maps) {
1262		ret = bpf_maps_attach_all(ctx);
1263		if (ret < 0) {
1264			fprintf(stderr, "Error loading maps into kernel!\n");
1265			return ret;
1266		}
1267	}
1268
1269	return ret;
1270}
1271
1272static int bpf_fetch_prog(struct bpf_elf_ctx *ctx, const char *section)
1273{
1274	struct bpf_elf_sec_data data;
1275	struct bpf_elf_prog prog;
1276	int ret, i, fd = -1;
1277
1278	for (i = 1; i < ctx->elf_hdr.e_shnum; i++) {
1279		if (ctx->sec_done[i])
1280			continue;
1281
1282		ret = bpf_fill_section_data(ctx, i, &data);
1283		if (ret < 0 ||
1284		    !(data.sec_hdr.sh_type == SHT_PROGBITS &&
1285		      data.sec_hdr.sh_flags & SHF_EXECINSTR &&
1286		      !strcmp(data.sec_name, section)))
1287			continue;
1288
1289		memset(&prog, 0, sizeof(prog));
1290		prog.type    = ctx->type;
1291		prog.insns   = data.sec_data->d_buf;
1292		prog.size    = data.sec_data->d_size;
1293		prog.license = ctx->license;
1294
1295		fd = bpf_prog_attach(section, &prog, ctx->verbose);
1296		if (fd < 0)
1297			continue;
1298
1299		ctx->sec_done[i] = true;
1300		break;
1301	}
1302
1303	return fd;
1304}
1305
1306static int bpf_apply_relo_data(struct bpf_elf_ctx *ctx,
1307			       struct bpf_elf_sec_data *data_relo,
1308			       struct bpf_elf_sec_data *data_insn)
1309{
1310	Elf_Data *idata = data_insn->sec_data;
1311	GElf_Shdr *rhdr = &data_relo->sec_hdr;
1312	int relo_ent, relo_num = rhdr->sh_size / rhdr->sh_entsize;
1313	struct bpf_insn *insns = idata->d_buf;
1314	unsigned int num_insns = idata->d_size / sizeof(*insns);
1315
1316	for (relo_ent = 0; relo_ent < relo_num; relo_ent++) {
1317		unsigned int ioff, rmap;
1318		GElf_Rel relo;
1319		GElf_Sym sym;
1320
1321		if (gelf_getrel(data_relo->sec_data, relo_ent, &relo) != &relo)
1322			return -EIO;
1323
1324		ioff = relo.r_offset / sizeof(struct bpf_insn);
1325		if (ioff >= num_insns ||
1326		    insns[ioff].code != (BPF_LD | BPF_IMM | BPF_DW))
1327			return -EINVAL;
1328
1329		if (gelf_getsym(ctx->sym_tab, GELF_R_SYM(relo.r_info), &sym) != &sym)
1330			return -EIO;
1331
1332		rmap = sym.st_value / sizeof(struct bpf_elf_map);
1333		if (rmap >= ARRAY_SIZE(ctx->map_fds))
1334			return -EINVAL;
1335		if (!ctx->map_fds[rmap])
1336			return -EINVAL;
1337
1338		if (ctx->verbose)
1339			fprintf(stderr, "Map \'%s\' (%d) injected into prog "
1340				"section \'%s\' at offset %u!\n",
1341				bpf_str_tab_name(ctx, &sym), ctx->map_fds[rmap],
1342				data_insn->sec_name, ioff);
1343
1344		insns[ioff].src_reg = BPF_PSEUDO_MAP_FD;
1345		insns[ioff].imm     = ctx->map_fds[rmap];
1346	}
1347
1348	return 0;
1349}
1350
1351static int bpf_fetch_prog_relo(struct bpf_elf_ctx *ctx, const char *section)
1352{
1353	struct bpf_elf_sec_data data_relo, data_insn;
1354	struct bpf_elf_prog prog;
1355	int ret, idx, i, fd = -1;
1356
1357	for (i = 1; i < ctx->elf_hdr.e_shnum; i++) {
1358		ret = bpf_fill_section_data(ctx, i, &data_relo);
1359		if (ret < 0 || data_relo.sec_hdr.sh_type != SHT_REL)
1360			continue;
1361
1362		idx = data_relo.sec_hdr.sh_info;
1363		ret = bpf_fill_section_data(ctx, idx, &data_insn);
1364		if (ret < 0 ||
1365		    !(data_insn.sec_hdr.sh_type == SHT_PROGBITS &&
1366		      data_insn.sec_hdr.sh_flags & SHF_EXECINSTR &&
1367		      !strcmp(data_insn.sec_name, section)))
1368			continue;
1369
1370		ret = bpf_apply_relo_data(ctx, &data_relo, &data_insn);
1371		if (ret < 0)
1372			continue;
1373
1374		memset(&prog, 0, sizeof(prog));
1375		prog.type    = ctx->type;
1376		prog.insns   = data_insn.sec_data->d_buf;
1377		prog.size    = data_insn.sec_data->d_size;
1378		prog.license = ctx->license;
1379
1380		fd = bpf_prog_attach(section, &prog, ctx->verbose);
1381		if (fd < 0)
1382			continue;
1383
1384		ctx->sec_done[i]   = true;
1385		ctx->sec_done[idx] = true;
1386		break;
1387	}
1388
1389	return fd;
1390}
1391
1392static int bpf_fetch_prog_sec(struct bpf_elf_ctx *ctx, const char *section)
1393{
1394	int ret = -1;
1395
1396	if (ctx->sym_tab)
1397		ret = bpf_fetch_prog_relo(ctx, section);
1398	if (ret < 0)
1399		ret = bpf_fetch_prog(ctx, section);
1400
1401	return ret;
1402}
1403
1404static int bpf_find_map_by_id(struct bpf_elf_ctx *ctx, uint32_t id)
1405{
1406	int i;
1407
1408	for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++)
1409		if (ctx->map_fds[i] && ctx->maps[i].id == id &&
1410		    ctx->maps[i].type == BPF_MAP_TYPE_PROG_ARRAY)
1411			return i;
1412	return -1;
1413}
1414
1415static int bpf_fill_prog_arrays(struct bpf_elf_ctx *ctx)
1416{
1417	struct bpf_elf_sec_data data;
1418	uint32_t map_id, key_id;
1419	int fd, i, ret, idx;
1420
1421	for (i = 1; i < ctx->elf_hdr.e_shnum; i++) {
1422		if (ctx->sec_done[i])
1423			continue;
1424
1425		ret = bpf_fill_section_data(ctx, i, &data);
1426		if (ret < 0)
1427			continue;
1428
1429		ret = sscanf(data.sec_name, "%i/%i", &map_id, &key_id);
1430		if (ret != 2)
1431			continue;
1432
1433		idx = bpf_find_map_by_id(ctx, map_id);
1434		if (idx < 0)
1435			continue;
1436
1437		fd = bpf_fetch_prog_sec(ctx, data.sec_name);
1438		if (fd < 0)
1439			return -EIO;
1440
1441		ret = bpf_map_update(ctx->map_fds[idx], &key_id,
1442				     &fd, BPF_ANY);
1443		if (ret < 0)
1444			return -ENOENT;
1445
1446		ctx->sec_done[i] = true;
1447	}
1448
1449	return 0;
1450}
1451
1452static void bpf_save_finfo(struct bpf_elf_ctx *ctx)
1453{
1454	struct stat st;
1455	int ret;
1456
1457	memset(&ctx->stat, 0, sizeof(ctx->stat));
1458
1459	ret = fstat(ctx->obj_fd, &st);
1460	if (ret < 0) {
1461		fprintf(stderr, "Stat of elf file failed: %s\n",
1462			strerror(errno));
1463		return;
1464	}
1465
1466	ctx->stat.st_dev = st.st_dev;
1467	ctx->stat.st_ino = st.st_ino;
1468}
1469
1470static int bpf_read_pin_mapping(FILE *fp, uint32_t *id, char *path)
1471{
1472	char buff[PATH_MAX];
1473
1474	while (fgets(buff, sizeof(buff), fp)) {
1475		char *ptr = buff;
1476
1477		while (*ptr == ' ' || *ptr == '\t')
1478			ptr++;
1479
1480		if (*ptr == '#' || *ptr == '\n' || *ptr == 0)
1481			continue;
1482
1483		if (sscanf(ptr, "%i %s\n", id, path) != 2 &&
1484		    sscanf(ptr, "%i %s #", id, path) != 2) {
1485			strcpy(path, ptr);
1486			return -1;
1487		}
1488
1489		return 1;
1490	}
1491
1492	return 0;
1493}
1494
1495static bool bpf_pinning_reserved(uint32_t pinning)
1496{
1497	switch (pinning) {
1498	case PIN_NONE:
1499	case PIN_OBJECT_NS:
1500	case PIN_GLOBAL_NS:
1501		return true;
1502	default:
1503		return false;
1504	}
1505}
1506
1507static void bpf_hash_init(struct bpf_elf_ctx *ctx, const char *db_file)
1508{
1509	struct bpf_hash_entry *entry;
1510	char subpath[PATH_MAX];
1511	uint32_t pinning;
1512	FILE *fp;
1513	int ret;
1514
1515	fp = fopen(db_file, "r");
1516	if (!fp)
1517		return;
1518
1519	memset(subpath, 0, sizeof(subpath));
1520	while ((ret = bpf_read_pin_mapping(fp, &pinning, subpath))) {
1521		if (ret == -1) {
1522			fprintf(stderr, "Database %s is corrupted at: %s\n",
1523				db_file, subpath);
1524			fclose(fp);
1525			return;
1526		}
1527
1528		if (bpf_pinning_reserved(pinning)) {
1529			fprintf(stderr, "Database %s, id %u is reserved - "
1530				"ignoring!\n", db_file, pinning);
1531			continue;
1532		}
1533
1534		entry = malloc(sizeof(*entry));
1535		if (!entry) {
1536			fprintf(stderr, "No memory left for db entry!\n");
1537			continue;
1538		}
1539
1540		entry->pinning = pinning;
1541		entry->subpath = strdup(subpath);
1542		if (!entry->subpath) {
1543			fprintf(stderr, "No memory left for db entry!\n");
1544			free(entry);
1545			continue;
1546		}
1547
1548		entry->next = ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)];
1549		ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)] = entry;
1550	}
1551
1552	fclose(fp);
1553}
1554
1555static void bpf_hash_destroy(struct bpf_elf_ctx *ctx)
1556{
1557	struct bpf_hash_entry *entry;
1558	int i;
1559
1560	for (i = 0; i < ARRAY_SIZE(ctx->ht); i++) {
1561		while ((entry = ctx->ht[i]) != NULL) {
1562			ctx->ht[i] = entry->next;
1563			free((char *)entry->subpath);
1564			free(entry);
1565		}
1566	}
1567}
1568
1569static int bpf_elf_check_ehdr(const struct bpf_elf_ctx *ctx)
1570{
1571	if (ctx->elf_hdr.e_type != ET_REL ||
1572	    ctx->elf_hdr.e_machine != 0 ||
1573	    ctx->elf_hdr.e_version != EV_CURRENT) {
1574		fprintf(stderr, "ELF format error, ELF file not for eBPF?\n");
1575		return -EINVAL;
1576	}
1577
1578	switch (ctx->elf_hdr.e_ident[EI_DATA]) {
1579	default:
1580		fprintf(stderr, "ELF format error, wrong endianness info?\n");
1581		return -EINVAL;
1582	case ELFDATA2LSB:
1583		if (htons(1) == 1) {
1584			fprintf(stderr,
1585				"We are big endian, eBPF object is little endian!\n");
1586			return -EIO;
1587		}
1588		break;
1589	case ELFDATA2MSB:
1590		if (htons(1) != 1) {
1591			fprintf(stderr,
1592				"We are little endian, eBPF object is big endian!\n");
1593			return -EIO;
1594		}
1595		break;
1596	}
1597
1598	return 0;
1599}
1600
1601static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname,
1602			    enum bpf_prog_type type, bool verbose)
1603{
1604	int ret = -EINVAL;
1605
1606	if (elf_version(EV_CURRENT) == EV_NONE ||
1607	    bpf_init_env(pathname))
1608		return ret;
1609
1610	memset(ctx, 0, sizeof(*ctx));
1611	ctx->verbose = verbose;
1612	ctx->type    = type;
1613
1614	ctx->obj_fd = open(pathname, O_RDONLY);
1615	if (ctx->obj_fd < 0)
1616		return ctx->obj_fd;
1617
1618	ctx->elf_fd = elf_begin(ctx->obj_fd, ELF_C_READ, NULL);
1619	if (!ctx->elf_fd) {
1620		ret = -EINVAL;
1621		goto out_fd;
1622	}
1623
1624	if (elf_kind(ctx->elf_fd) != ELF_K_ELF) {
1625		ret = -EINVAL;
1626		goto out_fd;
1627	}
1628
1629	if (gelf_getehdr(ctx->elf_fd, &ctx->elf_hdr) !=
1630	    &ctx->elf_hdr) {
1631		ret = -EIO;
1632		goto out_elf;
1633	}
1634
1635	ret = bpf_elf_check_ehdr(ctx);
1636	if (ret < 0)
1637		goto out_elf;
1638
1639	ctx->sec_done = calloc(ctx->elf_hdr.e_shnum,
1640			       sizeof(*(ctx->sec_done)));
1641	if (!ctx->sec_done) {
1642		ret = -ENOMEM;
1643		goto out_elf;
1644	}
1645
1646	bpf_save_finfo(ctx);
1647	bpf_hash_init(ctx, CONFDIR "/bpf_pinning");
1648
1649	return 0;
1650out_elf:
1651	elf_end(ctx->elf_fd);
1652out_fd:
1653	close(ctx->obj_fd);
1654	return ret;
1655}
1656
1657static int bpf_maps_count(struct bpf_elf_ctx *ctx)
1658{
1659	int i, count = 0;
1660
1661	for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++) {
1662		if (!ctx->map_fds[i])
1663			break;
1664		count++;
1665	}
1666
1667	return count;
1668}
1669
1670static void bpf_maps_teardown(struct bpf_elf_ctx *ctx)
1671{
1672	int i;
1673
1674	for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++) {
1675		if (ctx->map_fds[i])
1676			close(ctx->map_fds[i]);
1677	}
1678}
1679
1680static void bpf_elf_ctx_destroy(struct bpf_elf_ctx *ctx, bool failure)
1681{
1682	if (failure)
1683		bpf_maps_teardown(ctx);
1684
1685	bpf_hash_destroy(ctx);
1686	free(ctx->sec_done);
1687	elf_end(ctx->elf_fd);
1688	close(ctx->obj_fd);
1689}
1690
1691static struct bpf_elf_ctx __ctx;
1692
1693static int bpf_obj_open(const char *pathname, enum bpf_prog_type type,
1694			const char *section, bool verbose)
1695{
1696	struct bpf_elf_ctx *ctx = &__ctx;
1697	int fd = 0, ret;
1698
1699	ret = bpf_elf_ctx_init(ctx, pathname, type, verbose);
1700	if (ret < 0) {
1701		fprintf(stderr, "Cannot initialize ELF context!\n");
1702		return ret;
1703	}
1704
1705	ret = bpf_fetch_ancillary(ctx);
1706	if (ret < 0) {
1707		fprintf(stderr, "Error fetching ELF ancillary data!\n");
1708		goto out;
1709	}
1710
1711	fd = bpf_fetch_prog_sec(ctx, section);
1712	if (fd < 0) {
1713		fprintf(stderr, "Error fetching program/map!\n");
1714		ret = fd;
1715		goto out;
1716	}
1717
1718	ret = bpf_fill_prog_arrays(ctx);
1719	if (ret < 0)
1720		fprintf(stderr, "Error filling program arrays!\n");
1721out:
1722	bpf_elf_ctx_destroy(ctx, ret < 0);
1723	if (ret < 0) {
1724		if (fd)
1725			close(fd);
1726		return ret;
1727	}
1728
1729	return fd;
1730}
1731
1732static int
1733bpf_map_set_send(int fd, struct sockaddr_un *addr, unsigned int addr_len,
1734		 const struct bpf_map_data *aux, unsigned int entries)
1735{
1736	struct bpf_map_set_msg msg;
1737	int *cmsg_buf, min_fd;
1738	char *amsg_buf;
1739	int i;
1740
1741	memset(&msg, 0, sizeof(msg));
1742
1743	msg.aux.uds_ver = BPF_SCM_AUX_VER;
1744	msg.aux.num_ent = entries;
1745
1746	strncpy(msg.aux.obj_name, aux->obj, sizeof(msg.aux.obj_name));
1747	memcpy(&msg.aux.obj_st, aux->st, sizeof(msg.aux.obj_st));
1748
1749	cmsg_buf = bpf_map_set_init(&msg, addr, addr_len);
1750	amsg_buf = (char *)msg.aux.ent;
1751
1752	for (i = 0; i < entries; i += min_fd) {
1753		int ret;
1754
1755		min_fd = min(BPF_SCM_MAX_FDS * 1U, entries - i);
1756		bpf_map_set_init_single(&msg, min_fd);
1757
1758		memcpy(cmsg_buf, &aux->fds[i], sizeof(aux->fds[0]) * min_fd);
1759		memcpy(amsg_buf, &aux->ent[i], sizeof(aux->ent[0]) * min_fd);
1760
1761		ret = sendmsg(fd, &msg.hdr, 0);
1762		if (ret <= 0)
1763			return ret ? : -1;
1764	}
1765
1766	return 0;
1767}
1768
1769static int
1770bpf_map_set_recv(int fd, int *fds,  struct bpf_map_aux *aux,
1771		 unsigned int entries)
1772{
1773	struct bpf_map_set_msg msg;
1774	int *cmsg_buf, min_fd;
1775	char *amsg_buf, *mmsg_buf;
1776	unsigned int needed = 1;
1777	int i;
1778
1779	cmsg_buf = bpf_map_set_init(&msg, NULL, 0);
1780	amsg_buf = (char *)msg.aux.ent;
1781	mmsg_buf = (char *)&msg.aux;
1782
1783	for (i = 0; i < min(entries, needed); i += min_fd) {
1784		struct cmsghdr *cmsg;
1785		int ret;
1786
1787		min_fd = min(entries, entries - i);
1788		bpf_map_set_init_single(&msg, min_fd);
1789
1790		ret = recvmsg(fd, &msg.hdr, 0);
1791		if (ret <= 0)
1792			return ret ? : -1;
1793
1794		cmsg = CMSG_FIRSTHDR(&msg.hdr);
1795		if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS)
1796			return -EINVAL;
1797		if (msg.hdr.msg_flags & MSG_CTRUNC)
1798			return -EIO;
1799		if (msg.aux.uds_ver != BPF_SCM_AUX_VER)
1800			return -ENOSYS;
1801
1802		min_fd = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof(fd);
1803		if (min_fd > entries || min_fd <= 0)
1804			return -EINVAL;
1805
1806		memcpy(&fds[i], cmsg_buf, sizeof(fds[0]) * min_fd);
1807		memcpy(&aux->ent[i], amsg_buf, sizeof(aux->ent[0]) * min_fd);
1808		memcpy(aux, mmsg_buf, offsetof(struct bpf_map_aux, ent));
1809
1810		needed = aux->num_ent;
1811	}
1812
1813	return 0;
1814}
1815
1816int bpf_send_map_fds(const char *path, const char *obj)
1817{
1818	struct bpf_elf_ctx *ctx = &__ctx;
1819	struct sockaddr_un addr;
1820	struct bpf_map_data bpf_aux;
1821	int fd, ret;
1822
1823	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
1824	if (fd < 0) {
1825		fprintf(stderr, "Cannot open socket: %s\n",
1826			strerror(errno));
1827		return -1;
1828	}
1829
1830	memset(&addr, 0, sizeof(addr));
1831	addr.sun_family = AF_UNIX;
1832	strncpy(addr.sun_path, path, sizeof(addr.sun_path));
1833
1834	ret = connect(fd, (struct sockaddr *)&addr, sizeof(addr));
1835	if (ret < 0) {
1836		fprintf(stderr, "Cannot connect to %s: %s\n",
1837			path, strerror(errno));
1838		return -1;
1839	}
1840
1841	memset(&bpf_aux, 0, sizeof(bpf_aux));
1842
1843	bpf_aux.fds = ctx->map_fds;
1844	bpf_aux.ent = ctx->maps;
1845	bpf_aux.st  = &ctx->stat;
1846	bpf_aux.obj = obj;
1847
1848	ret = bpf_map_set_send(fd, &addr, sizeof(addr), &bpf_aux,
1849			       bpf_maps_count(ctx));
1850	if (ret < 0)
1851		fprintf(stderr, "Cannot send fds to %s: %s\n",
1852			path, strerror(errno));
1853
1854	bpf_maps_teardown(ctx);
1855	close(fd);
1856	return ret;
1857}
1858
1859int bpf_recv_map_fds(const char *path, int *fds, struct bpf_map_aux *aux,
1860		     unsigned int entries)
1861{
1862	struct sockaddr_un addr;
1863	int fd, ret;
1864
1865	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
1866	if (fd < 0) {
1867		fprintf(stderr, "Cannot open socket: %s\n",
1868			strerror(errno));
1869		return -1;
1870	}
1871
1872	memset(&addr, 0, sizeof(addr));
1873	addr.sun_family = AF_UNIX;
1874	strncpy(addr.sun_path, path, sizeof(addr.sun_path));
1875
1876	ret = bind(fd, (struct sockaddr *)&addr, sizeof(addr));
1877	if (ret < 0) {
1878		fprintf(stderr, "Cannot bind to socket: %s\n",
1879			strerror(errno));
1880		return -1;
1881	}
1882
1883	ret = bpf_map_set_recv(fd, fds, aux, entries);
1884	if (ret < 0)
1885		fprintf(stderr, "Cannot recv fds from %s: %s\n",
1886			path, strerror(errno));
1887
1888	unlink(addr.sun_path);
1889	close(fd);
1890	return ret;
1891}
1892#endif /* HAVE_ELF */
1893