1/*
2 * Copyright (C) 2011 STRATO.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/vmalloc.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "backref.h"
23#include "ulist.h"
24#include "transaction.h"
25#include "delayed-ref.h"
26#include "locking.h"
27
28/* Just an arbitrary number so we can be sure this happened */
29#define BACKREF_FOUND_SHARED 6
30
31struct extent_inode_elem {
32	u64 inum;
33	u64 offset;
34	struct extent_inode_elem *next;
35};
36
37static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
38				struct btrfs_file_extent_item *fi,
39				u64 extent_item_pos,
40				struct extent_inode_elem **eie)
41{
42	u64 offset = 0;
43	struct extent_inode_elem *e;
44
45	if (!btrfs_file_extent_compression(eb, fi) &&
46	    !btrfs_file_extent_encryption(eb, fi) &&
47	    !btrfs_file_extent_other_encoding(eb, fi)) {
48		u64 data_offset;
49		u64 data_len;
50
51		data_offset = btrfs_file_extent_offset(eb, fi);
52		data_len = btrfs_file_extent_num_bytes(eb, fi);
53
54		if (extent_item_pos < data_offset ||
55		    extent_item_pos >= data_offset + data_len)
56			return 1;
57		offset = extent_item_pos - data_offset;
58	}
59
60	e = kmalloc(sizeof(*e), GFP_NOFS);
61	if (!e)
62		return -ENOMEM;
63
64	e->next = *eie;
65	e->inum = key->objectid;
66	e->offset = key->offset + offset;
67	*eie = e;
68
69	return 0;
70}
71
72static void free_inode_elem_list(struct extent_inode_elem *eie)
73{
74	struct extent_inode_elem *eie_next;
75
76	for (; eie; eie = eie_next) {
77		eie_next = eie->next;
78		kfree(eie);
79	}
80}
81
82static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
83				u64 extent_item_pos,
84				struct extent_inode_elem **eie)
85{
86	u64 disk_byte;
87	struct btrfs_key key;
88	struct btrfs_file_extent_item *fi;
89	int slot;
90	int nritems;
91	int extent_type;
92	int ret;
93
94	/*
95	 * from the shared data ref, we only have the leaf but we need
96	 * the key. thus, we must look into all items and see that we
97	 * find one (some) with a reference to our extent item.
98	 */
99	nritems = btrfs_header_nritems(eb);
100	for (slot = 0; slot < nritems; ++slot) {
101		btrfs_item_key_to_cpu(eb, &key, slot);
102		if (key.type != BTRFS_EXTENT_DATA_KEY)
103			continue;
104		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
105		extent_type = btrfs_file_extent_type(eb, fi);
106		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
107			continue;
108		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
109		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
110		if (disk_byte != wanted_disk_byte)
111			continue;
112
113		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
114		if (ret < 0)
115			return ret;
116	}
117
118	return 0;
119}
120
121/*
122 * this structure records all encountered refs on the way up to the root
123 */
124struct __prelim_ref {
125	struct list_head list;
126	u64 root_id;
127	struct btrfs_key key_for_search;
128	int level;
129	int count;
130	struct extent_inode_elem *inode_list;
131	u64 parent;
132	u64 wanted_disk_byte;
133};
134
135static struct kmem_cache *btrfs_prelim_ref_cache;
136
137int __init btrfs_prelim_ref_init(void)
138{
139	btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
140					sizeof(struct __prelim_ref),
141					0,
142					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
143					NULL);
144	if (!btrfs_prelim_ref_cache)
145		return -ENOMEM;
146	return 0;
147}
148
149void btrfs_prelim_ref_exit(void)
150{
151	if (btrfs_prelim_ref_cache)
152		kmem_cache_destroy(btrfs_prelim_ref_cache);
153}
154
155/*
156 * the rules for all callers of this function are:
157 * - obtaining the parent is the goal
158 * - if you add a key, you must know that it is a correct key
159 * - if you cannot add the parent or a correct key, then we will look into the
160 *   block later to set a correct key
161 *
162 * delayed refs
163 * ============
164 *        backref type | shared | indirect | shared | indirect
165 * information         |   tree |     tree |   data |     data
166 * --------------------+--------+----------+--------+----------
167 *      parent logical |    y   |     -    |    -   |     -
168 *      key to resolve |    -   |     y    |    y   |     y
169 *  tree block logical |    -   |     -    |    -   |     -
170 *  root for resolving |    y   |     y    |    y   |     y
171 *
172 * - column 1:       we've the parent -> done
173 * - column 2, 3, 4: we use the key to find the parent
174 *
175 * on disk refs (inline or keyed)
176 * ==============================
177 *        backref type | shared | indirect | shared | indirect
178 * information         |   tree |     tree |   data |     data
179 * --------------------+--------+----------+--------+----------
180 *      parent logical |    y   |     -    |    y   |     -
181 *      key to resolve |    -   |     -    |    -   |     y
182 *  tree block logical |    y   |     y    |    y   |     y
183 *  root for resolving |    -   |     y    |    y   |     y
184 *
185 * - column 1, 3: we've the parent -> done
186 * - column 2:    we take the first key from the block to find the parent
187 *                (see __add_missing_keys)
188 * - column 4:    we use the key to find the parent
189 *
190 * additional information that's available but not required to find the parent
191 * block might help in merging entries to gain some speed.
192 */
193
194static int __add_prelim_ref(struct list_head *head, u64 root_id,
195			    struct btrfs_key *key, int level,
196			    u64 parent, u64 wanted_disk_byte, int count,
197			    gfp_t gfp_mask)
198{
199	struct __prelim_ref *ref;
200
201	if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID)
202		return 0;
203
204	ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
205	if (!ref)
206		return -ENOMEM;
207
208	ref->root_id = root_id;
209	if (key)
210		ref->key_for_search = *key;
211	else
212		memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
213
214	ref->inode_list = NULL;
215	ref->level = level;
216	ref->count = count;
217	ref->parent = parent;
218	ref->wanted_disk_byte = wanted_disk_byte;
219	list_add_tail(&ref->list, head);
220
221	return 0;
222}
223
224static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
225			   struct ulist *parents, struct __prelim_ref *ref,
226			   int level, u64 time_seq, const u64 *extent_item_pos,
227			   u64 total_refs)
228{
229	int ret = 0;
230	int slot;
231	struct extent_buffer *eb;
232	struct btrfs_key key;
233	struct btrfs_key *key_for_search = &ref->key_for_search;
234	struct btrfs_file_extent_item *fi;
235	struct extent_inode_elem *eie = NULL, *old = NULL;
236	u64 disk_byte;
237	u64 wanted_disk_byte = ref->wanted_disk_byte;
238	u64 count = 0;
239
240	if (level != 0) {
241		eb = path->nodes[level];
242		ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
243		if (ret < 0)
244			return ret;
245		return 0;
246	}
247
248	/*
249	 * We normally enter this function with the path already pointing to
250	 * the first item to check. But sometimes, we may enter it with
251	 * slot==nritems. In that case, go to the next leaf before we continue.
252	 */
253	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
254		ret = btrfs_next_old_leaf(root, path, time_seq);
255
256	while (!ret && count < total_refs) {
257		eb = path->nodes[0];
258		slot = path->slots[0];
259
260		btrfs_item_key_to_cpu(eb, &key, slot);
261
262		if (key.objectid != key_for_search->objectid ||
263		    key.type != BTRFS_EXTENT_DATA_KEY)
264			break;
265
266		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
267		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
268
269		if (disk_byte == wanted_disk_byte) {
270			eie = NULL;
271			old = NULL;
272			count++;
273			if (extent_item_pos) {
274				ret = check_extent_in_eb(&key, eb, fi,
275						*extent_item_pos,
276						&eie);
277				if (ret < 0)
278					break;
279			}
280			if (ret > 0)
281				goto next;
282			ret = ulist_add_merge_ptr(parents, eb->start,
283						  eie, (void **)&old, GFP_NOFS);
284			if (ret < 0)
285				break;
286			if (!ret && extent_item_pos) {
287				while (old->next)
288					old = old->next;
289				old->next = eie;
290			}
291			eie = NULL;
292		}
293next:
294		ret = btrfs_next_old_item(root, path, time_seq);
295	}
296
297	if (ret > 0)
298		ret = 0;
299	else if (ret < 0)
300		free_inode_elem_list(eie);
301	return ret;
302}
303
304/*
305 * resolve an indirect backref in the form (root_id, key, level)
306 * to a logical address
307 */
308static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
309				  struct btrfs_path *path, u64 time_seq,
310				  struct __prelim_ref *ref,
311				  struct ulist *parents,
312				  const u64 *extent_item_pos, u64 total_refs)
313{
314	struct btrfs_root *root;
315	struct btrfs_key root_key;
316	struct extent_buffer *eb;
317	int ret = 0;
318	int root_level;
319	int level = ref->level;
320	int index;
321
322	root_key.objectid = ref->root_id;
323	root_key.type = BTRFS_ROOT_ITEM_KEY;
324	root_key.offset = (u64)-1;
325
326	index = srcu_read_lock(&fs_info->subvol_srcu);
327
328	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
329	if (IS_ERR(root)) {
330		srcu_read_unlock(&fs_info->subvol_srcu, index);
331		ret = PTR_ERR(root);
332		goto out;
333	}
334
335	if (path->search_commit_root)
336		root_level = btrfs_header_level(root->commit_root);
337	else
338		root_level = btrfs_old_root_level(root, time_seq);
339
340	if (root_level + 1 == level) {
341		srcu_read_unlock(&fs_info->subvol_srcu, index);
342		goto out;
343	}
344
345	path->lowest_level = level;
346	ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
347
348	/* root node has been locked, we can release @subvol_srcu safely here */
349	srcu_read_unlock(&fs_info->subvol_srcu, index);
350
351	pr_debug("search slot in root %llu (level %d, ref count %d) returned "
352		 "%d for key (%llu %u %llu)\n",
353		 ref->root_id, level, ref->count, ret,
354		 ref->key_for_search.objectid, ref->key_for_search.type,
355		 ref->key_for_search.offset);
356	if (ret < 0)
357		goto out;
358
359	eb = path->nodes[level];
360	while (!eb) {
361		if (WARN_ON(!level)) {
362			ret = 1;
363			goto out;
364		}
365		level--;
366		eb = path->nodes[level];
367	}
368
369	ret = add_all_parents(root, path, parents, ref, level, time_seq,
370			      extent_item_pos, total_refs);
371out:
372	path->lowest_level = 0;
373	btrfs_release_path(path);
374	return ret;
375}
376
377/*
378 * resolve all indirect backrefs from the list
379 */
380static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
381				   struct btrfs_path *path, u64 time_seq,
382				   struct list_head *head,
383				   const u64 *extent_item_pos, u64 total_refs,
384				   u64 root_objectid)
385{
386	int err;
387	int ret = 0;
388	struct __prelim_ref *ref;
389	struct __prelim_ref *ref_safe;
390	struct __prelim_ref *new_ref;
391	struct ulist *parents;
392	struct ulist_node *node;
393	struct ulist_iterator uiter;
394
395	parents = ulist_alloc(GFP_NOFS);
396	if (!parents)
397		return -ENOMEM;
398
399	/*
400	 * _safe allows us to insert directly after the current item without
401	 * iterating over the newly inserted items.
402	 * we're also allowed to re-assign ref during iteration.
403	 */
404	list_for_each_entry_safe(ref, ref_safe, head, list) {
405		if (ref->parent)	/* already direct */
406			continue;
407		if (ref->count == 0)
408			continue;
409		if (root_objectid && ref->root_id != root_objectid) {
410			ret = BACKREF_FOUND_SHARED;
411			goto out;
412		}
413		err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
414					     parents, extent_item_pos,
415					     total_refs);
416		/*
417		 * we can only tolerate ENOENT,otherwise,we should catch error
418		 * and return directly.
419		 */
420		if (err == -ENOENT) {
421			continue;
422		} else if (err) {
423			ret = err;
424			goto out;
425		}
426
427		/* we put the first parent into the ref at hand */
428		ULIST_ITER_INIT(&uiter);
429		node = ulist_next(parents, &uiter);
430		ref->parent = node ? node->val : 0;
431		ref->inode_list = node ?
432			(struct extent_inode_elem *)(uintptr_t)node->aux : NULL;
433
434		/* additional parents require new refs being added here */
435		while ((node = ulist_next(parents, &uiter))) {
436			new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
437						   GFP_NOFS);
438			if (!new_ref) {
439				ret = -ENOMEM;
440				goto out;
441			}
442			memcpy(new_ref, ref, sizeof(*ref));
443			new_ref->parent = node->val;
444			new_ref->inode_list = (struct extent_inode_elem *)
445							(uintptr_t)node->aux;
446			list_add(&new_ref->list, &ref->list);
447		}
448		ulist_reinit(parents);
449	}
450out:
451	ulist_free(parents);
452	return ret;
453}
454
455static inline int ref_for_same_block(struct __prelim_ref *ref1,
456				     struct __prelim_ref *ref2)
457{
458	if (ref1->level != ref2->level)
459		return 0;
460	if (ref1->root_id != ref2->root_id)
461		return 0;
462	if (ref1->key_for_search.type != ref2->key_for_search.type)
463		return 0;
464	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
465		return 0;
466	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
467		return 0;
468	if (ref1->parent != ref2->parent)
469		return 0;
470
471	return 1;
472}
473
474/*
475 * read tree blocks and add keys where required.
476 */
477static int __add_missing_keys(struct btrfs_fs_info *fs_info,
478			      struct list_head *head)
479{
480	struct list_head *pos;
481	struct extent_buffer *eb;
482
483	list_for_each(pos, head) {
484		struct __prelim_ref *ref;
485		ref = list_entry(pos, struct __prelim_ref, list);
486
487		if (ref->parent)
488			continue;
489		if (ref->key_for_search.type)
490			continue;
491		BUG_ON(!ref->wanted_disk_byte);
492		eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
493				     0);
494		if (!eb || !extent_buffer_uptodate(eb)) {
495			free_extent_buffer(eb);
496			return -EIO;
497		}
498		btrfs_tree_read_lock(eb);
499		if (btrfs_header_level(eb) == 0)
500			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
501		else
502			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
503		btrfs_tree_read_unlock(eb);
504		free_extent_buffer(eb);
505	}
506	return 0;
507}
508
509/*
510 * merge two lists of backrefs and adjust counts accordingly
511 *
512 * mode = 1: merge identical keys, if key is set
513 *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
514 *           additionally, we could even add a key range for the blocks we
515 *           looked into to merge even more (-> replace unresolved refs by those
516 *           having a parent).
517 * mode = 2: merge identical parents
518 */
519static void __merge_refs(struct list_head *head, int mode)
520{
521	struct list_head *pos1;
522
523	list_for_each(pos1, head) {
524		struct list_head *n2;
525		struct list_head *pos2;
526		struct __prelim_ref *ref1;
527
528		ref1 = list_entry(pos1, struct __prelim_ref, list);
529
530		for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
531		     pos2 = n2, n2 = pos2->next) {
532			struct __prelim_ref *ref2;
533			struct __prelim_ref *xchg;
534			struct extent_inode_elem *eie;
535
536			ref2 = list_entry(pos2, struct __prelim_ref, list);
537
538			if (mode == 1) {
539				if (!ref_for_same_block(ref1, ref2))
540					continue;
541				if (!ref1->parent && ref2->parent) {
542					xchg = ref1;
543					ref1 = ref2;
544					ref2 = xchg;
545				}
546			} else {
547				if (ref1->parent != ref2->parent)
548					continue;
549			}
550
551			eie = ref1->inode_list;
552			while (eie && eie->next)
553				eie = eie->next;
554			if (eie)
555				eie->next = ref2->inode_list;
556			else
557				ref1->inode_list = ref2->inode_list;
558			ref1->count += ref2->count;
559
560			list_del(&ref2->list);
561			kmem_cache_free(btrfs_prelim_ref_cache, ref2);
562		}
563
564	}
565}
566
567/*
568 * add all currently queued delayed refs from this head whose seq nr is
569 * smaller or equal that seq to the list
570 */
571static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
572			      struct list_head *prefs, u64 *total_refs,
573			      u64 inum)
574{
575	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
576	struct rb_node *n = &head->node.rb_node;
577	struct btrfs_key key;
578	struct btrfs_key op_key = {0};
579	int sgn;
580	int ret = 0;
581
582	if (extent_op && extent_op->update_key)
583		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
584
585	spin_lock(&head->lock);
586	n = rb_first(&head->ref_root);
587	while (n) {
588		struct btrfs_delayed_ref_node *node;
589		node = rb_entry(n, struct btrfs_delayed_ref_node,
590				rb_node);
591		n = rb_next(n);
592		if (node->seq > seq)
593			continue;
594
595		switch (node->action) {
596		case BTRFS_ADD_DELAYED_EXTENT:
597		case BTRFS_UPDATE_DELAYED_HEAD:
598			WARN_ON(1);
599			continue;
600		case BTRFS_ADD_DELAYED_REF:
601			sgn = 1;
602			break;
603		case BTRFS_DROP_DELAYED_REF:
604			sgn = -1;
605			break;
606		default:
607			BUG_ON(1);
608		}
609		*total_refs += (node->ref_mod * sgn);
610		switch (node->type) {
611		case BTRFS_TREE_BLOCK_REF_KEY: {
612			struct btrfs_delayed_tree_ref *ref;
613
614			ref = btrfs_delayed_node_to_tree_ref(node);
615			ret = __add_prelim_ref(prefs, ref->root, &op_key,
616					       ref->level + 1, 0, node->bytenr,
617					       node->ref_mod * sgn, GFP_ATOMIC);
618			break;
619		}
620		case BTRFS_SHARED_BLOCK_REF_KEY: {
621			struct btrfs_delayed_tree_ref *ref;
622
623			ref = btrfs_delayed_node_to_tree_ref(node);
624			ret = __add_prelim_ref(prefs, ref->root, NULL,
625					       ref->level + 1, ref->parent,
626					       node->bytenr,
627					       node->ref_mod * sgn, GFP_ATOMIC);
628			break;
629		}
630		case BTRFS_EXTENT_DATA_REF_KEY: {
631			struct btrfs_delayed_data_ref *ref;
632			ref = btrfs_delayed_node_to_data_ref(node);
633
634			key.objectid = ref->objectid;
635			key.type = BTRFS_EXTENT_DATA_KEY;
636			key.offset = ref->offset;
637
638			/*
639			 * Found a inum that doesn't match our known inum, we
640			 * know it's shared.
641			 */
642			if (inum && ref->objectid != inum) {
643				ret = BACKREF_FOUND_SHARED;
644				break;
645			}
646
647			ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
648					       node->bytenr,
649					       node->ref_mod * sgn, GFP_ATOMIC);
650			break;
651		}
652		case BTRFS_SHARED_DATA_REF_KEY: {
653			struct btrfs_delayed_data_ref *ref;
654
655			ref = btrfs_delayed_node_to_data_ref(node);
656
657			key.objectid = ref->objectid;
658			key.type = BTRFS_EXTENT_DATA_KEY;
659			key.offset = ref->offset;
660			ret = __add_prelim_ref(prefs, ref->root, &key, 0,
661					       ref->parent, node->bytenr,
662					       node->ref_mod * sgn, GFP_ATOMIC);
663			break;
664		}
665		default:
666			WARN_ON(1);
667		}
668		if (ret)
669			break;
670	}
671	spin_unlock(&head->lock);
672	return ret;
673}
674
675/*
676 * add all inline backrefs for bytenr to the list
677 */
678static int __add_inline_refs(struct btrfs_fs_info *fs_info,
679			     struct btrfs_path *path, u64 bytenr,
680			     int *info_level, struct list_head *prefs,
681			     u64 *total_refs, u64 inum)
682{
683	int ret = 0;
684	int slot;
685	struct extent_buffer *leaf;
686	struct btrfs_key key;
687	struct btrfs_key found_key;
688	unsigned long ptr;
689	unsigned long end;
690	struct btrfs_extent_item *ei;
691	u64 flags;
692	u64 item_size;
693
694	/*
695	 * enumerate all inline refs
696	 */
697	leaf = path->nodes[0];
698	slot = path->slots[0];
699
700	item_size = btrfs_item_size_nr(leaf, slot);
701	BUG_ON(item_size < sizeof(*ei));
702
703	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
704	flags = btrfs_extent_flags(leaf, ei);
705	*total_refs += btrfs_extent_refs(leaf, ei);
706	btrfs_item_key_to_cpu(leaf, &found_key, slot);
707
708	ptr = (unsigned long)(ei + 1);
709	end = (unsigned long)ei + item_size;
710
711	if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
712	    flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
713		struct btrfs_tree_block_info *info;
714
715		info = (struct btrfs_tree_block_info *)ptr;
716		*info_level = btrfs_tree_block_level(leaf, info);
717		ptr += sizeof(struct btrfs_tree_block_info);
718		BUG_ON(ptr > end);
719	} else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
720		*info_level = found_key.offset;
721	} else {
722		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
723	}
724
725	while (ptr < end) {
726		struct btrfs_extent_inline_ref *iref;
727		u64 offset;
728		int type;
729
730		iref = (struct btrfs_extent_inline_ref *)ptr;
731		type = btrfs_extent_inline_ref_type(leaf, iref);
732		offset = btrfs_extent_inline_ref_offset(leaf, iref);
733
734		switch (type) {
735		case BTRFS_SHARED_BLOCK_REF_KEY:
736			ret = __add_prelim_ref(prefs, 0, NULL,
737						*info_level + 1, offset,
738						bytenr, 1, GFP_NOFS);
739			break;
740		case BTRFS_SHARED_DATA_REF_KEY: {
741			struct btrfs_shared_data_ref *sdref;
742			int count;
743
744			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
745			count = btrfs_shared_data_ref_count(leaf, sdref);
746			ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
747					       bytenr, count, GFP_NOFS);
748			break;
749		}
750		case BTRFS_TREE_BLOCK_REF_KEY:
751			ret = __add_prelim_ref(prefs, offset, NULL,
752					       *info_level + 1, 0,
753					       bytenr, 1, GFP_NOFS);
754			break;
755		case BTRFS_EXTENT_DATA_REF_KEY: {
756			struct btrfs_extent_data_ref *dref;
757			int count;
758			u64 root;
759
760			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
761			count = btrfs_extent_data_ref_count(leaf, dref);
762			key.objectid = btrfs_extent_data_ref_objectid(leaf,
763								      dref);
764			key.type = BTRFS_EXTENT_DATA_KEY;
765			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
766
767			if (inum && key.objectid != inum) {
768				ret = BACKREF_FOUND_SHARED;
769				break;
770			}
771
772			root = btrfs_extent_data_ref_root(leaf, dref);
773			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
774					       bytenr, count, GFP_NOFS);
775			break;
776		}
777		default:
778			WARN_ON(1);
779		}
780		if (ret)
781			return ret;
782		ptr += btrfs_extent_inline_ref_size(type);
783	}
784
785	return 0;
786}
787
788/*
789 * add all non-inline backrefs for bytenr to the list
790 */
791static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
792			    struct btrfs_path *path, u64 bytenr,
793			    int info_level, struct list_head *prefs, u64 inum)
794{
795	struct btrfs_root *extent_root = fs_info->extent_root;
796	int ret;
797	int slot;
798	struct extent_buffer *leaf;
799	struct btrfs_key key;
800
801	while (1) {
802		ret = btrfs_next_item(extent_root, path);
803		if (ret < 0)
804			break;
805		if (ret) {
806			ret = 0;
807			break;
808		}
809
810		slot = path->slots[0];
811		leaf = path->nodes[0];
812		btrfs_item_key_to_cpu(leaf, &key, slot);
813
814		if (key.objectid != bytenr)
815			break;
816		if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
817			continue;
818		if (key.type > BTRFS_SHARED_DATA_REF_KEY)
819			break;
820
821		switch (key.type) {
822		case BTRFS_SHARED_BLOCK_REF_KEY:
823			ret = __add_prelim_ref(prefs, 0, NULL,
824						info_level + 1, key.offset,
825						bytenr, 1, GFP_NOFS);
826			break;
827		case BTRFS_SHARED_DATA_REF_KEY: {
828			struct btrfs_shared_data_ref *sdref;
829			int count;
830
831			sdref = btrfs_item_ptr(leaf, slot,
832					      struct btrfs_shared_data_ref);
833			count = btrfs_shared_data_ref_count(leaf, sdref);
834			ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
835						bytenr, count, GFP_NOFS);
836			break;
837		}
838		case BTRFS_TREE_BLOCK_REF_KEY:
839			ret = __add_prelim_ref(prefs, key.offset, NULL,
840					       info_level + 1, 0,
841					       bytenr, 1, GFP_NOFS);
842			break;
843		case BTRFS_EXTENT_DATA_REF_KEY: {
844			struct btrfs_extent_data_ref *dref;
845			int count;
846			u64 root;
847
848			dref = btrfs_item_ptr(leaf, slot,
849					      struct btrfs_extent_data_ref);
850			count = btrfs_extent_data_ref_count(leaf, dref);
851			key.objectid = btrfs_extent_data_ref_objectid(leaf,
852								      dref);
853			key.type = BTRFS_EXTENT_DATA_KEY;
854			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
855
856			if (inum && key.objectid != inum) {
857				ret = BACKREF_FOUND_SHARED;
858				break;
859			}
860
861			root = btrfs_extent_data_ref_root(leaf, dref);
862			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
863					       bytenr, count, GFP_NOFS);
864			break;
865		}
866		default:
867			WARN_ON(1);
868		}
869		if (ret)
870			return ret;
871
872	}
873
874	return ret;
875}
876
877/*
878 * this adds all existing backrefs (inline backrefs, backrefs and delayed
879 * refs) for the given bytenr to the refs list, merges duplicates and resolves
880 * indirect refs to their parent bytenr.
881 * When roots are found, they're added to the roots list
882 *
883 * FIXME some caching might speed things up
884 */
885static int find_parent_nodes(struct btrfs_trans_handle *trans,
886			     struct btrfs_fs_info *fs_info, u64 bytenr,
887			     u64 time_seq, struct ulist *refs,
888			     struct ulist *roots, const u64 *extent_item_pos,
889			     u64 root_objectid, u64 inum)
890{
891	struct btrfs_key key;
892	struct btrfs_path *path;
893	struct btrfs_delayed_ref_root *delayed_refs = NULL;
894	struct btrfs_delayed_ref_head *head;
895	int info_level = 0;
896	int ret;
897	struct list_head prefs_delayed;
898	struct list_head prefs;
899	struct __prelim_ref *ref;
900	struct extent_inode_elem *eie = NULL;
901	u64 total_refs = 0;
902
903	INIT_LIST_HEAD(&prefs);
904	INIT_LIST_HEAD(&prefs_delayed);
905
906	key.objectid = bytenr;
907	key.offset = (u64)-1;
908	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
909		key.type = BTRFS_METADATA_ITEM_KEY;
910	else
911		key.type = BTRFS_EXTENT_ITEM_KEY;
912
913	path = btrfs_alloc_path();
914	if (!path)
915		return -ENOMEM;
916	if (!trans) {
917		path->search_commit_root = 1;
918		path->skip_locking = 1;
919	}
920
921	/*
922	 * grab both a lock on the path and a lock on the delayed ref head.
923	 * We need both to get a consistent picture of how the refs look
924	 * at a specified point in time
925	 */
926again:
927	head = NULL;
928
929	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
930	if (ret < 0)
931		goto out;
932	BUG_ON(ret == 0);
933
934#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
935	if (trans && likely(trans->type != __TRANS_DUMMY)) {
936#else
937	if (trans) {
938#endif
939		/*
940		 * look if there are updates for this ref queued and lock the
941		 * head
942		 */
943		delayed_refs = &trans->transaction->delayed_refs;
944		spin_lock(&delayed_refs->lock);
945		head = btrfs_find_delayed_ref_head(trans, bytenr);
946		if (head) {
947			if (!mutex_trylock(&head->mutex)) {
948				atomic_inc(&head->node.refs);
949				spin_unlock(&delayed_refs->lock);
950
951				btrfs_release_path(path);
952
953				/*
954				 * Mutex was contended, block until it's
955				 * released and try again
956				 */
957				mutex_lock(&head->mutex);
958				mutex_unlock(&head->mutex);
959				btrfs_put_delayed_ref(&head->node);
960				goto again;
961			}
962			spin_unlock(&delayed_refs->lock);
963			ret = __add_delayed_refs(head, time_seq,
964						 &prefs_delayed, &total_refs,
965						 inum);
966			mutex_unlock(&head->mutex);
967			if (ret)
968				goto out;
969		} else {
970			spin_unlock(&delayed_refs->lock);
971		}
972	}
973
974	if (path->slots[0]) {
975		struct extent_buffer *leaf;
976		int slot;
977
978		path->slots[0]--;
979		leaf = path->nodes[0];
980		slot = path->slots[0];
981		btrfs_item_key_to_cpu(leaf, &key, slot);
982		if (key.objectid == bytenr &&
983		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
984		     key.type == BTRFS_METADATA_ITEM_KEY)) {
985			ret = __add_inline_refs(fs_info, path, bytenr,
986						&info_level, &prefs,
987						&total_refs, inum);
988			if (ret)
989				goto out;
990			ret = __add_keyed_refs(fs_info, path, bytenr,
991					       info_level, &prefs, inum);
992			if (ret)
993				goto out;
994		}
995	}
996	btrfs_release_path(path);
997
998	list_splice_init(&prefs_delayed, &prefs);
999
1000	ret = __add_missing_keys(fs_info, &prefs);
1001	if (ret)
1002		goto out;
1003
1004	__merge_refs(&prefs, 1);
1005
1006	ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
1007				      extent_item_pos, total_refs,
1008				      root_objectid);
1009	if (ret)
1010		goto out;
1011
1012	__merge_refs(&prefs, 2);
1013
1014	while (!list_empty(&prefs)) {
1015		ref = list_first_entry(&prefs, struct __prelim_ref, list);
1016		WARN_ON(ref->count < 0);
1017		if (roots && ref->count && ref->root_id && ref->parent == 0) {
1018			if (root_objectid && ref->root_id != root_objectid) {
1019				ret = BACKREF_FOUND_SHARED;
1020				goto out;
1021			}
1022
1023			/* no parent == root of tree */
1024			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
1025			if (ret < 0)
1026				goto out;
1027		}
1028		if (ref->count && ref->parent) {
1029			if (extent_item_pos && !ref->inode_list &&
1030			    ref->level == 0) {
1031				struct extent_buffer *eb;
1032
1033				eb = read_tree_block(fs_info->extent_root,
1034							   ref->parent, 0);
1035				if (!eb || !extent_buffer_uptodate(eb)) {
1036					free_extent_buffer(eb);
1037					ret = -EIO;
1038					goto out;
1039				}
1040				btrfs_tree_read_lock(eb);
1041				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1042				ret = find_extent_in_eb(eb, bytenr,
1043							*extent_item_pos, &eie);
1044				btrfs_tree_read_unlock_blocking(eb);
1045				free_extent_buffer(eb);
1046				if (ret < 0)
1047					goto out;
1048				ref->inode_list = eie;
1049			}
1050			ret = ulist_add_merge_ptr(refs, ref->parent,
1051						  ref->inode_list,
1052						  (void **)&eie, GFP_NOFS);
1053			if (ret < 0)
1054				goto out;
1055			if (!ret && extent_item_pos) {
1056				/*
1057				 * we've recorded that parent, so we must extend
1058				 * its inode list here
1059				 */
1060				BUG_ON(!eie);
1061				while (eie->next)
1062					eie = eie->next;
1063				eie->next = ref->inode_list;
1064			}
1065			eie = NULL;
1066		}
1067		list_del(&ref->list);
1068		kmem_cache_free(btrfs_prelim_ref_cache, ref);
1069	}
1070
1071out:
1072	btrfs_free_path(path);
1073	while (!list_empty(&prefs)) {
1074		ref = list_first_entry(&prefs, struct __prelim_ref, list);
1075		list_del(&ref->list);
1076		kmem_cache_free(btrfs_prelim_ref_cache, ref);
1077	}
1078	while (!list_empty(&prefs_delayed)) {
1079		ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
1080				       list);
1081		list_del(&ref->list);
1082		kmem_cache_free(btrfs_prelim_ref_cache, ref);
1083	}
1084	if (ret < 0)
1085		free_inode_elem_list(eie);
1086	return ret;
1087}
1088
1089static void free_leaf_list(struct ulist *blocks)
1090{
1091	struct ulist_node *node = NULL;
1092	struct extent_inode_elem *eie;
1093	struct ulist_iterator uiter;
1094
1095	ULIST_ITER_INIT(&uiter);
1096	while ((node = ulist_next(blocks, &uiter))) {
1097		if (!node->aux)
1098			continue;
1099		eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
1100		free_inode_elem_list(eie);
1101		node->aux = 0;
1102	}
1103
1104	ulist_free(blocks);
1105}
1106
1107/*
1108 * Finds all leafs with a reference to the specified combination of bytenr and
1109 * offset. key_list_head will point to a list of corresponding keys (caller must
1110 * free each list element). The leafs will be stored in the leafs ulist, which
1111 * must be freed with ulist_free.
1112 *
1113 * returns 0 on success, <0 on error
1114 */
1115static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1116				struct btrfs_fs_info *fs_info, u64 bytenr,
1117				u64 time_seq, struct ulist **leafs,
1118				const u64 *extent_item_pos)
1119{
1120	int ret;
1121
1122	*leafs = ulist_alloc(GFP_NOFS);
1123	if (!*leafs)
1124		return -ENOMEM;
1125
1126	ret = find_parent_nodes(trans, fs_info, bytenr,
1127				time_seq, *leafs, NULL, extent_item_pos, 0, 0);
1128	if (ret < 0 && ret != -ENOENT) {
1129		free_leaf_list(*leafs);
1130		return ret;
1131	}
1132
1133	return 0;
1134}
1135
1136/*
1137 * walk all backrefs for a given extent to find all roots that reference this
1138 * extent. Walking a backref means finding all extents that reference this
1139 * extent and in turn walk the backrefs of those, too. Naturally this is a
1140 * recursive process, but here it is implemented in an iterative fashion: We
1141 * find all referencing extents for the extent in question and put them on a
1142 * list. In turn, we find all referencing extents for those, further appending
1143 * to the list. The way we iterate the list allows adding more elements after
1144 * the current while iterating. The process stops when we reach the end of the
1145 * list. Found roots are added to the roots list.
1146 *
1147 * returns 0 on success, < 0 on error.
1148 */
1149static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1150				  struct btrfs_fs_info *fs_info, u64 bytenr,
1151				  u64 time_seq, struct ulist **roots)
1152{
1153	struct ulist *tmp;
1154	struct ulist_node *node = NULL;
1155	struct ulist_iterator uiter;
1156	int ret;
1157
1158	tmp = ulist_alloc(GFP_NOFS);
1159	if (!tmp)
1160		return -ENOMEM;
1161	*roots = ulist_alloc(GFP_NOFS);
1162	if (!*roots) {
1163		ulist_free(tmp);
1164		return -ENOMEM;
1165	}
1166
1167	ULIST_ITER_INIT(&uiter);
1168	while (1) {
1169		ret = find_parent_nodes(trans, fs_info, bytenr,
1170					time_seq, tmp, *roots, NULL, 0, 0);
1171		if (ret < 0 && ret != -ENOENT) {
1172			ulist_free(tmp);
1173			ulist_free(*roots);
1174			return ret;
1175		}
1176		node = ulist_next(tmp, &uiter);
1177		if (!node)
1178			break;
1179		bytenr = node->val;
1180		cond_resched();
1181	}
1182
1183	ulist_free(tmp);
1184	return 0;
1185}
1186
1187int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1188			 struct btrfs_fs_info *fs_info, u64 bytenr,
1189			 u64 time_seq, struct ulist **roots)
1190{
1191	int ret;
1192
1193	if (!trans)
1194		down_read(&fs_info->commit_root_sem);
1195	ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
1196	if (!trans)
1197		up_read(&fs_info->commit_root_sem);
1198	return ret;
1199}
1200
1201int btrfs_check_shared(struct btrfs_trans_handle *trans,
1202		       struct btrfs_fs_info *fs_info, u64 root_objectid,
1203		       u64 inum, u64 bytenr)
1204{
1205	struct ulist *tmp = NULL;
1206	struct ulist *roots = NULL;
1207	struct ulist_iterator uiter;
1208	struct ulist_node *node;
1209	struct seq_list elem = {};
1210	int ret = 0;
1211
1212	tmp = ulist_alloc(GFP_NOFS);
1213	roots = ulist_alloc(GFP_NOFS);
1214	if (!tmp || !roots) {
1215		ulist_free(tmp);
1216		ulist_free(roots);
1217		return -ENOMEM;
1218	}
1219
1220	if (trans)
1221		btrfs_get_tree_mod_seq(fs_info, &elem);
1222	else
1223		down_read(&fs_info->commit_root_sem);
1224	ULIST_ITER_INIT(&uiter);
1225	while (1) {
1226		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
1227					roots, NULL, root_objectid, inum);
1228		if (ret == BACKREF_FOUND_SHARED) {
1229			ret = 1;
1230			break;
1231		}
1232		if (ret < 0 && ret != -ENOENT)
1233			break;
1234		node = ulist_next(tmp, &uiter);
1235		if (!node)
1236			break;
1237		bytenr = node->val;
1238		cond_resched();
1239	}
1240	if (trans)
1241		btrfs_put_tree_mod_seq(fs_info, &elem);
1242	else
1243		up_read(&fs_info->commit_root_sem);
1244	ulist_free(tmp);
1245	ulist_free(roots);
1246	return ret;
1247}
1248
1249/*
1250 * this makes the path point to (inum INODE_ITEM ioff)
1251 */
1252int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1253			struct btrfs_path *path)
1254{
1255	struct btrfs_key key;
1256	return btrfs_find_item(fs_root, path, inum, ioff,
1257			BTRFS_INODE_ITEM_KEY, &key);
1258}
1259
1260static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1261				struct btrfs_path *path,
1262				struct btrfs_key *found_key)
1263{
1264	return btrfs_find_item(fs_root, path, inum, ioff,
1265			BTRFS_INODE_REF_KEY, found_key);
1266}
1267
1268int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1269			  u64 start_off, struct btrfs_path *path,
1270			  struct btrfs_inode_extref **ret_extref,
1271			  u64 *found_off)
1272{
1273	int ret, slot;
1274	struct btrfs_key key;
1275	struct btrfs_key found_key;
1276	struct btrfs_inode_extref *extref;
1277	struct extent_buffer *leaf;
1278	unsigned long ptr;
1279
1280	key.objectid = inode_objectid;
1281	key.type = BTRFS_INODE_EXTREF_KEY;
1282	key.offset = start_off;
1283
1284	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1285	if (ret < 0)
1286		return ret;
1287
1288	while (1) {
1289		leaf = path->nodes[0];
1290		slot = path->slots[0];
1291		if (slot >= btrfs_header_nritems(leaf)) {
1292			/*
1293			 * If the item at offset is not found,
1294			 * btrfs_search_slot will point us to the slot
1295			 * where it should be inserted. In our case
1296			 * that will be the slot directly before the
1297			 * next INODE_REF_KEY_V2 item. In the case
1298			 * that we're pointing to the last slot in a
1299			 * leaf, we must move one leaf over.
1300			 */
1301			ret = btrfs_next_leaf(root, path);
1302			if (ret) {
1303				if (ret >= 1)
1304					ret = -ENOENT;
1305				break;
1306			}
1307			continue;
1308		}
1309
1310		btrfs_item_key_to_cpu(leaf, &found_key, slot);
1311
1312		/*
1313		 * Check that we're still looking at an extended ref key for
1314		 * this particular objectid. If we have different
1315		 * objectid or type then there are no more to be found
1316		 * in the tree and we can exit.
1317		 */
1318		ret = -ENOENT;
1319		if (found_key.objectid != inode_objectid)
1320			break;
1321		if (found_key.type != BTRFS_INODE_EXTREF_KEY)
1322			break;
1323
1324		ret = 0;
1325		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1326		extref = (struct btrfs_inode_extref *)ptr;
1327		*ret_extref = extref;
1328		if (found_off)
1329			*found_off = found_key.offset;
1330		break;
1331	}
1332
1333	return ret;
1334}
1335
1336/*
1337 * this iterates to turn a name (from iref/extref) into a full filesystem path.
1338 * Elements of the path are separated by '/' and the path is guaranteed to be
1339 * 0-terminated. the path is only given within the current file system.
1340 * Therefore, it never starts with a '/'. the caller is responsible to provide
1341 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
1342 * the start point of the resulting string is returned. this pointer is within
1343 * dest, normally.
1344 * in case the path buffer would overflow, the pointer is decremented further
1345 * as if output was written to the buffer, though no more output is actually
1346 * generated. that way, the caller can determine how much space would be
1347 * required for the path to fit into the buffer. in that case, the returned
1348 * value will be smaller than dest. callers must check this!
1349 */
1350char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1351			u32 name_len, unsigned long name_off,
1352			struct extent_buffer *eb_in, u64 parent,
1353			char *dest, u32 size)
1354{
1355	int slot;
1356	u64 next_inum;
1357	int ret;
1358	s64 bytes_left = ((s64)size) - 1;
1359	struct extent_buffer *eb = eb_in;
1360	struct btrfs_key found_key;
1361	int leave_spinning = path->leave_spinning;
1362	struct btrfs_inode_ref *iref;
1363
1364	if (bytes_left >= 0)
1365		dest[bytes_left] = '\0';
1366
1367	path->leave_spinning = 1;
1368	while (1) {
1369		bytes_left -= name_len;
1370		if (bytes_left >= 0)
1371			read_extent_buffer(eb, dest + bytes_left,
1372					   name_off, name_len);
1373		if (eb != eb_in) {
1374			btrfs_tree_read_unlock_blocking(eb);
1375			free_extent_buffer(eb);
1376		}
1377		ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
1378		if (ret > 0)
1379			ret = -ENOENT;
1380		if (ret)
1381			break;
1382
1383		next_inum = found_key.offset;
1384
1385		/* regular exit ahead */
1386		if (parent == next_inum)
1387			break;
1388
1389		slot = path->slots[0];
1390		eb = path->nodes[0];
1391		/* make sure we can use eb after releasing the path */
1392		if (eb != eb_in) {
1393			atomic_inc(&eb->refs);
1394			btrfs_tree_read_lock(eb);
1395			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1396		}
1397		btrfs_release_path(path);
1398		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
1399
1400		name_len = btrfs_inode_ref_name_len(eb, iref);
1401		name_off = (unsigned long)(iref + 1);
1402
1403		parent = next_inum;
1404		--bytes_left;
1405		if (bytes_left >= 0)
1406			dest[bytes_left] = '/';
1407	}
1408
1409	btrfs_release_path(path);
1410	path->leave_spinning = leave_spinning;
1411
1412	if (ret)
1413		return ERR_PTR(ret);
1414
1415	return dest + bytes_left;
1416}
1417
1418/*
1419 * this makes the path point to (logical EXTENT_ITEM *)
1420 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
1421 * tree blocks and <0 on error.
1422 */
1423int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1424			struct btrfs_path *path, struct btrfs_key *found_key,
1425			u64 *flags_ret)
1426{
1427	int ret;
1428	u64 flags;
1429	u64 size = 0;
1430	u32 item_size;
1431	struct extent_buffer *eb;
1432	struct btrfs_extent_item *ei;
1433	struct btrfs_key key;
1434
1435	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1436		key.type = BTRFS_METADATA_ITEM_KEY;
1437	else
1438		key.type = BTRFS_EXTENT_ITEM_KEY;
1439	key.objectid = logical;
1440	key.offset = (u64)-1;
1441
1442	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
1443	if (ret < 0)
1444		return ret;
1445
1446	ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
1447	if (ret) {
1448		if (ret > 0)
1449			ret = -ENOENT;
1450		return ret;
1451	}
1452	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1453	if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1454		size = fs_info->extent_root->nodesize;
1455	else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
1456		size = found_key->offset;
1457
1458	if (found_key->objectid > logical ||
1459	    found_key->objectid + size <= logical) {
1460		pr_debug("logical %llu is not within any extent\n", logical);
1461		return -ENOENT;
1462	}
1463
1464	eb = path->nodes[0];
1465	item_size = btrfs_item_size_nr(eb, path->slots[0]);
1466	BUG_ON(item_size < sizeof(*ei));
1467
1468	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
1469	flags = btrfs_extent_flags(eb, ei);
1470
1471	pr_debug("logical %llu is at position %llu within the extent (%llu "
1472		 "EXTENT_ITEM %llu) flags %#llx size %u\n",
1473		 logical, logical - found_key->objectid, found_key->objectid,
1474		 found_key->offset, flags, item_size);
1475
1476	WARN_ON(!flags_ret);
1477	if (flags_ret) {
1478		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1479			*flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
1480		else if (flags & BTRFS_EXTENT_FLAG_DATA)
1481			*flags_ret = BTRFS_EXTENT_FLAG_DATA;
1482		else
1483			BUG_ON(1);
1484		return 0;
1485	}
1486
1487	return -EIO;
1488}
1489
1490/*
1491 * helper function to iterate extent inline refs. ptr must point to a 0 value
1492 * for the first call and may be modified. it is used to track state.
1493 * if more refs exist, 0 is returned and the next call to
1494 * __get_extent_inline_ref must pass the modified ptr parameter to get the
1495 * next ref. after the last ref was processed, 1 is returned.
1496 * returns <0 on error
1497 */
1498static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
1499				   struct btrfs_key *key,
1500				   struct btrfs_extent_item *ei, u32 item_size,
1501				   struct btrfs_extent_inline_ref **out_eiref,
1502				   int *out_type)
1503{
1504	unsigned long end;
1505	u64 flags;
1506	struct btrfs_tree_block_info *info;
1507
1508	if (!*ptr) {
1509		/* first call */
1510		flags = btrfs_extent_flags(eb, ei);
1511		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1512			if (key->type == BTRFS_METADATA_ITEM_KEY) {
1513				/* a skinny metadata extent */
1514				*out_eiref =
1515				     (struct btrfs_extent_inline_ref *)(ei + 1);
1516			} else {
1517				WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY);
1518				info = (struct btrfs_tree_block_info *)(ei + 1);
1519				*out_eiref =
1520				   (struct btrfs_extent_inline_ref *)(info + 1);
1521			}
1522		} else {
1523			*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
1524		}
1525		*ptr = (unsigned long)*out_eiref;
1526		if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)
1527			return -ENOENT;
1528	}
1529
1530	end = (unsigned long)ei + item_size;
1531	*out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
1532	*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
1533
1534	*ptr += btrfs_extent_inline_ref_size(*out_type);
1535	WARN_ON(*ptr > end);
1536	if (*ptr == end)
1537		return 1; /* last */
1538
1539	return 0;
1540}
1541
1542/*
1543 * reads the tree block backref for an extent. tree level and root are returned
1544 * through out_level and out_root. ptr must point to a 0 value for the first
1545 * call and may be modified (see __get_extent_inline_ref comment).
1546 * returns 0 if data was provided, 1 if there was no more data to provide or
1547 * <0 on error.
1548 */
1549int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
1550			    struct btrfs_key *key, struct btrfs_extent_item *ei,
1551			    u32 item_size, u64 *out_root, u8 *out_level)
1552{
1553	int ret;
1554	int type;
1555	struct btrfs_tree_block_info *info;
1556	struct btrfs_extent_inline_ref *eiref;
1557
1558	if (*ptr == (unsigned long)-1)
1559		return 1;
1560
1561	while (1) {
1562		ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size,
1563					      &eiref, &type);
1564		if (ret < 0)
1565			return ret;
1566
1567		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1568		    type == BTRFS_SHARED_BLOCK_REF_KEY)
1569			break;
1570
1571		if (ret == 1)
1572			return 1;
1573	}
1574
1575	/* we can treat both ref types equally here */
1576	info = (struct btrfs_tree_block_info *)(ei + 1);
1577	*out_root = btrfs_extent_inline_ref_offset(eb, eiref);
1578	*out_level = btrfs_tree_block_level(eb, info);
1579
1580	if (ret == 1)
1581		*ptr = (unsigned long)-1;
1582
1583	return 0;
1584}
1585
1586static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
1587				u64 root, u64 extent_item_objectid,
1588				iterate_extent_inodes_t *iterate, void *ctx)
1589{
1590	struct extent_inode_elem *eie;
1591	int ret = 0;
1592
1593	for (eie = inode_list; eie; eie = eie->next) {
1594		pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
1595			 "root %llu\n", extent_item_objectid,
1596			 eie->inum, eie->offset, root);
1597		ret = iterate(eie->inum, eie->offset, root, ctx);
1598		if (ret) {
1599			pr_debug("stopping iteration for %llu due to ret=%d\n",
1600				 extent_item_objectid, ret);
1601			break;
1602		}
1603	}
1604
1605	return ret;
1606}
1607
1608/*
1609 * calls iterate() for every inode that references the extent identified by
1610 * the given parameters.
1611 * when the iterator function returns a non-zero value, iteration stops.
1612 */
1613int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1614				u64 extent_item_objectid, u64 extent_item_pos,
1615				int search_commit_root,
1616				iterate_extent_inodes_t *iterate, void *ctx)
1617{
1618	int ret;
1619	struct btrfs_trans_handle *trans = NULL;
1620	struct ulist *refs = NULL;
1621	struct ulist *roots = NULL;
1622	struct ulist_node *ref_node = NULL;
1623	struct ulist_node *root_node = NULL;
1624	struct seq_list tree_mod_seq_elem = {};
1625	struct ulist_iterator ref_uiter;
1626	struct ulist_iterator root_uiter;
1627
1628	pr_debug("resolving all inodes for extent %llu\n",
1629			extent_item_objectid);
1630
1631	if (!search_commit_root) {
1632		trans = btrfs_join_transaction(fs_info->extent_root);
1633		if (IS_ERR(trans))
1634			return PTR_ERR(trans);
1635		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1636	} else {
1637		down_read(&fs_info->commit_root_sem);
1638	}
1639
1640	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1641				   tree_mod_seq_elem.seq, &refs,
1642				   &extent_item_pos);
1643	if (ret)
1644		goto out;
1645
1646	ULIST_ITER_INIT(&ref_uiter);
1647	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
1648		ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
1649					     tree_mod_seq_elem.seq, &roots);
1650		if (ret)
1651			break;
1652		ULIST_ITER_INIT(&root_uiter);
1653		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
1654			pr_debug("root %llu references leaf %llu, data list "
1655				 "%#llx\n", root_node->val, ref_node->val,
1656				 ref_node->aux);
1657			ret = iterate_leaf_refs((struct extent_inode_elem *)
1658						(uintptr_t)ref_node->aux,
1659						root_node->val,
1660						extent_item_objectid,
1661						iterate, ctx);
1662		}
1663		ulist_free(roots);
1664	}
1665
1666	free_leaf_list(refs);
1667out:
1668	if (!search_commit_root) {
1669		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1670		btrfs_end_transaction(trans, fs_info->extent_root);
1671	} else {
1672		up_read(&fs_info->commit_root_sem);
1673	}
1674
1675	return ret;
1676}
1677
1678int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1679				struct btrfs_path *path,
1680				iterate_extent_inodes_t *iterate, void *ctx)
1681{
1682	int ret;
1683	u64 extent_item_pos;
1684	u64 flags = 0;
1685	struct btrfs_key found_key;
1686	int search_commit_root = path->search_commit_root;
1687
1688	ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
1689	btrfs_release_path(path);
1690	if (ret < 0)
1691		return ret;
1692	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1693		return -EINVAL;
1694
1695	extent_item_pos = logical - found_key.objectid;
1696	ret = iterate_extent_inodes(fs_info, found_key.objectid,
1697					extent_item_pos, search_commit_root,
1698					iterate, ctx);
1699
1700	return ret;
1701}
1702
1703typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
1704			      struct extent_buffer *eb, void *ctx);
1705
1706static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
1707			      struct btrfs_path *path,
1708			      iterate_irefs_t *iterate, void *ctx)
1709{
1710	int ret = 0;
1711	int slot;
1712	u32 cur;
1713	u32 len;
1714	u32 name_len;
1715	u64 parent = 0;
1716	int found = 0;
1717	struct extent_buffer *eb;
1718	struct btrfs_item *item;
1719	struct btrfs_inode_ref *iref;
1720	struct btrfs_key found_key;
1721
1722	while (!ret) {
1723		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
1724				     &found_key);
1725		if (ret < 0)
1726			break;
1727		if (ret) {
1728			ret = found ? 0 : -ENOENT;
1729			break;
1730		}
1731		++found;
1732
1733		parent = found_key.offset;
1734		slot = path->slots[0];
1735		eb = btrfs_clone_extent_buffer(path->nodes[0]);
1736		if (!eb) {
1737			ret = -ENOMEM;
1738			break;
1739		}
1740		extent_buffer_get(eb);
1741		btrfs_tree_read_lock(eb);
1742		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1743		btrfs_release_path(path);
1744
1745		item = btrfs_item_nr(slot);
1746		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
1747
1748		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
1749			name_len = btrfs_inode_ref_name_len(eb, iref);
1750			/* path must be released before calling iterate()! */
1751			pr_debug("following ref at offset %u for inode %llu in "
1752				 "tree %llu\n", cur, found_key.objectid,
1753				 fs_root->objectid);
1754			ret = iterate(parent, name_len,
1755				      (unsigned long)(iref + 1), eb, ctx);
1756			if (ret)
1757				break;
1758			len = sizeof(*iref) + name_len;
1759			iref = (struct btrfs_inode_ref *)((char *)iref + len);
1760		}
1761		btrfs_tree_read_unlock_blocking(eb);
1762		free_extent_buffer(eb);
1763	}
1764
1765	btrfs_release_path(path);
1766
1767	return ret;
1768}
1769
1770static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
1771				 struct btrfs_path *path,
1772				 iterate_irefs_t *iterate, void *ctx)
1773{
1774	int ret;
1775	int slot;
1776	u64 offset = 0;
1777	u64 parent;
1778	int found = 0;
1779	struct extent_buffer *eb;
1780	struct btrfs_inode_extref *extref;
1781	struct extent_buffer *leaf;
1782	u32 item_size;
1783	u32 cur_offset;
1784	unsigned long ptr;
1785
1786	while (1) {
1787		ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
1788					    &offset);
1789		if (ret < 0)
1790			break;
1791		if (ret) {
1792			ret = found ? 0 : -ENOENT;
1793			break;
1794		}
1795		++found;
1796
1797		slot = path->slots[0];
1798		eb = btrfs_clone_extent_buffer(path->nodes[0]);
1799		if (!eb) {
1800			ret = -ENOMEM;
1801			break;
1802		}
1803		extent_buffer_get(eb);
1804
1805		btrfs_tree_read_lock(eb);
1806		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1807		btrfs_release_path(path);
1808
1809		leaf = path->nodes[0];
1810		item_size = btrfs_item_size_nr(leaf, slot);
1811		ptr = btrfs_item_ptr_offset(leaf, slot);
1812		cur_offset = 0;
1813
1814		while (cur_offset < item_size) {
1815			u32 name_len;
1816
1817			extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
1818			parent = btrfs_inode_extref_parent(eb, extref);
1819			name_len = btrfs_inode_extref_name_len(eb, extref);
1820			ret = iterate(parent, name_len,
1821				      (unsigned long)&extref->name, eb, ctx);
1822			if (ret)
1823				break;
1824
1825			cur_offset += btrfs_inode_extref_name_len(leaf, extref);
1826			cur_offset += sizeof(*extref);
1827		}
1828		btrfs_tree_read_unlock_blocking(eb);
1829		free_extent_buffer(eb);
1830
1831		offset++;
1832	}
1833
1834	btrfs_release_path(path);
1835
1836	return ret;
1837}
1838
1839static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1840			 struct btrfs_path *path, iterate_irefs_t *iterate,
1841			 void *ctx)
1842{
1843	int ret;
1844	int found_refs = 0;
1845
1846	ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
1847	if (!ret)
1848		++found_refs;
1849	else if (ret != -ENOENT)
1850		return ret;
1851
1852	ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
1853	if (ret == -ENOENT && found_refs)
1854		return 0;
1855
1856	return ret;
1857}
1858
1859/*
1860 * returns 0 if the path could be dumped (probably truncated)
1861 * returns <0 in case of an error
1862 */
1863static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
1864			 struct extent_buffer *eb, void *ctx)
1865{
1866	struct inode_fs_paths *ipath = ctx;
1867	char *fspath;
1868	char *fspath_min;
1869	int i = ipath->fspath->elem_cnt;
1870	const int s_ptr = sizeof(char *);
1871	u32 bytes_left;
1872
1873	bytes_left = ipath->fspath->bytes_left > s_ptr ?
1874					ipath->fspath->bytes_left - s_ptr : 0;
1875
1876	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
1877	fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
1878				   name_off, eb, inum, fspath_min, bytes_left);
1879	if (IS_ERR(fspath))
1880		return PTR_ERR(fspath);
1881
1882	if (fspath > fspath_min) {
1883		ipath->fspath->val[i] = (u64)(unsigned long)fspath;
1884		++ipath->fspath->elem_cnt;
1885		ipath->fspath->bytes_left = fspath - fspath_min;
1886	} else {
1887		++ipath->fspath->elem_missed;
1888		ipath->fspath->bytes_missing += fspath_min - fspath;
1889		ipath->fspath->bytes_left = 0;
1890	}
1891
1892	return 0;
1893}
1894
1895/*
1896 * this dumps all file system paths to the inode into the ipath struct, provided
1897 * is has been created large enough. each path is zero-terminated and accessed
1898 * from ipath->fspath->val[i].
1899 * when it returns, there are ipath->fspath->elem_cnt number of paths available
1900 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
1901 * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
1902 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
1903 * have been needed to return all paths.
1904 */
1905int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
1906{
1907	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
1908			     inode_to_path, ipath);
1909}
1910
1911struct btrfs_data_container *init_data_container(u32 total_bytes)
1912{
1913	struct btrfs_data_container *data;
1914	size_t alloc_bytes;
1915
1916	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
1917	data = vmalloc(alloc_bytes);
1918	if (!data)
1919		return ERR_PTR(-ENOMEM);
1920
1921	if (total_bytes >= sizeof(*data)) {
1922		data->bytes_left = total_bytes - sizeof(*data);
1923		data->bytes_missing = 0;
1924	} else {
1925		data->bytes_missing = sizeof(*data) - total_bytes;
1926		data->bytes_left = 0;
1927	}
1928
1929	data->elem_cnt = 0;
1930	data->elem_missed = 0;
1931
1932	return data;
1933}
1934
1935/*
1936 * allocates space to return multiple file system paths for an inode.
1937 * total_bytes to allocate are passed, note that space usable for actual path
1938 * information will be total_bytes - sizeof(struct inode_fs_paths).
1939 * the returned pointer must be freed with free_ipath() in the end.
1940 */
1941struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
1942					struct btrfs_path *path)
1943{
1944	struct inode_fs_paths *ifp;
1945	struct btrfs_data_container *fspath;
1946
1947	fspath = init_data_container(total_bytes);
1948	if (IS_ERR(fspath))
1949		return (void *)fspath;
1950
1951	ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
1952	if (!ifp) {
1953		kfree(fspath);
1954		return ERR_PTR(-ENOMEM);
1955	}
1956
1957	ifp->btrfs_path = path;
1958	ifp->fspath = fspath;
1959	ifp->fs_root = fs_root;
1960
1961	return ifp;
1962}
1963
1964void free_ipath(struct inode_fs_paths *ipath)
1965{
1966	if (!ipath)
1967		return;
1968	vfree(ipath->fspath);
1969	kfree(ipath);
1970}
1971