dm-btree.c revision 3241b1d3e0aaafbfcd320f4d71ade629728cc4f4
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-btree-internal.h"
8#include "dm-space-map.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/module.h>
12#include <linux/device-mapper.h>
13
14#define DM_MSG_PREFIX "btree"
15
16/*----------------------------------------------------------------
17 * Array manipulation
18 *--------------------------------------------------------------*/
19static void memcpy_disk(void *dest, const void *src, size_t len)
20	__dm_written_to_disk(src)
21{
22	memcpy(dest, src, len);
23	__dm_unbless_for_disk(src);
24}
25
26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27			 unsigned index, void *elt)
28	__dm_written_to_disk(elt)
29{
30	if (index < nr_elts)
31		memmove(base + (elt_size * (index + 1)),
32			base + (elt_size * index),
33			(nr_elts - index) * elt_size);
34
35	memcpy_disk(base + (elt_size * index), elt, elt_size);
36}
37
38/*----------------------------------------------------------------*/
39
40/* makes the assumption that no two keys are the same. */
41static int bsearch(struct node *n, uint64_t key, int want_hi)
42{
43	int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45	while (hi - lo > 1) {
46		int mid = lo + ((hi - lo) / 2);
47		uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49		if (mid_key == key)
50			return mid;
51
52		if (mid_key < key)
53			lo = mid;
54		else
55			hi = mid;
56	}
57
58	return want_hi ? hi : lo;
59}
60
61int lower_bound(struct node *n, uint64_t key)
62{
63	return bsearch(n, key, 0);
64}
65
66void inc_children(struct dm_transaction_manager *tm, struct node *n,
67		  struct dm_btree_value_type *vt)
68{
69	unsigned i;
70	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71
72	if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73		for (i = 0; i < nr_entries; i++)
74			dm_tm_inc(tm, value64(n, i));
75	else if (vt->inc)
76		for (i = 0; i < nr_entries; i++)
77			vt->inc(vt->context,
78				value_ptr(n, i, vt->size));
79}
80
81static int insert_at(size_t value_size, struct node *node, unsigned index,
82		      uint64_t key, void *value)
83		      __dm_written_to_disk(value)
84{
85	uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
86	__le64 key_le = cpu_to_le64(key);
87
88	if (index > nr_entries ||
89	    index >= le32_to_cpu(node->header.max_entries)) {
90		DMERR("too many entries in btree node for insert");
91		__dm_unbless_for_disk(value);
92		return -ENOMEM;
93	}
94
95	__dm_bless_for_disk(&key_le);
96
97	array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
98	array_insert(value_base(node), value_size, nr_entries, index, value);
99	node->header.nr_entries = cpu_to_le32(nr_entries + 1);
100
101	return 0;
102}
103
104/*----------------------------------------------------------------*/
105
106/*
107 * We want 3n entries (for some n).  This works more nicely for repeated
108 * insert remove loops than (2n + 1).
109 */
110static uint32_t calc_max_entries(size_t value_size, size_t block_size)
111{
112	uint32_t total, n;
113	size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
114
115	block_size -= sizeof(struct node_header);
116	total = block_size / elt_size;
117	n = total / 3;		/* rounds down */
118
119	return 3 * n;
120}
121
122int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
123{
124	int r;
125	struct dm_block *b;
126	struct node *n;
127	size_t block_size;
128	uint32_t max_entries;
129
130	r = new_block(info, &b);
131	if (r < 0)
132		return r;
133
134	block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
135	max_entries = calc_max_entries(info->value_type.size, block_size);
136
137	n = dm_block_data(b);
138	memset(n, 0, block_size);
139	n->header.flags = cpu_to_le32(LEAF_NODE);
140	n->header.nr_entries = cpu_to_le32(0);
141	n->header.max_entries = cpu_to_le32(max_entries);
142	n->header.value_size = cpu_to_le32(info->value_type.size);
143
144	*root = dm_block_location(b);
145	return unlock_block(info, b);
146}
147EXPORT_SYMBOL_GPL(dm_btree_empty);
148
149/*----------------------------------------------------------------*/
150
151/*
152 * Deletion uses a recursive algorithm, since we have limited stack space
153 * we explicitly manage our own stack on the heap.
154 */
155#define MAX_SPINE_DEPTH 64
156struct frame {
157	struct dm_block *b;
158	struct node *n;
159	unsigned level;
160	unsigned nr_children;
161	unsigned current_child;
162};
163
164struct del_stack {
165	struct dm_transaction_manager *tm;
166	int top;
167	struct frame spine[MAX_SPINE_DEPTH];
168};
169
170static int top_frame(struct del_stack *s, struct frame **f)
171{
172	if (s->top < 0) {
173		DMERR("btree deletion stack empty");
174		return -EINVAL;
175	}
176
177	*f = s->spine + s->top;
178
179	return 0;
180}
181
182static int unprocessed_frames(struct del_stack *s)
183{
184	return s->top >= 0;
185}
186
187static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
188{
189	int r;
190	uint32_t ref_count;
191
192	if (s->top >= MAX_SPINE_DEPTH - 1) {
193		DMERR("btree deletion stack out of memory");
194		return -ENOMEM;
195	}
196
197	r = dm_tm_ref(s->tm, b, &ref_count);
198	if (r)
199		return r;
200
201	if (ref_count > 1)
202		/*
203		 * This is a shared node, so we can just decrement it's
204		 * reference counter and leave the children.
205		 */
206		dm_tm_dec(s->tm, b);
207
208	else {
209		struct frame *f = s->spine + ++s->top;
210
211		r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
212		if (r) {
213			s->top--;
214			return r;
215		}
216
217		f->n = dm_block_data(f->b);
218		f->level = level;
219		f->nr_children = le32_to_cpu(f->n->header.nr_entries);
220		f->current_child = 0;
221	}
222
223	return 0;
224}
225
226static void pop_frame(struct del_stack *s)
227{
228	struct frame *f = s->spine + s->top--;
229
230	dm_tm_dec(s->tm, dm_block_location(f->b));
231	dm_tm_unlock(s->tm, f->b);
232}
233
234int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
235{
236	int r;
237	struct del_stack *s;
238
239	s = kmalloc(sizeof(*s), GFP_KERNEL);
240	if (!s)
241		return -ENOMEM;
242	s->tm = info->tm;
243	s->top = -1;
244
245	r = push_frame(s, root, 1);
246	if (r)
247		goto out;
248
249	while (unprocessed_frames(s)) {
250		uint32_t flags;
251		struct frame *f;
252		dm_block_t b;
253
254		r = top_frame(s, &f);
255		if (r)
256			goto out;
257
258		if (f->current_child >= f->nr_children) {
259			pop_frame(s);
260			continue;
261		}
262
263		flags = le32_to_cpu(f->n->header.flags);
264		if (flags & INTERNAL_NODE) {
265			b = value64(f->n, f->current_child);
266			f->current_child++;
267			r = push_frame(s, b, f->level);
268			if (r)
269				goto out;
270
271		} else if (f->level != (info->levels - 1)) {
272			b = value64(f->n, f->current_child);
273			f->current_child++;
274			r = push_frame(s, b, f->level + 1);
275			if (r)
276				goto out;
277
278		} else {
279			if (info->value_type.dec) {
280				unsigned i;
281
282				for (i = 0; i < f->nr_children; i++)
283					info->value_type.dec(info->value_type.context,
284							     value_ptr(f->n, i, info->value_type.size));
285			}
286			f->current_child = f->nr_children;
287		}
288	}
289
290out:
291	kfree(s);
292	return r;
293}
294EXPORT_SYMBOL_GPL(dm_btree_del);
295
296/*----------------------------------------------------------------*/
297
298static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
299			    int (*search_fn)(struct node *, uint64_t),
300			    uint64_t *result_key, void *v, size_t value_size)
301{
302	int i, r;
303	uint32_t flags, nr_entries;
304
305	do {
306		r = ro_step(s, block);
307		if (r < 0)
308			return r;
309
310		i = search_fn(ro_node(s), key);
311
312		flags = le32_to_cpu(ro_node(s)->header.flags);
313		nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
314		if (i < 0 || i >= nr_entries)
315			return -ENODATA;
316
317		if (flags & INTERNAL_NODE)
318			block = value64(ro_node(s), i);
319
320	} while (!(flags & LEAF_NODE));
321
322	*result_key = le64_to_cpu(ro_node(s)->keys[i]);
323	memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
324
325	return 0;
326}
327
328int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
329		    uint64_t *keys, void *value_le)
330{
331	unsigned level, last_level = info->levels - 1;
332	int r = -ENODATA;
333	uint64_t rkey;
334	__le64 internal_value_le;
335	struct ro_spine spine;
336
337	init_ro_spine(&spine, info);
338	for (level = 0; level < info->levels; level++) {
339		size_t size;
340		void *value_p;
341
342		if (level == last_level) {
343			value_p = value_le;
344			size = info->value_type.size;
345
346		} else {
347			value_p = &internal_value_le;
348			size = sizeof(uint64_t);
349		}
350
351		r = btree_lookup_raw(&spine, root, keys[level],
352				     lower_bound, &rkey,
353				     value_p, size);
354
355		if (!r) {
356			if (rkey != keys[level]) {
357				exit_ro_spine(&spine);
358				return -ENODATA;
359			}
360		} else {
361			exit_ro_spine(&spine);
362			return r;
363		}
364
365		root = le64_to_cpu(internal_value_le);
366	}
367	exit_ro_spine(&spine);
368
369	return r;
370}
371EXPORT_SYMBOL_GPL(dm_btree_lookup);
372
373/*
374 * Splits a node by creating a sibling node and shifting half the nodes
375 * contents across.  Assumes there is a parent node, and it has room for
376 * another child.
377 *
378 * Before:
379 *	  +--------+
380 *	  | Parent |
381 *	  +--------+
382 *	     |
383 *	     v
384 *	+----------+
385 *	| A ++++++ |
386 *	+----------+
387 *
388 *
389 * After:
390 *		+--------+
391 *		| Parent |
392 *		+--------+
393 *		  |	|
394 *		  v	+------+
395 *	    +---------+	       |
396 *	    | A* +++  |	       v
397 *	    +---------+	  +-------+
398 *			  | B +++ |
399 *			  +-------+
400 *
401 * Where A* is a shadow of A.
402 */
403static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
404			       unsigned parent_index, uint64_t key)
405{
406	int r;
407	size_t size;
408	unsigned nr_left, nr_right;
409	struct dm_block *left, *right, *parent;
410	struct node *ln, *rn, *pn;
411	__le64 location;
412
413	left = shadow_current(s);
414
415	r = new_block(s->info, &right);
416	if (r < 0)
417		return r;
418
419	ln = dm_block_data(left);
420	rn = dm_block_data(right);
421
422	nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
423	nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
424
425	ln->header.nr_entries = cpu_to_le32(nr_left);
426
427	rn->header.flags = ln->header.flags;
428	rn->header.nr_entries = cpu_to_le32(nr_right);
429	rn->header.max_entries = ln->header.max_entries;
430	rn->header.value_size = ln->header.value_size;
431	memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
432
433	size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
434		sizeof(uint64_t) : s->info->value_type.size;
435	memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
436	       size * nr_right);
437
438	/*
439	 * Patch up the parent
440	 */
441	parent = shadow_parent(s);
442
443	pn = dm_block_data(parent);
444	location = cpu_to_le64(dm_block_location(left));
445	__dm_bless_for_disk(&location);
446	memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
447		    &location, sizeof(__le64));
448
449	location = cpu_to_le64(dm_block_location(right));
450	__dm_bless_for_disk(&location);
451
452	r = insert_at(sizeof(__le64), pn, parent_index + 1,
453		      le64_to_cpu(rn->keys[0]), &location);
454	if (r)
455		return r;
456
457	if (key < le64_to_cpu(rn->keys[0])) {
458		unlock_block(s->info, right);
459		s->nodes[1] = left;
460	} else {
461		unlock_block(s->info, left);
462		s->nodes[1] = right;
463	}
464
465	return 0;
466}
467
468/*
469 * Splits a node by creating two new children beneath the given node.
470 *
471 * Before:
472 *	  +----------+
473 *	  | A ++++++ |
474 *	  +----------+
475 *
476 *
477 * After:
478 *	+------------+
479 *	| A (shadow) |
480 *	+------------+
481 *	    |	|
482 *   +------+	+----+
483 *   |		     |
484 *   v		     v
485 * +-------+	 +-------+
486 * | B +++ |	 | C +++ |
487 * +-------+	 +-------+
488 */
489static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
490{
491	int r;
492	size_t size;
493	unsigned nr_left, nr_right;
494	struct dm_block *left, *right, *new_parent;
495	struct node *pn, *ln, *rn;
496	__le64 val;
497
498	new_parent = shadow_current(s);
499
500	r = new_block(s->info, &left);
501	if (r < 0)
502		return r;
503
504	r = new_block(s->info, &right);
505	if (r < 0) {
506		/* FIXME: put left */
507		return r;
508	}
509
510	pn = dm_block_data(new_parent);
511	ln = dm_block_data(left);
512	rn = dm_block_data(right);
513
514	nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
515	nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
516
517	ln->header.flags = pn->header.flags;
518	ln->header.nr_entries = cpu_to_le32(nr_left);
519	ln->header.max_entries = pn->header.max_entries;
520	ln->header.value_size = pn->header.value_size;
521
522	rn->header.flags = pn->header.flags;
523	rn->header.nr_entries = cpu_to_le32(nr_right);
524	rn->header.max_entries = pn->header.max_entries;
525	rn->header.value_size = pn->header.value_size;
526
527	memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
528	memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
529
530	size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
531		sizeof(__le64) : s->info->value_type.size;
532	memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
533	memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
534	       nr_right * size);
535
536	/* new_parent should just point to l and r now */
537	pn->header.flags = cpu_to_le32(INTERNAL_NODE);
538	pn->header.nr_entries = cpu_to_le32(2);
539	pn->header.max_entries = cpu_to_le32(
540		calc_max_entries(sizeof(__le64),
541				 dm_bm_block_size(
542					 dm_tm_get_bm(s->info->tm))));
543	pn->header.value_size = cpu_to_le32(sizeof(__le64));
544
545	val = cpu_to_le64(dm_block_location(left));
546	__dm_bless_for_disk(&val);
547	pn->keys[0] = ln->keys[0];
548	memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
549
550	val = cpu_to_le64(dm_block_location(right));
551	__dm_bless_for_disk(&val);
552	pn->keys[1] = rn->keys[0];
553	memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
554
555	/*
556	 * rejig the spine.  This is ugly, since it knows too
557	 * much about the spine
558	 */
559	if (s->nodes[0] != new_parent) {
560		unlock_block(s->info, s->nodes[0]);
561		s->nodes[0] = new_parent;
562	}
563	if (key < le64_to_cpu(rn->keys[0])) {
564		unlock_block(s->info, right);
565		s->nodes[1] = left;
566	} else {
567		unlock_block(s->info, left);
568		s->nodes[1] = right;
569	}
570	s->count = 2;
571
572	return 0;
573}
574
575static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
576			    struct dm_btree_value_type *vt,
577			    uint64_t key, unsigned *index)
578{
579	int r, i = *index, top = 1;
580	struct node *node;
581
582	for (;;) {
583		r = shadow_step(s, root, vt);
584		if (r < 0)
585			return r;
586
587		node = dm_block_data(shadow_current(s));
588
589		/*
590		 * We have to patch up the parent node, ugly, but I don't
591		 * see a way to do this automatically as part of the spine
592		 * op.
593		 */
594		if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
595			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
596
597			__dm_bless_for_disk(&location);
598			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
599				    &location, sizeof(__le64));
600		}
601
602		node = dm_block_data(shadow_current(s));
603
604		if (node->header.nr_entries == node->header.max_entries) {
605			if (top)
606				r = btree_split_beneath(s, key);
607			else
608				r = btree_split_sibling(s, root, i, key);
609
610			if (r < 0)
611				return r;
612		}
613
614		node = dm_block_data(shadow_current(s));
615
616		i = lower_bound(node, key);
617
618		if (le32_to_cpu(node->header.flags) & LEAF_NODE)
619			break;
620
621		if (i < 0) {
622			/* change the bounds on the lowest key */
623			node->keys[0] = cpu_to_le64(key);
624			i = 0;
625		}
626
627		root = value64(node, i);
628		top = 0;
629	}
630
631	if (i < 0 || le64_to_cpu(node->keys[i]) != key)
632		i++;
633
634	*index = i;
635	return 0;
636}
637
638static int insert(struct dm_btree_info *info, dm_block_t root,
639		  uint64_t *keys, void *value, dm_block_t *new_root,
640		  int *inserted)
641		  __dm_written_to_disk(value)
642{
643	int r, need_insert;
644	unsigned level, index = -1, last_level = info->levels - 1;
645	dm_block_t block = root;
646	struct shadow_spine spine;
647	struct node *n;
648	struct dm_btree_value_type le64_type;
649
650	le64_type.context = NULL;
651	le64_type.size = sizeof(__le64);
652	le64_type.inc = NULL;
653	le64_type.dec = NULL;
654	le64_type.equal = NULL;
655
656	init_shadow_spine(&spine, info);
657
658	for (level = 0; level < (info->levels - 1); level++) {
659		r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
660		if (r < 0)
661			goto bad;
662
663		n = dm_block_data(shadow_current(&spine));
664		need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
665			       (le64_to_cpu(n->keys[index]) != keys[level]));
666
667		if (need_insert) {
668			dm_block_t new_tree;
669			__le64 new_le;
670
671			r = dm_btree_empty(info, &new_tree);
672			if (r < 0)
673				goto bad;
674
675			new_le = cpu_to_le64(new_tree);
676			__dm_bless_for_disk(&new_le);
677
678			r = insert_at(sizeof(uint64_t), n, index,
679				      keys[level], &new_le);
680			if (r)
681				goto bad;
682		}
683
684		if (level < last_level)
685			block = value64(n, index);
686	}
687
688	r = btree_insert_raw(&spine, block, &info->value_type,
689			     keys[level], &index);
690	if (r < 0)
691		goto bad;
692
693	n = dm_block_data(shadow_current(&spine));
694	need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
695		       (le64_to_cpu(n->keys[index]) != keys[level]));
696
697	if (need_insert) {
698		if (inserted)
699			*inserted = 1;
700
701		r = insert_at(info->value_type.size, n, index,
702			      keys[level], value);
703		if (r)
704			goto bad_unblessed;
705	} else {
706		if (inserted)
707			*inserted = 0;
708
709		if (info->value_type.dec &&
710		    (!info->value_type.equal ||
711		     !info->value_type.equal(
712			     info->value_type.context,
713			     value_ptr(n, index, info->value_type.size),
714			     value))) {
715			info->value_type.dec(info->value_type.context,
716					     value_ptr(n, index, info->value_type.size));
717		}
718		memcpy_disk(value_ptr(n, index, info->value_type.size),
719			    value, info->value_type.size);
720	}
721
722	*new_root = shadow_root(&spine);
723	exit_shadow_spine(&spine);
724
725	return 0;
726
727bad:
728	__dm_unbless_for_disk(value);
729bad_unblessed:
730	exit_shadow_spine(&spine);
731	return r;
732}
733
734int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
735		    uint64_t *keys, void *value, dm_block_t *new_root)
736		    __dm_written_to_disk(value)
737{
738	return insert(info, root, keys, value, new_root, NULL);
739}
740EXPORT_SYMBOL_GPL(dm_btree_insert);
741
742int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
743			   uint64_t *keys, void *value, dm_block_t *new_root,
744			   int *inserted)
745			   __dm_written_to_disk(value)
746{
747	return insert(info, root, keys, value, new_root, inserted);
748}
749EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
750
751/*----------------------------------------------------------------*/
752
753static int find_highest_key(struct ro_spine *s, dm_block_t block,
754			    uint64_t *result_key, dm_block_t *next_block)
755{
756	int i, r;
757	uint32_t flags;
758
759	do {
760		r = ro_step(s, block);
761		if (r < 0)
762			return r;
763
764		flags = le32_to_cpu(ro_node(s)->header.flags);
765		i = le32_to_cpu(ro_node(s)->header.nr_entries);
766		if (!i)
767			return -ENODATA;
768		else
769			i--;
770
771		*result_key = le64_to_cpu(ro_node(s)->keys[i]);
772		if (next_block || flags & INTERNAL_NODE)
773			block = value64(ro_node(s), i);
774
775	} while (flags & INTERNAL_NODE);
776
777	if (next_block)
778		*next_block = block;
779	return 0;
780}
781
782int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
783			      uint64_t *result_keys)
784{
785	int r = 0, count = 0, level;
786	struct ro_spine spine;
787
788	init_ro_spine(&spine, info);
789	for (level = 0; level < info->levels; level++) {
790		r = find_highest_key(&spine, root, result_keys + level,
791				     level == info->levels - 1 ? NULL : &root);
792		if (r == -ENODATA) {
793			r = 0;
794			break;
795
796		} else if (r)
797			break;
798
799		count++;
800	}
801	exit_ro_spine(&spine);
802
803	return r ? r : count;
804}
805EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
806