1/*
2 * In-kernel transcendent memory (generic implementation)
3 *
4 * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
5 *
6 * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
7 * "handles" (triples containing a pool id, and object id, and an index), to
8 * pages in a page-accessible memory (PAM).  Tmem references the PAM pages via
9 * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
10 * set of functions (pamops).  Each pampd contains some representation of
11 * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
12 * pages and must be able to insert, find, and delete these pages at a
13 * potential frequency of thousands per second concurrently across many CPUs,
14 * (and, if used with KVM, across many vcpus across many guests).
15 * Tmem is tracked with a hierarchy of data structures, organized by
16 * the elements in a handle-tuple: pool_id, object_id, and page index.
17 * One or more "clients" (e.g. guests) each provide one or more tmem_pools.
18 * Each pool, contains a hash table of rb_trees of tmem_objs.  Each
19 * tmem_obj contains a radix-tree-like tree of pointers, with intermediate
20 * nodes called tmem_objnodes.  Each leaf pointer in this tree points to
21 * a pampd, which is accessible only through a small set of callbacks
22 * registered by the PAM implementation (see tmem_register_pamops). Tmem
23 * does all memory allocation via a set of callbacks registered by the tmem
24 * host implementation (e.g. see tmem_register_hostops).
25 */
26
27#include <linux/list.h>
28#include <linux/spinlock.h>
29#include <linux/atomic.h>
30
31#include "tmem.h"
32
33/* data structure sentinels used for debugging... see tmem.h */
34#define POOL_SENTINEL 0x87658765
35#define OBJ_SENTINEL 0x12345678
36#define OBJNODE_SENTINEL 0xfedcba09
37
38/*
39 * A tmem host implementation must use this function to register callbacks
40 * for memory allocation.
41 */
42static struct tmem_hostops tmem_hostops;
43
44static void tmem_objnode_tree_init(void);
45
46void tmem_register_hostops(struct tmem_hostops *m)
47{
48	tmem_objnode_tree_init();
49	tmem_hostops = *m;
50}
51
52/*
53 * A tmem host implementation must use this function to register
54 * callbacks for a page-accessible memory (PAM) implementation
55 */
56static struct tmem_pamops tmem_pamops;
57
58void tmem_register_pamops(struct tmem_pamops *m)
59{
60	tmem_pamops = *m;
61}
62
63/*
64 * Oid's are potentially very sparse and tmem_objs may have an indeterminately
65 * short life, being added and deleted at a relatively high frequency.
66 * So an rb_tree is an ideal data structure to manage tmem_objs.  But because
67 * of the potentially huge number of tmem_objs, each pool manages a hashtable
68 * of rb_trees to reduce search, insert, delete, and rebalancing time.
69 * Each hashbucket also has a lock to manage concurrent access.
70 *
71 * The following routines manage tmem_objs.  When any tmem_obj is accessed,
72 * the hashbucket lock must be held.
73 */
74
75/* searches for object==oid in pool, returns locked object if found */
76static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
77					struct tmem_oid *oidp)
78{
79	struct rb_node *rbnode;
80	struct tmem_obj *obj;
81
82	rbnode = hb->obj_rb_root.rb_node;
83	while (rbnode) {
84		BUG_ON(RB_EMPTY_NODE(rbnode));
85		obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
86		switch (tmem_oid_compare(oidp, &obj->oid)) {
87		case 0: /* equal */
88			goto out;
89		case -1:
90			rbnode = rbnode->rb_left;
91			break;
92		case 1:
93			rbnode = rbnode->rb_right;
94			break;
95		}
96	}
97	obj = NULL;
98out:
99	return obj;
100}
101
102static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
103
104/* free an object that has no more pampds in it */
105static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
106{
107	struct tmem_pool *pool;
108
109	BUG_ON(obj == NULL);
110	ASSERT_SENTINEL(obj, OBJ);
111	BUG_ON(obj->pampd_count > 0);
112	pool = obj->pool;
113	BUG_ON(pool == NULL);
114	if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
115		tmem_pampd_destroy_all_in_obj(obj);
116	BUG_ON(obj->objnode_tree_root != NULL);
117	BUG_ON((long)obj->objnode_count != 0);
118	atomic_dec(&pool->obj_count);
119	BUG_ON(atomic_read(&pool->obj_count) < 0);
120	INVERT_SENTINEL(obj, OBJ);
121	obj->pool = NULL;
122	tmem_oid_set_invalid(&obj->oid);
123	rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
124}
125
126/*
127 * initialize, and insert an tmem_object_root (called only if find failed)
128 */
129static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
130					struct tmem_pool *pool,
131					struct tmem_oid *oidp)
132{
133	struct rb_root *root = &hb->obj_rb_root;
134	struct rb_node **new = &(root->rb_node), *parent = NULL;
135	struct tmem_obj *this;
136
137	BUG_ON(pool == NULL);
138	atomic_inc(&pool->obj_count);
139	obj->objnode_tree_height = 0;
140	obj->objnode_tree_root = NULL;
141	obj->pool = pool;
142	obj->oid = *oidp;
143	obj->objnode_count = 0;
144	obj->pampd_count = 0;
145	(*tmem_pamops.new_obj)(obj);
146	SET_SENTINEL(obj, OBJ);
147	while (*new) {
148		BUG_ON(RB_EMPTY_NODE(*new));
149		this = rb_entry(*new, struct tmem_obj, rb_tree_node);
150		parent = *new;
151		switch (tmem_oid_compare(oidp, &this->oid)) {
152		case 0:
153			BUG(); /* already present; should never happen! */
154			break;
155		case -1:
156			new = &(*new)->rb_left;
157			break;
158		case 1:
159			new = &(*new)->rb_right;
160			break;
161		}
162	}
163	rb_link_node(&obj->rb_tree_node, parent, new);
164	rb_insert_color(&obj->rb_tree_node, root);
165}
166
167/*
168 * Tmem is managed as a set of tmem_pools with certain attributes, such as
169 * "ephemeral" vs "persistent".  These attributes apply to all tmem_objs
170 * and all pampds that belong to a tmem_pool.  A tmem_pool is created
171 * or deleted relatively rarely (for example, when a filesystem is
172 * mounted or unmounted.
173 */
174
175/* flush all data from a pool and, optionally, free it */
176static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
177{
178	struct rb_node *rbnode;
179	struct tmem_obj *obj;
180	struct tmem_hashbucket *hb = &pool->hashbucket[0];
181	int i;
182
183	BUG_ON(pool == NULL);
184	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
185		spin_lock(&hb->lock);
186		rbnode = rb_first(&hb->obj_rb_root);
187		while (rbnode != NULL) {
188			obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
189			rbnode = rb_next(rbnode);
190			tmem_pampd_destroy_all_in_obj(obj);
191			tmem_obj_free(obj, hb);
192			(*tmem_hostops.obj_free)(obj, pool);
193		}
194		spin_unlock(&hb->lock);
195	}
196	if (destroy)
197		list_del(&pool->pool_list);
198}
199
200/*
201 * A tmem_obj contains a radix-tree-like tree in which the intermediate
202 * nodes are called tmem_objnodes.  (The kernel lib/radix-tree.c implementation
203 * is very specialized and tuned for specific uses and is not particularly
204 * suited for use from this code, though some code from the core algorithms has
205 * been reused, thus the copyright notices below).  Each tmem_objnode contains
206 * a set of pointers which point to either a set of intermediate tmem_objnodes
207 * or a set of of pampds.
208 *
209 * Portions Copyright (C) 2001 Momchil Velikov
210 * Portions Copyright (C) 2001 Christoph Hellwig
211 * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
212 */
213
214struct tmem_objnode_tree_path {
215	struct tmem_objnode *objnode;
216	int offset;
217};
218
219/* objnode height_to_maxindex translation */
220static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
221
222static void tmem_objnode_tree_init(void)
223{
224	unsigned int ht, tmp;
225
226	for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
227		tmp = ht * OBJNODE_TREE_MAP_SHIFT;
228		if (tmp >= OBJNODE_TREE_INDEX_BITS)
229			tmem_objnode_tree_h2max[ht] = ~0UL;
230		else
231			tmem_objnode_tree_h2max[ht] =
232			    (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
233	}
234}
235
236static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
237{
238	struct tmem_objnode *objnode;
239
240	ASSERT_SENTINEL(obj, OBJ);
241	BUG_ON(obj->pool == NULL);
242	ASSERT_SENTINEL(obj->pool, POOL);
243	objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
244	if (unlikely(objnode == NULL))
245		goto out;
246	objnode->obj = obj;
247	SET_SENTINEL(objnode, OBJNODE);
248	memset(&objnode->slots, 0, sizeof(objnode->slots));
249	objnode->slots_in_use = 0;
250	obj->objnode_count++;
251out:
252	return objnode;
253}
254
255static void tmem_objnode_free(struct tmem_objnode *objnode)
256{
257	struct tmem_pool *pool;
258	int i;
259
260	BUG_ON(objnode == NULL);
261	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
262		BUG_ON(objnode->slots[i] != NULL);
263	ASSERT_SENTINEL(objnode, OBJNODE);
264	INVERT_SENTINEL(objnode, OBJNODE);
265	BUG_ON(objnode->obj == NULL);
266	ASSERT_SENTINEL(objnode->obj, OBJ);
267	pool = objnode->obj->pool;
268	BUG_ON(pool == NULL);
269	ASSERT_SENTINEL(pool, POOL);
270	objnode->obj->objnode_count--;
271	objnode->obj = NULL;
272	(*tmem_hostops.objnode_free)(objnode, pool);
273}
274
275/*
276 * lookup index in object and return associated pampd (or NULL if not found)
277 */
278static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
279{
280	unsigned int height, shift;
281	struct tmem_objnode **slot = NULL;
282
283	BUG_ON(obj == NULL);
284	ASSERT_SENTINEL(obj, OBJ);
285	BUG_ON(obj->pool == NULL);
286	ASSERT_SENTINEL(obj->pool, POOL);
287
288	height = obj->objnode_tree_height;
289	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
290		goto out;
291	if (height == 0 && obj->objnode_tree_root) {
292		slot = &obj->objnode_tree_root;
293		goto out;
294	}
295	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
296	slot = &obj->objnode_tree_root;
297	while (height > 0) {
298		if (*slot == NULL)
299			goto out;
300		slot = (struct tmem_objnode **)
301			((*slot)->slots +
302			 ((index >> shift) & OBJNODE_TREE_MAP_MASK));
303		shift -= OBJNODE_TREE_MAP_SHIFT;
304		height--;
305	}
306out:
307	return slot != NULL ? (void **)slot : NULL;
308}
309
310static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
311{
312	struct tmem_objnode **slot;
313
314	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
315	return slot != NULL ? *slot : NULL;
316}
317
318static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
319					void *new_pampd)
320{
321	struct tmem_objnode **slot;
322	void *ret = NULL;
323
324	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
325	if ((slot != NULL) && (*slot != NULL)) {
326		void *old_pampd = *(void **)slot;
327		*(void **)slot = new_pampd;
328		(*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0);
329		ret = new_pampd;
330	}
331	return ret;
332}
333
334static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
335					void *pampd)
336{
337	int ret = 0;
338	struct tmem_objnode *objnode = NULL, *newnode, *slot;
339	unsigned int height, shift;
340	int offset = 0;
341
342	/* if necessary, extend the tree to be higher  */
343	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
344		height = obj->objnode_tree_height + 1;
345		if (index > tmem_objnode_tree_h2max[height])
346			while (index > tmem_objnode_tree_h2max[height])
347				height++;
348		if (obj->objnode_tree_root == NULL) {
349			obj->objnode_tree_height = height;
350			goto insert;
351		}
352		do {
353			newnode = tmem_objnode_alloc(obj);
354			if (!newnode) {
355				ret = -ENOMEM;
356				goto out;
357			}
358			newnode->slots[0] = obj->objnode_tree_root;
359			newnode->slots_in_use = 1;
360			obj->objnode_tree_root = newnode;
361			obj->objnode_tree_height++;
362		} while (height > obj->objnode_tree_height);
363	}
364insert:
365	slot = obj->objnode_tree_root;
366	height = obj->objnode_tree_height;
367	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
368	while (height > 0) {
369		if (slot == NULL) {
370			/* add a child objnode.  */
371			slot = tmem_objnode_alloc(obj);
372			if (!slot) {
373				ret = -ENOMEM;
374				goto out;
375			}
376			if (objnode) {
377
378				objnode->slots[offset] = slot;
379				objnode->slots_in_use++;
380			} else
381				obj->objnode_tree_root = slot;
382		}
383		/* go down a level */
384		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
385		objnode = slot;
386		slot = objnode->slots[offset];
387		shift -= OBJNODE_TREE_MAP_SHIFT;
388		height--;
389	}
390	BUG_ON(slot != NULL);
391	if (objnode) {
392		objnode->slots_in_use++;
393		objnode->slots[offset] = pampd;
394	} else
395		obj->objnode_tree_root = pampd;
396	obj->pampd_count++;
397out:
398	return ret;
399}
400
401static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
402{
403	struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
404	struct tmem_objnode_tree_path *pathp = path;
405	struct tmem_objnode *slot = NULL;
406	unsigned int height, shift;
407	int offset;
408
409	BUG_ON(obj == NULL);
410	ASSERT_SENTINEL(obj, OBJ);
411	BUG_ON(obj->pool == NULL);
412	ASSERT_SENTINEL(obj->pool, POOL);
413	height = obj->objnode_tree_height;
414	if (index > tmem_objnode_tree_h2max[height])
415		goto out;
416	slot = obj->objnode_tree_root;
417	if (height == 0 && obj->objnode_tree_root) {
418		obj->objnode_tree_root = NULL;
419		goto out;
420	}
421	shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
422	pathp->objnode = NULL;
423	do {
424		if (slot == NULL)
425			goto out;
426		pathp++;
427		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
428		pathp->offset = offset;
429		pathp->objnode = slot;
430		slot = slot->slots[offset];
431		shift -= OBJNODE_TREE_MAP_SHIFT;
432		height--;
433	} while (height > 0);
434	if (slot == NULL)
435		goto out;
436	while (pathp->objnode) {
437		pathp->objnode->slots[pathp->offset] = NULL;
438		pathp->objnode->slots_in_use--;
439		if (pathp->objnode->slots_in_use) {
440			if (pathp->objnode == obj->objnode_tree_root) {
441				while (obj->objnode_tree_height > 0 &&
442				  obj->objnode_tree_root->slots_in_use == 1 &&
443				  obj->objnode_tree_root->slots[0]) {
444					struct tmem_objnode *to_free =
445						obj->objnode_tree_root;
446
447					obj->objnode_tree_root =
448							to_free->slots[0];
449					obj->objnode_tree_height--;
450					to_free->slots[0] = NULL;
451					to_free->slots_in_use = 0;
452					tmem_objnode_free(to_free);
453				}
454			}
455			goto out;
456		}
457		tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
458		pathp--;
459	}
460	obj->objnode_tree_height = 0;
461	obj->objnode_tree_root = NULL;
462
463out:
464	if (slot != NULL)
465		obj->pampd_count--;
466	BUG_ON(obj->pampd_count < 0);
467	return slot;
468}
469
470/* recursively walk the objnode_tree destroying pampds and objnodes */
471static void tmem_objnode_node_destroy(struct tmem_obj *obj,
472					struct tmem_objnode *objnode,
473					unsigned int ht)
474{
475	int i;
476
477	if (ht == 0)
478		return;
479	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
480		if (objnode->slots[i]) {
481			if (ht == 1) {
482				obj->pampd_count--;
483				(*tmem_pamops.free)(objnode->slots[i],
484						obj->pool, NULL, 0);
485				objnode->slots[i] = NULL;
486				continue;
487			}
488			tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
489			tmem_objnode_free(objnode->slots[i]);
490			objnode->slots[i] = NULL;
491		}
492	}
493}
494
495static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
496{
497	if (obj->objnode_tree_root == NULL)
498		return;
499	if (obj->objnode_tree_height == 0) {
500		obj->pampd_count--;
501		(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0);
502	} else {
503		tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
504					obj->objnode_tree_height);
505		tmem_objnode_free(obj->objnode_tree_root);
506		obj->objnode_tree_height = 0;
507	}
508	obj->objnode_tree_root = NULL;
509	(*tmem_pamops.free_obj)(obj->pool, obj);
510}
511
512/*
513 * Tmem is operated on by a set of well-defined actions:
514 * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
515 * (The tmem ABI allows for subpages and exchanges but these operations
516 * are not included in this implementation.)
517 *
518 * These "tmem core" operations are implemented in the following functions.
519 */
520
521/*
522 * "Put" a page, e.g. copy a page from the kernel into newly allocated
523 * PAM space (if such space is available).  Tmem_put is complicated by
524 * a corner case: What if a page with matching handle already exists in
525 * tmem?  To guarantee coherency, one of two actions is necessary: Either
526 * the data for the page must be overwritten, or the page must be
527 * "flushed" so that the data is not accessible to a subsequent "get".
528 * Since these "duplicate puts" are relatively rare, this implementation
529 * always flushes for simplicity.
530 */
531int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
532		char *data, size_t size, bool raw, bool ephemeral)
533{
534	struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
535	void *pampd = NULL, *pampd_del = NULL;
536	int ret = -ENOMEM;
537	struct tmem_hashbucket *hb;
538
539	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
540	spin_lock(&hb->lock);
541	obj = objfound = tmem_obj_find(hb, oidp);
542	if (obj != NULL) {
543		pampd = tmem_pampd_lookup_in_obj(objfound, index);
544		if (pampd != NULL) {
545			/* if found, is a dup put, flush the old one */
546			pampd_del = tmem_pampd_delete_from_obj(obj, index);
547			BUG_ON(pampd_del != pampd);
548			(*tmem_pamops.free)(pampd, pool, oidp, index);
549			if (obj->pampd_count == 0) {
550				objnew = obj;
551				objfound = NULL;
552			}
553			pampd = NULL;
554		}
555	} else {
556		obj = objnew = (*tmem_hostops.obj_alloc)(pool);
557		if (unlikely(obj == NULL)) {
558			ret = -ENOMEM;
559			goto out;
560		}
561		tmem_obj_init(obj, hb, pool, oidp);
562	}
563	BUG_ON(obj == NULL);
564	BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
565	pampd = (*tmem_pamops.create)(data, size, raw, ephemeral,
566					obj->pool, &obj->oid, index);
567	if (unlikely(pampd == NULL))
568		goto free;
569	ret = tmem_pampd_add_to_obj(obj, index, pampd);
570	if (unlikely(ret == -ENOMEM))
571		/* may have partially built objnode tree ("stump") */
572		goto delete_and_free;
573	goto out;
574
575delete_and_free:
576	(void)tmem_pampd_delete_from_obj(obj, index);
577free:
578	if (pampd)
579		(*tmem_pamops.free)(pampd, pool, NULL, 0);
580	if (objnew) {
581		tmem_obj_free(objnew, hb);
582		(*tmem_hostops.obj_free)(objnew, pool);
583	}
584out:
585	spin_unlock(&hb->lock);
586	return ret;
587}
588
589/*
590 * "Get" a page, e.g. if one can be found, copy the tmem page with the
591 * matching handle from PAM space to the kernel.  By tmem definition,
592 * when a "get" is successful on an ephemeral page, the page is "flushed",
593 * and when a "get" is successful on a persistent page, the page is retained
594 * in tmem.  Note that to preserve
595 * coherency, "get" can never be skipped if tmem contains the data.
596 * That is, if a get is done with a certain handle and fails, any
597 * subsequent "get" must also fail (unless of course there is a
598 * "put" done with the same handle).
599
600 */
601int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
602		char *data, size_t *size, bool raw, int get_and_free)
603{
604	struct tmem_obj *obj;
605	void *pampd;
606	bool ephemeral = is_ephemeral(pool);
607	int ret = -1;
608	struct tmem_hashbucket *hb;
609	bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
610	bool lock_held = false;
611
612	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
613	spin_lock(&hb->lock);
614	lock_held = true;
615	obj = tmem_obj_find(hb, oidp);
616	if (obj == NULL)
617		goto out;
618	if (free)
619		pampd = tmem_pampd_delete_from_obj(obj, index);
620	else
621		pampd = tmem_pampd_lookup_in_obj(obj, index);
622	if (pampd == NULL)
623		goto out;
624	if (free) {
625		if (obj->pampd_count == 0) {
626			tmem_obj_free(obj, hb);
627			(*tmem_hostops.obj_free)(obj, pool);
628			obj = NULL;
629		}
630	}
631	if (tmem_pamops.is_remote(pampd)) {
632		lock_held = false;
633		spin_unlock(&hb->lock);
634	}
635	if (free)
636		ret = (*tmem_pamops.get_data_and_free)(
637				data, size, raw, pampd, pool, oidp, index);
638	else
639		ret = (*tmem_pamops.get_data)(
640				data, size, raw, pampd, pool, oidp, index);
641	if (ret < 0)
642		goto out;
643	ret = 0;
644out:
645	if (lock_held)
646		spin_unlock(&hb->lock);
647	return ret;
648}
649
650/*
651 * If a page in tmem matches the handle, "flush" this page from tmem such
652 * that any subsequent "get" does not succeed (unless, of course, there
653 * was another "put" with the same handle).
654 */
655int tmem_flush_page(struct tmem_pool *pool,
656				struct tmem_oid *oidp, uint32_t index)
657{
658	struct tmem_obj *obj;
659	void *pampd;
660	int ret = -1;
661	struct tmem_hashbucket *hb;
662
663	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
664	spin_lock(&hb->lock);
665	obj = tmem_obj_find(hb, oidp);
666	if (obj == NULL)
667		goto out;
668	pampd = tmem_pampd_delete_from_obj(obj, index);
669	if (pampd == NULL)
670		goto out;
671	(*tmem_pamops.free)(pampd, pool, oidp, index);
672	if (obj->pampd_count == 0) {
673		tmem_obj_free(obj, hb);
674		(*tmem_hostops.obj_free)(obj, pool);
675	}
676	ret = 0;
677
678out:
679	spin_unlock(&hb->lock);
680	return ret;
681}
682
683/*
684 * If a page in tmem matches the handle, replace the page so that any
685 * subsequent "get" gets the new page.  Returns 0 if
686 * there was a page to replace, else returns -1.
687 */
688int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
689			uint32_t index, void *new_pampd)
690{
691	struct tmem_obj *obj;
692	int ret = -1;
693	struct tmem_hashbucket *hb;
694
695	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
696	spin_lock(&hb->lock);
697	obj = tmem_obj_find(hb, oidp);
698	if (obj == NULL)
699		goto out;
700	new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd);
701	ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
702out:
703	spin_unlock(&hb->lock);
704	return ret;
705}
706
707/*
708 * "Flush" all pages in tmem matching this oid.
709 */
710int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
711{
712	struct tmem_obj *obj;
713	struct tmem_hashbucket *hb;
714	int ret = -1;
715
716	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
717	spin_lock(&hb->lock);
718	obj = tmem_obj_find(hb, oidp);
719	if (obj == NULL)
720		goto out;
721	tmem_pampd_destroy_all_in_obj(obj);
722	tmem_obj_free(obj, hb);
723	(*tmem_hostops.obj_free)(obj, pool);
724	ret = 0;
725
726out:
727	spin_unlock(&hb->lock);
728	return ret;
729}
730
731/*
732 * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
733 * all subsequent access to this tmem_pool.
734 */
735int tmem_destroy_pool(struct tmem_pool *pool)
736{
737	int ret = -1;
738
739	if (pool == NULL)
740		goto out;
741	tmem_pool_flush(pool, 1);
742	ret = 0;
743out:
744	return ret;
745}
746
747static LIST_HEAD(tmem_global_pool_list);
748
749/*
750 * Create a new tmem_pool with the provided flag and return
751 * a pool id provided by the tmem host implementation.
752 */
753void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
754{
755	int persistent = flags & TMEM_POOL_PERSIST;
756	int shared = flags & TMEM_POOL_SHARED;
757	struct tmem_hashbucket *hb = &pool->hashbucket[0];
758	int i;
759
760	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
761		hb->obj_rb_root = RB_ROOT;
762		spin_lock_init(&hb->lock);
763	}
764	INIT_LIST_HEAD(&pool->pool_list);
765	atomic_set(&pool->obj_count, 0);
766	SET_SENTINEL(pool, POOL);
767	list_add_tail(&pool->pool_list, &tmem_global_pool_list);
768	pool->persistent = persistent;
769	pool->shared = shared;
770}
771