bitmap.c revision 6a3a16f2ef6f335286e2b2bf8284b0ab4ff38ec0
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4/* Reiserfs block (de)allocator, bitmap-based. */
5
6#include <linux/config.h>
7#include <linux/time.h>
8#include <linux/reiserfs_fs.h>
9#include <linux/errno.h>
10#include <linux/buffer_head.h>
11#include <linux/kernel.h>
12#include <linux/pagemap.h>
13#include <linux/reiserfs_fs_sb.h>
14#include <linux/reiserfs_fs_i.h>
15#include <linux/quotaops.h>
16
17#define PREALLOCATION_SIZE 9
18
19/* different reiserfs block allocator options */
20
21#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits)
22
23#define  _ALLOC_concentrating_formatted_nodes 0
24#define  _ALLOC_displacing_large_files 1
25#define  _ALLOC_displacing_new_packing_localities 2
26#define  _ALLOC_old_hashed_relocation 3
27#define  _ALLOC_new_hashed_relocation 4
28#define  _ALLOC_skip_busy 5
29#define  _ALLOC_displace_based_on_dirid 6
30#define  _ALLOC_hashed_formatted_nodes 7
31#define  _ALLOC_old_way 8
32#define  _ALLOC_hundredth_slices 9
33#define  _ALLOC_dirid_groups 10
34#define  _ALLOC_oid_groups 11
35#define  _ALLOC_packing_groups 12
36
37#define  concentrating_formatted_nodes(s)	test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
38#define  displacing_large_files(s)		test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
39#define  displacing_new_packing_localities(s)	test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s))
40
41#define SET_OPTION(optname) \
42   do { \
43        reiserfs_warning(s, "reiserfs: option \"%s\" is set", #optname); \
44        set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
45    } while(0)
46#define TEST_OPTION(optname, s) \
47    test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
48
49static inline void get_bit_address (struct super_block * s,
50				    b_blocknr_t block, int * bmap_nr, int * offset)
51{
52    /* It is in the bitmap block number equal to the block
53     * number divided by the number of bits in a block. */
54    *bmap_nr = block / (s->s_blocksize << 3);
55    /* Within that bitmap block it is located at bit offset *offset. */
56    *offset = block & ((s->s_blocksize << 3) - 1 );
57    return;
58}
59
60#ifdef CONFIG_REISERFS_CHECK
61int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value)
62{
63    int i, j;
64
65    if (block == 0 || block >= SB_BLOCK_COUNT (s)) {
66	reiserfs_warning (s, "vs-4010: is_reusable: block number is out of range %lu (%u)",
67			  block, SB_BLOCK_COUNT (s));
68	return 0;
69    }
70
71    /* it can't be one of the bitmap blocks */
72    for (i = 0; i < SB_BMAP_NR (s); i ++)
73	if (block == SB_AP_BITMAP (s)[i].bh->b_blocknr) {
74	    reiserfs_warning (s, "vs: 4020: is_reusable: "
75			      "bitmap block %lu(%u) can't be freed or reused",
76			      block, SB_BMAP_NR (s));
77	    return 0;
78	}
79
80    get_bit_address (s, block, &i, &j);
81
82    if (i >= SB_BMAP_NR (s)) {
83	reiserfs_warning (s, "vs-4030: is_reusable: there is no so many bitmap blocks: "
84			  "block=%lu, bitmap_nr=%d", block, i);
85	return 0;
86    }
87
88    if ((bit_value == 0 &&
89         reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data)) ||
90	(bit_value == 1 &&
91	 reiserfs_test_le_bit(j, SB_AP_BITMAP (s)[i].bh->b_data) == 0)) {
92	reiserfs_warning (s, "vs-4040: is_reusable: corresponding bit of block %lu does not "
93			  "match required value (i==%d, j==%d) test_bit==%d",
94		block, i, j, reiserfs_test_le_bit (j, SB_AP_BITMAP (s)[i].bh->b_data));
95
96	return 0;
97    }
98
99    if (bit_value == 0 && block == SB_ROOT_BLOCK (s)) {
100	reiserfs_warning (s, "vs-4050: is_reusable: this is root block (%u), "
101			  "it must be busy", SB_ROOT_BLOCK (s));
102	return 0;
103    }
104
105    return 1;
106}
107#endif /* CONFIG_REISERFS_CHECK */
108
109/* searches in journal structures for a given block number (bmap, off). If block
110   is found in reiserfs journal it suggests next free block candidate to test. */
111static inline  int is_block_in_journal (struct super_block * s, int bmap, int
112off, int *next)
113{
114    b_blocknr_t tmp;
115
116    if (reiserfs_in_journal (s, bmap, off, 1, &tmp)) {
117	if (tmp) {              /* hint supplied */
118	    *next = tmp;
119	    PROC_INFO_INC( s, scan_bitmap.in_journal_hint );
120	} else {
121	    (*next) = off + 1;          /* inc offset to avoid looping. */
122	    PROC_INFO_INC( s, scan_bitmap.in_journal_nohint );
123	}
124	PROC_INFO_INC( s, scan_bitmap.retry );
125	return 1;
126    }
127    return 0;
128}
129
130/* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap
131 * block; */
132static int scan_bitmap_block (struct reiserfs_transaction_handle *th,
133			      int bmap_n, int *beg, int boundary, int min, int max, int unfm)
134{
135    struct super_block *s = th->t_super;
136    struct reiserfs_bitmap_info *bi=&SB_AP_BITMAP(s)[bmap_n];
137    int end, next;
138    int org = *beg;
139
140    BUG_ON (!th->t_trans_id);
141
142    RFALSE(bmap_n >= SB_BMAP_NR (s), "Bitmap %d is out of range (0..%d)",bmap_n, SB_BMAP_NR (s) - 1);
143    PROC_INFO_INC( s, scan_bitmap.bmap );
144/* this is unclear and lacks comments, explain how journal bitmaps
145   work here for the reader.  Convey a sense of the design here. What
146   is a window? */
147/* - I mean `a window of zero bits' as in description of this function - Zam. */
148
149    if ( !bi ) {
150	reiserfs_warning (s, "NULL bitmap info pointer for bitmap %d", bmap_n);
151	return 0;
152    }
153    if (buffer_locked (bi->bh)) {
154       PROC_INFO_INC( s, scan_bitmap.wait );
155       __wait_on_buffer (bi->bh);
156    }
157
158    while (1) {
159	cont:
160	if (bi->free_count < min)
161		return 0; // No free blocks in this bitmap
162
163	/* search for a first zero bit -- beggining of a window */
164	*beg = reiserfs_find_next_zero_le_bit
165	        ((unsigned long*)(bi->bh->b_data), boundary, *beg);
166
167	if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block
168				      * cannot contain a zero window of minimum size */
169	    return 0;
170	}
171
172	if (unfm && is_block_in_journal(s,bmap_n, *beg, beg))
173	    continue;
174	/* first zero bit found; we check next bits */
175	for (end = *beg + 1;; end ++) {
176	    if (end >= *beg + max || end >= boundary || reiserfs_test_le_bit (end, bi->bh->b_data)) {
177		next = end;
178		break;
179	    }
180	    /* finding the other end of zero bit window requires looking into journal structures (in
181	     * case of searching for free blocks for unformatted nodes) */
182	    if (unfm && is_block_in_journal(s, bmap_n, end, &next))
183		break;
184	}
185
186	/* now (*beg) points to beginning of zero bits window,
187	 * (end) points to one bit after the window end */
188	if (end - *beg >= min) { /* it seems we have found window of proper size */
189	    int i;
190	    reiserfs_prepare_for_journal (s, bi->bh, 1);
191	    /* try to set all blocks used checking are they still free */
192	    for (i = *beg; i < end; i++) {
193		/* It seems that we should not check in journal again. */
194		if (reiserfs_test_and_set_le_bit (i, bi->bh->b_data)) {
195		    /* bit was set by another process
196		     * while we slept in prepare_for_journal() */
197		    PROC_INFO_INC( s, scan_bitmap.stolen );
198		    if (i >= *beg + min)	{ /* we can continue with smaller set of allocated blocks,
199					   * if length of this set is more or equal to `min' */
200			end = i;
201			break;
202		    }
203		    /* otherwise we clear all bit were set ... */
204		    while (--i >= *beg)
205			reiserfs_test_and_clear_le_bit (i, bi->bh->b_data);
206		    reiserfs_restore_prepared_buffer (s, bi->bh);
207		    *beg = org;
208		    /* ... and search again in current block from beginning */
209		    goto cont;
210		}
211	    }
212	    bi->free_count -= (end - *beg);
213	    journal_mark_dirty (th, s, bi->bh);
214
215	    /* free block count calculation */
216	    reiserfs_prepare_for_journal (s, SB_BUFFER_WITH_SB(s), 1);
217	    PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
218	    journal_mark_dirty (th, s, SB_BUFFER_WITH_SB(s));
219
220	    return end - (*beg);
221	} else {
222	    *beg = next;
223	}
224    }
225}
226
227static int bmap_hash_id(struct super_block *s, u32 id) {
228    char * hash_in = NULL;
229    unsigned long hash;
230    unsigned bm;
231
232    if (id <= 2) {
233	bm = 1;
234    } else {
235        hash_in = (char *)(&id);
236        hash = keyed_hash(hash_in, 4);
237	bm = hash % SB_BMAP_NR(s);
238	if (!bm)
239	    bm = 1;
240    }
241    /* this can only be true when SB_BMAP_NR = 1 */
242    if (bm >= SB_BMAP_NR(s))
243    	bm = 0;
244    return bm;
245}
246
247/*
248 * hashes the id and then returns > 0 if the block group for the
249 * corresponding hash is full
250 */
251static inline int block_group_used(struct super_block *s, u32 id) {
252    int bm;
253    bm = bmap_hash_id(s, id);
254    if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100) ) {
255        return 0;
256    }
257    return 1;
258}
259
260/*
261 * the packing is returned in disk byte order
262 */
263u32 reiserfs_choose_packing(struct inode *dir) {
264    u32 packing;
265    if (TEST_OPTION(packing_groups, dir->i_sb)) {
266	u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
267	/*
268	 * some versions of reiserfsck expect packing locality 1 to be
269	 * special
270	 */
271	if (parent_dir == 1 || block_group_used(dir->i_sb,parent_dir))
272            packing = INODE_PKEY(dir)->k_objectid;
273        else
274            packing = INODE_PKEY(dir)->k_dir_id;
275    } else
276        packing = INODE_PKEY(dir)->k_objectid;
277    return packing;
278}
279
280/* Tries to find contiguous zero bit window (given size) in given region of
281 * bitmap and place new blocks there. Returns number of allocated blocks. */
282static int scan_bitmap (struct reiserfs_transaction_handle *th,
283			b_blocknr_t *start, b_blocknr_t finish,
284			int min, int max, int unfm, unsigned long file_block)
285{
286    int nr_allocated=0;
287    struct super_block * s = th->t_super;
288    /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr
289     * - Hans, it is not a block number - Zam. */
290
291    int bm, off;
292    int end_bm, end_off;
293    int off_max = s->s_blocksize << 3;
294
295    BUG_ON (!th->t_trans_id);
296
297    PROC_INFO_INC( s, scan_bitmap.call );
298    if ( SB_FREE_BLOCKS(s) <= 0)
299	return 0; // No point in looking for more free blocks
300
301    get_bit_address (s, *start, &bm, &off);
302    get_bit_address (s, finish, &end_bm, &end_off);
303    if (bm > SB_BMAP_NR(s))
304        return 0;
305    if (end_bm > SB_BMAP_NR(s))
306        end_bm = SB_BMAP_NR(s);
307
308    /* When the bitmap is more than 10% free, anyone can allocate.
309     * When it's less than 10% free, only files that already use the
310     * bitmap are allowed. Once we pass 80% full, this restriction
311     * is lifted.
312     *
313     * We do this so that files that grow later still have space close to
314     * their original allocation. This improves locality, and presumably
315     * performance as a result.
316     *
317     * This is only an allocation policy and does not make up for getting a
318     * bad hint. Decent hinting must be implemented for this to work well.
319     */
320    if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) {
321	for (;bm < end_bm; bm++, off = 0) {
322	    if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 )
323		nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
324	    if (nr_allocated)
325		goto ret;
326        }
327	/* we know from above that start is a reasonable number */
328	get_bit_address (s, *start, &bm, &off);
329    }
330
331    for (;bm < end_bm; bm++, off = 0) {
332	nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
333	if (nr_allocated)
334	    goto ret;
335    }
336
337    nr_allocated = scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
338
339 ret:
340    *start = bm * off_max + off;
341    return nr_allocated;
342
343}
344
345static void _reiserfs_free_block (struct reiserfs_transaction_handle *th,
346				  struct inode *inode, b_blocknr_t block,
347				  int for_unformatted)
348{
349    struct super_block * s = th->t_super;
350    struct reiserfs_super_block * rs;
351    struct buffer_head * sbh;
352    struct reiserfs_bitmap_info *apbi;
353    int nr, offset;
354
355    BUG_ON (!th->t_trans_id);
356
357    PROC_INFO_INC( s, free_block );
358
359    rs = SB_DISK_SUPER_BLOCK (s);
360    sbh = SB_BUFFER_WITH_SB (s);
361    apbi = SB_AP_BITMAP(s);
362
363    get_bit_address (s, block, &nr, &offset);
364
365    if (nr >= sb_bmap_nr (rs)) {
366	reiserfs_warning (s, "vs-4075: reiserfs_free_block: "
367			  "block %lu is out of range on %s",
368			  block, reiserfs_bdevname (s));
369	return;
370    }
371
372    reiserfs_prepare_for_journal(s, apbi[nr].bh, 1 ) ;
373
374    /* clear bit for the given block in bit map */
375    if (!reiserfs_test_and_clear_le_bit (offset, apbi[nr].bh->b_data)) {
376	reiserfs_warning (s, "vs-4080: reiserfs_free_block: "
377			  "free_block (%s:%lu)[dev:blocknr]: bit already cleared",
378			  reiserfs_bdevname (s), block);
379    }
380    apbi[nr].free_count ++;
381    journal_mark_dirty (th, s, apbi[nr].bh);
382
383    reiserfs_prepare_for_journal(s, sbh, 1) ;
384    /* update super block */
385    set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 );
386
387    journal_mark_dirty (th, s, sbh);
388    if (for_unformatted)
389        DQUOT_FREE_BLOCK_NODIRTY(inode, 1);
390}
391
392void reiserfs_free_block (struct reiserfs_transaction_handle *th,
393			  struct inode *inode, b_blocknr_t block,
394			  int for_unformatted)
395{
396    struct super_block * s = th->t_super;
397
398    BUG_ON (!th->t_trans_id);
399
400    RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
401    RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block");
402    /* mark it before we clear it, just in case */
403    journal_mark_freed(th, s, block) ;
404    _reiserfs_free_block(th, inode, block, for_unformatted) ;
405}
406
407/* preallocated blocks don't need to be run through journal_mark_freed */
408static void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th,
409			  struct inode *inode, b_blocknr_t block) {
410    RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device");
411    RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block");
412    BUG_ON (!th->t_trans_id);
413    _reiserfs_free_block(th, inode, block, 1) ;
414}
415
416static void __discard_prealloc (struct reiserfs_transaction_handle * th,
417				struct reiserfs_inode_info *ei)
418{
419    unsigned long save = ei->i_prealloc_block ;
420    int dirty = 0;
421    struct inode *inode = &ei->vfs_inode;
422    BUG_ON (!th->t_trans_id);
423#ifdef CONFIG_REISERFS_CHECK
424    if (ei->i_prealloc_count < 0)
425	reiserfs_warning (th->t_super, "zam-4001:%s: inode has negative prealloc blocks count.", __FUNCTION__ );
426#endif
427    while (ei->i_prealloc_count > 0) {
428	reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block);
429	ei->i_prealloc_block++;
430	ei->i_prealloc_count --;
431	dirty = 1;
432    }
433    if (dirty)
434    	reiserfs_update_sd(th, inode);
435    ei->i_prealloc_block = save;
436    list_del_init(&(ei->i_prealloc_list));
437}
438
439/* FIXME: It should be inline function */
440void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th,
441				struct inode *inode)
442{
443    struct reiserfs_inode_info *ei = REISERFS_I(inode);
444    BUG_ON (!th->t_trans_id);
445    if (ei->i_prealloc_count)
446	__discard_prealloc(th, ei);
447}
448
449void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th)
450{
451    struct list_head * plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
452
453    BUG_ON (!th->t_trans_id);
454
455    while (!list_empty(plist)) {
456	struct reiserfs_inode_info *ei;
457	ei = list_entry(plist->next, struct reiserfs_inode_info, i_prealloc_list);
458#ifdef CONFIG_REISERFS_CHECK
459	if (!ei->i_prealloc_count) {
460	    reiserfs_warning (th->t_super, "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.", __FUNCTION__);
461	}
462#endif
463	__discard_prealloc(th, ei);
464    }
465}
466
467void reiserfs_init_alloc_options (struct super_block *s)
468{
469    set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
470    set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
471    set_bit (_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
472}
473
474/* block allocator related options are parsed here */
475int reiserfs_parse_alloc_options(struct super_block * s, char * options)
476{
477    char * this_char, * value;
478
479    REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */
480
481    while ( (this_char = strsep (&options, ":")) != NULL ) {
482	if ((value = strchr (this_char, '=')) != NULL)
483	    *value++ = 0;
484
485	if (!strcmp(this_char, "concentrating_formatted_nodes")) {
486	    int temp;
487	    SET_OPTION(concentrating_formatted_nodes);
488	    temp = (value && *value) ? simple_strtoul (value, &value, 0) : 10;
489	    if (temp <= 0 || temp > 100) {
490		REISERFS_SB(s)->s_alloc_options.border = 10;
491	    } else {
492		REISERFS_SB(s)->s_alloc_options.border = 100 / temp;
493	   }
494	    continue;
495	}
496	if (!strcmp(this_char, "displacing_large_files")) {
497	    SET_OPTION(displacing_large_files);
498	    REISERFS_SB(s)->s_alloc_options.large_file_size =
499		(value && *value) ? simple_strtoul (value, &value, 0) : 16;
500	    continue;
501	}
502	if (!strcmp(this_char, "displacing_new_packing_localities")) {
503	    SET_OPTION(displacing_new_packing_localities);
504	    continue;
505	};
506
507	if (!strcmp(this_char, "old_hashed_relocation")) {
508	    SET_OPTION(old_hashed_relocation);
509	    continue;
510	}
511
512	if (!strcmp(this_char, "new_hashed_relocation")) {
513	    SET_OPTION(new_hashed_relocation);
514	    continue;
515	}
516
517        if (!strcmp(this_char, "dirid_groups")) {
518	    SET_OPTION(dirid_groups);
519	    continue;
520        }
521        if (!strcmp(this_char, "oid_groups")) {
522	    SET_OPTION(oid_groups);
523	    continue;
524        }
525        if (!strcmp(this_char, "packing_groups")) {
526	    SET_OPTION(packing_groups);
527	    continue;
528        }
529	if (!strcmp(this_char, "hashed_formatted_nodes")) {
530	    SET_OPTION(hashed_formatted_nodes);
531	    continue;
532	}
533
534	if (!strcmp(this_char, "skip_busy")) {
535	    SET_OPTION(skip_busy);
536	    continue;
537	}
538
539	if (!strcmp(this_char, "hundredth_slices")) {
540	    SET_OPTION(hundredth_slices);
541	    continue;
542	}
543
544	if (!strcmp(this_char, "old_way")) {
545	    SET_OPTION(old_way);
546	    continue;
547	}
548
549	if (!strcmp(this_char, "displace_based_on_dirid")) {
550	    SET_OPTION(displace_based_on_dirid);
551	    continue;
552	}
553
554	if (!strcmp(this_char, "preallocmin")) {
555	    REISERFS_SB(s)->s_alloc_options.preallocmin =
556		(value && *value) ? simple_strtoul (value, &value, 0) : 4;
557	    continue;
558	}
559
560	if (!strcmp(this_char, "preallocsize")) {
561	    REISERFS_SB(s)->s_alloc_options.preallocsize =
562		(value && *value) ? simple_strtoul (value, &value, 0) : PREALLOCATION_SIZE;
563	    continue;
564	}
565
566	reiserfs_warning (s, "zam-4001: %s : unknown option - %s",
567			  __FUNCTION__ , this_char);
568	return 1;
569      }
570
571    reiserfs_warning (s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
572    return 0;
573}
574
575static inline void new_hashed_relocation (reiserfs_blocknr_hint_t * hint)
576{
577    char * hash_in;
578    if (hint->formatted_node) {
579	    hash_in = (char*)&hint->key.k_dir_id;
580    } else {
581	if (!hint->inode) {
582	    //hint->search_start = hint->beg;
583	    hash_in = (char*)&hint->key.k_dir_id;
584	} else
585	    if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
586		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
587	    else
588		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
589      }
590
591    hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
592}
593
594/*
595 * Relocation based on dirid, hashing them into a given bitmap block
596 * files. Formatted nodes are unaffected, a seperate policy covers them
597 */
598static void
599dirid_groups (reiserfs_blocknr_hint_t *hint)
600{
601    unsigned long hash;
602    __u32 dirid = 0;
603    int bm = 0;
604    struct super_block *sb = hint->th->t_super;
605    if (hint->inode)
606	dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
607    else if (hint->formatted_node)
608        dirid = hint->key.k_dir_id;
609
610    if (dirid) {
611	bm = bmap_hash_id(sb, dirid);
612	hash = bm * (sb->s_blocksize << 3);
613	/* give a portion of the block group to metadata */
614	if (hint->inode)
615	    hash += sb->s_blocksize/2;
616	hint->search_start = hash;
617    }
618}
619
620/*
621 * Relocation based on oid, hashing them into a given bitmap block
622 * files. Formatted nodes are unaffected, a seperate policy covers them
623 */
624static void
625oid_groups (reiserfs_blocknr_hint_t *hint)
626{
627    if (hint->inode) {
628	unsigned long hash;
629	__u32 oid;
630	__u32 dirid;
631	int bm;
632
633	dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
634
635	/* keep the root dir and it's first set of subdirs close to
636	 * the start of the disk
637	 */
638	if (dirid <= 2)
639	    hash = (hint->inode->i_sb->s_blocksize << 3);
640	else {
641	    oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
642	    bm = bmap_hash_id(hint->inode->i_sb, oid);
643	    hash = bm * (hint->inode->i_sb->s_blocksize << 3);
644	}
645	hint->search_start = hash;
646    }
647}
648
649/* returns 1 if it finds an indirect item and gets valid hint info
650 * from it, otherwise 0
651 */
652static int get_left_neighbor(reiserfs_blocknr_hint_t *hint)
653{
654    struct path * path;
655    struct buffer_head * bh;
656    struct item_head * ih;
657    int pos_in_item;
658    __u32 * item;
659    int ret = 0;
660
661    if (!hint->path)		/* reiserfs code can call this function w/o pointer to path
662				 * structure supplied; then we rely on supplied search_start */
663	return 0;
664
665    path = hint->path;
666    bh = get_last_bh(path);
667    RFALSE( !bh, "green-4002: Illegal path specified to get_left_neighbor");
668    ih = get_ih(path);
669    pos_in_item = path->pos_in_item;
670    item = get_item (path);
671
672    hint->search_start = bh->b_blocknr;
673
674    if (!hint->formatted_node && is_indirect_le_ih (ih)) {
675	/* for indirect item: go to left and look for the first non-hole entry
676	   in the indirect item */
677	if (pos_in_item == I_UNFM_NUM (ih))
678	    pos_in_item--;
679//	    pos_in_item = I_UNFM_NUM (ih) - 1;
680	while (pos_in_item >= 0) {
681	    int t=get_block_num(item,pos_in_item);
682	    if (t) {
683		hint->search_start = t;
684		ret = 1;
685		break;
686	    }
687	    pos_in_item --;
688	}
689    }
690
691    /* does result value fit into specified region? */
692    return ret;
693}
694
695/* should be, if formatted node, then try to put on first part of the device
696   specified as number of percent with mount option device, else try to put
697   on last of device.  This is not to say it is good code to do so,
698   but the effect should be measured.  */
699static inline void set_border_in_hint(struct super_block *s, reiserfs_blocknr_hint_t *hint)
700{
701    b_blocknr_t border = SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
702
703    if (hint->formatted_node)
704	hint->end = border - 1;
705    else
706	hint->beg = border;
707}
708
709static inline void displace_large_file(reiserfs_blocknr_hint_t *hint)
710{
711    if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
712	hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), 4) % (hint->end - hint->beg);
713    else
714	hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), 4) % (hint->end - hint->beg);
715}
716
717static inline void hash_formatted_node(reiserfs_blocknr_hint_t *hint)
718{
719   char * hash_in;
720
721   if (!hint->inode)
722	hash_in = (char*)&hint->key.k_dir_id;
723    else if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
724	hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
725    else
726	hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
727
728	hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
729}
730
731static inline int this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *hint)
732{
733    return hint->block == REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
734}
735
736#ifdef DISPLACE_NEW_PACKING_LOCALITIES
737static inline void displace_new_packing_locality (reiserfs_blocknr_hint_t *hint)
738{
739    struct in_core_key * key = &hint->key;
740
741    hint->th->displace_new_blocks = 0;
742    hint->search_start = hint->beg + keyed_hash((char*)(&key->k_objectid),4) % (hint->end - hint->beg);
743}
744  #endif
745
746static inline int old_hashed_relocation (reiserfs_blocknr_hint_t * hint)
747{
748    b_blocknr_t border;
749    u32 hash_in;
750
751    if (hint->formatted_node || hint->inode == NULL) {
752	return 0;
753      }
754
755    hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
756    border = hint->beg + (u32) keyed_hash(((char *) (&hash_in)), 4) % (hint->end - hint->beg - 1);
757    if (border > hint->search_start)
758	hint->search_start = border;
759
760    return 1;
761  }
762
763static inline int old_way (reiserfs_blocknr_hint_t * hint)
764{
765    b_blocknr_t border;
766
767    if (hint->formatted_node || hint->inode == NULL) {
768	return 0;
769    }
770
771      border = hint->beg + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end  - hint->beg);
772    if (border > hint->search_start)
773	hint->search_start = border;
774
775    return 1;
776}
777
778static inline void hundredth_slices (reiserfs_blocknr_hint_t * hint)
779{
780    struct in_core_key * key = &hint->key;
781    b_blocknr_t slice_start;
782
783    slice_start = (keyed_hash((char*)(&key->k_dir_id),4) % 100) * (hint->end / 100);
784    if ( slice_start > hint->search_start || slice_start + (hint->end / 100) <= hint->search_start) {
785	hint->search_start = slice_start;
786    }
787}
788
789static void determine_search_start(reiserfs_blocknr_hint_t *hint,
790					  int amount_needed)
791{
792    struct super_block *s = hint->th->t_super;
793    int unfm_hint;
794
795    hint->beg = 0;
796    hint->end = SB_BLOCK_COUNT(s) - 1;
797
798    /* This is former border algorithm. Now with tunable border offset */
799    if (concentrating_formatted_nodes(s))
800	set_border_in_hint(s, hint);
801
802#ifdef DISPLACE_NEW_PACKING_LOCALITIES
803    /* whenever we create a new directory, we displace it.  At first we will
804       hash for location, later we might look for a moderately empty place for
805       it */
806    if (displacing_new_packing_localities(s)
807	&& hint->th->displace_new_blocks) {
808	displace_new_packing_locality(hint);
809
810	/* we do not continue determine_search_start,
811	 * if new packing locality is being displaced */
812	return;
813    }
814#endif
815
816    /* all persons should feel encouraged to add more special cases here and
817     * test them */
818
819    if (displacing_large_files(s) && !hint->formatted_node
820	&& this_blocknr_allocation_would_make_it_a_large_file(hint)) {
821	displace_large_file(hint);
822	return;
823    }
824
825    /* if none of our special cases is relevant, use the left neighbor in the
826       tree order of the new node we are allocating for */
827    if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) {
828        hash_formatted_node(hint);
829	return;
830    }
831
832    unfm_hint = get_left_neighbor(hint);
833
834    /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
835       new blocks are displaced based on directory ID. Also, if suggested search_start
836       is less than last preallocated block, we start searching from it, assuming that
837       HDD dataflow is faster in forward direction */
838    if ( TEST_OPTION(old_way, s)) {
839	if (!hint->formatted_node) {
840	    if ( !reiserfs_hashed_relocation(s))
841		old_way(hint);
842	    else if (!reiserfs_no_unhashed_relocation(s))
843		old_hashed_relocation(hint);
844
845	    if ( hint->inode && hint->search_start < REISERFS_I(hint->inode)->i_prealloc_block)
846		hint->search_start = REISERFS_I(hint->inode)->i_prealloc_block;
847	}
848	return;
849    }
850
851    /* This is an approach proposed by Hans */
852    if ( TEST_OPTION(hundredth_slices, s) && ! (displacing_large_files(s) && !hint->formatted_node)) {
853	hundredth_slices(hint);
854	return;
855    }
856
857    /* old_hashed_relocation only works on unformatted */
858    if (!unfm_hint && !hint->formatted_node &&
859        TEST_OPTION(old_hashed_relocation, s))
860    {
861	old_hashed_relocation(hint);
862    }
863    /* new_hashed_relocation works with both formatted/unformatted nodes */
864    if ((!unfm_hint || hint->formatted_node) &&
865        TEST_OPTION(new_hashed_relocation, s))
866    {
867	new_hashed_relocation(hint);
868    }
869    /* dirid grouping works only on unformatted nodes */
870    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s))
871    {
872        dirid_groups(hint);
873    }
874
875#ifdef DISPLACE_NEW_PACKING_LOCALITIES
876    if (hint->formatted_node && TEST_OPTION(dirid_groups,s))
877    {
878        dirid_groups(hint);
879    }
880#endif
881
882    /* oid grouping works only on unformatted nodes */
883    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s))
884    {
885        oid_groups(hint);
886    }
887    return;
888}
889
890static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
891{
892    /* make minimum size a mount option and benchmark both ways */
893    /* we preallocate blocks only for regular files, specific size */
894    /* benchmark preallocating always and see what happens */
895
896    hint->prealloc_size = 0;
897
898    if (!hint->formatted_node && hint->preallocate) {
899	if (S_ISREG(hint->inode->i_mode)
900	    && hint->inode->i_size >= REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocmin * hint->inode->i_sb->s_blocksize)
901	    hint->prealloc_size = REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocsize - 1;
902    }
903    return CARRY_ON;
904}
905
906/* XXX I know it could be merged with upper-level function;
907   but may be result function would be too complex. */
908static inline int allocate_without_wrapping_disk (reiserfs_blocknr_hint_t * hint,
909					 b_blocknr_t * new_blocknrs,
910					 b_blocknr_t start, b_blocknr_t finish,
911					 int min,
912					 int amount_needed, int prealloc_size)
913{
914    int rest = amount_needed;
915    int nr_allocated;
916
917    while (rest > 0 && start <= finish) {
918	nr_allocated = scan_bitmap (hint->th, &start, finish, min,
919				    rest + prealloc_size, !hint->formatted_node,
920				    hint->block);
921
922	if (nr_allocated == 0)	/* no new blocks allocated, return */
923	    break;
924
925	/* fill free_blocknrs array first */
926	while (rest > 0 && nr_allocated > 0) {
927	    * new_blocknrs ++ = start ++;
928	    rest --; nr_allocated --;
929	}
930
931	/* do we have something to fill prealloc. array also ? */
932	if (nr_allocated > 0) {
933	    /* it means prealloc_size was greater that 0 and we do preallocation */
934	    list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
935		     &SB_JOURNAL(hint->th->t_super)->j_prealloc_list);
936	    REISERFS_I(hint->inode)->i_prealloc_block = start;
937	    REISERFS_I(hint->inode)->i_prealloc_count = nr_allocated;
938	    break;
939	}
940    }
941
942    return (amount_needed - rest);
943}
944
945static inline int blocknrs_and_prealloc_arrays_from_search_start
946    (reiserfs_blocknr_hint_t *hint, b_blocknr_t *new_blocknrs, int amount_needed)
947{
948    struct super_block *s = hint->th->t_super;
949    b_blocknr_t start = hint->search_start;
950    b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
951    int passno = 0;
952    int nr_allocated = 0;
953    int bigalloc = 0;
954
955    determine_prealloc_size(hint);
956    if (!hint->formatted_node) {
957        int quota_ret;
958#ifdef REISERQUOTA_DEBUG
959	reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating %d blocks id=%u", amount_needed, hint->inode->i_uid);
960#endif
961	quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed);
962	if (quota_ret)    /* Quota exceeded? */
963	    return QUOTA_EXCEEDED;
964	if (hint->preallocate && hint->prealloc_size ) {
965#ifdef REISERQUOTA_DEBUG
966	    reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid);
967#endif
968	    quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size);
969	    if (quota_ret)
970		hint->preallocate=hint->prealloc_size=0;
971	}
972	/* for unformatted nodes, force large allocations */
973	bigalloc = amount_needed;
974    }
975
976    do {
977	/* in bigalloc mode, nr_allocated should stay zero until
978	 * the entire allocation is filled
979	 */
980	if (unlikely(bigalloc && nr_allocated)) {
981	    reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n",
982	    bigalloc, nr_allocated);
983	    /* reset things to a sane value */
984	    bigalloc = amount_needed - nr_allocated;
985	}
986	/*
987	 * try pass 0 and pass 1 looking for a nice big
988	 * contiguous allocation.  Then reset and look
989	 * for anything you can find.
990	 */
991	if (passno == 2 && bigalloc) {
992	    passno = 0;
993	    bigalloc = 0;
994	}
995	switch (passno++) {
996        case 0: /* Search from hint->search_start to end of disk */
997	    start = hint->search_start;
998	    finish = SB_BLOCK_COUNT(s) - 1;
999	    break;
1000        case 1: /* Search from hint->beg to hint->search_start */
1001	    start = hint->beg;
1002	    finish = hint->search_start;
1003	    break;
1004	case 2: /* Last chance: Search from 0 to hint->beg */
1005	    start = 0;
1006	    finish = hint->beg;
1007	    break;
1008	default: /* We've tried searching everywhere, not enough space */
1009	    /* Free the blocks */
1010	    if (!hint->formatted_node) {
1011#ifdef REISERQUOTA_DEBUG
1012		reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (nospace) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid);
1013#endif
1014		DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated);     /* Free not allocated blocks */
1015	    }
1016  	    while (nr_allocated --)
1017		reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node);
1018
1019	    return NO_DISK_SPACE;
1020	}
1021    } while ((nr_allocated += allocate_without_wrapping_disk (hint,
1022			    new_blocknrs + nr_allocated, start, finish,
1023			    bigalloc ? bigalloc : 1,
1024			    amount_needed - nr_allocated,
1025			    hint->prealloc_size))
1026			< amount_needed);
1027    if ( !hint->formatted_node &&
1028         amount_needed + hint->prealloc_size >
1029	 nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
1030    /* Some of preallocation blocks were not allocated */
1031#ifdef REISERQUOTA_DEBUG
1032	reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (failed prealloc) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid);
1033#endif
1034	DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed +
1035	                         hint->prealloc_size - nr_allocated -
1036				 REISERFS_I(hint->inode)->i_prealloc_count);
1037    }
1038
1039    return CARRY_ON;
1040}
1041
1042/* grab new blocknrs from preallocated list */
1043/* return amount still needed after using them */
1044static int use_preallocated_list_if_available (reiserfs_blocknr_hint_t *hint,
1045					       b_blocknr_t *new_blocknrs, int amount_needed)
1046{
1047    struct inode * inode = hint->inode;
1048
1049    if (REISERFS_I(inode)->i_prealloc_count > 0) {
1050	while (amount_needed) {
1051
1052	    *new_blocknrs ++ = REISERFS_I(inode)->i_prealloc_block ++;
1053	    REISERFS_I(inode)->i_prealloc_count --;
1054
1055	    amount_needed --;
1056
1057	    if (REISERFS_I(inode)->i_prealloc_count <= 0) {
1058		list_del(&REISERFS_I(inode)->i_prealloc_list);
1059		break;
1060	    }
1061	}
1062      }
1063    /* return amount still needed after using preallocated blocks */
1064    return amount_needed;
1065}
1066
1067int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
1068			       b_blocknr_t * new_blocknrs, int amount_needed,
1069			       int reserved_by_us /* Amount of blocks we have
1070						      already reserved */)
1071{
1072    int initial_amount_needed = amount_needed;
1073    int ret;
1074    struct super_block *s = hint->th->t_super;
1075
1076    /* Check if there is enough space, taking into account reserved space */
1077    if ( SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
1078	 amount_needed - reserved_by_us)
1079        return NO_DISK_SPACE;
1080    /* should this be if !hint->inode &&  hint->preallocate? */
1081    /* do you mean hint->formatted_node can be removed ? - Zam */
1082    /* hint->formatted_node cannot be removed because we try to access
1083       inode information here, and there is often no inode assotiated with
1084       metadata allocations - green */
1085
1086    if (!hint->formatted_node && hint->preallocate) {
1087	amount_needed = use_preallocated_list_if_available
1088	    (hint, new_blocknrs, amount_needed);
1089	if (amount_needed == 0)	/* all blocknrs we need we got from
1090                                   prealloc. list */
1091	    return CARRY_ON;
1092	new_blocknrs += (initial_amount_needed - amount_needed);
1093    }
1094
1095    /* find search start and save it in hint structure */
1096    determine_search_start(hint, amount_needed);
1097    if (hint->search_start >= SB_BLOCK_COUNT(s))
1098        hint->search_start = SB_BLOCK_COUNT(s) - 1;
1099
1100    /* allocation itself; fill new_blocknrs and preallocation arrays */
1101    ret = blocknrs_and_prealloc_arrays_from_search_start
1102	(hint, new_blocknrs, amount_needed);
1103
1104    /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we
1105     * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second
1106     * variant) */
1107
1108    if (ret != CARRY_ON) {
1109	while (amount_needed ++ < initial_amount_needed) {
1110	    reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1);
1111	}
1112    }
1113    return ret;
1114}
1115
1116/* These 2 functions are here to provide blocks reservation to the rest of kernel */
1117/* Reserve @blocks amount of blocks in fs pointed by @sb. Caller must make sure
1118   there are actually this much blocks on the FS available */
1119void reiserfs_claim_blocks_to_be_allocated(
1120				      struct super_block *sb, /* super block of
1121							        filesystem where
1122								blocks should be
1123								reserved */
1124				      int blocks /* How much to reserve */
1125					  )
1126{
1127
1128    /* Fast case, if reservation is zero - exit immediately. */
1129    if ( !blocks )
1130	return;
1131
1132    spin_lock(&REISERFS_SB(sb)->bitmap_lock);
1133    REISERFS_SB(sb)->reserved_blocks += blocks;
1134    spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
1135}
1136
1137/* Unreserve @blocks amount of blocks in fs pointed by @sb */
1138void reiserfs_release_claimed_blocks(
1139				struct super_block *sb, /* super block of
1140							  filesystem where
1141							  blocks should be
1142							  reserved */
1143				int blocks /* How much to unreserve */
1144					  )
1145{
1146
1147    /* Fast case, if unreservation is zero - exit immediately. */
1148    if ( !blocks )
1149	return;
1150
1151    spin_lock(&REISERFS_SB(sb)->bitmap_lock);
1152    REISERFS_SB(sb)->reserved_blocks -= blocks;
1153    spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
1154    RFALSE( REISERFS_SB(sb)->reserved_blocks < 0, "amount of blocks reserved became zero?");
1155}
1156
1157/* This function estimates how much pages we will be able to write to FS
1158   used for reiserfs_file_write() purposes for now. */
1159int reiserfs_can_fit_pages ( struct super_block *sb /* superblock of filesystem
1160						       to estimate space */ )
1161{
1162	int space;
1163
1164	spin_lock(&REISERFS_SB(sb)->bitmap_lock);
1165	space = (SB_FREE_BLOCKS(sb) - REISERFS_SB(sb)->reserved_blocks) >> ( PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
1166	spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
1167
1168	return space>0?space:0;
1169}
1170