stree.c revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2
1/*
2 *  Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5/*
6 *  Written by Anatoly P. Pinchuk pap@namesys.botik.ru
7 *  Programm System Institute
8 *  Pereslavl-Zalessky Russia
9 */
10
11/*
12 *  This file contains functions dealing with S+tree
13 *
14 * B_IS_IN_TREE
15 * copy_item_head
16 * comp_short_keys
17 * comp_keys
18 * comp_short_le_keys
19 * le_key2cpu_key
20 * comp_le_keys
21 * bin_search
22 * get_lkey
23 * get_rkey
24 * key_in_buffer
25 * decrement_bcount
26 * decrement_counters_in_path
27 * reiserfs_check_path
28 * pathrelse_and_restore
29 * pathrelse
30 * search_by_key_reada
31 * search_by_key
32 * search_for_position_by_key
33 * comp_items
34 * prepare_for_direct_item
35 * prepare_for_direntry_item
36 * prepare_for_delete_or_cut
37 * calc_deleted_bytes_number
38 * init_tb_struct
39 * padd_item
40 * reiserfs_delete_item
41 * reiserfs_delete_solid_item
42 * reiserfs_delete_object
43 * maybe_indirect_to_direct
44 * indirect_to_direct_roll_back
45 * reiserfs_cut_from_item
46 * truncate_directory
47 * reiserfs_do_truncate
48 * reiserfs_paste_into_item
49 * reiserfs_insert_item
50 */
51
52#include <linux/config.h>
53#include <linux/time.h>
54#include <linux/string.h>
55#include <linux/pagemap.h>
56#include <linux/reiserfs_fs.h>
57#include <linux/smp_lock.h>
58#include <linux/buffer_head.h>
59#include <linux/quotaops.h>
60
61/* Does the buffer contain a disk block which is in the tree. */
62inline int B_IS_IN_TREE (const struct buffer_head * p_s_bh)
63{
64
65  RFALSE( B_LEVEL (p_s_bh) > MAX_HEIGHT,
66	  "PAP-1010: block (%b) has too big level (%z)", p_s_bh, p_s_bh);
67
68  return ( B_LEVEL (p_s_bh) != FREE_LEVEL );
69}
70
71//
72// to gets item head in le form
73//
74inline void copy_item_head(struct item_head * p_v_to,
75			   const struct item_head * p_v_from)
76{
77  memcpy (p_v_to, p_v_from, IH_SIZE);
78}
79
80
81/* k1 is pointer to on-disk structure which is stored in little-endian
82   form. k2 is pointer to cpu variable. For key of items of the same
83   object this returns 0.
84   Returns: -1 if key1 < key2
85   0 if key1 == key2
86   1 if key1 > key2 */
87inline int  comp_short_keys (const struct reiserfs_key * le_key,
88			     const struct cpu_key * cpu_key)
89{
90  __u32 * p_s_le_u32, * p_s_cpu_u32;
91  int n_key_length = REISERFS_SHORT_KEY_LEN;
92
93  p_s_le_u32 = (__u32 *)le_key;
94  p_s_cpu_u32 = (__u32 *)&cpu_key->on_disk_key;
95  for( ; n_key_length--; ++p_s_le_u32, ++p_s_cpu_u32 ) {
96    if ( le32_to_cpu (*p_s_le_u32) < *p_s_cpu_u32 )
97      return -1;
98    if ( le32_to_cpu (*p_s_le_u32) > *p_s_cpu_u32 )
99      return 1;
100  }
101
102  return 0;
103}
104
105
106/* k1 is pointer to on-disk structure which is stored in little-endian
107   form. k2 is pointer to cpu variable.
108   Compare keys using all 4 key fields.
109   Returns: -1 if key1 < key2 0
110   if key1 = key2 1 if key1 > key2 */
111static inline int  comp_keys (const struct reiserfs_key * le_key, const struct cpu_key * cpu_key)
112{
113  int retval;
114
115  retval = comp_short_keys (le_key, cpu_key);
116  if (retval)
117      return retval;
118  if (le_key_k_offset (le_key_version(le_key), le_key) < cpu_key_k_offset (cpu_key))
119      return -1;
120  if (le_key_k_offset (le_key_version(le_key), le_key) > cpu_key_k_offset (cpu_key))
121      return 1;
122
123  if (cpu_key->key_length == 3)
124      return 0;
125
126  /* this part is needed only when tail conversion is in progress */
127  if (le_key_k_type (le_key_version(le_key), le_key) < cpu_key_k_type (cpu_key))
128    return -1;
129
130  if (le_key_k_type (le_key_version(le_key), le_key) > cpu_key_k_type (cpu_key))
131    return 1;
132
133  return 0;
134}
135
136
137inline int comp_short_le_keys (const struct reiserfs_key * key1, const struct reiserfs_key * key2)
138{
139  __u32 * p_s_1_u32, * p_s_2_u32;
140  int n_key_length = REISERFS_SHORT_KEY_LEN;
141
142  p_s_1_u32 = (__u32 *)key1;
143  p_s_2_u32 = (__u32 *)key2;
144  for( ; n_key_length--; ++p_s_1_u32, ++p_s_2_u32 ) {
145    if ( le32_to_cpu (*p_s_1_u32) < le32_to_cpu (*p_s_2_u32) )
146      return -1;
147    if ( le32_to_cpu (*p_s_1_u32) > le32_to_cpu (*p_s_2_u32) )
148      return 1;
149  }
150  return 0;
151}
152
153inline void le_key2cpu_key (struct cpu_key * to, const struct reiserfs_key * from)
154{
155    to->on_disk_key.k_dir_id = le32_to_cpu (from->k_dir_id);
156    to->on_disk_key.k_objectid = le32_to_cpu (from->k_objectid);
157
158    // find out version of the key
159    to->version = le_key_version (from);
160    if (to->version == KEY_FORMAT_3_5) {
161	to->on_disk_key.u.k_offset_v1.k_offset = le32_to_cpu (from->u.k_offset_v1.k_offset);
162	to->on_disk_key.u.k_offset_v1.k_uniqueness = le32_to_cpu (from->u.k_offset_v1.k_uniqueness);
163    } else {
164	to->on_disk_key.u.k_offset_v2.k_offset = offset_v2_k_offset(&from->u.k_offset_v2);
165	to->on_disk_key.u.k_offset_v2.k_type = offset_v2_k_type(&from->u.k_offset_v2);
166    }
167}
168
169
170
171// this does not say which one is bigger, it only returns 1 if keys
172// are not equal, 0 otherwise
173inline int comp_le_keys (const struct reiserfs_key * k1, const struct reiserfs_key * k2)
174{
175    return memcmp (k1, k2, sizeof (struct reiserfs_key));
176}
177
178/**************************************************************************
179 *  Binary search toolkit function                                        *
180 *  Search for an item in the array by the item key                       *
181 *  Returns:    1 if found,  0 if not found;                              *
182 *        *p_n_pos = number of the searched element if found, else the    *
183 *        number of the first element that is larger than p_v_key.        *
184 **************************************************************************/
185/* For those not familiar with binary search: n_lbound is the leftmost item that it
186 could be, n_rbound the rightmost item that it could be.  We examine the item
187 halfway between n_lbound and n_rbound, and that tells us either that we can increase
188 n_lbound, or decrease n_rbound, or that we have found it, or if n_lbound <= n_rbound that
189 there are no possible items, and we have not found it. With each examination we
190 cut the number of possible items it could be by one more than half rounded down,
191 or we find it. */
192static inline	int bin_search (
193              const void * p_v_key, /* Key to search for.                   */
194	      const void * p_v_base,/* First item in the array.             */
195	      int       p_n_num,    /* Number of items in the array.        */
196	      int       p_n_width,  /* Item size in the array.
197				       searched. Lest the reader be
198				       confused, note that this is crafted
199				       as a general function, and when it
200				       is applied specifically to the array
201				       of item headers in a node, p_n_width
202				       is actually the item header size not
203				       the item size.                      */
204	      int     * p_n_pos     /* Number of the searched for element. */
205            ) {
206    int   n_rbound, n_lbound, n_j;
207
208   for ( n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0))/2; n_lbound <= n_rbound; n_j = (n_rbound + n_lbound)/2 )
209     switch( comp_keys((struct reiserfs_key *)((char * )p_v_base + n_j * p_n_width), (struct cpu_key *)p_v_key) )  {
210     case -1: n_lbound = n_j + 1; continue;
211     case  1: n_rbound = n_j - 1; continue;
212     case  0: *p_n_pos = n_j;     return ITEM_FOUND; /* Key found in the array.  */
213        }
214
215    /* bin_search did not find given key, it returns position of key,
216        that is minimal and greater than the given one. */
217    *p_n_pos = n_lbound;
218    return ITEM_NOT_FOUND;
219}
220
221#ifdef CONFIG_REISERFS_CHECK
222extern struct tree_balance * cur_tb;
223#endif
224
225
226
227/* Minimal possible key. It is never in the tree. */
228const struct reiserfs_key  MIN_KEY = {0, 0, {{0, 0},}};
229
230/* Maximal possible key. It is never in the tree. */
231const struct reiserfs_key  MAX_KEY = {0xffffffff, 0xffffffff, {{0xffffffff, 0xffffffff},}};
232
233
234/* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom
235   of the path, and going upwards.  We must check the path's validity at each step.  If the key is not in
236   the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this
237   case we return a special key, either MIN_KEY or MAX_KEY. */
238static inline	const struct  reiserfs_key * get_lkey  (
239	                const struct path         * p_s_chk_path,
240                        const struct super_block  * p_s_sb
241                      ) {
242  int                   n_position, n_path_offset = p_s_chk_path->path_length;
243  struct buffer_head  * p_s_parent;
244
245  RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET,
246	  "PAP-5010: invalid offset in the path");
247
248  /* While not higher in path than first element. */
249  while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) {
250
251    RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
252	    "PAP-5020: parent is not uptodate");
253
254    /* Parent at the path is not in the tree now. */
255    if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) )
256      return &MAX_KEY;
257    /* Check whether position in the parent is correct. */
258    if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) )
259       return &MAX_KEY;
260    /* Check whether parent at the path really points to the child. */
261    if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
262	 PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr )
263      return &MAX_KEY;
264    /* Return delimiting key if position in the parent is not equal to zero. */
265    if ( n_position )
266      return B_N_PDELIM_KEY(p_s_parent, n_position - 1);
267  }
268  /* Return MIN_KEY if we are in the root of the buffer tree. */
269  if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
270       SB_ROOT_BLOCK (p_s_sb) )
271    return &MIN_KEY;
272  return  &MAX_KEY;
273}
274
275
276/* Get delimiting key of the buffer at the path and its right neighbor. */
277inline	const struct  reiserfs_key * get_rkey  (
278	                const struct path         * p_s_chk_path,
279                        const struct super_block  * p_s_sb
280                      ) {
281  int                   n_position,
282    			n_path_offset = p_s_chk_path->path_length;
283  struct buffer_head  * p_s_parent;
284
285  RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET,
286	  "PAP-5030: invalid offset in the path");
287
288  while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) {
289
290    RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
291	    "PAP-5040: parent is not uptodate");
292
293    /* Parent at the path is not in the tree now. */
294    if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) )
295      return &MIN_KEY;
296    /* Check whether position in the parent is correct. */
297    if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) )
298      return &MIN_KEY;
299    /* Check whether parent at the path really points to the child. */
300    if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
301                                        PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr )
302      return &MIN_KEY;
303    /* Return delimiting key if position in the parent is not the last one. */
304    if ( n_position != B_NR_ITEMS(p_s_parent) )
305      return B_N_PDELIM_KEY(p_s_parent, n_position);
306  }
307  /* Return MAX_KEY if we are in the root of the buffer tree. */
308  if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
309       SB_ROOT_BLOCK (p_s_sb) )
310    return &MAX_KEY;
311  return  &MIN_KEY;
312}
313
314
315/* Check whether a key is contained in the tree rooted from a buffer at a path. */
316/* This works by looking at the left and right delimiting keys for the buffer in the last path_element in
317   the path.  These delimiting keys are stored at least one level above that buffer in the tree. If the
318   buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in
319   this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */
320static  inline  int key_in_buffer (
321                      struct path         * p_s_chk_path, /* Path which should be checked.  */
322                      const struct cpu_key      * p_s_key,      /* Key which should be checked.   */
323                      struct super_block  * p_s_sb        /* Super block pointer.           */
324		      ) {
325
326  RFALSE( ! p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET ||
327	  p_s_chk_path->path_length > MAX_HEIGHT,
328	  "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
329	  p_s_key, p_s_chk_path->path_length);
330  RFALSE( !PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev,
331	  "PAP-5060: device must not be NODEV");
332
333  if ( comp_keys(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1 )
334    /* left delimiting key is bigger, that the key we look for */
335    return 0;
336  //  if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 )
337  if ( comp_keys(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1 )
338    /* p_s_key must be less than right delimitiing key */
339    return 0;
340  return 1;
341}
342
343
344inline void decrement_bcount(
345              struct buffer_head  * p_s_bh
346            ) {
347  if ( p_s_bh ) {
348    if ( atomic_read (&(p_s_bh->b_count)) ) {
349      put_bh(p_s_bh) ;
350      return;
351    }
352    reiserfs_panic(NULL, "PAP-5070: decrement_bcount: trying to free free buffer %b", p_s_bh);
353  }
354}
355
356
357/* Decrement b_count field of the all buffers in the path. */
358void decrement_counters_in_path (
359              struct path * p_s_search_path
360            ) {
361  int n_path_offset = p_s_search_path->path_length;
362
363  RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET ||
364	  n_path_offset > EXTENDED_MAX_HEIGHT - 1,
365	  "PAP-5080: invalid path offset of %d", n_path_offset);
366
367  while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) {
368    struct buffer_head * bh;
369
370    bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--);
371    decrement_bcount (bh);
372  }
373  p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
374}
375
376
377int reiserfs_check_path(struct path *p) {
378  RFALSE( p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
379	  "path not properly relsed") ;
380  return 0 ;
381}
382
383
384/* Release all buffers in the path. Restore dirty bits clean
385** when preparing the buffer for the log
386**
387** only called from fix_nodes()
388*/
389void  pathrelse_and_restore (
390	struct super_block *s,
391        struct path * p_s_search_path
392      ) {
393  int n_path_offset = p_s_search_path->path_length;
394
395  RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
396	  "clm-4000: invalid path offset");
397
398  while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET )  {
399    reiserfs_restore_prepared_buffer(s, PATH_OFFSET_PBUFFER(p_s_search_path,
400                                     n_path_offset));
401    brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
402  }
403  p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
404}
405
406/* Release all buffers in the path. */
407void  pathrelse (
408        struct path * p_s_search_path
409      ) {
410  int n_path_offset = p_s_search_path->path_length;
411
412  RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
413	  "PAP-5090: invalid path offset");
414
415  while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET )
416    brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
417
418  p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
419}
420
421
422
423static int is_leaf (char * buf, int blocksize, struct buffer_head * bh)
424{
425    struct block_head * blkh;
426    struct item_head * ih;
427    int used_space;
428    int prev_location;
429    int i;
430    int nr;
431
432    blkh = (struct block_head *)buf;
433    if ( blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
434	reiserfs_warning (NULL, "is_leaf: this should be caught earlier");
435	return 0;
436    }
437
438    nr = blkh_nr_item(blkh);
439    if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
440	/* item number is too big or too small */
441	reiserfs_warning (NULL, "is_leaf: nr_item seems wrong: %z", bh);
442	return 0;
443    }
444    ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
445    used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location (ih));
446    if (used_space != blocksize - blkh_free_space(blkh)) {
447	/* free space does not match to calculated amount of use space */
448	reiserfs_warning (NULL, "is_leaf: free space seems wrong: %z", bh);
449	return 0;
450    }
451
452    // FIXME: it is_leaf will hit performance too much - we may have
453    // return 1 here
454
455    /* check tables of item heads */
456    ih = (struct item_head *)(buf + BLKH_SIZE);
457    prev_location = blocksize;
458    for (i = 0; i < nr; i ++, ih ++) {
459	if ( le_ih_k_type(ih) == TYPE_ANY) {
460	    reiserfs_warning (NULL, "is_leaf: wrong item type for item %h",ih);
461	    return 0;
462	}
463	if (ih_location (ih) >= blocksize || ih_location (ih) < IH_SIZE * nr) {
464	    reiserfs_warning (NULL, "is_leaf: item location seems wrong: %h", ih);
465	    return 0;
466	}
467	if (ih_item_len (ih) < 1 || ih_item_len (ih) > MAX_ITEM_LEN (blocksize)) {
468	    reiserfs_warning (NULL, "is_leaf: item length seems wrong: %h", ih);
469	    return 0;
470	}
471	if (prev_location - ih_location (ih) != ih_item_len (ih)) {
472	    reiserfs_warning (NULL, "is_leaf: item location seems wrong (second one): %h", ih);
473	    return 0;
474	}
475	prev_location = ih_location (ih);
476    }
477
478    // one may imagine much more checks
479    return 1;
480}
481
482
483/* returns 1 if buf looks like an internal node, 0 otherwise */
484static int is_internal (char * buf, int blocksize, struct buffer_head * bh)
485{
486    struct block_head * blkh;
487    int nr;
488    int used_space;
489
490    blkh = (struct block_head *)buf;
491    nr = blkh_level(blkh);
492    if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
493	/* this level is not possible for internal nodes */
494	reiserfs_warning (NULL, "is_internal: this should be caught earlier");
495	return 0;
496    }
497
498    nr = blkh_nr_item(blkh);
499    if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
500	/* for internal which is not root we might check min number of keys */
501	reiserfs_warning (NULL, "is_internal: number of key seems wrong: %z", bh);
502	return 0;
503    }
504
505    used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
506    if (used_space != blocksize - blkh_free_space(blkh)) {
507	reiserfs_warning (NULL, "is_internal: free space seems wrong: %z", bh);
508	return 0;
509    }
510
511    // one may imagine much more checks
512    return 1;
513}
514
515
516// make sure that bh contains formatted node of reiserfs tree of
517// 'level'-th level
518static int is_tree_node (struct buffer_head * bh, int level)
519{
520    if (B_LEVEL (bh) != level) {
521	reiserfs_warning (NULL, "is_tree_node: node level %d does not match to the expected one %d",
522		B_LEVEL (bh), level);
523	return 0;
524    }
525    if (level == DISK_LEAF_NODE_LEVEL)
526	return is_leaf (bh->b_data, bh->b_size, bh);
527
528    return is_internal (bh->b_data, bh->b_size, bh);
529}
530
531
532
533#define SEARCH_BY_KEY_READA 16
534
535/* The function is NOT SCHEDULE-SAFE! */
536static void search_by_key_reada (struct super_block * s,
537                                 struct buffer_head **bh,
538				 unsigned long *b, int num)
539{
540    int i,j;
541
542    for (i = 0 ; i < num ; i++) {
543	bh[i] = sb_getblk (s, b[i]);
544    }
545    for (j = 0 ; j < i ; j++) {
546	/*
547	 * note, this needs attention if we are getting rid of the BKL
548	 * you have to make sure the prepared bit isn't set on this buffer
549	 */
550	if (!buffer_uptodate(bh[j]))
551	    ll_rw_block(READA, 1, bh + j);
552    	brelse(bh[j]);
553    }
554}
555
556/**************************************************************************
557 * Algorithm   SearchByKey                                                *
558 *             look for item in the Disk S+Tree by its key                *
559 * Input:  p_s_sb   -  super block                                        *
560 *         p_s_key  - pointer to the key to search                        *
561 * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR                         *
562 *         p_s_search_path - path from the root to the needed leaf        *
563 **************************************************************************/
564
565/* This function fills up the path from the root to the leaf as it
566   descends the tree looking for the key.  It uses reiserfs_bread to
567   try to find buffers in the cache given their block number.  If it
568   does not find them in the cache it reads them from disk.  For each
569   node search_by_key finds using reiserfs_bread it then uses
570   bin_search to look through that node.  bin_search will find the
571   position of the block_number of the next node if it is looking
572   through an internal node.  If it is looking through a leaf node
573   bin_search will find the position of the item which has key either
574   equal to given key, or which is the maximal key less than the given
575   key.  search_by_key returns a path that must be checked for the
576   correctness of the top of the path but need not be checked for the
577   correctness of the bottom of the path */
578/* The function is NOT SCHEDULE-SAFE! */
579int search_by_key (struct super_block * p_s_sb,
580		   const struct cpu_key * p_s_key, /* Key to search. */
581		   struct path * p_s_search_path, /* This structure was
582						     allocated and initialized
583						     by the calling
584						     function. It is filled up
585						     by this function.  */
586		   int n_stop_level /* How far down the tree to search. To
587                                       stop at leaf level - set to
588                                       DISK_LEAF_NODE_LEVEL */
589    ) {
590    int  n_block_number;
591    int  expected_level;
592    struct buffer_head  *       p_s_bh;
593    struct path_element *       p_s_last_element;
594    int				n_node_level, n_retval;
595    int 			right_neighbor_of_leaf_node;
596    int				fs_gen;
597    struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
598    unsigned long      reada_blocks[SEARCH_BY_KEY_READA];
599    int reada_count = 0;
600
601#ifdef CONFIG_REISERFS_CHECK
602    int n_repeat_counter = 0;
603#endif
604
605    PROC_INFO_INC( p_s_sb, search_by_key );
606
607    /* As we add each node to a path we increase its count.  This means that
608       we must be careful to release all nodes in a path before we either
609       discard the path struct or re-use the path struct, as we do here. */
610
611    decrement_counters_in_path(p_s_search_path);
612
613    right_neighbor_of_leaf_node = 0;
614
615    /* With each iteration of this loop we search through the items in the
616       current node, and calculate the next current node(next path element)
617       for the next iteration of this loop.. */
618    n_block_number = SB_ROOT_BLOCK (p_s_sb);
619    expected_level = -1;
620    while ( 1 ) {
621
622#ifdef CONFIG_REISERFS_CHECK
623	if ( !(++n_repeat_counter % 50000) )
624	    reiserfs_warning (p_s_sb, "PAP-5100: search_by_key: %s:"
625			      "there were %d iterations of while loop "
626			      "looking for key %K",
627			      current->comm, n_repeat_counter, p_s_key);
628#endif
629
630	/* prep path to have another element added to it. */
631	p_s_last_element = PATH_OFFSET_PELEMENT(p_s_search_path, ++p_s_search_path->path_length);
632	fs_gen = get_generation (p_s_sb);
633
634	/* Read the next tree node, and set the last element in the path to
635           have a pointer to it. */
636	if ((p_s_bh = p_s_last_element->pe_buffer =
637	     sb_getblk(p_s_sb, n_block_number)) ) {
638	    if (!buffer_uptodate(p_s_bh) && reada_count > 1) {
639		search_by_key_reada (p_s_sb, reada_bh,
640		                     reada_blocks, reada_count);
641	    }
642	    ll_rw_block(READ, 1, &p_s_bh);
643	    wait_on_buffer(p_s_bh);
644	    if (!buffer_uptodate(p_s_bh))
645	        goto io_error;
646	} else {
647io_error:
648	    p_s_search_path->path_length --;
649	    pathrelse(p_s_search_path);
650	    return IO_ERROR;
651	}
652	reada_count = 0;
653	if (expected_level == -1)
654		expected_level = SB_TREE_HEIGHT (p_s_sb);
655	expected_level --;
656
657	/* It is possible that schedule occurred. We must check whether the key
658	   to search is still in the tree rooted from the current buffer. If
659	   not then repeat search from the root. */
660	if ( fs_changed (fs_gen, p_s_sb) &&
661	    (!B_IS_IN_TREE (p_s_bh) ||
662	     B_LEVEL(p_s_bh) != expected_level ||
663	     !key_in_buffer(p_s_search_path, p_s_key, p_s_sb))) {
664	    PROC_INFO_INC( p_s_sb, search_by_key_fs_changed );
665	    PROC_INFO_INC( p_s_sb, search_by_key_restarted );
666	    PROC_INFO_INC( p_s_sb, sbk_restarted[ expected_level - 1 ] );
667	    decrement_counters_in_path(p_s_search_path);
668
669	    /* Get the root block number so that we can repeat the search
670	       starting from the root. */
671	    n_block_number = SB_ROOT_BLOCK (p_s_sb);
672	    expected_level = -1;
673	    right_neighbor_of_leaf_node = 0;
674
675	    /* repeat search from the root */
676	    continue;
677	}
678
679        /* only check that the key is in the buffer if p_s_key is not
680           equal to the MAX_KEY. Latter case is only possible in
681           "finish_unfinished()" processing during mount. */
682        RFALSE( comp_keys( &MAX_KEY, p_s_key ) &&
683                ! key_in_buffer(p_s_search_path, p_s_key, p_s_sb),
684		"PAP-5130: key is not in the buffer");
685#ifdef CONFIG_REISERFS_CHECK
686	if ( cur_tb ) {
687	    print_cur_tb ("5140");
688	    reiserfs_panic(p_s_sb, "PAP-5140: search_by_key: schedule occurred in do_balance!");
689	}
690#endif
691
692	// make sure, that the node contents look like a node of
693	// certain level
694	if (!is_tree_node (p_s_bh, expected_level)) {
695	    reiserfs_warning (p_s_sb, "vs-5150: search_by_key: "
696			      "invalid format found in block %ld. Fsck?",
697			      p_s_bh->b_blocknr);
698	    pathrelse (p_s_search_path);
699	    return IO_ERROR;
700	}
701
702	/* ok, we have acquired next formatted node in the tree */
703	n_node_level = B_LEVEL (p_s_bh);
704
705	PROC_INFO_BH_STAT( p_s_sb, p_s_bh, n_node_level - 1 );
706
707	RFALSE( n_node_level < n_stop_level,
708		"vs-5152: tree level (%d) is less than stop level (%d)",
709		n_node_level, n_stop_level);
710
711	n_retval = bin_search( p_s_key, B_N_PITEM_HEAD(p_s_bh, 0),
712                B_NR_ITEMS(p_s_bh),
713                ( n_node_level == DISK_LEAF_NODE_LEVEL ) ? IH_SIZE : KEY_SIZE,
714                &(p_s_last_element->pe_position));
715	if (n_node_level == n_stop_level) {
716	    return n_retval;
717	}
718
719	/* we are not in the stop level */
720	if (n_retval == ITEM_FOUND)
721	    /* item has been found, so we choose the pointer which is to the right of the found one */
722	    p_s_last_element->pe_position++;
723
724	/* if item was not found we choose the position which is to
725	   the left of the found item. This requires no code,
726	   bin_search did it already.*/
727
728	/* So we have chosen a position in the current node which is
729	   an internal node.  Now we calculate child block number by
730	   position in the node. */
731	n_block_number = B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position);
732
733	/* if we are going to read leaf nodes, try for read ahead as well */
734	if ((p_s_search_path->reada & PATH_READA) &&
735	    n_node_level == DISK_LEAF_NODE_LEVEL + 1)
736	{
737	    int pos = p_s_last_element->pe_position;
738	    int limit = B_NR_ITEMS(p_s_bh);
739	    struct reiserfs_key *le_key;
740
741	    if (p_s_search_path->reada & PATH_READA_BACK)
742		limit = 0;
743	    while(reada_count < SEARCH_BY_KEY_READA) {
744		if (pos == limit)
745		    break;
746	        reada_blocks[reada_count++] = B_N_CHILD_NUM(p_s_bh, pos);
747		if (p_s_search_path->reada & PATH_READA_BACK)
748		    pos--;
749		else
750		    pos++;
751
752		/*
753		 * check to make sure we're in the same object
754		 */
755		le_key = B_N_PDELIM_KEY(p_s_bh, pos);
756		if (le32_to_cpu(le_key->k_objectid) !=
757		    p_s_key->on_disk_key.k_objectid)
758		{
759		    break;
760		}
761	    }
762        }
763    }
764}
765
766
767/* Form the path to an item and position in this item which contains
768   file byte defined by p_s_key. If there is no such item
769   corresponding to the key, we point the path to the item with
770   maximal key less than p_s_key, and *p_n_pos_in_item is set to one
771   past the last entry/byte in the item.  If searching for entry in a
772   directory item, and it is not found, *p_n_pos_in_item is set to one
773   entry more than the entry with maximal key which is less than the
774   sought key.
775
776   Note that if there is no entry in this same node which is one more,
777   then we point to an imaginary entry.  for direct items, the
778   position is in units of bytes, for indirect items the position is
779   in units of blocknr entries, for directory items the position is in
780   units of directory entries.  */
781
782/* The function is NOT SCHEDULE-SAFE! */
783int search_for_position_by_key (struct super_block  * p_s_sb,         /* Pointer to the super block.          */
784				const struct cpu_key  * p_cpu_key,      /* Key to search (cpu variable)         */
785				struct path         * p_s_search_path /* Filled up by this function.          */
786    ) {
787    struct item_head    * p_le_ih; /* pointer to on-disk structure */
788    int                   n_blk_size;
789    loff_t item_offset, offset;
790    struct reiserfs_dir_entry de;
791    int retval;
792
793    /* If searching for directory entry. */
794    if ( is_direntry_cpu_key (p_cpu_key) )
795	return  search_by_entry_key (p_s_sb, p_cpu_key, p_s_search_path, &de);
796
797    /* If not searching for directory entry. */
798
799    /* If item is found. */
800    retval = search_item (p_s_sb, p_cpu_key, p_s_search_path);
801    if (retval == IO_ERROR)
802	return retval;
803    if ( retval == ITEM_FOUND )  {
804
805	RFALSE( ! ih_item_len(
806                B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path),
807			       PATH_LAST_POSITION(p_s_search_path))),
808	        "PAP-5165: item length equals zero");
809
810	pos_in_item(p_s_search_path) = 0;
811	return POSITION_FOUND;
812    }
813
814    RFALSE( ! PATH_LAST_POSITION(p_s_search_path),
815	    "PAP-5170: position equals zero");
816
817    /* Item is not found. Set path to the previous item. */
818    p_le_ih = B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), --PATH_LAST_POSITION(p_s_search_path));
819    n_blk_size = p_s_sb->s_blocksize;
820
821    if (comp_short_keys (&(p_le_ih->ih_key), p_cpu_key)) {
822	return FILE_NOT_FOUND;
823    }
824
825    // FIXME: quite ugly this far
826
827    item_offset = le_ih_k_offset (p_le_ih);
828    offset = cpu_key_k_offset (p_cpu_key);
829
830    /* Needed byte is contained in the item pointed to by the path.*/
831    if (item_offset <= offset &&
832	item_offset + op_bytes_number (p_le_ih, n_blk_size) > offset) {
833	pos_in_item (p_s_search_path) = offset - item_offset;
834	if ( is_indirect_le_ih(p_le_ih) ) {
835	    pos_in_item (p_s_search_path) /= n_blk_size;
836	}
837	return POSITION_FOUND;
838    }
839
840    /* Needed byte is not contained in the item pointed to by the
841     path. Set pos_in_item out of the item. */
842    if ( is_indirect_le_ih (p_le_ih) )
843	pos_in_item (p_s_search_path) = ih_item_len(p_le_ih) / UNFM_P_SIZE;
844    else
845        pos_in_item (p_s_search_path) = ih_item_len( p_le_ih );
846
847    return POSITION_NOT_FOUND;
848}
849
850
851/* Compare given item and item pointed to by the path. */
852int comp_items (const struct item_head * stored_ih, const struct path * p_s_path)
853{
854    struct buffer_head  * p_s_bh;
855    struct item_head    * ih;
856
857    /* Last buffer at the path is not in the tree. */
858    if ( ! B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path)) )
859	return 1;
860
861    /* Last path position is invalid. */
862    if ( PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh) )
863	return 1;
864
865    /* we need only to know, whether it is the same item */
866    ih = get_ih (p_s_path);
867    return memcmp (stored_ih, ih, IH_SIZE);
868}
869
870
871/* unformatted nodes are not logged anymore, ever.  This is safe
872** now
873*/
874#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1)
875
876// block can not be forgotten as it is in I/O or held by someone
877#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh)))
878
879
880
881// prepare for delete or cut of direct item
882static inline int prepare_for_direct_item (struct path * path,
883					   struct item_head * le_ih,
884					   struct inode * inode,
885					   loff_t new_file_length,
886					   int * cut_size)
887{
888    loff_t round_len;
889
890
891    if ( new_file_length == max_reiserfs_offset (inode) ) {
892	/* item has to be deleted */
893	*cut_size = -(IH_SIZE + ih_item_len(le_ih));
894	return M_DELETE;
895    }
896
897    // new file gets truncated
898    if (get_inode_item_key_version (inode) == KEY_FORMAT_3_6) {
899	//
900	round_len = ROUND_UP (new_file_length);
901	/* this was n_new_file_length < le_ih ... */
902	if ( round_len < le_ih_k_offset (le_ih) )  {
903	    *cut_size = -(IH_SIZE + ih_item_len(le_ih));
904	    return M_DELETE; /* Delete this item. */
905	}
906	/* Calculate first position and size for cutting from item. */
907	pos_in_item (path) = round_len - (le_ih_k_offset (le_ih) - 1);
908	*cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
909
910	return M_CUT; /* Cut from this item. */
911    }
912
913
914    // old file: items may have any length
915
916    if ( new_file_length < le_ih_k_offset (le_ih) )  {
917	*cut_size = -(IH_SIZE + ih_item_len(le_ih));
918	return M_DELETE; /* Delete this item. */
919    }
920    /* Calculate first position and size for cutting from item. */
921    *cut_size = -(ih_item_len(le_ih) -
922		      (pos_in_item (path) = new_file_length + 1 - le_ih_k_offset (le_ih)));
923    return M_CUT; /* Cut from this item. */
924}
925
926
927static inline int prepare_for_direntry_item (struct path * path,
928					     struct item_head * le_ih,
929					     struct inode * inode,
930					     loff_t new_file_length,
931					     int * cut_size)
932{
933    if (le_ih_k_offset (le_ih) == DOT_OFFSET &&
934	new_file_length == max_reiserfs_offset (inode)) {
935	RFALSE( ih_entry_count (le_ih) != 2,
936	        "PAP-5220: incorrect empty directory item (%h)", le_ih);
937	*cut_size = -(IH_SIZE + ih_item_len(le_ih));
938	return M_DELETE; /* Delete the directory item containing "." and ".." entry. */
939    }
940
941    if ( ih_entry_count (le_ih) == 1 )  {
942	/* Delete the directory item such as there is one record only
943	   in this item*/
944	*cut_size = -(IH_SIZE + ih_item_len(le_ih));
945	return M_DELETE;
946    }
947
948    /* Cut one record from the directory item. */
949    *cut_size = -(DEH_SIZE + entry_length (get_last_bh (path), le_ih, pos_in_item (path)));
950    return M_CUT;
951}
952
953
954/*  If the path points to a directory or direct item, calculate mode and the size cut, for balance.
955    If the path points to an indirect item, remove some number of its unformatted nodes.
956    In case of file truncate calculate whether this item must be deleted/truncated or last
957    unformatted node of this item will be converted to a direct item.
958    This function returns a determination of what balance mode the calling function should employ. */
959static char  prepare_for_delete_or_cut(
960				       struct reiserfs_transaction_handle *th,
961				       struct inode * inode,
962				       struct path         * p_s_path,
963				       const struct cpu_key      * p_s_item_key,
964				       int                 * p_n_removed,      /* Number of unformatted nodes which were removed
965										  from end of the file. */
966				       int                 * p_n_cut_size,
967				       unsigned long long    n_new_file_length /* MAX_KEY_OFFSET in case of delete. */
968    ) {
969    struct super_block  * p_s_sb = inode->i_sb;
970    struct item_head    * p_le_ih = PATH_PITEM_HEAD(p_s_path);
971    struct buffer_head  * p_s_bh = PATH_PLAST_BUFFER(p_s_path);
972
973    BUG_ON (!th->t_trans_id);
974
975    /* Stat_data item. */
976    if ( is_statdata_le_ih (p_le_ih) ) {
977
978	RFALSE( n_new_file_length != max_reiserfs_offset (inode),
979		"PAP-5210: mode must be M_DELETE");
980
981	*p_n_cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
982	return M_DELETE;
983    }
984
985
986    /* Directory item. */
987    if ( is_direntry_le_ih (p_le_ih) )
988	return prepare_for_direntry_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size);
989
990    /* Direct item. */
991    if ( is_direct_le_ih (p_le_ih) )
992	return prepare_for_direct_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size);
993
994
995    /* Case of an indirect item. */
996    {
997	int                   n_unfm_number,    /* Number of the item unformatted nodes. */
998	    n_counter,
999	    n_blk_size;
1000	__u32               * p_n_unfm_pointer; /* Pointer to the unformatted node number. */
1001	__u32 tmp;
1002	struct item_head      s_ih;           /* Item header. */
1003	char                  c_mode;           /* Returned mode of the balance. */
1004	int need_research;
1005
1006
1007	n_blk_size = p_s_sb->s_blocksize;
1008
1009	/* Search for the needed object indirect item until there are no unformatted nodes to be removed. */
1010	do  {
1011	    need_research = 0;
1012            p_s_bh = PATH_PLAST_BUFFER(p_s_path);
1013	    /* Copy indirect item header to a temp variable. */
1014	    copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
1015	    /* Calculate number of unformatted nodes in this item. */
1016	    n_unfm_number = I_UNFM_NUM(&s_ih);
1017
1018	    RFALSE( ! is_indirect_le_ih(&s_ih) || ! n_unfm_number ||
1019		    pos_in_item (p_s_path) + 1 !=  n_unfm_number,
1020		    "PAP-5240: invalid item %h "
1021		    "n_unfm_number = %d *p_n_pos_in_item = %d",
1022		    &s_ih, n_unfm_number, pos_in_item (p_s_path));
1023
1024	    /* Calculate balance mode and position in the item to remove unformatted nodes. */
1025	    if ( n_new_file_length == max_reiserfs_offset (inode) ) {/* Case of delete. */
1026		pos_in_item (p_s_path) = 0;
1027		*p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih));
1028		c_mode = M_DELETE;
1029	    }
1030	    else  { /* Case of truncate. */
1031		if ( n_new_file_length < le_ih_k_offset (&s_ih) )  {
1032		    pos_in_item (p_s_path) = 0;
1033		    *p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih));
1034		    c_mode = M_DELETE; /* Delete this item. */
1035		}
1036		else  {
1037		    /* indirect item must be truncated starting from *p_n_pos_in_item-th position */
1038		    pos_in_item (p_s_path) = (n_new_file_length + n_blk_size - le_ih_k_offset (&s_ih) ) >> p_s_sb->s_blocksize_bits;
1039
1040		    RFALSE( pos_in_item (p_s_path) > n_unfm_number,
1041			    "PAP-5250: invalid position in the item");
1042
1043		    /* Either convert last unformatted node of indirect item to direct item or increase
1044		       its free space.  */
1045		    if ( pos_in_item (p_s_path) == n_unfm_number )  {
1046			*p_n_cut_size = 0; /* Nothing to cut. */
1047			return M_CONVERT; /* Maybe convert last unformatted node to the direct item. */
1048		    }
1049		    /* Calculate size to cut. */
1050		    *p_n_cut_size = -(ih_item_len(&s_ih) - pos_in_item(p_s_path) * UNFM_P_SIZE);
1051
1052		    c_mode = M_CUT;     /* Cut from this indirect item. */
1053		}
1054	    }
1055
1056	    RFALSE( n_unfm_number <= pos_in_item (p_s_path),
1057		    "PAP-5260: invalid position in the indirect item");
1058
1059	    /* pointers to be cut */
1060	    n_unfm_number -= pos_in_item (p_s_path);
1061	    /* Set pointer to the last unformatted node pointer that is to be cut. */
1062	    p_n_unfm_pointer = (__u32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1 - *p_n_removed;
1063
1064
1065	    /* We go through the unformatted nodes pointers of the indirect
1066	       item and look for the unformatted nodes in the cache. If we
1067	       found some of them we free it, zero corresponding indirect item
1068	       entry and log buffer containing that indirect item. For this we
1069	       need to prepare last path element for logging. If some
1070	       unformatted node has b_count > 1 we must not free this
1071	       unformatted node since it is in use. */
1072	    reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1);
1073	    // note: path could be changed, first line in for loop takes care
1074	    // of it
1075
1076	    for (n_counter = *p_n_removed;
1077		 n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) {
1078
1079		cond_resched();
1080		if (item_moved (&s_ih, p_s_path)) {
1081		    need_research = 1 ;
1082		    break;
1083		}
1084		RFALSE( p_n_unfm_pointer < (__u32 *)B_I_PITEM(p_s_bh, &s_ih) ||
1085			p_n_unfm_pointer > (__u32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1,
1086			"vs-5265: pointer out of range");
1087
1088		/* Hole, nothing to remove. */
1089		if ( ! get_block_num(p_n_unfm_pointer,0) )  {
1090			(*p_n_removed)++;
1091			continue;
1092		}
1093
1094		(*p_n_removed)++;
1095
1096		tmp = get_block_num(p_n_unfm_pointer,0);
1097		put_block_num(p_n_unfm_pointer, 0, 0);
1098		journal_mark_dirty (th, p_s_sb, p_s_bh);
1099		reiserfs_free_block(th, inode, tmp, 1);
1100		if ( item_moved (&s_ih, p_s_path) )  {
1101			need_research = 1;
1102			break ;
1103		}
1104	    }
1105
1106	    /* a trick.  If the buffer has been logged, this
1107	    ** will do nothing.  If we've broken the loop without
1108	    ** logging it, it will restore the buffer
1109	    **
1110	    */
1111	    reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh);
1112
1113	    /* This loop can be optimized. */
1114	} while ( (*p_n_removed < n_unfm_number || need_research) &&
1115		  search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_FOUND );
1116
1117	RFALSE( *p_n_removed < n_unfm_number,
1118		"PAP-5310: indirect item is not found");
1119	RFALSE( item_moved (&s_ih, p_s_path),
1120		"after while, comp failed, retry") ;
1121
1122	if (c_mode == M_CUT)
1123	    pos_in_item (p_s_path) *= UNFM_P_SIZE;
1124	return c_mode;
1125    }
1126}
1127
1128/* Calculate number of bytes which will be deleted or cut during balance */
1129static int calc_deleted_bytes_number(
1130    struct  tree_balance  * p_s_tb,
1131    char                    c_mode
1132    ) {
1133    int                     n_del_size;
1134    struct  item_head     * p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path);
1135
1136    if ( is_statdata_le_ih (p_le_ih) )
1137	return 0;
1138
1139    n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
1140    if ( is_direntry_le_ih (p_le_ih) ) {
1141	// return EMPTY_DIR_SIZE; /* We delete emty directoris only. */
1142	// we can't use EMPTY_DIR_SIZE, as old format dirs have a different
1143	// empty size.  ick. FIXME, is this right?
1144	//
1145	return n_del_size ;
1146    }
1147
1148    if ( is_indirect_le_ih (p_le_ih) )
1149	n_del_size = (n_del_size/UNFM_P_SIZE)*
1150	  (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size);// - get_ih_free_space (p_le_ih);
1151    return n_del_size;
1152}
1153
1154static void init_tb_struct(
1155    struct reiserfs_transaction_handle *th,
1156    struct tree_balance * p_s_tb,
1157    struct super_block  * p_s_sb,
1158    struct path         * p_s_path,
1159    int                   n_size
1160    ) {
1161
1162    BUG_ON (!th->t_trans_id);
1163
1164    memset (p_s_tb,'\0',sizeof(struct tree_balance));
1165    p_s_tb->transaction_handle = th ;
1166    p_s_tb->tb_sb = p_s_sb;
1167    p_s_tb->tb_path = p_s_path;
1168    PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
1169    PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
1170    p_s_tb->insert_size[0] = n_size;
1171}
1172
1173
1174
1175void padd_item (char * item, int total_length, int length)
1176{
1177    int i;
1178
1179    for (i = total_length; i > length; )
1180	item [--i] = 0;
1181}
1182
1183#ifdef REISERQUOTA_DEBUG
1184char key2type(struct reiserfs_key *ih)
1185{
1186  if (is_direntry_le_key(2, ih))
1187    return 'd';
1188  if (is_direct_le_key(2, ih))
1189    return 'D';
1190  if (is_indirect_le_key(2, ih))
1191    return 'i';
1192  if (is_statdata_le_key(2, ih))
1193    return 's';
1194  return 'u';
1195}
1196
1197char head2type(struct item_head *ih)
1198{
1199  if (is_direntry_le_ih(ih))
1200    return 'd';
1201  if (is_direct_le_ih(ih))
1202    return 'D';
1203  if (is_indirect_le_ih(ih))
1204    return 'i';
1205  if (is_statdata_le_ih(ih))
1206    return 's';
1207  return 'u';
1208}
1209#endif
1210
1211/* Delete object item. */
1212int reiserfs_delete_item (struct reiserfs_transaction_handle *th,
1213			  struct path * p_s_path, /* Path to the deleted item. */
1214			  const struct cpu_key * p_s_item_key, /* Key to search for the deleted item.  */
1215			  struct inode * p_s_inode,/* inode is here just to update i_blocks and quotas */
1216			  struct buffer_head  * p_s_un_bh)    /* NULL or unformatted node pointer.    */
1217{
1218    struct super_block * p_s_sb = p_s_inode->i_sb;
1219    struct tree_balance   s_del_balance;
1220    struct item_head      s_ih;
1221    struct item_head      *q_ih;
1222    int			  quota_cut_bytes;
1223    int                   n_ret_value,
1224	n_del_size,
1225	n_removed;
1226
1227#ifdef CONFIG_REISERFS_CHECK
1228    char                  c_mode;
1229    int			n_iter = 0;
1230#endif
1231
1232    BUG_ON (!th->t_trans_id);
1233
1234    init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path, 0/*size is unknown*/);
1235
1236    while ( 1 ) {
1237	n_removed = 0;
1238
1239#ifdef CONFIG_REISERFS_CHECK
1240	n_iter++;
1241	c_mode =
1242#endif
1243	    prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, &n_del_size, max_reiserfs_offset (p_s_inode));
1244
1245	RFALSE( c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
1246
1247	copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
1248	s_del_balance.insert_size[0] = n_del_size;
1249
1250	n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
1251	if ( n_ret_value != REPEAT_SEARCH )
1252	    break;
1253
1254	PROC_INFO_INC( p_s_sb, delete_item_restarted );
1255
1256	// file system changed, repeat search
1257	n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
1258	if (n_ret_value == IO_ERROR)
1259	    break;
1260	if (n_ret_value == FILE_NOT_FOUND) {
1261	    reiserfs_warning (p_s_sb, "vs-5340: reiserfs_delete_item: "
1262			      "no items of the file %K found", p_s_item_key);
1263	    break;
1264	}
1265    } /* while (1) */
1266
1267    if ( n_ret_value != CARRY_ON ) {
1268	unfix_nodes(&s_del_balance);
1269	return 0;
1270    }
1271
1272    // reiserfs_delete_item returns item length when success
1273    n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
1274    q_ih = get_ih(p_s_path) ;
1275    quota_cut_bytes = ih_item_len(q_ih) ;
1276
1277    /* hack so the quota code doesn't have to guess if the file
1278    ** has a tail.  On tail insert, we allocate quota for 1 unformatted node.
1279    ** We test the offset because the tail might have been
1280    ** split into multiple items, and we only want to decrement for
1281    ** the unfm node once
1282    */
1283    if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(q_ih)) {
1284        if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) {
1285            quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
1286        } else {
1287	    quota_cut_bytes = 0 ;
1288	}
1289    }
1290
1291    if ( p_s_un_bh )  {
1292	int off;
1293        char *data ;
1294
1295	/* We are in direct2indirect conversion, so move tail contents
1296           to the unformatted node */
1297	/* note, we do the copy before preparing the buffer because we
1298	** don't care about the contents of the unformatted node yet.
1299	** the only thing we really care about is the direct item's data
1300	** is in the unformatted node.
1301	**
1302	** Otherwise, we would have to call reiserfs_prepare_for_journal on
1303	** the unformatted node, which might schedule, meaning we'd have to
1304	** loop all the way back up to the start of the while loop.
1305	**
1306	** The unformatted node must be dirtied later on.  We can't be
1307	** sure here if the entire tail has been deleted yet.
1308        **
1309        ** p_s_un_bh is from the page cache (all unformatted nodes are
1310        ** from the page cache) and might be a highmem page.  So, we
1311        ** can't use p_s_un_bh->b_data.
1312	** -clm
1313	*/
1314
1315        data = kmap_atomic(p_s_un_bh->b_page, KM_USER0);
1316	off = ((le_ih_k_offset (&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
1317	memcpy(data + off,
1318	       B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value);
1319	kunmap_atomic(data, KM_USER0);
1320    }
1321    /* Perform balancing after all resources have been collected at once. */
1322    do_balance(&s_del_balance, NULL, NULL, M_DELETE);
1323
1324#ifdef REISERQUOTA_DEBUG
1325    reiserfs_debug (p_s_sb, REISERFS_DEBUG_CODE, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
1326#endif
1327    DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
1328
1329    /* Return deleted body length */
1330    return n_ret_value;
1331}
1332
1333
1334/* Summary Of Mechanisms For Handling Collisions Between Processes:
1335
1336 deletion of the body of the object is performed by iput(), with the
1337 result that if multiple processes are operating on a file, the
1338 deletion of the body of the file is deferred until the last process
1339 that has an open inode performs its iput().
1340
1341 writes and truncates are protected from collisions by use of
1342 semaphores.
1343
1344 creates, linking, and mknod are protected from collisions with other
1345 processes by making the reiserfs_add_entry() the last step in the
1346 creation, and then rolling back all changes if there was a collision.
1347 - Hans
1348*/
1349
1350
1351/* this deletes item which never gets split */
1352void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
1353				 struct inode *inode,
1354				 struct reiserfs_key * key)
1355{
1356    struct tree_balance tb;
1357    INITIALIZE_PATH (path);
1358    int item_len = 0;
1359    int tb_init = 0 ;
1360    struct cpu_key cpu_key;
1361    int retval;
1362    int quota_cut_bytes = 0;
1363
1364    BUG_ON (!th->t_trans_id);
1365
1366    le_key2cpu_key (&cpu_key, key);
1367
1368    while (1) {
1369	retval = search_item (th->t_super, &cpu_key, &path);
1370	if (retval == IO_ERROR) {
1371	    reiserfs_warning (th->t_super,
1372			      "vs-5350: reiserfs_delete_solid_item: "
1373			      "i/o failure occurred trying to delete %K",
1374			      &cpu_key);
1375	    break;
1376	}
1377	if (retval != ITEM_FOUND) {
1378	    pathrelse (&path);
1379	    // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir
1380	    if ( !( (unsigned long long) GET_HASH_VALUE (le_key_k_offset (le_key_version (key), key)) == 0 && \
1381		 (unsigned long long) GET_GENERATION_NUMBER (le_key_k_offset (le_key_version (key), key)) == 1 ) )
1382		reiserfs_warning (th->t_super, "vs-5355: reiserfs_delete_solid_item: %k not found", key);
1383	    break;
1384	}
1385	if (!tb_init) {
1386	    tb_init = 1 ;
1387	    item_len = ih_item_len( PATH_PITEM_HEAD(&path) );
1388	    init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len));
1389	}
1390	quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)) ;
1391
1392	retval = fix_nodes (M_DELETE, &tb, NULL, NULL);
1393	if (retval == REPEAT_SEARCH) {
1394	    PROC_INFO_INC( th -> t_super, delete_solid_item_restarted );
1395	    continue;
1396	}
1397
1398	if (retval == CARRY_ON) {
1399	    do_balance (&tb, NULL, NULL, M_DELETE);
1400	    if (inode) {	/* Should we count quota for item? (we don't count quotas for save-links) */
1401#ifdef REISERQUOTA_DEBUG
1402		reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota delete_solid_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, key2type(key));
1403#endif
1404		DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes);
1405	    }
1406	    break;
1407	}
1408
1409	// IO_ERROR, NO_DISK_SPACE, etc
1410	reiserfs_warning (th->t_super, "vs-5360: reiserfs_delete_solid_item: "
1411			  "could not delete %K due to fix_nodes failure", &cpu_key);
1412	unfix_nodes (&tb);
1413	break;
1414    }
1415
1416    reiserfs_check_path(&path) ;
1417}
1418
1419
1420int reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * inode)
1421{
1422    int err;
1423    inode->i_size = 0;
1424    BUG_ON (!th->t_trans_id);
1425
1426    /* for directory this deletes item containing "." and ".." */
1427    err = reiserfs_do_truncate (th, inode, NULL, 0/*no timestamp updates*/);
1428    if (err)
1429        return err;
1430
1431#if defined( USE_INODE_GENERATION_COUNTER )
1432    if( !old_format_only ( th -> t_super ) )
1433      {
1434       __u32 *inode_generation;
1435
1436       inode_generation =
1437         &REISERFS_SB(th -> t_super) -> s_rs -> s_inode_generation;
1438       *inode_generation = cpu_to_le32( le32_to_cpu( *inode_generation ) + 1 );
1439      }
1440/* USE_INODE_GENERATION_COUNTER */
1441#endif
1442    reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
1443
1444    return err;
1445}
1446
1447static void
1448unmap_buffers(struct page *page, loff_t pos) {
1449    struct buffer_head *bh ;
1450    struct buffer_head *head ;
1451    struct buffer_head *next ;
1452    unsigned long tail_index ;
1453    unsigned long cur_index ;
1454
1455    if (page) {
1456	if (page_has_buffers(page)) {
1457	    tail_index = pos & (PAGE_CACHE_SIZE - 1) ;
1458	    cur_index = 0 ;
1459	    head = page_buffers(page) ;
1460	    bh = head ;
1461	    do {
1462		next = bh->b_this_page ;
1463
1464		/* we want to unmap the buffers that contain the tail, and
1465		** all the buffers after it (since the tail must be at the
1466		** end of the file).  We don't want to unmap file data
1467		** before the tail, since it might be dirty and waiting to
1468		** reach disk
1469		*/
1470		cur_index += bh->b_size ;
1471		if (cur_index > tail_index) {
1472		    reiserfs_unmap_buffer(bh) ;
1473		}
1474		bh = next ;
1475	    } while (bh != head) ;
1476	    if ( PAGE_SIZE == bh->b_size ) {
1477		clear_page_dirty(page);
1478	    }
1479	}
1480    }
1481}
1482
1483static int maybe_indirect_to_direct (struct reiserfs_transaction_handle *th,
1484			      struct inode * p_s_inode,
1485			      struct page *page,
1486			      struct path         * p_s_path,
1487			      const struct cpu_key      * p_s_item_key,
1488			      loff_t         n_new_file_size,
1489			      char                * p_c_mode
1490			      ) {
1491    struct super_block * p_s_sb = p_s_inode->i_sb;
1492    int n_block_size = p_s_sb->s_blocksize;
1493    int cut_bytes;
1494    BUG_ON (!th->t_trans_id);
1495
1496    if (n_new_file_size != p_s_inode->i_size)
1497	BUG ();
1498
1499    /* the page being sent in could be NULL if there was an i/o error
1500    ** reading in the last block.  The user will hit problems trying to
1501    ** read the file, but for now we just skip the indirect2direct
1502    */
1503    if (atomic_read(&p_s_inode->i_count) > 1 ||
1504        !tail_has_to_be_packed (p_s_inode) ||
1505	!page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) {
1506	// leave tail in an unformatted node
1507	*p_c_mode = M_SKIP_BALANCING;
1508	cut_bytes = n_block_size - (n_new_file_size & (n_block_size - 1));
1509	pathrelse(p_s_path);
1510	return cut_bytes;
1511    }
1512    /* Permorm the conversion to a direct_item. */
1513    /*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode);*/
1514    return indirect2direct (th, p_s_inode, page, p_s_path, p_s_item_key, n_new_file_size, p_c_mode);
1515}
1516
1517
1518/* we did indirect_to_direct conversion. And we have inserted direct
1519   item successesfully, but there were no disk space to cut unfm
1520   pointer being converted. Therefore we have to delete inserted
1521   direct item(s) */
1522static void indirect_to_direct_roll_back (struct reiserfs_transaction_handle *th, struct inode * inode, struct path * path)
1523{
1524    struct cpu_key tail_key;
1525    int tail_len;
1526    int removed;
1527    BUG_ON (!th->t_trans_id);
1528
1529    make_cpu_key (&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);// !!!!
1530    tail_key.key_length = 4;
1531
1532    tail_len = (cpu_key_k_offset (&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
1533    while (tail_len) {
1534	/* look for the last byte of the tail */
1535	if (search_for_position_by_key (inode->i_sb, &tail_key, path) == POSITION_NOT_FOUND)
1536	    reiserfs_panic (inode->i_sb, "vs-5615: indirect_to_direct_roll_back: found invalid item");
1537	RFALSE( path->pos_in_item != ih_item_len(PATH_PITEM_HEAD (path)) - 1,
1538	        "vs-5616: appended bytes found");
1539	PATH_LAST_POSITION (path) --;
1540
1541	removed = reiserfs_delete_item (th, path, &tail_key, inode, NULL/*unbh not needed*/);
1542	RFALSE( removed <= 0 || removed > tail_len,
1543	        "vs-5617: there was tail %d bytes, removed item length %d bytes",
1544                tail_len, removed);
1545	tail_len -= removed;
1546	set_cpu_key_k_offset (&tail_key, cpu_key_k_offset (&tail_key) - removed);
1547    }
1548    reiserfs_warning (inode->i_sb, "indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space");
1549    //mark_file_without_tail (inode);
1550    mark_inode_dirty (inode);
1551}
1552
1553
1554/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
1555int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th,
1556			    struct path * p_s_path,
1557			    struct cpu_key * p_s_item_key,
1558			    struct inode * p_s_inode,
1559			    struct page *page,
1560			    loff_t n_new_file_size)
1561{
1562    struct super_block * p_s_sb = p_s_inode->i_sb;
1563    /* Every function which is going to call do_balance must first
1564       create a tree_balance structure.  Then it must fill up this
1565       structure by using the init_tb_struct and fix_nodes functions.
1566       After that we can make tree balancing. */
1567    struct tree_balance s_cut_balance;
1568    struct item_head *p_le_ih;
1569    int n_cut_size = 0,        /* Amount to be cut. */
1570	n_ret_value = CARRY_ON,
1571	n_removed = 0,     /* Number of the removed unformatted nodes. */
1572	n_is_inode_locked = 0;
1573    char                c_mode;            /* Mode of the balance. */
1574    int retval2 = -1;
1575    int quota_cut_bytes;
1576    loff_t tail_pos = 0;
1577
1578    BUG_ON (!th->t_trans_id);
1579
1580    init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size);
1581
1582
1583    /* Repeat this loop until we either cut the item without needing
1584       to balance, or we fix_nodes without schedule occurring */
1585    while ( 1 ) {
1586	/* Determine the balance mode, position of the first byte to
1587	   be cut, and size to be cut.  In case of the indirect item
1588	   free unformatted nodes which are pointed to by the cut
1589	   pointers. */
1590
1591	c_mode = prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed,
1592					   &n_cut_size, n_new_file_size);
1593	if ( c_mode == M_CONVERT )  {
1594	    /* convert last unformatted node to direct item or leave
1595               tail in the unformatted node */
1596	    RFALSE( n_ret_value != CARRY_ON, "PAP-5570: can not convert twice");
1597
1598	    n_ret_value = maybe_indirect_to_direct (th, p_s_inode, page, p_s_path, p_s_item_key,
1599						    n_new_file_size, &c_mode);
1600	    if ( c_mode == M_SKIP_BALANCING )
1601		/* tail has been left in the unformatted node */
1602		return n_ret_value;
1603
1604	    n_is_inode_locked = 1;
1605
1606	    /* removing of last unformatted node will change value we
1607               have to return to truncate. Save it */
1608	    retval2 = n_ret_value;
1609	    /*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1));*/
1610
1611	    /* So, we have performed the first part of the conversion:
1612	       inserting the new direct item.  Now we are removing the
1613	       last unformatted node pointer. Set key to search for
1614	       it. */
1615      	    set_cpu_key_k_type (p_s_item_key, TYPE_INDIRECT);
1616	    p_s_item_key->key_length = 4;
1617	    n_new_file_size -= (n_new_file_size & (p_s_sb->s_blocksize - 1));
1618	    tail_pos = n_new_file_size;
1619	    set_cpu_key_k_offset (p_s_item_key, n_new_file_size + 1);
1620	    if ( search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ){
1621		print_block (PATH_PLAST_BUFFER (p_s_path), 3, PATH_LAST_POSITION (p_s_path) - 1, PATH_LAST_POSITION (p_s_path) + 1);
1622		reiserfs_panic(p_s_sb, "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%K)", p_s_item_key);
1623	    }
1624	    continue;
1625	}
1626	if (n_cut_size == 0) {
1627	    pathrelse (p_s_path);
1628	    return 0;
1629	}
1630
1631	s_cut_balance.insert_size[0] = n_cut_size;
1632
1633	n_ret_value = fix_nodes(c_mode, &s_cut_balance, NULL, NULL);
1634      	if ( n_ret_value != REPEAT_SEARCH )
1635	    break;
1636
1637	PROC_INFO_INC( p_s_sb, cut_from_item_restarted );
1638
1639	n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
1640	if (n_ret_value == POSITION_FOUND)
1641	    continue;
1642
1643	reiserfs_warning (p_s_sb, "PAP-5610: reiserfs_cut_from_item: item %K not found", p_s_item_key);
1644	unfix_nodes (&s_cut_balance);
1645	return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT;
1646    } /* while */
1647
1648    // check fix_nodes results (IO_ERROR or NO_DISK_SPACE)
1649    if ( n_ret_value != CARRY_ON ) {
1650	if ( n_is_inode_locked ) {
1651	    // FIXME: this seems to be not needed: we are always able
1652	    // to cut item
1653	    indirect_to_direct_roll_back (th, p_s_inode, p_s_path);
1654	}
1655	if (n_ret_value == NO_DISK_SPACE)
1656	    reiserfs_warning (p_s_sb, "NO_DISK_SPACE");
1657	unfix_nodes (&s_cut_balance);
1658	return -EIO;
1659    }
1660
1661    /* go ahead and perform balancing */
1662
1663    RFALSE( c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode");
1664
1665    /* Calculate number of bytes that need to be cut from the item. */
1666    quota_cut_bytes = ( c_mode == M_DELETE ) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.insert_size[0];
1667    if (retval2 == -1)
1668	n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode);
1669    else
1670	n_ret_value = retval2;
1671
1672
1673    /* For direct items, we only change the quota when deleting the last
1674    ** item.
1675    */
1676    p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
1677    if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) {
1678        if (c_mode == M_DELETE &&
1679	   (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) {
1680	    // FIXME: this is to keep 3.5 happy
1681	    REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX;
1682	    quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE ;
1683        } else {
1684	    quota_cut_bytes = 0 ;
1685	}
1686    }
1687#ifdef CONFIG_REISERFS_CHECK
1688    if (n_is_inode_locked) {
1689	struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
1690	/* we are going to complete indirect2direct conversion. Make
1691           sure, that we exactly remove last unformatted node pointer
1692           of the item */
1693	if (!is_indirect_le_ih (le_ih))
1694	    reiserfs_panic (p_s_sb, "vs-5652: reiserfs_cut_from_item: "
1695			    "item must be indirect %h", le_ih);
1696
1697	if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
1698	    reiserfs_panic (p_s_sb, "vs-5653: reiserfs_cut_from_item: "
1699			    "completing indirect2direct conversion indirect item %h "
1700			    "being deleted must be of 4 byte long", le_ih);
1701
1702	if (c_mode == M_CUT && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
1703	    reiserfs_panic (p_s_sb, "vs-5654: reiserfs_cut_from_item: "
1704			    "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)",
1705			    le_ih, s_cut_balance.insert_size[0]);
1706	}
1707	/* it would be useful to make sure, that right neighboring
1708           item is direct item of this file */
1709    }
1710#endif
1711
1712    do_balance(&s_cut_balance, NULL, NULL, c_mode);
1713    if ( n_is_inode_locked ) {
1714	/* we've done an indirect->direct conversion.  when the data block
1715	** was freed, it was removed from the list of blocks that must
1716	** be flushed before the transaction commits, make sure to
1717	** unmap and invalidate it
1718	*/
1719	unmap_buffers(page, tail_pos);
1720	REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask ;
1721    }
1722#ifdef REISERQUOTA_DEBUG
1723    reiserfs_debug (p_s_inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, '?');
1724#endif
1725    DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
1726    return n_ret_value;
1727}
1728
1729static void truncate_directory (struct reiserfs_transaction_handle *th, struct inode * inode)
1730{
1731    BUG_ON (!th->t_trans_id);
1732    if (inode->i_nlink)
1733	reiserfs_warning (inode->i_sb,
1734			  "vs-5655: truncate_directory: link count != 0");
1735
1736    set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), DOT_OFFSET);
1737    set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_DIRENTRY);
1738    reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
1739    reiserfs_update_sd(th, inode) ;
1740    set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), SD_OFFSET);
1741    set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_STAT_DATA);
1742}
1743
1744
1745
1746
1747/* Truncate file to the new size. Note, this must be called with a transaction
1748   already started */
1749int reiserfs_do_truncate (struct reiserfs_transaction_handle *th,
1750			   struct  inode * p_s_inode, /* ->i_size contains new
1751                                                         size */
1752			   struct page *page, /* up to date for last block */
1753			   int update_timestamps  /* when it is called by
1754						     file_release to convert
1755						     the tail - no timestamps
1756						     should be updated */
1757    ) {
1758    INITIALIZE_PATH (s_search_path);       /* Path to the current object item. */
1759    struct item_head    * p_le_ih;         /* Pointer to an item header. */
1760    struct cpu_key      s_item_key;     /* Key to search for a previous file item. */
1761    loff_t         n_file_size,    /* Old file size. */
1762	n_new_file_size;/* New file size. */
1763    int                   n_deleted;      /* Number of deleted or truncated bytes. */
1764    int retval;
1765    int err = 0;
1766
1767    BUG_ON (!th->t_trans_id);
1768    if ( ! (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) || S_ISLNK(p_s_inode->i_mode)) )
1769	return 0;
1770
1771    if (S_ISDIR(p_s_inode->i_mode)) {
1772	// deletion of directory - no need to update timestamps
1773	truncate_directory (th, p_s_inode);
1774	return 0;
1775    }
1776
1777    /* Get new file size. */
1778    n_new_file_size = p_s_inode->i_size;
1779
1780    // FIXME: note, that key type is unimportant here
1781    make_cpu_key (&s_item_key, p_s_inode, max_reiserfs_offset (p_s_inode), TYPE_DIRECT, 3);
1782
1783    retval = search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path);
1784    if (retval == IO_ERROR) {
1785	reiserfs_warning (p_s_inode->i_sb, "vs-5657: reiserfs_do_truncate: "
1786			  "i/o failure occurred trying to truncate %K", &s_item_key);
1787        err = -EIO;
1788        goto out;
1789    }
1790    if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
1791	reiserfs_warning (p_s_inode->i_sb, "PAP-5660: reiserfs_do_truncate: "
1792			  "wrong result %d of search for %K", retval, &s_item_key);
1793
1794        err = -EIO;
1795        goto out;
1796    }
1797
1798    s_search_path.pos_in_item --;
1799
1800    /* Get real file size (total length of all file items) */
1801    p_le_ih = PATH_PITEM_HEAD(&s_search_path);
1802    if ( is_statdata_le_ih (p_le_ih) )
1803	n_file_size = 0;
1804    else {
1805	loff_t offset = le_ih_k_offset (p_le_ih);
1806	int bytes = op_bytes_number (p_le_ih,p_s_inode->i_sb->s_blocksize);
1807
1808	/* this may mismatch with real file size: if last direct item
1809           had no padding zeros and last unformatted node had no free
1810           space, this file would have this file size */
1811	n_file_size = offset + bytes - 1;
1812    }
1813    /*
1814     * are we doing a full truncate or delete, if so
1815     * kick in the reada code
1816     */
1817    if (n_new_file_size == 0)
1818        s_search_path.reada = PATH_READA | PATH_READA_BACK;
1819
1820    if ( n_file_size == 0 || n_file_size < n_new_file_size ) {
1821	goto update_and_out ;
1822    }
1823
1824    /* Update key to search for the last file item. */
1825    set_cpu_key_k_offset (&s_item_key, n_file_size);
1826
1827    do  {
1828	/* Cut or delete file item. */
1829	n_deleted = reiserfs_cut_from_item(th, &s_search_path, &s_item_key, p_s_inode,  page, n_new_file_size);
1830	if (n_deleted < 0) {
1831	    reiserfs_warning (p_s_inode->i_sb, "vs-5665: reiserfs_do_truncate: reiserfs_cut_from_item failed");
1832	    reiserfs_check_path(&s_search_path) ;
1833	    return 0;
1834	}
1835
1836	RFALSE( n_deleted > n_file_size,
1837		"PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
1838		n_deleted, n_file_size, &s_item_key);
1839
1840	/* Change key to search the last file item. */
1841	n_file_size -= n_deleted;
1842
1843	set_cpu_key_k_offset (&s_item_key, n_file_size);
1844
1845	/* While there are bytes to truncate and previous file item is presented in the tree. */
1846
1847	/*
1848	** This loop could take a really long time, and could log
1849	** many more blocks than a transaction can hold.  So, we do a polite
1850	** journal end here, and if the transaction needs ending, we make
1851	** sure the file is consistent before ending the current trans
1852	** and starting a new one
1853	*/
1854        if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
1855	  int orig_len_alloc = th->t_blocks_allocated ;
1856	  decrement_counters_in_path(&s_search_path) ;
1857
1858	  if (update_timestamps) {
1859	      p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC;
1860	  }
1861	  reiserfs_update_sd(th, p_s_inode) ;
1862
1863	  err = journal_end(th, p_s_inode->i_sb, orig_len_alloc) ;
1864	  if (err)
1865	    goto out;
1866	  err = journal_begin (th, p_s_inode->i_sb,
1867                               JOURNAL_PER_BALANCE_CNT * 6);
1868	  if (err)
1869	    goto out;
1870	  reiserfs_update_inode_transaction(p_s_inode) ;
1871	}
1872    } while ( n_file_size > ROUND_UP (n_new_file_size) &&
1873	      search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path) == POSITION_FOUND )  ;
1874
1875    RFALSE( n_file_size > ROUND_UP (n_new_file_size),
1876	    "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
1877	    n_new_file_size, n_file_size, s_item_key.on_disk_key.k_objectid);
1878
1879update_and_out:
1880    if (update_timestamps) {
1881	// this is truncate, not file closing
1882	    p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC;
1883    }
1884    reiserfs_update_sd (th, p_s_inode);
1885
1886out:
1887    pathrelse(&s_search_path) ;
1888    return err;
1889}
1890
1891
1892#ifdef CONFIG_REISERFS_CHECK
1893// this makes sure, that we __append__, not overwrite or add holes
1894static void check_research_for_paste (struct path * path,
1895				      const struct cpu_key * p_s_key)
1896{
1897    struct item_head * found_ih = get_ih (path);
1898
1899    if (is_direct_le_ih (found_ih)) {
1900	if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) !=
1901	    cpu_key_k_offset (p_s_key) ||
1902	    op_bytes_number (found_ih, get_last_bh (path)->b_size) != pos_in_item (path))
1903	    reiserfs_panic (NULL, "PAP-5720: check_research_for_paste: "
1904			    "found direct item %h or position (%d) does not match to key %K",
1905			    found_ih, pos_in_item (path), p_s_key);
1906    }
1907    if (is_indirect_le_ih (found_ih)) {
1908	if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) != cpu_key_k_offset (p_s_key) ||
1909	    I_UNFM_NUM (found_ih) != pos_in_item (path) ||
1910	    get_ih_free_space (found_ih) != 0)
1911	    reiserfs_panic (NULL, "PAP-5730: check_research_for_paste: "
1912			    "found indirect item (%h) or position (%d) does not match to key (%K)",
1913			    found_ih, pos_in_item (path), p_s_key);
1914    }
1915}
1916#endif /* config reiserfs check */
1917
1918
1919/* Paste bytes to the existing item. Returns bytes number pasted into the item. */
1920int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th,
1921			      struct path         * p_s_search_path,	/* Path to the pasted item.          */
1922			      const struct cpu_key      * p_s_key,        	/* Key to search for the needed item.*/
1923			      struct inode	  * inode,		/* Inode item belongs to */
1924			      const char          * p_c_body,       	/* Pointer to the bytes to paste.    */
1925			      int                   n_pasted_size)  	/* Size of pasted bytes.             */
1926{
1927    struct tree_balance s_paste_balance;
1928    int                 retval;
1929    int			fs_gen;
1930
1931    BUG_ON (!th->t_trans_id);
1932
1933    fs_gen = get_generation(inode->i_sb) ;
1934
1935#ifdef REISERQUOTA_DEBUG
1936    reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): allocating %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
1937#endif
1938
1939    if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) {
1940	pathrelse(p_s_search_path);
1941	return -EDQUOT;
1942    }
1943    init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size);
1944#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1945    s_paste_balance.key = p_s_key->on_disk_key;
1946#endif
1947
1948    /* DQUOT_* can schedule, must check before the fix_nodes */
1949    if (fs_changed(fs_gen, inode->i_sb)) {
1950	goto search_again;
1951    }
1952
1953    while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) ==
1954REPEAT_SEARCH ) {
1955search_again:
1956	/* file system changed while we were in the fix_nodes */
1957	PROC_INFO_INC( th -> t_super, paste_into_item_restarted );
1958	retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path);
1959	if (retval == IO_ERROR) {
1960	    retval = -EIO ;
1961	    goto error_out ;
1962	}
1963	if (retval == POSITION_FOUND) {
1964	    reiserfs_warning (inode->i_sb, "PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists", p_s_key);
1965	    retval = -EEXIST ;
1966	    goto error_out ;
1967	}
1968
1969#ifdef CONFIG_REISERFS_CHECK
1970	check_research_for_paste (p_s_search_path, p_s_key);
1971#endif
1972    }
1973
1974    /* Perform balancing after all resources are collected by fix_nodes, and
1975       accessing them will not risk triggering schedule. */
1976    if ( retval == CARRY_ON ) {
1977	do_balance(&s_paste_balance, NULL/*ih*/, p_c_body, M_PASTE);
1978	return 0;
1979    }
1980    retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
1981error_out:
1982    /* this also releases the path */
1983    unfix_nodes(&s_paste_balance);
1984#ifdef REISERQUOTA_DEBUG
1985    reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): freeing %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
1986#endif
1987    DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size);
1988    return retval ;
1989}
1990
1991
1992/* Insert new item into the buffer at the path. */
1993int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
1994			 struct path         * 	p_s_path,         /* Path to the inserteded item.         */
1995			 const struct cpu_key      * key,
1996			 struct item_head    * 	p_s_ih,           /* Pointer to the item header to insert.*/
1997			 struct inode        * inode,
1998			 const char          * 	p_c_body)         /* Pointer to the bytes to insert.      */
1999{
2000    struct tree_balance s_ins_balance;
2001    int                 retval;
2002    int fs_gen = 0 ;
2003    int quota_bytes = 0 ;
2004
2005    BUG_ON (!th->t_trans_id);
2006
2007    if (inode) {      /* Do we count quotas for item? */
2008	fs_gen = get_generation(inode->i_sb);
2009	quota_bytes = ih_item_len(p_s_ih);
2010
2011	/* hack so the quota code doesn't have to guess if the file has
2012	 ** a tail, links are always tails, so there's no guessing needed
2013	 */
2014	if (!S_ISLNK (inode->i_mode) && is_direct_le_ih(p_s_ih)) {
2015	    quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE ;
2016	}
2017#ifdef REISERQUOTA_DEBUG
2018	reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih));
2019#endif
2020	/* We can't dirty inode here. It would be immediately written but
2021	 * appropriate stat item isn't inserted yet... */
2022	if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) {
2023	    pathrelse(p_s_path);
2024	    return -EDQUOT;
2025	}
2026    }
2027    init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih));
2028#ifdef DISPLACE_NEW_PACKING_LOCALITIES
2029    s_ins_balance.key = key->on_disk_key;
2030#endif
2031    /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
2032    if (inode && fs_changed(fs_gen, inode->i_sb)) {
2033	goto search_again;
2034    }
2035
2036    while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) {
2037search_again:
2038	/* file system changed while we were in the fix_nodes */
2039	PROC_INFO_INC( th -> t_super, insert_item_restarted );
2040	retval = search_item (th->t_super, key, p_s_path);
2041	if (retval == IO_ERROR) {
2042	    retval = -EIO;
2043	    goto error_out ;
2044	}
2045	if (retval == ITEM_FOUND) {
2046	    reiserfs_warning (th->t_super, "PAP-5760: reiserfs_insert_item: "
2047			      "key %K already exists in the tree", key);
2048	    retval = -EEXIST ;
2049	    goto error_out;
2050	}
2051    }
2052
2053    /* make balancing after all resources will be collected at a time */
2054    if ( retval == CARRY_ON ) {
2055	do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT);
2056	return 0;
2057    }
2058
2059    retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
2060error_out:
2061    /* also releases the path */
2062    unfix_nodes(&s_ins_balance);
2063#ifdef REISERQUOTA_DEBUG
2064    reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih));
2065#endif
2066    if (inode)
2067	DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes) ;
2068    return retval;
2069}
2070
2071
2072
2073
2074