namei.c revision 734711abac46c8fee4d70cc9876ebc6d9edb4971
1/*
2 *  linux/fs/ext3/namei.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 *  from
10 *
11 *  linux/fs/minix/namei.c
12 *
13 *  Copyright (C) 1991, 1992  Linus Torvalds
14 *
15 *  Big-endian to little-endian byte-swapping/bitmaps by
16 *        David S. Miller (davem@caip.rutgers.edu), 1995
17 *  Directory entry file type support and forward compatibility hooks
18 *	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
19 *  Hash Tree Directory indexing (c)
20 *	Daniel Phillips, 2001
21 *  Hash Tree Directory indexing porting
22 *	Christopher Li, 2002
23 *  Hash Tree Directory indexing cleanup
24 *	Theodore Ts'o, 2002
25 */
26
27#include <linux/fs.h>
28#include <linux/pagemap.h>
29#include <linux/jbd.h>
30#include <linux/time.h>
31#include <linux/ext3_fs.h>
32#include <linux/ext3_jbd.h>
33#include <linux/fcntl.h>
34#include <linux/stat.h>
35#include <linux/string.h>
36#include <linux/quotaops.h>
37#include <linux/buffer_head.h>
38#include <linux/bio.h>
39
40#include "namei.h"
41#include "xattr.h"
42#include "acl.h"
43
44/*
45 * define how far ahead to read directories while searching them.
46 */
47#define NAMEI_RA_CHUNKS  2
48#define NAMEI_RA_BLOCKS  4
49#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
50#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
51
52static struct buffer_head *ext3_append(handle_t *handle,
53					struct inode *inode,
54					u32 *block, int *err)
55{
56	struct buffer_head *bh;
57
58	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
59
60	bh = ext3_bread(handle, inode, *block, 1, err);
61	if (bh) {
62		inode->i_size += inode->i_sb->s_blocksize;
63		EXT3_I(inode)->i_disksize = inode->i_size;
64		*err = ext3_journal_get_write_access(handle, bh);
65		if (*err) {
66			brelse(bh);
67			bh = NULL;
68		}
69	}
70	return bh;
71}
72
73#ifndef assert
74#define assert(test) J_ASSERT(test)
75#endif
76
77#ifndef swap
78#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
79#endif
80
81#ifdef DX_DEBUG
82#define dxtrace(command) command
83#else
84#define dxtrace(command)
85#endif
86
87struct fake_dirent
88{
89	__le32 inode;
90	__le16 rec_len;
91	u8 name_len;
92	u8 file_type;
93};
94
95struct dx_countlimit
96{
97	__le16 limit;
98	__le16 count;
99};
100
101struct dx_entry
102{
103	__le32 hash;
104	__le32 block;
105};
106
107/*
108 * dx_root_info is laid out so that if it should somehow get overlaid by a
109 * dirent the two low bits of the hash version will be zero.  Therefore, the
110 * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
111 */
112
113struct dx_root
114{
115	struct fake_dirent dot;
116	char dot_name[4];
117	struct fake_dirent dotdot;
118	char dotdot_name[4];
119	struct dx_root_info
120	{
121		__le32 reserved_zero;
122		u8 hash_version;
123		u8 info_length; /* 8 */
124		u8 indirect_levels;
125		u8 unused_flags;
126	}
127	info;
128	struct dx_entry	entries[0];
129};
130
131struct dx_node
132{
133	struct fake_dirent fake;
134	struct dx_entry	entries[0];
135};
136
137
138struct dx_frame
139{
140	struct buffer_head *bh;
141	struct dx_entry *entries;
142	struct dx_entry *at;
143};
144
145struct dx_map_entry
146{
147	u32 hash;
148	u16 offs;
149	u16 size;
150};
151
152static inline unsigned dx_get_block (struct dx_entry *entry);
153static void dx_set_block (struct dx_entry *entry, unsigned value);
154static inline unsigned dx_get_hash (struct dx_entry *entry);
155static void dx_set_hash (struct dx_entry *entry, unsigned value);
156static unsigned dx_get_count (struct dx_entry *entries);
157static unsigned dx_get_limit (struct dx_entry *entries);
158static void dx_set_count (struct dx_entry *entries, unsigned value);
159static void dx_set_limit (struct dx_entry *entries, unsigned value);
160static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
161static unsigned dx_node_limit (struct inode *dir);
162static struct dx_frame *dx_probe(struct qstr *entry,
163				 struct inode *dir,
164				 struct dx_hash_info *hinfo,
165				 struct dx_frame *frame,
166				 int *err);
167static void dx_release (struct dx_frame *frames);
168static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
169			struct dx_hash_info *hinfo, struct dx_map_entry map[]);
170static void dx_sort_map(struct dx_map_entry *map, unsigned count);
171static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
172		struct dx_map_entry *offsets, int count);
173static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
174static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
175static int ext3_htree_next_block(struct inode *dir, __u32 hash,
176				 struct dx_frame *frame,
177				 struct dx_frame *frames,
178				 __u32 *start_hash);
179static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
180			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
181			int *err);
182static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
183			     struct inode *inode);
184
185/*
186 * p is at least 6 bytes before the end of page
187 */
188static inline struct ext3_dir_entry_2 *
189ext3_next_entry(struct ext3_dir_entry_2 *p)
190{
191	return (struct ext3_dir_entry_2 *)((char *)p +
192		ext3_rec_len_from_disk(p->rec_len));
193}
194
195/*
196 * Future: use high four bits of block for coalesce-on-delete flags
197 * Mask them off for now.
198 */
199
200static inline unsigned dx_get_block (struct dx_entry *entry)
201{
202	return le32_to_cpu(entry->block) & 0x00ffffff;
203}
204
205static inline void dx_set_block (struct dx_entry *entry, unsigned value)
206{
207	entry->block = cpu_to_le32(value);
208}
209
210static inline unsigned dx_get_hash (struct dx_entry *entry)
211{
212	return le32_to_cpu(entry->hash);
213}
214
215static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
216{
217	entry->hash = cpu_to_le32(value);
218}
219
220static inline unsigned dx_get_count (struct dx_entry *entries)
221{
222	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
223}
224
225static inline unsigned dx_get_limit (struct dx_entry *entries)
226{
227	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
228}
229
230static inline void dx_set_count (struct dx_entry *entries, unsigned value)
231{
232	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
233}
234
235static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
236{
237	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
238}
239
240static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
241{
242	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
243		EXT3_DIR_REC_LEN(2) - infosize;
244	return entry_space / sizeof(struct dx_entry);
245}
246
247static inline unsigned dx_node_limit (struct inode *dir)
248{
249	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
250	return entry_space / sizeof(struct dx_entry);
251}
252
253/*
254 * Debug
255 */
256#ifdef DX_DEBUG
257static void dx_show_index (char * label, struct dx_entry *entries)
258{
259        int i, n = dx_get_count (entries);
260        printk("%s index ", label);
261        for (i = 0; i < n; i++)
262        {
263                printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i));
264        }
265        printk("\n");
266}
267
268struct stats
269{
270	unsigned names;
271	unsigned space;
272	unsigned bcount;
273};
274
275static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
276				 int size, int show_names)
277{
278	unsigned names = 0, space = 0;
279	char *base = (char *) de;
280	struct dx_hash_info h = *hinfo;
281
282	printk("names: ");
283	while ((char *) de < base + size)
284	{
285		if (de->inode)
286		{
287			if (show_names)
288			{
289				int len = de->name_len;
290				char *name = de->name;
291				while (len--) printk("%c", *name++);
292				ext3fs_dirhash(de->name, de->name_len, &h);
293				printk(":%x.%u ", h.hash,
294				       ((char *) de - base));
295			}
296			space += EXT3_DIR_REC_LEN(de->name_len);
297			names++;
298		}
299		de = ext3_next_entry(de);
300	}
301	printk("(%i)\n", names);
302	return (struct stats) { names, space, 1 };
303}
304
305struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
306			     struct dx_entry *entries, int levels)
307{
308	unsigned blocksize = dir->i_sb->s_blocksize;
309	unsigned count = dx_get_count (entries), names = 0, space = 0, i;
310	unsigned bcount = 0;
311	struct buffer_head *bh;
312	int err;
313	printk("%i indexed blocks...\n", count);
314	for (i = 0; i < count; i++, entries++)
315	{
316		u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
317		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
318		struct stats stats;
319		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
320		if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
321		stats = levels?
322		   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
323		   dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
324		names += stats.names;
325		space += stats.space;
326		bcount += stats.bcount;
327		brelse (bh);
328	}
329	if (bcount)
330		printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
331			names, space/bcount,(space/bcount)*100/blocksize);
332	return (struct stats) { names, space, bcount};
333}
334#endif /* DX_DEBUG */
335
336/*
337 * Probe for a directory leaf block to search.
338 *
339 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
340 * error in the directory index, and the caller should fall back to
341 * searching the directory normally.  The callers of dx_probe **MUST**
342 * check for this error code, and make sure it never gets reflected
343 * back to userspace.
344 */
345static struct dx_frame *
346dx_probe(struct qstr *entry, struct inode *dir,
347	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
348{
349	unsigned count, indirect;
350	struct dx_entry *at, *entries, *p, *q, *m;
351	struct dx_root *root;
352	struct buffer_head *bh;
353	struct dx_frame *frame = frame_in;
354	u32 hash;
355
356	frame->bh = NULL;
357	if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
358		goto fail;
359	root = (struct dx_root *) bh->b_data;
360	if (root->info.hash_version != DX_HASH_TEA &&
361	    root->info.hash_version != DX_HASH_HALF_MD4 &&
362	    root->info.hash_version != DX_HASH_LEGACY) {
363		ext3_warning(dir->i_sb, __func__,
364			     "Unrecognised inode hash code %d",
365			     root->info.hash_version);
366		brelse(bh);
367		*err = ERR_BAD_DX_DIR;
368		goto fail;
369	}
370	hinfo->hash_version = root->info.hash_version;
371	hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
372	if (entry)
373		ext3fs_dirhash(entry->name, entry->len, hinfo);
374	hash = hinfo->hash;
375
376	if (root->info.unused_flags & 1) {
377		ext3_warning(dir->i_sb, __func__,
378			     "Unimplemented inode hash flags: %#06x",
379			     root->info.unused_flags);
380		brelse(bh);
381		*err = ERR_BAD_DX_DIR;
382		goto fail;
383	}
384
385	if ((indirect = root->info.indirect_levels) > 1) {
386		ext3_warning(dir->i_sb, __func__,
387			     "Unimplemented inode hash depth: %#06x",
388			     root->info.indirect_levels);
389		brelse(bh);
390		*err = ERR_BAD_DX_DIR;
391		goto fail;
392	}
393
394	entries = (struct dx_entry *) (((char *)&root->info) +
395				       root->info.info_length);
396
397	if (dx_get_limit(entries) != dx_root_limit(dir,
398						   root->info.info_length)) {
399		ext3_warning(dir->i_sb, __func__,
400			     "dx entry: limit != root limit");
401		brelse(bh);
402		*err = ERR_BAD_DX_DIR;
403		goto fail;
404	}
405
406	dxtrace (printk("Look up %x", hash));
407	while (1)
408	{
409		count = dx_get_count(entries);
410		if (!count || count > dx_get_limit(entries)) {
411			ext3_warning(dir->i_sb, __func__,
412				     "dx entry: no count or count > limit");
413			brelse(bh);
414			*err = ERR_BAD_DX_DIR;
415			goto fail2;
416		}
417
418		p = entries + 1;
419		q = entries + count - 1;
420		while (p <= q)
421		{
422			m = p + (q - p)/2;
423			dxtrace(printk("."));
424			if (dx_get_hash(m) > hash)
425				q = m - 1;
426			else
427				p = m + 1;
428		}
429
430		if (0) // linear search cross check
431		{
432			unsigned n = count - 1;
433			at = entries;
434			while (n--)
435			{
436				dxtrace(printk(","));
437				if (dx_get_hash(++at) > hash)
438				{
439					at--;
440					break;
441				}
442			}
443			assert (at == p - 1);
444		}
445
446		at = p - 1;
447		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
448		frame->bh = bh;
449		frame->entries = entries;
450		frame->at = at;
451		if (!indirect--) return frame;
452		if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
453			goto fail2;
454		at = entries = ((struct dx_node *) bh->b_data)->entries;
455		if (dx_get_limit(entries) != dx_node_limit (dir)) {
456			ext3_warning(dir->i_sb, __func__,
457				     "dx entry: limit != node limit");
458			brelse(bh);
459			*err = ERR_BAD_DX_DIR;
460			goto fail2;
461		}
462		frame++;
463		frame->bh = NULL;
464	}
465fail2:
466	while (frame >= frame_in) {
467		brelse(frame->bh);
468		frame--;
469	}
470fail:
471	if (*err == ERR_BAD_DX_DIR)
472		ext3_warning(dir->i_sb, __func__,
473			     "Corrupt dir inode %ld, running e2fsck is "
474			     "recommended.", dir->i_ino);
475	return NULL;
476}
477
478static void dx_release (struct dx_frame *frames)
479{
480	if (frames[0].bh == NULL)
481		return;
482
483	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
484		brelse(frames[1].bh);
485	brelse(frames[0].bh);
486}
487
488/*
489 * This function increments the frame pointer to search the next leaf
490 * block, and reads in the necessary intervening nodes if the search
491 * should be necessary.  Whether or not the search is necessary is
492 * controlled by the hash parameter.  If the hash value is even, then
493 * the search is only continued if the next block starts with that
494 * hash value.  This is used if we are searching for a specific file.
495 *
496 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
497 *
498 * This function returns 1 if the caller should continue to search,
499 * or 0 if it should not.  If there is an error reading one of the
500 * index blocks, it will a negative error code.
501 *
502 * If start_hash is non-null, it will be filled in with the starting
503 * hash of the next page.
504 */
505static int ext3_htree_next_block(struct inode *dir, __u32 hash,
506				 struct dx_frame *frame,
507				 struct dx_frame *frames,
508				 __u32 *start_hash)
509{
510	struct dx_frame *p;
511	struct buffer_head *bh;
512	int err, num_frames = 0;
513	__u32 bhash;
514
515	p = frame;
516	/*
517	 * Find the next leaf page by incrementing the frame pointer.
518	 * If we run out of entries in the interior node, loop around and
519	 * increment pointer in the parent node.  When we break out of
520	 * this loop, num_frames indicates the number of interior
521	 * nodes need to be read.
522	 */
523	while (1) {
524		if (++(p->at) < p->entries + dx_get_count(p->entries))
525			break;
526		if (p == frames)
527			return 0;
528		num_frames++;
529		p--;
530	}
531
532	/*
533	 * If the hash is 1, then continue only if the next page has a
534	 * continuation hash of any value.  This is used for readdir
535	 * handling.  Otherwise, check to see if the hash matches the
536	 * desired contiuation hash.  If it doesn't, return since
537	 * there's no point to read in the successive index pages.
538	 */
539	bhash = dx_get_hash(p->at);
540	if (start_hash)
541		*start_hash = bhash;
542	if ((hash & 1) == 0) {
543		if ((bhash & ~1) != hash)
544			return 0;
545	}
546	/*
547	 * If the hash is HASH_NB_ALWAYS, we always go to the next
548	 * block so no check is necessary
549	 */
550	while (num_frames--) {
551		if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
552				      0, &err)))
553			return err; /* Failure */
554		p++;
555		brelse (p->bh);
556		p->bh = bh;
557		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
558	}
559	return 1;
560}
561
562
563/*
564 * This function fills a red-black tree with information from a
565 * directory block.  It returns the number directory entries loaded
566 * into the tree.  If there is an error it is returned in err.
567 */
568static int htree_dirblock_to_tree(struct file *dir_file,
569				  struct inode *dir, int block,
570				  struct dx_hash_info *hinfo,
571				  __u32 start_hash, __u32 start_minor_hash)
572{
573	struct buffer_head *bh;
574	struct ext3_dir_entry_2 *de, *top;
575	int err, count = 0;
576
577	dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
578	if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
579		return err;
580
581	de = (struct ext3_dir_entry_2 *) bh->b_data;
582	top = (struct ext3_dir_entry_2 *) ((char *) de +
583					   dir->i_sb->s_blocksize -
584					   EXT3_DIR_REC_LEN(0));
585	for (; de < top; de = ext3_next_entry(de)) {
586		if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
587					(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
588						+((char *)de - bh->b_data))) {
589			/* On error, skip the f_pos to the next block. */
590			dir_file->f_pos = (dir_file->f_pos |
591					(dir->i_sb->s_blocksize - 1)) + 1;
592			brelse (bh);
593			return count;
594		}
595		ext3fs_dirhash(de->name, de->name_len, hinfo);
596		if ((hinfo->hash < start_hash) ||
597		    ((hinfo->hash == start_hash) &&
598		     (hinfo->minor_hash < start_minor_hash)))
599			continue;
600		if (de->inode == 0)
601			continue;
602		if ((err = ext3_htree_store_dirent(dir_file,
603				   hinfo->hash, hinfo->minor_hash, de)) != 0) {
604			brelse(bh);
605			return err;
606		}
607		count++;
608	}
609	brelse(bh);
610	return count;
611}
612
613
614/*
615 * This function fills a red-black tree with information from a
616 * directory.  We start scanning the directory in hash order, starting
617 * at start_hash and start_minor_hash.
618 *
619 * This function returns the number of entries inserted into the tree,
620 * or a negative error code.
621 */
622int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
623			 __u32 start_minor_hash, __u32 *next_hash)
624{
625	struct dx_hash_info hinfo;
626	struct ext3_dir_entry_2 *de;
627	struct dx_frame frames[2], *frame;
628	struct inode *dir;
629	int block, err;
630	int count = 0;
631	int ret;
632	__u32 hashval;
633
634	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
635		       start_minor_hash));
636	dir = dir_file->f_path.dentry->d_inode;
637	if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
638		hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
639		hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
640		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
641					       start_hash, start_minor_hash);
642		*next_hash = ~0;
643		return count;
644	}
645	hinfo.hash = start_hash;
646	hinfo.minor_hash = 0;
647	frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
648	if (!frame)
649		return err;
650
651	/* Add '.' and '..' from the htree header */
652	if (!start_hash && !start_minor_hash) {
653		de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
654		if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
655			goto errout;
656		count++;
657	}
658	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
659		de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
660		de = ext3_next_entry(de);
661		if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0)
662			goto errout;
663		count++;
664	}
665
666	while (1) {
667		block = dx_get_block(frame->at);
668		ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
669					     start_hash, start_minor_hash);
670		if (ret < 0) {
671			err = ret;
672			goto errout;
673		}
674		count += ret;
675		hashval = ~0;
676		ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
677					    frame, frames, &hashval);
678		*next_hash = hashval;
679		if (ret < 0) {
680			err = ret;
681			goto errout;
682		}
683		/*
684		 * Stop if:  (a) there are no more entries, or
685		 * (b) we have inserted at least one entry and the
686		 * next hash value is not a continuation
687		 */
688		if ((ret == 0) ||
689		    (count && ((hashval & 1) == 0)))
690			break;
691	}
692	dx_release(frames);
693	dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
694		       count, *next_hash));
695	return count;
696errout:
697	dx_release(frames);
698	return (err);
699}
700
701
702/*
703 * Directory block splitting, compacting
704 */
705
706/*
707 * Create map of hash values, offsets, and sizes, stored at end of block.
708 * Returns number of entries mapped.
709 */
710static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
711			struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
712{
713	int count = 0;
714	char *base = (char *) de;
715	struct dx_hash_info h = *hinfo;
716
717	while ((char *) de < base + size)
718	{
719		if (de->name_len && de->inode) {
720			ext3fs_dirhash(de->name, de->name_len, &h);
721			map_tail--;
722			map_tail->hash = h.hash;
723			map_tail->offs = (u16) ((char *) de - base);
724			map_tail->size = le16_to_cpu(de->rec_len);
725			count++;
726			cond_resched();
727		}
728		/* XXX: do we need to check rec_len == 0 case? -Chris */
729		de = ext3_next_entry(de);
730	}
731	return count;
732}
733
734/* Sort map by hash value */
735static void dx_sort_map (struct dx_map_entry *map, unsigned count)
736{
737        struct dx_map_entry *p, *q, *top = map + count - 1;
738        int more;
739        /* Combsort until bubble sort doesn't suck */
740        while (count > 2)
741	{
742                count = count*10/13;
743                if (count - 9 < 2) /* 9, 10 -> 11 */
744                        count = 11;
745                for (p = top, q = p - count; q >= map; p--, q--)
746                        if (p->hash < q->hash)
747                                swap(*p, *q);
748        }
749        /* Garden variety bubble sort */
750        do {
751                more = 0;
752                q = top;
753                while (q-- > map)
754		{
755                        if (q[1].hash >= q[0].hash)
756				continue;
757                        swap(*(q+1), *q);
758                        more = 1;
759		}
760	} while(more);
761}
762
763static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
764{
765	struct dx_entry *entries = frame->entries;
766	struct dx_entry *old = frame->at, *new = old + 1;
767	int count = dx_get_count(entries);
768
769	assert(count < dx_get_limit(entries));
770	assert(old < entries + count);
771	memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
772	dx_set_hash(new, hash);
773	dx_set_block(new, block);
774	dx_set_count(entries, count + 1);
775}
776
777static void ext3_update_dx_flag(struct inode *inode)
778{
779	if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
780				     EXT3_FEATURE_COMPAT_DIR_INDEX))
781		EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
782}
783
784/*
785 * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
786 *
787 * `len <= EXT3_NAME_LEN' is guaranteed by caller.
788 * `de != NULL' is guaranteed by caller.
789 */
790static inline int ext3_match (int len, const char * const name,
791			      struct ext3_dir_entry_2 * de)
792{
793	if (len != de->name_len)
794		return 0;
795	if (!de->inode)
796		return 0;
797	return !memcmp(name, de->name, len);
798}
799
800/*
801 * Returns 0 if not found, -1 on failure, and 1 on success
802 */
803static inline int search_dirblock(struct buffer_head * bh,
804				  struct inode *dir,
805				  struct qstr *child,
806				  unsigned long offset,
807				  struct ext3_dir_entry_2 ** res_dir)
808{
809	struct ext3_dir_entry_2 * de;
810	char * dlimit;
811	int de_len;
812	const char *name = child->name;
813	int namelen = child->len;
814
815	de = (struct ext3_dir_entry_2 *) bh->b_data;
816	dlimit = bh->b_data + dir->i_sb->s_blocksize;
817	while ((char *) de < dlimit) {
818		/* this code is executed quadratically often */
819		/* do minimal checking `by hand' */
820
821		if ((char *) de + namelen <= dlimit &&
822		    ext3_match (namelen, name, de)) {
823			/* found a match - just to be sure, do a full check */
824			if (!ext3_check_dir_entry("ext3_find_entry",
825						  dir, de, bh, offset))
826				return -1;
827			*res_dir = de;
828			return 1;
829		}
830		/* prevent looping on a bad block */
831		de_len = ext3_rec_len_from_disk(de->rec_len);
832		if (de_len <= 0)
833			return -1;
834		offset += de_len;
835		de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
836	}
837	return 0;
838}
839
840
841/*
842 *	ext3_find_entry()
843 *
844 * finds an entry in the specified directory with the wanted name. It
845 * returns the cache buffer in which the entry was found, and the entry
846 * itself (as a parameter - res_dir). It does NOT read the inode of the
847 * entry - you'll have to do that yourself if you want to.
848 *
849 * The returned buffer_head has ->b_count elevated.  The caller is expected
850 * to brelse() it when appropriate.
851 */
852static struct buffer_head *ext3_find_entry(struct inode *dir,
853					struct qstr *entry,
854					struct ext3_dir_entry_2 **res_dir)
855{
856	struct super_block * sb;
857	struct buffer_head * bh_use[NAMEI_RA_SIZE];
858	struct buffer_head * bh, *ret = NULL;
859	unsigned long start, block, b;
860	int ra_max = 0;		/* Number of bh's in the readahead
861				   buffer, bh_use[] */
862	int ra_ptr = 0;		/* Current index into readahead
863				   buffer */
864	int num = 0;
865	int nblocks, i, err;
866	int namelen;
867
868	*res_dir = NULL;
869	sb = dir->i_sb;
870	namelen = entry->len;
871	if (namelen > EXT3_NAME_LEN)
872		return NULL;
873	if (is_dx(dir)) {
874		bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
875		/*
876		 * On success, or if the error was file not found,
877		 * return.  Otherwise, fall back to doing a search the
878		 * old fashioned way.
879		 */
880		if (bh || (err != ERR_BAD_DX_DIR))
881			return bh;
882		dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
883	}
884	nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
885	start = EXT3_I(dir)->i_dir_start_lookup;
886	if (start >= nblocks)
887		start = 0;
888	block = start;
889restart:
890	do {
891		/*
892		 * We deal with the read-ahead logic here.
893		 */
894		if (ra_ptr >= ra_max) {
895			/* Refill the readahead buffer */
896			ra_ptr = 0;
897			b = block;
898			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
899				/*
900				 * Terminate if we reach the end of the
901				 * directory and must wrap, or if our
902				 * search has finished at this block.
903				 */
904				if (b >= nblocks || (num && block == start)) {
905					bh_use[ra_max] = NULL;
906					break;
907				}
908				num++;
909				bh = ext3_getblk(NULL, dir, b++, 0, &err);
910				bh_use[ra_max] = bh;
911				if (bh)
912					ll_rw_block(READ_META, 1, &bh);
913			}
914		}
915		if ((bh = bh_use[ra_ptr++]) == NULL)
916			goto next;
917		wait_on_buffer(bh);
918		if (!buffer_uptodate(bh)) {
919			/* read error, skip block & hope for the best */
920			ext3_error(sb, __func__, "reading directory #%lu "
921				   "offset %lu", dir->i_ino, block);
922			brelse(bh);
923			goto next;
924		}
925		i = search_dirblock(bh, dir, entry,
926			    block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
927		if (i == 1) {
928			EXT3_I(dir)->i_dir_start_lookup = block;
929			ret = bh;
930			goto cleanup_and_exit;
931		} else {
932			brelse(bh);
933			if (i < 0)
934				goto cleanup_and_exit;
935		}
936	next:
937		if (++block >= nblocks)
938			block = 0;
939	} while (block != start);
940
941	/*
942	 * If the directory has grown while we were searching, then
943	 * search the last part of the directory before giving up.
944	 */
945	block = nblocks;
946	nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
947	if (block < nblocks) {
948		start = 0;
949		goto restart;
950	}
951
952cleanup_and_exit:
953	/* Clean up the read-ahead blocks */
954	for (; ra_ptr < ra_max; ra_ptr++)
955		brelse (bh_use[ra_ptr]);
956	return ret;
957}
958
959static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
960			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
961			int *err)
962{
963	struct super_block * sb;
964	struct dx_hash_info	hinfo;
965	u32 hash;
966	struct dx_frame frames[2], *frame;
967	struct ext3_dir_entry_2 *de, *top;
968	struct buffer_head *bh;
969	unsigned long block;
970	int retval;
971	int namelen = entry->len;
972	const u8 *name = entry->name;
973
974	sb = dir->i_sb;
975	/* NFS may look up ".." - look at dx_root directory block */
976	if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
977		if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
978			return NULL;
979	} else {
980		frame = frames;
981		frame->bh = NULL;			/* for dx_release() */
982		frame->at = (struct dx_entry *)frames;	/* hack for zero entry*/
983		dx_set_block(frame->at, 0);		/* dx_root block is 0 */
984	}
985	hash = hinfo.hash;
986	do {
987		block = dx_get_block(frame->at);
988		if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
989			goto errout;
990		de = (struct ext3_dir_entry_2 *) bh->b_data;
991		top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
992				       EXT3_DIR_REC_LEN(0));
993		for (; de < top; de = ext3_next_entry(de)) {
994			int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
995				  + ((char *) de - bh->b_data);
996
997			if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
998				brelse(bh);
999				*err = ERR_BAD_DX_DIR;
1000				goto errout;
1001			}
1002
1003			if (ext3_match(namelen, name, de)) {
1004				*res_dir = de;
1005				dx_release(frames);
1006				return bh;
1007			}
1008		}
1009		brelse (bh);
1010		/* Check to see if we should continue to search */
1011		retval = ext3_htree_next_block(dir, hash, frame,
1012					       frames, NULL);
1013		if (retval < 0) {
1014			ext3_warning(sb, __func__,
1015			     "error reading index page in directory #%lu",
1016			     dir->i_ino);
1017			*err = retval;
1018			goto errout;
1019		}
1020	} while (retval == 1);
1021
1022	*err = -ENOENT;
1023errout:
1024	dxtrace(printk("%s not found\n", name));
1025	dx_release (frames);
1026	return NULL;
1027}
1028
1029static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
1030{
1031	struct inode * inode;
1032	struct ext3_dir_entry_2 * de;
1033	struct buffer_head * bh;
1034
1035	if (dentry->d_name.len > EXT3_NAME_LEN)
1036		return ERR_PTR(-ENAMETOOLONG);
1037
1038	bh = ext3_find_entry(dir, &dentry->d_name, &de);
1039	inode = NULL;
1040	if (bh) {
1041		unsigned long ino = le32_to_cpu(de->inode);
1042		brelse (bh);
1043		if (!ext3_valid_inum(dir->i_sb, ino)) {
1044			ext3_error(dir->i_sb, "ext3_lookup",
1045				   "bad inode number: %lu", ino);
1046			return ERR_PTR(-EIO);
1047		}
1048		inode = ext3_iget(dir->i_sb, ino);
1049		if (IS_ERR(inode))
1050			return ERR_CAST(inode);
1051	}
1052	return d_splice_alias(inode, dentry);
1053}
1054
1055
1056struct dentry *ext3_get_parent(struct dentry *child)
1057{
1058	unsigned long ino;
1059	struct qstr dotdot = {.name = "..", .len = 2};
1060	struct ext3_dir_entry_2 * de;
1061	struct buffer_head *bh;
1062
1063	bh = ext3_find_entry(child->d_inode, &dotdot, &de);
1064	if (!bh)
1065		return ERR_PTR(-ENOENT);
1066	ino = le32_to_cpu(de->inode);
1067	brelse(bh);
1068
1069	if (!ext3_valid_inum(child->d_inode->i_sb, ino)) {
1070		ext3_error(child->d_inode->i_sb, "ext3_get_parent",
1071			   "bad inode number: %lu", ino);
1072		return ERR_PTR(-EIO);
1073	}
1074
1075	return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino));
1076}
1077
1078#define S_SHIFT 12
1079static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
1080	[S_IFREG >> S_SHIFT]	= EXT3_FT_REG_FILE,
1081	[S_IFDIR >> S_SHIFT]	= EXT3_FT_DIR,
1082	[S_IFCHR >> S_SHIFT]	= EXT3_FT_CHRDEV,
1083	[S_IFBLK >> S_SHIFT]	= EXT3_FT_BLKDEV,
1084	[S_IFIFO >> S_SHIFT]	= EXT3_FT_FIFO,
1085	[S_IFSOCK >> S_SHIFT]	= EXT3_FT_SOCK,
1086	[S_IFLNK >> S_SHIFT]	= EXT3_FT_SYMLINK,
1087};
1088
1089static inline void ext3_set_de_type(struct super_block *sb,
1090				struct ext3_dir_entry_2 *de,
1091				umode_t mode) {
1092	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
1093		de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1094}
1095
1096/*
1097 * Move count entries from end of map between two memory locations.
1098 * Returns pointer to last entry moved.
1099 */
1100static struct ext3_dir_entry_2 *
1101dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1102{
1103	unsigned rec_len = 0;
1104
1105	while (count--) {
1106		struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
1107		rec_len = EXT3_DIR_REC_LEN(de->name_len);
1108		memcpy (to, de, rec_len);
1109		((struct ext3_dir_entry_2 *) to)->rec_len =
1110				ext3_rec_len_to_disk(rec_len);
1111		de->inode = 0;
1112		map++;
1113		to += rec_len;
1114	}
1115	return (struct ext3_dir_entry_2 *) (to - rec_len);
1116}
1117
1118/*
1119 * Compact each dir entry in the range to the minimal rec_len.
1120 * Returns pointer to last entry in range.
1121 */
1122static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
1123{
1124	struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
1125	unsigned rec_len = 0;
1126
1127	prev = to = de;
1128	while ((char*)de < base + size) {
1129		next = ext3_next_entry(de);
1130		if (de->inode && de->name_len) {
1131			rec_len = EXT3_DIR_REC_LEN(de->name_len);
1132			if (de > to)
1133				memmove(to, de, rec_len);
1134			to->rec_len = ext3_rec_len_to_disk(rec_len);
1135			prev = to;
1136			to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
1137		}
1138		de = next;
1139	}
1140	return prev;
1141}
1142
1143/*
1144 * Split a full leaf block to make room for a new dir entry.
1145 * Allocate a new block, and move entries so that they are approx. equally full.
1146 * Returns pointer to de in block into which the new entry will be inserted.
1147 */
1148static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1149			struct buffer_head **bh,struct dx_frame *frame,
1150			struct dx_hash_info *hinfo, int *error)
1151{
1152	unsigned blocksize = dir->i_sb->s_blocksize;
1153	unsigned count, continued;
1154	struct buffer_head *bh2;
1155	u32 newblock;
1156	u32 hash2;
1157	struct dx_map_entry *map;
1158	char *data1 = (*bh)->b_data, *data2;
1159	unsigned split, move, size, i;
1160	struct ext3_dir_entry_2 *de = NULL, *de2;
1161	int	err = 0;
1162
1163	bh2 = ext3_append (handle, dir, &newblock, &err);
1164	if (!(bh2)) {
1165		brelse(*bh);
1166		*bh = NULL;
1167		goto errout;
1168	}
1169
1170	BUFFER_TRACE(*bh, "get_write_access");
1171	err = ext3_journal_get_write_access(handle, *bh);
1172	if (err)
1173		goto journal_error;
1174
1175	BUFFER_TRACE(frame->bh, "get_write_access");
1176	err = ext3_journal_get_write_access(handle, frame->bh);
1177	if (err)
1178		goto journal_error;
1179
1180	data2 = bh2->b_data;
1181
1182	/* create map in the end of data2 block */
1183	map = (struct dx_map_entry *) (data2 + blocksize);
1184	count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
1185			     blocksize, hinfo, map);
1186	map -= count;
1187	dx_sort_map (map, count);
1188	/* Split the existing block in the middle, size-wise */
1189	size = 0;
1190	move = 0;
1191	for (i = count-1; i >= 0; i--) {
1192		/* is more than half of this entry in 2nd half of the block? */
1193		if (size + map[i].size/2 > blocksize/2)
1194			break;
1195		size += map[i].size;
1196		move++;
1197	}
1198	/* map index at which we will split */
1199	split = count - move;
1200	hash2 = map[split].hash;
1201	continued = hash2 == map[split - 1].hash;
1202	dxtrace(printk("Split block %i at %x, %i/%i\n",
1203		dx_get_block(frame->at), hash2, split, count-split));
1204
1205	/* Fancy dance to stay within two buffers */
1206	de2 = dx_move_dirents(data1, data2, map + split, count - split);
1207	de = dx_pack_dirents(data1,blocksize);
1208	de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
1209	de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2);
1210	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
1211	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
1212
1213	/* Which block gets the new entry? */
1214	if (hinfo->hash >= hash2)
1215	{
1216		swap(*bh, bh2);
1217		de = de2;
1218	}
1219	dx_insert_block (frame, hash2 + continued, newblock);
1220	err = ext3_journal_dirty_metadata (handle, bh2);
1221	if (err)
1222		goto journal_error;
1223	err = ext3_journal_dirty_metadata (handle, frame->bh);
1224	if (err)
1225		goto journal_error;
1226	brelse (bh2);
1227	dxtrace(dx_show_index ("frame", frame->entries));
1228	return de;
1229
1230journal_error:
1231	brelse(*bh);
1232	brelse(bh2);
1233	*bh = NULL;
1234	ext3_std_error(dir->i_sb, err);
1235errout:
1236	*error = err;
1237	return NULL;
1238}
1239
1240
1241/*
1242 * Add a new entry into a directory (leaf) block.  If de is non-NULL,
1243 * it points to a directory entry which is guaranteed to be large
1244 * enough for new directory entry.  If de is NULL, then
1245 * add_dirent_to_buf will attempt search the directory block for
1246 * space.  It will return -ENOSPC if no space is available, and -EIO
1247 * and -EEXIST if directory entry already exists.
1248 *
1249 * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
1250 * all other cases bh is released.
1251 */
1252static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1253			     struct inode *inode, struct ext3_dir_entry_2 *de,
1254			     struct buffer_head * bh)
1255{
1256	struct inode	*dir = dentry->d_parent->d_inode;
1257	const char	*name = dentry->d_name.name;
1258	int		namelen = dentry->d_name.len;
1259	unsigned long	offset = 0;
1260	unsigned short	reclen;
1261	int		nlen, rlen, err;
1262	char		*top;
1263
1264	reclen = EXT3_DIR_REC_LEN(namelen);
1265	if (!de) {
1266		de = (struct ext3_dir_entry_2 *)bh->b_data;
1267		top = bh->b_data + dir->i_sb->s_blocksize - reclen;
1268		while ((char *) de <= top) {
1269			if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
1270						  bh, offset)) {
1271				brelse (bh);
1272				return -EIO;
1273			}
1274			if (ext3_match (namelen, name, de)) {
1275				brelse (bh);
1276				return -EEXIST;
1277			}
1278			nlen = EXT3_DIR_REC_LEN(de->name_len);
1279			rlen = ext3_rec_len_from_disk(de->rec_len);
1280			if ((de->inode? rlen - nlen: rlen) >= reclen)
1281				break;
1282			de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
1283			offset += rlen;
1284		}
1285		if ((char *) de > top)
1286			return -ENOSPC;
1287	}
1288	BUFFER_TRACE(bh, "get_write_access");
1289	err = ext3_journal_get_write_access(handle, bh);
1290	if (err) {
1291		ext3_std_error(dir->i_sb, err);
1292		brelse(bh);
1293		return err;
1294	}
1295
1296	/* By now the buffer is marked for journaling */
1297	nlen = EXT3_DIR_REC_LEN(de->name_len);
1298	rlen = ext3_rec_len_from_disk(de->rec_len);
1299	if (de->inode) {
1300		struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
1301		de1->rec_len = ext3_rec_len_to_disk(rlen - nlen);
1302		de->rec_len = ext3_rec_len_to_disk(nlen);
1303		de = de1;
1304	}
1305	de->file_type = EXT3_FT_UNKNOWN;
1306	if (inode) {
1307		de->inode = cpu_to_le32(inode->i_ino);
1308		ext3_set_de_type(dir->i_sb, de, inode->i_mode);
1309	} else
1310		de->inode = 0;
1311	de->name_len = namelen;
1312	memcpy (de->name, name, namelen);
1313	/*
1314	 * XXX shouldn't update any times until successful
1315	 * completion of syscall, but too many callers depend
1316	 * on this.
1317	 *
1318	 * XXX similarly, too many callers depend on
1319	 * ext3_new_inode() setting the times, but error
1320	 * recovery deletes the inode, so the worst that can
1321	 * happen is that the times are slightly out of date
1322	 * and/or different from the directory change time.
1323	 */
1324	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
1325	ext3_update_dx_flag(dir);
1326	dir->i_version++;
1327	ext3_mark_inode_dirty(handle, dir);
1328	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1329	err = ext3_journal_dirty_metadata(handle, bh);
1330	if (err)
1331		ext3_std_error(dir->i_sb, err);
1332	brelse(bh);
1333	return 0;
1334}
1335
1336/*
1337 * This converts a one block unindexed directory to a 3 block indexed
1338 * directory, and adds the dentry to the indexed directory.
1339 */
1340static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1341			    struct inode *inode, struct buffer_head *bh)
1342{
1343	struct inode	*dir = dentry->d_parent->d_inode;
1344	const char	*name = dentry->d_name.name;
1345	int		namelen = dentry->d_name.len;
1346	struct buffer_head *bh2;
1347	struct dx_root	*root;
1348	struct dx_frame	frames[2], *frame;
1349	struct dx_entry *entries;
1350	struct ext3_dir_entry_2	*de, *de2;
1351	char		*data1, *top;
1352	unsigned	len;
1353	int		retval;
1354	unsigned	blocksize;
1355	struct dx_hash_info hinfo;
1356	u32		block;
1357	struct fake_dirent *fde;
1358
1359	blocksize =  dir->i_sb->s_blocksize;
1360	dxtrace(printk("Creating index\n"));
1361	retval = ext3_journal_get_write_access(handle, bh);
1362	if (retval) {
1363		ext3_std_error(dir->i_sb, retval);
1364		brelse(bh);
1365		return retval;
1366	}
1367	root = (struct dx_root *) bh->b_data;
1368
1369	bh2 = ext3_append (handle, dir, &block, &retval);
1370	if (!(bh2)) {
1371		brelse(bh);
1372		return retval;
1373	}
1374	EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
1375	data1 = bh2->b_data;
1376
1377	/* The 0th block becomes the root, move the dirents out */
1378	fde = &root->dotdot;
1379	de = (struct ext3_dir_entry_2 *)((char *)fde +
1380			ext3_rec_len_from_disk(fde->rec_len));
1381	len = ((char *) root) + blocksize - (char *) de;
1382	memcpy (data1, de, len);
1383	de = (struct ext3_dir_entry_2 *) data1;
1384	top = data1 + len;
1385	while ((char *)(de2 = ext3_next_entry(de)) < top)
1386		de = de2;
1387	de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
1388	/* Initialize the root; the dot dirents already exist */
1389	de = (struct ext3_dir_entry_2 *) (&root->dotdot);
1390	de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2));
1391	memset (&root->info, 0, sizeof(root->info));
1392	root->info.info_length = sizeof(root->info);
1393	root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
1394	entries = root->entries;
1395	dx_set_block (entries, 1);
1396	dx_set_count (entries, 1);
1397	dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
1398
1399	/* Initialize as for dx_probe */
1400	hinfo.hash_version = root->info.hash_version;
1401	hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
1402	ext3fs_dirhash(name, namelen, &hinfo);
1403	frame = frames;
1404	frame->entries = entries;
1405	frame->at = entries;
1406	frame->bh = bh;
1407	bh = bh2;
1408	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1409	dx_release (frames);
1410	if (!(de))
1411		return retval;
1412
1413	return add_dirent_to_buf(handle, dentry, inode, de, bh);
1414}
1415
1416/*
1417 *	ext3_add_entry()
1418 *
1419 * adds a file entry to the specified directory, using the same
1420 * semantics as ext3_find_entry(). It returns NULL if it failed.
1421 *
1422 * NOTE!! The inode part of 'de' is left at 0 - which means you
1423 * may not sleep between calling this and putting something into
1424 * the entry, as someone else might have used it while you slept.
1425 */
1426static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1427	struct inode *inode)
1428{
1429	struct inode *dir = dentry->d_parent->d_inode;
1430	unsigned long offset;
1431	struct buffer_head * bh;
1432	struct ext3_dir_entry_2 *de;
1433	struct super_block * sb;
1434	int	retval;
1435	int	dx_fallback=0;
1436	unsigned blocksize;
1437	u32 block, blocks;
1438
1439	sb = dir->i_sb;
1440	blocksize = sb->s_blocksize;
1441	if (!dentry->d_name.len)
1442		return -EINVAL;
1443	if (is_dx(dir)) {
1444		retval = ext3_dx_add_entry(handle, dentry, inode);
1445		if (!retval || (retval != ERR_BAD_DX_DIR))
1446			return retval;
1447		EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
1448		dx_fallback++;
1449		ext3_mark_inode_dirty(handle, dir);
1450	}
1451	blocks = dir->i_size >> sb->s_blocksize_bits;
1452	for (block = 0, offset = 0; block < blocks; block++) {
1453		bh = ext3_bread(handle, dir, block, 0, &retval);
1454		if(!bh)
1455			return retval;
1456		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1457		if (retval != -ENOSPC)
1458			return retval;
1459
1460		if (blocks == 1 && !dx_fallback &&
1461		    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
1462			return make_indexed_dir(handle, dentry, inode, bh);
1463		brelse(bh);
1464	}
1465	bh = ext3_append(handle, dir, &block, &retval);
1466	if (!bh)
1467		return retval;
1468	de = (struct ext3_dir_entry_2 *) bh->b_data;
1469	de->inode = 0;
1470	de->rec_len = ext3_rec_len_to_disk(blocksize);
1471	return add_dirent_to_buf(handle, dentry, inode, de, bh);
1472}
1473
1474/*
1475 * Returns 0 for success, or a negative error value
1476 */
1477static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1478			     struct inode *inode)
1479{
1480	struct dx_frame frames[2], *frame;
1481	struct dx_entry *entries, *at;
1482	struct dx_hash_info hinfo;
1483	struct buffer_head * bh;
1484	struct inode *dir = dentry->d_parent->d_inode;
1485	struct super_block * sb = dir->i_sb;
1486	struct ext3_dir_entry_2 *de;
1487	int err;
1488
1489	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1490	if (!frame)
1491		return err;
1492	entries = frame->entries;
1493	at = frame->at;
1494
1495	if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
1496		goto cleanup;
1497
1498	BUFFER_TRACE(bh, "get_write_access");
1499	err = ext3_journal_get_write_access(handle, bh);
1500	if (err)
1501		goto journal_error;
1502
1503	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1504	if (err != -ENOSPC) {
1505		bh = NULL;
1506		goto cleanup;
1507	}
1508
1509	/* Block full, should compress but for now just split */
1510	dxtrace(printk("using %u of %u node entries\n",
1511		       dx_get_count(entries), dx_get_limit(entries)));
1512	/* Need to split index? */
1513	if (dx_get_count(entries) == dx_get_limit(entries)) {
1514		u32 newblock;
1515		unsigned icount = dx_get_count(entries);
1516		int levels = frame - frames;
1517		struct dx_entry *entries2;
1518		struct dx_node *node2;
1519		struct buffer_head *bh2;
1520
1521		if (levels && (dx_get_count(frames->entries) ==
1522			       dx_get_limit(frames->entries))) {
1523			ext3_warning(sb, __func__,
1524				     "Directory index full!");
1525			err = -ENOSPC;
1526			goto cleanup;
1527		}
1528		bh2 = ext3_append (handle, dir, &newblock, &err);
1529		if (!(bh2))
1530			goto cleanup;
1531		node2 = (struct dx_node *)(bh2->b_data);
1532		entries2 = node2->entries;
1533		node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
1534		node2->fake.inode = 0;
1535		BUFFER_TRACE(frame->bh, "get_write_access");
1536		err = ext3_journal_get_write_access(handle, frame->bh);
1537		if (err)
1538			goto journal_error;
1539		if (levels) {
1540			unsigned icount1 = icount/2, icount2 = icount - icount1;
1541			unsigned hash2 = dx_get_hash(entries + icount1);
1542			dxtrace(printk("Split index %i/%i\n", icount1, icount2));
1543
1544			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1545			err = ext3_journal_get_write_access(handle,
1546							     frames[0].bh);
1547			if (err)
1548				goto journal_error;
1549
1550			memcpy ((char *) entries2, (char *) (entries + icount1),
1551				icount2 * sizeof(struct dx_entry));
1552			dx_set_count (entries, icount1);
1553			dx_set_count (entries2, icount2);
1554			dx_set_limit (entries2, dx_node_limit(dir));
1555
1556			/* Which index block gets the new entry? */
1557			if (at - entries >= icount1) {
1558				frame->at = at = at - entries - icount1 + entries2;
1559				frame->entries = entries = entries2;
1560				swap(frame->bh, bh2);
1561			}
1562			dx_insert_block (frames + 0, hash2, newblock);
1563			dxtrace(dx_show_index ("node", frames[1].entries));
1564			dxtrace(dx_show_index ("node",
1565			       ((struct dx_node *) bh2->b_data)->entries));
1566			err = ext3_journal_dirty_metadata(handle, bh2);
1567			if (err)
1568				goto journal_error;
1569			brelse (bh2);
1570		} else {
1571			dxtrace(printk("Creating second level index...\n"));
1572			memcpy((char *) entries2, (char *) entries,
1573			       icount * sizeof(struct dx_entry));
1574			dx_set_limit(entries2, dx_node_limit(dir));
1575
1576			/* Set up root */
1577			dx_set_count(entries, 1);
1578			dx_set_block(entries + 0, newblock);
1579			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
1580
1581			/* Add new access path frame */
1582			frame = frames + 1;
1583			frame->at = at = at - entries + entries2;
1584			frame->entries = entries = entries2;
1585			frame->bh = bh2;
1586			err = ext3_journal_get_write_access(handle,
1587							     frame->bh);
1588			if (err)
1589				goto journal_error;
1590		}
1591		ext3_journal_dirty_metadata(handle, frames[0].bh);
1592	}
1593	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1594	if (!de)
1595		goto cleanup;
1596	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1597	bh = NULL;
1598	goto cleanup;
1599
1600journal_error:
1601	ext3_std_error(dir->i_sb, err);
1602cleanup:
1603	if (bh)
1604		brelse(bh);
1605	dx_release(frames);
1606	return err;
1607}
1608
1609/*
1610 * ext3_delete_entry deletes a directory entry by merging it with the
1611 * previous entry
1612 */
1613static int ext3_delete_entry (handle_t *handle,
1614			      struct inode * dir,
1615			      struct ext3_dir_entry_2 * de_del,
1616			      struct buffer_head * bh)
1617{
1618	struct ext3_dir_entry_2 * de, * pde;
1619	int i;
1620
1621	i = 0;
1622	pde = NULL;
1623	de = (struct ext3_dir_entry_2 *) bh->b_data;
1624	while (i < bh->b_size) {
1625		if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
1626			return -EIO;
1627		if (de == de_del)  {
1628			BUFFER_TRACE(bh, "get_write_access");
1629			ext3_journal_get_write_access(handle, bh);
1630			if (pde)
1631				pde->rec_len = ext3_rec_len_to_disk(
1632					ext3_rec_len_from_disk(pde->rec_len) +
1633					ext3_rec_len_from_disk(de->rec_len));
1634			else
1635				de->inode = 0;
1636			dir->i_version++;
1637			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1638			ext3_journal_dirty_metadata(handle, bh);
1639			return 0;
1640		}
1641		i += ext3_rec_len_from_disk(de->rec_len);
1642		pde = de;
1643		de = ext3_next_entry(de);
1644	}
1645	return -ENOENT;
1646}
1647
1648static int ext3_add_nondir(handle_t *handle,
1649		struct dentry *dentry, struct inode *inode)
1650{
1651	int err = ext3_add_entry(handle, dentry, inode);
1652	if (!err) {
1653		ext3_mark_inode_dirty(handle, inode);
1654		d_instantiate(dentry, inode);
1655		return 0;
1656	}
1657	drop_nlink(inode);
1658	iput(inode);
1659	return err;
1660}
1661
1662/*
1663 * By the time this is called, we already have created
1664 * the directory cache entry for the new file, but it
1665 * is so far negative - it has no inode.
1666 *
1667 * If the create succeeds, we fill in the inode information
1668 * with d_instantiate().
1669 */
1670static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
1671		struct nameidata *nd)
1672{
1673	handle_t *handle;
1674	struct inode * inode;
1675	int err, retries = 0;
1676
1677retry:
1678	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1679					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1680					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
1681	if (IS_ERR(handle))
1682		return PTR_ERR(handle);
1683
1684	if (IS_DIRSYNC(dir))
1685		handle->h_sync = 1;
1686
1687	inode = ext3_new_inode (handle, dir, mode);
1688	err = PTR_ERR(inode);
1689	if (!IS_ERR(inode)) {
1690		inode->i_op = &ext3_file_inode_operations;
1691		inode->i_fop = &ext3_file_operations;
1692		ext3_set_aops(inode);
1693		err = ext3_add_nondir(handle, dentry, inode);
1694	}
1695	ext3_journal_stop(handle);
1696	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1697		goto retry;
1698	return err;
1699}
1700
1701static int ext3_mknod (struct inode * dir, struct dentry *dentry,
1702			int mode, dev_t rdev)
1703{
1704	handle_t *handle;
1705	struct inode *inode;
1706	int err, retries = 0;
1707
1708	if (!new_valid_dev(rdev))
1709		return -EINVAL;
1710
1711retry:
1712	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1713					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1714					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
1715	if (IS_ERR(handle))
1716		return PTR_ERR(handle);
1717
1718	if (IS_DIRSYNC(dir))
1719		handle->h_sync = 1;
1720
1721	inode = ext3_new_inode (handle, dir, mode);
1722	err = PTR_ERR(inode);
1723	if (!IS_ERR(inode)) {
1724		init_special_inode(inode, inode->i_mode, rdev);
1725#ifdef CONFIG_EXT3_FS_XATTR
1726		inode->i_op = &ext3_special_inode_operations;
1727#endif
1728		err = ext3_add_nondir(handle, dentry, inode);
1729	}
1730	ext3_journal_stop(handle);
1731	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1732		goto retry;
1733	return err;
1734}
1735
1736static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1737{
1738	handle_t *handle;
1739	struct inode * inode;
1740	struct buffer_head * dir_block;
1741	struct ext3_dir_entry_2 * de;
1742	int err, retries = 0;
1743
1744	if (dir->i_nlink >= EXT3_LINK_MAX)
1745		return -EMLINK;
1746
1747retry:
1748	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1749					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1750					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
1751	if (IS_ERR(handle))
1752		return PTR_ERR(handle);
1753
1754	if (IS_DIRSYNC(dir))
1755		handle->h_sync = 1;
1756
1757	inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
1758	err = PTR_ERR(inode);
1759	if (IS_ERR(inode))
1760		goto out_stop;
1761
1762	inode->i_op = &ext3_dir_inode_operations;
1763	inode->i_fop = &ext3_dir_operations;
1764	inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1765	dir_block = ext3_bread (handle, inode, 0, 1, &err);
1766	if (!dir_block) {
1767		drop_nlink(inode); /* is this nlink == 0? */
1768		ext3_mark_inode_dirty(handle, inode);
1769		iput (inode);
1770		goto out_stop;
1771	}
1772	BUFFER_TRACE(dir_block, "get_write_access");
1773	ext3_journal_get_write_access(handle, dir_block);
1774	de = (struct ext3_dir_entry_2 *) dir_block->b_data;
1775	de->inode = cpu_to_le32(inode->i_ino);
1776	de->name_len = 1;
1777	de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len));
1778	strcpy (de->name, ".");
1779	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1780	de = ext3_next_entry(de);
1781	de->inode = cpu_to_le32(dir->i_ino);
1782	de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize -
1783					EXT3_DIR_REC_LEN(1));
1784	de->name_len = 2;
1785	strcpy (de->name, "..");
1786	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1787	inode->i_nlink = 2;
1788	BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
1789	ext3_journal_dirty_metadata(handle, dir_block);
1790	brelse (dir_block);
1791	ext3_mark_inode_dirty(handle, inode);
1792	err = ext3_add_entry (handle, dentry, inode);
1793	if (err) {
1794		inode->i_nlink = 0;
1795		ext3_mark_inode_dirty(handle, inode);
1796		iput (inode);
1797		goto out_stop;
1798	}
1799	inc_nlink(dir);
1800	ext3_update_dx_flag(dir);
1801	ext3_mark_inode_dirty(handle, dir);
1802	d_instantiate(dentry, inode);
1803out_stop:
1804	ext3_journal_stop(handle);
1805	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1806		goto retry;
1807	return err;
1808}
1809
1810/*
1811 * routine to check that the specified directory is empty (for rmdir)
1812 */
1813static int empty_dir (struct inode * inode)
1814{
1815	unsigned long offset;
1816	struct buffer_head * bh;
1817	struct ext3_dir_entry_2 * de, * de1;
1818	struct super_block * sb;
1819	int err = 0;
1820
1821	sb = inode->i_sb;
1822	if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
1823	    !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
1824		if (err)
1825			ext3_error(inode->i_sb, __func__,
1826				   "error %d reading directory #%lu offset 0",
1827				   err, inode->i_ino);
1828		else
1829			ext3_warning(inode->i_sb, __func__,
1830				     "bad directory (dir #%lu) - no data block",
1831				     inode->i_ino);
1832		return 1;
1833	}
1834	de = (struct ext3_dir_entry_2 *) bh->b_data;
1835	de1 = ext3_next_entry(de);
1836	if (le32_to_cpu(de->inode) != inode->i_ino ||
1837			!le32_to_cpu(de1->inode) ||
1838			strcmp (".", de->name) ||
1839			strcmp ("..", de1->name)) {
1840		ext3_warning (inode->i_sb, "empty_dir",
1841			      "bad directory (dir #%lu) - no `.' or `..'",
1842			      inode->i_ino);
1843		brelse (bh);
1844		return 1;
1845	}
1846	offset = ext3_rec_len_from_disk(de->rec_len) +
1847			ext3_rec_len_from_disk(de1->rec_len);
1848	de = ext3_next_entry(de1);
1849	while (offset < inode->i_size ) {
1850		if (!bh ||
1851			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1852			err = 0;
1853			brelse (bh);
1854			bh = ext3_bread (NULL, inode,
1855				offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
1856			if (!bh) {
1857				if (err)
1858					ext3_error(sb, __func__,
1859						   "error %d reading directory"
1860						   " #%lu offset %lu",
1861						   err, inode->i_ino, offset);
1862				offset += sb->s_blocksize;
1863				continue;
1864			}
1865			de = (struct ext3_dir_entry_2 *) bh->b_data;
1866		}
1867		if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) {
1868			de = (struct ext3_dir_entry_2 *)(bh->b_data +
1869							 sb->s_blocksize);
1870			offset = (offset | (sb->s_blocksize - 1)) + 1;
1871			continue;
1872		}
1873		if (le32_to_cpu(de->inode)) {
1874			brelse (bh);
1875			return 0;
1876		}
1877		offset += ext3_rec_len_from_disk(de->rec_len);
1878		de = ext3_next_entry(de);
1879	}
1880	brelse (bh);
1881	return 1;
1882}
1883
1884/* ext3_orphan_add() links an unlinked or truncated inode into a list of
1885 * such inodes, starting at the superblock, in case we crash before the
1886 * file is closed/deleted, or in case the inode truncate spans multiple
1887 * transactions and the last transaction is not recovered after a crash.
1888 *
1889 * At filesystem recovery time, we walk this list deleting unlinked
1890 * inodes and truncating linked inodes in ext3_orphan_cleanup().
1891 */
1892int ext3_orphan_add(handle_t *handle, struct inode *inode)
1893{
1894	struct super_block *sb = inode->i_sb;
1895	struct ext3_iloc iloc;
1896	int err = 0, rc;
1897
1898	lock_super(sb);
1899	if (!list_empty(&EXT3_I(inode)->i_orphan))
1900		goto out_unlock;
1901
1902	/* Orphan handling is only valid for files with data blocks
1903	 * being truncated, or files being unlinked. */
1904
1905	/* @@@ FIXME: Observation from aviro:
1906	 * I think I can trigger J_ASSERT in ext3_orphan_add().  We block
1907	 * here (on lock_super()), so race with ext3_link() which might bump
1908	 * ->i_nlink. For, say it, character device. Not a regular file,
1909	 * not a directory, not a symlink and ->i_nlink > 0.
1910	 */
1911	J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1912		S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1913
1914	BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
1915	err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
1916	if (err)
1917		goto out_unlock;
1918
1919	err = ext3_reserve_inode_write(handle, inode, &iloc);
1920	if (err)
1921		goto out_unlock;
1922
1923	/* Insert this inode at the head of the on-disk orphan list... */
1924	NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
1925	EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
1926	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1927	rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
1928	if (!err)
1929		err = rc;
1930
1931	/* Only add to the head of the in-memory list if all the
1932	 * previous operations succeeded.  If the orphan_add is going to
1933	 * fail (possibly taking the journal offline), we can't risk
1934	 * leaving the inode on the orphan list: stray orphan-list
1935	 * entries can cause panics at unmount time.
1936	 *
1937	 * This is safe: on error we're going to ignore the orphan list
1938	 * anyway on the next recovery. */
1939	if (!err)
1940		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1941
1942	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
1943	jbd_debug(4, "orphan inode %lu will point to %d\n",
1944			inode->i_ino, NEXT_ORPHAN(inode));
1945out_unlock:
1946	unlock_super(sb);
1947	ext3_std_error(inode->i_sb, err);
1948	return err;
1949}
1950
1951/*
1952 * ext3_orphan_del() removes an unlinked or truncated inode from the list
1953 * of such inodes stored on disk, because it is finally being cleaned up.
1954 */
1955int ext3_orphan_del(handle_t *handle, struct inode *inode)
1956{
1957	struct list_head *prev;
1958	struct ext3_inode_info *ei = EXT3_I(inode);
1959	struct ext3_sb_info *sbi;
1960	unsigned long ino_next;
1961	struct ext3_iloc iloc;
1962	int err = 0;
1963
1964	lock_super(inode->i_sb);
1965	if (list_empty(&ei->i_orphan)) {
1966		unlock_super(inode->i_sb);
1967		return 0;
1968	}
1969
1970	ino_next = NEXT_ORPHAN(inode);
1971	prev = ei->i_orphan.prev;
1972	sbi = EXT3_SB(inode->i_sb);
1973
1974	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
1975
1976	list_del_init(&ei->i_orphan);
1977
1978	/* If we're on an error path, we may not have a valid
1979	 * transaction handle with which to update the orphan list on
1980	 * disk, but we still need to remove the inode from the linked
1981	 * list in memory. */
1982	if (!handle)
1983		goto out;
1984
1985	err = ext3_reserve_inode_write(handle, inode, &iloc);
1986	if (err)
1987		goto out_err;
1988
1989	if (prev == &sbi->s_orphan) {
1990		jbd_debug(4, "superblock will point to %lu\n", ino_next);
1991		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
1992		err = ext3_journal_get_write_access(handle, sbi->s_sbh);
1993		if (err)
1994			goto out_brelse;
1995		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
1996		err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
1997	} else {
1998		struct ext3_iloc iloc2;
1999		struct inode *i_prev =
2000			&list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode;
2001
2002		jbd_debug(4, "orphan inode %lu will point to %lu\n",
2003			  i_prev->i_ino, ino_next);
2004		err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
2005		if (err)
2006			goto out_brelse;
2007		NEXT_ORPHAN(i_prev) = ino_next;
2008		err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
2009	}
2010	if (err)
2011		goto out_brelse;
2012	NEXT_ORPHAN(inode) = 0;
2013	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2014
2015out_err:
2016	ext3_std_error(inode->i_sb, err);
2017out:
2018	unlock_super(inode->i_sb);
2019	return err;
2020
2021out_brelse:
2022	brelse(iloc.bh);
2023	goto out_err;
2024}
2025
2026static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
2027{
2028	int retval;
2029	struct inode * inode;
2030	struct buffer_head * bh;
2031	struct ext3_dir_entry_2 * de;
2032	handle_t *handle;
2033
2034	/* Initialize quotas before so that eventual writes go in
2035	 * separate transaction */
2036	DQUOT_INIT(dentry->d_inode);
2037	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2038	if (IS_ERR(handle))
2039		return PTR_ERR(handle);
2040
2041	retval = -ENOENT;
2042	bh = ext3_find_entry(dir, &dentry->d_name, &de);
2043	if (!bh)
2044		goto end_rmdir;
2045
2046	if (IS_DIRSYNC(dir))
2047		handle->h_sync = 1;
2048
2049	inode = dentry->d_inode;
2050
2051	retval = -EIO;
2052	if (le32_to_cpu(de->inode) != inode->i_ino)
2053		goto end_rmdir;
2054
2055	retval = -ENOTEMPTY;
2056	if (!empty_dir (inode))
2057		goto end_rmdir;
2058
2059	retval = ext3_delete_entry(handle, dir, de, bh);
2060	if (retval)
2061		goto end_rmdir;
2062	if (inode->i_nlink != 2)
2063		ext3_warning (inode->i_sb, "ext3_rmdir",
2064			      "empty directory has nlink!=2 (%d)",
2065			      inode->i_nlink);
2066	inode->i_version++;
2067	clear_nlink(inode);
2068	/* There's no need to set i_disksize: the fact that i_nlink is
2069	 * zero will ensure that the right thing happens during any
2070	 * recovery. */
2071	inode->i_size = 0;
2072	ext3_orphan_add(handle, inode);
2073	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2074	ext3_mark_inode_dirty(handle, inode);
2075	drop_nlink(dir);
2076	ext3_update_dx_flag(dir);
2077	ext3_mark_inode_dirty(handle, dir);
2078
2079end_rmdir:
2080	ext3_journal_stop(handle);
2081	brelse (bh);
2082	return retval;
2083}
2084
2085static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2086{
2087	int retval;
2088	struct inode * inode;
2089	struct buffer_head * bh;
2090	struct ext3_dir_entry_2 * de;
2091	handle_t *handle;
2092
2093	/* Initialize quotas before so that eventual writes go
2094	 * in separate transaction */
2095	DQUOT_INIT(dentry->d_inode);
2096	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2097	if (IS_ERR(handle))
2098		return PTR_ERR(handle);
2099
2100	if (IS_DIRSYNC(dir))
2101		handle->h_sync = 1;
2102
2103	retval = -ENOENT;
2104	bh = ext3_find_entry(dir, &dentry->d_name, &de);
2105	if (!bh)
2106		goto end_unlink;
2107
2108	inode = dentry->d_inode;
2109
2110	retval = -EIO;
2111	if (le32_to_cpu(de->inode) != inode->i_ino)
2112		goto end_unlink;
2113
2114	if (!inode->i_nlink) {
2115		ext3_warning (inode->i_sb, "ext3_unlink",
2116			      "Deleting nonexistent file (%lu), %d",
2117			      inode->i_ino, inode->i_nlink);
2118		inode->i_nlink = 1;
2119	}
2120	retval = ext3_delete_entry(handle, dir, de, bh);
2121	if (retval)
2122		goto end_unlink;
2123	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2124	ext3_update_dx_flag(dir);
2125	ext3_mark_inode_dirty(handle, dir);
2126	drop_nlink(inode);
2127	if (!inode->i_nlink)
2128		ext3_orphan_add(handle, inode);
2129	inode->i_ctime = dir->i_ctime;
2130	ext3_mark_inode_dirty(handle, inode);
2131	retval = 0;
2132
2133end_unlink:
2134	ext3_journal_stop(handle);
2135	brelse (bh);
2136	return retval;
2137}
2138
2139static int ext3_symlink (struct inode * dir,
2140		struct dentry *dentry, const char * symname)
2141{
2142	handle_t *handle;
2143	struct inode * inode;
2144	int l, err, retries = 0;
2145
2146	l = strlen(symname)+1;
2147	if (l > dir->i_sb->s_blocksize)
2148		return -ENAMETOOLONG;
2149
2150retry:
2151	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2152					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2153					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
2154	if (IS_ERR(handle))
2155		return PTR_ERR(handle);
2156
2157	if (IS_DIRSYNC(dir))
2158		handle->h_sync = 1;
2159
2160	inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
2161	err = PTR_ERR(inode);
2162	if (IS_ERR(inode))
2163		goto out_stop;
2164
2165	if (l > sizeof (EXT3_I(inode)->i_data)) {
2166		inode->i_op = &ext3_symlink_inode_operations;
2167		ext3_set_aops(inode);
2168		/*
2169		 * page_symlink() calls into ext3_prepare/commit_write.
2170		 * We have a transaction open.  All is sweetness.  It also sets
2171		 * i_size in generic_commit_write().
2172		 */
2173		err = __page_symlink(inode, symname, l,
2174				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
2175		if (err) {
2176			drop_nlink(inode);
2177			ext3_mark_inode_dirty(handle, inode);
2178			iput (inode);
2179			goto out_stop;
2180		}
2181	} else {
2182		inode->i_op = &ext3_fast_symlink_inode_operations;
2183		memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
2184		inode->i_size = l-1;
2185	}
2186	EXT3_I(inode)->i_disksize = inode->i_size;
2187	err = ext3_add_nondir(handle, dentry, inode);
2188out_stop:
2189	ext3_journal_stop(handle);
2190	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2191		goto retry;
2192	return err;
2193}
2194
2195static int ext3_link (struct dentry * old_dentry,
2196		struct inode * dir, struct dentry *dentry)
2197{
2198	handle_t *handle;
2199	struct inode *inode = old_dentry->d_inode;
2200	int err, retries = 0;
2201
2202	if (inode->i_nlink >= EXT3_LINK_MAX)
2203		return -EMLINK;
2204	/*
2205	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
2206	 * otherwise has the potential to corrupt the orphan inode list.
2207	 */
2208	if (inode->i_nlink == 0)
2209		return -ENOENT;
2210
2211retry:
2212	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2213					EXT3_INDEX_EXTRA_TRANS_BLOCKS);
2214	if (IS_ERR(handle))
2215		return PTR_ERR(handle);
2216
2217	if (IS_DIRSYNC(dir))
2218		handle->h_sync = 1;
2219
2220	inode->i_ctime = CURRENT_TIME_SEC;
2221	inc_nlink(inode);
2222	atomic_inc(&inode->i_count);
2223
2224	err = ext3_add_nondir(handle, dentry, inode);
2225	ext3_journal_stop(handle);
2226	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2227		goto retry;
2228	return err;
2229}
2230
2231#define PARENT_INO(buffer) \
2232	(ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode)
2233
2234/*
2235 * Anybody can rename anything with this: the permission checks are left to the
2236 * higher-level routines.
2237 */
2238static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2239			   struct inode * new_dir,struct dentry *new_dentry)
2240{
2241	handle_t *handle;
2242	struct inode * old_inode, * new_inode;
2243	struct buffer_head * old_bh, * new_bh, * dir_bh;
2244	struct ext3_dir_entry_2 * old_de, * new_de;
2245	int retval;
2246
2247	old_bh = new_bh = dir_bh = NULL;
2248
2249	/* Initialize quotas before so that eventual writes go
2250	 * in separate transaction */
2251	if (new_dentry->d_inode)
2252		DQUOT_INIT(new_dentry->d_inode);
2253	handle = ext3_journal_start(old_dir, 2 *
2254					EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2255					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
2256	if (IS_ERR(handle))
2257		return PTR_ERR(handle);
2258
2259	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2260		handle->h_sync = 1;
2261
2262	old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de);
2263	/*
2264	 *  Check for inode number is _not_ due to possible IO errors.
2265	 *  We might rmdir the source, keep it as pwd of some process
2266	 *  and merrily kill the link to whatever was created under the
2267	 *  same name. Goodbye sticky bit ;-<
2268	 */
2269	old_inode = old_dentry->d_inode;
2270	retval = -ENOENT;
2271	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
2272		goto end_rename;
2273
2274	new_inode = new_dentry->d_inode;
2275	new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
2276	if (new_bh) {
2277		if (!new_inode) {
2278			brelse (new_bh);
2279			new_bh = NULL;
2280		}
2281	}
2282	if (S_ISDIR(old_inode->i_mode)) {
2283		if (new_inode) {
2284			retval = -ENOTEMPTY;
2285			if (!empty_dir (new_inode))
2286				goto end_rename;
2287		}
2288		retval = -EIO;
2289		dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
2290		if (!dir_bh)
2291			goto end_rename;
2292		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2293			goto end_rename;
2294		retval = -EMLINK;
2295		if (!new_inode && new_dir!=old_dir &&
2296				new_dir->i_nlink >= EXT3_LINK_MAX)
2297			goto end_rename;
2298	}
2299	if (!new_bh) {
2300		retval = ext3_add_entry (handle, new_dentry, old_inode);
2301		if (retval)
2302			goto end_rename;
2303	} else {
2304		BUFFER_TRACE(new_bh, "get write access");
2305		ext3_journal_get_write_access(handle, new_bh);
2306		new_de->inode = cpu_to_le32(old_inode->i_ino);
2307		if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2308					      EXT3_FEATURE_INCOMPAT_FILETYPE))
2309			new_de->file_type = old_de->file_type;
2310		new_dir->i_version++;
2311		new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
2312		ext3_mark_inode_dirty(handle, new_dir);
2313		BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
2314		ext3_journal_dirty_metadata(handle, new_bh);
2315		brelse(new_bh);
2316		new_bh = NULL;
2317	}
2318
2319	/*
2320	 * Like most other Unix systems, set the ctime for inodes on a
2321	 * rename.
2322	 */
2323	old_inode->i_ctime = CURRENT_TIME_SEC;
2324	ext3_mark_inode_dirty(handle, old_inode);
2325
2326	/*
2327	 * ok, that's it
2328	 */
2329	if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
2330	    old_de->name_len != old_dentry->d_name.len ||
2331	    strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2332	    (retval = ext3_delete_entry(handle, old_dir,
2333					old_de, old_bh)) == -ENOENT) {
2334		/* old_de could have moved from under us during htree split, so
2335		 * make sure that we are deleting the right entry.  We might
2336		 * also be pointing to a stale entry in the unused part of
2337		 * old_bh so just checking inum and the name isn't enough. */
2338		struct buffer_head *old_bh2;
2339		struct ext3_dir_entry_2 *old_de2;
2340
2341		old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name,
2342					  &old_de2);
2343		if (old_bh2) {
2344			retval = ext3_delete_entry(handle, old_dir,
2345						   old_de2, old_bh2);
2346			brelse(old_bh2);
2347		}
2348	}
2349	if (retval) {
2350		ext3_warning(old_dir->i_sb, "ext3_rename",
2351				"Deleting old file (%lu), %d, error=%d",
2352				old_dir->i_ino, old_dir->i_nlink, retval);
2353	}
2354
2355	if (new_inode) {
2356		drop_nlink(new_inode);
2357		new_inode->i_ctime = CURRENT_TIME_SEC;
2358	}
2359	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
2360	ext3_update_dx_flag(old_dir);
2361	if (dir_bh) {
2362		BUFFER_TRACE(dir_bh, "get_write_access");
2363		ext3_journal_get_write_access(handle, dir_bh);
2364		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2365		BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
2366		ext3_journal_dirty_metadata(handle, dir_bh);
2367		drop_nlink(old_dir);
2368		if (new_inode) {
2369			drop_nlink(new_inode);
2370		} else {
2371			inc_nlink(new_dir);
2372			ext3_update_dx_flag(new_dir);
2373			ext3_mark_inode_dirty(handle, new_dir);
2374		}
2375	}
2376	ext3_mark_inode_dirty(handle, old_dir);
2377	if (new_inode) {
2378		ext3_mark_inode_dirty(handle, new_inode);
2379		if (!new_inode->i_nlink)
2380			ext3_orphan_add(handle, new_inode);
2381	}
2382	retval = 0;
2383
2384end_rename:
2385	brelse (dir_bh);
2386	brelse (old_bh);
2387	brelse (new_bh);
2388	ext3_journal_stop(handle);
2389	return retval;
2390}
2391
2392/*
2393 * directories can handle most operations...
2394 */
2395const struct inode_operations ext3_dir_inode_operations = {
2396	.create		= ext3_create,
2397	.lookup		= ext3_lookup,
2398	.link		= ext3_link,
2399	.unlink		= ext3_unlink,
2400	.symlink	= ext3_symlink,
2401	.mkdir		= ext3_mkdir,
2402	.rmdir		= ext3_rmdir,
2403	.mknod		= ext3_mknod,
2404	.rename		= ext3_rename,
2405	.setattr	= ext3_setattr,
2406#ifdef CONFIG_EXT3_FS_XATTR
2407	.setxattr	= generic_setxattr,
2408	.getxattr	= generic_getxattr,
2409	.listxattr	= ext3_listxattr,
2410	.removexattr	= generic_removexattr,
2411#endif
2412	.permission	= ext3_permission,
2413};
2414
2415const struct inode_operations ext3_special_inode_operations = {
2416	.setattr	= ext3_setattr,
2417#ifdef CONFIG_EXT3_FS_XATTR
2418	.setxattr	= generic_setxattr,
2419	.getxattr	= generic_getxattr,
2420	.listxattr	= ext3_listxattr,
2421	.removexattr	= generic_removexattr,
2422#endif
2423	.permission	= ext3_permission,
2424};
2425