file.c revision add1f0995454374d90c9d6b2c420d2fba3d0a4e3
1/*
2 *  linux/fs/file.c
3 *
4 *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
5 *
6 *  Manage the dynamic fd arrays in the process files_struct.
7 */
8
9#include <linux/syscalls.h>
10#include <linux/export.h>
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/mmzone.h>
14#include <linux/time.h>
15#include <linux/sched.h>
16#include <linux/slab.h>
17#include <linux/vmalloc.h>
18#include <linux/file.h>
19#include <linux/fdtable.h>
20#include <linux/bitops.h>
21#include <linux/interrupt.h>
22#include <linux/spinlock.h>
23#include <linux/rcupdate.h>
24#include <linux/workqueue.h>
25
26int sysctl_nr_open __read_mostly = 1024*1024;
27int sysctl_nr_open_min = BITS_PER_LONG;
28int sysctl_nr_open_max = 1024 * 1024; /* raised later */
29
30static void *alloc_fdmem(size_t size)
31{
32	/*
33	 * Very large allocations can stress page reclaim, so fall back to
34	 * vmalloc() if the allocation size will be considered "large" by the VM.
35	 */
36	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
37		void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
38		if (data != NULL)
39			return data;
40	}
41	return vmalloc(size);
42}
43
44static void free_fdmem(void *ptr)
45{
46	is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
47}
48
49static void __free_fdtable(struct fdtable *fdt)
50{
51	free_fdmem(fdt->fd);
52	free_fdmem(fdt->open_fds);
53	kfree(fdt);
54}
55
56static void free_fdtable_rcu(struct rcu_head *rcu)
57{
58	__free_fdtable(container_of(rcu, struct fdtable, rcu));
59}
60
61/*
62 * Expand the fdset in the files_struct.  Called with the files spinlock
63 * held for write.
64 */
65static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
66{
67	unsigned int cpy, set;
68
69	BUG_ON(nfdt->max_fds < ofdt->max_fds);
70
71	cpy = ofdt->max_fds * sizeof(struct file *);
72	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
73	memcpy(nfdt->fd, ofdt->fd, cpy);
74	memset((char *)(nfdt->fd) + cpy, 0, set);
75
76	cpy = ofdt->max_fds / BITS_PER_BYTE;
77	set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
78	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
79	memset((char *)(nfdt->open_fds) + cpy, 0, set);
80	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
81	memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
82}
83
84static struct fdtable * alloc_fdtable(unsigned int nr)
85{
86	struct fdtable *fdt;
87	void *data;
88
89	/*
90	 * Figure out how many fds we actually want to support in this fdtable.
91	 * Allocation steps are keyed to the size of the fdarray, since it
92	 * grows far faster than any of the other dynamic data. We try to fit
93	 * the fdarray into comfortable page-tuned chunks: starting at 1024B
94	 * and growing in powers of two from there on.
95	 */
96	nr /= (1024 / sizeof(struct file *));
97	nr = roundup_pow_of_two(nr + 1);
98	nr *= (1024 / sizeof(struct file *));
99	/*
100	 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
101	 * had been set lower between the check in expand_files() and here.  Deal
102	 * with that in caller, it's cheaper that way.
103	 *
104	 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
105	 * bitmaps handling below becomes unpleasant, to put it mildly...
106	 */
107	if (unlikely(nr > sysctl_nr_open))
108		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
109
110	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
111	if (!fdt)
112		goto out;
113	fdt->max_fds = nr;
114	data = alloc_fdmem(nr * sizeof(struct file *));
115	if (!data)
116		goto out_fdt;
117	fdt->fd = data;
118
119	data = alloc_fdmem(max_t(size_t,
120				 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
121	if (!data)
122		goto out_arr;
123	fdt->open_fds = data;
124	data += nr / BITS_PER_BYTE;
125	fdt->close_on_exec = data;
126
127	return fdt;
128
129out_arr:
130	free_fdmem(fdt->fd);
131out_fdt:
132	kfree(fdt);
133out:
134	return NULL;
135}
136
137/*
138 * Expand the file descriptor table.
139 * This function will allocate a new fdtable and both fd array and fdset, of
140 * the given size.
141 * Return <0 error code on error; 1 on successful completion.
142 * The files->file_lock should be held on entry, and will be held on exit.
143 */
144static int expand_fdtable(struct files_struct *files, int nr)
145	__releases(files->file_lock)
146	__acquires(files->file_lock)
147{
148	struct fdtable *new_fdt, *cur_fdt;
149
150	spin_unlock(&files->file_lock);
151	new_fdt = alloc_fdtable(nr);
152	spin_lock(&files->file_lock);
153	if (!new_fdt)
154		return -ENOMEM;
155	/*
156	 * extremely unlikely race - sysctl_nr_open decreased between the check in
157	 * caller and alloc_fdtable().  Cheaper to catch it here...
158	 */
159	if (unlikely(new_fdt->max_fds <= nr)) {
160		__free_fdtable(new_fdt);
161		return -EMFILE;
162	}
163	/*
164	 * Check again since another task may have expanded the fd table while
165	 * we dropped the lock
166	 */
167	cur_fdt = files_fdtable(files);
168	if (nr >= cur_fdt->max_fds) {
169		/* Continue as planned */
170		copy_fdtable(new_fdt, cur_fdt);
171		rcu_assign_pointer(files->fdt, new_fdt);
172		if (cur_fdt != &files->fdtab)
173			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
174	} else {
175		/* Somebody else expanded, so undo our attempt */
176		__free_fdtable(new_fdt);
177	}
178	return 1;
179}
180
181/*
182 * Expand files.
183 * This function will expand the file structures, if the requested size exceeds
184 * the current capacity and there is room for expansion.
185 * Return <0 error code on error; 0 when nothing done; 1 when files were
186 * expanded and execution may have blocked.
187 * The files->file_lock should be held on entry, and will be held on exit.
188 */
189static int expand_files(struct files_struct *files, int nr)
190{
191	struct fdtable *fdt;
192
193	fdt = files_fdtable(files);
194
195	/* Do we need to expand? */
196	if (nr < fdt->max_fds)
197		return 0;
198
199	/* Can we expand? */
200	if (nr >= sysctl_nr_open)
201		return -EMFILE;
202
203	/* All good, so we try */
204	return expand_fdtable(files, nr);
205}
206
207static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
208{
209	__set_bit(fd, fdt->close_on_exec);
210}
211
212static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
213{
214	__clear_bit(fd, fdt->close_on_exec);
215}
216
217static inline void __set_open_fd(int fd, struct fdtable *fdt)
218{
219	__set_bit(fd, fdt->open_fds);
220}
221
222static inline void __clear_open_fd(int fd, struct fdtable *fdt)
223{
224	__clear_bit(fd, fdt->open_fds);
225}
226
227static int count_open_files(struct fdtable *fdt)
228{
229	int size = fdt->max_fds;
230	int i;
231
232	/* Find the last open fd */
233	for (i = size / BITS_PER_LONG; i > 0; ) {
234		if (fdt->open_fds[--i])
235			break;
236	}
237	i = (i + 1) * BITS_PER_LONG;
238	return i;
239}
240
241/*
242 * Allocate a new files structure and copy contents from the
243 * passed in files structure.
244 * errorp will be valid only when the returned files_struct is NULL.
245 */
246struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
247{
248	struct files_struct *newf;
249	struct file **old_fds, **new_fds;
250	int open_files, size, i;
251	struct fdtable *old_fdt, *new_fdt;
252
253	*errorp = -ENOMEM;
254	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
255	if (!newf)
256		goto out;
257
258	atomic_set(&newf->count, 1);
259
260	spin_lock_init(&newf->file_lock);
261	newf->next_fd = 0;
262	new_fdt = &newf->fdtab;
263	new_fdt->max_fds = NR_OPEN_DEFAULT;
264	new_fdt->close_on_exec = newf->close_on_exec_init;
265	new_fdt->open_fds = newf->open_fds_init;
266	new_fdt->fd = &newf->fd_array[0];
267
268	spin_lock(&oldf->file_lock);
269	old_fdt = files_fdtable(oldf);
270	open_files = count_open_files(old_fdt);
271
272	/*
273	 * Check whether we need to allocate a larger fd array and fd set.
274	 */
275	while (unlikely(open_files > new_fdt->max_fds)) {
276		spin_unlock(&oldf->file_lock);
277
278		if (new_fdt != &newf->fdtab)
279			__free_fdtable(new_fdt);
280
281		new_fdt = alloc_fdtable(open_files - 1);
282		if (!new_fdt) {
283			*errorp = -ENOMEM;
284			goto out_release;
285		}
286
287		/* beyond sysctl_nr_open; nothing to do */
288		if (unlikely(new_fdt->max_fds < open_files)) {
289			__free_fdtable(new_fdt);
290			*errorp = -EMFILE;
291			goto out_release;
292		}
293
294		/*
295		 * Reacquire the oldf lock and a pointer to its fd table
296		 * who knows it may have a new bigger fd table. We need
297		 * the latest pointer.
298		 */
299		spin_lock(&oldf->file_lock);
300		old_fdt = files_fdtable(oldf);
301		open_files = count_open_files(old_fdt);
302	}
303
304	old_fds = old_fdt->fd;
305	new_fds = new_fdt->fd;
306
307	memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
308	memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
309
310	for (i = open_files; i != 0; i--) {
311		struct file *f = *old_fds++;
312		if (f) {
313			get_file(f);
314		} else {
315			/*
316			 * The fd may be claimed in the fd bitmap but not yet
317			 * instantiated in the files array if a sibling thread
318			 * is partway through open().  So make sure that this
319			 * fd is available to the new process.
320			 */
321			__clear_open_fd(open_files - i, new_fdt);
322		}
323		rcu_assign_pointer(*new_fds++, f);
324	}
325	spin_unlock(&oldf->file_lock);
326
327	/* compute the remainder to be cleared */
328	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
329
330	/* This is long word aligned thus could use a optimized version */
331	memset(new_fds, 0, size);
332
333	if (new_fdt->max_fds > open_files) {
334		int left = (new_fdt->max_fds - open_files) / 8;
335		int start = open_files / BITS_PER_LONG;
336
337		memset(&new_fdt->open_fds[start], 0, left);
338		memset(&new_fdt->close_on_exec[start], 0, left);
339	}
340
341	rcu_assign_pointer(newf->fdt, new_fdt);
342
343	return newf;
344
345out_release:
346	kmem_cache_free(files_cachep, newf);
347out:
348	return NULL;
349}
350
351static struct fdtable *close_files(struct files_struct * files)
352{
353	/*
354	 * It is safe to dereference the fd table without RCU or
355	 * ->file_lock because this is the last reference to the
356	 * files structure.
357	 */
358	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
359	int i, j = 0;
360
361	for (;;) {
362		unsigned long set;
363		i = j * BITS_PER_LONG;
364		if (i >= fdt->max_fds)
365			break;
366		set = fdt->open_fds[j++];
367		while (set) {
368			if (set & 1) {
369				struct file * file = xchg(&fdt->fd[i], NULL);
370				if (file) {
371					filp_close(file, files);
372					cond_resched();
373				}
374			}
375			i++;
376			set >>= 1;
377		}
378	}
379
380	return fdt;
381}
382
383struct files_struct *get_files_struct(struct task_struct *task)
384{
385	struct files_struct *files;
386
387	task_lock(task);
388	files = task->files;
389	if (files)
390		atomic_inc(&files->count);
391	task_unlock(task);
392
393	return files;
394}
395
396void put_files_struct(struct files_struct *files)
397{
398	if (atomic_dec_and_test(&files->count)) {
399		struct fdtable *fdt = close_files(files);
400
401		/* free the arrays if they are not embedded */
402		if (fdt != &files->fdtab)
403			__free_fdtable(fdt);
404		kmem_cache_free(files_cachep, files);
405	}
406}
407
408void reset_files_struct(struct files_struct *files)
409{
410	struct task_struct *tsk = current;
411	struct files_struct *old;
412
413	old = tsk->files;
414	task_lock(tsk);
415	tsk->files = files;
416	task_unlock(tsk);
417	put_files_struct(old);
418}
419
420void exit_files(struct task_struct *tsk)
421{
422	struct files_struct * files = tsk->files;
423
424	if (files) {
425		task_lock(tsk);
426		tsk->files = NULL;
427		task_unlock(tsk);
428		put_files_struct(files);
429	}
430}
431
432void __init files_defer_init(void)
433{
434	sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
435			     -BITS_PER_LONG;
436}
437
438struct files_struct init_files = {
439	.count		= ATOMIC_INIT(1),
440	.fdt		= &init_files.fdtab,
441	.fdtab		= {
442		.max_fds	= NR_OPEN_DEFAULT,
443		.fd		= &init_files.fd_array[0],
444		.close_on_exec	= init_files.close_on_exec_init,
445		.open_fds	= init_files.open_fds_init,
446	},
447	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
448};
449
450/*
451 * allocate a file descriptor, mark it busy.
452 */
453int __alloc_fd(struct files_struct *files,
454	       unsigned start, unsigned end, unsigned flags)
455{
456	unsigned int fd;
457	int error;
458	struct fdtable *fdt;
459
460	spin_lock(&files->file_lock);
461repeat:
462	fdt = files_fdtable(files);
463	fd = start;
464	if (fd < files->next_fd)
465		fd = files->next_fd;
466
467	if (fd < fdt->max_fds)
468		fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
469
470	/*
471	 * N.B. For clone tasks sharing a files structure, this test
472	 * will limit the total number of files that can be opened.
473	 */
474	error = -EMFILE;
475	if (fd >= end)
476		goto out;
477
478	error = expand_files(files, fd);
479	if (error < 0)
480		goto out;
481
482	/*
483	 * If we needed to expand the fs array we
484	 * might have blocked - try again.
485	 */
486	if (error)
487		goto repeat;
488
489	if (start <= files->next_fd)
490		files->next_fd = fd + 1;
491
492	__set_open_fd(fd, fdt);
493	if (flags & O_CLOEXEC)
494		__set_close_on_exec(fd, fdt);
495	else
496		__clear_close_on_exec(fd, fdt);
497	error = fd;
498#if 1
499	/* Sanity check */
500	if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
501		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
502		rcu_assign_pointer(fdt->fd[fd], NULL);
503	}
504#endif
505
506out:
507	spin_unlock(&files->file_lock);
508	return error;
509}
510
511static int alloc_fd(unsigned start, unsigned flags)
512{
513	return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
514}
515
516int get_unused_fd_flags(unsigned flags)
517{
518	return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
519}
520EXPORT_SYMBOL(get_unused_fd_flags);
521
522static void __put_unused_fd(struct files_struct *files, unsigned int fd)
523{
524	struct fdtable *fdt = files_fdtable(files);
525	__clear_open_fd(fd, fdt);
526	if (fd < files->next_fd)
527		files->next_fd = fd;
528}
529
530void put_unused_fd(unsigned int fd)
531{
532	struct files_struct *files = current->files;
533	spin_lock(&files->file_lock);
534	__put_unused_fd(files, fd);
535	spin_unlock(&files->file_lock);
536}
537
538EXPORT_SYMBOL(put_unused_fd);
539
540/*
541 * Install a file pointer in the fd array.
542 *
543 * The VFS is full of places where we drop the files lock between
544 * setting the open_fds bitmap and installing the file in the file
545 * array.  At any such point, we are vulnerable to a dup2() race
546 * installing a file in the array before us.  We need to detect this and
547 * fput() the struct file we are about to overwrite in this case.
548 *
549 * It should never happen - if we allow dup2() do it, _really_ bad things
550 * will follow.
551 *
552 * NOTE: __fd_install() variant is really, really low-level; don't
553 * use it unless you are forced to by truly lousy API shoved down
554 * your throat.  'files' *MUST* be either current->files or obtained
555 * by get_files_struct(current) done by whoever had given it to you,
556 * or really bad things will happen.  Normally you want to use
557 * fd_install() instead.
558 */
559
560void __fd_install(struct files_struct *files, unsigned int fd,
561		struct file *file)
562{
563	struct fdtable *fdt;
564	spin_lock(&files->file_lock);
565	fdt = files_fdtable(files);
566	BUG_ON(fdt->fd[fd] != NULL);
567	rcu_assign_pointer(fdt->fd[fd], file);
568	spin_unlock(&files->file_lock);
569}
570
571void fd_install(unsigned int fd, struct file *file)
572{
573	__fd_install(current->files, fd, file);
574}
575
576EXPORT_SYMBOL(fd_install);
577
578/*
579 * The same warnings as for __alloc_fd()/__fd_install() apply here...
580 */
581int __close_fd(struct files_struct *files, unsigned fd)
582{
583	struct file *file;
584	struct fdtable *fdt;
585
586	spin_lock(&files->file_lock);
587	fdt = files_fdtable(files);
588	if (fd >= fdt->max_fds)
589		goto out_unlock;
590	file = fdt->fd[fd];
591	if (!file)
592		goto out_unlock;
593	rcu_assign_pointer(fdt->fd[fd], NULL);
594	__clear_close_on_exec(fd, fdt);
595	__put_unused_fd(files, fd);
596	spin_unlock(&files->file_lock);
597	return filp_close(file, files);
598
599out_unlock:
600	spin_unlock(&files->file_lock);
601	return -EBADF;
602}
603
604void do_close_on_exec(struct files_struct *files)
605{
606	unsigned i;
607	struct fdtable *fdt;
608
609	/* exec unshares first */
610	spin_lock(&files->file_lock);
611	for (i = 0; ; i++) {
612		unsigned long set;
613		unsigned fd = i * BITS_PER_LONG;
614		fdt = files_fdtable(files);
615		if (fd >= fdt->max_fds)
616			break;
617		set = fdt->close_on_exec[i];
618		if (!set)
619			continue;
620		fdt->close_on_exec[i] = 0;
621		for ( ; set ; fd++, set >>= 1) {
622			struct file *file;
623			if (!(set & 1))
624				continue;
625			file = fdt->fd[fd];
626			if (!file)
627				continue;
628			rcu_assign_pointer(fdt->fd[fd], NULL);
629			__put_unused_fd(files, fd);
630			spin_unlock(&files->file_lock);
631			filp_close(file, files);
632			cond_resched();
633			spin_lock(&files->file_lock);
634		}
635
636	}
637	spin_unlock(&files->file_lock);
638}
639
640static struct file *__fget(unsigned int fd, fmode_t mask)
641{
642	struct files_struct *files = current->files;
643	struct file *file;
644
645	rcu_read_lock();
646	file = fcheck_files(files, fd);
647	if (file) {
648		/* File object ref couldn't be taken */
649		if ((file->f_mode & mask) ||
650		    !atomic_long_inc_not_zero(&file->f_count))
651			file = NULL;
652	}
653	rcu_read_unlock();
654
655	return file;
656}
657
658struct file *fget(unsigned int fd)
659{
660	return __fget(fd, FMODE_PATH);
661}
662EXPORT_SYMBOL(fget);
663
664struct file *fget_raw(unsigned int fd)
665{
666	return __fget(fd, 0);
667}
668EXPORT_SYMBOL(fget_raw);
669
670/*
671 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
672 *
673 * You can use this instead of fget if you satisfy all of the following
674 * conditions:
675 * 1) You must call fput_light before exiting the syscall and returning control
676 *    to userspace (i.e. you cannot remember the returned struct file * after
677 *    returning to userspace).
678 * 2) You must not call filp_close on the returned struct file * in between
679 *    calls to fget_light and fput_light.
680 * 3) You must not clone the current task in between the calls to fget_light
681 *    and fput_light.
682 *
683 * The fput_needed flag returned by fget_light should be passed to the
684 * corresponding fput_light.
685 */
686struct file *__fget_light(unsigned int fd, fmode_t mask, int *fput_needed)
687{
688	struct files_struct *files = current->files;
689	struct file *file;
690
691	*fput_needed = 0;
692	if (atomic_read(&files->count) == 1) {
693		file = __fcheck_files(files, fd);
694		if (file && (file->f_mode & mask))
695			file = NULL;
696	} else {
697		file = __fget(fd, mask);
698		if (file)
699			*fput_needed = 1;
700	}
701
702	return file;
703}
704struct file *fget_light(unsigned int fd, int *fput_needed)
705{
706	return __fget_light(fd, FMODE_PATH, fput_needed);
707}
708EXPORT_SYMBOL(fget_light);
709
710struct file *fget_raw_light(unsigned int fd, int *fput_needed)
711{
712	return __fget_light(fd, 0, fput_needed);
713}
714
715void set_close_on_exec(unsigned int fd, int flag)
716{
717	struct files_struct *files = current->files;
718	struct fdtable *fdt;
719	spin_lock(&files->file_lock);
720	fdt = files_fdtable(files);
721	if (flag)
722		__set_close_on_exec(fd, fdt);
723	else
724		__clear_close_on_exec(fd, fdt);
725	spin_unlock(&files->file_lock);
726}
727
728bool get_close_on_exec(unsigned int fd)
729{
730	struct files_struct *files = current->files;
731	struct fdtable *fdt;
732	bool res;
733	rcu_read_lock();
734	fdt = files_fdtable(files);
735	res = close_on_exec(fd, fdt);
736	rcu_read_unlock();
737	return res;
738}
739
740static int do_dup2(struct files_struct *files,
741	struct file *file, unsigned fd, unsigned flags)
742{
743	struct file *tofree;
744	struct fdtable *fdt;
745
746	/*
747	 * We need to detect attempts to do dup2() over allocated but still
748	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
749	 * extra work in their equivalent of fget() - they insert struct
750	 * file immediately after grabbing descriptor, mark it larval if
751	 * more work (e.g. actual opening) is needed and make sure that
752	 * fget() treats larval files as absent.  Potentially interesting,
753	 * but while extra work in fget() is trivial, locking implications
754	 * and amount of surgery on open()-related paths in VFS are not.
755	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
756	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
757	 * scope of POSIX or SUS, since neither considers shared descriptor
758	 * tables and this condition does not arise without those.
759	 */
760	fdt = files_fdtable(files);
761	tofree = fdt->fd[fd];
762	if (!tofree && fd_is_open(fd, fdt))
763		goto Ebusy;
764	get_file(file);
765	rcu_assign_pointer(fdt->fd[fd], file);
766	__set_open_fd(fd, fdt);
767	if (flags & O_CLOEXEC)
768		__set_close_on_exec(fd, fdt);
769	else
770		__clear_close_on_exec(fd, fdt);
771	spin_unlock(&files->file_lock);
772
773	if (tofree)
774		filp_close(tofree, files);
775
776	return fd;
777
778Ebusy:
779	spin_unlock(&files->file_lock);
780	return -EBUSY;
781}
782
783int replace_fd(unsigned fd, struct file *file, unsigned flags)
784{
785	int err;
786	struct files_struct *files = current->files;
787
788	if (!file)
789		return __close_fd(files, fd);
790
791	if (fd >= rlimit(RLIMIT_NOFILE))
792		return -EBADF;
793
794	spin_lock(&files->file_lock);
795	err = expand_files(files, fd);
796	if (unlikely(err < 0))
797		goto out_unlock;
798	return do_dup2(files, file, fd, flags);
799
800out_unlock:
801	spin_unlock(&files->file_lock);
802	return err;
803}
804
805SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
806{
807	int err = -EBADF;
808	struct file *file;
809	struct files_struct *files = current->files;
810
811	if ((flags & ~O_CLOEXEC) != 0)
812		return -EINVAL;
813
814	if (unlikely(oldfd == newfd))
815		return -EINVAL;
816
817	if (newfd >= rlimit(RLIMIT_NOFILE))
818		return -EBADF;
819
820	spin_lock(&files->file_lock);
821	err = expand_files(files, newfd);
822	file = fcheck(oldfd);
823	if (unlikely(!file))
824		goto Ebadf;
825	if (unlikely(err < 0)) {
826		if (err == -EMFILE)
827			goto Ebadf;
828		goto out_unlock;
829	}
830	return do_dup2(files, file, newfd, flags);
831
832Ebadf:
833	err = -EBADF;
834out_unlock:
835	spin_unlock(&files->file_lock);
836	return err;
837}
838
839SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
840{
841	if (unlikely(newfd == oldfd)) { /* corner case */
842		struct files_struct *files = current->files;
843		int retval = oldfd;
844
845		rcu_read_lock();
846		if (!fcheck_files(files, oldfd))
847			retval = -EBADF;
848		rcu_read_unlock();
849		return retval;
850	}
851	return sys_dup3(oldfd, newfd, 0);
852}
853
854SYSCALL_DEFINE1(dup, unsigned int, fildes)
855{
856	int ret = -EBADF;
857	struct file *file = fget_raw(fildes);
858
859	if (file) {
860		ret = get_unused_fd();
861		if (ret >= 0)
862			fd_install(ret, file);
863		else
864			fput(file);
865	}
866	return ret;
867}
868
869int f_dupfd(unsigned int from, struct file *file, unsigned flags)
870{
871	int err;
872	if (from >= rlimit(RLIMIT_NOFILE))
873		return -EINVAL;
874	err = alloc_fd(from, flags);
875	if (err >= 0) {
876		get_file(file);
877		fd_install(err, file);
878	}
879	return err;
880}
881
882int iterate_fd(struct files_struct *files, unsigned n,
883		int (*f)(const void *, struct file *, unsigned),
884		const void *p)
885{
886	struct fdtable *fdt;
887	int res = 0;
888	if (!files)
889		return 0;
890	spin_lock(&files->file_lock);
891	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
892		struct file *file;
893		file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
894		if (!file)
895			continue;
896		res = f(p, file, n);
897		if (res)
898			break;
899	}
900	spin_unlock(&files->file_lock);
901	return res;
902}
903EXPORT_SYMBOL(iterate_fd);
904