unix_io.c revision 2e8ca9a26b0bd7dae546a3f9a98df67b043fe3be
1/*
2 * unix_io.c --- This is the Unix (well, really POSIX) implementation
3 * 	of the I/O manager.
4 *
5 * Implements a one-block write-through cache.
6 *
7 * Includes support for Windows NT support under Cygwin.
8 *
9 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10 * 	2002 by Theodore Ts'o.
11 *
12 * %Begin-Header%
13 * This file may be redistributed under the terms of the GNU Public
14 * License.
15 * %End-Header%
16 */
17
18#define _LARGEFILE_SOURCE
19#define _LARGEFILE64_SOURCE
20
21#include <stdio.h>
22#include <string.h>
23#if HAVE_UNISTD_H
24#include <unistd.h>
25#endif
26#if HAVE_ERRNO_H
27#include <errno.h>
28#endif
29#include <fcntl.h>
30#include <time.h>
31#ifdef __linux__
32#include <sys/utsname.h>
33#endif
34#if HAVE_SYS_STAT_H
35#include <sys/stat.h>
36#endif
37#if HAVE_SYS_TYPES_H
38#include <sys/types.h>
39#endif
40#if HAVE_SYS_RESOURCE_H
41#include <sys/resource.h>
42#endif
43
44#include "ext2_fs.h"
45#include "ext2fs.h"
46
47/*
48 * For checking structure magic numbers...
49 */
50
51#define EXT2_CHECK_MAGIC(struct, code) \
52	  if ((struct)->magic != (code)) return (code)
53
54struct unix_cache {
55	char		*buf;
56	unsigned long	block;
57	int		access_time;
58	unsigned	dirty:1;
59	unsigned	in_use:1;
60};
61
62#define CACHE_SIZE 8
63#define WRITE_DIRECT_SIZE 4	/* Must be smaller than CACHE_SIZE */
64#define READ_DIRECT_SIZE 4	/* Should be smaller than CACHE_SIZE */
65
66struct unix_private_data {
67	int	magic;
68	int	dev;
69	int	flags;
70	int	access_time;
71	ext2_loff_t offset;
72	struct unix_cache cache[CACHE_SIZE];
73};
74
75static errcode_t unix_open(const char *name, int flags, io_channel *channel);
76static errcode_t unix_close(io_channel channel);
77static errcode_t unix_set_blksize(io_channel channel, int blksize);
78static errcode_t unix_read_blk(io_channel channel, unsigned long block,
79			       int count, void *data);
80static errcode_t unix_write_blk(io_channel channel, unsigned long block,
81				int count, const void *data);
82static errcode_t unix_flush(io_channel channel);
83static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
84				int size, const void *data);
85static errcode_t unix_set_option(io_channel channel, const char *option,
86				 const char *arg);
87
88static void reuse_cache(io_channel channel, struct unix_private_data *data,
89		 struct unix_cache *cache, unsigned long block);
90
91/* __FreeBSD_kernel__ is defined by GNU/kFreeBSD - the FreeBSD kernel
92 * does not know buffered block devices - everything is raw. */
93#if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
94#define NEED_BOUNCE_BUFFER
95#else
96#undef NEED_BOUNCE_BUFFER
97#endif
98
99static struct struct_io_manager struct_unix_manager = {
100	EXT2_ET_MAGIC_IO_MANAGER,
101	"Unix I/O Manager",
102	unix_open,
103	unix_close,
104	unix_set_blksize,
105	unix_read_blk,
106	unix_write_blk,
107	unix_flush,
108#ifdef NEED_BOUNCE_BUFFER
109	0,
110#else
111	unix_write_byte,
112#endif
113	unix_set_option
114};
115
116io_manager unix_io_manager = &struct_unix_manager;
117
118/*
119 * Here are the raw I/O functions
120 */
121#ifndef NEED_BOUNCE_BUFFER
122static errcode_t raw_read_blk(io_channel channel,
123			      struct unix_private_data *data,
124			      unsigned long block,
125			      int count, void *buf)
126{
127	errcode_t	retval;
128	ssize_t		size;
129	ext2_loff_t	location;
130	int		actual = 0;
131
132	size = (count < 0) ? -count : count * channel->block_size;
133	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
134	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
135		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
136		goto error_out;
137	}
138	actual = read(data->dev, buf, size);
139	if (actual != size) {
140		if (actual < 0)
141			actual = 0;
142		retval = EXT2_ET_SHORT_READ;
143		goto error_out;
144	}
145	return 0;
146
147error_out:
148	memset((char *) buf+actual, 0, size-actual);
149	if (channel->read_error)
150		retval = (channel->read_error)(channel, block, count, buf,
151					       size, actual, retval);
152	return retval;
153}
154#else /* NEED_BOUNCE_BUFFER */
155/*
156 * Windows and FreeBSD block devices only allow sector alignment IO in offset and size
157 */
158static errcode_t raw_read_blk(io_channel channel,
159			      struct unix_private_data *data,
160			      unsigned long block,
161			      int count, void *buf)
162{
163	errcode_t	retval;
164	size_t		size, alignsize, fragment;
165	ext2_loff_t	location;
166	int		total = 0, actual;
167#define BLOCKALIGN 512
168	char		sector[BLOCKALIGN];
169
170	size = (count < 0) ? -count : count * channel->block_size;
171	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
172#ifdef DEBUG
173	printf("count=%d, size=%d, block=%d, blk_size=%d, location=%lx\n",
174	 		count, size, block, channel->block_size, location);
175#endif
176	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
177		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
178		goto error_out;
179	}
180	fragment = size % BLOCKALIGN;
181	alignsize = size - fragment;
182	if (alignsize) {
183		actual = read(data->dev, buf, alignsize);
184		if (actual != alignsize)
185			goto short_read;
186	}
187	if (fragment) {
188		actual = read(data->dev, sector, BLOCKALIGN);
189		if (actual != BLOCKALIGN)
190			goto short_read;
191		memcpy(buf+alignsize, sector, fragment);
192	}
193	return 0;
194
195short_read:
196	if (actual>0)
197		total += actual;
198	retval = EXT2_ET_SHORT_READ;
199
200error_out:
201	memset((char *) buf+total, 0, size-actual);
202	if (channel->read_error)
203		retval = (channel->read_error)(channel, block, count, buf,
204					       size, actual, retval);
205	return retval;
206}
207#endif
208
209static errcode_t raw_write_blk(io_channel channel,
210			       struct unix_private_data *data,
211			       unsigned long block,
212			       int count, const void *buf)
213{
214	ssize_t		size;
215	ext2_loff_t	location;
216	int		actual = 0;
217	errcode_t	retval;
218
219	if (count == 1)
220		size = channel->block_size;
221	else {
222		if (count < 0)
223			size = -count;
224		else
225			size = count * channel->block_size;
226	}
227
228	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
229	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
230		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
231		goto error_out;
232	}
233
234	actual = write(data->dev, buf, size);
235	if (actual != size) {
236		retval = EXT2_ET_SHORT_WRITE;
237		goto error_out;
238	}
239	return 0;
240
241error_out:
242	if (channel->write_error)
243		retval = (channel->write_error)(channel, block, count, buf,
244						size, actual, retval);
245	return retval;
246}
247
248
249/*
250 * Here we implement the cache functions
251 */
252
253/* Allocate the cache buffers */
254static errcode_t alloc_cache(io_channel channel,
255			     struct unix_private_data *data)
256{
257	errcode_t		retval;
258	struct unix_cache	*cache;
259	int			i;
260
261	data->access_time = 0;
262	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
263		cache->block = 0;
264		cache->access_time = 0;
265		cache->dirty = 0;
266		cache->in_use = 0;
267		if ((retval = ext2fs_get_mem(channel->block_size,
268					     &cache->buf)))
269			return retval;
270	}
271	return 0;
272}
273
274/* Free the cache buffers */
275static void free_cache(struct unix_private_data *data)
276{
277	struct unix_cache	*cache;
278	int			i;
279
280	data->access_time = 0;
281	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
282		cache->block = 0;
283		cache->access_time = 0;
284		cache->dirty = 0;
285		cache->in_use = 0;
286		if (cache->buf)
287			ext2fs_free_mem(&cache->buf);
288		cache->buf = 0;
289	}
290}
291
292#ifndef NO_IO_CACHE
293/*
294 * Try to find a block in the cache.  If the block is not found, and
295 * eldest is a non-zero pointer, then fill in eldest with the cache
296 * entry to that should be reused.
297 */
298static struct unix_cache *find_cached_block(struct unix_private_data *data,
299					    unsigned long block,
300					    struct unix_cache **eldest)
301{
302	struct unix_cache	*cache, *unused_cache, *oldest_cache;
303	int			i;
304
305	unused_cache = oldest_cache = 0;
306	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
307		if (!cache->in_use) {
308			if (!unused_cache)
309				unused_cache = cache;
310			continue;
311		}
312		if (cache->block == block) {
313			cache->access_time = ++data->access_time;
314			return cache;
315		}
316		if (!oldest_cache ||
317		    (cache->access_time < oldest_cache->access_time))
318			oldest_cache = cache;
319	}
320	if (eldest)
321		*eldest = (unused_cache) ? unused_cache : oldest_cache;
322	return 0;
323}
324
325/*
326 * Reuse a particular cache entry for another block.
327 */
328static void reuse_cache(io_channel channel, struct unix_private_data *data,
329		 struct unix_cache *cache, unsigned long block)
330{
331	if (cache->dirty && cache->in_use)
332		raw_write_blk(channel, data, cache->block, 1, cache->buf);
333
334	cache->in_use = 1;
335	cache->dirty = 0;
336	cache->block = block;
337	cache->access_time = ++data->access_time;
338}
339
340/*
341 * Flush all of the blocks in the cache
342 */
343static errcode_t flush_cached_blocks(io_channel channel,
344				     struct unix_private_data *data,
345				     int invalidate)
346
347{
348	struct unix_cache	*cache;
349	errcode_t		retval, retval2;
350	int			i;
351
352	retval2 = 0;
353	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
354		if (!cache->in_use)
355			continue;
356
357		if (invalidate)
358			cache->in_use = 0;
359
360		if (!cache->dirty)
361			continue;
362
363		retval = raw_write_blk(channel, data,
364				       cache->block, 1, cache->buf);
365		if (retval)
366			retval2 = retval;
367		else
368			cache->dirty = 0;
369	}
370	return retval2;
371}
372#endif /* NO_IO_CACHE */
373
374static errcode_t unix_open(const char *name, int flags, io_channel *channel)
375{
376	io_channel	io = NULL;
377	struct unix_private_data *data = NULL;
378	errcode_t	retval;
379	int		open_flags;
380	struct stat	st;
381#ifdef __linux__
382	struct 		utsname ut;
383#endif
384
385	if (name == 0)
386		return EXT2_ET_BAD_DEVICE_NAME;
387	retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
388	if (retval)
389		return retval;
390	memset(io, 0, sizeof(struct struct_io_channel));
391	io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
392	retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
393	if (retval)
394		goto cleanup;
395
396	io->manager = unix_io_manager;
397	retval = ext2fs_get_mem(strlen(name)+1, &io->name);
398	if (retval)
399		goto cleanup;
400
401	strcpy(io->name, name);
402	io->private_data = data;
403	io->block_size = 1024;
404	io->read_error = 0;
405	io->write_error = 0;
406	io->refcount = 1;
407
408	memset(data, 0, sizeof(struct unix_private_data));
409	data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
410
411	if ((retval = alloc_cache(io, data)))
412		goto cleanup;
413
414	open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
415#ifdef HAVE_OPEN64
416	data->dev = open64(io->name, open_flags);
417#else
418	data->dev = open(io->name, open_flags);
419#endif
420	if (data->dev < 0) {
421		retval = errno;
422		goto cleanup;
423	}
424
425#ifdef __linux__
426#undef RLIM_INFINITY
427#if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
428#define RLIM_INFINITY	((unsigned long)(~0UL>>1))
429#else
430#define RLIM_INFINITY  (~0UL)
431#endif
432	/*
433	 * Work around a bug in 2.4.10-2.4.18 kernels where writes to
434	 * block devices are wrongly getting hit by the filesize
435	 * limit.  This workaround isn't perfect, since it won't work
436	 * if glibc wasn't built against 2.2 header files.  (Sigh.)
437	 *
438	 */
439	if ((flags & IO_FLAG_RW) &&
440	    (uname(&ut) == 0) &&
441	    ((ut.release[0] == '2') && (ut.release[1] == '.') &&
442	     (ut.release[2] == '4') && (ut.release[3] == '.') &&
443	     (ut.release[4] == '1') && (ut.release[5] >= '0') &&
444	     (ut.release[5] < '8')) &&
445	    (fstat(data->dev, &st) == 0) &&
446	    (S_ISBLK(st.st_mode))) {
447		struct rlimit	rlim;
448
449		rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
450		setrlimit(RLIMIT_FSIZE, &rlim);
451		getrlimit(RLIMIT_FSIZE, &rlim);
452		if (((unsigned long) rlim.rlim_cur) <
453		    ((unsigned long) rlim.rlim_max)) {
454			rlim.rlim_cur = rlim.rlim_max;
455			setrlimit(RLIMIT_FSIZE, &rlim);
456		}
457	}
458#endif
459	*channel = io;
460	return 0;
461
462cleanup:
463	if (data) {
464		free_cache(data);
465		ext2fs_free_mem(&data);
466	}
467	if (io)
468		ext2fs_free_mem(&io);
469	return retval;
470}
471
472static errcode_t unix_close(io_channel channel)
473{
474	struct unix_private_data *data;
475	errcode_t	retval = 0;
476
477	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
478	data = (struct unix_private_data *) channel->private_data;
479	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
480
481	if (--channel->refcount > 0)
482		return 0;
483
484#ifndef NO_IO_CACHE
485	retval = flush_cached_blocks(channel, data, 0);
486#endif
487
488	if (close(data->dev) < 0)
489		retval = errno;
490	free_cache(data);
491
492	ext2fs_free_mem(&channel->private_data);
493	if (channel->name)
494		ext2fs_free_mem(&channel->name);
495	ext2fs_free_mem(&channel);
496	return retval;
497}
498
499static errcode_t unix_set_blksize(io_channel channel, int blksize)
500{
501	struct unix_private_data *data;
502	errcode_t		retval;
503
504	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
505	data = (struct unix_private_data *) channel->private_data;
506	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
507
508	if (channel->block_size != blksize) {
509#ifndef NO_IO_CACHE
510		if ((retval = flush_cached_blocks(channel, data, 0)))
511			return retval;
512#endif
513
514		channel->block_size = blksize;
515		free_cache(data);
516		if ((retval = alloc_cache(channel, data)))
517			return retval;
518	}
519	return 0;
520}
521
522
523static errcode_t unix_read_blk(io_channel channel, unsigned long block,
524			       int count, void *buf)
525{
526	struct unix_private_data *data;
527	struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
528	errcode_t	retval;
529	char		*cp;
530	int		i, j;
531
532	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
533	data = (struct unix_private_data *) channel->private_data;
534	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
535
536#ifdef NO_IO_CACHE
537	return raw_read_blk(channel, data, block, count, buf);
538#else
539	/*
540	 * If we're doing an odd-sized read or a very large read,
541	 * flush out the cache and then do a direct read.
542	 */
543	if (count < 0 || count > WRITE_DIRECT_SIZE) {
544		if ((retval = flush_cached_blocks(channel, data, 0)))
545			return retval;
546		return raw_read_blk(channel, data, block, count, buf);
547	}
548
549	cp = buf;
550	while (count > 0) {
551		/* If it's in the cache, use it! */
552		if ((cache = find_cached_block(data, block, &reuse[0]))) {
553#ifdef DEBUG
554			printf("Using cached block %d\n", block);
555#endif
556			memcpy(cp, cache->buf, channel->block_size);
557			count--;
558			block++;
559			cp += channel->block_size;
560			continue;
561		}
562		/*
563		 * Find the number of uncached blocks so we can do a
564		 * single read request
565		 */
566		for (i=1; i < count; i++)
567			if (find_cached_block(data, block+i, &reuse[i]))
568				break;
569#ifdef DEBUG
570		printf("Reading %d blocks starting at %d\n", i, block);
571#endif
572		if ((retval = raw_read_blk(channel, data, block, i, cp)))
573			return retval;
574
575		/* Save the results in the cache */
576		for (j=0; j < i; j++) {
577			count--;
578			cache = reuse[j];
579			reuse_cache(channel, data, cache, block++);
580			memcpy(cache->buf, cp, channel->block_size);
581			cp += channel->block_size;
582		}
583	}
584	return 0;
585#endif /* NO_IO_CACHE */
586}
587
588static errcode_t unix_write_blk(io_channel channel, unsigned long block,
589				int count, const void *buf)
590{
591	struct unix_private_data *data;
592	struct unix_cache *cache, *reuse;
593	errcode_t	retval = 0;
594	const char	*cp;
595	int		writethrough;
596
597	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
598	data = (struct unix_private_data *) channel->private_data;
599	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
600
601#ifdef NO_IO_CACHE
602	return raw_write_blk(channel, data, block, count, buf);
603#else
604	/*
605	 * If we're doing an odd-sized write or a very large write,
606	 * flush out the cache completely and then do a direct write.
607	 */
608	if (count < 0 || count > WRITE_DIRECT_SIZE) {
609		if ((retval = flush_cached_blocks(channel, data, 1)))
610			return retval;
611		return raw_write_blk(channel, data, block, count, buf);
612	}
613
614	/*
615	 * For a moderate-sized multi-block write, first force a write
616	 * if we're in write-through cache mode, and then fill the
617	 * cache with the blocks.
618	 */
619	writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
620	if (writethrough)
621		retval = raw_write_blk(channel, data, block, count, buf);
622
623	cp = buf;
624	while (count > 0) {
625		cache = find_cached_block(data, block, &reuse);
626		if (!cache) {
627			cache = reuse;
628			reuse_cache(channel, data, cache, block);
629		}
630		memcpy(cache->buf, cp, channel->block_size);
631		cache->dirty = !writethrough;
632		count--;
633		block++;
634		cp += channel->block_size;
635	}
636	return retval;
637#endif /* NO_IO_CACHE */
638}
639
640static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
641				 int size, const void *buf)
642{
643	struct unix_private_data *data;
644	errcode_t	retval = 0;
645	ssize_t		actual;
646
647	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
648	data = (struct unix_private_data *) channel->private_data;
649	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
650
651#ifndef NO_IO_CACHE
652	/*
653	 * Flush out the cache completely
654	 */
655	if ((retval = flush_cached_blocks(channel, data, 1)))
656		return retval;
657#endif
658
659	if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
660		return errno;
661
662	actual = write(data->dev, buf, size);
663	if (actual != size)
664		return EXT2_ET_SHORT_WRITE;
665
666	return 0;
667}
668
669/*
670 * Flush data buffers to disk.
671 */
672static errcode_t unix_flush(io_channel channel)
673{
674	struct unix_private_data *data;
675	errcode_t retval = 0;
676
677	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
678	data = (struct unix_private_data *) channel->private_data;
679	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
680
681#ifndef NO_IO_CACHE
682	retval = flush_cached_blocks(channel, data, 0);
683#endif
684	fsync(data->dev);
685	return retval;
686}
687
688static errcode_t unix_set_option(io_channel channel, const char *option,
689				 const char *arg)
690{
691	struct unix_private_data *data;
692	unsigned long tmp;
693	char *end;
694
695	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
696	data = (struct unix_private_data *) channel->private_data;
697	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
698
699	if (!strcmp(option, "offset")) {
700		if (!arg)
701			return EXT2_ET_INVALID_ARGUMENT;
702
703		tmp = strtoul(arg, &end, 0);
704		if (*end)
705			return EXT2_ET_INVALID_ARGUMENT;
706		data->offset = tmp;
707		return 0;
708	}
709	return EXT2_ET_INVALID_ARGUMENT;
710}
711