unix_io.c revision b8a953157bce577bff6f9d8437e8d7f2c881fe63
1/*
2 * unix_io.c --- This is the Unix (well, really POSIX) implementation
3 * 	of the I/O manager.
4 *
5 * Implements a one-block write-through cache.
6 *
7 * Includes support for Windows NT support under Cygwin.
8 *
9 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10 * 	2002 by Theodore Ts'o.
11 *
12 * %Begin-Header%
13 * This file may be redistributed under the terms of the GNU Public
14 * License.
15 * %End-Header%
16 */
17
18#define _LARGEFILE_SOURCE
19#define _LARGEFILE64_SOURCE
20
21#include <stdio.h>
22#include <string.h>
23#if HAVE_UNISTD_H
24#include <unistd.h>
25#endif
26#if HAVE_ERRNO_H
27#include <errno.h>
28#endif
29#include <fcntl.h>
30#include <time.h>
31#ifdef __linux__
32#include <sys/utsname.h>
33#endif
34#if HAVE_SYS_STAT_H
35#include <sys/stat.h>
36#endif
37#if HAVE_SYS_TYPES_H
38#include <sys/types.h>
39#endif
40#if HAVE_SYS_RESOURCE_H
41#include <sys/resource.h>
42#endif
43
44#include "ext2_fs.h"
45#include "ext2fs.h"
46
47/*
48 * For checking structure magic numbers...
49 */
50
51#define EXT2_CHECK_MAGIC(struct, code) \
52	  if ((struct)->magic != (code)) return (code)
53
54struct unix_cache {
55	char		*buf;
56	unsigned long	block;
57	int		access_time;
58	int		dirty:1;
59	int		in_use:1;
60};
61
62#define CACHE_SIZE 8
63#define WRITE_DIRECT_SIZE 4	/* Must be smaller than CACHE_SIZE */
64#define READ_DIRECT_SIZE 4	/* Should be smaller than CACHE_SIZE */
65
66struct unix_private_data {
67	int	magic;
68	int	dev;
69	int	flags;
70	int	access_time;
71	struct unix_cache cache[CACHE_SIZE];
72};
73
74static errcode_t unix_open(const char *name, int flags, io_channel *channel);
75static errcode_t unix_close(io_channel channel);
76static errcode_t unix_set_blksize(io_channel channel, int blksize);
77static errcode_t unix_read_blk(io_channel channel, unsigned long block,
78			       int count, void *data);
79static errcode_t unix_write_blk(io_channel channel, unsigned long block,
80				int count, const void *data);
81static errcode_t unix_flush(io_channel channel);
82static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
83				int size, const void *data);
84
85static void reuse_cache(io_channel channel, struct unix_private_data *data,
86		 struct unix_cache *cache, unsigned long block);
87
88static struct struct_io_manager struct_unix_manager = {
89	EXT2_ET_MAGIC_IO_MANAGER,
90	"Unix I/O Manager",
91	unix_open,
92	unix_close,
93	unix_set_blksize,
94	unix_read_blk,
95	unix_write_blk,
96	unix_flush,
97#ifdef __CYGWIN__
98	0
99#else
100	unix_write_byte
101#endif
102};
103
104io_manager unix_io_manager = &struct_unix_manager;
105
106/*
107 * Here are the raw I/O functions
108 */
109#ifndef __CYGWIN__
110static errcode_t raw_read_blk(io_channel channel,
111			      struct unix_private_data *data,
112			      unsigned long block,
113			      int count, void *buf)
114{
115	errcode_t	retval;
116	size_t		size;
117	ext2_loff_t	location;
118	int		actual = 0;
119
120	size = (count < 0) ? -count : count * channel->block_size;
121	location = (ext2_loff_t) block * channel->block_size;
122	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
123		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
124		goto error_out;
125	}
126	actual = read(data->dev, buf, size);
127	if (actual != size) {
128		if (actual < 0)
129			actual = 0;
130		retval = EXT2_ET_SHORT_READ;
131		goto error_out;
132	}
133	return 0;
134
135error_out:
136	memset((char *) buf+actual, 0, size-actual);
137	if (channel->read_error)
138		retval = (channel->read_error)(channel, block, count, buf,
139					       size, actual, retval);
140	return retval;
141}
142#else /* __CYGWIN__ */
143/*
144 * Windows block devices only allow sector alignment IO in offset and size
145 */
146static errcode_t raw_read_blk(io_channel channel,
147			      struct unix_private_data *data,
148			      unsigned long block,
149			      int count, void *buf)
150{
151	errcode_t	retval;
152	size_t		size, alignsize, fragment;
153	ext2_loff_t	location;
154	int		total = 0, actual;
155#define BLOCKALIGN 512
156	char		sector[BLOCKALIGN];
157
158	size = (count < 0) ? -count : count * channel->block_size;
159	location = (ext2_loff_t) block * channel->block_size;
160#ifdef DEBUG
161	printf("count=%d, size=%d, block=%d, blk_size=%d, location=%lx\n",
162	 		count, size, block, channel->block_size, location);
163#endif
164	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
165		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
166		goto error_out;
167	}
168	fragment = size % BLOCKALIGN;
169	alignsize = size - fragment;
170	if (alignsize) {
171		actual = read(data->dev, buf, alignsize);
172		if (actual != alignsize)
173			goto short_read;
174	}
175	if (fragment) {
176		actual = read(data->dev, sector, BLOCKALIGN);
177		if (actual != BLOCKALIGN)
178			goto short_read;
179		memcpy(buf+alignsize, sector, fragment);
180	}
181	return 0;
182
183short_read:
184	if (actual>0)
185		total += actual;
186	retval = EXT2_ET_SHORT_READ;
187
188error_out:
189	memset((char *) buf+total, 0, size-actual);
190	if (channel->read_error)
191		retval = (channel->read_error)(channel, block, count, buf,
192					       size, actual, retval);
193	return retval;
194}
195#endif
196
197static errcode_t raw_write_blk(io_channel channel,
198			       struct unix_private_data *data,
199			       unsigned long block,
200			       int count, const void *buf)
201{
202	size_t		size;
203	ext2_loff_t	location;
204	int		actual = 0;
205	errcode_t	retval;
206
207	if (count == 1)
208		size = channel->block_size;
209	else {
210		if (count < 0)
211			size = -count;
212		else
213			size = count * channel->block_size;
214	}
215
216	location = (ext2_loff_t) block * channel->block_size;
217	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
218		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
219		goto error_out;
220	}
221
222	actual = write(data->dev, buf, size);
223	if (actual != size) {
224		retval = EXT2_ET_SHORT_WRITE;
225		goto error_out;
226	}
227	return 0;
228
229error_out:
230	if (channel->write_error)
231		retval = (channel->write_error)(channel, block, count, buf,
232						size, actual, retval);
233	return retval;
234}
235
236
237/*
238 * Here we implement the cache functions
239 */
240
241/* Allocate the cache buffers */
242static errcode_t alloc_cache(io_channel channel,
243			     struct unix_private_data *data)
244{
245	errcode_t		retval;
246	struct unix_cache	*cache;
247	int			i;
248
249	data->access_time = 0;
250	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
251		cache->block = 0;
252		cache->access_time = 0;
253		cache->dirty = 0;
254		cache->in_use = 0;
255		if ((retval = ext2fs_get_mem(channel->block_size,
256					     (void **) &cache->buf)))
257			return retval;
258	}
259	return 0;
260}
261
262/* Free the cache buffers */
263static void free_cache(io_channel channel,
264		       struct unix_private_data *data)
265{
266	struct unix_cache	*cache;
267	int			i;
268
269	data->access_time = 0;
270	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
271		cache->block = 0;
272		cache->access_time = 0;
273		cache->dirty = 0;
274		cache->in_use = 0;
275		if (cache->buf)
276			ext2fs_free_mem((void **) &cache->buf);
277		cache->buf = 0;
278	}
279}
280
281#ifndef NO_IO_CACHE
282/*
283 * Try to find a block in the cache.  If the block is not found, and
284 * eldest is a non-zero pointer, then fill in eldest with the cache
285 * entry to that should be reused.
286 */
287static struct unix_cache *find_cached_block(io_channel channel,
288					    struct unix_private_data *data,
289					    unsigned long block,
290					    struct unix_cache **eldest)
291{
292	struct unix_cache	*cache, *unused_cache, *oldest_cache;
293	int			i;
294
295	unused_cache = oldest_cache = 0;
296	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
297		if (!cache->in_use) {
298			if (!unused_cache)
299				unused_cache = cache;
300			continue;
301		}
302		if (cache->block == block) {
303			cache->access_time = ++data->access_time;
304			return cache;
305		}
306		if (!oldest_cache ||
307		    (cache->access_time < oldest_cache->access_time))
308			oldest_cache = cache;
309	}
310	if (eldest)
311		*eldest = (unused_cache) ? unused_cache : oldest_cache;
312	return 0;
313}
314
315/*
316 * Reuse a particular cache entry for another block.
317 */
318static void reuse_cache(io_channel channel, struct unix_private_data *data,
319		 struct unix_cache *cache, unsigned long block)
320{
321	if (cache->dirty && cache->in_use)
322		raw_write_blk(channel, data, cache->block, 1, cache->buf);
323
324	cache->in_use = 1;
325	cache->dirty = 0;
326	cache->block = block;
327	cache->access_time = ++data->access_time;
328}
329
330/*
331 * Flush all of the blocks in the cache
332 */
333static errcode_t flush_cached_blocks(io_channel channel,
334				     struct unix_private_data *data,
335				     int invalidate)
336
337{
338	struct unix_cache	*cache;
339	errcode_t		retval, retval2;
340	int			i;
341
342	retval2 = 0;
343	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
344		if (!cache->in_use)
345			continue;
346
347		if (invalidate)
348			cache->in_use = 0;
349
350		if (!cache->dirty)
351			continue;
352
353		retval = raw_write_blk(channel, data,
354				       cache->block, 1, cache->buf);
355		if (retval)
356			retval2 = retval;
357		else
358			cache->dirty = 0;
359	}
360	return retval2;
361}
362#endif /* NO_IO_CACHE */
363
364static errcode_t unix_open(const char *name, int flags, io_channel *channel)
365{
366	io_channel	io = NULL;
367	struct unix_private_data *data = NULL;
368	errcode_t	retval;
369	int		open_flags;
370	struct stat	st;
371#ifdef __linux__
372	struct 		utsname ut;
373#endif
374
375	if (name == 0)
376		return EXT2_ET_BAD_DEVICE_NAME;
377	retval = ext2fs_get_mem(sizeof(struct struct_io_channel),
378				(void **) &io);
379	if (retval)
380		return retval;
381	memset(io, 0, sizeof(struct struct_io_channel));
382	io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
383	retval = ext2fs_get_mem(sizeof(struct unix_private_data),
384				(void **) &data);
385	if (retval)
386		goto cleanup;
387
388	io->manager = unix_io_manager;
389	retval = ext2fs_get_mem(strlen(name)+1, (void **) &io->name);
390	if (retval)
391		goto cleanup;
392
393	strcpy(io->name, name);
394	io->private_data = data;
395	io->block_size = 1024;
396	io->read_error = 0;
397	io->write_error = 0;
398	io->refcount = 1;
399
400	memset(data, 0, sizeof(struct unix_private_data));
401	data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
402
403	if ((retval = alloc_cache(io, data)))
404		goto cleanup;
405
406	open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
407#ifdef HAVE_OPEN64
408	data->dev = open64(name, open_flags);
409#else
410	data->dev = open(name, open_flags);
411#endif
412	if (data->dev < 0) {
413		retval = errno;
414		goto cleanup;
415	}
416
417#ifdef __linux__
418#undef RLIM_INFINITY
419#if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
420#define RLIM_INFINITY	((unsigned long)(~0UL>>1))
421#else
422#define RLIM_INFINITY  (~0UL)
423#endif
424	/*
425	 * Work around a bug in 2.4.10-2.4.18 kernels where writes to
426	 * block devices are wrongly getting hit by the filesize
427	 * limit.  This workaround isn't perfect, since it won't work
428	 * if glibc wasn't built against 2.2 header files.  (Sigh.)
429	 *
430	 */
431	if ((flags & IO_FLAG_RW) &&
432	    (uname(&ut) == 0) &&
433	    ((ut.release[0] == '2') && (ut.release[1] == '.') &&
434	     (ut.release[2] == '4') && (ut.release[3] == '.') &&
435	     (ut.release[4] == '1') && (ut.release[5] >= '0') &&
436	     (ut.release[5] < '8')) &&
437	    (fstat(data->dev, &st) == 0) &&
438	    (S_ISBLK(st.st_mode))) {
439		struct rlimit	rlim;
440
441		rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
442		setrlimit(RLIMIT_FSIZE, &rlim);
443		getrlimit(RLIMIT_FSIZE, &rlim);
444		if (((unsigned long) rlim.rlim_cur) <
445		    ((unsigned long) rlim.rlim_max)) {
446			rlim.rlim_cur = rlim.rlim_max;
447			setrlimit(RLIMIT_FSIZE, &rlim);
448		}
449	}
450#endif
451	*channel = io;
452	return 0;
453
454cleanup:
455	if (data) {
456		free_cache(io, data);
457		ext2fs_free_mem((void **) &data);
458	}
459	if (io)
460		ext2fs_free_mem((void **) &io);
461	return retval;
462}
463
464static errcode_t unix_close(io_channel channel)
465{
466	struct unix_private_data *data;
467	errcode_t	retval = 0;
468
469	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
470	data = (struct unix_private_data *) channel->private_data;
471	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
472
473	if (--channel->refcount > 0)
474		return 0;
475
476#ifndef NO_IO_CACHE
477	retval = flush_cached_blocks(channel, data, 0);
478#endif
479
480	if (close(data->dev) < 0)
481		retval = errno;
482	free_cache(channel, data);
483
484	ext2fs_free_mem((void **) &channel->private_data);
485	if (channel->name)
486		ext2fs_free_mem((void **) &channel->name);
487	ext2fs_free_mem((void **) &channel);
488	return retval;
489}
490
491static errcode_t unix_set_blksize(io_channel channel, int blksize)
492{
493	struct unix_private_data *data;
494	errcode_t		retval;
495
496	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
497	data = (struct unix_private_data *) channel->private_data;
498	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
499
500	if (channel->block_size != blksize) {
501#ifndef NO_IO_CACHE
502		if ((retval = flush_cached_blocks(channel, data, 0)))
503			return retval;
504#endif
505
506		channel->block_size = blksize;
507		free_cache(channel, data);
508		if ((retval = alloc_cache(channel, data)))
509			return retval;
510	}
511	return 0;
512}
513
514
515static errcode_t unix_read_blk(io_channel channel, unsigned long block,
516			       int count, void *buf)
517{
518	struct unix_private_data *data;
519	struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
520	errcode_t	retval;
521	char		*cp;
522	int		i, j;
523
524	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
525	data = (struct unix_private_data *) channel->private_data;
526	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
527
528#ifdef NO_IO_CACHE
529	return raw_read_blk(channel, data, block, count, buf);
530#else
531	/*
532	 * If we're doing an odd-sized read or a very large read,
533	 * flush out the cache and then do a direct read.
534	 */
535	if (count < 0 || count > WRITE_DIRECT_SIZE) {
536		if ((retval = flush_cached_blocks(channel, data, 0)))
537			return retval;
538		return raw_read_blk(channel, data, block, count, buf);
539	}
540
541	cp = buf;
542	while (count > 0) {
543		/* If it's in the cache, use it! */
544		if ((cache = find_cached_block(channel, data, block,
545					       &reuse[0]))) {
546#ifdef DEBUG
547			printf("Using cached block %d\n", block);
548#endif
549			memcpy(cp, cache->buf, channel->block_size);
550			count--;
551			block++;
552			cp += channel->block_size;
553			continue;
554		}
555		/*
556		 * Find the number of uncached blocks so we can do a
557		 * single read request
558		 */
559		for (i=1; i < count; i++)
560			if (find_cached_block(channel, data, block+i,
561					      &reuse[i]))
562				break;
563#ifdef DEBUG
564		printf("Reading %d blocks starting at %d\n", i, block);
565#endif
566		if ((retval = raw_read_blk(channel, data, block, i, cp)))
567			return retval;
568
569		/* Save the results in the cache */
570		for (j=0; j < i; j++) {
571			count--;
572			cache = reuse[j];
573			reuse_cache(channel, data, cache, block++);
574			memcpy(cache->buf, cp, channel->block_size);
575			cp += channel->block_size;
576		}
577	}
578	return 0;
579#endif /* NO_IO_CACHE */
580}
581
582static errcode_t unix_write_blk(io_channel channel, unsigned long block,
583				int count, const void *buf)
584{
585	struct unix_private_data *data;
586	struct unix_cache *cache, *reuse;
587	errcode_t	retval = 0;
588	const char	*cp;
589	int		writethrough;
590
591	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
592	data = (struct unix_private_data *) channel->private_data;
593	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
594
595#ifdef NO_IO_CACHE
596	return raw_write_blk(channel, data, block, count, buf);
597#else
598	/*
599	 * If we're doing an odd-sized write or a very large write,
600	 * flush out the cache completely and then do a direct write.
601	 */
602	if (count < 0 || count > WRITE_DIRECT_SIZE) {
603		if ((retval = flush_cached_blocks(channel, data, 1)))
604			return retval;
605		return raw_write_blk(channel, data, block, count, buf);
606	}
607
608	/*
609	 * For a moderate-sized multi-block write, first force a write
610	 * if we're in write-through cache mode, and then fill the
611	 * cache with the blocks.
612	 */
613	writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
614	if (writethrough)
615		retval = raw_write_blk(channel, data, block, count, buf);
616
617	cp = buf;
618	while (count > 0) {
619		cache = find_cached_block(channel, data, block, &reuse);
620		if (!cache) {
621			cache = reuse;
622			reuse_cache(channel, data, cache, block);
623		}
624		memcpy(cache->buf, cp, channel->block_size);
625		cache->dirty = !writethrough;
626		count--;
627		block++;
628		cp += channel->block_size;
629	}
630	return retval;
631#endif /* NO_IO_CACHE */
632}
633
634static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
635				 int size, const void *buf)
636{
637	struct unix_private_data *data;
638	errcode_t	retval = 0;
639	size_t		actual;
640
641	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
642	data = (struct unix_private_data *) channel->private_data;
643	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
644
645#ifndef NO_IO_CACHE
646	/*
647	 * Flush out the cache completely
648	 */
649	if ((retval = flush_cached_blocks(channel, data, 1)))
650		return retval;
651#endif
652
653	if (lseek(data->dev, offset, SEEK_SET) < 0)
654		return errno;
655
656	actual = write(data->dev, buf, size);
657	if (actual != size)
658		return EXT2_ET_SHORT_WRITE;
659
660	return 0;
661}
662
663/*
664 * Flush data buffers to disk.
665 */
666static errcode_t unix_flush(io_channel channel)
667{
668	struct unix_private_data *data;
669	errcode_t retval = 0;
670
671	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
672	data = (struct unix_private_data *) channel->private_data;
673	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
674
675#ifndef NO_IO_CACHE
676	retval = flush_cached_blocks(channel, data, 0);
677#endif
678	fsync(data->dev);
679	return retval;
680}
681
682