unix_io.c revision 544349270e4c74a6feb971123884a8cf5052a7ee
1/*
2 * unix_io.c --- This is the Unix (well, really POSIX) implementation
3 * 	of the I/O manager.
4 *
5 * Implements a one-block write-through cache.
6 *
7 * Includes support for Windows NT support under Cygwin.
8 *
9 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10 * 	2002 by Theodore Ts'o.
11 *
12 * %Begin-Header%
13 * This file may be redistributed under the terms of the GNU Public
14 * License.
15 * %End-Header%
16 */
17
18#define _LARGEFILE_SOURCE
19#define _LARGEFILE64_SOURCE
20
21#include <stdio.h>
22#include <string.h>
23#if HAVE_UNISTD_H
24#include <unistd.h>
25#endif
26#if HAVE_ERRNO_H
27#include <errno.h>
28#endif
29#include <fcntl.h>
30#include <time.h>
31#ifdef __linux__
32#include <sys/utsname.h>
33#endif
34#if HAVE_SYS_STAT_H
35#include <sys/stat.h>
36#endif
37#if HAVE_SYS_TYPES_H
38#include <sys/types.h>
39#endif
40#if HAVE_SYS_RESOURCE_H
41#include <sys/resource.h>
42#endif
43
44#include "ext2_fs.h"
45#include "ext2fs.h"
46
47/*
48 * For checking structure magic numbers...
49 */
50
51#define EXT2_CHECK_MAGIC(struct, code) \
52	  if ((struct)->magic != (code)) return (code)
53
54struct unix_cache {
55	char		*buf;
56	unsigned long	block;
57	int		access_time;
58	int		dirty:1;
59	int		in_use:1;
60};
61
62#define CACHE_SIZE 8
63#define WRITE_DIRECT_SIZE 4	/* Must be smaller than CACHE_SIZE */
64#define READ_DIRECT_SIZE 4	/* Should be smaller than CACHE_SIZE */
65
66struct unix_private_data {
67	int	magic;
68	int	dev;
69	int	flags;
70	int	access_time;
71	struct unix_cache cache[CACHE_SIZE];
72};
73
74static errcode_t unix_open(const char *name, int flags, io_channel *channel);
75static errcode_t unix_close(io_channel channel);
76static errcode_t unix_set_blksize(io_channel channel, int blksize);
77static errcode_t unix_read_blk(io_channel channel, unsigned long block,
78			       int count, void *data);
79static errcode_t unix_write_blk(io_channel channel, unsigned long block,
80				int count, const void *data);
81static errcode_t unix_flush(io_channel channel);
82static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
83				int size, const void *data);
84
85static void reuse_cache(io_channel channel, struct unix_private_data *data,
86		 struct unix_cache *cache, unsigned long block);
87
88static struct struct_io_manager struct_unix_manager = {
89	EXT2_ET_MAGIC_IO_MANAGER,
90	"Unix I/O Manager",
91	unix_open,
92	unix_close,
93	unix_set_blksize,
94	unix_read_blk,
95	unix_write_blk,
96	unix_flush,
97#ifdef __CYGWIN__
98	0
99#else
100	unix_write_byte
101#endif
102};
103
104io_manager unix_io_manager = &struct_unix_manager;
105
106/*
107 * Here are the raw I/O functions
108 */
109#ifndef __CYGWIN__
110static errcode_t raw_read_blk(io_channel channel,
111			      struct unix_private_data *data,
112			      unsigned long block,
113			      int count, void *buf)
114{
115	errcode_t	retval;
116	ssize_t		size;
117	ext2_loff_t	location;
118	int		actual = 0;
119
120	size = (count < 0) ? -count : count * channel->block_size;
121	location = (ext2_loff_t) block * channel->block_size;
122	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
123		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
124		goto error_out;
125	}
126	actual = read(data->dev, buf, size);
127	if (actual != size) {
128		if (actual < 0)
129			actual = 0;
130		retval = EXT2_ET_SHORT_READ;
131		goto error_out;
132	}
133	return 0;
134
135error_out:
136	memset((char *) buf+actual, 0, size-actual);
137	if (channel->read_error)
138		retval = (channel->read_error)(channel, block, count, buf,
139					       size, actual, retval);
140	return retval;
141}
142#else /* __CYGWIN__ */
143/*
144 * Windows block devices only allow sector alignment IO in offset and size
145 */
146static errcode_t raw_read_blk(io_channel channel,
147			      struct unix_private_data *data,
148			      unsigned long block,
149			      int count, void *buf)
150{
151	errcode_t	retval;
152	size_t		size, alignsize, fragment;
153	ext2_loff_t	location;
154	int		total = 0, actual;
155#define BLOCKALIGN 512
156	char		sector[BLOCKALIGN];
157
158	size = (count < 0) ? -count : count * channel->block_size;
159	location = (ext2_loff_t) block * channel->block_size;
160#ifdef DEBUG
161	printf("count=%d, size=%d, block=%d, blk_size=%d, location=%lx\n",
162	 		count, size, block, channel->block_size, location);
163#endif
164	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
165		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
166		goto error_out;
167	}
168	fragment = size % BLOCKALIGN;
169	alignsize = size - fragment;
170	if (alignsize) {
171		actual = read(data->dev, buf, alignsize);
172		if (actual != alignsize)
173			goto short_read;
174	}
175	if (fragment) {
176		actual = read(data->dev, sector, BLOCKALIGN);
177		if (actual != BLOCKALIGN)
178			goto short_read;
179		memcpy(buf+alignsize, sector, fragment);
180	}
181	return 0;
182
183short_read:
184	if (actual>0)
185		total += actual;
186	retval = EXT2_ET_SHORT_READ;
187
188error_out:
189	memset((char *) buf+total, 0, size-actual);
190	if (channel->read_error)
191		retval = (channel->read_error)(channel, block, count, buf,
192					       size, actual, retval);
193	return retval;
194}
195#endif
196
197static errcode_t raw_write_blk(io_channel channel,
198			       struct unix_private_data *data,
199			       unsigned long block,
200			       int count, const void *buf)
201{
202	ssize_t		size;
203	ext2_loff_t	location;
204	int		actual = 0;
205	errcode_t	retval;
206
207	if (count == 1)
208		size = channel->block_size;
209	else {
210		if (count < 0)
211			size = -count;
212		else
213			size = count * channel->block_size;
214	}
215
216	location = (ext2_loff_t) block * channel->block_size;
217	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
218		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
219		goto error_out;
220	}
221
222	actual = write(data->dev, buf, size);
223	if (actual != size) {
224		retval = EXT2_ET_SHORT_WRITE;
225		goto error_out;
226	}
227	return 0;
228
229error_out:
230	if (channel->write_error)
231		retval = (channel->write_error)(channel, block, count, buf,
232						size, actual, retval);
233	return retval;
234}
235
236
237/*
238 * Here we implement the cache functions
239 */
240
241/* Allocate the cache buffers */
242static errcode_t alloc_cache(io_channel channel,
243			     struct unix_private_data *data)
244{
245	errcode_t		retval;
246	struct unix_cache	*cache;
247	int			i;
248
249	data->access_time = 0;
250	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
251		cache->block = 0;
252		cache->access_time = 0;
253		cache->dirty = 0;
254		cache->in_use = 0;
255		if ((retval = ext2fs_get_mem(channel->block_size,
256					     &cache->buf)))
257			return retval;
258	}
259	return 0;
260}
261
262/* Free the cache buffers */
263static void free_cache(struct unix_private_data *data)
264{
265	struct unix_cache	*cache;
266	int			i;
267
268	data->access_time = 0;
269	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
270		cache->block = 0;
271		cache->access_time = 0;
272		cache->dirty = 0;
273		cache->in_use = 0;
274		if (cache->buf)
275			ext2fs_free_mem(&cache->buf);
276		cache->buf = 0;
277	}
278}
279
280#ifndef NO_IO_CACHE
281/*
282 * Try to find a block in the cache.  If the block is not found, and
283 * eldest is a non-zero pointer, then fill in eldest with the cache
284 * entry to that should be reused.
285 */
286static struct unix_cache *find_cached_block(struct unix_private_data *data,
287					    unsigned long block,
288					    struct unix_cache **eldest)
289{
290	struct unix_cache	*cache, *unused_cache, *oldest_cache;
291	int			i;
292
293	unused_cache = oldest_cache = 0;
294	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
295		if (!cache->in_use) {
296			if (!unused_cache)
297				unused_cache = cache;
298			continue;
299		}
300		if (cache->block == block) {
301			cache->access_time = ++data->access_time;
302			return cache;
303		}
304		if (!oldest_cache ||
305		    (cache->access_time < oldest_cache->access_time))
306			oldest_cache = cache;
307	}
308	if (eldest)
309		*eldest = (unused_cache) ? unused_cache : oldest_cache;
310	return 0;
311}
312
313/*
314 * Reuse a particular cache entry for another block.
315 */
316static void reuse_cache(io_channel channel, struct unix_private_data *data,
317		 struct unix_cache *cache, unsigned long block)
318{
319	if (cache->dirty && cache->in_use)
320		raw_write_blk(channel, data, cache->block, 1, cache->buf);
321
322	cache->in_use = 1;
323	cache->dirty = 0;
324	cache->block = block;
325	cache->access_time = ++data->access_time;
326}
327
328/*
329 * Flush all of the blocks in the cache
330 */
331static errcode_t flush_cached_blocks(io_channel channel,
332				     struct unix_private_data *data,
333				     int invalidate)
334
335{
336	struct unix_cache	*cache;
337	errcode_t		retval, retval2;
338	int			i;
339
340	retval2 = 0;
341	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
342		if (!cache->in_use)
343			continue;
344
345		if (invalidate)
346			cache->in_use = 0;
347
348		if (!cache->dirty)
349			continue;
350
351		retval = raw_write_blk(channel, data,
352				       cache->block, 1, cache->buf);
353		if (retval)
354			retval2 = retval;
355		else
356			cache->dirty = 0;
357	}
358	return retval2;
359}
360#endif /* NO_IO_CACHE */
361
362static errcode_t unix_open(const char *name, int flags, io_channel *channel)
363{
364	io_channel	io = NULL;
365	struct unix_private_data *data = NULL;
366	errcode_t	retval;
367	int		open_flags;
368	struct stat	st;
369#ifdef __linux__
370	struct 		utsname ut;
371#endif
372
373	if (name == 0)
374		return EXT2_ET_BAD_DEVICE_NAME;
375	retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
376	if (retval)
377		return retval;
378	memset(io, 0, sizeof(struct struct_io_channel));
379	io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
380	retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
381	if (retval)
382		goto cleanup;
383
384	io->manager = unix_io_manager;
385	retval = ext2fs_get_mem(strlen(name)+1, &io->name);
386	if (retval)
387		goto cleanup;
388
389	strcpy(io->name, name);
390	io->private_data = data;
391	io->block_size = 1024;
392	io->read_error = 0;
393	io->write_error = 0;
394	io->refcount = 1;
395
396	memset(data, 0, sizeof(struct unix_private_data));
397	data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
398
399	if ((retval = alloc_cache(io, data)))
400		goto cleanup;
401
402	open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
403#ifdef HAVE_OPEN64
404	data->dev = open64(name, open_flags);
405#else
406	data->dev = open(name, open_flags);
407#endif
408	if (data->dev < 0) {
409		retval = errno;
410		goto cleanup;
411	}
412
413#ifdef __linux__
414#undef RLIM_INFINITY
415#if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
416#define RLIM_INFINITY	((unsigned long)(~0UL>>1))
417#else
418#define RLIM_INFINITY  (~0UL)
419#endif
420	/*
421	 * Work around a bug in 2.4.10-2.4.18 kernels where writes to
422	 * block devices are wrongly getting hit by the filesize
423	 * limit.  This workaround isn't perfect, since it won't work
424	 * if glibc wasn't built against 2.2 header files.  (Sigh.)
425	 *
426	 */
427	if ((flags & IO_FLAG_RW) &&
428	    (uname(&ut) == 0) &&
429	    ((ut.release[0] == '2') && (ut.release[1] == '.') &&
430	     (ut.release[2] == '4') && (ut.release[3] == '.') &&
431	     (ut.release[4] == '1') && (ut.release[5] >= '0') &&
432	     (ut.release[5] < '8')) &&
433	    (fstat(data->dev, &st) == 0) &&
434	    (S_ISBLK(st.st_mode))) {
435		struct rlimit	rlim;
436
437		rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
438		setrlimit(RLIMIT_FSIZE, &rlim);
439		getrlimit(RLIMIT_FSIZE, &rlim);
440		if (((unsigned long) rlim.rlim_cur) <
441		    ((unsigned long) rlim.rlim_max)) {
442			rlim.rlim_cur = rlim.rlim_max;
443			setrlimit(RLIMIT_FSIZE, &rlim);
444		}
445	}
446#endif
447	*channel = io;
448	return 0;
449
450cleanup:
451	if (data) {
452		free_cache(data);
453		ext2fs_free_mem(&data);
454	}
455	if (io)
456		ext2fs_free_mem(&io);
457	return retval;
458}
459
460static errcode_t unix_close(io_channel channel)
461{
462	struct unix_private_data *data;
463	errcode_t	retval = 0;
464
465	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
466	data = (struct unix_private_data *) channel->private_data;
467	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
468
469	if (--channel->refcount > 0)
470		return 0;
471
472#ifndef NO_IO_CACHE
473	retval = flush_cached_blocks(channel, data, 0);
474#endif
475
476	if (close(data->dev) < 0)
477		retval = errno;
478	free_cache(data);
479
480	ext2fs_free_mem(&channel->private_data);
481	if (channel->name)
482		ext2fs_free_mem(&channel->name);
483	ext2fs_free_mem(&channel);
484	return retval;
485}
486
487static errcode_t unix_set_blksize(io_channel channel, int blksize)
488{
489	struct unix_private_data *data;
490	errcode_t		retval;
491
492	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
493	data = (struct unix_private_data *) channel->private_data;
494	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
495
496	if (channel->block_size != blksize) {
497#ifndef NO_IO_CACHE
498		if ((retval = flush_cached_blocks(channel, data, 0)))
499			return retval;
500#endif
501
502		channel->block_size = blksize;
503		free_cache(data);
504		if ((retval = alloc_cache(channel, data)))
505			return retval;
506	}
507	return 0;
508}
509
510
511static errcode_t unix_read_blk(io_channel channel, unsigned long block,
512			       int count, void *buf)
513{
514	struct unix_private_data *data;
515	struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
516	errcode_t	retval;
517	char		*cp;
518	int		i, j;
519
520	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
521	data = (struct unix_private_data *) channel->private_data;
522	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
523
524#ifdef NO_IO_CACHE
525	return raw_read_blk(channel, data, block, count, buf);
526#else
527	/*
528	 * If we're doing an odd-sized read or a very large read,
529	 * flush out the cache and then do a direct read.
530	 */
531	if (count < 0 || count > WRITE_DIRECT_SIZE) {
532		if ((retval = flush_cached_blocks(channel, data, 0)))
533			return retval;
534		return raw_read_blk(channel, data, block, count, buf);
535	}
536
537	cp = buf;
538	while (count > 0) {
539		/* If it's in the cache, use it! */
540		if ((cache = find_cached_block(data, block, &reuse[0]))) {
541#ifdef DEBUG
542			printf("Using cached block %d\n", block);
543#endif
544			memcpy(cp, cache->buf, channel->block_size);
545			count--;
546			block++;
547			cp += channel->block_size;
548			continue;
549		}
550		/*
551		 * Find the number of uncached blocks so we can do a
552		 * single read request
553		 */
554		for (i=1; i < count; i++)
555			if (find_cached_block(data, block+i, &reuse[i]))
556				break;
557#ifdef DEBUG
558		printf("Reading %d blocks starting at %d\n", i, block);
559#endif
560		if ((retval = raw_read_blk(channel, data, block, i, cp)))
561			return retval;
562
563		/* Save the results in the cache */
564		for (j=0; j < i; j++) {
565			count--;
566			cache = reuse[j];
567			reuse_cache(channel, data, cache, block++);
568			memcpy(cache->buf, cp, channel->block_size);
569			cp += channel->block_size;
570		}
571	}
572	return 0;
573#endif /* NO_IO_CACHE */
574}
575
576static errcode_t unix_write_blk(io_channel channel, unsigned long block,
577				int count, const void *buf)
578{
579	struct unix_private_data *data;
580	struct unix_cache *cache, *reuse;
581	errcode_t	retval = 0;
582	const char	*cp;
583	int		writethrough;
584
585	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
586	data = (struct unix_private_data *) channel->private_data;
587	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
588
589#ifdef NO_IO_CACHE
590	return raw_write_blk(channel, data, block, count, buf);
591#else
592	/*
593	 * If we're doing an odd-sized write or a very large write,
594	 * flush out the cache completely and then do a direct write.
595	 */
596	if (count < 0 || count > WRITE_DIRECT_SIZE) {
597		if ((retval = flush_cached_blocks(channel, data, 1)))
598			return retval;
599		return raw_write_blk(channel, data, block, count, buf);
600	}
601
602	/*
603	 * For a moderate-sized multi-block write, first force a write
604	 * if we're in write-through cache mode, and then fill the
605	 * cache with the blocks.
606	 */
607	writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
608	if (writethrough)
609		retval = raw_write_blk(channel, data, block, count, buf);
610
611	cp = buf;
612	while (count > 0) {
613		cache = find_cached_block(data, block, &reuse);
614		if (!cache) {
615			cache = reuse;
616			reuse_cache(channel, data, cache, block);
617		}
618		memcpy(cache->buf, cp, channel->block_size);
619		cache->dirty = !writethrough;
620		count--;
621		block++;
622		cp += channel->block_size;
623	}
624	return retval;
625#endif /* NO_IO_CACHE */
626}
627
628static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
629				 int size, const void *buf)
630{
631	struct unix_private_data *data;
632	errcode_t	retval = 0;
633	ssize_t		actual;
634
635	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
636	data = (struct unix_private_data *) channel->private_data;
637	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
638
639#ifndef NO_IO_CACHE
640	/*
641	 * Flush out the cache completely
642	 */
643	if ((retval = flush_cached_blocks(channel, data, 1)))
644		return retval;
645#endif
646
647	if (lseek(data->dev, offset, SEEK_SET) < 0)
648		return errno;
649
650	actual = write(data->dev, buf, size);
651	if (actual != size)
652		return EXT2_ET_SHORT_WRITE;
653
654	return 0;
655}
656
657/*
658 * Flush data buffers to disk.
659 */
660static errcode_t unix_flush(io_channel channel)
661{
662	struct unix_private_data *data;
663	errcode_t retval = 0;
664
665	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
666	data = (struct unix_private_data *) channel->private_data;
667	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
668
669#ifndef NO_IO_CACHE
670	retval = flush_cached_blocks(channel, data, 0);
671#endif
672	fsync(data->dev);
673	return retval;
674}
675
676