unix_io.c revision 0a05b90330d8b505469c72143964b954776be232
1/*
2 * unix_io.c --- This is the Unix (well, really POSIX) implementation
3 * 	of the I/O manager.
4 *
5 * Implements a one-block write-through cache.
6 *
7 * Includes support for Windows NT support under Cygwin.
8 *
9 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10 * 	2002 by Theodore Ts'o.
11 *
12 * %Begin-Header%
13 * This file may be redistributed under the terms of the GNU Library
14 * General Public License, version 2.
15 * %End-Header%
16 */
17
18#define _LARGEFILE_SOURCE
19#define _LARGEFILE64_SOURCE
20#ifndef _GNU_SOURCE
21#define _GNU_SOURCE
22#endif
23
24#include "config.h"
25#include <stdio.h>
26#include <string.h>
27#if HAVE_UNISTD_H
28#include <unistd.h>
29#endif
30#if HAVE_ERRNO_H
31#include <errno.h>
32#endif
33#include <fcntl.h>
34#include <time.h>
35#ifdef __linux__
36#include <sys/utsname.h>
37#endif
38#ifdef HAVE_SYS_IOCTL_H
39#include <sys/ioctl.h>
40#endif
41#ifdef HAVE_SYS_MOUNT_H
42#include <sys/mount.h>
43#endif
44#if HAVE_SYS_STAT_H
45#include <sys/stat.h>
46#endif
47#if HAVE_SYS_TYPES_H
48#include <sys/types.h>
49#endif
50#if HAVE_SYS_RESOURCE_H
51#include <sys/resource.h>
52#endif
53#if HAVE_LINUX_FALLOC_H
54#include <linux/falloc.h>
55#endif
56
57#if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
58#define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
59#endif
60
61#if defined(__linux__) && defined(_IO) && !defined(BLKSSZGET)
62#define BLKSSZGET  _IO(0x12,104)/* get block device sector size */
63#endif
64
65#undef ALIGN_DEBUG
66
67#include "ext2_fs.h"
68#include "ext2fs.h"
69
70/*
71 * For checking structure magic numbers...
72 */
73
74#define EXT2_CHECK_MAGIC(struct, code) \
75	  if ((struct)->magic != (code)) return (code)
76
77struct unix_cache {
78	char		*buf;
79	unsigned long	block;
80	int		access_time;
81	unsigned	dirty:1;
82	unsigned	in_use:1;
83};
84
85#define CACHE_SIZE 8
86#define WRITE_DIRECT_SIZE 4	/* Must be smaller than CACHE_SIZE */
87#define READ_DIRECT_SIZE 4	/* Should be smaller than CACHE_SIZE */
88
89struct unix_private_data {
90	int	magic;
91	int	dev;
92	int	flags;
93	int	align;
94	int	access_time;
95	ext2_loff_t offset;
96	struct unix_cache cache[CACHE_SIZE];
97	void	*bounce;
98	struct struct_io_stats io_stats;
99};
100
101#define IS_ALIGNED(n, align) ((((unsigned long) n) & \
102			       ((unsigned long) ((align)-1))) == 0)
103
104static errcode_t unix_open(const char *name, int flags, io_channel *channel);
105static errcode_t unix_close(io_channel channel);
106static errcode_t unix_set_blksize(io_channel channel, int blksize);
107static errcode_t unix_read_blk(io_channel channel, unsigned long block,
108			       int count, void *data);
109static errcode_t unix_write_blk(io_channel channel, unsigned long block,
110				int count, const void *data);
111static errcode_t unix_flush(io_channel channel);
112static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
113				int size, const void *data);
114static errcode_t unix_set_option(io_channel channel, const char *option,
115				 const char *arg);
116static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
117;
118static void reuse_cache(io_channel channel, struct unix_private_data *data,
119		 struct unix_cache *cache, unsigned long long block);
120static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
121			       int count, void *data);
122static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
123				int count, const void *data);
124static errcode_t unix_discard(io_channel channel, unsigned long long block,
125			      unsigned long long count);
126
127static struct struct_io_manager struct_unix_manager = {
128	EXT2_ET_MAGIC_IO_MANAGER,
129	"Unix I/O Manager",
130	unix_open,
131	unix_close,
132	unix_set_blksize,
133	unix_read_blk,
134	unix_write_blk,
135	unix_flush,
136	unix_write_byte,
137	unix_set_option,
138	unix_get_stats,
139	unix_read_blk64,
140	unix_write_blk64,
141	unix_discard,
142};
143
144io_manager unix_io_manager = &struct_unix_manager;
145
146static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
147{
148	errcode_t 	retval = 0;
149
150	struct unix_private_data *data;
151
152	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
153	data = (struct unix_private_data *) channel->private_data;
154	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
155
156	if (stats)
157		*stats = &data->io_stats;
158
159	return retval;
160}
161
162/*
163 * Here are the raw I/O functions
164 */
165static errcode_t raw_read_blk(io_channel channel,
166			      struct unix_private_data *data,
167			      unsigned long long block,
168			      int count, void *bufv)
169{
170	errcode_t	retval;
171	ssize_t		size;
172	ext2_loff_t	location;
173	int		actual = 0;
174	unsigned char	*buf = bufv;
175
176	size = (count < 0) ? -count : count * channel->block_size;
177	data->io_stats.bytes_read += size;
178	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
179	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
180		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
181		goto error_out;
182	}
183	if ((channel->align == 0) ||
184	    (IS_ALIGNED(buf, channel->align) &&
185	     IS_ALIGNED(size, channel->align))) {
186		actual = read(data->dev, buf, size);
187		if (actual != size) {
188		short_read:
189			if (actual < 0)
190				actual = 0;
191			retval = EXT2_ET_SHORT_READ;
192			goto error_out;
193		}
194		return 0;
195	}
196
197#ifdef ALIGN_DEBUG
198	printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
199	       (unsigned long) size);
200#endif
201
202	/*
203	 * The buffer or size which we're trying to read isn't aligned
204	 * to the O_DIRECT rules, so we need to do this the hard way...
205	 */
206	while (size > 0) {
207		actual = read(data->dev, data->bounce, channel->block_size);
208		if (actual != channel->block_size)
209			goto short_read;
210		actual = size;
211		if (size > channel->block_size)
212			actual = channel->block_size;
213		memcpy(buf, data->bounce, actual);
214		size -= actual;
215		buf += actual;
216	}
217	return 0;
218
219error_out:
220	memset((char *) buf+actual, 0, size-actual);
221	if (channel->read_error)
222		retval = (channel->read_error)(channel, block, count, buf,
223					       size, actual, retval);
224	return retval;
225}
226
227static errcode_t raw_write_blk(io_channel channel,
228			       struct unix_private_data *data,
229			       unsigned long long block,
230			       int count, const void *bufv)
231{
232	ssize_t		size;
233	ext2_loff_t	location;
234	int		actual = 0;
235	errcode_t	retval;
236	const unsigned char *buf = bufv;
237
238	if (count == 1)
239		size = channel->block_size;
240	else {
241		if (count < 0)
242			size = -count;
243		else
244			size = count * channel->block_size;
245	}
246	data->io_stats.bytes_written += size;
247
248	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
249	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
250		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
251		goto error_out;
252	}
253
254	if ((channel->align == 0) ||
255	    (IS_ALIGNED(buf, channel->align) &&
256	     IS_ALIGNED(size, channel->align))) {
257		actual = write(data->dev, buf, size);
258		if (actual != size) {
259		short_write:
260			retval = EXT2_ET_SHORT_WRITE;
261			goto error_out;
262		}
263		return 0;
264	}
265
266#ifdef ALIGN_DEBUG
267	printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
268	       (unsigned long) size);
269#endif
270	/*
271	 * The buffer or size which we're trying to write isn't aligned
272	 * to the O_DIRECT rules, so we need to do this the hard way...
273	 */
274	while (size > 0) {
275		if (size < channel->block_size) {
276			actual = read(data->dev, data->bounce,
277				      channel->block_size);
278			if (actual != channel->block_size) {
279				retval = EXT2_ET_SHORT_READ;
280				goto error_out;
281			}
282		}
283		actual = size;
284		if (size > channel->block_size)
285			actual = channel->block_size;
286		memcpy(data->bounce, buf, actual);
287		actual = write(data->dev, data->bounce, channel->block_size);
288		if (actual != channel->block_size)
289			goto short_write;
290		size -= actual;
291		buf += actual;
292	}
293	return 0;
294
295error_out:
296	if (channel->write_error)
297		retval = (channel->write_error)(channel, block, count, buf,
298						size, actual, retval);
299	return retval;
300}
301
302
303/*
304 * Here we implement the cache functions
305 */
306
307/* Allocate the cache buffers */
308static errcode_t alloc_cache(io_channel channel,
309			     struct unix_private_data *data)
310{
311	errcode_t		retval;
312	struct unix_cache	*cache;
313	int			i;
314
315	data->access_time = 0;
316	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
317		cache->block = 0;
318		cache->access_time = 0;
319		cache->dirty = 0;
320		cache->in_use = 0;
321		if (cache->buf)
322			ext2fs_free_mem(&cache->buf);
323		retval = ext2fs_get_memalign(channel->block_size,
324					     channel->align, &cache->buf);
325		if (retval)
326			return retval;
327	}
328	if (channel->align) {
329		if (data->bounce)
330			ext2fs_free_mem(&data->bounce);
331		retval = ext2fs_get_memalign(channel->block_size,
332					     channel->align,
333					     &data->bounce);
334	}
335	return retval;
336}
337
338/* Free the cache buffers */
339static void free_cache(struct unix_private_data *data)
340{
341	struct unix_cache	*cache;
342	int			i;
343
344	data->access_time = 0;
345	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
346		cache->block = 0;
347		cache->access_time = 0;
348		cache->dirty = 0;
349		cache->in_use = 0;
350		if (cache->buf)
351			ext2fs_free_mem(&cache->buf);
352	}
353	if (data->bounce)
354		ext2fs_free_mem(&data->bounce);
355}
356
357#ifndef NO_IO_CACHE
358/*
359 * Try to find a block in the cache.  If the block is not found, and
360 * eldest is a non-zero pointer, then fill in eldest with the cache
361 * entry to that should be reused.
362 */
363static struct unix_cache *find_cached_block(struct unix_private_data *data,
364					    unsigned long long block,
365					    struct unix_cache **eldest)
366{
367	struct unix_cache	*cache, *unused_cache, *oldest_cache;
368	int			i;
369
370	unused_cache = oldest_cache = 0;
371	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
372		if (!cache->in_use) {
373			if (!unused_cache)
374				unused_cache = cache;
375			continue;
376		}
377		if (cache->block == block) {
378			cache->access_time = ++data->access_time;
379			return cache;
380		}
381		if (!oldest_cache ||
382		    (cache->access_time < oldest_cache->access_time))
383			oldest_cache = cache;
384	}
385	if (eldest)
386		*eldest = (unused_cache) ? unused_cache : oldest_cache;
387	return 0;
388}
389
390/*
391 * Reuse a particular cache entry for another block.
392 */
393static void reuse_cache(io_channel channel, struct unix_private_data *data,
394		 struct unix_cache *cache, unsigned long long block)
395{
396	if (cache->dirty && cache->in_use)
397		raw_write_blk(channel, data, cache->block, 1, cache->buf);
398
399	cache->in_use = 1;
400	cache->dirty = 0;
401	cache->block = block;
402	cache->access_time = ++data->access_time;
403}
404
405/*
406 * Flush all of the blocks in the cache
407 */
408static errcode_t flush_cached_blocks(io_channel channel,
409				     struct unix_private_data *data,
410				     int invalidate)
411
412{
413	struct unix_cache	*cache;
414	errcode_t		retval, retval2;
415	int			i;
416
417	retval2 = 0;
418	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
419		if (!cache->in_use)
420			continue;
421
422		if (invalidate)
423			cache->in_use = 0;
424
425		if (!cache->dirty)
426			continue;
427
428		retval = raw_write_blk(channel, data,
429				       cache->block, 1, cache->buf);
430		if (retval)
431			retval2 = retval;
432		else
433			cache->dirty = 0;
434	}
435	return retval2;
436}
437#endif /* NO_IO_CACHE */
438
439#ifdef __linux__
440#ifndef BLKDISCARDZEROES
441#define BLKDISCARDZEROES _IO(0x12,124)
442#endif
443#endif
444
445static errcode_t unix_open(const char *name, int flags, io_channel *channel)
446{
447	io_channel	io = NULL;
448	struct unix_private_data *data = NULL;
449	errcode_t	retval;
450	int		open_flags, zeroes = 0;
451	int		f_nocache = 0;
452	ext2fs_struct_stat st;
453#ifdef __linux__
454	struct 		utsname ut;
455#endif
456
457	if (name == 0)
458		return EXT2_ET_BAD_DEVICE_NAME;
459	retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
460	if (retval)
461		goto cleanup;
462	memset(io, 0, sizeof(struct struct_io_channel));
463	io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
464	retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
465	if (retval)
466		goto cleanup;
467
468	io->manager = unix_io_manager;
469	retval = ext2fs_get_mem(strlen(name)+1, &io->name);
470	if (retval)
471		goto cleanup;
472
473	strcpy(io->name, name);
474	io->private_data = data;
475	io->block_size = 1024;
476	io->read_error = 0;
477	io->write_error = 0;
478	io->refcount = 1;
479
480	memset(data, 0, sizeof(struct unix_private_data));
481	data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
482	data->io_stats.num_fields = 2;
483
484	open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
485	if (flags & IO_FLAG_EXCLUSIVE)
486		open_flags |= O_EXCL;
487#if defined(O_DIRECT)
488	if (flags & IO_FLAG_DIRECT_IO)
489		open_flags |= O_DIRECT;
490#elif defined(F_NOCACHE)
491	if (flags & IO_FLAG_DIRECT_IO)
492		f_nocache = F_NOCACHE;
493#endif
494	data->flags = flags;
495
496	data->dev = ext2fs_open_file(io->name, open_flags, 0);
497	if (data->dev < 0) {
498		retval = errno;
499		goto cleanup;
500	}
501	if (f_nocache) {
502		if (fcntl(data->dev, f_nocache, 1) < 0) {
503			retval = errno;
504			goto cleanup;
505		}
506	}
507
508	/*
509	 * If the device is really a block device, then set the
510	 * appropriate flag, otherwise we can set DISCARD_ZEROES flag
511	 * because we are going to use punch hole instead of discard
512	 * and if it succeed, subsequent read from sparse area returns
513	 * zero.
514	 */
515	if (ext2fs_stat(io->name, &st) == 0) {
516		if (S_ISBLK(st.st_mode))
517			io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
518		else
519			io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
520	}
521
522#ifdef BLKSSZGET
523	if (flags & IO_FLAG_DIRECT_IO) {
524		if (ioctl(data->dev, BLKSSZGET, &io->align) != 0)
525			io->align = io->block_size;
526	}
527#endif
528
529#ifdef BLKDISCARDZEROES
530	ioctl(data->dev, BLKDISCARDZEROES, &zeroes);
531	if (zeroes)
532		io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
533#endif
534
535#if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
536	/*
537	 * Some operating systems require that the buffers be aligned,
538	 * regardless of O_DIRECT
539	 */
540	io->align = 512;
541#endif
542
543
544	if ((retval = alloc_cache(io, data)))
545		goto cleanup;
546
547#ifdef BLKROGET
548	if (flags & IO_FLAG_RW) {
549		int error;
550		int readonly = 0;
551
552		/* Is the block device actually writable? */
553		error = ioctl(data->dev, BLKROGET, &readonly);
554		if (!error && readonly) {
555			close(data->dev);
556			retval = EPERM;
557			goto cleanup;
558		}
559	}
560#endif
561
562#ifdef __linux__
563#undef RLIM_INFINITY
564#if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
565#define RLIM_INFINITY	((unsigned long)(~0UL>>1))
566#else
567#define RLIM_INFINITY  (~0UL)
568#endif
569	/*
570	 * Work around a bug in 2.4.10-2.4.18 kernels where writes to
571	 * block devices are wrongly getting hit by the filesize
572	 * limit.  This workaround isn't perfect, since it won't work
573	 * if glibc wasn't built against 2.2 header files.  (Sigh.)
574	 *
575	 */
576	if ((flags & IO_FLAG_RW) &&
577	    (uname(&ut) == 0) &&
578	    ((ut.release[0] == '2') && (ut.release[1] == '.') &&
579	     (ut.release[2] == '4') && (ut.release[3] == '.') &&
580	     (ut.release[4] == '1') && (ut.release[5] >= '0') &&
581	     (ut.release[5] < '8')) &&
582	    (ext2fs_stat(io->name, &st) == 0) &&
583	    (S_ISBLK(st.st_mode))) {
584		struct rlimit	rlim;
585
586		rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
587		setrlimit(RLIMIT_FSIZE, &rlim);
588		getrlimit(RLIMIT_FSIZE, &rlim);
589		if (((unsigned long) rlim.rlim_cur) <
590		    ((unsigned long) rlim.rlim_max)) {
591			rlim.rlim_cur = rlim.rlim_max;
592			setrlimit(RLIMIT_FSIZE, &rlim);
593		}
594	}
595#endif
596	*channel = io;
597	return 0;
598
599cleanup:
600	if (data) {
601		free_cache(data);
602		ext2fs_free_mem(&data);
603	}
604	if (io)
605		ext2fs_free_mem(&io);
606	return retval;
607}
608
609static errcode_t unix_close(io_channel channel)
610{
611	struct unix_private_data *data;
612	errcode_t	retval = 0;
613
614	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
615	data = (struct unix_private_data *) channel->private_data;
616	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
617
618	if (--channel->refcount > 0)
619		return 0;
620
621#ifndef NO_IO_CACHE
622	retval = flush_cached_blocks(channel, data, 0);
623#endif
624
625	if (close(data->dev) < 0)
626		retval = errno;
627	free_cache(data);
628
629	ext2fs_free_mem(&channel->private_data);
630	if (channel->name)
631		ext2fs_free_mem(&channel->name);
632	ext2fs_free_mem(&channel);
633	return retval;
634}
635
636static errcode_t unix_set_blksize(io_channel channel, int blksize)
637{
638	struct unix_private_data *data;
639	errcode_t		retval;
640
641	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
642	data = (struct unix_private_data *) channel->private_data;
643	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
644
645	if (channel->block_size != blksize) {
646#ifndef NO_IO_CACHE
647		if ((retval = flush_cached_blocks(channel, data, 0)))
648			return retval;
649#endif
650
651		channel->block_size = blksize;
652		free_cache(data);
653		if ((retval = alloc_cache(channel, data)))
654			return retval;
655	}
656	return 0;
657}
658
659
660static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
661			       int count, void *buf)
662{
663	struct unix_private_data *data;
664	struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
665	errcode_t	retval;
666	char		*cp;
667	int		i, j;
668
669	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
670	data = (struct unix_private_data *) channel->private_data;
671	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
672
673#ifdef NO_IO_CACHE
674	return raw_read_blk(channel, data, block, count, buf);
675#else
676	/*
677	 * If we're doing an odd-sized read or a very large read,
678	 * flush out the cache and then do a direct read.
679	 */
680	if (count < 0 || count > WRITE_DIRECT_SIZE) {
681		if ((retval = flush_cached_blocks(channel, data, 0)))
682			return retval;
683		return raw_read_blk(channel, data, block, count, buf);
684	}
685
686	cp = buf;
687	while (count > 0) {
688		/* If it's in the cache, use it! */
689		if ((cache = find_cached_block(data, block, &reuse[0]))) {
690#ifdef DEBUG
691			printf("Using cached block %lu\n", block);
692#endif
693			memcpy(cp, cache->buf, channel->block_size);
694			count--;
695			block++;
696			cp += channel->block_size;
697			continue;
698		}
699		if (count == 1) {
700			/*
701			 * Special case where we read directly into the
702			 * cache buffer; important in the O_DIRECT case
703			 */
704			cache = reuse[0];
705			reuse_cache(channel, data, cache, block);
706			if ((retval = raw_read_blk(channel, data, block, 1,
707						   cache->buf))) {
708				cache->in_use = 0;
709				return retval;
710			}
711			memcpy(cp, cache->buf, channel->block_size);
712			return 0;
713		}
714
715		/*
716		 * Find the number of uncached blocks so we can do a
717		 * single read request
718		 */
719		for (i=1; i < count; i++)
720			if (find_cached_block(data, block+i, &reuse[i]))
721				break;
722#ifdef DEBUG
723		printf("Reading %d blocks starting at %lu\n", i, block);
724#endif
725		if ((retval = raw_read_blk(channel, data, block, i, cp)))
726			return retval;
727
728		/* Save the results in the cache */
729		for (j=0; j < i; j++) {
730			count--;
731			cache = reuse[j];
732			reuse_cache(channel, data, cache, block++);
733			memcpy(cache->buf, cp, channel->block_size);
734			cp += channel->block_size;
735		}
736	}
737	return 0;
738#endif /* NO_IO_CACHE */
739}
740
741static errcode_t unix_read_blk(io_channel channel, unsigned long block,
742			       int count, void *buf)
743{
744	return unix_read_blk64(channel, block, count, buf);
745}
746
747static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
748				int count, const void *buf)
749{
750	struct unix_private_data *data;
751	struct unix_cache *cache, *reuse;
752	errcode_t	retval = 0;
753	const char	*cp;
754	int		writethrough;
755
756	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
757	data = (struct unix_private_data *) channel->private_data;
758	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
759
760#ifdef NO_IO_CACHE
761	return raw_write_blk(channel, data, block, count, buf);
762#else
763	/*
764	 * If we're doing an odd-sized write or a very large write,
765	 * flush out the cache completely and then do a direct write.
766	 */
767	if (count < 0 || count > WRITE_DIRECT_SIZE) {
768		if ((retval = flush_cached_blocks(channel, data, 1)))
769			return retval;
770		return raw_write_blk(channel, data, block, count, buf);
771	}
772
773	/*
774	 * For a moderate-sized multi-block write, first force a write
775	 * if we're in write-through cache mode, and then fill the
776	 * cache with the blocks.
777	 */
778	writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
779	if (writethrough)
780		retval = raw_write_blk(channel, data, block, count, buf);
781
782	cp = buf;
783	while (count > 0) {
784		cache = find_cached_block(data, block, &reuse);
785		if (!cache) {
786			cache = reuse;
787			reuse_cache(channel, data, cache, block);
788		}
789		memcpy(cache->buf, cp, channel->block_size);
790		cache->dirty = !writethrough;
791		count--;
792		block++;
793		cp += channel->block_size;
794	}
795	return retval;
796#endif /* NO_IO_CACHE */
797}
798
799static errcode_t unix_write_blk(io_channel channel, unsigned long block,
800				int count, const void *buf)
801{
802	return unix_write_blk64(channel, block, count, buf);
803}
804
805static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
806				 int size, const void *buf)
807{
808	struct unix_private_data *data;
809	errcode_t	retval = 0;
810	ssize_t		actual;
811
812	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
813	data = (struct unix_private_data *) channel->private_data;
814	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
815
816	if (channel->align != 0) {
817#ifdef ALIGN_DEBUG
818		printf("unix_write_byte: O_DIRECT fallback\n");
819#endif
820		return EXT2_ET_UNIMPLEMENTED;
821	}
822
823#ifndef NO_IO_CACHE
824	/*
825	 * Flush out the cache completely
826	 */
827	if ((retval = flush_cached_blocks(channel, data, 1)))
828		return retval;
829#endif
830
831	if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
832		return errno;
833
834	actual = write(data->dev, buf, size);
835	if (actual != size)
836		return EXT2_ET_SHORT_WRITE;
837
838	return 0;
839}
840
841/*
842 * Flush data buffers to disk.
843 */
844static errcode_t unix_flush(io_channel channel)
845{
846	struct unix_private_data *data;
847	errcode_t retval = 0;
848
849	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
850	data = (struct unix_private_data *) channel->private_data;
851	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
852
853#ifndef NO_IO_CACHE
854	retval = flush_cached_blocks(channel, data, 0);
855#endif
856	fsync(data->dev);
857	return retval;
858}
859
860static errcode_t unix_set_option(io_channel channel, const char *option,
861				 const char *arg)
862{
863	struct unix_private_data *data;
864	unsigned long long tmp;
865	char *end;
866
867	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
868	data = (struct unix_private_data *) channel->private_data;
869	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
870
871	if (!strcmp(option, "offset")) {
872		if (!arg)
873			return EXT2_ET_INVALID_ARGUMENT;
874
875		tmp = strtoull(arg, &end, 0);
876		if (*end)
877			return EXT2_ET_INVALID_ARGUMENT;
878		data->offset = tmp;
879		if (data->offset < 0)
880			return EXT2_ET_INVALID_ARGUMENT;
881		return 0;
882	}
883	return EXT2_ET_INVALID_ARGUMENT;
884}
885
886#if defined(__linux__) && !defined(BLKDISCARD)
887#define BLKDISCARD		_IO(0x12,119)
888#endif
889
890static errcode_t unix_discard(io_channel channel, unsigned long long block,
891			      unsigned long long count)
892{
893	struct unix_private_data *data;
894	__uint64_t	range[2];
895	int		ret;
896
897	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
898	data = (struct unix_private_data *) channel->private_data;
899	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
900
901	if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
902#ifdef BLKDISCARD
903		range[0] = (__uint64_t)(block) * channel->block_size;
904		range[1] = (__uint64_t)(count) * channel->block_size;
905
906		ret = ioctl(data->dev, BLKDISCARD, &range);
907#else
908		goto unimplemented;
909#endif
910	} else {
911#if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
912		/*
913		 * If we are not on block device, try to use punch hole
914		 * to reclaim free space.
915		 */
916		ret = fallocate(data->dev,
917				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
918				(off_t)(block) * channel->block_size,
919				(off_t)(count) * channel->block_size);
920#else
921		goto unimplemented;
922#endif
923	}
924	if (ret < 0) {
925		if (errno == EOPNOTSUPP)
926			goto unimplemented;
927		return errno;
928	}
929	return 0;
930unimplemented:
931	return EXT2_ET_UNIMPLEMENTED;
932}
933