unix_io.c revision 18a1444b4f1e6a0948fd38fa0de382d86cfe04de
1/*
2 * unix_io.c --- This is the Unix (well, really POSIX) implementation
3 * 	of the I/O manager.
4 *
5 * Implements a one-block write-through cache.
6 *
7 * Includes support for Windows NT support under Cygwin.
8 *
9 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10 * 	2002 by Theodore Ts'o.
11 *
12 * %Begin-Header%
13 * This file may be redistributed under the terms of the GNU Library
14 * General Public License, version 2.
15 * %End-Header%
16 */
17
18#define _LARGEFILE_SOURCE
19#define _LARGEFILE64_SOURCE
20#ifndef _GNU_SOURCE
21#define _GNU_SOURCE
22#endif
23
24#include <stdio.h>
25#include <string.h>
26#if HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#if HAVE_ERRNO_H
30#include <errno.h>
31#endif
32#include <fcntl.h>
33#include <time.h>
34#ifdef __linux__
35#include <sys/utsname.h>
36#endif
37#ifdef HAVE_SYS_IOCTL_H
38#include <sys/ioctl.h>
39#endif
40#ifdef HAVE_SYS_MOUNT_H
41#include <sys/mount.h>
42#endif
43#if HAVE_SYS_STAT_H
44#include <sys/stat.h>
45#endif
46#if HAVE_SYS_TYPES_H
47#include <sys/types.h>
48#endif
49#if HAVE_SYS_RESOURCE_H
50#include <sys/resource.h>
51#endif
52#if HAVE_LINUX_FALLOC_H
53#include <linux/falloc.h>
54#endif
55
56#if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
57#define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
58#endif
59
60#undef ALIGN_DEBUG
61
62#include "ext2_fs.h"
63#include "ext2fs.h"
64
65/*
66 * For checking structure magic numbers...
67 */
68
69#define EXT2_CHECK_MAGIC(struct, code) \
70	  if ((struct)->magic != (code)) return (code)
71
72struct unix_cache {
73	char			*buf;
74	unsigned long long	block;
75	int			access_time;
76	unsigned		dirty:1;
77	unsigned		in_use:1;
78};
79
80#define CACHE_SIZE 8
81#define WRITE_DIRECT_SIZE 4	/* Must be smaller than CACHE_SIZE */
82#define READ_DIRECT_SIZE 4	/* Should be smaller than CACHE_SIZE */
83
84struct unix_private_data {
85	int	magic;
86	int	dev;
87	int	flags;
88	int	align;
89	int	access_time;
90	ext2_loff_t offset;
91	struct unix_cache cache[CACHE_SIZE];
92	void	*bounce;
93	struct struct_io_stats io_stats;
94};
95
96#define IS_ALIGNED(n, align) ((((unsigned long) n) & \
97			       ((unsigned long) ((align)-1))) == 0)
98
99static errcode_t unix_open(const char *name, int flags, io_channel *channel);
100static errcode_t unix_close(io_channel channel);
101static errcode_t unix_set_blksize(io_channel channel, int blksize);
102static errcode_t unix_read_blk(io_channel channel, unsigned long block,
103			       int count, void *data);
104static errcode_t unix_write_blk(io_channel channel, unsigned long block,
105				int count, const void *data);
106static errcode_t unix_flush(io_channel channel);
107static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
108				int size, const void *data);
109static errcode_t unix_set_option(io_channel channel, const char *option,
110				 const char *arg);
111static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
112;
113static void reuse_cache(io_channel channel, struct unix_private_data *data,
114		 struct unix_cache *cache, unsigned long long block);
115static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
116			       int count, void *data);
117static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
118				int count, const void *data);
119static errcode_t unix_discard(io_channel channel, unsigned long long block,
120			      unsigned long long count);
121
122static struct struct_io_manager struct_unix_manager = {
123	EXT2_ET_MAGIC_IO_MANAGER,
124	"Unix I/O Manager",
125	unix_open,
126	unix_close,
127	unix_set_blksize,
128	unix_read_blk,
129	unix_write_blk,
130	unix_flush,
131	unix_write_byte,
132	unix_set_option,
133	unix_get_stats,
134	unix_read_blk64,
135	unix_write_blk64,
136	unix_discard,
137};
138
139io_manager unix_io_manager = &struct_unix_manager;
140
141static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
142{
143	errcode_t 	retval = 0;
144
145	struct unix_private_data *data;
146
147	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
148	data = (struct unix_private_data *) channel->private_data;
149	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
150
151	if (stats)
152		*stats = &data->io_stats;
153
154	return retval;
155}
156
157/*
158 * Here are the raw I/O functions
159 */
160static errcode_t raw_read_blk(io_channel channel,
161			      struct unix_private_data *data,
162			      unsigned long long block,
163			      int count, void *bufv)
164{
165	errcode_t	retval;
166	ssize_t		size;
167	ext2_loff_t	location;
168	int		actual = 0;
169	unsigned char	*buf = bufv;
170
171	size = (count < 0) ? -count : count * channel->block_size;
172	data->io_stats.bytes_read += size;
173	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
174	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
175		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
176		goto error_out;
177	}
178	if ((channel->align == 0) ||
179	    (IS_ALIGNED(buf, channel->align) &&
180	     IS_ALIGNED(size, channel->align))) {
181		actual = read(data->dev, buf, size);
182		if (actual != size) {
183		short_read:
184			if (actual < 0)
185				actual = 0;
186			retval = EXT2_ET_SHORT_READ;
187			goto error_out;
188		}
189		return 0;
190	}
191
192#ifdef ALIGN_DEBUG
193	printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
194	       (unsigned long) size);
195#endif
196
197	/*
198	 * The buffer or size which we're trying to read isn't aligned
199	 * to the O_DIRECT rules, so we need to do this the hard way...
200	 */
201	while (size > 0) {
202		actual = read(data->dev, data->bounce, channel->block_size);
203		if (actual != channel->block_size)
204			goto short_read;
205		actual = size;
206		if (size > channel->block_size)
207			actual = channel->block_size;
208		memcpy(buf, data->bounce, actual);
209		size -= actual;
210		buf += actual;
211	}
212	return 0;
213
214error_out:
215	memset((char *) buf+actual, 0, size-actual);
216	if (channel->read_error)
217		retval = (channel->read_error)(channel, block, count, buf,
218					       size, actual, retval);
219	return retval;
220}
221
222static errcode_t raw_write_blk(io_channel channel,
223			       struct unix_private_data *data,
224			       unsigned long long block,
225			       int count, const void *bufv)
226{
227	ssize_t		size;
228	ext2_loff_t	location;
229	int		actual = 0;
230	errcode_t	retval;
231	const unsigned char *buf = bufv;
232
233	if (count == 1)
234		size = channel->block_size;
235	else {
236		if (count < 0)
237			size = -count;
238		else
239			size = count * channel->block_size;
240	}
241	data->io_stats.bytes_written += size;
242
243	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
244	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
245		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
246		goto error_out;
247	}
248
249	if ((channel->align == 0) ||
250	    (IS_ALIGNED(buf, channel->align) &&
251	     IS_ALIGNED(size, channel->align))) {
252		actual = write(data->dev, buf, size);
253		if (actual != size) {
254		short_write:
255			retval = EXT2_ET_SHORT_WRITE;
256			goto error_out;
257		}
258		return 0;
259	}
260
261#ifdef ALIGN_DEBUG
262	printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
263	       (unsigned long) size);
264#endif
265	/*
266	 * The buffer or size which we're trying to write isn't aligned
267	 * to the O_DIRECT rules, so we need to do this the hard way...
268	 */
269	while (size > 0) {
270		if (size < channel->block_size) {
271			actual = read(data->dev, data->bounce,
272				      channel->block_size);
273			if (actual != channel->block_size) {
274				retval = EXT2_ET_SHORT_READ;
275				goto error_out;
276			}
277		}
278		actual = size;
279		if (size > channel->block_size)
280			actual = channel->block_size;
281		memcpy(data->bounce, buf, actual);
282		actual = write(data->dev, data->bounce, channel->block_size);
283		if (actual != channel->block_size)
284			goto short_write;
285		size -= actual;
286		buf += actual;
287	}
288	return 0;
289
290error_out:
291	if (channel->write_error)
292		retval = (channel->write_error)(channel, block, count, buf,
293						size, actual, retval);
294	return retval;
295}
296
297
298/*
299 * Here we implement the cache functions
300 */
301
302/* Allocate the cache buffers */
303static errcode_t alloc_cache(io_channel channel,
304			     struct unix_private_data *data)
305{
306	errcode_t		retval;
307	struct unix_cache	*cache;
308	int			i;
309
310	data->access_time = 0;
311	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
312		cache->block = 0;
313		cache->access_time = 0;
314		cache->dirty = 0;
315		cache->in_use = 0;
316		if (cache->buf)
317			ext2fs_free_mem(&cache->buf);
318		retval = io_channel_alloc_buf(channel, 0, &cache->buf);
319		if (retval)
320			return retval;
321	}
322	if (channel->align) {
323		if (data->bounce)
324			ext2fs_free_mem(&data->bounce);
325		retval = io_channel_alloc_buf(channel, 0, &data->bounce);
326	}
327	return retval;
328}
329
330/* Free the cache buffers */
331static void free_cache(struct unix_private_data *data)
332{
333	struct unix_cache	*cache;
334	int			i;
335
336	data->access_time = 0;
337	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
338		cache->block = 0;
339		cache->access_time = 0;
340		cache->dirty = 0;
341		cache->in_use = 0;
342		if (cache->buf)
343			ext2fs_free_mem(&cache->buf);
344	}
345	if (data->bounce)
346		ext2fs_free_mem(&data->bounce);
347}
348
349#ifndef NO_IO_CACHE
350/*
351 * Try to find a block in the cache.  If the block is not found, and
352 * eldest is a non-zero pointer, then fill in eldest with the cache
353 * entry to that should be reused.
354 */
355static struct unix_cache *find_cached_block(struct unix_private_data *data,
356					    unsigned long long block,
357					    struct unix_cache **eldest)
358{
359	struct unix_cache	*cache, *unused_cache, *oldest_cache;
360	int			i;
361
362	unused_cache = oldest_cache = 0;
363	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
364		if (!cache->in_use) {
365			if (!unused_cache)
366				unused_cache = cache;
367			continue;
368		}
369		if (cache->block == block) {
370			cache->access_time = ++data->access_time;
371			return cache;
372		}
373		if (!oldest_cache ||
374		    (cache->access_time < oldest_cache->access_time))
375			oldest_cache = cache;
376	}
377	if (eldest)
378		*eldest = (unused_cache) ? unused_cache : oldest_cache;
379	return 0;
380}
381
382/*
383 * Reuse a particular cache entry for another block.
384 */
385static void reuse_cache(io_channel channel, struct unix_private_data *data,
386		 struct unix_cache *cache, unsigned long long block)
387{
388	if (cache->dirty && cache->in_use)
389		raw_write_blk(channel, data, cache->block, 1, cache->buf);
390
391	cache->in_use = 1;
392	cache->dirty = 0;
393	cache->block = block;
394	cache->access_time = ++data->access_time;
395}
396
397/*
398 * Flush all of the blocks in the cache
399 */
400static errcode_t flush_cached_blocks(io_channel channel,
401				     struct unix_private_data *data,
402				     int invalidate)
403
404{
405	struct unix_cache	*cache;
406	errcode_t		retval, retval2;
407	int			i;
408
409	retval2 = 0;
410	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
411		if (!cache->in_use)
412			continue;
413
414		if (invalidate)
415			cache->in_use = 0;
416
417		if (!cache->dirty)
418			continue;
419
420		retval = raw_write_blk(channel, data,
421				       cache->block, 1, cache->buf);
422		if (retval)
423			retval2 = retval;
424		else
425			cache->dirty = 0;
426	}
427	return retval2;
428}
429#endif /* NO_IO_CACHE */
430
431#ifdef __linux__
432#ifndef BLKDISCARDZEROES
433#define BLKDISCARDZEROES _IO(0x12,124)
434#endif
435#endif
436
437int ext2fs_open_file(const char *pathname, int flags, mode_t mode)
438{
439	if (mode)
440#if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
441		return open64(pathname, flags, mode);
442	else
443		return open64(pathname, flags);
444#else
445		return open(pathname, flags, mode);
446	else
447		return open(pathname, flags);
448#endif
449}
450
451int ext2fs_stat(const char *path, ext2fs_struct_stat *buf)
452{
453#if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
454	return stat64(path, buf);
455#else
456	return stat(path, buf);
457#endif
458}
459
460int ext2fs_fstat(int fd, ext2fs_struct_stat *buf)
461{
462#if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
463	return fstat64(fd, buf);
464#else
465	return fstat(fd, buf);
466#endif
467}
468
469static errcode_t unix_open(const char *name, int flags, io_channel *channel)
470{
471	io_channel	io = NULL;
472	struct unix_private_data *data = NULL;
473	errcode_t	retval;
474	int		open_flags;
475	int		f_nocache = 0;
476	ext2fs_struct_stat st;
477#ifdef __linux__
478	struct 		utsname ut;
479#endif
480
481	if (name == 0)
482		return EXT2_ET_BAD_DEVICE_NAME;
483	retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
484	if (retval)
485		goto cleanup;
486	memset(io, 0, sizeof(struct struct_io_channel));
487	io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
488	retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
489	if (retval)
490		goto cleanup;
491
492	io->manager = unix_io_manager;
493	retval = ext2fs_get_mem(strlen(name)+1, &io->name);
494	if (retval)
495		goto cleanup;
496
497	strcpy(io->name, name);
498	io->private_data = data;
499	io->block_size = 1024;
500	io->read_error = 0;
501	io->write_error = 0;
502	io->refcount = 1;
503
504	memset(data, 0, sizeof(struct unix_private_data));
505	data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
506	data->io_stats.num_fields = 2;
507	data->dev = -1;
508
509	open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
510	if (flags & IO_FLAG_EXCLUSIVE)
511		open_flags |= O_EXCL;
512#if defined(O_DIRECT)
513	if (flags & IO_FLAG_DIRECT_IO) {
514		open_flags |= O_DIRECT;
515		io->align = ext2fs_get_dio_alignment(data->dev);
516	}
517#elif defined(F_NOCACHE)
518	if (flags & IO_FLAG_DIRECT_IO) {
519		f_nocache = F_NOCACHE;
520		io->align = 4096;
521	}
522#endif
523	data->flags = flags;
524
525	data->dev = ext2fs_open_file(io->name, open_flags, 0);
526	if (data->dev < 0) {
527		retval = errno;
528		goto cleanup;
529	}
530	if (f_nocache) {
531		if (fcntl(data->dev, f_nocache, 1) < 0) {
532			retval = errno;
533			goto cleanup;
534		}
535	}
536
537	/*
538	 * If the device is really a block device, then set the
539	 * appropriate flag, otherwise we can set DISCARD_ZEROES flag
540	 * because we are going to use punch hole instead of discard
541	 * and if it succeed, subsequent read from sparse area returns
542	 * zero.
543	 */
544	if (ext2fs_stat(io->name, &st) == 0) {
545		if (S_ISBLK(st.st_mode))
546			io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
547		else
548			io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
549	}
550
551#ifdef BLKDISCARDZEROES
552	{
553		int zeroes = 0;
554		if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
555		    zeroes)
556			io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
557	}
558#endif
559
560#if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
561	/*
562	 * Some operating systems require that the buffers be aligned,
563	 * regardless of O_DIRECT
564	 */
565	if (!io->align)
566		io->align = 512;
567#endif
568
569
570	if ((retval = alloc_cache(io, data)))
571		goto cleanup;
572
573#ifdef BLKROGET
574	if (flags & IO_FLAG_RW) {
575		int error;
576		int readonly = 0;
577
578		/* Is the block device actually writable? */
579		error = ioctl(data->dev, BLKROGET, &readonly);
580		if (!error && readonly) {
581			retval = EPERM;
582			goto cleanup;
583		}
584	}
585#endif
586
587#ifdef __linux__
588#undef RLIM_INFINITY
589#if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
590#define RLIM_INFINITY	((unsigned long)(~0UL>>1))
591#else
592#define RLIM_INFINITY  (~0UL)
593#endif
594	/*
595	 * Work around a bug in 2.4.10-2.4.18 kernels where writes to
596	 * block devices are wrongly getting hit by the filesize
597	 * limit.  This workaround isn't perfect, since it won't work
598	 * if glibc wasn't built against 2.2 header files.  (Sigh.)
599	 *
600	 */
601	if ((flags & IO_FLAG_RW) &&
602	    (uname(&ut) == 0) &&
603	    ((ut.release[0] == '2') && (ut.release[1] == '.') &&
604	     (ut.release[2] == '4') && (ut.release[3] == '.') &&
605	     (ut.release[4] == '1') && (ut.release[5] >= '0') &&
606	     (ut.release[5] < '8')) &&
607	    (ext2fs_stat(io->name, &st) == 0) &&
608	    (S_ISBLK(st.st_mode))) {
609		struct rlimit	rlim;
610
611		rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
612		setrlimit(RLIMIT_FSIZE, &rlim);
613		getrlimit(RLIMIT_FSIZE, &rlim);
614		if (((unsigned long) rlim.rlim_cur) <
615		    ((unsigned long) rlim.rlim_max)) {
616			rlim.rlim_cur = rlim.rlim_max;
617			setrlimit(RLIMIT_FSIZE, &rlim);
618		}
619	}
620#endif
621	*channel = io;
622	return 0;
623
624cleanup:
625	if (data) {
626		if (data->dev >= 0)
627			close(data->dev);
628		free_cache(data);
629		ext2fs_free_mem(&data);
630	}
631	if (io) {
632		if (io->name) {
633			ext2fs_free_mem(&io->name);
634		}
635		ext2fs_free_mem(&io);
636	}
637	return retval;
638}
639
640static errcode_t unix_close(io_channel channel)
641{
642	struct unix_private_data *data;
643	errcode_t	retval = 0;
644
645	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
646	data = (struct unix_private_data *) channel->private_data;
647	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
648
649	if (--channel->refcount > 0)
650		return 0;
651
652#ifndef NO_IO_CACHE
653	retval = flush_cached_blocks(channel, data, 0);
654#endif
655
656	if (close(data->dev) < 0)
657		retval = errno;
658	free_cache(data);
659
660	ext2fs_free_mem(&channel->private_data);
661	if (channel->name)
662		ext2fs_free_mem(&channel->name);
663	ext2fs_free_mem(&channel);
664	return retval;
665}
666
667static errcode_t unix_set_blksize(io_channel channel, int blksize)
668{
669	struct unix_private_data *data;
670	errcode_t		retval;
671
672	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
673	data = (struct unix_private_data *) channel->private_data;
674	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
675
676	if (channel->block_size != blksize) {
677#ifndef NO_IO_CACHE
678		if ((retval = flush_cached_blocks(channel, data, 0)))
679			return retval;
680#endif
681
682		channel->block_size = blksize;
683		free_cache(data);
684		if ((retval = alloc_cache(channel, data)))
685			return retval;
686	}
687	return 0;
688}
689
690
691static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
692			       int count, void *buf)
693{
694	struct unix_private_data *data;
695	struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
696	errcode_t	retval;
697	char		*cp;
698	int		i, j;
699
700	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
701	data = (struct unix_private_data *) channel->private_data;
702	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
703
704#ifdef NO_IO_CACHE
705	return raw_read_blk(channel, data, block, count, buf);
706#else
707	/*
708	 * If we're doing an odd-sized read or a very large read,
709	 * flush out the cache and then do a direct read.
710	 */
711	if (count < 0 || count > WRITE_DIRECT_SIZE) {
712		if ((retval = flush_cached_blocks(channel, data, 0)))
713			return retval;
714		return raw_read_blk(channel, data, block, count, buf);
715	}
716
717	cp = buf;
718	while (count > 0) {
719		/* If it's in the cache, use it! */
720		if ((cache = find_cached_block(data, block, &reuse[0]))) {
721#ifdef DEBUG
722			printf("Using cached block %lu\n", block);
723#endif
724			memcpy(cp, cache->buf, channel->block_size);
725			count--;
726			block++;
727			cp += channel->block_size;
728			continue;
729		}
730		if (count == 1) {
731			/*
732			 * Special case where we read directly into the
733			 * cache buffer; important in the O_DIRECT case
734			 */
735			cache = reuse[0];
736			reuse_cache(channel, data, cache, block);
737			if ((retval = raw_read_blk(channel, data, block, 1,
738						   cache->buf))) {
739				cache->in_use = 0;
740				return retval;
741			}
742			memcpy(cp, cache->buf, channel->block_size);
743			return 0;
744		}
745
746		/*
747		 * Find the number of uncached blocks so we can do a
748		 * single read request
749		 */
750		for (i=1; i < count; i++)
751			if (find_cached_block(data, block+i, &reuse[i]))
752				break;
753#ifdef DEBUG
754		printf("Reading %d blocks starting at %lu\n", i, block);
755#endif
756		if ((retval = raw_read_blk(channel, data, block, i, cp)))
757			return retval;
758
759		/* Save the results in the cache */
760		for (j=0; j < i; j++) {
761			count--;
762			cache = reuse[j];
763			reuse_cache(channel, data, cache, block++);
764			memcpy(cache->buf, cp, channel->block_size);
765			cp += channel->block_size;
766		}
767	}
768	return 0;
769#endif /* NO_IO_CACHE */
770}
771
772static errcode_t unix_read_blk(io_channel channel, unsigned long block,
773			       int count, void *buf)
774{
775	return unix_read_blk64(channel, block, count, buf);
776}
777
778static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
779				int count, const void *buf)
780{
781	struct unix_private_data *data;
782	struct unix_cache *cache, *reuse;
783	errcode_t	retval = 0;
784	const char	*cp;
785	int		writethrough;
786
787	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
788	data = (struct unix_private_data *) channel->private_data;
789	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
790
791#ifdef NO_IO_CACHE
792	return raw_write_blk(channel, data, block, count, buf);
793#else
794	/*
795	 * If we're doing an odd-sized write or a very large write,
796	 * flush out the cache completely and then do a direct write.
797	 */
798	if (count < 0 || count > WRITE_DIRECT_SIZE) {
799		if ((retval = flush_cached_blocks(channel, data, 1)))
800			return retval;
801		return raw_write_blk(channel, data, block, count, buf);
802	}
803
804	/*
805	 * For a moderate-sized multi-block write, first force a write
806	 * if we're in write-through cache mode, and then fill the
807	 * cache with the blocks.
808	 */
809	writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
810	if (writethrough)
811		retval = raw_write_blk(channel, data, block, count, buf);
812
813	cp = buf;
814	while (count > 0) {
815		cache = find_cached_block(data, block, &reuse);
816		if (!cache) {
817			cache = reuse;
818			reuse_cache(channel, data, cache, block);
819		}
820		memcpy(cache->buf, cp, channel->block_size);
821		cache->dirty = !writethrough;
822		count--;
823		block++;
824		cp += channel->block_size;
825	}
826	return retval;
827#endif /* NO_IO_CACHE */
828}
829
830static errcode_t unix_write_blk(io_channel channel, unsigned long block,
831				int count, const void *buf)
832{
833	return unix_write_blk64(channel, block, count, buf);
834}
835
836static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
837				 int size, const void *buf)
838{
839	struct unix_private_data *data;
840	errcode_t	retval = 0;
841	ssize_t		actual;
842
843	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
844	data = (struct unix_private_data *) channel->private_data;
845	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
846
847	if (channel->align != 0) {
848#ifdef ALIGN_DEBUG
849		printf("unix_write_byte: O_DIRECT fallback\n");
850#endif
851		return EXT2_ET_UNIMPLEMENTED;
852	}
853
854#ifndef NO_IO_CACHE
855	/*
856	 * Flush out the cache completely
857	 */
858	if ((retval = flush_cached_blocks(channel, data, 1)))
859		return retval;
860#endif
861
862	if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
863		return errno;
864
865	actual = write(data->dev, buf, size);
866	if (actual != size)
867		return EXT2_ET_SHORT_WRITE;
868
869	return 0;
870}
871
872/*
873 * Flush data buffers to disk.
874 */
875static errcode_t unix_flush(io_channel channel)
876{
877	struct unix_private_data *data;
878	errcode_t retval = 0;
879
880	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
881	data = (struct unix_private_data *) channel->private_data;
882	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
883
884#ifndef NO_IO_CACHE
885	retval = flush_cached_blocks(channel, data, 0);
886#endif
887	fsync(data->dev);
888	return retval;
889}
890
891static errcode_t unix_set_option(io_channel channel, const char *option,
892				 const char *arg)
893{
894	struct unix_private_data *data;
895	unsigned long long tmp;
896	char *end;
897
898	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
899	data = (struct unix_private_data *) channel->private_data;
900	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
901
902	if (!strcmp(option, "offset")) {
903		if (!arg)
904			return EXT2_ET_INVALID_ARGUMENT;
905
906		tmp = strtoull(arg, &end, 0);
907		if (*end)
908			return EXT2_ET_INVALID_ARGUMENT;
909		data->offset = tmp;
910		if (data->offset < 0)
911			return EXT2_ET_INVALID_ARGUMENT;
912		return 0;
913	}
914	return EXT2_ET_INVALID_ARGUMENT;
915}
916
917#if defined(__linux__) && !defined(BLKDISCARD)
918#define BLKDISCARD		_IO(0x12,119)
919#endif
920
921static errcode_t unix_discard(io_channel channel, unsigned long long block,
922			      unsigned long long count)
923{
924	struct unix_private_data *data;
925	int		ret;
926
927	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
928	data = (struct unix_private_data *) channel->private_data;
929	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
930
931	if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
932#ifdef BLKDISCARD
933		__uint64_t range[2];
934
935		range[0] = (__uint64_t)(block) * channel->block_size;
936		range[1] = (__uint64_t)(count) * channel->block_size;
937
938		ret = ioctl(data->dev, BLKDISCARD, &range);
939#else
940		goto unimplemented;
941#endif
942	} else {
943#if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
944		/*
945		 * If we are not on block device, try to use punch hole
946		 * to reclaim free space.
947		 */
948		ret = fallocate(data->dev,
949				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
950				(off_t)(block) * channel->block_size,
951				(off_t)(count) * channel->block_size);
952#else
953		goto unimplemented;
954#endif
955	}
956	if (ret < 0) {
957		if (errno == EOPNOTSUPP)
958			goto unimplemented;
959		return errno;
960	}
961	return 0;
962unimplemented:
963	return EXT2_ET_UNIMPLEMENTED;
964}
965