unix_io.c revision 800766ee4a2bcdc0a32442d093d20da6ea3815ab
1/*
2 * unix_io.c --- This is the Unix (well, really POSIX) implementation
3 * 	of the I/O manager.
4 *
5 * Implements a one-block write-through cache.
6 *
7 * Includes support for Windows NT support under Cygwin.
8 *
9 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10 * 	2002 by Theodore Ts'o.
11 *
12 * %Begin-Header%
13 * This file may be redistributed under the terms of the GNU Library
14 * General Public License, version 2.
15 * %End-Header%
16 */
17
18#define _LARGEFILE_SOURCE
19#define _LARGEFILE64_SOURCE
20#ifndef _GNU_SOURCE
21#define _GNU_SOURCE
22#endif
23
24#include "config.h"
25#include <stdio.h>
26#include <string.h>
27#if HAVE_UNISTD_H
28#include <unistd.h>
29#endif
30#if HAVE_ERRNO_H
31#include <errno.h>
32#endif
33#include <fcntl.h>
34#include <time.h>
35#ifdef __linux__
36#include <sys/utsname.h>
37#endif
38#ifdef HAVE_SYS_IOCTL_H
39#include <sys/ioctl.h>
40#endif
41#ifdef HAVE_SYS_MOUNT_H
42#include <sys/mount.h>
43#endif
44#if HAVE_SYS_STAT_H
45#include <sys/stat.h>
46#endif
47#if HAVE_SYS_TYPES_H
48#include <sys/types.h>
49#endif
50#if HAVE_SYS_RESOURCE_H
51#include <sys/resource.h>
52#endif
53#if HAVE_LINUX_FALLOC_H
54#include <linux/falloc.h>
55#endif
56
57#if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
58#define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
59#endif
60
61#if defined(__linux__) && defined(_IO) && !defined(BLKSSZGET)
62#define BLKSSZGET  _IO(0x12,104)/* get block device sector size */
63#endif
64
65#undef ALIGN_DEBUG
66
67#include "ext2_fs.h"
68#include "ext2fs.h"
69
70/*
71 * For checking structure magic numbers...
72 */
73
74#define EXT2_CHECK_MAGIC(struct, code) \
75	  if ((struct)->magic != (code)) return (code)
76
77struct unix_cache {
78	char		*buf;
79	unsigned long	block;
80	int		access_time;
81	unsigned	dirty:1;
82	unsigned	in_use:1;
83};
84
85#define CACHE_SIZE 8
86#define WRITE_DIRECT_SIZE 4	/* Must be smaller than CACHE_SIZE */
87#define READ_DIRECT_SIZE 4	/* Should be smaller than CACHE_SIZE */
88
89struct unix_private_data {
90	int	magic;
91	int	dev;
92	int	flags;
93	int	align;
94	int	access_time;
95	ext2_loff_t offset;
96	struct unix_cache cache[CACHE_SIZE];
97	void	*bounce;
98	struct struct_io_stats io_stats;
99};
100
101#define IS_ALIGNED(n, align) ((((unsigned long) n) & \
102			       ((unsigned long) ((align)-1))) == 0)
103
104static errcode_t unix_open(const char *name, int flags, io_channel *channel);
105static errcode_t unix_close(io_channel channel);
106static errcode_t unix_set_blksize(io_channel channel, int blksize);
107static errcode_t unix_read_blk(io_channel channel, unsigned long block,
108			       int count, void *data);
109static errcode_t unix_write_blk(io_channel channel, unsigned long block,
110				int count, const void *data);
111static errcode_t unix_flush(io_channel channel);
112static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
113				int size, const void *data);
114static errcode_t unix_set_option(io_channel channel, const char *option,
115				 const char *arg);
116static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
117;
118static void reuse_cache(io_channel channel, struct unix_private_data *data,
119		 struct unix_cache *cache, unsigned long long block);
120static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
121			       int count, void *data);
122static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
123				int count, const void *data);
124static errcode_t unix_discard(io_channel channel, unsigned long long block,
125			      unsigned long long count);
126
127static struct struct_io_manager struct_unix_manager = {
128	EXT2_ET_MAGIC_IO_MANAGER,
129	"Unix I/O Manager",
130	unix_open,
131	unix_close,
132	unix_set_blksize,
133	unix_read_blk,
134	unix_write_blk,
135	unix_flush,
136	unix_write_byte,
137	unix_set_option,
138	unix_get_stats,
139	unix_read_blk64,
140	unix_write_blk64,
141	unix_discard,
142};
143
144io_manager unix_io_manager = &struct_unix_manager;
145
146static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
147{
148	errcode_t 	retval = 0;
149
150	struct unix_private_data *data;
151
152	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
153	data = (struct unix_private_data *) channel->private_data;
154	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
155
156	if (stats)
157		*stats = &data->io_stats;
158
159	return retval;
160}
161
162/*
163 * Here are the raw I/O functions
164 */
165static errcode_t raw_read_blk(io_channel channel,
166			      struct unix_private_data *data,
167			      unsigned long long block,
168			      int count, void *bufv)
169{
170	errcode_t	retval;
171	ssize_t		size;
172	ext2_loff_t	location;
173	int		actual = 0;
174	unsigned char	*buf = bufv;
175
176	size = (count < 0) ? -count : count * channel->block_size;
177	data->io_stats.bytes_read += size;
178	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
179	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
180		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
181		goto error_out;
182	}
183	if ((data->align == 0) ||
184	    ((IS_ALIGNED(buf, data->align)) && IS_ALIGNED(size, data->align))) {
185		actual = read(data->dev, buf, size);
186		if (actual != size) {
187		short_read:
188			if (actual < 0)
189				actual = 0;
190			retval = EXT2_ET_SHORT_READ;
191			goto error_out;
192		}
193		return 0;
194	}
195
196#ifdef ALIGN_DEBUG
197	printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
198	       (unsigned long) size);
199#endif
200
201	/*
202	 * The buffer or size which we're trying to read isn't aligned
203	 * to the O_DIRECT rules, so we need to do this the hard way...
204	 */
205	while (size > 0) {
206		actual = read(data->dev, data->bounce, channel->block_size);
207		if (actual != channel->block_size)
208			goto short_read;
209		actual = size;
210		if (size > channel->block_size)
211			actual = channel->block_size;
212		memcpy(buf, data->bounce, actual);
213		size -= actual;
214		buf += actual;
215	}
216	return 0;
217
218error_out:
219	memset((char *) buf+actual, 0, size-actual);
220	if (channel->read_error)
221		retval = (channel->read_error)(channel, block, count, buf,
222					       size, actual, retval);
223	return retval;
224}
225
226static errcode_t raw_write_blk(io_channel channel,
227			       struct unix_private_data *data,
228			       unsigned long long block,
229			       int count, const void *bufv)
230{
231	ssize_t		size;
232	ext2_loff_t	location;
233	int		actual = 0;
234	errcode_t	retval;
235	const unsigned char *buf = bufv;
236
237	if (count == 1)
238		size = channel->block_size;
239	else {
240		if (count < 0)
241			size = -count;
242		else
243			size = count * channel->block_size;
244	}
245	data->io_stats.bytes_written += size;
246
247	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
248	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
249		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
250		goto error_out;
251	}
252
253	if ((data->align == 0) ||
254	    ((IS_ALIGNED(buf, data->align)) && IS_ALIGNED(size, data->align))) {
255		actual = write(data->dev, buf, size);
256		if (actual != size) {
257		short_write:
258			retval = EXT2_ET_SHORT_WRITE;
259			goto error_out;
260		}
261		return 0;
262	}
263
264#ifdef ALIGN_DEBUG
265	printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
266	       (unsigned long) size);
267#endif
268	/*
269	 * The buffer or size which we're trying to write isn't aligned
270	 * to the O_DIRECT rules, so we need to do this the hard way...
271	 */
272	while (size > 0) {
273		if (size < channel->block_size) {
274			actual = read(data->dev, data->bounce,
275				      channel->block_size);
276			if (actual != channel->block_size) {
277				retval = EXT2_ET_SHORT_READ;
278				goto error_out;
279			}
280		}
281		actual = size;
282		if (size > channel->block_size)
283			actual = channel->block_size;
284		memcpy(data->bounce, buf, actual);
285		actual = write(data->dev, data->bounce, channel->block_size);
286		if (actual != channel->block_size)
287			goto short_write;
288		size -= actual;
289		buf += actual;
290	}
291	return 0;
292
293error_out:
294	if (channel->write_error)
295		retval = (channel->write_error)(channel, block, count, buf,
296						size, actual, retval);
297	return retval;
298}
299
300
301/*
302 * Here we implement the cache functions
303 */
304
305/* Allocate the cache buffers */
306static errcode_t alloc_cache(io_channel channel,
307			     struct unix_private_data *data)
308{
309	errcode_t		retval;
310	struct unix_cache	*cache;
311	int			i;
312
313	data->access_time = 0;
314	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
315		cache->block = 0;
316		cache->access_time = 0;
317		cache->dirty = 0;
318		cache->in_use = 0;
319		if (cache->buf)
320			ext2fs_free_mem(&cache->buf);
321		retval = ext2fs_get_memalign(channel->block_size,
322					     data->align, &cache->buf);
323		if (retval)
324			return retval;
325	}
326	if (data->align) {
327		if (data->bounce)
328			ext2fs_free_mem(&data->bounce);
329		retval = ext2fs_get_memalign(channel->block_size, data->align,
330					     &data->bounce);
331	}
332	return retval;
333}
334
335/* Free the cache buffers */
336static void free_cache(struct unix_private_data *data)
337{
338	struct unix_cache	*cache;
339	int			i;
340
341	data->access_time = 0;
342	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
343		cache->block = 0;
344		cache->access_time = 0;
345		cache->dirty = 0;
346		cache->in_use = 0;
347		if (cache->buf)
348			ext2fs_free_mem(&cache->buf);
349	}
350	if (data->bounce)
351		ext2fs_free_mem(&data->bounce);
352}
353
354#ifndef NO_IO_CACHE
355/*
356 * Try to find a block in the cache.  If the block is not found, and
357 * eldest is a non-zero pointer, then fill in eldest with the cache
358 * entry to that should be reused.
359 */
360static struct unix_cache *find_cached_block(struct unix_private_data *data,
361					    unsigned long long block,
362					    struct unix_cache **eldest)
363{
364	struct unix_cache	*cache, *unused_cache, *oldest_cache;
365	int			i;
366
367	unused_cache = oldest_cache = 0;
368	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
369		if (!cache->in_use) {
370			if (!unused_cache)
371				unused_cache = cache;
372			continue;
373		}
374		if (cache->block == block) {
375			cache->access_time = ++data->access_time;
376			return cache;
377		}
378		if (!oldest_cache ||
379		    (cache->access_time < oldest_cache->access_time))
380			oldest_cache = cache;
381	}
382	if (eldest)
383		*eldest = (unused_cache) ? unused_cache : oldest_cache;
384	return 0;
385}
386
387/*
388 * Reuse a particular cache entry for another block.
389 */
390static void reuse_cache(io_channel channel, struct unix_private_data *data,
391		 struct unix_cache *cache, unsigned long long block)
392{
393	if (cache->dirty && cache->in_use)
394		raw_write_blk(channel, data, cache->block, 1, cache->buf);
395
396	cache->in_use = 1;
397	cache->dirty = 0;
398	cache->block = block;
399	cache->access_time = ++data->access_time;
400}
401
402/*
403 * Flush all of the blocks in the cache
404 */
405static errcode_t flush_cached_blocks(io_channel channel,
406				     struct unix_private_data *data,
407				     int invalidate)
408
409{
410	struct unix_cache	*cache;
411	errcode_t		retval, retval2;
412	int			i;
413
414	retval2 = 0;
415	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
416		if (!cache->in_use)
417			continue;
418
419		if (invalidate)
420			cache->in_use = 0;
421
422		if (!cache->dirty)
423			continue;
424
425		retval = raw_write_blk(channel, data,
426				       cache->block, 1, cache->buf);
427		if (retval)
428			retval2 = retval;
429		else
430			cache->dirty = 0;
431	}
432	return retval2;
433}
434#endif /* NO_IO_CACHE */
435
436#ifdef __linux__
437#ifndef BLKDISCARDZEROES
438#define BLKDISCARDZEROES _IO(0x12,124)
439#endif
440#endif
441
442static errcode_t unix_open(const char *name, int flags, io_channel *channel)
443{
444	io_channel	io = NULL;
445	struct unix_private_data *data = NULL;
446	errcode_t	retval;
447	int		open_flags, zeroes = 0;
448	int		f_nocache = 0;
449	ext2fs_struct_stat st;
450#ifdef __linux__
451	struct 		utsname ut;
452#endif
453
454	if (name == 0)
455		return EXT2_ET_BAD_DEVICE_NAME;
456	retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
457	if (retval)
458		goto cleanup;
459	memset(io, 0, sizeof(struct struct_io_channel));
460	io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
461	retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
462	if (retval)
463		goto cleanup;
464
465	io->manager = unix_io_manager;
466	retval = ext2fs_get_mem(strlen(name)+1, &io->name);
467	if (retval)
468		goto cleanup;
469
470	strcpy(io->name, name);
471	io->private_data = data;
472	io->block_size = 1024;
473	io->read_error = 0;
474	io->write_error = 0;
475	io->refcount = 1;
476
477	memset(data, 0, sizeof(struct unix_private_data));
478	data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
479	data->io_stats.num_fields = 2;
480
481	open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
482	if (flags & IO_FLAG_EXCLUSIVE)
483		open_flags |= O_EXCL;
484#if defined(O_DIRECT)
485	if (flags & IO_FLAG_DIRECT_IO)
486		open_flags |= O_DIRECT;
487#elif defined(F_NOCACHE)
488	if (flags & IO_FLAG_DIRECT_IO)
489		f_nocache = F_NOCACHE;
490#endif
491	data->flags = flags;
492
493	data->dev = ext2fs_open_file(io->name, open_flags, 0);
494	if (data->dev < 0) {
495		retval = errno;
496		goto cleanup;
497	}
498	if (f_nocache) {
499		if (fcntl(data->dev, f_nocache, 1) < 0) {
500			retval = errno;
501			goto cleanup;
502		}
503	}
504
505	/*
506	 * If the device is really a block device, then set the
507	 * appropriate flag, otherwise we can set DISCARD_ZEROES flag
508	 * because we are going to use punch hole instead of discard
509	 * and if it succeed, subsequent read from sparse area returns
510	 * zero.
511	 */
512	if (ext2fs_stat(io->name, &st) == 0) {
513		if (S_ISBLK(st.st_mode))
514			io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
515		else
516			io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
517	}
518
519#ifdef BLKSSZGET
520	if (flags & IO_FLAG_DIRECT_IO) {
521		if (ioctl(data->dev, BLKSSZGET, &data->align) != 0)
522			data->align = io->block_size;
523	}
524#endif
525
526#ifdef BLKDISCARDZEROES
527	ioctl(data->dev, BLKDISCARDZEROES, &zeroes);
528	if (zeroes)
529		io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
530#endif
531
532#if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
533	/*
534	 * Some operating systems require that the buffers be aligned,
535	 * regardless of O_DIRECT
536	 */
537	data->align = 512;
538#endif
539
540
541	if ((retval = alloc_cache(io, data)))
542		goto cleanup;
543
544#ifdef BLKROGET
545	if (flags & IO_FLAG_RW) {
546		int error;
547		int readonly = 0;
548
549		/* Is the block device actually writable? */
550		error = ioctl(data->dev, BLKROGET, &readonly);
551		if (!error && readonly) {
552			close(data->dev);
553			retval = EPERM;
554			goto cleanup;
555		}
556	}
557#endif
558
559#ifdef __linux__
560#undef RLIM_INFINITY
561#if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
562#define RLIM_INFINITY	((unsigned long)(~0UL>>1))
563#else
564#define RLIM_INFINITY  (~0UL)
565#endif
566	/*
567	 * Work around a bug in 2.4.10-2.4.18 kernels where writes to
568	 * block devices are wrongly getting hit by the filesize
569	 * limit.  This workaround isn't perfect, since it won't work
570	 * if glibc wasn't built against 2.2 header files.  (Sigh.)
571	 *
572	 */
573	if ((flags & IO_FLAG_RW) &&
574	    (uname(&ut) == 0) &&
575	    ((ut.release[0] == '2') && (ut.release[1] == '.') &&
576	     (ut.release[2] == '4') && (ut.release[3] == '.') &&
577	     (ut.release[4] == '1') && (ut.release[5] >= '0') &&
578	     (ut.release[5] < '8')) &&
579	    (ext2fs_stat(io->name, &st) == 0) &&
580	    (S_ISBLK(st.st_mode))) {
581		struct rlimit	rlim;
582
583		rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
584		setrlimit(RLIMIT_FSIZE, &rlim);
585		getrlimit(RLIMIT_FSIZE, &rlim);
586		if (((unsigned long) rlim.rlim_cur) <
587		    ((unsigned long) rlim.rlim_max)) {
588			rlim.rlim_cur = rlim.rlim_max;
589			setrlimit(RLIMIT_FSIZE, &rlim);
590		}
591	}
592#endif
593	*channel = io;
594	return 0;
595
596cleanup:
597	if (data) {
598		free_cache(data);
599		ext2fs_free_mem(&data);
600	}
601	if (io)
602		ext2fs_free_mem(&io);
603	return retval;
604}
605
606static errcode_t unix_close(io_channel channel)
607{
608	struct unix_private_data *data;
609	errcode_t	retval = 0;
610
611	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
612	data = (struct unix_private_data *) channel->private_data;
613	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
614
615	if (--channel->refcount > 0)
616		return 0;
617
618#ifndef NO_IO_CACHE
619	retval = flush_cached_blocks(channel, data, 0);
620#endif
621
622	if (close(data->dev) < 0)
623		retval = errno;
624	free_cache(data);
625
626	ext2fs_free_mem(&channel->private_data);
627	if (channel->name)
628		ext2fs_free_mem(&channel->name);
629	ext2fs_free_mem(&channel);
630	return retval;
631}
632
633static errcode_t unix_set_blksize(io_channel channel, int blksize)
634{
635	struct unix_private_data *data;
636	errcode_t		retval;
637
638	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
639	data = (struct unix_private_data *) channel->private_data;
640	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
641
642	if (channel->block_size != blksize) {
643#ifndef NO_IO_CACHE
644		if ((retval = flush_cached_blocks(channel, data, 0)))
645			return retval;
646#endif
647
648		channel->block_size = blksize;
649		free_cache(data);
650		if ((retval = alloc_cache(channel, data)))
651			return retval;
652	}
653	return 0;
654}
655
656
657static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
658			       int count, void *buf)
659{
660	struct unix_private_data *data;
661	struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
662	errcode_t	retval;
663	char		*cp;
664	int		i, j;
665
666	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
667	data = (struct unix_private_data *) channel->private_data;
668	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
669
670#ifdef NO_IO_CACHE
671	return raw_read_blk(channel, data, block, count, buf);
672#else
673	/*
674	 * If we're doing an odd-sized read or a very large read,
675	 * flush out the cache and then do a direct read.
676	 */
677	if (count < 0 || count > WRITE_DIRECT_SIZE) {
678		if ((retval = flush_cached_blocks(channel, data, 0)))
679			return retval;
680		return raw_read_blk(channel, data, block, count, buf);
681	}
682
683	cp = buf;
684	while (count > 0) {
685		/* If it's in the cache, use it! */
686		if ((cache = find_cached_block(data, block, &reuse[0]))) {
687#ifdef DEBUG
688			printf("Using cached block %lu\n", block);
689#endif
690			memcpy(cp, cache->buf, channel->block_size);
691			count--;
692			block++;
693			cp += channel->block_size;
694			continue;
695		}
696		if (count == 1) {
697			/*
698			 * Special case where we read directly into the
699			 * cache buffer; important in the O_DIRECT case
700			 */
701			cache = reuse[0];
702			reuse_cache(channel, data, cache, block);
703			if ((retval = raw_read_blk(channel, data, block, 1,
704						   cache->buf))) {
705				cache->in_use = 0;
706				return retval;
707			}
708			memcpy(cp, cache->buf, channel->block_size);
709			return 0;
710		}
711
712		/*
713		 * Find the number of uncached blocks so we can do a
714		 * single read request
715		 */
716		for (i=1; i < count; i++)
717			if (find_cached_block(data, block+i, &reuse[i]))
718				break;
719#ifdef DEBUG
720		printf("Reading %d blocks starting at %lu\n", i, block);
721#endif
722		if ((retval = raw_read_blk(channel, data, block, i, cp)))
723			return retval;
724
725		/* Save the results in the cache */
726		for (j=0; j < i; j++) {
727			count--;
728			cache = reuse[j];
729			reuse_cache(channel, data, cache, block++);
730			memcpy(cache->buf, cp, channel->block_size);
731			cp += channel->block_size;
732		}
733	}
734	return 0;
735#endif /* NO_IO_CACHE */
736}
737
738static errcode_t unix_read_blk(io_channel channel, unsigned long block,
739			       int count, void *buf)
740{
741	return unix_read_blk64(channel, block, count, buf);
742}
743
744static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
745				int count, const void *buf)
746{
747	struct unix_private_data *data;
748	struct unix_cache *cache, *reuse;
749	errcode_t	retval = 0;
750	const char	*cp;
751	int		writethrough;
752
753	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
754	data = (struct unix_private_data *) channel->private_data;
755	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
756
757#ifdef NO_IO_CACHE
758	return raw_write_blk(channel, data, block, count, buf);
759#else
760	/*
761	 * If we're doing an odd-sized write or a very large write,
762	 * flush out the cache completely and then do a direct write.
763	 */
764	if (count < 0 || count > WRITE_DIRECT_SIZE) {
765		if ((retval = flush_cached_blocks(channel, data, 1)))
766			return retval;
767		return raw_write_blk(channel, data, block, count, buf);
768	}
769
770	/*
771	 * For a moderate-sized multi-block write, first force a write
772	 * if we're in write-through cache mode, and then fill the
773	 * cache with the blocks.
774	 */
775	writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
776	if (writethrough)
777		retval = raw_write_blk(channel, data, block, count, buf);
778
779	cp = buf;
780	while (count > 0) {
781		cache = find_cached_block(data, block, &reuse);
782		if (!cache) {
783			cache = reuse;
784			reuse_cache(channel, data, cache, block);
785		}
786		memcpy(cache->buf, cp, channel->block_size);
787		cache->dirty = !writethrough;
788		count--;
789		block++;
790		cp += channel->block_size;
791	}
792	return retval;
793#endif /* NO_IO_CACHE */
794}
795
796static errcode_t unix_write_blk(io_channel channel, unsigned long block,
797				int count, const void *buf)
798{
799	return unix_write_blk64(channel, block, count, buf);
800}
801
802static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
803				 int size, const void *buf)
804{
805	struct unix_private_data *data;
806	errcode_t	retval = 0;
807	ssize_t		actual;
808
809	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
810	data = (struct unix_private_data *) channel->private_data;
811	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
812
813	if (data->align != 0) {
814#ifdef ALIGN_DEBUG
815		printf("unix_write_byte: O_DIRECT fallback\n");
816#endif
817		return EXT2_ET_UNIMPLEMENTED;
818	}
819
820#ifndef NO_IO_CACHE
821	/*
822	 * Flush out the cache completely
823	 */
824	if ((retval = flush_cached_blocks(channel, data, 1)))
825		return retval;
826#endif
827
828	if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
829		return errno;
830
831	actual = write(data->dev, buf, size);
832	if (actual != size)
833		return EXT2_ET_SHORT_WRITE;
834
835	return 0;
836}
837
838/*
839 * Flush data buffers to disk.
840 */
841static errcode_t unix_flush(io_channel channel)
842{
843	struct unix_private_data *data;
844	errcode_t retval = 0;
845
846	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
847	data = (struct unix_private_data *) channel->private_data;
848	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
849
850#ifndef NO_IO_CACHE
851	retval = flush_cached_blocks(channel, data, 0);
852#endif
853	fsync(data->dev);
854	return retval;
855}
856
857static errcode_t unix_set_option(io_channel channel, const char *option,
858				 const char *arg)
859{
860	struct unix_private_data *data;
861	unsigned long long tmp;
862	char *end;
863
864	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
865	data = (struct unix_private_data *) channel->private_data;
866	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
867
868	if (!strcmp(option, "offset")) {
869		if (!arg)
870			return EXT2_ET_INVALID_ARGUMENT;
871
872		tmp = strtoull(arg, &end, 0);
873		if (*end)
874			return EXT2_ET_INVALID_ARGUMENT;
875		data->offset = tmp;
876		if (data->offset < 0)
877			return EXT2_ET_INVALID_ARGUMENT;
878		return 0;
879	}
880	return EXT2_ET_INVALID_ARGUMENT;
881}
882
883#if defined(__linux__) && !defined(BLKDISCARD)
884#define BLKDISCARD		_IO(0x12,119)
885#endif
886
887static errcode_t unix_discard(io_channel channel, unsigned long long block,
888			      unsigned long long count)
889{
890	struct unix_private_data *data;
891	__uint64_t	range[2];
892	int		ret;
893
894	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
895	data = (struct unix_private_data *) channel->private_data;
896	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
897
898	if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
899#ifdef BLKDISCARD
900		range[0] = (__uint64_t)(block) * channel->block_size;
901		range[1] = (__uint64_t)(count) * channel->block_size;
902
903		ret = ioctl(data->dev, BLKDISCARD, &range);
904#else
905		goto unimplemented;
906#endif
907	} else {
908#if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
909		/*
910		 * If we are not on block device, try to use punch hole
911		 * to reclaim free space.
912		 */
913		ret = fallocate(data->dev,
914				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
915				(off_t)(block) * channel->block_size,
916				(off_t)(count) * channel->block_size);
917#else
918		goto unimplemented;
919#endif
920	}
921	if (ret < 0) {
922		if (errno == EOPNOTSUPP)
923			goto unimplemented;
924		return errno;
925	}
926	return 0;
927unimplemented:
928	return EXT2_ET_UNIMPLEMENTED;
929}
930