unix_io.c revision 6d96b00d57d236e2746f8245df6c8ea64abc64c1
1/* 2 * unix_io.c --- This is the Unix (well, really POSIX) implementation 3 * of the I/O manager. 4 * 5 * Implements a one-block write-through cache. 6 * 7 * Includes support for Windows NT support under Cygwin. 8 * 9 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 10 * 2002 by Theodore Ts'o. 11 * 12 * %Begin-Header% 13 * This file may be redistributed under the terms of the GNU Public 14 * License. 15 * %End-Header% 16 */ 17 18#define _LARGEFILE_SOURCE 19#define _LARGEFILE64_SOURCE 20 21#include <stdio.h> 22#include <string.h> 23#if HAVE_UNISTD_H 24#include <unistd.h> 25#endif 26#if HAVE_ERRNO_H 27#include <errno.h> 28#endif 29#include <fcntl.h> 30#include <time.h> 31#ifdef __linux__ 32#include <sys/utsname.h> 33#endif 34#if HAVE_SYS_STAT_H 35#include <sys/stat.h> 36#endif 37#if HAVE_SYS_TYPES_H 38#include <sys/types.h> 39#endif 40#if HAVE_SYS_RESOURCE_H 41#include <sys/resource.h> 42#endif 43 44#include "ext2_fs.h" 45#include "ext2fs.h" 46 47/* 48 * For checking structure magic numbers... 49 */ 50 51#define EXT2_CHECK_MAGIC(struct, code) \ 52 if ((struct)->magic != (code)) return (code) 53 54struct unix_cache { 55 char *buf; 56 unsigned long block; 57 int access_time; 58 unsigned dirty:1; 59 unsigned in_use:1; 60}; 61 62#define CACHE_SIZE 8 63#define WRITE_DIRECT_SIZE 4 /* Must be smaller than CACHE_SIZE */ 64#define READ_DIRECT_SIZE 4 /* Should be smaller than CACHE_SIZE */ 65 66struct unix_private_data { 67 int magic; 68 int dev; 69 int flags; 70 int access_time; 71 ext2_loff_t offset; 72 struct unix_cache cache[CACHE_SIZE]; 73 struct struct_io_stats io_stats; 74}; 75 76static errcode_t unix_open(const char *name, int flags, io_channel *channel); 77static errcode_t unix_close(io_channel channel); 78static errcode_t unix_set_blksize(io_channel channel, int blksize); 79static errcode_t unix_read_blk(io_channel channel, unsigned long block, 80 int count, void *data); 81static errcode_t unix_write_blk(io_channel channel, unsigned long block, 82 int count, const void *data); 83static errcode_t unix_flush(io_channel channel); 84static errcode_t unix_write_byte(io_channel channel, unsigned long offset, 85 int size, const void *data); 86static errcode_t unix_set_option(io_channel channel, const char *option, 87 const char *arg); 88static errcode_t unix_get_stats(io_channel channel, io_stats *stats) 89; 90static void reuse_cache(io_channel channel, struct unix_private_data *data, 91 struct unix_cache *cache, unsigned long block); 92 93/* __FreeBSD_kernel__ is defined by GNU/kFreeBSD - the FreeBSD kernel 94 * does not know buffered block devices - everything is raw. */ 95#if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 96#define NEED_BOUNCE_BUFFER 97#else 98#undef NEED_BOUNCE_BUFFER 99#endif 100 101static struct struct_io_manager struct_unix_manager = { 102 EXT2_ET_MAGIC_IO_MANAGER, 103 "Unix I/O Manager", 104 unix_open, 105 unix_close, 106 unix_set_blksize, 107 unix_read_blk, 108 unix_write_blk, 109 unix_flush, 110#ifdef NEED_BOUNCE_BUFFER 111 0, 112#else 113 unix_write_byte, 114#endif 115 unix_set_option, 116 unix_get_stats, 117}; 118 119io_manager unix_io_manager = &struct_unix_manager; 120 121static errcode_t unix_get_stats(io_channel channel, io_stats *stats) 122{ 123 errcode_t retval = 0; 124 125 struct unix_private_data *data; 126 127 EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL); 128 data = (struct unix_private_data *) channel->private_data; 129 EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL); 130 131 if (stats) 132 *stats = &data->io_stats; 133 134 return retval; 135} 136 137/* 138 * Here are the raw I/O functions 139 */ 140#ifndef NEED_BOUNCE_BUFFER 141static errcode_t raw_read_blk(io_channel channel, 142 struct unix_private_data *data, 143 unsigned long block, 144 int count, void *buf) 145{ 146 errcode_t retval; 147 ssize_t size; 148 ext2_loff_t location; 149 int actual = 0; 150 151 size = (count < 0) ? -count : count * channel->block_size; 152 data->io_stats.bytes_read += size; 153 location = ((ext2_loff_t) block * channel->block_size) + data->offset; 154 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) { 155 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED; 156 goto error_out; 157 } 158 actual = read(data->dev, buf, size); 159 if (actual != size) { 160 if (actual < 0) 161 actual = 0; 162 retval = EXT2_ET_SHORT_READ; 163 goto error_out; 164 } 165 return 0; 166 167error_out: 168 memset((char *) buf+actual, 0, size-actual); 169 if (channel->read_error) 170 retval = (channel->read_error)(channel, block, count, buf, 171 size, actual, retval); 172 return retval; 173} 174#else /* NEED_BOUNCE_BUFFER */ 175/* 176 * Windows and FreeBSD block devices only allow sector alignment IO in offset and size 177 */ 178static errcode_t raw_read_blk(io_channel channel, 179 struct unix_private_data *data, 180 unsigned long block, 181 int count, void *buf) 182{ 183 errcode_t retval; 184 size_t size, alignsize, fragment; 185 ext2_loff_t location; 186 int total = 0, actual; 187#define BLOCKALIGN 512 188 char sector[BLOCKALIGN]; 189 190 size = (count < 0) ? -count : count * channel->block_size; 191 data->io_stats.bytes_read += size; 192 location = ((ext2_loff_t) block * channel->block_size) + data->offset; 193#ifdef DEBUG 194 printf("count=%d, size=%d, block=%lu, blk_size=%d, location=%llx\n", 195 count, size, block, channel->block_size, (long long)location); 196#endif 197 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) { 198 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED; 199 goto error_out; 200 } 201 fragment = size % BLOCKALIGN; 202 alignsize = size - fragment; 203 if (alignsize) { 204 actual = read(data->dev, buf, alignsize); 205 if (actual != alignsize) 206 goto short_read; 207 } 208 if (fragment) { 209 actual = read(data->dev, sector, BLOCKALIGN); 210 if (actual != BLOCKALIGN) 211 goto short_read; 212 memcpy(buf+alignsize, sector, fragment); 213 } 214 return 0; 215 216short_read: 217 if (actual>0) 218 total += actual; 219 retval = EXT2_ET_SHORT_READ; 220 221error_out: 222 memset((char *) buf+total, 0, size-actual); 223 if (channel->read_error) 224 retval = (channel->read_error)(channel, block, count, buf, 225 size, actual, retval); 226 return retval; 227} 228#endif 229 230static errcode_t raw_write_blk(io_channel channel, 231 struct unix_private_data *data, 232 unsigned long block, 233 int count, const void *buf) 234{ 235 ssize_t size; 236 ext2_loff_t location; 237 int actual = 0; 238 errcode_t retval; 239 240 if (count == 1) 241 size = channel->block_size; 242 else { 243 if (count < 0) 244 size = -count; 245 else 246 size = count * channel->block_size; 247 } 248 data->io_stats.bytes_written += size; 249 250 location = ((ext2_loff_t) block * channel->block_size) + data->offset; 251 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) { 252 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED; 253 goto error_out; 254 } 255 256 actual = write(data->dev, buf, size); 257 if (actual != size) { 258 retval = EXT2_ET_SHORT_WRITE; 259 goto error_out; 260 } 261 return 0; 262 263error_out: 264 if (channel->write_error) 265 retval = (channel->write_error)(channel, block, count, buf, 266 size, actual, retval); 267 return retval; 268} 269 270 271/* 272 * Here we implement the cache functions 273 */ 274 275/* Allocate the cache buffers */ 276static errcode_t alloc_cache(io_channel channel, 277 struct unix_private_data *data) 278{ 279 errcode_t retval; 280 struct unix_cache *cache; 281 int i; 282 283 data->access_time = 0; 284 for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) { 285 cache->block = 0; 286 cache->access_time = 0; 287 cache->dirty = 0; 288 cache->in_use = 0; 289 if ((retval = ext2fs_get_mem(channel->block_size, 290 &cache->buf))) 291 return retval; 292 } 293 return 0; 294} 295 296/* Free the cache buffers */ 297static void free_cache(struct unix_private_data *data) 298{ 299 struct unix_cache *cache; 300 int i; 301 302 data->access_time = 0; 303 for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) { 304 cache->block = 0; 305 cache->access_time = 0; 306 cache->dirty = 0; 307 cache->in_use = 0; 308 if (cache->buf) 309 ext2fs_free_mem(&cache->buf); 310 cache->buf = 0; 311 } 312} 313 314#ifndef NO_IO_CACHE 315/* 316 * Try to find a block in the cache. If the block is not found, and 317 * eldest is a non-zero pointer, then fill in eldest with the cache 318 * entry to that should be reused. 319 */ 320static struct unix_cache *find_cached_block(struct unix_private_data *data, 321 unsigned long block, 322 struct unix_cache **eldest) 323{ 324 struct unix_cache *cache, *unused_cache, *oldest_cache; 325 int i; 326 327 unused_cache = oldest_cache = 0; 328 for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) { 329 if (!cache->in_use) { 330 if (!unused_cache) 331 unused_cache = cache; 332 continue; 333 } 334 if (cache->block == block) { 335 cache->access_time = ++data->access_time; 336 return cache; 337 } 338 if (!oldest_cache || 339 (cache->access_time < oldest_cache->access_time)) 340 oldest_cache = cache; 341 } 342 if (eldest) 343 *eldest = (unused_cache) ? unused_cache : oldest_cache; 344 return 0; 345} 346 347/* 348 * Reuse a particular cache entry for another block. 349 */ 350static void reuse_cache(io_channel channel, struct unix_private_data *data, 351 struct unix_cache *cache, unsigned long block) 352{ 353 if (cache->dirty && cache->in_use) 354 raw_write_blk(channel, data, cache->block, 1, cache->buf); 355 356 cache->in_use = 1; 357 cache->dirty = 0; 358 cache->block = block; 359 cache->access_time = ++data->access_time; 360} 361 362/* 363 * Flush all of the blocks in the cache 364 */ 365static errcode_t flush_cached_blocks(io_channel channel, 366 struct unix_private_data *data, 367 int invalidate) 368 369{ 370 struct unix_cache *cache; 371 errcode_t retval, retval2; 372 int i; 373 374 retval2 = 0; 375 for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) { 376 if (!cache->in_use) 377 continue; 378 379 if (invalidate) 380 cache->in_use = 0; 381 382 if (!cache->dirty) 383 continue; 384 385 retval = raw_write_blk(channel, data, 386 cache->block, 1, cache->buf); 387 if (retval) 388 retval2 = retval; 389 else 390 cache->dirty = 0; 391 } 392 return retval2; 393} 394#endif /* NO_IO_CACHE */ 395 396static errcode_t unix_open(const char *name, int flags, io_channel *channel) 397{ 398 io_channel io = NULL; 399 struct unix_private_data *data = NULL; 400 errcode_t retval; 401 int open_flags; 402 struct stat st; 403#ifdef __linux__ 404 struct utsname ut; 405#endif 406 407 if (name == 0) 408 return EXT2_ET_BAD_DEVICE_NAME; 409 retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io); 410 if (retval) 411 return retval; 412 memset(io, 0, sizeof(struct struct_io_channel)); 413 io->magic = EXT2_ET_MAGIC_IO_CHANNEL; 414 retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data); 415 if (retval) 416 goto cleanup; 417 418 io->manager = unix_io_manager; 419 retval = ext2fs_get_mem(strlen(name)+1, &io->name); 420 if (retval) 421 goto cleanup; 422 423 strcpy(io->name, name); 424 io->private_data = data; 425 io->block_size = 1024; 426 io->read_error = 0; 427 io->write_error = 0; 428 io->refcount = 1; 429 430 memset(data, 0, sizeof(struct unix_private_data)); 431 data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL; 432 data->io_stats.num_fields = 2; 433 434 if ((retval = alloc_cache(io, data))) 435 goto cleanup; 436 437 open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY; 438 if (flags & IO_FLAG_EXCLUSIVE) 439 open_flags |= O_EXCL; 440#ifdef HAVE_OPEN64 441 data->dev = open64(io->name, open_flags); 442#else 443 data->dev = open(io->name, open_flags); 444#endif 445 if (data->dev < 0) { 446 retval = errno; 447 goto cleanup; 448 } 449 450#ifdef __linux__ 451#undef RLIM_INFINITY 452#if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4))) 453#define RLIM_INFINITY ((unsigned long)(~0UL>>1)) 454#else 455#define RLIM_INFINITY (~0UL) 456#endif 457 /* 458 * Work around a bug in 2.4.10-2.4.18 kernels where writes to 459 * block devices are wrongly getting hit by the filesize 460 * limit. This workaround isn't perfect, since it won't work 461 * if glibc wasn't built against 2.2 header files. (Sigh.) 462 * 463 */ 464 if ((flags & IO_FLAG_RW) && 465 (uname(&ut) == 0) && 466 ((ut.release[0] == '2') && (ut.release[1] == '.') && 467 (ut.release[2] == '4') && (ut.release[3] == '.') && 468 (ut.release[4] == '1') && (ut.release[5] >= '0') && 469 (ut.release[5] < '8')) && 470 (fstat(data->dev, &st) == 0) && 471 (S_ISBLK(st.st_mode))) { 472 struct rlimit rlim; 473 474 rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY; 475 setrlimit(RLIMIT_FSIZE, &rlim); 476 getrlimit(RLIMIT_FSIZE, &rlim); 477 if (((unsigned long) rlim.rlim_cur) < 478 ((unsigned long) rlim.rlim_max)) { 479 rlim.rlim_cur = rlim.rlim_max; 480 setrlimit(RLIMIT_FSIZE, &rlim); 481 } 482 } 483#endif 484 *channel = io; 485 return 0; 486 487cleanup: 488 if (data) { 489 free_cache(data); 490 ext2fs_free_mem(&data); 491 } 492 if (io) 493 ext2fs_free_mem(&io); 494 return retval; 495} 496 497static errcode_t unix_close(io_channel channel) 498{ 499 struct unix_private_data *data; 500 errcode_t retval = 0; 501 502 EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL); 503 data = (struct unix_private_data *) channel->private_data; 504 EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL); 505 506 if (--channel->refcount > 0) 507 return 0; 508 509#ifndef NO_IO_CACHE 510 retval = flush_cached_blocks(channel, data, 0); 511#endif 512 513 if (close(data->dev) < 0) 514 retval = errno; 515 free_cache(data); 516 517 ext2fs_free_mem(&channel->private_data); 518 if (channel->name) 519 ext2fs_free_mem(&channel->name); 520 ext2fs_free_mem(&channel); 521 return retval; 522} 523 524static errcode_t unix_set_blksize(io_channel channel, int blksize) 525{ 526 struct unix_private_data *data; 527 errcode_t retval; 528 529 EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL); 530 data = (struct unix_private_data *) channel->private_data; 531 EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL); 532 533 if (channel->block_size != blksize) { 534#ifndef NO_IO_CACHE 535 if ((retval = flush_cached_blocks(channel, data, 0))) 536 return retval; 537#endif 538 539 channel->block_size = blksize; 540 free_cache(data); 541 if ((retval = alloc_cache(channel, data))) 542 return retval; 543 } 544 return 0; 545} 546 547 548static errcode_t unix_read_blk(io_channel channel, unsigned long block, 549 int count, void *buf) 550{ 551 struct unix_private_data *data; 552 struct unix_cache *cache, *reuse[READ_DIRECT_SIZE]; 553 errcode_t retval; 554 char *cp; 555 int i, j; 556 557 EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL); 558 data = (struct unix_private_data *) channel->private_data; 559 EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL); 560 561#ifdef NO_IO_CACHE 562 return raw_read_blk(channel, data, block, count, buf); 563#else 564 /* 565 * If we're doing an odd-sized read or a very large read, 566 * flush out the cache and then do a direct read. 567 */ 568 if (count < 0 || count > WRITE_DIRECT_SIZE) { 569 if ((retval = flush_cached_blocks(channel, data, 0))) 570 return retval; 571 return raw_read_blk(channel, data, block, count, buf); 572 } 573 574 cp = buf; 575 while (count > 0) { 576 /* If it's in the cache, use it! */ 577 if ((cache = find_cached_block(data, block, &reuse[0]))) { 578#ifdef DEBUG 579 printf("Using cached block %lu\n", block); 580#endif 581 memcpy(cp, cache->buf, channel->block_size); 582 count--; 583 block++; 584 cp += channel->block_size; 585 continue; 586 } 587 /* 588 * Find the number of uncached blocks so we can do a 589 * single read request 590 */ 591 for (i=1; i < count; i++) 592 if (find_cached_block(data, block+i, &reuse[i])) 593 break; 594#ifdef DEBUG 595 printf("Reading %d blocks starting at %lu\n", i, block); 596#endif 597 if ((retval = raw_read_blk(channel, data, block, i, cp))) 598 return retval; 599 600 /* Save the results in the cache */ 601 for (j=0; j < i; j++) { 602 count--; 603 cache = reuse[j]; 604 reuse_cache(channel, data, cache, block++); 605 memcpy(cache->buf, cp, channel->block_size); 606 cp += channel->block_size; 607 } 608 } 609 return 0; 610#endif /* NO_IO_CACHE */ 611} 612 613static errcode_t unix_write_blk(io_channel channel, unsigned long block, 614 int count, const void *buf) 615{ 616 struct unix_private_data *data; 617 struct unix_cache *cache, *reuse; 618 errcode_t retval = 0; 619 const char *cp; 620 int writethrough; 621 622 EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL); 623 data = (struct unix_private_data *) channel->private_data; 624 EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL); 625 626#ifdef NO_IO_CACHE 627 return raw_write_blk(channel, data, block, count, buf); 628#else 629 /* 630 * If we're doing an odd-sized write or a very large write, 631 * flush out the cache completely and then do a direct write. 632 */ 633 if (count < 0 || count > WRITE_DIRECT_SIZE) { 634 if ((retval = flush_cached_blocks(channel, data, 1))) 635 return retval; 636 return raw_write_blk(channel, data, block, count, buf); 637 } 638 639 /* 640 * For a moderate-sized multi-block write, first force a write 641 * if we're in write-through cache mode, and then fill the 642 * cache with the blocks. 643 */ 644 writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH; 645 if (writethrough) 646 retval = raw_write_blk(channel, data, block, count, buf); 647 648 cp = buf; 649 while (count > 0) { 650 cache = find_cached_block(data, block, &reuse); 651 if (!cache) { 652 cache = reuse; 653 reuse_cache(channel, data, cache, block); 654 } 655 memcpy(cache->buf, cp, channel->block_size); 656 cache->dirty = !writethrough; 657 count--; 658 block++; 659 cp += channel->block_size; 660 } 661 return retval; 662#endif /* NO_IO_CACHE */ 663} 664 665static errcode_t unix_write_byte(io_channel channel, unsigned long offset, 666 int size, const void *buf) 667{ 668 struct unix_private_data *data; 669 errcode_t retval = 0; 670 ssize_t actual; 671 672 EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL); 673 data = (struct unix_private_data *) channel->private_data; 674 EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL); 675 676#ifndef NO_IO_CACHE 677 /* 678 * Flush out the cache completely 679 */ 680 if ((retval = flush_cached_blocks(channel, data, 1))) 681 return retval; 682#endif 683 684 if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0) 685 return errno; 686 687 actual = write(data->dev, buf, size); 688 if (actual != size) 689 return EXT2_ET_SHORT_WRITE; 690 691 return 0; 692} 693 694/* 695 * Flush data buffers to disk. 696 */ 697static errcode_t unix_flush(io_channel channel) 698{ 699 struct unix_private_data *data; 700 errcode_t retval = 0; 701 702 EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL); 703 data = (struct unix_private_data *) channel->private_data; 704 EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL); 705 706#ifndef NO_IO_CACHE 707 retval = flush_cached_blocks(channel, data, 0); 708#endif 709 fsync(data->dev); 710 return retval; 711} 712 713static errcode_t unix_set_option(io_channel channel, const char *option, 714 const char *arg) 715{ 716 struct unix_private_data *data; 717 unsigned long long tmp; 718 char *end; 719 720 EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL); 721 data = (struct unix_private_data *) channel->private_data; 722 EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL); 723 724 if (!strcmp(option, "offset")) { 725 if (!arg) 726 return EXT2_ET_INVALID_ARGUMENT; 727 728 tmp = strtoull(arg, &end, 0); 729 if (*end) 730 return EXT2_ET_INVALID_ARGUMENT; 731 data->offset = tmp; 732 if (data->offset < 0) 733 return EXT2_ET_INVALID_ARGUMENT; 734 return 0; 735 } 736 return EXT2_ET_INVALID_ARGUMENT; 737} 738