readahead.c revision c743d96b6d2ff55a94df7b5ac7c74987bb9c343b
1/* 2 * mm/readahead.c - address_space-level file readahead. 3 * 4 * Copyright (C) 2002, Linus Torvalds 5 * 6 * 09Apr2002 akpm@zip.com.au 7 * Initial version. 8 */ 9 10#include <linux/kernel.h> 11#include <linux/fs.h> 12#include <linux/mm.h> 13#include <linux/module.h> 14#include <linux/blkdev.h> 15#include <linux/backing-dev.h> 16#include <linux/task_io_accounting_ops.h> 17#include <linux/pagevec.h> 18 19void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 20{ 21} 22EXPORT_SYMBOL(default_unplug_io_fn); 23 24/* 25 * Convienent macros for min/max read-ahead pages. 26 * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up. 27 * The latter is necessary for systems with large page size(i.e. 64k). 28 */ 29#define MAX_RA_PAGES (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE) 30#define MIN_RA_PAGES DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE) 31 32struct backing_dev_info default_backing_dev_info = { 33 .ra_pages = MAX_RA_PAGES, 34 .state = 0, 35 .capabilities = BDI_CAP_MAP_COPY, 36 .unplug_io_fn = default_unplug_io_fn, 37}; 38EXPORT_SYMBOL_GPL(default_backing_dev_info); 39 40/* 41 * Initialise a struct file's readahead state. Assumes that the caller has 42 * memset *ra to zero. 43 */ 44void 45file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) 46{ 47 ra->ra_pages = mapping->backing_dev_info->ra_pages; 48 ra->prev_index = -1; 49} 50EXPORT_SYMBOL_GPL(file_ra_state_init); 51 52#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) 53 54/** 55 * read_cache_pages - populate an address space with some pages & start reads against them 56 * @mapping: the address_space 57 * @pages: The address of a list_head which contains the target pages. These 58 * pages have their ->index populated and are otherwise uninitialised. 59 * @filler: callback routine for filling a single page. 60 * @data: private data for the callback routine. 61 * 62 * Hides the details of the LRU cache etc from the filesystems. 63 */ 64int read_cache_pages(struct address_space *mapping, struct list_head *pages, 65 int (*filler)(void *, struct page *), void *data) 66{ 67 struct page *page; 68 struct pagevec lru_pvec; 69 int ret = 0; 70 71 pagevec_init(&lru_pvec, 0); 72 73 while (!list_empty(pages)) { 74 page = list_to_page(pages); 75 list_del(&page->lru); 76 if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { 77 page_cache_release(page); 78 continue; 79 } 80 ret = filler(data, page); 81 if (!pagevec_add(&lru_pvec, page)) 82 __pagevec_lru_add(&lru_pvec); 83 if (ret) { 84 put_pages_list(pages); 85 break; 86 } 87 task_io_account_read(PAGE_CACHE_SIZE); 88 } 89 pagevec_lru_add(&lru_pvec); 90 return ret; 91} 92 93EXPORT_SYMBOL(read_cache_pages); 94 95static int read_pages(struct address_space *mapping, struct file *filp, 96 struct list_head *pages, unsigned nr_pages) 97{ 98 unsigned page_idx; 99 struct pagevec lru_pvec; 100 int ret; 101 102 if (mapping->a_ops->readpages) { 103 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 104 /* Clean up the remaining pages */ 105 put_pages_list(pages); 106 goto out; 107 } 108 109 pagevec_init(&lru_pvec, 0); 110 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 111 struct page *page = list_to_page(pages); 112 list_del(&page->lru); 113 if (!add_to_page_cache(page, mapping, 114 page->index, GFP_KERNEL)) { 115 mapping->a_ops->readpage(filp, page); 116 if (!pagevec_add(&lru_pvec, page)) 117 __pagevec_lru_add(&lru_pvec); 118 } else 119 page_cache_release(page); 120 } 121 pagevec_lru_add(&lru_pvec); 122 ret = 0; 123out: 124 return ret; 125} 126 127/* 128 * do_page_cache_readahead actually reads a chunk of disk. It allocates all 129 * the pages first, then submits them all for I/O. This avoids the very bad 130 * behaviour which would occur if page allocations are causing VM writeback. 131 * We really don't want to intermingle reads and writes like that. 132 * 133 * Returns the number of pages requested, or the maximum amount of I/O allowed. 134 * 135 * do_page_cache_readahead() returns -1 if it encountered request queue 136 * congestion. 137 */ 138static int 139__do_page_cache_readahead(struct address_space *mapping, struct file *filp, 140 pgoff_t offset, unsigned long nr_to_read, 141 unsigned long lookahead_size) 142{ 143 struct inode *inode = mapping->host; 144 struct page *page; 145 unsigned long end_index; /* The last page we want to read */ 146 LIST_HEAD(page_pool); 147 int page_idx; 148 int ret = 0; 149 loff_t isize = i_size_read(inode); 150 151 if (isize == 0) 152 goto out; 153 154 end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); 155 156 /* 157 * Preallocate as many pages as we will need. 158 */ 159 read_lock_irq(&mapping->tree_lock); 160 for (page_idx = 0; page_idx < nr_to_read; page_idx++) { 161 pgoff_t page_offset = offset + page_idx; 162 163 if (page_offset > end_index) 164 break; 165 166 page = radix_tree_lookup(&mapping->page_tree, page_offset); 167 if (page) 168 continue; 169 170 read_unlock_irq(&mapping->tree_lock); 171 page = page_cache_alloc_cold(mapping); 172 read_lock_irq(&mapping->tree_lock); 173 if (!page) 174 break; 175 page->index = page_offset; 176 list_add(&page->lru, &page_pool); 177 if (page_idx == nr_to_read - lookahead_size) 178 SetPageReadahead(page); 179 ret++; 180 } 181 read_unlock_irq(&mapping->tree_lock); 182 183 /* 184 * Now start the IO. We ignore I/O errors - if the page is not 185 * uptodate then the caller will launch readpage again, and 186 * will then handle the error. 187 */ 188 if (ret) 189 read_pages(mapping, filp, &page_pool, ret); 190 BUG_ON(!list_empty(&page_pool)); 191out: 192 return ret; 193} 194 195/* 196 * Chunk the readahead into 2 megabyte units, so that we don't pin too much 197 * memory at once. 198 */ 199int force_page_cache_readahead(struct address_space *mapping, struct file *filp, 200 pgoff_t offset, unsigned long nr_to_read) 201{ 202 int ret = 0; 203 204 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 205 return -EINVAL; 206 207 while (nr_to_read) { 208 int err; 209 210 unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; 211 212 if (this_chunk > nr_to_read) 213 this_chunk = nr_to_read; 214 err = __do_page_cache_readahead(mapping, filp, 215 offset, this_chunk, 0); 216 if (err < 0) { 217 ret = err; 218 break; 219 } 220 ret += err; 221 offset += this_chunk; 222 nr_to_read -= this_chunk; 223 } 224 return ret; 225} 226 227/* 228 * This version skips the IO if the queue is read-congested, and will tell the 229 * block layer to abandon the readahead if request allocation would block. 230 * 231 * force_page_cache_readahead() will ignore queue congestion and will block on 232 * request queues. 233 */ 234int do_page_cache_readahead(struct address_space *mapping, struct file *filp, 235 pgoff_t offset, unsigned long nr_to_read) 236{ 237 if (bdi_read_congested(mapping->backing_dev_info)) 238 return -1; 239 240 return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); 241} 242 243/* 244 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a 245 * sensible upper limit. 246 */ 247unsigned long max_sane_readahead(unsigned long nr) 248{ 249 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) 250 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); 251} 252 253/* 254 * Submit IO for the read-ahead request in file_ra_state. 255 */ 256unsigned long ra_submit(struct file_ra_state *ra, 257 struct address_space *mapping, struct file *filp) 258{ 259 unsigned long ra_size; 260 unsigned long la_size; 261 int actual; 262 263 ra_size = ra_readahead_size(ra); 264 la_size = ra_lookahead_size(ra); 265 actual = __do_page_cache_readahead(mapping, filp, 266 ra->ra_index, ra_size, la_size); 267 268 return actual; 269} 270EXPORT_SYMBOL_GPL(ra_submit); 271 272/* 273 * Set the initial window size, round to next power of 2 and square 274 * for small size, x 4 for medium, and x 2 for large 275 * for 128k (32 page) max ra 276 * 1-8 page = 32k initial, > 8 page = 128k initial 277 */ 278static unsigned long get_init_ra_size(unsigned long size, unsigned long max) 279{ 280 unsigned long newsize = roundup_pow_of_two(size); 281 282 if (newsize <= max / 32) 283 newsize = newsize * 4; 284 else if (newsize <= max / 4) 285 newsize = newsize * 2; 286 else 287 newsize = max; 288 289 return newsize; 290} 291 292/* 293 * Get the previous window size, ramp it up, and 294 * return it as the new window size. 295 */ 296static unsigned long get_next_ra_size(struct file_ra_state *ra, 297 unsigned long max) 298{ 299 unsigned long cur = ra->readahead_index - ra->ra_index; 300 unsigned long newsize; 301 302 if (cur < max / 16) 303 newsize = 4 * cur; 304 else 305 newsize = 2 * cur; 306 307 return min(newsize, max); 308} 309 310/* 311 * On-demand readahead design. 312 * 313 * The fields in struct file_ra_state represent the most-recently-executed 314 * readahead attempt: 315 * 316 * |-------- last readahead window -------->| 317 * |-- application walking here -->| 318 * ======#============|==================#=====================| 319 * ^la_index ^ra_index ^lookahead_index ^readahead_index 320 * 321 * [ra_index, readahead_index) represents the last readahead window. 322 * 323 * [la_index, lookahead_index] is where the application would be walking(in 324 * the common case of cache-cold sequential reads): the last window was 325 * established when the application was at la_index, and the next window will 326 * be bring in when the application reaches lookahead_index. 327 * 328 * To overlap application thinking time and disk I/O time, we do 329 * `readahead pipelining': Do not wait until the application consumed all 330 * readahead pages and stalled on the missing page at readahead_index; 331 * Instead, submit an asynchronous readahead I/O as early as the application 332 * reads on the page at lookahead_index. Normally lookahead_index will be 333 * equal to ra_index, for maximum pipelining. 334 * 335 * In interleaved sequential reads, concurrent streams on the same fd can 336 * be invalidating each other's readahead state. So we flag the new readahead 337 * page at lookahead_index with PG_readahead, and use it as readahead 338 * indicator. The flag won't be set on already cached pages, to avoid the 339 * readahead-for-nothing fuss, saving pointless page cache lookups. 340 * 341 * prev_index tracks the last visited page in the _previous_ read request. 342 * It should be maintained by the caller, and will be used for detecting 343 * small random reads. Note that the readahead algorithm checks loosely 344 * for sequential patterns. Hence interleaved reads might be served as 345 * sequential ones. 346 * 347 * There is a special-case: if the first page which the application tries to 348 * read happens to be the first page of the file, it is assumed that a linear 349 * read is about to happen and the window is immediately set to the initial size 350 * based on I/O request size and the max_readahead. 351 * 352 * The code ramps up the readahead size aggressively at first, but slow down as 353 * it approaches max_readhead. 354 */ 355 356/* 357 * A minimal readahead algorithm for trivial sequential/random reads. 358 */ 359static unsigned long 360ondemand_readahead(struct address_space *mapping, 361 struct file_ra_state *ra, struct file *filp, 362 struct page *page, pgoff_t offset, 363 unsigned long req_size) 364{ 365 unsigned long max; /* max readahead pages */ 366 pgoff_t ra_index; /* readahead index */ 367 unsigned long ra_size; /* readahead size */ 368 unsigned long la_size; /* lookahead size */ 369 int sequential; 370 371 max = ra->ra_pages; 372 sequential = (offset - ra->prev_index <= 1UL) || (req_size > max); 373 374 /* 375 * Lookahead/readahead hit, assume sequential access. 376 * Ramp up sizes, and push forward the readahead window. 377 */ 378 if (offset && (offset == ra->lookahead_index || 379 offset == ra->readahead_index)) { 380 ra_index = ra->readahead_index; 381 ra_size = get_next_ra_size(ra, max); 382 la_size = ra_size; 383 goto fill_ra; 384 } 385 386 /* 387 * Standalone, small read. 388 * Read as is, and do not pollute the readahead state. 389 */ 390 if (!page && !sequential) { 391 return __do_page_cache_readahead(mapping, filp, 392 offset, req_size, 0); 393 } 394 395 /* 396 * It may be one of 397 * - first read on start of file 398 * - sequential cache miss 399 * - oversize random read 400 * Start readahead for it. 401 */ 402 ra_index = offset; 403 ra_size = get_init_ra_size(req_size, max); 404 la_size = ra_size > req_size ? ra_size - req_size : ra_size; 405 406 /* 407 * Hit on a lookahead page without valid readahead state. 408 * E.g. interleaved reads. 409 * Not knowing its readahead pos/size, bet on the minimal possible one. 410 */ 411 if (page) { 412 ra_index++; 413 ra_size = min(4 * ra_size, max); 414 } 415 416fill_ra: 417 ra_set_index(ra, offset, ra_index); 418 ra_set_size(ra, ra_size, la_size); 419 420 return ra_submit(ra, mapping, filp); 421} 422 423/** 424 * page_cache_readahead_ondemand - generic file readahead 425 * @mapping: address_space which holds the pagecache and I/O vectors 426 * @ra: file_ra_state which holds the readahead state 427 * @filp: passed on to ->readpage() and ->readpages() 428 * @page: the page at @offset, or NULL if non-present 429 * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units 430 * @req_size: hint: total size of the read which the caller is performing in 431 * PAGE_CACHE_SIZE units 432 * 433 * page_cache_readahead_ondemand() is the entry point of readahead logic. 434 * This function should be called when it is time to perform readahead: 435 * 1) @page == NULL 436 * A cache miss happened, time for synchronous readahead. 437 * 2) @page != NULL && PageReadahead(@page) 438 * A look-ahead hit occured, time for asynchronous readahead. 439 */ 440unsigned long 441page_cache_readahead_ondemand(struct address_space *mapping, 442 struct file_ra_state *ra, struct file *filp, 443 struct page *page, pgoff_t offset, 444 unsigned long req_size) 445{ 446 /* no read-ahead */ 447 if (!ra->ra_pages) 448 return 0; 449 450 if (page) { 451 ClearPageReadahead(page); 452 453 /* 454 * Defer asynchronous read-ahead on IO congestion. 455 */ 456 if (bdi_read_congested(mapping->backing_dev_info)) 457 return 0; 458 } 459 460 /* do read-ahead */ 461 return ondemand_readahead(mapping, ra, filp, page, 462 offset, req_size); 463} 464EXPORT_SYMBOL_GPL(page_cache_readahead_ondemand); 465