aops.c revision c0420ad2ca514551ca086510b0e7d17a05c70492
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/fs.h>
23#include <linux/slab.h>
24#include <linux/highmem.h>
25#include <linux/pagemap.h>
26#include <asm/byteorder.h>
27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h>
30
31#define MLOG_MASK_PREFIX ML_FILE_IO
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "alloc.h"
37#include "aops.h"
38#include "dlmglue.h"
39#include "extent_map.h"
40#include "file.h"
41#include "inode.h"
42#include "journal.h"
43#include "suballoc.h"
44#include "super.h"
45#include "symlink.h"
46
47#include "buffer_head_io.h"
48
49static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
50				   struct buffer_head *bh_result, int create)
51{
52	int err = -EIO;
53	int status;
54	struct ocfs2_dinode *fe = NULL;
55	struct buffer_head *bh = NULL;
56	struct buffer_head *buffer_cache_bh = NULL;
57	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
58	void *kaddr;
59
60	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
61		   (unsigned long long)iblock, bh_result, create);
62
63	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
64
65	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
66		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
67		     (unsigned long long)iblock);
68		goto bail;
69	}
70
71	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
72				  OCFS2_I(inode)->ip_blkno,
73				  &bh, OCFS2_BH_CACHED, inode);
74	if (status < 0) {
75		mlog_errno(status);
76		goto bail;
77	}
78	fe = (struct ocfs2_dinode *) bh->b_data;
79
80	if (!OCFS2_IS_VALID_DINODE(fe)) {
81		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
82		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
83		     fe->i_signature);
84		goto bail;
85	}
86
87	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
88						    le32_to_cpu(fe->i_clusters))) {
89		mlog(ML_ERROR, "block offset is outside the allocated size: "
90		     "%llu\n", (unsigned long long)iblock);
91		goto bail;
92	}
93
94	/* We don't use the page cache to create symlink data, so if
95	 * need be, copy it over from the buffer cache. */
96	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
97		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
98			    iblock;
99		buffer_cache_bh = sb_getblk(osb->sb, blkno);
100		if (!buffer_cache_bh) {
101			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
102			goto bail;
103		}
104
105		/* we haven't locked out transactions, so a commit
106		 * could've happened. Since we've got a reference on
107		 * the bh, even if it commits while we're doing the
108		 * copy, the data is still good. */
109		if (buffer_jbd(buffer_cache_bh)
110		    && ocfs2_inode_is_new(inode)) {
111			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
112			if (!kaddr) {
113				mlog(ML_ERROR, "couldn't kmap!\n");
114				goto bail;
115			}
116			memcpy(kaddr + (bh_result->b_size * iblock),
117			       buffer_cache_bh->b_data,
118			       bh_result->b_size);
119			kunmap_atomic(kaddr, KM_USER0);
120			set_buffer_uptodate(bh_result);
121		}
122		brelse(buffer_cache_bh);
123	}
124
125	map_bh(bh_result, inode->i_sb,
126	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
127
128	err = 0;
129
130bail:
131	if (bh)
132		brelse(bh);
133
134	mlog_exit(err);
135	return err;
136}
137
138static int ocfs2_get_block(struct inode *inode, sector_t iblock,
139			   struct buffer_head *bh_result, int create)
140{
141	int err = 0;
142	unsigned int ext_flags;
143	u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
144	u64 p_blkno, count, past_eof;
145	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
146
147	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
148		   (unsigned long long)iblock, bh_result, create);
149
150	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
151		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
152		     inode, inode->i_ino);
153
154	if (S_ISLNK(inode->i_mode)) {
155		/* this always does I/O for some reason. */
156		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
157		goto bail;
158	}
159
160	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
161					  &ext_flags);
162	if (err) {
163		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
164		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
165		     (unsigned long long)p_blkno);
166		goto bail;
167	}
168
169	if (max_blocks < count)
170		count = max_blocks;
171
172	/*
173	 * ocfs2 never allocates in this function - the only time we
174	 * need to use BH_New is when we're extending i_size on a file
175	 * system which doesn't support holes, in which case BH_New
176	 * allows block_prepare_write() to zero.
177	 *
178	 * If we see this on a sparse file system, then a truncate has
179	 * raced us and removed the cluster. In this case, we clear
180	 * the buffers dirty and uptodate bits and let the buffer code
181	 * ignore it as a hole.
182	 */
183	if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
184		clear_buffer_dirty(bh_result);
185		clear_buffer_uptodate(bh_result);
186		goto bail;
187	}
188
189	/* Treat the unwritten extent as a hole for zeroing purposes. */
190	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
191		map_bh(bh_result, inode->i_sb, p_blkno);
192
193	bh_result->b_size = count << inode->i_blkbits;
194
195	if (!ocfs2_sparse_alloc(osb)) {
196		if (p_blkno == 0) {
197			err = -EIO;
198			mlog(ML_ERROR,
199			     "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
200			     (unsigned long long)iblock,
201			     (unsigned long long)p_blkno,
202			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
203			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
204			dump_stack();
205		}
206
207		past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
208		mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
209		     (unsigned long long)past_eof);
210
211		if (create && (iblock >= past_eof))
212			set_buffer_new(bh_result);
213	}
214
215bail:
216	if (err < 0)
217		err = -EIO;
218
219	mlog_exit(err);
220	return err;
221}
222
223int ocfs2_read_inline_data(struct inode *inode, struct page *page,
224			   struct buffer_head *di_bh)
225{
226	void *kaddr;
227	loff_t size;
228	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
229
230	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
231		ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
232			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
233		return -EROFS;
234	}
235
236	size = i_size_read(inode);
237
238	if (size > PAGE_CACHE_SIZE ||
239	    size > ocfs2_max_inline_data(inode->i_sb)) {
240		ocfs2_error(inode->i_sb,
241			    "Inode %llu has with inline data has bad size: %Lu",
242			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
243			    (unsigned long long)size);
244		return -EROFS;
245	}
246
247	kaddr = kmap_atomic(page, KM_USER0);
248	if (size)
249		memcpy(kaddr, di->id2.i_data.id_data, size);
250	/* Clear the remaining part of the page */
251	memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
252	flush_dcache_page(page);
253	kunmap_atomic(kaddr, KM_USER0);
254
255	SetPageUptodate(page);
256
257	return 0;
258}
259
260static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
261{
262	int ret;
263	struct buffer_head *di_bh = NULL;
264	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
265
266	BUG_ON(!PageLocked(page));
267	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
268
269	ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
270			       OCFS2_BH_CACHED, inode);
271	if (ret) {
272		mlog_errno(ret);
273		goto out;
274	}
275
276	ret = ocfs2_read_inline_data(inode, page, di_bh);
277out:
278	unlock_page(page);
279
280	brelse(di_bh);
281	return ret;
282}
283
284static int ocfs2_readpage(struct file *file, struct page *page)
285{
286	struct inode *inode = page->mapping->host;
287	struct ocfs2_inode_info *oi = OCFS2_I(inode);
288	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
289	int ret, unlock = 1;
290
291	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
292
293	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
294	if (ret != 0) {
295		if (ret == AOP_TRUNCATED_PAGE)
296			unlock = 0;
297		mlog_errno(ret);
298		goto out;
299	}
300
301	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
302		ret = AOP_TRUNCATED_PAGE;
303		goto out_inode_unlock;
304	}
305
306	/*
307	 * i_size might have just been updated as we grabed the meta lock.  We
308	 * might now be discovering a truncate that hit on another node.
309	 * block_read_full_page->get_block freaks out if it is asked to read
310	 * beyond the end of a file, so we check here.  Callers
311	 * (generic_file_read, vm_ops->fault) are clever enough to check i_size
312	 * and notice that the page they just read isn't needed.
313	 *
314	 * XXX sys_readahead() seems to get that wrong?
315	 */
316	if (start >= i_size_read(inode)) {
317		zero_user(page, 0, PAGE_SIZE);
318		SetPageUptodate(page);
319		ret = 0;
320		goto out_alloc;
321	}
322
323	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
324		ret = ocfs2_readpage_inline(inode, page);
325	else
326		ret = block_read_full_page(page, ocfs2_get_block);
327	unlock = 0;
328
329out_alloc:
330	up_read(&OCFS2_I(inode)->ip_alloc_sem);
331out_inode_unlock:
332	ocfs2_inode_unlock(inode, 0);
333out:
334	if (unlock)
335		unlock_page(page);
336	mlog_exit(ret);
337	return ret;
338}
339
340/*
341 * This is used only for read-ahead. Failures or difficult to handle
342 * situations are safe to ignore.
343 *
344 * Right now, we don't bother with BH_Boundary - in-inode extent lists
345 * are quite large (243 extents on 4k blocks), so most inodes don't
346 * grow out to a tree. If need be, detecting boundary extents could
347 * trivially be added in a future version of ocfs2_get_block().
348 */
349static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
350			   struct list_head *pages, unsigned nr_pages)
351{
352	int ret, err = -EIO;
353	struct inode *inode = mapping->host;
354	struct ocfs2_inode_info *oi = OCFS2_I(inode);
355	loff_t start;
356	struct page *last;
357
358	/*
359	 * Use the nonblocking flag for the dlm code to avoid page
360	 * lock inversion, but don't bother with retrying.
361	 */
362	ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
363	if (ret)
364		return err;
365
366	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
367		ocfs2_inode_unlock(inode, 0);
368		return err;
369	}
370
371	/*
372	 * Don't bother with inline-data. There isn't anything
373	 * to read-ahead in that case anyway...
374	 */
375	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
376		goto out_unlock;
377
378	/*
379	 * Check whether a remote node truncated this file - we just
380	 * drop out in that case as it's not worth handling here.
381	 */
382	last = list_entry(pages->prev, struct page, lru);
383	start = (loff_t)last->index << PAGE_CACHE_SHIFT;
384	if (start >= i_size_read(inode))
385		goto out_unlock;
386
387	err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
388
389out_unlock:
390	up_read(&oi->ip_alloc_sem);
391	ocfs2_inode_unlock(inode, 0);
392
393	return err;
394}
395
396/* Note: Because we don't support holes, our allocation has
397 * already happened (allocation writes zeros to the file data)
398 * so we don't have to worry about ordered writes in
399 * ocfs2_writepage.
400 *
401 * ->writepage is called during the process of invalidating the page cache
402 * during blocked lock processing.  It can't block on any cluster locks
403 * to during block mapping.  It's relying on the fact that the block
404 * mapping can't have disappeared under the dirty pages that it is
405 * being asked to write back.
406 */
407static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
408{
409	int ret;
410
411	mlog_entry("(0x%p)\n", page);
412
413	ret = block_write_full_page(page, ocfs2_get_block, wbc);
414
415	mlog_exit(ret);
416
417	return ret;
418}
419
420/*
421 * This is called from ocfs2_write_zero_page() which has handled it's
422 * own cluster locking and has ensured allocation exists for those
423 * blocks to be written.
424 */
425int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
426			       unsigned from, unsigned to)
427{
428	int ret;
429
430	ret = block_prepare_write(page, from, to, ocfs2_get_block);
431
432	return ret;
433}
434
435/* Taken from ext3. We don't necessarily need the full blown
436 * functionality yet, but IMHO it's better to cut and paste the whole
437 * thing so we can avoid introducing our own bugs (and easily pick up
438 * their fixes when they happen) --Mark */
439int walk_page_buffers(	handle_t *handle,
440			struct buffer_head *head,
441			unsigned from,
442			unsigned to,
443			int *partial,
444			int (*fn)(	handle_t *handle,
445					struct buffer_head *bh))
446{
447	struct buffer_head *bh;
448	unsigned block_start, block_end;
449	unsigned blocksize = head->b_size;
450	int err, ret = 0;
451	struct buffer_head *next;
452
453	for (	bh = head, block_start = 0;
454		ret == 0 && (bh != head || !block_start);
455	    	block_start = block_end, bh = next)
456	{
457		next = bh->b_this_page;
458		block_end = block_start + blocksize;
459		if (block_end <= from || block_start >= to) {
460			if (partial && !buffer_uptodate(bh))
461				*partial = 1;
462			continue;
463		}
464		err = (*fn)(handle, bh);
465		if (!ret)
466			ret = err;
467	}
468	return ret;
469}
470
471handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
472							 struct page *page,
473							 unsigned from,
474							 unsigned to)
475{
476	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
477	handle_t *handle;
478	int ret = 0;
479
480	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
481	if (IS_ERR(handle)) {
482		ret = -ENOMEM;
483		mlog_errno(ret);
484		goto out;
485	}
486
487	if (ocfs2_should_order_data(inode)) {
488		ret = walk_page_buffers(handle,
489					page_buffers(page),
490					from, to, NULL,
491					ocfs2_journal_dirty_data);
492		if (ret < 0)
493			mlog_errno(ret);
494	}
495out:
496	if (ret) {
497		if (!IS_ERR(handle))
498			ocfs2_commit_trans(osb, handle);
499		handle = ERR_PTR(ret);
500	}
501	return handle;
502}
503
504static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
505{
506	sector_t status;
507	u64 p_blkno = 0;
508	int err = 0;
509	struct inode *inode = mapping->host;
510
511	mlog_entry("(block = %llu)\n", (unsigned long long)block);
512
513	/* We don't need to lock journal system files, since they aren't
514	 * accessed concurrently from multiple nodes.
515	 */
516	if (!INODE_JOURNAL(inode)) {
517		err = ocfs2_inode_lock(inode, NULL, 0);
518		if (err) {
519			if (err != -ENOENT)
520				mlog_errno(err);
521			goto bail;
522		}
523		down_read(&OCFS2_I(inode)->ip_alloc_sem);
524	}
525
526	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
527		err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
528						  NULL);
529
530	if (!INODE_JOURNAL(inode)) {
531		up_read(&OCFS2_I(inode)->ip_alloc_sem);
532		ocfs2_inode_unlock(inode, 0);
533	}
534
535	if (err) {
536		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
537		     (unsigned long long)block);
538		mlog_errno(err);
539		goto bail;
540	}
541
542bail:
543	status = err ? 0 : p_blkno;
544
545	mlog_exit((int)status);
546
547	return status;
548}
549
550/*
551 * TODO: Make this into a generic get_blocks function.
552 *
553 * From do_direct_io in direct-io.c:
554 *  "So what we do is to permit the ->get_blocks function to populate
555 *   bh.b_size with the size of IO which is permitted at this offset and
556 *   this i_blkbits."
557 *
558 * This function is called directly from get_more_blocks in direct-io.c.
559 *
560 * called like this: dio->get_blocks(dio->inode, fs_startblk,
561 * 					fs_count, map_bh, dio->rw == WRITE);
562 */
563static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
564				     struct buffer_head *bh_result, int create)
565{
566	int ret;
567	u64 p_blkno, inode_blocks, contig_blocks;
568	unsigned int ext_flags;
569	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
570	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
571
572	/* This function won't even be called if the request isn't all
573	 * nicely aligned and of the right size, so there's no need
574	 * for us to check any of that. */
575
576	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
577
578	/*
579	 * Any write past EOF is not allowed because we'd be extending.
580	 */
581	if (create && (iblock + max_blocks) > inode_blocks) {
582		ret = -EIO;
583		goto bail;
584	}
585
586	/* This figures out the size of the next contiguous block, and
587	 * our logical offset */
588	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
589					  &contig_blocks, &ext_flags);
590	if (ret) {
591		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
592		     (unsigned long long)iblock);
593		ret = -EIO;
594		goto bail;
595	}
596
597	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
598		ocfs2_error(inode->i_sb,
599			    "Inode %llu has a hole at block %llu\n",
600			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
601			    (unsigned long long)iblock);
602		ret = -EROFS;
603		goto bail;
604	}
605
606	/*
607	 * get_more_blocks() expects us to describe a hole by clearing
608	 * the mapped bit on bh_result().
609	 *
610	 * Consider an unwritten extent as a hole.
611	 */
612	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
613		map_bh(bh_result, inode->i_sb, p_blkno);
614	else {
615		/*
616		 * ocfs2_prepare_inode_for_write() should have caught
617		 * the case where we'd be filling a hole and triggered
618		 * a buffered write instead.
619		 */
620		if (create) {
621			ret = -EIO;
622			mlog_errno(ret);
623			goto bail;
624		}
625
626		clear_buffer_mapped(bh_result);
627	}
628
629	/* make sure we don't map more than max_blocks blocks here as
630	   that's all the kernel will handle at this point. */
631	if (max_blocks < contig_blocks)
632		contig_blocks = max_blocks;
633	bh_result->b_size = contig_blocks << blocksize_bits;
634bail:
635	return ret;
636}
637
638/*
639 * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
640 * particularly interested in the aio/dio case.  Like the core uses
641 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
642 * truncation on another.
643 */
644static void ocfs2_dio_end_io(struct kiocb *iocb,
645			     loff_t offset,
646			     ssize_t bytes,
647			     void *private)
648{
649	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
650	int level;
651
652	/* this io's submitter should not have unlocked this before we could */
653	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
654
655	ocfs2_iocb_clear_rw_locked(iocb);
656
657	level = ocfs2_iocb_rw_locked_level(iocb);
658	if (!level)
659		up_read(&inode->i_alloc_sem);
660	ocfs2_rw_unlock(inode, level);
661}
662
663/*
664 * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
665 * from ext3.  PageChecked() bits have been removed as OCFS2 does not
666 * do journalled data.
667 */
668static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
669{
670	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
671
672	journal_invalidatepage(journal, page, offset);
673}
674
675static int ocfs2_releasepage(struct page *page, gfp_t wait)
676{
677	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
678
679	if (!page_has_buffers(page))
680		return 0;
681	return journal_try_to_free_buffers(journal, page, wait);
682}
683
684static ssize_t ocfs2_direct_IO(int rw,
685			       struct kiocb *iocb,
686			       const struct iovec *iov,
687			       loff_t offset,
688			       unsigned long nr_segs)
689{
690	struct file *file = iocb->ki_filp;
691	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
692	int ret;
693
694	mlog_entry_void();
695
696	/*
697	 * Fallback to buffered I/O if we see an inode without
698	 * extents.
699	 */
700	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
701		return 0;
702
703	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
704					    inode->i_sb->s_bdev, iov, offset,
705					    nr_segs,
706					    ocfs2_direct_IO_get_blocks,
707					    ocfs2_dio_end_io);
708
709	mlog_exit(ret);
710	return ret;
711}
712
713static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
714					    u32 cpos,
715					    unsigned int *start,
716					    unsigned int *end)
717{
718	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
719
720	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
721		unsigned int cpp;
722
723		cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
724
725		cluster_start = cpos % cpp;
726		cluster_start = cluster_start << osb->s_clustersize_bits;
727
728		cluster_end = cluster_start + osb->s_clustersize;
729	}
730
731	BUG_ON(cluster_start > PAGE_SIZE);
732	BUG_ON(cluster_end > PAGE_SIZE);
733
734	if (start)
735		*start = cluster_start;
736	if (end)
737		*end = cluster_end;
738}
739
740/*
741 * 'from' and 'to' are the region in the page to avoid zeroing.
742 *
743 * If pagesize > clustersize, this function will avoid zeroing outside
744 * of the cluster boundary.
745 *
746 * from == to == 0 is code for "zero the entire cluster region"
747 */
748static void ocfs2_clear_page_regions(struct page *page,
749				     struct ocfs2_super *osb, u32 cpos,
750				     unsigned from, unsigned to)
751{
752	void *kaddr;
753	unsigned int cluster_start, cluster_end;
754
755	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
756
757	kaddr = kmap_atomic(page, KM_USER0);
758
759	if (from || to) {
760		if (from > cluster_start)
761			memset(kaddr + cluster_start, 0, from - cluster_start);
762		if (to < cluster_end)
763			memset(kaddr + to, 0, cluster_end - to);
764	} else {
765		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
766	}
767
768	kunmap_atomic(kaddr, KM_USER0);
769}
770
771/*
772 * Nonsparse file systems fully allocate before we get to the write
773 * code. This prevents ocfs2_write() from tagging the write as an
774 * allocating one, which means ocfs2_map_page_blocks() might try to
775 * read-in the blocks at the tail of our file. Avoid reading them by
776 * testing i_size against each block offset.
777 */
778static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
779				 unsigned int block_start)
780{
781	u64 offset = page_offset(page) + block_start;
782
783	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
784		return 1;
785
786	if (i_size_read(inode) > offset)
787		return 1;
788
789	return 0;
790}
791
792/*
793 * Some of this taken from block_prepare_write(). We already have our
794 * mapping by now though, and the entire write will be allocating or
795 * it won't, so not much need to use BH_New.
796 *
797 * This will also skip zeroing, which is handled externally.
798 */
799int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
800			  struct inode *inode, unsigned int from,
801			  unsigned int to, int new)
802{
803	int ret = 0;
804	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
805	unsigned int block_end, block_start;
806	unsigned int bsize = 1 << inode->i_blkbits;
807
808	if (!page_has_buffers(page))
809		create_empty_buffers(page, bsize, 0);
810
811	head = page_buffers(page);
812	for (bh = head, block_start = 0; bh != head || !block_start;
813	     bh = bh->b_this_page, block_start += bsize) {
814		block_end = block_start + bsize;
815
816		clear_buffer_new(bh);
817
818		/*
819		 * Ignore blocks outside of our i/o range -
820		 * they may belong to unallocated clusters.
821		 */
822		if (block_start >= to || block_end <= from) {
823			if (PageUptodate(page))
824				set_buffer_uptodate(bh);
825			continue;
826		}
827
828		/*
829		 * For an allocating write with cluster size >= page
830		 * size, we always write the entire page.
831		 */
832		if (new)
833			set_buffer_new(bh);
834
835		if (!buffer_mapped(bh)) {
836			map_bh(bh, inode->i_sb, *p_blkno);
837			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
838		}
839
840		if (PageUptodate(page)) {
841			if (!buffer_uptodate(bh))
842				set_buffer_uptodate(bh);
843		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
844			   !buffer_new(bh) &&
845			   ocfs2_should_read_blk(inode, page, block_start) &&
846			   (block_start < from || block_end > to)) {
847			ll_rw_block(READ, 1, &bh);
848			*wait_bh++=bh;
849		}
850
851		*p_blkno = *p_blkno + 1;
852	}
853
854	/*
855	 * If we issued read requests - let them complete.
856	 */
857	while(wait_bh > wait) {
858		wait_on_buffer(*--wait_bh);
859		if (!buffer_uptodate(*wait_bh))
860			ret = -EIO;
861	}
862
863	if (ret == 0 || !new)
864		return ret;
865
866	/*
867	 * If we get -EIO above, zero out any newly allocated blocks
868	 * to avoid exposing stale data.
869	 */
870	bh = head;
871	block_start = 0;
872	do {
873		block_end = block_start + bsize;
874		if (block_end <= from)
875			goto next_bh;
876		if (block_start >= to)
877			break;
878
879		zero_user(page, block_start, bh->b_size);
880		set_buffer_uptodate(bh);
881		mark_buffer_dirty(bh);
882
883next_bh:
884		block_start = block_end;
885		bh = bh->b_this_page;
886	} while (bh != head);
887
888	return ret;
889}
890
891#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
892#define OCFS2_MAX_CTXT_PAGES	1
893#else
894#define OCFS2_MAX_CTXT_PAGES	(OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
895#endif
896
897#define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
898
899/*
900 * Describe the state of a single cluster to be written to.
901 */
902struct ocfs2_write_cluster_desc {
903	u32		c_cpos;
904	u32		c_phys;
905	/*
906	 * Give this a unique field because c_phys eventually gets
907	 * filled.
908	 */
909	unsigned	c_new;
910	unsigned	c_unwritten;
911};
912
913static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
914{
915	return d->c_new || d->c_unwritten;
916}
917
918struct ocfs2_write_ctxt {
919	/* Logical cluster position / len of write */
920	u32				w_cpos;
921	u32				w_clen;
922
923	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
924
925	/*
926	 * This is true if page_size > cluster_size.
927	 *
928	 * It triggers a set of special cases during write which might
929	 * have to deal with allocating writes to partial pages.
930	 */
931	unsigned int			w_large_pages;
932
933	/*
934	 * Pages involved in this write.
935	 *
936	 * w_target_page is the page being written to by the user.
937	 *
938	 * w_pages is an array of pages which always contains
939	 * w_target_page, and in the case of an allocating write with
940	 * page_size < cluster size, it will contain zero'd and mapped
941	 * pages adjacent to w_target_page which need to be written
942	 * out in so that future reads from that region will get
943	 * zero's.
944	 */
945	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
946	unsigned int			w_num_pages;
947	struct page			*w_target_page;
948
949	/*
950	 * ocfs2_write_end() uses this to know what the real range to
951	 * write in the target should be.
952	 */
953	unsigned int			w_target_from;
954	unsigned int			w_target_to;
955
956	/*
957	 * We could use journal_current_handle() but this is cleaner,
958	 * IMHO -Mark
959	 */
960	handle_t			*w_handle;
961
962	struct buffer_head		*w_di_bh;
963
964	struct ocfs2_cached_dealloc_ctxt w_dealloc;
965};
966
967void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
968{
969	int i;
970
971	for(i = 0; i < num_pages; i++) {
972		if (pages[i]) {
973			unlock_page(pages[i]);
974			mark_page_accessed(pages[i]);
975			page_cache_release(pages[i]);
976		}
977	}
978}
979
980static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
981{
982	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
983
984	brelse(wc->w_di_bh);
985	kfree(wc);
986}
987
988static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
989				  struct ocfs2_super *osb, loff_t pos,
990				  unsigned len, struct buffer_head *di_bh)
991{
992	u32 cend;
993	struct ocfs2_write_ctxt *wc;
994
995	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
996	if (!wc)
997		return -ENOMEM;
998
999	wc->w_cpos = pos >> osb->s_clustersize_bits;
1000	cend = (pos + len - 1) >> osb->s_clustersize_bits;
1001	wc->w_clen = cend - wc->w_cpos + 1;
1002	get_bh(di_bh);
1003	wc->w_di_bh = di_bh;
1004
1005	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
1006		wc->w_large_pages = 1;
1007	else
1008		wc->w_large_pages = 0;
1009
1010	ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
1011
1012	*wcp = wc;
1013
1014	return 0;
1015}
1016
1017/*
1018 * If a page has any new buffers, zero them out here, and mark them uptodate
1019 * and dirty so they'll be written out (in order to prevent uninitialised
1020 * block data from leaking). And clear the new bit.
1021 */
1022static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1023{
1024	unsigned int block_start, block_end;
1025	struct buffer_head *head, *bh;
1026
1027	BUG_ON(!PageLocked(page));
1028	if (!page_has_buffers(page))
1029		return;
1030
1031	bh = head = page_buffers(page);
1032	block_start = 0;
1033	do {
1034		block_end = block_start + bh->b_size;
1035
1036		if (buffer_new(bh)) {
1037			if (block_end > from && block_start < to) {
1038				if (!PageUptodate(page)) {
1039					unsigned start, end;
1040
1041					start = max(from, block_start);
1042					end = min(to, block_end);
1043
1044					zero_user_segment(page, start, end);
1045					set_buffer_uptodate(bh);
1046				}
1047
1048				clear_buffer_new(bh);
1049				mark_buffer_dirty(bh);
1050			}
1051		}
1052
1053		block_start = block_end;
1054		bh = bh->b_this_page;
1055	} while (bh != head);
1056}
1057
1058/*
1059 * Only called when we have a failure during allocating write to write
1060 * zero's to the newly allocated region.
1061 */
1062static void ocfs2_write_failure(struct inode *inode,
1063				struct ocfs2_write_ctxt *wc,
1064				loff_t user_pos, unsigned user_len)
1065{
1066	int i;
1067	unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
1068		to = user_pos + user_len;
1069	struct page *tmppage;
1070
1071	ocfs2_zero_new_buffers(wc->w_target_page, from, to);
1072
1073	for(i = 0; i < wc->w_num_pages; i++) {
1074		tmppage = wc->w_pages[i];
1075
1076		if (ocfs2_should_order_data(inode))
1077			walk_page_buffers(wc->w_handle, page_buffers(tmppage),
1078					  from, to, NULL,
1079					  ocfs2_journal_dirty_data);
1080
1081		block_commit_write(tmppage, from, to);
1082	}
1083}
1084
1085static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
1086					struct ocfs2_write_ctxt *wc,
1087					struct page *page, u32 cpos,
1088					loff_t user_pos, unsigned user_len,
1089					int new)
1090{
1091	int ret;
1092	unsigned int map_from = 0, map_to = 0;
1093	unsigned int cluster_start, cluster_end;
1094	unsigned int user_data_from = 0, user_data_to = 0;
1095
1096	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
1097					&cluster_start, &cluster_end);
1098
1099	if (page == wc->w_target_page) {
1100		map_from = user_pos & (PAGE_CACHE_SIZE - 1);
1101		map_to = map_from + user_len;
1102
1103		if (new)
1104			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
1105						    cluster_start, cluster_end,
1106						    new);
1107		else
1108			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
1109						    map_from, map_to, new);
1110		if (ret) {
1111			mlog_errno(ret);
1112			goto out;
1113		}
1114
1115		user_data_from = map_from;
1116		user_data_to = map_to;
1117		if (new) {
1118			map_from = cluster_start;
1119			map_to = cluster_end;
1120		}
1121	} else {
1122		/*
1123		 * If we haven't allocated the new page yet, we
1124		 * shouldn't be writing it out without copying user
1125		 * data. This is likely a math error from the caller.
1126		 */
1127		BUG_ON(!new);
1128
1129		map_from = cluster_start;
1130		map_to = cluster_end;
1131
1132		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
1133					    cluster_start, cluster_end, new);
1134		if (ret) {
1135			mlog_errno(ret);
1136			goto out;
1137		}
1138	}
1139
1140	/*
1141	 * Parts of newly allocated pages need to be zero'd.
1142	 *
1143	 * Above, we have also rewritten 'to' and 'from' - as far as
1144	 * the rest of the function is concerned, the entire cluster
1145	 * range inside of a page needs to be written.
1146	 *
1147	 * We can skip this if the page is up to date - it's already
1148	 * been zero'd from being read in as a hole.
1149	 */
1150	if (new && !PageUptodate(page))
1151		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1152					 cpos, user_data_from, user_data_to);
1153
1154	flush_dcache_page(page);
1155
1156out:
1157	return ret;
1158}
1159
1160/*
1161 * This function will only grab one clusters worth of pages.
1162 */
1163static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1164				      struct ocfs2_write_ctxt *wc,
1165				      u32 cpos, loff_t user_pos, int new,
1166				      struct page *mmap_page)
1167{
1168	int ret = 0, i;
1169	unsigned long start, target_index, index;
1170	struct inode *inode = mapping->host;
1171
1172	target_index = user_pos >> PAGE_CACHE_SHIFT;
1173
1174	/*
1175	 * Figure out how many pages we'll be manipulating here. For
1176	 * non allocating write, we just change the one
1177	 * page. Otherwise, we'll need a whole clusters worth.
1178	 */
1179	if (new) {
1180		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
1181		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
1182	} else {
1183		wc->w_num_pages = 1;
1184		start = target_index;
1185	}
1186
1187	for(i = 0; i < wc->w_num_pages; i++) {
1188		index = start + i;
1189
1190		if (index == target_index && mmap_page) {
1191			/*
1192			 * ocfs2_pagemkwrite() is a little different
1193			 * and wants us to directly use the page
1194			 * passed in.
1195			 */
1196			lock_page(mmap_page);
1197
1198			if (mmap_page->mapping != mapping) {
1199				unlock_page(mmap_page);
1200				/*
1201				 * Sanity check - the locking in
1202				 * ocfs2_pagemkwrite() should ensure
1203				 * that this code doesn't trigger.
1204				 */
1205				ret = -EINVAL;
1206				mlog_errno(ret);
1207				goto out;
1208			}
1209
1210			page_cache_get(mmap_page);
1211			wc->w_pages[i] = mmap_page;
1212		} else {
1213			wc->w_pages[i] = find_or_create_page(mapping, index,
1214							     GFP_NOFS);
1215			if (!wc->w_pages[i]) {
1216				ret = -ENOMEM;
1217				mlog_errno(ret);
1218				goto out;
1219			}
1220		}
1221
1222		if (index == target_index)
1223			wc->w_target_page = wc->w_pages[i];
1224	}
1225out:
1226	return ret;
1227}
1228
1229/*
1230 * Prepare a single cluster for write one cluster into the file.
1231 */
1232static int ocfs2_write_cluster(struct address_space *mapping,
1233			       u32 phys, unsigned int unwritten,
1234			       struct ocfs2_alloc_context *data_ac,
1235			       struct ocfs2_alloc_context *meta_ac,
1236			       struct ocfs2_write_ctxt *wc, u32 cpos,
1237			       loff_t user_pos, unsigned user_len)
1238{
1239	int ret, i, new, should_zero = 0;
1240	u64 v_blkno, p_blkno;
1241	struct inode *inode = mapping->host;
1242
1243	new = phys == 0 ? 1 : 0;
1244	if (new || unwritten)
1245		should_zero = 1;
1246
1247	if (new) {
1248		u32 tmp_pos;
1249
1250		/*
1251		 * This is safe to call with the page locks - it won't take
1252		 * any additional semaphores or cluster locks.
1253		 */
1254		tmp_pos = cpos;
1255		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
1256						 &tmp_pos, 1, 0, wc->w_di_bh,
1257						 wc->w_handle, data_ac,
1258						 meta_ac, NULL);
1259		/*
1260		 * This shouldn't happen because we must have already
1261		 * calculated the correct meta data allocation required. The
1262		 * internal tree allocation code should know how to increase
1263		 * transaction credits itself.
1264		 *
1265		 * If need be, we could handle -EAGAIN for a
1266		 * RESTART_TRANS here.
1267		 */
1268		mlog_bug_on_msg(ret == -EAGAIN,
1269				"Inode %llu: EAGAIN return during allocation.\n",
1270				(unsigned long long)OCFS2_I(inode)->ip_blkno);
1271		if (ret < 0) {
1272			mlog_errno(ret);
1273			goto out;
1274		}
1275	} else if (unwritten) {
1276		ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
1277						wc->w_handle, cpos, 1, phys,
1278						meta_ac, &wc->w_dealloc);
1279		if (ret < 0) {
1280			mlog_errno(ret);
1281			goto out;
1282		}
1283	}
1284
1285	if (should_zero)
1286		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
1287	else
1288		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1289
1290	/*
1291	 * The only reason this should fail is due to an inability to
1292	 * find the extent added.
1293	 */
1294	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1295					  NULL);
1296	if (ret < 0) {
1297		ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
1298			    "at logical block %llu",
1299			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
1300			    (unsigned long long)v_blkno);
1301		goto out;
1302	}
1303
1304	BUG_ON(p_blkno == 0);
1305
1306	for(i = 0; i < wc->w_num_pages; i++) {
1307		int tmpret;
1308
1309		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
1310						      wc->w_pages[i], cpos,
1311						      user_pos, user_len,
1312						      should_zero);
1313		if (tmpret) {
1314			mlog_errno(tmpret);
1315			if (ret == 0)
1316				tmpret = ret;
1317		}
1318	}
1319
1320	/*
1321	 * We only have cleanup to do in case of allocating write.
1322	 */
1323	if (ret && new)
1324		ocfs2_write_failure(inode, wc, user_pos, user_len);
1325
1326out:
1327
1328	return ret;
1329}
1330
1331static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1332				       struct ocfs2_alloc_context *data_ac,
1333				       struct ocfs2_alloc_context *meta_ac,
1334				       struct ocfs2_write_ctxt *wc,
1335				       loff_t pos, unsigned len)
1336{
1337	int ret, i;
1338	loff_t cluster_off;
1339	unsigned int local_len = len;
1340	struct ocfs2_write_cluster_desc *desc;
1341	struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
1342
1343	for (i = 0; i < wc->w_clen; i++) {
1344		desc = &wc->w_desc[i];
1345
1346		/*
1347		 * We have to make sure that the total write passed in
1348		 * doesn't extend past a single cluster.
1349		 */
1350		local_len = len;
1351		cluster_off = pos & (osb->s_clustersize - 1);
1352		if ((cluster_off + local_len) > osb->s_clustersize)
1353			local_len = osb->s_clustersize - cluster_off;
1354
1355		ret = ocfs2_write_cluster(mapping, desc->c_phys,
1356					  desc->c_unwritten, data_ac, meta_ac,
1357					  wc, desc->c_cpos, pos, local_len);
1358		if (ret) {
1359			mlog_errno(ret);
1360			goto out;
1361		}
1362
1363		len -= local_len;
1364		pos += local_len;
1365	}
1366
1367	ret = 0;
1368out:
1369	return ret;
1370}
1371
1372/*
1373 * ocfs2_write_end() wants to know which parts of the target page it
1374 * should complete the write on. It's easiest to compute them ahead of
1375 * time when a more complete view of the write is available.
1376 */
1377static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1378					struct ocfs2_write_ctxt *wc,
1379					loff_t pos, unsigned len, int alloc)
1380{
1381	struct ocfs2_write_cluster_desc *desc;
1382
1383	wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
1384	wc->w_target_to = wc->w_target_from + len;
1385
1386	if (alloc == 0)
1387		return;
1388
1389	/*
1390	 * Allocating write - we may have different boundaries based
1391	 * on page size and cluster size.
1392	 *
1393	 * NOTE: We can no longer compute one value from the other as
1394	 * the actual write length and user provided length may be
1395	 * different.
1396	 */
1397
1398	if (wc->w_large_pages) {
1399		/*
1400		 * We only care about the 1st and last cluster within
1401		 * our range and whether they should be zero'd or not. Either
1402		 * value may be extended out to the start/end of a
1403		 * newly allocated cluster.
1404		 */
1405		desc = &wc->w_desc[0];
1406		if (ocfs2_should_zero_cluster(desc))
1407			ocfs2_figure_cluster_boundaries(osb,
1408							desc->c_cpos,
1409							&wc->w_target_from,
1410							NULL);
1411
1412		desc = &wc->w_desc[wc->w_clen - 1];
1413		if (ocfs2_should_zero_cluster(desc))
1414			ocfs2_figure_cluster_boundaries(osb,
1415							desc->c_cpos,
1416							NULL,
1417							&wc->w_target_to);
1418	} else {
1419		wc->w_target_from = 0;
1420		wc->w_target_to = PAGE_CACHE_SIZE;
1421	}
1422}
1423
1424/*
1425 * Populate each single-cluster write descriptor in the write context
1426 * with information about the i/o to be done.
1427 *
1428 * Returns the number of clusters that will have to be allocated, as
1429 * well as a worst case estimate of the number of extent records that
1430 * would have to be created during a write to an unwritten region.
1431 */
1432static int ocfs2_populate_write_desc(struct inode *inode,
1433				     struct ocfs2_write_ctxt *wc,
1434				     unsigned int *clusters_to_alloc,
1435				     unsigned int *extents_to_split)
1436{
1437	int ret;
1438	struct ocfs2_write_cluster_desc *desc;
1439	unsigned int num_clusters = 0;
1440	unsigned int ext_flags = 0;
1441	u32 phys = 0;
1442	int i;
1443
1444	*clusters_to_alloc = 0;
1445	*extents_to_split = 0;
1446
1447	for (i = 0; i < wc->w_clen; i++) {
1448		desc = &wc->w_desc[i];
1449		desc->c_cpos = wc->w_cpos + i;
1450
1451		if (num_clusters == 0) {
1452			/*
1453			 * Need to look up the next extent record.
1454			 */
1455			ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
1456						 &num_clusters, &ext_flags);
1457			if (ret) {
1458				mlog_errno(ret);
1459				goto out;
1460			}
1461
1462			/*
1463			 * Assume worst case - that we're writing in
1464			 * the middle of the extent.
1465			 *
1466			 * We can assume that the write proceeds from
1467			 * left to right, in which case the extent
1468			 * insert code is smart enough to coalesce the
1469			 * next splits into the previous records created.
1470			 */
1471			if (ext_flags & OCFS2_EXT_UNWRITTEN)
1472				*extents_to_split = *extents_to_split + 2;
1473		} else if (phys) {
1474			/*
1475			 * Only increment phys if it doesn't describe
1476			 * a hole.
1477			 */
1478			phys++;
1479		}
1480
1481		desc->c_phys = phys;
1482		if (phys == 0) {
1483			desc->c_new = 1;
1484			*clusters_to_alloc = *clusters_to_alloc + 1;
1485		}
1486		if (ext_flags & OCFS2_EXT_UNWRITTEN)
1487			desc->c_unwritten = 1;
1488
1489		num_clusters--;
1490	}
1491
1492	ret = 0;
1493out:
1494	return ret;
1495}
1496
1497static int ocfs2_write_begin_inline(struct address_space *mapping,
1498				    struct inode *inode,
1499				    struct ocfs2_write_ctxt *wc)
1500{
1501	int ret;
1502	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1503	struct page *page;
1504	handle_t *handle;
1505	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1506
1507	page = find_or_create_page(mapping, 0, GFP_NOFS);
1508	if (!page) {
1509		ret = -ENOMEM;
1510		mlog_errno(ret);
1511		goto out;
1512	}
1513	/*
1514	 * If we don't set w_num_pages then this page won't get unlocked
1515	 * and freed on cleanup of the write context.
1516	 */
1517	wc->w_pages[0] = wc->w_target_page = page;
1518	wc->w_num_pages = 1;
1519
1520	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1521	if (IS_ERR(handle)) {
1522		ret = PTR_ERR(handle);
1523		mlog_errno(ret);
1524		goto out;
1525	}
1526
1527	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
1528				   OCFS2_JOURNAL_ACCESS_WRITE);
1529	if (ret) {
1530		ocfs2_commit_trans(osb, handle);
1531
1532		mlog_errno(ret);
1533		goto out;
1534	}
1535
1536	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1537		ocfs2_set_inode_data_inline(inode, di);
1538
1539	if (!PageUptodate(page)) {
1540		ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
1541		if (ret) {
1542			ocfs2_commit_trans(osb, handle);
1543
1544			goto out;
1545		}
1546	}
1547
1548	wc->w_handle = handle;
1549out:
1550	return ret;
1551}
1552
1553int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
1554{
1555	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1556
1557	if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
1558		return 1;
1559	return 0;
1560}
1561
1562static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1563					  struct inode *inode, loff_t pos,
1564					  unsigned len, struct page *mmap_page,
1565					  struct ocfs2_write_ctxt *wc)
1566{
1567	int ret, written = 0;
1568	loff_t end = pos + len;
1569	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1570
1571	mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
1572	     (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
1573	     oi->ip_dyn_features);
1574
1575	/*
1576	 * Handle inodes which already have inline data 1st.
1577	 */
1578	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1579		if (mmap_page == NULL &&
1580		    ocfs2_size_fits_inline_data(wc->w_di_bh, end))
1581			goto do_inline_write;
1582
1583		/*
1584		 * The write won't fit - we have to give this inode an
1585		 * inline extent list now.
1586		 */
1587		ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
1588		if (ret)
1589			mlog_errno(ret);
1590		goto out;
1591	}
1592
1593	/*
1594	 * Check whether the inode can accept inline data.
1595	 */
1596	if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
1597		return 0;
1598
1599	/*
1600	 * Check whether the write can fit.
1601	 */
1602	if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb))
1603		return 0;
1604
1605do_inline_write:
1606	ret = ocfs2_write_begin_inline(mapping, inode, wc);
1607	if (ret) {
1608		mlog_errno(ret);
1609		goto out;
1610	}
1611
1612	/*
1613	 * This signals to the caller that the data can be written
1614	 * inline.
1615	 */
1616	written = 1;
1617out:
1618	return written ? written : ret;
1619}
1620
1621/*
1622 * This function only does anything for file systems which can't
1623 * handle sparse files.
1624 *
1625 * What we want to do here is fill in any hole between the current end
1626 * of allocation and the end of our write. That way the rest of the
1627 * write path can treat it as an non-allocating write, which has no
1628 * special case code for sparse/nonsparse files.
1629 */
1630static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
1631					unsigned len,
1632					struct ocfs2_write_ctxt *wc)
1633{
1634	int ret;
1635	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1636	loff_t newsize = pos + len;
1637
1638	if (ocfs2_sparse_alloc(osb))
1639		return 0;
1640
1641	if (newsize <= i_size_read(inode))
1642		return 0;
1643
1644	ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
1645	if (ret)
1646		mlog_errno(ret);
1647
1648	return ret;
1649}
1650
1651int ocfs2_write_begin_nolock(struct address_space *mapping,
1652			     loff_t pos, unsigned len, unsigned flags,
1653			     struct page **pagep, void **fsdata,
1654			     struct buffer_head *di_bh, struct page *mmap_page)
1655{
1656	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
1657	unsigned int clusters_to_alloc, extents_to_split;
1658	struct ocfs2_write_ctxt *wc;
1659	struct inode *inode = mapping->host;
1660	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1661	struct ocfs2_dinode *di;
1662	struct ocfs2_alloc_context *data_ac = NULL;
1663	struct ocfs2_alloc_context *meta_ac = NULL;
1664	handle_t *handle;
1665
1666	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1667	if (ret) {
1668		mlog_errno(ret);
1669		return ret;
1670	}
1671
1672	if (ocfs2_supports_inline_data(osb)) {
1673		ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
1674						     mmap_page, wc);
1675		if (ret == 1) {
1676			ret = 0;
1677			goto success;
1678		}
1679		if (ret < 0) {
1680			mlog_errno(ret);
1681			goto out;
1682		}
1683	}
1684
1685	ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
1686	if (ret) {
1687		mlog_errno(ret);
1688		goto out;
1689	}
1690
1691	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1692					&extents_to_split);
1693	if (ret) {
1694		mlog_errno(ret);
1695		goto out;
1696	}
1697
1698	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1699
1700	/*
1701	 * We set w_target_from, w_target_to here so that
1702	 * ocfs2_write_end() knows which range in the target page to
1703	 * write out. An allocation requires that we write the entire
1704	 * cluster range.
1705	 */
1706	if (clusters_to_alloc || extents_to_split) {
1707		/*
1708		 * XXX: We are stretching the limits of
1709		 * ocfs2_lock_allocators(). It greatly over-estimates
1710		 * the work to be done.
1711		 */
1712		ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
1713					    extents_to_split, &data_ac, &meta_ac);
1714		if (ret) {
1715			mlog_errno(ret);
1716			goto out;
1717		}
1718
1719		credits = ocfs2_calc_extend_credits(inode->i_sb, di,
1720						    clusters_to_alloc);
1721
1722	}
1723
1724	ocfs2_set_target_boundaries(osb, wc, pos, len,
1725				    clusters_to_alloc + extents_to_split);
1726
1727	handle = ocfs2_start_trans(osb, credits);
1728	if (IS_ERR(handle)) {
1729		ret = PTR_ERR(handle);
1730		mlog_errno(ret);
1731		goto out;
1732	}
1733
1734	wc->w_handle = handle;
1735
1736	/*
1737	 * We don't want this to fail in ocfs2_write_end(), so do it
1738	 * here.
1739	 */
1740	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
1741				   OCFS2_JOURNAL_ACCESS_WRITE);
1742	if (ret) {
1743		mlog_errno(ret);
1744		goto out_commit;
1745	}
1746
1747	/*
1748	 * Fill our page array first. That way we've grabbed enough so
1749	 * that we can zero and flush if we error after adding the
1750	 * extent.
1751	 */
1752	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
1753					 clusters_to_alloc + extents_to_split,
1754					 mmap_page);
1755	if (ret) {
1756		mlog_errno(ret);
1757		goto out_commit;
1758	}
1759
1760	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1761					  len);
1762	if (ret) {
1763		mlog_errno(ret);
1764		goto out_commit;
1765	}
1766
1767	if (data_ac)
1768		ocfs2_free_alloc_context(data_ac);
1769	if (meta_ac)
1770		ocfs2_free_alloc_context(meta_ac);
1771
1772success:
1773	*pagep = wc->w_target_page;
1774	*fsdata = wc;
1775	return 0;
1776out_commit:
1777	ocfs2_commit_trans(osb, handle);
1778
1779out:
1780	ocfs2_free_write_ctxt(wc);
1781
1782	if (data_ac)
1783		ocfs2_free_alloc_context(data_ac);
1784	if (meta_ac)
1785		ocfs2_free_alloc_context(meta_ac);
1786	return ret;
1787}
1788
1789static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1790			     loff_t pos, unsigned len, unsigned flags,
1791			     struct page **pagep, void **fsdata)
1792{
1793	int ret;
1794	struct buffer_head *di_bh = NULL;
1795	struct inode *inode = mapping->host;
1796
1797	ret = ocfs2_inode_lock(inode, &di_bh, 1);
1798	if (ret) {
1799		mlog_errno(ret);
1800		return ret;
1801	}
1802
1803	/*
1804	 * Take alloc sem here to prevent concurrent lookups. That way
1805	 * the mapping, zeroing and tree manipulation within
1806	 * ocfs2_write() will be safe against ->readpage(). This
1807	 * should also serve to lock out allocation from a shared
1808	 * writeable region.
1809	 */
1810	down_write(&OCFS2_I(inode)->ip_alloc_sem);
1811
1812	ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1813				       fsdata, di_bh, NULL);
1814	if (ret) {
1815		mlog_errno(ret);
1816		goto out_fail;
1817	}
1818
1819	brelse(di_bh);
1820
1821	return 0;
1822
1823out_fail:
1824	up_write(&OCFS2_I(inode)->ip_alloc_sem);
1825
1826	brelse(di_bh);
1827	ocfs2_inode_unlock(inode, 1);
1828
1829	return ret;
1830}
1831
1832static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
1833				   unsigned len, unsigned *copied,
1834				   struct ocfs2_dinode *di,
1835				   struct ocfs2_write_ctxt *wc)
1836{
1837	void *kaddr;
1838
1839	if (unlikely(*copied < len)) {
1840		if (!PageUptodate(wc->w_target_page)) {
1841			*copied = 0;
1842			return;
1843		}
1844	}
1845
1846	kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
1847	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
1848	kunmap_atomic(kaddr, KM_USER0);
1849
1850	mlog(0, "Data written to inode at offset %llu. "
1851	     "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
1852	     (unsigned long long)pos, *copied,
1853	     le16_to_cpu(di->id2.i_data.id_count),
1854	     le16_to_cpu(di->i_dyn_features));
1855}
1856
1857int ocfs2_write_end_nolock(struct address_space *mapping,
1858			   loff_t pos, unsigned len, unsigned copied,
1859			   struct page *page, void *fsdata)
1860{
1861	int i;
1862	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
1863	struct inode *inode = mapping->host;
1864	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1865	struct ocfs2_write_ctxt *wc = fsdata;
1866	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1867	handle_t *handle = wc->w_handle;
1868	struct page *tmppage;
1869
1870	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1871		ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
1872		goto out_write_size;
1873	}
1874
1875	if (unlikely(copied < len)) {
1876		if (!PageUptodate(wc->w_target_page))
1877			copied = 0;
1878
1879		ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
1880				       start+len);
1881	}
1882	flush_dcache_page(wc->w_target_page);
1883
1884	for(i = 0; i < wc->w_num_pages; i++) {
1885		tmppage = wc->w_pages[i];
1886
1887		if (tmppage == wc->w_target_page) {
1888			from = wc->w_target_from;
1889			to = wc->w_target_to;
1890
1891			BUG_ON(from > PAGE_CACHE_SIZE ||
1892			       to > PAGE_CACHE_SIZE ||
1893			       to < from);
1894		} else {
1895			/*
1896			 * Pages adjacent to the target (if any) imply
1897			 * a hole-filling write in which case we want
1898			 * to flush their entire range.
1899			 */
1900			from = 0;
1901			to = PAGE_CACHE_SIZE;
1902		}
1903
1904		if (ocfs2_should_order_data(inode))
1905			walk_page_buffers(wc->w_handle, page_buffers(tmppage),
1906					  from, to, NULL,
1907					  ocfs2_journal_dirty_data);
1908
1909		block_commit_write(tmppage, from, to);
1910	}
1911
1912out_write_size:
1913	pos += copied;
1914	if (pos > inode->i_size) {
1915		i_size_write(inode, pos);
1916		mark_inode_dirty(inode);
1917	}
1918	inode->i_blocks = ocfs2_inode_sector_count(inode);
1919	di->i_size = cpu_to_le64((u64)i_size_read(inode));
1920	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1921	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1922	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1923	ocfs2_journal_dirty(handle, wc->w_di_bh);
1924
1925	ocfs2_commit_trans(osb, handle);
1926
1927	ocfs2_run_deallocs(osb, &wc->w_dealloc);
1928
1929	ocfs2_free_write_ctxt(wc);
1930
1931	return copied;
1932}
1933
1934static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1935			   loff_t pos, unsigned len, unsigned copied,
1936			   struct page *page, void *fsdata)
1937{
1938	int ret;
1939	struct inode *inode = mapping->host;
1940
1941	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1942
1943	up_write(&OCFS2_I(inode)->ip_alloc_sem);
1944	ocfs2_inode_unlock(inode, 1);
1945
1946	return ret;
1947}
1948
1949const struct address_space_operations ocfs2_aops = {
1950	.readpage	= ocfs2_readpage,
1951	.readpages	= ocfs2_readpages,
1952	.writepage	= ocfs2_writepage,
1953	.write_begin	= ocfs2_write_begin,
1954	.write_end	= ocfs2_write_end,
1955	.bmap		= ocfs2_bmap,
1956	.sync_page	= block_sync_page,
1957	.direct_IO	= ocfs2_direct_IO,
1958	.invalidatepage	= ocfs2_invalidatepage,
1959	.releasepage	= ocfs2_releasepage,
1960	.migratepage	= buffer_migrate_page,
1961};
1962