aops.c revision a81cb88b64a479b78c6dd5666678d50171865db8
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/fs.h>
23#include <linux/slab.h>
24#include <linux/highmem.h>
25#include <linux/pagemap.h>
26#include <asm/byteorder.h>
27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h>
30
31#define MLOG_MASK_PREFIX ML_FILE_IO
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "alloc.h"
37#include "aops.h"
38#include "dlmglue.h"
39#include "extent_map.h"
40#include "file.h"
41#include "inode.h"
42#include "journal.h"
43#include "suballoc.h"
44#include "super.h"
45#include "symlink.h"
46
47#include "buffer_head_io.h"
48
49static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
50				   struct buffer_head *bh_result, int create)
51{
52	int err = -EIO;
53	int status;
54	struct ocfs2_dinode *fe = NULL;
55	struct buffer_head *bh = NULL;
56	struct buffer_head *buffer_cache_bh = NULL;
57	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
58	void *kaddr;
59
60	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
61		   (unsigned long long)iblock, bh_result, create);
62
63	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
64
65	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
66		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
67		     (unsigned long long)iblock);
68		goto bail;
69	}
70
71	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
72				  OCFS2_I(inode)->ip_blkno,
73				  &bh, OCFS2_BH_CACHED, inode);
74	if (status < 0) {
75		mlog_errno(status);
76		goto bail;
77	}
78	fe = (struct ocfs2_dinode *) bh->b_data;
79
80	if (!OCFS2_IS_VALID_DINODE(fe)) {
81		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
82		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
83		     fe->i_signature);
84		goto bail;
85	}
86
87	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
88						    le32_to_cpu(fe->i_clusters))) {
89		mlog(ML_ERROR, "block offset is outside the allocated size: "
90		     "%llu\n", (unsigned long long)iblock);
91		goto bail;
92	}
93
94	/* We don't use the page cache to create symlink data, so if
95	 * need be, copy it over from the buffer cache. */
96	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
97		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
98			    iblock;
99		buffer_cache_bh = sb_getblk(osb->sb, blkno);
100		if (!buffer_cache_bh) {
101			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
102			goto bail;
103		}
104
105		/* we haven't locked out transactions, so a commit
106		 * could've happened. Since we've got a reference on
107		 * the bh, even if it commits while we're doing the
108		 * copy, the data is still good. */
109		if (buffer_jbd(buffer_cache_bh)
110		    && ocfs2_inode_is_new(inode)) {
111			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
112			if (!kaddr) {
113				mlog(ML_ERROR, "couldn't kmap!\n");
114				goto bail;
115			}
116			memcpy(kaddr + (bh_result->b_size * iblock),
117			       buffer_cache_bh->b_data,
118			       bh_result->b_size);
119			kunmap_atomic(kaddr, KM_USER0);
120			set_buffer_uptodate(bh_result);
121		}
122		brelse(buffer_cache_bh);
123	}
124
125	map_bh(bh_result, inode->i_sb,
126	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
127
128	err = 0;
129
130bail:
131	brelse(bh);
132
133	mlog_exit(err);
134	return err;
135}
136
137static int ocfs2_get_block(struct inode *inode, sector_t iblock,
138			   struct buffer_head *bh_result, int create)
139{
140	int err = 0;
141	unsigned int ext_flags;
142	u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
143	u64 p_blkno, count, past_eof;
144	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
145
146	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
147		   (unsigned long long)iblock, bh_result, create);
148
149	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
150		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
151		     inode, inode->i_ino);
152
153	if (S_ISLNK(inode->i_mode)) {
154		/* this always does I/O for some reason. */
155		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
156		goto bail;
157	}
158
159	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
160					  &ext_flags);
161	if (err) {
162		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
163		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
164		     (unsigned long long)p_blkno);
165		goto bail;
166	}
167
168	if (max_blocks < count)
169		count = max_blocks;
170
171	/*
172	 * ocfs2 never allocates in this function - the only time we
173	 * need to use BH_New is when we're extending i_size on a file
174	 * system which doesn't support holes, in which case BH_New
175	 * allows block_prepare_write() to zero.
176	 *
177	 * If we see this on a sparse file system, then a truncate has
178	 * raced us and removed the cluster. In this case, we clear
179	 * the buffers dirty and uptodate bits and let the buffer code
180	 * ignore it as a hole.
181	 */
182	if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
183		clear_buffer_dirty(bh_result);
184		clear_buffer_uptodate(bh_result);
185		goto bail;
186	}
187
188	/* Treat the unwritten extent as a hole for zeroing purposes. */
189	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
190		map_bh(bh_result, inode->i_sb, p_blkno);
191
192	bh_result->b_size = count << inode->i_blkbits;
193
194	if (!ocfs2_sparse_alloc(osb)) {
195		if (p_blkno == 0) {
196			err = -EIO;
197			mlog(ML_ERROR,
198			     "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
199			     (unsigned long long)iblock,
200			     (unsigned long long)p_blkno,
201			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
202			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
203			dump_stack();
204		}
205
206		past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
207		mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
208		     (unsigned long long)past_eof);
209
210		if (create && (iblock >= past_eof))
211			set_buffer_new(bh_result);
212	}
213
214bail:
215	if (err < 0)
216		err = -EIO;
217
218	mlog_exit(err);
219	return err;
220}
221
222int ocfs2_read_inline_data(struct inode *inode, struct page *page,
223			   struct buffer_head *di_bh)
224{
225	void *kaddr;
226	loff_t size;
227	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
228
229	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
230		ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
231			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
232		return -EROFS;
233	}
234
235	size = i_size_read(inode);
236
237	if (size > PAGE_CACHE_SIZE ||
238	    size > ocfs2_max_inline_data(inode->i_sb)) {
239		ocfs2_error(inode->i_sb,
240			    "Inode %llu has with inline data has bad size: %Lu",
241			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
242			    (unsigned long long)size);
243		return -EROFS;
244	}
245
246	kaddr = kmap_atomic(page, KM_USER0);
247	if (size)
248		memcpy(kaddr, di->id2.i_data.id_data, size);
249	/* Clear the remaining part of the page */
250	memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
251	flush_dcache_page(page);
252	kunmap_atomic(kaddr, KM_USER0);
253
254	SetPageUptodate(page);
255
256	return 0;
257}
258
259static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
260{
261	int ret;
262	struct buffer_head *di_bh = NULL;
263	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
264
265	BUG_ON(!PageLocked(page));
266	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
267
268	ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
269			       OCFS2_BH_CACHED, inode);
270	if (ret) {
271		mlog_errno(ret);
272		goto out;
273	}
274
275	ret = ocfs2_read_inline_data(inode, page, di_bh);
276out:
277	unlock_page(page);
278
279	brelse(di_bh);
280	return ret;
281}
282
283static int ocfs2_readpage(struct file *file, struct page *page)
284{
285	struct inode *inode = page->mapping->host;
286	struct ocfs2_inode_info *oi = OCFS2_I(inode);
287	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
288	int ret, unlock = 1;
289
290	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
291
292	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
293	if (ret != 0) {
294		if (ret == AOP_TRUNCATED_PAGE)
295			unlock = 0;
296		mlog_errno(ret);
297		goto out;
298	}
299
300	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
301		ret = AOP_TRUNCATED_PAGE;
302		goto out_inode_unlock;
303	}
304
305	/*
306	 * i_size might have just been updated as we grabed the meta lock.  We
307	 * might now be discovering a truncate that hit on another node.
308	 * block_read_full_page->get_block freaks out if it is asked to read
309	 * beyond the end of a file, so we check here.  Callers
310	 * (generic_file_read, vm_ops->fault) are clever enough to check i_size
311	 * and notice that the page they just read isn't needed.
312	 *
313	 * XXX sys_readahead() seems to get that wrong?
314	 */
315	if (start >= i_size_read(inode)) {
316		zero_user(page, 0, PAGE_SIZE);
317		SetPageUptodate(page);
318		ret = 0;
319		goto out_alloc;
320	}
321
322	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
323		ret = ocfs2_readpage_inline(inode, page);
324	else
325		ret = block_read_full_page(page, ocfs2_get_block);
326	unlock = 0;
327
328out_alloc:
329	up_read(&OCFS2_I(inode)->ip_alloc_sem);
330out_inode_unlock:
331	ocfs2_inode_unlock(inode, 0);
332out:
333	if (unlock)
334		unlock_page(page);
335	mlog_exit(ret);
336	return ret;
337}
338
339/*
340 * This is used only for read-ahead. Failures or difficult to handle
341 * situations are safe to ignore.
342 *
343 * Right now, we don't bother with BH_Boundary - in-inode extent lists
344 * are quite large (243 extents on 4k blocks), so most inodes don't
345 * grow out to a tree. If need be, detecting boundary extents could
346 * trivially be added in a future version of ocfs2_get_block().
347 */
348static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
349			   struct list_head *pages, unsigned nr_pages)
350{
351	int ret, err = -EIO;
352	struct inode *inode = mapping->host;
353	struct ocfs2_inode_info *oi = OCFS2_I(inode);
354	loff_t start;
355	struct page *last;
356
357	/*
358	 * Use the nonblocking flag for the dlm code to avoid page
359	 * lock inversion, but don't bother with retrying.
360	 */
361	ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
362	if (ret)
363		return err;
364
365	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
366		ocfs2_inode_unlock(inode, 0);
367		return err;
368	}
369
370	/*
371	 * Don't bother with inline-data. There isn't anything
372	 * to read-ahead in that case anyway...
373	 */
374	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
375		goto out_unlock;
376
377	/*
378	 * Check whether a remote node truncated this file - we just
379	 * drop out in that case as it's not worth handling here.
380	 */
381	last = list_entry(pages->prev, struct page, lru);
382	start = (loff_t)last->index << PAGE_CACHE_SHIFT;
383	if (start >= i_size_read(inode))
384		goto out_unlock;
385
386	err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
387
388out_unlock:
389	up_read(&oi->ip_alloc_sem);
390	ocfs2_inode_unlock(inode, 0);
391
392	return err;
393}
394
395/* Note: Because we don't support holes, our allocation has
396 * already happened (allocation writes zeros to the file data)
397 * so we don't have to worry about ordered writes in
398 * ocfs2_writepage.
399 *
400 * ->writepage is called during the process of invalidating the page cache
401 * during blocked lock processing.  It can't block on any cluster locks
402 * to during block mapping.  It's relying on the fact that the block
403 * mapping can't have disappeared under the dirty pages that it is
404 * being asked to write back.
405 */
406static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
407{
408	int ret;
409
410	mlog_entry("(0x%p)\n", page);
411
412	ret = block_write_full_page(page, ocfs2_get_block, wbc);
413
414	mlog_exit(ret);
415
416	return ret;
417}
418
419/*
420 * This is called from ocfs2_write_zero_page() which has handled it's
421 * own cluster locking and has ensured allocation exists for those
422 * blocks to be written.
423 */
424int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
425			       unsigned from, unsigned to)
426{
427	int ret;
428
429	ret = block_prepare_write(page, from, to, ocfs2_get_block);
430
431	return ret;
432}
433
434/* Taken from ext3. We don't necessarily need the full blown
435 * functionality yet, but IMHO it's better to cut and paste the whole
436 * thing so we can avoid introducing our own bugs (and easily pick up
437 * their fixes when they happen) --Mark */
438int walk_page_buffers(	handle_t *handle,
439			struct buffer_head *head,
440			unsigned from,
441			unsigned to,
442			int *partial,
443			int (*fn)(	handle_t *handle,
444					struct buffer_head *bh))
445{
446	struct buffer_head *bh;
447	unsigned block_start, block_end;
448	unsigned blocksize = head->b_size;
449	int err, ret = 0;
450	struct buffer_head *next;
451
452	for (	bh = head, block_start = 0;
453		ret == 0 && (bh != head || !block_start);
454	    	block_start = block_end, bh = next)
455	{
456		next = bh->b_this_page;
457		block_end = block_start + blocksize;
458		if (block_end <= from || block_start >= to) {
459			if (partial && !buffer_uptodate(bh))
460				*partial = 1;
461			continue;
462		}
463		err = (*fn)(handle, bh);
464		if (!ret)
465			ret = err;
466	}
467	return ret;
468}
469
470handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
471							 struct page *page,
472							 unsigned from,
473							 unsigned to)
474{
475	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
476	handle_t *handle;
477	int ret = 0;
478
479	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
480	if (IS_ERR(handle)) {
481		ret = -ENOMEM;
482		mlog_errno(ret);
483		goto out;
484	}
485
486	if (ocfs2_should_order_data(inode)) {
487		ret = ocfs2_jbd2_file_inode(handle, inode);
488#ifdef CONFIG_OCFS2_COMPAT_JBD
489		ret = walk_page_buffers(handle,
490					page_buffers(page),
491					from, to, NULL,
492					ocfs2_journal_dirty_data);
493#endif
494		if (ret < 0)
495			mlog_errno(ret);
496	}
497out:
498	if (ret) {
499		if (!IS_ERR(handle))
500			ocfs2_commit_trans(osb, handle);
501		handle = ERR_PTR(ret);
502	}
503	return handle;
504}
505
506static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
507{
508	sector_t status;
509	u64 p_blkno = 0;
510	int err = 0;
511	struct inode *inode = mapping->host;
512
513	mlog_entry("(block = %llu)\n", (unsigned long long)block);
514
515	/* We don't need to lock journal system files, since they aren't
516	 * accessed concurrently from multiple nodes.
517	 */
518	if (!INODE_JOURNAL(inode)) {
519		err = ocfs2_inode_lock(inode, NULL, 0);
520		if (err) {
521			if (err != -ENOENT)
522				mlog_errno(err);
523			goto bail;
524		}
525		down_read(&OCFS2_I(inode)->ip_alloc_sem);
526	}
527
528	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
529		err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
530						  NULL);
531
532	if (!INODE_JOURNAL(inode)) {
533		up_read(&OCFS2_I(inode)->ip_alloc_sem);
534		ocfs2_inode_unlock(inode, 0);
535	}
536
537	if (err) {
538		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
539		     (unsigned long long)block);
540		mlog_errno(err);
541		goto bail;
542	}
543
544bail:
545	status = err ? 0 : p_blkno;
546
547	mlog_exit((int)status);
548
549	return status;
550}
551
552/*
553 * TODO: Make this into a generic get_blocks function.
554 *
555 * From do_direct_io in direct-io.c:
556 *  "So what we do is to permit the ->get_blocks function to populate
557 *   bh.b_size with the size of IO which is permitted at this offset and
558 *   this i_blkbits."
559 *
560 * This function is called directly from get_more_blocks in direct-io.c.
561 *
562 * called like this: dio->get_blocks(dio->inode, fs_startblk,
563 * 					fs_count, map_bh, dio->rw == WRITE);
564 */
565static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
566				     struct buffer_head *bh_result, int create)
567{
568	int ret;
569	u64 p_blkno, inode_blocks, contig_blocks;
570	unsigned int ext_flags;
571	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
572	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
573
574	/* This function won't even be called if the request isn't all
575	 * nicely aligned and of the right size, so there's no need
576	 * for us to check any of that. */
577
578	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
579
580	/*
581	 * Any write past EOF is not allowed because we'd be extending.
582	 */
583	if (create && (iblock + max_blocks) > inode_blocks) {
584		ret = -EIO;
585		goto bail;
586	}
587
588	/* This figures out the size of the next contiguous block, and
589	 * our logical offset */
590	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
591					  &contig_blocks, &ext_flags);
592	if (ret) {
593		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
594		     (unsigned long long)iblock);
595		ret = -EIO;
596		goto bail;
597	}
598
599	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
600		ocfs2_error(inode->i_sb,
601			    "Inode %llu has a hole at block %llu\n",
602			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
603			    (unsigned long long)iblock);
604		ret = -EROFS;
605		goto bail;
606	}
607
608	/*
609	 * get_more_blocks() expects us to describe a hole by clearing
610	 * the mapped bit on bh_result().
611	 *
612	 * Consider an unwritten extent as a hole.
613	 */
614	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
615		map_bh(bh_result, inode->i_sb, p_blkno);
616	else {
617		/*
618		 * ocfs2_prepare_inode_for_write() should have caught
619		 * the case where we'd be filling a hole and triggered
620		 * a buffered write instead.
621		 */
622		if (create) {
623			ret = -EIO;
624			mlog_errno(ret);
625			goto bail;
626		}
627
628		clear_buffer_mapped(bh_result);
629	}
630
631	/* make sure we don't map more than max_blocks blocks here as
632	   that's all the kernel will handle at this point. */
633	if (max_blocks < contig_blocks)
634		contig_blocks = max_blocks;
635	bh_result->b_size = contig_blocks << blocksize_bits;
636bail:
637	return ret;
638}
639
640/*
641 * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
642 * particularly interested in the aio/dio case.  Like the core uses
643 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
644 * truncation on another.
645 */
646static void ocfs2_dio_end_io(struct kiocb *iocb,
647			     loff_t offset,
648			     ssize_t bytes,
649			     void *private)
650{
651	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
652	int level;
653
654	/* this io's submitter should not have unlocked this before we could */
655	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
656
657	ocfs2_iocb_clear_rw_locked(iocb);
658
659	level = ocfs2_iocb_rw_locked_level(iocb);
660	if (!level)
661		up_read(&inode->i_alloc_sem);
662	ocfs2_rw_unlock(inode, level);
663}
664
665/*
666 * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
667 * from ext3.  PageChecked() bits have been removed as OCFS2 does not
668 * do journalled data.
669 */
670static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
671{
672	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
673
674	jbd2_journal_invalidatepage(journal, page, offset);
675}
676
677static int ocfs2_releasepage(struct page *page, gfp_t wait)
678{
679	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
680
681	if (!page_has_buffers(page))
682		return 0;
683	return jbd2_journal_try_to_free_buffers(journal, page, wait);
684}
685
686static ssize_t ocfs2_direct_IO(int rw,
687			       struct kiocb *iocb,
688			       const struct iovec *iov,
689			       loff_t offset,
690			       unsigned long nr_segs)
691{
692	struct file *file = iocb->ki_filp;
693	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
694	int ret;
695
696	mlog_entry_void();
697
698	/*
699	 * Fallback to buffered I/O if we see an inode without
700	 * extents.
701	 */
702	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
703		return 0;
704
705	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
706					    inode->i_sb->s_bdev, iov, offset,
707					    nr_segs,
708					    ocfs2_direct_IO_get_blocks,
709					    ocfs2_dio_end_io);
710
711	mlog_exit(ret);
712	return ret;
713}
714
715static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
716					    u32 cpos,
717					    unsigned int *start,
718					    unsigned int *end)
719{
720	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
721
722	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
723		unsigned int cpp;
724
725		cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
726
727		cluster_start = cpos % cpp;
728		cluster_start = cluster_start << osb->s_clustersize_bits;
729
730		cluster_end = cluster_start + osb->s_clustersize;
731	}
732
733	BUG_ON(cluster_start > PAGE_SIZE);
734	BUG_ON(cluster_end > PAGE_SIZE);
735
736	if (start)
737		*start = cluster_start;
738	if (end)
739		*end = cluster_end;
740}
741
742/*
743 * 'from' and 'to' are the region in the page to avoid zeroing.
744 *
745 * If pagesize > clustersize, this function will avoid zeroing outside
746 * of the cluster boundary.
747 *
748 * from == to == 0 is code for "zero the entire cluster region"
749 */
750static void ocfs2_clear_page_regions(struct page *page,
751				     struct ocfs2_super *osb, u32 cpos,
752				     unsigned from, unsigned to)
753{
754	void *kaddr;
755	unsigned int cluster_start, cluster_end;
756
757	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
758
759	kaddr = kmap_atomic(page, KM_USER0);
760
761	if (from || to) {
762		if (from > cluster_start)
763			memset(kaddr + cluster_start, 0, from - cluster_start);
764		if (to < cluster_end)
765			memset(kaddr + to, 0, cluster_end - to);
766	} else {
767		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
768	}
769
770	kunmap_atomic(kaddr, KM_USER0);
771}
772
773/*
774 * Nonsparse file systems fully allocate before we get to the write
775 * code. This prevents ocfs2_write() from tagging the write as an
776 * allocating one, which means ocfs2_map_page_blocks() might try to
777 * read-in the blocks at the tail of our file. Avoid reading them by
778 * testing i_size against each block offset.
779 */
780static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
781				 unsigned int block_start)
782{
783	u64 offset = page_offset(page) + block_start;
784
785	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
786		return 1;
787
788	if (i_size_read(inode) > offset)
789		return 1;
790
791	return 0;
792}
793
794/*
795 * Some of this taken from block_prepare_write(). We already have our
796 * mapping by now though, and the entire write will be allocating or
797 * it won't, so not much need to use BH_New.
798 *
799 * This will also skip zeroing, which is handled externally.
800 */
801int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
802			  struct inode *inode, unsigned int from,
803			  unsigned int to, int new)
804{
805	int ret = 0;
806	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
807	unsigned int block_end, block_start;
808	unsigned int bsize = 1 << inode->i_blkbits;
809
810	if (!page_has_buffers(page))
811		create_empty_buffers(page, bsize, 0);
812
813	head = page_buffers(page);
814	for (bh = head, block_start = 0; bh != head || !block_start;
815	     bh = bh->b_this_page, block_start += bsize) {
816		block_end = block_start + bsize;
817
818		clear_buffer_new(bh);
819
820		/*
821		 * Ignore blocks outside of our i/o range -
822		 * they may belong to unallocated clusters.
823		 */
824		if (block_start >= to || block_end <= from) {
825			if (PageUptodate(page))
826				set_buffer_uptodate(bh);
827			continue;
828		}
829
830		/*
831		 * For an allocating write with cluster size >= page
832		 * size, we always write the entire page.
833		 */
834		if (new)
835			set_buffer_new(bh);
836
837		if (!buffer_mapped(bh)) {
838			map_bh(bh, inode->i_sb, *p_blkno);
839			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
840		}
841
842		if (PageUptodate(page)) {
843			if (!buffer_uptodate(bh))
844				set_buffer_uptodate(bh);
845		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
846			   !buffer_new(bh) &&
847			   ocfs2_should_read_blk(inode, page, block_start) &&
848			   (block_start < from || block_end > to)) {
849			ll_rw_block(READ, 1, &bh);
850			*wait_bh++=bh;
851		}
852
853		*p_blkno = *p_blkno + 1;
854	}
855
856	/*
857	 * If we issued read requests - let them complete.
858	 */
859	while(wait_bh > wait) {
860		wait_on_buffer(*--wait_bh);
861		if (!buffer_uptodate(*wait_bh))
862			ret = -EIO;
863	}
864
865	if (ret == 0 || !new)
866		return ret;
867
868	/*
869	 * If we get -EIO above, zero out any newly allocated blocks
870	 * to avoid exposing stale data.
871	 */
872	bh = head;
873	block_start = 0;
874	do {
875		block_end = block_start + bsize;
876		if (block_end <= from)
877			goto next_bh;
878		if (block_start >= to)
879			break;
880
881		zero_user(page, block_start, bh->b_size);
882		set_buffer_uptodate(bh);
883		mark_buffer_dirty(bh);
884
885next_bh:
886		block_start = block_end;
887		bh = bh->b_this_page;
888	} while (bh != head);
889
890	return ret;
891}
892
893#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
894#define OCFS2_MAX_CTXT_PAGES	1
895#else
896#define OCFS2_MAX_CTXT_PAGES	(OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
897#endif
898
899#define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
900
901/*
902 * Describe the state of a single cluster to be written to.
903 */
904struct ocfs2_write_cluster_desc {
905	u32		c_cpos;
906	u32		c_phys;
907	/*
908	 * Give this a unique field because c_phys eventually gets
909	 * filled.
910	 */
911	unsigned	c_new;
912	unsigned	c_unwritten;
913};
914
915static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
916{
917	return d->c_new || d->c_unwritten;
918}
919
920struct ocfs2_write_ctxt {
921	/* Logical cluster position / len of write */
922	u32				w_cpos;
923	u32				w_clen;
924
925	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
926
927	/*
928	 * This is true if page_size > cluster_size.
929	 *
930	 * It triggers a set of special cases during write which might
931	 * have to deal with allocating writes to partial pages.
932	 */
933	unsigned int			w_large_pages;
934
935	/*
936	 * Pages involved in this write.
937	 *
938	 * w_target_page is the page being written to by the user.
939	 *
940	 * w_pages is an array of pages which always contains
941	 * w_target_page, and in the case of an allocating write with
942	 * page_size < cluster size, it will contain zero'd and mapped
943	 * pages adjacent to w_target_page which need to be written
944	 * out in so that future reads from that region will get
945	 * zero's.
946	 */
947	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
948	unsigned int			w_num_pages;
949	struct page			*w_target_page;
950
951	/*
952	 * ocfs2_write_end() uses this to know what the real range to
953	 * write in the target should be.
954	 */
955	unsigned int			w_target_from;
956	unsigned int			w_target_to;
957
958	/*
959	 * We could use journal_current_handle() but this is cleaner,
960	 * IMHO -Mark
961	 */
962	handle_t			*w_handle;
963
964	struct buffer_head		*w_di_bh;
965
966	struct ocfs2_cached_dealloc_ctxt w_dealloc;
967};
968
969void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
970{
971	int i;
972
973	for(i = 0; i < num_pages; i++) {
974		if (pages[i]) {
975			unlock_page(pages[i]);
976			mark_page_accessed(pages[i]);
977			page_cache_release(pages[i]);
978		}
979	}
980}
981
982static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
983{
984	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
985
986	brelse(wc->w_di_bh);
987	kfree(wc);
988}
989
990static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
991				  struct ocfs2_super *osb, loff_t pos,
992				  unsigned len, struct buffer_head *di_bh)
993{
994	u32 cend;
995	struct ocfs2_write_ctxt *wc;
996
997	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
998	if (!wc)
999		return -ENOMEM;
1000
1001	wc->w_cpos = pos >> osb->s_clustersize_bits;
1002	cend = (pos + len - 1) >> osb->s_clustersize_bits;
1003	wc->w_clen = cend - wc->w_cpos + 1;
1004	get_bh(di_bh);
1005	wc->w_di_bh = di_bh;
1006
1007	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
1008		wc->w_large_pages = 1;
1009	else
1010		wc->w_large_pages = 0;
1011
1012	ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
1013
1014	*wcp = wc;
1015
1016	return 0;
1017}
1018
1019/*
1020 * If a page has any new buffers, zero them out here, and mark them uptodate
1021 * and dirty so they'll be written out (in order to prevent uninitialised
1022 * block data from leaking). And clear the new bit.
1023 */
1024static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1025{
1026	unsigned int block_start, block_end;
1027	struct buffer_head *head, *bh;
1028
1029	BUG_ON(!PageLocked(page));
1030	if (!page_has_buffers(page))
1031		return;
1032
1033	bh = head = page_buffers(page);
1034	block_start = 0;
1035	do {
1036		block_end = block_start + bh->b_size;
1037
1038		if (buffer_new(bh)) {
1039			if (block_end > from && block_start < to) {
1040				if (!PageUptodate(page)) {
1041					unsigned start, end;
1042
1043					start = max(from, block_start);
1044					end = min(to, block_end);
1045
1046					zero_user_segment(page, start, end);
1047					set_buffer_uptodate(bh);
1048				}
1049
1050				clear_buffer_new(bh);
1051				mark_buffer_dirty(bh);
1052			}
1053		}
1054
1055		block_start = block_end;
1056		bh = bh->b_this_page;
1057	} while (bh != head);
1058}
1059
1060/*
1061 * Only called when we have a failure during allocating write to write
1062 * zero's to the newly allocated region.
1063 */
1064static void ocfs2_write_failure(struct inode *inode,
1065				struct ocfs2_write_ctxt *wc,
1066				loff_t user_pos, unsigned user_len)
1067{
1068	int i;
1069	unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
1070		to = user_pos + user_len;
1071	struct page *tmppage;
1072
1073	ocfs2_zero_new_buffers(wc->w_target_page, from, to);
1074
1075	for(i = 0; i < wc->w_num_pages; i++) {
1076		tmppage = wc->w_pages[i];
1077
1078		if (page_has_buffers(tmppage)) {
1079			if (ocfs2_should_order_data(inode)) {
1080				ocfs2_jbd2_file_inode(wc->w_handle, inode);
1081#ifdef CONFIG_OCFS2_COMPAT_JBD
1082				walk_page_buffers(wc->w_handle,
1083						  page_buffers(tmppage),
1084						  from, to, NULL,
1085						  ocfs2_journal_dirty_data);
1086#endif
1087			}
1088
1089			block_commit_write(tmppage, from, to);
1090		}
1091	}
1092}
1093
1094static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
1095					struct ocfs2_write_ctxt *wc,
1096					struct page *page, u32 cpos,
1097					loff_t user_pos, unsigned user_len,
1098					int new)
1099{
1100	int ret;
1101	unsigned int map_from = 0, map_to = 0;
1102	unsigned int cluster_start, cluster_end;
1103	unsigned int user_data_from = 0, user_data_to = 0;
1104
1105	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
1106					&cluster_start, &cluster_end);
1107
1108	if (page == wc->w_target_page) {
1109		map_from = user_pos & (PAGE_CACHE_SIZE - 1);
1110		map_to = map_from + user_len;
1111
1112		if (new)
1113			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
1114						    cluster_start, cluster_end,
1115						    new);
1116		else
1117			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
1118						    map_from, map_to, new);
1119		if (ret) {
1120			mlog_errno(ret);
1121			goto out;
1122		}
1123
1124		user_data_from = map_from;
1125		user_data_to = map_to;
1126		if (new) {
1127			map_from = cluster_start;
1128			map_to = cluster_end;
1129		}
1130	} else {
1131		/*
1132		 * If we haven't allocated the new page yet, we
1133		 * shouldn't be writing it out without copying user
1134		 * data. This is likely a math error from the caller.
1135		 */
1136		BUG_ON(!new);
1137
1138		map_from = cluster_start;
1139		map_to = cluster_end;
1140
1141		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
1142					    cluster_start, cluster_end, new);
1143		if (ret) {
1144			mlog_errno(ret);
1145			goto out;
1146		}
1147	}
1148
1149	/*
1150	 * Parts of newly allocated pages need to be zero'd.
1151	 *
1152	 * Above, we have also rewritten 'to' and 'from' - as far as
1153	 * the rest of the function is concerned, the entire cluster
1154	 * range inside of a page needs to be written.
1155	 *
1156	 * We can skip this if the page is up to date - it's already
1157	 * been zero'd from being read in as a hole.
1158	 */
1159	if (new && !PageUptodate(page))
1160		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1161					 cpos, user_data_from, user_data_to);
1162
1163	flush_dcache_page(page);
1164
1165out:
1166	return ret;
1167}
1168
1169/*
1170 * This function will only grab one clusters worth of pages.
1171 */
1172static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1173				      struct ocfs2_write_ctxt *wc,
1174				      u32 cpos, loff_t user_pos, int new,
1175				      struct page *mmap_page)
1176{
1177	int ret = 0, i;
1178	unsigned long start, target_index, index;
1179	struct inode *inode = mapping->host;
1180
1181	target_index = user_pos >> PAGE_CACHE_SHIFT;
1182
1183	/*
1184	 * Figure out how many pages we'll be manipulating here. For
1185	 * non allocating write, we just change the one
1186	 * page. Otherwise, we'll need a whole clusters worth.
1187	 */
1188	if (new) {
1189		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
1190		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
1191	} else {
1192		wc->w_num_pages = 1;
1193		start = target_index;
1194	}
1195
1196	for(i = 0; i < wc->w_num_pages; i++) {
1197		index = start + i;
1198
1199		if (index == target_index && mmap_page) {
1200			/*
1201			 * ocfs2_pagemkwrite() is a little different
1202			 * and wants us to directly use the page
1203			 * passed in.
1204			 */
1205			lock_page(mmap_page);
1206
1207			if (mmap_page->mapping != mapping) {
1208				unlock_page(mmap_page);
1209				/*
1210				 * Sanity check - the locking in
1211				 * ocfs2_pagemkwrite() should ensure
1212				 * that this code doesn't trigger.
1213				 */
1214				ret = -EINVAL;
1215				mlog_errno(ret);
1216				goto out;
1217			}
1218
1219			page_cache_get(mmap_page);
1220			wc->w_pages[i] = mmap_page;
1221		} else {
1222			wc->w_pages[i] = find_or_create_page(mapping, index,
1223							     GFP_NOFS);
1224			if (!wc->w_pages[i]) {
1225				ret = -ENOMEM;
1226				mlog_errno(ret);
1227				goto out;
1228			}
1229		}
1230
1231		if (index == target_index)
1232			wc->w_target_page = wc->w_pages[i];
1233	}
1234out:
1235	return ret;
1236}
1237
1238/*
1239 * Prepare a single cluster for write one cluster into the file.
1240 */
1241static int ocfs2_write_cluster(struct address_space *mapping,
1242			       u32 phys, unsigned int unwritten,
1243			       struct ocfs2_alloc_context *data_ac,
1244			       struct ocfs2_alloc_context *meta_ac,
1245			       struct ocfs2_write_ctxt *wc, u32 cpos,
1246			       loff_t user_pos, unsigned user_len)
1247{
1248	int ret, i, new, should_zero = 0;
1249	u64 v_blkno, p_blkno;
1250	struct inode *inode = mapping->host;
1251	struct ocfs2_extent_tree et;
1252
1253	new = phys == 0 ? 1 : 0;
1254	if (new || unwritten)
1255		should_zero = 1;
1256
1257	if (new) {
1258		u32 tmp_pos;
1259
1260		/*
1261		 * This is safe to call with the page locks - it won't take
1262		 * any additional semaphores or cluster locks.
1263		 */
1264		tmp_pos = cpos;
1265		ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
1266					   &tmp_pos, 1, 0, wc->w_di_bh,
1267					   wc->w_handle, data_ac,
1268					   meta_ac, NULL);
1269		/*
1270		 * This shouldn't happen because we must have already
1271		 * calculated the correct meta data allocation required. The
1272		 * internal tree allocation code should know how to increase
1273		 * transaction credits itself.
1274		 *
1275		 * If need be, we could handle -EAGAIN for a
1276		 * RESTART_TRANS here.
1277		 */
1278		mlog_bug_on_msg(ret == -EAGAIN,
1279				"Inode %llu: EAGAIN return during allocation.\n",
1280				(unsigned long long)OCFS2_I(inode)->ip_blkno);
1281		if (ret < 0) {
1282			mlog_errno(ret);
1283			goto out;
1284		}
1285	} else if (unwritten) {
1286		ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
1287		ret = ocfs2_mark_extent_written(inode, &et,
1288						wc->w_handle, cpos, 1, phys,
1289						meta_ac, &wc->w_dealloc);
1290		if (ret < 0) {
1291			mlog_errno(ret);
1292			goto out;
1293		}
1294	}
1295
1296	if (should_zero)
1297		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
1298	else
1299		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1300
1301	/*
1302	 * The only reason this should fail is due to an inability to
1303	 * find the extent added.
1304	 */
1305	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1306					  NULL);
1307	if (ret < 0) {
1308		ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
1309			    "at logical block %llu",
1310			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
1311			    (unsigned long long)v_blkno);
1312		goto out;
1313	}
1314
1315	BUG_ON(p_blkno == 0);
1316
1317	for(i = 0; i < wc->w_num_pages; i++) {
1318		int tmpret;
1319
1320		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
1321						      wc->w_pages[i], cpos,
1322						      user_pos, user_len,
1323						      should_zero);
1324		if (tmpret) {
1325			mlog_errno(tmpret);
1326			if (ret == 0)
1327				tmpret = ret;
1328		}
1329	}
1330
1331	/*
1332	 * We only have cleanup to do in case of allocating write.
1333	 */
1334	if (ret && new)
1335		ocfs2_write_failure(inode, wc, user_pos, user_len);
1336
1337out:
1338
1339	return ret;
1340}
1341
1342static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1343				       struct ocfs2_alloc_context *data_ac,
1344				       struct ocfs2_alloc_context *meta_ac,
1345				       struct ocfs2_write_ctxt *wc,
1346				       loff_t pos, unsigned len)
1347{
1348	int ret, i;
1349	loff_t cluster_off;
1350	unsigned int local_len = len;
1351	struct ocfs2_write_cluster_desc *desc;
1352	struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
1353
1354	for (i = 0; i < wc->w_clen; i++) {
1355		desc = &wc->w_desc[i];
1356
1357		/*
1358		 * We have to make sure that the total write passed in
1359		 * doesn't extend past a single cluster.
1360		 */
1361		local_len = len;
1362		cluster_off = pos & (osb->s_clustersize - 1);
1363		if ((cluster_off + local_len) > osb->s_clustersize)
1364			local_len = osb->s_clustersize - cluster_off;
1365
1366		ret = ocfs2_write_cluster(mapping, desc->c_phys,
1367					  desc->c_unwritten, data_ac, meta_ac,
1368					  wc, desc->c_cpos, pos, local_len);
1369		if (ret) {
1370			mlog_errno(ret);
1371			goto out;
1372		}
1373
1374		len -= local_len;
1375		pos += local_len;
1376	}
1377
1378	ret = 0;
1379out:
1380	return ret;
1381}
1382
1383/*
1384 * ocfs2_write_end() wants to know which parts of the target page it
1385 * should complete the write on. It's easiest to compute them ahead of
1386 * time when a more complete view of the write is available.
1387 */
1388static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1389					struct ocfs2_write_ctxt *wc,
1390					loff_t pos, unsigned len, int alloc)
1391{
1392	struct ocfs2_write_cluster_desc *desc;
1393
1394	wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
1395	wc->w_target_to = wc->w_target_from + len;
1396
1397	if (alloc == 0)
1398		return;
1399
1400	/*
1401	 * Allocating write - we may have different boundaries based
1402	 * on page size and cluster size.
1403	 *
1404	 * NOTE: We can no longer compute one value from the other as
1405	 * the actual write length and user provided length may be
1406	 * different.
1407	 */
1408
1409	if (wc->w_large_pages) {
1410		/*
1411		 * We only care about the 1st and last cluster within
1412		 * our range and whether they should be zero'd or not. Either
1413		 * value may be extended out to the start/end of a
1414		 * newly allocated cluster.
1415		 */
1416		desc = &wc->w_desc[0];
1417		if (ocfs2_should_zero_cluster(desc))
1418			ocfs2_figure_cluster_boundaries(osb,
1419							desc->c_cpos,
1420							&wc->w_target_from,
1421							NULL);
1422
1423		desc = &wc->w_desc[wc->w_clen - 1];
1424		if (ocfs2_should_zero_cluster(desc))
1425			ocfs2_figure_cluster_boundaries(osb,
1426							desc->c_cpos,
1427							NULL,
1428							&wc->w_target_to);
1429	} else {
1430		wc->w_target_from = 0;
1431		wc->w_target_to = PAGE_CACHE_SIZE;
1432	}
1433}
1434
1435/*
1436 * Populate each single-cluster write descriptor in the write context
1437 * with information about the i/o to be done.
1438 *
1439 * Returns the number of clusters that will have to be allocated, as
1440 * well as a worst case estimate of the number of extent records that
1441 * would have to be created during a write to an unwritten region.
1442 */
1443static int ocfs2_populate_write_desc(struct inode *inode,
1444				     struct ocfs2_write_ctxt *wc,
1445				     unsigned int *clusters_to_alloc,
1446				     unsigned int *extents_to_split)
1447{
1448	int ret;
1449	struct ocfs2_write_cluster_desc *desc;
1450	unsigned int num_clusters = 0;
1451	unsigned int ext_flags = 0;
1452	u32 phys = 0;
1453	int i;
1454
1455	*clusters_to_alloc = 0;
1456	*extents_to_split = 0;
1457
1458	for (i = 0; i < wc->w_clen; i++) {
1459		desc = &wc->w_desc[i];
1460		desc->c_cpos = wc->w_cpos + i;
1461
1462		if (num_clusters == 0) {
1463			/*
1464			 * Need to look up the next extent record.
1465			 */
1466			ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
1467						 &num_clusters, &ext_flags);
1468			if (ret) {
1469				mlog_errno(ret);
1470				goto out;
1471			}
1472
1473			/*
1474			 * Assume worst case - that we're writing in
1475			 * the middle of the extent.
1476			 *
1477			 * We can assume that the write proceeds from
1478			 * left to right, in which case the extent
1479			 * insert code is smart enough to coalesce the
1480			 * next splits into the previous records created.
1481			 */
1482			if (ext_flags & OCFS2_EXT_UNWRITTEN)
1483				*extents_to_split = *extents_to_split + 2;
1484		} else if (phys) {
1485			/*
1486			 * Only increment phys if it doesn't describe
1487			 * a hole.
1488			 */
1489			phys++;
1490		}
1491
1492		desc->c_phys = phys;
1493		if (phys == 0) {
1494			desc->c_new = 1;
1495			*clusters_to_alloc = *clusters_to_alloc + 1;
1496		}
1497		if (ext_flags & OCFS2_EXT_UNWRITTEN)
1498			desc->c_unwritten = 1;
1499
1500		num_clusters--;
1501	}
1502
1503	ret = 0;
1504out:
1505	return ret;
1506}
1507
1508static int ocfs2_write_begin_inline(struct address_space *mapping,
1509				    struct inode *inode,
1510				    struct ocfs2_write_ctxt *wc)
1511{
1512	int ret;
1513	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1514	struct page *page;
1515	handle_t *handle;
1516	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1517
1518	page = find_or_create_page(mapping, 0, GFP_NOFS);
1519	if (!page) {
1520		ret = -ENOMEM;
1521		mlog_errno(ret);
1522		goto out;
1523	}
1524	/*
1525	 * If we don't set w_num_pages then this page won't get unlocked
1526	 * and freed on cleanup of the write context.
1527	 */
1528	wc->w_pages[0] = wc->w_target_page = page;
1529	wc->w_num_pages = 1;
1530
1531	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1532	if (IS_ERR(handle)) {
1533		ret = PTR_ERR(handle);
1534		mlog_errno(ret);
1535		goto out;
1536	}
1537
1538	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
1539				   OCFS2_JOURNAL_ACCESS_WRITE);
1540	if (ret) {
1541		ocfs2_commit_trans(osb, handle);
1542
1543		mlog_errno(ret);
1544		goto out;
1545	}
1546
1547	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1548		ocfs2_set_inode_data_inline(inode, di);
1549
1550	if (!PageUptodate(page)) {
1551		ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
1552		if (ret) {
1553			ocfs2_commit_trans(osb, handle);
1554
1555			goto out;
1556		}
1557	}
1558
1559	wc->w_handle = handle;
1560out:
1561	return ret;
1562}
1563
1564int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
1565{
1566	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1567
1568	if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
1569		return 1;
1570	return 0;
1571}
1572
1573static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1574					  struct inode *inode, loff_t pos,
1575					  unsigned len, struct page *mmap_page,
1576					  struct ocfs2_write_ctxt *wc)
1577{
1578	int ret, written = 0;
1579	loff_t end = pos + len;
1580	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1581
1582	mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
1583	     (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
1584	     oi->ip_dyn_features);
1585
1586	/*
1587	 * Handle inodes which already have inline data 1st.
1588	 */
1589	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1590		if (mmap_page == NULL &&
1591		    ocfs2_size_fits_inline_data(wc->w_di_bh, end))
1592			goto do_inline_write;
1593
1594		/*
1595		 * The write won't fit - we have to give this inode an
1596		 * inline extent list now.
1597		 */
1598		ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
1599		if (ret)
1600			mlog_errno(ret);
1601		goto out;
1602	}
1603
1604	/*
1605	 * Check whether the inode can accept inline data.
1606	 */
1607	if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
1608		return 0;
1609
1610	/*
1611	 * Check whether the write can fit.
1612	 */
1613	if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb))
1614		return 0;
1615
1616do_inline_write:
1617	ret = ocfs2_write_begin_inline(mapping, inode, wc);
1618	if (ret) {
1619		mlog_errno(ret);
1620		goto out;
1621	}
1622
1623	/*
1624	 * This signals to the caller that the data can be written
1625	 * inline.
1626	 */
1627	written = 1;
1628out:
1629	return written ? written : ret;
1630}
1631
1632/*
1633 * This function only does anything for file systems which can't
1634 * handle sparse files.
1635 *
1636 * What we want to do here is fill in any hole between the current end
1637 * of allocation and the end of our write. That way the rest of the
1638 * write path can treat it as an non-allocating write, which has no
1639 * special case code for sparse/nonsparse files.
1640 */
1641static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
1642					unsigned len,
1643					struct ocfs2_write_ctxt *wc)
1644{
1645	int ret;
1646	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1647	loff_t newsize = pos + len;
1648
1649	if (ocfs2_sparse_alloc(osb))
1650		return 0;
1651
1652	if (newsize <= i_size_read(inode))
1653		return 0;
1654
1655	ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
1656	if (ret)
1657		mlog_errno(ret);
1658
1659	return ret;
1660}
1661
1662int ocfs2_write_begin_nolock(struct address_space *mapping,
1663			     loff_t pos, unsigned len, unsigned flags,
1664			     struct page **pagep, void **fsdata,
1665			     struct buffer_head *di_bh, struct page *mmap_page)
1666{
1667	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
1668	unsigned int clusters_to_alloc, extents_to_split;
1669	struct ocfs2_write_ctxt *wc;
1670	struct inode *inode = mapping->host;
1671	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1672	struct ocfs2_dinode *di;
1673	struct ocfs2_alloc_context *data_ac = NULL;
1674	struct ocfs2_alloc_context *meta_ac = NULL;
1675	handle_t *handle;
1676	struct ocfs2_extent_tree et;
1677
1678	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1679	if (ret) {
1680		mlog_errno(ret);
1681		return ret;
1682	}
1683
1684	if (ocfs2_supports_inline_data(osb)) {
1685		ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
1686						     mmap_page, wc);
1687		if (ret == 1) {
1688			ret = 0;
1689			goto success;
1690		}
1691		if (ret < 0) {
1692			mlog_errno(ret);
1693			goto out;
1694		}
1695	}
1696
1697	ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
1698	if (ret) {
1699		mlog_errno(ret);
1700		goto out;
1701	}
1702
1703	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1704					&extents_to_split);
1705	if (ret) {
1706		mlog_errno(ret);
1707		goto out;
1708	}
1709
1710	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1711
1712	/*
1713	 * We set w_target_from, w_target_to here so that
1714	 * ocfs2_write_end() knows which range in the target page to
1715	 * write out. An allocation requires that we write the entire
1716	 * cluster range.
1717	 */
1718	if (clusters_to_alloc || extents_to_split) {
1719		/*
1720		 * XXX: We are stretching the limits of
1721		 * ocfs2_lock_allocators(). It greatly over-estimates
1722		 * the work to be done.
1723		 */
1724		mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
1725		     " clusters_to_add = %u, extents_to_split = %u\n",
1726		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1727		     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
1728		     clusters_to_alloc, extents_to_split);
1729
1730		ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
1731		ret = ocfs2_lock_allocators(inode, &et,
1732					    clusters_to_alloc, extents_to_split,
1733					    &data_ac, &meta_ac);
1734		if (ret) {
1735			mlog_errno(ret);
1736			goto out;
1737		}
1738
1739		credits = ocfs2_calc_extend_credits(inode->i_sb,
1740						    &di->id2.i_list,
1741						    clusters_to_alloc);
1742
1743	}
1744
1745	ocfs2_set_target_boundaries(osb, wc, pos, len,
1746				    clusters_to_alloc + extents_to_split);
1747
1748	handle = ocfs2_start_trans(osb, credits);
1749	if (IS_ERR(handle)) {
1750		ret = PTR_ERR(handle);
1751		mlog_errno(ret);
1752		goto out;
1753	}
1754
1755	wc->w_handle = handle;
1756
1757	/*
1758	 * We don't want this to fail in ocfs2_write_end(), so do it
1759	 * here.
1760	 */
1761	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
1762				   OCFS2_JOURNAL_ACCESS_WRITE);
1763	if (ret) {
1764		mlog_errno(ret);
1765		goto out_commit;
1766	}
1767
1768	/*
1769	 * Fill our page array first. That way we've grabbed enough so
1770	 * that we can zero and flush if we error after adding the
1771	 * extent.
1772	 */
1773	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
1774					 clusters_to_alloc + extents_to_split,
1775					 mmap_page);
1776	if (ret) {
1777		mlog_errno(ret);
1778		goto out_commit;
1779	}
1780
1781	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1782					  len);
1783	if (ret) {
1784		mlog_errno(ret);
1785		goto out_commit;
1786	}
1787
1788	if (data_ac)
1789		ocfs2_free_alloc_context(data_ac);
1790	if (meta_ac)
1791		ocfs2_free_alloc_context(meta_ac);
1792
1793success:
1794	*pagep = wc->w_target_page;
1795	*fsdata = wc;
1796	return 0;
1797out_commit:
1798	ocfs2_commit_trans(osb, handle);
1799
1800out:
1801	ocfs2_free_write_ctxt(wc);
1802
1803	if (data_ac)
1804		ocfs2_free_alloc_context(data_ac);
1805	if (meta_ac)
1806		ocfs2_free_alloc_context(meta_ac);
1807	return ret;
1808}
1809
1810static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1811			     loff_t pos, unsigned len, unsigned flags,
1812			     struct page **pagep, void **fsdata)
1813{
1814	int ret;
1815	struct buffer_head *di_bh = NULL;
1816	struct inode *inode = mapping->host;
1817
1818	ret = ocfs2_inode_lock(inode, &di_bh, 1);
1819	if (ret) {
1820		mlog_errno(ret);
1821		return ret;
1822	}
1823
1824	/*
1825	 * Take alloc sem here to prevent concurrent lookups. That way
1826	 * the mapping, zeroing and tree manipulation within
1827	 * ocfs2_write() will be safe against ->readpage(). This
1828	 * should also serve to lock out allocation from a shared
1829	 * writeable region.
1830	 */
1831	down_write(&OCFS2_I(inode)->ip_alloc_sem);
1832
1833	ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1834				       fsdata, di_bh, NULL);
1835	if (ret) {
1836		mlog_errno(ret);
1837		goto out_fail;
1838	}
1839
1840	brelse(di_bh);
1841
1842	return 0;
1843
1844out_fail:
1845	up_write(&OCFS2_I(inode)->ip_alloc_sem);
1846
1847	brelse(di_bh);
1848	ocfs2_inode_unlock(inode, 1);
1849
1850	return ret;
1851}
1852
1853static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
1854				   unsigned len, unsigned *copied,
1855				   struct ocfs2_dinode *di,
1856				   struct ocfs2_write_ctxt *wc)
1857{
1858	void *kaddr;
1859
1860	if (unlikely(*copied < len)) {
1861		if (!PageUptodate(wc->w_target_page)) {
1862			*copied = 0;
1863			return;
1864		}
1865	}
1866
1867	kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
1868	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
1869	kunmap_atomic(kaddr, KM_USER0);
1870
1871	mlog(0, "Data written to inode at offset %llu. "
1872	     "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
1873	     (unsigned long long)pos, *copied,
1874	     le16_to_cpu(di->id2.i_data.id_count),
1875	     le16_to_cpu(di->i_dyn_features));
1876}
1877
1878int ocfs2_write_end_nolock(struct address_space *mapping,
1879			   loff_t pos, unsigned len, unsigned copied,
1880			   struct page *page, void *fsdata)
1881{
1882	int i;
1883	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
1884	struct inode *inode = mapping->host;
1885	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1886	struct ocfs2_write_ctxt *wc = fsdata;
1887	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1888	handle_t *handle = wc->w_handle;
1889	struct page *tmppage;
1890
1891	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1892		ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
1893		goto out_write_size;
1894	}
1895
1896	if (unlikely(copied < len)) {
1897		if (!PageUptodate(wc->w_target_page))
1898			copied = 0;
1899
1900		ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
1901				       start+len);
1902	}
1903	flush_dcache_page(wc->w_target_page);
1904
1905	for(i = 0; i < wc->w_num_pages; i++) {
1906		tmppage = wc->w_pages[i];
1907
1908		if (tmppage == wc->w_target_page) {
1909			from = wc->w_target_from;
1910			to = wc->w_target_to;
1911
1912			BUG_ON(from > PAGE_CACHE_SIZE ||
1913			       to > PAGE_CACHE_SIZE ||
1914			       to < from);
1915		} else {
1916			/*
1917			 * Pages adjacent to the target (if any) imply
1918			 * a hole-filling write in which case we want
1919			 * to flush their entire range.
1920			 */
1921			from = 0;
1922			to = PAGE_CACHE_SIZE;
1923		}
1924
1925		if (page_has_buffers(tmppage)) {
1926			if (ocfs2_should_order_data(inode)) {
1927				ocfs2_jbd2_file_inode(wc->w_handle, inode);
1928#ifdef CONFIG_OCFS2_COMPAT_JBD
1929				walk_page_buffers(wc->w_handle,
1930						  page_buffers(tmppage),
1931						  from, to, NULL,
1932						  ocfs2_journal_dirty_data);
1933#endif
1934			}
1935			block_commit_write(tmppage, from, to);
1936		}
1937	}
1938
1939out_write_size:
1940	pos += copied;
1941	if (pos > inode->i_size) {
1942		i_size_write(inode, pos);
1943		mark_inode_dirty(inode);
1944	}
1945	inode->i_blocks = ocfs2_inode_sector_count(inode);
1946	di->i_size = cpu_to_le64((u64)i_size_read(inode));
1947	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1948	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1949	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1950	ocfs2_journal_dirty(handle, wc->w_di_bh);
1951
1952	ocfs2_commit_trans(osb, handle);
1953
1954	ocfs2_run_deallocs(osb, &wc->w_dealloc);
1955
1956	ocfs2_free_write_ctxt(wc);
1957
1958	return copied;
1959}
1960
1961static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1962			   loff_t pos, unsigned len, unsigned copied,
1963			   struct page *page, void *fsdata)
1964{
1965	int ret;
1966	struct inode *inode = mapping->host;
1967
1968	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1969
1970	up_write(&OCFS2_I(inode)->ip_alloc_sem);
1971	ocfs2_inode_unlock(inode, 1);
1972
1973	return ret;
1974}
1975
1976const struct address_space_operations ocfs2_aops = {
1977	.readpage	= ocfs2_readpage,
1978	.readpages	= ocfs2_readpages,
1979	.writepage	= ocfs2_writepage,
1980	.write_begin	= ocfs2_write_begin,
1981	.write_end	= ocfs2_write_end,
1982	.bmap		= ocfs2_bmap,
1983	.sync_page	= block_sync_page,
1984	.direct_IO	= ocfs2_direct_IO,
1985	.invalidatepage	= ocfs2_invalidatepage,
1986	.releasepage	= ocfs2_releasepage,
1987	.migratepage	= buffer_migrate_page,
1988};
1989