From 7bb46a6734a7e1ad4beaecc11cae7ed3ff81d30f Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Thu, 27 May 2010 01:05:33 +1000 Subject: fs: introduce new truncate sequence Introduce a new truncate calling sequence into fs/mm subsystems. Rather than setattr > vmtruncate > truncate, have filesystems call their truncate sequence from ->setattr if filesystem specific operations are required. vmtruncate is deprecated, and truncate_pagecache and inode_newsize_ok helpers introduced previously should be used. simple_setattr is introduced for simple in-ram filesystems to implement the new truncate sequence. Eventually all filesystems should be converted to implement a setattr, and the default code in notify_change should go away. simple_setsize is also introduced to perform just the ATTR_SIZE portion of simple_setattr (ie. changing i_size and trimming pagecache). To implement the new truncate sequence: - filesystem specific manipulations (eg freeing blocks) must be done in the setattr method rather than ->truncate. - vmtruncate can not be used by core code to trim blocks past i_size in the event of write failure after allocation, so this must be performed in the fs code. - convert usage of helpers block_write_begin, nobh_write_begin, cont_write_begin, and *blockdev_direct_IO* to use _newtrunc postfixed variants. These avoid calling vmtruncate to trim blocks (see previous). - inode_setattr should not be used. generic_setattr is a new function to be used to copy simple attributes into the generic inode. - make use of the better opportunity to handle errors with the new sequence. Big problem with the previous calling sequence: the filesystem is not called until i_size has already changed. This means it is not allowed to fail the call, and also it does not know what the previous i_size was. Also, generic code calling vmtruncate to truncate allocated blocks in case of error had no good way to return a meaningful error (or, for example, atomically handle block deallocation). Cc: Christoph Hellwig Acked-by: Jan Kara Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/buffer.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 98 insertions(+), 25 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index e8aa708..d54812b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, } /* - * block_write_begin takes care of the basic task of block allocation and - * bringing partial write blocks uptodate first. - * - * If *pagep is not NULL, then block_write_begin uses the locked page - * at *pagep rather than allocating its own. In this case, the page will - * not be unlocked or deallocated on failure. + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int block_write_begin(struct file *file, struct address_space *mapping, +int block_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - */ - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); } } out: return status; } +EXPORT_SYMBOL(block_write_begin_newtrunc); + +/* + * block_write_begin takes care of the basic task of block allocation and + * bringing partial write blocks uptodate first. + * + * If *pagep is not NULL, then block_write_begin uses the locked page + * at *pagep rather than allocating its own. In this case, the page will + * not be unlocked or deallocated on failure. + */ +int block_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = block_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Filesystems which pass down their own page also cannot + * call into vmtruncate here because it would lead to lock + * inversion problems (*pagep is locked). This is a further + * example of where the old truncate sequence is inadequate. + */ + if (unlikely(ret) && *pagep == NULL) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(block_write_begin); int block_write_end(struct file *file, struct address_space *mapping, @@ -2324,7 +2351,7 @@ out: * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ -int cont_write_begin(struct file *file, struct address_space *mapping, +int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block, loff_t *bytes) @@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } *pagep = NULL; - err = block_write_begin(file, mapping, pos, len, + err = block_write_begin_newtrunc(file, mapping, pos, len, flags, pagep, fsdata, get_block); out: return err; } +EXPORT_SYMBOL(cont_write_begin_newtrunc); + +int cont_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block, loff_t *bytes) +{ + int ret; + + ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block, bytes); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(cont_write_begin); int block_prepare_write(struct page *page, unsigned from, unsigned to, @@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write); * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because - * vmtruncate() writes the inode size before removing pages, once we have the + * truncate writes the inode size before removing pages, once we have the * page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. @@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) } /* - * On entry, the page is fully not uptodate. - * On exit the page is fully uptodate in the areas outside (from,to) + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int nobh_write_begin(struct file *file, struct address_space *mapping, +int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, - fsdata, get_block); + return block_write_begin_newtrunc(file, mapping, pos, len, + flags, pagep, fsdata, get_block); } if (PageMappedToDisk(page)) @@ -2605,8 +2652,34 @@ out_release: page_cache_release(page); *pagep = NULL; - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + return ret; +} +EXPORT_SYMBOL(nobh_write_begin_newtrunc); + +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + */ +int nobh_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } return ret; } -- cgit v1.1