From a94477da38e0b261a7ecea71f4c95a3bcd5be69c Mon Sep 17 00:00:00 2001 From: David Chinner Date: Wed, 30 Apr 2008 18:15:34 +1000 Subject: [XFS] Include linux/random.h in all builds, not just debug builds. SGI-PV: 979416 SGI-Modid: xfs-linux-melb:xfs-kern:31008a Signed-off-by: David Chinner Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_linux.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 1bc9f60..4edc469 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -75,6 +75,7 @@ #include #include #include +#include #include #include -- cgit v1.1 From 978b7237123d007b9fa983af6e0e2fa8f97f9934 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Mon, 19 May 2008 16:29:46 +1000 Subject: [XFS] Fix fsync() b0rkage. xfs_fsync() fails to wait for data I/O completion before checking if the inode is dirty or clean to decide whether to log the inode or not. This misses inode size updates when the data flushed by the fsync() is extending the file. Hence, like fdatasync(), we need to wait for I/o completion first, then check the inode for cleanliness. Doing so makes the behaviour of xfs_fsync() identical for fsync and fdatasync and we *always* use synchronous semantics if the inode is dirty. Therefore also kill the differences and remove the unused flags from the xfs_fsync function and callers. SGI-PV: 981296 SGI-Modid: xfs-linux-melb:xfs-kern:31033a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_file.c | 17 +++++++++++------ fs/xfs/linux-2.6/xfs_vnode.h | 8 -------- 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 65e78c1..5f60363 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -184,19 +184,24 @@ xfs_file_release( return -xfs_release(XFS_I(inode)); } +/* + * We ignore the datasync flag here because a datasync is effectively + * identical to an fsync. That is, datasync implies that we need to write + * only the metadata needed to be able to access the data that is written + * if we crash after the call completes. Hence if we are writing beyond + * EOF we have to log the inode size change as well, which makes it a + * full fsync. If we don't write beyond EOF, the inode core will be + * clean in memory and so we don't need to log the inode, just like + * fsync. + */ STATIC int xfs_file_fsync( struct file *filp, struct dentry *dentry, int datasync) { - int flags = FSYNC_WAIT; - - if (datasync) - flags |= FSYNC_DATA; xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); - return -xfs_fsync(XFS_I(dentry->d_inode), flags, - (xfs_off_t)0, (xfs_off_t)-1); + return -xfs_fsync(XFS_I(dentry->d_inode)); } /* diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 9d73cb5..25eb2a9 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -230,14 +230,6 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt) #define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */ /* - * Flags to vop_fsync/reclaim. - */ -#define FSYNC_NOWAIT 0 /* asynchronous flush */ -#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */ -#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */ -#define FSYNC_DATA 0x4 /* synchronous fsync of data only */ - -/* * Tracking vnode activity. */ #if defined(XFS_INODE_TRACE) -- cgit v1.1 From 6ab455eeaff6893cd06da33843e840d888cdc04a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 May 2008 16:34:42 +1000 Subject: [XFS] Fix memory corruption with small buffer reads When we have multiple buffers in a single page for a blocksize == pagesize filesystem we might overwrite the page contents if two callers hit it shortly after each other. To prevent that we need to keep the page locked until I/O is completed and the page marked uptodate. Thanks to Eric Sandeen for triaging this bug and finding a reproducible testcase and Dave Chinner for additional advice. This should fix kernel.org bz #10421. Tested-by: Eric Sandeen SGI-PV: 981813 SGI-Modid: xfs-linux-melb:xfs-kern:31173a Signed-off-by: Christoph Hellwig Signed-off-by: David Chinner Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_buf.c | 24 ++++++++++++++++++++---- fs/xfs/linux-2.6/xfs_buf.h | 19 +++++++++++++++++++ 2 files changed, 39 insertions(+), 4 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 5105015..98e0e86 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -387,6 +387,8 @@ _xfs_buf_lookup_pages( if (unlikely(page == NULL)) { if (flags & XBF_READ_AHEAD) { bp->b_page_count = i; + for (i = 0; i < bp->b_page_count; i++) + unlock_page(bp->b_pages[i]); return -ENOMEM; } @@ -416,17 +418,24 @@ _xfs_buf_lookup_pages( ASSERT(!PagePrivate(page)); if (!PageUptodate(page)) { page_count--; - if (blocksize < PAGE_CACHE_SIZE && !PagePrivate(page)) { + if (blocksize >= PAGE_CACHE_SIZE) { + if (flags & XBF_READ) + bp->b_flags |= _XBF_PAGE_LOCKED; + } else if (!PagePrivate(page)) { if (test_page_region(page, offset, nbytes)) page_count++; } } - unlock_page(page); bp->b_pages[i] = page; offset = 0; } + if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { + for (i = 0; i < bp->b_page_count; i++) + unlock_page(bp->b_pages[i]); + } + if (page_count == bp->b_page_count) bp->b_flags |= XBF_DONE; @@ -746,6 +755,7 @@ xfs_buf_associate_memory( bp->b_count_desired = len; bp->b_buffer_length = buflen; bp->b_flags |= XBF_MAPPED; + bp->b_flags &= ~_XBF_PAGE_LOCKED; return 0; } @@ -1093,8 +1103,10 @@ _xfs_buf_ioend( xfs_buf_t *bp, int schedule) { - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { + bp->b_flags &= ~_XBF_PAGE_LOCKED; xfs_buf_ioend(bp, schedule); + } } STATIC void @@ -1125,6 +1137,9 @@ xfs_buf_bio_end_io( if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); + + if (bp->b_flags & _XBF_PAGE_LOCKED) + unlock_page(page); } while (bvec >= bio->bi_io_vec); _xfs_buf_ioend(bp, 1); @@ -1163,7 +1178,8 @@ _xfs_buf_ioapply( * filesystem block size is not smaller than the page size. */ if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && - (bp->b_flags & XBF_READ) && + ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) == + (XBF_READ|_XBF_PAGE_LOCKED)) && (blocksize >= PAGE_CACHE_SIZE)) { bio = bio_alloc(GFP_NOIO, 1); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 841d788..f948ec7 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -66,6 +66,25 @@ typedef enum { _XBF_PAGES = (1 << 18), /* backed by refcounted pages */ _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ + + /* + * Special flag for supporting metadata blocks smaller than a FSB. + * + * In this case we can have multiple xfs_buf_t on a single page and + * need to lock out concurrent xfs_buf_t readers as they only + * serialise access to the buffer. + * + * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation + * between reads of the page. Hence we can have one thread read the + * page and modify it, but then race with another thread that thinks + * the page is not up-to-date and hence reads it again. + * + * The result is that the first modifcation to the page is lost. + * This sort of AGF/AGI reading race can happen when unlinking inodes + * that require truncation and results in the AGI unlinked list + * modifications being lost. + */ + _XBF_PAGE_LOCKED = (1 << 22), } xfs_buf_flags_t; typedef enum { -- cgit v1.1