From fe2bbc4832659b7ffc867cac03e0a92ae81e11e4 Mon Sep 17 00:00:00 2001
From: Henrik Kretzschmar <henne@nachtwindheim.de>
Date: Wed, 6 Sep 2006 00:03:41 -0700
Subject: [PATCH] add missing desctiption in super.c

Adds kernel-doc for alloc_super() type in fs/super.c.

Signed-off-by: Henrik Kretzschmar <henne@nachtwindheim.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 6d4e817..5c4c94d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -49,6 +49,7 @@ DEFINE_SPINLOCK(sb_lock);
 
 /**
  *	alloc_super	-	create new superblock
+ *	@type:	filesystem type superblock should belong to
  *
  *	Allocates and initializes a new &struct super_block.  alloc_super()
  *	returns a pointer new superblock or %NULL if allocation had failed.
-- 
cgit v1.1


From b835bebe95608c81270636a78b70333afb011925 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 6 Sep 2006 22:02:22 +0000
Subject: [CIFS] Fix CIFS readdir access denied when SE Linux enabled

CIFS had one path in which dentry was instantiated before the corresponding
inode metadata was filled in.

Fixes Redhat bugzilla bug #163493

Signed-off-by: Steve French <sfrench@us.ibm.com>
Acked-by: Eric Paris <eparis@redhat.com>
Acked-by: Dave Kleikamp <shaggy@austin.ibm.com>
---
 fs/cifs/readdir.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 105761e..9aeb58a 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -82,7 +82,6 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 			if(*ptmp_inode == NULL)
 				return rc;
 			rc = 1;
-			d_instantiate(tmp_dentry, *ptmp_inode);
 		}
 	} else {
 		tmp_dentry = d_alloc(file->f_dentry, qstring);
@@ -99,9 +98,7 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 			tmp_dentry->d_op = &cifs_dentry_ops;
 		if(*ptmp_inode == NULL)
 			return rc;
-		rc = 1;
-		d_instantiate(tmp_dentry, *ptmp_inode);
-		d_rehash(tmp_dentry);
+		rc = 2;
 	}
 
 	tmp_dentry->d_time = jiffies;
@@ -870,6 +867,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 				pfindEntry, &obj_type, rc);
 	else
 		fill_in_inode(tmp_inode, 1 /* NT */, pfindEntry, &obj_type, rc);
+
+	if(rc) /* new inode - needs to be tied to dentry */ {
+		d_instantiate(tmp_dentry, tmp_inode);
+		if(rc == 2)
+			d_rehash(tmp_dentry);
+	}
 	
 	
 	rc = filldir(direntry,qstring.name,qstring.len,file->f_pos,
-- 
cgit v1.1


From 4be536debe3f7b0c62283e77fd6bd8bdb9f83c6f Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 7 Sep 2006 14:26:50 +1000
Subject: [XFS] Prevent free space oversubscription and xfssyncd looping.

The fix for recent ENOSPC deadlocks introduced certain limitations on
allocations. The fix could cause xfssyncd to loop endlessly if we did not
leave some space free for the allocator to work correctly. Basically, we
needed to ensure that we had at least 4 blocks free for an AG free list
and a block for the inode bmap btree at all times.

However, this did not take into account the fact that each AG has a free
list that needs 4 blocks. Hence any filesystem with more than one AG could
cause oversubscription of free space and make xfssyncd spin forever trying
to allocate space needed for AG freelists that was not available in the
AG.

The following patch reserves space for the free lists in all AGs plus the
inode bmap btree which prevents oversubscription. It also prevents those
blocks from being reported as free space (as they can never be used) and
makes the SMP in-core superblock accounting code and the reserved block
ioctl respect this requirement.

SGI-PV: 955674
SGI-Modid: xfs-linux-melb:xfs-kern:26894a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: David Chatterton <chatz@sgi.com>
---
 fs/xfs/xfs_alloc.h  | 20 ++++++++++++++++++++
 fs/xfs/xfs_fsops.c  | 16 ++++++++++------
 fs/xfs/xfs_mount.c  | 32 ++++++++------------------------
 fs/xfs/xfs_vfsops.c |  3 ++-
 4 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 650591f..5a42561 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -44,6 +44,26 @@ typedef enum xfs_alloctype
 #define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
 
 /*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks
+ * to 4 + 4*agcount.
+ */
+#define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
+
+/*
  * Argument structure for xfs_alloc routines.
  * This is turned into a structure to avoid having 20 arguments passed
  * down several levels of the stack.
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 077629b..c064e72 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -462,7 +462,7 @@ xfs_fs_counts(
 
 	xfs_icsb_sync_counters_lazy(mp);
 	s = XFS_SB_LOCK(mp);
-	cnt->freedata = mp->m_sb.sb_fdblocks;
+	cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
 	cnt->freertx = mp->m_sb.sb_frextents;
 	cnt->freeino = mp->m_sb.sb_ifree;
 	cnt->allocino = mp->m_sb.sb_icount;
@@ -519,15 +519,19 @@ xfs_reserve_blocks(
 		}
 		mp->m_resblks = request;
 	} else {
+		__int64_t	free;
+
+		free =  mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
 		delta = request - mp->m_resblks;
-		lcounter = mp->m_sb.sb_fdblocks - delta;
+		lcounter = free - delta;
 		if (lcounter < 0) {
 			/* We can't satisfy the request, just get what we can */
-			mp->m_resblks += mp->m_sb.sb_fdblocks;
-			mp->m_resblks_avail += mp->m_sb.sb_fdblocks;
-			mp->m_sb.sb_fdblocks = 0;
+			mp->m_resblks += free;
+			mp->m_resblks_avail += free;
+			mp->m_sb.sb_fdblocks = XFS_ALLOC_SET_ASIDE(mp);
 		} else {
-			mp->m_sb.sb_fdblocks = lcounter;
+			mp->m_sb.sb_fdblocks =
+				lcounter + XFS_ALLOC_SET_ASIDE(mp);
 			mp->m_resblks = request;
 			mp->m_resblks_avail += delta;
 		}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 4be5c0b..9dfae18 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1243,24 +1243,6 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
 	xfs_trans_log_buf(tp, bp, first, last);
 }
 
-/*
- * In order to avoid ENOSPC-related deadlock caused by
- * out-of-order locking of AGF buffer (PV 947395), we place
- * constraints on the relationship among actual allocations for
- * data blocks, freelist blocks, and potential file data bmap
- * btree blocks. However, these restrictions may result in no
- * actual space allocated for a delayed extent, for example, a data
- * block in a certain AG is allocated but there is no additional
- * block for the additional bmap btree block due to a split of the
- * bmap btree of the file. The result of this may lead to an
- * infinite loop in xfssyncd when the file gets flushed to disk and
- * all delayed extents need to be actually allocated. To get around
- * this, we explicitly set aside a few blocks which will not be
- * reserved in delayed allocation. Considering the minimum number of
- * needed freelist blocks is 4 fsbs, a potential split of file's bmap
- * btree requires 1 fsb, so we set the number of set-aside blocks to 8.
-*/
-#define SET_ASIDE_BLOCKS 8
 
 /*
  * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
@@ -1306,7 +1288,8 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
 		return 0;
 	case XFS_SBS_FDBLOCKS:
 
-		lcounter = (long long)mp->m_sb.sb_fdblocks - SET_ASIDE_BLOCKS;
+		lcounter = (long long)
+			mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
 		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
 
 		if (delta > 0) {		/* Putting blocks back */
@@ -1340,7 +1323,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
 			}
 		}
 
-		mp->m_sb.sb_fdblocks = lcounter + SET_ASIDE_BLOCKS;
+		mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
 		return 0;
 	case XFS_SBS_FREXTENTS:
 		lcounter = (long long)mp->m_sb.sb_frextents;
@@ -2021,7 +2004,8 @@ xfs_icsb_sync_counters_lazy(
  * when we get near ENOSPC.
  */
 #define XFS_ICSB_INO_CNTR_REENABLE	64
-#define XFS_ICSB_FDBLK_CNTR_REENABLE	512
+#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
+		(512 + XFS_ALLOC_SET_ASIDE(mp))
 STATIC void
 xfs_icsb_balance_counter(
 	xfs_mount_t	*mp,
@@ -2055,7 +2039,7 @@ xfs_icsb_balance_counter(
 	case XFS_SBS_FDBLOCKS:
 		count = mp->m_sb.sb_fdblocks;
 		resid = do_div(count, weight);
-		if (count < XFS_ICSB_FDBLK_CNTR_REENABLE)
+		if (count < XFS_ICSB_FDBLK_CNTR_REENABLE(mp))
 			goto out;
 		break;
 	default:
@@ -2110,11 +2094,11 @@ again:
 	case XFS_SBS_FDBLOCKS:
 		BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
 
-		lcounter = icsbp->icsb_fdblocks;
+		lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
 		lcounter += delta;
 		if (unlikely(lcounter < 0))
 			goto slow_path;
-		icsbp->icsb_fdblocks = lcounter;
+		icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
 		break;
 	default:
 		BUG();
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index b427d22..a34796e 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -811,7 +811,8 @@ xfs_statvfs(
 	statp->f_bsize = sbp->sb_blocksize;
 	lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
 	statp->f_blocks = sbp->sb_dblocks - lsize;
-	statp->f_bfree = statp->f_bavail = sbp->sb_fdblocks;
+	statp->f_bfree = statp->f_bavail =
+				sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
 	fakeinos = statp->f_bfree << sbp->sb_inopblog;
 #if XFS_BIG_INUMS
 	fakeinos += mp->m_inoadd;
-- 
cgit v1.1


From 721259bce2851893155c6cb88a3f8ecb106b348c Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Thu, 7 Sep 2006 14:27:05 +1000
Subject: [XFS] Fix ABBA deadlock between i_mutex and iolock. Avoid calling
 __blockdev_direct_IO for the DIO_OWN_LOCKING case for direct I/O reads since
 it drops and reacquires the i_mutex while holding the iolock and this
 violates the locking order.

SGI-PV: 955696
SGI-Modid: xfs-linux-melb:xfs-kern:26898a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chatterton <chatz@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 18 +++++++++++++-----
 fs/xfs/linux-2.6/xfs_lrw.c  | 11 ++++++-----
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c40f81b..34dcb43 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1390,11 +1390,19 @@ xfs_vm_direct_IO(
 
 	iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
 
-	ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
-		iomap.iomap_target->bt_bdev,
-		iov, offset, nr_segs,
-		xfs_get_blocks_direct,
-		xfs_end_io_direct);
+	if (rw == WRITE) {
+		ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
+			iomap.iomap_target->bt_bdev,
+			iov, offset, nr_segs,
+			xfs_get_blocks_direct,
+			xfs_end_io_direct);
+	} else {
+		ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+			iomap.iomap_target->bt_bdev,
+			iov, offset, nr_segs,
+			xfs_get_blocks_direct,
+			xfs_end_io_direct);
+	}
 
 	if (unlikely(ret <= 0 && iocb->private))
 		xfs_destroy_ioend(iocb->private);
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 5d9cfd9..110c038 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -264,7 +264,9 @@ xfs_read(
 					dmflags, &locktype);
 		if (ret) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-			goto unlock_mutex;
+			if (unlikely(ioflags & IO_ISDIRECT))
+				mutex_unlock(&inode->i_mutex);
+			return ret;
 		}
 	}
 
@@ -272,6 +274,9 @@ xfs_read(
 		bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
 						-1, FI_REMAPF_LOCKED);
 
+	if (unlikely(ioflags & IO_ISDIRECT))
+		mutex_unlock(&inode->i_mutex);
+
 	xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
 				(void *)iovp, segs, *offset, ioflags);
 	ret = __generic_file_aio_read(iocb, iovp, segs, offset);
@@ -281,10 +286,6 @@ xfs_read(
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
-unlock_mutex:
-	if (unlikely(ioflags & IO_ISDIRECT))
-		mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
-- 
cgit v1.1


From 0a8d17d090a4939643a52194b7d4a4001b9b2d93 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 7 Sep 2006 14:27:15 +1000
Subject: [XFS] Fix xfs_splice_write() so appended data gets to disk.

xfs_splice_write() failed to update the on disk inode size when extending
the so when the file was closed the range extended by splice was truncated
off. Hence any region of a file written to by splice would end up as a
hole full of zeros.

SGI-PV: 955939
SGI-Modid: xfs-linux-melb:xfs-kern:26920a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: David Chatterton <chatz@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 110c038..ee788b1 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -391,6 +391,8 @@ xfs_splice_write(
 	xfs_inode_t		*ip = XFS_BHVTOI(bdp);
 	xfs_mount_t		*mp = ip->i_mount;
 	ssize_t			ret;
+	struct inode		*inode = outfilp->f_mapping->host;
+	xfs_fsize_t		isize;
 
 	XFS_STATS_INC(xs_write_calls);
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -417,6 +419,20 @@ xfs_splice_write(
 	if (ret > 0)
 		XFS_STATS_ADD(xs_write_bytes, ret);
 
+	isize = i_size_read(inode);
+	if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
+		*ppos = isize;
+
+	if (*ppos > ip->i_d.di_size) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (*ppos > ip->i_d.di_size) {
+			ip->i_d.di_size = *ppos;
+			i_size_write(inode, *ppos);
+			ip->i_update_core = 1;
+			ip->i_update_size = 1;
+		}
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
 }
-- 
cgit v1.1


From 0edc7d0f3709e8c3bb7e69c4df614218a753361e Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Thu, 7 Sep 2006 14:27:23 +1000
Subject: [XFS] Fix a bad pointer dereference in the quota statvfs handling.

SGI-PV: 955993
SGI-Modid: xfs-linux-melb:xfs-kern:26934a

Signed-off-by: Nathan Scott <nathans@sgi.com>
Signed-off-by: David Chatterton <chatz@sgi.com>
---
 fs/xfs/quota/xfs_qm_bhv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index f137856..db8872b 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -203,7 +203,7 @@ xfs_qm_statvfs(
 	if (error || !vnode)
 		return error;
 
-	mp = XFS_BHVTOM(bhv);
+	mp = xfs_vfstom(bhvtovfs(bhv));
 	ip = xfs_vtoi(vnode);
 
 	if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
-- 
cgit v1.1


From 3665d0e58fa44f50c744f85c7e8ad21d5b10e206 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Fri, 8 Sep 2006 09:48:21 -0700
Subject: [PATCH] ext3_getblk() should handle HOLE correctly

It has been reported that ext3_getblk() is not doing the right thing and
triggering following WARN():

BUG: warning at fs/ext3/inode.c:1016/ext3_getblk()
 <c01c5140> ext3_getblk+0x98/0x2a6  <c03b2806> md_wakeup_thread+0x26/0x2a
 <c01c536d> ext3_bread+0x1f/0x88  <c01cedf9> ext3_quota_read+0x136/0x1ae
 <c018b683> v1_read_dqblk+0x61/0xac  <c0188f32> dquot_acquire+0xf6/0x107
 <c01ceaba> ext3_acquire_dquot+0x46/0x68  <c01897d4> dqget+0x155/0x1e7
 <c018a97b> dquot_transfer+0x3e0/0x3e9  <c016fe52> dput+0x23/0x13e
 <c01c7986> ext3_setattr+0xc3/0x240  <c0120f66> current_fs_time+0x52/0x6a
 <c017320e> notify_change+0x2bd/0x30d  <c0159246> chown_common+0x9c/0xc5
 <c02a222c> strncpy_from_user+0x3b/0x68  <c0167fe6> do_path_lookup+0xdf/0x266
 <c016841b> __user_walk_fd+0x44/0x5a  <c01592b9> sys_chown+0x4a/0x55
 <c015a43c> vfs_write+0xe7/0x13c  <c01695d4> sys_mkdir+0x1f/0x23
 <c0102a97> syscall_call+0x7/0xb

Looking at the code, it looks like it's not handle HOLE correctly.  It ends
up returning -EIO.  Here is the patch to fix it.

If we really want to be paranoid, we can allow return values 0 (HOLE), 1
(we asked for one block) and return -EIO for more than 1 block.  But I
really don't see a reason for doing it - all we need is the block# here.
(doesn't matter how many blocks are mapped).

ext3_get_blocks_handle() returns number of blocks it mapped.  It returns 0
in case of HOLE.  ext3_getblk() should handle HOLE properly (currently its
dumping warning stack and returning -EIO).

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Acked-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/inode.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index c5ee9f0..0f0b1ea 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1009,11 +1009,14 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
 	buffer_trace_init(&dummy.b_history);
 	err = ext3_get_blocks_handle(handle, inode, block, 1,
 					&dummy, create, 1);
-	if (err == 1) {
+	/*
+	 * ext3_get_blocks_handle() returns number of blocks
+	 * mapped. 0 in case of a HOLE.
+	 */
+	if (err > 0) {
+		if (err > 1)
+			WARN_ON(1);
 		err = 0;
-	} else if (err >= 0) {
-		WARN_ON(1);
-		err = -EIO;
 	}
 	*errp = err;
 	if (!err && buffer_mapped(&dummy)) {
-- 
cgit v1.1


From e9f7bee1df223dcf83743b46cb06c08d95497ec0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 8 Sep 2006 09:48:54 -0700
Subject: [PATCH] NFS: large non-page-aligned direct I/O clobbers memory

The logic in nfs_direct_read_schedule and nfs_direct_write_schedule can
allow data->npages to be one larger than rpages.  This causes a page
pointer to be written beyond the end of the pagevec in nfs_read_data (or
nfs_write_data).

Fix this by making nfs_(read|write)_alloc() calculate the size of the
pagevec array, and initialise data->npages.

Also get rid of the redundant argument to nfs_commit_alloc().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/direct.c | 50 ++++++++++++++------------------------------------
 fs/nfs/read.c   | 24 +++++++++++++-----------
 fs/nfs/write.c  | 37 +++++++++++++++----------------------
 3 files changed, 42 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index fecd3b0..76ca1cb 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -100,25 +100,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
 	return atomic_dec_and_test(&dreq->io_count);
 }
 
-/*
- * "size" is never larger than rsize or wsize.
- */
-static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size)
-{
-	int page_count;
-
-	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	page_count -= user_addr >> PAGE_SHIFT;
-	BUG_ON(page_count < 0);
-
-	return page_count;
-}
-
-static inline unsigned int nfs_max_pages(unsigned int size)
-{
-	return (size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
-
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
  * @rw: direction (read or write)
@@ -276,28 +257,24 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	size_t rsize = NFS_SERVER(inode)->rsize;
-	unsigned int rpages = nfs_max_pages(rsize);
 	unsigned int pgbase;
 	int result;
 	ssize_t started = 0;
 
 	get_dreq(dreq);
 
-	pgbase = user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_read_data *data;
 		size_t bytes;
 
+		pgbase = user_addr & ~PAGE_MASK;
+		bytes = min(rsize,count);
+
 		result = -ENOMEM;
-		data = nfs_readdata_alloc(rpages);
+		data = nfs_readdata_alloc(pgbase + bytes);
 		if (unlikely(!data))
 			break;
 
-		bytes = rsize;
-		if (count < rsize)
-			bytes = count;
-
-		data->npages = nfs_direct_count_pages(user_addr, bytes);
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
 					data->npages, 1, 0, data->pagevec, NULL);
@@ -344,8 +321,10 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 		started += bytes;
 		user_addr += bytes;
 		pos += bytes;
+		/* FIXME: Remove this unnecessary math from final patch */
 		pgbase += bytes;
 		pgbase &= ~PAGE_MASK;
+		BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
 
 		count -= bytes;
 	} while (count != 0);
@@ -524,7 +503,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 
 static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
 {
-	dreq->commit_data = nfs_commit_alloc(0);
+	dreq->commit_data = nfs_commit_alloc();
 	if (dreq->commit_data != NULL)
 		dreq->commit_data->req = (struct nfs_page *) dreq;
 }
@@ -605,28 +584,24 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	size_t wsize = NFS_SERVER(inode)->wsize;
-	unsigned int wpages = nfs_max_pages(wsize);
 	unsigned int pgbase;
 	int result;
 	ssize_t started = 0;
 
 	get_dreq(dreq);
 
-	pgbase = user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_write_data *data;
 		size_t bytes;
 
+		pgbase = user_addr & ~PAGE_MASK;
+		bytes = min(wsize,count);
+
 		result = -ENOMEM;
-		data = nfs_writedata_alloc(wpages);
+		data = nfs_writedata_alloc(pgbase + bytes);
 		if (unlikely(!data))
 			break;
 
-		bytes = wsize;
-		if (count < wsize)
-			bytes = count;
-
-		data->npages = nfs_direct_count_pages(user_addr, bytes);
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
 					data->npages, 0, 0, data->pagevec, NULL);
@@ -676,8 +651,11 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 		started += bytes;
 		user_addr += bytes;
 		pos += bytes;
+
+		/* FIXME: Remove this useless math from the final patch */
 		pgbase += bytes;
 		pgbase &= ~PAGE_MASK;
+		BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
 
 		count -= bytes;
 	} while (count != 0);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index da9cf11..7a9ee00 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -43,13 +43,15 @@ static mempool_t *nfs_rdata_mempool;
 
 #define MIN_POOL_READ	(32)
 
-struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
+struct nfs_read_data *nfs_readdata_alloc(size_t len)
 {
+	unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
 
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
+		p->npages = pagecount;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
@@ -140,7 +142,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
 	int		result;
 	struct nfs_read_data *rdata;
 
-	rdata = nfs_readdata_alloc(1);
+	rdata = nfs_readdata_alloc(count);
 	if (!rdata)
 		return -ENOMEM;
 
@@ -336,25 +338,25 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
 	struct nfs_page *req = nfs_list_entry(head->next);
 	struct page *page = req->wb_page;
 	struct nfs_read_data *data;
-	unsigned int rsize = NFS_SERVER(inode)->rsize;
-	unsigned int nbytes, offset;
+	size_t rsize = NFS_SERVER(inode)->rsize, nbytes;
+	unsigned int offset;
 	int requests = 0;
 	LIST_HEAD(list);
 
 	nfs_list_remove_request(req);
 
 	nbytes = req->wb_bytes;
-	for(;;) {
-		data = nfs_readdata_alloc(1);
+	do {
+		size_t len = min(nbytes,rsize);
+
+		data = nfs_readdata_alloc(len);
 		if (!data)
 			goto out_bad;
 		INIT_LIST_HEAD(&data->pages);
 		list_add(&data->pages, &list);
 		requests++;
-		if (nbytes <= rsize)
-			break;
-		nbytes -= rsize;
-	}
+		nbytes -= len;
+	} while(nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
 
 	ClearPageError(page);
@@ -402,7 +404,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode)
 	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
 		return nfs_pagein_multi(head, inode);
 
-	data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages);
+	data = nfs_readdata_alloc(NFS_SERVER(inode)->rsize);
 	if (!data)
 		goto out_bad;
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5077499..8ab3cf1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -90,22 +90,13 @@ static mempool_t *nfs_commit_mempool;
 
 static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
 
-struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
+struct nfs_write_data *nfs_commit_alloc(void)
 {
 	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
 
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount <= ARRAY_SIZE(p->page_array))
-			p->pagevec = p->page_array;
-		else {
-			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
-			if (!p->pagevec) {
-				mempool_free(p, nfs_commit_mempool);
-				p = NULL;
-			}
-		}
 	}
 	return p;
 }
@@ -117,13 +108,15 @@ void nfs_commit_free(struct nfs_write_data *p)
 	mempool_free(p, nfs_commit_mempool);
 }
 
-struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
+struct nfs_write_data *nfs_writedata_alloc(size_t len)
 {
+	unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
 
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
+		p->npages = pagecount;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
@@ -208,7 +201,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
 	int		result, written = 0;
 	struct nfs_write_data *wdata;
 
-	wdata = nfs_writedata_alloc(1);
+	wdata = nfs_writedata_alloc(wsize);
 	if (!wdata)
 		return -ENOMEM;
 
@@ -999,24 +992,24 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
 	struct nfs_page *req = nfs_list_entry(head->next);
 	struct page *page = req->wb_page;
 	struct nfs_write_data *data;
-	unsigned int wsize = NFS_SERVER(inode)->wsize;
-	unsigned int nbytes, offset;
+	size_t wsize = NFS_SERVER(inode)->wsize, nbytes;
+	unsigned int offset;
 	int requests = 0;
 	LIST_HEAD(list);
 
 	nfs_list_remove_request(req);
 
 	nbytes = req->wb_bytes;
-	for (;;) {
-		data = nfs_writedata_alloc(1);
+	do {
+		size_t len = min(nbytes, wsize);
+
+		data = nfs_writedata_alloc(len);
 		if (!data)
 			goto out_bad;
 		list_add(&data->pages, &list);
 		requests++;
-		if (nbytes <= wsize)
-			break;
-		nbytes -= wsize;
-	}
+		nbytes -= len;
+	} while (nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
 
 	ClearPageError(page);
@@ -1070,7 +1063,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
 	struct nfs_write_data	*data;
 	unsigned int		count;
 
-	data = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
+	data = nfs_writedata_alloc(NFS_SERVER(inode)->wsize);
 	if (!data)
 		goto out_bad;
 
@@ -1378,7 +1371,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 	struct nfs_write_data	*data;
 	struct nfs_page         *req;
 
-	data = nfs_commit_alloc(NFS_SERVER(inode)->wpages);
+	data = nfs_commit_alloc();
 
 	if (!data)
 		goto out_bad;
-- 
cgit v1.1