From 1c1266bb916e6a6b362d3be95f2cc7f3c41277a6 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 12 Jan 2011 16:53:27 -0800 Subject: ceph: fix getattr on directory when using norbytes The norbytes mount option was broken, and when doing getattr on a directory it return the rbytes instead of the number of entities. This commit fixes it. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e791fa3..50001de 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -701,10 +701,6 @@ static int fill_inode(struct inode *inode, ci->i_ceph_flags |= CEPH_I_COMPLETE; ci->i_max_offset = 2; } - - /* it may be better to set st_size in getattr instead? */ - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) - inode->i_size = ci->i_rbytes; break; default: pr_err("fill_inode %llx.%llx BAD mode 0%o\n", @@ -1805,7 +1801,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, else stat->dev = 0; if (S_ISDIR(inode->i_mode)) { - stat->size = ci->i_rbytes; + if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), + RBYTES)) + stat->size = ci->i_rbytes; + else + stat->size = ci->i_files + ci->i_subdirs; stat->blocks = 0; stat->blksize = 65536; } -- cgit v1.1 From 17db143fc091238c43ab9f373974ca2224a4c3f8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Jan 2011 15:27:29 -0800 Subject: ceph: fix xattr rbtree search Fix xattr name comparison in rbtree search for strings that share a prefix. The *name argument is null terminated, but the xattr name is not, so we need to use strncmp, but that means adjusting for the case where name is a prefix of xattr->name. The corresponding case in __set_xattr() already handles this properly (although in that case *name is also not null terminated). Reported-by: Sergiy Kibrik Signed-off-by: Sage Weil --- fs/ceph/xattr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ceph') diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 6e12a6b..8c9eba6 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -219,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, struct rb_node **p; struct rb_node *parent = NULL; struct ceph_inode_xattr *xattr = NULL; + int name_len = strlen(name); int c; p = &ci->i_xattrs.index.rb_node; @@ -226,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, parent = *p; xattr = rb_entry(parent, struct ceph_inode_xattr, node); c = strncmp(name, xattr->name, xattr->name_len); + if (c == 0 && name_len > xattr->name_len) + c = 1; if (c < 0) p = &(*p)->rb_left; else if (c > 0) -- cgit v1.1 From 50aac4fec503960380ab594a93a6fbfdf3f8915f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Jan 2011 07:59:40 -0800 Subject: ceph: fix cap_wanted_delay_{min,max} mount option initialization These were initialized to 0 instead of the default, fallout from the RBD refactor in 3d14c5d2b6e15c21d8e5467dc62d33127c23a644. Signed-off-by: Sage Weil --- fs/ceph/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ceph') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bf6f0f3..9c50854 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -290,6 +290,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; + fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; -- cgit v1.1 From 24be0c481067560b11441e794e27f166a3568863 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Jan 2011 08:48:06 -0800 Subject: ceph: fix erroneous cap flush to non-auth mds The int flushing is global and not clear on each iteration of the loop, which can cause a second flush of caps to any MDSs with ids greater than the auth. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 60d27bc..f654c7e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1658,6 +1658,8 @@ ack: if (cap == ci->i_auth_cap && ci->i_dirty_caps) flushing = __mark_caps_flushing(inode, session); + else + flushing = 0; mds = cap->mds; /* remember mds, so we don't repeat */ sent++; -- cgit v1.1 From 088b3f5e9ee2649f5cfc2f08d8ce654e3eeba310 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Jan 2011 08:56:01 -0800 Subject: ceph: fix flushing of caps vs cap import If we are mid-flush and a cap is migrated to another node, we need to resend the cap flush message to the new MDS, and do so with the original flush_seq to avoid leaking across a sync boundary. Previously we didn't redo the flush (we only flushed newly dirty data), which would cause a later sync to hang forever. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f654c7e..7def3f5 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1560,9 +1560,10 @@ retry_locked: /* NOTE: no side-effects allowed, until we take s_mutex */ revoking = cap->implemented & ~cap->issued; - if (revoking) - dout(" mds%d revoking %s\n", cap->mds, - ceph_cap_string(revoking)); + dout(" mds%d cap %p issued %s implemented %s revoking %s\n", + cap->mds, cap, ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented), + ceph_cap_string(revoking)); if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { @@ -1942,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, } } +static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + int delayed = 0; + + spin_lock(&inode->i_lock); + cap = ci->i_auth_cap; + dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, + ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + __ceph_flush_snaps(ci, &session, 1); + if (ci->i_flushing_caps) { + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + ci->i_flushing_caps, NULL); + if (delayed) { + spin_lock(&inode->i_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&inode->i_lock); + } + } else { + spin_unlock(&inode->i_lock); + } +} + /* * Take references to capabilities we hold, so that we don't release @@ -2689,7 +2719,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, ceph_add_cap(inode, session, cap_id, -1, issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, NULL /* no caps context */); - try_flush_caps(inode, session, NULL); + kick_flushing_inode_caps(mdsc, session, inode); up_read(&mdsc->snap_rwsem); /* make sure we re-request max_size, if necessary */ -- cgit v1.1 From 7e57b81c7688c762bc9e775bc83f9fc17946f527 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Jan 2011 09:00:01 -0800 Subject: ceph: avoid immediate cap check after import The NODELAY flag avoids the heuristics that delay cap (issued/wanted) release. There's no reason for that after we import a cap, and it kills whatever benefit we get from those delays. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7def3f5..6b61ded 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2817,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_IMPORT: handle_cap_import(mdsc, inode, h, session, snaptrace, snaptrace_len); - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, - session); + ceph_check_caps(ceph_inode(inode), 0, session); goto done_unlocked; } -- cgit v1.1 From d66bbd441c08fe00ed2add1cf70cb243ebc2b27e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 21 Jan 2011 21:16:46 -0800 Subject: ceph: avoid picking MDS that is not active Ignore replication or auth frag data if it indicates an MDS that is not active. This can happen if the MDS shuts down and the client has stale data about the namespace distribution across the MDS cluster. If that's the case, fall back to directing the request based on the auth cap (which should always be accurate). Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 509339c..a6949cc 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -693,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc, dout("choose_mds %p %llx.%llx " "frag %u mds%d (%d/%d)\n", inode, ceph_vinop(inode), - frag.frag, frag.mds, + frag.frag, mds, (int)r, frag.ndist); - return mds; + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE) + return mds; } /* since this file/dir wasn't known to be @@ -708,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, dout("choose_mds %p %llx.%llx " "frag %u mds%d (auth)\n", inode, ceph_vinop(inode), frag.frag, mds); - return mds; + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE) + return mds; } } } -- cgit v1.1