aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Makefile7
-rw-r--r--fs/btrfs/acl.c104
-rw-r--r--fs/btrfs/async-thread.c120
-rw-r--r--fs/btrfs/async-thread.h4
-rw-r--r--fs/btrfs/btrfs_inode.h43
-rw-r--r--fs/btrfs/compression.c17
-rw-r--r--fs/btrfs/ctree.c484
-rw-r--r--fs/btrfs/ctree.h272
-rw-r--r--fs/btrfs/delayed-inode.c112
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/dir-item.c39
-rw-r--r--fs/btrfs/disk-io.c803
-rw-r--r--fs/btrfs/disk-io.h14
-rw-r--r--fs/btrfs/extent-tree.c1494
-rw-r--r--fs/btrfs/extent_io.c1044
-rw-r--r--fs/btrfs/extent_io.h80
-rw-r--r--fs/btrfs/extent_map.c155
-rw-r--r--fs/btrfs/file-item.c69
-rw-r--r--fs/btrfs/file.c293
-rw-r--r--fs/btrfs/free-space-cache.c1083
-rw-r--r--fs/btrfs/inode-map.c69
-rw-r--r--fs/btrfs/inode.c1058
-rw-r--r--fs/btrfs/ioctl.c350
-rw-r--r--fs/btrfs/ioctl.h29
-rw-r--r--fs/btrfs/locking.c280
-rw-r--r--fs/btrfs/locking.h36
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/ref-cache.c68
-rw-r--r--fs/btrfs/ref-cache.h52
-rw-r--r--fs/btrfs/relocation.c43
-rw-r--r--fs/btrfs/root-tree.c5
-rw-r--r--fs/btrfs/scrub.c668
-rw-r--r--fs/btrfs/struct-funcs.c100
-rw-r--r--fs/btrfs/super.c347
-rw-r--r--fs/btrfs/transaction.c262
-rw-r--r--fs/btrfs/tree-log.c39
-rw-r--r--fs/btrfs/volumes.c270
-rw-r--r--fs/btrfs/volumes.h24
-rw-r--r--fs/btrfs/xattr.c184
-rw-r--r--fs/hfs/bnode.c9
-rw-r--r--fs/hfs/brec.c20
-rw-r--r--fs/hfs/dir.c4
-rw-r--r--fs/hfs/inode.c19
-rw-r--r--fs/hfs/super.c4
-rw-r--r--fs/hfsplus/bnode.c3
-rw-r--r--fs/hfsplus/brec.c24
-rw-r--r--fs/hfsplus/catalog.c14
-rw-r--r--fs/hfsplus/dir.c12
-rw-r--r--fs/hfsplus/extents.c50
-rw-r--r--fs/hfsplus/hfsplus_fs.h5
-rw-r--r--fs/hfsplus/inode.c38
-rw-r--r--fs/hfsplus/options.c4
-rw-r--r--fs/hfsplus/super.c36
-rw-r--r--fs/hfsplus/unicode.c35
-rw-r--r--fs/hfsplus/wrapper.c9
-rw-r--r--fs/hostfs/hostfs_kern.c25
-rw-r--r--fs/hostfs/hostfs_user.c1
-rw-r--r--fs/hpfs/alloc.c66
-rw-r--r--fs/hpfs/dir.c16
-rw-r--r--fs/hpfs/file.c12
-rw-r--r--fs/hpfs/hpfs_fn.h8
-rw-r--r--fs/hpfs/inode.c10
-rw-r--r--fs/hpfs/namei.c10
-rw-r--r--fs/hpfs/super.c32
-rw-r--r--fs/hppfs/hppfs.c7
-rw-r--r--fs/hugetlbfs/inode.c60
-rw-r--r--fs/isofs/dir.c3
-rw-r--r--fs/isofs/inode.c30
-rw-r--r--fs/isofs/isofs.h24
-rw-r--r--fs/isofs/namei.c13
-rw-r--r--fs/isofs/rock.c55
-rw-r--r--fs/jbd/checkpoint.c64
-rw-r--r--fs/jbd/commit.c57
-rw-r--r--fs/jbd/journal.c99
-rw-r--r--fs/jbd/recovery.c4
-rw-r--r--fs/jbd/transaction.c83
-rw-r--r--fs/jfs/acl.c73
-rw-r--r--fs/jfs/file.c13
-rw-r--r--fs/jfs/inode.c4
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/jfs_dmap.c5
-rw-r--r--fs/jfs/jfs_dtree.c10
-rw-r--r--fs/jfs/jfs_imap.c6
-rw-r--r--fs/jfs/jfs_inode.c5
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_txnmgr.c6
-rw-r--r--fs/jfs/jfs_umount.c4
-rw-r--r--fs/jfs/namei.c66
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/jfs/xattr.c61
-rw-r--r--fs/nfsd/Kconfig12
-rw-r--r--fs/nfsd/cache.h2
-rw-r--r--fs/nfsd/export.c434
-rw-r--r--fs/nfsd/lockd.c2
-rw-r--r--fs/nfsd/nfs4acl.c18
-rw-r--r--fs/nfsd/nfs4callback.c32
-rw-r--r--fs/nfsd/nfs4proc.c438
-rw-r--r--fs/nfsd/nfs4recover.c103
-rw-r--r--fs/nfsd/nfs4state.c1810
-rw-r--r--fs/nfsd/nfs4xdr.c555
-rw-r--r--fs/nfsd/nfscache.c3
-rw-r--r--fs/nfsd/nfsctl.c345
-rw-r--r--fs/nfsd/nfsd.h33
-rw-r--r--fs/nfsd/nfsfh.c39
-rw-r--r--fs/nfsd/nfssvc.c38
-rw-r--r--fs/nfsd/state.h167
-rw-r--r--fs/nfsd/vfs.c259
-rw-r--r--fs/nfsd/vfs.h42
-rw-r--r--fs/nfsd/xdr4.h56
-rw-r--r--fs/nilfs2/btree.c47
-rw-r--r--fs/nilfs2/file.c12
-rw-r--r--fs/nilfs2/inode.c21
-rw-r--r--fs/nilfs2/ioctl.c3
-rw-r--r--fs/nilfs2/namei.c9
-rw-r--r--fs/nilfs2/nilfs.h14
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/nilfs2/segment.c66
-rw-r--r--fs/nilfs2/segment.h5
-rw-r--r--fs/ocfs2/acl.c81
-rw-r--r--fs/ocfs2/acl.h2
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c93
-rw-r--r--fs/ocfs2/aops.h14
-rw-r--r--fs/ocfs2/buffer_head_io.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c194
-rw-r--r--fs/ocfs2/cluster/netdebug.c102
-rw-r--r--fs/ocfs2/cluster/tcp.c139
-rw-r--r--fs/ocfs2/cluster/tcp.h2
-rw-r--r--fs/ocfs2/dcache.c2
-rw-r--r--fs/ocfs2/dir.c7
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h56
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c1
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c44
-rw-r--r--fs/ocfs2/dlm/dlmlock.c54
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c197
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c199
-rw-r--r--fs/ocfs2/dlm/dlmthread.c16
-rw-r--r--fs/ocfs2/dlmglue.c33
-rw-r--r--fs/ocfs2/extent_map.c96
-rw-r--r--fs/ocfs2/extent_map.h2
-rw-r--r--fs/ocfs2/file.c161
-rw-r--r--fs/ocfs2/file.h2
-rw-r--r--fs/ocfs2/inode.c6
-rw-r--r--fs/ocfs2/inode.h3
-rw-r--r--fs/ocfs2/ioctl.c11
-rw-r--r--fs/ocfs2/journal.c23
-rw-r--r--fs/ocfs2/journal.h5
-rw-r--r--fs/ocfs2/mmap.c53
-rw-r--r--fs/ocfs2/move_extents.c3
-rw-r--r--fs/ocfs2/namei.c19
-rw-r--r--fs/ocfs2/ocfs2.h51
-rw-r--r--fs/ocfs2/quota_global.c27
-rw-r--r--fs/ocfs2/quota_local.c27
-rw-r--r--fs/ocfs2/refcounttree.c49
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/stack_o2cb.c71
-rw-r--r--fs/ocfs2/super.c29
-rw-r--r--fs/ocfs2/super.h14
-rw-r--r--fs/ocfs2/xattr.c50
-rw-r--r--fs/reiserfs/bitmap.c8
-rw-r--r--fs/reiserfs/dir.c19
-rw-r--r--fs/reiserfs/file.c13
-rw-r--r--fs/reiserfs/inode.c44
-rw-r--r--fs/reiserfs/journal.c43
-rw-r--r--fs/reiserfs/namei.c20
-rw-r--r--fs/reiserfs/resize.c10
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/reiserfs/xattr.c36
-rw-r--r--fs/reiserfs/xattr_acl.c77
-rw-r--r--fs/reiserfs/xattr_security.c4
-rw-r--r--fs/ubifs/budget.c2
-rw-r--r--fs/ubifs/commit.c18
-rw-r--r--fs/ubifs/debug.c778
-rw-r--r--fs/ubifs/debug.h252
-rw-r--r--fs/ubifs/dir.c16
-rw-r--r--fs/ubifs/file.c26
-rw-r--r--fs/ubifs/io.c168
-rw-r--r--fs/ubifs/log.c25
-rw-r--r--fs/ubifs/lprops.c8
-rw-r--r--fs/ubifs/lpt.c37
-rw-r--r--fs/ubifs/lpt_commit.c40
-rw-r--r--fs/ubifs/master.c7
-rw-r--r--fs/ubifs/misc.h103
-rw-r--r--fs/ubifs/orphan.c7
-rw-r--r--fs/ubifs/recovery.c45
-rw-r--r--fs/ubifs/replay.c3
-rw-r--r--fs/ubifs/sb.c8
-rw-r--r--fs/ubifs/scan.c4
-rw-r--r--fs/ubifs/shrinker.c1
-rw-r--r--fs/ubifs/super.c27
-rw-r--r--fs/ubifs/tnc.c26
-rw-r--r--fs/ubifs/tnc_commit.c145
-rw-r--r--fs/ubifs/ubifs.h27
-rw-r--r--fs/ubifs/xattr.c4
-rw-r--r--fs/udf/balloc.c14
-rw-r--r--fs/udf/dir.c3
-rw-r--r--fs/udf/directory.c8
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c137
-rw-r--r--fs/udf/lowlevel.c2
-rw-r--r--fs/udf/misc.c19
-rw-r--r--fs/udf/namei.c23
-rw-r--r--fs/udf/partition.c19
-rw-r--r--fs/udf/super.c288
-rw-r--r--fs/udf/symlink.c67
-rw-r--r--fs/udf/truncate.c22
-rw-r--r--fs/udf/udf_sb.h5
-rw-r--r--fs/udf/udfdecl.h38
-rw-r--r--fs/udf/udftime.c3
-rw-r--r--fs/udf/unicode.c34
-rw-r--r--fs/ufs/ialloc.c2
-rw-r--r--fs/ufs/inode.c4
-rw-r--r--fs/ufs/namei.c2
-rw-r--r--fs/ufs/ufs.h9
215 files changed, 12242 insertions, 8379 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9b72dcf..c0ddfd2 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
transaction.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
- export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
- compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
+ export.o tree-log.o free-space-cache.o zlib.o lzo.o \
+ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
+ reada.o backref.o
+
+btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f66fc99..89b156d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -28,9 +28,7 @@
#include "btrfs_inode.h"
#include "xattr.h"
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-
-static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
{
int size;
const char *name;
@@ -61,22 +59,19 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
if (!value)
return ERR_PTR(-ENOMEM);
size = __btrfs_getxattr(inode, name, value, size);
- if (size > 0) {
- acl = posix_acl_from_xattr(value, size);
- if (IS_ERR(acl)) {
- kfree(value);
- return acl;
- }
- set_cached_acl(inode, type, acl);
- }
- kfree(value);
+ }
+ if (size > 0) {
+ acl = posix_acl_from_xattr(value, size);
} else if (size == -ENOENT || size == -ENODATA || size == 0) {
/* FIXME, who returns -ENOENT? I think nobody */
acl = NULL;
- set_cached_acl(inode, type, acl);
} else {
acl = ERR_PTR(-EIO);
}
+ kfree(value);
+
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
return acl;
}
@@ -111,7 +106,6 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
int ret, size = 0;
const char *name;
char *value = NULL;
- mode_t mode;
if (acl) {
ret = posix_acl_valid(acl);
@@ -122,13 +116,11 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
switch (type) {
case ACL_TYPE_ACCESS:
- mode = inode->i_mode;
name = POSIX_ACL_XATTR_ACCESS;
if (acl) {
- ret = posix_acl_equiv_mode(acl, &mode);
+ ret = posix_acl_equiv_mode(acl, &inode->i_mode);
if (ret < 0)
return ret;
- inode->i_mode = mode;
}
ret = 0;
break;
@@ -195,28 +187,6 @@ out:
return ret;
}
-int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
-{
- int error = -EAGAIN;
-
- if (flags & IPERM_FLAG_RCU) {
- if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
- error = -ECHILD;
-
- } else {
- struct posix_acl *acl;
- acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- error = posix_acl_permission(inode, acl, mask);
- posix_acl_release(acl);
- }
- }
-
- return error;
-}
-
/*
* btrfs_init_acl is already generally called under fs_mutex, so the locking
* stuff has been fixed to work with that. If the locking stuff changes, we
@@ -244,31 +214,20 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
}
if (IS_POSIXACL(dir) && acl) {
- struct posix_acl *clone;
- mode_t mode;
-
if (S_ISDIR(inode->i_mode)) {
ret = btrfs_set_acl(trans, inode, acl,
ACL_TYPE_DEFAULT);
if (ret)
goto failed;
}
- clone = posix_acl_clone(acl, GFP_NOFS);
- ret = -ENOMEM;
- if (!clone)
- goto failed;
-
- mode = inode->i_mode;
- ret = posix_acl_create_masq(clone, &mode);
- if (ret >= 0) {
- inode->i_mode = mode;
- if (ret > 0) {
- /* we need an acl */
- ret = btrfs_set_acl(trans, inode, clone,
- ACL_TYPE_ACCESS);
- }
+ ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0) {
+ /* we need an acl */
+ ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
}
- posix_acl_release(clone);
}
failed:
posix_acl_release(acl);
@@ -278,7 +237,7 @@ failed:
int btrfs_acl_chmod(struct inode *inode)
{
- struct posix_acl *acl, *clone;
+ struct posix_acl *acl;
int ret = 0;
if (S_ISLNK(inode->i_mode))
@@ -291,17 +250,11 @@ int btrfs_acl_chmod(struct inode *inode)
if (IS_ERR_OR_NULL(acl))
return PTR_ERR(acl);
- clone = posix_acl_clone(acl, GFP_KERNEL);
+ ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ if (ret)
+ return ret;
+ ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS);
posix_acl_release(acl);
- if (!clone)
- return -ENOMEM;
-
- ret = posix_acl_chmod_masq(clone, inode->i_mode);
- if (!ret)
- ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS);
-
- posix_acl_release(clone);
-
return ret;
}
@@ -318,18 +271,3 @@ const struct xattr_handler btrfs_xattr_acl_access_handler = {
.get = btrfs_xattr_acl_get,
.set = btrfs_xattr_acl_set,
};
-
-#else /* CONFIG_BTRFS_FS_POSIX_ACL */
-
-int btrfs_acl_chmod(struct inode *inode)
-{
- return 0;
-}
-
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
-{
- return 0;
-}
-
-#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8006a28..03321e5 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,8 @@ struct btrfs_worker_thread {
int idle;
};
+static int __btrfs_start_workers(struct btrfs_workers *workers);
+
/*
* btrfs_start_workers uses kthread_run, which can block waiting for memory
* for a very long time. It will actually throttle on page writeback,
@@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work)
{
struct worker_start *start;
start = container_of(work, struct worker_start, work);
- btrfs_start_workers(start->queue, 1);
+ __btrfs_start_workers(start->queue);
kfree(start);
}
-static int start_new_worker(struct btrfs_workers *queue)
-{
- struct worker_start *start;
- int ret;
-
- start = kzalloc(sizeof(*start), GFP_NOFS);
- if (!start)
- return -ENOMEM;
-
- start->work.func = start_new_worker_func;
- start->queue = queue;
- ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
- if (ret)
- kfree(start);
- return ret;
-}
-
/*
* helper function to move a thread onto the idle list after it
* has finished some requests.
@@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
{
struct btrfs_workers *workers = worker->workers;
+ struct worker_start *start;
unsigned long flags;
rmb();
if (!workers->atomic_start_pending)
return;
+ start = kzalloc(sizeof(*start), GFP_NOFS);
+ if (!start)
+ return;
+
+ start->work.func = start_new_worker_func;
+ start->queue = workers;
+
spin_lock_irqsave(&workers->lock, flags);
if (!workers->atomic_start_pending)
goto out;
@@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
workers->num_workers_starting += 1;
spin_unlock_irqrestore(&workers->lock, flags);
- start_new_worker(workers);
+ btrfs_queue_worker(workers->atomic_worker_start, &start->work);
return;
out:
+ kfree(start);
spin_unlock_irqrestore(&workers->lock, flags);
}
@@ -338,7 +332,7 @@ again:
run_ordered_completions(worker->workers, work);
check_pending_worker_creates(worker);
-
+ cond_resched();
}
spin_lock_irq(&worker->lock);
@@ -469,56 +463,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
* starts new worker threads. This does not enforce the max worker
* count in case you need to temporarily go past it.
*/
-static int __btrfs_start_workers(struct btrfs_workers *workers,
- int num_workers)
+static int __btrfs_start_workers(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
int ret = 0;
- int i;
- for (i = 0; i < num_workers; i++) {
- worker = kzalloc(sizeof(*worker), GFP_NOFS);
- if (!worker) {
- ret = -ENOMEM;
- goto fail;
- }
+ worker = kzalloc(sizeof(*worker), GFP_NOFS);
+ if (!worker) {
+ ret = -ENOMEM;
+ goto fail;
+ }
- INIT_LIST_HEAD(&worker->pending);
- INIT_LIST_HEAD(&worker->prio_pending);
- INIT_LIST_HEAD(&worker->worker_list);
- spin_lock_init(&worker->lock);
-
- atomic_set(&worker->num_pending, 0);
- atomic_set(&worker->refs, 1);
- worker->workers = workers;
- worker->task = kthread_run(worker_loop, worker,
- "btrfs-%s-%d", workers->name,
- workers->num_workers + i);
- if (IS_ERR(worker->task)) {
- ret = PTR_ERR(worker->task);
- kfree(worker);
- goto fail;
- }
- spin_lock_irq(&workers->lock);
- list_add_tail(&worker->worker_list, &workers->idle_list);
- worker->idle = 1;
- workers->num_workers++;
- workers->num_workers_starting--;
- WARN_ON(workers->num_workers_starting < 0);
- spin_unlock_irq(&workers->lock);
+ INIT_LIST_HEAD(&worker->pending);
+ INIT_LIST_HEAD(&worker->prio_pending);
+ INIT_LIST_HEAD(&worker->worker_list);
+ spin_lock_init(&worker->lock);
+
+ atomic_set(&worker->num_pending, 0);
+ atomic_set(&worker->refs, 1);
+ worker->workers = workers;
+ worker->task = kthread_run(worker_loop, worker,
+ "btrfs-%s-%d", workers->name,
+ workers->num_workers + 1);
+ if (IS_ERR(worker->task)) {
+ ret = PTR_ERR(worker->task);
+ kfree(worker);
+ goto fail;
}
+ spin_lock_irq(&workers->lock);
+ list_add_tail(&worker->worker_list, &workers->idle_list);
+ worker->idle = 1;
+ workers->num_workers++;
+ workers->num_workers_starting--;
+ WARN_ON(workers->num_workers_starting < 0);
+ spin_unlock_irq(&workers->lock);
+
return 0;
fail:
- btrfs_stop_workers(workers);
+ spin_lock_irq(&workers->lock);
+ workers->num_workers_starting--;
+ spin_unlock_irq(&workers->lock);
return ret;
}
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+int btrfs_start_workers(struct btrfs_workers *workers)
{
spin_lock_irq(&workers->lock);
- workers->num_workers_starting += num_workers;
+ workers->num_workers_starting++;
spin_unlock_irq(&workers->lock);
- return __btrfs_start_workers(workers, num_workers);
+ return __btrfs_start_workers(workers);
}
/*
@@ -575,9 +568,10 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
struct btrfs_worker_thread *worker;
unsigned long flags;
struct list_head *fallback;
+ int ret;
-again:
spin_lock_irqsave(&workers->lock, flags);
+again:
worker = next_worker(workers);
if (!worker) {
@@ -591,7 +585,10 @@ again:
workers->num_workers_starting++;
spin_unlock_irqrestore(&workers->lock, flags);
/* we're below the limit, start another worker */
- __btrfs_start_workers(workers, 1);
+ ret = __btrfs_start_workers(workers);
+ spin_lock_irqsave(&workers->lock, flags);
+ if (ret)
+ goto fallback;
goto again;
}
}
@@ -672,7 +669,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work)
/*
* places a struct btrfs_work into the pending queue of one of the kthreads
*/
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
{
struct btrfs_worker_thread *worker;
unsigned long flags;
@@ -680,7 +677,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
/* don't requeue something already on a list */
if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
- goto out;
+ return;
worker = find_worker(workers);
if (workers->ordered) {
@@ -719,7 +716,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
if (wake)
wake_up_process(worker->task);
spin_unlock_irqrestore(&worker->lock, flags);
-
-out:
- return 0;
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746..f34cc31 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -109,8 +109,8 @@ struct btrfs_workers {
char *name;
};
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+int btrfs_start_workers(struct btrfs_workers *workers);
int btrfs_stop_workers(struct btrfs_workers *workers);
void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
struct btrfs_workers *async_starter);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 52d7eca..634608d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -34,6 +34,9 @@ struct btrfs_inode {
*/
struct btrfs_key location;
+ /* Lock for counters */
+ spinlock_t lock;
+
/* the extent_tree has caches of all the extent mappings to disk */
struct extent_map_tree extent_tree;
@@ -100,11 +103,6 @@ struct btrfs_inode {
*/
u64 delalloc_bytes;
- /* total number of bytes that may be used for this inode for
- * delalloc
- */
- u64 reserved_bytes;
-
/*
* the size of the file stored in the metadata on disk. data=ordered
* means the in-memory i_size might be larger than the size on disk
@@ -112,9 +110,6 @@ struct btrfs_inode {
*/
u64 disk_i_size;
- /* flags field from the on disk inode */
- u32 flags;
-
/*
* if this is a directory then index_cnt is the counter for the index
* number for new files that are created
@@ -129,13 +124,22 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
+ * Number of bytes outstanding that are going to need csums. This is
+ * used in ENOSPC accounting.
+ */
+ u64 csum_bytes;
+
+ /* flags field from the on disk inode */
+ u32 flags;
+
+ /*
* Counters to keep track of the number of extent item's we may use due
* to delalloc and such. outstanding_extents is the number of extent
* items we think we'll end up using, and reserved_extents is the number
* of extent items we've reserved metadata for.
*/
- atomic_t outstanding_extents;
- atomic_t reserved_extents;
+ unsigned outstanding_extents;
+ unsigned reserved_extents;
/*
* ordered_data_close is set by truncate when a file that used
@@ -143,14 +147,12 @@ struct btrfs_inode {
* the btrfs file release call will add this inode to the
* ordered operations list so that we make sure to flush out any
* new data the application may have written before commit.
- *
- * yes, its silly to have a single bitflag, but we might grow more
- * of these.
*/
unsigned ordered_data_close:1;
unsigned orphan_meta_reserved:1;
unsigned dummy_inode:1;
unsigned in_defrag:1;
+ unsigned delalloc_meta_reserved:1;
/*
* always compress this one file
@@ -173,7 +175,11 @@ static inline u64 btrfs_ino(struct inode *inode)
{
u64 ino = BTRFS_I(inode)->location.objectid;
- if (ino <= BTRFS_FIRST_FREE_OBJECTID)
+ /*
+ * !ino: btree_inode
+ * type == BTRFS_ROOT_ITEM_KEY: subvol dir
+ */
+ if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
ino = inode->i_ino;
return ino;
}
@@ -184,4 +190,13 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
BTRFS_I(inode)->disk_i_size = size;
}
+static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
+ struct inode *inode)
+{
+ if (root == root->fs_info->tree_root ||
+ BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+ return true;
+ return false;
+}
+
#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index bfe42b0..14f1c5a 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
static inline int compressed_bio_size(struct btrfs_root *root,
unsigned long disk_size)
{
- u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
return sizeof(struct compressed_bio) +
((disk_size + root->sectorsize - 1) / root->sectorsize) *
csum_size;
@@ -338,6 +339,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
u64 first_byte = disk_start;
struct block_device *bdev;
int ret;
+ int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
@@ -392,8 +394,11 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
BUG_ON(ret);
- ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
- BUG_ON(ret);
+ if (!skip_sum) {
+ ret = btrfs_csum_one_bio(root, inode, bio,
+ start, 1);
+ BUG_ON(ret);
+ }
ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
BUG_ON(ret);
@@ -418,8 +423,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
BUG_ON(ret);
- ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
- BUG_ON(ret);
+ if (!skip_sum) {
+ ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+ BUG_ON(ret);
+ }
ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
BUG_ON(ret);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2e66786..dede441 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
{
int i;
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
- if (p->nodes[i] && p->locks[i])
- btrfs_set_lock_blocking(p->nodes[i]);
+ if (!p->nodes[i] || !p->locks[i])
+ continue;
+ btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
+ if (p->locks[i] == BTRFS_READ_LOCK)
+ p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
+ else if (p->locks[i] == BTRFS_WRITE_LOCK)
+ p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
}
}
@@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
* for held
*/
noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
- struct extent_buffer *held)
+ struct extent_buffer *held, int held_rw)
{
int i;
@@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
* really sure by forcing the path to blocking before we clear
* the path blocking.
*/
- if (held)
- btrfs_set_lock_blocking(held);
+ if (held) {
+ btrfs_set_lock_blocking_rw(held, held_rw);
+ if (held_rw == BTRFS_WRITE_LOCK)
+ held_rw = BTRFS_WRITE_LOCK_BLOCKING;
+ else if (held_rw == BTRFS_READ_LOCK)
+ held_rw = BTRFS_READ_LOCK_BLOCKING;
+ }
btrfs_set_path_blocking(p);
#endif
for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
- if (p->nodes[i] && p->locks[i])
- btrfs_clear_lock_blocking(p->nodes[i]);
+ if (p->nodes[i] && p->locks[i]) {
+ btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
+ if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
+ p->locks[i] = BTRFS_WRITE_LOCK;
+ else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
+ p->locks[i] = BTRFS_READ_LOCK;
+ }
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (held)
- btrfs_clear_lock_blocking(held);
+ btrfs_clear_lock_blocking_rw(held, held_rw);
#endif
}
@@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p)
if (!p->nodes[i])
continue;
if (p->locks[i]) {
- btrfs_tree_unlock(p->nodes[i]);
+ btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
p->locks[i] = 0;
}
free_extent_buffer(p->nodes[i]);
@@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
return eb;
}
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root. A locked buffer
+ * is returned, with a reference held.
+ */
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ btrfs_tree_read_lock(eb);
+ if (eb == root->node)
+ break;
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
/* cowonly root (everything not a reference counted cow subvolume), just get
* put onto a simple dirty list. transaction.c walks this to make sure they
* get properly updated on disk.
@@ -480,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
+ /* ensure we can see the force_cow */
+ smp_rmb();
+
+ /*
+ * We do not need to cow a block if
+ * 1) this block is not created or changed in this transaction;
+ * 2) this block does not belong to TREE_RELOC tree;
+ * 3) the root is not forced COW.
+ *
+ * What is forced COW:
+ * when we create snapshot during commiting the transaction,
+ * after we've finished coping src root, we must COW the shared
+ * block to ensure the metadata consistency.
+ */
if (btrfs_header_generation(buf) == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
!(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+ btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
+ !root->force_cow)
return 0;
return 1;
}
@@ -626,14 +675,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
for (i = start_slot; i < end_slot; i++) {
int close = 1;
- if (!parent->map_token) {
- map_extent_buffer(parent,
- btrfs_node_key_ptr_offset(i),
- sizeof(struct btrfs_key_ptr),
- &parent->map_token, &parent->kaddr,
- &parent->map_start, &parent->map_len,
- KM_USER1);
- }
btrfs_node_key(parent, &disk_key, i);
if (!progress_passed && comp_keys(&disk_key, progress) < 0)
continue;
@@ -656,11 +697,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
last_block = blocknr;
continue;
}
- if (parent->map_token) {
- unmap_extent_buffer(parent, parent->map_token,
- KM_USER1);
- parent->map_token = NULL;
- }
cur = btrfs_find_tree_block(root, blocknr, blocksize);
if (cur)
@@ -701,11 +737,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
}
- if (parent->map_token) {
- unmap_extent_buffer(parent, parent->map_token,
- KM_USER1);
- parent->map_token = NULL;
- }
return err;
}
@@ -746,7 +777,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
struct btrfs_disk_key *tmp = NULL;
struct btrfs_disk_key unaligned;
unsigned long offset;
- char *map_token = NULL;
char *kaddr = NULL;
unsigned long map_start = 0;
unsigned long map_len = 0;
@@ -756,18 +786,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
mid = (low + high) / 2;
offset = p + mid * item_size;
- if (!map_token || offset < map_start ||
+ if (!kaddr || offset < map_start ||
(offset + sizeof(struct btrfs_disk_key)) >
map_start + map_len) {
- if (map_token) {
- unmap_extent_buffer(eb, map_token, KM_USER0);
- map_token = NULL;
- }
err = map_private_extent_buffer(eb, offset,
sizeof(struct btrfs_disk_key),
- &map_token, &kaddr,
- &map_start, &map_len, KM_USER0);
+ &kaddr, &map_start, &map_len);
if (!err) {
tmp = (struct btrfs_disk_key *)(kaddr + offset -
@@ -790,14 +815,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
high = mid;
else {
*slot = mid;
- if (map_token)
- unmap_extent_buffer(eb, map_token, KM_USER0);
return 0;
}
}
*slot = low;
- if (map_token)
- unmap_extent_buffer(eb, map_token, KM_USER0);
return 1;
}
@@ -890,14 +911,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
mid = path->nodes[level];
- WARN_ON(!path->locks[level]);
+ WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
+ path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
WARN_ON(btrfs_header_generation(mid) != trans->transid);
orig_ptr = btrfs_node_blockptr(mid, orig_slot);
- if (level < BTRFS_MAX_LEVEL - 1)
+ if (level < BTRFS_MAX_LEVEL - 1) {
parent = path->nodes[level + 1];
- pslot = path->slots[level + 1];
+ pslot = path->slots[level + 1];
+ }
/*
* deal with the case where there is only one pointer in the root
@@ -1100,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
mid = path->nodes[level];
WARN_ON(btrfs_header_generation(mid) != trans->transid);
- if (level < BTRFS_MAX_LEVEL - 1)
+ if (level < BTRFS_MAX_LEVEL - 1) {
parent = path->nodes[level + 1];
- pslot = path->slots[level + 1];
+ pslot = path->slots[level + 1];
+ }
if (!parent)
return 1;
@@ -1228,7 +1252,6 @@ static void reada_for_search(struct btrfs_root *root,
u32 nr;
u32 blocksize;
u32 nscan = 0;
- bool map = true;
if (level != 1)
return;
@@ -1250,19 +1273,8 @@ static void reada_for_search(struct btrfs_root *root,
nritems = btrfs_header_nritems(node);
nr = slot;
- if (node->map_token || path->skip_locking)
- map = false;
while (1) {
- if (map && !node->map_token) {
- unsigned long offset = btrfs_node_key_ptr_offset(nr);
- map_private_extent_buffer(node, offset,
- sizeof(struct btrfs_key_ptr),
- &node->map_token,
- &node->kaddr,
- &node->map_start,
- &node->map_len, KM_USER1);
- }
if (direction < 0) {
if (nr == 0)
break;
@@ -1281,11 +1293,6 @@ static void reada_for_search(struct btrfs_root *root,
if ((search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
gen = btrfs_node_ptr_generation(node, nr);
- if (map && node->map_token) {
- unmap_extent_buffer(node, node->map_token,
- KM_USER1);
- node->map_token = NULL;
- }
readahead_tree_block(root, search, blocksize, gen);
nread += blocksize;
}
@@ -1293,10 +1300,6 @@ static void reada_for_search(struct btrfs_root *root,
if ((nread > 65536 || nscan > 32))
break;
}
- if (map && node->map_token) {
- unmap_extent_buffer(node, node->map_token, KM_USER1);
- node->map_token = NULL;
- }
}
/*
@@ -1409,7 +1412,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
t = path->nodes[i];
if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
- btrfs_tree_unlock(t);
+ btrfs_tree_unlock_rw(t, path->locks[i]);
path->locks[i] = 0;
}
}
@@ -1436,7 +1439,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
continue;
if (!path->locks[i])
continue;
- btrfs_tree_unlock(path->nodes[i]);
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
path->locks[i] = 0;
}
}
@@ -1485,6 +1488,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
* we can trust our generation number
*/
free_extent_buffer(tmp);
+ btrfs_set_path_blocking(p);
+
tmp = read_tree_block(root, blocknr, blocksize, gen);
if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
*eb_ret = tmp;
@@ -1540,20 +1545,27 @@ read_block_for_search(struct btrfs_trans_handle *trans,
static int
setup_nodes_for_search(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *p,
- struct extent_buffer *b, int level, int ins_len)
+ struct extent_buffer *b, int level, int ins_len,
+ int *write_lock_level)
{
int ret;
if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
int sret;
+ if (*write_lock_level < level + 1) {
+ *write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
sret = reada_for_balance(root, p, level);
if (sret)
goto again;
btrfs_set_path_blocking(p);
sret = split_node(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL);
+ btrfs_clear_path_blocking(p, NULL, 0);
BUG_ON(sret > 0);
if (sret) {
@@ -1565,13 +1577,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
int sret;
+ if (*write_lock_level < level + 1) {
+ *write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
sret = reada_for_balance(root, p, level);
if (sret)
goto again;
btrfs_set_path_blocking(p);
sret = balance_level(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL);
+ btrfs_clear_path_blocking(p, NULL, 0);
if (sret) {
ret = sret;
@@ -1615,27 +1633,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
int err;
int level;
int lowest_unlock = 1;
+ int root_lock;
+ /* everything at write_lock_level or lower must be write locked */
+ int write_lock_level = 0;
u8 lowest_level = 0;
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
WARN_ON(p->nodes[0] != NULL);
- if (ins_len < 0)
+ if (ins_len < 0) {
lowest_unlock = 2;
+ /* when we are removing items, we might have to go up to level
+ * two as we update tree pointers Make sure we keep write
+ * for those levels as well
+ */
+ write_lock_level = 2;
+ } else if (ins_len > 0) {
+ /*
+ * for inserting items, make sure we have a write lock on
+ * level 1 so we can update keys
+ */
+ write_lock_level = 1;
+ }
+
+ if (!cow)
+ write_lock_level = -1;
+
+ if (cow && (p->keep_locks || p->lowest_level))
+ write_lock_level = BTRFS_MAX_LEVEL;
+
again:
+ /*
+ * we try very hard to do read locks on the root
+ */
+ root_lock = BTRFS_READ_LOCK;
+ level = 0;
if (p->search_commit_root) {
+ /*
+ * the commit roots are read only
+ * so we always do read locks
+ */
b = root->commit_root;
extent_buffer_get(b);
+ level = btrfs_header_level(b);
if (!p->skip_locking)
- btrfs_tree_lock(b);
+ btrfs_tree_read_lock(b);
} else {
- if (p->skip_locking)
+ if (p->skip_locking) {
b = btrfs_root_node(root);
- else
- b = btrfs_lock_root_node(root);
+ level = btrfs_header_level(b);
+ } else {
+ /* we don't know the level of the root node
+ * until we actually have it read locked
+ */
+ b = btrfs_read_lock_root_node(root);
+ level = btrfs_header_level(b);
+ if (level <= write_lock_level) {
+ /* whoops, must trade for write lock */
+ btrfs_tree_read_unlock(b);
+ free_extent_buffer(b);
+ b = btrfs_lock_root_node(root);
+ root_lock = BTRFS_WRITE_LOCK;
+
+ /* the level might have changed, check again */
+ level = btrfs_header_level(b);
+ }
+ }
}
+ p->nodes[level] = b;
+ if (!p->skip_locking)
+ p->locks[level] = root_lock;
while (b) {
level = btrfs_header_level(b);
@@ -1644,10 +1713,6 @@ again:
* setup the path here so we can release it under lock
* contention with the cow code
*/
- p->nodes[level] = b;
- if (!p->skip_locking)
- p->locks[level] = 1;
-
if (cow) {
/*
* if we don't really need to cow this block
@@ -1659,6 +1724,16 @@ again:
btrfs_set_path_blocking(p);
+ /*
+ * must have write locks on this node and the
+ * parent
+ */
+ if (level + 1 > write_lock_level) {
+ write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
p->slots[level + 1], &b);
@@ -1671,10 +1746,7 @@ cow_done:
BUG_ON(!cow && ins_len);
p->nodes[level] = b;
- if (!p->skip_locking)
- p->locks[level] = 1;
-
- btrfs_clear_path_blocking(p, NULL);
+ btrfs_clear_path_blocking(p, NULL, 0);
/*
* we have a lock on b and as long as we aren't changing
@@ -1700,7 +1772,7 @@ cow_done:
}
p->slots[level] = slot;
err = setup_nodes_for_search(trans, root, p, b, level,
- ins_len);
+ ins_len, &write_lock_level);
if (err == -EAGAIN)
goto again;
if (err) {
@@ -1710,6 +1782,19 @@ cow_done:
b = p->nodes[level];
slot = p->slots[level];
+ /*
+ * slot 0 is special, if we change the key
+ * we have to update the parent pointer
+ * which means we must have a write lock
+ * on the parent
+ */
+ if (slot == 0 && cow &&
+ write_lock_level < level + 1) {
+ write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
unlock_up(p, level, lowest_unlock);
if (level == lowest_level) {
@@ -1728,23 +1813,42 @@ cow_done:
}
if (!p->skip_locking) {
- btrfs_clear_path_blocking(p, NULL);
- err = btrfs_try_spin_lock(b);
-
- if (!err) {
- btrfs_set_path_blocking(p);
- btrfs_tree_lock(b);
- btrfs_clear_path_blocking(p, b);
+ level = btrfs_header_level(b);
+ if (level <= write_lock_level) {
+ err = btrfs_try_tree_write_lock(b);
+ if (!err) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_lock(b);
+ btrfs_clear_path_blocking(p, b,
+ BTRFS_WRITE_LOCK);
+ }
+ p->locks[level] = BTRFS_WRITE_LOCK;
+ } else {
+ err = btrfs_try_tree_read_lock(b);
+ if (!err) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ btrfs_clear_path_blocking(p, b,
+ BTRFS_READ_LOCK);
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
}
+ p->nodes[level] = b;
}
} else {
p->slots[level] = slot;
if (ins_len > 0 &&
btrfs_leaf_free_space(root, b) < ins_len) {
+ if (write_lock_level < 1) {
+ write_lock_level = 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
btrfs_set_path_blocking(p);
err = split_leaf(trans, root, key,
p, ins_len, ret == 0);
- btrfs_clear_path_blocking(p, NULL);
+ btrfs_clear_path_blocking(p, NULL, 0);
BUG_ON(err > 0);
if (err) {
@@ -2025,7 +2129,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
extent_buffer_get(c);
path->nodes[level] = c;
- path->locks[level] = 1;
+ path->locks[level] = BTRFS_WRITE_LOCK;
path->slots[level] = 0;
return 0;
}
@@ -2253,14 +2357,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (path->slots[0] == i)
push_space += data_size;
- if (!left->map_token) {
- map_extent_buffer(left, (unsigned long)item,
- sizeof(struct btrfs_item),
- &left->map_token, &left->kaddr,
- &left->map_start, &left->map_len,
- KM_USER1);
- }
-
this_item_size = btrfs_item_size(left, item);
if (this_item_size + sizeof(*item) + push_space > free_space)
break;
@@ -2271,10 +2367,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
break;
i--;
}
- if (left->map_token) {
- unmap_extent_buffer(left, left->map_token, KM_USER1);
- left->map_token = NULL;
- }
if (push_items == 0)
goto out_unlock;
@@ -2316,21 +2408,10 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
push_space = BTRFS_LEAF_DATA_SIZE(root);
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(right, i);
- if (!right->map_token) {
- map_extent_buffer(right, (unsigned long)item,
- sizeof(struct btrfs_item),
- &right->map_token, &right->kaddr,
- &right->map_start, &right->map_len,
- KM_USER1);
- }
push_space -= btrfs_item_size(right, item);
btrfs_set_item_offset(right, item, push_space);
}
- if (right->map_token) {
- unmap_extent_buffer(right, right->map_token, KM_USER1);
- right->map_token = NULL;
- }
left_nritems -= push_items;
btrfs_set_header_nritems(left, left_nritems);
@@ -2467,13 +2548,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
for (i = 0; i < nr; i++) {
item = btrfs_item_nr(right, i);
- if (!right->map_token) {
- map_extent_buffer(right, (unsigned long)item,
- sizeof(struct btrfs_item),
- &right->map_token, &right->kaddr,
- &right->map_start, &right->map_len,
- KM_USER1);
- }
if (!empty && push_items > 0) {
if (path->slots[0] < i)
@@ -2496,11 +2570,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
push_space += this_item_size + sizeof(*item);
}
- if (right->map_token) {
- unmap_extent_buffer(right, right->map_token, KM_USER1);
- right->map_token = NULL;
- }
-
if (push_items == 0) {
ret = 1;
goto out;
@@ -2530,23 +2599,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(left, i);
- if (!left->map_token) {
- map_extent_buffer(left, (unsigned long)item,
- sizeof(struct btrfs_item),
- &left->map_token, &left->kaddr,
- &left->map_start, &left->map_len,
- KM_USER1);
- }
ioff = btrfs_item_offset(left, item);
btrfs_set_item_offset(left, item,
ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
- if (left->map_token) {
- unmap_extent_buffer(left, left->map_token, KM_USER1);
- left->map_token = NULL;
- }
/* fixup right node */
if (push_items > right_nritems) {
@@ -2574,21 +2632,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(right, i);
- if (!right->map_token) {
- map_extent_buffer(right, (unsigned long)item,
- sizeof(struct btrfs_item),
- &right->map_token, &right->kaddr,
- &right->map_start, &right->map_len,
- KM_USER1);
- }
-
push_space = push_space - btrfs_item_size(right, item);
btrfs_set_item_offset(right, item, push_space);
}
- if (right->map_token) {
- unmap_extent_buffer(right, right->map_token, KM_USER1);
- right->map_token = NULL;
- }
btrfs_mark_buffer_dirty(left);
if (right_nritems)
@@ -2729,23 +2775,10 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
struct btrfs_item *item = btrfs_item_nr(right, i);
u32 ioff;
- if (!right->map_token) {
- map_extent_buffer(right, (unsigned long)item,
- sizeof(struct btrfs_item),
- &right->map_token, &right->kaddr,
- &right->map_start, &right->map_len,
- KM_USER1);
- }
-
ioff = btrfs_item_offset(right, item);
btrfs_set_item_offset(right, item, ioff + rt_data_off);
}
- if (right->map_token) {
- unmap_extent_buffer(right, right->map_token, KM_USER1);
- right->map_token = NULL;
- }
-
btrfs_set_header_nritems(l, mid);
ret = 0;
btrfs_item_key(right, &disk_key, 0);
@@ -3264,23 +3297,10 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- if (!leaf->map_token) {
- map_extent_buffer(leaf, (unsigned long)item,
- sizeof(struct btrfs_item),
- &leaf->map_token, &leaf->kaddr,
- &leaf->map_start, &leaf->map_len,
- KM_USER1);
- }
-
ioff = btrfs_item_offset(leaf, item);
btrfs_set_item_offset(leaf, item, ioff + size_diff);
}
- if (leaf->map_token) {
- unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
- leaf->map_token = NULL;
- }
-
/* shift the data */
if (from_end) {
memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
@@ -3377,22 +3397,10 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- if (!leaf->map_token) {
- map_extent_buffer(leaf, (unsigned long)item,
- sizeof(struct btrfs_item),
- &leaf->map_token, &leaf->kaddr,
- &leaf->map_start, &leaf->map_len,
- KM_USER1);
- }
ioff = btrfs_item_offset(leaf, item);
btrfs_set_item_offset(leaf, item, ioff - data_size);
}
- if (leaf->map_token) {
- unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
- leaf->map_token = NULL;
- }
-
/* shift the data */
memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
data_end - data_size, btrfs_leaf_data(leaf) +
@@ -3494,27 +3502,13 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
- WARN_ON(leaf->map_token);
for (i = slot; i < nritems; i++) {
u32 ioff;
item = btrfs_item_nr(leaf, i);
- if (!leaf->map_token) {
- map_extent_buffer(leaf, (unsigned long)item,
- sizeof(struct btrfs_item),
- &leaf->map_token, &leaf->kaddr,
- &leaf->map_start, &leaf->map_len,
- KM_USER1);
- }
-
ioff = btrfs_item_offset(leaf, item);
btrfs_set_item_offset(leaf, item, ioff - total_data);
}
- if (leaf->map_token) {
- unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
- leaf->map_token = NULL;
- }
-
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
btrfs_item_nr_offset(slot),
@@ -3608,27 +3602,13 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
- WARN_ON(leaf->map_token);
for (i = slot; i < nritems; i++) {
u32 ioff;
item = btrfs_item_nr(leaf, i);
- if (!leaf->map_token) {
- map_extent_buffer(leaf, (unsigned long)item,
- sizeof(struct btrfs_item),
- &leaf->map_token, &leaf->kaddr,
- &leaf->map_start, &leaf->map_len,
- KM_USER1);
- }
-
ioff = btrfs_item_offset(leaf, item);
btrfs_set_item_offset(leaf, item, ioff - total_data);
}
- if (leaf->map_token) {
- unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
- leaf->map_token = NULL;
- }
-
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
btrfs_item_nr_offset(slot),
@@ -3840,22 +3820,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- if (!leaf->map_token) {
- map_extent_buffer(leaf, (unsigned long)item,
- sizeof(struct btrfs_item),
- &leaf->map_token, &leaf->kaddr,
- &leaf->map_start, &leaf->map_len,
- KM_USER1);
- }
ioff = btrfs_item_offset(leaf, item);
btrfs_set_item_offset(leaf, item, ioff + dsize);
}
- if (leaf->map_token) {
- unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
- leaf->map_token = NULL;
- }
-
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
btrfs_item_nr_offset(slot + nr),
sizeof(struct btrfs_item) *
@@ -4004,11 +3972,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
WARN_ON(!path->keep_locks);
again:
- cur = btrfs_lock_root_node(root);
+ cur = btrfs_read_lock_root_node(root);
level = btrfs_header_level(cur);
WARN_ON(path->nodes[level]);
path->nodes[level] = cur;
- path->locks[level] = 1;
+ path->locks[level] = BTRFS_READ_LOCK;
if (btrfs_header_generation(cur) < min_trans) {
ret = 1;
@@ -4098,12 +4066,12 @@ find_next_key:
cur = read_node_slot(root, cur, slot);
BUG_ON(!cur);
- btrfs_tree_lock(cur);
+ btrfs_tree_read_lock(cur);
- path->locks[level - 1] = 1;
+ path->locks[level - 1] = BTRFS_READ_LOCK;
path->nodes[level - 1] = cur;
unlock_up(path, level, 1);
- btrfs_clear_path_blocking(path, NULL);
+ btrfs_clear_path_blocking(path, NULL, 0);
}
out:
if (ret == 0)
@@ -4218,30 +4186,21 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
u32 nritems;
int ret;
int old_spinning = path->leave_spinning;
- int force_blocking = 0;
+ int next_rw_lock = 0;
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
return 1;
- /*
- * we take the blocks in an order that upsets lockdep. Using
- * blocking mode is the only way around it.
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- force_blocking = 1;
-#endif
-
btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
again:
level = 1;
next = NULL;
+ next_rw_lock = 0;
btrfs_release_path(path);
path->keep_locks = 1;
-
- if (!force_blocking)
- path->leave_spinning = 1;
+ path->leave_spinning = 1;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
path->keep_locks = 0;
@@ -4281,11 +4240,12 @@ again:
}
if (next) {
- btrfs_tree_unlock(next);
+ btrfs_tree_unlock_rw(next, next_rw_lock);
free_extent_buffer(next);
}
next = c;
+ next_rw_lock = path->locks[level];
ret = read_block_for_search(NULL, root, path, &next, level,
slot, &key);
if (ret == -EAGAIN)
@@ -4297,15 +4257,14 @@ again:
}
if (!path->skip_locking) {
- ret = btrfs_try_spin_lock(next);
+ ret = btrfs_try_tree_read_lock(next);
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_lock(next);
- if (!force_blocking)
- btrfs_clear_path_blocking(path, next);
+ btrfs_tree_read_lock(next);
+ btrfs_clear_path_blocking(path, next,
+ BTRFS_READ_LOCK);
}
- if (force_blocking)
- btrfs_set_lock_blocking(next);
+ next_rw_lock = BTRFS_READ_LOCK;
}
break;
}
@@ -4314,14 +4273,13 @@ again:
level--;
c = path->nodes[level];
if (path->locks[level])
- btrfs_tree_unlock(c);
+ btrfs_tree_unlock_rw(c, path->locks[level]);
free_extent_buffer(c);
path->nodes[level] = next;
path->slots[level] = 0;
if (!path->skip_locking)
- path->locks[level] = 1;
-
+ path->locks[level] = next_rw_lock;
if (!level)
break;
@@ -4336,16 +4294,14 @@ again:
}
if (!path->skip_locking) {
- btrfs_assert_tree_locked(path->nodes[level]);
- ret = btrfs_try_spin_lock(next);
+ ret = btrfs_try_tree_read_lock(next);
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_lock(next);
- if (!force_blocking)
- btrfs_clear_path_blocking(path, next);
+ btrfs_tree_read_lock(next);
+ btrfs_clear_path_blocking(path, next,
+ BTRFS_READ_LOCK);
}
- if (force_blocking)
- btrfs_set_lock_blocking(next);
+ next_rw_lock = BTRFS_READ_LOCK;
}
}
ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 66179bc..83a871f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
#include <linux/kobject.h>
#include <trace/events/btrfs.h>
#include <asm/kmap_types.h>
+#include <linux/pagemap.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
#define BTRFS_LABEL_SIZE 256
/*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+ __le64 tree_root;
+ __le64 tree_root_gen;
+
+ __le64 chunk_root;
+ __le64 chunk_root_gen;
+
+ __le64 extent_root;
+ __le64 extent_root_gen;
+
+ __le64 fs_root;
+ __le64 fs_root_gen;
+
+ __le64 dev_root;
+ __le64 dev_root_gen;
+
+ __le64 csum_root;
+ __le64 csum_root_gen;
+
+ __le64 total_bytes;
+ __le64 bytes_used;
+ __le64 num_devices;
+ /* future */
+ __le64 unsed_64[4];
+
+ u8 tree_root_level;
+ u8 chunk_root_level;
+ u8 extent_root_level;
+ u8 fs_root_level;
+ u8 dev_root_level;
+ u8 csum_root_level;
+ /* future and to align */
+ u8 unused_8[10];
+} __attribute__ ((__packed__));
+
+/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
*/
@@ -405,6 +447,7 @@ struct btrfs_super_block {
/* future expansion */
__le64 reserved[31];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+ struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
} __attribute__ ((__packed__));
/*
@@ -755,6 +798,8 @@ struct btrfs_space_info {
chunks for this space */
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
+ unsigned int flush:1; /* set if we are trying to make space */
+
unsigned int force_alloc; /* set if we need to force a chunk
alloc for this space */
@@ -764,20 +809,14 @@ struct btrfs_space_info {
struct list_head block_groups[BTRFS_NR_RAID_TYPES];
spinlock_t lock;
struct rw_semaphore groups_sem;
- atomic_t caching_threads;
+ wait_queue_head_t wait;
};
struct btrfs_block_rsv {
u64 size;
u64 reserved;
- u64 freed[2];
struct btrfs_space_info *space_info;
- struct list_head list;
spinlock_t lock;
- atomic_t usage;
- unsigned int priority:8;
- unsigned int durable:1;
- unsigned int refill_used:1;
unsigned int full:1;
};
@@ -809,7 +848,8 @@ struct btrfs_free_cluster {
enum btrfs_caching_type {
BTRFS_CACHE_NO = 0,
BTRFS_CACHE_STARTED = 1,
- BTRFS_CACHE_FINISHED = 2,
+ BTRFS_CACHE_FAST = 2,
+ BTRFS_CACHE_FINISHED = 3,
};
enum btrfs_disk_cache_state {
@@ -824,6 +864,7 @@ struct btrfs_caching_control {
struct list_head list;
struct mutex mutex;
wait_queue_head_t wait;
+ struct btrfs_work work;
struct btrfs_block_group_cache *block_group;
u64 progress;
atomic_t count;
@@ -837,10 +878,10 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
- u64 reserved_pinned;
u64 bytes_super;
u64 flags;
u64 sectorsize;
+ u64 cache_generation;
unsigned int ro:1;
unsigned int dirty:1;
unsigned int iref:1;
@@ -896,6 +937,10 @@ struct btrfs_fs_info {
spinlock_t block_group_cache_lock;
struct rb_root block_group_cache_tree;
+ /* keep track of unallocated space */
+ spinlock_t free_chunk_lock;
+ u64 free_chunk_space;
+
struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;
@@ -913,14 +958,11 @@ struct btrfs_fs_info {
struct btrfs_block_rsv trans_block_rsv;
/* block reservation for chunk tree */
struct btrfs_block_rsv chunk_block_rsv;
+ /* block reservation for delayed operations */
+ struct btrfs_block_rsv delayed_block_rsv;
struct btrfs_block_rsv empty_block_rsv;
- /* list of block reservations that cross multiple transactions */
- struct list_head durable_block_rsv_list;
-
- struct mutex durable_block_rsv_mutex;
-
u64 generation;
u64 last_trans_committed;
@@ -939,8 +981,8 @@ struct btrfs_fs_info {
wait_queue_head_t transaction_blocked_wait;
wait_queue_head_t async_submit_wait;
- struct btrfs_super_block super_copy;
- struct btrfs_super_block super_for_commit;
+ struct btrfs_super_block *super_copy;
+ struct btrfs_super_block *super_for_commit;
struct block_device *__bdev;
struct super_block *sb;
struct inode *btree_inode;
@@ -1032,6 +1074,9 @@ struct btrfs_fs_info {
struct btrfs_workers endio_write_workers;
struct btrfs_workers endio_freespace_worker;
struct btrfs_workers submit_workers;
+ struct btrfs_workers caching_workers;
+ struct btrfs_workers readahead_workers;
+
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
@@ -1114,6 +1159,13 @@ struct btrfs_fs_info {
u64 fs_state;
struct btrfs_delayed_root *delayed_root;
+
+ /* readahead tree */
+ spinlock_t reada_lock;
+ struct radix_tree_root reada_tree;
+
+ /* next backup root to be overwritten */
+ int backup_root_index;
};
/*
@@ -1219,7 +1271,9 @@ struct btrfs_root {
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
*/
- struct super_block anon_super;
+ dev_t anon_dev;
+
+ int force_cow;
};
struct btrfs_ioctl_defrag_range_args {
@@ -1358,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
+#define BTRFS_MOUNT_RECOVERY (1 << 18)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1410,17 +1465,15 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(struct extent_buffer *eb) \
{ \
- type *p = kmap_atomic(eb->first_page, KM_USER0); \
+ type *p = page_address(eb->first_page); \
u##bits res = le##bits##_to_cpu(p->member); \
- kunmap_atomic(p, KM_USER0); \
return res; \
} \
static inline void btrfs_set_##name(struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = kmap_atomic(eb->first_page, KM_USER0); \
+ type *p = page_address(eb->first_page); \
p->member = cpu_to_le##bits(val); \
- kunmap_atomic(p, KM_USER0); \
}
#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
@@ -1975,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
}
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+ tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+ tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+ tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+ chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+ chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+ extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+ extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+ extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+ fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+ fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+ fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+ dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+ dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+ dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+ csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+ csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+ csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+ num_devices, 64);
+
/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2126,14 +2228,30 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
}
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+ return mapping_gfp_mask(mapping) & ~__GFP_FS;
+}
+
/* extent-tree.c */
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
- int num_items)
+ unsigned num_items)
{
return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3 * num_items;
}
+/*
+ * Doing a truncate won't result in new nodes or leaves, just what we need for
+ * COW.
+ */
+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+ unsigned num_items)
+{
+ return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ num_items;
+}
+
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
@@ -2143,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr);
@@ -2193,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve, int sinfo);
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len);
int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2222,9 +2343,6 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
-int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- int num_items);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2240,25 +2358,26 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes);
+int btrfs_block_rsv_check(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
- u64 min_reserved, int min_factor);
+ u64 min_reserved);
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_block_rsv *rsv);
int btrfs_set_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2330,7 +2449,7 @@ struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
void btrfs_set_path_blocking(struct btrfs_path *p);
void btrfs_clear_path_blocking(struct btrfs_path *p,
- struct extent_buffer *held);
+ struct extent_buffer *held, int held_rw);
void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2365,8 +2484,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref);
+void btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int update_ref);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@@ -2379,6 +2498,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
smp_mb();
return fs_info->closing;
}
+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+{
+ kfree(fs_info->delayed_root);
+ kfree(fs_info->extent_root);
+ kfree(fs_info->tree_root);
+ kfree(fs_info->chunk_root);
+ kfree(fs_info->dev_root);
+ kfree(fs_info->csum_root);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ kfree(fs_info);
+}
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2404,8 +2535,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
btrfs_root_item *item, struct btrfs_key *key);
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
-int btrfs_set_root_node(struct btrfs_root_item *item,
- struct extent_buffer *node);
+void btrfs_set_root_node(struct btrfs_root_item *item,
+ struct extent_buffer *node);
void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
/* dir-item.c */
@@ -2510,6 +2641,9 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
/* inode.c */
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create);
/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -2518,6 +2652,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
#define PageChecked PageFsMisc
#endif
+/* This forces readahead on a given range of bytes in an inode */
+static inline void btrfs_force_ra(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *file,
+ pgoff_t offset, unsigned long req_size)
+{
+ page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+}
+
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -2546,14 +2688,12 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio, unsigned long bio_flags);
-unsigned long btrfs_force_ra(struct address_space *mapping,
- struct file_ra_state *ra, struct file *file,
- pgoff_t offset, pgoff_t last_index);
int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode, int flags);
+int btrfs_dirty_inode(struct inode *inode);
+int btrfs_update_time(struct file *file);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
@@ -2571,11 +2711,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending,
- u64 *bytes_to_reserve);
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending);
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2602,7 +2737,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct inode *inode);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
-int btrfs_sync_file(struct file *file, int datasync);
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned);
extern const struct file_operations btrfs_file_operations;
@@ -2642,13 +2777,22 @@ do { \
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
-#else
-#define btrfs_check_acl NULL
-#endif
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
int btrfs_init_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir);
int btrfs_acl_chmod(struct inode *inode);
+#else
+#define btrfs_get_acl NULL
+static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir)
+{
+ return 0;
+}
+static inline int btrfs_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+#endif
/* relocation.c */
int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
@@ -2680,4 +2824,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
+/* reada.c */
+struct reada_control {
+ struct btrfs_root *root; /* tree to prefetch */
+ struct btrfs_key key_start;
+ struct btrfs_key key_end; /* exclusive */
+ atomic_t elems;
+ struct kref refcnt;
+ wait_queue_head_t wait;
+};
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+ struct btrfs_key *start, struct btrfs_key *end);
+int btrfs_reada_wait(void *handle);
+void btrfs_reada_detach(void *handle);
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+ u64 start, int err);
+
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 98c68e6..9c1eccc 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
return 0;
src_rsv = trans->block_rsv;
- dst_rsv = &root->fs_info->global_block_rsv;
+ dst_rsv = &root->fs_info->delayed_block_rsv;
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
if (!item->bytes_reserved)
return;
- rsv = &root->fs_info->global_block_rsv;
+ rsv = &root->fs_info->delayed_block_rsv;
btrfs_block_rsv_release(root, rsv,
item->bytes_reserved);
}
@@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct inode *inode,
struct btrfs_delayed_node *node)
{
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
u64 num_bytes;
int ret;
-
- if (!trans->bytes_reserved)
- return 0;
+ int release = false;
src_rsv = trans->block_rsv;
- dst_rsv = &root->fs_info->global_block_rsv;
+ dst_rsv = &root->fs_info->delayed_block_rsv;
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+ /*
+ * btrfs_dirty_inode will update the inode under btrfs_join_transaction
+ * which doesn't reserve space for speed. This is a problem since we
+ * still need to reserve space for this update, so try to reserve the
+ * space.
+ *
+ * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
+ * we're accounted for.
+ */
+ if (!src_rsv || (!trans->bytes_reserved &&
+ src_rsv != &root->fs_info->delalloc_block_rsv)) {
+ ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+ /*
+ * Since we're under a transaction reserve_metadata_bytes could
+ * try to commit the transaction which will make it return
+ * EAGAIN to make us stop the transaction we have, so return
+ * ENOSPC instead so that btrfs_dirty_inode knows what to do.
+ */
+ if (ret == -EAGAIN)
+ ret = -ENOSPC;
+ if (!ret)
+ node->bytes_reserved = num_bytes;
+ return ret;
+ } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ if (BTRFS_I(inode)->delalloc_meta_reserved) {
+ BTRFS_I(inode)->delalloc_meta_reserved = 0;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ release = true;
+ goto migrate;
+ }
+ spin_unlock(&BTRFS_I(inode)->lock);
+
+ /* Ok we didn't have space pre-reserved. This shouldn't happen
+ * too often but it can happen if we do delalloc to an existing
+ * inode which gets dirtied because of the time update, and then
+ * isn't touched again until after the transaction commits and
+ * then we try to write out the data. First try to be nice and
+ * reserve something strictly for us. If not be a pain and try
+ * to steal from the delalloc block rsv.
+ */
+ ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+ if (!ret)
+ goto out;
+
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ if (!ret)
+ goto out;
+
+ /*
+ * Ok this is a problem, let's just steal from the global rsv
+ * since this really shouldn't happen that often.
+ */
+ WARN_ON(1);
+ ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
+ dst_rsv, num_bytes);
+ goto out;
+ }
+
+migrate:
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+
+out:
+ /*
+ * Migrate only takes a reservation, it doesn't touch the size of the
+ * block_rsv. This is to simplify people who don't normally have things
+ * migrated from their block rsv. If they go to release their
+ * reservation, that will decrease the size as well, so if migrate
+ * reduced size we'd end up with a negative size. But for the
+ * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
+ * but we could in fact do this reserve/migrate dance several times
+ * between the time we did the original reservation and we'd clean it
+ * up. So to take care of this, release the space for the meta
+ * reservation here. I think it may be time for a documentation page on
+ * how block rsvs. work.
+ */
if (!ret)
node->bytes_reserved = num_bytes;
+ if (release)
+ btrfs_block_rsv_release(root, src_rsv, num_bytes);
+
return ret;
}
@@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
if (!node->bytes_reserved)
return;
- rsv = &root->fs_info->global_block_rsv;
+ rsv = &root->fs_info->delayed_block_rsv;
btrfs_block_rsv_release(root, rsv,
node->bytes_reserved);
node->bytes_reserved = 0;
@@ -735,7 +813,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
}
/* reset all the locked nodes in the patch to spinning locks. */
- btrfs_clear_path_blocking(path, NULL);
+ btrfs_clear_path_blocking(path, NULL, 0);
/* insert the keys of the items */
ret = setup_items_for_insert(trans, root, path, keys, data_size,
@@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
block_rsv = trans->block_rsv;
- trans->block_rsv = &root->fs_info->global_block_rsv;
+ trans->block_rsv = &root->fs_info->delayed_block_rsv;
delayed_root = btrfs_get_delayed_root(root);
@@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
block_rsv = trans->block_rsv;
- trans->block_rsv = &node->root->fs_info->global_block_rsv;
+ trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
ret = btrfs_insert_delayed_items(trans, path, node->root, node);
if (!ret)
@@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
goto free_path;
block_rsv = trans->block_rsv;
- trans->block_rsv = &root->fs_info->global_block_rsv;
+ trans->block_rsv = &root->fs_info->delayed_block_rsv;
ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
if (!ret)
@@ -1641,7 +1719,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
inode->i_gid = btrfs_stack_inode_gid(inode_item);
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
inode->i_mode = btrfs_stack_inode_mode(inode_item);
- inode->i_nlink = btrfs_stack_inode_nlink(inode_item);
+ set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
@@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
goto release_node;
}
- ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
- /*
- * we must reserve enough space when we start a new transaction,
- * so reserving metadata failure is impossible
- */
- BUG_ON(ret);
+ ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
+ delayed_node);
+ if (ret)
+ goto release_node;
fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 8d27af4..7083d08 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -25,7 +25,7 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/wait.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include "ctree.h"
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 685f259..31d84e7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -89,13 +89,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
data_size = sizeof(*dir_item) + name_len + data_len;
dir_item = insert_with_overflow(trans, root, path, &key, data_size,
name, name_len);
- /*
- * FIXME: at some point we should handle xattr's that are larger than
- * what we can fit in our leaf. We set location to NULL b/c we arent
- * pointing at anything else, that will change if we store the xattr
- * data in a separate inode.
- */
- BUG_ON(IS_ERR(dir_item));
+ if (IS_ERR(dir_item))
+ return PTR_ERR(dir_item);
memset(&location, 0, sizeof(location));
leaf = path->nodes[0];
@@ -203,8 +198,6 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ins_len = mod < 0 ? -1 : 0;
int cow = mod != 0;
- struct btrfs_key found_key;
- struct extent_buffer *leaf;
key.objectid = dir;
btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
@@ -214,18 +207,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
if (ret < 0)
return ERR_PTR(ret);
- if (ret > 0) {
- if (path->slots[0] == 0)
- return NULL;
- path->slots[0]--;
- }
-
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
- if (found_key.objectid != dir ||
- btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
- found_key.offset != key.offset)
+ if (ret > 0)
return NULL;
return btrfs_match_dir_item_name(root, path, name, name_len);
@@ -320,8 +302,6 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ins_len = mod < 0 ? -1 : 0;
int cow = mod != 0;
- struct btrfs_key found_key;
- struct extent_buffer *leaf;
key.objectid = dir;
btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
@@ -329,18 +309,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
if (ret < 0)
return ERR_PTR(ret);
- if (ret > 0) {
- if (path->slots[0] == 0)
- return NULL;
- path->slots[0]--;
- }
-
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
- if (found_key.objectid != dir ||
- btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
- found_key.offset != key.offset)
+ if (ret > 0)
return NULL;
return btrfs_match_dir_item_name(root, path, name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 57106a9..b8fc473 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -100,38 +100,83 @@ struct async_submit_bio {
struct btrfs_work work;
};
-/* These are used to set the lockdep class on the extent buffer locks.
- * The class is set by the readpage_end_io_hook after the buffer has
- * passed csum validation but before the pages are unlocked.
+/*
+ * Lockdep class keys for extent_buffer->lock's in this root. For a given
+ * eb, the lockdep key is determined by the btrfs_root it belongs to and
+ * the level the eb occupies in the tree.
+ *
+ * Different roots are used for different purposes and may nest inside each
+ * other and they require separate keysets. As lockdep keys should be
+ * static, assign keysets according to the purpose of the root as indicated
+ * by btrfs_root->objectid. This ensures that all special purpose roots
+ * have separate keysets.
*
- * The lockdep class is also set by btrfs_init_new_buffer on freshly
- * allocated blocks.
+ * Lock-nesting across peer nodes is always done with the immediate parent
+ * node locked thus preventing deadlock. As lockdep doesn't know this, use
+ * subclass to avoid triggering lockdep warning in such cases.
*
- * The class is based on the level in the tree block, which allows lockdep
- * to know that lower nodes nest inside the locks of higher nodes.
+ * The key is set by the readpage_end_io_hook after the buffer has passed
+ * csum validation but before the pages are unlocked. It is also set by
+ * btrfs_init_new_buffer on freshly allocated blocks.
*
- * We also add a check to make sure the highest level of the tree is
- * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this
- * code needs update as well.
+ * We also add a check to make sure the highest level of the tree is the
+ * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
+ * needs update as well.
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# if BTRFS_MAX_LEVEL != 8
# error
# endif
-static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
-static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
- /* leaf */
- "btrfs-extent-00",
- "btrfs-extent-01",
- "btrfs-extent-02",
- "btrfs-extent-03",
- "btrfs-extent-04",
- "btrfs-extent-05",
- "btrfs-extent-06",
- "btrfs-extent-07",
- /* highest possible level */
- "btrfs-extent-08",
+
+static struct btrfs_lockdep_keyset {
+ u64 id; /* root objectid */
+ const char *name_stem; /* lock name stem */
+ char names[BTRFS_MAX_LEVEL + 1][20];
+ struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
+} btrfs_lockdep_keysets[] = {
+ { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
+ { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
+ { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
+ { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
+ { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
+ { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
+ { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" },
+ { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
+ { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
+ { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
+ { .id = 0, .name_stem = "tree" },
};
+
+void __init btrfs_init_lockdep(void)
+{
+ int i, j;
+
+ /* initialize lockdep class names */
+ for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
+ struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
+
+ for (j = 0; j < ARRAY_SIZE(ks->names); j++)
+ snprintf(ks->names[j], sizeof(ks->names[j]),
+ "btrfs-%s-%02d", ks->name_stem, j);
+ }
+}
+
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
+ int level)
+{
+ struct btrfs_lockdep_keyset *ks;
+
+ BUG_ON(level >= ARRAY_SIZE(ks->keys));
+
+ /* find the matching keyset, id 0 is the default entry */
+ for (ks = btrfs_lockdep_keysets; ks->id; ks++)
+ if (ks->id == objectid)
+ break;
+
+ lockdep_set_class_and_name(&eb->lock,
+ &ks->keys[level], ks->names[level]);
+}
+
#endif
/*
@@ -211,13 +256,11 @@ void btrfs_csum_final(u32 crc, char *result)
static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
int verify)
{
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
char *result = NULL;
unsigned long len;
unsigned long cur_len;
unsigned long offset = BTRFS_CSUM_SIZE;
- char *map_token = NULL;
char *kaddr;
unsigned long map_start;
unsigned long map_len;
@@ -228,8 +271,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
len = buf->len - offset;
while (len > 0) {
err = map_private_extent_buffer(buf, offset, 32,
- &map_token, &kaddr,
- &map_start, &map_len, KM_USER0);
+ &kaddr, &map_start, &map_len);
if (err)
return 1;
cur_len = min(len, map_len - (offset - map_start));
@@ -237,7 +279,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
crc, cur_len);
len -= cur_len;
offset += cur_len;
- unmap_extent_buffer(buf, map_token, KM_USER0);
}
if (csum_size > sizeof(inline_result)) {
result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
@@ -325,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
while (1) {
- ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+ ret = read_extent_buffer_pages(io_tree, eb, start,
+ WAIT_COMPLETE,
btree_get_extent, mirror_num);
if (!ret &&
!verify_parent_transid(io_tree, eb, parent_transid))
@@ -494,15 +536,6 @@ static noinline int check_leaf(struct btrfs_root *root,
return 0;
}
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
-{
- lockdep_set_class_and_name(&eb->lock,
- &btrfs_eb_class[level],
- btrfs_eb_name[level]);
-}
-#endif
-
static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state)
{
@@ -553,7 +586,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
}
found_level = btrfs_header_level(eb);
- btrfs_set_buffer_lockdep_class(eb, found_level);
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
+ eb, found_level);
ret = csum_tree_block(root, eb, 1);
if (ret) {
@@ -574,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
end = eb->start + end - 1;
err:
+ if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+ clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ btree_readahead_hook(root, eb, eb->start, ret);
+ }
+
free_extent_buffer(eb);
out:
return ret;
}
+static int btree_io_failed_hook(struct bio *failed_bio,
+ struct page *page, u64 start, u64 end,
+ int mirror_num, struct extent_state *state)
+{
+ struct extent_io_tree *tree;
+ unsigned long len;
+ struct extent_buffer *eb;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+ if (!page->private)
+ goto out;
+
+ len = page->private >> 2;
+ WARN_ON(len == 0);
+
+ eb = alloc_extent_buffer(tree, start, len, page);
+ if (eb == NULL)
+ goto out;
+
+ if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+ clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ btree_readahead_hook(root, eb, eb->start, -EIO);
+ }
+ free_extent_buffer(eb);
+
+out:
+ return -EIO; /* we fixed nothing */
+}
+
static void end_workqueue_bio(struct bio *bio, int err)
{
struct end_io_wq *end_io_wq = bio->bi_private;
@@ -875,7 +946,7 @@ static int btree_readpage(struct file *file, struct page *page)
{
struct extent_io_tree *tree;
tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_read_full_page(tree, page, btree_get_extent);
+ return extent_read_full_page(tree, page, btree_get_extent, 0);
}
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -941,11 +1012,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
if (!buf)
return 0;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
- buf, 0, 0, btree_get_extent, 0);
+ buf, 0, WAIT_NONE, btree_get_extent, 0);
free_extent_buffer(buf);
return ret;
}
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ int mirror_num, struct extent_buffer **eb)
+{
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+ int ret;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return 0;
+
+ set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+ ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+ btree_get_extent, mirror_num);
+ if (ret) {
+ free_extent_buffer(buf);
+ return ret;
+ }
+
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+ free_extent_buffer(buf);
+ return -EIO;
+ } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+ *eb = buf;
+ } else {
+ free_extent_buffer(buf);
+ }
+ return 0;
+}
+
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
@@ -1078,12 +1181,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
init_completion(&root->kobj_unregister);
root->defrag_running = 0;
root->root_key.objectid = objectid;
- root->anon_super.s_root = NULL;
- root->anon_super.s_dev = 0;
- INIT_LIST_HEAD(&root->anon_super.s_list);
- INIT_LIST_HEAD(&root->anon_super.s_instances);
- init_rwsem(&root->anon_super.s_umount);
-
+ root->anon_dev = 0;
return 0;
}
@@ -1107,10 +1205,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->commit_root = NULL;
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
free_extent_buffer(root->node);
+ root->node = NULL;
return -EIO;
}
root->commit_root = btrfs_root_node(root);
@@ -1312,7 +1412,7 @@ again:
spin_lock_init(&root->cache_lock);
init_waitqueue_head(&root->cache_wait);
- ret = set_anon_super(&root->anon_super, NULL);
+ ret = get_anon_bdev(&root->anon_dev);
if (ret)
goto fail;
@@ -1549,6 +1649,235 @@ sleep:
return 0;
}
+/*
+ * this will find the highest generation in the array of
+ * root backups. The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest root in the array with the generation
+ * in the super block. If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+ u64 cur;
+ int newest_index = -1;
+ struct btrfs_root_backup *root_backup;
+ int i;
+
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+ root_backup = info->super_copy->super_roots + i;
+ cur = btrfs_backup_tree_root_gen(root_backup);
+ if (cur == newest_gen)
+ newest_index = i;
+ }
+
+ /* check to see if we actually wrapped around */
+ if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+ root_backup = info->super_copy->super_roots;
+ cur = btrfs_backup_tree_root_gen(root_backup);
+ if (cur == newest_gen)
+ newest_index = 0;
+ }
+ return newest_index;
+}
+
+
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array. This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+ u64 newest_gen)
+{
+ int newest_index = -1;
+
+ newest_index = find_newest_super_backup(info, newest_gen);
+ /* if there was garbage in there, just move along */
+ if (newest_index == -1) {
+ info->backup_root_index = 0;
+ } else {
+ info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+ }
+}
+
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+ int next_backup;
+ struct btrfs_root_backup *root_backup;
+ int last_backup;
+
+ next_backup = info->backup_root_index;
+ last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ BTRFS_NUM_BACKUP_ROOTS;
+
+ /*
+ * just overwrite the last backup if we're at the same generation
+ * this happens only at umount
+ */
+ root_backup = info->super_for_commit->super_roots + last_backup;
+ if (btrfs_backup_tree_root_gen(root_backup) ==
+ btrfs_header_generation(info->tree_root->node))
+ next_backup = last_backup;
+
+ root_backup = info->super_for_commit->super_roots + next_backup;
+
+ /*
+ * make sure all of our padding and empty slots get zero filled
+ * regardless of which ones we use today
+ */
+ memset(root_backup, 0, sizeof(*root_backup));
+
+ info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+
+ btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+ btrfs_set_backup_tree_root_gen(root_backup,
+ btrfs_header_generation(info->tree_root->node));
+
+ btrfs_set_backup_tree_root_level(root_backup,
+ btrfs_header_level(info->tree_root->node));
+
+ btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+ btrfs_set_backup_chunk_root_gen(root_backup,
+ btrfs_header_generation(info->chunk_root->node));
+ btrfs_set_backup_chunk_root_level(root_backup,
+ btrfs_header_level(info->chunk_root->node));
+
+ btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+ btrfs_set_backup_extent_root_gen(root_backup,
+ btrfs_header_generation(info->extent_root->node));
+ btrfs_set_backup_extent_root_level(root_backup,
+ btrfs_header_level(info->extent_root->node));
+
+ /*
+ * we might commit during log recovery, which happens before we set
+ * the fs_root. Make sure it is valid before we fill it in.
+ */
+ if (info->fs_root && info->fs_root->node) {
+ btrfs_set_backup_fs_root(root_backup,
+ info->fs_root->node->start);
+ btrfs_set_backup_fs_root_gen(root_backup,
+ btrfs_header_generation(info->fs_root->node));
+ btrfs_set_backup_fs_root_level(root_backup,
+ btrfs_header_level(info->fs_root->node));
+ }
+
+ btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+ btrfs_set_backup_dev_root_gen(root_backup,
+ btrfs_header_generation(info->dev_root->node));
+ btrfs_set_backup_dev_root_level(root_backup,
+ btrfs_header_level(info->dev_root->node));
+
+ btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+ btrfs_set_backup_csum_root_gen(root_backup,
+ btrfs_header_generation(info->csum_root->node));
+ btrfs_set_backup_csum_root_level(root_backup,
+ btrfs_header_level(info->csum_root->node));
+
+ btrfs_set_backup_total_bytes(root_backup,
+ btrfs_super_total_bytes(info->super_copy));
+ btrfs_set_backup_bytes_used(root_backup,
+ btrfs_super_bytes_used(info->super_copy));
+ btrfs_set_backup_num_devices(root_backup,
+ btrfs_super_num_devices(info->super_copy));
+
+ /*
+ * if we don't copy this out to the super_copy, it won't get remembered
+ * for the next commit
+ */
+ memcpy(&info->super_copy->super_roots,
+ &info->super_for_commit->super_roots,
+ sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block. It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+ struct btrfs_super_block *super,
+ int *num_backups_tried, int *backup_index)
+{
+ struct btrfs_root_backup *root_backup;
+ int newest = *backup_index;
+
+ if (*num_backups_tried == 0) {
+ u64 gen = btrfs_super_generation(super);
+
+ newest = find_newest_super_backup(info, gen);
+ if (newest == -1)
+ return -1;
+
+ *backup_index = newest;
+ *num_backups_tried = 1;
+ } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+ /* we've tried all the backups, all done */
+ return -1;
+ } else {
+ /* jump to the next oldest backup */
+ newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ BTRFS_NUM_BACKUP_ROOTS;
+ *backup_index = newest;
+ *num_backups_tried += 1;
+ }
+ root_backup = super->super_roots + newest;
+
+ btrfs_set_super_generation(super,
+ btrfs_backup_tree_root_gen(root_backup));
+ btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+ btrfs_set_super_root_level(super,
+ btrfs_backup_tree_root_level(root_backup));
+ btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+
+ /*
+ * fixme: the total bytes and num_devices need to match or we should
+ * need a fsck
+ */
+ btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+ btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+ return 0;
+}
+
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+ free_extent_buffer(info->tree_root->node);
+ free_extent_buffer(info->tree_root->commit_root);
+ free_extent_buffer(info->dev_root->node);
+ free_extent_buffer(info->dev_root->commit_root);
+ free_extent_buffer(info->extent_root->node);
+ free_extent_buffer(info->extent_root->commit_root);
+ free_extent_buffer(info->csum_root->node);
+ free_extent_buffer(info->csum_root->commit_root);
+
+ info->tree_root->node = NULL;
+ info->tree_root->commit_root = NULL;
+ info->dev_root->node = NULL;
+ info->dev_root->commit_root = NULL;
+ info->extent_root->node = NULL;
+ info->extent_root->commit_root = NULL;
+ info->csum_root->node = NULL;
+ info->csum_root->commit_root = NULL;
+
+ if (chunk_root) {
+ free_extent_buffer(info->chunk_root->node);
+ free_extent_buffer(info->chunk_root->commit_root);
+ info->chunk_root->node = NULL;
+ info->chunk_root->commit_root = NULL;
+ }
+}
+
+
struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
@@ -1562,29 +1891,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
u64 features;
struct btrfs_key location;
struct buffer_head *bh;
- struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
- struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
+ struct btrfs_super_block *disk_super;
struct btrfs_root *tree_root = btrfs_sb(sb);
- struct btrfs_fs_info *fs_info = NULL;
- struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
- struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_root *extent_root;
+ struct btrfs_root *csum_root;
+ struct btrfs_root *chunk_root;
+ struct btrfs_root *dev_root;
struct btrfs_root *log_tree_root;
-
int ret;
int err = -EINVAL;
-
- struct btrfs_super_block *disk_super;
-
- if (!extent_root || !tree_root || !tree_root->fs_info ||
- !chunk_root || !dev_root || !csum_root) {
+ int num_backups_tried = 0;
+ int backup_index = 0;
+
+ extent_root = fs_info->extent_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ csum_root = fs_info->csum_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ chunk_root = fs_info->chunk_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ dev_root = fs_info->dev_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+
+ if (!extent_root || !csum_root || !chunk_root || !dev_root) {
err = -ENOMEM;
goto fail;
}
- fs_info = tree_root->fs_info;
ret = init_srcu_struct(&fs_info->subvol_srcu);
if (ret) {
@@ -1604,7 +1936,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_bdi;
}
- fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
+ mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
@@ -1620,15 +1952,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
+ spin_lock_init(&fs_info->free_chunk_lock);
mutex_init(&fs_info->reloc_mutex);
init_completion(&fs_info->kobj_unregister);
- fs_info->tree_root = tree_root;
- fs_info->extent_root = extent_root;
- fs_info->csum_root = csum_root;
- fs_info->chunk_root = chunk_root;
- fs_info->dev_root = dev_root;
- fs_info->fs_devices = fs_devices;
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
btrfs_mapping_init(&fs_info->mapping_tree);
@@ -1637,8 +1964,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_block_rsv(&fs_info->trans_block_rsv);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
btrfs_init_block_rsv(&fs_info->empty_block_rsv);
- INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
- mutex_init(&fs_info->durable_block_rsv_mutex);
+ btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
@@ -1649,6 +1975,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->trans_no_join = 0;
+ fs_info->free_chunk_space = 0;
+
+ /* readahead state */
+ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ spin_lock_init(&fs_info->reada_lock);
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
@@ -1677,7 +2008,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
sb->s_bdi = &fs_info->bdi;
fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
- fs_info->btree_inode->i_nlink = 1;
+ set_nlink(fs_info->btree_inode, 1);
/*
* we set the i_size on the btree inode to the max possible int.
* the real end of the address space is determined by all of
@@ -1738,14 +2069,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_alloc;
}
- memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
- memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
- sizeof(fs_info->super_for_commit));
+ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
+ memcpy(fs_info->super_for_commit, fs_info->super_copy,
+ sizeof(*fs_info->super_for_commit));
brelse(bh);
- memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+ memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
- disk_super = &fs_info->super_copy;
+ disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
goto fail_alloc;
@@ -1755,6 +2086,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
/*
+ * run through our array of backup supers and setup
+ * our ring pointer to the oldest one
+ */
+ generation = btrfs_super_generation(disk_super);
+ find_oldest_super_backup(fs_info, generation);
+
+ /*
* In the long term, we'll store the compression type in the super
* block, and it'll be used for per file compression control.
*/
@@ -1808,6 +2146,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->thread_pool_size),
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->caching_workers, "cache",
+ 2, &fs_info->generic_worker);
+
/* a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
* devices
@@ -1839,6 +2180,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
fs_info->thread_pool_size,
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->readahead_workers, "readahead",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
@@ -1849,18 +2193,29 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->endio_write_workers.idle_thresh = 2;
fs_info->endio_meta_write_workers.idle_thresh = 2;
+ fs_info->readahead_workers.idle_thresh = 2;
- btrfs_start_workers(&fs_info->workers, 1);
- btrfs_start_workers(&fs_info->generic_worker, 1);
- btrfs_start_workers(&fs_info->submit_workers, 1);
- btrfs_start_workers(&fs_info->delalloc_workers, 1);
- btrfs_start_workers(&fs_info->fixup_workers, 1);
- btrfs_start_workers(&fs_info->endio_workers, 1);
- btrfs_start_workers(&fs_info->endio_meta_workers, 1);
- btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
- btrfs_start_workers(&fs_info->endio_write_workers, 1);
- btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
- btrfs_start_workers(&fs_info->delayed_workers, 1);
+ /*
+ * btrfs_start_workers can really only fail because of ENOMEM so just
+ * return -ENOMEM if any of these fail.
+ */
+ ret = btrfs_start_workers(&fs_info->workers);
+ ret |= btrfs_start_workers(&fs_info->generic_worker);
+ ret |= btrfs_start_workers(&fs_info->submit_workers);
+ ret |= btrfs_start_workers(&fs_info->delalloc_workers);
+ ret |= btrfs_start_workers(&fs_info->fixup_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_write_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
+ ret |= btrfs_start_workers(&fs_info->delayed_workers);
+ ret |= btrfs_start_workers(&fs_info->caching_workers);
+ ret |= btrfs_start_workers(&fs_info->readahead_workers);
+ if (ret) {
+ ret = -ENOMEM;
+ goto fail_sb_buffer;
+ }
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1907,7 +2262,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
sb->s_id);
- goto fail_chunk_root;
+ goto fail_tree_roots;
}
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1922,11 +2277,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (ret) {
printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
sb->s_id);
- goto fail_chunk_root;
+ goto fail_tree_roots;
}
btrfs_close_extra_devices(fs_devices);
+retry_root_backup:
blocksize = btrfs_level_size(tree_root,
btrfs_super_root_level(disk_super));
generation = btrfs_super_generation(disk_super);
@@ -1934,32 +2290,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
blocksize, generation);
- if (!tree_root->node)
- goto fail_chunk_root;
- if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ if (!tree_root->node ||
+ !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
sb->s_id);
- goto fail_tree_root;
+
+ goto recovery_tree_root;
}
+
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_EXTENT_TREE_OBJECTID, extent_root);
if (ret)
- goto fail_tree_root;
+ goto recovery_tree_root;
extent_root->track_dirty = 1;
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_DEV_TREE_OBJECTID, dev_root);
if (ret)
- goto fail_extent_root;
+ goto recovery_tree_root;
dev_root->track_dirty = 1;
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_CSUM_TREE_OBJECTID, csum_root);
if (ret)
- goto fail_dev_root;
+ goto recovery_tree_root;
csum_root->track_dirty = 1;
@@ -2092,22 +2449,13 @@ fail_cleaner:
fail_block_groups:
btrfs_free_block_groups(fs_info);
- free_extent_buffer(csum_root->node);
- free_extent_buffer(csum_root->commit_root);
-fail_dev_root:
- free_extent_buffer(dev_root->node);
- free_extent_buffer(dev_root->commit_root);
-fail_extent_root:
- free_extent_buffer(extent_root->node);
- free_extent_buffer(extent_root->commit_root);
-fail_tree_root:
- free_extent_buffer(tree_root->node);
- free_extent_buffer(tree_root->commit_root);
-fail_chunk_root:
- free_extent_buffer(chunk_root->node);
- free_extent_buffer(chunk_root->commit_root);
+
+fail_tree_roots:
+ free_root_pointers(fs_info, 1);
+
fail_sb_buffer:
btrfs_stop_workers(&fs_info->generic_worker);
+ btrfs_stop_workers(&fs_info->readahead_workers);
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
@@ -2118,26 +2466,39 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->endio_freespace_worker);
btrfs_stop_workers(&fs_info->submit_workers);
btrfs_stop_workers(&fs_info->delayed_workers);
+ btrfs_stop_workers(&fs_info->caching_workers);
fail_alloc:
- kfree(fs_info->delayed_root);
fail_iput:
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
-
- btrfs_close_devices(fs_info->fs_devices);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
fail_bdi:
bdi_destroy(&fs_info->bdi);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
- kfree(extent_root);
- kfree(tree_root);
- kfree(fs_info);
- kfree(chunk_root);
- kfree(dev_root);
- kfree(csum_root);
+ btrfs_close_devices(fs_info->fs_devices);
+ free_fs_info(fs_info);
return ERR_PTR(err);
+
+recovery_tree_root:
+ if (!btrfs_test_opt(tree_root, RECOVERY))
+ goto fail_tree_roots;
+
+ free_root_pointers(fs_info, 0);
+
+ /* don't use the log in recovery mode, it won't be valid */
+ btrfs_set_super_log_root(disk_super, 0);
+
+ /* we can't trust the free space cache either */
+ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+ ret = next_root_backup(fs_info, fs_info->super_copy,
+ &num_backups_tried, &backup_index);
+ if (ret == -1)
+ goto fail_block_groups;
+ goto retry_root_backup;
}
static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2221,22 +2582,10 @@ static int write_dev_supers(struct btrfs_device *device,
int errors = 0;
u32 crc;
u64 bytenr;
- int last_barrier = 0;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
- /* make sure only the last submit_bh does a barrier */
- if (do_barriers) {
- for (i = 0; i < max_mirrors; i++) {
- bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >=
- device->total_bytes)
- break;
- last_barrier = i;
- }
- }
-
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2282,17 +2631,140 @@ static int write_dev_supers(struct btrfs_device *device,
bh->b_end_io = btrfs_end_buffer_write_sync;
}
- if (i == last_barrier && do_barriers)
- ret = submit_bh(WRITE_FLUSH_FUA, bh);
- else
- ret = submit_bh(WRITE_SYNC, bh);
-
+ /*
+ * we fua the first super. The others we allow
+ * to go down lazy.
+ */
+ ret = submit_bh(WRITE_FUA, bh);
if (ret)
errors++;
}
return errors < i ? 0 : -1;
}
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ }
+ if (bio->bi_private)
+ complete(bio->bi_private);
+ bio_put(bio);
+}
+
+/*
+ * trigger flushes for one the devices. If you pass wait == 0, the flushes are
+ * sent down. With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+ struct bio *bio;
+ int ret = 0;
+
+ if (device->nobarriers)
+ return 0;
+
+ if (wait) {
+ bio = device->flush_bio;
+ if (!bio)
+ return 0;
+
+ wait_for_completion(&device->flush_wait);
+
+ if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+ printk("btrfs: disabling barriers on dev %s\n",
+ device->name);
+ device->nobarriers = 1;
+ }
+ if (!bio_flagged(bio, BIO_UPTODATE)) {
+ ret = -EIO;
+ }
+
+ /* drop the reference from the wait == 0 run */
+ bio_put(bio);
+ device->flush_bio = NULL;
+
+ return ret;
+ }
+
+ /*
+ * one reference for us, and we leave it for the
+ * caller
+ */
+ device->flush_bio = NULL;;
+ bio = bio_alloc(GFP_NOFS, 0);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_end_io = btrfs_end_empty_barrier;
+ bio->bi_bdev = device->bdev;
+ init_completion(&device->flush_wait);
+ bio->bi_private = &device->flush_wait;
+ device->flush_bio = bio;
+
+ bio_get(bio);
+ submit_bio(WRITE_FLUSH, bio);
+
+ return 0;
+}
+
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+ struct list_head *head;
+ struct btrfs_device *dev;
+ int errors = 0;
+ int ret;
+
+ /* send down all the barriers */
+ head = &info->fs_devices->devices;
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ if (dev->missing)
+ continue;
+ if (!dev->bdev) {
+ errors++;
+ continue;
+ }
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ ret = write_dev_flush(dev, 0);
+ if (ret)
+ errors++;
+ }
+
+ /* wait for all the barriers */
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ if (dev->missing)
+ continue;
+ if (!dev->bdev) {
+ errors++;
+ continue;
+ }
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ ret = write_dev_flush(dev, 1);
+ if (ret)
+ errors++;
+ }
+ if (errors)
+ return -EIO;
+ return 0;
+}
+
int write_all_supers(struct btrfs_root *root, int max_mirrors)
{
struct list_head *head;
@@ -2305,14 +2777,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
int total_errors = 0;
u64 flags;
- max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+ max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
do_barriers = !btrfs_test_opt(root, NOBARRIER);
+ backup_super_roots(root->fs_info);
- sb = &root->fs_info->super_for_commit;
+ sb = root->fs_info->super_for_commit;
dev_item = &sb->dev_item;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
head = &root->fs_info->fs_devices->devices;
+
+ if (do_barriers)
+ barrier_all_devices(root->fs_info);
+
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
total_errors++;
@@ -2394,10 +2871,8 @@ static void free_fs_root(struct btrfs_root *root)
{
iput(root->cache_inode);
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
- if (root->anon_super.s_dev) {
- down_write(&root->anon_super.s_umount);
- kill_anon_super(&root->anon_super);
- }
+ if (root->anon_dev)
+ free_anon_bdev(root->anon_dev);
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
kfree(root->free_ino_ctl);
@@ -2514,8 +2989,6 @@ int close_ctree(struct btrfs_root *root)
/* clear out the rbtree of defraggable inodes */
btrfs_run_defrag_inodes(root->fs_info);
- btrfs_put_block_group_cache(fs_info);
-
/*
* Here come 2 situations when btrfs is broken to flip readonly:
*
@@ -2541,6 +3014,8 @@ int close_ctree(struct btrfs_root *root)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
+ btrfs_put_block_group_cache(fs_info);
+
kthread_stop(root->fs_info->transaction_kthread);
kthread_stop(root->fs_info->cleaner_kthread);
@@ -2572,7 +3047,6 @@ int close_ctree(struct btrfs_root *root)
del_fs_roots(fs_info);
iput(fs_info->btree_inode);
- kfree(fs_info->delayed_root);
btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2585,6 +3059,8 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_freespace_worker);
btrfs_stop_workers(&fs_info->submit_workers);
btrfs_stop_workers(&fs_info->delayed_workers);
+ btrfs_stop_workers(&fs_info->caching_workers);
+ btrfs_stop_workers(&fs_info->readahead_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2592,12 +3068,7 @@ int close_ctree(struct btrfs_root *root)
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
- kfree(fs_info->extent_root);
- kfree(fs_info->tree_root);
- kfree(fs_info->chunk_root);
- kfree(fs_info->dev_root);
- kfree(fs_info->csum_root);
- kfree(fs_info);
+ free_fs_info(fs_info);
return 0;
}
@@ -2703,7 +3174,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
return ret;
}
-int btree_lock_page_hook(struct page *page)
+static int btree_lock_page_hook(struct page *page, void *data,
+ void (*flush_fn)(void *))
{
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2720,7 +3192,10 @@ int btree_lock_page_hook(struct page *page)
if (!eb)
goto out;
- btrfs_tree_lock(eb);
+ if (!btrfs_try_tree_write_lock(eb)) {
+ flush_fn(data);
+ btrfs_tree_lock(eb);
+ }
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2735,7 +3210,10 @@ int btree_lock_page_hook(struct page *page)
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
out:
- lock_page(page);
+ if (!trylock_page(page)) {
+ flush_fn(data);
+ lock_page(page);
+ }
return 0;
}
@@ -3003,12 +3481,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
if (ret)
break;
- /* opt_discard */
- if (btrfs_test_opt(root, DISCARD))
- ret = btrfs_error_discard_extent(root, start,
- end + 1 - start,
- NULL);
-
clear_extent_dirty(unpin, start, end, GFP_NOFS);
btrfs_error_unpin_extent_range(root, start, end);
cond_resched();
@@ -3091,6 +3563,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
+ .readpage_io_failed_hook = btree_io_failed_hook,
.submit_bio_hook = btree_submit_bio_hook,
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a0b610a..c99d0a8f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
u32 blocksize, u64 parent_transid);
int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
u64 parent_transid);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,14 +85,16 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btree_lock_page_hook(struct page *page);
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+void btrfs_init_lockdep(void);
+void btrfs_set_buffer_lockdep_class(u64 objectid,
+ struct extent_buffer *eb, int level);
#else
-static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
- int level)
+static inline void btrfs_init_lockdep(void)
+{ }
+static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
+ struct extent_buffer *eb, int level)
{
}
#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 01220b7..da528f8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
#include <linux/rcupdate.h>
#include <linux/kthread.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
#include "compat.h"
#include "hash.h"
#include "ctree.h"
@@ -52,6 +53,21 @@ enum {
CHUNK_ALLOC_LIMITED = 2,
};
+/*
+ * Control how reservations are dealt with.
+ *
+ * RESERVE_FREE - freeing a reservation.
+ * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
+ * ENOSPC accounting
+ * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
+ * bytes_may_use as the ENOSPC accounting is done elsewhere
+ */
+enum {
+ RESERVE_FREE = 0,
+ RESERVE_ALLOC = 1,
+ RESERVE_ALLOC_NO_ACCOUNT = 2,
+};
+
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
if (atomic_dec_and_test(&cache->count)) {
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
- WARN_ON(cache->reserved_pinned > 0);
kfree(cache->free_space_ctl);
kfree(cache);
}
@@ -320,12 +337,12 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
return total_added;
}
-static int caching_kthread(void *data)
+static noinline void caching_thread(struct btrfs_work *work)
{
- struct btrfs_block_group_cache *block_group = data;
- struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -334,9 +351,14 @@ static int caching_kthread(void *data)
u32 nritems;
int ret = 0;
+ caching_ctl = container_of(work, struct btrfs_caching_control, work);
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ extent_root = fs_info->extent_root;
+
path = btrfs_alloc_path();
if (!path)
- return -ENOMEM;
+ goto out;
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -433,13 +455,11 @@ err:
free_excluded_extents(extent_root, block_group);
mutex_unlock(&caching_ctl->mutex);
+out:
wake_up(&caching_ctl->wait);
put_caching_control(caching_ctl);
- atomic_dec(&block_group->space_info->caching_threads);
btrfs_put_block_group(block_group);
-
- return 0;
}
static int cache_block_group(struct btrfs_block_group_cache *cache,
@@ -447,14 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
struct btrfs_root *root,
int load_cache_only)
{
+ DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl;
- struct task_struct *tsk;
int ret = 0;
- smp_mb();
- if (cache->cached != BTRFS_CACHE_NO)
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+ BUG_ON(!caching_ctl);
+
+ INIT_LIST_HEAD(&caching_ctl->list);
+ mutex_init(&caching_ctl->mutex);
+ init_waitqueue_head(&caching_ctl->wait);
+ caching_ctl->block_group = cache;
+ caching_ctl->progress = cache->key.objectid;
+ atomic_set(&caching_ctl->count, 1);
+ caching_ctl->work.func = caching_thread;
+
+ spin_lock(&cache->lock);
+ /*
+ * This should be a rare occasion, but this could happen I think in the
+ * case where one thread starts to load the space cache info, and then
+ * some other thread starts a transaction commit which tries to do an
+ * allocation while the other thread is still loading the space cache
+ * info. The previous loop should have kept us from choosing this block
+ * group, but if we've moved to the state where we will wait on caching
+ * block groups we need to first check if we're doing a fast load here,
+ * so we can wait for it to finish, otherwise we could end up allocating
+ * from a block group who's cache gets evicted for one reason or
+ * another.
+ */
+ while (cache->cached == BTRFS_CACHE_FAST) {
+ struct btrfs_caching_control *ctl;
+
+ ctl = cache->caching_ctl;
+ atomic_inc(&ctl->count);
+ prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&cache->lock);
+
+ schedule();
+
+ finish_wait(&ctl->wait, &wait);
+ put_caching_control(ctl);
+ spin_lock(&cache->lock);
+ }
+
+ if (cache->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&cache->lock);
+ kfree(caching_ctl);
return 0;
+ }
+ WARN_ON(cache->caching_ctl);
+ cache->caching_ctl = caching_ctl;
+ cache->cached = BTRFS_CACHE_FAST;
+ spin_unlock(&cache->lock);
/*
* We can't do the read from on-disk cache during a commit since we need
@@ -463,69 +528,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
* we likely hold important locks.
*/
if (trans && (!trans->transaction->in_commit) &&
- (root && root != root->fs_info->tree_root)) {
- spin_lock(&cache->lock);
- if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
- return 0;
- }
- cache->cached = BTRFS_CACHE_STARTED;
- spin_unlock(&cache->lock);
-
+ (root && root != root->fs_info->tree_root) &&
+ btrfs_test_opt(root, SPACE_CACHE)) {
ret = load_free_space_cache(fs_info, cache);
spin_lock(&cache->lock);
if (ret == 1) {
+ cache->caching_ctl = NULL;
cache->cached = BTRFS_CACHE_FINISHED;
cache->last_byte_to_unpin = (u64)-1;
} else {
- cache->cached = BTRFS_CACHE_NO;
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ }
}
spin_unlock(&cache->lock);
+ wake_up(&caching_ctl->wait);
if (ret == 1) {
+ put_caching_control(caching_ctl);
free_excluded_extents(fs_info->extent_root, cache);
return 0;
}
+ } else {
+ /*
+ * We are not going to do the fast caching, set cached to the
+ * appropriate value and wakeup any waiters.
+ */
+ spin_lock(&cache->lock);
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ }
+ spin_unlock(&cache->lock);
+ wake_up(&caching_ctl->wait);
}
- if (load_cache_only)
- return 0;
-
- caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
- BUG_ON(!caching_ctl);
-
- INIT_LIST_HEAD(&caching_ctl->list);
- mutex_init(&caching_ctl->mutex);
- init_waitqueue_head(&caching_ctl->wait);
- caching_ctl->block_group = cache;
- caching_ctl->progress = cache->key.objectid;
- /* one for caching kthread, one for caching block group list */
- atomic_set(&caching_ctl->count, 2);
-
- spin_lock(&cache->lock);
- if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
- kfree(caching_ctl);
+ if (load_cache_only) {
+ put_caching_control(caching_ctl);
return 0;
}
- cache->caching_ctl = caching_ctl;
- cache->cached = BTRFS_CACHE_STARTED;
- spin_unlock(&cache->lock);
down_write(&fs_info->extent_commit_sem);
+ atomic_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
up_write(&fs_info->extent_commit_sem);
- atomic_inc(&cache->space_info->caching_threads);
btrfs_get_block_group(cache);
- tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
- cache->key.objectid);
- if (IS_ERR(tsk)) {
- ret = PTR_ERR(tsk);
- printk(KERN_ERR "error running thread %d\n", ret);
- BUG();
- }
+ btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
return ret;
}
@@ -667,7 +722,9 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
struct btrfs_path *path;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
key.objectid = start;
key.offset = len;
btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -1772,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
{
int ret;
u64 discarded_bytes = 0;
- struct btrfs_multi_bio *multi = NULL;
+ struct btrfs_bio *bbio = NULL;
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
- bytenr, &num_bytes, &multi, 0);
+ bytenr, &num_bytes, &bbio, 0);
if (!ret) {
- struct btrfs_bio_stripe *stripe = multi->stripes;
+ struct btrfs_bio_stripe *stripe = bbio->stripes;
int i;
- for (i = 0; i < multi->num_stripes; i++, stripe++) {
+ for (i = 0; i < bbio->num_stripes; i++, stripe++) {
if (!stripe->dev->can_discard)
continue;
@@ -1802,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
*/
ret = 0;
}
- kfree(multi);
+ kfree(bbio);
}
if (actual_bytes)
@@ -2702,6 +2759,13 @@ again:
goto again;
}
+ /* We've already setup this transaction, go ahead and exit */
+ if (block_group->cache_generation == trans->transid &&
+ i_size_read(inode)) {
+ dcs = BTRFS_DC_SETUP;
+ goto out_put;
+ }
+
/*
* We want to set the generation to 0, that way if anything goes wrong
* from here on out we know not to trust this cache when we load up next
@@ -2751,12 +2815,15 @@ again:
if (!ret)
dcs = BTRFS_DC_SETUP;
btrfs_free_reserved_data_space(inode, num_pages);
+
out_put:
iput(inode);
out_free:
btrfs_release_path(path);
out:
spin_lock(&block_group->lock);
+ if (!ret && dcs == BTRFS_DC_SETUP)
+ block_group->cache_generation = trans->transid;
block_group->disk_cache_state = dcs;
spin_unlock(&block_group->lock);
@@ -2940,9 +3007,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->full = 0;
found->force_alloc = CHUNK_ALLOC_NO_FORCE;
found->chunk_alloc = 0;
+ found->flush = 0;
+ init_waitqueue_head(&found->wait);
*space_info = found;
list_add_rcu(&found->list, &info->space_info);
- atomic_set(&found->caching_threads, 0);
return 0;
}
@@ -3123,16 +3191,13 @@ commit_trans:
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
- BTRFS_I(inode)->reserved_bytes += bytes;
spin_unlock(&data_sinfo->lock);
return 0;
}
/*
- * called when we are clearing an delalloc extent from the
- * inode's io_tree or there was an error for whatever reason
- * after calling btrfs_check_data_free_space
+ * Called if we need to clear a data reservation for this inode.
*/
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
{
@@ -3145,7 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
data_sinfo = BTRFS_I(inode)->space_info;
spin_lock(&data_sinfo->lock);
data_sinfo->bytes_may_use -= bytes;
- BTRFS_I(inode)->reserved_bytes -= bytes;
spin_unlock(&data_sinfo->lock);
}
@@ -3166,6 +3230,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
struct btrfs_space_info *sinfo, u64 alloc_bytes,
int force)
{
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
u64 thresh;
@@ -3174,11 +3239,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
return 1;
/*
+ * We need to take into account the global rsv because for all intents
+ * and purposes it's used space. Don't worry about locking the
+ * global_rsv, it doesn't change except when the transaction commits.
+ */
+ num_allocated += global_rsv->size;
+
+ /*
* in limited mode, we want to have some free space up to
* about 1% of the FS size.
*/
if (force == CHUNK_ALLOC_LIMITED) {
- thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+ thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
thresh = max_t(u64, 64 * 1024 * 1024,
div_factor_fine(thresh, 1));
@@ -3200,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
return 0;
- thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+ thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
/* 256MB or 5% of the FS */
thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3283,6 +3355,9 @@ again:
}
ret = btrfs_alloc_chunk(trans, extent_root, flags);
+ if (ret < 0 && ret != -ENOSPC)
+ goto out;
+
spin_lock(&space_info->lock);
if (ret)
space_info->full = 1;
@@ -3292,6 +3367,7 @@ again:
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
+out:
mutex_unlock(&extent_root->fs_info->chunk_mutex);
return ret;
}
@@ -3299,42 +3375,54 @@ again:
/*
* shrink metadata reservation for delalloc
*/
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 to_reclaim, int sync)
+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
+ bool wait_ordered)
{
struct btrfs_block_rsv *block_rsv;
struct btrfs_space_info *space_info;
+ struct btrfs_trans_handle *trans;
u64 reserved;
u64 max_reclaim;
u64 reclaimed = 0;
long time_left;
- int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
int loops = 0;
unsigned long progress;
+ trans = (struct btrfs_trans_handle *)current->journal_info;
block_rsv = &root->fs_info->delalloc_block_rsv;
space_info = block_rsv->space_info;
smp_mb();
- reserved = space_info->bytes_reserved;
+ reserved = space_info->bytes_may_use;
progress = space_info->reservation_progress;
if (reserved == 0)
return 0;
- max_reclaim = min(reserved, to_reclaim);
+ smp_mb();
+ if (root->fs_info->delalloc_bytes == 0) {
+ if (trans)
+ return 0;
+ btrfs_wait_ordered_extents(root, 0, 0);
+ return 0;
+ }
+ max_reclaim = min(reserved, to_reclaim);
+ nr_pages = max_t(unsigned long, nr_pages,
+ max_reclaim >> PAGE_CACHE_SHIFT);
while (loops < 1024) {
/* have the flusher threads jump in and do some IO */
smp_mb();
nr_pages = min_t(unsigned long, nr_pages,
root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
- writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
+ writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
+ WB_REASON_FS_FREE_SPACE);
spin_lock(&space_info->lock);
- if (reserved > space_info->bytes_reserved)
- reclaimed += reserved - space_info->bytes_reserved;
- reserved = space_info->bytes_reserved;
+ if (reserved > space_info->bytes_may_use)
+ reclaimed += reserved - space_info->bytes_may_use;
+ reserved = space_info->bytes_may_use;
spin_unlock(&space_info->lock);
loops++;
@@ -3345,11 +3433,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
if (trans && trans->transaction->blocked)
return -EAGAIN;
- time_left = schedule_timeout_interruptible(1);
+ if (wait_ordered && !trans) {
+ btrfs_wait_ordered_extents(root, 0, 0);
+ } else {
+ time_left = schedule_timeout_interruptible(1);
- /* We were interrupted, exit */
- if (time_left)
- break;
+ /* We were interrupted, exit */
+ if (time_left)
+ break;
+ }
/* we've kicked the IO a few times, if anything has been freed,
* exit. There is no sense in looping here for a long time
@@ -3364,42 +3456,121 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
}
}
+
return reclaimed >= to_reclaim;
}
-/*
- * Retries tells us how many times we've called reserve_metadata_bytes. The
- * idea is if this is the first call (retries == 0) then we will add to our
- * reserved count if we can't make the allocation in order to hold our place
- * while we go and try and free up space. That way for retries > 1 we don't try
- * and add space, we just check to see if the amount of unused space is >= the
- * total space, meaning that our reservation is valid.
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
*
- * However if we don't intend to retry this reservation, pass -1 as retries so
- * that it short circuits this logic.
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does. Otherwise it
+ * will return -ENOSPC.
*/
-static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static int may_commit_transaction(struct btrfs_root *root,
+ struct btrfs_space_info *space_info,
+ u64 bytes, int force)
+{
+ struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+ struct btrfs_trans_handle *trans;
+
+ trans = (struct btrfs_trans_handle *)current->journal_info;
+ if (trans)
+ return -EAGAIN;
+
+ if (force)
+ goto commit;
+
+ /* See if there is enough pinned space to make this reservation */
+ spin_lock(&space_info->lock);
+ if (space_info->bytes_pinned >= bytes) {
+ spin_unlock(&space_info->lock);
+ goto commit;
+ }
+ spin_unlock(&space_info->lock);
+
+ /*
+ * See if there is some space in the delayed insertion reservation for
+ * this reservation.
+ */
+ if (space_info != delayed_rsv->space_info)
+ return -ENOSPC;
+
+ spin_lock(&delayed_rsv->lock);
+ if (delayed_rsv->size < bytes) {
+ spin_unlock(&delayed_rsv->lock);
+ return -ENOSPC;
+ }
+ spin_unlock(&delayed_rsv->lock);
+
+commit:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return -ENOSPC;
+
+ return btrfs_commit_transaction(trans, root);
+}
+
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - wether or not we can flush to make our reservation
+ *
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv. If there is not enough space it will make an attempt to
+ * flush out space to make room. It will do this by flushing delalloc if
+ * possible or committing the transaction. If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes, int flush)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
- u64 unused;
+ u64 used;
u64 num_bytes = orig_bytes;
int retries = 0;
int ret = 0;
- bool reserved = false;
bool committed = false;
+ bool flushing = false;
+ bool wait_ordered = false;
again:
- ret = -ENOSPC;
- if (reserved)
- num_bytes = 0;
-
+ ret = 0;
spin_lock(&space_info->lock);
- unused = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
+ /*
+ * We only want to wait if somebody other than us is flushing and we are
+ * actually alloed to flush.
+ */
+ while (flush && !flushing && space_info->flush) {
+ spin_unlock(&space_info->lock);
+ /*
+ * If we have a trans handle we can't wait because the flusher
+ * may have to commit the transaction, which would mean we would
+ * deadlock since we are waiting for the flusher to finish, but
+ * hold the current transaction open.
+ */
+ if (current->journal_info)
+ return -EAGAIN;
+ ret = wait_event_interruptible(space_info->wait,
+ !space_info->flush);
+ /* Must have been interrupted, return */
+ if (ret)
+ return -EINTR;
+
+ spin_lock(&space_info->lock);
+ }
+
+ ret = -ENOSPC;
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
/*
* The idea here is that we've not already over-reserved the block group
@@ -3408,11 +3579,9 @@ again:
* lets start flushing stuff first and then come back and try to make
* our reservation.
*/
- if (unused <= space_info->total_bytes) {
- unused = space_info->total_bytes - unused;
- if (unused >= num_bytes) {
- if (!reserved)
- space_info->bytes_reserved += orig_bytes;
+ if (used <= space_info->total_bytes) {
+ if (used + orig_bytes <= space_info->total_bytes) {
+ space_info->bytes_may_use += orig_bytes;
ret = 0;
} else {
/*
@@ -3428,91 +3597,129 @@ again:
* amount plus the amount of bytes that we need for this
* reservation.
*/
- num_bytes = unused - space_info->total_bytes +
+ wait_ordered = true;
+ num_bytes = used - space_info->total_bytes +
(orig_bytes * (retries + 1));
}
+ if (ret) {
+ u64 profile = btrfs_get_alloc_profile(root, 0);
+ u64 avail;
+
+ /*
+ * If we have a lot of space that's pinned, don't bother doing
+ * the overcommit dance yet and just commit the transaction.
+ */
+ avail = (space_info->total_bytes - space_info->bytes_used) * 8;
+ do_div(avail, 10);
+ if (space_info->bytes_pinned >= avail && flush && !committed) {
+ space_info->flush = 1;
+ flushing = true;
+ spin_unlock(&space_info->lock);
+ ret = may_commit_transaction(root, space_info,
+ orig_bytes, 1);
+ if (ret)
+ goto out;
+ committed = true;
+ goto again;
+ }
+
+ spin_lock(&root->fs_info->free_chunk_lock);
+ avail = root->fs_info->free_chunk_space;
+
+ /*
+ * If we have dup, raid1 or raid10 then only half of the free
+ * space is actually useable.
+ */
+ if (profile & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ avail >>= 1;
+
+ /*
+ * If we aren't flushing don't let us overcommit too much, say
+ * 1/8th of the space. If we can flush, let it overcommit up to
+ * 1/2 of the space.
+ */
+ if (flush)
+ avail >>= 3;
+ else
+ avail >>= 1;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
+ if (used + num_bytes < space_info->total_bytes + avail) {
+ space_info->bytes_may_use += orig_bytes;
+ ret = 0;
+ } else {
+ wait_ordered = true;
+ }
+ }
+
/*
* Couldn't make our reservation, save our place so while we're trying
* to reclaim space we can actually use it instead of somebody else
* stealing it from us.
*/
- if (ret && !reserved) {
- space_info->bytes_reserved += orig_bytes;
- reserved = true;
+ if (ret && flush) {
+ flushing = true;
+ space_info->flush = 1;
}
spin_unlock(&space_info->lock);
- if (!ret)
- return 0;
-
- if (!flush)
+ if (!ret || !flush)
goto out;
/*
* We do synchronous shrinking since we don't actually unreserve
* metadata until after the IO is completed.
*/
- ret = shrink_delalloc(trans, root, num_bytes, 1);
- if (ret > 0)
- return 0;
- else if (ret < 0)
+ ret = shrink_delalloc(root, num_bytes, wait_ordered);
+ if (ret < 0)
goto out;
+ ret = 0;
+
/*
* So if we were overcommitted it's possible that somebody else flushed
* out enough space and we simply didn't have enough space to reclaim,
* so go back around and try again.
*/
if (retries < 2) {
+ wait_ordered = true;
retries++;
goto again;
}
- spin_lock(&space_info->lock);
- /*
- * Not enough space to be reclaimed, don't bother committing the
- * transaction.
- */
- if (space_info->bytes_pinned < orig_bytes)
- ret = -ENOSPC;
- spin_unlock(&space_info->lock);
- if (ret)
- goto out;
-
- ret = -EAGAIN;
- if (trans || committed)
- goto out;
-
ret = -ENOSPC;
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
+ if (committed)
goto out;
- ret = btrfs_commit_transaction(trans, root);
+
+ ret = may_commit_transaction(root, space_info, orig_bytes, 0);
if (!ret) {
- trans = NULL;
committed = true;
goto again;
}
out:
- if (reserved) {
+ if (flushing) {
spin_lock(&space_info->lock);
- space_info->bytes_reserved -= orig_bytes;
+ space_info->flush = 0;
+ wake_up_all(&space_info->wait);
spin_unlock(&space_info->lock);
}
-
return ret;
}
static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- struct btrfs_block_rsv *block_rsv;
- if (root->ref_cows)
+ struct btrfs_block_rsv *block_rsv = NULL;
+
+ if (root->ref_cows || root == root->fs_info->csum_root)
block_rsv = trans->block_rsv;
- else
+
+ if (!block_rsv)
block_rsv = root->block_rsv;
if (!block_rsv)
@@ -3583,7 +3790,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
}
if (num_bytes) {
spin_lock(&space_info->lock);
- space_info->bytes_reserved -= num_bytes;
+ space_info->bytes_may_use -= num_bytes;
space_info->reservation_progress++;
spin_unlock(&space_info->lock);
}
@@ -3607,9 +3814,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
{
memset(rsv, 0, sizeof(*rsv));
spin_lock_init(&rsv->lock);
- atomic_set(&rsv->usage, 1);
- rsv->priority = 6;
- INIT_LIST_HEAD(&rsv->list);
}
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3630,38 +3834,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
- if (rsv && atomic_dec_and_test(&rsv->usage)) {
- btrfs_block_rsv_release(root, rsv, (u64)-1);
- if (!rsv->durable)
- kfree(rsv);
- }
-}
-
-/*
- * make the block_rsv struct be able to capture freed space.
- * the captured space will re-add to the the block_rsv struct
- * after transaction commit
- */
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv)
-{
- block_rsv->durable = 1;
- mutex_lock(&fs_info->durable_block_rsv_mutex);
- list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
- mutex_unlock(&fs_info->durable_block_rsv_mutex);
+ btrfs_block_rsv_release(root, rsv, (u64)-1);
+ kfree(rsv);
}
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
+static inline int __block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int flush)
{
int ret;
if (num_bytes == 0)
return 0;
- ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 1);
return 0;
@@ -3670,56 +3856,80 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved, int min_factor)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ return __block_rsv_add(root, block_rsv, num_bytes, 1);
+}
+
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ return __block_rsv_add(root, block_rsv, num_bytes, 0);
+}
+
+int btrfs_block_rsv_check(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int min_factor)
{
u64 num_bytes = 0;
- int commit_trans = 0;
int ret = -ENOSPC;
if (!block_rsv)
return 0;
spin_lock(&block_rsv->lock);
- if (min_factor > 0)
- num_bytes = div_factor(block_rsv->size, min_factor);
- if (min_reserved > num_bytes)
- num_bytes = min_reserved;
+ num_bytes = div_factor(block_rsv->size, min_factor);
+ if (block_rsv->reserved >= num_bytes)
+ ret = 0;
+ spin_unlock(&block_rsv->lock);
- if (block_rsv->reserved >= num_bytes) {
+ return ret;
+}
+
+static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved, int flush)
+{
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ if (!block_rsv)
+ return 0;
+
+ spin_lock(&block_rsv->lock);
+ num_bytes = min_reserved;
+ if (block_rsv->reserved >= num_bytes)
ret = 0;
- } else {
+ else
num_bytes -= block_rsv->reserved;
- if (block_rsv->durable &&
- block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
- commit_trans = 1;
- }
spin_unlock(&block_rsv->lock);
+
if (!ret)
return 0;
- if (block_rsv->refill_used) {
- ret = reserve_metadata_bytes(trans, root, block_rsv,
- num_bytes, 0);
- if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
- return 0;
- }
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ return 0;
}
- if (commit_trans) {
- if (trans)
- return -EAGAIN;
+ return ret;
+}
- trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
- ret = btrfs_commit_transaction(trans, root);
- return 0;
- }
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved)
+{
+ return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
+}
- return -ENOSPC;
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved)
+{
+ return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
}
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3751,7 +3961,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
u64 num_bytes;
u64 meta_used;
u64 data_used;
- int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+ int csum_size = btrfs_super_csum_size(fs_info->super_copy);
sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
spin_lock(&sinfo->lock);
@@ -3795,12 +4005,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
if (sinfo->total_bytes > num_bytes) {
num_bytes = sinfo->total_bytes - num_bytes;
block_rsv->reserved += num_bytes;
- sinfo->bytes_reserved += num_bytes;
+ sinfo->bytes_may_use += num_bytes;
}
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
- sinfo->bytes_reserved -= num_bytes;
+ sinfo->bytes_may_use -= num_bytes;
sinfo->reservation_progress++;
block_rsv->reserved = block_rsv->size;
block_rsv->full = 1;
@@ -3816,16 +4026,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
fs_info->chunk_block_rsv.space_info = space_info;
- fs_info->chunk_block_rsv.priority = 10;
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
fs_info->global_block_rsv.space_info = space_info;
- fs_info->global_block_rsv.priority = 10;
- fs_info->global_block_rsv.refill_used = 1;
fs_info->delalloc_block_rsv.space_info = space_info;
fs_info->trans_block_rsv.space_info = space_info;
fs_info->empty_block_rsv.space_info = space_info;
- fs_info->empty_block_rsv.priority = 10;
+ fs_info->delayed_block_rsv.space_info = space_info;
fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3833,10 +4040,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
- btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
-
- btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
-
update_global_block_rsv(fs_info);
}
@@ -3849,57 +4052,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);
WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
-}
-
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_block_rsv *rsv)
-{
- struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
- u64 num_bytes;
- int ret;
-
- /*
- * Truncate should be freeing data, but give us 2 items just in case it
- * needs to use some space. We may want to be smarter about this in the
- * future.
- */
- num_bytes = btrfs_calc_trans_metadata_size(root, 2);
-
- /* We already have enough bytes, just return */
- if (rsv->reserved >= num_bytes)
- return 0;
-
- num_bytes -= rsv->reserved;
-
- /*
- * You should have reserved enough space before hand to do this, so this
- * should not fail.
- */
- ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
- BUG_ON(ret);
-
- return 0;
-}
-
-int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- int num_items)
-{
- u64 num_bytes;
- int ret;
-
- if (num_items == 0 || root->fs_info->chunk_root == root)
- return 0;
-
- num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
- ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
- num_bytes);
- if (!ret) {
- trans->bytes_reserved += num_bytes;
- trans->block_rsv = &root->fs_info->trans_block_rsv;
- }
- return ret;
+ WARN_ON(fs_info->delayed_block_rsv.size > 0);
+ WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
}
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3908,9 +4062,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
if (!trans->bytes_reserved)
return;
- BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
- btrfs_block_rsv_release(root, trans->block_rsv,
- trans->bytes_reserved);
+ btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
trans->bytes_reserved = 0;
}
@@ -3952,90 +4104,229 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}
-static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+/**
+ * drop_outstanding_extent - drop an outstanding extent
+ * @inode: the inode we're dropping the extent for
+ *
+ * This is called when we are freeing up an outstanding extent, either called
+ * after an error or after an extent is written. This will return the number of
+ * reserved extents that need to be freed. This must be called with
+ * BTRFS_I(inode)->lock held.
+ */
+static unsigned drop_outstanding_extent(struct inode *inode)
{
- return num_bytes >>= 3;
+ unsigned drop_inode_space = 0;
+ unsigned dropped_extents = 0;
+
+ BUG_ON(!BTRFS_I(inode)->outstanding_extents);
+ BTRFS_I(inode)->outstanding_extents--;
+
+ if (BTRFS_I(inode)->outstanding_extents == 0 &&
+ BTRFS_I(inode)->delalloc_meta_reserved) {
+ drop_inode_space = 1;
+ BTRFS_I(inode)->delalloc_meta_reserved = 0;
+ }
+
+ /*
+ * If we have more or the same amount of outsanding extents than we have
+ * reserved then we need to leave the reserved extents count alone.
+ */
+ if (BTRFS_I(inode)->outstanding_extents >=
+ BTRFS_I(inode)->reserved_extents)
+ return drop_inode_space;
+
+ dropped_extents = BTRFS_I(inode)->reserved_extents -
+ BTRFS_I(inode)->outstanding_extents;
+ BTRFS_I(inode)->reserved_extents -= dropped_extents;
+ return dropped_extents + drop_inode_space;
+}
+
+/**
+ * calc_csum_metadata_size - return the amount of metada space that must be
+ * reserved/free'd for the given bytes.
+ * @inode: the inode we're manipulating
+ * @num_bytes: the number of bytes in question
+ * @reserve: 1 if we are reserving space, 0 if we are freeing space
+ *
+ * This adjusts the number of csum_bytes in the inode and then returns the
+ * correct amount of metadata that must either be reserved or freed. We
+ * calculate how many checksums we can fit into one leaf and then divide the
+ * number of bytes that will need to be checksumed by this value to figure out
+ * how many checksums will be required. If we are adding bytes then the number
+ * may go up and we will return the number of additional bytes that must be
+ * reserved. If it is going down we will return the number of bytes that must
+ * be freed.
+ *
+ * This must be called with BTRFS_I(inode)->lock held.
+ */
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
+ int reserve)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 csum_size;
+ int num_csums_per_leaf;
+ int num_csums;
+ int old_csums;
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
+ BTRFS_I(inode)->csum_bytes == 0)
+ return 0;
+
+ old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+ if (reserve)
+ BTRFS_I(inode)->csum_bytes += num_bytes;
+ else
+ BTRFS_I(inode)->csum_bytes -= num_bytes;
+ csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+ num_csums_per_leaf = (int)div64_u64(csum_size,
+ sizeof(struct btrfs_csum_item) +
+ sizeof(struct btrfs_disk_key));
+ num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+ num_csums = num_csums + num_csums_per_leaf - 1;
+ num_csums = num_csums / num_csums_per_leaf;
+
+ old_csums = old_csums + num_csums_per_leaf - 1;
+ old_csums = old_csums / num_csums_per_leaf;
+
+ /* No change, no need to reserve more */
+ if (old_csums == num_csums)
+ return 0;
+
+ if (reserve)
+ return btrfs_calc_trans_metadata_size(root,
+ num_csums - old_csums);
+
+ return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
}
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
- u64 to_reserve;
- int nr_extents;
- int reserved_extents;
+ u64 to_reserve = 0;
+ u64 csum_bytes;
+ unsigned nr_extents = 0;
+ int extra_reserve = 0;
+ int flush = 1;
int ret;
- if (btrfs_transaction_in_commit(root->fs_info))
+ /* Need to be holding the i_mutex here if we aren't free space cache */
+ if (btrfs_is_free_space_inode(root, inode))
+ flush = 0;
+ else
+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
+
+ if (flush && btrfs_transaction_in_commit(root->fs_info))
schedule_timeout(1);
num_bytes = ALIGN(num_bytes, root->sectorsize);
- nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
- reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
- if (nr_extents > reserved_extents) {
- nr_extents -= reserved_extents;
- to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
- } else {
- nr_extents = 0;
- to_reserve = 0;
+ if (BTRFS_I(inode)->outstanding_extents >
+ BTRFS_I(inode)->reserved_extents)
+ nr_extents = BTRFS_I(inode)->outstanding_extents -
+ BTRFS_I(inode)->reserved_extents;
+
+ /*
+ * Add an item to reserve for updating the inode when we complete the
+ * delalloc io.
+ */
+ if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+ nr_extents++;
+ extra_reserve = 1;
}
- to_reserve += calc_csum_metadata_size(inode, num_bytes);
- ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
- if (ret)
+ to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+ to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
+ csum_bytes = BTRFS_I(inode)->csum_bytes;
+ spin_unlock(&BTRFS_I(inode)->lock);
+
+ ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+ if (ret) {
+ u64 to_free = 0;
+ unsigned dropped;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ dropped = drop_outstanding_extent(inode);
+ /*
+ * If the inodes csum_bytes is the same as the original
+ * csum_bytes then we know we haven't raced with any free()ers
+ * so we can just reduce our inodes csum bytes and carry on.
+ * Otherwise we have to do the normal free thing to account for
+ * the case that the free side didn't free up its reserve
+ * because of this outstanding reservation.
+ */
+ if (BTRFS_I(inode)->csum_bytes == csum_bytes)
+ calc_csum_metadata_size(inode, num_bytes, 0);
+ else
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ if (dropped)
+ to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+ if (to_free)
+ btrfs_block_rsv_release(root, block_rsv, to_free);
return ret;
+ }
- atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
- atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+ spin_lock(&BTRFS_I(inode)->lock);
+ if (extra_reserve) {
+ BTRFS_I(inode)->delalloc_meta_reserved = 1;
+ nr_extents--;
+ }
+ BTRFS_I(inode)->reserved_extents += nr_extents;
+ spin_unlock(&BTRFS_I(inode)->lock);
block_rsv_add_bytes(block_rsv, to_reserve, 1);
- if (block_rsv->size > 512 * 1024 * 1024)
- shrink_delalloc(NULL, root, to_reserve, 0);
-
return 0;
}
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for
+ * @num_bytes: the number of bytes we're releasing
+ *
+ * This will release the metadata reservation for an inode. This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations.
+ */
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 to_free;
- int nr_extents;
- int reserved_extents;
+ u64 to_free = 0;
+ unsigned dropped;
num_bytes = ALIGN(num_bytes, root->sectorsize);
- atomic_dec(&BTRFS_I(inode)->outstanding_extents);
- WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
-
- reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
- do {
- int old, new;
-
- nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
- if (nr_extents >= reserved_extents) {
- nr_extents = 0;
- break;
- }
- old = reserved_extents;
- nr_extents = reserved_extents - nr_extents;
- new = reserved_extents - nr_extents;
- old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
- reserved_extents, new);
- if (likely(old == reserved_extents))
- break;
- reserved_extents = old;
- } while (1);
+ spin_lock(&BTRFS_I(inode)->lock);
+ dropped = drop_outstanding_extent(inode);
- to_free = calc_csum_metadata_size(inode, num_bytes);
- if (nr_extents > 0)
- to_free += btrfs_calc_trans_metadata_size(root, nr_extents);
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ if (dropped > 0)
+ to_free += btrfs_calc_trans_metadata_size(root, dropped);
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
to_free);
}
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * @inode: inode we're writing to
+ * @num_bytes: the number of bytes we want to allocate
+ *
+ * This will do the following things
+ *
+ * o reserve space in the data space info for num_bytes
+ * o reserve space in the metadata space info based on number of outstanding
+ * extents and how much csums will be needed
+ * o add to the inodes ->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *
+ * This will return 0 for success and -ENOSPC if there is no space left.
+ */
int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
{
int ret;
@@ -4053,6 +4344,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
return 0;
}
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @num_bytes: the number of bytes we want to free up
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space. This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore. So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ */
void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
{
btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4072,12 +4376,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
/* block accounting for super block */
spin_lock(&info->delalloc_lock);
- old_val = btrfs_super_bytes_used(&info->super_copy);
+ old_val = btrfs_super_bytes_used(info->super_copy);
if (alloc)
old_val += num_bytes;
else
old_val -= num_bytes;
- btrfs_set_super_bytes_used(&info->super_copy, old_val);
+ btrfs_set_super_bytes_used(info->super_copy, old_val);
spin_unlock(&info->delalloc_lock);
while (total) {
@@ -4105,7 +4409,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
- if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+ if (btrfs_test_opt(root, SPACE_CACHE) &&
cache->disk_cache_state < BTRFS_DC_CLEAR)
cache->disk_cache_state = BTRFS_DC_CLEAR;
@@ -4117,7 +4421,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
btrfs_set_block_group_used(&cache->item, old_val);
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
- cache->space_info->reservation_progress++;
cache->space_info->bytes_used += num_bytes;
cache->space_info->disk_used += num_bytes * factor;
spin_unlock(&cache->lock);
@@ -4169,7 +4472,6 @@ static int pin_down_extent(struct btrfs_root *root,
if (reserved) {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
- cache->space_info->reservation_progress++;
}
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@@ -4197,45 +4499,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
}
/*
- * update size of reserved extents. this function may return -EAGAIN
- * if 'reserve' is true or 'sinfo' is false.
+ * this function must be called within transaction
*/
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve, int sinfo)
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes)
{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ BUG_ON(!cache);
+
+ /*
+ * pull in the free space cache (if any) so that our pin
+ * removes the free space from the cache. We have load_only set
+ * to one because the slow code to read in the free extents does check
+ * the pinned extents.
+ */
+ cache_block_group(cache, trans, root, 1);
+
+ pin_down_extent(root, cache, bytenr, num_bytes, 0);
+
+ /* remove us from the free space cache (if we're there at all) */
+ btrfs_remove_free_space(cache, bytenr, num_bytes);
+ btrfs_put_block_group(cache);
+ return 0;
+}
+
+/**
+ * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * @cache: The cache we are manipulating
+ * @num_bytes: The number of bytes in question
+ * @reserve: One of the reservation enums
+ *
+ * This is called by the allocator when it reserves space, or by somebody who is
+ * freeing space that was never actually used on disk. For example if you
+ * reserve some space for a new leaf in transaction A and before transaction A
+ * commits you free that leaf, you call this with reserve set to 0 in order to
+ * clear the reservation.
+ *
+ * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * ENOSPC accounting. For data we handle the reservation through clearing the
+ * delalloc bits in the io_tree. We have to do this since we could end up
+ * allocating less disk space for the amount of data we have reserved in the
+ * case of compression.
+ *
+ * If this is a reservation and the block group has become read only we cannot
+ * make the reservation and return -EAGAIN, otherwise this function always
+ * succeeds.
+ */
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve)
+{
+ struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
- if (sinfo) {
- struct btrfs_space_info *space_info = cache->space_info;
- spin_lock(&space_info->lock);
- spin_lock(&cache->lock);
- if (reserve) {
- if (cache->ro) {
- ret = -EAGAIN;
- } else {
- cache->reserved += num_bytes;
- space_info->bytes_reserved += num_bytes;
- }
- } else {
- if (cache->ro)
- space_info->bytes_readonly += num_bytes;
- cache->reserved -= num_bytes;
- space_info->bytes_reserved -= num_bytes;
- space_info->reservation_progress++;
- }
- spin_unlock(&cache->lock);
- spin_unlock(&space_info->lock);
- } else {
- spin_lock(&cache->lock);
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (reserve != RESERVE_FREE) {
if (cache->ro) {
ret = -EAGAIN;
} else {
- if (reserve)
- cache->reserved += num_bytes;
- else
- cache->reserved -= num_bytes;
+ cache->reserved += num_bytes;
+ space_info->bytes_reserved += num_bytes;
+ if (reserve == RESERVE_ALLOC) {
+ BUG_ON(space_info->bytes_may_use < num_bytes);
+ space_info->bytes_may_use -= num_bytes;
+ }
}
- spin_unlock(&cache->lock);
+ } else {
+ if (cache->ro)
+ space_info->bytes_readonly += num_bytes;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+ space_info->reservation_progress++;
}
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
return ret;
}
@@ -4272,7 +4611,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
return 0;
}
-static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
+ const bool return_free_space)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
@@ -4292,7 +4632,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
if (start < cache->last_byte_to_unpin) {
len = min(len, cache->last_byte_to_unpin - start);
- btrfs_add_free_space(cache, start, len);
+ if (return_free_space)
+ btrfs_add_free_space(cache, start, len);
}
start += len;
@@ -4301,13 +4642,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
spin_lock(&cache->lock);
cache->pinned -= len;
cache->space_info->bytes_pinned -= len;
- if (cache->ro) {
+ if (cache->ro)
cache->space_info->bytes_readonly += len;
- } else if (cache->reserved_pinned > 0) {
- len = min(len, cache->reserved_pinned);
- cache->reserved_pinned -= len;
- cache->space_info->bytes_reserved += len;
- }
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
}
@@ -4322,11 +4658,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *unpin;
- struct btrfs_block_rsv *block_rsv;
- struct btrfs_block_rsv *next_rsv;
u64 start;
u64 end;
- int idx;
int ret;
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4345,34 +4678,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
end + 1 - start, NULL);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
- unpin_extent_range(root, start, end);
+ unpin_extent_range(root, start, end, true);
cond_resched();
}
- mutex_lock(&fs_info->durable_block_rsv_mutex);
- list_for_each_entry_safe(block_rsv, next_rsv,
- &fs_info->durable_block_rsv_list, list) {
-
- idx = trans->transid & 0x1;
- if (block_rsv->freed[idx] > 0) {
- block_rsv_add_bytes(block_rsv,
- block_rsv->freed[idx], 0);
- block_rsv->freed[idx] = 0;
- }
- if (atomic_read(&block_rsv->usage) == 0) {
- btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-
- if (block_rsv->freed[0] == 0 &&
- block_rsv->freed[1] == 0) {
- list_del_init(&block_rsv->list);
- kfree(block_rsv);
- }
- } else {
- btrfs_block_rsv_release(root, block_rsv, 0);
- }
- }
- mutex_unlock(&fs_info->durable_block_rsv_mutex);
-
return 0;
}
@@ -4452,7 +4761,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
printk(KERN_ERR "umm, got %d back from search"
", was looking for %llu\n", ret,
(unsigned long long)bytenr);
- btrfs_print_leaf(extent_root, path->nodes[0]);
+ if (ret > 0)
+ btrfs_print_leaf(extent_root,
+ path->nodes[0]);
}
BUG_ON(ret);
extent_slot = path->slots[0];
@@ -4648,7 +4959,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
u64 parent, int last_ref)
{
- struct btrfs_block_rsv *block_rsv;
struct btrfs_block_group_cache *cache = NULL;
int ret;
@@ -4663,64 +4973,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (!last_ref)
return;
- block_rsv = get_block_rsv(trans, root);
cache = btrfs_lookup_block_group(root->fs_info, buf->start);
- if (block_rsv->space_info != cache->space_info)
- goto out;
if (btrfs_header_generation(buf) == trans->transid) {
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, root, buf->start);
if (!ret)
- goto pin;
+ goto out;
}
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
pin_down_extent(root, cache, buf->start, buf->len, 1);
- goto pin;
+ goto out;
}
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
- ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
- if (ret == -EAGAIN) {
- /* block group became read-only */
- btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
- goto out;
- }
-
- ret = 1;
- spin_lock(&block_rsv->lock);
- if (block_rsv->reserved < block_rsv->size) {
- block_rsv->reserved += buf->len;
- ret = 0;
- }
- spin_unlock(&block_rsv->lock);
-
- if (ret) {
- spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_reserved -= buf->len;
- cache->space_info->reservation_progress++;
- spin_unlock(&cache->space_info->lock);
- }
- goto out;
- }
-pin:
- if (block_rsv->durable && !cache->ro) {
- ret = 0;
- spin_lock(&cache->lock);
- if (!cache->ro) {
- cache->reserved_pinned += buf->len;
- ret = 1;
- }
- spin_unlock(&cache->lock);
-
- if (ret) {
- spin_lock(&block_rsv->lock);
- block_rsv->freed[trans->transid & 0x1] += buf->len;
- spin_unlock(&block_rsv->lock);
- }
+ btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
}
out:
/*
@@ -4856,17 +5126,20 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root = orig_root->fs_info->extent_root;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
+ struct btrfs_block_group_cache *used_block_group;
int empty_cluster = 2 * 1024 * 1024;
int allowed_chunk_alloc = 0;
int done_chunk_alloc = 0;
struct btrfs_space_info *space_info;
- int last_ptr_loop = 0;
int loop = 0;
int index = 0;
+ int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
+ RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool found_uncached_bg = false;
bool failed_cluster_refill = false;
bool failed_alloc = false;
bool use_cluster = true;
+ bool have_caching_bg = false;
u64 ideal_cache_percent = 0;
u64 ideal_cache_offset = 0;
@@ -4919,6 +5192,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
ideal_cache:
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
+ used_block_group = block_group;
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -4949,12 +5223,14 @@ ideal_cache:
}
}
search:
+ have_caching_bg = false;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
list) {
u64 offset;
int cached;
+ used_block_group = block_group;
btrfs_get_block_group(block_group);
search_start = block_group->key.objectid;
@@ -4978,13 +5254,15 @@ search:
}
have_block_group:
- if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+ cached = block_group_cache_done(block_group);
+ if (unlikely(!cached)) {
u64 free_percent;
+ found_uncached_bg = true;
ret = cache_block_group(block_group, trans,
orig_root, 1);
if (block_group->cached == BTRFS_CACHE_FINISHED)
- goto have_block_group;
+ goto alloc;
free_percent = btrfs_block_group_used(&block_group->item);
free_percent *= 100;
@@ -4998,19 +5276,14 @@ have_block_group:
}
/*
- * We only want to start kthread caching if we are at
- * the point where we will wait for caching to make
- * progress, or if our ideal search is over and we've
- * found somebody to start caching.
+ * The caching workers are limited to 2 threads, so we
+ * can queue as much work as we care to.
*/
- if (loop > LOOP_CACHING_NOWAIT ||
- (loop > LOOP_FIND_IDEAL &&
- atomic_read(&space_info->caching_threads) < 2)) {
+ if (loop > LOOP_FIND_IDEAL) {
ret = cache_block_group(block_group, trans,
orig_root, 0);
BUG_ON(ret);
}
- found_uncached_bg = true;
/*
* If loop is set for cached only, try the next block
@@ -5020,92 +5293,80 @@ have_block_group:
goto loop;
}
- cached = block_group_cache_done(block_group);
- if (unlikely(!cached))
- found_uncached_bg = true;
-
+alloc:
if (unlikely(block_group->ro))
goto loop;
spin_lock(&block_group->free_space_ctl->tree_lock);
if (cached &&
block_group->free_space_ctl->free_space <
- num_bytes + empty_size) {
+ num_bytes + empty_cluster + empty_size) {
spin_unlock(&block_group->free_space_ctl->tree_lock);
goto loop;
}
spin_unlock(&block_group->free_space_ctl->tree_lock);
/*
- * Ok we want to try and use the cluster allocator, so lets look
- * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
- * have tried the cluster allocator plenty of times at this
- * point and not have found anything, so we are likely way too
- * fragmented for the clustering stuff to find anything, so lets
- * just skip it and let the allocator find whatever block it can
- * find
+ * Ok we want to try and use the cluster allocator, so
+ * lets look there
*/
- if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+ if (last_ptr) {
/*
* the refill lock keeps out other
* people trying to start a new cluster
*/
spin_lock(&last_ptr->refill_lock);
- if (last_ptr->block_group &&
- (last_ptr->block_group->ro ||
- !block_group_bits(last_ptr->block_group, data))) {
- offset = 0;
+ used_block_group = last_ptr->block_group;
+ if (used_block_group != block_group &&
+ (!used_block_group ||
+ used_block_group->ro ||
+ !block_group_bits(used_block_group, data))) {
+ used_block_group = block_group;
goto refill_cluster;
}
- offset = btrfs_alloc_from_cluster(block_group, last_ptr,
- num_bytes, search_start);
+ if (used_block_group != block_group)
+ btrfs_get_block_group(used_block_group);
+
+ offset = btrfs_alloc_from_cluster(used_block_group,
+ last_ptr, num_bytes, used_block_group->key.objectid);
if (offset) {
/* we have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
goto checks;
}
- spin_lock(&last_ptr->lock);
- /*
- * whoops, this cluster doesn't actually point to
- * this block group. Get a ref on the block
- * group is does point to and try again
- */
- if (!last_ptr_loop && last_ptr->block_group &&
- last_ptr->block_group != block_group) {
-
- btrfs_put_block_group(block_group);
- block_group = last_ptr->block_group;
- btrfs_get_block_group(block_group);
- spin_unlock(&last_ptr->lock);
- spin_unlock(&last_ptr->refill_lock);
-
- last_ptr_loop = 1;
- search_start = block_group->key.objectid;
- /*
- * we know this block group is properly
- * in the list because
- * btrfs_remove_block_group, drops the
- * cluster before it removes the block
- * group from the list
- */
- goto have_block_group;
+ WARN_ON(last_ptr->block_group != used_block_group);
+ if (used_block_group != block_group) {
+ btrfs_put_block_group(used_block_group);
+ used_block_group = block_group;
}
- spin_unlock(&last_ptr->lock);
refill_cluster:
+ BUG_ON(used_block_group != block_group);
+ /* If we are on LOOP_NO_EMPTY_SIZE, we can't
+ * set up a new clusters, so lets just skip it
+ * and let the allocator find whatever block
+ * it can find. If we reach this point, we
+ * will have tried the cluster allocator
+ * plenty of times and not have found
+ * anything, so we are likely way too
+ * fragmented for the clustering stuff to find
+ * anything. */
+ if (loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ goto unclustered_alloc;
+ }
+
/*
* this cluster didn't work out, free it and
* start over
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
- last_ptr_loop = 0;
-
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans, root,
block_group, last_ptr,
- offset, num_bytes,
+ search_start, num_bytes,
empty_cluster + empty_size);
if (ret == 0) {
/*
@@ -5141,6 +5402,7 @@ refill_cluster:
goto loop;
}
+unclustered_alloc:
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
/*
@@ -5159,20 +5421,22 @@ refill_cluster:
failed_alloc = true;
goto have_block_group;
} else if (!offset) {
+ if (!cached)
+ have_caching_bg = true;
goto loop;
}
checks:
search_start = stripe_align(root, offset);
/* move on to the next group */
if (search_start + num_bytes >= search_end) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ btrfs_add_free_space(used_block_group, offset, num_bytes);
goto loop;
}
/* move on to the next group */
if (search_start + num_bytes >
- block_group->key.objectid + block_group->key.offset) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ used_block_group->key.objectid + used_block_group->key.offset) {
+ btrfs_add_free_space(used_block_group, offset, num_bytes);
goto loop;
}
@@ -5180,14 +5444,14 @@ checks:
ins->offset = num_bytes;
if (offset < search_start)
- btrfs_add_free_space(block_group, offset,
+ btrfs_add_free_space(used_block_group, offset,
search_start - offset);
BUG_ON(offset > search_start);
- ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
- (data & BTRFS_BLOCK_GROUP_DATA));
+ ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
+ alloc_type);
if (ret == -EAGAIN) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ btrfs_add_free_space(used_block_group, offset, num_bytes);
goto loop;
}
@@ -5196,19 +5460,26 @@ checks:
ins->offset = num_bytes;
if (offset < search_start)
- btrfs_add_free_space(block_group, offset,
+ btrfs_add_free_space(used_block_group, offset,
search_start - offset);
BUG_ON(offset > search_start);
+ if (used_block_group != block_group)
+ btrfs_put_block_group(used_block_group);
btrfs_put_block_group(block_group);
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
BUG_ON(index != get_block_group_index(block_group));
+ if (used_block_group != block_group)
+ btrfs_put_block_group(used_block_group);
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
+ if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+ goto search;
+
if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
goto search;
@@ -5227,8 +5498,7 @@ loop:
if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
found_uncached_bg = false;
loop++;
- if (!ideal_cache_percent &&
- atomic_read(&space_info->caching_threads))
+ if (!ideal_cache_percent)
goto search;
/*
@@ -5308,7 +5578,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int index = 0;
spin_lock(&info->lock);
- printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+ printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+ (unsigned long long)info->flags,
(unsigned long long)(info->total_bytes - info->bytes_used -
info->bytes_pinned - info->bytes_reserved -
info->bytes_readonly),
@@ -5394,7 +5665,8 @@ again:
return ret;
}
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len, int pin)
{
struct btrfs_block_group_cache *cache;
int ret = 0;
@@ -5406,11 +5678,14 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
return -ENOSPC;
}
- if (btrfs_test_opt(root, DISCARD))
- ret = btrfs_discard_extent(root, start, len, NULL);
-
- btrfs_add_free_space(cache, start, len);
- btrfs_update_reserved_bytes(cache, len, 0, 1);
+ if (pin)
+ pin_down_extent(root, cache, start, len, 1);
+ else {
+ if (btrfs_test_opt(root, DISCARD))
+ ret = btrfs_discard_extent(root, start, len, NULL);
+ btrfs_add_free_space(cache, start, len);
+ btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+ }
btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(root, start, len);
@@ -5418,6 +5693,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
return ret;
}
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len)
+{
+ return __btrfs_free_reserved_extent(root, start, len, 0);
+}
+
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len)
+{
+ return __btrfs_free_reserved_extent(root, start, len, 1);
+}
+
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -5502,7 +5789,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -5612,7 +5900,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
put_caching_control(caching_ctl);
}
- ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
+ ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+ RESERVE_ALLOC_NO_ACCOUNT);
BUG_ON(ret);
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5631,7 +5920,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
if (!buf)
return ERR_PTR(-ENOMEM);
btrfs_set_header_generation(buf, trans->transid);
- btrfs_set_buffer_lockdep_class(buf, level);
+ btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
clean_tree_block(trans, root, buf);
@@ -5669,8 +5958,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
if (block_rsv->size == 0) {
- ret = reserve_metadata_bytes(trans, root, block_rsv,
- blocksize, 0);
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
/*
* If we couldn't reserve metadata bytes try and use some from
* the global reserve.
@@ -5690,13 +5978,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
if (!ret)
return block_rsv;
if (ret) {
- WARN_ON(1);
- ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
- 0);
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ /*DEFAULT_RATELIMIT_BURST*/ 2);
+ if (__ratelimit(&_rs)) {
+ printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
+ WARN_ON(1);
+ }
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
if (!ret) {
- spin_lock(&block_rsv->lock);
- block_rsv->size += blocksize;
- spin_unlock(&block_rsv->lock);
return block_rsv;
} else if (ret && block_rsv != global_rsv) {
ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -5918,7 +6208,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
return 1;
if (path->locks[level] && !wc->keep_locks) {
- btrfs_tree_unlock(eb);
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
}
return 0;
@@ -5942,7 +6232,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
* keep the tree lock
*/
if (path->locks[level] && level > 0) {
- btrfs_tree_unlock(eb);
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
}
return 0;
@@ -6055,7 +6345,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
BUG_ON(level != btrfs_header_level(next));
path->nodes[level] = next;
path->slots[level] = 0;
- path->locks[level] = 1;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
wc->level = level;
if (wc->level == 1)
wc->reada_slot = 0;
@@ -6126,7 +6416,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
BUG_ON(level == 0);
btrfs_tree_lock(eb);
btrfs_set_lock_blocking(eb);
- path->locks[level] = 1;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, root,
eb->start, eb->len,
@@ -6135,8 +6425,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret);
BUG_ON(wc->refs[level] == 0);
if (wc->refs[level] == 1) {
- btrfs_tree_unlock(eb);
- path->locks[level] = 0;
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
return 1;
}
}
@@ -6158,7 +6447,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
btrfs_set_lock_blocking(eb);
- path->locks[level] = 1;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
}
clean_tree_block(trans, root, eb);
}
@@ -6237,7 +6526,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
return 0;
if (path->locks[level]) {
- btrfs_tree_unlock(path->nodes[level]);
+ btrfs_tree_unlock_rw(path->nodes[level],
+ path->locks[level]);
path->locks[level] = 0;
}
free_extent_buffer(path->nodes[level]);
@@ -6259,8 +6549,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
*/
-int btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref)
+void btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int update_ref)
{
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -6271,12 +6561,20 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int err = 0;
int ret;
int level;
+ bool root_dropped = false;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
wc = kzalloc(sizeof(*wc), GFP_NOFS);
- BUG_ON(!wc);
+ if (!wc) {
+ btrfs_free_path(path);
+ err = -ENOMEM;
+ goto out;
+ }
trans = btrfs_start_transaction(tree_root, 0);
BUG_ON(IS_ERR(trans));
@@ -6289,7 +6587,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
path->nodes[level] = btrfs_lock_root_node(root);
btrfs_set_lock_blocking(path->nodes[level]);
path->slots[level] = 0;
- path->locks[level] = 1;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
memset(&wc->update_progress, 0,
sizeof(wc->update_progress));
} else {
@@ -6304,7 +6602,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
path->lowest_level = 0;
if (ret < 0) {
err = ret;
- goto out;
+ goto out_free;
}
WARN_ON(ret > 0);
@@ -6318,6 +6616,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
while (1) {
btrfs_tree_lock(path->nodes[level]);
btrfs_set_lock_blocking(path->nodes[level]);
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, root,
path->nodes[level]->start,
@@ -6331,6 +6630,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
break;
btrfs_tree_unlock(path->nodes[level]);
+ path->locks[level] = 0;
WARN_ON(wc->refs[level] != 1);
level--;
}
@@ -6411,11 +6711,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
free_extent_buffer(root->commit_root);
kfree(root);
}
-out:
+ root_dropped = true;
+out_free:
btrfs_end_transaction_throttle(trans, tree_root);
kfree(wc);
btrfs_free_path(path);
- return err;
+out:
+ /*
+ * So if we need to stop dropping the snapshot for whatever reason we
+ * need to make sure to add it back to the dead root list so that we
+ * keep trying to do the work later. This also cleans up roots if we
+ * don't have it in the radix (like when we recover after a power fail
+ * or unmount) so we don't leak memory.
+ */
+ if (root_dropped == false)
+ btrfs_add_dead_root(root);
+ if (err && err != -EAGAIN)
+ btrfs_std_error(root->fs_info, err);
+ return;
}
/*
@@ -6457,7 +6770,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(node);
path->nodes[level] = node;
path->slots[level] = 0;
- path->locks[level] = 1;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
wc->refs[parent_level] = 1;
wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -6532,30 +6845,45 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
return flags;
}
-static int set_block_group_ro(struct btrfs_block_group_cache *cache)
+static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
+ u64 min_allocable_bytes;
int ret = -ENOSPC;
- if (cache->ro)
- return 0;
+
+ /*
+ * We need some metadata space and system metadata space for
+ * allocating chunks in some corner cases until we force to set
+ * it to be readonly.
+ */
+ if ((sinfo->flags &
+ (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
+ !force)
+ min_allocable_bytes = 1 * 1024 * 1024;
+ else
+ min_allocable_bytes = 0;
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
+
+ if (cache->ro) {
+ ret = 0;
+ goto out;
+ }
+
num_bytes = cache->key.offset - cache->reserved - cache->pinned -
cache->bytes_super - btrfs_block_group_used(&cache->item);
if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
- sinfo->bytes_may_use + sinfo->bytes_readonly +
- cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
+ sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
+ min_allocable_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
- sinfo->bytes_reserved += cache->reserved_pinned;
- cache->reserved_pinned = 0;
cache->ro = 1;
ret = 0;
}
-
+out:
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
return ret;
@@ -6579,7 +6907,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
CHUNK_ALLOC_FORCE);
- ret = set_block_group_ro(cache);
+ ret = set_block_group_ro(cache, 0);
if (!ret)
goto out;
alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -6587,7 +6915,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
- ret = set_block_group_ro(cache);
+ ret = set_block_group_ro(cache, 0);
out:
btrfs_end_transaction(trans, root);
return ret;
@@ -6688,6 +7016,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
struct btrfs_space_info *space_info;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_device *device;
+ u64 min_free;
+ u64 dev_min = 1;
+ u64 dev_nr = 0;
+ int index;
int full = 0;
int ret = 0;
@@ -6697,8 +7029,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
if (!block_group)
return -1;
+ min_free = btrfs_block_group_used(&block_group->item);
+
/* no bytes used, we're good */
- if (!btrfs_block_group_used(&block_group->item))
+ if (!min_free)
goto out;
space_info = block_group->space_info;
@@ -6714,10 +7048,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* all of the extents from this block group. If we can, we're good
*/
if ((space_info->total_bytes != block_group->key.offset) &&
- (space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- btrfs_block_group_used(&block_group->item) <
- space_info->total_bytes)) {
+ (space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ min_free < space_info->total_bytes)) {
spin_unlock(&space_info->lock);
goto out;
}
@@ -6734,9 +7067,31 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
if (full)
goto out;
+ /*
+ * index:
+ * 0: raid10
+ * 1: raid1
+ * 2: dup
+ * 3: raid0
+ * 4: single
+ */
+ index = get_block_group_index(block_group);
+ if (index == 0) {
+ dev_min = 4;
+ /* Divide by 2 */
+ min_free >>= 1;
+ } else if (index == 1) {
+ dev_min = 2;
+ } else if (index == 2) {
+ /* Multiply by 2 */
+ min_free <<= 1;
+ } else if (index == 3) {
+ dev_min = fs_devices->rw_devices;
+ do_div(min_free, dev_min);
+ }
+
mutex_lock(&root->fs_info->chunk_mutex);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
- u64 min_free = btrfs_block_group_used(&block_group->item);
u64 dev_offset;
/*
@@ -6747,7 +7102,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
ret = find_free_dev_extent(NULL, device, min_free,
&dev_offset, NULL);
if (!ret)
+ dev_nr++;
+
+ if (dev_nr >= dev_min)
break;
+
ret = -1;
}
}
@@ -6887,7 +7246,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_space_info,
list);
if (space_info->bytes_pinned > 0 ||
- space_info->bytes_reserved > 0) {
+ space_info->bytes_reserved > 0 ||
+ space_info->bytes_may_use > 0) {
WARN_ON(1);
dump_space_info(space_info, 0, 0);
}
@@ -6929,14 +7289,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
return -ENOMEM;
path->reada = 1;
- cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
- if (cache_gen != 0 &&
- btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+ cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+ if (btrfs_test_opt(root, SPACE_CACHE) &&
+ btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
need_clear = 1;
if (btrfs_test_opt(root, CLEAR_CACHE))
need_clear = 1;
- if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
- printk(KERN_INFO "btrfs: disk space caching is enabled\n");
while (1) {
ret = find_first_block_group(root, path, &key);
@@ -7024,7 +7382,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid))
- set_block_group_ro(cache);
+ set_block_group_ro(cache, 1);
}
list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -7038,9 +7396,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* mirrored block groups.
*/
list_for_each_entry(cache, &space_info->block_groups[3], list)
- set_block_group_ro(cache);
+ set_block_group_ro(cache, 1);
list_for_each_entry(cache, &space_info->block_groups[4], list)
- set_block_group_ro(cache);
+ set_block_group_ro(cache, 1);
}
init_global_block_rsv(info);
@@ -7170,11 +7528,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cluster->refill_lock);
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
- inode = lookup_free_space_inode(root, block_group, path);
+ inode = lookup_free_space_inode(tree_root, block_group, path);
if (!IS_ERR(inode)) {
- btrfs_orphan_add(trans, inode);
+ ret = btrfs_orphan_add(trans, inode);
+ BUG_ON(ret);
clear_nlink(inode);
/* One for the block groups ref */
spin_lock(&block_group->lock);
@@ -7187,7 +7549,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&block_group->lock);
}
/* One for our lookup ref */
- iput(inode);
+ btrfs_add_delayed_iput(inode);
}
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7258,7 +7620,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
int mixed = 0;
int ret;
- disk_super = &fs_info->super_copy;
+ disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
return 1;
@@ -7289,7 +7651,7 @@ out:
int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
- return unpin_extent_range(root, start, end);
+ return unpin_extent_range(root, start, end, false);
}
int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7055d11..9a837a8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
#include "compat.h"
#include "ctree.h"
#include "btrfs_inode.h"
+#include "volumes.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -254,14 +255,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
*
* This should be called with the tree lock held.
*/
-static int merge_state(struct extent_io_tree *tree,
- struct extent_state *state)
+static void merge_state(struct extent_io_tree *tree,
+ struct extent_state *state)
{
struct extent_state *other;
struct rb_node *other_node;
if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
- return 0;
+ return;
other_node = rb_prev(&state->rb_node);
if (other_node) {
@@ -281,26 +282,19 @@ static int merge_state(struct extent_io_tree *tree,
if (other->start == state->end + 1 &&
other->state == state->state) {
merge_cb(tree, state, other);
- other->start = state->start;
- state->tree = NULL;
- rb_erase(&state->rb_node, &tree->state);
- free_extent_state(state);
- state = NULL;
+ state->end = other->end;
+ other->tree = NULL;
+ rb_erase(&other->rb_node, &tree->state);
+ free_extent_state(other);
}
}
-
- return 0;
}
-static int set_state_cb(struct extent_io_tree *tree,
+static void set_state_cb(struct extent_io_tree *tree,
struct extent_state *state, int *bits)
{
- if (tree->ops && tree->ops->set_bit_hook) {
- return tree->ops->set_bit_hook(tree->mapping->host,
- state, bits);
- }
-
- return 0;
+ if (tree->ops && tree->ops->set_bit_hook)
+ tree->ops->set_bit_hook(tree->mapping->host, state, bits);
}
static void clear_state_cb(struct extent_io_tree *tree,
@@ -310,6 +304,9 @@ static void clear_state_cb(struct extent_io_tree *tree,
tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
+static void set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state, int *bits);
+
/*
* insert an extent_state struct into the tree. 'bits' are set on the
* struct before it is inserted.
@@ -325,8 +322,6 @@ static int insert_state(struct extent_io_tree *tree,
int *bits)
{
struct rb_node *node;
- int bits_to_set = *bits & ~EXTENT_CTLBITS;
- int ret;
if (end < start) {
printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -336,13 +331,9 @@ static int insert_state(struct extent_io_tree *tree,
}
state->start = start;
state->end = end;
- ret = set_state_cb(tree, state, bits);
- if (ret)
- return ret;
- if (bits_to_set & EXTENT_DIRTY)
- tree->dirty_bytes += end - start + 1;
- state->state |= bits_to_set;
+ set_state_bits(tree, state, bits);
+
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
struct extent_state *found;
@@ -351,7 +342,6 @@ static int insert_state(struct extent_io_tree *tree,
"%llu %llu\n", (unsigned long long)found->start,
(unsigned long long)found->end,
(unsigned long long)start, (unsigned long long)end);
- free_extent_state(state);
return -EEXIST;
}
state->tree = tree;
@@ -359,13 +349,11 @@ static int insert_state(struct extent_io_tree *tree,
return 0;
}
-static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
u64 split)
{
if (tree->ops && tree->ops->split_extent_hook)
- return tree->ops->split_extent_hook(tree->mapping->host,
- orig, split);
- return 0;
+ tree->ops->split_extent_hook(tree->mapping->host, orig, split);
}
/*
@@ -500,7 +488,8 @@ again:
cached_state = NULL;
}
- if (cached && cached->tree && cached->start == start) {
+ if (cached && cached->tree && cached->start <= start &&
+ cached->end > start) {
if (clear)
atomic_dec(&cached->refs);
state = cached;
@@ -660,34 +649,25 @@ again:
if (start > end)
break;
- if (need_resched()) {
- spin_unlock(&tree->lock);
- cond_resched();
- spin_lock(&tree->lock);
- }
+ cond_resched_lock(&tree->lock);
}
out:
spin_unlock(&tree->lock);
return 0;
}
-static int set_state_bits(struct extent_io_tree *tree,
+static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
int *bits)
{
- int ret;
int bits_to_set = *bits & ~EXTENT_CTLBITS;
- ret = set_state_cb(tree, state, bits);
- if (ret)
- return ret;
+ set_state_cb(tree, state, bits);
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
state->state |= bits_to_set;
-
- return 0;
}
static void cache_state(struct extent_state *state,
@@ -742,7 +722,8 @@ again:
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
- if (state->start == start && state->tree) {
+ if (state->start <= start && state->end > start &&
+ state->tree) {
node = &state->rb_node;
goto hit_next;
}
@@ -779,17 +760,15 @@ hit_next:
goto out;
}
- err = set_state_bits(tree, state, &bits);
- if (err)
- goto out;
+ set_state_bits(tree, state, &bits);
- next_node = rb_next(node);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
+ next_node = rb_next(&state->rb_node);
if (next_node && start < end && prealloc && !need_resched()) {
state = rb_entry(next_node, struct extent_state,
rb_node);
@@ -830,9 +809,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- err = set_state_bits(tree, state, &bits);
- if (err)
- goto out;
+ set_state_bits(tree, state, &bits);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -862,7 +839,6 @@ hit_next:
* Avoid to free 'prealloc' if it can be merged with
* the later extent.
*/
- atomic_inc(&prealloc->refs);
err = insert_state(tree, prealloc, start, this_end,
&bits);
BUG_ON(err == -EEXIST);
@@ -872,7 +848,6 @@ hit_next:
goto out;
}
cache_state(prealloc, cached_state);
- free_extent_state(prealloc);
prealloc = NULL;
start = this_end + 1;
goto search_again;
@@ -895,12 +870,204 @@ hit_next:
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
- err = set_state_bits(tree, prealloc, &bits);
+ set_state_bits(tree, prealloc, &bits);
+ cache_state(prealloc, cached_state);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+ goto search_again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+
+/**
+ * convert_extent - convert all bits in a given range from one bit to another
+ * @tree: the io tree to search
+ * @start: the start offset in bytes
+ * @end: the end offset in bytes (inclusive)
+ * @bits: the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @mask: the allocation mask
+ *
+ * This will go through and set bits for the given range. If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits. This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY. This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int clear_bits, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+ if (!prealloc)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = insert_state(tree, prealloc, start, end, &bits);
+ prealloc = NULL;
+ BUG_ON(err == -EEXIST);
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ struct rb_node *next_node;
+
+ set_state_bits(tree, state, &bits);
+ clear_state_bit(tree, state, &clear_bits, 0);
+
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+
+ start = last_end + 1;
+ next_node = rb_next(&state->rb_node);
+ if (next_node && start < end && prealloc && !need_resched()) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on
+ * second half.
+ *
+ * If the extent we found extends past our
+ * range, we just split and search again. It'll get split
+ * again the next time though.
+ *
+ * If the extent we found is inside our range, we set the
+ * desired bit on it.
+ */
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, start);
+ BUG_ON(err == -EEXIST);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, &bits);
+ clear_state_bit(tree, state, &clear_bits, 0);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and
+ * ignore the extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with
+ * the later extent.
+ */
+ err = insert_state(tree, prealloc, start, this_end,
+ &bits);
+ BUG_ON(err == -EEXIST);
if (err) {
+ free_extent_state(prealloc);
prealloc = NULL;
goto out;
}
- cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and set the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+
+ set_state_bits(tree, prealloc, &bits);
+ clear_state_bit(tree, prealloc, &clear_bits, 0);
+
merge_state(tree, prealloc);
prealloc = NULL;
goto out;
@@ -949,7 +1116,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+ EXTENT_DELALLOC | EXTENT_UPTODATE,
0, NULL, cached_state, mask);
}
@@ -1042,19 +1209,33 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
mask);
}
-/*
- * helper function to set both pages and extents in the tree writeback
- */
-static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = end >> PAGE_CACHE_SHIFT;
struct page *page;
while (index <= end_index) {
- page = find_get_page(tree->mapping, index);
- BUG_ON(!page);
- set_page_writeback(page);
+ page = find_get_page(inode->i_mapping, index);
+ BUG_ON(!page); /* Pages should be in the extent_io_tree */
+ clear_page_dirty_for_io(page);
+ page_cache_release(page);
+ index++;
+ }
+ return 0;
+}
+
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(inode->i_mapping, index);
+ BUG_ON(!page); /* Pages should be in the extent_io_tree */
+ account_page_redirty(page);
+ __set_page_dirty_nobuffers(page);
page_cache_release(page);
index++;
}
@@ -1062,43 +1243,22 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
}
/*
- * find the first offset in the io tree with 'bits' set. zero is
- * returned if we find something, and *start_ret and *end_ret are
- * set to reflect the state struct that was found.
- *
- * If nothing was found, 1 is returned, < 0 on error
+ * helper function to set both pages and extents in the tree writeback
*/
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, int bits)
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
{
- struct rb_node *node;
- struct extent_state *state;
- int ret = 1;
-
- spin_lock(&tree->lock);
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node)
- goto out;
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->end >= start && (state->state & bits)) {
- *start_ret = state->start;
- *end_ret = state->end;
- ret = 0;
- break;
- }
- node = rb_next(node);
- if (!node)
- break;
+ while (index <= end_index) {
+ page = find_get_page(tree->mapping, index);
+ BUG_ON(!page);
+ set_page_writeback(page);
+ page_cache_release(page);
+ index++;
}
-out:
- spin_unlock(&tree->lock);
- return ret;
+ return 0;
}
/* find the first state struct with 'bits' set after 'start', and
@@ -1133,6 +1293,30 @@ out:
}
/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, int bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
* find a contiguous range of bytes in the file marked as delalloc, not
* more than 'max_bytes'. start and end are used to return the range,
*
@@ -1339,6 +1523,7 @@ again:
* shortening the size of the delalloc range we're searching
*/
free_extent_state(cached_state);
+ cached_state = NULL;
if (!loops) {
unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
max_bytes = PAGE_CACHE_SIZE - offset;
@@ -1564,7 +1749,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bitset = 0;
spin_lock(&tree->lock);
- if (cached && cached->tree && cached->start == start)
+ if (cached && cached->tree && cached->start <= start &&
+ cached->end > start)
node = &cached->rb_node;
else
node = tree_search(tree, start);
@@ -1644,6 +1830,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
return 0;
}
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data. This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors. If another mirror has good data, the page is set up to date
+ * and things continue. If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+ struct page *page;
+ u64 start;
+ u64 len;
+ u64 logical;
+ unsigned long bio_flags;
+ int this_mirror;
+ int failed_mirror;
+ int in_validation;
+};
+
+static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
+ int did_repair)
+{
+ int ret;
+ int err = 0;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+
+ set_state_private(failure_tree, rec->start, 0);
+ ret = clear_extent_bits(failure_tree, rec->start,
+ rec->start + rec->len - 1,
+ EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ if (ret)
+ err = ret;
+
+ if (did_repair) {
+ ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+ rec->start + rec->len - 1,
+ EXTENT_DAMAGED, GFP_NOFS);
+ if (ret && !err)
+ err = ret;
+ }
+
+ kfree(rec);
+ return err;
+}
+
+static void repair_io_failure_callback(struct bio *bio, int err)
+{
+ complete(bio->bi_private);
+}
+
+/*
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchonization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
+ */
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+ u64 length, u64 logical, struct page *page,
+ int mirror_num)
+{
+ struct bio *bio;
+ struct btrfs_device *dev;
+ DECLARE_COMPLETION_ONSTACK(compl);
+ u64 map_length = 0;
+ u64 sector;
+ struct btrfs_bio *bbio = NULL;
+ int ret;
+
+ BUG_ON(!mirror_num);
+
+ bio = bio_alloc(GFP_NOFS, 1);
+ if (!bio)
+ return -EIO;
+ bio->bi_private = &compl;
+ bio->bi_end_io = repair_io_failure_callback;
+ bio->bi_size = 0;
+ map_length = length;
+
+ ret = btrfs_map_block(map_tree, WRITE, logical,
+ &map_length, &bbio, mirror_num);
+ if (ret) {
+ bio_put(bio);
+ return -EIO;
+ }
+ BUG_ON(mirror_num != bbio->mirror_num);
+ sector = bbio->stripes[mirror_num-1].physical >> 9;
+ bio->bi_sector = sector;
+ dev = bbio->stripes[mirror_num-1].dev;
+ kfree(bbio);
+ if (!dev || !dev->bdev || !dev->writeable) {
+ bio_put(bio);
+ return -EIO;
+ }
+ bio->bi_bdev = dev->bdev;
+ bio_add_page(bio, page, length, start-page_offset(page));
+ submit_bio(WRITE_SYNC, bio);
+ wait_for_completion(&compl);
+
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ /* try to remap that extent elsewhere? */
+ bio_put(bio);
+ return -EIO;
+ }
+
+ printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
+ "sector %llu)\n", page->mapping->host->i_ino, start,
+ dev->name, sector);
+
+ bio_put(bio);
+ return 0;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int clean_io_failure(u64 start, struct page *page)
+{
+ u64 private;
+ u64 private_failure;
+ struct io_failure_record *failrec;
+ struct btrfs_mapping_tree *map_tree;
+ struct extent_state *state;
+ int num_copies;
+ int did_repair = 0;
+ int ret;
+ struct inode *inode = page->mapping->host;
+
+ private = 0;
+ ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+ (u64)-1, 1, EXTENT_DIRTY, 0);
+ if (!ret)
+ return 0;
+
+ ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+ &private_failure);
+ if (ret)
+ return 0;
+
+ failrec = (struct io_failure_record *)(unsigned long) private_failure;
+ BUG_ON(!failrec->this_mirror);
+
+ if (failrec->in_validation) {
+ /* there was no real error, just free the record */
+ pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+ failrec->start);
+ did_repair = 1;
+ goto out;
+ }
+
+ spin_lock(&BTRFS_I(inode)->io_tree.lock);
+ state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+ failrec->start,
+ EXTENT_LOCKED);
+ spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+
+ if (state && state->start == failrec->start) {
+ map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+ num_copies = btrfs_num_copies(map_tree, failrec->logical,
+ failrec->len);
+ if (num_copies > 1) {
+ ret = repair_io_failure(map_tree, start, failrec->len,
+ failrec->logical, page,
+ failrec->failed_mirror);
+ did_repair = !ret;
+ }
+ }
+
+out:
+ if (!ret)
+ ret = free_io_failure(inode, failrec, did_repair);
+
+ return ret;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, struct page *page,
+ u64 start, u64 end, int failed_mirror,
+ struct extent_state *state)
+{
+ struct io_failure_record *failrec = NULL;
+ u64 private;
+ struct extent_map *em;
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct bio *bio;
+ int num_copies;
+ int ret;
+ int read_mode;
+ u64 logical;
+
+ BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+ ret = get_state_private(failure_tree, start, &private);
+ if (ret) {
+ failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+ if (!failrec)
+ return -ENOMEM;
+ failrec->start = start;
+ failrec->len = end - start + 1;
+ failrec->this_mirror = 0;
+ failrec->bio_flags = 0;
+ failrec->in_validation = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, failrec->len);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ kfree(failrec);
+ return -EIO;
+ }
+
+ if (em->start > start || em->start + em->len < start) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ read_unlock(&em_tree->lock);
+
+ if (!em || IS_ERR(em)) {
+ kfree(failrec);
+ return -EIO;
+ }
+ logical = start - em->start;
+ logical = em->block_start + logical;
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ logical = em->block_start;
+ failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&failrec->bio_flags,
+ em->compress_type);
+ }
+ pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
+ "len=%llu\n", logical, start, failrec->len);
+ failrec->logical = logical;
+ free_extent_map(em);
+
+ /* set the bits in the private failure tree */
+ ret = set_extent_bits(failure_tree, start, end,
+ EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ if (ret >= 0)
+ ret = set_state_private(failure_tree, start,
+ (u64)(unsigned long)failrec);
+ /* set the bits in the inode's tree */
+ if (ret >= 0)
+ ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+ GFP_NOFS);
+ if (ret < 0) {
+ kfree(failrec);
+ return ret;
+ }
+ } else {
+ failrec = (struct io_failure_record *)(unsigned long)private;
+ pr_debug("bio_readpage_error: (found) logical=%llu, "
+ "start=%llu, len=%llu, validation=%d\n",
+ failrec->logical, failrec->start, failrec->len,
+ failrec->in_validation);
+ /*
+ * when data can be on disk more than twice, add to failrec here
+ * (e.g. with a list for failed_mirror) to make
+ * clean_io_failure() clean all those errors at once.
+ */
+ }
+ num_copies = btrfs_num_copies(
+ &BTRFS_I(inode)->root->fs_info->mapping_tree,
+ failrec->logical, failrec->len);
+ if (num_copies == 1) {
+ /*
+ * we only have a single copy of the data, so don't bother with
+ * all the retry and error correction code that follows. no
+ * matter what the error is, it is very likely to persist.
+ */
+ pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
+ "state=%p, num_copies=%d, next_mirror %d, "
+ "failed_mirror %d\n", state, num_copies,
+ failrec->this_mirror, failed_mirror);
+ free_io_failure(inode, failrec, 0);
+ return -EIO;
+ }
+
+ if (!state) {
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, failrec->start,
+ EXTENT_LOCKED);
+ if (state && state->start != failrec->start)
+ state = NULL;
+ spin_unlock(&tree->lock);
+ }
+
+ /*
+ * there are two premises:
+ * a) deliver good data to the caller
+ * b) correct the bad sectors on disk
+ */
+ if (failed_bio->bi_vcnt > 1) {
+ /*
+ * to fulfill b), we need to know the exact failing sectors, as
+ * we don't want to rewrite any more than the failed ones. thus,
+ * we need separate read requests for the failed bio
+ *
+ * if the following BUG_ON triggers, our validation request got
+ * merged. we need separate requests for our algorithm to work.
+ */
+ BUG_ON(failrec->in_validation);
+ failrec->in_validation = 1;
+ failrec->this_mirror = failed_mirror;
+ read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+ } else {
+ /*
+ * we're ready to fulfill a) and b) alongside. get a good copy
+ * of the failed sector and if we succeed, we have setup
+ * everything for repair_io_failure to do the rest for us.
+ */
+ if (failrec->in_validation) {
+ BUG_ON(failrec->this_mirror != failed_mirror);
+ failrec->in_validation = 0;
+ failrec->this_mirror = 0;
+ }
+ failrec->failed_mirror = failed_mirror;
+ failrec->this_mirror++;
+ if (failrec->this_mirror == failed_mirror)
+ failrec->this_mirror++;
+ read_mode = READ_SYNC;
+ }
+
+ if (!state || failrec->this_mirror > num_copies) {
+ pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
+ "next_mirror %d, failed_mirror %d\n", state,
+ num_copies, failrec->this_mirror, failed_mirror);
+ free_io_failure(inode, failrec, 0);
+ return -EIO;
+ }
+
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio->bi_private = state;
+ bio->bi_end_io = failed_bio->bi_end_io;
+ bio->bi_sector = failrec->logical >> 9;
+ bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ bio->bi_size = 0;
+
+ bio_add_page(bio, page, failrec->len, start - page_offset(page));
+
+ pr_debug("bio_readpage_error: submitting new read[%#x] to "
+ "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
+ failrec->this_mirror, num_copies, failrec->in_validation);
+
+ tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
+ failrec->bio_flags, 0);
+ return 0;
+}
+
/* lots and lots of room for performance fixes in the end_bio funcs */
/*
@@ -1742,6 +2290,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct extent_state *cached = NULL;
struct extent_state *state;
+ pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
+ "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+ (long int)bio->bi_bdev);
tree = &BTRFS_I(page->mapping->host)->io_tree;
start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1772,12 +2323,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
state);
if (ret)
uptodate = 0;
+ else
+ clean_io_failure(start, page);
}
- if (!uptodate && tree->ops &&
- tree->ops->readpage_io_failed_hook) {
- ret = tree->ops->readpage_io_failed_hook(bio, page,
- start, end, NULL);
+ if (!uptodate) {
+ int failed_mirror;
+ failed_mirror = (int)(unsigned long)bio->bi_bdev;
+ /*
+ * The generic bio_readpage_error handles errors the
+ * following way: If possible, new read requests are
+ * created and submitted and will end up in
+ * end_bio_extent_readpage as well (if we're lucky, not
+ * in the !uptodate case). In that case it returns 0 and
+ * we just go on with the next page in our bio. If it
+ * can't handle the error it will return -EIO and we
+ * remain responsible for that page.
+ */
+ ret = bio_readpage_error(bio, page, start, end,
+ failed_mirror, NULL);
if (ret == 0) {
+error_handled:
uptodate =
test_bit(BIO_UPTODATE, &bio->bi_flags);
if (err)
@@ -1785,6 +2350,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
uncache_state(&cached);
continue;
}
+ if (tree->ops && tree->ops->readpage_io_failed_hook) {
+ ret = tree->ops->readpage_io_failed_hook(
+ bio, page, start, end,
+ failed_mirror, state);
+ if (ret == 0)
+ goto error_handled;
+ }
}
if (uptodate) {
@@ -1856,6 +2428,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
mirror_num, bio_flags, start);
else
submit_bio(rw, bio);
+
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
bio_put(bio);
@@ -1871,7 +2444,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
bio_end_io_t end_io_func,
int mirror_num,
unsigned long prev_bio_flags,
- unsigned long bio_flags)
+ unsigned long bio_flags,
+ bool force_bio_submit)
{
int ret = 0;
struct bio *bio;
@@ -1890,6 +2464,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
sector;
if (prev_bio_flags != bio_flags || !contig ||
+ force_bio_submit ||
(tree->ops && tree->ops->merge_bio_hook &&
tree->ops->merge_bio_hook(page, offset, page_size, bio,
bio_flags)) ||
@@ -1946,7 +2521,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
struct page *page,
get_extent_t *get_extent,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags)
+ unsigned long *bio_flags,
+ u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -2002,6 +2578,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
}
}
while (cur <= end) {
+ bool force_bio_submit = false;
+
if (cur >= last_byte) {
char *userpage;
struct extent_state *cached = NULL;
@@ -2048,6 +2626,49 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
+
+ /*
+ * If we have a file range that points to a compressed extent
+ * and it's followed by a consecutive file range that points to
+ * to the same compressed extent (possibly with a different
+ * offset and/or length, so it either points to the whole extent
+ * or only part of it), we must make sure we do not submit a
+ * single bio to populate the pages for the 2 ranges because
+ * this makes the compressed extent read zero out the pages
+ * belonging to the 2nd range. Imagine the following scenario:
+ *
+ * File layout
+ * [0 - 8K] [8K - 24K]
+ * | |
+ * | |
+ * points to extent X, points to extent X,
+ * offset 4K, length of 8K offset 0, length 16K
+ *
+ * [extent X, compressed length = 4K uncompressed length = 16K]
+ *
+ * If the bio to read the compressed extent covers both ranges,
+ * it will decompress extent X into the pages belonging to the
+ * first range and then it will stop, zeroing out the remaining
+ * pages that belong to the other range that points to extent X.
+ * So here we make sure we submit 2 bios, one for the first
+ * range and another one for the third range. Both will target
+ * the same physical extent from disk, but we can't currently
+ * make the compressed bio endio callback populate the pages
+ * for both ranges because each compressed bio is tightly
+ * coupled with a single extent map, and each range can have
+ * an extent map with a different offset value relative to the
+ * uncompressed data of our extent and different lengths. This
+ * is a corner case so we prioritize correctness over
+ * non-optimal behavior (submitting 2 bios for the same extent).
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+ prev_em_start && *prev_em_start != (u64)-1 &&
+ *prev_em_start != em->orig_start)
+ force_bio_submit = true;
+
+ if (prev_em_start)
+ *prev_em_start = em->orig_start;
+
free_extent_map(em);
em = NULL;
@@ -2102,7 +2723,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
bdev, bio, pnr,
end_bio_extent_readpage, mirror_num,
*bio_flags,
- this_bio_flag);
+ this_bio_flag,
+ force_bio_submit);
nr++;
*bio_flags = this_bio_flag;
}
@@ -2121,16 +2743,16 @@ out:
}
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent)
+ get_extent_t *get_extent, int mirror_num)
{
struct bio *bio = NULL;
unsigned long bio_flags = 0;
int ret;
- ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
- &bio_flags);
+ ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
+ &bio_flags, NULL);
if (bio)
- ret = submit_one_bio(READ, bio, 0, bio_flags);
+ ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
return ret;
}
@@ -2181,6 +2803,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
int compressed;
int write_flags;
unsigned long nr_written = 0;
+ bool fill_delalloc = true;
if (wbc->sync_mode == WB_SYNC_ALL)
write_flags = WRITE_SYNC;
@@ -2190,6 +2813,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
trace___extent_writepage(page, inode, wbc);
WARN_ON(!PageLocked(page));
+
+ ClearPageError(page);
+
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
@@ -2211,10 +2837,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
set_page_extent_mapped(page);
+ if (!tree->ops || !tree->ops->fill_delalloc)
+ fill_delalloc = false;
+
delalloc_start = start;
delalloc_end = 0;
page_started = 0;
- if (!epd->extent_locked) {
+ if (!epd->extent_locked && fill_delalloc) {
u64 delalloc_to_write = 0;
/*
* make sure the wbc mapping index is at least updated
@@ -2380,7 +3009,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
sector, iosize, pg_offset,
bdev, &epd->bio, max_nr,
end_bio_extent_writepage,
- 0, 0, 0);
+ 0, 0, 0, false);
if (ret)
SetPageError(page);
}
@@ -2432,6 +3061,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
+ int tag;
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
@@ -2442,11 +3072,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
end = wbc->range_end >> PAGE_CACHE_SHIFT;
scanned = 1;
}
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
while (!done && !nr_to_write_done && (index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY, min(end - index,
- (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
unsigned i;
scanned = 1;
@@ -2460,10 +3095,16 @@ retry:
* swizzled back from swapper_space to tmpfs file
* mapping
*/
- if (tree->ops && tree->ops->write_cache_pages_lock_hook)
- tree->ops->write_cache_pages_lock_hook(page);
- else
- lock_page(page);
+ if (tree->ops &&
+ tree->ops->write_cache_pages_lock_hook) {
+ tree->ops->write_cache_pages_lock_hook(page,
+ data, flush_fn);
+ } else {
+ if (!trylock_page(page)) {
+ flush_fn(data);
+ lock_page(page);
+ }
+ }
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
@@ -2541,7 +3182,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
struct writeback_control *wbc)
{
int ret;
- struct address_space *mapping = page->mapping;
struct extent_page_data epd = {
.bio = NULL,
.tree = tree,
@@ -2549,18 +3189,9 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
- struct writeback_control wbc_writepages = {
- .sync_mode = wbc->sync_mode,
- .older_than_this = NULL,
- .nr_to_write = 64,
- .range_start = page_offset(page) + PAGE_CACHE_SIZE,
- .range_end = (loff_t)-1,
- };
ret = __extent_writepage(page, wbc, &epd);
- extent_write_cache_pages(tree, mapping, &wbc_writepages,
- __extent_writepage, &epd, flush_write_bio);
flush_epd_write_bio(&epd);
return ret;
}
@@ -2584,7 +3215,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
};
struct writeback_control wbc_writepages = {
.sync_mode = mode,
- .older_than_this = NULL,
.nr_to_write = nr_pages * 2,
.range_start = start,
.range_end = end + 1,
@@ -2638,6 +3268,7 @@ int extent_readpages(struct extent_io_tree *tree,
struct bio *bio = NULL;
unsigned page_idx;
unsigned long bio_flags = 0;
+ u64 prev_em_start = (u64)-1;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_entry(pages->prev, struct page, lru);
@@ -2647,7 +3278,8 @@ int extent_readpages(struct extent_io_tree *tree,
if (!add_to_page_cache_lru(page, mapping,
page->index, GFP_NOFS)) {
__extent_read_full_page(tree, page, get_extent,
- &bio, 0, &bio_flags);
+ &bio, 0, &bio_flags,
+ &prev_em_start);
}
page_cache_release(page);
}
@@ -2840,6 +3472,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return -ENOMEM;
path->leave_spinning = 1;
+ start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
+ len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
+
/*
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
@@ -2887,7 +3522,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
&cached_state, GFP_NOFS);
- em = get_extent_skip_holes(inode, off, last_for_get_extent,
+ em = get_extent_skip_holes(inode, start, last_for_get_extent,
get_extent);
if (!em)
goto out;
@@ -2976,7 +3611,7 @@ out:
return ret;
}
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+inline struct page *extent_buffer_page(struct extent_buffer *eb,
unsigned long i)
{
struct page *p;
@@ -3001,7 +3636,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
return p;
}
-static inline unsigned long num_extent_pages(u64 start, u64 len)
+inline unsigned long num_extent_pages(u64 start, u64 len)
{
return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
(start >> PAGE_CACHE_SHIFT);
@@ -3022,8 +3657,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
return NULL;
eb->start = start;
eb->len = len;
- spin_lock_init(&eb->lock);
- init_waitqueue_head(&eb->lock_wq);
+ rwlock_init(&eb->lock);
+ atomic_set(&eb->write_locks, 0);
+ atomic_set(&eb->read_locks, 0);
+ atomic_set(&eb->blocking_readers, 0);
+ atomic_set(&eb->blocking_writers, 0);
+ atomic_set(&eb->spinning_readers, 0);
+ atomic_set(&eb->spinning_writers, 0);
+ init_waitqueue_head(&eb->write_lock_wq);
+ init_waitqueue_head(&eb->read_lock_wq);
#if LEAK_DEBUG
spin_lock_irqsave(&leak_lock, flags);
@@ -3119,7 +3761,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
i = 0;
}
for (; i < num_pages; i++, index++) {
- p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
+ p = find_or_create_page(mapping, index, GFP_NOFS);
if (!p) {
WARN_ON(1);
goto free_eb;
@@ -3247,6 +3889,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
PAGECACHE_TAG_DIRTY);
}
spin_unlock_irq(&page->mapping->tree_lock);
+ ClearPageError(page);
unlock_page(page);
}
return 0;
@@ -3266,6 +3909,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
return was_dirty;
}
+static int __eb_straddles_pages(u64 start, u64 len)
+{
+ if (len < PAGE_CACHE_SIZE)
+ return 1;
+ if (start & (PAGE_CACHE_SIZE - 1))
+ return 1;
+ if ((start + len) & (PAGE_CACHE_SIZE - 1))
+ return 1;
+ return 0;
+}
+
+static int eb_straddles_pages(struct extent_buffer *eb)
+{
+ return __eb_straddles_pages(eb->start, eb->len);
+}
+
int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
struct extent_buffer *eb,
struct extent_state **cached_state)
@@ -3277,8 +3936,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
num_pages = num_extent_pages(eb->start, eb->len);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- cached_state, GFP_NOFS);
+ if (eb_straddles_pages(eb)) {
+ clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+ cached_state, GFP_NOFS);
+ }
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
if (page)
@@ -3296,8 +3957,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
num_pages = num_extent_pages(eb->start, eb->len);
- set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- NULL, GFP_NOFS);
+ if (eb_straddles_pages(eb)) {
+ set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+ NULL, GFP_NOFS);
+ }
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3320,9 +3983,12 @@ int extent_range_uptodate(struct extent_io_tree *tree,
int uptodate;
unsigned long index;
- ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
- if (ret)
- return 1;
+ if (__eb_straddles_pages(start, end - start + 1)) {
+ ret = test_range_bit(tree, start, end,
+ EXTENT_UPTODATE, 1, NULL);
+ if (ret)
+ return 1;
+ }
while (start <= end) {
index = start >> PAGE_CACHE_SHIFT;
page = find_get_page(tree->mapping, index);
@@ -3350,10 +4016,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 1;
- ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1, cached_state);
- if (ret)
- return ret;
+ if (eb_straddles_pages(eb)) {
+ ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+ EXTENT_UPTODATE, 1, cached_state);
+ if (ret)
+ return ret;
+ }
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
@@ -3367,8 +4035,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
}
int read_extent_buffer_pages(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- u64 start, int wait,
+ struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num)
{
unsigned long i;
@@ -3382,13 +4049,16 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
unsigned long num_pages;
struct bio *bio = NULL;
unsigned long bio_flags = 0;
+ u64 prev_em_start = (u64)-1;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
- if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1, NULL)) {
- return 0;
+ if (eb_straddles_pages(eb)) {
+ if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+ EXTENT_UPTODATE, 1, NULL)) {
+ return 0;
+ }
}
if (start) {
@@ -3402,7 +4072,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = start_i; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
- if (!wait) {
+ if (wait == WAIT_NONE) {
if (!trylock_page(page))
goto unlock_exit;
} else {
@@ -3435,7 +4105,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
ClearPageError(page);
err = __extent_read_full_page(tree, page,
get_extent, &bio,
- mirror_num, &bio_flags);
+ mirror_num, &bio_flags,
+ &prev_em_start);
if (err)
ret = err;
} else {
@@ -3446,7 +4117,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
if (bio)
submit_one_bio(READ, bio, mirror_num, bio_flags);
- if (ret || !wait)
+ if (ret || wait != WAIT_COMPLETE)
return ret;
for (i = start_i; i < num_pages; i++) {
@@ -3492,9 +4163,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
page = extent_buffer_page(eb, i);
cur = min(len, (PAGE_CACHE_SIZE - offset));
- kaddr = kmap_atomic(page, KM_USER1);
+ kaddr = page_address(page);
memcpy(dst, kaddr + offset, cur);
- kunmap_atomic(kaddr, KM_USER1);
dst += cur;
len -= cur;
@@ -3504,9 +4174,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
}
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
- unsigned long min_len, char **token, char **map,
+ unsigned long min_len, char **map,
unsigned long *map_start,
- unsigned long *map_len, int km)
+ unsigned long *map_len)
{
size_t offset = start & (PAGE_CACHE_SIZE - 1);
char *kaddr;
@@ -3536,42 +4206,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
}
p = extent_buffer_page(eb, i);
- kaddr = kmap_atomic(p, km);
- *token = kaddr;
+ kaddr = page_address(p);
*map = kaddr + offset;
*map_len = PAGE_CACHE_SIZE - offset;
return 0;
}
-int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
- unsigned long min_len,
- char **token, char **map,
- unsigned long *map_start,
- unsigned long *map_len, int km)
-{
- int err;
- int save = 0;
- if (eb->map_token) {
- unmap_extent_buffer(eb, eb->map_token, km);
- eb->map_token = NULL;
- save = 1;
- }
- err = map_private_extent_buffer(eb, start, min_len, token, map,
- map_start, map_len, km);
- if (!err && save) {
- eb->map_token = *token;
- eb->kaddr = *map;
- eb->map_start = *map_start;
- eb->map_len = *map_len;
- }
- return err;
-}
-
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
-{
- kunmap_atomic(token, km);
-}
-
int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
unsigned long start,
unsigned long len)
@@ -3595,9 +4235,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
cur = min(len, (PAGE_CACHE_SIZE - offset));
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = page_address(page);
ret = memcmp(ptr, kaddr + offset, cur);
- kunmap_atomic(kaddr, KM_USER0);
if (ret)
break;
@@ -3630,9 +4269,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
WARN_ON(!PageUptodate(page));
cur = min(len, PAGE_CACHE_SIZE - offset);
- kaddr = kmap_atomic(page, KM_USER1);
+ kaddr = page_address(page);
memcpy(kaddr + offset, src, cur);
- kunmap_atomic(kaddr, KM_USER1);
src += cur;
len -= cur;
@@ -3661,9 +4299,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
WARN_ON(!PageUptodate(page));
cur = min(len, PAGE_CACHE_SIZE - offset);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = page_address(page);
memset(kaddr + offset, c, cur);
- kunmap_atomic(kaddr, KM_USER0);
len -= cur;
offset = 0;
@@ -3694,9 +4331,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = page_address(page);
read_extent_buffer(src, kaddr + offset, src_offset, cur);
- kunmap_atomic(kaddr, KM_USER0);
src_offset += cur;
len -= cur;
@@ -3709,20 +4345,17 @@ static void move_pages(struct page *dst_page, struct page *src_page,
unsigned long dst_off, unsigned long src_off,
unsigned long len)
{
- char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+ char *dst_kaddr = page_address(dst_page);
if (dst_page == src_page) {
memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
} else {
- char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+ char *src_kaddr = page_address(src_page);
char *p = dst_kaddr + dst_off + len;
char *s = src_kaddr + src_off + len;
while (len--)
*--p = *--s;
-
- kunmap_atomic(src_kaddr, KM_USER1);
}
- kunmap_atomic(dst_kaddr, KM_USER0);
}
static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
@@ -3735,20 +4368,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
unsigned long dst_off, unsigned long src_off,
unsigned long len)
{
- char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+ char *dst_kaddr = page_address(dst_page);
char *src_kaddr;
if (dst_page != src_page) {
- src_kaddr = kmap_atomic(src_page, KM_USER1);
+ src_kaddr = page_address(src_page);
} else {
src_kaddr = dst_kaddr;
BUG_ON(areas_overlap(src_off, dst_off, len));
}
memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
- kunmap_atomic(dst_kaddr, KM_USER0);
- if (dst_page != src_page)
- kunmap_atomic(src_kaddr, KM_USER1);
}
void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a11a92e..2e32510 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,8 @@
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_DO_ACCOUNTING (1 << 11)
#define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_NEED_WAIT (1 << 13)
+#define EXTENT_DAMAGED (1 << 14)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@@ -32,6 +34,7 @@
#define EXTENT_BUFFER_BLOCKING 1
#define EXTENT_BUFFER_DIRTY 2
#define EXTENT_BUFFER_CORRUPT 3
+#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
/* these are flags for extent_clear_unlock_delalloc */
#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -67,7 +70,7 @@ struct extent_io_ops {
unsigned long bio_flags);
int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
- u64 start, u64 end,
+ u64 start, u64 end, int failed_mirror,
struct extent_state *state);
int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
u64 start, u64 end,
@@ -76,16 +79,17 @@ struct extent_io_ops {
struct extent_state *state);
int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate);
- int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
- int *bits);
- int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
- int *bits);
- int (*merge_extent_hook)(struct inode *inode,
- struct extent_state *new,
- struct extent_state *other);
- int (*split_extent_hook)(struct inode *inode,
- struct extent_state *orig, u64 split);
- int (*write_cache_pages_lock_hook)(struct page *page);
+ void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+ int *bits);
+ void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+ int *bits);
+ void (*merge_extent_hook)(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other);
+ void (*split_extent_hook)(struct inode *inode,
+ struct extent_state *orig, u64 split);
+ int (*write_cache_pages_lock_hook)(struct page *page, void *data,
+ void (*flush_fn)(void *));
};
struct extent_io_tree {
@@ -108,8 +112,6 @@ struct extent_state {
wait_queue_head_t wq;
atomic_t refs;
unsigned long state;
- u64 split_start;
- u64 split_end;
/* for use by the FS */
u64 private;
@@ -120,8 +122,6 @@ struct extent_state {
struct extent_buffer {
u64 start;
unsigned long len;
- char *map_token;
- char *kaddr;
unsigned long map_start;
unsigned long map_len;
struct page *first_page;
@@ -130,14 +130,26 @@ struct extent_buffer {
struct rcu_head rcu_head;
atomic_t refs;
- /* the spinlock is used to protect most operations */
- spinlock_t lock;
+ /* count of read lock holders on the extent buffer */
+ atomic_t write_locks;
+ atomic_t read_locks;
+ atomic_t blocking_writers;
+ atomic_t blocking_readers;
+ atomic_t spinning_readers;
+ atomic_t spinning_writers;
+
+ /* protects write locks */
+ rwlock_t lock;
+
+ /* readers use lock_wq while they wait for the write
+ * lock holders to unlock
+ */
+ wait_queue_head_t write_lock_wq;
- /*
- * when we keep the lock held while blocking, waiters go onto
- * the wq
+ /* writers use read_lock_wq while they wait for readers
+ * to unlock
*/
- wait_queue_head_t lock_wq;
+ wait_queue_head_t read_lock_wq;
};
static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -177,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent);
+ get_extent_t *get_extent, int mirror_num);
int __init extent_io_init(void);
void extent_io_exit(void);
@@ -206,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int clear_bits, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -240,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len);
void free_extent_buffer(struct extent_buffer *eb);
+#define WAIT_NONE 0
+#define WAIT_COMPLETE 1
+#define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num);
+unsigned long num_extent_pages(u64 start, u64 len);
+struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
static inline void extent_buffer_get(struct extent_buffer *eb)
{
@@ -279,17 +298,14 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
int extent_buffer_uptodate(struct extent_io_tree *tree,
struct extent_buffer *eb,
struct extent_state *cached_state);
-int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
- unsigned long min_len, char **token, char **map,
- unsigned long *map_start,
- unsigned long *map_len, int km);
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
- unsigned long min_len, char **token, char **map,
+ unsigned long min_len, char **map,
unsigned long *map_start,
- unsigned long *map_len, int km);
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
+ unsigned long *map_len);
int extent_range_uptodate(struct extent_io_tree *tree,
u64 start, u64 end);
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_clear_unlock_delalloc(struct inode *inode,
struct extent_io_tree *tree,
u64 start, u64 end, struct page *locked_page,
@@ -297,4 +313,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
gfp_t gfp_flags);
+
+struct btrfs_mapping_tree;
+
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+ u64 length, u64 logical, struct page *page,
+ int mirror_num);
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2d04103..7c97b33 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -183,22 +183,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
return 0;
}
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
{
- int ret = 0;
struct extent_map *merge = NULL;
struct rb_node *rb;
- struct extent_map *em;
-
- write_lock(&tree->lock);
- em = lookup_extent_mapping(tree, start, len);
-
- WARN_ON(!em || em->start != start);
-
- if (!em)
- goto out;
-
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
@@ -225,6 +213,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
merge->in_tree = 0;
free_extent_map(merge);
}
+}
+
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+{
+ int ret = 0;
+ struct extent_map *em;
+
+ write_lock(&tree->lock);
+ em = lookup_extent_mapping(tree, start, len);
+
+ WARN_ON(!em || em->start != start);
+
+ if (!em)
+ goto out;
+
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ try_merge_map(tree, em);
free_extent_map(em);
out:
@@ -247,7 +253,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em)
{
int ret = 0;
- struct extent_map *merge = NULL;
struct rb_node *rb;
struct extent_map *exist;
@@ -263,30 +268,8 @@ int add_extent_mapping(struct extent_map_tree *tree,
goto out;
}
atomic_inc(&em->refs);
- if (em->start != 0) {
- rb = rb_prev(&em->rb_node);
- if (rb)
- merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(merge, em)) {
- em->start = merge->start;
- em->len += merge->len;
- em->block_len += merge->block_len;
- em->block_start = merge->block_start;
- merge->in_tree = 0;
- rb_erase(&merge->rb_node, &tree->map);
- free_extent_map(merge);
- }
- }
- rb = rb_next(&em->rb_node);
- if (rb)
- merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(em, merge)) {
- em->len += merge->len;
- em->block_len += merge->len;
- rb_erase(&merge->rb_node, &tree->map);
- merge->in_tree = 0;
- free_extent_map(merge);
- }
+
+ try_merge_map(tree, em);
out:
return ret;
}
@@ -299,19 +282,8 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
-/**
- * lookup_extent_mapping - lookup extent_map
- * @tree: tree to lookup in
- * @start: byte offset to start the search
- * @len: length of the lookup range
- *
- * Find and return the first extent_map struct in @tree that intersects the
- * [start, len] range. There may be additional objects in the tree that
- * intersect, so check the object returned carefully to make sure that no
- * additional lookups are needed.
- */
-struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len)
+struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len, int strict)
{
struct extent_map *em;
struct rb_node *rb_node;
@@ -320,38 +292,42 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 end = range_end(start, len);
rb_node = __tree_search(&tree->map, start, &prev, &next);
- if (!rb_node && prev) {
- em = rb_entry(prev, struct extent_map, rb_node);
- if (end > em->start && start < extent_map_end(em))
- goto found;
- }
- if (!rb_node && next) {
- em = rb_entry(next, struct extent_map, rb_node);
- if (end > em->start && start < extent_map_end(em))
- goto found;
- }
if (!rb_node) {
- em = NULL;
- goto out;
- }
- if (IS_ERR(rb_node)) {
- em = ERR_CAST(rb_node);
- goto out;
+ if (prev)
+ rb_node = prev;
+ else if (next)
+ rb_node = next;
+ else
+ return NULL;
}
+
em = rb_entry(rb_node, struct extent_map, rb_node);
- if (end > em->start && start < extent_map_end(em))
- goto found;
- em = NULL;
- goto out;
+ if (strict && !(end > em->start && start < extent_map_end(em)))
+ return NULL;
-found:
atomic_inc(&em->refs);
-out:
return em;
}
/**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree: tree to lookup in
+ * @start: byte offset to start the search
+ * @len: length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range. There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
+{
+ return __lookup_extent_mapping(tree, start, len, 1);
+}
+
+/**
* search_extent_mapping - find a nearby extent map
* @tree: tree to lookup in
* @start: byte offset to start the search
@@ -365,38 +341,7 @@ out:
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len)
{
- struct extent_map *em;
- struct rb_node *rb_node;
- struct rb_node *prev = NULL;
- struct rb_node *next = NULL;
-
- rb_node = __tree_search(&tree->map, start, &prev, &next);
- if (!rb_node && prev) {
- em = rb_entry(prev, struct extent_map, rb_node);
- goto found;
- }
- if (!rb_node && next) {
- em = rb_entry(next, struct extent_map, rb_node);
- goto found;
- }
- if (!rb_node) {
- em = NULL;
- goto out;
- }
- if (IS_ERR(rb_node)) {
- em = ERR_CAST(rb_node);
- goto out;
- }
- em = rb_entry(rb_node, struct extent_map, rb_node);
- goto found;
-
- em = NULL;
- goto out;
-
-found:
- atomic_inc(&em->refs);
-out:
- return em;
+ return __lookup_extent_mapping(tree, start, len, 0);
}
/**
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 90d4ee5..d7c9b68 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_csum_item *item;
struct extent_buffer *leaf;
u64 csum_offset = 0;
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
u64 item_last_offset = 0;
u64 disk_bytenr;
u32 diff;
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
int ret;
struct btrfs_path *path;
struct btrfs_csum_item *item = NULL;
@@ -177,6 +175,17 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
WARN_ON(bio->bi_vcnt <= 0);
+ /*
+ * the free space stuff is only read when it hasn't been
+ * updated in the current transaction. So, we can safely
+ * read from the commit root and sidestep a nasty deadlock
+ * between reading the free space cache and updating the csum tree.
+ */
+ if (btrfs_is_free_space_inode(root, inode)) {
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
+
disk_bytenr = (u64)bio->bi_sector << 9;
if (dio)
offset = logical_offset;
@@ -279,10 +288,11 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
int ret;
size_t size;
u64 csum_end;
- u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
if (search_commit) {
path->skip_locking = 1;
@@ -480,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
u64 bytenr, u64 len)
{
struct extent_buffer *leaf;
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
u64 csum_end;
u64 end_byte = bytenr + len;
u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -537,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 csum_end;
struct extent_buffer *leaf;
int ret;
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
root = root->fs_info->csum_root;
@@ -664,15 +672,12 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_sector_sum *sector_sum;
u32 nritems;
u32 ins_size;
- char *eb_map;
- char *eb_token;
- unsigned long map_len;
- unsigned long map_start;
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
sector_sum = sums->sums;
again:
next_offset = (u64)-1;
@@ -712,7 +717,7 @@ again:
found_next = 1;
if (ret != 0)
goto insert;
- slot = 0;
+ slot = path->slots[0];
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
@@ -814,30 +819,9 @@ found:
item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
btrfs_item_size_nr(leaf, path->slots[0]));
- eb_token = NULL;
next_sector:
- if (!eb_token ||
- (unsigned long)item + csum_size >= map_start + map_len) {
- int err;
-
- if (eb_token)
- unmap_extent_buffer(leaf, eb_token, KM_USER1);
- eb_token = NULL;
- err = map_private_extent_buffer(leaf, (unsigned long)item,
- csum_size,
- &eb_token, &eb_map,
- &map_start, &map_len, KM_USER1);
- if (err)
- eb_token = NULL;
- }
- if (eb_token) {
- memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
- &sector_sum->sum, csum_size);
- } else {
- write_extent_buffer(leaf, &sector_sum->sum,
- (unsigned long)item, csum_size);
- }
+ write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
total_bytes += root->sectorsize;
sector_sum++;
@@ -850,10 +834,7 @@ next_sector:
goto next_sector;
}
}
- if (eb_token) {
- unmap_extent_buffer(leaf, eb_token, KM_USER1);
- eb_token = NULL;
- }
+
btrfs_mark_buffer_dirty(path->nodes[0]);
if (total_bytes < sums->len) {
btrfs_release_path(path);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fa4ef18..97fbe93 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -74,7 +74,7 @@ struct inode_defrag {
* If an existing record is found the defrag item you
* pass in is freed
*/
-static int __btrfs_add_inode_defrag(struct inode *inode,
+static void __btrfs_add_inode_defrag(struct inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -106,11 +106,11 @@ static int __btrfs_add_inode_defrag(struct inode *inode,
BTRFS_I(inode)->in_defrag = 1;
rb_link_node(&defrag->rb_node, parent, p);
rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
- return 0;
+ return;
exists:
kfree(defrag);
- return 0;
+ return;
}
@@ -123,7 +123,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct inode_defrag *defrag;
- int ret = 0;
u64 transid;
if (!btrfs_test_opt(root, AUTO_DEFRAG))
@@ -150,9 +149,11 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->defrag_inodes_lock);
if (!BTRFS_I(inode)->in_defrag)
- ret = __btrfs_add_inode_defrag(inode, defrag);
+ __btrfs_add_inode_defrag(inode, defrag);
+ else
+ kfree(defrag);
spin_unlock(&root->fs_info->defrag_inodes_lock);
- return ret;
+ return 0;
}
/*
@@ -855,7 +856,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
btrfs_drop_extent_cache(inode, start, end - 1, 0);
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
again:
recow = 0;
split = start;
@@ -1034,11 +1036,13 @@ out:
* on error we return an unlocked page and the error value
* on success we return a locked page and 0
*/
-static int prepare_uptodate_page(struct page *page, u64 pos)
+static int prepare_uptodate_page(struct page *page, u64 pos,
+ bool force_uptodate)
{
int ret = 0;
- if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+ if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
+ !PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
if (ret)
return ret;
@@ -1059,12 +1063,13 @@ static int prepare_uptodate_page(struct page *page, u64 pos)
static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
struct page **pages, size_t num_pages,
loff_t pos, unsigned long first_index,
- unsigned long last_index, size_t write_bytes)
+ size_t write_bytes, bool force_uptodate)
{
struct extent_state *cached_state = NULL;
int i;
unsigned long index = pos >> PAGE_CACHE_SHIFT;
struct inode *inode = fdentry(file)->d_inode;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int err = 0;
int faili = 0;
u64 start_pos;
@@ -1073,15 +1078,10 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
start_pos = pos & ~((u64)root->sectorsize - 1);
last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
- if (start_pos > inode->i_size) {
- err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
- if (err)
- return err;
- }
-
again:
for (i = 0; i < num_pages; i++) {
- pages[i] = grab_cache_page(inode->i_mapping, index + i);
+ pages[i] = find_or_create_page(inode->i_mapping, index + i,
+ mask);
if (!pages[i]) {
faili = i - 1;
err = -ENOMEM;
@@ -1089,10 +1089,11 @@ again:
}
if (i == 0)
- err = prepare_uptodate_page(pages[i], pos);
+ err = prepare_uptodate_page(pages[i], pos,
+ force_uptodate);
if (i == num_pages - 1)
err = prepare_uptodate_page(pages[i],
- pos + write_bytes);
+ pos + write_bytes, false);
if (err) {
page_cache_release(pages[i]);
faili = i - 1;
@@ -1158,20 +1159,21 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
unsigned long first_index;
- unsigned long last_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
+ bool force_page_uptodate = false;
nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
(sizeof(struct page *)));
+ nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
+ nrptrs = max(nrptrs, 8);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
if (!pages)
return -ENOMEM;
first_index = pos >> PAGE_CACHE_SHIFT;
- last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
while (iov_iter_count(i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
@@ -1205,8 +1207,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
* contents of pages from loop to loop
*/
ret = prepare_pages(root, file, pages, num_pages,
- pos, first_index, last_index,
- write_bytes);
+ pos, first_index, write_bytes,
+ force_page_uptodate);
if (ret) {
btrfs_delalloc_release_space(inode,
num_pages << PAGE_CACHE_SHIFT);
@@ -1223,12 +1225,15 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (copied < write_bytes)
nrptrs = 1;
- if (copied == 0)
+ if (copied == 0) {
+ force_page_uptodate = true;
dirty_pages = 0;
- else
+ } else {
+ force_page_uptodate = false;
dirty_pages = (copied + offset +
PAGE_CACHE_SIZE - 1) >>
PAGE_CACHE_SHIFT;
+ }
/*
* If we had a short copy we need to release the excess delaloc
@@ -1238,9 +1243,11 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
* managed to copy.
*/
if (num_pages > dirty_pages) {
- if (copied > 0)
- atomic_inc(
- &BTRFS_I(inode)->outstanding_extents);
+ if (copied > 0) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
btrfs_delalloc_release_space(inode,
(num_pages - dirty_pages) <<
PAGE_CACHE_SHIFT);
@@ -1336,6 +1343,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
loff_t *ppos = &iocb->ki_pos;
+ u64 start_pos;
ssize_t num_written = 0;
ssize_t err = 0;
size_t count, ocount;
@@ -1381,9 +1389,22 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
goto out;
}
- file_update_time(file);
+ err = btrfs_update_time(file);
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
BTRFS_I(inode)->sequence++;
+ start_pos = round_down(pos, root->sectorsize);
+ if (start_pos > i_size_read(inode)) {
+ err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+ }
+
if (unlikely(file->f_flags & O_DIRECT)) {
num_written = __btrfs_direct_write(iocb, iov, nr_segs,
pos, ppos, count, ocount);
@@ -1452,7 +1473,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
* important optimization for directories because holding the mutex prevents
* new operations on the dir while we write to disk.
*/
-int btrfs_sync_file(struct file *file, int datasync)
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
@@ -1462,9 +1483,13 @@ int btrfs_sync_file(struct file *file, int datasync)
trace_btrfs_sync_file(file, datasync);
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
+ mutex_lock(&inode->i_mutex);
+
/* we wait first, since the writeback may change the inode */
root->log_batch++;
- /* the VFS called filemap_fdatawrite for us */
btrfs_wait_ordered_range(inode, 0, (u64)-1);
root->log_batch++;
@@ -1472,8 +1497,10 @@ int btrfs_sync_file(struct file *file, int datasync)
* check the transaction that last modified this inode
* and see if its already been committed
*/
- if (!BTRFS_I(inode)->last_trans)
+ if (!BTRFS_I(inode)->last_trans) {
+ mutex_unlock(&inode->i_mutex);
goto out;
+ }
/*
* if the last transaction that changed this file was before
@@ -1484,6 +1511,7 @@ int btrfs_sync_file(struct file *file, int datasync)
if (BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
BTRFS_I(inode)->last_trans = 0;
+ mutex_unlock(&inode->i_mutex);
goto out;
}
@@ -1496,12 +1524,15 @@ int btrfs_sync_file(struct file *file, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ mutex_unlock(&inode->i_mutex);
goto out;
}
ret = btrfs_log_dentry_safe(trans, root, dentry);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&inode->i_mutex);
goto out;
+ }
/* we've logged all the items and now have a consistent
* version of the file in the log. It is possible that
@@ -1513,7 +1544,7 @@ int btrfs_sync_file(struct file *file, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&inode->i_mutex);
if (ret != BTRFS_NO_LOG_SYNC) {
if (ret > 0) {
@@ -1528,7 +1559,6 @@ int btrfs_sync_file(struct file *file, int datasync)
} else {
ret = btrfs_end_transaction(trans, root);
}
- mutex_lock(&dentry->d_inode->i_mutex);
out:
return ret > 0 ? -EIO : ret;
}
@@ -1592,10 +1622,6 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out;
}
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
- if (ret)
- goto out;
-
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
@@ -1629,23 +1655,53 @@ static long btrfs_fallocate(struct file *file, int mode,
cur_offset = alloc_start;
while (1) {
+ u64 actual_end;
+
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
BUG_ON(IS_ERR_OR_NULL(em));
last_byte = min(extent_map_end(em), alloc_end);
+ actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = (last_byte + mask) & ~mask;
+
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+
+ /*
+ * Make sure we have enough space before we do the
+ * allocation.
+ */
+ ret = btrfs_check_data_free_space(inode, last_byte -
+ cur_offset);
+ if (ret) {
+ free_extent_map(em);
+ break;
+ }
+
ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
last_byte - cur_offset,
1 << inode->i_blkbits,
offset + len,
&alloc_hint);
+
+ /* Let go of our reservation. */
+ btrfs_free_reserved_data_space(inode, last_byte -
+ cur_offset);
if (ret < 0) {
free_extent_map(em);
break;
}
+ } else if (actual_end > inode->i_size &&
+ !(mode & FALLOC_FL_KEEP_SIZE)) {
+ /*
+ * We didn't need to allocate any more space, but we
+ * still extended the size of the file so we need to
+ * update i_size.
+ */
+ inode->i_ctime = CURRENT_TIME;
+ i_size_write(inode, actual_end);
+ btrfs_ordered_update_i_size(inode, actual_end, NULL);
}
free_extent_map(em);
@@ -1657,15 +1713,168 @@ static long btrfs_fallocate(struct file *file, int mode,
}
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
-
- btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
}
+static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_map *em;
+ struct extent_state *cached_state = NULL;
+ u64 lockstart = *offset;
+ u64 lockend = i_size_read(inode);
+ u64 start = *offset;
+ u64 orig_start = *offset;
+ u64 len = i_size_read(inode);
+ u64 last_end = 0;
+ int ret = 0;
+
+ lockend = max_t(u64, root->sectorsize, lockend);
+ if (lockend <= lockstart)
+ lockend = lockstart + root->sectorsize;
+
+ len = lockend - lockstart + 1;
+
+ len = max_t(u64, len, root->sectorsize);
+ if (inode->i_size == 0)
+ return -ENXIO;
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+ &cached_state, GFP_NOFS);
+
+ /*
+ * Delalloc is such a pain. If we have a hole and we have pending
+ * delalloc for a portion of the hole we will get back a hole that
+ * exists for the entire range since it hasn't been actually written
+ * yet. So to take care of this case we need to look for an extent just
+ * before the position we want in case there is outstanding delalloc
+ * going on here.
+ */
+ if (origin == SEEK_HOLE && start != 0) {
+ if (start <= root->sectorsize)
+ em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
+ root->sectorsize, 0);
+ else
+ em = btrfs_get_extent_fiemap(inode, NULL, 0,
+ start - root->sectorsize,
+ root->sectorsize, 0);
+ if (IS_ERR(em)) {
+ ret = -ENXIO;
+ goto out;
+ }
+ last_end = em->start + em->len;
+ if (em->block_start == EXTENT_MAP_DELALLOC)
+ last_end = min_t(u64, last_end, inode->i_size);
+ free_extent_map(em);
+ }
+
+ while (1) {
+ em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = -ENXIO;
+ break;
+ }
+
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+ if (last_end <= orig_start) {
+ free_extent_map(em);
+ ret = -ENXIO;
+ break;
+ }
+ }
+
+ if (origin == SEEK_HOLE) {
+ *offset = start;
+ free_extent_map(em);
+ break;
+ }
+ } else {
+ if (origin == SEEK_DATA) {
+ if (em->block_start == EXTENT_MAP_DELALLOC) {
+ if (start >= inode->i_size) {
+ free_extent_map(em);
+ ret = -ENXIO;
+ break;
+ }
+ }
+
+ *offset = start;
+ free_extent_map(em);
+ break;
+ }
+ }
+
+ start = em->start + em->len;
+ last_end = em->start + em->len;
+
+ if (em->block_start == EXTENT_MAP_DELALLOC)
+ last_end = min_t(u64, last_end, inode->i_size);
+
+ if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+ free_extent_map(em);
+ ret = -ENXIO;
+ break;
+ }
+ free_extent_map(em);
+ cond_resched();
+ }
+ if (!ret)
+ *offset = min(*offset, inode->i_size);
+out:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state, GFP_NOFS);
+ return ret;
+}
+
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret;
+
+ mutex_lock(&inode->i_mutex);
+ switch (origin) {
+ case SEEK_END:
+ case SEEK_CUR:
+ offset = generic_file_llseek(file, offset, origin);
+ goto out;
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ if (offset >= i_size_read(inode)) {
+ mutex_unlock(&inode->i_mutex);
+ return -ENXIO;
+ }
+
+ ret = find_desired_extent(inode, &offset, origin);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+ }
+
+ if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
+ offset = -EINVAL;
+ goto out;
+ }
+ if (offset > inode->i_sb->s_maxbytes) {
+ offset = -EINVAL;
+ goto out;
+ }
+
+ /* Special lock needed here? */
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+out:
+ mutex_unlock(&inode->i_mutex);
+ return offset;
+}
+
const struct file_operations btrfs_file_operations = {
- .llseek = generic_file_llseek,
+ .llseek = btrfs_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index bf0d615..ec23d43 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/math64.h>
+#include <linux/ratelimit.h>
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
*block_group, struct btrfs_path *path)
{
struct inode *inode = NULL;
+ u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
spin_lock(&block_group->lock);
if (block_group->inode)
@@ -98,7 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
return inode;
spin_lock(&block_group->lock);
- if (!btrfs_fs_closing(root->fs_info)) {
+ if (!((BTRFS_I(inode)->flags & flags) == flags)) {
+ printk(KERN_INFO "Old style space inode found, converting.\n");
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
+ BTRFS_INODE_NODATACOW;
+ block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ }
+
+ if (!block_group->iref) {
block_group->inode = igrab(inode);
block_group->iref = 1;
}
@@ -116,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header *header;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
+ u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
int ret;
ret = btrfs_insert_empty_inode(trans, root, path, ino);
if (ret)
return ret;
+ /* We inline crc's for the free disk space cache */
+ if (ino != BTRFS_FREE_INO_OBJECTID)
+ flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
@@ -134,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
btrfs_set_inode_uid(leaf, inode_item, 0);
btrfs_set_inode_gid(leaf, inode_item, 0);
btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
- BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
+ btrfs_set_inode_flags(leaf, inode_item, flags);
btrfs_set_inode_nlink(leaf, inode_item, 1);
btrfs_set_inode_transid(leaf, inode_item, trans->transid);
btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -184,15 +197,25 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_path *path,
struct inode *inode)
{
+ struct btrfs_block_rsv *rsv;
+ u64 needed_bytes;
loff_t oldsize;
int ret = 0;
- trans->block_rsv = root->orphan_block_rsv;
- ret = btrfs_block_rsv_check(trans, root,
- root->orphan_block_rsv,
- 0, 5);
- if (ret)
- return ret;
+ rsv = trans->block_rsv;
+ trans->block_rsv = &root->fs_info->global_block_rsv;
+
+ /* 1 for slack space, 1 for updating the inode */
+ needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
+ btrfs_calc_trans_metadata_size(root, 1);
+
+ spin_lock(&trans->block_rsv->lock);
+ if (trans->block_rsv->reserved < needed_bytes) {
+ spin_unlock(&trans->block_rsv->lock);
+ trans->block_rsv = rsv;
+ return -ENOSPC;
+ }
+ spin_unlock(&trans->block_rsv->lock);
oldsize = i_size_read(inode);
btrfs_i_size_write(inode, 0);
@@ -204,12 +227,16 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
*/
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
+
if (ret) {
+ trans->block_rsv = rsv;
WARN_ON(1);
return ret;
}
ret = btrfs_update_inode(trans, root, inode);
+ trans->block_rsv = rsv;
+
return ret;
}
@@ -232,31 +259,348 @@ static int readahead_cache(struct inode *inode)
return 0;
}
+struct io_ctl {
+ void *cur, *orig;
+ struct page *page;
+ struct page **pages;
+ struct btrfs_root *root;
+ unsigned long size;
+ int index;
+ int num_pages;
+ unsigned check_crcs:1;
+};
+
+static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
+ struct btrfs_root *root)
+{
+ memset(io_ctl, 0, sizeof(struct io_ctl));
+ io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
+ GFP_NOFS);
+ if (!io_ctl->pages)
+ return -ENOMEM;
+ io_ctl->root = root;
+ if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+ io_ctl->check_crcs = 1;
+ return 0;
+}
+
+static void io_ctl_free(struct io_ctl *io_ctl)
+{
+ kfree(io_ctl->pages);
+}
+
+static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+{
+ if (io_ctl->cur) {
+ kunmap(io_ctl->page);
+ io_ctl->cur = NULL;
+ io_ctl->orig = NULL;
+ }
+}
+
+static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+{
+ WARN_ON(io_ctl->cur);
+ BUG_ON(io_ctl->index >= io_ctl->num_pages);
+ io_ctl->page = io_ctl->pages[io_ctl->index++];
+ io_ctl->cur = kmap(io_ctl->page);
+ io_ctl->orig = io_ctl->cur;
+ io_ctl->size = PAGE_CACHE_SIZE;
+ if (clear)
+ memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+}
+
+static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+{
+ int i;
+
+ io_ctl_unmap_page(io_ctl);
+
+ for (i = 0; i < io_ctl->num_pages; i++) {
+ ClearPageChecked(io_ctl->pages[i]);
+ unlock_page(io_ctl->pages[i]);
+ page_cache_release(io_ctl->pages[i]);
+ }
+}
+
+static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+ int uptodate)
+{
+ struct page *page;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ int i;
+
+ for (i = 0; i < io_ctl->num_pages; i++) {
+ page = find_or_create_page(inode->i_mapping, i, mask);
+ if (!page) {
+ io_ctl_drop_pages(io_ctl);
+ return -ENOMEM;
+ }
+ io_ctl->pages[i] = page;
+ if (uptodate && !PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ printk(KERN_ERR "btrfs: error reading free "
+ "space cache\n");
+ io_ctl_drop_pages(io_ctl);
+ return -EIO;
+ }
+ }
+ }
+
+ for (i = 0; i < io_ctl->num_pages; i++) {
+ clear_page_dirty_for_io(io_ctl->pages[i]);
+ set_page_extent_mapped(io_ctl->pages[i]);
+ }
+
+ return 0;
+}
+
+static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+{
+ u64 *val;
+
+ io_ctl_map_page(io_ctl, 1);
+
+ /*
+ * Skip the csum areas. If we don't check crcs then we just have a
+ * 64bit chunk at the front of the first page.
+ */
+ if (io_ctl->check_crcs) {
+ io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+ io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
+ } else {
+ io_ctl->cur += sizeof(u64);
+ io_ctl->size -= sizeof(u64) * 2;
+ }
+
+ val = io_ctl->cur;
+ *val = cpu_to_le64(generation);
+ io_ctl->cur += sizeof(u64);
+}
+
+static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+{
+ u64 *gen;
+
+ /*
+ * Skip the crc area. If we don't check crcs then we just have a 64bit
+ * chunk at the front of the first page.
+ */
+ if (io_ctl->check_crcs) {
+ io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+ io_ctl->size -= sizeof(u64) +
+ (sizeof(u32) * io_ctl->num_pages);
+ } else {
+ io_ctl->cur += sizeof(u64);
+ io_ctl->size -= sizeof(u64) * 2;
+ }
+
+ gen = io_ctl->cur;
+ if (le64_to_cpu(*gen) != generation) {
+ printk_ratelimited(KERN_ERR "btrfs: space cache generation "
+ "(%Lu) does not match inode (%Lu)\n", *gen,
+ generation);
+ io_ctl_unmap_page(io_ctl);
+ return -EIO;
+ }
+ io_ctl->cur += sizeof(u64);
+ return 0;
+}
+
+static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+{
+ u32 *tmp;
+ u32 crc = ~(u32)0;
+ unsigned offset = 0;
+
+ if (!io_ctl->check_crcs) {
+ io_ctl_unmap_page(io_ctl);
+ return;
+ }
+
+ if (index == 0)
+ offset = sizeof(u32) * io_ctl->num_pages;;
+
+ crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+ PAGE_CACHE_SIZE - offset);
+ btrfs_csum_final(crc, (char *)&crc);
+ io_ctl_unmap_page(io_ctl);
+ tmp = kmap(io_ctl->pages[0]);
+ tmp += index;
+ *tmp = crc;
+ kunmap(io_ctl->pages[0]);
+}
+
+static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+{
+ u32 *tmp, val;
+ u32 crc = ~(u32)0;
+ unsigned offset = 0;
+
+ if (!io_ctl->check_crcs) {
+ io_ctl_map_page(io_ctl, 0);
+ return 0;
+ }
+
+ if (index == 0)
+ offset = sizeof(u32) * io_ctl->num_pages;
+
+ tmp = kmap(io_ctl->pages[0]);
+ tmp += index;
+ val = *tmp;
+ kunmap(io_ctl->pages[0]);
+
+ io_ctl_map_page(io_ctl, 0);
+ crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+ PAGE_CACHE_SIZE - offset);
+ btrfs_csum_final(crc, (char *)&crc);
+ if (val != crc) {
+ printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
+ "space cache\n");
+ io_ctl_unmap_page(io_ctl);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+ void *bitmap)
+{
+ struct btrfs_free_space_entry *entry;
+
+ if (!io_ctl->cur)
+ return -ENOSPC;
+
+ entry = io_ctl->cur;
+ entry->offset = cpu_to_le64(offset);
+ entry->bytes = cpu_to_le64(bytes);
+ entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
+ BTRFS_FREE_SPACE_EXTENT;
+ io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+ io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+ if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+ return 0;
+
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+
+ /* No more pages to map */
+ if (io_ctl->index >= io_ctl->num_pages)
+ return 0;
+
+ /* map the next page */
+ io_ctl_map_page(io_ctl, 1);
+ return 0;
+}
+
+static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+{
+ if (!io_ctl->cur)
+ return -ENOSPC;
+
+ /*
+ * If we aren't at the start of the current page, unmap this one and
+ * map the next one if there is any left.
+ */
+ if (io_ctl->cur != io_ctl->orig) {
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ if (io_ctl->index >= io_ctl->num_pages)
+ return -ENOSPC;
+ io_ctl_map_page(io_ctl, 0);
+ }
+
+ memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ if (io_ctl->index < io_ctl->num_pages)
+ io_ctl_map_page(io_ctl, 0);
+ return 0;
+}
+
+static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+{
+ /*
+ * If we're not on the boundary we know we've modified the page and we
+ * need to crc the page.
+ */
+ if (io_ctl->cur != io_ctl->orig)
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ else
+ io_ctl_unmap_page(io_ctl);
+
+ while (io_ctl->index < io_ctl->num_pages) {
+ io_ctl_map_page(io_ctl, 1);
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ }
+}
+
+static int io_ctl_read_entry(struct io_ctl *io_ctl,
+ struct btrfs_free_space *entry, u8 *type)
+{
+ struct btrfs_free_space_entry *e;
+ int ret;
+
+ if (!io_ctl->cur) {
+ ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+ if (ret)
+ return ret;
+ }
+
+ e = io_ctl->cur;
+ entry->offset = le64_to_cpu(e->offset);
+ entry->bytes = le64_to_cpu(e->bytes);
+ *type = e->type;
+ io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+ io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+ if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+ return 0;
+
+ io_ctl_unmap_page(io_ctl);
+
+ return 0;
+}
+
+static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+ struct btrfs_free_space *entry)
+{
+ int ret;
+
+ ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+ if (ret)
+ return ret;
+
+ memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+ io_ctl_unmap_page(io_ctl);
+
+ return 0;
+}
+
int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_path *path, u64 offset)
{
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
- struct page *page;
- u32 *checksums = NULL, *crc;
- char *disk_crcs = NULL;
+ struct io_ctl io_ctl;
struct btrfs_key key;
+ struct btrfs_free_space *e, *n;
struct list_head bitmaps;
u64 num_entries;
u64 num_bitmaps;
u64 generation;
- u32 cur_crc = ~(u32)0;
- pgoff_t index = 0;
- unsigned long first_page_offset;
- int num_checksums;
+ u8 type;
int ret = 0;
INIT_LIST_HEAD(&bitmaps);
/* Nothing in the space cache, goodbye */
if (!i_size_read(inode))
- goto out;
+ return 0;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
@@ -264,11 +608,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return 0;
else if (ret > 0) {
btrfs_release_path(path);
- ret = 0;
- goto out;
+ return 0;
}
ret = -1;
@@ -286,194 +629,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
" not match free space cache generation (%llu)\n",
(unsigned long long)BTRFS_I(inode)->generation,
(unsigned long long)generation);
- goto out;
+ return 0;
}
if (!num_entries)
- goto out;
-
- /* Setup everything for doing checksumming */
- num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
- checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
- if (!checksums)
- goto out;
- first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
- disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
- if (!disk_crcs)
- goto out;
+ return 0;
+ io_ctl_init(&io_ctl, inode, root);
ret = readahead_cache(inode);
if (ret)
goto out;
- while (1) {
- struct btrfs_free_space_entry *entry;
- struct btrfs_free_space *e;
- void *addr;
- unsigned long offset = 0;
- unsigned long start_offset = 0;
- int need_loop = 0;
+ ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
+ if (ret)
+ goto out;
- if (!num_entries && !num_bitmaps)
- break;
+ ret = io_ctl_check_crc(&io_ctl, 0);
+ if (ret)
+ goto free_cache;
- if (index == 0) {
- start_offset = first_page_offset;
- offset = start_offset;
- }
+ ret = io_ctl_check_generation(&io_ctl, generation);
+ if (ret)
+ goto free_cache;
- page = grab_cache_page(inode->i_mapping, index);
- if (!page)
+ while (num_entries) {
+ e = kmem_cache_zalloc(btrfs_free_space_cachep,
+ GFP_NOFS);
+ if (!e)
goto free_cache;
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- page_cache_release(page);
- printk(KERN_ERR "btrfs: error reading free "
- "space cache\n");
- goto free_cache;
- }
- }
- addr = kmap(page);
-
- if (index == 0) {
- u64 *gen;
-
- memcpy(disk_crcs, addr, first_page_offset);
- gen = addr + (sizeof(u32) * num_checksums);
- if (*gen != BTRFS_I(inode)->generation) {
- printk(KERN_ERR "btrfs: space cache generation"
- " (%llu) does not match inode (%llu)\n",
- (unsigned long long)*gen,
- (unsigned long long)
- BTRFS_I(inode)->generation);
- kunmap(page);
- unlock_page(page);
- page_cache_release(page);
- goto free_cache;
- }
- crc = (u32 *)disk_crcs;
- }
- entry = addr + start_offset;
-
- /* First lets check our crc before we do anything fun */
- cur_crc = ~(u32)0;
- cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
- PAGE_CACHE_SIZE - start_offset);
- btrfs_csum_final(cur_crc, (char *)&cur_crc);
- if (cur_crc != *crc) {
- printk(KERN_ERR "btrfs: crc mismatch for page %lu\n",
- index);
- kunmap(page);
- unlock_page(page);
- page_cache_release(page);
+ ret = io_ctl_read_entry(&io_ctl, e, &type);
+ if (ret) {
+ kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
- crc++;
- while (1) {
- if (!num_entries)
- break;
+ if (!e->bytes) {
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
- need_loop = 1;
- e = kmem_cache_zalloc(btrfs_free_space_cachep,
- GFP_NOFS);
- if (!e) {
- kunmap(page);
- unlock_page(page);
- page_cache_release(page);
+ if (type == BTRFS_FREE_SPACE_EXTENT) {
+ spin_lock(&ctl->tree_lock);
+ ret = link_free_space(ctl, e);
+ spin_unlock(&ctl->tree_lock);
+ if (ret) {
+ printk(KERN_ERR "Duplicate entries in "
+ "free space cache, dumping\n");
+ kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
-
- e->offset = le64_to_cpu(entry->offset);
- e->bytes = le64_to_cpu(entry->bytes);
- if (!e->bytes) {
- kunmap(page);
- kmem_cache_free(btrfs_free_space_cachep, e);
- unlock_page(page);
- page_cache_release(page);
+ } else {
+ BUG_ON(!num_bitmaps);
+ num_bitmaps--;
+ e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ if (!e->bitmap) {
+ kmem_cache_free(
+ btrfs_free_space_cachep, e);
goto free_cache;
}
-
- if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
- spin_lock(&ctl->tree_lock);
- ret = link_free_space(ctl, e);
- spin_unlock(&ctl->tree_lock);
- if (ret) {
- printk(KERN_ERR "Duplicate entries in "
- "free space cache, dumping\n");
- kunmap(page);
- unlock_page(page);
- page_cache_release(page);
- goto free_cache;
- }
- } else {
- e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
- if (!e->bitmap) {
- kunmap(page);
- kmem_cache_free(
- btrfs_free_space_cachep, e);
- unlock_page(page);
- page_cache_release(page);
- goto free_cache;
- }
- spin_lock(&ctl->tree_lock);
- ret = link_free_space(ctl, e);
- ctl->total_bitmaps++;
- ctl->op->recalc_thresholds(ctl);
- spin_unlock(&ctl->tree_lock);
- if (ret) {
- printk(KERN_ERR "Duplicate entries in "
- "free space cache, dumping\n");
- kunmap(page);
- unlock_page(page);
- page_cache_release(page);
- goto free_cache;
- }
- list_add_tail(&e->list, &bitmaps);
+ spin_lock(&ctl->tree_lock);
+ ret = link_free_space(ctl, e);
+ ctl->total_bitmaps++;
+ ctl->op->recalc_thresholds(ctl);
+ spin_unlock(&ctl->tree_lock);
+ if (ret) {
+ printk(KERN_ERR "Duplicate entries in "
+ "free space cache, dumping\n");
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
}
-
- num_entries--;
- offset += sizeof(struct btrfs_free_space_entry);
- if (offset + sizeof(struct btrfs_free_space_entry) >=
- PAGE_CACHE_SIZE)
- break;
- entry++;
+ list_add_tail(&e->list, &bitmaps);
}
- /*
- * We read an entry out of this page, we need to move on to the
- * next page.
- */
- if (need_loop) {
- kunmap(page);
- goto next;
- }
+ num_entries--;
+ }
- /*
- * We add the bitmaps at the end of the entries in order that
- * the bitmap entries are added to the cache.
- */
- e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+ io_ctl_unmap_page(&io_ctl);
+
+ /*
+ * We add the bitmaps at the end of the entries in order that
+ * the bitmap entries are added to the cache.
+ */
+ list_for_each_entry_safe(e, n, &bitmaps, list) {
list_del_init(&e->list);
- memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
- kunmap(page);
- num_bitmaps--;
-next:
- unlock_page(page);
- page_cache_release(page);
- index++;
+ ret = io_ctl_read_bitmap(&io_ctl, e);
+ if (ret)
+ goto free_cache;
}
+ io_ctl_drop_pages(&io_ctl);
ret = 1;
out:
- kfree(checksums);
- kfree(disk_crcs);
+ io_ctl_free(&io_ctl);
return ret;
free_cache:
+ io_ctl_drop_pages(&io_ctl);
__btrfs_remove_free_space_cache(ctl);
goto out;
}
@@ -485,7 +736,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
struct btrfs_root *root = fs_info->tree_root;
struct inode *inode;
struct btrfs_path *path;
- int ret;
+ int ret = 0;
bool matched;
u64 used = btrfs_block_group_used(&block_group->item);
@@ -517,6 +768,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
return 0;
}
+ /* We may have converted the inode and made the cache invalid. */
+ spin_lock(&block_group->lock);
+ if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+ spin_unlock(&block_group->lock);
+ goto out;
+ }
+ spin_unlock(&block_group->lock);
+
ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
path, block_group->key.objectid);
btrfs_free_path(path);
@@ -550,6 +809,19 @@ out:
return ret;
}
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount. This will return 0 if it was successfull in writing the cache out,
+ * and -1 if it was not.
+ */
int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group_cache *block_group,
@@ -560,64 +832,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct extent_buffer *leaf;
struct rb_node *node;
struct list_head *pos, *n;
- struct page **pages;
- struct page *page;
struct extent_state *cached_state = NULL;
struct btrfs_free_cluster *cluster = NULL;
struct extent_io_tree *unpin = NULL;
+ struct io_ctl io_ctl;
struct list_head bitmap_list;
struct btrfs_key key;
u64 start, end, len;
- u64 bytes = 0;
- u32 *crc, *checksums;
- unsigned long first_page_offset;
- int index = 0, num_pages = 0;
int entries = 0;
int bitmaps = 0;
- int ret = -1;
- bool next_page = false;
- bool out_of_space = false;
+ int ret;
+ int err = -1;
INIT_LIST_HEAD(&bitmap_list);
- node = rb_first(&ctl->free_space_offset);
- if (!node)
- return 0;
-
if (!i_size_read(inode))
return -1;
- num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
-
- /* Since the first page has all of our checksums and our generation we
- * need to calculate the offset into the page that we can start writing
- * our entries.
- */
- first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
-
- filemap_write_and_wait(inode->i_mapping);
- btrfs_wait_ordered_range(inode, inode->i_size &
- ~(root->sectorsize - 1), (u64)-1);
-
- /* make sure we don't overflow that first page */
- if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
- /* this is really the same as running out of space, where we also return 0 */
- printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
- ret = 0;
- goto out_update;
- }
-
- /* We need a checksum per page. */
- crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
- if (!crc)
- return -1;
-
- pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
- if (!pages) {
- kfree(crc);
- return -1;
- }
+ io_ctl_init(&io_ctl, inode, root);
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list))
@@ -631,30 +863,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
*/
unpin = root->fs_info->pinned_extents;
- /*
- * Lock all pages first so we can lock the extent safely.
- *
- * NOTE: Because we hold the ref the entire time we're going to write to
- * the page find_get_page should never fail, so we don't do a check
- * after find_get_page at this point. Just putting this here so people
- * know and don't freak out.
- */
- while (index < num_pages) {
- page = grab_cache_page(inode->i_mapping, index);
- if (!page) {
- int i;
-
- for (i = 0; i < num_pages; i++) {
- unlock_page(pages[i]);
- page_cache_release(pages[i]);
- }
- goto out_free;
- }
- pages[index] = page;
- index++;
- }
+ /* Lock all pages first so we can lock the extent safely. */
+ io_ctl_prepare_pages(&io_ctl, inode, 0);
- index = 0;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
0, &cached_state, GFP_NOFS);
@@ -665,192 +876,112 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (block_group)
start = block_group->key.objectid;
- /* Write out the extent entries */
- do {
- struct btrfs_free_space_entry *entry;
- void *addr;
- unsigned long offset = 0;
- unsigned long start_offset = 0;
-
- next_page = false;
-
- if (index == 0) {
- start_offset = first_page_offset;
- offset = start_offset;
- }
+ node = rb_first(&ctl->free_space_offset);
+ if (!node && cluster) {
+ node = rb_first(&cluster->root);
+ cluster = NULL;
+ }
- if (index >= num_pages) {
- out_of_space = true;
- break;
- }
+ /* Make sure we can fit our crcs into the first page */
+ if (io_ctl.check_crcs &&
+ (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
+ WARN_ON(1);
+ goto out_nospc;
+ }
- page = pages[index];
+ io_ctl_set_generation(&io_ctl, trans->transid);
- addr = kmap(page);
- entry = addr + start_offset;
+ /* Write out the extent entries */
+ while (node) {
+ struct btrfs_free_space *e;
- memset(addr, 0, PAGE_CACHE_SIZE);
- while (node && !next_page) {
- struct btrfs_free_space *e;
+ e = rb_entry(node, struct btrfs_free_space, offset_index);
+ entries++;
- e = rb_entry(node, struct btrfs_free_space, offset_index);
- entries++;
+ ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
+ e->bitmap);
+ if (ret)
+ goto out_nospc;
- entry->offset = cpu_to_le64(e->offset);
- entry->bytes = cpu_to_le64(e->bytes);
- if (e->bitmap) {
- entry->type = BTRFS_FREE_SPACE_BITMAP;
- list_add_tail(&e->list, &bitmap_list);
- bitmaps++;
- } else {
- entry->type = BTRFS_FREE_SPACE_EXTENT;
- }
- node = rb_next(node);
- if (!node && cluster) {
- node = rb_first(&cluster->root);
- cluster = NULL;
- }
- offset += sizeof(struct btrfs_free_space_entry);
- if (offset + sizeof(struct btrfs_free_space_entry) >=
- PAGE_CACHE_SIZE)
- next_page = true;
- entry++;
+ if (e->bitmap) {
+ list_add_tail(&e->list, &bitmap_list);
+ bitmaps++;
}
+ node = rb_next(node);
+ if (!node && cluster) {
+ node = rb_first(&cluster->root);
+ cluster = NULL;
+ }
+ }
- /*
- * We want to add any pinned extents to our free space cache
- * so we don't leak the space
- */
- while (block_group && !next_page &&
- (start < block_group->key.objectid +
- block_group->key.offset)) {
- ret = find_first_extent_bit(unpin, start, &start, &end,
- EXTENT_DIRTY);
- if (ret) {
- ret = 0;
- break;
- }
-
- /* This pinned extent is out of our range */
- if (start >= block_group->key.objectid +
- block_group->key.offset)
- break;
-
- len = block_group->key.objectid +
- block_group->key.offset - start;
- len = min(len, end + 1 - start);
-
- entries++;
- entry->offset = cpu_to_le64(start);
- entry->bytes = cpu_to_le64(len);
- entry->type = BTRFS_FREE_SPACE_EXTENT;
-
- start = end + 1;
- offset += sizeof(struct btrfs_free_space_entry);
- if (offset + sizeof(struct btrfs_free_space_entry) >=
- PAGE_CACHE_SIZE)
- next_page = true;
- entry++;
+ /*
+ * We want to add any pinned extents to our free space cache
+ * so we don't leak the space
+ */
+ while (block_group && (start < block_group->key.objectid +
+ block_group->key.offset)) {
+ ret = find_first_extent_bit(unpin, start, &start, &end,
+ EXTENT_DIRTY);
+ if (ret) {
+ ret = 0;
+ break;
}
- *crc = ~(u32)0;
- *crc = btrfs_csum_data(root, addr + start_offset, *crc,
- PAGE_CACHE_SIZE - start_offset);
- kunmap(page);
- btrfs_csum_final(*crc, (char *)crc);
- crc++;
+ /* This pinned extent is out of our range */
+ if (start >= block_group->key.objectid +
+ block_group->key.offset)
+ break;
+
+ len = block_group->key.objectid +
+ block_group->key.offset - start;
+ len = min(len, end + 1 - start);
- bytes += PAGE_CACHE_SIZE;
+ entries++;
+ ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+ if (ret)
+ goto out_nospc;
- index++;
- } while (node || next_page);
+ start = end + 1;
+ }
/* Write out the bitmaps */
list_for_each_safe(pos, n, &bitmap_list) {
- void *addr;
struct btrfs_free_space *entry =
list_entry(pos, struct btrfs_free_space, list);
- if (index >= num_pages) {
- out_of_space = true;
- break;
- }
- page = pages[index];
-
- addr = kmap(page);
- memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
- *crc = ~(u32)0;
- *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
- kunmap(page);
- btrfs_csum_final(*crc, (char *)crc);
- crc++;
- bytes += PAGE_CACHE_SIZE;
-
+ ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
+ if (ret)
+ goto out_nospc;
list_del_init(&entry->list);
- index++;
- }
-
- if (out_of_space) {
- btrfs_drop_pages(pages, num_pages);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state,
- GFP_NOFS);
- ret = 0;
- goto out_free;
}
/* Zero out the rest of the pages just to make sure */
- while (index < num_pages) {
- void *addr;
-
- page = pages[index];
- addr = kmap(page);
- memset(addr, 0, PAGE_CACHE_SIZE);
- kunmap(page);
- bytes += PAGE_CACHE_SIZE;
- index++;
- }
-
- /* Write the checksums and trans id to the first page */
- {
- void *addr;
- u64 *gen;
-
- page = pages[0];
+ io_ctl_zero_remaining_pages(&io_ctl);
- addr = kmap(page);
- memcpy(addr, checksums, sizeof(u32) * num_pages);
- gen = addr + (sizeof(u32) * num_pages);
- *gen = trans->transid;
- kunmap(page);
- }
-
- ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
- bytes, &cached_state);
- btrfs_drop_pages(pages, num_pages);
+ ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+ 0, i_size_read(inode), &cached_state);
+ io_ctl_drop_pages(&io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state, GFP_NOFS);
- if (ret) {
- ret = 0;
- goto out_free;
- }
+ if (ret)
+ goto out;
- BTRFS_I(inode)->generation = trans->transid;
- filemap_write_and_wait(inode->i_mapping);
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ goto out;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
key.type = 0;
- ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- ret = -1;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
- goto out_free;
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+ GFP_NOFS);
+ goto out;
}
leaf = path->nodes[0];
if (ret > 0) {
@@ -860,15 +991,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
found_key.offset != offset) {
- ret = -1;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL,
- GFP_NOFS);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+ inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+ NULL, GFP_NOFS);
btrfs_release_path(path);
- goto out_free;
+ goto out;
}
}
+
+ BTRFS_I(inode)->generation = trans->transid;
header = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_header);
btrfs_set_free_space_entries(leaf, header, entries);
@@ -877,19 +1009,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- ret = 1;
-
-out_free:
- kfree(checksums);
- kfree(pages);
-
-out_update:
- if (ret != 1) {
- invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+ err = 0;
+out:
+ io_ctl_free(&io_ctl);
+ if (err) {
+ invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
- return ret;
+ return err;
+
+out_nospc:
+ list_for_each_safe(pos, n, &bitmap_list) {
+ struct btrfs_free_space *entry =
+ list_entry(pos, struct btrfs_free_space, list);
+ list_del_init(&entry->list);
+ }
+ io_ctl_drop_pages(&io_ctl);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+ goto out;
}
int btrfs_write_out_cache(struct btrfs_root *root,
@@ -916,14 +1055,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
path, block_group->key.objectid);
- if (ret < 0) {
+ if (ret) {
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_ERROR;
spin_unlock(&block_group->lock);
ret = 0;
-
+#ifdef DEBUG
printk(KERN_ERR "btrfs: failed to write free space cace "
"for block group %llu\n", block_group->key.objectid);
+#endif
}
iput(inode);
@@ -1219,9 +1359,9 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
}
-static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info, u64 offset,
- u64 bytes)
+static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ u64 offset, u64 bytes)
{
unsigned long start, count;
@@ -1232,6 +1372,13 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
+}
+
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
+{
+ __bitmap_clear_bits(ctl, info, offset, bytes);
ctl->free_space -= bytes;
}
@@ -1323,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
{
info->offset = offset_to_bitmap(ctl, offset);
info->bytes = 0;
+ INIT_LIST_HEAD(&info->list);
link_free_space(ctl, info);
ctl->total_bitmaps++;
@@ -1702,7 +1850,13 @@ again:
info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!info) {
- WARN_ON(1);
+ /* the tree logging code might be calling us before we
+ * have fully loaded the free space rbtree for this
+ * block group. So it is possible the entry won't
+ * be in the rbtree yet at all. The caching code
+ * will make sure not to put it in the rbtree if
+ * the logging code has pinned it.
+ */
goto out_lock;
}
}
@@ -1741,6 +1895,7 @@ again:
ctl->total_bitmaps--;
}
kmem_cache_free(btrfs_free_space_cachep, info);
+ ret = 0;
goto out_lock;
}
@@ -1748,7 +1903,8 @@ again:
unlink_free_space(ctl, info);
info->offset += bytes;
info->bytes -= bytes;
- link_free_space(ctl, info);
+ ret = link_free_space(ctl, info);
+ WARN_ON(ret);
goto out_lock;
}
@@ -2035,7 +2191,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
return 0;
ret = search_start;
- bitmap_clear_bits(ctl, entry, ret, bytes);
+ __bitmap_clear_bits(ctl, entry, ret, bytes);
return ret;
}
@@ -2090,7 +2246,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
continue;
}
} else {
-
ret = entry->offset;
entry->offset += bytes;
@@ -2165,6 +2320,7 @@ again:
if (!found) {
start = i;
+ cluster->max_size = 0;
found = true;
}
@@ -2308,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
- struct rb_node *node;
int ret = -ENOSPC;
+ u64 bitmap_offset = offset_to_bitmap(ctl, offset);
if (ctl->total_bitmaps == 0)
return -ENOSPC;
/*
- * First check our cached list of bitmaps and see if there is an entry
- * here that will work.
+ * The bitmap that covers offset won't be in the list unless offset
+ * is just its start offset.
*/
+ entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+ if (entry->offset != bitmap_offset) {
+ entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (entry && list_empty(&entry->list))
+ list_add(&entry->list, bitmaps);
+ }
+
list_for_each_entry(entry, bitmaps, list) {
if (entry->bytes < min_bytes)
continue;
@@ -2328,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
}
/*
- * If we do have entries on our list and we are here then we didn't find
- * anything, so go ahead and get the next entry after the last entry in
- * this list and start the search from there.
+ * The bitmaps list has all the bitmaps that record free space
+ * starting after offset, so no more search is required.
*/
- if (!list_empty(bitmaps)) {
- entry = list_entry(bitmaps->prev, struct btrfs_free_space,
- list);
- node = rb_next(&entry->offset_index);
- if (!node)
- return -ENOSPC;
- entry = rb_entry(node, struct btrfs_free_space, offset_index);
- goto search;
- }
-
- entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
- if (!entry)
- return -ENOSPC;
-
-search:
- node = &entry->offset_index;
- do {
- entry = rb_entry(node, struct btrfs_free_space, offset_index);
- node = rb_next(&entry->offset_index);
- if (!entry->bitmap)
- continue;
- if (entry->bytes < min_bytes)
- continue;
- ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
- bytes, min_bytes);
- } while (ret && node);
-
- return ret;
+ return -ENOSPC;
}
/*
@@ -2377,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
u64 offset, u64 bytes, u64 empty_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- struct list_head bitmaps;
struct btrfs_free_space *entry, *tmp;
+ LIST_HEAD(bitmaps);
u64 min_bytes;
int ret;
@@ -2417,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
goto out;
}
- INIT_LIST_HEAD(&bitmaps);
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
bytes, min_bytes);
if (ret)
@@ -2513,9 +2647,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
spin_unlock(&ctl->tree_lock);
if (bytes >= minlen) {
- int update_ret;
- update_ret = btrfs_update_reserved_bytes(block_group,
- bytes, 1, 1);
+ struct btrfs_space_info *space_info;
+ int update = 0;
+
+ space_info = block_group->space_info;
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (!block_group->ro) {
+ block_group->reserved += bytes;
+ space_info->bytes_reserved += bytes;
+ update = 1;
+ }
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
ret = btrfs_error_discard_extent(fs_info->extent_root,
start,
@@ -2523,9 +2667,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
&actually_trimmed);
btrfs_add_free_space(block_group, start, bytes);
- if (!update_ret)
- btrfs_update_reserved_bytes(block_group,
- bytes, 0, 1);
+ if (update) {
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (block_group->ro)
+ space_info->bytes_readonly += bytes;
+ block_group->reserved -= bytes;
+ space_info->bytes_reserved -= bytes;
+ spin_unlock(&space_info->lock);
+ spin_unlock(&block_group->lock);
+ }
if (ret)
break;
@@ -2684,9 +2835,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
return 0;
ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
- if (ret < 0)
+ if (ret) {
+ btrfs_delalloc_release_metadata(inode, inode->i_size);
+#ifdef DEBUG
printk(KERN_ERR "btrfs: failed to write free ino cache "
"for root %llu\n", root->root_key.objectid);
+#endif
+ }
iput(inode);
return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0..b3d1efe 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -207,24 +207,14 @@ again:
void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
-
again:
if (root->cached == BTRFS_CACHE_FINISHED) {
- __btrfs_add_free_space(ctl, objectid, 1);
+ __btrfs_add_free_space(pinned, objectid, 1);
} else {
- /*
- * If we are in the process of caching free ino chunks,
- * to avoid adding the same inode number to the free_ino
- * tree twice due to cross transaction, we'll leave it
- * in the pinned tree until a transaction is committed
- * or the caching work is done.
- */
-
mutex_lock(&root->fs_commit_mutex);
spin_lock(&root->cache_lock);
if (root->cached == BTRFS_CACHE_FINISHED) {
@@ -236,11 +226,7 @@ again:
start_caching(root);
- if (objectid <= root->cache_progress ||
- objectid > root->highest_objectid)
- __btrfs_add_free_space(ctl, objectid, 1);
- else
- __btrfs_add_free_space(pinned, objectid, 1);
+ __btrfs_add_free_space(pinned, objectid, 1);
mutex_unlock(&root->fs_commit_mutex);
}
@@ -258,6 +244,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
+ spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
struct btrfs_free_space *info;
struct rb_node *n;
u64 count;
@@ -266,24 +253,30 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
return;
while (1) {
+ bool add_to_ctl = true;
+
+ spin_lock(rbroot_lock);
n = rb_first(rbroot);
- if (!n)
+ if (!n) {
+ spin_unlock(rbroot_lock);
break;
+ }
info = rb_entry(n, struct btrfs_free_space, offset_index);
BUG_ON(info->bitmap);
if (info->offset > root->cache_progress)
- goto free;
+ add_to_ctl = false;
else if (info->offset + info->bytes > root->cache_progress)
count = root->cache_progress - info->offset + 1;
else
count = info->bytes;
- __btrfs_add_free_space(ctl, info->offset, count);
-free:
rb_erase(&info->offset_index, rbroot);
- kfree(info);
+ spin_unlock(rbroot_lock);
+ if (add_to_ctl)
+ __btrfs_add_free_space(ctl, info->offset, count);
+ kmem_cache_free(btrfs_free_space_cachep, info);
}
}
@@ -398,6 +391,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_path *path;
struct inode *inode;
+ struct btrfs_block_rsv *rsv;
+ u64 num_bytes;
u64 alloc_hint = 0;
int ret;
int prealloc;
@@ -421,11 +416,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
if (!path)
return -ENOMEM;
+ rsv = trans->block_rsv;
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+ num_bytes = trans->bytes_reserved;
+ /*
+ * 1 item for inode item insertion if need
+ * 3 items for inode item update (in the worst case)
+ * 1 item for free space object
+ * 3 items for pre-allocation
+ */
+ trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
+ ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
+ trans->bytes_reserved);
+ if (ret)
+ goto out;
again:
inode = lookup_free_ino_inode(root, path);
if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
ret = PTR_ERR(inode);
- goto out;
+ goto out_release;
}
if (IS_ERR(inode)) {
@@ -434,7 +444,7 @@ again:
ret = create_free_ino_inode(root, trans, path);
if (ret)
- goto out;
+ goto out_release;
goto again;
}
@@ -465,21 +475,26 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_CACHE_SIZE;
- ret = btrfs_check_data_free_space(inode, prealloc);
+ ret = btrfs_delalloc_reserve_space(inode, prealloc);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
- if (ret)
+ if (ret) {
+ btrfs_delalloc_release_space(inode, prealloc);
goto out_put;
+ }
btrfs_free_reserved_data_space(inode, prealloc);
+ ret = btrfs_write_out_ino_cache(root, trans, path);
out_put:
iput(inode);
+out_release:
+ btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
out:
- if (ret == 0)
- ret = btrfs_write_out_ino_cache(root, trans, path);
+ trans->block_rsv = rsv;
+ trans->bytes_reserved = num_bytes;
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d42e6bf..cb10cb9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -38,6 +38,7 @@
#include <linux/falloc.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <linux/mount.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -45,17 +46,17 @@
#include "btrfs_inode.h"
#include "ioctl.h"
#include "print-tree.h"
-#include "volumes.h"
#include "ordered-data.h"
#include "xattr.h"
#include "tree-log.h"
+#include "volumes.h"
#include "compression.h"
#include "locking.h"
#include "free-space-cache.h"
#include "inode-map.h"
struct btrfs_iget_args {
- u64 ino;
+ struct btrfs_key *location;
struct btrfs_root *root;
};
@@ -93,6 +94,8 @@ static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock);
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
@@ -340,6 +343,7 @@ static noinline int compress_file_range(struct inode *inode,
int i;
int will_compress;
int compress_type = root->fs_info->compress_type;
+ int redirty = 0;
/* if this is a small write inside eof, kick off a defragbot */
if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
@@ -393,11 +397,25 @@ again:
(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
WARN_ON(pages);
pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
- BUG_ON(!pages);
+ if (!pages) {
+ /* just bail out to the uncompressed code */
+ goto cont;
+ }
if (BTRFS_I(inode)->force_compress)
compress_type = BTRFS_I(inode)->force_compress;
+ /*
+ * we need to call clear_page_dirty_for_io on each
+ * page in the range. Otherwise applications with the file
+ * mmap'd can wander in and change the page contents while
+ * we are compressing them.
+ *
+ * If the compression fails for any reason, we set the pages
+ * dirty again later on.
+ */
+ extent_range_clear_dirty_for_io(inode, start, end);
+ redirty = 1;
ret = btrfs_compress_pages(compress_type,
inode->i_mapping, start,
total_compressed, pages,
@@ -424,6 +442,7 @@ again:
will_compress = 1;
}
}
+cont:
if (start == 0) {
trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
@@ -534,6 +553,8 @@ cleanup_and_bail_uncompressed:
__set_page_dirty_nobuffers(locked_page);
/* unlocked later on in the async handlers */
}
+ if (redirty)
+ extent_range_redirty_for_io(inode, start, end);
add_async_extent(async_cow, start, end - start + 1,
0, NULL, 0, BTRFS_COMPRESS_NONE);
*num_added += 1;
@@ -750,15 +771,6 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
return alloc_hint;
}
-static inline bool is_free_space_inode(struct btrfs_root *root,
- struct inode *inode)
-{
- if (root == root->fs_info->tree_root ||
- BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
- return true;
- return false;
-}
-
/*
* when extent_io.c finds a delayed allocation range in the file,
* the call backs end up in this code. The basic idea is to
@@ -791,7 +803,7 @@ static noinline int cow_file_range(struct inode *inode,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
int ret = 0;
- BUG_ON(is_free_space_inode(root, inode));
+ BUG_ON(btrfs_is_free_space_inode(root, inode));
trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -829,7 +841,7 @@ static noinline int cow_file_range(struct inode *inode,
}
BUG_ON(disk_num_bytes >
- btrfs_super_total_bytes(&root->fs_info->super_copy));
+ btrfs_super_total_bytes(root->fs_info->super_copy));
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1070,9 +1082,10 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
- nolock = is_free_space_inode(root, inode);
+ nolock = btrfs_is_free_space_inode(root, inode);
if (nolock)
trans = btrfs_join_transaction_nolock(root);
@@ -1291,15 +1304,16 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
return ret;
}
-static int btrfs_split_extent_hook(struct inode *inode,
- struct extent_state *orig, u64 split)
+static void btrfs_split_extent_hook(struct inode *inode,
+ struct extent_state *orig, u64 split)
{
/* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
- return 0;
+ return;
- atomic_inc(&BTRFS_I(inode)->outstanding_extents);
- return 0;
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
}
/*
@@ -1308,16 +1322,17 @@ static int btrfs_split_extent_hook(struct inode *inode,
* extents, such as when we are doing sequential writes, so we can properly
* account for the metadata space we'll need.
*/
-static int btrfs_merge_extent_hook(struct inode *inode,
- struct extent_state *new,
- struct extent_state *other)
+static void btrfs_merge_extent_hook(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other)
{
/* not delalloc, ignore it */
if (!(other->state & EXTENT_DELALLOC))
- return 0;
+ return;
- atomic_dec(&BTRFS_I(inode)->outstanding_extents);
- return 0;
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->lock);
}
/*
@@ -1325,8 +1340,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
* bytes in this file, and to maintain the list of inodes that
* have pending delalloc work to be done.
*/
-static int btrfs_set_bit_hook(struct inode *inode,
- struct extent_state *state, int *bits)
+static void btrfs_set_bit_hook(struct inode *inode,
+ struct extent_state *state, int *bits)
{
/*
@@ -1337,12 +1352,15 @@ static int btrfs_set_bit_hook(struct inode *inode,
if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
- bool do_list = !is_free_space_inode(root, inode);
+ bool do_list = !btrfs_is_free_space_inode(root, inode);
- if (*bits & EXTENT_FIRST_DELALLOC)
+ if (*bits & EXTENT_FIRST_DELALLOC) {
*bits &= ~EXTENT_FIRST_DELALLOC;
- else
- atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+ } else {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
spin_lock(&root->fs_info->delalloc_lock);
BTRFS_I(inode)->delalloc_bytes += len;
@@ -1353,14 +1371,13 @@ static int btrfs_set_bit_hook(struct inode *inode,
}
spin_unlock(&root->fs_info->delalloc_lock);
}
- return 0;
}
/*
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
-static int btrfs_clear_bit_hook(struct inode *inode,
- struct extent_state *state, int *bits)
+static void btrfs_clear_bit_hook(struct inode *inode,
+ struct extent_state *state, int *bits)
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
@@ -1370,12 +1387,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
- bool do_list = !is_free_space_inode(root, inode);
+ bool do_list = !btrfs_is_free_space_inode(root, inode);
- if (*bits & EXTENT_FIRST_DELALLOC)
+ if (*bits & EXTENT_FIRST_DELALLOC) {
*bits &= ~EXTENT_FIRST_DELALLOC;
- else if (!(*bits & EXTENT_DO_ACCOUNTING))
- atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+ } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
if (*bits & EXTENT_DO_ACCOUNTING)
btrfs_delalloc_release_metadata(inode, len);
@@ -1394,7 +1414,6 @@ static int btrfs_clear_bit_hook(struct inode *inode,
}
spin_unlock(&root->fs_info->delalloc_lock);
}
- return 0;
}
/*
@@ -1477,7 +1496,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- if (is_free_space_inode(root, inode))
+ if (btrfs_is_free_space_inode(root, inode))
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
else
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
@@ -1644,7 +1663,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
int ret;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
path->leave_spinning = 1;
@@ -1726,7 +1746,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
return 0;
BUG_ON(!ordered_extent);
- nolock = is_free_space_inode(root, inode);
+ nolock = btrfs_is_free_space_inode(root, inode);
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list));
@@ -1738,7 +1758,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode_fallback(trans, root, inode);
BUG_ON(ret);
}
goto out;
@@ -1787,18 +1807,18 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
&ordered_extent->list);
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
- if (!ret) {
- ret = btrfs_update_inode(trans, root, inode);
+ if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+ ret = btrfs_update_inode_fallback(trans, root, inode);
BUG_ON(ret);
}
ret = 0;
out:
- if (nolock) {
- if (trans)
- btrfs_end_transaction_nolock(trans, root);
- } else {
+ if (root != root->fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, ordered_extent->len);
- if (trans)
+ if (trans) {
+ if (nolock)
+ btrfs_end_transaction_nolock(trans, root);
+ else
btrfs_end_transaction(trans, root);
}
@@ -1820,153 +1840,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
}
/*
- * When IO fails, either with EIO or csum verification fails, we
- * try other mirrors that might have a good copy of the data. This
- * io_failure_record is used to record state as we go through all the
- * mirrors. If another mirror has good data, the page is set up to date
- * and things continue. If a good mirror can't be found, the original
- * bio end_io callback is called to indicate things have failed.
- */
-struct io_failure_record {
- struct page *page;
- u64 start;
- u64 len;
- u64 logical;
- unsigned long bio_flags;
- int last_mirror;
-};
-
-static int btrfs_io_failed_hook(struct bio *failed_bio,
- struct page *page, u64 start, u64 end,
- struct extent_state *state)
-{
- struct io_failure_record *failrec = NULL;
- u64 private;
- struct extent_map *em;
- struct inode *inode = page->mapping->host;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct bio *bio;
- int num_copies;
- int ret;
- int rw;
- u64 logical;
-
- ret = get_state_private(failure_tree, start, &private);
- if (ret) {
- failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
- if (!failrec)
- return -ENOMEM;
- failrec->start = start;
- failrec->len = end - start + 1;
- failrec->last_mirror = 0;
- failrec->bio_flags = 0;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, failrec->len);
- if (em->start > start || em->start + em->len < start) {
- free_extent_map(em);
- em = NULL;
- }
- read_unlock(&em_tree->lock);
-
- if (IS_ERR_OR_NULL(em)) {
- kfree(failrec);
- return -EIO;
- }
- logical = start - em->start;
- logical = em->block_start + logical;
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- logical = em->block_start;
- failrec->bio_flags = EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&failrec->bio_flags,
- em->compress_type);
- }
- failrec->logical = logical;
- free_extent_map(em);
- set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
- EXTENT_DIRTY, GFP_NOFS);
- set_state_private(failure_tree, start,
- (u64)(unsigned long)failrec);
- } else {
- failrec = (struct io_failure_record *)(unsigned long)private;
- }
- num_copies = btrfs_num_copies(
- &BTRFS_I(inode)->root->fs_info->mapping_tree,
- failrec->logical, failrec->len);
- failrec->last_mirror++;
- if (!state) {
- spin_lock(&BTRFS_I(inode)->io_tree.lock);
- state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
- failrec->start,
- EXTENT_LOCKED);
- if (state && state->start != failrec->start)
- state = NULL;
- spin_unlock(&BTRFS_I(inode)->io_tree.lock);
- }
- if (!state || failrec->last_mirror > num_copies) {
- set_state_private(failure_tree, failrec->start, 0);
- clear_extent_bits(failure_tree, failrec->start,
- failrec->start + failrec->len - 1,
- EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
- kfree(failrec);
- return -EIO;
- }
- bio = bio_alloc(GFP_NOFS, 1);
- bio->bi_private = state;
- bio->bi_end_io = failed_bio->bi_end_io;
- bio->bi_sector = failrec->logical >> 9;
- bio->bi_bdev = failed_bio->bi_bdev;
- bio->bi_size = 0;
-
- bio_add_page(bio, page, failrec->len, start - page_offset(page));
- if (failed_bio->bi_rw & REQ_WRITE)
- rw = WRITE;
- else
- rw = READ;
-
- ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
- failrec->last_mirror,
- failrec->bio_flags, 0);
- return ret;
-}
-
-/*
- * each time an IO finishes, we do a fast check in the IO failure tree
- * to see if we need to process or clean up an io_failure_record
- */
-static int btrfs_clean_io_failures(struct inode *inode, u64 start)
-{
- u64 private;
- u64 private_failure;
- struct io_failure_record *failure;
- int ret;
-
- private = 0;
- if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
- (u64)-1, 1, EXTENT_DIRTY, 0)) {
- ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
- start, &private_failure);
- if (ret == 0) {
- failure = (struct io_failure_record *)(unsigned long)
- private_failure;
- set_state_private(&BTRFS_I(inode)->io_failure_tree,
- failure->start, 0);
- clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
- failure->start,
- failure->start + failure->len - 1,
- EXTENT_DIRTY | EXTENT_LOCKED,
- GFP_NOFS);
- kfree(failure);
- }
- }
- return 0;
-}
-
-/*
* when reads are done, we need to check csums to verify the data is correct
- * if there's a match, we allow the bio to finish. If not, we go through
- * the io_failure_record routines to find good copies
+ * if there's a match, we allow the bio to finish. If not, the code in
+ * extent_io.c will try to find good copies for us.
*/
static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state)
@@ -2012,10 +1888,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
kunmap_atomic(kaddr, KM_USER0);
good:
- /* if the io failure tree for this inode is non-empty,
- * check to see if we've recovered from a failed IO
- */
- btrfs_clean_io_failures(inode, start);
return 0;
zeroit:
@@ -2080,89 +1952,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
up_read(&root->fs_info->cleanup_work_sem);
}
-/*
- * calculate extra metadata reservation when snapshotting a subvolume
- * contains orphan files.
- */
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending,
- u64 *bytes_to_reserve)
-{
- struct btrfs_root *root;
- struct btrfs_block_rsv *block_rsv;
- u64 num_bytes;
- int index;
-
- root = pending->root;
- if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
- return;
-
- block_rsv = root->orphan_block_rsv;
-
- /* orphan block reservation for the snapshot */
- num_bytes = block_rsv->size;
-
- /*
- * after the snapshot is created, COWing tree blocks may use more
- * space than it frees. So we should make sure there is enough
- * reserved space.
- */
- index = trans->transid & 0x1;
- if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
- num_bytes += block_rsv->size -
- (block_rsv->reserved + block_rsv->freed[index]);
- }
-
- *bytes_to_reserve += num_bytes;
-}
-
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending)
-{
- struct btrfs_root *root = pending->root;
- struct btrfs_root *snap = pending->snap;
- struct btrfs_block_rsv *block_rsv;
- u64 num_bytes;
- int index;
- int ret;
-
- if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
- return;
-
- /* refill source subvolume's orphan block reservation */
- block_rsv = root->orphan_block_rsv;
- index = trans->transid & 0x1;
- if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
- num_bytes = block_rsv->size -
- (block_rsv->reserved + block_rsv->freed[index]);
- ret = btrfs_block_rsv_migrate(&pending->block_rsv,
- root->orphan_block_rsv,
- num_bytes);
- BUG_ON(ret);
- }
-
- /* setup orphan block reservation for the snapshot */
- block_rsv = btrfs_alloc_block_rsv(snap);
- BUG_ON(!block_rsv);
-
- btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
- snap->orphan_block_rsv = block_rsv;
-
- num_bytes = root->orphan_block_rsv->size;
- ret = btrfs_block_rsv_migrate(&pending->block_rsv,
- block_rsv, num_bytes);
- BUG_ON(ret);
-
-#if 0
- /* insert orphan item for the snapshot */
- WARN_ON(!root->orphan_item_inserted);
- ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
- snap->root_key.objectid);
- BUG_ON(ret);
- snap->orphan_item_inserted = 1;
-#endif
-}
-
enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_STARTED = 1,
ORPHAN_CLEANUP_DONE = 2,
@@ -2214,7 +2003,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
if (!root->orphan_block_rsv) {
block_rsv = btrfs_alloc_block_rsv(root);
- BUG_ON(!block_rsv);
+ if (!block_rsv)
+ return -ENOMEM;
}
spin_lock(&root->orphan_lock);
@@ -2247,9 +2037,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
}
spin_unlock(&root->orphan_lock);
- if (block_rsv)
- btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-
/* grab metadata reservation from transaction handle */
if (reserve) {
ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2259,7 +2046,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
/* insert an orphan item to track this unlinked/truncated file */
if (insert >= 1) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
- BUG_ON(ret);
+ BUG_ON(ret && ret != -EEXIST);
}
/* insert an orphan item to track subvolume contains orphan files */
@@ -2316,6 +2103,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
struct btrfs_key key, found_key;
struct btrfs_trans_handle *trans;
struct inode *inode;
+ u64 last_objectid = 0;
int ret = 0, nr_unlink = 0, nr_truncate = 0;
if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2367,41 +2155,81 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* crossing root thing. we store the inode number in the
* offset of the orphan item.
*/
+
+ if (found_key.offset == last_objectid) {
+ printk(KERN_ERR "btrfs: Error removing orphan entry, "
+ "stopping orphan cleanup\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ last_objectid = found_key.offset;
+
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
+ ret = PTR_RET(inode);
+ if (ret && ret != -ESTALE)
goto out;
- }
- /*
- * add this inode to the orphan list so btrfs_orphan_del does
- * the proper thing when we hit it
- */
- spin_lock(&root->orphan_lock);
- list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
- spin_unlock(&root->orphan_lock);
+ if (ret == -ESTALE && root == root->fs_info->tree_root) {
+ struct btrfs_root *dead_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int is_dead_root = 0;
+ /*
+ * this is an orphan in the tree root. Currently these
+ * could come from 2 sources:
+ * a) a snapshot deletion in progress
+ * b) a free space cache inode
+ * We need to distinguish those two, as the snapshot
+ * orphan must not get deleted.
+ * find_dead_roots already ran before us, so if this
+ * is a snapshot deletion, we should find the root
+ * in the dead_roots list
+ */
+ spin_lock(&fs_info->trans_lock);
+ list_for_each_entry(dead_root, &fs_info->dead_roots,
+ root_list) {
+ if (dead_root->root_key.objectid ==
+ found_key.objectid) {
+ is_dead_root = 1;
+ break;
+ }
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (is_dead_root) {
+ /* prevent this orphan from being found again */
+ key.offset = found_key.objectid - 1;
+ continue;
+ }
+ }
/*
- * if this is a bad inode, means we actually succeeded in
- * removing the inode, but not the orphan record, which means
- * we need to manually delete the orphan since iput will just
- * do a destroy_inode
+ * Inode is already gone but the orphan item is still there,
+ * kill the orphan item.
*/
- if (is_bad_inode(inode)) {
- trans = btrfs_start_transaction(root, 0);
+ if (ret == -ESTALE) {
+ trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
- btrfs_orphan_del(trans, inode);
+ ret = btrfs_del_orphan_item(trans, root,
+ found_key.objectid);
+ BUG_ON(ret);
btrfs_end_transaction(trans, root);
- iput(inode);
continue;
}
+ /*
+ * add this inode to the orphan list so btrfs_orphan_del does
+ * the proper thing when we hit it
+ */
+ spin_lock(&root->orphan_lock);
+ list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ spin_unlock(&root->orphan_lock);
+
/* if we have links, this was a truncate, lets do that */
if (inode->i_nlink) {
if (!S_ISREG(inode->i_mode)) {
@@ -2410,7 +2238,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
continue;
}
nr_truncate++;
+ /*
+ * Need to hold the imutex for reservation purposes, not
+ * a huge deal here but I have a WARN_ON in
+ * btrfs_delalloc_reserve_space to catch offenders.
+ */
+ mutex_lock(&inode->i_mutex);
ret = btrfs_truncate(inode);
+ mutex_unlock(&inode->i_mutex);
} else {
nr_unlink++;
}
@@ -2420,6 +2255,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
if (ret)
goto out;
}
+ /* release the path since we're done with it */
+ btrfs_release_path(path);
+
root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
if (root->orphan_block_rsv)
@@ -2516,7 +2354,9 @@ static void btrfs_read_locked_inode(struct inode *inode)
filled = true;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ goto make_bad;
+
path->leave_spinning = 1;
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
@@ -2531,15 +2371,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
- if (!leaf->map_token)
- map_private_extent_buffer(leaf, (unsigned long)inode_item,
- sizeof(struct btrfs_inode_item),
- &leaf->map_token, &leaf->kaddr,
- &leaf->map_start, &leaf->map_len,
- KM_USER1);
-
inode->i_mode = btrfs_inode_mode(leaf, inode_item);
- inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+ set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
inode->i_uid = btrfs_inode_uid(leaf, inode_item);
inode->i_gid = btrfs_inode_gid(leaf, inode_item);
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
@@ -2575,11 +2408,6 @@ cache_acl:
if (!maybe_acls)
cache_no_acl(inode);
- if (leaf->map_token) {
- unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
- leaf->map_token = NULL;
- }
-
btrfs_free_path(path);
switch (inode->i_mode & S_IFMT) {
@@ -2624,13 +2452,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *item,
struct inode *inode)
{
- if (!leaf->map_token)
- map_private_extent_buffer(leaf, (unsigned long)item,
- sizeof(struct btrfs_inode_item),
- &leaf->map_token, &leaf->kaddr,
- &leaf->map_start, &leaf->map_len,
- KM_USER1);
-
btrfs_set_inode_uid(leaf, item, inode->i_uid);
btrfs_set_inode_gid(leaf, item, inode->i_gid);
btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2659,17 +2480,12 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
btrfs_set_inode_block_group(leaf, item, 0);
-
- if (leaf->map_token) {
- unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
- leaf->map_token = NULL;
- }
}
/*
* copy everything in the in-memory inode into the btree.
*/
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode)
{
struct btrfs_inode_item *inode_item;
@@ -2677,21 +2493,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int ret;
- /*
- * If the inode is a free space inode, we can deadlock during commit
- * if we put it into the delayed code.
- *
- * The data relocation inode should also be directly updated
- * without delay
- */
- if (!is_free_space_inode(root, inode)
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
- ret = btrfs_delayed_update_inode(trans, root, inode);
- if (!ret)
- btrfs_set_inode_last_trans(trans, inode);
- return ret;
- }
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2720,6 +2521,43 @@ failed:
}
/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
+{
+ int ret;
+
+ /*
+ * If the inode is a free space inode, we can deadlock during commit
+ * if we put it into the delayed code.
+ *
+ * The data relocation inode should also be directly updated
+ * without delay
+ */
+ if (!btrfs_is_free_space_inode(root, inode)
+ && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ ret = btrfs_delayed_update_inode(trans, root, inode);
+ if (!ret)
+ btrfs_set_inode_last_trans(trans, inode);
+ return ret;
+ }
+
+ return btrfs_update_inode_item(trans, root, inode);
+}
+
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
+{
+ int ret;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret == -ENOSPC)
+ return btrfs_update_inode_item(trans, root, inode);
+ return ret;
+}
+
+/*
* unlink helper that gets used here in inode.c and in the tree logging
* recovery code. It remove a link in a directory with a given name, and
* also drops the back refs in the inode to the directory
@@ -2857,7 +2695,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
u64 ino = btrfs_ino(inode);
u64 dir_ino = btrfs_ino(dir);
- trans = btrfs_start_transaction(root, 10);
+ /*
+ * 1 for the possible orphan item
+ * 1 for the dir item
+ * 1 for the dir index
+ * 1 for the inode ref
+ * 1 for the inode ref in the tree log
+ * 2 for the dir entries in the log
+ * 1 for the inode
+ */
+ trans = btrfs_start_transaction(root, 8);
if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
return trans;
@@ -2880,7 +2727,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
return ERR_PTR(-ENOMEM);
}
- trans = btrfs_start_transaction(root, 0);
+ /* 1 for the orphan item */
+ trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
btrfs_free_path(path);
root->fs_info->enospc_unlink = 0;
@@ -2985,6 +2833,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
err = 0;
out:
btrfs_free_path(path);
+ /* Migrate the orphan reservation over */
+ if (!err)
+ err = btrfs_block_rsv_migrate(trans->block_rsv,
+ &root->fs_info->global_block_rsv,
+ trans->bytes_reserved);
+
if (err) {
btrfs_end_transaction(trans, root);
root->fs_info->enospc_unlink = 0;
@@ -2999,6 +2853,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+ btrfs_block_rsv_release(root, trans->block_rsv,
+ trans->bytes_reserved);
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
BUG_ON(!root->fs_info->enospc_unlink);
root->fs_info->enospc_unlink = 0;
}
@@ -3021,13 +2878,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len);
- BUG_ON(ret);
+ if (ret)
+ goto out;
if (inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, inode);
- BUG_ON(ret);
+ if (ret)
+ goto out;
}
+out:
nr = trans->blocks_used;
__unlink_end_trans(trans, root);
btrfs_btree_balance_dirty(root, nr);
@@ -3170,6 +3030,11 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = -1;
+
if (root->ref_cows || root == root->fs_info->tree_root)
btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
@@ -3182,10 +3047,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
if (min_type == 0 && root == BTRFS_I(inode)->root)
btrfs_kill_delayed_inode_items(inode);
- path = btrfs_alloc_path();
- BUG_ON(!path);
- path->reada = -1;
-
key.objectid = ino;
key.offset = (u64)-1;
key.type = (u8)-1;
@@ -3386,6 +3247,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
pgoff_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
struct page *page;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
int ret = 0;
u64 page_start;
u64 page_end;
@@ -3398,7 +3260,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
ret = -ENOMEM;
again:
- page = grab_cache_page(mapping, index);
+ page = find_or_create_page(mapping, index, mask);
if (!page) {
btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
goto out;
@@ -3519,7 +3381,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
u64 hint_byte = 0;
hole_size = last_byte - cur_offset;
- trans = btrfs_start_transaction(root, 2);
+ trans = btrfs_start_transaction(root, 3);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
break;
@@ -3528,19 +3390,26 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
err = btrfs_drop_extents(trans, inode, cur_offset,
cur_offset + hole_size,
&hint_byte, 1);
- if (err)
+ if (err) {
+ btrfs_update_inode(trans, root, inode);
+ btrfs_end_transaction(trans, root);
break;
+ }
err = btrfs_insert_file_extent(trans, root,
btrfs_ino(inode), cur_offset, 0,
0, hole_size, 0, hole_size,
0, 0, 0);
- if (err)
+ if (err) {
+ btrfs_update_inode(trans, root, inode);
+ btrfs_end_transaction(trans, root);
break;
+ }
btrfs_drop_extent_cache(inode, hole_start,
last_byte - 1, 0);
+ btrfs_update_inode(trans, root, inode);
btrfs_end_transaction(trans, root);
}
free_extent_map(em);
@@ -3558,6 +3427,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
static int btrfs_setsize(struct inode *inode, loff_t newsize)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
loff_t oldsize = i_size_read(inode);
int ret;
@@ -3565,16 +3436,19 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
return 0;
if (newsize > oldsize) {
- i_size_write(inode, newsize);
- btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
truncate_pagecache(inode, oldsize, newsize);
ret = btrfs_cont_expand(inode, oldsize, newsize);
- if (ret) {
- btrfs_setsize(inode, oldsize);
+ if (ret)
return ret;
- }
- mark_inode_dirty(inode);
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ i_size_write(inode, newsize);
+ btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ btrfs_end_transaction_throttle(trans, root);
} else {
/*
@@ -3614,9 +3488,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid) {
setattr_copy(inode, attr);
- mark_inode_dirty(inode);
+ err = btrfs_dirty_inode(inode);
- if (attr->ia_valid & ATTR_MODE)
+ if (!err && attr->ia_valid & ATTR_MODE)
err = btrfs_acl_chmod(inode);
}
@@ -3627,6 +3501,8 @@ void btrfs_evict_inode(struct inode *inode)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *rsv, *global_rsv;
+ u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
unsigned long nr;
int ret;
@@ -3634,7 +3510,7 @@ void btrfs_evict_inode(struct inode *inode)
truncate_inode_pages(&inode->i_data, 0);
if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
- is_free_space_inode(root, inode)))
+ btrfs_is_free_space_inode(root, inode)))
goto no_delete;
if (is_bad_inode(inode)) {
@@ -3642,7 +3518,8 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (!special_file(inode->i_mode))
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (root->fs_info->log_root_recovering) {
BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
@@ -3654,22 +3531,55 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
+ rsv = btrfs_alloc_block_rsv(root);
+ if (!rsv) {
+ btrfs_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+ rsv->size = min_size;
+ global_rsv = &root->fs_info->global_block_rsv;
+
btrfs_i_size_write(inode, 0);
+ /*
+ * This is a bit simpler than btrfs_truncate since
+ *
+ * 1) We've already reserved our space for our orphan item in the
+ * unlink.
+ * 2) We're going to delete the inode item, so we don't need to update
+ * it at all.
+ *
+ * So we just need to reserve some slack space in case we add bytes when
+ * doing the truncate.
+ */
while (1) {
- trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
- trans->block_rsv = root->orphan_block_rsv;
+ ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
+
+ /*
+ * Try and steal from the global reserve since we will
+ * likely not use this space anyway, we want to try as
+ * hard as possible to get this to work.
+ */
+ if (ret)
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
- ret = btrfs_block_rsv_check(trans, root,
- root->orphan_block_rsv, 0, 5);
if (ret) {
- BUG_ON(ret != -EAGAIN);
- ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
- continue;
+ printk(KERN_WARNING "Could not get space for a "
+ "delete, will truncate on mount %d\n", ret);
+ btrfs_orphan_del(NULL, inode);
+ btrfs_free_block_rsv(root, rsv);
+ goto no_delete;
+ }
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_orphan_del(NULL, inode);
+ btrfs_free_block_rsv(root, rsv);
+ goto no_delete;
}
+ trans->block_rsv = rsv;
+
ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
if (ret != -EAGAIN)
break;
@@ -3678,14 +3588,17 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_end_transaction(trans, root);
trans = NULL;
btrfs_btree_balance_dirty(root, nr);
-
}
+ btrfs_free_block_rsv(root, rsv);
+
if (ret == 0) {
+ trans->block_rsv = root->orphan_block_rsv;
ret = btrfs_orphan_del(trans, inode);
BUG_ON(ret);
}
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
if (!(root == root->fs_info->tree_root ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
btrfs_return_ino(root, btrfs_ino(inode));
@@ -3713,7 +3626,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
int ret = 0;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
namelen, 0);
@@ -3934,7 +3848,9 @@ again:
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
- inode->i_ino = args->ino;
+ inode->i_ino = args->location->objectid;
+ memcpy(&BTRFS_I(inode)->location, args->location,
+ sizeof(*args->location));
BTRFS_I(inode)->root = args->root;
btrfs_set_inode_space_info(args->root, inode);
return 0;
@@ -3943,20 +3859,20 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
static int btrfs_find_actor(struct inode *inode, void *opaque)
{
struct btrfs_iget_args *args = opaque;
- return args->ino == btrfs_ino(inode) &&
+ return args->location->objectid == BTRFS_I(inode)->location.objectid &&
args->root == BTRFS_I(inode)->root;
}
static struct inode *btrfs_iget_locked(struct super_block *s,
- u64 objectid,
+ struct btrfs_key *location,
struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
- args.ino = objectid;
+ args.location = location;
args.root = root;
- inode = iget5_locked(s, objectid, btrfs_find_actor,
+ inode = iget5_locked(s, location->objectid, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
return inode;
@@ -3970,18 +3886,22 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
{
struct inode *inode;
- inode = btrfs_iget_locked(s, location->objectid, root);
+ inode = btrfs_iget_locked(s, location, root);
if (!inode)
return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
- BTRFS_I(inode)->root = root;
- memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
btrfs_read_locked_inode(inode);
- inode_tree_add(inode);
- unlock_new_inode(inode);
- if (new)
- *new = 1;
+ if (!is_bad_inode(inode)) {
+ inode_tree_add(inode);
+ unlock_new_inode(inode);
+ if (new)
+ *new = 1;
+ } else {
+ unlock_new_inode(inode);
+ iput(inode);
+ inode = ERR_PTR(-ESTALE);
+ }
}
return inode;
@@ -4016,12 +3936,20 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
struct btrfs_root *sub_root = root;
struct btrfs_key location;
int index;
- int ret;
+ int ret = 0;
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- ret = btrfs_inode_by_name(dir, dentry, &location);
+ if (unlikely(d_need_lookup(dentry))) {
+ memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
+ kfree(dentry->d_fsdata);
+ dentry->d_fsdata = NULL;
+ /* This thing is hashed, drop it for now */
+ d_drop(dentry);
+ } else {
+ ret = btrfs_inode_by_name(dir, dentry, &location);
+ }
if (ret < 0)
return ERR_PTR(ret);
@@ -4076,16 +4004,24 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
return 0;
}
+static void btrfs_dentry_release(struct dentry *dentry)
+{
+ if (dentry->d_fsdata)
+ kfree(dentry->d_fsdata);
+}
+
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
- struct inode *inode;
-
- inode = btrfs_lookup_dentry(dir, dentry);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
+ struct dentry *ret;
- return d_splice_alias(inode, dentry);
+ ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
+ if (unlikely(d_need_lookup(dentry))) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
+ spin_unlock(&dentry->d_lock);
+ }
+ return ret;
}
unsigned char btrfs_filetype_table[] = {
@@ -4104,6 +4040,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
struct btrfs_path *path;
struct list_head ins_list;
struct list_head del_list;
+ struct qstr q;
int ret;
struct extent_buffer *leaf;
int slot;
@@ -4194,6 +4131,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
while (di_cur < di_total) {
struct btrfs_key location;
+ struct dentry *tmp;
if (verify_dir_item(root, leaf, di))
break;
@@ -4214,6 +4152,33 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
btrfs_dir_item_key_to_cpu(leaf, di, &location);
+ q.name = name_ptr;
+ q.len = name_len;
+ q.hash = full_name_hash(q.name, q.len);
+ tmp = d_lookup(filp->f_dentry, &q);
+ if (!tmp) {
+ struct btrfs_key *newkey;
+
+ newkey = kzalloc(sizeof(struct btrfs_key),
+ GFP_NOFS);
+ if (!newkey)
+ goto no_dentry;
+ tmp = d_alloc(filp->f_dentry, &q);
+ if (!tmp) {
+ kfree(newkey);
+ dput(tmp);
+ goto no_dentry;
+ }
+ memcpy(newkey, &location,
+ sizeof(struct btrfs_key));
+ tmp->d_fsdata = newkey;
+ tmp->d_flags |= DCACHE_NEED_LOOKUP;
+ d_rehash(tmp);
+ dput(tmp);
+ } else {
+ dput(tmp);
+ }
+no_dentry:
/* is this a reference to our own snapshot? If so
* skip it
*/
@@ -4278,7 +4243,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (BTRFS_I(inode)->dummy_inode)
return 0;
- if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode))
+ if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
nolock = true;
if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4302,42 +4267,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
* FIXME, needs more benchmarking...there are no reasons other than performance
* to keep or drop this code.
*/
-void btrfs_dirty_inode(struct inode *inode, int flags)
+int btrfs_dirty_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
int ret;
if (BTRFS_I(inode)->dummy_inode)
- return;
+ return 0;
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
ret = btrfs_update_inode(trans, root, inode);
if (ret && ret == -ENOSPC) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans, root);
trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- printk_ratelimited(KERN_ERR "btrfs: fail to "
- "dirty inode %llu error %ld\n",
- (unsigned long long)btrfs_ino(inode),
- PTR_ERR(trans));
- return;
- }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- printk_ratelimited(KERN_ERR "btrfs: fail to "
- "dirty inode %llu error %d\n",
- (unsigned long long)btrfs_ino(inode),
- ret);
- }
}
btrfs_end_transaction(trans, root);
if (BTRFS_I(inode)->delayed_node)
btrfs_balance_delayed_items(root);
+
+ return ret;
+}
+
+/*
+ * This is a copy of file_update_time. We need this so we can return error on
+ * ENOSPC for updating the inode in the case of file write and mmap writes.
+ */
+int btrfs_update_time(struct file *file)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct timespec now;
+ int ret;
+ enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
+
+ /* First try to exhaust all avenues to not sync */
+ if (IS_NOCMTIME(inode))
+ return 0;
+
+ now = current_fs_time(inode->i_sb);
+ if (!timespec_equal(&inode->i_mtime, &now))
+ sync_it = S_MTIME;
+
+ if (!timespec_equal(&inode->i_ctime, &now))
+ sync_it |= S_CTIME;
+
+ if (IS_I_VERSION(inode))
+ sync_it |= S_VERSION;
+
+ if (!sync_it)
+ return 0;
+
+ /* Finally allowed to write? Takes lock. */
+ if (mnt_want_write_file(file))
+ return 0;
+
+ /* Only change inode inside the lock region */
+ if (sync_it & S_VERSION)
+ inode_inc_iversion(inode);
+ if (sync_it & S_CTIME)
+ inode->i_ctime = now;
+ if (sync_it & S_MTIME)
+ inode->i_mtime = now;
+ ret = btrfs_dirty_inode(inode);
+ if (!ret)
+ mark_inode_dirty_sync(inode);
+ mnt_drop_write(file->f_path.mnt);
+ return ret;
}
/*
@@ -4439,7 +4442,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
int owner;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return ERR_PTR(-ENOMEM);
inode = new_inode(root->fs_info->sb);
if (!inode) {
@@ -4474,7 +4478,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
inode->i_generation = BTRFS_I(inode)->generation;
btrfs_set_inode_space_info(root, inode);
- if (mode & S_IFDIR)
+ if (S_ISDIR(mode))
owner = 0;
else
owner = 1;
@@ -4519,7 +4523,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_inherit_iflags(inode, dir);
- if ((mode & S_IFREG)) {
+ if (S_ISREG(mode)) {
if (btrfs_test_opt(root, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
if (btrfs_test_opt(root, NODATACOW) ||
@@ -4601,10 +4605,6 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
int err = btrfs_add_link(trans, dir, inode,
dentry->d_name.name, dentry->d_name.len,
backref, index);
- if (!err) {
- d_instantiate(dentry, inode);
- return 0;
- }
if (err > 0)
err = -EEXIST;
return err;
@@ -4652,13 +4652,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
+ /*
+ * If the active LSM wants to access the inode during
+ * d_instantiate it needs these. Smack checks to see
+ * if the filesystem supports xattrs by looking at the
+ * ops vector.
+ */
+
+ inode->i_op = &btrfs_special_inode_operations;
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
drop_inode = 1;
else {
- inode->i_op = &btrfs_special_inode_operations;
init_special_inode(inode, inode->i_mode, rdev);
btrfs_update_inode(trans, root, inode);
+ d_instantiate(dentry, inode);
}
out_unlock:
nr = trans->blocks_used;
@@ -4710,15 +4718,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
+ /*
+ * If the active LSM wants to access the inode during
+ * d_instantiate it needs these. Smack checks to see
+ * if the filesystem supports xattrs by looking at the
+ * ops vector.
+ */
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
drop_inode = 1;
else {
inode->i_mapping->a_ops = &btrfs_aops;
inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
- inode->i_fop = &btrfs_file_operations;
- inode->i_op = &btrfs_file_inode_operations;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ d_instantiate(dentry, inode);
}
out_unlock:
nr = trans->blocks_used;
@@ -4773,11 +4789,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (err) {
drop_inode = 1;
} else {
- struct dentry *parent = dget_parent(dentry);
+ struct dentry *parent = dentry->d_parent;
err = btrfs_update_inode(trans, root, inode);
BUG_ON(err);
+ d_instantiate(dentry, inode);
btrfs_log_new_name(trans, inode, NULL, parent);
- dput(parent);
}
nr = trans->blocks_used;
@@ -5757,8 +5773,7 @@ again:
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
ret = btrfs_ordered_update_i_size(inode, 0, ordered);
if (!ret)
- ret = btrfs_update_inode(trans, root, inode);
- err = ret;
+ err = btrfs_update_inode_fallback(trans, root, inode);
goto out;
}
@@ -5795,8 +5810,8 @@ again:
add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
ret = btrfs_ordered_update_i_size(inode, 0, ordered);
- if (!ret)
- btrfs_update_inode(trans, root, inode);
+ if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
+ btrfs_update_inode_fallback(trans, root, inode);
ret = 0;
out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
@@ -6251,7 +6266,7 @@ int btrfs_readpage(struct file *file, struct page *page)
{
struct extent_io_tree *tree;
tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_read_full_page(tree, page, btrfs_get_extent);
+ return extent_read_full_page(tree, page, btrfs_get_extent, 0);
}
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -6402,7 +6417,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 page_start;
u64 page_end;
+ /* Need this to keep space reservations serialized */
+ mutex_lock(&inode->i_mutex);
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ mutex_unlock(&inode->i_mutex);
+ if (!ret)
+ ret = btrfs_update_time(vma->vm_file);
if (ret) {
if (ret == -ENOMEM)
ret = VM_FAULT_OOM;
@@ -6503,6 +6523,7 @@ static int btrfs_truncate(struct inode *inode)
struct btrfs_trans_handle *trans;
unsigned long nr;
u64 mask = root->sectorsize - 1;
+ u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
if (ret)
@@ -6550,19 +6571,23 @@ static int btrfs_truncate(struct inode *inode)
rsv = btrfs_alloc_block_rsv(root);
if (!rsv)
return -ENOMEM;
- btrfs_add_durable_block_rsv(root->fs_info, rsv);
+ rsv->size = min_size;
+ /*
+ * 1 for the truncate slack space
+ * 1 for the orphan item we're going to add
+ * 1 for the orphan item deletion
+ * 1 for updating the inode.
+ */
trans = btrfs_start_transaction(root, 4);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out;
}
- /*
- * Reserve space for the truncate process. Truncate should be adding
- * space, but if there are snapshots it may end up using space.
- */
- ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
+ /* Migrate the slack space for the truncate to our reserve */
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+ min_size);
BUG_ON(ret);
ret = btrfs_orphan_add(trans, inode);
@@ -6571,21 +6596,6 @@ static int btrfs_truncate(struct inode *inode)
goto out;
}
- nr = trans->blocks_used;
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
-
- /*
- * Ok so we've already migrated our bytes over for the truncate, so here
- * just reserve the one slot we need for updating the inode.
- */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
- goto out;
- }
- trans->block_rsv = rsv;
-
/*
* setattr is responsible for setting the ordered_data_close flag,
* but that is only tested during the last file release. That
@@ -6607,20 +6617,31 @@ static int btrfs_truncate(struct inode *inode)
btrfs_add_ordered_operation(trans, root, inode);
while (1) {
+ ret = btrfs_block_rsv_refill(root, rsv, min_size);
+ if (ret) {
+ /*
+ * This can only happen with the original transaction we
+ * started above, every other time we shouldn't have a
+ * transaction started yet.
+ */
+ if (ret == -EAGAIN)
+ goto end_trans;
+ err = ret;
+ break;
+ }
+
if (!trans) {
- trans = btrfs_start_transaction(root, 3);
+ /* Just need the 1 for updating the inode */
+ trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
- goto out;
+ ret = err = PTR_ERR(trans);
+ trans = NULL;
+ break;
}
-
- ret = btrfs_truncate_reserve_metadata(trans, root,
- rsv);
- BUG_ON(ret);
-
- trans->block_rsv = rsv;
}
+ trans->block_rsv = rsv;
+
ret = btrfs_truncate_inode_items(trans, root, inode,
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
@@ -6635,7 +6656,7 @@ static int btrfs_truncate(struct inode *inode)
err = ret;
break;
}
-
+end_trans:
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
trans = NULL;
@@ -6655,14 +6676,16 @@ static int btrfs_truncate(struct inode *inode)
ret = btrfs_orphan_del(NULL, inode);
}
- trans->block_rsv = &root->fs_info->trans_block_rsv;
- ret = btrfs_update_inode(trans, root, inode);
- if (ret && !err)
- err = ret;
+ if (trans) {
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret && !err)
+ err = ret;
- nr = trans->blocks_used;
- ret = btrfs_end_transaction_throttle(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction_throttle(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+ }
out:
btrfs_free_block_rsv(root, rsv);
@@ -6690,7 +6713,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
btrfs_i_size_write(inode, 0);
err = btrfs_update_inode(trans, new_root, inode);
@@ -6700,19 +6723,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
return 0;
}
-/* helper function for file defrag and space balancing. This
- * forces readahead on a given range of bytes in an inode
- */
-unsigned long btrfs_force_ra(struct address_space *mapping,
- struct file_ra_state *ra, struct file *file,
- pgoff_t offset, pgoff_t last_index)
-{
- pgoff_t req_size = last_index - offset + 1;
-
- page_cache_sync_readahead(mapping, ra, file, offset, req_size);
- return offset + req_size;
-}
-
struct inode *btrfs_alloc_inode(struct super_block *sb)
{
struct btrfs_inode *ei;
@@ -6730,19 +6740,21 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
- ei->reserved_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
+ ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
ei->last_unlink_trans = 0;
- atomic_set(&ei->outstanding_extents, 0);
- atomic_set(&ei->reserved_extents, 0);
+ spin_lock_init(&ei->lock);
+ ei->outstanding_extents = 0;
+ ei->reserved_extents = 0;
ei->ordered_data_close = 0;
ei->orphan_meta_reserved = 0;
ei->dummy_inode = 0;
ei->in_defrag = 0;
+ ei->delalloc_meta_reserved = 0;
ei->force_compress = BTRFS_COMPRESS_NONE;
ei->delayed_node = NULL;
@@ -6775,8 +6787,10 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(!list_empty(&inode->i_dentry));
WARN_ON(inode->i_data.nrpages);
- WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
- WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
+ WARN_ON(BTRFS_I(inode)->outstanding_extents);
+ WARN_ON(BTRFS_I(inode)->reserved_extents);
+ WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+ WARN_ON(BTRFS_I(inode)->csum_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -6831,7 +6845,7 @@ int btrfs_drop_inode(struct inode *inode)
struct btrfs_root *root = BTRFS_I(inode)->root;
if (btrfs_root_refs(&root->root_item) == 0 &&
- !is_free_space_inode(root, inode))
+ !btrfs_is_free_space_inode(root, inode))
return 1;
else
return generic_drop_inode(inode);
@@ -6900,11 +6914,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
+ u32 blocksize = inode->i_sb->s_blocksize;
+
generic_fillattr(inode, stat);
- stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
+ stat->dev = BTRFS_I(inode)->root->anon_dev;
stat->blksize = PAGE_CACHE_SIZE;
- stat->blocks = (inode_get_bytes(inode) +
- BTRFS_I(inode)->delalloc_bytes) >> 9;
+ stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
+ ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
return 0;
}
@@ -7069,9 +7085,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BUG_ON(ret);
if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
- struct dentry *parent = dget_parent(new_dentry);
+ struct dentry *parent = new_dentry->d_parent;
btrfs_log_new_name(trans, old_inode, old_dir, parent);
- dput(parent);
btrfs_end_log_trans(root);
}
out_fail:
@@ -7181,21 +7196,32 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
+ /*
+ * If the active LSM wants to access the inode during
+ * d_instantiate it needs these. Smack checks to see
+ * if the filesystem supports xattrs by looking at the
+ * ops vector.
+ */
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
drop_inode = 1;
else {
inode->i_mapping->a_ops = &btrfs_aops;
inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
- inode->i_fop = &btrfs_file_operations;
- inode->i_op = &btrfs_file_inode_operations;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
}
if (drop_inode)
goto out_unlock;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path) {
+ err = -ENOMEM;
+ drop_inode = 1;
+ goto out_unlock;
+ }
key.objectid = btrfs_ino(inode);
key.offset = 0;
btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
@@ -7233,6 +7259,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
drop_inode = 1;
out_unlock:
+ if (!err)
+ d_instantiate(dentry, inode);
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
if (drop_inode) {
@@ -7332,15 +7360,19 @@ static int btrfs_set_page_dirty(struct page *page)
return __set_page_dirty_nobuffers(page);
}
-static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
+static int btrfs_permission(struct inode *inode, int mask)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ umode_t mode = inode->i_mode;
- if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
- return -EROFS;
- if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
- return -EACCES;
- return generic_permission(inode, mask, flags, btrfs_check_acl);
+ if (mask & MAY_WRITE &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
+ return -EACCES;
+ }
+ return generic_permission(inode, mask);
}
static const struct inode_operations btrfs_dir_inode_operations = {
@@ -7360,10 +7392,12 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
+ .get_acl = btrfs_get_acl,
};
static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
.permission = btrfs_permission,
+ .get_acl = btrfs_get_acl,
};
static const struct file_operations btrfs_dir_file_operations = {
@@ -7385,7 +7419,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
.readpage_end_io_hook = btrfs_readpage_end_io_hook,
.writepage_end_io_hook = btrfs_writepage_end_io_hook,
.writepage_start_hook = btrfs_writepage_start_hook,
- .readpage_io_failed_hook = btrfs_io_failed_hook,
.set_bit_hook = btrfs_set_bit_hook,
.clear_bit_hook = btrfs_clear_bit_hook,
.merge_extent_hook = btrfs_merge_extent_hook,
@@ -7432,6 +7465,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
+ .get_acl = btrfs_get_acl,
};
static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
@@ -7441,19 +7475,23 @@ static const struct inode_operations btrfs_special_inode_operations = {
.getxattr = btrfs_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
+ .get_acl = btrfs_get_acl,
};
static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
.getattr = btrfs_getattr,
+ .setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
.getxattr = btrfs_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
+ .get_acl = btrfs_get_acl,
};
const struct dentry_operations btrfs_dentry_operations = {
.d_delete = btrfs_dentry_delete,
+ .d_release = btrfs_dentry_release,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a205027..ba26540 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
#include "volumes.h"
#include "locking.h"
#include "inode-map.h"
+#include "backref.h"
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
/*
* Inherit flags from the parent inode.
*
- * Unlike extN we don't have any flags we don't want to inherit currently.
+ * Currently only the compression flags and the cow flags are inherited.
*/
void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
{
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
flags = BTRFS_I(dir)->flags;
- if (S_ISREG(inode->i_mode))
- flags &= ~BTRFS_INODE_DIRSYNC;
- else if (!S_ISDIR(inode->i_mode))
- flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+ if (flags & BTRFS_INODE_NOCOMPRESS) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ } else if (flags & BTRFS_INODE_COMPRESS) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+ }
+
+ if (flags & BTRFS_INODE_NODATACOW)
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
- BTRFS_I(inode)->flags = flags;
btrfs_update_iflags(inode);
}
@@ -246,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
+ btrfs_update_iflags(inode);
+ inode->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
- btrfs_update_iflags(inode);
- inode->i_ctime = CURRENT_TIME;
btrfs_end_transaction(trans, root);
mnt_drop_write(file->f_path.mnt);
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
+ u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
}
}
rcu_read_unlock();
+
if (!num_devices)
return -EOPNOTSUPP;
-
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
+ if (range.start > total_bytes)
+ return -EINVAL;
+ range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(root, &range);
if (ret < 0)
@@ -323,7 +333,7 @@ static noinline int create_subvol(struct btrfs_root *root,
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
struct btrfs_root *new_root;
- struct dentry *parent = dget_parent(dentry);
+ struct dentry *parent = dentry->d_parent;
struct inode *dir;
int ret;
int err;
@@ -332,10 +342,8 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 index = 0;
ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
- if (ret) {
- dput(parent);
+ if (ret)
return ret;
- }
dir = parent->d_inode;
@@ -346,10 +354,8 @@ static noinline int create_subvol(struct btrfs_root *root,
* 2 - dir items
*/
trans = btrfs_start_transaction(root, 6);
- if (IS_ERR(trans)) {
- dput(parent);
+ if (IS_ERR(trans))
return PTR_ERR(trans);
- }
leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
0, objectid, NULL, 0, 0, 0);
@@ -439,7 +445,6 @@ static noinline int create_subvol(struct btrfs_root *root,
d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
fail:
- dput(parent);
if (async_transid) {
*async_transid = trans->transid;
err = btrfs_commit_transaction_async(trans, root, 1);
@@ -456,7 +461,6 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
bool readonly)
{
struct inode *inode;
- struct dentry *parent;
struct btrfs_pending_snapshot *pending_snapshot;
struct btrfs_trans_handle *trans;
int ret;
@@ -504,9 +508,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (ret)
goto fail;
- parent = dget_parent(dentry);
- inode = btrfs_lookup_dentry(parent->d_inode, dentry);
- dput(parent);
+ inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto fail;
@@ -768,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
int ret = 1;
/*
- * make sure that once we start defragging and extent, we keep on
+ * make sure that once we start defragging an extent, we keep on
* defragging it
*/
if (start < *defrag_end)
@@ -813,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
* extent will force at least part of that big extent to be defragged.
*/
if (ret) {
- *last_len += len;
*defrag_end = extent_map_end(em);
} else {
*last_len = 0;
@@ -851,13 +852,16 @@ static int cluster_pages_for_defrag(struct inode *inode,
int i_done;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
if (isize == 0)
return 0;
file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+ mutex_lock(&inode->i_mutex);
ret = btrfs_delalloc_reserve_space(inode,
num_pages << PAGE_CACHE_SHIFT);
+ mutex_unlock(&inode->i_mutex);
if (ret)
return ret;
again:
@@ -867,8 +871,8 @@ again:
/* step one, lock all the pages */
for (i = 0; i < num_pages; i++) {
struct page *page;
- page = grab_cache_page(inode->i_mapping,
- start_index + i);
+ page = find_or_create_page(inode->i_mapping,
+ start_index + i, mask);
if (!page)
break;
@@ -938,7 +942,9 @@ again:
GFP_NOFS);
if (i_done != num_pages) {
- atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode,
(num_pages - i_done) << PAGE_CACHE_SHIFT);
}
@@ -978,18 +984,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_super_block *disk_super;
struct file_ra_state *ra = NULL;
unsigned long last_index;
+ u64 isize = i_size_read(inode);
u64 features;
u64 last_len = 0;
u64 skip = 0;
u64 defrag_end = 0;
u64 newer_off = range->start;
- int newer_left = 0;
unsigned long i;
+ unsigned long ra_index = 0;
int ret;
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
int extent_thresh = range->extent_thresh;
- int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ int cluster = max_cluster;
u64 new_align = ~((u64)128 * 1024 - 1);
struct page **pages = NULL;
@@ -1003,7 +1011,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
compress_type = range->compress_type;
}
- if (inode->i_size == 0)
+ if (isize == 0)
return 0;
/*
@@ -1019,7 +1027,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra = &file->f_ra;
}
- pages = kmalloc(sizeof(struct page *) * newer_cluster,
+ pages = kmalloc(sizeof(struct page *) * max_cluster,
GFP_NOFS);
if (!pages) {
ret = -ENOMEM;
@@ -1028,10 +1036,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
/* find the last page to defrag */
if (range->start + range->len > range->start) {
- last_index = min_t(u64, inode->i_size - 1,
+ last_index = min_t(u64, isize - 1,
range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
} else {
- last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
}
if (newer_than) {
@@ -1044,16 +1052,24 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
* the extents in the file evenly spaced
*/
i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
- newer_left = newer_cluster;
} else
goto out_ra;
} else {
i = range->start >> PAGE_CACHE_SHIFT;
}
if (!max_to_defrag)
- max_to_defrag = last_index - 1;
+ max_to_defrag = last_index;
- while (i <= last_index && defrag_count < max_to_defrag) {
+ /*
+ * make writeback starts from i, so the defrag range can be
+ * written sequentially.
+ */
+ if (i < inode->i_mapping->writeback_index)
+ inode->i_mapping->writeback_index = i;
+
+ while (i <= last_index && defrag_count < max_to_defrag &&
+ (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT)) {
/*
* make sure we stop running if someone unmounts
* the FS
@@ -1076,18 +1092,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
i = max(i + 1, next);
continue;
}
+
+ if (!newer_than) {
+ cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
+ PAGE_CACHE_SHIFT) - i;
+ cluster = min(cluster, max_cluster);
+ } else {
+ cluster = max_cluster;
+ }
+
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = compress_type;
- btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
+ if (i + cluster > ra_index) {
+ ra_index = max(i, ra_index);
+ btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
+ cluster);
+ ra_index += max_cluster;
+ }
- ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
+ ret = cluster_pages_for_defrag(inode, pages, i, cluster);
if (ret < 0)
goto out_ra;
defrag_count += ret;
balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
- i += ret;
if (newer_than) {
if (newer_off == (u64)-1)
@@ -1102,12 +1131,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (!ret) {
range->start = newer_off;
i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
- newer_left = newer_cluster;
} else {
break;
}
} else {
- i++;
+ if (ret > 0) {
+ i += ret;
+ last_len += ret << PAGE_CACHE_SHIFT;
+ } else {
+ i++;
+ last_len = 0;
+ }
}
}
@@ -1133,16 +1167,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
mutex_unlock(&inode->i_mutex);
}
- disk_super = &root->fs_info->super_copy;
+ disk_super = root->fs_info->super_copy;
features = btrfs_super_incompat_flags(disk_super);
if (range->compress_type == BTRFS_COMPRESS_LZO) {
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
btrfs_set_super_incompat_flags(disk_super, features);
}
- if (!file)
- kfree(ra);
- return defrag_count;
+ ret = defrag_count;
out_ra:
if (!file)
@@ -1186,12 +1218,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
*devstr = '\0';
devstr = vol_args->name;
devid = simple_strtoull(devstr, &end, 10);
- printk(KERN_INFO "resizing devid %llu\n",
+ printk(KERN_INFO "btrfs: resizing devid %llu\n",
(unsigned long long)devid);
}
device = btrfs_find_device(root, devid, NULL, NULL);
if (!device) {
- printk(KERN_INFO "resizer unable to find device %llu\n",
+ printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
(unsigned long long)devid);
ret = -EINVAL;
goto out_unlock;
@@ -1237,7 +1269,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
do_div(new_size, root->sectorsize);
new_size *= root->sectorsize;
- printk(KERN_INFO "new size for %s is %llu\n",
+ printk(KERN_INFO "btrfs: new size for %s is %llu\n",
device->name, (unsigned long long)new_size);
if (new_size > old_size) {
@@ -1248,7 +1280,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
}
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans, root);
- } else {
+ } else if (new_size < old_size) {
ret = btrfs_shrink_device(device, new_size);
}
@@ -1295,12 +1327,17 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
printk(KERN_INFO "btrfs: Snapshot src from "
"another FS\n");
ret = -EINVAL;
- fput(src_file);
- goto out;
+ } else if (!inode_owner_or_capable(src_inode)) {
+ /*
+ * Subvolume creation is not restricted, but snapshots
+ * are limited to own subvolumes only
+ */
+ ret = -EPERM;
+ } else {
+ ret = btrfs_mksubvol(&file->f_path, name, namelen,
+ BTRFS_I(src_inode)->root,
+ transid, readonly);
}
- ret = btrfs_mksubvol(&file->f_path, name, namelen,
- BTRFS_I(src_inode)->root,
- transid, readonly);
fput(src_file);
}
out:
@@ -1755,11 +1792,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.objectid = key.offset;
key.offset = (u64)-1;
dirid = key.objectid;
-
}
if (ptr < name)
goto out;
- memcpy(name, ptr, total_len);
+ memmove(name, ptr, total_len);
name[total_len]='\0';
ret = 0;
out:
@@ -2184,6 +2220,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (!(src_file->f_mode & FMODE_READ))
goto out_fput;
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+ goto out_fput;
+
ret = -EISDIR;
if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
goto out_fput;
@@ -2222,11 +2263,26 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (off + len == src->i_size)
len = ALIGN(src->i_size, bs) - off;
+ if (len == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
/* verify the end result is block aligned */
if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
!IS_ALIGNED(destoff, bs))
goto out_unlock;
+ if (destoff > inode->i_size) {
+ ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+ if (ret)
+ goto out_unlock;
+ }
+
+ /* truncate page cache pages from target inode range */
+ truncate_inode_pages_range(&inode->i_data, destoff,
+ PAGE_CACHE_ALIGN(destoff + len) - 1);
+
/* do any pending delalloc/csum calc on src, one way or
another, and lock file content */
while (1) {
@@ -2320,7 +2376,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
else
new_key.offset = destoff;
- trans = btrfs_start_transaction(root, 1);
+ /*
+ * 1 - adjusting old extent (we may have to split it)
+ * 1 - add new extent
+ * 1 - inode update
+ */
+ trans = btrfs_start_transaction(root, 3);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
@@ -2328,14 +2389,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
+ /*
+ * a | --- range to clone ---| b
+ * | ------------- extent ------------- |
+ */
+
+ /* substract range b */
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
+ /* substract range a */
if (off > key.offset) {
datao += off - key.offset;
datal -= off - key.offset;
}
- if (key.offset + datal > off + len)
- datal = off + len - key.offset;
-
ret = btrfs_drop_extents(trans, inode,
new_key.offset,
new_key.offset + datal,
@@ -2380,6 +2448,20 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
new_key.offset += skip;
}
+ /*
+ * Don't copy an inline extent into an offset
+ * greater than zero. Having an inline extent
+ * at such an offset results in chaos as btrfs
+ * isn't prepared for such cases. Just skip
+ * this case for the same reasons as commented
+ * at btrfs_ioctl_clone().
+ */
+ if (new_key.offset > 0) {
+ ret = -EOPNOTSUPP;
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+
if (key.offset + datal > off+len)
trim = key.offset + datal - (off+len);
@@ -2432,7 +2514,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (endoff > inode->i_size)
btrfs_i_size_write(inode, endoff);
- BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
btrfs_end_transaction(trans, root);
@@ -2559,7 +2640,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
return PTR_ERR(trans);
}
- dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+ dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
dir_id, "default", 7, 1);
if (IS_ERR_OR_NULL(di)) {
@@ -2575,7 +2656,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- disk_super = &root->fs_info->super_copy;
+ disk_super = root->fs_info->super_copy;
features = btrfs_super_incompat_flags(disk_super);
if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2836,6 +2917,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
return ret;
}
+static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
+{
+ int ret = 0;
+ int i;
+ u64 rel_ptr;
+ int size;
+ struct btrfs_ioctl_ino_path_args *ipa = NULL;
+ struct inode_fs_paths *ipath = NULL;
+ struct btrfs_path *path;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ipa = memdup_user(arg, sizeof(*ipa));
+ if (IS_ERR(ipa)) {
+ ret = PTR_ERR(ipa);
+ ipa = NULL;
+ goto out;
+ }
+
+ size = min_t(u32, ipa->size, 4096);
+ ipath = init_ipath(size, root, path);
+ if (IS_ERR(ipath)) {
+ ret = PTR_ERR(ipath);
+ ipath = NULL;
+ goto out;
+ }
+
+ ret = paths_from_inode(ipa->inum, ipath);
+ if (ret < 0)
+ goto out;
+
+ for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
+ rel_ptr = ipath->fspath->val[i] -
+ (u64)(unsigned long)ipath->fspath->val;
+ ipath->fspath->val[i] = rel_ptr;
+ }
+
+ ret = copy_to_user((void *)(unsigned long)ipa->fspath,
+ (void *)(unsigned long)ipath->fspath, size);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+out:
+ btrfs_free_path(path);
+ free_ipath(ipath);
+ kfree(ipa);
+
+ return ret;
+}
+
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ struct btrfs_data_container *inodes = ctx;
+ const size_t c = 3 * sizeof(u64);
+
+ if (inodes->bytes_left >= c) {
+ inodes->bytes_left -= c;
+ inodes->val[inodes->elem_cnt] = inum;
+ inodes->val[inodes->elem_cnt + 1] = offset;
+ inodes->val[inodes->elem_cnt + 2] = root;
+ inodes->elem_cnt += 3;
+ } else {
+ inodes->bytes_missing += c - inodes->bytes_left;
+ inodes->bytes_left = 0;
+ inodes->elem_missed += 3;
+ }
+
+ return 0;
+}
+
+static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
+ void __user *arg)
+{
+ int ret = 0;
+ int size;
+ u64 extent_offset;
+ struct btrfs_ioctl_logical_ino_args *loi;
+ struct btrfs_data_container *inodes = NULL;
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ loi = memdup_user(arg, sizeof(*loi));
+ if (IS_ERR(loi)) {
+ ret = PTR_ERR(loi);
+ loi = NULL;
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ size = min_t(u32, loi->size, 4096);
+ inodes = init_data_container(size);
+ if (IS_ERR(inodes)) {
+ ret = PTR_ERR(inodes);
+ inodes = NULL;
+ goto out;
+ }
+
+ ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+
+ if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
+
+ extent_offset = loi->logical - key.objectid;
+ ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
+ extent_offset, build_ino_list, inodes);
+
+ if (ret < 0)
+ goto out;
+
+ ret = copy_to_user((void *)(unsigned long)loi->inodes,
+ (void *)(unsigned long)inodes, size);
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ btrfs_free_path(path);
+ kfree(inodes);
+ kfree(loi);
+
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -2893,6 +3115,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_tree_search(file, argp);
case BTRFS_IOC_INO_LOOKUP:
return btrfs_ioctl_ino_lookup(file, argp);
+ case BTRFS_IOC_INO_PATHS:
+ return btrfs_ioctl_ino_to_path(root, argp);
+ case BTRFS_IOC_LOGICAL_INO:
+ return btrfs_ioctl_logical_to_ino(root, argp);
case BTRFS_IOC_SPACE_INFO:
return btrfs_ioctl_space_info(root, argp);
case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea78..252ae99 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
struct btrfs_ioctl_space_info spaces[0];
};
+struct btrfs_data_container {
+ __u32 bytes_left; /* out -- bytes not needed to deliver output */
+ __u32 bytes_missing; /* out -- additional bytes needed for result */
+ __u32 elem_cnt; /* out */
+ __u32 elem_missed; /* out */
+ __u64 val[0]; /* out */
+};
+
+struct btrfs_ioctl_ino_path_args {
+ __u64 inum; /* in */
+ __u32 size; /* in */
+ __u64 reserved[4];
+ /* struct btrfs_data_container *fspath; out */
+ __u64 fspath; /* out */
+};
+
+struct btrfs_ioctl_logical_ino_args {
+ __u64 logical; /* in */
+ __u32 size; /* in */
+ __u64 reserved[4];
+ /* struct btrfs_data_container *inodes; out */
+ __u64 inodes;
+};
+
#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
struct btrfs_ioctl_dev_info_args)
#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
+ struct btrfs_ioctl_ino_path_args)
+#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
+ struct btrfs_ioctl_ino_path_args)
+
#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 66fa43d..d77b67c 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -24,185 +24,197 @@
#include "extent_io.h"
#include "locking.h"
-static inline void spin_nested(struct extent_buffer *eb)
-{
- spin_lock(&eb->lock);
-}
+void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
/*
- * Setting a lock to blocking will drop the spinlock and set the
- * flag that forces other procs who want the lock to wait. After
- * this you can safely schedule with the lock held.
+ * if we currently have a spinning reader or writer lock
+ * (indicated by the rw flag) this will bump the count
+ * of blocking holders and drop the spinlock.
*/
-void btrfs_set_lock_blocking(struct extent_buffer *eb)
+void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
- set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
- spin_unlock(&eb->lock);
+ if (rw == BTRFS_WRITE_LOCK) {
+ if (atomic_read(&eb->blocking_writers) == 0) {
+ WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+ atomic_dec(&eb->spinning_writers);
+ btrfs_assert_tree_locked(eb);
+ atomic_inc(&eb->blocking_writers);
+ write_unlock(&eb->lock);
+ }
+ } else if (rw == BTRFS_READ_LOCK) {
+ btrfs_assert_tree_read_locked(eb);
+ atomic_inc(&eb->blocking_readers);
+ WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+ atomic_dec(&eb->spinning_readers);
+ read_unlock(&eb->lock);
}
- /* exit with the spin lock released and the bit set */
+ return;
}
/*
- * clearing the blocking flag will take the spinlock again.
- * After this you can't safely schedule
+ * if we currently have a blocking lock, take the spinlock
+ * and drop our blocking count
*/
-void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
- spin_nested(eb);
- clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
- smp_mb__after_clear_bit();
+ if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
+ BUG_ON(atomic_read(&eb->blocking_writers) != 1);
+ write_lock(&eb->lock);
+ WARN_ON(atomic_read(&eb->spinning_writers));
+ atomic_inc(&eb->spinning_writers);
+ if (atomic_dec_and_test(&eb->blocking_writers))
+ wake_up(&eb->write_lock_wq);
+ } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
+ BUG_ON(atomic_read(&eb->blocking_readers) == 0);
+ read_lock(&eb->lock);
+ atomic_inc(&eb->spinning_readers);
+ if (atomic_dec_and_test(&eb->blocking_readers))
+ wake_up(&eb->read_lock_wq);
}
- /* exit with the spin lock held */
+ return;
}
/*
- * unfortunately, many of the places that currently set a lock to blocking
- * don't end up blocking for very long, and often they don't block
- * at all. For a dbench 50 run, if we don't spin on the blocking bit
- * at all, the context switch rate can jump up to 400,000/sec or more.
- *
- * So, we're still stuck with this crummy spin on the blocking bit,
- * at least until the most common causes of the short blocks
- * can be dealt with.
+ * take a spinning read lock. This will wait for any blocking
+ * writers
*/
-static int btrfs_spin_on_block(struct extent_buffer *eb)
+void btrfs_tree_read_lock(struct extent_buffer *eb)
{
- int i;
-
- for (i = 0; i < 512; i++) {
- if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
- return 1;
- if (need_resched())
- break;
- cpu_relax();
+again:
+ wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
+ read_lock(&eb->lock);
+ if (atomic_read(&eb->blocking_writers)) {
+ read_unlock(&eb->lock);
+ wait_event(eb->write_lock_wq,
+ atomic_read(&eb->blocking_writers) == 0);
+ goto again;
}
- return 0;
+ atomic_inc(&eb->read_locks);
+ atomic_inc(&eb->spinning_readers);
}
/*
- * This is somewhat different from trylock. It will take the
- * spinlock but if it finds the lock is set to blocking, it will
- * return without the lock held.
- *
- * returns 1 if it was able to take the lock and zero otherwise
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers
*/
-int btrfs_try_spin_lock(struct extent_buffer *eb)
+int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
- int i;
+ if (atomic_read(&eb->blocking_writers))
+ return 0;
- if (btrfs_spin_on_block(eb)) {
- spin_nested(eb);
- if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
- return 1;
- spin_unlock(&eb->lock);
+ read_lock(&eb->lock);
+ if (atomic_read(&eb->blocking_writers)) {
+ read_unlock(&eb->lock);
+ return 0;
}
- /* spin for a bit on the BLOCKING flag */
- for (i = 0; i < 2; i++) {
- cpu_relax();
- if (!btrfs_spin_on_block(eb))
- break;
-
- spin_nested(eb);
- if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
- return 1;
- spin_unlock(&eb->lock);
- }
- return 0;
+ atomic_inc(&eb->read_locks);
+ atomic_inc(&eb->spinning_readers);
+ return 1;
}
/*
- * the autoremove wake function will return 0 if it tried to wake up
- * a process that was already awake, which means that process won't
- * count as an exclusive wakeup. The waitq code will continue waking
- * procs until it finds one that was actually sleeping.
- *
- * For btrfs, this isn't quite what we want. We want a single proc
- * to be notified that the lock is ready for taking. If that proc
- * already happen to be awake, great, it will loop around and try for
- * the lock.
- *
- * So, btrfs_wake_function always returns 1, even when the proc that we
- * tried to wake up was already awake.
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers or readers
*/
-static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
- int sync, void *key)
+int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
- autoremove_wake_function(wait, mode, sync, key);
+ if (atomic_read(&eb->blocking_writers) ||
+ atomic_read(&eb->blocking_readers))
+ return 0;
+ write_lock(&eb->lock);
+ if (atomic_read(&eb->blocking_writers) ||
+ atomic_read(&eb->blocking_readers)) {
+ write_unlock(&eb->lock);
+ return 0;
+ }
+ atomic_inc(&eb->write_locks);
+ atomic_inc(&eb->spinning_writers);
return 1;
}
/*
- * returns with the extent buffer spinlocked.
- *
- * This will spin and/or wait as required to take the lock, and then
- * return with the spinlock held.
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
+ * drop a spinning read lock
+ */
+void btrfs_tree_read_unlock(struct extent_buffer *eb)
+{
+ btrfs_assert_tree_read_locked(eb);
+ WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+ atomic_dec(&eb->spinning_readers);
+ atomic_dec(&eb->read_locks);
+ read_unlock(&eb->lock);
+}
+
+/*
+ * drop a blocking read lock
+ */
+void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
+{
+ btrfs_assert_tree_read_locked(eb);
+ WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+ if (atomic_dec_and_test(&eb->blocking_readers))
+ wake_up(&eb->read_lock_wq);
+ atomic_dec(&eb->read_locks);
+}
+
+/*
+ * take a spinning write lock. This will wait for both
+ * blocking readers or writers
*/
int btrfs_tree_lock(struct extent_buffer *eb)
{
- DEFINE_WAIT(wait);
- wait.func = btrfs_wake_function;
-
- if (!btrfs_spin_on_block(eb))
- goto sleep;
-
- while(1) {
- spin_nested(eb);
-
- /* nobody is blocking, exit with the spinlock held */
- if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
- return 0;
-
- /*
- * we have the spinlock, but the real owner is blocking.
- * wait for them
- */
- spin_unlock(&eb->lock);
-
- /*
- * spin for a bit, and if the blocking flag goes away,
- * loop around
- */
- cpu_relax();
- if (btrfs_spin_on_block(eb))
- continue;
-sleep:
- prepare_to_wait_exclusive(&eb->lock_wq, &wait,
- TASK_UNINTERRUPTIBLE);
-
- if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
- schedule();
-
- finish_wait(&eb->lock_wq, &wait);
+again:
+ wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
+ wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
+ write_lock(&eb->lock);
+ if (atomic_read(&eb->blocking_readers)) {
+ write_unlock(&eb->lock);
+ wait_event(eb->read_lock_wq,
+ atomic_read(&eb->blocking_readers) == 0);
+ goto again;
}
+ if (atomic_read(&eb->blocking_writers)) {
+ write_unlock(&eb->lock);
+ wait_event(eb->write_lock_wq,
+ atomic_read(&eb->blocking_writers) == 0);
+ goto again;
+ }
+ WARN_ON(atomic_read(&eb->spinning_writers));
+ atomic_inc(&eb->spinning_writers);
+ atomic_inc(&eb->write_locks);
return 0;
}
+/*
+ * drop a spinning or a blocking write lock.
+ */
int btrfs_tree_unlock(struct extent_buffer *eb)
{
- /*
- * if we were a blocking owner, we don't have the spinlock held
- * just clear the bit and look for waiters
- */
- if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
- smp_mb__after_clear_bit();
- else
- spin_unlock(&eb->lock);
-
- if (waitqueue_active(&eb->lock_wq))
- wake_up(&eb->lock_wq);
+ int blockers = atomic_read(&eb->blocking_writers);
+
+ BUG_ON(blockers > 1);
+
+ btrfs_assert_tree_locked(eb);
+ atomic_dec(&eb->write_locks);
+
+ if (blockers) {
+ WARN_ON(atomic_read(&eb->spinning_writers));
+ atomic_dec(&eb->blocking_writers);
+ smp_wmb();
+ wake_up(&eb->write_lock_wq);
+ } else {
+ WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+ atomic_dec(&eb->spinning_writers);
+ write_unlock(&eb->lock);
+ }
return 0;
}
void btrfs_assert_tree_locked(struct extent_buffer *eb)
{
- if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
- assert_spin_locked(&eb->lock);
+ BUG_ON(!atomic_read(&eb->write_locks));
+}
+
+void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+{
+ BUG_ON(!atomic_read(&eb->read_locks));
}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 5c33a56..17247dd 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -19,11 +19,43 @@
#ifndef __BTRFS_LOCKING_
#define __BTRFS_LOCKING_
+#define BTRFS_WRITE_LOCK 1
+#define BTRFS_READ_LOCK 2
+#define BTRFS_WRITE_LOCK_BLOCKING 3
+#define BTRFS_READ_LOCK_BLOCKING 4
+
int btrfs_tree_lock(struct extent_buffer *eb);
int btrfs_tree_unlock(struct extent_buffer *eb);
int btrfs_try_spin_lock(struct extent_buffer *eb);
-void btrfs_set_lock_blocking(struct extent_buffer *eb);
-void btrfs_clear_lock_blocking(struct extent_buffer *eb);
+void btrfs_tree_read_lock(struct extent_buffer *eb);
+void btrfs_tree_read_unlock(struct extent_buffer *eb);
+void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
+void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
void btrfs_assert_tree_locked(struct extent_buffer *eb);
+int btrfs_try_tree_read_lock(struct extent_buffer *eb);
+int btrfs_try_tree_write_lock(struct extent_buffer *eb);
+
+static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
+{
+ if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
+ btrfs_tree_unlock(eb);
+ else if (rw == BTRFS_READ_LOCK_BLOCKING)
+ btrfs_tree_read_unlock_blocking(eb);
+ else if (rw == BTRFS_READ_LOCK)
+ btrfs_tree_read_unlock(eb);
+ else
+ BUG();
+}
+
+static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
+{
+ btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
+}
+
+static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+ btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
+}
#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d..f38e452 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
{
int i;
- u32 type;
- u32 nr = btrfs_header_nritems(l);
+ u32 type, nr;
struct btrfs_item *item;
struct btrfs_root_item *ri;
struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
struct btrfs_key key;
struct btrfs_key found_key;
+ if (!l)
+ return;
+
+ nr = btrfs_header_nritems(l);
+
printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
(unsigned long long)btrfs_header_bytenr(l), nr,
btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
deleted file mode 100644
index 82d569c..0000000
--- a/fs/btrfs/ref-cache.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/sort.h>
-#include "ctree.h"
-#include "ref-cache.h"
-#include "transaction.h"
-
-static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
- struct rb_node *node)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct btrfs_leaf_ref *entry;
-
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
-
- if (bytenr < entry->bytenr)
- p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
- p = &(*p)->rb_right;
- else
- return parent;
- }
-
- entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
-static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
-{
- struct rb_node *n = root->rb_node;
- struct btrfs_leaf_ref *entry;
-
- while (n) {
- entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
- WARN_ON(!entry->in_tree);
-
- if (bytenr < entry->bytenr)
- n = n->rb_left;
- else if (bytenr > entry->bytenr)
- n = n->rb_right;
- else
- return n;
- }
- return NULL;
-}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
deleted file mode 100644
index 24f7001..0000000
--- a/fs/btrfs/ref-cache.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef __REFCACHE__
-#define __REFCACHE__
-
-struct btrfs_extent_info {
- /* bytenr and num_bytes find the extent in the extent allocation tree */
- u64 bytenr;
- u64 num_bytes;
-
- /* objectid and offset find the back reference for the file */
- u64 objectid;
- u64 offset;
-};
-
-struct btrfs_leaf_ref {
- struct rb_node rb_node;
- struct btrfs_leaf_ref_tree *tree;
- int in_tree;
- atomic_t usage;
-
- u64 root_gen;
- u64 bytenr;
- u64 owner;
- u64 generation;
- int nritems;
-
- struct list_head list;
- struct btrfs_extent_info extents[];
-};
-
-static inline size_t btrfs_leaf_ref_size(int nr_extents)
-{
- return sizeof(struct btrfs_leaf_ref) +
- sizeof(struct btrfs_extent_info) * nr_extents;
-}
-#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 2ab5837..cfb5543 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -670,7 +670,6 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
int cowonly;
int ret;
int err = 0;
- bool need_check = true;
path1 = btrfs_alloc_path();
path2 = btrfs_alloc_path();
@@ -893,7 +892,6 @@ again:
cur->bytenr);
lower = cur;
- need_check = true;
for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path2->nodes[level]) {
BUG_ON(btrfs_root_bytenr(&root->root_item) !=
@@ -937,12 +935,14 @@ again:
/*
* add the block to pending list if we
- * need check its backrefs, we only do this once
- * while walking up a tree as we will catch
- * anything else later on.
+ * need check its backrefs. only block
+ * at 'cur->level + 1' is added to the
+ * tail of pending list. this guarantees
+ * we check backrefs from lower level
+ * blocks to upper level blocks.
*/
- if (!upper->checked && need_check) {
- need_check = false;
+ if (!upper->checked &&
+ level == cur->level + 1) {
list_add_tail(&edge->list[UPPER],
&list);
} else
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
list_add_tail(&new_edge->list[UPPER],
&new_node->lower);
}
+ } else {
+ list_add_tail(&new_node->lower, &cache->leaves);
}
rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
@@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
BUG_ON(IS_ERR(trans));
trans->block_rsv = rc->block_rsv;
- ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
- min_reserved, 0);
+ ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
if (ret) {
BUG_ON(ret != -EAGAIN);
ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
again:
if (!err) {
num_bytes = rc->merging_rsv_size;
- ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
- num_bytes);
+ ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
if (ret)
err = ret;
}
@@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
num_bytes = calcu_metadata_size(rc, node, 1) * 2;
trans->block_rsv = rc->block_rsv;
- ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
+ ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
if (ret) {
if (ret == -EAGAIN)
rc->commit_transaction = 1;
@@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
unsigned long last_index;
struct page *page;
struct file_ra_state *ra;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int nr = 0;
int ret = 0;
@@ -2946,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
while (index <= last_index) {
+ mutex_lock(&inode->i_mutex);
ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+ mutex_unlock(&inode->i_mutex);
if (ret)
goto out;
@@ -2955,7 +2958,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
page_cache_sync_readahead(inode->i_mapping,
ra, NULL, index,
last_index + 1 - index);
- page = grab_cache_page(inode->i_mapping, index);
+ page = find_or_create_page(inode->i_mapping, index,
+ mask);
if (!page) {
btrfs_delalloc_release_metadata(inode,
PAGE_CACHE_SIZE);
@@ -3322,8 +3326,11 @@ static int find_data_references(struct reloc_control *rc,
}
key.objectid = ref_objectid;
- key.offset = ref_offset;
key.type = BTRFS_EXTENT_DATA_KEY;
+ if (ref_offset > ((u64)-1 << 32))
+ key.offset = 0;
+ else
+ key.offset = ref_offset;
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -3644,14 +3651,11 @@ int prepare_to_relocate(struct reloc_control *rc)
* btrfs_init_reloc_root will use them when there
* is no reservation in transaction handle.
*/
- ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+ ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
rc->extent_root->nodesize * 256);
if (ret)
return ret;
- rc->block_rsv->refill_used = 1;
- btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
-
memset(&rc->cluster, 0, sizeof(rc->cluster));
rc->search_start = rc->block_group->key.objectid;
rc->extents_found = 0;
@@ -3776,8 +3780,7 @@ restart:
}
}
- ret = btrfs_block_rsv_check(trans, rc->extent_root,
- rc->block_rsv, 0, 5);
+ ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
if (ret < 0) {
if (ret != -EAGAIN) {
err = ret;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ebe4544..f409990 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -71,13 +71,12 @@ out:
return ret;
}
-int btrfs_set_root_node(struct btrfs_root_item *item,
- struct extent_buffer *node)
+void btrfs_set_root_node(struct btrfs_root_item *item,
+ struct extent_buffer *node)
{
btrfs_set_root_bytenr(item, node->start);
btrfs_set_root_level(item, btrfs_header_level(node));
btrfs_set_root_generation(item, btrfs_header_generation(node));
- return 0;
}
/*
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5..ddf2c90 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
*/
#include <linux/blkdev.h>
+#include <linux/ratelimit.h>
#include "ctree.h"
#include "volumes.h"
#include "disk-io.h"
#include "ordered-data.h"
+#include "transaction.h"
+#include "backref.h"
+#include "extent_io.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
* any can be found.
*
* Future enhancements:
- * - To enhance the performance, better read-ahead strategies for the
- * extent-tree can be employed.
* - In case an unrepairable extent is encountered, track which files are
* affected and report them
* - In case of a read error on files with nodatasum, map the file and read
* the extent to trigger a writeback of the good copy
* - track and record media errors, throw out bad devices
* - add a mode to also read unallocated space
- * - make the prefetch cancellable
*/
struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
struct scrub_page {
u64 flags; /* extent flags */
u64 generation;
- u64 mirror_num;
+ int mirror_num;
int have_csum;
u8 csum[BTRFS_CSUM_SIZE];
};
@@ -87,6 +88,7 @@ struct scrub_dev {
int first_free;
int curr;
atomic_t in_flight;
+ atomic_t fixup_cnt;
spinlock_t list_lock;
wait_queue_head_t list_wait;
u16 csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
spinlock_t stat_lock;
};
+struct scrub_fixup_nodatasum {
+ struct scrub_dev *sdev;
+ u64 logical;
+ struct btrfs_root *root;
+ struct btrfs_work work;
+ int mirror_num;
+};
+
+struct scrub_warning {
+ struct btrfs_path *path;
+ u64 extent_item_size;
+ char *scratch_buf;
+ char *msg_buf;
+ const char *errstr;
+ sector_t sector;
+ u64 logical;
+ struct btrfs_device *dev;
+ int msg_bufsize;
+ int scratch_bufsize;
+};
+
static void scrub_free_csums(struct scrub_dev *sdev)
{
while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
if (i != SCRUB_BIOS_PER_DEV-1)
sdev->bios[i]->next_free = i + 1;
- else
+ else
sdev->bios[i]->next_free = -1;
}
sdev->first_free = 0;
sdev->curr = -1;
atomic_set(&sdev->in_flight, 0);
+ atomic_set(&sdev->fixup_cnt, 0);
atomic_set(&sdev->cancel_req, 0);
- sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+ sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
INIT_LIST_HEAD(&sdev->csum_list);
spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,366 @@ nomem:
return ERR_PTR(-ENOMEM);
}
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ u64 isize;
+ u32 nlink;
+ int ret;
+ int i;
+ struct extent_buffer *eb;
+ struct btrfs_inode_item *inode_item;
+ struct scrub_warning *swarn = ctx;
+ struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
+ struct inode_fs_paths *ipath = NULL;
+ struct btrfs_root *local_root;
+ struct btrfs_key root_key;
+
+ root_key.objectid = root;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+ local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ if (IS_ERR(local_root)) {
+ ret = PTR_ERR(local_root);
+ goto err;
+ }
+
+ ret = inode_item_info(inum, 0, local_root, swarn->path);
+ if (ret) {
+ btrfs_release_path(swarn->path);
+ goto err;
+ }
+
+ eb = swarn->path->nodes[0];
+ inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
+ struct btrfs_inode_item);
+ isize = btrfs_inode_size(eb, inode_item);
+ nlink = btrfs_inode_nlink(eb, inode_item);
+ btrfs_release_path(swarn->path);
+
+ ipath = init_ipath(4096, local_root, swarn->path);
+ if (IS_ERR(ipath)) {
+ ret = PTR_ERR(ipath);
+ ipath = NULL;
+ goto err;
+ }
+ ret = paths_from_inode(inum, ipath);
+
+ if (ret < 0)
+ goto err;
+
+ /*
+ * we deliberately ignore the bit ipath might have been too small to
+ * hold all of the paths here
+ */
+ for (i = 0; i < ipath->fspath->elem_cnt; ++i)
+ printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+ "%s, sector %llu, root %llu, inode %llu, offset %llu, "
+ "length %llu, links %u (path: %s)\n", swarn->errstr,
+ swarn->logical, swarn->dev->name,
+ (unsigned long long)swarn->sector, root, inum, offset,
+ min(isize - offset, (u64)PAGE_SIZE), nlink,
+ (char *)(unsigned long)ipath->fspath->val[i]);
+
+ free_ipath(ipath);
+ return 0;
+
+err:
+ printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+ "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
+ "resolving failed with ret=%d\n", swarn->errstr,
+ swarn->logical, swarn->dev->name,
+ (unsigned long long)swarn->sector, root, inum, offset, ret);
+
+ free_ipath(ipath);
+ return 0;
+}
+
+static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
+ int ix)
+{
+ struct btrfs_device *dev = sbio->sdev->dev;
+ struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+ struct btrfs_path *path;
+ struct btrfs_key found_key;
+ struct extent_buffer *eb;
+ struct btrfs_extent_item *ei;
+ struct scrub_warning swarn;
+ u32 item_size;
+ int ret;
+ u64 ref_root;
+ u8 ref_level;
+ unsigned long ptr = 0;
+ const int bufsize = 4096;
+ u64 extent_offset;
+
+ path = btrfs_alloc_path();
+
+ swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
+ swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
+ swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+ swarn.logical = sbio->logical + ix * PAGE_SIZE;
+ swarn.errstr = errstr;
+ swarn.dev = dev;
+ swarn.msg_bufsize = bufsize;
+ swarn.scratch_bufsize = bufsize;
+
+ if (!path || !swarn.scratch_buf || !swarn.msg_buf)
+ goto out;
+
+ ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
+ if (ret < 0)
+ goto out;
+
+ extent_offset = swarn.logical - found_key.objectid;
+ swarn.extent_item_size = found_key.offset;
+
+ eb = path->nodes[0];
+ ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+ item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+ if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ do {
+ ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
+ &ref_root, &ref_level);
+ printk(KERN_WARNING "%s at logical %llu on dev %s, "
+ "sector %llu: metadata %s (level %d) in tree "
+ "%llu\n", errstr, swarn.logical, dev->name,
+ (unsigned long long)swarn.sector,
+ ref_level ? "node" : "leaf",
+ ret < 0 ? -1 : ref_level,
+ ret < 0 ? -1 : ref_root);
+ } while (ret != 1);
+ } else {
+ swarn.path = path;
+ iterate_extent_inodes(fs_info, path, found_key.objectid,
+ extent_offset,
+ scrub_print_warning_inode, &swarn);
+ }
+
+out:
+ btrfs_free_path(path);
+ kfree(swarn.scratch_buf);
+ kfree(swarn.msg_buf);
+}
+
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ struct page *page = NULL;
+ unsigned long index;
+ struct scrub_fixup_nodatasum *fixup = ctx;
+ int ret;
+ int corrected = 0;
+ struct btrfs_key key;
+ struct inode *inode = NULL;
+ u64 end = offset + PAGE_SIZE - 1;
+ struct btrfs_root *local_root;
+
+ key.objectid = root;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
+ if (IS_ERR(local_root))
+ return PTR_ERR(local_root);
+
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.objectid = inum;
+ key.offset = 0;
+ inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ index = offset >> PAGE_CACHE_SHIFT;
+
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (PageUptodate(page)) {
+ struct btrfs_mapping_tree *map_tree;
+ if (PageDirty(page)) {
+ /*
+ * we need to write the data to the defect sector. the
+ * data that was in that sector is not in memory,
+ * because the page was modified. we must not write the
+ * modified page to that sector.
+ *
+ * TODO: what could be done here: wait for the delalloc
+ * runner to write out that page (might involve
+ * COW) and see whether the sector is still
+ * referenced afterwards.
+ *
+ * For the meantime, we'll treat this error
+ * incorrectable, although there is a chance that a
+ * later scrub will find the bad sector again and that
+ * there's no dirty page in memory, then.
+ */
+ ret = -EIO;
+ goto out;
+ }
+ map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+ ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
+ fixup->logical, page,
+ fixup->mirror_num);
+ unlock_page(page);
+ corrected = !ret;
+ } else {
+ /*
+ * we need to get good data first. the general readpage path
+ * will call repair_io_failure for us, we just have to make
+ * sure we read the bad mirror.
+ */
+ ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+ EXTENT_DAMAGED, GFP_NOFS);
+ if (ret) {
+ /* set_extent_bits should give proper error */
+ WARN_ON(ret > 0);
+ if (ret > 0)
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
+ btrfs_get_extent,
+ fixup->mirror_num);
+ wait_on_page_locked(page);
+
+ corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
+ end, EXTENT_DAMAGED, 0, NULL);
+ if (!corrected)
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+ EXTENT_DAMAGED, GFP_NOFS);
+ }
+
+out:
+ if (page)
+ put_page(page);
+ if (inode)
+ iput(inode);
+
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0 && corrected) {
+ /*
+ * we only need to call readpage for one of the inodes belonging
+ * to this extent. so make iterate_extent_inodes stop
+ */
+ return 1;
+ }
+
+ return -EIO;
+}
+
+static void scrub_fixup_nodatasum(struct btrfs_work *work)
+{
+ int ret;
+ struct scrub_fixup_nodatasum *fixup;
+ struct scrub_dev *sdev;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_path *path;
+ int uncorrectable = 0;
+
+ fixup = container_of(work, struct scrub_fixup_nodatasum, work);
+ sdev = fixup->sdev;
+ fs_info = fixup->root->fs_info;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.malloc_errors;
+ spin_unlock(&sdev->stat_lock);
+ uncorrectable = 1;
+ goto out;
+ }
+
+ trans = btrfs_join_transaction(fixup->root);
+ if (IS_ERR(trans)) {
+ uncorrectable = 1;
+ goto out;
+ }
+
+ /*
+ * the idea is to trigger a regular read through the standard path. we
+ * read a page from the (failed) logical address by specifying the
+ * corresponding copynum of the failed sector. thus, that readpage is
+ * expected to fail.
+ * that is the point where on-the-fly error correction will kick in
+ * (once it's finished) and rewrite the failed sector if a good copy
+ * can be found.
+ */
+ ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
+ path, scrub_fixup_readpage,
+ fixup);
+ if (ret < 0) {
+ uncorrectable = 1;
+ goto out;
+ }
+ WARN_ON(ret != 1);
+
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.corrected_errors;
+ spin_unlock(&sdev->stat_lock);
+
+out:
+ if (trans && !IS_ERR(trans))
+ btrfs_end_transaction(trans, fixup->root);
+ if (uncorrectable) {
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.uncorrectable_errors;
+ spin_unlock(&sdev->stat_lock);
+ printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
+ "(nodatasum) error at logical %llu\n",
+ fixup->logical);
+ }
+
+ btrfs_free_path(path);
+ kfree(fixup);
+
+ /* see caller why we're pretending to be paused in the scrub counters */
+ mutex_lock(&fs_info->scrub_lock);
+ atomic_dec(&fs_info->scrubs_running);
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ atomic_dec(&sdev->fixup_cnt);
+ wake_up(&fs_info->scrub_pause_wait);
+ wake_up(&sdev->list_wait);
+}
+
/*
* scrub_recheck_error gets called when either verification of the page
* failed or the bio failed to read, e.g. with EIO. In the latter case,
* recheck_error gets called for every page in the bio, even though only
* one may be bad
*/
-static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
+static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
{
+ struct scrub_dev *sdev = sbio->sdev;
+ u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
if (sbio->err) {
- if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
- (sbio->physical + ix * PAGE_SIZE) >> 9,
+ if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
sbio->bio->bi_io_vec[ix].bv_page) == 0) {
if (scrub_fixup_check(sbio, ix) == 0)
- return;
+ return 0;
}
+ if (__ratelimit(&_rs))
+ scrub_print_warning("i/o error", sbio, ix);
+ } else {
+ if (__ratelimit(&_rs))
+ scrub_print_warning("checksum error", sbio, ix);
}
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.read_errors;
+ spin_unlock(&sdev->stat_lock);
+
scrub_fixup(sbio, ix);
+ return 1;
}
static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
struct scrub_dev *sdev = sbio->sdev;
struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct btrfs_multi_bio *multi = NULL;
+ struct btrfs_bio *bbio = NULL;
+ struct scrub_fixup_nodatasum *fixup;
u64 logical = sbio->logical + ix * PAGE_SIZE;
u64 length;
int i;
@@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
(sbio->spag[ix].have_csum == 0)) {
+ fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+ if (!fixup)
+ goto uncorrectable;
+ fixup->sdev = sdev;
+ fixup->logical = logical;
+ fixup->root = fs_info->extent_root;
+ fixup->mirror_num = sbio->spag[ix].mirror_num;
/*
- * nodatasum, don't try to fix anything
- * FIXME: we can do better, open the inode and trigger a
- * writeback
+ * increment scrubs_running to prevent cancel requests from
+ * completing as long as a fixup worker is running. we must also
+ * increment scrubs_paused to prevent deadlocking on pause
+ * requests used for transactions commits (as the worker uses a
+ * transaction context). it is safe to regard the fixup worker
+ * as paused for all matters practical. effectively, we only
+ * avoid cancellation requests from completing.
*/
- goto uncorrectable;
+ mutex_lock(&fs_info->scrub_lock);
+ atomic_inc(&fs_info->scrubs_running);
+ atomic_inc(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ atomic_inc(&sdev->fixup_cnt);
+ fixup->work.func = scrub_fixup_nodatasum;
+ btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
+ return;
}
length = PAGE_SIZE;
ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
- &multi, 0);
- if (ret || !multi || length < PAGE_SIZE) {
+ &bbio, 0);
+ if (ret || !bbio || length < PAGE_SIZE) {
printk(KERN_ERR
"scrub_fixup: btrfs_map_block failed us for %llu\n",
(unsigned long long)logical);
WARN_ON(1);
+ kfree(bbio);
return;
}
- if (multi->num_stripes == 1)
+ if (bbio->num_stripes == 1)
/* there aren't any replicas */
goto uncorrectable;
/*
* first find a good copy
*/
- for (i = 0; i < multi->num_stripes; ++i) {
- if (i == sbio->spag[ix].mirror_num)
+ for (i = 0; i < bbio->num_stripes; ++i) {
+ if (i + 1 == sbio->spag[ix].mirror_num)
continue;
- if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
- multi->stripes[i].physical >> 9,
+ if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
+ bbio->stripes[i].physical >> 9,
sbio->bio->bi_io_vec[ix].bv_page)) {
/* I/O-error, this is not a good copy */
continue;
@@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
if (scrub_fixup_check(sbio, ix) == 0)
break;
}
- if (i == multi->num_stripes)
+ if (i == bbio->num_stripes)
goto uncorrectable;
if (!sdev->readonly) {
@@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
}
}
- kfree(multi);
+ kfree(bbio);
spin_lock(&sdev->stat_lock);
++sdev->stat.corrected_errors;
spin_unlock(&sdev->stat_lock);
- if (printk_ratelimit())
- printk(KERN_ERR "btrfs: fixed up at %llu\n",
- (unsigned long long)logical);
+ printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
+ (unsigned long long)logical);
return;
uncorrectable:
- kfree(multi);
+ kfree(bbio);
spin_lock(&sdev->stat_lock);
++sdev->stat.uncorrectable_errors;
spin_unlock(&sdev->stat_lock);
- if (printk_ratelimit())
- printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
- (unsigned long long)logical);
+ printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
+ "logical %llu\n", (unsigned long long)logical);
}
static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work)
int ret;
if (sbio->err) {
+ ret = 0;
for (i = 0; i < sbio->count; ++i)
- scrub_recheck_error(sbio, i);
+ ret |= scrub_recheck_error(sbio, i);
+ if (!ret) {
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.unverified_errors;
+ spin_unlock(&sdev->stat_lock);
+ }
sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work)
bi->bv_offset = 0;
bi->bv_len = PAGE_SIZE;
}
-
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.read_errors;
- spin_unlock(&sdev->stat_lock);
goto out;
}
for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work)
WARN_ON(1);
}
kunmap_atomic(buffer, KM_USER0);
- if (ret)
- scrub_recheck_error(sbio, i);
+ if (ret) {
+ ret = scrub_recheck_error(sbio, i);
+ if (!ret) {
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.unverified_errors;
+ spin_unlock(&sdev->stat_lock);
+ }
+ }
}
out:
@@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
static int scrub_submit(struct scrub_dev *sdev)
{
struct scrub_bio *sbio;
- struct bio *bio;
- int i;
if (sdev->curr == -1)
return 0;
sbio = sdev->bios[sdev->curr];
-
- bio = bio_alloc(GFP_NOFS, sbio->count);
- if (!bio)
- goto nomem;
-
- bio->bi_private = sbio;
- bio->bi_end_io = scrub_bio_end_io;
- bio->bi_bdev = sdev->dev->bdev;
- bio->bi_sector = sbio->physical >> 9;
-
- for (i = 0; i < sbio->count; ++i) {
- struct page *page;
- int ret;
-
- page = alloc_page(GFP_NOFS);
- if (!page)
- goto nomem;
-
- ret = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (!ret) {
- __free_page(page);
- goto nomem;
- }
- }
-
sbio->err = 0;
sdev->curr = -1;
atomic_inc(&sdev->in_flight);
- submit_bio(READ, bio);
+ submit_bio(READ, sbio->bio);
return 0;
-
-nomem:
- scrub_free_bio(bio);
-
- return -ENOMEM;
}
static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
- u64 physical, u64 flags, u64 gen, u64 mirror_num,
+ u64 physical, u64 flags, u64 gen, int mirror_num,
u8 *csum, int force)
{
struct scrub_bio *sbio;
+ struct page *page;
+ int ret;
again:
/*
@@ -628,12 +990,22 @@ again:
}
sbio = sdev->bios[sdev->curr];
if (sbio->count == 0) {
+ struct bio *bio;
+
sbio->physical = physical;
sbio->logical = logical;
+ bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_private = sbio;
+ bio->bi_end_io = scrub_bio_end_io;
+ bio->bi_bdev = sdev->dev->bdev;
+ bio->bi_sector = sbio->physical >> 9;
+ sbio->err = 0;
+ sbio->bio = bio;
} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
sbio->logical + sbio->count * PAGE_SIZE != logical) {
- int ret;
-
ret = scrub_submit(sdev);
if (ret)
return ret;
@@ -643,6 +1015,20 @@ again:
sbio->spag[sbio->count].generation = gen;
sbio->spag[sbio->count].have_csum = 0;
sbio->spag[sbio->count].mirror_num = mirror_num;
+
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+
+ ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
+ if (!ret) {
+ __free_page(page);
+ ret = scrub_submit(sdev);
+ if (ret)
+ return ret;
+ goto again;
+ }
+
if (csum) {
sbio->spag[sbio->count].have_csum = 1;
memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
/* scrub extent tries to collect up to 64 kB for each bio */
static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
- u64 physical, u64 flags, u64 gen, u64 mirror_num)
+ u64 physical, u64 flags, u64 gen, int mirror_num)
{
int ret;
u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
int slot;
int i;
u64 nstripes;
- int start_stripe;
struct extent_buffer *l;
struct btrfs_key key;
u64 physical;
u64 logical;
u64 generation;
- u64 mirror_num;
+ int mirror_num;
+ struct reada_control *reada1;
+ struct reada_control *reada2;
+ struct btrfs_key key_start;
+ struct btrfs_key key_end;
u64 increment = map->stripe_len;
u64 offset;
@@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
offset = map->stripe_len * num;
increment = map->stripe_len * map->num_stripes;
- mirror_num = 0;
+ mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
int factor = map->num_stripes / map->sub_stripes;
offset = map->stripe_len * (num / map->sub_stripes);
increment = map->stripe_len * factor;
- mirror_num = num % map->sub_stripes;
+ mirror_num = num % map->sub_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
increment = map->stripe_len;
- mirror_num = num % map->num_stripes;
+ mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
increment = map->stripe_len;
- mirror_num = num % map->num_stripes;
+ mirror_num = num % map->num_stripes + 1;
} else {
increment = map->stripe_len;
- mirror_num = 0;
+ mirror_num = 1;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
path->search_commit_root = 1;
path->skip_locking = 1;
/*
- * find all extents for each stripe and just read them to get
- * them into the page cache
- * FIXME: we can do better. build a more intelligent prefetching
+ * trigger the readahead for extent tree csum tree and wait for
+ * completion. During readahead, the scrub is officially paused
+ * to not hold off transaction commits
*/
logical = base + offset;
- physical = map->stripes[num].physical;
- ret = 0;
- for (i = 0; i < nstripes; ++i) {
- key.objectid = logical;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = (u64)0;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out_noplug;
- /*
- * we might miss half an extent here, but that doesn't matter,
- * as it's only the prefetch
- */
- while (1) {
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out_noplug;
-
- break;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
+ wait_event(sdev->list_wait,
+ atomic_read(&sdev->in_flight) == 0);
+ atomic_inc(&fs_info->scrubs_paused);
+ wake_up(&fs_info->scrub_pause_wait);
- if (key.objectid >= logical + map->stripe_len)
- break;
+ /* FIXME it might be better to start readahead at commit root */
+ key_start.objectid = logical;
+ key_start.type = BTRFS_EXTENT_ITEM_KEY;
+ key_start.offset = (u64)0;
+ key_end.objectid = base + offset + nstripes * increment;
+ key_end.type = BTRFS_EXTENT_ITEM_KEY;
+ key_end.offset = (u64)0;
+ reada1 = btrfs_reada_add(root, &key_start, &key_end);
+
+ key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key_start.type = BTRFS_EXTENT_CSUM_KEY;
+ key_start.offset = logical;
+ key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key_end.type = BTRFS_EXTENT_CSUM_KEY;
+ key_end.offset = base + offset + nstripes * increment;
+ reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+
+ if (!IS_ERR(reada1))
+ btrfs_reada_wait(reada1);
+ if (!IS_ERR(reada2))
+ btrfs_reada_wait(reada2);
- path->slots[0]++;
- }
- btrfs_release_path(path);
- logical += increment;
- physical += map->stripe_len;
- cond_resched();
+ mutex_lock(&fs_info->scrub_lock);
+ while (atomic_read(&fs_info->scrub_pause_req)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrub_pause_req) == 0);
+ mutex_lock(&fs_info->scrub_lock);
}
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ wake_up(&fs_info->scrub_pause_wait);
/*
* collect all data csums for the stripe to avoid seeking during
* the scrub. This might currently (crc32) end up to be about 1MB
*/
- start_stripe = 0;
blk_start_plug(&plug);
-again:
- logical = base + offset + start_stripe * increment;
- for (i = start_stripe; i < nstripes; ++i) {
- ret = btrfs_lookup_csums_range(csum_root, logical,
- logical + map->stripe_len - 1,
- &sdev->csum_list, 1);
- if (ret)
- goto out;
- logical += increment;
- cond_resched();
- }
/*
* now find all extents for each stripe and scrub them
*/
- logical = base + offset + start_stripe * increment;
- physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+ logical = base + offset;
+ physical = map->stripes[num].physical;
ret = 0;
- for (i = start_stripe; i < nstripes; ++i) {
+ for (i = 0; i < nstripes; ++i) {
/*
* canceled?
*/
@@ -882,11 +1257,14 @@ again:
atomic_dec(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
wake_up(&fs_info->scrub_pause_wait);
- scrub_free_csums(sdev);
- start_stripe = i;
- goto again;
}
+ ret = btrfs_lookup_csums_range(csum_root, logical,
+ logical + map->stripe_len - 1,
+ &sdev->csum_list, 1);
+ if (ret)
+ goto out;
+
key.objectid = logical;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = (u64)0;
@@ -982,7 +1360,6 @@ next:
out:
blk_finish_plug(&plug);
-out_noplug:
btrfs_free_path(path);
return ret < 0 ? ret : 0;
}
@@ -1158,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
mutex_lock(&fs_info->scrub_lock);
if (fs_info->scrub_workers_refcnt == 0) {
btrfs_init_workers(&fs_info->scrub_workers, "scrub",
fs_info->thread_pool_size, &fs_info->generic_worker);
fs_info->scrub_workers.idle_thresh = 4;
- btrfs_start_workers(&fs_info->scrub_workers, 1);
+ ret = btrfs_start_workers(&fs_info->scrub_workers);
+ if (ret)
+ goto out;
}
++fs_info->scrub_workers_refcnt;
+out:
mutex_unlock(&fs_info->scrub_lock);
- return 0;
+ return ret;
}
static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
@@ -1253,10 +1634,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
ret = scrub_enumerate_chunks(sdev, start, end);
wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
-
atomic_dec(&fs_info->scrubs_running);
wake_up(&fs_info->scrub_pause_wait);
+ wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
+
if (progress)
memcpy(progress, &sdev->stat, sizeof(*progress));
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c0f7eca..bc1f6ad 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -50,36 +50,22 @@ u##bits btrfs_##name(struct extent_buffer *eb, \
unsigned long part_offset = (unsigned long)s; \
unsigned long offset = part_offset + offsetof(type, member); \
type *p; \
- /* ugly, but we want the fast path here */ \
- if (eb->map_token && offset >= eb->map_start && \
- offset + sizeof(((type *)0)->member) <= eb->map_start + \
- eb->map_len) { \
- p = (type *)(eb->kaddr + part_offset - eb->map_start); \
- return le##bits##_to_cpu(p->member); \
- } \
- { \
- int err; \
- char *map_token; \
- char *kaddr; \
- int unmap_on_exit = (eb->map_token == NULL); \
- unsigned long map_start; \
- unsigned long map_len; \
- u##bits res; \
- err = map_extent_buffer(eb, offset, \
- sizeof(((type *)0)->member), \
- &map_token, &kaddr, \
- &map_start, &map_len, KM_USER1); \
- if (err) { \
- __le##bits leres; \
- read_eb_member(eb, s, type, member, &leres); \
- return le##bits##_to_cpu(leres); \
- } \
- p = (type *)(kaddr + part_offset - map_start); \
- res = le##bits##_to_cpu(p->member); \
- if (unmap_on_exit) \
- unmap_extent_buffer(eb, map_token, KM_USER1); \
- return res; \
- } \
+ int err; \
+ char *kaddr; \
+ unsigned long map_start; \
+ unsigned long map_len; \
+ u##bits res; \
+ err = map_private_extent_buffer(eb, offset, \
+ sizeof(((type *)0)->member), \
+ &kaddr, &map_start, &map_len); \
+ if (err) { \
+ __le##bits leres; \
+ read_eb_member(eb, s, type, member, &leres); \
+ return le##bits##_to_cpu(leres); \
+ } \
+ p = (type *)(kaddr + part_offset - map_start); \
+ res = le##bits##_to_cpu(p->member); \
+ return res; \
} \
void btrfs_set_##name(struct extent_buffer *eb, \
type *s, u##bits val) \
@@ -87,36 +73,21 @@ void btrfs_set_##name(struct extent_buffer *eb, \
unsigned long part_offset = (unsigned long)s; \
unsigned long offset = part_offset + offsetof(type, member); \
type *p; \
- /* ugly, but we want the fast path here */ \
- if (eb->map_token && offset >= eb->map_start && \
- offset + sizeof(((type *)0)->member) <= eb->map_start + \
- eb->map_len) { \
- p = (type *)(eb->kaddr + part_offset - eb->map_start); \
- p->member = cpu_to_le##bits(val); \
- return; \
- } \
- { \
- int err; \
- char *map_token; \
- char *kaddr; \
- int unmap_on_exit = (eb->map_token == NULL); \
- unsigned long map_start; \
- unsigned long map_len; \
- err = map_extent_buffer(eb, offset, \
- sizeof(((type *)0)->member), \
- &map_token, &kaddr, \
- &map_start, &map_len, KM_USER1); \
- if (err) { \
- __le##bits val2; \
- val2 = cpu_to_le##bits(val); \
- write_eb_member(eb, s, type, member, &val2); \
- return; \
- } \
- p = (type *)(kaddr + part_offset - map_start); \
- p->member = cpu_to_le##bits(val); \
- if (unmap_on_exit) \
- unmap_extent_buffer(eb, map_token, KM_USER1); \
- } \
+ int err; \
+ char *kaddr; \
+ unsigned long map_start; \
+ unsigned long map_len; \
+ err = map_private_extent_buffer(eb, offset, \
+ sizeof(((type *)0)->member), \
+ &kaddr, &map_start, &map_len); \
+ if (err) { \
+ __le##bits val2; \
+ val2 = cpu_to_le##bits(val); \
+ write_eb_member(eb, s, type, member, &val2); \
+ return; \
+ } \
+ p = (type *)(kaddr + part_offset - map_start); \
+ p->member = cpu_to_le##bits(val); \
}
#include "ctree.h"
@@ -125,15 +96,6 @@ void btrfs_node_key(struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
unsigned long ptr = btrfs_node_key_ptr_offset(nr);
- if (eb->map_token && ptr >= eb->map_start &&
- ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
- memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
- sizeof(*disk_key));
- return;
- } else if (eb->map_token) {
- unmap_extent_buffer(eb, eb->map_token, KM_USER1);
- eb->map_token = NULL;
- }
read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
struct btrfs_key_ptr, key, disk_key);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4..200f63b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,8 @@
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/cleancache.h>
+#include <linux/mnt_namespace.h>
+#include <linux/ratelimit.h>
#include "compat.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -58,6 +60,7 @@
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
+static struct file_system_type btrfs_fs_type;
static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
char nbuf[16])
@@ -162,7 +165,7 @@ enum {
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
- Opt_inode_cache, Opt_err,
+ Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
};
static match_table_t tokens = {
@@ -195,6 +198,8 @@ static match_table_t tokens = {
{Opt_subvolrootid, "subvolrootid=%d"},
{Opt_defrag, "autodefrag"},
{Opt_inode_cache, "inode_cache"},
+ {Opt_no_space_cache, "nospace_cache"},
+ {Opt_recovery, "recovery"},
{Opt_err, NULL},
};
@@ -206,14 +211,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
{
struct btrfs_fs_info *info = root->fs_info;
substring_t args[MAX_OPT_ARGS];
- char *p, *num, *orig;
+ char *p, *num, *orig = NULL;
+ u64 cache_gen;
int intarg;
int ret = 0;
char *compress_type;
bool compress_force = false;
+ cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+ if (cache_gen)
+ btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+
if (!options)
- return 0;
+ goto out;
/*
* strsep changes the string, duplicate it because parse_options
@@ -360,9 +370,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_opt(info->mount_opt, DISCARD);
break;
case Opt_space_cache:
- printk(KERN_INFO "btrfs: enabling disk space caching\n");
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
break;
+ case Opt_no_space_cache:
+ printk(KERN_INFO "btrfs: disabling disk space caching\n");
+ btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
+ break;
case Opt_inode_cache:
printk(KERN_INFO "btrfs: enabling inode map caching\n");
btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +394,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
printk(KERN_INFO "btrfs: enabling auto defrag");
btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
break;
+ case Opt_recovery:
+ printk(KERN_INFO "btrfs: enabling auto recovery");
+ btrfs_set_opt(info->mount_opt, RECOVERY);
+ break;
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@@ -391,6 +408,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
}
out:
+ if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+ printk(KERN_INFO "btrfs: disk space caching is enabled\n");
kfree(orig);
return ret;
}
@@ -406,12 +425,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
{
substring_t args[MAX_OPT_ARGS];
- char *opts, *orig, *p;
+ char *device_name, *opts, *orig, *p;
int error = 0;
int intarg;
if (!options)
- goto out;
+ return 0;
/*
* strsep changes the string, duplicate it because parse_options
@@ -430,6 +449,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
token = match_token(p, tokens, args);
switch (token) {
case Opt_subvol:
+ kfree(*subvol_name);
*subvol_name = match_strdup(&args[0]);
break;
case Opt_subvolid:
@@ -457,29 +477,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
}
break;
case Opt_device:
- error = btrfs_scan_one_device(match_strdup(&args[0]),
+ device_name = match_strdup(&args[0]);
+ if (!device_name) {
+ error = -ENOMEM;
+ goto out;
+ }
+ error = btrfs_scan_one_device(device_name,
flags, holder, fs_devices);
+ kfree(device_name);
if (error)
- goto out_free_opts;
+ goto out;
break;
default:
break;
}
}
- out_free_opts:
+out:
kfree(orig);
- out:
- /*
- * If no subvolume name is specified we use the default one. Allocate
- * a copy of the string "." here so that code later in the
- * mount path doesn't care if it's the default volume or another one.
- */
- if (!*subvol_name) {
- *subvol_name = kstrdup(".", GFP_KERNEL);
- if (!*subvol_name)
- return -ENOMEM;
- }
return error;
}
@@ -492,7 +507,6 @@ static struct dentry *get_default_root(struct super_block *sb,
struct btrfs_path *path;
struct btrfs_key location;
struct inode *inode;
- struct dentry *dentry;
u64 dir_id;
int new = 0;
@@ -517,7 +531,7 @@ static struct dentry *get_default_root(struct super_block *sb,
* will mount by default if we haven't been given a specific subvolume
* to mount.
*/
- dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+ dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
@@ -566,29 +580,7 @@ setup_root:
return dget(sb->s_root);
}
- if (new) {
- const struct qstr name = { .name = "/", .len = 1 };
-
- /*
- * New inode, we need to make the dentry a sibling of s_root so
- * everything gets cleaned up properly on unmount.
- */
- dentry = d_alloc(sb->s_root, &name);
- if (!dentry) {
- iput(inode);
- return ERR_PTR(-ENOMEM);
- }
- d_splice_alias(inode, dentry);
- } else {
- /*
- * We found the inode in cache, just find a dentry for it and
- * put the reference to the inode we just got.
- */
- dentry = d_find_alias(inode);
- iput(inode);
- }
-
- return dentry;
+ return d_obtain_alias(inode);
}
static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +711,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",noacl");
if (btrfs_test_opt(root, SPACE_CACHE))
seq_puts(seq, ",space_cache");
+ else
+ seq_puts(seq, ",nospace_cache");
if (btrfs_test_opt(root, CLEAR_CACHE))
seq_puts(seq, ",clear_cache");
if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +747,111 @@ static int btrfs_set_super(struct super_block *s, void *data)
return set_anon_super(s, data);
}
+/*
+ * subvolumes are identified by ino 256
+ */
+static inline int is_subvolume_inode(struct inode *inode)
+{
+ if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+ return 0;
+}
+
+/*
+ * This will strip out the subvol=%s argument for an argument string and add
+ * subvolid=0 to make sure we get the actual tree root for path walking to the
+ * subvol we want.
+ */
+static char *setup_root_args(char *args)
+{
+ unsigned copied = 0;
+ unsigned len = strlen(args) + 2;
+ char *pos;
+ char *ret;
+
+ /*
+ * We need the same args as before, but minus
+ *
+ * subvol=a
+ *
+ * and add
+ *
+ * subvolid=0
+ *
+ * which is a difference of 2 characters, so we allocate strlen(args) +
+ * 2 characters.
+ */
+ ret = kzalloc(len * sizeof(char), GFP_NOFS);
+ if (!ret)
+ return NULL;
+ pos = strstr(args, "subvol=");
+
+ /* This shouldn't happen, but just in case.. */
+ if (!pos) {
+ kfree(ret);
+ return NULL;
+ }
+
+ /*
+ * The subvol=<> arg is not at the front of the string, copy everybody
+ * up to that into ret.
+ */
+ if (pos != args) {
+ *pos = '\0';
+ strcpy(ret, args);
+ copied += strlen(args);
+ pos++;
+ }
+
+ strncpy(ret + copied, "subvolid=0", len - copied);
+
+ /* Length of subvolid=0 */
+ copied += 10;
+
+ /*
+ * If there is no , after the subvol= option then we know there's no
+ * other options and we can just return.
+ */
+ pos = strchr(pos, ',');
+ if (!pos)
+ return ret;
+
+ /* Copy the rest of the arguments into our buffer */
+ strncpy(ret + copied, pos, len - copied);
+ copied += strlen(pos);
+
+ return ret;
+}
+
+static struct dentry *mount_subvol(const char *subvol_name, int flags,
+ const char *device_name, char *data)
+{
+ struct dentry *root;
+ struct vfsmount *mnt;
+ char *newargs;
+
+ newargs = setup_root_args(data);
+ if (!newargs)
+ return ERR_PTR(-ENOMEM);
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
+ newargs);
+ kfree(newargs);
+ if (IS_ERR(mnt))
+ return ERR_CAST(mnt);
+
+ root = mount_subtree(mnt, subvol_name);
+
+ if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
+ struct super_block *s = root->d_sb;
+ dput(root);
+ root = ERR_PTR(-EINVAL);
+ deactivate_locked_super(s);
+ printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
+ subvol_name);
+ }
+
+ return root;
+}
/*
* Find a superblock for the given device / mount point.
@@ -767,7 +866,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
struct super_block *s;
struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
- struct btrfs_root *tree_root = NULL;
struct btrfs_fs_info *fs_info = NULL;
fmode_t mode = FMODE_READ;
char *subvol_name = NULL;
@@ -781,21 +879,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
error = btrfs_parse_early_options(data, mode, fs_type,
&subvol_name, &subvol_objectid,
&subvol_rootid, &fs_devices);
- if (error)
+ if (error) {
+ kfree(subvol_name);
return ERR_PTR(error);
+ }
- error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
- if (error)
- goto error_free_subvol_name;
+ if (subvol_name) {
+ root = mount_subvol(subvol_name, flags, device_name, data);
+ kfree(subvol_name);
+ return root;
+ }
- error = btrfs_open_devices(fs_devices, mode, fs_type);
+ error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
if (error)
- goto error_free_subvol_name;
-
- if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
- error = -EACCES;
- goto error_close_devices;
- }
+ return ERR_PTR(error);
/*
* Setup a dummy root and fs_info for test/set super. This is because
@@ -804,19 +901,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
* then open_ctree will properly initialize everything later.
*/
fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
- tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- if (!fs_info || !tree_root) {
+ if (!fs_info)
+ return ERR_PTR(-ENOMEM);
+
+ fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ if (!fs_info->tree_root) {
error = -ENOMEM;
- goto error_close_devices;
+ goto error_fs_info;
}
- fs_info->tree_root = tree_root;
+ fs_info->tree_root->fs_info = fs_info;
fs_info->fs_devices = fs_devices;
- tree_root->fs_info = fs_info;
+
+ fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+ fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+ if (!fs_info->super_copy || !fs_info->super_for_commit) {
+ error = -ENOMEM;
+ goto error_fs_info;
+ }
+
+ error = btrfs_open_devices(fs_devices, mode, fs_type);
+ if (error)
+ goto error_fs_info;
+
+ if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+ error = -EACCES;
+ goto error_close_devices;
+ }
bdev = fs_devices->latest_bdev;
- s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
- if (IS_ERR(s))
- goto error_s;
+ s = sget(fs_type, btrfs_test_super, btrfs_set_super,
+ fs_info->tree_root);
+ if (IS_ERR(s)) {
+ error = PTR_ERR(s);
+ goto error_close_devices;
+ }
if (s->s_root) {
if ((flags ^ s->s_flags) & MS_RDONLY) {
@@ -826,75 +944,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
}
btrfs_close_devices(fs_devices);
- kfree(fs_info);
- kfree(tree_root);
+ free_fs_info(fs_info);
} else {
char b[BDEVNAME_SIZE];
s->s_flags = flags | MS_NOSEC;
strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ btrfs_sb(s)->fs_info->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
if (error) {
deactivate_locked_super(s);
- goto error_free_subvol_name;
+ return ERR_PTR(error);
}
- btrfs_sb(s)->fs_info->bdev_holder = fs_type;
s->s_flags |= MS_ACTIVE;
}
- /* if they gave us a subvolume name bind mount into that */
- if (strcmp(subvol_name, ".")) {
- struct dentry *new_root;
-
- root = get_default_root(s, subvol_rootid);
- if (IS_ERR(root)) {
- error = PTR_ERR(root);
- deactivate_locked_super(s);
- goto error_free_subvol_name;
- }
-
- mutex_lock(&root->d_inode->i_mutex);
- new_root = lookup_one_len(subvol_name, root,
- strlen(subvol_name));
- mutex_unlock(&root->d_inode->i_mutex);
-
- if (IS_ERR(new_root)) {
- dput(root);
- deactivate_locked_super(s);
- error = PTR_ERR(new_root);
- goto error_free_subvol_name;
- }
- if (!new_root->d_inode) {
- dput(root);
- dput(new_root);
- deactivate_locked_super(s);
- error = -ENXIO;
- goto error_free_subvol_name;
- }
- dput(root);
- root = new_root;
- } else {
- root = get_default_root(s, subvol_objectid);
- if (IS_ERR(root)) {
- error = PTR_ERR(root);
- deactivate_locked_super(s);
- goto error_free_subvol_name;
- }
+ root = get_default_root(s, subvol_objectid);
+ if (IS_ERR(root)) {
+ deactivate_locked_super(s);
+ return root;
}
- kfree(subvol_name);
return root;
-error_s:
- error = PTR_ERR(s);
error_close_devices:
btrfs_close_devices(fs_devices);
- kfree(fs_info);
- kfree(tree_root);
-error_free_subvol_name:
- kfree(subvol_name);
+error_fs_info:
+ free_fs_info(fs_info);
return ERR_PTR(error);
}
@@ -919,7 +997,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (root->fs_info->fs_devices->rw_devices == 0)
return -EACCES;
- if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+ if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
return -EINVAL;
ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -976,11 +1054,11 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
u64 avail_space;
u64 used_space;
u64 min_stripe_size;
- int min_stripes = 1;
+ int min_stripes = 1, num_stripes = 1;
int i = 0, nr_devices;
int ret;
- nr_devices = fs_info->fs_devices->rw_devices;
+ nr_devices = fs_info->fs_devices->open_devices;
BUG_ON(!nr_devices);
devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -990,20 +1068,24 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
/* calc min stripe number for data space alloction */
type = btrfs_get_alloc_profile(root, 1);
- if (type & BTRFS_BLOCK_GROUP_RAID0)
+ if (type & BTRFS_BLOCK_GROUP_RAID0) {
min_stripes = 2;
- else if (type & BTRFS_BLOCK_GROUP_RAID1)
+ num_stripes = nr_devices;
+ } else if (type & BTRFS_BLOCK_GROUP_RAID1) {
min_stripes = 2;
- else if (type & BTRFS_BLOCK_GROUP_RAID10)
+ num_stripes = 2;
+ } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
min_stripes = 4;
+ num_stripes = 4;
+ }
if (type & BTRFS_BLOCK_GROUP_DUP)
min_stripe_size = 2 * BTRFS_STRIPE_LEN;
else
min_stripe_size = BTRFS_STRIPE_LEN;
- list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
- if (!device->in_fs_metadata)
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->in_fs_metadata || !device->bdev)
continue;
avail_space = device->total_bytes - device->bytes_used;
@@ -1064,13 +1146,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
i = nr_devices - 1;
avail_space = 0;
while (nr_devices >= min_stripes) {
+ if (num_stripes > nr_devices)
+ num_stripes = nr_devices;
+
if (devices_info[i].max_avail >= min_stripe_size) {
int j;
u64 alloc_size;
- avail_space += devices_info[i].max_avail * min_stripes;
+ avail_space += devices_info[i].max_avail * num_stripes;
alloc_size = devices_info[i].max_avail;
- for (j = i + 1 - min_stripes; j <= i; j++)
+ for (j = i + 1 - num_stripes; j <= i; j++)
devices_info[j].max_avail -= alloc_size;
}
i--;
@@ -1085,7 +1170,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_root *root = btrfs_sb(dentry->d_sb);
- struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_super_block *disk_super = root->fs_info->super_copy;
struct list_head *head = &root->fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
@@ -1187,6 +1272,16 @@ static int btrfs_unfreeze(struct super_block *sb)
return 0;
}
+static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
+{
+ int ret;
+
+ ret = btrfs_dirty_inode(inode);
+ if (ret)
+ printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
+ "error %d\n", btrfs_ino(inode), ret);
+}
+
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
@@ -1194,7 +1289,7 @@ static const struct super_operations btrfs_super_ops = {
.sync_fs = btrfs_sync_fs,
.show_options = btrfs_show_options,
.write_inode = btrfs_write_inode,
- .dirty_inode = btrfs_dirty_inode,
+ .dirty_inode = btrfs_fs_dirty_inode,
.alloc_inode = btrfs_alloc_inode,
.destroy_inode = btrfs_destroy_inode,
.statfs = btrfs_statfs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 51dcec8..292e847 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
struct btrfs_transaction *cur_trans;
spin_lock(&root->fs_info->trans_lock);
+loop:
if (root->fs_info->trans_no_join) {
if (!nofail) {
spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
if (!cur_trans)
return -ENOMEM;
+
spin_lock(&root->fs_info->trans_lock);
if (root->fs_info->running_transaction) {
+ /*
+ * someone started a transaction after we unlocked. Make sure
+ * to redo the trans_no_join checks above
+ */
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
cur_trans = root->fs_info->running_transaction;
- atomic_inc(&cur_trans->use_count);
- atomic_inc(&cur_trans->num_writers);
- cur_trans->num_joined++;
- spin_unlock(&root->fs_info->trans_lock);
- return 0;
+ goto loop;
}
+
atomic_set(&cur_trans->num_writers, 1);
cur_trans->num_joined = 0;
init_waitqueue_head(&cur_trans->writer_wait);
@@ -216,17 +219,11 @@ static void wait_current_trans(struct btrfs_root *root)
spin_lock(&root->fs_info->trans_lock);
cur_trans = root->fs_info->running_transaction;
if (cur_trans && cur_trans->blocked) {
- DEFINE_WAIT(wait);
atomic_inc(&cur_trans->use_count);
spin_unlock(&root->fs_info->trans_lock);
- while (1) {
- prepare_to_wait(&root->fs_info->transaction_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (!cur_trans->blocked)
- break;
- schedule();
- }
- finish_wait(&root->fs_info->transaction_wait, &wait);
+
+ wait_event(root->fs_info->transaction_wait,
+ !cur_trans->blocked);
put_transaction(cur_trans);
} else {
spin_unlock(&root->fs_info->trans_lock);
@@ -260,7 +257,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
{
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
- int retries = 0;
+ u64 num_bytes = 0;
int ret;
if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -274,6 +271,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
h->block_rsv = NULL;
goto got_it;
}
+
+ /*
+ * Do the reservation before we join the transaction so we can do all
+ * the appropriate flushing if need be.
+ */
+ if (num_items > 0 && root != root->fs_info->chunk_root) {
+ num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+ ret = btrfs_block_rsv_add(root,
+ &root->fs_info->trans_block_rsv,
+ num_bytes);
+ if (ret)
+ return ERR_PTR(ret);
+ }
again:
h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
if (!h)
@@ -310,24 +320,9 @@ again:
goto again;
}
- if (num_items > 0) {
- ret = btrfs_trans_reserve_metadata(h, root, num_items);
- if (ret == -EAGAIN && !retries) {
- retries++;
- btrfs_commit_transaction(h, root);
- goto again;
- } else if (ret == -EAGAIN) {
- /*
- * We have already retried and got EAGAIN, so really we
- * don't have space, so set ret to -ENOSPC.
- */
- ret = -ENOSPC;
- }
-
- if (ret < 0) {
- btrfs_end_transaction(h, root);
- return ERR_PTR(ret);
- }
+ if (num_bytes) {
+ h->block_rsv = &root->fs_info->trans_block_rsv;
+ h->bytes_reserved = num_bytes;
}
got_it:
@@ -359,19 +354,10 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
}
/* wait for a transaction commit to be fully complete */
-static noinline int wait_for_commit(struct btrfs_root *root,
+static noinline void wait_for_commit(struct btrfs_root *root,
struct btrfs_transaction *commit)
{
- DEFINE_WAIT(wait);
- while (!commit->commit_done) {
- prepare_to_wait(&commit->commit_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (commit->commit_done)
- break;
- schedule();
- }
- finish_wait(&commit->commit_wait, &wait);
- return 0;
+ wait_event(commit->commit_wait, commit->commit_done);
}
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -435,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
int ret;
- ret = btrfs_block_rsv_check(trans, root,
- &root->fs_info->global_block_rsv, 0, 5);
+
+ ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
return ret ? 1 : 0;
}
@@ -444,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_block_rsv *rsv = trans->block_rsv;
int updates;
smp_mb();
if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
return 1;
+ /*
+ * We need to do this in case we're deleting csums so the global block
+ * rsv get's used instead of the csum block rsv.
+ */
+ trans->block_rsv = NULL;
+
updates = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (updates)
btrfs_run_delayed_refs(trans, root, updates);
+ trans->block_rsv = rsv;
+
return should_end_transaction(trans, root);
}
@@ -465,11 +460,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *info = root->fs_info;
int count = 0;
- if (--trans->use_count) {
+ if (trans->use_count > 1) {
+ trans->use_count--;
trans->block_rsv = trans->orig_rsv;
return 0;
}
+ btrfs_trans_release_metadata(trans, root);
+ trans->block_rsv = NULL;
while (count < 4) {
unsigned long cur = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
@@ -490,8 +488,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
count++;
}
- btrfs_trans_release_metadata(trans, root);
-
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
should_end_transaction(trans, root)) {
trans->transaction->blocked = 1;
@@ -572,50 +568,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark)
{
- int ret;
int err = 0;
int werr = 0;
- struct page *page;
- struct inode *btree_inode = root->fs_info->btree_inode;
+ struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
u64 start = 0;
u64 end;
- unsigned long index;
- while (1) {
- ret = find_first_extent_bit(dirty_pages, start, &start, &end,
- mark);
- if (ret)
- break;
- while (start <= end) {
- cond_resched();
-
- index = start >> PAGE_CACHE_SHIFT;
- start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
- page = find_get_page(btree_inode->i_mapping, index);
- if (!page)
- continue;
-
- btree_lock_page_hook(page);
- if (!page->mapping) {
- unlock_page(page);
- page_cache_release(page);
- continue;
- }
-
- if (PageWriteback(page)) {
- if (PageDirty(page))
- wait_on_page_writeback(page);
- else {
- unlock_page(page);
- page_cache_release(page);
- continue;
- }
- }
- err = write_one_page(page, 0);
- if (err)
- werr = err;
- page_cache_release(page);
- }
+ while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark)) {
+ convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
+ GFP_NOFS);
+ err = filemap_fdatawrite_range(mapping, start, end);
+ if (err)
+ werr = err;
+ cond_resched();
+ start = end + 1;
}
if (err)
werr = err;
@@ -631,39 +598,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark)
{
- int ret;
int err = 0;
int werr = 0;
- struct page *page;
- struct inode *btree_inode = root->fs_info->btree_inode;
+ struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
u64 start = 0;
u64 end;
- unsigned long index;
- while (1) {
- ret = find_first_extent_bit(dirty_pages, start, &start, &end,
- mark);
- if (ret)
- break;
-
- clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
- while (start <= end) {
- index = start >> PAGE_CACHE_SHIFT;
- start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
- page = find_get_page(btree_inode->i_mapping, index);
- if (!page)
- continue;
- if (PageDirty(page)) {
- btree_lock_page_hook(page);
- wait_on_page_writeback(page);
- err = write_one_page(page, 0);
- if (err)
- werr = err;
- }
- wait_on_page_writeback(page);
- page_cache_release(page);
- cond_resched();
- }
+ while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+ EXTENT_NEED_WAIT)) {
+ clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
+ err = filemap_fdatawait_range(mapping, start, end);
+ if (err)
+ werr = err;
+ cond_resched();
+ start = end + 1;
}
if (err)
werr = err;
@@ -683,7 +631,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
ret = btrfs_write_marked_extents(root, dirty_pages, mark);
ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
- return ret || ret2;
+
+ if (ret)
+ return ret;
+ if (ret2)
+ return ret2;
+ return 0;
}
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -826,6 +779,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_save_ino_cache(root, trans);
+ /* see comments in should_cow_block() */
+ root->force_cow = 0;
+ smp_wmb();
+
if (root->commit_root != root->node) {
mutex_lock(&root->fs_commit_mutex);
switch_commit_root(root);
@@ -894,6 +851,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
+ struct btrfs_block_rsv *rsv;
struct inode *parent_inode;
struct dentry *parent;
struct dentry *dentry;
@@ -905,6 +863,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 objectid;
u64 root_flags;
+ rsv = trans->block_rsv;
+
new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
if (!new_root_item) {
pending->error = -ENOMEM;
@@ -918,11 +878,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
- btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
if (to_reserve > 0) {
- ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
- to_reserve);
+ ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
+ to_reserve);
if (ret) {
pending->error = ret;
goto fail;
@@ -986,6 +945,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(old);
free_extent_buffer(old);
+ /* see comments in should_cow_block() */
+ root->force_cow = 1;
+ smp_wmb();
+
btrfs_set_root_node(new_root_item, tmp);
/* record when the snapshot was created in key.offset */
key.offset = trans->transid;
@@ -1009,9 +972,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
BUG_ON(IS_ERR(pending->snap));
btrfs_reloc_post_snapshot(trans, pending);
- btrfs_orphan_post_snapshot(trans, pending);
fail:
kfree(new_root_item);
+ trans->block_rsv = rsv;
btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
return 0;
}
@@ -1038,7 +1001,7 @@ static void update_super_roots(struct btrfs_root *root)
struct btrfs_root_item *root_item;
struct btrfs_super_block *super;
- super = &root->fs_info->super_copy;
+ super = root->fs_info->super_copy;
root_item = &root->fs_info->chunk_root->root_item;
super->chunk_root = root_item->bytenr;
@@ -1049,7 +1012,7 @@ static void update_super_roots(struct btrfs_root *root)
super->root = root_item->bytenr;
super->generation = root_item->generation;
super->root_level = root_item->level;
- if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+ if (btrfs_test_opt(root, SPACE_CACHE))
super->cache_generation = root_item->generation;
}
@@ -1080,22 +1043,7 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
static void wait_current_trans_commit_start(struct btrfs_root *root,
struct btrfs_transaction *trans)
{
- DEFINE_WAIT(wait);
-
- if (trans->in_commit)
- return;
-
- while (1) {
- prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (trans->in_commit) {
- finish_wait(&root->fs_info->transaction_blocked_wait,
- &wait);
- break;
- }
- schedule();
- finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
- }
+ wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
}
/*
@@ -1105,24 +1053,8 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
struct btrfs_transaction *trans)
{
- DEFINE_WAIT(wait);
-
- if (trans->commit_done || (trans->in_commit && !trans->blocked))
- return;
-
- while (1) {
- prepare_to_wait(&root->fs_info->transaction_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (trans->commit_done ||
- (trans->in_commit && !trans->blocked)) {
- finish_wait(&root->fs_info->transaction_wait,
- &wait);
- break;
- }
- schedule();
- finish_wait(&root->fs_info->transaction_wait,
- &wait);
- }
+ wait_event(root->fs_info->transaction_wait,
+ trans->commit_done || (trans->in_commit && !trans->blocked));
}
/*
@@ -1205,14 +1137,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_run_ordered_operations(root, 0);
+ btrfs_trans_release_metadata(trans, root);
+ trans->block_rsv = NULL;
+
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
ret = btrfs_run_delayed_refs(trans, root, 0);
BUG_ON(ret);
- btrfs_trans_release_metadata(trans, root);
-
cur_trans = trans->transaction;
/*
* set the flushing flag so procs in this transaction have to
@@ -1229,8 +1162,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
atomic_inc(&cur_trans->use_count);
btrfs_end_transaction(trans, root);
- ret = wait_for_commit(root, cur_trans);
- BUG_ON(ret);
+ wait_for_commit(root, cur_trans);
put_transaction(cur_trans);
@@ -1379,12 +1311,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
update_super_roots(root);
if (!root->fs_info->log_root_recovering) {
- btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
- btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+ btrfs_set_super_log_root(root->fs_info->super_copy, 0);
+ btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
}
- memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
- sizeof(root->fs_info->super_copy));
+ memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
+ sizeof(*root->fs_info->super_copy));
trans->transaction->blocked = 0;
spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 88dec16..21faa12 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
struct walk_control *wc, u64 gen)
{
if (wc->pin)
- btrfs_pin_extent(log->fs_info->extent_root,
- eb->start, eb->len, 0);
+ btrfs_pin_extent_for_log_replay(wc->trans,
+ log->fs_info->extent_root,
+ eb->start, eb->len);
if (btrfs_buffer_uptodate(eb, gen)) {
if (wc->write)
@@ -1069,7 +1070,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
if (nlink != inode->i_nlink) {
- inode->i_nlink = nlink;
+ set_nlink(inode, nlink);
btrfs_update_inode(trans, root, inode);
}
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1679,7 +1680,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
return 0;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
nritems = btrfs_header_nritems(eb);
for (i = 0; i < nritems; i++) {
@@ -1785,21 +1787,23 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return -ENOMEM;
if (*level == 1) {
- wc->process_func(root, next, wc, ptr_gen);
+ ret = wc->process_func(root, next, wc, ptr_gen);
+ if (ret)
+ return ret;
path->slots[*level]++;
if (wc->free) {
btrfs_read_buffer(next, ptr_gen);
btrfs_tree_lock(next);
- clean_tree_block(trans, root, next);
btrfs_set_lock_blocking(next);
+ clean_tree_block(trans, root, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
WARN_ON(root_owner !=
BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_reserved_extent(root,
+ ret = btrfs_free_and_pin_reserved_extent(root,
bytenr, blocksize);
BUG_ON(ret);
}
@@ -1850,21 +1854,24 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
parent = path->nodes[*level + 1];
root_owner = btrfs_header_owner(parent);
- wc->process_func(root, path->nodes[*level], wc,
+ ret = wc->process_func(root, path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]));
+ if (ret)
+ return ret;
+
if (wc->free) {
struct extent_buffer *next;
next = path->nodes[*level];
btrfs_tree_lock(next);
- clean_tree_block(trans, root, next);
btrfs_set_lock_blocking(next);
+ clean_tree_block(trans, root, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_reserved_extent(root,
+ ret = btrfs_free_and_pin_reserved_extent(root,
path->nodes[*level]->start,
path->nodes[*level]->len);
BUG_ON(ret);
@@ -1926,14 +1933,14 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
next = path->nodes[orig_level];
btrfs_tree_lock(next);
- clean_tree_block(trans, log, next);
btrfs_set_lock_blocking(next);
+ clean_tree_block(trans, log, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
WARN_ON(log->root_key.objectid !=
BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_reserved_extent(log, next->start,
+ ret = btrfs_free_and_pin_reserved_extent(log, next->start,
next->len);
BUG_ON(ret);
}
@@ -2049,10 +2056,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
wait_log_commit(trans, root, root->log_transid - 1);
-
while (1) {
unsigned long batch = root->log_batch;
- if (root->log_multiple_pids) {
+ /* when we're on an ssd, just kick the log commit out */
+ if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
@@ -2153,9 +2160,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
BUG_ON(ret);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
- btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+ btrfs_set_super_log_root(root->fs_info->super_for_commit,
log_root_tree->node->start);
- btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+ btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
btrfs_header_level(log_root_tree->node));
log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7745ad5..9899205 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -142,6 +142,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
unsigned long limit;
unsigned long last_waited = 0;
int force_reg = 0;
+ int sync_pending = 0;
struct blk_plug plug;
/*
@@ -229,6 +230,22 @@ loop_lock:
BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+ /*
+ * if we're doing the sync list, record that our
+ * plug has some sync requests on it
+ *
+ * If we're doing the regular list and there are
+ * sync requests sitting around, unplug before
+ * we add more
+ */
+ if (pending_bios == &device->pending_sync_bios) {
+ sync_pending = 1;
+ } else if (sync_pending) {
+ blk_finish_plug(&plug);
+ blk_start_plug(&plug);
+ sync_pending = 0;
+ }
+
submit_bio(cur->bi_rw, cur);
num_run++;
batch_run++;
@@ -278,6 +295,12 @@ loop_lock:
btrfs_requeue_work(&device->work);
goto done;
}
+ /* unplug every 64 requests just for good measure */
+ if (batch_run % 64 == 0) {
+ blk_finish_plug(&plug);
+ blk_start_plug(&plug);
+ sync_pending = 0;
+ }
}
cond_resched();
@@ -349,6 +372,14 @@ static noinline int device_list_add(const char *path,
}
INIT_LIST_HEAD(&device->dev_alloc_list);
+ /* init readahead state */
+ spin_lock_init(&device->reada_lock);
+ device->reada_curr_zone = NULL;
+ atomic_set(&device->reada_in_flight, 0);
+ device->reada_next = 0;
+ INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
mutex_lock(&fs_devices->device_list_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
mutex_unlock(&fs_devices->device_list_mutex);
@@ -587,10 +618,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
- if (!bh) {
- ret = -EINVAL;
+ if (!bh)
goto error_close;
- }
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -645,7 +674,7 @@ error:
continue;
}
if (fs_devices->open_devices == 0) {
- ret = -EIO;
+ ret = -EINVAL;
goto out;
}
fs_devices->seeding = seeding;
@@ -853,6 +882,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
max_hole_start = search_start;
max_hole_size = 0;
+ hole_size = 0;
if (search_start >= search_end) {
ret = -ENOSPC;
@@ -935,7 +965,14 @@ next:
cond_resched();
}
- hole_size = search_end- search_start;
+ /*
+ * At this point, search_start should be the end of
+ * allocated dev extents, and when shrinking the device,
+ * search_end may be smaller than search_start.
+ */
+ if (search_end > search_start)
+ hole_size = search_end - search_start;
+
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
@@ -975,7 +1012,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
-
+again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid,
@@ -988,6 +1025,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_dev_extent);
BUG_ON(found_key.offset > start || found_key.offset +
btrfs_dev_extent_length(leaf, extent) < start);
+ key = found_key;
+ btrfs_release_path(path);
+ goto again;
} else if (ret == 0) {
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -995,8 +1035,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
}
BUG_ON(ret);
- if (device->bytes_used > 0)
- device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+ if (device->bytes_used > 0) {
+ u64 len = btrfs_dev_extent_length(leaf, extent);
+ device->bytes_used -= len;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += len;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ }
ret = btrfs_del_item(trans, root, path);
out:
@@ -1055,7 +1100,8 @@ static noinline int find_next_chunk(struct btrfs_root *root,
struct btrfs_key found_key;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
key.objectid = objectid;
key.offset = (u64)-1;
@@ -1337,6 +1383,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (ret)
goto error_undo;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space = device->total_bytes -
+ device->bytes_used;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root, device);
@@ -1368,8 +1419,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
call_rcu(&device->rcu, free_device);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
- num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
- btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+ num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+ btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
if (cur_devices->open_devices == 0) {
struct btrfs_fs_devices *fs_devices;
@@ -1431,7 +1482,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
- struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_super_block *disk_super = root->fs_info->super_copy;
struct btrfs_device *device;
u64 super_flags;
@@ -1573,7 +1624,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
return -EINVAL;
- bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+ bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
@@ -1672,15 +1723,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
root->fs_info->fs_devices->num_can_discard++;
root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += device->total_bytes;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
root->fs_info->fs_devices->rotating = 1;
- total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
- btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+ total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+ btrfs_set_super_total_bytes(root->fs_info->super_copy,
total_bytes + device->total_bytes);
- total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
- btrfs_set_super_num_devices(&root->fs_info->super_copy,
+ total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+ btrfs_set_super_num_devices(root->fs_info->super_copy,
total_bytes + 1);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
@@ -1771,7 +1826,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_super_block *super_copy =
- &device->dev_root->fs_info->super_copy;
+ device->dev_root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 diff = new_size - device->total_bytes;
@@ -1830,7 +1885,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
chunk_offset)
{
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *ptr;
@@ -2085,8 +2140,10 @@ int btrfs_balance(struct btrfs_root *dev_root)
/* step two, relocate all the chunks */
path = btrfs_alloc_path();
- BUG_ON(!path);
-
+ if (!path) {
+ ret = -ENOMEM;
+ goto error;
+ }
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -2154,7 +2211,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
bool retried = false;
struct extent_buffer *l;
struct btrfs_key key;
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = device->total_bytes;
u64 diff = device->total_bytes - new_size;
@@ -2171,8 +2228,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
lock_chunks(root);
device->total_bytes = new_size;
- if (device->writeable)
+ if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space -= diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ }
unlock_chunks(root);
again:
@@ -2236,6 +2297,9 @@ again:
device->total_bytes = old_size;
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
unlock_chunks(root);
goto done;
}
@@ -2271,7 +2335,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key disk_key;
u32 array_size;
u8 *ptr;
@@ -2434,9 +2498,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
total_avail = device->total_bytes - device->bytes_used;
else
total_avail = 0;
- /* avail is off by max(alloc_start, 1MB), but that is the same
- * for all devices, so it doesn't hurt the sorting later on
- */
+
+ /* If there is no space on this device, skip it. */
+ if (total_avail == 0)
+ continue;
ret = find_free_dev_extent(trans, device,
max_stripe_size * dev_stripes,
@@ -2593,6 +2658,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
index++;
}
+ spin_lock(&extent_root->fs_info->free_chunk_lock);
+ extent_root->fs_info->free_chunk_space -= (stripe_size *
+ map->num_stripes);
+ spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
index = 0;
stripe = &chunk->stripe;
while (index < map->num_stripes) {
@@ -2685,7 +2755,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
ret = find_next_chunk(fs_info->chunk_root,
BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
- BUG_ON(ret);
+ if (ret)
+ return ret;
alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
(fs_info->metadata_alloc_profile &
@@ -2825,7 +2896,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
- struct btrfs_multi_bio **multi_ret,
+ struct btrfs_bio **bbio_ret,
int mirror_num)
{
struct extent_map *em;
@@ -2843,18 +2914,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
int i;
int num_stripes;
int max_errors = 0;
- struct btrfs_multi_bio *multi = NULL;
+ struct btrfs_bio *bbio = NULL;
- if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
+ if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
stripes_allocated = 1;
again:
- if (multi_ret) {
- multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+ if (bbio_ret) {
+ bbio = kzalloc(btrfs_bio_size(stripes_allocated),
GFP_NOFS);
- if (!multi)
+ if (!bbio)
return -ENOMEM;
- atomic_set(&multi->error, 0);
+ atomic_set(&bbio->error, 0);
}
read_lock(&em_tree->lock);
@@ -2875,7 +2946,7 @@ again:
if (mirror_num > map->num_stripes)
mirror_num = 0;
- /* if our multi bio struct is too small, back off and try again */
+ /* if our btrfs_bio struct is too small, back off and try again */
if (rw & REQ_WRITE) {
if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_DUP)) {
@@ -2894,11 +2965,11 @@ again:
stripes_required = map->num_stripes;
}
}
- if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+ if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
stripes_allocated < stripes_required) {
stripes_allocated = map->num_stripes;
free_extent_map(em);
- kfree(multi);
+ kfree(bbio);
goto again;
}
stripe_nr = offset;
@@ -2927,7 +2998,7 @@ again:
*length = em->len - offset;
}
- if (!multi_ret)
+ if (!bbio_ret)
goto out;
num_stripes = 1;
@@ -2952,13 +3023,17 @@ again:
stripe_index = find_live_mirror(map, 0,
map->num_stripes,
current->pid % map->num_stripes);
+ mirror_num = stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (rw & (REQ_WRITE | REQ_DISCARD))
+ if (rw & (REQ_WRITE | REQ_DISCARD)) {
num_stripes = map->num_stripes;
- else if (mirror_num)
+ } else if (mirror_num) {
stripe_index = mirror_num - 1;
+ } else {
+ mirror_num = 1;
+ }
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
int factor = map->num_stripes / map->sub_stripes;
@@ -2978,6 +3053,7 @@ again:
stripe_index = find_live_mirror(map, stripe_index,
map->sub_stripes, stripe_index +
current->pid % map->sub_stripes);
+ mirror_num = stripe_index + 1;
}
} else {
/*
@@ -2986,15 +3062,16 @@ again:
* stripe_index is the number of our device in the stripe array
*/
stripe_index = do_div(stripe_nr, map->num_stripes);
+ mirror_num = stripe_index + 1;
}
BUG_ON(stripe_index >= map->num_stripes);
if (rw & REQ_DISCARD) {
for (i = 0; i < num_stripes; i++) {
- multi->stripes[i].physical =
+ bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- multi->stripes[i].dev = map->stripes[stripe_index].dev;
+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
u64 stripes;
@@ -3015,16 +3092,16 @@ again:
}
stripes = stripe_nr_end - 1 - j;
do_div(stripes, map->num_stripes);
- multi->stripes[i].length = map->stripe_len *
+ bbio->stripes[i].length = map->stripe_len *
(stripes - stripe_nr + 1);
if (i == 0) {
- multi->stripes[i].length -=
+ bbio->stripes[i].length -=
stripe_offset;
stripe_offset = 0;
}
if (stripe_index == last_stripe)
- multi->stripes[i].length -=
+ bbio->stripes[i].length -=
stripe_end_offset;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
u64 stripes;
@@ -3049,11 +3126,11 @@ again:
}
stripes = stripe_nr_end - 1 - j;
do_div(stripes, factor);
- multi->stripes[i].length = map->stripe_len *
+ bbio->stripes[i].length = map->stripe_len *
(stripes - stripe_nr + 1);
if (i < map->sub_stripes) {
- multi->stripes[i].length -=
+ bbio->stripes[i].length -=
stripe_offset;
if (i == map->sub_stripes - 1)
stripe_offset = 0;
@@ -3061,11 +3138,11 @@ again:
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
map->sub_stripes - 1)) {
- multi->stripes[i].length -=
+ bbio->stripes[i].length -=
stripe_end_offset;
}
} else
- multi->stripes[i].length = *length;
+ bbio->stripes[i].length = *length;
stripe_index++;
if (stripe_index == map->num_stripes) {
@@ -3076,19 +3153,20 @@ again:
}
} else {
for (i = 0; i < num_stripes; i++) {
- multi->stripes[i].physical =
+ bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset +
stripe_nr * map->stripe_len;
- multi->stripes[i].dev =
+ bbio->stripes[i].dev =
map->stripes[stripe_index].dev;
stripe_index++;
}
}
- if (multi_ret) {
- *multi_ret = multi;
- multi->num_stripes = num_stripes;
- multi->max_errors = max_errors;
+ if (bbio_ret) {
+ *bbio_ret = bbio;
+ bbio->num_stripes = num_stripes;
+ bbio->max_errors = max_errors;
+ bbio->mirror_num = mirror_num;
}
out:
free_extent_map(em);
@@ -3097,9 +3175,9 @@ out:
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
- struct btrfs_multi_bio **multi_ret, int mirror_num)
+ struct btrfs_bio **bbio_ret, int mirror_num)
{
- return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+ return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
mirror_num);
}
@@ -3168,30 +3246,32 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
-static void end_bio_multi_stripe(struct bio *bio, int err)
+static void btrfs_end_bio(struct bio *bio, int err)
{
- struct btrfs_multi_bio *multi = bio->bi_private;
+ struct btrfs_bio *bbio = bio->bi_private;
int is_orig_bio = 0;
if (err)
- atomic_inc(&multi->error);
+ atomic_inc(&bbio->error);
- if (bio == multi->orig_bio)
+ if (bio == bbio->orig_bio)
is_orig_bio = 1;
- if (atomic_dec_and_test(&multi->stripes_pending)) {
+ if (atomic_dec_and_test(&bbio->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
- bio = multi->orig_bio;
+ bio = bbio->orig_bio;
}
- bio->bi_private = multi->private;
- bio->bi_end_io = multi->end_io;
+ bio->bi_private = bbio->private;
+ bio->bi_end_io = bbio->end_io;
+ bio->bi_bdev = (struct block_device *)
+ (unsigned long)bbio->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the multi-bio
*/
- if (atomic_read(&multi->error) > multi->max_errors) {
+ if (atomic_read(&bbio->error) > bbio->max_errors) {
err = -EIO;
- } else if (err) {
+ } else {
/*
* this bio is actually up to date, we didn't
* go over the max number of errors
@@ -3199,7 +3279,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
set_bit(BIO_UPTODATE, &bio->bi_flags);
err = 0;
}
- kfree(multi);
+ kfree(bbio);
bio_endio(bio, err);
} else if (!is_orig_bio) {
@@ -3279,20 +3359,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
u64 logical = (u64)bio->bi_sector << 9;
u64 length = 0;
u64 map_length;
- struct btrfs_multi_bio *multi = NULL;
int ret;
int dev_nr = 0;
int total_devs = 1;
+ struct btrfs_bio *bbio = NULL;
length = bio->bi_size;
map_tree = &root->fs_info->mapping_tree;
map_length = length;
- ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+ ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
mirror_num);
BUG_ON(ret);
- total_devs = multi->num_stripes;
+ total_devs = bbio->num_stripes;
if (map_length < length) {
printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
"len %llu\n", (unsigned long long)logical,
@@ -3300,25 +3380,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
(unsigned long long)map_length);
BUG();
}
- multi->end_io = first_bio->bi_end_io;
- multi->private = first_bio->bi_private;
- multi->orig_bio = first_bio;
- atomic_set(&multi->stripes_pending, multi->num_stripes);
+
+ bbio->orig_bio = first_bio;
+ bbio->private = first_bio->bi_private;
+ bbio->end_io = first_bio->bi_end_io;
+ atomic_set(&bbio->stripes_pending, bbio->num_stripes);
while (dev_nr < total_devs) {
- if (total_devs > 1) {
- if (dev_nr < total_devs - 1) {
- bio = bio_clone(first_bio, GFP_NOFS);
- BUG_ON(!bio);
- } else {
- bio = first_bio;
- }
- bio->bi_private = multi;
- bio->bi_end_io = end_bio_multi_stripe;
+ if (dev_nr < total_devs - 1) {
+ bio = bio_clone(first_bio, GFP_NOFS);
+ BUG_ON(!bio);
+ } else {
+ bio = first_bio;
}
- bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
- dev = multi->stripes[dev_nr].dev;
+ bio->bi_private = bbio;
+ bio->bi_end_io = btrfs_end_bio;
+ bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
+ dev = bbio->stripes[dev_nr].dev;
if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
+ pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
+ "(%s id %llu), size=%u\n", rw,
+ (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+ dev->name, dev->devid, bio->bi_size);
bio->bi_bdev = dev->bdev;
if (async_submit)
schedule_bio(root, dev, rw, bio);
@@ -3331,8 +3414,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
}
dev_nr++;
}
- if (total_devs == 1)
- kfree(multi);
return 0;
}
@@ -3593,15 +3674,20 @@ static int read_one_dev(struct btrfs_root *root,
fill_device_from_item(leaf, dev_item, device);
device->dev_root = root->fs_info->dev_root;
device->in_fs_metadata = 1;
- if (device->writeable)
+ if (device->writeable) {
device->fs_devices->total_rw_bytes += device->total_bytes;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += device->total_bytes -
+ device->bytes_used;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ }
ret = 0;
return ret;
}
int btrfs_read_sys_array(struct btrfs_root *root)
{
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
@@ -3619,7 +3705,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
if (!sb)
return -ENOMEM;
btrfs_set_buffer_uptodate(sb);
- btrfs_set_buffer_lockdep_class(sb, 0);
+ btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db..78f2d4d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,20 @@ struct btrfs_device {
struct btrfs_work work;
struct rcu_head rcu;
struct work_struct rcu_work;
+
+ /* readahead state */
+ spinlock_t reada_lock;
+ atomic_t reada_in_flight;
+ u64 reada_next;
+ struct reada_zone *reada_curr_zone;
+ struct radix_tree_root reada_zones;
+ struct radix_tree_root reada_extents;
+
+ /* for sending down flush barriers */
+ struct bio *flush_bio;
+ struct completion flush_wait;
+ int nobarriers;
+
};
struct btrfs_fs_devices {
@@ -136,7 +150,10 @@ struct btrfs_bio_stripe {
u64 length; /* only used for discard mappings */
};
-struct btrfs_multi_bio {
+struct btrfs_bio;
+typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+
+struct btrfs_bio {
atomic_t stripes_pending;
bio_end_io_t *end_io;
struct bio *orig_bio;
@@ -144,6 +161,7 @@ struct btrfs_multi_bio {
atomic_t error;
int max_errors;
int num_stripes;
+ int mirror_num;
struct btrfs_bio_stripe stripes[];
};
@@ -171,7 +189,7 @@ struct map_lookup {
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
-#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
u64 chunk_offset, u64 start, u64 num_bytes);
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
- struct btrfs_multi_bio **multi_ret, int mirror_num);
+ struct btrfs_bio **bbio_ret, int mirror_num);
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 5366fe4..60ff45e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -102,48 +102,82 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- /* first lets see if we already have this xattr */
- di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
- strlen(name), -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
-
- /* ok we already have this xattr, lets remove it */
- if (di) {
- /* if we want create only exit */
- if (flags & XATTR_CREATE) {
- ret = -EEXIST;
+ if (flags & XATTR_REPLACE) {
+ di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
+ name_len, -1);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (!di) {
+ ret = -ENODATA;
goto out;
}
-
ret = btrfs_delete_one_dir_name(trans, root, path, di);
- BUG_ON(ret);
+ if (ret)
+ goto out;
btrfs_release_path(path);
- /* if we don't have a value then we are removing the xattr */
+ /*
+ * remove the attribute
+ */
if (!value)
goto out;
- } else {
+ }
+
+again:
+ ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
+ name, name_len, value, size);
+ /*
+ * If we're setting an xattr to a new value but the new value is say
+ * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
+ * back from split_leaf. This is because it thinks we'll be extending
+ * the existing item size, but we're asking for enough space to add the
+ * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
+ * the rest of the function figure it out.
+ */
+ if (ret == -EOVERFLOW)
+ ret = -EEXIST;
+
+ if (ret == -EEXIST) {
+ if (flags & XATTR_CREATE)
+ goto out;
+ /*
+ * We can't use the path we already have since we won't have the
+ * proper locking for a delete, so release the path and
+ * re-lookup to delete the thing.
+ */
btrfs_release_path(path);
+ di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+ name, name_len, -1);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (!di) {
+ /* Shouldn't happen but just in case... */
+ btrfs_release_path(path);
+ goto again;
+ }
- if (flags & XATTR_REPLACE) {
- /* we couldn't find the attr to replace */
- ret = -ENODATA;
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret)
goto out;
+
+ /*
+ * We have a value to set, so go back and try to insert it now.
+ */
+ if (value) {
+ btrfs_release_path(path);
+ goto again;
}
}
-
- /* ok we have to create a completely new xattr */
- ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
- name, name_len, value, size);
- BUG_ON(ret);
out:
btrfs_free_path(path);
return ret;
}
+/*
+ * @value: "" makes the attribute to empty, NULL removes it
+ */
int __btrfs_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags)
@@ -276,21 +310,40 @@ const struct xattr_handler *btrfs_xattr_handlers[] = {
/*
* Check if the attribute is in a supported namespace.
*
- * This applied after the check for the synthetic attributes in the system
+ * This is applied after the check for the synthetic attributes in the system
* namespace.
*/
-static bool btrfs_is_valid_xattr(const char *name)
+static int btrfs_is_valid_xattr(const char *name)
{
- return !strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN) ||
- !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
- !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
- !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+ int len = strlen(name);
+ int prefixlen = 0;
+
+ if (!strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN))
+ prefixlen = XATTR_SECURITY_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ prefixlen = XATTR_SYSTEM_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+ prefixlen = XATTR_TRUSTED_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ prefixlen = XATTR_USER_PREFIX_LEN;
+ else
+ return -EOPNOTSUPP;
+
+ /*
+ * The name cannot consist of just prefix
+ */
+ if (len <= prefixlen)
+ return -EINVAL;
+
+ return 0;
}
ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
+ int ret;
+
/*
* If this is a request for a synthetic attribute in the system.*
* namespace use the generic infrastructure to resolve a handler
@@ -299,8 +352,9 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return generic_getxattr(dentry, name, buffer, size);
- if (!btrfs_is_valid_xattr(name))
- return -EOPNOTSUPP;
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
}
@@ -308,6 +362,7 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+ int ret;
/*
* The permission on security.* and system.* is not checked
@@ -324,8 +379,9 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return generic_setxattr(dentry, name, value, size, flags);
- if (!btrfs_is_valid_xattr(name))
- return -EOPNOTSUPP;
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
if (size == 0)
value = ""; /* empty EA, do not remove */
@@ -337,6 +393,7 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
int btrfs_removexattr(struct dentry *dentry, const char *name)
{
struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+ int ret;
/*
* The permission on security.* and system.* is not checked
@@ -353,43 +410,44 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return generic_removexattr(dentry, name);
- if (!btrfs_is_valid_xattr(name))
- return -EOPNOTSUPP;
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
XATTR_REPLACE);
}
-int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *suffix;
+ const struct xattr *xattr;
+ struct btrfs_trans_handle *trans = fs_info;
char *name;
+ int err = 0;
- err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
- &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
- }
-
- name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
- GFP_NOFS);
- if (!name) {
- err = -ENOMEM;
- } else {
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1, GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ break;
+ }
strcpy(name, XATTR_SECURITY_PREFIX);
- strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
- err = __btrfs_setxattr(trans, inode, name, value, len, 0);
+ strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ err = __btrfs_setxattr(trans, inode, name,
+ xattr->value, xattr->value_len, 0);
kfree(name);
+ if (err < 0)
+ break;
}
-
- kfree(suffix);
- kfree(value);
return err;
}
+
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &btrfs_initxattrs, trans);
+}
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index cdb41a1..8daea16 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -287,7 +287,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -397,11 +396,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
- //int i;
+ int i;
- //for (i = 0; i < node->tree->pages_per_bnode; i++)
- // if (node->page[i])
- // page_cache_release(node->page[i]);
+ for (i = 0; i < node->tree->pages_per_bnode; i++)
+ if (node->page[i])
+ page_cache_release(node->page[i]);
kfree(node);
}
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 92fb358..db240c5 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -132,13 +132,16 @@ skip:
hfs_bnode_write(node, entry, data_off + key_len, entry_len);
hfs_bnode_dump(node);
- if (new_node) {
- /* update parent key if we inserted a key
- * at the start of the first node
- */
- if (!rec && new_node != node)
- hfs_brec_update_parent(fd);
+ /*
+ * update parent key if we inserted a key
+ * at the start of the node and it is not the new node
+ */
+ if (!rec && new_node != node) {
+ hfs_bnode_read_key(node, fd->search_key, data_off + size);
+ hfs_brec_update_parent(fd);
+ }
+ if (new_node) {
hfs_bnode_put(fd->bnode);
if (!new_node->parent) {
hfs_btree_inc_height(tree);
@@ -167,9 +170,6 @@ skip:
goto again;
}
- if (!rec)
- hfs_brec_update_parent(fd);
-
return 0;
}
@@ -366,6 +366,8 @@ again:
if (IS_ERR(parent))
return PTR_ERR(parent);
__hfs_brec_find(parent, fd);
+ if (fd->record < 0)
+ return -ENOENT;
hfs_bnode_dump(parent);
rec = fd->record;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b1..bce4eef 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -198,7 +198,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode,
res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
- inode->i_nlink = 0;
+ clear_nlink(inode);
hfs_delete_inode(inode);
iput(inode);
return res;
@@ -227,7 +227,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
- inode->i_nlink = 0;
+ clear_nlink(inode);
hfs_delete_inode(inode);
iput(inode);
return res;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index fff16c9..a1a9fdc 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -123,8 +123,8 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, hfs_get_block, NULL);
+ ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ hfs_get_block);
/*
* In case of error extending write may have instantiated a few
@@ -183,7 +183,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
HFS_I(inode)->flags = 0;
HFS_I(inode)->rsrc_inode = NULL;
@@ -313,7 +313,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
/* Initialize the inode */
inode->i_uid = hsb->s_uid;
inode->i_gid = hsb->s_gid;
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
if (idata->key)
HFS_I(inode)->cat_key = *idata->key;
@@ -615,6 +615,8 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
+ inode_dio_wait(inode);
+
error = vmtruncate(inode, attr->ia_size);
if (error)
return error;
@@ -625,12 +627,18 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
return 0;
}
-static int hfs_file_fsync(struct file *filp, int datasync)
+static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
+ int datasync)
{
struct inode *inode = filp->f_mapping->host;
struct super_block * sb;
int ret, err;
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
+ mutex_lock(&inode->i_mutex);
+
/* sync the inode to buffers */
ret = write_inode_now(inode, 0);
@@ -647,6 +655,7 @@ static int hfs_file_fsync(struct file *filp, int datasync)
err = sync_blockdev(sb->s_bdev);
if (!ret)
ret = err;
+ mutex_unlock(&inode->i_mutex);
return ret;
}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 1b55f70..cac813d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -138,9 +138,9 @@ static int hfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
struct hfs_sb_info *sbi = HFS_SB(mnt->mnt_sb);
if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
- seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+ seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
- seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+ seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
seq_printf(seq, ",uid=%u,gid=%u", sbi->s_uid, sbi->s_gid);
if (sbi->s_file_umask != 0133)
seq_printf(seq, ",file_umask=%o", sbi->s_file_umask);
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 1c42cc5..a1e9109 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -566,13 +565,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
-#if 0
int i;
for (i = 0; i < node->tree->pages_per_bnode; i++)
if (node->page[i])
page_cache_release(node->page[i]);
-#endif
kfree(node);
}
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 2312de3..7429c40 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -43,6 +43,10 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
node->tree->node_size - (rec + 1) * 2);
if (!recoff)
return 0;
+ if (recoff > node->tree->node_size - 2) {
+ printk(KERN_ERR "hfs: recoff %d too large\n", recoff);
+ return 0;
+ }
retval = hfs_bnode_read_u16(node, recoff) + 2;
if (retval > node->tree->max_key_len + 2) {
@@ -126,13 +130,16 @@ skip:
hfs_bnode_write(node, entry, data_off + key_len, entry_len);
hfs_bnode_dump(node);
- if (new_node) {
- /* update parent key if we inserted a key
- * at the start of the first node
- */
- if (!rec && new_node != node)
- hfs_brec_update_parent(fd);
+ /*
+ * update parent key if we inserted a key
+ * at the start of the node and it is not the new node
+ */
+ if (!rec && new_node != node) {
+ hfs_bnode_read_key(node, fd->search_key, data_off + size);
+ hfs_brec_update_parent(fd);
+ }
+ if (new_node) {
hfs_bnode_put(fd->bnode);
if (!new_node->parent) {
hfs_btree_inc_height(tree);
@@ -162,9 +169,6 @@ skip:
goto again;
}
- if (!rec)
- hfs_brec_update_parent(fd);
-
return 0;
}
@@ -364,6 +368,8 @@ again:
if (IS_ERR(parent))
return PTR_ERR(parent);
__hfs_brec_find(parent, fd);
+ if (fd->record < 0)
+ return -ENOENT;
hfs_bnode_dump(parent);
rec = fd->record;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 408073a..ec2a9c2 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -212,7 +212,9 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n",
str->name, cnid, inode->i_nlink);
- hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+ err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+ if (err)
+ return err;
hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
entry_size = hfsplus_fill_cat_thread(sb, &entry,
@@ -269,7 +271,9 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n",
str ? str->name : NULL, cnid);
- hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+ err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+ if (err)
+ return err;
if (!str) {
int len;
@@ -347,12 +351,14 @@ int hfsplus_rename_cat(u32 cnid,
struct hfs_find_data src_fd, dst_fd;
hfsplus_cat_entry entry;
int entry_size, type;
- int err = 0;
+ int err;
dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
cnid, src_dir->i_ino, src_name->name,
dst_dir->i_ino, dst_name->name);
- hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
+ err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
+ if (err)
+ return err;
dst_fd = src_fd;
/* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 159f5eb..5adb740 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -38,7 +38,9 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
sb = dir->i_sb;
dentry->d_fsdata = NULL;
- hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+ err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+ if (err)
+ return ERR_PTR(err);
hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
again:
err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -132,7 +134,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (filp->f_pos >= inode->i_size)
return 0;
- hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+ err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+ if (err)
+ return err;
hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
err = hfs_brec_find(&fd);
if (err)
@@ -422,7 +426,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
goto out;
out_err:
- inode->i_nlink = 0;
+ clear_nlink(inode);
hfsplus_delete_inode(inode);
iput(inode);
out:
@@ -447,7 +451,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
- inode->i_nlink = 0;
+ clear_nlink(inode);
hfsplus_delete_inode(inode);
iput(inode);
goto out;
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 9d8c087..32b12e5 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -119,22 +119,31 @@ static void __hfsplus_ext_write_extent(struct inode *inode,
set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
}
-static void hfsplus_ext_write_extent_locked(struct inode *inode)
+static int hfsplus_ext_write_extent_locked(struct inode *inode)
{
+ int res;
+
if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
struct hfs_find_data fd;
- hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
+ res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
+ if (res)
+ return res;
__hfsplus_ext_write_extent(inode, &fd);
hfs_find_exit(&fd);
}
+ return 0;
}
-void hfsplus_ext_write_extent(struct inode *inode)
+int hfsplus_ext_write_extent(struct inode *inode)
{
+ int res;
+
mutex_lock(&HFSPLUS_I(inode)->extents_lock);
- hfsplus_ext_write_extent_locked(inode);
+ res = hfsplus_ext_write_extent_locked(inode);
mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
+
+ return res;
}
static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
@@ -194,9 +203,11 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
block < hip->cached_start + hip->cached_blocks)
return 0;
- hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
- res = __hfsplus_ext_cache_extent(&fd, inode, block);
- hfs_find_exit(&fd);
+ res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
+ if (!res) {
+ res = __hfsplus_ext_cache_extent(&fd, inode, block);
+ hfs_find_exit(&fd);
+ }
return res;
}
@@ -209,6 +220,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
int res = -EIO;
u32 ablock, dblock, mask;
+ sector_t sector;
int was_dirty = 0;
int shift;
@@ -255,10 +267,12 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
done:
dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n",
inode->i_ino, (long long)iblock, dblock);
+
mask = (1 << sbi->fs_shift) - 1;
- map_bh(bh_result, sb,
- (dblock << sbi->fs_shift) + sbi->blockoffset +
- (iblock & mask));
+ sector = ((sector_t)dblock << sbi->fs_shift) +
+ sbi->blockoffset + (iblock & mask);
+ map_bh(bh_result, sb, sector);
+
if (create) {
set_buffer_new(bh_result);
hip->phys_size += sb->s_blocksize;
@@ -371,7 +385,9 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid,
if (total_blocks == blocks)
return 0;
- hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
+ res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
+ if (res)
+ return res;
do {
res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
total_blocks, type);
@@ -469,7 +485,9 @@ out:
insert_extent:
dprint(DBG_EXTENT, "insert new extent\n");
- hfsplus_ext_write_extent_locked(inode);
+ res = hfsplus_ext_write_extent_locked(inode);
+ if (res)
+ goto out;
memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
hip->cached_extents[0].start_block = cpu_to_be32(start);
@@ -500,7 +518,6 @@ void hfsplus_file_truncate(struct inode *inode)
struct page *page;
void *fsdata;
loff_t size = inode->i_size;
- int res;
res = pagecache_write_begin(NULL, mapping, size, 0,
AOP_FLAG_UNINTERRUPTIBLE,
@@ -523,7 +540,12 @@ void hfsplus_file_truncate(struct inode *inode)
goto out;
mutex_lock(&hip->extents_lock);
- hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
+ res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
+ if (res) {
+ mutex_unlock(&hip->extents_lock);
+ /* XXX: We lack error handling of hfsplus_file_truncate() */
+ return;
+ }
while (1) {
if (alloc_cnt == hip->first_blocks) {
hfsplus_free_extents(sb, hip->first_extents,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 4e7f64b..d7674d0 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -386,7 +386,7 @@ extern const struct file_operations hfsplus_dir_operations;
/* extents.c */
int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
-void hfsplus_ext_write_extent(struct inode *);
+int hfsplus_ext_write_extent(struct inode *);
int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int);
int hfsplus_free_fork(struct super_block *, u32,
struct hfsplus_fork_raw *, int);
@@ -404,7 +404,8 @@ int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *);
int hfsplus_cat_write_inode(struct inode *);
struct inode *hfsplus_new_inode(struct super_block *, int);
void hfsplus_delete_inode(struct inode *);
-int hfsplus_file_fsync(struct file *file, int datasync);
+int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync);
/* ioctl.c */
long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index b248a6cf..40e1413 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -119,8 +119,8 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, hfsplus_get_block, NULL);
+ ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ hfsplus_get_block);
/*
* In case of error extending write may have instantiated a few
@@ -195,11 +195,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir,
hip->flags = 0;
set_bit(HFSPLUS_I_RSRC, &hip->flags);
- hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
- err = hfsplus_find_cat(sb, dir->i_ino, &fd);
- if (!err)
- err = hfsplus_cat_read_inode(inode, &fd);
- hfs_find_exit(&fd);
+ err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+ if (!err) {
+ err = hfsplus_find_cat(sb, dir->i_ino, &fd);
+ if (!err)
+ err = hfsplus_cat_read_inode(inode, &fd);
+ hfs_find_exit(&fd);
+ }
if (err) {
iput(inode);
return ERR_PTR(err);
@@ -296,6 +298,8 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
+ inode_dio_wait(inode);
+
error = vmtruncate(inode, attr->ia_size);
if (error)
return error;
@@ -306,13 +310,19 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
return 0;
}
-int hfsplus_file_fsync(struct file *file, int datasync)
+int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
{
struct inode *inode = file->f_mapping->host;
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
int error = 0, error2;
+ error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (error)
+ return error;
+ mutex_lock(&inode->i_mutex);
+
/*
* Sync inode metadata into the catalog and extent trees.
*/
@@ -340,6 +350,8 @@ int hfsplus_file_fsync(struct file *file, int datasync)
if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ mutex_unlock(&inode->i_mutex);
+
return error;
}
@@ -379,7 +391,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
hip = HFSPLUS_I(inode);
@@ -500,7 +512,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_folder));
hfsplus_get_perms(inode, &folder->permissions, 1);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_size = 2 + be32_to_cpu(folder->valence);
inode->i_atime = hfsp_mt2ut(folder->access_date);
inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
@@ -520,11 +532,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
&file->rsrc_fork : &file->data_fork);
hfsplus_get_perms(inode, &file->permissions, 0);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
if (S_ISREG(inode->i_mode)) {
if (file->permissions.dev)
- inode->i_nlink =
- be32_to_cpu(file->permissions.dev);
+ set_nlink(inode,
+ be32_to_cpu(file->permissions.dev));
inode->i_op = &hfsplus_file_inode_operations;
inode->i_fop = &hfsplus_file_operations;
inode->i_mapping->a_ops = &hfsplus_aops;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bb62a5882..c8d6b4f 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -211,9 +211,9 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
- seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+ seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4);
if (sbi->type != HFSPLUS_DEF_CR_TYPE)
- seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+ seq_show_option_n(seq, "type", (char *)&sbi->type, 4);
seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
sbi->uid, sbi->gid);
if (sbi->part >= 0)
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index c3a76fd..d24a9b6 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -73,11 +73,13 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
inode->i_ino == HFSPLUS_ROOT_CNID) {
- hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
- err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
- if (!err)
- err = hfsplus_cat_read_inode(inode, &fd);
- hfs_find_exit(&fd);
+ err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+ if (!err) {
+ err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+ if (!err)
+ err = hfsplus_cat_read_inode(inode, &fd);
+ hfs_find_exit(&fd);
+ }
} else {
err = hfsplus_system_read_inode(inode);
}
@@ -133,9 +135,13 @@ static int hfsplus_system_write_inode(struct inode *inode)
static int hfsplus_write_inode(struct inode *inode,
struct writeback_control *wbc)
{
+ int err;
+
dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
- hfsplus_ext_write_extent(inode);
+ err = hfsplus_ext_write_extent(inode);
+ if (err)
+ return err;
if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
inode->i_ino == HFSPLUS_ROOT_CNID)
@@ -338,6 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
struct inode *root, *inode;
struct qstr str;
struct nls_table *nls = NULL;
+ u64 last_fs_block, last_fs_page;
int err;
err = -EINVAL;
@@ -393,6 +400,17 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
if (!sbi->rsrc_clump_blocks)
sbi->rsrc_clump_blocks = 1;
+ err = -EFBIG;
+ last_fs_block = sbi->total_blocks - 1;
+ last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >>
+ PAGE_CACHE_SHIFT;
+
+ if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
+ (last_fs_page > (pgoff_t)(~0ULL))) {
+ printk(KERN_ERR "hfs: filesystem size too large.\n");
+ goto out_free_vhdr;
+ }
+
/* Set up operations so we can load metadata */
sb->s_op = &hfsplus_sops;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -417,6 +435,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags |= MS_RDONLY;
}
+ err = -EINVAL;
+
/* Load metadata objects (B*Trees) */
sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
if (!sbi->ext_tree) {
@@ -447,7 +467,9 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
str.name = HFSP_HIDDENDIR_NAME;
- hfs_find_init(sbi->cat_tree, &fd);
+ err = hfs_find_init(sbi->cat_tree, &fd);
+ if (err)
+ goto out_put_root;
hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
hfs_find_exit(&fd);
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index a3f0bfc..a32998f 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -142,7 +142,11 @@ int hfsplus_uni2asc(struct super_block *sb,
/* search for single decomposed char */
if (likely(compose))
ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c0);
- if (ce1 && (cc = ce1[0])) {
+ if (ce1)
+ cc = ce1[0];
+ else
+ cc = 0;
+ if (cc) {
/* start of a possibly decomposed Hangul char */
if (cc != 0xffff)
goto done;
@@ -209,7 +213,8 @@ int hfsplus_uni2asc(struct super_block *sb,
i++;
ce2 = ce1;
}
- if ((cc = ce2[0])) {
+ cc = ce2[0];
+ if (cc) {
ip += i;
ustrlen -= i;
goto done;
@@ -301,7 +306,11 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
size = asc2unichar(sb, astr, len, &c);
- if (decompose && (dstr = decompose_unichar(c, &dsize))) {
+ if (decompose)
+ dstr = decompose_unichar(c, &dsize);
+ else
+ dstr = NULL;
+ if (dstr) {
if (outlen + dsize > HFSPLUS_MAX_STRLEN)
break;
do {
@@ -346,15 +355,23 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
astr += size;
len -= size;
- if (decompose && (dstr = decompose_unichar(c, &dsize))) {
+ if (decompose)
+ dstr = decompose_unichar(c, &dsize);
+ else
+ dstr = NULL;
+ if (dstr) {
do {
c2 = *dstr++;
- if (!casefold || (c2 = case_fold(c2)))
+ if (casefold)
+ c2 = case_fold(c2);
+ if (!casefold || c2)
hash = partial_name_hash(c2, hash);
} while (--dsize > 0);
} else {
c2 = c;
- if (!casefold || (c2 = case_fold(c2)))
+ if (casefold)
+ c2 = case_fold(c2);
+ if (!casefold || c2)
hash = partial_name_hash(c2, hash);
}
}
@@ -422,12 +439,14 @@ int hfsplus_compare_dentry(const struct dentry *parent,
c1 = *dstr1;
c2 = *dstr2;
if (casefold) {
- if (!(c1 = case_fold(c1))) {
+ c1 = case_fold(c1);
+ if (!c1) {
dstr1++;
dsize1--;
continue;
}
- if (!(c2 = case_fold(c2))) {
+ c2 = case_fold(c2);
+ if (!c2) {
dstr2++;
dsize2--;
continue;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index aac1563..90effcc 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -184,10 +184,6 @@ int hfsplus_read_wrapper(struct super_block *sb)
if (hfsplus_get_last_session(sb, &part_start, &part_size))
goto out;
- if ((u64)part_start + part_size > 0x100000000ULL) {
- pr_err("hfs: volumes larger than 2TB are not supported yet\n");
- goto out;
- }
error = -ENOMEM;
sbi->s_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
@@ -215,8 +211,9 @@ reread:
if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
goto out_free_backup_vhdr;
wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
- part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
- part_size = wd.embed_count * wd.ablk_size;
+ part_start += (sector_t)wd.ablk_start +
+ (sector_t)wd.embed_start * wd.ablk_size;
+ part_size = (sector_t)wd.embed_count * wd.ablk_size;
goto reread;
default:
/*
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2638c834e..104e4d9 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -265,7 +265,7 @@ static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
size_t offset = strlen(root_ino) + 1;
if (strlen(root_path) > offset)
- seq_printf(seq, ",%s", root_path + offset);
+ seq_show_option(seq, root_path + offset, NULL);
return 0;
}
@@ -362,9 +362,20 @@ retry:
return 0;
}
-int hostfs_fsync(struct file *file, int datasync)
+int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
+ struct inode *inode = file->f_mapping->host;
+ int ret;
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
+
+ mutex_lock(&inode->i_mutex);
+ ret = fsync_file(HOSTFS_I(inode)->fd, datasync);
+ mutex_unlock(&inode->i_mutex);
+
+ return ret;
}
static const struct file_operations hostfs_file_fops = {
@@ -530,7 +541,7 @@ static int read_name(struct inode *ino, char *name)
ino->i_ino = st.ino;
ino->i_mode = st.mode;
- ino->i_nlink = st.nlink;
+ set_nlink(ino, st.nlink);
ino->i_uid = st.uid;
ino->i_gid = st.gid;
ino->i_atime = st.atime;
@@ -748,12 +759,12 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
return err;
}
-int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
+int hostfs_permission(struct inode *ino, int desired)
{
char *name;
int r = 0, w = 0, x = 0, err;
- if (flags & IPERM_FLAG_RCU)
+ if (desired & MAY_NOT_BLOCK)
return -ECHILD;
if (desired & MAY_READ) r = 1;
@@ -770,7 +781,7 @@ int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
err = access_file(name, r, w, x);
__putname(name);
if (!err)
- err = generic_permission(ino, desired, flags, NULL);
+ err = generic_permission(ino, desired);
return err;
}
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index d51a983..dd7bc38 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -16,7 +16,6 @@
#include <sys/vfs.h>
#include "hostfs.h"
#include "os.h"
-#include "user.h"
#include <utime.h>
static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
index 7a5eb2c..e0d57c0 100644
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -8,6 +8,58 @@
#include "hpfs_fn.h"
+static void hpfs_claim_alloc(struct super_block *s, secno sec)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ if (sbi->sb_n_free != (unsigned)-1) {
+ if (unlikely(!sbi->sb_n_free)) {
+ hpfs_error(s, "free count underflow, allocating sector %08x", sec);
+ sbi->sb_n_free = -1;
+ return;
+ }
+ sbi->sb_n_free--;
+ }
+}
+
+static void hpfs_claim_free(struct super_block *s, secno sec)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ if (sbi->sb_n_free != (unsigned)-1) {
+ if (unlikely(sbi->sb_n_free >= sbi->sb_fs_size)) {
+ hpfs_error(s, "free count overflow, freeing sector %08x", sec);
+ sbi->sb_n_free = -1;
+ return;
+ }
+ sbi->sb_n_free++;
+ }
+}
+
+static void hpfs_claim_dirband_alloc(struct super_block *s, secno sec)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ if (sbi->sb_n_free_dnodes != (unsigned)-1) {
+ if (unlikely(!sbi->sb_n_free_dnodes)) {
+ hpfs_error(s, "dirband free count underflow, allocating sector %08x", sec);
+ sbi->sb_n_free_dnodes = -1;
+ return;
+ }
+ sbi->sb_n_free_dnodes--;
+ }
+}
+
+static void hpfs_claim_dirband_free(struct super_block *s, secno sec)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ if (sbi->sb_n_free_dnodes != (unsigned)-1) {
+ if (unlikely(sbi->sb_n_free_dnodes >= sbi->sb_dirband_size / 4)) {
+ hpfs_error(s, "dirband free count overflow, freeing sector %08x", sec);
+ sbi->sb_n_free_dnodes = -1;
+ return;
+ }
+ sbi->sb_n_free_dnodes++;
+ }
+}
+
/*
* Check if a sector is allocated in bitmap
* This is really slow. Turned on only if chk==2
@@ -203,9 +255,15 @@ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forwa
}
sec = 0;
ret:
+ if (sec) {
+ i = 0;
+ do
+ hpfs_claim_alloc(s, sec + i);
+ while (unlikely(++i < n));
+ }
if (sec && f_p) {
for (i = 0; i < forward; i++) {
- if (!hpfs_alloc_if_possible(s, sec + i + 1)) {
+ if (!hpfs_alloc_if_possible(s, sec + n + i)) {
hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i);
sec = 0;
break;
@@ -228,6 +286,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near)
nr >>= 2;
sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0);
if (!sec) return 0;
+ hpfs_claim_dirband_alloc(s, sec);
return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start;
}
@@ -242,6 +301,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec)
bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f)));
hpfs_mark_4buffers_dirty(&qbh);
hpfs_brelse4(&qbh);
+ hpfs_claim_alloc(s, sec);
return 1;
}
hpfs_brelse4(&qbh);
@@ -275,6 +335,7 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
return;
}
bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f));
+ hpfs_claim_free(s, sec);
if (!--n) {
hpfs_mark_4buffers_dirty(&qbh);
hpfs_brelse4(&qbh);
@@ -359,6 +420,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f));
hpfs_mark_4buffers_dirty(&qbh);
hpfs_brelse4(&qbh);
+ hpfs_claim_dirband_free(s, dno);
}
}
@@ -366,7 +428,7 @@ struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near,
dnode_secno *dno, struct quad_buffer_head *qbh)
{
struct dnode *d;
- if (hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_dmap) > FREE_DNODES_ADD) {
+ if (hpfs_get_free_dnodes(s) > FREE_DNODES_ADD) {
if (!(*dno = alloc_in_dirband(s, near)))
if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL;
} else {
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index f46ae02..46549c7 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -29,25 +29,31 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
struct super_block *s = i->i_sb;
+ /* Somebody else will have to figure out what to do here */
+ if (whence == SEEK_DATA || whence == SEEK_HOLE)
+ return -EINVAL;
+
+ mutex_lock(&i->i_mutex);
hpfs_lock(s);
/*printk("dir lseek\n");*/
if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
- mutex_lock(&i->i_mutex);
pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1;
while (pos != new_off) {
if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh);
else goto fail;
if (pos == 12) goto fail;
}
- mutex_unlock(&i->i_mutex);
+ hpfs_add_pos(i, &filp->f_pos);
ok:
+ filp->f_pos = new_off;
hpfs_unlock(s);
- return filp->f_pos = new_off;
-fail:
mutex_unlock(&i->i_mutex);
+ return new_off;
+fail:
/*printk("illegal lseek: %016llx\n", new_off);*/
hpfs_unlock(s);
+ mutex_unlock(&i->i_mutex);
return -ESPIPE;
}
@@ -243,7 +249,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
result->i_mode &= ~0111;
result->i_op = &hpfs_file_iops;
result->i_fop = &hpfs_file_ops;
- result->i_nlink = 1;
+ set_nlink(result, 1);
}
unlock_new_inode(result);
}
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 89c500e..5ecfffe 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -18,9 +18,14 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
return 0;
}
-int hpfs_file_fsync(struct file *file, int datasync)
+int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
+ int ret;
+
+ ret = filemap_write_and_wait_range(file->f_mapping, start, end);
+ if (ret)
+ return ret;
return sync_blockdev(inode->i_sb->s_bdev);
}
@@ -111,9 +116,12 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
hpfs_get_block,
&hpfs_i(mapping->host)->mmu_private);
if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
+ loff_t isize;
+ hpfs_lock(mapping->host->i_sb);
+ isize = mapping->host->i_size;
if (pos + len > isize)
vmtruncate(mapping->host, isize);
+ hpfs_unlock(mapping->host->i_sb);
}
return ret;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index dd552f8..f1f2ca7 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -258,7 +258,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
/* file.c */
-int hpfs_file_fsync(struct file *, int);
+int hpfs_file_fsync(struct file *, loff_t, loff_t, int);
extern const struct file_operations hpfs_file_ops;
extern const struct inode_operations hpfs_file_iops;
extern const struct address_space_operations hpfs_aops;
@@ -311,10 +311,10 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
/* super.c */
-void hpfs_error(struct super_block *, const char *, ...)
- __attribute__((format (printf, 2, 3)));
+__printf(2, 3)
+void hpfs_error(struct super_block *, const char *, ...);
int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
-unsigned hpfs_count_one_bitmap(struct super_block *, secno);
+unsigned hpfs_get_free_dnodes(struct super_block *);
/*
* local time (HPFS) to GMT (Unix)
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 338cd83..3b2cec2 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -53,7 +53,7 @@ void hpfs_read_inode(struct inode *i)
i->i_mode &= ~0111;
i->i_op = &hpfs_file_iops;
i->i_fop = &hpfs_file_ops;
- i->i_nlink = 0;*/
+ clear_nlink(i);*/
make_bad_inode(i);
return;
}
@@ -77,7 +77,7 @@ void hpfs_read_inode(struct inode *i)
i->i_mode = S_IFLNK | 0777;
i->i_op = &page_symlink_inode_operations;
i->i_data.a_ops = &hpfs_symlink_aops;
- i->i_nlink = 1;
+ set_nlink(i, 1);
i->i_size = ea_size;
i->i_blocks = 1;
brelse(bh);
@@ -101,7 +101,7 @@ void hpfs_read_inode(struct inode *i)
}
if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
brelse(bh);
- i->i_nlink = 1;
+ set_nlink(i, 1);
i->i_size = 0;
i->i_blocks = 1;
init_special_inode(i, mode,
@@ -125,13 +125,13 @@ void hpfs_read_inode(struct inode *i)
hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL);
i->i_blocks = 4 * n_dnodes;
i->i_size = 2048 * n_dnodes;
- i->i_nlink = 2 + n_subdirs;
+ set_nlink(i, 2 + n_subdirs);
} else {
i->i_mode |= S_IFREG;
if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111;
i->i_op = &hpfs_file_iops;
i->i_fop = &hpfs_file_ops;
- i->i_nlink = 1;
+ set_nlink(i, 1);
i->i_size = le32_to_cpu(fnode->file_size);
i->i_blocks = ((i->i_size + 511) >> 9) + 1;
i->i_data.a_ops = &hpfs_aops;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index acf95da..ea91fcb 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -56,7 +56,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
result->i_fop = &hpfs_dir_ops;
result->i_blocks = 4;
result->i_size = 2048;
- result->i_nlink = 2;
+ set_nlink(result, 2);
if (dee.read_only)
result->i_mode &= ~0222;
@@ -150,7 +150,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
result->i_mode &= ~0111;
result->i_op = &hpfs_file_iops;
result->i_fop = &hpfs_file_ops;
- result->i_nlink = 1;
+ set_nlink(result, 1);
hpfs_i(result)->i_parent_dir = dir->i_ino;
result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
result->i_ctime.tv_nsec = 0;
@@ -242,7 +242,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
hpfs_i(result)->i_ea_size = 0;
result->i_uid = current_fsuid();
result->i_gid = current_fsgid();
- result->i_nlink = 1;
+ set_nlink(result, 1);
result->i_size = 0;
result->i_blocks = 1;
init_special_inode(result, mode, rdev);
@@ -318,7 +318,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
result->i_uid = current_fsuid();
result->i_gid = current_fsgid();
result->i_blocks = 1;
- result->i_nlink = 1;
+ set_nlink(result, 1);
result->i_size = strlen(symlink);
result->i_op = &page_symlink_inode_operations;
result->i_data.a_ops = &hpfs_symlink_aops;
@@ -398,7 +398,7 @@ again:
hpfs_unlock(dir->i_sb);
return -ENOSPC;
}
- if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
+ if (generic_permission(inode, MAY_WRITE) ||
!S_ISREG(inode->i_mode) ||
get_write_access(inode)) {
d_rehash(dentry);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f760c15..b03e766 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -115,7 +115,7 @@ static void hpfs_put_super(struct super_block *s)
kfree(sbi);
}
-unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
+static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
{
struct quad_buffer_head qbh;
unsigned long *bits;
@@ -123,7 +123,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
bits = hpfs_map_4sectors(s, secno, &qbh, 4);
if (!bits)
- return 0;
+ return (unsigned)-1;
count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
hpfs_brelse4(&qbh);
return count;
@@ -134,29 +134,45 @@ static unsigned count_bitmaps(struct super_block *s)
unsigned n, count, n_bands;
n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
count = 0;
- for (n = 0; n < n_bands; n++)
- count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+ for (n = 0; n < n_bands; n++) {
+ unsigned c;
+ c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+ if (c != (unsigned)-1)
+ count += c;
+ }
return count;
}
+unsigned hpfs_get_free_dnodes(struct super_block *s)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ if (sbi->sb_n_free_dnodes == (unsigned)-1) {
+ unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap);
+ if (c == (unsigned)-1)
+ return 0;
+ sbi->sb_n_free_dnodes = c;
+ }
+ return sbi->sb_n_free_dnodes;
+}
+
static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *s = dentry->d_sb;
struct hpfs_sb_info *sbi = hpfs_sb(s);
u64 id = huge_encode_dev(s->s_bdev->bd_dev);
+
hpfs_lock(s);
- /*if (sbi->sb_n_free == -1) {*/
+ if (sbi->sb_n_free == (unsigned)-1)
sbi->sb_n_free = count_bitmaps(s);
- sbi->sb_n_free_dnodes = hpfs_count_one_bitmap(s, sbi->sb_dmap);
- /*}*/
+
buf->f_type = s->s_magic;
buf->f_bsize = 512;
buf->f_blocks = sbi->sb_fs_size;
buf->f_bfree = sbi->sb_n_free;
buf->f_bavail = sbi->sb_n_free;
buf->f_files = sbi->sb_dirband_size / 4;
- buf->f_ffree = sbi->sb_n_free_dnodes;
+ buf->f_ffree = hpfs_get_free_dnodes(s);
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = 254;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 9d71c95..f590b11 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -574,9 +574,10 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
return err;
}
-static int hppfs_fsync(struct file *file, int datasync)
+static int hppfs_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
{
- return 0;
+ return filemap_write_and_wait_range(file->f_mapping, start, end);
}
static const struct file_operations hppfs_dir_fops = {
@@ -701,7 +702,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
inode->i_ctime = proc_ino->i_ctime;
inode->i_ino = proc_ino->i_ino;
inode->i_mode = proc_ino->i_mode;
- inode->i_nlink = proc_ino->i_nlink;
+ set_nlink(inode, proc_ino->i_nlink);
inode->i_size = proc_ino->i_size;
inode->i_blocks = proc_ino->i_blocks;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6327a06..0aa424a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -94,7 +94,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
vma->vm_ops = &hugetlb_vm_ops;
- if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT))
+ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
return -EINVAL;
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
@@ -484,6 +484,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
inode->i_op = &page_symlink_inode_operations;
break;
}
+ lockdep_annotate_inode_mutex_key(inode);
}
return inode;
}
@@ -592,9 +593,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
spin_lock(&sbinfo->stat_lock);
/* If no limits set, just report 0 for max/free/used
* blocks, like simple_statfs() */
- if (sbinfo->max_blocks >= 0) {
- buf->f_blocks = sbinfo->max_blocks;
- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ if (sbinfo->spool) {
+ long free_pages;
+
+ spin_lock(&sbinfo->spool->lock);
+ buf->f_blocks = sbinfo->spool->max_hpages;
+ free_pages = sbinfo->spool->max_hpages
+ - sbinfo->spool->used_hpages;
+ buf->f_bavail = buf->f_bfree = free_pages;
+ spin_unlock(&sbinfo->spool->lock);
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
@@ -610,6 +617,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
if (sbi) {
sb->s_fs_info = NULL;
+
+ if (sbi->spool)
+ hugepage_put_subpool(sbi->spool);
+
kfree(sbi);
}
}
@@ -841,10 +852,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = sbinfo;
sbinfo->hstate = config.hstate;
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_blocks = config.nr_blocks;
- sbinfo->free_blocks = config.nr_blocks;
sbinfo->max_inodes = config.nr_inodes;
sbinfo->free_inodes = config.nr_inodes;
+ sbinfo->spool = NULL;
+ if (config.nr_blocks != -1) {
+ sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
+ if (!sbinfo->spool)
+ goto out_free;
+ }
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = huge_page_size(config.hstate);
sb->s_blocksize_bits = huge_page_shift(config.hstate);
@@ -864,38 +879,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_root = root;
return 0;
out_free:
+ if (sbinfo->spool)
+ kfree(sbinfo->spool);
kfree(sbinfo);
return -ENOMEM;
}
-int hugetlb_get_quota(struct address_space *mapping, long delta)
-{
- int ret = 0;
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-
- if (sbinfo->free_blocks > -1) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks - delta >= 0)
- sbinfo->free_blocks -= delta;
- else
- ret = -ENOMEM;
- spin_unlock(&sbinfo->stat_lock);
- }
-
- return ret;
-}
-
-void hugetlb_put_quota(struct address_space *mapping, long delta)
-{
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-
- if (sbinfo->free_blocks > -1) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_blocks += delta;
- spin_unlock(&sbinfo->stat_lock);
- }
-}
-
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
@@ -963,7 +952,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
d_instantiate(path.dentry, inode);
inode->i_size = size;
- inode->i_nlink = 0;
+ clear_nlink(inode);
error = -ENFILE;
file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
@@ -1024,6 +1013,7 @@ static int __init init_hugetlbfs_fs(void)
static void __exit exit_hugetlbfs_fs(void)
{
kmem_cache_destroy(hugetlbfs_inode_cachep);
+ kern_unmount(hugetlbfs_vfsmount);
unregister_filesystem(&hugetlbfs_fs_type);
bdi_destroy(&hugetlbfs_backing_dev_info);
}
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 0542b6e..f20437c 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -254,19 +254,16 @@ static int isofs_readdir(struct file *filp,
char *tmpname;
struct iso_directory_record *tmpde;
struct inode *inode = filp->f_path.dentry->d_inode;
- struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
tmpname = (char *)__get_free_page(GFP_KERNEL);
if (tmpname == NULL)
return -ENOMEM;
- mutex_lock(&sbi->s_mutex);
tmpde = (struct iso_directory_record *) (tmpname+1024);
result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
free_page((unsigned long) tmpname);
- mutex_unlock(&sbi->s_mutex);
return result;
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 26f6364..2f9197f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -20,6 +20,7 @@
#include <linux/statfs.h>
#include <linux/cdrom.h>
#include <linux/parser.h>
+#include <linux/mpage.h>
#include "isofs.h"
#include "zisofs.h"
@@ -67,7 +68,7 @@ static void isofs_put_super(struct super_block *sb)
return;
}
-static int isofs_read_inode(struct inode *);
+static int isofs_read_inode(struct inode *, int relocated);
static int isofs_statfs (struct dentry *, struct kstatfs *);
static struct kmem_cache *isofs_inode_cachep;
@@ -854,7 +855,6 @@ root_found:
sbi->s_utf8 = opt.utf8;
sbi->s_nocompress = opt.nocompress;
sbi->s_overriderockperm = opt.overriderockperm;
- mutex_init(&sbi->s_mutex);
/*
* It would be incredibly stupid to allow people to mark every file
* on the disk as suid, so we merely allow them to set the default
@@ -1140,7 +1140,13 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
static int isofs_readpage(struct file *file, struct page *page)
{
- return block_read_full_page(page,isofs_get_block);
+ return mpage_readpage(page, isofs_get_block);
+}
+
+static int isofs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, isofs_get_block);
}
static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
@@ -1150,6 +1156,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
static const struct address_space_operations isofs_aops = {
.readpage = isofs_readpage,
+ .readpages = isofs_readpages,
.bmap = _isofs_bmap
};
@@ -1256,7 +1263,7 @@ out_toomany:
goto out;
}
-static int isofs_read_inode(struct inode *inode)
+static int isofs_read_inode(struct inode *inode, int relocated)
{
struct super_block *sb = inode->i_sb;
struct isofs_sb_info *sbi = ISOFS_SB(sb);
@@ -1311,7 +1318,7 @@ static int isofs_read_inode(struct inode *inode)
inode->i_mode = S_IFDIR | sbi->s_dmode;
else
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
- inode->i_nlink = 1; /*
+ set_nlink(inode, 1); /*
* Set to 1. We know there are 2, but
* the find utility tries to optimize
* if it is 2, and it screws up. It is
@@ -1329,7 +1336,7 @@ static int isofs_read_inode(struct inode *inode)
*/
inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
}
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
inode->i_uid = sbi->s_uid;
inode->i_gid = sbi->s_gid;
@@ -1401,7 +1408,7 @@ static int isofs_read_inode(struct inode *inode)
*/
if (!high_sierra) {
- parse_rock_ridge_inode(de, inode);
+ parse_rock_ridge_inode(de, inode, relocated);
/* if we want uid/gid set, override the rock ridge setting */
if (sbi->s_uid_set)
inode->i_uid = sbi->s_uid;
@@ -1480,9 +1487,10 @@ static int isofs_iget5_set(struct inode *ino, void *data)
* offset that point to the underlying meta-data for the inode. The
* code below is otherwise similar to the iget() code in
* include/linux/fs.h */
-struct inode *isofs_iget(struct super_block *sb,
- unsigned long block,
- unsigned long offset)
+struct inode *__isofs_iget(struct super_block *sb,
+ unsigned long block,
+ unsigned long offset,
+ int relocated)
{
unsigned long hashval;
struct inode *inode;
@@ -1504,7 +1512,7 @@ struct inode *isofs_iget(struct super_block *sb,
return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
- ret = isofs_read_inode(inode);
+ ret = isofs_read_inode(inode, relocated);
if (ret < 0) {
iget_failed(inode);
inode = ERR_PTR(ret);
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 2882dc0..f9c9793 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -55,7 +55,6 @@ struct isofs_sb_info {
gid_t s_gid;
uid_t s_uid;
struct nls_table *s_nls_iocharset; /* Native language support table */
- struct mutex s_mutex; /* replaces BKL, please remove if possible */
};
#define ISOFS_INVALID_MODE ((mode_t) -1)
@@ -108,7 +107,7 @@ extern int iso_date(char *, int);
struct inode; /* To make gcc happy */
-extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *);
+extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated);
extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *);
extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *);
@@ -119,9 +118,24 @@ extern struct dentry *isofs_lookup(struct inode *, struct dentry *, struct namei
extern struct buffer_head *isofs_bread(struct inode *, sector_t);
extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long);
-extern struct inode *isofs_iget(struct super_block *sb,
- unsigned long block,
- unsigned long offset);
+struct inode *__isofs_iget(struct super_block *sb,
+ unsigned long block,
+ unsigned long offset,
+ int relocated);
+
+static inline struct inode *isofs_iget(struct super_block *sb,
+ unsigned long block,
+ unsigned long offset)
+{
+ return __isofs_iget(sb, block, offset, 0);
+}
+
+static inline struct inode *isofs_iget_reloc(struct super_block *sb,
+ unsigned long block,
+ unsigned long offset)
+{
+ return __isofs_iget(sb, block, offset, 1);
+}
/* Because the inode number is no longer relevant to finding the
* underlying meta-data for an inode, we are free to choose a more
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 4fb3e80..1e2946f 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -168,7 +168,6 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
int found;
unsigned long uninitialized_var(block);
unsigned long uninitialized_var(offset);
- struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
struct inode *inode;
struct page *page;
@@ -176,21 +175,13 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
if (!page)
return ERR_PTR(-ENOMEM);
- mutex_lock(&sbi->s_mutex);
found = isofs_find_entry(dir, dentry,
&block, &offset,
page_address(page),
1024 + page_address(page));
__free_page(page);
- inode = NULL;
- if (found) {
- inode = isofs_iget(dir->i_sb, block, offset);
- if (IS_ERR(inode)) {
- mutex_unlock(&sbi->s_mutex);
- return ERR_CAST(inode);
- }
- }
- mutex_unlock(&sbi->s_mutex);
+ inode = found ? isofs_iget(dir->i_sb, block, offset) : NULL;
+
return d_splice_alias(inode, dentry);
}
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index f9cd04d..1780949 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -30,6 +30,7 @@ struct rock_state {
int cont_size;
int cont_extent;
int cont_offset;
+ int cont_loops;
struct inode *inode;
};
@@ -73,6 +74,9 @@ static void init_rock_state(struct rock_state *rs, struct inode *inode)
rs->inode = inode;
}
+/* Maximum number of Rock Ridge continuation entries */
+#define RR_MAX_CE_ENTRIES 32
+
/*
* Returns 0 if the caller should continue scanning, 1 if the scan must end
* and -ve on error.
@@ -105,6 +109,8 @@ static int rock_continue(struct rock_state *rs)
goto out;
}
ret = -EIO;
+ if (++rs->cont_loops >= RR_MAX_CE_ENTRIES)
+ goto out;
bh = sb_bread(rs->inode->i_sb, rs->cont_extent);
if (bh) {
memcpy(rs->buffer, bh->b_data + rs->cont_offset,
@@ -288,12 +294,16 @@ eio:
goto out;
}
+#define RR_REGARD_XA 1
+#define RR_RELOC_DE 2
+
static int
parse_rock_ridge_inode_internal(struct iso_directory_record *de,
- struct inode *inode, int regard_xa)
+ struct inode *inode, int flags)
{
int symlink_len = 0;
int cnt, sig;
+ unsigned int reloc_block;
struct inode *reloc;
struct rock_ridge *rr;
int rootflag;
@@ -305,7 +315,7 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de,
init_rock_state(&rs, inode);
setup_rock_ridge(de, inode, &rs);
- if (regard_xa) {
+ if (flags & RR_REGARD_XA) {
rs.chr += 14;
rs.len -= 14;
if (rs.len < 0)
@@ -352,6 +362,9 @@ repeat:
rs.cont_size = isonum_733(rr->u.CE.size);
break;
case SIG('E', 'R'):
+ /* Invalid length of ER tag id? */
+ if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len)
+ goto out;
ISOFS_SB(inode->i_sb)->s_rock = 1;
printk(KERN_DEBUG "ISO 9660 Extensions: ");
{
@@ -363,7 +376,7 @@ repeat:
break;
case SIG('P', 'X'):
inode->i_mode = isonum_733(rr->u.PX.mode);
- inode->i_nlink = isonum_733(rr->u.PX.n_links);
+ set_nlink(inode, isonum_733(rr->u.PX.n_links));
inode->i_uid = isonum_733(rr->u.PX.uid);
inode->i_gid = isonum_733(rr->u.PX.gid);
break;
@@ -485,18 +498,28 @@ repeat:
"relocated directory\n");
goto out;
case SIG('C', 'L'):
- ISOFS_I(inode)->i_first_extent =
- isonum_733(rr->u.CL.location);
- reloc =
- isofs_iget(inode->i_sb,
- ISOFS_I(inode)->i_first_extent,
- 0);
+ if (flags & RR_RELOC_DE) {
+ printk(KERN_ERR
+ "ISOFS: Recursive directory relocation "
+ "is not supported\n");
+ goto eio;
+ }
+ reloc_block = isonum_733(rr->u.CL.location);
+ if (reloc_block == ISOFS_I(inode)->i_iget5_block &&
+ ISOFS_I(inode)->i_iget5_offset == 0) {
+ printk(KERN_ERR
+ "ISOFS: Directory relocation points to "
+ "itself\n");
+ goto eio;
+ }
+ ISOFS_I(inode)->i_first_extent = reloc_block;
+ reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0);
if (IS_ERR(reloc)) {
ret = PTR_ERR(reloc);
goto out;
}
inode->i_mode = reloc->i_mode;
- inode->i_nlink = reloc->i_nlink;
+ set_nlink(inode, reloc->i_nlink);
inode->i_uid = reloc->i_uid;
inode->i_gid = reloc->i_gid;
inode->i_rdev = reloc->i_rdev;
@@ -637,9 +660,11 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit)
return rpnt;
}
-int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode)
+int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
+ int relocated)
{
- int result = parse_rock_ridge_inode_internal(de, inode, 0);
+ int flags = relocated ? RR_RELOC_DE : 0;
+ int result = parse_rock_ridge_inode_internal(de, inode, flags);
/*
* if rockridge flag was reset and we didn't look for attributes
@@ -647,7 +672,8 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode)
*/
if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1)
&& (ISOFS_SB(inode->i_sb)->s_rock == 2)) {
- result = parse_rock_ridge_inode_internal(de, inode, 14);
+ result = parse_rock_ridge_inode_internal(de, inode,
+ flags | RR_REGARD_XA);
}
return result;
}
@@ -678,7 +704,6 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
init_rock_state(&rs, inode);
block = ei->i_iget5_block;
- mutex_lock(&sbi->s_mutex);
bh = sb_bread(inode->i_sb, block);
if (!bh)
goto out_noread;
@@ -748,7 +773,6 @@ repeat:
goto fail;
brelse(bh);
*rpnt = '\0';
- mutex_unlock(&sbi->s_mutex);
SetPageUptodate(page);
kunmap(page);
unlock_page(page);
@@ -765,7 +789,6 @@ out_bad_span:
printk("symlink spans iso9660 blocks\n");
fail:
brelse(bh);
- mutex_unlock(&sbi->s_mutex);
error:
SetPageError(page);
kunmap(page);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index e4b87bc..5c93ffc 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -22,6 +22,8 @@
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <trace/events/jbd.h>
/*
* Unlink a buffer from a transaction checkpoint list.
@@ -95,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
!buffer_dirty(bh) && !buffer_write_io_error(bh)) {
+ /*
+ * Get our reference so that bh cannot be freed before
+ * we unlock it
+ */
+ get_bh(bh);
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __journal_remove_checkpoint(jh) + 1;
jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
BUFFER_TRACE(bh, "release");
__brelse(bh);
} else {
@@ -220,8 +226,8 @@ restart:
spin_lock(&journal->j_list_lock);
goto restart;
}
+ get_bh(bh);
if (buffer_locked(bh)) {
- get_bh(bh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
@@ -240,7 +246,6 @@ restart:
*/
released = __journal_remove_checkpoint(jh);
jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
__brelse(bh);
}
@@ -253,9 +258,12 @@ static void
__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
{
int i;
+ struct blk_plug plug;
+ blk_start_plug(&plug);
for (i = 0; i < *batch_count; i++)
- write_dirty_buffer(bhs[i], WRITE);
+ write_dirty_buffer(bhs[i], WRITE_SYNC);
+ blk_finish_plug(&plug);
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = bhs[i];
@@ -304,12 +312,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
ret = 1;
if (unlikely(buffer_write_io_error(bh)))
ret = -EIO;
+ get_bh(bh);
J_ASSERT_JH(jh, !buffer_jbddirty(bh));
BUFFER_TRACE(bh, "remove from checkpoint");
__journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
__brelse(bh);
} else {
/*
@@ -358,6 +366,7 @@ int log_do_checkpoint(journal_t *journal)
* journal straight away.
*/
result = cleanup_journal_tail(journal);
+ trace_jbd_checkpoint(journal, result);
jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
@@ -444,8 +453,6 @@ out:
*
* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
*
- * Called with the journal lock held.
- *
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
@@ -463,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal)
if (is_journal_aborted(journal))
return 1;
- /* OK, work out the oldest transaction remaining in the log, and
+ /*
+ * OK, work out the oldest transaction remaining in the log, and
* the log block it starts at.
*
* If the log is now empty, we need to work out which is the
* next transaction ID we will write, and where it will
- * start. */
-
+ * start.
+ */
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
transaction = journal->j_checkpoint_transactions;
@@ -495,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal)
spin_unlock(&journal->j_state_lock);
return 1;
}
+ spin_unlock(&journal->j_state_lock);
+ /*
+ * We need to make sure that any blocks that were recently written out
+ * --- perhaps by log_do_checkpoint() --- are flushed out before we
+ * drop the transactions from the journal. It's unlikely this will be
+ * necessary, especially with an appropriately sized journal, but we
+ * need this to guarantee correctness. Fortunately
+ * cleanup_journal_tail() doesn't get called all that often.
+ */
+ if (journal->j_flags & JFS_BARRIER)
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+
+ spin_lock(&journal->j_state_lock);
+ if (!tid_gt(first_tid, journal->j_tail_sequence)) {
+ spin_unlock(&journal->j_state_lock);
+ /* Someone else cleaned up journal so return 0 */
+ return 0;
+ }
/* OK, update the superblock to recover the freed space.
* Physical blocks come first: have we wrapped beyond the end of
* the log? */
@@ -503,6 +529,7 @@ int cleanup_journal_tail(journal_t *journal)
if (blocknr < journal->j_tail)
freed = freed + journal->j_last - journal->j_first;
+ trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed);
jbd_debug(1,
"Cleaning journal tail from %d to %d (offset %u), "
"freeing %u\n",
@@ -523,9 +550,9 @@ int cleanup_journal_tail(journal_t *journal)
/*
* journal_clean_one_cp_list
*
- * Find all the written-back checkpoint buffers in the given list and release them.
+ * Find all the written-back checkpoint buffers in the given list and release
+ * them.
*
- * Called with the journal locked.
* Called with j_list_lock held.
* Returns number of bufers reaped (for debug)
*/
@@ -632,8 +659,8 @@ out:
* checkpoint lists.
*
* The function returns 1 if it frees the transaction, 0 otherwise.
+ * The function can free jh and bh.
*
- * This function is called with the journal locked.
* This function is called with j_list_lock held.
* This function is called with jbd_lock_bh_state(jh2bh(jh))
*/
@@ -652,13 +679,14 @@ int __journal_remove_checkpoint(struct journal_head *jh)
}
journal = transaction->t_journal;
+ JBUFFER_TRACE(jh, "removing from transaction");
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
+ journal_put_journal_head(jh);
if (transaction->t_checkpoint_list != NULL ||
transaction->t_checkpoint_io_list != NULL)
goto out;
- JBUFFER_TRACE(jh, "transaction has no more buffers");
/*
* There is one special case to worry about: if we have just pulled the
@@ -669,10 +697,8 @@ int __journal_remove_checkpoint(struct journal_head *jh)
* The locking here around t_state is a bit sleazy.
* See the comment at the end of journal_commit_transaction().
*/
- if (transaction->t_state != T_FINISHED) {
- JBUFFER_TRACE(jh, "belongs to running/committing transaction");
+ if (transaction->t_state != T_FINISHED)
goto out;
- }
/* OK, that was the last buffer for the transaction: we can now
safely remove this transaction from the log */
@@ -684,7 +710,6 @@ int __journal_remove_checkpoint(struct journal_head *jh)
wake_up(&journal->j_wait_logspace);
ret = 1;
out:
- JBUFFER_TRACE(jh, "exit");
return ret;
}
@@ -703,6 +728,8 @@ void __journal_insert_checkpoint(struct journal_head *jh,
J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+ /* Get reference for checkpointing transaction */
+ journal_grab_journal_head(jh2bh(jh));
jh->b_cp_transaction = transaction;
if (!transaction->t_checkpoint_list) {
@@ -752,6 +779,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
+ trace_jbd_drop_transaction(journal, transaction);
jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
kfree(transaction);
}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index dcd23f8..931bf95 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -21,6 +21,7 @@
#include <linux/pagemap.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <trace/events/jbd.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -209,6 +210,8 @@ write_out_data:
if (!trylock_buffer(bh)) {
BUFFER_TRACE(bh, "needs blocking lock");
spin_unlock(&journal->j_list_lock);
+ trace_jbd_do_submit_data(journal,
+ commit_transaction);
/* Write out all data to prevent deadlocks */
journal_do_submit_data(wbuf, bufs, write_op);
bufs = 0;
@@ -241,6 +244,8 @@ write_out_data:
jbd_unlock_bh_state(bh);
if (bufs == journal->j_wbufsize) {
spin_unlock(&journal->j_list_lock);
+ trace_jbd_do_submit_data(journal,
+ commit_transaction);
journal_do_submit_data(wbuf, bufs, write_op);
bufs = 0;
goto write_out_data;
@@ -258,10 +263,6 @@ write_out_data:
jbd_unlock_bh_state(bh);
if (locked)
unlock_buffer(bh);
- journal_remove_journal_head(bh);
- /* One for our safety reference, other for
- * journal_remove_journal_head() */
- put_bh(bh);
release_data_buffer(bh);
}
@@ -271,6 +272,7 @@ write_out_data:
}
}
spin_unlock(&journal->j_list_lock);
+ trace_jbd_do_submit_data(journal, commit_transaction);
journal_do_submit_data(wbuf, bufs, write_op);
return err;
@@ -321,12 +323,14 @@ void journal_commit_transaction(journal_t *journal)
commit_transaction = journal->j_running_transaction;
J_ASSERT(commit_transaction->t_state == T_RUNNING);
+ trace_jbd_start_commit(journal, commit_transaction);
jbd_debug(1, "JBD: starting commit of transaction %d\n",
commit_transaction->t_tid);
spin_lock(&journal->j_state_lock);
commit_transaction->t_state = T_LOCKED;
+ trace_jbd_commit_locking(journal, commit_transaction);
spin_lock(&commit_transaction->t_handle_lock);
while (commit_transaction->t_updates) {
DEFINE_WAIT(wait);
@@ -397,6 +401,7 @@ void journal_commit_transaction(journal_t *journal)
*/
journal_switch_revoke_table(journal);
+ trace_jbd_commit_flushing(journal, commit_transaction);
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
journal->j_running_transaction = NULL;
@@ -451,14 +456,9 @@ void journal_commit_transaction(journal_t *journal)
}
if (buffer_jbd(bh) && bh2jh(bh) == jh &&
jh->b_transaction == commit_transaction &&
- jh->b_jlist == BJ_Locked) {
+ jh->b_jlist == BJ_Locked)
__journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
- put_bh(bh);
- } else {
- jbd_unlock_bh_state(bh);
- }
+ jbd_unlock_bh_state(bh);
release_data_buffer(bh);
cond_resched_lock(&journal->j_list_lock);
}
@@ -498,6 +498,7 @@ void journal_commit_transaction(journal_t *journal)
commit_transaction->t_state = T_COMMIT;
spin_unlock(&journal->j_state_lock);
+ trace_jbd_commit_logging(journal, commit_transaction);
J_ASSERT(commit_transaction->t_nr_buffers <=
commit_transaction->t_outstanding_credits);
@@ -802,10 +803,16 @@ restart_loop:
while (commit_transaction->t_forget) {
transaction_t *cp_transaction;
struct buffer_head *bh;
+ int try_to_free = 0;
jh = commit_transaction->t_forget;
spin_unlock(&journal->j_list_lock);
bh = jh2bh(jh);
+ /*
+ * Get a reference so that bh cannot be freed before we are
+ * done with it.
+ */
+ get_bh(bh);
jbd_lock_bh_state(bh);
J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
jh->b_transaction == journal->j_running_transaction);
@@ -881,28 +888,27 @@ restart_loop:
__journal_insert_checkpoint(jh, commit_transaction);
if (is_journal_aborted(journal))
clear_buffer_jbddirty(bh);
- JBUFFER_TRACE(jh, "refile for checkpoint writeback");
- __journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
} else {
J_ASSERT_BH(bh, !buffer_dirty(bh));
- /* The buffer on BJ_Forget list and not jbddirty means
+ /*
+ * The buffer on BJ_Forget list and not jbddirty means
* it has been freed by this transaction and hence it
* could not have been reallocated until this
* transaction has committed. *BUT* it could be
* reallocated once we have written all the data to
* disk and before we process the buffer on BJ_Forget
- * list. */
- JBUFFER_TRACE(jh, "refile or unfile freed buffer");
- __journal_refile_buffer(jh);
- if (!jh->b_transaction) {
- jbd_unlock_bh_state(bh);
- /* needs a brelse */
- journal_remove_journal_head(bh);
- release_buffer_page(bh);
- } else
- jbd_unlock_bh_state(bh);
+ * list.
+ */
+ if (!jh->b_next_transaction)
+ try_to_free = 1;
}
+ JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+ __journal_refile_buffer(jh);
+ jbd_unlock_bh_state(bh);
+ if (try_to_free)
+ release_buffer_page(bh);
+ else
+ __brelse(bh);
cond_resched_lock(&journal->j_list_lock);
}
spin_unlock(&journal->j_list_lock);
@@ -969,6 +975,7 @@ restart_loop:
}
spin_unlock(&journal->j_list_lock);
+ trace_jbd_end_commit(journal, commit_transaction);
jbd_debug(1, "JBD: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9f36384..fea8dd6 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -38,6 +38,9 @@
#include <linux/debugfs.h>
#include <linux/ratelimit.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/jbd.h>
+
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -1065,6 +1068,7 @@ void journal_update_superblock(journal_t *journal, int wait)
} else
write_dirty_buffer(bh, WRITE);
+ trace_jbd_update_superblock_end(journal, wait);
out:
/* If we have just flushed the log (by marking s_start==0), then
* any future commit will have to be careful to update the
@@ -1807,10 +1811,9 @@ static void journal_free_journal_head(struct journal_head *jh)
* When a buffer has its BH_JBD bit set it is immune from being released by
* core kernel code, mainly via ->b_count.
*
- * A journal_head may be detached from its buffer_head when the journal_head's
- * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
- * Various places in JBD call journal_remove_journal_head() to indicate that the
- * journal_head can be dropped if needed.
+ * A journal_head is detached from its buffer_head when the journal_head's
+ * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
+ * transaction (b_cp_transaction) hold their references to b_jcount.
*
* Various places in the kernel want to attach a journal_head to a buffer_head
* _before_ attaching the journal_head to a transaction. To protect the
@@ -1823,17 +1826,16 @@ static void journal_free_journal_head(struct journal_head *jh)
* (Attach a journal_head if needed. Increments b_jcount)
* struct journal_head *jh = journal_add_journal_head(bh);
* ...
- * jh->b_transaction = xxx;
- * journal_put_journal_head(jh);
- *
- * Now, the journal_head's b_jcount is zero, but it is safe from being released
- * because it has a non-zero b_transaction.
+ * (Get another reference for transaction)
+ * journal_grab_journal_head(bh);
+ * jh->b_transaction = xxx;
+ * (Put original reference)
+ * journal_put_journal_head(jh);
*/
/*
* Give a buffer_head a journal_head.
*
- * Doesn't need the journal lock.
* May sleep.
*/
struct journal_head *journal_add_journal_head(struct buffer_head *bh)
@@ -1897,61 +1899,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
struct journal_head *jh = bh2jh(bh);
J_ASSERT_JH(jh, jh->b_jcount >= 0);
-
- get_bh(bh);
- if (jh->b_jcount == 0) {
- if (jh->b_transaction == NULL &&
- jh->b_next_transaction == NULL &&
- jh->b_cp_transaction == NULL) {
- J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
- J_ASSERT_BH(bh, buffer_jbd(bh));
- J_ASSERT_BH(bh, jh2bh(jh) == bh);
- BUFFER_TRACE(bh, "remove journal_head");
- if (jh->b_frozen_data) {
- printk(KERN_WARNING "%s: freeing "
- "b_frozen_data\n",
- __func__);
- jbd_free(jh->b_frozen_data, bh->b_size);
- }
- if (jh->b_committed_data) {
- printk(KERN_WARNING "%s: freeing "
- "b_committed_data\n",
- __func__);
- jbd_free(jh->b_committed_data, bh->b_size);
- }
- bh->b_private = NULL;
- jh->b_bh = NULL; /* debug, really */
- clear_buffer_jbd(bh);
- __brelse(bh);
- journal_free_journal_head(jh);
- } else {
- BUFFER_TRACE(bh, "journal_head was locked");
- }
+ J_ASSERT_JH(jh, jh->b_transaction == NULL);
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+ J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+ J_ASSERT_BH(bh, buffer_jbd(bh));
+ J_ASSERT_BH(bh, jh2bh(jh) == bh);
+ BUFFER_TRACE(bh, "remove journal_head");
+ if (jh->b_frozen_data) {
+ printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
+ jbd_free(jh->b_frozen_data, bh->b_size);
}
+ if (jh->b_committed_data) {
+ printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
+ jbd_free(jh->b_committed_data, bh->b_size);
+ }
+ bh->b_private = NULL;
+ jh->b_bh = NULL; /* debug, really */
+ clear_buffer_jbd(bh);
+ journal_free_journal_head(jh);
}
/*
- * journal_remove_journal_head(): if the buffer isn't attached to a transaction
- * and has a zero b_jcount then remove and release its journal_head. If we did
- * see that the buffer is not used by any transaction we also "logically"
- * decrement ->b_count.
- *
- * We in fact take an additional increment on ->b_count as a convenience,
- * because the caller usually wants to do additional things with the bh
- * after calling here.
- * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
- * time. Once the caller has run __brelse(), the buffer is eligible for
- * reaping by try_to_free_buffers().
- */
-void journal_remove_journal_head(struct buffer_head *bh)
-{
- jbd_lock_bh_journal_head(bh);
- __journal_remove_journal_head(bh);
- jbd_unlock_bh_journal_head(bh);
-}
-
-/*
- * Drop a reference on the passed journal_head. If it fell to zero then try to
+ * Drop a reference on the passed journal_head. If it fell to zero then
* release the journal_head from the buffer_head.
*/
void journal_put_journal_head(struct journal_head *jh)
@@ -1961,11 +1931,12 @@ void journal_put_journal_head(struct journal_head *jh)
jbd_lock_bh_journal_head(bh);
J_ASSERT_JH(jh, jh->b_jcount > 0);
--jh->b_jcount;
- if (!jh->b_jcount && !jh->b_transaction) {
+ if (!jh->b_jcount) {
__journal_remove_journal_head(bh);
+ jbd_unlock_bh_journal_head(bh);
__brelse(bh);
- }
- jbd_unlock_bh_journal_head(bh);
+ } else
+ jbd_unlock_bh_journal_head(bh);
}
/*
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 5b43e96..008bf06 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,6 +20,7 @@
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
+#include <linux/blkdev.h>
#endif
/*
@@ -263,6 +264,9 @@ int journal_recover(journal_t *journal)
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
+ /* Flush disk caches to get replayed data on the permanent storage */
+ if (journal->j_flags & JFS_BARRIER)
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
return err;
}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index d7ab092..7c86b37 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -26,6 +26,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hrtimer.h>
+#include <linux/backing-dev.h>
static void __journal_temp_unlink_buffer(struct journal_head *jh);
@@ -99,11 +100,10 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
alloc_transaction:
if (!journal->j_running_transaction) {
- new_transaction = kzalloc(sizeof(*new_transaction),
- GFP_NOFS|__GFP_NOFAIL);
+ new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS);
if (!new_transaction) {
- ret = -ENOMEM;
- goto out;
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto alloc_transaction;
}
}
@@ -696,7 +696,6 @@ repeat:
if (!jh->b_transaction) {
JBUFFER_TRACE(jh, "no transaction");
J_ASSERT_JH(jh, !jh->b_next_transaction);
- jh->b_transaction = transaction;
JBUFFER_TRACE(jh, "file as BJ_Reserved");
spin_lock(&journal->j_list_lock);
__journal_file_buffer(jh, transaction, BJ_Reserved);
@@ -818,7 +817,6 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
* committed and so it's safe to clear the dirty bit.
*/
clear_buffer_dirty(jh2bh(jh));
- jh->b_transaction = transaction;
/* first access by this transaction */
jh->b_modified = 0;
@@ -844,8 +842,8 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
*/
JBUFFER_TRACE(jh, "cancelling revoke");
journal_cancel_revoke(handle, jh);
- journal_put_journal_head(jh);
out:
+ journal_put_journal_head(jh);
return err;
}
@@ -1069,8 +1067,9 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
ret = -EIO;
goto no_journal;
}
-
- if (jh->b_transaction != NULL) {
+ /* We might have slept so buffer could be refiled now */
+ if (jh->b_transaction != NULL &&
+ jh->b_transaction != handle->h_transaction) {
JBUFFER_TRACE(jh, "unfile from commit");
__journal_temp_unlink_buffer(jh);
/* It still points to the committing
@@ -1091,8 +1090,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
JBUFFER_TRACE(jh, "not on correct data list: unfile");
J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
- __journal_temp_unlink_buffer(jh);
- jh->b_transaction = handle->h_transaction;
JBUFFER_TRACE(jh, "file as data");
__journal_file_buffer(jh, handle->h_transaction,
BJ_SyncData);
@@ -1300,8 +1297,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
__journal_file_buffer(jh, transaction, BJ_Forget);
} else {
__journal_unfile_buffer(jh);
- journal_remove_journal_head(bh);
- __brelse(bh);
if (!buffer_jbd(bh)) {
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
@@ -1622,19 +1617,32 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh)
mark_buffer_dirty(bh); /* Expose it to the VM */
}
+/*
+ * Remove buffer from all transactions.
+ *
+ * Called with bh_state lock and j_list_lock
+ *
+ * jh and bh may be already freed when this function returns.
+ */
void __journal_unfile_buffer(struct journal_head *jh)
{
__journal_temp_unlink_buffer(jh);
jh->b_transaction = NULL;
+ journal_put_journal_head(jh);
}
void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
{
- jbd_lock_bh_state(jh2bh(jh));
+ struct buffer_head *bh = jh2bh(jh);
+
+ /* Get reference so that buffer cannot be freed before we unlock it */
+ get_bh(bh);
+ jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
__journal_unfile_buffer(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(jh2bh(jh));
+ jbd_unlock_bh_state(bh);
+ __brelse(bh);
}
/*
@@ -1661,16 +1669,12 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
/* A written-back ordered data buffer */
JBUFFER_TRACE(jh, "release data");
__journal_unfile_buffer(jh);
- journal_remove_journal_head(bh);
- __brelse(bh);
}
} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
__journal_remove_checkpoint(jh);
- journal_remove_journal_head(bh);
- __brelse(bh);
}
}
spin_unlock(&journal->j_list_lock);
@@ -1733,7 +1737,7 @@ int journal_try_to_free_buffers(journal_t *journal,
/*
* We take our own ref against the journal_head here to avoid
* having to add tons of locking around each instance of
- * journal_remove_journal_head() and journal_put_journal_head().
+ * journal_put_journal_head().
*/
jh = journal_grab_journal_head(bh);
if (!jh)
@@ -1770,10 +1774,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
int may_free = 1;
struct buffer_head *bh = jh2bh(jh);
- __journal_unfile_buffer(jh);
-
if (jh->b_cp_transaction) {
JBUFFER_TRACE(jh, "on running+cp transaction");
+ __journal_temp_unlink_buffer(jh);
/*
* We don't want to write the buffer anymore, clear the
* bit so that we don't confuse checks in
@@ -1784,8 +1787,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
may_free = 0;
} else {
JBUFFER_TRACE(jh, "on running transaction");
- journal_remove_journal_head(bh);
- __brelse(bh);
+ __journal_unfile_buffer(jh);
}
return may_free;
}
@@ -2096,6 +2098,8 @@ void __journal_file_buffer(struct journal_head *jh,
if (jh->b_transaction)
__journal_temp_unlink_buffer(jh);
+ else
+ journal_grab_journal_head(bh);
jh->b_transaction = transaction;
switch (jlist) {
@@ -2153,9 +2157,10 @@ void journal_file_buffer(struct journal_head *jh,
* already started to be used by a subsequent transaction, refile the
* buffer on that transaction's metadata list.
*
- * Called under journal->j_list_lock
- *
+ * Called under j_list_lock
* Called under jbd_lock_bh_state(jh2bh(jh))
+ *
+ * jh and bh may be already free when this function returns
*/
void __journal_refile_buffer(struct journal_head *jh)
{
@@ -2179,6 +2184,11 @@ void __journal_refile_buffer(struct journal_head *jh)
was_dirty = test_clear_buffer_jbddirty(bh);
__journal_temp_unlink_buffer(jh);
+ /*
+ * We set b_transaction here because b_next_transaction will inherit
+ * our jh reference and thus __journal_file_buffer() must not take a
+ * new one.
+ */
jh->b_transaction = jh->b_next_transaction;
jh->b_next_transaction = NULL;
if (buffer_freed(bh))
@@ -2195,30 +2205,21 @@ void __journal_refile_buffer(struct journal_head *jh)
}
/*
- * For the unlocked version of this call, also make sure that any
- * hanging journal_head is cleaned up if necessary.
- *
- * __journal_refile_buffer is usually called as part of a single locked
- * operation on a buffer_head, in which the caller is probably going to
- * be hooking the journal_head onto other lists. In that case it is up
- * to the caller to remove the journal_head if necessary. For the
- * unlocked journal_refile_buffer call, the caller isn't going to be
- * doing anything else to the buffer so we need to do the cleanup
- * ourselves to avoid a jh leak.
- *
- * *** The journal_head may be freed by this call! ***
+ * __journal_refile_buffer() with necessary locking added. We take our bh
+ * reference so that we can safely unlock bh.
+ *
+ * The jh and bh may be freed by this call.
*/
void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
struct buffer_head *bh = jh2bh(jh);
+ /* Get reference so that buffer cannot be freed before we unlock it */
+ get_bh(bh);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
-
__journal_refile_buffer(jh);
jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
-
spin_unlock(&journal->j_list_lock);
__brelse(bh);
}
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index e5de942..45559dc 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -27,7 +27,7 @@
#include "jfs_xattr.h"
#include "jfs_acl.h"
-static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
+struct posix_acl *jfs_get_acl(struct inode *inode, int type)
{
struct posix_acl *acl;
char *ea_name;
@@ -114,30 +114,9 @@ out:
return rc;
}
-int jfs_check_acl(struct inode *inode, int mask, unsigned int flags)
-{
- struct posix_acl *acl;
-
- if (flags & IPERM_FLAG_RCU)
- return -ECHILD;
-
- acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- int error = posix_acl_permission(inode, acl, mask);
- posix_acl_release(acl);
- return error;
- }
-
- return -EAGAIN;
-}
-
int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
{
struct posix_acl *acl = NULL;
- struct posix_acl *clone;
- mode_t mode;
int rc = 0;
if (S_ISLNK(inode->i_mode))
@@ -153,20 +132,11 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
if (rc)
goto cleanup;
}
- clone = posix_acl_clone(acl, GFP_KERNEL);
- if (!clone) {
- rc = -ENOMEM;
- goto cleanup;
- }
- mode = inode->i_mode;
- rc = posix_acl_create_masq(clone, &mode);
- if (rc >= 0) {
- inode->i_mode = mode;
- if (rc > 0)
- rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS,
- clone);
- }
- posix_acl_release(clone);
+ rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
+ if (rc < 0)
+ goto cleanup; /* posix_acl_release(NULL) is no-op */
+ if (rc > 0)
+ rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
cleanup:
posix_acl_release(acl);
} else
@@ -180,8 +150,9 @@ cleanup:
int jfs_acl_chmod(struct inode *inode)
{
- struct posix_acl *acl, *clone;
+ struct posix_acl *acl;
int rc;
+ tid_t tid;
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
@@ -190,22 +161,18 @@ int jfs_acl_chmod(struct inode *inode)
if (IS_ERR(acl) || !acl)
return PTR_ERR(acl);
- clone = posix_acl_clone(acl, GFP_KERNEL);
- posix_acl_release(acl);
- if (!clone)
- return -ENOMEM;
-
- rc = posix_acl_chmod_masq(clone, inode->i_mode);
- if (!rc) {
- tid_t tid = txBegin(inode->i_sb, 0);
- mutex_lock(&JFS_IP(inode)->commit_mutex);
- rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, clone);
- if (!rc)
- rc = txCommit(tid, 1, &inode, 0);
- txEnd(tid);
- mutex_unlock(&JFS_IP(inode)->commit_mutex);
- }
+ rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ if (rc)
+ return rc;
- posix_acl_release(clone);
+ tid = txBegin(inode->i_sb, 0);
+ mutex_lock(&JFS_IP(inode)->commit_mutex);
+ rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
+ if (!rc)
+ rc = txCommit(tid, 1, &inode, 0);
+ txEnd(tid);
+ mutex_unlock(&JFS_IP(inode)->commit_mutex);
+
+ posix_acl_release(acl);
return rc;
}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 2f3f531..844f946 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -28,19 +28,26 @@
#include "jfs_acl.h"
#include "jfs_debug.h"
-int jfs_fsync(struct file *file, int datasync)
+int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
int rc = 0;
+ rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (rc)
+ return rc;
+
+ mutex_lock(&inode->i_mutex);
if (!(inode->i_state & I_DIRTY) ||
(datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
/* Make sure committed changes hit the disk */
jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
+ mutex_unlock(&inode->i_mutex);
return rc;
}
rc |= jfs_commit_inode(inode, 1);
+ mutex_unlock(&inode->i_mutex);
return rc ? -EIO : 0;
}
@@ -110,6 +117,8 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
+ inode_dio_wait(inode);
+
rc = vmtruncate(inode, iattr->ia_size);
if (rc)
return rc;
@@ -131,7 +140,7 @@ const struct inode_operations jfs_file_inode_operations = {
.removexattr = jfs_removexattr,
.setattr = jfs_setattr,
#ifdef CONFIG_JFS_POSIX_ACL
- .check_acl = jfs_check_acl,
+ .get_acl = jfs_get_acl,
#endif
};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 09100b4..13fc885 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -329,8 +329,8 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
struct inode *inode = file->f_mapping->host;
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, jfs_get_block, NULL);
+ ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ jfs_get_block);
/*
* In case of error extending write may have instantiated a few
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index f9285c4..ad84fe5 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
#ifdef CONFIG_JFS_POSIX_ACL
-int jfs_check_acl(struct inode *, int, unsigned int flags);
+struct posix_acl *jfs_get_acl(struct inode *inode, int type);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
int jfs_acl_chmod(struct inode *inode);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 4496872..9cbd11a 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -3161,7 +3161,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
{
int rc;
int dbitno, word, rembits, nb, nwords, wbitno, agno;
- s8 oldroot, *leaf;
+ s8 oldroot;
struct dmaptree *tp = (struct dmaptree *) & dp->tree;
/* save the current value of the root (i.e. maximum free string)
@@ -3169,9 +3169,6 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
*/
oldroot = tp->stree[ROOT];
- /* pick up a pointer to the leaves of the dmap tree */
- leaf = tp->stree + LEAFIND;
-
/* determine the bit number and word within the dmap of the
* starting block.
*/
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index b6f17c0..f6f32fa 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -3048,9 +3048,9 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
dir_index = (u32) filp->f_pos;
/*
- * NFSv4 reserves cookies 1 and 2 for . and .. so we add
- * the value we return to the vfs is one greater than the
- * one we use internally.
+ * NFSv4 reserves cookies 1 and 2 for . and .. so the value
+ * we return to the vfs is one greater than the one we use
+ * internally.
*/
if (dir_index)
dir_index--;
@@ -3103,7 +3103,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
* self "."
*/
filp->f_pos = 1;
- if (filldir(dirent, ".", 1, 0, ip->i_ino,
+ if (filldir(dirent, ".", 1, 1, ip->i_ino,
DT_DIR))
return 0;
}
@@ -3111,7 +3111,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
* parent ".."
*/
filp->f_pos = 2;
- if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
+ if (filldir(dirent, "..", 2, 2, PARENT(ip), DT_DIR))
return 0;
/*
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index b78b2f9..1b6f15f 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -457,7 +457,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
/* read the page of fixed disk inode (AIT) in raw mode */
mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
if (mp == NULL) {
- ip->i_nlink = 1; /* Don't want iput() deleting it */
+ set_nlink(ip, 1); /* Don't want iput() deleting it */
iput(ip);
return (NULL);
}
@@ -469,7 +469,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
/* copy on-disk inode to in-memory inode */
if ((copy_from_dinode(dp, ip)) != 0) {
/* handle bad return by returning NULL for ip */
- ip->i_nlink = 1; /* Don't want iput() deleting it */
+ set_nlink(ip, 1); /* Don't want iput() deleting it */
iput(ip);
/* release the page */
release_metapage(mp);
@@ -3076,7 +3076,7 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
ip->i_mode |= 0001;
}
}
- ip->i_nlink = le32_to_cpu(dip->di_nlink);
+ set_nlink(ip, le32_to_cpu(dip->di_nlink));
jfs_ip->saved_uid = le32_to_cpu(dip->di_uid);
if (sbi->uid == -1)
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 2686531..7f464c5 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -95,7 +95,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
if (insert_inode_locked(inode) < 0) {
rc = -EINVAL;
- goto fail_unlock;
+ goto fail_put;
}
inode_init_owner(inode, parent, mode);
@@ -156,8 +156,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
fail_drop:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
-fail_unlock:
- inode->i_nlink = 0;
+ clear_nlink(inode);
unlock_new_inode(inode);
fail_put:
iput(inode);
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index ec2fb8b..9271cfe 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@
struct fid;
extern struct inode *ialloc(struct inode *, umode_t);
-extern int jfs_fsync(struct file *, int);
+extern int jfs_fsync(struct file *, loff_t, loff_t, int);
extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
extern struct inode *jfs_iget(struct super_block *, unsigned long);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index ee55e45..bfb2a91 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -67,6 +67,7 @@
#include <linux/buffer_head.h> /* for sync_blockdev() */
#include <linux/bio.h>
#include <linux/freezer.h>
+#include <linux/export.h>
#include <linux/delay.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index f6cc0c0..af96060 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1143,7 +1143,6 @@ int txCommit(tid_t tid, /* transaction identifier */
struct jfs_log *log;
struct tblock *tblk;
struct lrd *lrd;
- int lsn;
struct inode *ip;
struct jfs_inode_info *jfs_ip;
int k, n;
@@ -1310,7 +1309,7 @@ int txCommit(tid_t tid, /* transaction identifier */
*/
lrd->type = cpu_to_le16(LOG_COMMIT);
lrd->length = 0;
- lsn = lmLog(log, tblk, lrd, NULL);
+ lmLog(log, tblk, lrd, NULL);
lmGroupCommit(log, tblk);
@@ -2935,7 +2934,6 @@ int jfs_sync(void *arg)
{
struct inode *ip;
struct jfs_inode_info *jfs_ip;
- int rc;
tid_t tid;
do {
@@ -2961,7 +2959,7 @@ int jfs_sync(void *arg)
*/
TXN_UNLOCK();
tid = txBegin(ip->i_sb, COMMIT_INODE);
- rc = txCommit(tid, 1, &ip, 0);
+ txCommit(tid, 1, &ip, 0);
txEnd(tid);
mutex_unlock(&jfs_ip->commit_mutex);
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index adcf92d..7971f37 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb)
/*
* Wait for outstanding transactions to be written to log:
*/
- jfs_flush_journal(log, 1);
+ jfs_flush_journal(log, 2);
/*
* close fileset inode allocation map (aka fileset inode)
@@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb)
*
* remove file system from log active file system list.
*/
- jfs_flush_journal(log, 1);
+ jfs_flush_journal(log, 2);
/*
* Make sure all metadata makes it to disk
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index eaaf2b5..a112ad9 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -172,7 +172,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
mutex_unlock(&JFS_IP(dip)->commit_mutex);
if (rc) {
free_ea_wmap(ip);
- ip->i_nlink = 0;
+ clear_nlink(ip);
unlock_new_inode(ip);
iput(ip);
} else {
@@ -292,7 +292,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
goto out3;
}
- ip->i_nlink = 2; /* for '.' */
+ set_nlink(ip, 2); /* for '.' */
ip->i_op = &jfs_dir_inode_operations;
ip->i_fop = &jfs_dir_operations;
@@ -311,7 +311,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
mutex_unlock(&JFS_IP(dip)->commit_mutex);
if (rc) {
free_ea_wmap(ip);
- ip->i_nlink = 0;
+ clear_nlink(ip);
unlock_new_inode(ip);
iput(ip);
} else {
@@ -844,7 +844,7 @@ static int jfs_link(struct dentry *old_dentry,
rc = txCommit(tid, 2, &iplist[0], 0);
if (rc) {
- ip->i_nlink--; /* never instantiated */
+ drop_nlink(ip); /* never instantiated */
iput(ip);
} else
d_instantiate(dentry, ip);
@@ -893,7 +893,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
unchar *i_fastsymlink;
s64 xlen = 0;
int bmask = 0, xsize;
- s64 extent = 0, xaddr;
+ s64 xaddr;
struct metapage *mp;
struct super_block *sb;
struct tblock *tblk;
@@ -993,7 +993,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
txAbort(tid, 0);
goto out3;
}
- extent = xaddr;
ip->i_size = ssize - 1;
while (ssize) {
/* This is kind of silly since PATH_MAX == 4K */
@@ -1049,7 +1048,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
mutex_unlock(&JFS_IP(dip)->commit_mutex);
if (rc) {
free_ea_wmap(ip);
- ip->i_nlink = 0;
+ clear_nlink(ip);
unlock_new_inode(ip);
iput(ip);
} else {
@@ -1434,7 +1433,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
mutex_unlock(&JFS_IP(dir)->commit_mutex);
if (rc) {
free_ea_wmap(ip);
- ip->i_nlink = 0;
+ clear_nlink(ip);
unlock_new_inode(ip);
iput(ip);
} else {
@@ -1456,34 +1455,23 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
ino_t inum;
struct inode *ip;
struct component_name key;
- const char *name = dentry->d_name.name;
- int len = dentry->d_name.len;
int rc;
- jfs_info("jfs_lookup: name = %s", name);
-
- if ((name[0] == '.') && (len == 1))
- inum = dip->i_ino;
- else if (strcmp(name, "..") == 0)
- inum = PARENT(dip);
- else {
- if ((rc = get_UCSname(&key, dentry)))
- return ERR_PTR(rc);
- rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP);
- free_UCSname(&key);
- if (rc == -ENOENT) {
- d_add(dentry, NULL);
- return NULL;
- } else if (rc) {
- jfs_err("jfs_lookup: dtSearch returned %d", rc);
- return ERR_PTR(rc);
- }
- }
-
- ip = jfs_iget(dip->i_sb, inum);
- if (IS_ERR(ip)) {
- jfs_err("jfs_lookup: iget failed on inum %d", (uint) inum);
- return ERR_CAST(ip);
+ jfs_info("jfs_lookup: name = %s", dentry->d_name.name);
+
+ if ((rc = get_UCSname(&key, dentry)))
+ return ERR_PTR(rc);
+ rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP);
+ free_UCSname(&key);
+ if (rc == -ENOENT) {
+ ip = NULL;
+ } else if (rc) {
+ jfs_err("jfs_lookup: dtSearch returned %d", rc);
+ ip = ERR_PTR(rc);
+ } else {
+ ip = jfs_iget(dip->i_sb, inum);
+ if (IS_ERR(ip))
+ jfs_err("jfs_lookup: iget failed on inum %d", (uint)inum);
}
return d_splice_alias(ip, dentry);
@@ -1548,7 +1536,7 @@ const struct inode_operations jfs_dir_inode_operations = {
.removexattr = jfs_removexattr,
.setattr = jfs_setattr,
#ifdef CONFIG_JFS_POSIX_ACL
- .check_acl = jfs_check_acl,
+ .get_acl = jfs_get_acl,
#endif
};
@@ -1597,8 +1585,6 @@ out:
static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
{
- if (nd && nd->flags & LOOKUP_RCU)
- return -ECHILD;
/*
* This is not negative dentry. Always valid.
*
@@ -1624,10 +1610,8 @@ static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
* case sensitive name which is specified by user if this is
* for creation.
*/
- if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
- if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
- return 0;
- }
+ if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+ return 0;
return 1;
}
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 06c8a67..a44eff0 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -485,7 +485,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
goto out_unload;
}
inode->i_ino = 0;
- inode->i_nlink = 1;
inode->i_size = sb->s_bdev->bd_inode->i_size;
inode->i_mapping->a_ops = &jfs_metapage_aops;
insert_inode_hash(inode);
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 24838f1..26683e1 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -693,8 +693,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
return rc;
}
if (acl) {
- mode_t mode = inode->i_mode;
- rc = posix_acl_equiv_mode(acl, &mode);
+ rc = posix_acl_equiv_mode(acl, &inode->i_mode);
posix_acl_release(acl);
if (rc < 0) {
printk(KERN_ERR
@@ -702,7 +701,6 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
rc);
return rc;
}
- inode->i_mode = mode;
mark_inode_dirty(inode);
}
/*
@@ -1091,38 +1089,37 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
}
#ifdef CONFIG_JFS_SECURITY
-int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int rc;
- size_t len;
- void *value;
- char *suffix;
+ const struct xattr *xattr;
+ tid_t *tid = fs_info;
char *name;
-
- rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
- &len);
- if (rc) {
- if (rc == -EOPNOTSUPP)
- return 0;
- return rc;
- }
- name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix),
- GFP_NOFS);
- if (!name) {
- rc = -ENOMEM;
- goto kmalloc_failed;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1, GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ break;
+ }
+ strcpy(name, XATTR_SECURITY_PREFIX);
+ strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+
+ err = __jfs_setxattr(*tid, inode, name,
+ xattr->value, xattr->value_len, 0);
+ kfree(name);
+ if (err < 0)
+ break;
}
- strcpy(name, XATTR_SECURITY_PREFIX);
- strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
-
- rc = __jfs_setxattr(tid, inode, name, value, len, 0);
-
- kfree(name);
-kmalloc_failed:
- kfree(suffix);
- kfree(value);
+ return err;
+}
- return rc;
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &jfs_initxattrs, &tid);
}
#endif
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index fbb2a5e..10e6366 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -28,18 +28,6 @@ config NFSD
If unsure, say N.
-config NFSD_DEPRECATED
- bool "Include support for deprecated syscall interface to NFSD"
- depends on NFSD
- default y
- help
- The syscall interface to nfsd was obsoleted in 2.6.0 by a new
- filesystem based interface. The old interface is due for removal
- in 2.6.40. If you wish to remove the interface before then
- say N.
-
- In unsure, say Y.
-
config NFSD_V2_ACL
bool
depends on NFSD
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index d892be6..93cc9d3 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -69,7 +69,7 @@ enum {
int nfsd_reply_cache_init(void);
void nfsd_reply_cache_shutdown(void);
-int nfsd_cache_lookup(struct svc_rqst *, int);
+int nfsd_cache_lookup(struct svc_rqst *);
void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
#ifdef CONFIG_NFSD_V4
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index d7c4f02..a0205fc 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -16,7 +16,6 @@
#include <linux/module.h>
#include <linux/exportfs.h>
-#include <linux/nfsd/syscall.h>
#include <net/ipv6.h>
#include "nfsd.h"
@@ -318,7 +317,6 @@ static void svc_export_put(struct kref *ref)
struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
path_put(&exp->ex_path);
auth_domain_put(exp->ex_client);
- kfree(exp->ex_pathname);
nfsd4_fslocs_free(&exp->ex_fslocs);
kfree(exp);
}
@@ -528,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
exp.ex_client = dom;
- err = -ENOMEM;
- exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
- if (!exp.ex_pathname)
- goto out2;
-
/* expiry */
err = -EINVAL;
exp.h.expiry_time = get_expiry(&mesg);
@@ -613,8 +606,6 @@ out4:
nfsd4_fslocs_free(&exp.ex_fslocs);
kfree(exp.ex_uuid);
out3:
- kfree(exp.ex_pathname);
-out2:
path_put(&exp.ex_path);
out1:
auth_domain_put(dom);
@@ -678,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_client = item->ex_client;
new->ex_path.dentry = dget(item->ex_path.dentry);
new->ex_path.mnt = mntget(item->ex_path.mnt);
- new->ex_pathname = NULL;
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
@@ -696,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
new->ex_fsid = item->ex_fsid;
new->ex_uuid = item->ex_uuid;
item->ex_uuid = NULL;
- new->ex_pathname = item->ex_pathname;
- item->ex_pathname = NULL;
new->ex_fslocs.locations = item->ex_fslocs.locations;
item->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
@@ -797,58 +785,6 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
return ek;
}
-#ifdef CONFIG_NFSD_DEPRECATED
-static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
- struct svc_export *exp)
-{
- struct svc_expkey key, *ek;
-
- key.ek_client = clp;
- key.ek_fsidtype = fsid_type;
- memcpy(key.ek_fsid, fsidv, key_len(fsid_type));
- key.ek_path = exp->ex_path;
- key.h.expiry_time = NEVER;
- key.h.flags = 0;
-
- ek = svc_expkey_lookup(&key);
- if (ek)
- ek = svc_expkey_update(&key,ek);
- if (ek) {
- cache_put(&ek->h, &svc_expkey_cache);
- return 0;
- }
- return -ENOMEM;
-}
-
-/*
- * Find the client's export entry matching xdev/xino.
- */
-static inline struct svc_expkey *
-exp_get_key(svc_client *clp, dev_t dev, ino_t ino)
-{
- u32 fsidv[3];
-
- if (old_valid_dev(dev)) {
- mk_fsid(FSID_DEV, fsidv, dev, ino, 0, NULL);
- return exp_find_key(clp, FSID_DEV, fsidv, NULL);
- }
- mk_fsid(FSID_ENCODE_DEV, fsidv, dev, ino, 0, NULL);
- return exp_find_key(clp, FSID_ENCODE_DEV, fsidv, NULL);
-}
-
-/*
- * Find the client's export entry matching fsid
- */
-static inline struct svc_expkey *
-exp_get_fsid_key(svc_client *clp, int fsid)
-{
- u32 fsidv[2];
-
- mk_fsid(FSID_NUM, fsidv, 0, 0, fsid, NULL);
-
- return exp_find_key(clp, FSID_NUM, fsidv, NULL);
-}
-#endif
static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
struct cache_req *reqp)
@@ -890,275 +826,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path)
return exp;
}
-#ifdef CONFIG_NFSD_DEPRECATED
-/*
- * Hashtable locking. Write locks are placed only by user processes
- * wanting to modify export information.
- * Write locking only done in this file. Read locking
- * needed externally.
- */
-static DECLARE_RWSEM(hash_sem);
-
-void
-exp_readlock(void)
-{
- down_read(&hash_sem);
-}
-
-static inline void
-exp_writelock(void)
-{
- down_write(&hash_sem);
-}
-
-void
-exp_readunlock(void)
-{
- up_read(&hash_sem);
-}
-
-static inline void
-exp_writeunlock(void)
-{
- up_write(&hash_sem);
-}
-#else
-
-/* hash_sem not needed once deprecated interface is removed */
-void exp_readlock(void) {}
-static inline void exp_writelock(void){}
-void exp_readunlock(void) {}
-static inline void exp_writeunlock(void){}
-
-#endif
-
-#ifdef CONFIG_NFSD_DEPRECATED
-static void exp_do_unexport(svc_export *unexp);
-static int exp_verify_string(char *cp, int max);
-
-static void exp_fsid_unhash(struct svc_export *exp)
-{
- struct svc_expkey *ek;
-
- if ((exp->ex_flags & NFSEXP_FSID) == 0)
- return;
-
- ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
- if (!IS_ERR(ek)) {
- sunrpc_invalidate(&ek->h, &svc_expkey_cache);
- cache_put(&ek->h, &svc_expkey_cache);
- }
-}
-
-static int exp_fsid_hash(svc_client *clp, struct svc_export *exp)
-{
- u32 fsid[2];
-
- if ((exp->ex_flags & NFSEXP_FSID) == 0)
- return 0;
-
- mk_fsid(FSID_NUM, fsid, 0, 0, exp->ex_fsid, NULL);
- return exp_set_key(clp, FSID_NUM, fsid, exp);
-}
-
-static int exp_hash(struct auth_domain *clp, struct svc_export *exp)
-{
- u32 fsid[2];
- struct inode *inode = exp->ex_path.dentry->d_inode;
- dev_t dev = inode->i_sb->s_dev;
-
- if (old_valid_dev(dev)) {
- mk_fsid(FSID_DEV, fsid, dev, inode->i_ino, 0, NULL);
- return exp_set_key(clp, FSID_DEV, fsid, exp);
- }
- mk_fsid(FSID_ENCODE_DEV, fsid, dev, inode->i_ino, 0, NULL);
- return exp_set_key(clp, FSID_ENCODE_DEV, fsid, exp);
-}
-
-static void exp_unhash(struct svc_export *exp)
-{
- struct svc_expkey *ek;
- struct inode *inode = exp->ex_path.dentry->d_inode;
-
- ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
- if (!IS_ERR(ek)) {
- sunrpc_invalidate(&ek->h, &svc_expkey_cache);
- cache_put(&ek->h, &svc_expkey_cache);
- }
-}
-
-/*
- * Export a file system.
- */
-int
-exp_export(struct nfsctl_export *nxp)
-{
- svc_client *clp;
- struct svc_export *exp = NULL;
- struct svc_export new;
- struct svc_expkey *fsid_key = NULL;
- struct path path;
- int err;
-
- /* Consistency check */
- err = -EINVAL;
- if (!exp_verify_string(nxp->ex_path, NFS_MAXPATHLEN) ||
- !exp_verify_string(nxp->ex_client, NFSCLNT_IDMAX))
- goto out;
-
- dprintk("exp_export called for %s:%s (%x/%ld fl %x).\n",
- nxp->ex_client, nxp->ex_path,
- (unsigned)nxp->ex_dev, (long)nxp->ex_ino,
- nxp->ex_flags);
-
- /* Try to lock the export table for update */
- exp_writelock();
-
- /* Look up client info */
- if (!(clp = auth_domain_find(nxp->ex_client)))
- goto out_unlock;
-
-
- /* Look up the dentry */
- err = kern_path(nxp->ex_path, 0, &path);
- if (err)
- goto out_put_clp;
- err = -EINVAL;
-
- exp = exp_get_by_name(clp, &path, NULL);
-
- memset(&new, 0, sizeof(new));
-
- /* must make sure there won't be an ex_fsid clash */
- if ((nxp->ex_flags & NFSEXP_FSID) &&
- (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) &&
- fsid_key->ek_path.mnt &&
- (fsid_key->ek_path.mnt != path.mnt ||
- fsid_key->ek_path.dentry != path.dentry))
- goto finish;
-
- if (!IS_ERR(exp)) {
- /* just a flags/id/fsid update */
-
- exp_fsid_unhash(exp);
- exp->ex_flags = nxp->ex_flags;
- exp->ex_anon_uid = nxp->ex_anon_uid;
- exp->ex_anon_gid = nxp->ex_anon_gid;
- exp->ex_fsid = nxp->ex_dev;
-
- err = exp_fsid_hash(clp, exp);
- goto finish;
- }
-
- err = check_export(path.dentry->d_inode, &nxp->ex_flags, NULL);
- if (err) goto finish;
-
- err = -ENOMEM;
-
- dprintk("nfsd: creating export entry %p for client %p\n", exp, clp);
-
- new.h.expiry_time = NEVER;
- new.h.flags = 0;
- new.ex_pathname = kstrdup(nxp->ex_path, GFP_KERNEL);
- if (!new.ex_pathname)
- goto finish;
- new.ex_client = clp;
- new.ex_path = path;
- new.ex_flags = nxp->ex_flags;
- new.ex_anon_uid = nxp->ex_anon_uid;
- new.ex_anon_gid = nxp->ex_anon_gid;
- new.ex_fsid = nxp->ex_dev;
-
- exp = svc_export_lookup(&new);
- if (exp)
- exp = svc_export_update(&new, exp);
-
- if (!exp)
- goto finish;
-
- if (exp_hash(clp, exp) ||
- exp_fsid_hash(clp, exp)) {
- /* failed to create at least one index */
- exp_do_unexport(exp);
- cache_flush();
- } else
- err = 0;
-finish:
- kfree(new.ex_pathname);
- if (!IS_ERR_OR_NULL(exp))
- exp_put(exp);
- if (!IS_ERR_OR_NULL(fsid_key))
- cache_put(&fsid_key->h, &svc_expkey_cache);
- path_put(&path);
-out_put_clp:
- auth_domain_put(clp);
-out_unlock:
- exp_writeunlock();
-out:
- return err;
-}
-
-/*
- * Unexport a file system. The export entry has already
- * been removed from the client's list of exported fs's.
- */
-static void
-exp_do_unexport(svc_export *unexp)
-{
- sunrpc_invalidate(&unexp->h, &svc_export_cache);
- exp_unhash(unexp);
- exp_fsid_unhash(unexp);
-}
-
-
-/*
- * unexport syscall.
- */
-int
-exp_unexport(struct nfsctl_export *nxp)
-{
- struct auth_domain *dom;
- svc_export *exp;
- struct path path;
- int err;
-
- /* Consistency check */
- if (!exp_verify_string(nxp->ex_path, NFS_MAXPATHLEN) ||
- !exp_verify_string(nxp->ex_client, NFSCLNT_IDMAX))
- return -EINVAL;
-
- exp_writelock();
-
- err = -EINVAL;
- dom = auth_domain_find(nxp->ex_client);
- if (!dom) {
- dprintk("nfsd: unexport couldn't find %s\n", nxp->ex_client);
- goto out_unlock;
- }
-
- err = kern_path(nxp->ex_path, 0, &path);
- if (err)
- goto out_domain;
-
- err = -EINVAL;
- exp = exp_get_by_name(dom, &path, NULL);
- path_put(&path);
- if (IS_ERR(exp))
- goto out_domain;
-
- exp_do_unexport(exp);
- exp_put(exp);
- err = 0;
-
-out_domain:
- auth_domain_put(dom);
- cache_flush();
-out_unlock:
- exp_writeunlock();
- return err;
-}
-#endif /* CONFIG_NFSD_DEPRECATED */
/*
* Obtain the root fh on behalf of a client.
@@ -1330,7 +998,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
return exp;
}
-static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
+struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp)
{
u32 fsidv[2];
@@ -1350,7 +1018,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
struct svc_export *exp;
__be32 rv;
- exp = find_fsidzero_export(rqstp);
+ exp = rqst_find_fsidzero_export(rqstp);
if (IS_ERR(exp))
return nfserrno(PTR_ERR(exp));
rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
@@ -1367,7 +1035,6 @@ static void *e_start(struct seq_file *m, loff_t *pos)
unsigned hash, export;
struct cache_head *ch;
- exp_readlock();
read_lock(&svc_export_cache.hash_lock);
if (!n--)
return SEQ_START_TOKEN;
@@ -1418,7 +1085,6 @@ static void e_stop(struct seq_file *m, void *p)
__releases(svc_export_cache.hash_lock)
{
read_unlock(&svc_export_cache.hash_lock);
- exp_readunlock();
}
static struct flags {
@@ -1550,97 +1216,6 @@ const struct seq_operations nfs_exports_op = {
.show = e_show,
};
-#ifdef CONFIG_NFSD_DEPRECATED
-/*
- * Add or modify a client.
- * Change requests may involve the list of host addresses. The list of
- * exports and possibly existing uid maps are left untouched.
- */
-int
-exp_addclient(struct nfsctl_client *ncp)
-{
- struct auth_domain *dom;
- int i, err;
- struct in6_addr addr6;
-
- /* First, consistency check. */
- err = -EINVAL;
- if (! exp_verify_string(ncp->cl_ident, NFSCLNT_IDMAX))
- goto out;
- if (ncp->cl_naddr > NFSCLNT_ADDRMAX)
- goto out;
-
- /* Lock the hashtable */
- exp_writelock();
-
- dom = unix_domain_find(ncp->cl_ident);
-
- err = -ENOMEM;
- if (!dom)
- goto out_unlock;
-
- /* Insert client into hashtable. */
- for (i = 0; i < ncp->cl_naddr; i++) {
- ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6);
- auth_unix_add_addr(&init_net, &addr6, dom);
- }
- auth_unix_forget_old(dom);
- auth_domain_put(dom);
-
- err = 0;
-
-out_unlock:
- exp_writeunlock();
-out:
- return err;
-}
-
-/*
- * Delete a client given an identifier.
- */
-int
-exp_delclient(struct nfsctl_client *ncp)
-{
- int err;
- struct auth_domain *dom;
-
- err = -EINVAL;
- if (!exp_verify_string(ncp->cl_ident, NFSCLNT_IDMAX))
- goto out;
-
- /* Lock the hashtable */
- exp_writelock();
-
- dom = auth_domain_find(ncp->cl_ident);
- /* just make sure that no addresses work
- * and that it will expire soon
- */
- if (dom) {
- err = auth_unix_forget_old(dom);
- auth_domain_put(dom);
- }
-
- exp_writeunlock();
-out:
- return err;
-}
-
-/*
- * Verify that string is non-empty and does not exceed max length.
- */
-static int
-exp_verify_string(char *cp, int max)
-{
- int i;
-
- for (i = 0; i < max; i++)
- if (!cp[i])
- return i;
- cp[i] = 0;
- printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp);
- return 0;
-}
-#endif /* CONFIG_NFSD_DEPRECATED */
/*
* Initialize the exports module.
@@ -1667,10 +1242,8 @@ nfsd_export_init(void)
void
nfsd_export_flush(void)
{
- exp_writelock();
cache_purge(&svc_expkey_cache);
cache_purge(&svc_export_cache);
- exp_writeunlock();
}
/*
@@ -1682,12 +1255,9 @@ nfsd_export_shutdown(void)
dprintk("nfsd: shutting down export module.\n");
- exp_writelock();
-
cache_unregister(&svc_expkey_cache);
cache_unregister(&svc_export_cache);
svcauth_unix_purge();
- exp_writeunlock();
dprintk("nfsd: export shutdown complete.\n");
}
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 7c831a2..77e7a5c 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -35,10 +35,8 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
fh.fh_export = NULL;
- exp_readlock();
nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
fh_put(&fh);
- exp_readunlock();
/* We return nlm error codes as nlm doesn't know
* about nfsd, but nfsd does know about nlm..
*/
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index ad88f1c..435a9be 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include <linux/nfs_fs.h>
+#include <linux/export.h>
#include "acl.h"
@@ -372,8 +373,10 @@ sort_pacl(struct posix_acl *pacl)
* by uid/gid. */
int i, j;
- if (pacl->a_count <= 4)
- return; /* no users or groups */
+ /* no users or groups */
+ if (!pacl || pacl->a_count <= 4)
+ return;
+
i = 1;
while (pacl->a_entries[i].e_tag == ACL_USER)
i++;
@@ -497,13 +500,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
/*
* ACLs with no ACEs are treated differently in the inheritable
- * and effective cases: when there are no inheritable ACEs, we
- * set a zero-length default posix acl:
+ * and effective cases: when there are no inheritable ACEs,
+ * calls ->set_acl with a NULL ACL structure.
*/
- if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) {
- pacl = posix_acl_alloc(0, GFP_KERNEL);
- return pacl ? pacl : ERR_PTR(-ENOMEM);
- }
+ if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT))
+ return NULL;
+
/*
* When there are no effective ACEs, the following will end
* up setting a 3-element effective posix ACL with all
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 02eb4ed..51b4f43 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -39,6 +39,8 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
+static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+
#define NFSPROC4_CB_NULL 0
#define NFSPROC4_CB_COMPOUND 1
@@ -351,7 +353,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
__be32 *p;
encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
- encode_stateid4(xdr, &dp->dl_stateid);
+ encode_stateid4(xdr, &dp->dl_stid.sc_stateid);
p = xdr_reserve_space(xdr, 4);
*p++ = xdr_zero; /* truncate */
@@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
*/
status = 0;
out:
+ if (status)
+ nfsd4_mark_cb_fault(cb->cb_clp, status);
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -629,9 +633,11 @@ static int max_cb_time(void)
static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
{
+ int maxtime = max_cb_time();
struct rpc_timeout timeparms = {
- .to_initval = max_cb_time(),
+ .to_initval = maxtime,
.to_retries = 0,
+ .to_maxval = maxtime,
};
struct rpc_create_args args = {
.net = &init_net,
@@ -686,6 +692,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
warn_no_callback_path(clp, reason);
}
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+{
+ clp->cl_cb_state = NFSD4_CB_FAULT;
+ warn_no_callback_path(clp, reason);
+}
+
static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
{
struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
@@ -773,8 +785,12 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
{
if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
- dprintk("%s slot is busy\n", __func__);
- return false;
+ /* Race breaker */
+ if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+ dprintk("%s slot is busy\n", __func__);
+ return false;
+ }
+ rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
}
return true;
}
@@ -787,7 +803,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
u32 minorversion = clp->cl_minorversion;
cb->cb_minorversion = minorversion;
@@ -809,7 +825,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
dprintk("%s: minorversion=%d\n", __func__,
clp->cl_minorversion);
@@ -832,7 +848,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
nfsd4_cb_done(task, calldata);
@@ -1006,7 +1022,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
void nfsd4_cb_recall(struct nfs4_delegation *dp)
{
struct nfsd4_callback *cb = &dp->dl_recall;
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
dp->dl_retries = 1;
cb->cb_op = dp;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index d06a02c..9a959de 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -35,6 +35,7 @@
#include <linux/file.h>
#include <linux/slab.h>
+#include "idmap.h"
#include "cache.h"
#include "xdr4.h"
#include "vfs.h"
@@ -170,12 +171,30 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
return status;
}
+static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
+{
+ umode_t mode = fh->fh_dentry->d_inode->i_mode;
+
+ if (S_ISREG(mode))
+ return nfs_ok;
+ if (S_ISDIR(mode))
+ return nfserr_isdir;
+ /*
+ * Using err_symlink as our catch-all case may look odd; but
+ * there's no other obvious error for this case in 4.0, and we
+ * happen to know that it will cause the linux v4 client to do
+ * the right thing on attempts to open something other than a
+ * regular file.
+ */
+ return nfserr_symlink;
+}
+
static __be32
do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
struct svc_fh resfh;
+ int accmode;
__be32 status;
- int created = 0;
fh_init(&resfh, NFS4_FHSIZE);
open->op_truncate = 0;
@@ -204,7 +223,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
open->op_fname.len, &open->op_iattr,
&resfh, open->op_createmode,
(u32 *)open->op_verf.data,
- &open->op_truncate, &created);
+ &open->op_truncate, &open->op_created);
/*
* Following rfc 3530 14.2.16, use the returned bitmask
@@ -213,7 +232,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
*/
if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
- FATTR4_WORD1_TIME_MODIFY);
+ FATTR4_WORD1_TIME_MODIFY);
} else {
status = nfsd_lookup(rqstp, current_fh,
open->op_fname.data, open->op_fname.len, &resfh);
@@ -221,6 +240,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
}
if (status)
goto out;
+ status = nfsd_check_obj_isreg(&resfh);
+ if (status)
+ goto out;
if (is_create_with_attrs(open) && open->op_acl != NULL)
do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval);
@@ -229,11 +251,12 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
fh_dup2(current_fh, &resfh);
/* set reply cache */
- fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+ fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
&resfh.fh_handle);
- if (!created)
- status = do_open_permission(rqstp, current_fh, open,
- NFSD_MAY_NOP);
+ accmode = NFSD_MAY_NOP;
+ if (open->op_created)
+ accmode |= NFSD_MAY_OWNER_OVERRIDE;
+ status = do_open_permission(rqstp, current_fh, open, accmode);
out:
fh_put(&resfh);
@@ -244,6 +267,7 @@ static __be32
do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
__be32 status;
+ int accmode = 0;
/* Only reclaims from previously confirmed clients are valid */
if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
@@ -256,14 +280,24 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
/* set replay cache */
- fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+ fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
&current_fh->fh_handle);
open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
(open->op_iattr.ia_size == 0);
+ /*
+ * In the delegation case, the client is telling us about an
+ * open that it *already* performed locally, some time ago. We
+ * should let it succeed now if possible.
+ *
+ * In the case of a CLAIM_FH open, on the other hand, the client
+ * may be counting on us to enforce permissions (the Linux 4.1
+ * client uses this for normal opens, for example).
+ */
+ if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH)
+ accmode = NFSD_MAY_OWNER_OVERRIDE;
- status = do_open_permission(rqstp, current_fh, open,
- NFSD_MAY_OWNER_OVERRIDE);
+ status = do_open_permission(rqstp, current_fh, open, accmode);
return status;
}
@@ -285,14 +319,27 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfsd4_compoundres *resp;
- dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
+ dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
(int)open->op_fname.len, open->op_fname.data,
- open->op_stateowner);
+ open->op_openowner);
/* This check required by spec. */
if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
return nfserr_inval;
+ /* We don't yet support WANT bits: */
+ open->op_share_access &= NFS4_SHARE_ACCESS_MASK;
+
+ open->op_created = 0;
+ /*
+ * RFC5661 18.51.3
+ * Before RECLAIM_COMPLETE done, server should deny new lock
+ */
+ if (nfsd4_has_session(cstate) &&
+ !cstate->session->se_client->cl_firststate &&
+ open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ return nfserr_grace;
+
if (nfsd4_has_session(cstate))
copy_clientid(&open->op_clientid, cstate->session);
@@ -302,7 +349,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
resp = rqstp->rq_resp;
status = nfsd4_process_open1(&resp->cstate, open);
if (status == nfserr_replay_me) {
- struct nfs4_replay *rp = &open->op_stateowner->so_replay;
+ struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
fh_put(&cstate->current_fh);
fh_copy_shallow(&cstate->current_fh.fh_handle,
&rp->rp_openfh);
@@ -332,32 +379,23 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
case NFS4_OPEN_CLAIM_NULL:
- /*
- * (1) set CURRENT_FH to the file being opened,
- * creating it if necessary, (2) set open->op_cinfo,
- * (3) set open->op_truncate if the file is to be
- * truncated after opening, (4) do permission checking.
- */
status = do_open_lookup(rqstp, &cstate->current_fh,
open);
if (status)
goto out;
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
- open->op_stateowner->so_confirmed = 1;
- /*
- * The CURRENT_FH is already set to the file being
- * opened. (1) set open->op_cinfo, (2) set
- * open->op_truncate if the file is to be truncated
- * after opening, (3) do permission checking.
- */
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
status = do_open_fhandle(rqstp, &cstate->current_fh,
open);
if (status)
goto out;
break;
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
case NFS4_OPEN_CLAIM_DELEGATE_PREV:
- open->op_stateowner->so_confirmed = 1;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
dprintk("NFSD: unsupported OPEN claim type %d\n",
open->op_claim_type);
status = nfserr_notsupp;
@@ -374,12 +412,13 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* set, (2) sets open->op_stateid, (3) sets open->op_delegation.
*/
status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
+ WARN_ON(status && open->op_created);
out:
- if (open->op_stateowner) {
- nfs4_get_stateowner(open->op_stateowner);
- cstate->replay_owner = open->op_stateowner;
- }
- nfs4_unlock_state();
+ nfsd4_cleanup_open_state(open, status);
+ if (open->op_openowner)
+ cstate->replay_owner = &open->op_openowner->oo_owner;
+ else
+ nfs4_unlock_state();
return status;
}
@@ -460,17 +499,12 @@ static __be32
nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_commit *commit)
{
- __be32 status;
-
u32 *p = (u32 *)commit->co_verf.data;
*p++ = nfssvc_boot.tv_sec;
*p++ = nfssvc_boot.tv_usec;
- status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+ return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
commit->co_count);
- if (status == nfserr_symlink)
- status = nfserr_inval;
- return status;
}
static __be32
@@ -485,8 +519,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
NFSD_MAY_CREATE);
- if (status == nfserr_symlink)
- status = nfserr_notdir;
if (status)
return status;
@@ -497,15 +529,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
switch (create->cr_type) {
case NF4LNK:
- /* ugh! we have to null-terminate the linktext, or
- * vfs_symlink() will choke. it is always safe to
- * null-terminate by brute force, since at worst we
- * will overwrite the first byte of the create namelen
- * in the XDR buffer, which has already been extracted
- * during XDR decode.
- */
- create->cr_linkname[create->cr_linklen] = 0;
-
status = nfsd_symlink(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
create->cr_linkname, create->cr_linklen,
@@ -712,8 +735,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_grace;
status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
remove->rm_name, remove->rm_namelen);
- if (status == nfserr_symlink)
- return nfserr_notdir;
if (!status) {
fh_unlock(&cstate->current_fh);
set_change_info(&remove->rm_cinfo, &cstate->current_fh);
@@ -744,8 +765,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
(S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
status = nfserr_exist;
- else if (status == nfserr_symlink)
- status = nfserr_notdir;
if (!status) {
set_change_info(&rename->rn_sinfo, &cstate->current_fh);
@@ -863,14 +882,14 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
- if (filp)
- get_file(filp);
- nfs4_unlock_state();
-
if (status) {
+ nfs4_unlock_state();
dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
return status;
}
+ if (filp)
+ get_file(filp);
+ nfs4_unlock_state();
cnt = write->wr_buflen;
write->wr_how_written = write->wr_stable_how;
@@ -886,8 +905,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
write->wr_bytes_written = cnt;
- if (status == nfserr_symlink)
- status = nfserr_inval;
return status;
}
@@ -988,6 +1005,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
void *);
+typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op);
+
enum nfsd4_op_flags {
ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
@@ -995,12 +1014,31 @@ enum nfsd4_op_flags {
/* For rfc 5661 section 2.6.3.1.1: */
OP_HANDLES_WRONGSEC = 1 << 3,
OP_IS_PUTFH_LIKE = 1 << 4,
+ /*
+ * These are the ops whose result size we estimate before
+ * encoding, to avoid performing an op then not being able to
+ * respond or cache a response. This includes writes and setattrs
+ * as well as the operations usually called "nonidempotent":
+ */
+ OP_MODIFIES_SOMETHING = 1 << 5,
+ /*
+ * Cache compounds containing these ops in the xid-based drc:
+ * We use the DRC for compounds containing non-idempotent
+ * operations, *except* those that are 4.1-specific (since
+ * sessions provide their own EOS), and except for stateful
+ * operations other than setclientid and setclientid_confirm
+ * (since sequence numbers provide EOS for open, lock, etc in
+ * the v4.0 case).
+ */
+ OP_CACHEME = 1 << 6,
};
struct nfsd4_operation {
nfsd4op_func op_func;
u32 op_flags;
char *op_name;
+ /* Try to get response size before operation */
+ nfsd4op_rsize op_rsize_bop;
};
static struct nfsd4_operation nfsd4_ops[];
@@ -1045,6 +1083,11 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
return &nfsd4_ops[op->opnum];
}
+bool nfsd4_cache_this_op(struct nfsd4_op *op)
+{
+ return OPDESC(op)->op_flags & OP_CACHEME;
+}
+
static bool need_wrongsec_check(struct svc_rqst *rqstp)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
@@ -1068,7 +1111,8 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
*/
if (argp->opcnt == resp->opcnt)
return false;
-
+ if (next->opnum == OP_ILLEGAL)
+ return false;
nextd = OPDESC(next);
/*
* Rest of 2.6.3.1.1: certain operations will return WRONGSEC
@@ -1090,6 +1134,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
struct nfsd4_operation *opdesc;
struct nfsd4_compound_state *cstate = &resp->cstate;
int slack_bytes;
+ u32 plen = 0;
__be32 status;
resp->xbuf = &rqstp->rq_res;
@@ -1168,6 +1213,21 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
goto encode_op;
}
+ /* If op is non-idempotent */
+ if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
+ plen = opdesc->op_rsize_bop(rqstp, op);
+ /*
+ * If there's still another operation, make sure
+ * we'll have space to at least encode an error:
+ */
+ if (resp->opcnt < args->opcnt)
+ plen += COMPOUND_ERR_SLACK_SPACE;
+ op->status = nfsd4_check_resp_size(resp, plen);
+ }
+
+ if (op->status)
+ goto encode_op;
+
if (opdesc->op_func)
op->status = opdesc->op_func(rqstp, cstate, &op->u);
else
@@ -1197,7 +1257,7 @@ encode_op:
be32_to_cpu(status));
if (cstate->replay_owner) {
- nfs4_put_stateowner(cstate->replay_owner);
+ nfs4_unlock_state();
cstate->replay_owner = NULL;
}
/* XXX Ugh, we need to get rid of this kind of special case: */
@@ -1212,13 +1272,151 @@ encode_op:
fh_put(&resp->cstate.save_fh);
BUG_ON(resp->cstate.replay_owner);
out:
- nfsd4_release_compoundargs(args);
/* Reset deferral mechanism for RPC deferrals */
rqstp->rq_usedeferral = 1;
dprintk("nfsv4 compound returned %d\n", ntohl(status));
return status;
}
+#define op_encode_hdr_size (2)
+#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define op_encode_change_info_maxsz (5)
+#define nfs4_fattr_bitmap_maxsz (4)
+
+#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz)
+
+#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+
+#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz)
+#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \
+ op_encode_ace_maxsz)
+
+#define op_encode_channel_attrs_maxsz (6 + 1 + 1)
+
+static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
+}
+
+static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz
+ + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_stateid_maxsz
+ + op_encode_change_info_maxsz + 1
+ + nfs4_fattr_bitmap_maxsz
+ + op_encode_delegation_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = 0, rlen = 0;
+
+ maxcount = svc_max_payload(rqstp);
+ rlen = op->u.read.rd_length;
+
+ if (rlen > maxcount)
+ rlen = maxcount;
+
+ return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen;
+}
+
+static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 rlen = op->u.readdir.rd_maxcount;
+
+ if (rlen > PAGE_SIZE)
+ rlen = PAGE_SIZE;
+
+ return (op_encode_hdr_size + op_encode_verifier_maxsz)
+ * sizeof(__be32) + rlen;
+}
+
+static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz
+ + op_encode_change_info_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
+ sizeof(__be32);
+}
+
+static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
+ 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\
+ 2 + /*eir_server_owner.so_minor_id */\
+ /* eir_server_owner.so_major_id<> */\
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+ /* eir_server_scope<> */\
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+ 1 + /* eir_server_impl_id array length */\
+ 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
+ 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
+ 2 + /* csr_sequence, csr_flags */\
+ op_encode_channel_attrs_maxsz + \
+ op_encode_channel_attrs_maxsz) * sizeof(__be32);
+}
+
static struct nfsd4_operation nfsd4_ops[] = {
[OP_ACCESS] = {
.op_func = (nfsd4op_func)nfsd4_access,
@@ -1226,19 +1424,27 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_CLOSE] = {
.op_func = (nfsd4op_func)nfsd4_close,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_CLOSE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_COMMIT] = {
.op_func = (nfsd4op_func)nfsd4_commit,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_COMMIT",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize,
},
[OP_CREATE] = {
.op_func = (nfsd4op_func)nfsd4_create,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_CREATE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize,
},
[OP_DELEGRETURN] = {
.op_func = (nfsd4op_func)nfsd4_delegreturn,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_DELEGRETURN",
+ .op_rsize_bop = nfsd4_only_status_rsize,
},
[OP_GETATTR] = {
.op_func = (nfsd4op_func)nfsd4_getattr,
@@ -1251,11 +1457,16 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LINK] = {
.op_func = (nfsd4op_func)nfsd4_link,
+ .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING
+ | OP_CACHEME,
.op_name = "OP_LINK",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize,
},
[OP_LOCK] = {
.op_func = (nfsd4op_func)nfsd4_lock,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_LOCK",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
},
[OP_LOCKT] = {
.op_func = (nfsd4op_func)nfsd4_lockt,
@@ -1263,7 +1474,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LOCKU] = {
.op_func = (nfsd4op_func)nfsd4_locku,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_LOCKU",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_LOOKUP] = {
.op_func = (nfsd4op_func)nfsd4_lookup,
@@ -1281,42 +1494,54 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_OPEN] = {
.op_func = (nfsd4op_func)nfsd4_open,
- .op_flags = OP_HANDLES_WRONGSEC,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
.op_name = "OP_OPEN",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize,
},
[OP_OPEN_CONFIRM] = {
.op_func = (nfsd4op_func)nfsd4_open_confirm,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_OPEN_CONFIRM",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_OPEN_DOWNGRADE] = {
.op_func = (nfsd4op_func)nfsd4_open_downgrade,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_OPEN_DOWNGRADE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_PUTFH] = {
.op_func = (nfsd4op_func)nfsd4_putfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_PUTFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTPUBFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_PUTPUBFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTROOTFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_PUTROOTFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_READ] = {
.op_func = (nfsd4op_func)nfsd4_read,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_READ",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
},
[OP_READDIR] = {
.op_func = (nfsd4op_func)nfsd4_readdir,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_READDIR",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize,
},
[OP_READLINK] = {
.op_func = (nfsd4op_func)nfsd4_readlink,
@@ -1324,27 +1549,36 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_REMOVE] = {
.op_func = (nfsd4op_func)nfsd4_remove,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_REMOVE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize,
},
[OP_RENAME] = {
- .op_name = "OP_RENAME",
.op_func = (nfsd4op_func)nfsd4_rename,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_RENAME",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize,
},
[OP_RENEW] = {
.op_func = (nfsd4op_func)nfsd4_renew,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_RENEW",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+
},
[OP_RESTOREFH] = {
.op_func = (nfsd4op_func)nfsd4_restorefh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_RESTOREFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SAVEFH] = {
.op_func = (nfsd4op_func)nfsd4_savefh,
- .op_flags = OP_HANDLES_WRONGSEC,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
.op_name = "OP_SAVEFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SECINFO] = {
.op_func = (nfsd4op_func)nfsd4_secinfo,
@@ -1354,16 +1588,22 @@ static struct nfsd4_operation nfsd4_ops[] = {
[OP_SETATTR] = {
.op_func = (nfsd4op_func)nfsd4_setattr,
.op_name = "OP_SETATTR",
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize,
},
[OP_SETCLIENTID] = {
.op_func = (nfsd4op_func)nfsd4_setclientid,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_SETCLIENTID",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize,
},
[OP_SETCLIENTID_CONFIRM] = {
.op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_SETCLIENTID_CONFIRM",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_VERIFY] = {
.op_func = (nfsd4op_func)nfsd4_verify,
@@ -1371,50 +1611,81 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_WRITE] = {
.op_func = (nfsd4op_func)nfsd4_write,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_WRITE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
},
[OP_RELEASE_LOCKOWNER] = {
.op_func = (nfsd4op_func)nfsd4_release_lockowner,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_RELEASE_LOCKOWNER",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
/* NFSv4.1 operations */
[OP_EXCHANGE_ID] = {
.op_func = (nfsd4op_func)nfsd4_exchange_id,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_EXCHANGE_ID",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
},
[OP_BIND_CONN_TO_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_BIND_CONN_TO_SESSION",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize,
},
[OP_CREATE_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_create_session,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_CREATE_SESSION",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize,
},
[OP_DESTROY_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_destroy_session,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_DESTROY_SESSION",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SEQUENCE] = {
.op_func = (nfsd4op_func)nfsd4_sequence,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
.op_name = "OP_SEQUENCE",
},
+ [OP_DESTROY_CLIENTID] = {
+ .op_func = (nfsd4op_func)nfsd4_destroy_clientid,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_DESTROY_CLIENTID",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+ },
[OP_RECLAIM_COMPLETE] = {
.op_func = (nfsd4op_func)nfsd4_reclaim_complete,
- .op_flags = ALLOWED_WITHOUT_FH,
+ .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
.op_name = "OP_RECLAIM_COMPLETE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SECINFO_NO_NAME] = {
.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
.op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_SECINFO_NO_NAME",
},
+ [OP_TEST_STATEID] = {
+ .op_func = (nfsd4op_func)nfsd4_test_stateid,
+ .op_flags = ALLOWED_WITHOUT_FH,
+ .op_name = "OP_TEST_STATEID",
+ },
+ [OP_FREE_STATEID] = {
+ .op_func = (nfsd4op_func)nfsd4_free_stateid,
+ .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_FREE_STATEID",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+ },
};
static const char *nfsd4_op_name(unsigned opnum)
@@ -1427,16 +1698,6 @@ static const char *nfsd4_op_name(unsigned opnum)
#define nfsd4_voidres nfsd4_voidargs
struct nfsd4_voidargs { int dummy; };
-/*
- * TODO: At the present time, the NFSv4 server does not do XID caching
- * of requests. Implementing XID caching would not be a serious problem,
- * although it would require a mild change in interfaces since one
- * doesn't know whether an NFSv4 request is idempotent until after the
- * XDR decode. However, XID caching totally confuses pynfs (Peter
- * Astrand's regression testsuite for NFSv4 servers), which reuses
- * XID's liberally, so I've left it unimplemented until pynfs generates
- * better XID's.
- */
static struct svc_procedure nfsd_procedures4[2] = {
[NFSPROC4_NULL] = {
.pc_func = (svc_procfunc) nfsd4_proc_null,
@@ -1452,6 +1713,7 @@ static struct svc_procedure nfsd_procedures4[2] = {
.pc_encode = (kxdrproc_t) nfs4svc_encode_compoundres,
.pc_argsize = sizeof(struct nfsd4_compoundargs),
.pc_ressize = sizeof(struct nfsd4_compoundres),
+ .pc_release = nfsd4_release_compoundargs,
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = NFSD_BUFSIZE/4,
},
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index be26814..ed083b9 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -45,6 +45,7 @@
/* Globals */
static struct file *rec_file;
+static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
static int
nfs4_save_creds(const struct cred **original_creds)
@@ -129,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
if (!rec_file || clp->cl_firststate)
return 0;
+ clp->cl_firststate = 1;
status = nfs4_save_creds(&original_cred);
if (status < 0)
return status;
@@ -143,10 +145,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
goto out_unlock;
}
status = -EEXIST;
- if (dentry->d_inode) {
- dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
+ if (dentry->d_inode)
goto out_put;
- }
status = mnt_want_write(rec_file->f_path.mnt);
if (status)
goto out_put;
@@ -156,12 +156,14 @@ out_put:
dput(dentry);
out_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
- if (status == 0) {
- clp->cl_firststate = 1;
+ if (status == 0)
vfs_fsync(rec_file, 0);
- }
+ else
+ printk(KERN_ERR "NFSD: failed to write recovery record"
+ " (err %d); please check that %s exists"
+ " and is writeable", status,
+ user_recovery_dirname);
nfs4_reset_creds(original_cred);
- dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
return status;
}
@@ -191,52 +193,42 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
}
static int
-nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
+nfsd4_list_rec_dir(recdir_func *f)
{
const struct cred *original_cred;
- struct file *filp;
+ struct dentry *dir = rec_file->f_path.dentry;
LIST_HEAD(names);
- struct name_list *entry;
- struct dentry *dentry;
int status;
- if (!rec_file)
- return 0;
-
status = nfs4_save_creds(&original_cred);
if (status < 0)
return status;
- filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
- current_cred());
- status = PTR_ERR(filp);
- if (IS_ERR(filp))
- goto out;
- status = vfs_readdir(filp, nfsd4_build_namelist, &names);
- fput(filp);
+ status = vfs_llseek(rec_file, 0, SEEK_SET);
+ if (status < 0) {
+ nfs4_reset_creds(original_cred);
+ return status;
+ }
+
+ status = vfs_readdir(rec_file, nfsd4_build_namelist, &names);
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
while (!list_empty(&names)) {
+ struct name_list *entry;
entry = list_entry(names.next, struct name_list, list);
-
- dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
- if (IS_ERR(dentry)) {
- status = PTR_ERR(dentry);
- break;
+ if (!status) {
+ struct dentry *dentry;
+ dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+ if (IS_ERR(dentry)) {
+ status = PTR_ERR(dentry);
+ break;
+ }
+ status = f(dir, dentry);
+ dput(dentry);
}
- status = f(dir, dentry);
- dput(dentry);
- if (status)
- break;
list_del(&entry->list);
kfree(entry);
}
mutex_unlock(&dir->d_inode->i_mutex);
-out:
- while (!list_empty(&names)) {
- entry = list_entry(names.next, struct name_list, list);
- list_del(&entry->list);
- kfree(entry);
- }
nfs4_reset_creds(original_cred);
return status;
}
@@ -322,7 +314,7 @@ nfsd4_recdir_purge_old(void) {
status = mnt_want_write(rec_file->f_path.mnt);
if (status)
goto out;
- status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
+ status = nfsd4_list_rec_dir(purge_old);
if (status == 0)
vfs_fsync(rec_file, 0);
mnt_drop_write(rec_file->f_path.mnt);
@@ -352,7 +344,7 @@ nfsd4_recdir_load(void) {
if (!rec_file)
return 0;
- status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
+ status = nfsd4_list_rec_dir(load_recdir);
if (status)
printk("nfsd4: failed loading clients from recovery"
" directory %s\n", rec_file->f_path.dentry->d_name.name);
@@ -364,13 +356,13 @@ nfsd4_recdir_load(void) {
*/
void
-nfsd4_init_recdir(char *rec_dirname)
+nfsd4_init_recdir()
{
const struct cred *original_cred;
int status;
printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
- rec_dirname);
+ user_recovery_dirname);
BUG_ON(rec_file);
@@ -382,10 +374,10 @@ nfsd4_init_recdir(char *rec_dirname)
return;
}
- rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
+ rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
if (IS_ERR(rec_file)) {
printk("NFSD: unable to find recovery directory %s\n",
- rec_dirname);
+ user_recovery_dirname);
rec_file = NULL;
}
@@ -400,3 +392,30 @@ nfsd4_shutdown_recdir(void)
fput(rec_file);
rec_file = NULL;
}
+
+/*
+ * Change the NFSv4 recovery directory to recdir.
+ */
+int
+nfs4_reset_recoverydir(char *recdir)
+{
+ int status;
+ struct path path;
+
+ status = kern_path(recdir, LOOKUP_FOLLOW, &path);
+ if (status)
+ return status;
+ status = -ENOTDIR;
+ if (S_ISDIR(path.dentry->d_inode->i_mode)) {
+ strcpy(user_recovery_dirname, recdir);
+ status = 0;
+ }
+ path_put(&path);
+ return status;
+}
+
+char *
+nfs4_recoverydir(void)
+{
+ return user_recovery_dirname;
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4ec38df..d455ea0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -37,6 +37,7 @@
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/swap.h>
+#include <linux/pagemap.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/clnt.h>
#include "xdr4.h"
@@ -48,9 +49,6 @@
time_t nfsd4_lease = 90; /* default lease time */
time_t nfsd4_grace = 90;
static time_t boot_time;
-static u32 current_ownerid = 1;
-static u32 current_fileid = 1;
-static u32 current_delegid = 1;
static stateid_t zerostateid; /* bits all 0 */
static stateid_t onestateid; /* bits all 1 */
static u64 current_sessionid = 1;
@@ -59,10 +57,7 @@ static u64 current_sessionid = 1;
#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
/* forward declarations */
-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
-static void nfs4_set_recdir(char *recdir);
+static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
/* Locking: */
@@ -76,7 +71,8 @@ static DEFINE_MUTEX(client_mutex);
*/
static DEFINE_SPINLOCK(recall_lock);
-static struct kmem_cache *stateowner_slab = NULL;
+static struct kmem_cache *openowner_slab = NULL;
+static struct kmem_cache *lockowner_slab = NULL;
static struct kmem_cache *file_slab = NULL;
static struct kmem_cache *stateid_slab = NULL;
static struct kmem_cache *deleg_slab = NULL;
@@ -108,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes)
static struct list_head del_recall_lru;
+static void nfsd4_free_file(struct nfs4_file *f)
+{
+ kmem_cache_free(file_slab, f);
+}
+
static inline void
put_nfs4_file(struct nfs4_file *fi)
{
@@ -115,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi)
list_del(&fi->fi_hash);
spin_unlock(&recall_lock);
iput(fi->fi_inode);
- kmem_cache_free(file_slab, fi);
+ nfsd4_free_file(fi);
}
}
@@ -132,35 +133,33 @@ unsigned int max_delegations;
* Open owner state (share locks)
*/
-/* hash tables for nfs4_stateowner */
-#define OWNER_HASH_BITS 8
-#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
-#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
+/* hash tables for open owners */
+#define OPEN_OWNER_HASH_BITS 8
+#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS)
+#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1)
-#define ownerid_hashval(id) \
- ((id) & OWNER_HASH_MASK)
-#define ownerstr_hashval(clientid, ownername) \
- (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
+static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+{
+ unsigned int ret;
-static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE];
-static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
+ ret = opaque_hashval(ownername->data, ownername->len);
+ ret += clientid;
+ return ret & OPEN_OWNER_HASH_MASK;
+}
+
+static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
/* hash table for nfs4_file */
#define FILE_HASH_BITS 8
#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
-/* hash table for (open)nfs4_stateid */
-#define STATEID_HASH_BITS 10
-#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS)
-#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1)
-
-#define file_hashval(x) \
- hash_ptr(x, FILE_HASH_BITS)
-#define stateid_hashval(owner_id, file_id) \
- (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+static unsigned int file_hashval(struct inode *ino)
+{
+ /* XXX: why are we hashing on inode pointer, anyway? */
+ return hash_ptr(ino, FILE_HASH_BITS);
+}
static struct list_head file_hashtbl[FILE_HASH_SIZE];
-static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
{
@@ -203,8 +202,73 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
__nfs4_file_put_access(fp, oflag);
}
+static inline int get_new_stid(struct nfs4_stid *stid)
+{
+ static int min_stateid = 0;
+ struct idr *stateids = &stid->sc_client->cl_stateids;
+ int new_stid;
+ int error;
+
+ error = idr_get_new_above(stateids, stid, min_stateid, &new_stid);
+ /*
+ * Note: the necessary preallocation was done in
+ * nfs4_alloc_stateid(). The idr code caps the number of
+ * preallocations that can exist at a time, but the state lock
+ * prevents anyone from using ours before we get here:
+ */
+ BUG_ON(error);
+ /*
+ * It shouldn't be a problem to reuse an opaque stateid value.
+ * I don't think it is for 4.1. But with 4.0 I worry that, for
+ * example, a stray write retransmission could be accepted by
+ * the server when it should have been rejected. Therefore,
+ * adopt a trick from the sctp code to attempt to maximize the
+ * amount of time until an id is reused, by ensuring they always
+ * "increase" (mod INT_MAX):
+ */
+
+ min_stateid = new_stid+1;
+ if (min_stateid == INT_MAX)
+ min_stateid = 0;
+ return new_stid;
+}
+
+static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
+{
+ stateid_t *s = &stid->sc_stateid;
+ int new_id;
+
+ stid->sc_type = type;
+ stid->sc_client = cl;
+ s->si_opaque.so_clid = cl->cl_clientid;
+ new_id = get_new_stid(stid);
+ s->si_opaque.so_id = (u32)new_id;
+ /* Will be incremented before return to client: */
+ s->si_generation = 0;
+}
+
+static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab)
+{
+ struct idr *stateids = &cl->cl_stateids;
+
+ if (!idr_pre_get(stateids, GFP_KERNEL))
+ return NULL;
+ /*
+ * Note: if we fail here (or any time between now and the time
+ * we actually get the new idr), we won't need to undo the idr
+ * preallocation, since the idr code caps the number of
+ * preallocated entries.
+ */
+ return kmem_cache_alloc(slab, GFP_KERNEL);
+}
+
+static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
+{
+ return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
+}
+
static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
{
struct nfs4_delegation *dp;
struct nfs4_file *fp = stp->st_file;
@@ -221,21 +285,23 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
return NULL;
if (num_delegations > max_delegations)
return NULL;
- dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
+ dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
if (dp == NULL)
return dp;
+ init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
+ /*
+ * delegation seqid's are never incremented. The 4.1 special
+ * meaning of seqid 0 isn't meaningful, really, but let's avoid
+ * 0 anyway just for consistency and use 1:
+ */
+ dp->dl_stid.sc_stateid.si_generation = 1;
num_delegations++;
INIT_LIST_HEAD(&dp->dl_perfile);
INIT_LIST_HEAD(&dp->dl_perclnt);
INIT_LIST_HEAD(&dp->dl_recall_lru);
- dp->dl_client = clp;
get_nfs4_file(fp);
dp->dl_file = fp;
dp->dl_type = type;
- dp->dl_stateid.si_boot = boot_time;
- dp->dl_stateid.si_stateownerid = current_delegid++;
- dp->dl_stateid.si_fileid = 0;
- dp->dl_stateid.si_generation = 0;
fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
dp->dl_time = 0;
atomic_set(&dp->dl_count, 1);
@@ -264,12 +330,29 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
}
}
+static void unhash_stid(struct nfs4_stid *s)
+{
+ struct idr *stateids = &s->sc_client->cl_stateids;
+
+ idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+}
+
+static void
+hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
+{
+ lockdep_assert_held(&recall_lock);
+
+ list_add(&dp->dl_perfile, &fp->fi_delegations);
+ list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+}
+
/* Called under the state lock. */
static void
unhash_delegation(struct nfs4_delegation *dp)
{
- list_del_init(&dp->dl_perclnt);
+ unhash_stid(&dp->dl_stid);
spin_lock(&recall_lock);
+ list_del_init(&dp->dl_perclnt);
list_del_init(&dp->dl_perfile);
list_del_init(&dp->dl_recall_lru);
spin_unlock(&recall_lock);
@@ -289,10 +372,16 @@ static DEFINE_SPINLOCK(client_lock);
#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
-#define clientid_hashval(id) \
- ((id) & CLIENT_HASH_MASK)
-#define clientstr_hashval(name) \
- (opaque_hashval((name), 8) & CLIENT_HASH_MASK)
+static unsigned int clientid_hashval(u32 id)
+{
+ return id & CLIENT_HASH_MASK;
+}
+
+static unsigned int clientstr_hashval(const char *name)
+{
+ return opaque_hashval(name, 8) & CLIENT_HASH_MASK;
+}
+
/*
* reclaim_str_hashtbl[] holds known client info from previous reset/reboot
* used in reboot/reset lease grace period processing
@@ -359,7 +448,7 @@ set_deny(unsigned int *deny, unsigned long bmap) {
}
static int
-test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
+test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
unsigned int access, deny;
set_access(&access, stp->st_access_bmap);
@@ -382,14 +471,13 @@ static int nfs4_access_to_omode(u32 access)
BUG();
}
-static void unhash_generic_stateid(struct nfs4_stateid *stp)
+static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
{
- list_del(&stp->st_hash);
list_del(&stp->st_perfile);
list_del(&stp->st_perstateowner);
}
-static void free_generic_stateid(struct nfs4_stateid *stp)
+static void close_generic_stateid(struct nfs4_ol_stateid *stp)
{
int i;
@@ -398,84 +486,106 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
if (test_bit(i, &stp->st_access_bmap))
nfs4_file_put_access(stp->st_file,
nfs4_access_to_omode(i));
+ __clear_bit(i, &stp->st_access_bmap);
}
}
put_nfs4_file(stp->st_file);
+ stp->st_file = NULL;
+}
+
+static void free_generic_stateid(struct nfs4_ol_stateid *stp)
+{
kmem_cache_free(stateid_slab, stp);
}
-static void release_lock_stateid(struct nfs4_stateid *stp)
+static void release_lock_stateid(struct nfs4_ol_stateid *stp)
{
struct file *file;
unhash_generic_stateid(stp);
+ unhash_stid(&stp->st_stid);
file = find_any_file(stp->st_file);
if (file)
- locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
+ locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
+ close_generic_stateid(stp);
free_generic_stateid(stp);
}
-static void unhash_lockowner(struct nfs4_stateowner *sop)
+static void unhash_lockowner(struct nfs4_lockowner *lo)
{
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
- list_del(&sop->so_idhash);
- list_del(&sop->so_strhash);
- list_del(&sop->so_perstateid);
- while (!list_empty(&sop->so_stateids)) {
- stp = list_first_entry(&sop->so_stateids,
- struct nfs4_stateid, st_perstateowner);
+ list_del(&lo->lo_owner.so_strhash);
+ list_del(&lo->lo_perstateid);
+ while (!list_empty(&lo->lo_owner.so_stateids)) {
+ stp = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
release_lock_stateid(stp);
}
}
-static void release_lockowner(struct nfs4_stateowner *sop)
+static void release_lockowner(struct nfs4_lockowner *lo)
{
- unhash_lockowner(sop);
- nfs4_put_stateowner(sop);
+ unhash_lockowner(lo);
+ nfs4_free_lockowner(lo);
}
static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
+release_stateid_lockowners(struct nfs4_ol_stateid *open_stp)
{
- struct nfs4_stateowner *lock_sop;
+ struct nfs4_lockowner *lo;
while (!list_empty(&open_stp->st_lockowners)) {
- lock_sop = list_entry(open_stp->st_lockowners.next,
- struct nfs4_stateowner, so_perstateid);
- /* list_del(&open_stp->st_lockowners); */
- BUG_ON(lock_sop->so_is_open_owner);
- release_lockowner(lock_sop);
+ lo = list_entry(open_stp->st_lockowners.next,
+ struct nfs4_lockowner, lo_perstateid);
+ release_lockowner(lo);
}
}
-static void release_open_stateid(struct nfs4_stateid *stp)
+static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
{
unhash_generic_stateid(stp);
release_stateid_lockowners(stp);
+ close_generic_stateid(stp);
+}
+
+static void release_open_stateid(struct nfs4_ol_stateid *stp)
+{
+ unhash_open_stateid(stp);
+ unhash_stid(&stp->st_stid);
free_generic_stateid(stp);
}
-static void unhash_openowner(struct nfs4_stateowner *sop)
+static void unhash_openowner(struct nfs4_openowner *oo)
{
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
- list_del(&sop->so_idhash);
- list_del(&sop->so_strhash);
- list_del(&sop->so_perclient);
- list_del(&sop->so_perstateid); /* XXX: necessary? */
- while (!list_empty(&sop->so_stateids)) {
- stp = list_first_entry(&sop->so_stateids,
- struct nfs4_stateid, st_perstateowner);
+ list_del(&oo->oo_owner.so_strhash);
+ list_del(&oo->oo_perclient);
+ while (!list_empty(&oo->oo_owner.so_stateids)) {
+ stp = list_first_entry(&oo->oo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
release_open_stateid(stp);
}
}
-static void release_openowner(struct nfs4_stateowner *sop)
+static void release_last_closed_stateid(struct nfs4_openowner *oo)
{
- unhash_openowner(sop);
- list_del(&sop->so_close_lru);
- nfs4_put_stateowner(sop);
+ struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
+
+ if (s) {
+ unhash_stid(&s->st_stid);
+ free_generic_stateid(s);
+ oo->oo_last_closed_stid = NULL;
+ }
+}
+
+static void release_openowner(struct nfs4_openowner *oo)
+{
+ unhash_openowner(oo);
+ list_del(&oo->oo_close_lru);
+ release_last_closed_stateid(oo);
+ nfs4_free_openowner(oo);
}
#define SESSION_HASH_SIZE 512
@@ -840,9 +950,6 @@ renew_client_locked(struct nfs4_client *clp)
return;
}
- /*
- * Move client to the end to the LRU list.
- */
dprintk("renewing client (clientid %08x/%08x)\n",
clp->cl_clientid.cl_boot,
clp->cl_clientid.cl_id);
@@ -888,6 +995,18 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
}
memcpy(clp->cl_name.data, name.data, name.len);
clp->cl_name.len = name.len;
+ INIT_LIST_HEAD(&clp->cl_sessions);
+ idr_init(&clp->cl_stateids);
+ atomic_set(&clp->cl_refcount, 0);
+ clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+ INIT_LIST_HEAD(&clp->cl_idhash);
+ INIT_LIST_HEAD(&clp->cl_strhash);
+ INIT_LIST_HEAD(&clp->cl_openowners);
+ INIT_LIST_HEAD(&clp->cl_delegations);
+ INIT_LIST_HEAD(&clp->cl_lru);
+ INIT_LIST_HEAD(&clp->cl_callbacks);
+ spin_lock_init(&clp->cl_lock);
+ rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
return clp;
}
@@ -901,10 +1020,13 @@ free_client(struct nfs4_client *clp)
list_del(&ses->se_perclnt);
nfsd4_put_session(ses);
}
+ rpc_destroy_wait_queue(&clp->cl_cb_waitq);
if (clp->cl_cred.cr_group_info)
put_group_info(clp->cl_cred.cr_group_info);
kfree(clp->cl_principal);
kfree(clp->cl_name.data);
+ idr_remove_all(&clp->cl_stateids);
+ idr_destroy(&clp->cl_stateids);
kfree(clp);
}
@@ -940,7 +1062,7 @@ unhash_client_locked(struct nfs4_client *clp)
static void
expire_client(struct nfs4_client *clp)
{
- struct nfs4_stateowner *sop;
+ struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct list_head reaplist;
@@ -958,8 +1080,8 @@ expire_client(struct nfs4_client *clp)
unhash_delegation(dp);
}
while (!list_empty(&clp->cl_openowners)) {
- sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
- release_openowner(sop);
+ oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
+ release_openowner(oo);
}
nfsd4_shutdown_callback(clp);
if (clp->cl_cb_conn.cb_xprt)
@@ -1035,6 +1157,23 @@ static void gen_confirm(struct nfs4_client *clp)
*p++ = i++;
}
+static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
+{
+ return idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+}
+
+static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+{
+ struct nfs4_stid *s;
+
+ s = find_stateid(cl, t);
+ if (!s)
+ return NULL;
+ if (typemask & s->sc_type)
+ return s;
+ return NULL;
+}
+
static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
struct svc_rqst *rqstp, nfs4_verifier *verf)
{
@@ -1046,7 +1185,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
if (clp == NULL)
return NULL;
- INIT_LIST_HEAD(&clp->cl_sessions);
princ = svc_gss_principal(rqstp);
if (princ) {
@@ -1058,19 +1196,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
}
memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
- atomic_set(&clp->cl_refcount, 0);
- clp->cl_cb_state = NFSD4_CB_UNKNOWN;
- INIT_LIST_HEAD(&clp->cl_idhash);
- INIT_LIST_HEAD(&clp->cl_strhash);
- INIT_LIST_HEAD(&clp->cl_openowners);
- INIT_LIST_HEAD(&clp->cl_delegations);
- INIT_LIST_HEAD(&clp->cl_lru);
- INIT_LIST_HEAD(&clp->cl_callbacks);
- spin_lock_init(&clp->cl_lock);
INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
clp->cl_time = get_seconds();
clear_bit(0, &clp->cl_cb_slot_busy);
- rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
copy_verf(clp, verf);
rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
clp->cl_flavor = rqstp->rq_flavor;
@@ -1080,17 +1208,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
return clp;
}
-static int check_name(struct xdr_netobj name)
-{
- if (name.len == 0)
- return 0;
- if (name.len > NFS4_OPAQUE_LIMIT) {
- dprintk("NFSD: check_name: name too long(%d)!\n", name.len);
- return 0;
- }
- return 1;
-}
-
static void
add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
{
@@ -1122,8 +1239,10 @@ find_confirmed_client(clientid_t *clid)
unsigned int idhashval = clientid_hashval(clid->cl_id);
list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
- if (same_clid(&clp->cl_clientid, clid))
+ if (same_clid(&clp->cl_clientid, clid)) {
+ renew_client(clp);
return clp;
+ }
}
return NULL;
}
@@ -1170,20 +1289,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
return NULL;
}
-static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
-{
- switch (family) {
- case AF_INET:
- ((struct sockaddr_in *)sa)->sin_family = AF_INET;
- ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
- return;
- case AF_INET6:
- ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
- ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
- return;
- }
-}
-
static void
gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
{
@@ -1215,7 +1320,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
conn->cb_prog = se->se_callback_prog;
conn->cb_ident = se->se_callback_ident;
- rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
+ memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen);
return;
out_err:
conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1347,7 +1452,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
__func__, rqstp, exid, exid->clname.len, exid->clname.data,
addr_str, exid->flags, exid->spa_how);
- if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+ if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
return nfserr_inval;
/* Currently only support SP4_NONE */
@@ -1503,6 +1608,29 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
return slot->sl_status;
}
+#define NFSD_MIN_REQ_HDR_SEQ_SZ ((\
+ 2 * 2 + /* credential,verifier: AUTH_NULL, length 0 */ \
+ 1 + /* MIN tag is length with zero, only length */ \
+ 3 + /* version, opcount, opcode */ \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+ /* seqid, slotID, slotID, cache */ \
+ 4 ) * sizeof(__be32))
+
+#define NFSD_MIN_RESP_HDR_SEQ_SZ ((\
+ 2 + /* verifier: AUTH_NULL, length 0 */\
+ 1 + /* status */ \
+ 1 + /* MIN tag is length with zero, only length */ \
+ 3 + /* opcount, opcode, opstatus*/ \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+ /* seqid, slotID, slotID, slotID, status */ \
+ 5 ) * sizeof(__be32))
+
+static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs fchannel)
+{
+ return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ
+ || fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ;
+}
+
__be32
nfsd4_create_session(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
@@ -1571,6 +1699,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
cr_ses->flags &= ~SESSION4_PERSIST;
cr_ses->flags &= ~SESSION4_RDMA;
+ status = nfserr_toosmall;
+ if (check_forechannel_attrs(cr_ses->fore_channel))
+ goto out;
+
status = nfserr_jukebox;
new = alloc_init_session(rqstp, conf, cr_ses);
if (!new)
@@ -1732,6 +1864,14 @@ static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_sess
return args->opcnt > session->se_fchannel.maxops;
}
+static bool nfsd4_request_too_big(struct svc_rqst *rqstp,
+ struct nfsd4_session *session)
+{
+ struct xdr_buf *xb = &rqstp->rq_arg;
+
+ return xb->len > session->se_fchannel.maxreq_sz;
+}
+
__be32
nfsd4_sequence(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
@@ -1764,6 +1904,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (nfsd4_session_too_many_ops(rqstp, session))
goto out;
+ status = nfserr_req_too_big;
+ if (nfsd4_request_too_big(rqstp, session))
+ goto out;
+
status = nfserr_badslot;
if (seq->slotid >= session->se_fchannel.maxreqs)
goto out;
@@ -1807,8 +1951,16 @@ out:
nfsd4_get_session(cstate->session);
atomic_inc(&clp->cl_refcount);
- if (clp->cl_cb_state == NFSD4_CB_DOWN)
- seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
+ switch (clp->cl_cb_state) {
+ case NFSD4_CB_DOWN:
+ seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
+ break;
+ case NFSD4_CB_FAULT:
+ seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
+ break;
+ default:
+ seq->status_flags = 0;
+ }
}
kfree(conn);
spin_unlock(&client_lock);
@@ -1816,6 +1968,50 @@ out:
return status;
}
+static inline bool has_resources(struct nfs4_client *clp)
+{
+ return !list_empty(&clp->cl_openowners)
+ || !list_empty(&clp->cl_delegations)
+ || !list_empty(&clp->cl_sessions);
+}
+
+__be32
+nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
+{
+ struct nfs4_client *conf, *unconf, *clp;
+ int status = 0;
+
+ nfs4_lock_state();
+ unconf = find_unconfirmed_client(&dc->clientid);
+ conf = find_confirmed_client(&dc->clientid);
+
+ if (conf) {
+ clp = conf;
+
+ if (!is_client_expired(conf) && has_resources(conf)) {
+ status = nfserr_clientid_busy;
+ goto out;
+ }
+
+ /* rfc5661 18.50.3 */
+ if (cstate->session && conf == cstate->session->se_client) {
+ status = nfserr_clientid_busy;
+ goto out;
+ }
+ } else if (unconf)
+ clp = unconf;
+ else {
+ status = nfserr_stale_clientid;
+ goto out;
+ }
+
+ expire_client(clp);
+out:
+ nfs4_unlock_state();
+ dprintk("%s return %d\n", __func__, ntohl(status));
+ return status;
+}
+
__be32
nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
{
@@ -1858,19 +2054,13 @@ __be32
nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_setclientid *setclid)
{
- struct xdr_netobj clname = {
- .len = setclid->se_namelen,
- .data = setclid->se_name,
- };
+ struct xdr_netobj clname = setclid->se_name;
nfs4_verifier clverifier = setclid->se_verf;
unsigned int strhashval;
struct nfs4_client *conf, *unconf, *new;
__be32 status;
char dname[HEXDIR_LEN];
- if (!check_name(clname))
- return nfserr_inval;
-
status = nfs4_make_rec_clidname(dname, &clname);
if (status)
return status;
@@ -2074,31 +2264,28 @@ out:
return status;
}
+static struct nfs4_file *nfsd4_alloc_file(void)
+{
+ return kmem_cache_alloc(file_slab, GFP_KERNEL);
+}
+
/* OPEN Share state helper functions */
-static inline struct nfs4_file *
-alloc_init_file(struct inode *ino)
+static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
{
- struct nfs4_file *fp;
unsigned int hashval = file_hashval(ino);
- fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
- if (fp) {
- atomic_set(&fp->fi_ref, 1);
- INIT_LIST_HEAD(&fp->fi_hash);
- INIT_LIST_HEAD(&fp->fi_stateids);
- INIT_LIST_HEAD(&fp->fi_delegations);
- fp->fi_inode = igrab(ino);
- fp->fi_id = current_fileid++;
- fp->fi_had_conflict = false;
- fp->fi_lease = NULL;
- memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
- memset(fp->fi_access, 0, sizeof(fp->fi_access));
- spin_lock(&recall_lock);
- list_add(&fp->fi_hash, &file_hashtbl[hashval]);
- spin_unlock(&recall_lock);
- return fp;
- }
- return NULL;
+ atomic_set(&fp->fi_ref, 1);
+ INIT_LIST_HEAD(&fp->fi_hash);
+ INIT_LIST_HEAD(&fp->fi_stateids);
+ INIT_LIST_HEAD(&fp->fi_delegations);
+ fp->fi_inode = igrab(ino);
+ fp->fi_had_conflict = false;
+ fp->fi_lease = NULL;
+ memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+ memset(fp->fi_access, 0, sizeof(fp->fi_access));
+ spin_lock(&recall_lock);
+ list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+ spin_unlock(&recall_lock);
}
static void
@@ -2113,7 +2300,8 @@ nfsd4_free_slab(struct kmem_cache **slab)
void
nfsd4_free_slabs(void)
{
- nfsd4_free_slab(&stateowner_slab);
+ nfsd4_free_slab(&openowner_slab);
+ nfsd4_free_slab(&lockowner_slab);
nfsd4_free_slab(&file_slab);
nfsd4_free_slab(&stateid_slab);
nfsd4_free_slab(&deleg_slab);
@@ -2122,16 +2310,20 @@ nfsd4_free_slabs(void)
static int
nfsd4_init_slabs(void)
{
- stateowner_slab = kmem_cache_create("nfsd4_stateowners",
- sizeof(struct nfs4_stateowner), 0, 0, NULL);
- if (stateowner_slab == NULL)
+ openowner_slab = kmem_cache_create("nfsd4_openowners",
+ sizeof(struct nfs4_openowner), 0, 0, NULL);
+ if (openowner_slab == NULL)
+ goto out_nomem;
+ lockowner_slab = kmem_cache_create("nfsd4_lockowners",
+ sizeof(struct nfs4_lockowner), 0, 0, NULL);
+ if (lockowner_slab == NULL)
goto out_nomem;
file_slab = kmem_cache_create("nfsd4_files",
sizeof(struct nfs4_file), 0, 0, NULL);
if (file_slab == NULL)
goto out_nomem;
stateid_slab = kmem_cache_create("nfsd4_stateids",
- sizeof(struct nfs4_stateid), 0, 0, NULL);
+ sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
if (stateid_slab == NULL)
goto out_nomem;
deleg_slab = kmem_cache_create("nfsd4_delegations",
@@ -2145,97 +2337,94 @@ out_nomem:
return -ENOMEM;
}
-void
-nfs4_free_stateowner(struct kref *kref)
+void nfs4_free_openowner(struct nfs4_openowner *oo)
{
- struct nfs4_stateowner *sop =
- container_of(kref, struct nfs4_stateowner, so_ref);
- kfree(sop->so_owner.data);
- kmem_cache_free(stateowner_slab, sop);
+ kfree(oo->oo_owner.so_owner.data);
+ kmem_cache_free(openowner_slab, oo);
}
-static inline struct nfs4_stateowner *
-alloc_stateowner(struct xdr_netobj *owner)
+void nfs4_free_lockowner(struct nfs4_lockowner *lo)
{
- struct nfs4_stateowner *sop;
+ kfree(lo->lo_owner.so_owner.data);
+ kmem_cache_free(lockowner_slab, lo);
+}
- if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) {
- if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) {
- memcpy(sop->so_owner.data, owner->data, owner->len);
- sop->so_owner.len = owner->len;
- kref_init(&sop->so_ref);
- return sop;
- }
- kmem_cache_free(stateowner_slab, sop);
- }
- return NULL;
+static void init_nfs4_replay(struct nfs4_replay *rp)
+{
+ rp->rp_status = nfserr_serverfault;
+ rp->rp_buflen = 0;
+ rp->rp_buf = rp->rp_ibuf;
}
-static struct nfs4_stateowner *
-alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
+{
struct nfs4_stateowner *sop;
- struct nfs4_replay *rp;
- unsigned int idhashval;
- if (!(sop = alloc_stateowner(&open->op_owner)))
+ sop = kmem_cache_alloc(slab, GFP_KERNEL);
+ if (!sop)
+ return NULL;
+
+ sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL);
+ if (!sop->so_owner.data) {
+ kmem_cache_free(slab, sop);
return NULL;
- idhashval = ownerid_hashval(current_ownerid);
- INIT_LIST_HEAD(&sop->so_idhash);
- INIT_LIST_HEAD(&sop->so_strhash);
- INIT_LIST_HEAD(&sop->so_perclient);
+ }
+ sop->so_owner.len = owner->len;
+
INIT_LIST_HEAD(&sop->so_stateids);
- INIT_LIST_HEAD(&sop->so_perstateid); /* not used */
- INIT_LIST_HEAD(&sop->so_close_lru);
- sop->so_time = 0;
- list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]);
- list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]);
- list_add(&sop->so_perclient, &clp->cl_openowners);
- sop->so_is_open_owner = 1;
- sop->so_id = current_ownerid++;
sop->so_client = clp;
- sop->so_seqid = open->op_seqid;
- sop->so_confirmed = 0;
- rp = &sop->so_replay;
- rp->rp_status = nfserr_serverfault;
- rp->rp_buflen = 0;
- rp->rp_buf = rp->rp_ibuf;
+ init_nfs4_replay(&sop->so_replay);
return sop;
}
-static inline void
-init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
- struct nfs4_stateowner *sop = open->op_stateowner;
- unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
+{
+ list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
+ list_add(&oo->oo_perclient, &clp->cl_openowners);
+}
- INIT_LIST_HEAD(&stp->st_hash);
- INIT_LIST_HEAD(&stp->st_perstateowner);
+static struct nfs4_openowner *
+alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+ struct nfs4_openowner *oo;
+
+ oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
+ if (!oo)
+ return NULL;
+ oo->oo_owner.so_is_open_owner = 1;
+ oo->oo_owner.so_seqid = open->op_seqid;
+ oo->oo_flags = NFS4_OO_NEW;
+ oo->oo_time = 0;
+ oo->oo_last_closed_stid = NULL;
+ INIT_LIST_HEAD(&oo->oo_close_lru);
+ hash_openowner(oo, clp, strhashval);
+ return oo;
+}
+
+static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+ struct nfs4_openowner *oo = open->op_openowner;
+ struct nfs4_client *clp = oo->oo_owner.so_client;
+
+ init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
INIT_LIST_HEAD(&stp->st_lockowners);
- INIT_LIST_HEAD(&stp->st_perfile);
- list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
- list_add(&stp->st_perstateowner, &sop->so_stateids);
+ list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
list_add(&stp->st_perfile, &fp->fi_stateids);
- stp->st_stateowner = sop;
+ stp->st_stateowner = &oo->oo_owner;
get_nfs4_file(fp);
stp->st_file = fp;
- stp->st_stateid.si_boot = boot_time;
- stp->st_stateid.si_stateownerid = sop->so_id;
- stp->st_stateid.si_fileid = fp->fi_id;
- stp->st_stateid.si_generation = 0;
stp->st_access_bmap = 0;
stp->st_deny_bmap = 0;
- __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
- &stp->st_access_bmap);
+ __set_bit(open->op_share_access, &stp->st_access_bmap);
__set_bit(open->op_share_deny, &stp->st_deny_bmap);
stp->st_openstp = NULL;
}
static void
-move_to_close_lru(struct nfs4_stateowner *sop)
+move_to_close_lru(struct nfs4_openowner *oo)
{
- dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
+ dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
- list_move_tail(&sop->so_close_lru, &close_lru);
- sop->so_time = get_seconds();
+ list_move_tail(&oo->oo_close_lru, &close_lru);
+ oo->oo_time = get_seconds();
}
static int
@@ -2247,14 +2436,18 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
(sop->so_client->cl_clientid.cl_id == clid->cl_id);
}
-static struct nfs4_stateowner *
+static struct nfs4_openowner *
find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
{
- struct nfs4_stateowner *so = NULL;
+ struct nfs4_stateowner *so;
+ struct nfs4_openowner *oo;
- list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
- if (same_owner_str(so, &open->op_owner, &open->op_clientid))
- return so;
+ list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
+ if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
+ oo = openowner(so);
+ renew_client(oo->oo_owner.so_client);
+ return oo;
+ }
}
return NULL;
}
@@ -2278,31 +2471,6 @@ find_file(struct inode *ino)
return NULL;
}
-static inline int access_valid(u32 x, u32 minorversion)
-{
- if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
- return 0;
- if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
- return 0;
- x &= ~NFS4_SHARE_ACCESS_MASK;
- if (minorversion && x) {
- if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
- return 0;
- if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
- return 0;
- x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
- }
- if (x)
- return 0;
- return 1;
-}
-
-static inline int deny_valid(u32 x)
-{
- /* Note: unlike access bits, deny bits may be zero. */
- return x <= NFS4_SHARE_DENY_BOTH;
-}
-
/*
* Called to check deny when READ with all zero stateid or
* WRITE with all zero or all one stateid
@@ -2312,7 +2480,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
{
struct inode *ino = current_fh->fh_dentry->d_inode;
struct nfs4_file *fp;
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
__be32 ret;
dprintk("NFSD: nfs4_share_conflict\n");
@@ -2383,10 +2551,20 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
}
static const struct lock_manager_operations nfsd_lease_mng_ops = {
- .fl_break = nfsd_break_deleg_cb,
- .fl_change = nfsd_change_deleg_cb,
+ .lm_break = nfsd_break_deleg_cb,
+ .lm_change = nfsd_change_deleg_cb,
};
+static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid)
+{
+ if (nfsd4_has_session(cstate))
+ return nfs_ok;
+ if (seqid == so->so_seqid - 1)
+ return nfserr_replay_me;
+ if (seqid == so->so_seqid)
+ return nfs_ok;
+ return nfserr_bad_seqid;
+}
__be32
nfsd4_process_open1(struct nfsd4_compound_state *cstate,
@@ -2395,57 +2573,49 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
clientid_t *clientid = &open->op_clientid;
struct nfs4_client *clp = NULL;
unsigned int strhashval;
- struct nfs4_stateowner *sop = NULL;
-
- if (!check_name(open->op_owner))
- return nfserr_inval;
+ struct nfs4_openowner *oo = NULL;
+ __be32 status;
if (STALE_CLIENTID(&open->op_clientid))
return nfserr_stale_clientid;
+ /*
+ * In case we need it later, after we've already created the
+ * file and don't want to risk a further failure:
+ */
+ open->op_file = nfsd4_alloc_file();
+ if (open->op_file == NULL)
+ return nfserr_jukebox;
- strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner);
- sop = find_openstateowner_str(strhashval, open);
- open->op_stateowner = sop;
- if (!sop) {
- /* Make sure the client's lease hasn't expired. */
+ strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
+ oo = find_openstateowner_str(strhashval, open);
+ open->op_openowner = oo;
+ if (!oo) {
clp = find_confirmed_client(clientid);
if (clp == NULL)
return nfserr_expired;
- goto renew;
+ goto new_owner;
}
- /* When sessions are used, skip open sequenceid processing */
- if (nfsd4_has_session(cstate))
- goto renew;
- if (!sop->so_confirmed) {
+ if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
/* Replace unconfirmed owners without checking for replay. */
- clp = sop->so_client;
- release_openowner(sop);
- open->op_stateowner = NULL;
- goto renew;
- }
- if (open->op_seqid == sop->so_seqid - 1) {
- if (sop->so_replay.rp_buflen)
- return nfserr_replay_me;
- /* The original OPEN failed so spectacularly
- * that we don't even have replay data saved!
- * Therefore, we have no choice but to continue
- * processing this OPEN; presumably, we'll
- * fail again for the same reason.
- */
- dprintk("nfsd4_process_open1: replay with no replay cache\n");
- goto renew;
- }
- if (open->op_seqid != sop->so_seqid)
- return nfserr_bad_seqid;
-renew:
- if (open->op_stateowner == NULL) {
- sop = alloc_init_open_stateowner(strhashval, clp, open);
- if (sop == NULL)
- return nfserr_jukebox;
- open->op_stateowner = sop;
+ clp = oo->oo_owner.so_client;
+ release_openowner(oo);
+ open->op_openowner = NULL;
+ goto new_owner;
}
- list_del_init(&sop->so_close_lru);
- renew_client(sop->so_client);
+ status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
+ if (status)
+ return status;
+ clp = oo->oo_owner.so_client;
+ goto alloc_stateid;
+new_owner:
+ oo = alloc_init_open_stateowner(strhashval, clp, open);
+ if (oo == NULL)
+ return nfserr_jukebox;
+ open->op_openowner = oo;
+alloc_stateid:
+ open->op_stp = nfs4_alloc_stateid(clp);
+ if (!open->op_stp)
+ return nfserr_jukebox;
return nfs_ok;
}
@@ -2458,36 +2628,37 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
return nfs_ok;
}
-static struct nfs4_delegation *
-find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
+static int share_access_to_flags(u32 share_access)
{
- struct nfs4_delegation *dp;
+ share_access &= ~NFS4_SHARE_WANT_MASK;
- spin_lock(&recall_lock);
- list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
- if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
- spin_unlock(&recall_lock);
- return dp;
- }
- spin_unlock(&recall_lock);
- return NULL;
+ return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
}
-static int share_access_to_flags(u32 share_access)
+static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
{
- share_access &= ~NFS4_SHARE_WANT_MASK;
+ struct nfs4_stid *ret;
- return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
+ ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID);
+ if (!ret)
+ return NULL;
+ return delegstateid(ret);
+}
+
+static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
+{
+ return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
+ open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH;
}
static __be32
-nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
+nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open,
struct nfs4_delegation **dp)
{
int flags;
__be32 status = nfserr_bad_stateid;
- *dp = find_delegation_file(fp, &open->op_delegate_stateid);
+ *dp = find_deleg_stateid(cl, &open->op_delegate_stateid);
if (*dp == NULL)
goto out;
flags = share_access_to_flags(open->op_share_access);
@@ -2495,41 +2666,37 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
if (status)
*dp = NULL;
out:
- if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR)
+ if (!nfsd4_is_deleg_cur(open))
return nfs_ok;
if (status)
return status;
- open->op_stateowner->so_confirmed = 1;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
return nfs_ok;
}
static __be32
-nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
+nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp)
{
- struct nfs4_stateid *local;
- __be32 status = nfserr_share_denied;
- struct nfs4_stateowner *sop = open->op_stateowner;
+ struct nfs4_ol_stateid *local;
+ struct nfs4_openowner *oo = open->op_openowner;
list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
/* ignore lock owners */
if (local->st_stateowner->so_is_open_owner == 0)
continue;
/* remember if we have seen this open owner */
- if (local->st_stateowner == sop)
+ if (local->st_stateowner == &oo->oo_owner)
*stpp = local;
/* check for conflicting share reservations */
if (!test_share(local, open))
- goto out;
+ return nfserr_share_denied;
}
- status = 0;
-out:
- return status;
+ return nfs_ok;
}
-static inline struct nfs4_stateid *
-nfs4_alloc_stateid(void)
+static void nfs4_free_stateid(struct nfs4_ol_stateid *s)
{
- return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
+ kmem_cache_free(stateid_slab, s);
}
static inline int nfs4_access_to_access(u32 nfs4_access)
@@ -2550,12 +2717,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
int oflag = nfs4_access_to_omode(open->op_share_access);
int access = nfs4_access_to_access(open->op_share_access);
- /* CLAIM_DELEGATE_CUR is used in response to a broken lease;
- * allowing it to break the lease and return EAGAIN leaves the
- * client unable to make progress in returning the delegation */
- if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
- access |= NFSD_MAY_NOT_BREAK_LEASE;
-
if (!fp->fi_fds[oflag]) {
status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
&fp->fi_fds[oflag]);
@@ -2567,27 +2728,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
return nfs_ok;
}
-static __be32
-nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
- struct nfs4_file *fp, struct svc_fh *cur_fh,
- struct nfsd4_open *open)
-{
- struct nfs4_stateid *stp;
- __be32 status;
-
- stp = nfs4_alloc_stateid();
- if (stp == NULL)
- return nfserr_jukebox;
-
- status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
- if (status) {
- kmem_cache_free(stateid_slab, stp);
- return status;
- }
- *stpp = stp;
- return 0;
-}
-
static inline __be32
nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
struct nfsd4_open *open)
@@ -2604,9 +2744,9 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
}
static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
{
- u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
+ u32 op_share_access = open->op_share_access;
bool new_access;
__be32 status;
@@ -2635,8 +2775,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
static void
nfs4_set_claim_prev(struct nfsd4_open *open)
{
- open->op_stateowner->so_confirmed = 1;
- open->op_stateowner->so_client->cl_firststate = 1;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ open->op_openowner->oo_owner.so_client->cl_firststate = 1;
}
/* Should we give out recallable state?: */
@@ -2679,10 +2819,8 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
if (!fl)
return -ENOMEM;
fl->fl_file = find_readable_file(fp);
- list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
if (status) {
- list_del_init(&dp->dl_perclnt);
locks_free_lock(fl);
return -ENOMEM;
}
@@ -2690,7 +2828,9 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
fp->fi_deleg_file = fl->fl_file;
get_file(fp->fi_deleg_file);
atomic_set(&fp->fi_delegees, 1);
- list_add(&dp->dl_perfile, &fp->fi_delegations);
+ spin_lock(&recall_lock);
+ hash_delegation_locked(dp, fp);
+ spin_unlock(&recall_lock);
return 0;
}
@@ -2706,9 +2846,8 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
return -EAGAIN;
}
atomic_inc(&fp->fi_delegees);
- list_add(&dp->dl_perfile, &fp->fi_delegations);
+ hash_delegation_locked(dp, fp);
spin_unlock(&recall_lock);
- list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
return 0;
}
@@ -2716,14 +2855,14 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
* Attempt to hand out a delegation.
*/
static void
-nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp)
+nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp)
{
struct nfs4_delegation *dp;
- struct nfs4_stateowner *sop = stp->st_stateowner;
+ struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
int cb_up;
int status, flag = 0;
- cb_up = nfsd4_cb_channel_good(sop->so_client);
+ cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
flag = NFS4_OPEN_DELEGATE_NONE;
open->op_recall = 0;
switch (open->op_claim_type) {
@@ -2739,7 +2878,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
* had the chance to reclaim theirs.... */
if (locks_in_grace())
goto out;
- if (!cb_up || !sop->so_confirmed)
+ if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
goto out;
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2750,17 +2889,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
goto out;
}
- dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
+ dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
if (dp == NULL)
goto out_no_deleg;
status = nfs4_set_delegation(dp, flag);
if (status)
goto out_free;
- memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
+ memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
- STATEID_VAL(&dp->dl_stateid));
+ STATEID_VAL(&dp->dl_stid.sc_stateid));
out:
if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
&& flag == NFS4_OPEN_DELEGATE_NONE
@@ -2782,16 +2921,13 @@ __be32
nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
struct nfs4_file *fp = NULL;
struct inode *ino = current_fh->fh_dentry->d_inode;
- struct nfs4_stateid *stp = NULL;
+ struct nfs4_ol_stateid *stp = NULL;
struct nfs4_delegation *dp = NULL;
__be32 status;
- status = nfserr_inval;
- if (!access_valid(open->op_share_access, resp->cstate.minorversion)
- || !deny_valid(open->op_share_deny))
- goto out;
/*
* Lookup file; if found, lookup stateid and check open request,
* and check for delegations in the process of being recalled.
@@ -2801,17 +2937,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
if (fp) {
if ((status = nfs4_check_open(fp, open, &stp)))
goto out;
- status = nfs4_check_deleg(fp, open, &dp);
+ status = nfs4_check_deleg(cl, fp, open, &dp);
if (status)
goto out;
} else {
status = nfserr_bad_stateid;
- if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
+ if (nfsd4_is_deleg_cur(open))
goto out;
status = nfserr_jukebox;
- fp = alloc_init_file(ino);
- if (fp == NULL)
- goto out;
+ fp = open->op_file;
+ open->op_file = NULL;
+ nfsd4_init_file(fp, ino);
}
/*
@@ -2823,24 +2959,24 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
if (status)
goto out;
- update_stateid(&stp->st_stateid);
} else {
- status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
+ status = nfs4_get_vfs_file(rqstp, fp, current_fh, open);
if (status)
goto out;
- init_stateid(stp, fp, open);
+ stp = open->op_stp;
+ open->op_stp = NULL;
+ init_open_stateid(stp, fp, open);
status = nfsd4_truncate(rqstp, current_fh, open);
if (status) {
release_open_stateid(stp);
goto out;
}
- if (nfsd4_has_session(&resp->cstate))
- update_stateid(&stp->st_stateid);
}
- memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
if (nfsd4_has_session(&resp->cstate))
- open->op_stateowner->so_confirmed = 1;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
/*
* Attempt to hand out a delegation. No error return, because the
@@ -2851,7 +2987,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs_ok;
dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
- STATEID_VAL(&stp->st_stateid));
+ STATEID_VAL(&stp->st_stid.sc_stateid));
out:
if (fp)
put_nfs4_file(fp);
@@ -2861,13 +2997,34 @@ out:
* To finish the open response, we just need to set the rflags.
*/
open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
- if (!open->op_stateowner->so_confirmed &&
+ if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
!nfsd4_has_session(&resp->cstate))
open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
return status;
}
+void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
+{
+ if (open->op_openowner) {
+ struct nfs4_openowner *oo = open->op_openowner;
+
+ if (!list_empty(&oo->oo_owner.so_stateids))
+ list_del_init(&oo->oo_close_lru);
+ if (oo->oo_flags & NFS4_OO_NEW) {
+ if (status) {
+ release_openowner(oo);
+ open->op_openowner = NULL;
+ } else
+ oo->oo_flags &= ~NFS4_OO_NEW;
+ }
+ }
+ if (open->op_file)
+ nfsd4_free_file(open->op_file);
+ if (open->op_stp)
+ nfs4_free_stateid(open->op_stp);
+}
+
__be32
nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
clientid_t *clid)
@@ -2888,7 +3045,6 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("nfsd4_renew: clientid not found!\n");
goto out;
}
- renew_client(clp);
status = nfserr_cb_path_down;
if (!list_empty(&clp->cl_delegations)
&& clp->cl_cb_state != NFSD4_CB_UP)
@@ -2920,7 +3076,7 @@ static time_t
nfs4_laundromat(void)
{
struct nfs4_client *clp;
- struct nfs4_stateowner *sop;
+ struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct list_head *pos, *next, reaplist;
time_t cutoff = get_seconds() - nfsd4_lease;
@@ -2977,16 +3133,14 @@ nfs4_laundromat(void)
}
test_val = nfsd4_lease;
list_for_each_safe(pos, next, &close_lru) {
- sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
- if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
- u = sop->so_time - cutoff;
+ oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
+ if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
+ u = oo->oo_time - cutoff;
if (test_val > u)
test_val = u;
break;
}
- dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
- sop->so_id);
- release_openowner(sop);
+ release_openowner(oo);
}
if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -3008,30 +3162,17 @@ laundromat_main(struct work_struct *not_used)
queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
}
-static struct nfs4_stateowner *
-search_close_lru(u32 st_id, int flags)
-{
- struct nfs4_stateowner *local = NULL;
-
- if (flags & CLOSE_STATE) {
- list_for_each_entry(local, &close_lru, so_close_lru) {
- if (local->so_id == st_id)
- return local;
- }
- }
- return NULL;
-}
-
-static inline int
-nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
+static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
{
- return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
+ if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode)
+ return nfserr_bad_stateid;
+ return nfs_ok;
}
static int
STALE_STATEID(stateid_t *stateid)
{
- if (stateid->si_boot == boot_time)
+ if (stateid->si_opaque.so_clid.cl_boot == boot_time)
return 0;
dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
STATEID_VAL(stateid));
@@ -3054,7 +3195,7 @@ access_permit_write(unsigned long access_bmap)
}
static
-__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
+__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
{
__be32 status = nfserr_openmode;
@@ -3097,37 +3238,83 @@ grace_disallows_io(struct inode *inode)
return locks_in_grace() && mandatory_lock(inode);
}
-static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
+/* Returns true iff a is later than b: */
+static bool stateid_generation_after(stateid_t *a, stateid_t *b)
+{
+ return (s32)a->si_generation - (s32)b->si_generation > 0;
+}
+
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
{
/*
* When sessions are used the stateid generation number is ignored
* when it is zero.
*/
- if ((flags & HAS_SESSION) && in->si_generation == 0)
- goto out;
+ if (has_session && in->si_generation == 0)
+ return nfs_ok;
+
+ if (in->si_generation == ref->si_generation)
+ return nfs_ok;
/* If the client sends us a stateid from the future, it's buggy: */
- if (in->si_generation > ref->si_generation)
+ if (stateid_generation_after(in, ref))
return nfserr_bad_stateid;
/*
- * The following, however, can happen. For example, if the
- * client sends an open and some IO at the same time, the open
- * may bump si_generation while the IO is still in flight.
- * Thanks to hard links and renames, the client never knows what
- * file an open will affect. So it could avoid that situation
- * only by serializing all opens and IO from the same open
- * owner. To recover from the old_stateid error, the client
- * will just have to retry the IO:
+ * However, we could see a stateid from the past, even from a
+ * non-buggy client. For example, if the client sends a lock
+ * while some IO is outstanding, the lock may bump si_generation
+ * while the IO is still in flight. The client could avoid that
+ * situation by waiting for responses on all the IO requests,
+ * but better performance may result in retrying IO that
+ * receives an old_stateid error if requests are rarely
+ * reordered in flight:
*/
- if (in->si_generation < ref->si_generation)
- return nfserr_old_stateid;
-out:
+ return nfserr_old_stateid;
+}
+
+static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols)
+{
+ if (ols->st_stateowner->so_is_open_owner &&
+ !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+ return nfserr_bad_stateid;
return nfs_ok;
}
-static int is_delegation_stateid(stateid_t *stateid)
+__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
{
- return stateid->si_fileid == 0;
+ struct nfs4_stid *s;
+ __be32 status;
+
+ if (STALE_STATEID(stateid))
+ return nfserr_stale_stateid;
+
+ s = find_stateid(cl, stateid);
+ if (!s)
+ return nfserr_stale_stateid;
+ status = check_stateid_generation(stateid, &s->sc_stateid, 1);
+ if (status)
+ return status;
+ if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID)))
+ return nfs_ok;
+ return nfsd4_check_openowner_confirmed(openlockstateid(s));
+}
+
+static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s)
+{
+ struct nfs4_client *cl;
+
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+ return nfserr_bad_stateid;
+ if (STALE_STATEID(stateid))
+ return nfserr_stale_stateid;
+ cl = find_confirmed_client(&stateid->si_opaque.so_clid);
+ if (!cl)
+ return nfserr_expired;
+ *s = find_stateid_by_type(cl, stateid, typemask);
+ if (!*s)
+ return nfserr_bad_stateid;
+ return nfs_ok;
+
}
/*
@@ -3137,7 +3324,8 @@ __be32
nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
stateid_t *stateid, int flags, struct file **filpp)
{
- struct nfs4_stateid *stp = NULL;
+ struct nfs4_stid *s;
+ struct nfs4_ol_stateid *stp = NULL;
struct nfs4_delegation *dp = NULL;
struct svc_fh *current_fh = &cstate->current_fh;
struct inode *ino = current_fh->fh_dentry->d_inode;
@@ -3149,66 +3337,115 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
if (grace_disallows_io(ino))
return nfserr_grace;
- if (nfsd4_has_session(cstate))
- flags |= HAS_SESSION;
-
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return check_special_stateids(current_fh, stateid, flags);
- status = nfserr_stale_stateid;
- if (STALE_STATEID(stateid))
+ status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s);
+ if (status)
+ return status;
+ status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
+ if (status)
goto out;
-
- /*
- * We assume that any stateid that has the current boot time,
- * but that we can't find, is expired:
- */
- status = nfserr_expired;
- if (is_delegation_stateid(stateid)) {
- dp = find_delegation_stateid(ino, stateid);
- if (!dp)
- goto out;
- status = check_stateid_generation(stateid, &dp->dl_stateid,
- flags);
- if (status)
- goto out;
+ switch (s->sc_type) {
+ case NFS4_DELEG_STID:
+ dp = delegstateid(s);
status = nfs4_check_delegmode(dp, flags);
if (status)
goto out;
- renew_client(dp->dl_client);
if (filpp) {
*filpp = dp->dl_file->fi_deleg_file;
BUG_ON(!*filpp);
}
- } else { /* open or lock stateid */
- stp = find_stateid(stateid, flags);
- if (!stp)
- goto out;
- status = nfserr_bad_stateid;
- if (nfs4_check_fh(current_fh, stp))
- goto out;
- if (!stp->st_stateowner->so_confirmed)
+ break;
+ case NFS4_OPEN_STID:
+ case NFS4_LOCK_STID:
+ stp = openlockstateid(s);
+ status = nfs4_check_fh(current_fh, stp);
+ if (status)
goto out;
- status = check_stateid_generation(stateid, &stp->st_stateid,
- flags);
+ status = nfsd4_check_openowner_confirmed(stp);
if (status)
goto out;
status = nfs4_check_openmode(stp, flags);
if (status)
goto out;
- renew_client(stp->st_stateowner->so_client);
if (filpp) {
if (flags & RD_STATE)
*filpp = find_readable_file(stp->st_file);
else
*filpp = find_writeable_file(stp->st_file);
}
+ break;
+ default:
+ return nfserr_bad_stateid;
}
status = nfs_ok;
out:
return status;
}
+static __be32
+nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
+{
+ struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
+
+ if (check_for_locks(stp->st_file, lo))
+ return nfserr_locks_held;
+ /*
+ * Currently there's a 1-1 lock stateid<->lockowner
+ * correspondance, and we have to delete the lockowner when we
+ * delete the lock stateid:
+ */
+ release_lockowner(lo);
+ return nfs_ok;
+}
+
+/*
+ * Test if the stateid is valid
+ */
+__be32
+nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_test_stateid *test_stateid)
+{
+ /* real work is done during encoding */
+ return nfs_ok;
+}
+
+__be32
+nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_free_stateid *free_stateid)
+{
+ stateid_t *stateid = &free_stateid->fr_stateid;
+ struct nfs4_stid *s;
+ struct nfs4_client *cl = cstate->session->se_client;
+ __be32 ret = nfserr_bad_stateid;
+
+ nfs4_lock_state();
+ s = find_stateid(cl, stateid);
+ if (!s)
+ goto out;
+ switch (s->sc_type) {
+ case NFS4_DELEG_STID:
+ ret = nfserr_locks_held;
+ goto out;
+ case NFS4_OPEN_STID:
+ case NFS4_LOCK_STID:
+ ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+ if (ret)
+ goto out;
+ if (s->sc_type == NFS4_LOCK_STID)
+ ret = nfsd4_free_lock_stateid(openlockstateid(s));
+ else
+ ret = nfserr_locks_held;
+ break;
+ default:
+ ret = nfserr_bad_stateid;
+ }
+out:
+ nfs4_unlock_state();
+ return ret;
+}
+
static inline int
setlkflg (int type)
{
@@ -3216,124 +3453,64 @@ setlkflg (int type)
RD_STATE : WR_STATE;
}
+static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp)
+{
+ struct svc_fh *current_fh = &cstate->current_fh;
+ struct nfs4_stateowner *sop = stp->st_stateowner;
+ __be32 status;
+
+ status = nfsd4_check_seqid(cstate, sop, seqid);
+ if (status)
+ return status;
+ if (stp->st_stid.sc_type == NFS4_CLOSED_STID)
+ /*
+ * "Closed" stateid's exist *only* to return
+ * nfserr_replay_me from the previous step.
+ */
+ return nfserr_bad_stateid;
+ status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
+ if (status)
+ return status;
+ return nfs4_check_fh(current_fh, stp);
+}
+
/*
* Checks for sequence id mutating operations.
*/
static __be32
nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
- stateid_t *stateid, int flags,
- struct nfs4_stateowner **sopp,
- struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+ stateid_t *stateid, char typemask,
+ struct nfs4_ol_stateid **stpp)
{
- struct nfs4_stateid *stp;
- struct nfs4_stateowner *sop;
- struct svc_fh *current_fh = &cstate->current_fh;
__be32 status;
+ struct nfs4_stid *s;
dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
seqid, STATEID_VAL(stateid));
*stpp = NULL;
- *sopp = NULL;
-
- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
- return nfserr_bad_stateid;
- }
-
- if (STALE_STATEID(stateid))
- return nfserr_stale_stateid;
-
- if (nfsd4_has_session(cstate))
- flags |= HAS_SESSION;
-
- /*
- * We return BAD_STATEID if filehandle doesn't match stateid,
- * the confirmed flag is incorrecly set, or the generation
- * number is incorrect.
- */
- stp = find_stateid(stateid, flags);
- if (stp == NULL) {
- /*
- * Also, we should make sure this isn't just the result of
- * a replayed close:
- */
- sop = search_close_lru(stateid->si_stateownerid, flags);
- /* It's not stale; let's assume it's expired: */
- if (sop == NULL)
- return nfserr_expired;
- *sopp = sop;
- goto check_replay;
- }
-
- *stpp = stp;
- *sopp = sop = stp->st_stateowner;
-
- if (lock) {
- clientid_t *lockclid = &lock->v.new.clientid;
- struct nfs4_client *clp = sop->so_client;
- int lkflg = 0;
- __be32 status;
-
- lkflg = setlkflg(lock->lk_type);
-
- if (lock->lk_is_new) {
- if (!sop->so_is_open_owner)
- return nfserr_bad_stateid;
- if (!(flags & HAS_SESSION) &&
- !same_clid(&clp->cl_clientid, lockclid))
- return nfserr_bad_stateid;
- /* stp is the open stateid */
- status = nfs4_check_openmode(stp, lkflg);
- if (status)
- return status;
- } else {
- /* stp is the lock stateid */
- status = nfs4_check_openmode(stp->st_openstp, lkflg);
- if (status)
- return status;
- }
- }
+ status = nfsd4_lookup_stateid(stateid, typemask, &s);
+ if (status)
+ return status;
+ *stpp = openlockstateid(s);
+ cstate->replay_owner = (*stpp)->st_stateowner;
- if (nfs4_check_fh(current_fh, stp)) {
- dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n");
- return nfserr_bad_stateid;
- }
+ return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
+}
- /*
- * We now validate the seqid and stateid generation numbers.
- * For the moment, we ignore the possibility of
- * generation number wraparound.
- */
- if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
- goto check_replay;
+static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp)
+{
+ __be32 status;
+ struct nfs4_openowner *oo;
- if (sop->so_confirmed && flags & CONFIRM) {
- dprintk("NFSD: preprocess_seqid_op: expected"
- " unconfirmed stateowner!\n");
- return nfserr_bad_stateid;
- }
- if (!sop->so_confirmed && !(flags & CONFIRM)) {
- dprintk("NFSD: preprocess_seqid_op: stateowner not"
- " confirmed yet!\n");
- return nfserr_bad_stateid;
- }
- status = check_stateid_generation(stateid, &stp->st_stateid, flags);
+ status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
+ NFS4_OPEN_STID, stpp);
if (status)
return status;
- renew_client(sop->so_client);
+ oo = openowner((*stpp)->st_stateowner);
+ if (!(oo->oo_flags & NFS4_OO_CONFIRMED))
+ return nfserr_bad_stateid;
return nfs_ok;
-
-check_replay:
- if (seqid == sop->so_seqid - 1) {
- dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
- /* indicate replay to calling function */
- return nfserr_replay_me;
- }
- dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
- sop->so_seqid, seqid);
- *sopp = NULL;
- return nfserr_bad_seqid;
}
__be32
@@ -3341,8 +3518,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_open_confirm *oc)
{
__be32 status;
- struct nfs4_stateowner *sop;
- struct nfs4_stateid *stp;
+ struct nfs4_openowner *oo;
+ struct nfs4_ol_stateid *stp;
dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3354,39 +3531,52 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
- if ((status = nfs4_preprocess_seqid_op(cstate,
+ status = nfs4_preprocess_seqid_op(cstate,
oc->oc_seqid, &oc->oc_req_stateid,
- CONFIRM | OPEN_STATE,
- &oc->oc_stateowner, &stp, NULL)))
- goto out;
-
- sop = oc->oc_stateowner;
- sop->so_confirmed = 1;
- update_stateid(&stp->st_stateid);
- memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
+ NFS4_OPEN_STID, &stp);
+ if (status)
+ goto out;
+ oo = openowner(stp->st_stateowner);
+ status = nfserr_bad_stateid;
+ if (oo->oo_flags & NFS4_OO_CONFIRMED)
+ goto out;
+ oo->oo_flags |= NFS4_OO_CONFIRMED;
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
- __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
+ __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
- nfsd4_create_clid_dir(sop->so_client);
+ nfsd4_create_clid_dir(oo->oo_owner.so_client);
+ status = nfs_ok;
out:
- if (oc->oc_stateowner) {
- nfs4_get_stateowner(oc->oc_stateowner);
- cstate->replay_owner = oc->oc_stateowner;
- }
- nfs4_unlock_state();
+ if (!cstate->replay_owner)
+ nfs4_unlock_state();
return status;
}
-static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access)
+static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
{
- int i;
+ if (!test_bit(access, &stp->st_access_bmap))
+ return;
+ nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
+ __clear_bit(access, &stp->st_access_bmap);
+}
- for (i = 1; i < 4; i++) {
- if (test_bit(i, &stp->st_access_bmap)
- && ((i & to_access) != i)) {
- nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(i));
- __clear_bit(i, &stp->st_access_bmap);
- }
+static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
+{
+ switch (to_access) {
+ case NFS4_SHARE_ACCESS_READ:
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE);
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+ break;
+ case NFS4_SHARE_ACCESS_WRITE:
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ);
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+ break;
+ case NFS4_SHARE_ACCESS_BOTH:
+ break;
+ default:
+ BUG();
}
}
@@ -3406,26 +3596,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
struct nfsd4_open_downgrade *od)
{
__be32 status;
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
cstate->current_fh.fh_dentry->d_name.name);
- if (!access_valid(od->od_share_access, cstate->minorversion)
- || !deny_valid(od->od_share_deny))
- return nfserr_inval;
/* We don't yet support WANT bits: */
od->od_share_access &= NFS4_SHARE_ACCESS_MASK;
nfs4_lock_state();
- if ((status = nfs4_preprocess_seqid_op(cstate,
- od->od_seqid,
- &od->od_stateid,
- OPEN_STATE,
- &od->od_stateowner, &stp, NULL)))
+ status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
+ &od->od_stateid, &stp);
+ if (status)
goto out;
-
status = nfserr_inval;
if (!test_bit(od->od_share_access, &stp->st_access_bmap)) {
dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n",
@@ -3437,22 +3621,45 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
stp->st_deny_bmap, od->od_share_deny);
goto out;
}
- nfs4_file_downgrade(stp, od->od_share_access);
+ nfs4_stateid_downgrade(stp, od->od_share_access);
reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
- update_stateid(&stp->st_stateid);
- memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t));
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
status = nfs_ok;
out:
- if (od->od_stateowner) {
- nfs4_get_stateowner(od->od_stateowner);
- cstate->replay_owner = od->od_stateowner;
- }
- nfs4_unlock_state();
+ if (!cstate->replay_owner)
+ nfs4_unlock_state();
return status;
}
+void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so)
+{
+ struct nfs4_openowner *oo;
+ struct nfs4_ol_stateid *s;
+
+ if (!so->so_is_open_owner)
+ return;
+ oo = openowner(so);
+ s = oo->oo_last_closed_stid;
+ if (!s)
+ return;
+ if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) {
+ /* Release the last_closed_stid on the next seqid bump: */
+ oo->oo_flags |= NFS4_OO_PURGE_CLOSE;
+ return;
+ }
+ oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
+ release_last_closed_stateid(oo);
+}
+
+static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+{
+ unhash_open_stateid(s);
+ s->st_stid.sc_type = NFS4_CLOSED_STID;
+}
+
/*
* nfs4_unlock_state() called after encode
*/
@@ -3461,39 +3668,38 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_close *close)
{
__be32 status;
- struct nfs4_stateid *stp;
+ struct nfs4_openowner *oo;
+ struct nfs4_ol_stateid *stp;
dprintk("NFSD: nfsd4_close on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
cstate->current_fh.fh_dentry->d_name.name);
nfs4_lock_state();
- /* check close_lru for replay */
- if ((status = nfs4_preprocess_seqid_op(cstate,
- close->cl_seqid,
- &close->cl_stateid,
- OPEN_STATE | CLOSE_STATE,
- &close->cl_stateowner, &stp, NULL)))
+ status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
+ &close->cl_stateid,
+ NFS4_OPEN_STID|NFS4_CLOSED_STID,
+ &stp);
+ if (status)
goto out;
+ oo = openowner(stp->st_stateowner);
status = nfs_ok;
- update_stateid(&stp->st_stateid);
- memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
- /* release_stateid() calls nfsd_close() if needed */
- release_open_stateid(stp);
+ nfsd4_close_open_stateid(stp);
+ release_last_closed_stateid(oo);
+ oo->oo_last_closed_stid = stp;
/* place unused nfs4_stateowners on so_close_lru list to be
* released by the laundromat service after the lease period
* to enable us to handle CLOSE replay
*/
- if (list_empty(&close->cl_stateowner->so_stateids))
- move_to_close_lru(close->cl_stateowner);
+ if (list_empty(&oo->oo_owner.so_stateids))
+ move_to_close_lru(oo);
out:
- if (close->cl_stateowner) {
- nfs4_get_stateowner(close->cl_stateowner);
- cstate->replay_owner = close->cl_stateowner;
- }
- nfs4_unlock_state();
+ if (!cstate->replay_owner)
+ nfs4_unlock_state();
return status;
}
@@ -3503,34 +3709,22 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct nfs4_delegation *dp;
stateid_t *stateid = &dr->dr_stateid;
+ struct nfs4_stid *s;
struct inode *inode;
__be32 status;
- int flags = 0;
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
inode = cstate->current_fh.fh_dentry->d_inode;
- if (nfsd4_has_session(cstate))
- flags |= HAS_SESSION;
nfs4_lock_state();
- status = nfserr_bad_stateid;
- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
- goto out;
- status = nfserr_stale_stateid;
- if (STALE_STATEID(stateid))
- goto out;
- status = nfserr_bad_stateid;
- if (!is_delegation_stateid(stateid))
- goto out;
- status = nfserr_expired;
- dp = find_delegation_stateid(inode, stateid);
- if (!dp)
+ status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s);
+ if (status)
goto out;
- status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+ dp = delegstateid(s);
+ status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
if (status)
goto out;
- renew_client(dp->dl_client);
unhash_delegation(dp);
out:
@@ -3568,9 +3762,6 @@ last_byte_offset(u64 start, u64 len)
return end > start ? end - 1: NFS4_MAX_UINT64;
}
-#define lockownerid_hashval(id) \
- ((id) & LOCK_HASH_MASK)
-
static inline unsigned int
lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
struct xdr_netobj *ownername)
@@ -3580,55 +3771,7 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
& LOCK_HASH_MASK;
}
-static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
-static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
-
-static struct nfs4_stateid *
-find_stateid(stateid_t *stid, int flags)
-{
- struct nfs4_stateid *local;
- u32 st_id = stid->si_stateownerid;
- u32 f_id = stid->si_fileid;
- unsigned int hashval;
-
- dprintk("NFSD: find_stateid flags 0x%x\n",flags);
- if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
- hashval = stateid_hashval(st_id, f_id);
- list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
- if ((local->st_stateid.si_stateownerid == st_id) &&
- (local->st_stateid.si_fileid == f_id))
- return local;
- }
- }
-
- if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
- hashval = stateid_hashval(st_id, f_id);
- list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
- if ((local->st_stateid.si_stateownerid == st_id) &&
- (local->st_stateid.si_fileid == f_id))
- return local;
- }
- }
- return NULL;
-}
-
-static struct nfs4_delegation *
-find_delegation_stateid(struct inode *ino, stateid_t *stid)
-{
- struct nfs4_file *fp;
- struct nfs4_delegation *dl;
-
- dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
- STATEID_VAL(stid));
-
- fp = find_file(ino);
- if (!fp)
- return NULL;
- dl = find_delegation_file(fp, stid);
- put_nfs4_file(fp);
- return dl;
-}
/*
* TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
@@ -3655,15 +3798,21 @@ static const struct lock_manager_operations nfsd_posix_mng_ops = {
static inline void
nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
{
- struct nfs4_stateowner *sop;
+ struct nfs4_lockowner *lo;
if (fl->fl_lmops == &nfsd_posix_mng_ops) {
- sop = (struct nfs4_stateowner *) fl->fl_owner;
- kref_get(&sop->so_ref);
- deny->ld_sop = sop;
- deny->ld_clientid = sop->so_client->cl_clientid;
+ lo = (struct nfs4_lockowner *) fl->fl_owner;
+ deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data,
+ lo->lo_owner.so_owner.len, GFP_KERNEL);
+ if (!deny->ld_owner.data)
+ /* We just don't care that much */
+ goto nevermind;
+ deny->ld_owner.len = lo->lo_owner.so_owner.len;
+ deny->ld_clientid = lo->lo_owner.so_client->cl_clientid;
} else {
- deny->ld_sop = NULL;
+nevermind:
+ deny->ld_owner.len = 0;
+ deny->ld_owner.data = NULL;
deny->ld_clientid.cl_boot = 0;
deny->ld_clientid.cl_id = 0;
}
@@ -3676,20 +3825,43 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
deny->ld_type = NFS4_WRITE_LT;
}
-static struct nfs4_stateowner *
-find_lockstateowner_str(struct inode *inode, clientid_t *clid,
+static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner)
+{
+ struct nfs4_ol_stateid *lst;
+
+ if (!same_owner_str(&lo->lo_owner, owner, clid))
+ return false;
+ if (list_empty(&lo->lo_owner.so_stateids)) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ lst = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
+ return lst->st_file->fi_inode == inode;
+}
+
+static struct nfs4_lockowner *
+find_lockowner_str(struct inode *inode, clientid_t *clid,
struct xdr_netobj *owner)
{
unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
+ struct nfs4_lockowner *lo;
struct nfs4_stateowner *op;
list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
- if (same_owner_str(op, owner, clid))
- return op;
+ lo = lockowner(op);
+ if (same_lockowner_ino(lo, inode, clid, owner))
+ return lo;
}
return NULL;
}
+static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
+{
+ list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
+ list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
+}
+
/*
* Alloc a lock owner structure.
* Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
@@ -3698,67 +3870,40 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
* strhashval = lock_ownerstr_hashval
*/
-static struct nfs4_stateowner *
-alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) {
- struct nfs4_stateowner *sop;
- struct nfs4_replay *rp;
- unsigned int idhashval;
+static struct nfs4_lockowner *
+alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) {
+ struct nfs4_lockowner *lo;
- if (!(sop = alloc_stateowner(&lock->lk_new_owner)))
+ lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
+ if (!lo)
return NULL;
- idhashval = lockownerid_hashval(current_ownerid);
- INIT_LIST_HEAD(&sop->so_idhash);
- INIT_LIST_HEAD(&sop->so_strhash);
- INIT_LIST_HEAD(&sop->so_perclient);
- INIT_LIST_HEAD(&sop->so_stateids);
- INIT_LIST_HEAD(&sop->so_perstateid);
- INIT_LIST_HEAD(&sop->so_close_lru); /* not used */
- sop->so_time = 0;
- list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]);
- list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]);
- list_add(&sop->so_perstateid, &open_stp->st_lockowners);
- sop->so_is_open_owner = 0;
- sop->so_id = current_ownerid++;
- sop->so_client = clp;
+ INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
+ lo->lo_owner.so_is_open_owner = 0;
/* It is the openowner seqid that will be incremented in encode in the
* case of new lockowners; so increment the lock seqid manually: */
- sop->so_seqid = lock->lk_new_lock_seqid + 1;
- sop->so_confirmed = 1;
- rp = &sop->so_replay;
- rp->rp_status = nfserr_serverfault;
- rp->rp_buflen = 0;
- rp->rp_buf = rp->rp_ibuf;
- return sop;
+ lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1;
+ hash_lockowner(lo, strhashval, clp, open_stp);
+ return lo;
}
-static struct nfs4_stateid *
-alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp)
+static struct nfs4_ol_stateid *
+alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp)
{
- struct nfs4_stateid *stp;
- unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+ struct nfs4_ol_stateid *stp;
+ struct nfs4_client *clp = lo->lo_owner.so_client;
- stp = nfs4_alloc_stateid();
+ stp = nfs4_alloc_stateid(clp);
if (stp == NULL)
- goto out;
- INIT_LIST_HEAD(&stp->st_hash);
- INIT_LIST_HEAD(&stp->st_perfile);
- INIT_LIST_HEAD(&stp->st_perstateowner);
- INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
- list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
+ return NULL;
+ init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
list_add(&stp->st_perfile, &fp->fi_stateids);
- list_add(&stp->st_perstateowner, &sop->so_stateids);
- stp->st_stateowner = sop;
+ list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
+ stp->st_stateowner = &lo->lo_owner;
get_nfs4_file(fp);
stp->st_file = fp;
- stp->st_stateid.si_boot = boot_time;
- stp->st_stateid.si_stateownerid = sop->so_id;
- stp->st_stateid.si_fileid = fp->fi_id;
- stp->st_stateid.si_generation = 0;
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
-
-out:
return stp;
}
@@ -3769,7 +3914,7 @@ check_lock_length(u64 offset, u64 length)
LOFF_OVERFLOW(offset, length)));
}
-static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access)
+static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
{
struct nfs4_file *fp = lock_stp->st_file;
int oflag = nfs4_access_to_omode(access);
@@ -3787,15 +3932,16 @@ __be32
nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_lock *lock)
{
- struct nfs4_stateowner *open_sop = NULL;
- struct nfs4_stateowner *lock_sop = NULL;
- struct nfs4_stateid *lock_stp;
+ struct nfs4_openowner *open_sop = NULL;
+ struct nfs4_lockowner *lock_sop = NULL;
+ struct nfs4_ol_stateid *lock_stp;
struct nfs4_file *fp;
struct file *filp = NULL;
struct file_lock file_lock;
struct file_lock conflock;
__be32 status = 0;
unsigned int strhashval;
+ int lkflg;
int err;
dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -3819,7 +3965,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* Use open owner and open stateid to create lock owner and
* lock stateid.
*/
- struct nfs4_stateid *open_stp = NULL;
+ struct nfs4_ol_stateid *open_stp = NULL;
status = nfserr_stale_clientid;
if (!nfsd4_has_session(cstate) &&
@@ -3827,26 +3973,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
/* validate and update open stateid and open seqid */
- status = nfs4_preprocess_seqid_op(cstate,
+ status = nfs4_preprocess_confirmed_seqid_op(cstate,
lock->lk_new_open_seqid,
&lock->lk_new_open_stateid,
- OPEN_STATE,
- &lock->lk_replay_owner, &open_stp,
- lock);
+ &open_stp);
if (status)
goto out;
- open_sop = lock->lk_replay_owner;
+ open_sop = openowner(open_stp->st_stateowner);
+ status = nfserr_bad_stateid;
+ if (!nfsd4_has_session(cstate) &&
+ !same_clid(&open_sop->oo_owner.so_client->cl_clientid,
+ &lock->v.new.clientid))
+ goto out;
/* create lockowner and lock stateid */
fp = open_stp->st_file;
- strhashval = lock_ownerstr_hashval(fp->fi_inode,
- open_sop->so_client->cl_clientid.cl_id,
+ strhashval = lock_ownerstr_hashval(fp->fi_inode,
+ open_sop->oo_owner.so_client->cl_clientid.cl_id,
&lock->v.new.owner);
/* XXX: Do we need to check for duplicate stateowners on
* the same file, or should they just be allowed (and
* create new stateids)? */
status = nfserr_jukebox;
lock_sop = alloc_init_lock_stateowner(strhashval,
- open_sop->so_client, open_stp, lock);
+ open_sop->oo_owner.so_client, open_stp, lock);
if (lock_sop == NULL)
goto out;
lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
@@ -3855,16 +4004,20 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
} else {
/* lock (lock owner + lock stateid) already exists */
status = nfs4_preprocess_seqid_op(cstate,
- lock->lk_old_lock_seqid,
- &lock->lk_old_lock_stateid,
- LOCK_STATE,
- &lock->lk_replay_owner, &lock_stp, lock);
+ lock->lk_old_lock_seqid,
+ &lock->lk_old_lock_stateid,
+ NFS4_LOCK_STID, &lock_stp);
if (status)
goto out;
- lock_sop = lock->lk_replay_owner;
+ lock_sop = lockowner(lock_stp->st_stateowner);
fp = lock_stp->st_file;
}
- /* lock->lk_replay_owner and lock_stp have been created or found */
+ /* lock_sop and lock_stp have been created or found */
+
+ lkflg = setlkflg(lock->lk_type);
+ status = nfs4_check_openmode(lock_stp, lkflg);
+ if (status)
+ goto out;
status = nfserr_grace;
if (locks_in_grace() && !lock->lk_reclaim)
@@ -3915,8 +4068,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
switch (-err) {
case 0: /* success! */
- update_stateid(&lock_stp->st_stateid);
- memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid,
+ update_stateid(&lock_stp->st_stid.sc_stateid);
+ memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid,
sizeof(stateid_t));
status = 0;
break;
@@ -3936,11 +4089,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
out:
if (status && lock->lk_is_new && lock_sop)
release_lockowner(lock_sop);
- if (lock->lk_replay_owner) {
- nfs4_get_stateowner(lock->lk_replay_owner);
- cstate->replay_owner = lock->lk_replay_owner;
- }
- nfs4_unlock_state();
+ if (!cstate->replay_owner)
+ nfs4_unlock_state();
return status;
}
@@ -3970,6 +4120,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct inode *inode;
struct file_lock file_lock;
+ struct nfs4_lockowner *lo;
__be32 status;
if (locks_in_grace())
@@ -3978,19 +4129,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (check_lock_length(lockt->lt_offset, lockt->lt_length))
return nfserr_inval;
- lockt->lt_stateowner = NULL;
nfs4_lock_state();
status = nfserr_stale_clientid;
if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
goto out;
- if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
- dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n");
- if (status == nfserr_symlink)
- status = nfserr_inval;
+ if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
goto out;
- }
inode = cstate->current_fh.fh_dentry->d_inode;
locks_init_lock(&file_lock);
@@ -4009,10 +4155,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- lockt->lt_stateowner = find_lockstateowner_str(inode,
- &lockt->lt_clientid, &lockt->lt_owner);
- if (lockt->lt_stateowner)
- file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
+ lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
+ if (lo)
+ file_lock.fl_owner = (fl_owner_t)lo;
file_lock.fl_pid = current->tgid;
file_lock.fl_flags = FL_POSIX;
@@ -4038,7 +4183,7 @@ __be32
nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_locku *locku)
{
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
struct file *filp = NULL;
struct file_lock file_lock;
__be32 status;
@@ -4053,13 +4198,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
- if ((status = nfs4_preprocess_seqid_op(cstate,
- locku->lu_seqid,
- &locku->lu_stateid,
- LOCK_STATE,
- &locku->lu_stateowner, &stp, NULL)))
+ status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
+ &locku->lu_stateid, NFS4_LOCK_STID, &stp);
+ if (status)
goto out;
-
filp = find_any_file(stp->st_file);
if (!filp) {
status = nfserr_lock_range;
@@ -4068,7 +4210,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
BUG_ON(!filp);
locks_init_lock(&file_lock);
file_lock.fl_type = F_UNLCK;
- file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner;
+ file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
file_lock.fl_pid = current->tgid;
file_lock.fl_file = filp;
file_lock.fl_flags = FL_POSIX;
@@ -4089,15 +4231,12 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/*
* OK, unlock succeeded; the only thing left to do is update the stateid.
*/
- update_stateid(&stp->st_stateid);
- memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t));
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
out:
- if (locku->lu_stateowner) {
- nfs4_get_stateowner(locku->lu_stateowner);
- cstate->replay_owner = locku->lu_stateowner;
- }
- nfs4_unlock_state();
+ if (!cstate->replay_owner)
+ nfs4_unlock_state();
return status;
out_nfserr:
@@ -4111,7 +4250,7 @@ out_nfserr:
* 0: no locks held by lockowner
*/
static int
-check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
+check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
{
struct file_lock **flpp;
struct inode *inode = filp->fi_inode;
@@ -4136,7 +4275,8 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
{
clientid_t *clid = &rlockowner->rl_clientid;
struct nfs4_stateowner *sop;
- struct nfs4_stateid *stp;
+ struct nfs4_lockowner *lo;
+ struct nfs4_ol_stateid *stp;
struct xdr_netobj *owner = &rlockowner->rl_owner;
struct list_head matches;
int i;
@@ -4160,16 +4300,15 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
* data structures. */
INIT_LIST_HEAD(&matches);
for (i = 0; i < LOCK_HASH_SIZE; i++) {
- list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) {
+ list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) {
if (!same_owner_str(sop, owner, clid))
continue;
list_for_each_entry(stp, &sop->so_stateids,
st_perstateowner) {
- if (check_for_locks(stp->st_file, sop))
+ lo = lockowner(sop);
+ if (check_for_locks(stp->st_file, lo))
goto out;
- /* Note: so_perclient unused for lockowners,
- * so it's OK to fool with here. */
- list_add(&sop->so_perclient, &matches);
+ list_add(&lo->lo_list, &matches);
}
}
}
@@ -4178,12 +4317,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
* have been checked. */
status = nfs_ok;
while (!list_empty(&matches)) {
- sop = list_entry(matches.next, struct nfs4_stateowner,
- so_perclient);
+ lo = list_entry(matches.next, struct nfs4_lockowner,
+ lo_list);
/* unhash_stateowner deletes so_perclient only
* for openowners. */
- list_del(&sop->so_perclient);
- release_lockowner(sop);
+ list_del(&lo->lo_list);
+ release_lockowner(lo);
}
out:
nfs4_unlock_state();
@@ -4305,16 +4444,10 @@ nfs4_state_init(void)
for (i = 0; i < FILE_HASH_SIZE; i++) {
INIT_LIST_HEAD(&file_hashtbl[i]);
}
- for (i = 0; i < OWNER_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
- INIT_LIST_HEAD(&ownerid_hashtbl[i]);
- }
- for (i = 0; i < STATEID_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&stateid_hashtbl[i]);
- INIT_LIST_HEAD(&lockstateid_hashtbl[i]);
+ for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
+ INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
}
for (i = 0; i < LOCK_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]);
INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
}
memset(&onestateid, ~0, sizeof(stateid_t));
@@ -4331,7 +4464,7 @@ nfsd4_load_reboot_recovery_data(void)
int status;
nfs4_lock_state();
- nfsd4_init_recdir(user_recovery_dirname);
+ nfsd4_init_recdir();
status = nfsd4_recdir_load();
nfs4_unlock_state();
if (status)
@@ -4440,40 +4573,3 @@ nfs4_state_shutdown(void)
nfs4_unlock_state();
nfsd4_destroy_callback_queue();
}
-
-/*
- * user_recovery_dirname is protected by the nfsd_mutex since it's only
- * accessed when nfsd is starting.
- */
-static void
-nfs4_set_recdir(char *recdir)
-{
- strcpy(user_recovery_dirname, recdir);
-}
-
-/*
- * Change the NFSv4 recovery directory to recdir.
- */
-int
-nfs4_reset_recoverydir(char *recdir)
-{
- int status;
- struct path path;
-
- status = kern_path(recdir, LOOKUP_FOLLOW, &path);
- if (status)
- return status;
- status = -ENOTDIR;
- if (S_ISDIR(path.dentry->d_inode->i_mode)) {
- nfs4_set_recdir(recdir);
- status = 0;
- }
- path_put(&path);
- return status;
-}
-
-char *
-nfs4_recoverydir(void)
-{
- return user_recovery_dirname;
-}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 45f53ae..9d2c52b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,15 @@
#include <linux/namei.h>
#include <linux/statfs.h>
#include <linux/utsname.h>
+#include <linux/pagemap.h>
#include <linux/sunrpc/svcauth_gss.h>
#include "idmap.h"
#include "acl.h"
#include "xdr4.h"
#include "vfs.h"
-
+#include "state.h"
+#include "cache.h"
#define NFSDDBG_FACILITY NFSDDBG_XDR
@@ -131,6 +133,22 @@ xdr_error: \
} \
} while (0)
+static void save_buf(struct nfsd4_compoundargs *argp, struct nfsd4_saved_compoundargs *savep)
+{
+ savep->p = argp->p;
+ savep->end = argp->end;
+ savep->pagelen = argp->pagelen;
+ savep->pagelist = argp->pagelist;
+}
+
+static void restore_buf(struct nfsd4_compoundargs *argp, struct nfsd4_saved_compoundargs *savep)
+{
+ argp->p = savep->p;
+ argp->end = savep->end;
+ argp->pagelen = savep->pagelen;
+ argp->pagelist = savep->pagelist;
+}
+
static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
{
/* We want more bytes than seem to be available.
@@ -159,8 +177,8 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
*/
memcpy(p, argp->p, avail);
/* step to next page */
- argp->pagelist++;
argp->p = page_address(argp->pagelist[0]);
+ argp->pagelist++;
if (argp->pagelen < PAGE_SIZE) {
argp->end = argp->p + (argp->pagelen>>2);
argp->pagelen = 0;
@@ -432,7 +450,6 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
{
DECODE_HEAD;
- close->cl_stateowner = NULL;
READ_BUF(4);
READ32(close->cl_seqid);
return nfsd4_decode_stateid(argp, &close->cl_stateid);
@@ -465,7 +482,18 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
READ_BUF(4);
READ32(create->cr_linklen);
READ_BUF(create->cr_linklen);
- SAVEMEM(create->cr_linkname, create->cr_linklen);
+ /*
+ * The VFS will want a null-terminated string, and
+ * null-terminating in place isn't safe since this might
+ * end on a page boundary:
+ */
+ create->cr_linkname =
+ kmalloc(create->cr_linklen + 1, GFP_KERNEL);
+ if (!create->cr_linkname)
+ return nfserr_jukebox;
+ memcpy(create->cr_linkname, p, create->cr_linklen);
+ create->cr_linkname[create->cr_linklen] = '\0';
+ defer_free(argp, kfree, create->cr_linkname);
break;
case NF4BLK:
case NF4CHR:
@@ -527,7 +555,6 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
{
DECODE_HEAD;
- lock->lk_replay_owner = NULL;
/*
* type, reclaim(boolean), offset, length, new_lock_owner(boolean)
*/
@@ -587,7 +614,6 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
{
DECODE_HEAD;
- locku->lu_stateowner = NULL;
READ_BUF(8);
READ32(locku->lu_type);
if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
@@ -618,6 +644,83 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
DECODE_TAIL;
}
+static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
+{
+ __be32 *p;
+ u32 w;
+
+ READ_BUF(4);
+ READ32(w);
+ *x = w;
+ switch (w & NFS4_SHARE_ACCESS_MASK) {
+ case NFS4_SHARE_ACCESS_READ:
+ case NFS4_SHARE_ACCESS_WRITE:
+ case NFS4_SHARE_ACCESS_BOTH:
+ break;
+ default:
+ return nfserr_bad_xdr;
+ }
+ w &= ~NFS4_SHARE_ACCESS_MASK;
+ if (!w)
+ return nfs_ok;
+ if (!argp->minorversion)
+ return nfserr_bad_xdr;
+ switch (w & NFS4_SHARE_WANT_MASK) {
+ case NFS4_SHARE_WANT_NO_PREFERENCE:
+ case NFS4_SHARE_WANT_READ_DELEG:
+ case NFS4_SHARE_WANT_WRITE_DELEG:
+ case NFS4_SHARE_WANT_ANY_DELEG:
+ case NFS4_SHARE_WANT_NO_DELEG:
+ case NFS4_SHARE_WANT_CANCEL:
+ break;
+ default:
+ return nfserr_bad_xdr;
+ }
+ w &= ~NFS4_SHARE_WANT_MASK;
+ if (!w)
+ return nfs_ok;
+ switch (w) {
+ case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
+ case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED:
+ case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL |
+ NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED):
+ return nfs_ok;
+ }
+xdr_error:
+ return nfserr_bad_xdr;
+}
+
+static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
+{
+ __be32 *p;
+
+ READ_BUF(4);
+ READ32(*x);
+ /* Note: unlinke access bits, deny bits may be zero. */
+ if (*x & ~NFS4_SHARE_DENY_BOTH)
+ return nfserr_bad_xdr;
+ return nfs_ok;
+xdr_error:
+ return nfserr_bad_xdr;
+}
+
+static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
+{
+ __be32 *p;
+
+ READ_BUF(4);
+ READ32(o->len);
+
+ if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
+ return nfserr_bad_xdr;
+
+ READ_BUF(o->len);
+ SAVEMEM(o->data, o->len);
+ return nfs_ok;
+xdr_error:
+ return nfserr_bad_xdr;
+}
+
static __be32
nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
{
@@ -625,19 +728,23 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
memset(open->op_bmval, 0, sizeof(open->op_bmval));
open->op_iattr.ia_valid = 0;
- open->op_stateowner = NULL;
+ open->op_openowner = NULL;
/* seqid, share_access, share_deny, clientid, ownerlen */
- READ_BUF(16 + sizeof(clientid_t));
+ READ_BUF(4);
READ32(open->op_seqid);
- READ32(open->op_share_access);
- READ32(open->op_share_deny);
+ status = nfsd4_decode_share_access(argp, &open->op_share_access);
+ if (status)
+ goto xdr_error;
+ status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
+ if (status)
+ goto xdr_error;
+ READ_BUF(sizeof(clientid_t));
COPYMEM(&open->op_clientid, sizeof(clientid_t));
- READ32(open->op_owner.len);
-
- /* owner, open_flag */
- READ_BUF(open->op_owner.len + 4);
- SAVEMEM(open->op_owner.data, open->op_owner.len);
+ status = nfsd4_decode_opaque(argp, &open->op_owner);
+ if (status)
+ goto xdr_error;
+ READ_BUF(4);
READ32(open->op_create);
switch (open->op_create) {
case NFS4_OPEN_NOCREATE:
@@ -703,6 +810,19 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
return status;
break;
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ if (argp->minorversion < 1)
+ goto xdr_error;
+ /* void */
+ break;
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ if (argp->minorversion < 1)
+ goto xdr_error;
+ status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+ if (status)
+ return status;
+ break;
default:
goto xdr_error;
}
@@ -715,7 +835,6 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
{
DECODE_HEAD;
- open_conf->oc_stateowner = NULL;
status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
if (status)
return status;
@@ -730,15 +849,17 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
{
DECODE_HEAD;
- open_down->od_stateowner = NULL;
status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
if (status)
return status;
- READ_BUF(12);
+ READ_BUF(4);
READ32(open_down->od_seqid);
- READ32(open_down->od_share_access);
- READ32(open_down->od_share_deny);
-
+ status = nfsd4_decode_share_access(argp, &open_down->od_share_access);
+ if (status)
+ return status;
+ status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
+ if (status)
+ return status;
DECODE_TAIL;
}
@@ -879,12 +1000,13 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
{
DECODE_HEAD;
- READ_BUF(12);
+ READ_BUF(8);
COPYMEM(setclientid->se_verf.data, 8);
- READ32(setclientid->se_namelen);
- READ_BUF(setclientid->se_namelen + 8);
- SAVEMEM(setclientid->se_name, setclientid->se_namelen);
+ status = nfsd4_decode_opaque(argp, &setclientid->se_name);
+ if (status)
+ return nfserr_bad_xdr;
+ READ_BUF(8);
READ32(setclientid->se_callback_prog);
READ32(setclientid->se_callback_netid_len);
@@ -1027,11 +1149,9 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
READ_BUF(NFS4_VERIFIER_SIZE);
COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
- READ_BUF(4);
- READ32(exid->clname.len);
-
- READ_BUF(exid->clname.len);
- SAVEMEM(exid->clname.data, exid->clname.len);
+ status = nfsd4_decode_opaque(argp, &exid->clname);
+ if (status)
+ return nfserr_bad_xdr;
READ_BUF(4);
READ32(exid->flags);
@@ -1240,6 +1360,19 @@ nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
}
static __be32
+nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
+ struct nfsd4_free_stateid *free_stateid)
+{
+ DECODE_HEAD;
+
+ READ_BUF(sizeof(stateid_t));
+ READ32(free_stateid->fr_stateid.si_generation);
+ COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t));
+
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
struct nfsd4_sequence *seq)
{
@@ -1255,6 +1388,50 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
DECODE_TAIL;
}
+static __be32
+nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid)
+{
+ unsigned int nbytes;
+ stateid_t si;
+ int i;
+ __be32 *p;
+ __be32 status;
+
+ READ_BUF(4);
+ test_stateid->ts_num_ids = ntohl(*p++);
+
+ nbytes = test_stateid->ts_num_ids * sizeof(stateid_t);
+ if (nbytes > (u32)((char *)argp->end - (char *)argp->p))
+ goto xdr_error;
+
+ test_stateid->ts_saved_args = argp;
+ save_buf(argp, &test_stateid->ts_savedp);
+
+ for (i = 0; i < test_stateid->ts_num_ids; i++) {
+ status = nfsd4_decode_stateid(argp, &si);
+ if (status)
+ return status;
+ }
+
+ status = 0;
+out:
+ return status;
+xdr_error:
+ dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__);
+ status = nfserr_bad_xdr;
+ goto out;
+}
+
+static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc)
+{
+ DECODE_HEAD;
+
+ READ_BUF(8);
+ COPYMEM(&dc->clientid, 8);
+
+ DECODE_TAIL;
+}
+
static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
{
DECODE_HEAD;
@@ -1364,7 +1541,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
[OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
[OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
[OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
- [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid,
[OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -1374,9 +1551,9 @@ static nfsd4_dec nfsd41_dec_ops[] = {
[OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
[OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
[OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid,
[OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
[OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
};
@@ -1396,6 +1573,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
DECODE_HEAD;
struct nfsd4_op *op;
struct nfsd4_minorversion_ops *ops;
+ bool cachethis = false;
int i;
/*
@@ -1477,7 +1655,16 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
argp->opcnt = i+1;
break;
}
+ /*
+ * We'll try to cache the result in the DRC if any one
+ * op in the compound wants to be cached:
+ */
+ cachethis |= nfsd4_cache_this_op(op);
}
+ /* Sessions make the DRC unnecessary: */
+ if (argp->minorversion)
+ cachethis = false;
+ argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
DECODE_TAIL;
}
@@ -1542,18 +1729,6 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
\
save = resp->p;
-static bool seqid_mutating_err(__be32 err)
-{
- /* rfc 3530 section 8.1.5: */
- return err != nfserr_stale_clientid &&
- err != nfserr_stale_stateid &&
- err != nfserr_bad_stateid &&
- err != nfserr_bad_seqid &&
- err != nfserr_bad_xdr &&
- err != nfserr_resource &&
- err != nfserr_nofilehandle;
-}
-
/*
* Routine for encoding the result of a "seqid-mutating" NFSv4 operation. This
* is where sequence id's are incremented, and the replay cache is filled.
@@ -1561,15 +1736,20 @@ static bool seqid_mutating_err(__be32 err)
* we know whether the error to be returned is a sequence id mutating error.
*/
-#define ENCODE_SEQID_OP_TAIL(stateowner) do { \
- if (seqid_mutating_err(nfserr) && stateowner) { \
- stateowner->so_seqid++; \
- stateowner->so_replay.rp_status = nfserr; \
- stateowner->so_replay.rp_buflen = \
- (((char *)(resp)->p - (char *)save)); \
- memcpy(stateowner->so_replay.rp_buf, save, \
- stateowner->so_replay.rp_buflen); \
- } } while (0);
+static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr)
+{
+ struct nfs4_stateowner *stateowner = resp->cstate.replay_owner;
+
+ if (seqid_mutating_err(ntohl(nfserr)) && stateowner) {
+ stateowner->so_seqid++;
+ stateowner->so_replay.rp_status = nfserr;
+ stateowner->so_replay.rp_buflen =
+ (char *)resp->p - (char *)save;
+ memcpy(stateowner->so_replay.rp_buf, save,
+ stateowner->so_replay.rp_buflen);
+ nfsd4_purge_closed_stateid(stateowner);
+ }
+}
/* Encode as an array of strings the string given with components
* separated @sep.
@@ -1628,36 +1808,89 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
}
/*
- * Return the path to an export point in the pseudo filesystem namespace
- * Returned string is safe to use as long as the caller holds a reference
- * to @exp.
+ * Encode a path in RFC3530 'pathname4' format
*/
-static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
+static __be32 nfsd4_encode_path(const struct path *root,
+ const struct path *path, __be32 **pp, int *buflen)
{
- struct svc_fh tmp_fh;
- char *path = NULL, *rootpath;
- size_t rootlen;
+ struct path cur = {
+ .mnt = path->mnt,
+ .dentry = path->dentry,
+ };
+ __be32 *p = *pp;
+ struct dentry **components = NULL;
+ unsigned int ncomponents = 0;
+ __be32 err = nfserr_jukebox;
- fh_init(&tmp_fh, NFS4_FHSIZE);
- *stat = exp_pseudoroot(rqstp, &tmp_fh);
- if (*stat)
- return NULL;
- rootpath = tmp_fh.fh_export->ex_pathname;
+ dprintk("nfsd4_encode_components(");
+
+ path_get(&cur);
+ /* First walk the path up to the nfsd root, and store the
+ * dentries/path components in an array.
+ */
+ for (;;) {
+ if (cur.dentry == root->dentry && cur.mnt == root->mnt)
+ break;
+ if (cur.dentry == cur.mnt->mnt_root) {
+ if (follow_up(&cur))
+ continue;
+ goto out_free;
+ }
+ if ((ncomponents & 15) == 0) {
+ struct dentry **new;
+ new = krealloc(components,
+ sizeof(*new) * (ncomponents + 16),
+ GFP_KERNEL);
+ if (!new)
+ goto out_free;
+ components = new;
+ }
+ components[ncomponents++] = cur.dentry;
+ cur.dentry = dget_parent(cur.dentry);
+ }
- path = exp->ex_pathname;
+ *buflen -= 4;
+ if (*buflen < 0)
+ goto out_free;
+ WRITE32(ncomponents);
- rootlen = strlen(rootpath);
- if (strncmp(path, rootpath, rootlen)) {
- dprintk("nfsd: fs_locations failed;"
- "%s is not contained in %s\n", path, rootpath);
- *stat = nfserr_notsupp;
- path = NULL;
- goto out;
+ while (ncomponents) {
+ struct dentry *dentry = components[ncomponents - 1];
+ unsigned int len = dentry->d_name.len;
+
+ *buflen -= 4 + (XDR_QUADLEN(len) << 2);
+ if (*buflen < 0)
+ goto out_free;
+ WRITE32(len);
+ WRITEMEM(dentry->d_name.name, len);
+ dprintk("/%s", dentry->d_name.name);
+ dput(dentry);
+ ncomponents--;
}
- path += rootlen;
-out:
- fh_put(&tmp_fh);
- return path;
+
+ *pp = p;
+ err = 0;
+out_free:
+ dprintk(")\n");
+ while (ncomponents)
+ dput(components[--ncomponents]);
+ kfree(components);
+ path_put(&cur);
+ return err;
+}
+
+static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
+ const struct path *path, __be32 **pp, int *buflen)
+{
+ struct svc_export *exp_ps;
+ __be32 res;
+
+ exp_ps = rqst_find_fsidzero_export(rqstp);
+ if (IS_ERR(exp_ps))
+ return nfserrno(PTR_ERR(exp_ps));
+ res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen);
+ exp_put(exp_ps);
+ return res;
}
/*
@@ -1671,11 +1904,8 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
int i;
__be32 *p = *pp;
struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
- char *root = nfsd4_path(rqstp, exp, &status);
- if (status)
- return status;
- status = nfsd4_encode_components('/', root, &p, buflen);
+ status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen);
if (status)
return status;
if ((*buflen -= 4) < 0)
@@ -1691,12 +1921,19 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
return 0;
}
-static u32 nfs4_ftypes[16] = {
- NF4BAD, NF4FIFO, NF4CHR, NF4BAD,
- NF4DIR, NF4BAD, NF4BLK, NF4BAD,
- NF4REG, NF4BAD, NF4LNK, NF4BAD,
- NF4SOCK, NF4BAD, NF4LNK, NF4BAD,
-};
+static u32 nfs4_file_type(umode_t mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFIFO: return NF4FIFO;
+ case S_IFCHR: return NF4CHR;
+ case S_IFDIR: return NF4DIR;
+ case S_IFBLK: return NF4BLK;
+ case S_IFLNK: return NF4LNK;
+ case S_IFREG: return NF4REG;
+ case S_IFSOCK: return NF4SOCK;
+ default: return NF4BAD;
+ };
+}
static __be32
nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
@@ -1809,8 +2046,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
err = vfs_getattr(exp->ex_path.mnt, dentry, &stat);
if (err)
goto out_nfserr;
- if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL |
- FATTR4_WORD0_MAXNAME)) ||
+ if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+ FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
(bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
FATTR4_WORD1_SPACE_TOTAL))) {
err = vfs_statfs(&path, &statfs);
@@ -1885,7 +2122,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (bmval0 & FATTR4_WORD0_TYPE) {
if ((buflen -= 4) < 0)
goto out_resource;
- dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12];
+ dummy = nfs4_file_type(stat.mode);
if (dummy == NF4BAD)
goto out_serverfault;
WRITE32(dummy);
@@ -2187,6 +2424,8 @@ out_acl:
WRITE64(stat.ino);
}
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+ if ((buflen -= 16) < 0)
+ goto out_resource;
WRITE32(3);
WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
@@ -2416,7 +2655,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
if (!nfserr)
nfsd4_encode_stateid(resp, &close->cl_stateid);
- ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2492,17 +2731,18 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
static void
nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
{
+ struct xdr_netobj *conf = &ld->ld_owner;
__be32 *p;
- RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0));
+ RESERVE_SPACE(32 + XDR_LEN(conf->len));
WRITE64(ld->ld_start);
WRITE64(ld->ld_length);
WRITE32(ld->ld_type);
- if (ld->ld_sop) {
+ if (conf->len) {
WRITEMEM(&ld->ld_clientid, 8);
- WRITE32(ld->ld_sop->so_owner.len);
- WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len);
- kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner);
+ WRITE32(conf->len);
+ WRITEMEM(conf->data, conf->len);
+ kfree(conf->data);
} else { /* non - nfsv4 lock in conflict, no clientid nor owner */
WRITE64((u64)0); /* clientid */
WRITE32(0); /* length of owner name */
@@ -2520,7 +2760,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
else if (nfserr == nfserr_denied)
nfsd4_encode_lock_denied(resp, &lock->lk_denied);
- ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2540,7 +2780,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
if (!nfserr)
nfsd4_encode_stateid(resp, &locku->lu_stateid);
- ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2621,7 +2861,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
}
/* XXX save filehandle here */
out:
- ENCODE_SEQID_OP_TAIL(open->op_stateowner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2633,7 +2873,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
if (!nfserr)
nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
- ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2645,7 +2885,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
if (!nfserr)
nfsd4_encode_stateid(resp, &od->od_stateid);
- ENCODE_SEQID_OP_TAIL(od->od_stateowner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2692,8 +2932,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
&maxcount);
- if (nfserr == nfserr_symlink)
- nfserr = nfserr_inval;
if (nfserr)
return nfserr;
eof = (read->rd_offset + maxcount >=
@@ -2823,8 +3061,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
readdir->common.err == nfserr_toosmall &&
readdir->buffer == page)
nfserr = nfserr_toosmall;
- if (nfserr == nfserr_symlink)
- nfserr = nfserr_notdir;
if (nfserr)
goto err_no_verf;
@@ -3128,6 +3364,21 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
}
static __be32
+nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr,
+ struct nfsd4_free_stateid *free_stateid)
+{
+ __be32 *p;
+
+ if (nfserr)
+ return nfserr;
+
+ RESERVE_SPACE(4);
+ WRITE32(nfserr);
+ ADJUST_ARGS();
+ return nfserr;
+}
+
+static __be32
nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
struct nfsd4_sequence *seq)
{
@@ -3140,9 +3391,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
WRITE32(seq->seqid);
WRITE32(seq->slotid);
- WRITE32(seq->maxslots);
- /* For now: target_maxslots = maxslots */
- WRITE32(seq->maxslots);
+ /* Note slotid's are numbered from zero: */
+ WRITE32(seq->maxslots - 1); /* sr_highest_slotid */
+ WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */
WRITE32(seq->status_flags);
ADJUST_ARGS();
@@ -3150,6 +3401,37 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
return 0;
}
+__be32
+nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
+ struct nfsd4_test_stateid *test_stateid)
+{
+ struct nfsd4_compoundargs *argp;
+ struct nfs4_client *cl = resp->cstate.session->se_client;
+ stateid_t si;
+ __be32 *p;
+ int i;
+ int valid;
+
+ restore_buf(test_stateid->ts_saved_args, &test_stateid->ts_savedp);
+ argp = test_stateid->ts_saved_args;
+
+ RESERVE_SPACE(4);
+ *p++ = htonl(test_stateid->ts_num_ids);
+ resp->p = p;
+
+ nfs4_lock_state();
+ for (i = 0; i < test_stateid->ts_num_ids; i++) {
+ nfsd4_decode_stateid(argp, &si);
+ valid = nfs4_validate_stateid(cl, &si);
+ RESERVE_SPACE(4);
+ *p++ = valid;
+ resp->p = p;
+ }
+ nfs4_unlock_state();
+
+ return nfserr;
+}
+
static __be32
nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
{
@@ -3208,7 +3490,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
[OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
[OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
- [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_free_stateid,
[OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
[OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
[OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
@@ -3218,7 +3500,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
[OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
[OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid,
[OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
[OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
[OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
@@ -3226,34 +3508,29 @@ static nfsd4_enc nfsd4_enc_ops[] = {
/*
* Calculate the total amount of memory that the compound response has taken
- * after encoding the current operation.
+ * after encoding the current operation with pad.
*
- * pad: add on 8 bytes for the next operation's op_code and status so that
- * there is room to cache a failure on the next operation.
+ * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop()
+ * which was specified at nfsd4_operation, else pad is zero.
*
- * Compare this length to the session se_fmaxresp_cached.
+ * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached.
*
* Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
* will be at least a page and will therefore hold the xdr_buf head.
*/
-static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
{
- int status = 0;
struct xdr_buf *xb = &resp->rqstp->rq_res;
- struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
struct nfsd4_session *session = NULL;
struct nfsd4_slot *slot = resp->cstate.slot;
- u32 length, tlen = 0, pad = 8;
+ u32 length, tlen = 0;
if (!nfsd4_has_session(&resp->cstate))
- return status;
+ return 0;
session = resp->cstate.session;
- if (session == NULL || slot->sl_cachethis == 0)
- return status;
-
- if (resp->opcnt >= args->opcnt)
- pad = 0; /* this is the last operation */
+ if (session == NULL)
+ return 0;
if (xb->page_len == 0) {
length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
@@ -3266,10 +3543,14 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
length, xb->page_len, tlen, pad);
- if (length <= session->se_fchannel.maxresp_cached)
- return status;
- else
+ if (length > session->se_fchannel.maxresp_sz)
+ return nfserr_rep_too_big;
+
+ if (slot->sl_cachethis == 1 &&
+ length > session->se_fchannel.maxresp_cached)
return nfserr_rep_too_big_to_cache;
+
+ return 0;
}
void
@@ -3289,8 +3570,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
!nfsd4_enc_ops[op->opnum]);
op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
/* nfsd4_check_drc_limit guarantees enough room for error status */
- if (!op->status && nfsd4_check_drc_limit(resp))
- op->status = nfserr_rep_too_big_to_cache;
+ if (!op->status)
+ op->status = nfsd4_check_resp_size(resp, 0);
status:
/*
* Note: We write the status directly, instead of using WRITE32(),
@@ -3331,8 +3612,11 @@ nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
return xdr_ressize_check(rqstp, p);
}
-void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args)
+int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp)
{
+ struct svc_rqst *rqstp = rq;
+ struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
if (args->ops != args->iops) {
kfree(args->ops);
args->ops = args->iops;
@@ -3345,13 +3629,12 @@ void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args)
tb->release(tb->buf);
kfree(tb);
}
+ return 1;
}
int
nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args)
{
- __be32 status;
-
args->p = p;
args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
args->pagelist = rqstp->rq_arg.pages;
@@ -3361,11 +3644,7 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_comp
args->ops = args->iops;
args->rqstp = rqstp;
- status = nfsd4_decode_compound(args);
- if (status) {
- nfsd4_release_compoundargs(args);
- }
- return !status;
+ return !nfsd4_decode_compound(args);
}
int
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 4666a20..2cbac34 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -118,7 +118,7 @@ hash_refile(struct svc_cacherep *rp)
* Note that no operation within the loop may sleep.
*/
int
-nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
+nfsd_cache_lookup(struct svc_rqst *rqstp)
{
struct hlist_node *hn;
struct hlist_head *rh;
@@ -128,6 +128,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
vers = rqstp->rq_vers,
proc = rqstp->rq_proc;
unsigned long age;
+ int type = rqstp->rq_cachetype;
int rtn;
rqstp->rq_cacherep = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 2b1449d..c45a2ea 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -9,11 +9,11 @@
#include <linux/ctype.h>
#include <linux/sunrpc/svcsock.h>
-#include <linux/nfsd/syscall.h>
#include <linux/lockd/lockd.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/gss_api.h>
#include <linux/sunrpc/gss_krb5_enctypes.h>
+#include <linux/module.h>
#include "idmap.h"
#include "nfsd.h"
@@ -24,15 +24,6 @@
*/
enum {
NFSD_Root = 1,
-#ifdef CONFIG_NFSD_DEPRECATED
- NFSD_Svc,
- NFSD_Add,
- NFSD_Del,
- NFSD_Export,
- NFSD_Unexport,
- NFSD_Getfd,
- NFSD_Getfs,
-#endif
NFSD_List,
NFSD_Export_features,
NFSD_Fh,
@@ -59,15 +50,6 @@ enum {
/*
* write() for these nodes.
*/
-#ifdef CONFIG_NFSD_DEPRECATED
-static ssize_t write_svc(struct file *file, char *buf, size_t size);
-static ssize_t write_add(struct file *file, char *buf, size_t size);
-static ssize_t write_del(struct file *file, char *buf, size_t size);
-static ssize_t write_export(struct file *file, char *buf, size_t size);
-static ssize_t write_unexport(struct file *file, char *buf, size_t size);
-static ssize_t write_getfd(struct file *file, char *buf, size_t size);
-static ssize_t write_getfs(struct file *file, char *buf, size_t size);
-#endif
static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
@@ -83,15 +65,6 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
#endif
static ssize_t (*write_op[])(struct file *, char *, size_t) = {
-#ifdef CONFIG_NFSD_DEPRECATED
- [NFSD_Svc] = write_svc,
- [NFSD_Add] = write_add,
- [NFSD_Del] = write_del,
- [NFSD_Export] = write_export,
- [NFSD_Unexport] = write_unexport,
- [NFSD_Getfd] = write_getfd,
- [NFSD_Getfs] = write_getfs,
-#endif
[NFSD_Fh] = write_filehandle,
[NFSD_FO_UnlockIP] = write_unlock_ip,
[NFSD_FO_UnlockFS] = write_unlock_fs,
@@ -130,16 +103,6 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
{
-#ifdef CONFIG_NFSD_DEPRECATED
- static int warned;
- if (file->f_dentry->d_name.name[0] == '.' && !warned) {
- printk(KERN_INFO
- "Warning: \"%s\" uses deprecated NFSD interface: %s."
- " This will be removed in 2.6.40\n",
- current->comm, file->f_dentry->d_name.name);
- warned = 1;
- }
-#endif
if (! file->private_data) {
/* An attempt to read a transaction file without writing
* causes a 0-byte write so that the file can return
@@ -226,303 +189,6 @@ static const struct file_operations pool_stats_operations = {
* payload - write methods
*/
-#ifdef CONFIG_NFSD_DEPRECATED
-/**
- * write_svc - Start kernel's NFSD server
- *
- * Deprecated. /proc/fs/nfsd/threads is preferred.
- * Function remains to support old versions of nfs-utils.
- *
- * Input:
- * buf: struct nfsctl_svc
- * svc_port: port number of this
- * server's listener
- * svc_nthreads: number of threads to start
- * size: size in bytes of passed in nfsctl_svc
- * Output:
- * On success: returns zero
- * On error: return code is negative errno value
- */
-static ssize_t write_svc(struct file *file, char *buf, size_t size)
-{
- struct nfsctl_svc *data;
- int err;
- if (size < sizeof(*data))
- return -EINVAL;
- data = (struct nfsctl_svc*) buf;
- err = nfsd_svc(data->svc_port, data->svc_nthreads);
- if (err < 0)
- return err;
- return 0;
-}
-
-/**
- * write_add - Add or modify client entry in auth unix cache
- *
- * Deprecated. /proc/net/rpc/auth.unix.ip is preferred.
- * Function remains to support old versions of nfs-utils.
- *
- * Input:
- * buf: struct nfsctl_client
- * cl_ident: '\0'-terminated C string
- * containing domain name
- * of client
- * cl_naddr: no. of items in cl_addrlist
- * cl_addrlist: array of client addresses
- * cl_fhkeytype: ignored
- * cl_fhkeylen: ignored
- * cl_fhkey: ignored
- * size: size in bytes of passed in nfsctl_client
- * Output:
- * On success: returns zero
- * On error: return code is negative errno value
- *
- * Note: Only AF_INET client addresses are passed in, since
- * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
- */
-static ssize_t write_add(struct file *file, char *buf, size_t size)
-{
- struct nfsctl_client *data;
- if (size < sizeof(*data))
- return -EINVAL;
- data = (struct nfsctl_client *)buf;
- return exp_addclient(data);
-}
-
-/**
- * write_del - Remove client from auth unix cache
- *
- * Deprecated. /proc/net/rpc/auth.unix.ip is preferred.
- * Function remains to support old versions of nfs-utils.
- *
- * Input:
- * buf: struct nfsctl_client
- * cl_ident: '\0'-terminated C string
- * containing domain name
- * of client
- * cl_naddr: ignored
- * cl_addrlist: ignored
- * cl_fhkeytype: ignored
- * cl_fhkeylen: ignored
- * cl_fhkey: ignored
- * size: size in bytes of passed in nfsctl_client
- * Output:
- * On success: returns zero
- * On error: return code is negative errno value
- *
- * Note: Only AF_INET client addresses are passed in, since
- * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
- */
-static ssize_t write_del(struct file *file, char *buf, size_t size)
-{
- struct nfsctl_client *data;
- if (size < sizeof(*data))
- return -EINVAL;
- data = (struct nfsctl_client *)buf;
- return exp_delclient(data);
-}
-
-/**
- * write_export - Export part or all of a local file system
- *
- * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
- * Function remains to support old versions of nfs-utils.
- *
- * Input:
- * buf: struct nfsctl_export
- * ex_client: '\0'-terminated C string
- * containing domain name
- * of client allowed to access
- * this export
- * ex_path: '\0'-terminated C string
- * containing pathname of
- * directory in local file system
- * ex_dev: fsid to use for this export
- * ex_ino: ignored
- * ex_flags: export flags for this export
- * ex_anon_uid: UID to use for anonymous
- * requests
- * ex_anon_gid: GID to use for anonymous
- * requests
- * size: size in bytes of passed in nfsctl_export
- * Output:
- * On success: returns zero
- * On error: return code is negative errno value
- */
-static ssize_t write_export(struct file *file, char *buf, size_t size)
-{
- struct nfsctl_export *data;
- if (size < sizeof(*data))
- return -EINVAL;
- data = (struct nfsctl_export*)buf;
- return exp_export(data);
-}
-
-/**
- * write_unexport - Unexport a previously exported file system
- *
- * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
- * Function remains to support old versions of nfs-utils.
- *
- * Input:
- * buf: struct nfsctl_export
- * ex_client: '\0'-terminated C string
- * containing domain name
- * of client no longer allowed
- * to access this export
- * ex_path: '\0'-terminated C string
- * containing pathname of
- * directory in local file system
- * ex_dev: ignored
- * ex_ino: ignored
- * ex_flags: ignored
- * ex_anon_uid: ignored
- * ex_anon_gid: ignored
- * size: size in bytes of passed in nfsctl_export
- * Output:
- * On success: returns zero
- * On error: return code is negative errno value
- */
-static ssize_t write_unexport(struct file *file, char *buf, size_t size)
-{
- struct nfsctl_export *data;
-
- if (size < sizeof(*data))
- return -EINVAL;
- data = (struct nfsctl_export*)buf;
- return exp_unexport(data);
-}
-
-/**
- * write_getfs - Get a variable-length NFS file handle by path
- *
- * Deprecated. /proc/fs/nfsd/filehandle is preferred.
- * Function remains to support old versions of nfs-utils.
- *
- * Input:
- * buf: struct nfsctl_fsparm
- * gd_addr: socket address of client
- * gd_path: '\0'-terminated C string
- * containing pathname of
- * directory in local file system
- * gd_maxlen: maximum size of returned file
- * handle
- * size: size in bytes of passed in nfsctl_fsparm
- * Output:
- * On success: passed-in buffer filled with a knfsd_fh structure
- * (a variable-length raw NFS file handle);
- * return code is the size in bytes of the file handle
- * On error: return code is negative errno value
- *
- * Note: Only AF_INET client addresses are passed in, since gd_addr
- * is the same size as a struct sockaddr_in.
- */
-static ssize_t write_getfs(struct file *file, char *buf, size_t size)
-{
- struct nfsctl_fsparm *data;
- struct sockaddr_in *sin;
- struct auth_domain *clp;
- int err = 0;
- struct knfsd_fh *res;
- struct in6_addr in6;
-
- if (size < sizeof(*data))
- return -EINVAL;
- data = (struct nfsctl_fsparm*)buf;
- err = -EPROTONOSUPPORT;
- if (data->gd_addr.sa_family != AF_INET)
- goto out;
- sin = (struct sockaddr_in *)&data->gd_addr;
- if (data->gd_maxlen > NFS3_FHSIZE)
- data->gd_maxlen = NFS3_FHSIZE;
-
- res = (struct knfsd_fh*)buf;
-
- exp_readlock();
-
- ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
-
- clp = auth_unix_lookup(&init_net, &in6);
- if (!clp)
- err = -EPERM;
- else {
- err = exp_rootfh(clp, data->gd_path, res, data->gd_maxlen);
- auth_domain_put(clp);
- }
- exp_readunlock();
- if (err == 0)
- err = res->fh_size + offsetof(struct knfsd_fh, fh_base);
- out:
- return err;
-}
-
-/**
- * write_getfd - Get a fixed-length NFS file handle by path (used by mountd)
- *
- * Deprecated. /proc/fs/nfsd/filehandle is preferred.
- * Function remains to support old versions of nfs-utils.
- *
- * Input:
- * buf: struct nfsctl_fdparm
- * gd_addr: socket address of client
- * gd_path: '\0'-terminated C string
- * containing pathname of
- * directory in local file system
- * gd_version: fdparm structure version
- * size: size in bytes of passed in nfsctl_fdparm
- * Output:
- * On success: passed-in buffer filled with nfsctl_res
- * (a fixed-length raw NFS file handle);
- * return code is the size in bytes of the file handle
- * On error: return code is negative errno value
- *
- * Note: Only AF_INET client addresses are passed in, since gd_addr
- * is the same size as a struct sockaddr_in.
- */
-static ssize_t write_getfd(struct file *file, char *buf, size_t size)
-{
- struct nfsctl_fdparm *data;
- struct sockaddr_in *sin;
- struct auth_domain *clp;
- int err = 0;
- struct knfsd_fh fh;
- char *res;
- struct in6_addr in6;
-
- if (size < sizeof(*data))
- return -EINVAL;
- data = (struct nfsctl_fdparm*)buf;
- err = -EPROTONOSUPPORT;
- if (data->gd_addr.sa_family != AF_INET)
- goto out;
- err = -EINVAL;
- if (data->gd_version < 2 || data->gd_version > NFSSVC_MAXVERS)
- goto out;
-
- res = buf;
- sin = (struct sockaddr_in *)&data->gd_addr;
- exp_readlock();
-
- ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
-
- clp = auth_unix_lookup(&init_net, &in6);
- if (!clp)
- err = -EPERM;
- else {
- err = exp_rootfh(clp, data->gd_path, &fh, NFS_FHSIZE);
- auth_domain_put(clp);
- }
- exp_readunlock();
-
- if (err == 0) {
- memset(res,0, NFS_FHSIZE);
- memcpy(res, &fh.fh_base, fh.fh_size);
- err = NFS_FHSIZE;
- }
- out:
- return err;
-}
-#endif /* CONFIG_NFSD_DEPRECATED */
/**
* write_unlock_ip - Release all locks used by a client
@@ -1397,15 +1063,6 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
{
static struct tree_descr nfsd_files[] = {
-#ifdef CONFIG_NFSD_DEPRECATED
- [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
- [NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
- [NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
- [NFSD_Export] = {".export", &transaction_ops, S_IWUSR},
- [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
- [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
- [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
-#endif
[NFSD_List] = {"exports", &exports_operations, S_IRUGO},
[NFSD_Export_features] = {"export_features",
&export_features_operations, S_IRUGO},
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 7ecfa24..58134a2 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -11,13 +11,39 @@
#include <linux/types.h>
#include <linux/mount.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/sunrpc/msg_prot.h>
+
#include <linux/nfsd/debug.h>
#include <linux/nfsd/export.h>
#include <linux/nfsd/stats.h>
+
/*
* nfsd version
*/
#define NFSD_SUPPORTED_MINOR_VERSION 1
+/*
+ * Maximum blocksizes supported by daemon under various circumstances.
+ */
+#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD
+/* NFSv2 is limited by the protocol specification, see RFC 1094 */
+#define NFSSVC_MAXBLKSIZE_V2 (8*1024)
+
+
+/*
+ * Largest number of bytes we need to allocate for an NFS
+ * call or reply. Used to control buffer sizes. We use
+ * the length of v3 WRITE, READDIR and READDIR replies
+ * which are an RPC header, up to 26 XDR units of reply
+ * data, and some page data.
+ *
+ * Note that accuracy here doesn't matter too much as the
+ * size is rounded up to a page size when allocating space.
+ */
+#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
struct readdir_cd {
__be32 err; /* 0, nfserr, or nfserr_eof */
@@ -335,6 +361,13 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
NFSD_WRITEABLE_ATTRS_WORD2
+extern int nfsd4_is_junction(struct dentry *dentry);
+#else
+static inline int nfsd4_is_junction(struct dentry *dentry)
+{
+ return 0;
+}
+
#endif /* CONFIG_NFSD_V4 */
#endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 90c6aa6..c763de5 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -59,28 +59,25 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
* the write call).
*/
static inline __be32
-nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
+nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested)
{
- /* Type can be negative when creating hardlinks - not to a dir */
- if (type > 0 && (mode & S_IFMT) != type) {
- if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
- return nfserr_symlink;
- else if (type == S_IFDIR)
- return nfserr_notdir;
- else if ((mode & S_IFMT) == S_IFDIR)
- return nfserr_isdir;
- else
- return nfserr_inval;
- }
- if (type < 0 && (mode & S_IFMT) == -type) {
- if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
- return nfserr_symlink;
- else if (type == -S_IFDIR)
- return nfserr_isdir;
- else
- return nfserr_notdir;
- }
- return 0;
+ mode &= S_IFMT;
+
+ if (requested == 0) /* the caller doesn't care */
+ return nfs_ok;
+ if (mode == requested)
+ return nfs_ok;
+ /*
+ * v4 has an error more specific than err_notdir which we should
+ * return in preference to err_notdir:
+ */
+ if (rqstp->rq_vers == 4 && mode == S_IFLNK)
+ return nfserr_symlink;
+ if (requested == S_IFDIR)
+ return nfserr_notdir;
+ if (mode == S_IFDIR)
+ return nfserr_isdir;
+ return nfserr_inval;
}
static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 18743c4..7595582 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/freezer.h>
+#include <linux/module.h>
#include <linux/fs_struct.h>
#include <linux/swap.h>
@@ -256,6 +257,8 @@ static void nfsd_last_thread(struct svc_serv *serv)
nfsd_serv = NULL;
nfsd_shutdown();
+ svc_rpcb_cleanup(serv);
+
printk(KERN_WARNING "nfsd: last server has exited, flushing export "
"cache\n");
nfsd_export_flush();
@@ -528,16 +531,9 @@ nfsd(void *vrqstp)
continue;
}
-
- /* Lock the export hash tables for reading. */
- exp_readlock();
-
validate_process_creds();
svc_process(rqstp);
validate_process_creds();
-
- /* Unlock export hash tables */
- exp_readunlock();
}
/* Clear signals before calling svc_exit_thread() */
@@ -577,8 +573,22 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
rqstp->rq_vers, rqstp->rq_proc);
proc = rqstp->rq_procinfo;
+ /*
+ * Give the xdr decoder a chance to change this if it wants
+ * (necessary in the NFSv4.0 compound case)
+ */
+ rqstp->rq_cachetype = proc->pc_cachetype;
+ /* Decode arguments */
+ xdr = proc->pc_decode;
+ if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base,
+ rqstp->rq_argp)) {
+ dprintk("nfsd: failed to decode arguments!\n");
+ *statp = rpc_garbage_args;
+ return 1;
+ }
+
/* Check whether we have this call in the cache. */
- switch (nfsd_cache_lookup(rqstp, proc->pc_cachetype)) {
+ switch (nfsd_cache_lookup(rqstp)) {
case RC_INTR:
case RC_DROPIT:
return 0;
@@ -588,16 +598,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
/* do it */
}
- /* Decode arguments */
- xdr = proc->pc_decode;
- if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base,
- rqstp->rq_argp)) {
- dprintk("nfsd: failed to decode arguments!\n");
- nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
- *statp = rpc_garbage_args;
- return 1;
- }
-
/* need to grab the location to store the status, as
* nfsv4 does some encoding while processing
*/
@@ -633,7 +633,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
}
/* Store reply in cache. */
- nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
+ nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
return 1;
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 858c7ba..a3cf384 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
#ifndef _NFSD4_STATE_H
#define _NFSD4_STATE_H
+#include <linux/idr.h>
#include <linux/sunrpc/svc_xprt.h>
#include <linux/nfsd/nfsfh.h>
#include "nfsfh.h"
@@ -45,24 +46,20 @@ typedef struct {
} clientid_t;
typedef struct {
- u32 so_boot;
- u32 so_stateownerid;
- u32 so_fileid;
+ clientid_t so_clid;
+ u32 so_id;
} stateid_opaque_t;
typedef struct {
u32 si_generation;
stateid_opaque_t si_opaque;
} stateid_t;
-#define si_boot si_opaque.so_boot
-#define si_stateownerid si_opaque.so_stateownerid
-#define si_fileid si_opaque.so_fileid
#define STATEID_FMT "(%08x/%08x/%08x/%08x)"
#define STATEID_VAL(s) \
- (s)->si_boot, \
- (s)->si_stateownerid, \
- (s)->si_fileid, \
+ (s)->si_opaque.so_clid.cl_boot, \
+ (s)->si_opaque.so_clid.cl_id, \
+ (s)->si_opaque.so_id, \
(s)->si_generation
struct nfsd4_callback {
@@ -76,17 +73,27 @@ struct nfsd4_callback {
bool cb_done;
};
+struct nfs4_stid {
+#define NFS4_OPEN_STID 1
+#define NFS4_LOCK_STID 2
+#define NFS4_DELEG_STID 4
+/* For an open stateid kept around *only* to process close replays: */
+#define NFS4_CLOSED_STID 8
+ unsigned char sc_type;
+ stateid_t sc_stateid;
+ struct nfs4_client *sc_client;
+};
+
struct nfs4_delegation {
+ struct nfs4_stid dl_stid; /* must be first field */
struct list_head dl_perfile;
struct list_head dl_perclnt;
struct list_head dl_recall_lru; /* delegation recalled */
atomic_t dl_count; /* ref count */
- struct nfs4_client *dl_client;
struct nfs4_file *dl_file;
u32 dl_type;
time_t dl_time;
/* For recall: */
- stateid_t dl_stateid;
struct knfsd_fh dl_fh;
int dl_retries;
struct nfsd4_callback dl_recall;
@@ -104,6 +111,11 @@ struct nfs4_cb_conn {
struct svc_xprt *cb_xprt; /* minorversion 1 only */
};
+static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
+{
+ return container_of(s, struct nfs4_delegation, dl_stid);
+}
+
/* Maximum number of slots per session. 160 is useful for long haul TCP */
#define NFSD_MAX_SLOTS_PER_SESSION 160
/* Maximum number of operations per session compound */
@@ -220,6 +232,7 @@ struct nfs4_client {
struct list_head cl_idhash; /* hash by cl_clientid.id */
struct list_head cl_strhash; /* hash by cl_name */
struct list_head cl_openowners;
+ struct idr cl_stateids; /* stateid lookup */
struct list_head cl_delegations;
struct list_head cl_lru; /* tail queue */
struct xdr_netobj cl_name; /* id generated by client */
@@ -245,6 +258,7 @@ struct nfs4_client {
#define NFSD4_CB_UP 0
#define NFSD4_CB_UNKNOWN 1
#define NFSD4_CB_DOWN 2
+#define NFSD4_CB_FAULT 3
int cl_cb_state;
struct nfsd4_callback cl_cb_null;
struct nfsd4_session *cl_cb_session;
@@ -293,6 +307,9 @@ static inline void
update_stateid(stateid_t *stateid)
{
stateid->si_generation++;
+ /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
+ if (stateid->si_generation == 0)
+ stateid->si_generation = 1;
}
/* A reasonable value for REPLAY_ISIZE was estimated as follows:
@@ -312,49 +329,57 @@ struct nfs4_replay {
__be32 rp_status;
unsigned int rp_buflen;
char *rp_buf;
- unsigned intrp_allocated;
struct knfsd_fh rp_openfh;
char rp_ibuf[NFSD4_REPLAY_ISIZE];
};
-/*
-* nfs4_stateowner can either be an open_owner, or a lock_owner
-*
-* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[]
-* for lock_owner
-* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[]
-* for lock_owner
-* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
-* struct is reaped.
-* so_perfilestate: heads the list of nfs4_stateid (either open or lock)
-* and is used to ensure no dangling nfs4_stateid references when we
-* release a stateowner.
-* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when
-* close is called to reap associated byte-range locks
-* so_close_lru: (open) stateowner is placed on this list instead of being
-* reaped (when so_perfilestate is empty) to hold the last close replay.
-* reaped by laundramat thread after lease period.
-*/
struct nfs4_stateowner {
- struct kref so_ref;
- struct list_head so_idhash; /* hash by so_id */
struct list_head so_strhash; /* hash by op_name */
- struct list_head so_perclient;
struct list_head so_stateids;
- struct list_head so_perstateid; /* for lockowners only */
- struct list_head so_close_lru; /* tail queue */
- time_t so_time; /* time of placement on so_close_lru */
- int so_is_open_owner; /* 1=openowner,0=lockowner */
- u32 so_id;
struct nfs4_client * so_client;
/* after increment in ENCODE_SEQID_OP_TAIL, represents the next
* sequence id expected from the client: */
u32 so_seqid;
struct xdr_netobj so_owner; /* open owner name */
- int so_confirmed; /* successful OPEN_CONFIRM? */
struct nfs4_replay so_replay;
+ bool so_is_open_owner;
};
+struct nfs4_openowner {
+ struct nfs4_stateowner oo_owner; /* must be first field */
+ struct list_head oo_perclient;
+ /*
+ * We keep around openowners a little while after last close,
+ * which saves clients from having to confirm, and allows us to
+ * handle close replays if they come soon enough. The close_lru
+ * is a list of such openowners, to be reaped by the laundromat
+ * thread eventually if they remain unused:
+ */
+ struct list_head oo_close_lru;
+ struct nfs4_ol_stateid *oo_last_closed_stid;
+ time_t oo_time; /* time of placement on so_close_lru */
+#define NFS4_OO_CONFIRMED 1
+#define NFS4_OO_PURGE_CLOSE 2
+#define NFS4_OO_NEW 4
+ unsigned char oo_flags;
+};
+
+struct nfs4_lockowner {
+ struct nfs4_stateowner lo_owner; /* must be first element */
+ struct list_head lo_perstateid; /* for lockowners only */
+ struct list_head lo_list; /* for temporary uses */
+};
+
+static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
+{
+ return container_of(so, struct nfs4_openowner, oo_owner);
+}
+
+static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
+{
+ return container_of(so, struct nfs4_lockowner, lo_owner);
+}
+
/*
* nfs4_file: a file opened by some number of (open) nfs4_stateowners.
* o fi_perfile list is used to search for conflicting
@@ -368,17 +393,17 @@ struct nfs4_file {
/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
struct file * fi_fds[3];
/*
- * Each open or lock stateid contributes 1 to either
- * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending
- * on open or lock mode:
+ * Each open or lock stateid contributes 0-4 to the counts
+ * below depending on which bits are set in st_access_bitmap:
+ * 1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set
+ * + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set
+ * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
*/
atomic_t fi_access[2];
struct file *fi_deleg_file;
struct file_lock *fi_lease;
atomic_t fi_delegees;
struct inode *fi_inode;
- u32 fi_id; /* used with stateowner->so_id
- * for stateid_hashtbl hash */
bool fi_had_conflict;
};
@@ -408,44 +433,27 @@ static inline struct file *find_any_file(struct nfs4_file *f)
return f->fi_fds[O_RDONLY];
}
-/*
-* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
-*
-* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file
-*
-* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
-* st_perfile: file_hashtbl[] entry.
-* st_perfile_state: nfs4_stateowner->so_perfilestate
-* st_perlockowner: (open stateid) list of lock nfs4_stateowners
-* st_access_bmap: used only for open stateid
-* st_deny_bmap: used only for open stateid
-* st_openstp: open stateid lock stateid was derived from
-*
-* XXX: open stateids and lock stateids have diverged sufficiently that
-* we should consider defining separate structs for the two cases.
-*/
-
-struct nfs4_stateid {
- struct list_head st_hash;
+/* "ol" stands for "Open or Lock". Better suggestions welcome. */
+struct nfs4_ol_stateid {
+ struct nfs4_stid st_stid; /* must be first field */
struct list_head st_perfile;
struct list_head st_perstateowner;
struct list_head st_lockowners;
struct nfs4_stateowner * st_stateowner;
struct nfs4_file * st_file;
- stateid_t st_stateid;
unsigned long st_access_bmap;
unsigned long st_deny_bmap;
- struct nfs4_stateid * st_openstp;
+ struct nfs4_ol_stateid * st_openstp;
};
+static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
+{
+ return container_of(s, struct nfs4_ol_stateid, st_stid);
+}
+
/* flags for preprocess_seqid_op() */
-#define HAS_SESSION 0x00000001
-#define CONFIRM 0x00000002
-#define OPEN_STATE 0x00000004
-#define LOCK_STATE 0x00000008
#define RD_STATE 0x00000010
#define WR_STATE 0x00000020
-#define CLOSE_STATE 0x00000040
struct nfsd4_compound_state;
@@ -455,7 +463,8 @@ extern void nfs4_lock_state(void);
extern void nfs4_unlock_state(void);
extern int nfs4_in_grace(void);
extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
-extern void nfs4_free_stateowner(struct kref *kref);
+extern void nfs4_free_openowner(struct nfs4_openowner *);
+extern void nfs4_free_lockowner(struct nfs4_lockowner *);
extern int set_callback_cred(void);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
@@ -467,7 +476,7 @@ extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfs4_put_delegation(struct nfs4_delegation *dp);
extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
-extern void nfsd4_init_recdir(char *recdir_name);
+extern void nfsd4_init_recdir(void);
extern int nfsd4_recdir_load(void);
extern void nfsd4_shutdown_recdir(void);
extern int nfs4_client_to_reclaim(const char *name);
@@ -476,17 +485,7 @@ extern void nfsd4_recdir_purge_old(void);
extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
extern void release_session_client(struct nfsd4_session *);
-
-static inline void
-nfs4_put_stateowner(struct nfs4_stateowner *so)
-{
- kref_put(&so->so_ref, nfs4_free_stateowner);
-}
-
-static inline void
-nfs4_get_stateowner(struct nfs4_stateowner *so)
-{
- kref_get(&so->so_ref);
-}
+extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
+extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index acf88ae..e2e7914 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -168,6 +168,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
{
if (d_mountpoint(dentry))
return 1;
+ if (nfsd4_is_junction(dentry))
+ return 1;
if (!(exp->ex_flags & NFSEXP_V4ROOT))
return 0;
return dentry->d_inode != NULL;
@@ -295,41 +297,12 @@ commit_metadata(struct svc_fh *fhp)
}
/*
- * Set various file attributes.
- * N.B. After this call fhp needs an fh_put
+ * Go over the attributes and take care of the small differences between
+ * NFS semantics and what Linux expects.
*/
-__be32
-nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
- int check_guard, time_t guardtime)
+static void
+nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
{
- struct dentry *dentry;
- struct inode *inode;
- int accmode = NFSD_MAY_SATTR;
- int ftype = 0;
- __be32 err;
- int host_err;
- int size_change = 0;
-
- if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
- accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
- if (iap->ia_valid & ATTR_SIZE)
- ftype = S_IFREG;
-
- /* Get inode */
- err = fh_verify(rqstp, fhp, ftype, accmode);
- if (err)
- goto out;
-
- dentry = fhp->fh_dentry;
- inode = dentry->d_inode;
-
- /* Ignore any mode updates on symlinks */
- if (S_ISLNK(inode->i_mode))
- iap->ia_valid &= ~ATTR_MODE;
-
- if (!iap->ia_valid)
- goto out;
-
/*
* NFSv2 does not differentiate between "set-[ac]time-to-now"
* which only requires access, and "set-[ac]time-to-X" which
@@ -339,8 +312,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
* convert to "set to now" instead of "set to explicit time"
*
* We only call inode_change_ok as the last test as technically
- * it is not an interface that we should be using. It is only
- * valid if the filesystem does not define it's own i_op->setattr.
+ * it is not an interface that we should be using.
*/
#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
#define MAX_TOUCH_TIME_ERROR (30*60)
@@ -366,30 +338,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
iap->ia_valid &= ~BOTH_TIME_SET;
}
}
-
- /*
- * The size case is special.
- * It changes the file as well as the attributes.
- */
- if (iap->ia_valid & ATTR_SIZE) {
- if (iap->ia_size < inode->i_size) {
- err = nfsd_permission(rqstp, fhp->fh_export, dentry,
- NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
- if (err)
- goto out;
- }
-
- host_err = get_write_access(inode);
- if (host_err)
- goto out_nfserr;
-
- size_change = 1;
- host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
- if (host_err) {
- put_write_access(inode);
- goto out_nfserr;
- }
- }
/* sanitize the mode change */
if (iap->ia_valid & ATTR_MODE) {
@@ -412,32 +360,120 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
}
}
+}
- /* Change the attributes. */
+static __be32
+nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct iattr *iap)
+{
+ struct inode *inode = fhp->fh_dentry->d_inode;
+ int host_err;
- iap->ia_valid |= ATTR_CTIME;
+ if (iap->ia_size < inode->i_size) {
+ __be32 err;
+
+ err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+ NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE);
+ if (err)
+ return err;
+ }
+
+ host_err = get_write_access(inode);
+ if (host_err)
+ goto out_nfserrno;
+
+ host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
+ if (host_err)
+ goto out_put_write_access;
+ return 0;
+
+out_put_write_access:
+ put_write_access(inode);
+out_nfserrno:
+ return nfserrno(host_err);
+}
+
+/*
+ * Set various file attributes. After this call fhp needs an fh_put.
+ */
+__be32
+nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+ int check_guard, time_t guardtime)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+ int accmode = NFSD_MAY_SATTR;
+ int ftype = 0;
+ __be32 err;
+ int host_err;
+ bool get_write_count;
+ int size_change = 0;
- err = nfserr_notsync;
- if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
- host_err = nfsd_break_lease(inode);
+ if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
+ accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
+ if (iap->ia_valid & ATTR_SIZE)
+ ftype = S_IFREG;
+
+ /* Callers that do fh_verify should do the fh_want_write: */
+ get_write_count = !fhp->fh_dentry;
+
+ /* Get inode */
+ err = fh_verify(rqstp, fhp, ftype, accmode);
+ if (err)
+ goto out;
+ if (get_write_count) {
+ host_err = fh_want_write(fhp);
if (host_err)
- goto out_nfserr;
- fh_lock(fhp);
+ return nfserrno(host_err);
+ }
- host_err = notify_change(dentry, iap);
- err = nfserrno(host_err);
- fh_unlock(fhp);
+ dentry = fhp->fh_dentry;
+ inode = dentry->d_inode;
+
+ /* Ignore any mode updates on symlinks */
+ if (S_ISLNK(inode->i_mode))
+ iap->ia_valid &= ~ATTR_MODE;
+
+ if (!iap->ia_valid)
+ goto out;
+
+ nfsd_sanitize_attrs(inode, iap);
+
+ /*
+ * The size case is special, it changes the file in addition to the
+ * attributes.
+ */
+ if (iap->ia_valid & ATTR_SIZE) {
+ err = nfsd_get_write_access(rqstp, fhp, iap);
+ if (err)
+ goto out;
+ size_change = 1;
}
+
+ iap->ia_valid |= ATTR_CTIME;
+
+ if (check_guard && guardtime != inode->i_ctime.tv_sec) {
+ err = nfserr_notsync;
+ goto out_put_write_access;
+ }
+
+ host_err = nfsd_break_lease(inode);
+ if (host_err)
+ goto out_put_write_access_nfserror;
+
+ fh_lock(fhp);
+ host_err = notify_change(dentry, iap);
+ fh_unlock(fhp);
+
+out_put_write_access_nfserror:
+ err = nfserrno(host_err);
+out_put_write_access:
if (size_change)
put_write_access(inode);
if (!err)
commit_metadata(fhp);
out:
return err;
-
-out_nfserr:
- err = nfserrno(host_err);
- goto out;
}
#if defined(CONFIG_NFSD_V2_ACL) || \
@@ -472,6 +508,9 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
char *buf = NULL;
int error = 0;
+ if (!pacl)
+ return vfs_setxattr(dentry, key, NULL, 0, 0);
+
buflen = posix_acl_xattr_size(pacl->a_count);
buf = kmalloc(buflen, GFP_KERNEL);
error = -ENOMEM;
@@ -502,7 +541,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int flags = 0;
/* Get inode */
- error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+ error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
if (error)
return error;
@@ -592,6 +631,22 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
return error;
}
+#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction."
+#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type"
+int nfsd4_is_junction(struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ if (inode == NULL)
+ return 0;
+ if (inode->i_mode & S_IXUGO)
+ return 0;
+ if (!(inode->i_mode & S_ISVTX))
+ return 0;
+ if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0)
+ return 0;
+ return 1;
+}
#endif /* defined(CONFIG_NFSD_V4) */
#ifdef CONFIG_NFSD_V3
@@ -708,12 +763,13 @@ static int nfsd_open_break_lease(struct inode *inode, int access)
/*
* Open an existing file or directory.
- * The access argument indicates the type of open (read/write/lock)
+ * The may_flags argument indicates the type of open (read/write/lock)
+ * and additional flags.
* N.B. After this call fhp needs an fh_put
*/
__be32
nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
- int access, struct file **filp)
+ int may_flags, struct file **filp)
{
struct dentry *dentry;
struct inode *inode;
@@ -728,7 +784,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
* and (hopefully) checked permission - so allow OWNER_OVERRIDE
* in case a chmod has now revoked permission.
*/
- err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE);
+ err = fh_verify(rqstp, fhp, type, may_flags | NFSD_MAY_OWNER_OVERRIDE);
if (err)
goto out;
@@ -739,7 +795,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
* or any access when mandatory locking enabled
*/
err = nfserr_perm;
- if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE))
+ if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
goto out;
/*
* We must ignore files (but only files) which might have mandatory
@@ -752,22 +808,30 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (!inode->i_fop)
goto out;
- host_err = nfsd_open_break_lease(inode, access);
+ host_err = nfsd_open_break_lease(inode, may_flags);
if (host_err) /* NOMEM or WOULDBLOCK */
goto out_nfserr;
- if (access & NFSD_MAY_WRITE) {
- if (access & NFSD_MAY_READ)
+ if (may_flags & NFSD_MAY_WRITE) {
+ if (may_flags & NFSD_MAY_READ)
flags = O_RDWR|O_LARGEFILE;
else
flags = O_WRONLY|O_LARGEFILE;
}
*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
flags, current_cred());
- if (IS_ERR(*filp))
+ if (IS_ERR(*filp)) {
host_err = PTR_ERR(*filp);
- else
- host_err = ima_file_check(*filp, access);
+ *filp = NULL;
+ } else {
+ host_err = ima_file_check(*filp, may_flags);
+
+ if (may_flags & NFSD_MAY_64BIT_COOKIE)
+ (*filp)->f_mode |= FMODE_64BITHASH;
+ else
+ (*filp)->f_mode |= FMODE_32BITHASH;
+ }
+
out_nfserr:
err = nfserrno(host_err);
out:
@@ -1352,7 +1416,7 @@ __be32
do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *fname, int flen, struct iattr *iap,
struct svc_fh *resfhp, int createmode, u32 *verifier,
- int *truncp, int *created)
+ bool *truncp, bool *created)
{
struct dentry *dentry, *dchild = NULL;
struct inode *dirp;
@@ -1421,7 +1485,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
switch (createmode) {
case NFS3_CREATE_UNCHECKED:
if (! S_ISREG(dchild->d_inode->i_mode))
- err = nfserr_exist;
+ goto out;
else if (truncp) {
/* in nfsv4, we need to treat this case a little
* differently. we don't want to truncate the
@@ -1440,13 +1504,19 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
case NFS3_CREATE_EXCLUSIVE:
if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
&& dchild->d_inode->i_atime.tv_sec == v_atime
- && dchild->d_inode->i_size == 0 )
+ && dchild->d_inode->i_size == 0 ) {
+ if (created)
+ *created = 1;
break;
+ }
case NFS4_CREATE_EXCLUSIVE4_1:
if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
&& dchild->d_inode->i_atime.tv_sec == v_atime
- && dchild->d_inode->i_size == 0 )
+ && dchild->d_inode->i_size == 0 ) {
+ if (created)
+ *created = 1;
goto set_attr;
+ }
/* fallthru */
case NFS3_CREATE_GUARDED:
err = nfserr_exist;
@@ -1632,10 +1702,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
if (err)
goto out;
- err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP);
+ err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP);
if (err)
goto out;
-
+ err = nfserr_isdir;
+ if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode))
+ goto out;
err = nfserr_perm;
if (!len)
goto out;
@@ -1989,8 +2061,13 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
__be32 err;
struct file *file;
loff_t offset = *offsetp;
+ int may_flags = NFSD_MAY_READ;
+
+ /* NFSv2 only supports 32 bit cookies */
+ if (rqstp->rq_vers > 2)
+ may_flags |= NFSD_MAY_64BIT_COOKIE;
- err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file);
+ err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file);
if (err)
goto out;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a22e40e..c7fe962 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -10,22 +10,24 @@
/*
* Flags for nfsd_permission
*/
-#define NFSD_MAY_NOP 0
-#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */
-#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */
-#define NFSD_MAY_READ 4 /* == MAY_READ */
-#define NFSD_MAY_SATTR 8
-#define NFSD_MAY_TRUNC 16
-#define NFSD_MAY_LOCK 32
-#define NFSD_MAY_MASK 63
+#define NFSD_MAY_NOP 0
+#define NFSD_MAY_EXEC 0x001 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE 0x002 /* == MAY_WRITE */
+#define NFSD_MAY_READ 0x004 /* == MAY_READ */
+#define NFSD_MAY_SATTR 0x008
+#define NFSD_MAY_TRUNC 0x010
+#define NFSD_MAY_LOCK 0x020
+#define NFSD_MAY_MASK 0x03f
/* extra hints to permission and open routines: */
-#define NFSD_MAY_OWNER_OVERRIDE 64
-#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
-#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
-#define NFSD_MAY_NOT_BREAK_LEASE 512
-#define NFSD_MAY_BYPASS_GSS 1024
-#define NFSD_MAY_READ_IF_EXEC 2048
+#define NFSD_MAY_OWNER_OVERRIDE 0x040
+#define NFSD_MAY_LOCAL_ACCESS 0x080 /* for device special files */
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT 0x100
+#define NFSD_MAY_NOT_BREAK_LEASE 0x200
+#define NFSD_MAY_BYPASS_GSS 0x400
+#define NFSD_MAY_READ_IF_EXEC 0x800
+
+#define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */
#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -62,7 +64,7 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
struct svc_fh *res, int createmode,
- u32 *verifier, int *truncp, int *created);
+ u32 *verifier, bool *truncp, bool *created);
__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
loff_t, unsigned long);
#endif /* CONFIG_NFSD_V3 */
@@ -106,4 +108,14 @@ struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
#endif
+static inline int fh_want_write(struct svc_fh *fh)
+{
+ return mnt_want_write(fh->fh_export->ex_path.mnt);
+}
+
+static inline void fh_drop_write(struct svc_fh *fh)
+{
+ mnt_drop_write(fh->fh_export->ex_path.mnt);
+}
+
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 366401e..2364747 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -81,7 +81,6 @@ struct nfsd4_access {
struct nfsd4_close {
u32 cl_seqid; /* request */
stateid_t cl_stateid; /* request+response */
- struct nfs4_stateowner * cl_stateowner; /* response */
};
struct nfsd4_commit {
@@ -131,7 +130,7 @@ struct nfsd4_link {
struct nfsd4_lock_denied {
clientid_t ld_clientid;
- struct nfs4_stateowner *ld_sop;
+ struct xdr_netobj ld_owner;
u64 ld_start;
u64 ld_length;
u32 ld_type;
@@ -165,9 +164,6 @@ struct nfsd4_lock {
} ok;
struct nfsd4_lock_denied denied;
} u;
- /* The lk_replay_owner is the open owner in the open_to_lock_owner
- * case and the lock owner otherwise: */
- struct nfs4_stateowner *lk_replay_owner;
};
#define lk_new_open_seqid v.new.open_seqid
#define lk_new_open_stateid v.new.open_stateid
@@ -188,7 +184,6 @@ struct nfsd4_lockt {
struct xdr_netobj lt_owner;
u64 lt_offset;
u64 lt_length;
- struct nfs4_stateowner * lt_stateowner;
struct nfsd4_lock_denied lt_denied;
};
@@ -199,7 +194,6 @@ struct nfsd4_locku {
stateid_t lu_stateid;
u64 lu_offset;
u64 lu_length;
- struct nfs4_stateowner *lu_stateowner;
};
@@ -232,8 +226,11 @@ struct nfsd4_open {
u32 op_recall; /* recall */
struct nfsd4_change_info op_cinfo; /* response */
u32 op_rflags; /* response */
- int op_truncate; /* used during processing */
- struct nfs4_stateowner *op_stateowner; /* used during processing */
+ bool op_truncate; /* used during processing */
+ bool op_created; /* used during processing */
+ struct nfs4_openowner *op_openowner; /* used during processing */
+ struct nfs4_file *op_file; /* used during processing */
+ struct nfs4_ol_stateid *op_stp; /* used during processing */
struct nfs4_acl *op_acl;
};
#define op_iattr iattr
@@ -243,7 +240,6 @@ struct nfsd4_open_confirm {
stateid_t oc_req_stateid /* request */;
u32 oc_seqid /* request */;
stateid_t oc_resp_stateid /* response */;
- struct nfs4_stateowner * oc_stateowner; /* response */
};
struct nfsd4_open_downgrade {
@@ -251,7 +247,6 @@ struct nfsd4_open_downgrade {
u32 od_seqid;
u32 od_share_access;
u32 od_share_deny;
- struct nfs4_stateowner *od_stateowner;
};
@@ -325,8 +320,7 @@ struct nfsd4_setattr {
struct nfsd4_setclientid {
nfs4_verifier se_verf; /* request */
- u32 se_namelen; /* request */
- char * se_name; /* request */
+ struct xdr_netobj se_name;
u32 se_callback_prog; /* request */
u32 se_callback_netid_len; /* request */
char * se_callback_netid_val; /* request */
@@ -342,6 +336,24 @@ struct nfsd4_setclientid_confirm {
nfs4_verifier sc_confirm;
};
+struct nfsd4_saved_compoundargs {
+ __be32 *p;
+ __be32 *end;
+ int pagelen;
+ struct page **pagelist;
+};
+
+struct nfsd4_test_stateid {
+ __be32 ts_num_ids;
+ struct nfsd4_compoundargs *ts_saved_args;
+ struct nfsd4_saved_compoundargs ts_savedp;
+};
+
+struct nfsd4_free_stateid {
+ stateid_t fr_stateid; /* request */
+ __be32 fr_status; /* response */
+};
+
/* also used for NVERIFY */
struct nfsd4_verify {
u32 ve_bmval[3]; /* request */
@@ -386,6 +398,10 @@ struct nfsd4_destroy_session {
struct nfs4_sessionid sessionid;
};
+struct nfsd4_destroy_clientid {
+ clientid_t clientid;
+};
+
struct nfsd4_reclaim_complete {
u32 rca_one_fs;
};
@@ -432,10 +448,14 @@ struct nfsd4_op {
struct nfsd4_destroy_session destroy_session;
struct nfsd4_sequence sequence;
struct nfsd4_reclaim_complete reclaim_complete;
+ struct nfsd4_test_stateid test_stateid;
+ struct nfsd4_free_stateid free_stateid;
} u;
struct nfs4_replay * replay;
};
+bool nfsd4_cache_this_op(struct nfsd4_op *);
+
struct nfsd4_compoundargs {
/* scratch variables for XDR decode */
__be32 * p;
@@ -458,6 +478,7 @@ struct nfsd4_compoundargs {
u32 opcnt;
struct nfsd4_op *ops;
struct nfsd4_op iops[8];
+ int cachetype;
};
struct nfsd4_compoundres {
@@ -508,6 +529,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
struct nfsd4_compoundargs *);
int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
struct nfsd4_compoundres *);
+int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
@@ -534,11 +556,13 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
extern __be32 nfsd4_destroy_session(struct svc_rqst *,
struct nfsd4_compound_state *,
struct nfsd4_destroy_session *);
+extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
struct nfsd4_open *open);
extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
struct svc_fh *current_fh, struct nfsd4_open *open);
+extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
extern __be32 nfsd4_close(struct svc_rqst *rqstp,
@@ -559,11 +583,15 @@ extern __be32
nfsd4_release_lockowner(struct svc_rqst *rqstp,
struct nfsd4_compound_state *,
struct nfsd4_release_lockowner *rlockowner);
-extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *);
+extern int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp);
extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr);
extern __be32 nfsd4_renew(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, clientid_t *clid);
+extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *, struct nfsd4_test_stateid *test_stateid);
+extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid);
#endif
/*
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b2e3ff3..090d8ce 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,6 +31,8 @@
#include "alloc.h"
#include "dat.h"
+static void __nilfs_btree_init(struct nilfs_bmap *bmap);
+
static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
{
struct nilfs_btree_path *path;
@@ -368,6 +370,34 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
return ret;
}
+/**
+ * nilfs_btree_root_broken - verify consistency of btree root node
+ * @node: btree root node to be examined
+ * @ino: inode number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
+ unsigned long ino)
+{
+ int level, flags, nchildren;
+ int ret = 0;
+
+ level = nilfs_btree_node_get_level(node);
+ flags = nilfs_btree_node_get_flags(node);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+
+ if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+ level >= NILFS_BTREE_LEVEL_MAX ||
+ nchildren < 0 ||
+ nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
+ pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
+ ino, level, flags, nchildren);
+ ret = 1;
+ }
+ return ret;
+}
+
int nilfs_btree_broken_node_block(struct buffer_head *bh)
{
int ret;
@@ -1713,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
/* convert and insert */
dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
- nilfs_btree_init(btree);
+ __nilfs_btree_init(btree);
if (nreq != NULL) {
nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
@@ -2294,12 +2324,23 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
.bop_gather_data = NULL,
};
-int nilfs_btree_init(struct nilfs_bmap *bmap)
+static void __nilfs_btree_init(struct nilfs_bmap *bmap)
{
bmap->b_ops = &nilfs_btree_ops;
bmap->b_nchildren_per_block =
NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
- return 0;
+}
+
+int nilfs_btree_init(struct nilfs_bmap *bmap)
+{
+ int ret = 0;
+
+ __nilfs_btree_init(bmap);
+
+ if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap),
+ bmap->b_inode->i_ino))
+ ret = -EIO;
+ return ret;
}
void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index d7eeca6..2660152 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
#include "nilfs.h"
#include "segment.h"
-int nilfs_sync_file(struct file *file, int datasync)
+int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
/*
* Called from fsync() system call
@@ -40,8 +40,15 @@ int nilfs_sync_file(struct file *file, int datasync)
struct inode *inode = file->f_mapping->host;
int err;
- if (!nilfs_inode_dirty(inode))
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (err)
+ return err;
+ mutex_lock(&inode->i_mutex);
+
+ if (!nilfs_inode_dirty(inode)) {
+ mutex_unlock(&inode->i_mutex);
return 0;
+ }
if (datasync)
err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0,
@@ -49,6 +56,7 @@ int nilfs_sync_file(struct file *file, int datasync)
else
err = nilfs_construct_segment(inode->i_sb);
+ mutex_unlock(&inode->i_mutex);
return err;
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 373cd7b..b2d8a96 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -24,6 +24,7 @@
#include <linux/buffer_head.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
+#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/uio.h>
#include "nilfs.h"
@@ -195,10 +196,10 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
static int nilfs_set_page_dirty(struct page *page)
{
+ struct inode *inode = page->mapping->host;
int ret = __set_page_dirty_nobuffers(page);
if (page_has_buffers(page)) {
- struct inode *inode = page->mapping->host;
unsigned nr_dirty = 0;
struct buffer_head *bh, *head;
@@ -221,6 +222,10 @@ static int nilfs_set_page_dirty(struct page *page)
if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
+ } else if (ret) {
+ unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ nilfs_set_file_dirty(inode, nr_dirty);
}
return ret;
}
@@ -278,8 +283,8 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
return 0;
/* Needs synchronization with the cleaner */
- size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, nilfs_get_block, NULL);
+ size = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ nilfs_get_block);
/*
* In case of error extending write may have instantiated a few
@@ -373,7 +378,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
failed_acl:
failed_bmap:
- inode->i_nlink = 0;
+ clear_nlink(inode);
iput(inode); /* raw_inode will be deleted through
generic_delete_inode() */
goto failed;
@@ -415,7 +420,7 @@ int nilfs_read_inode_common(struct inode *inode,
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le64_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
@@ -797,6 +802,8 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
+ inode_dio_wait(inode);
+
err = vmtruncate(inode, iattr->ia_size);
if (unlikely(err))
goto out_err;
@@ -818,14 +825,14 @@ out_err:
return err;
}
-int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
+int nilfs_permission(struct inode *inode, int mask)
{
struct nilfs_root *root = NILFS_I(inode)->i_root;
if ((mask & MAY_WRITE) && root &&
root->cno != NILFS_CPTREE_CURRENT_CNO)
return -EROFS; /* snapshot is not writable */
- return generic_permission(inode, mask, flags, NULL);
+ return generic_permission(inode, mask);
}
int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index cee648e..2b5e695 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -625,6 +625,9 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment)
goto out_free;
+ if (argv[n].v_nmembs >= UINT_MAX / argv[n].v_size)
+ goto out_free;
+
len = argv[n].v_size * argv[n].v_nmembs;
base = (void __user *)(unsigned long)argv[n].v_base;
if (len == 0) {
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 546849b..768982d 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -72,12 +72,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
return ERR_PTR(-ENAMETOOLONG);
ino = nilfs_inode_by_name(dir, &dentry->d_name);
- inode = NULL;
- if (ino) {
- inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- }
+ inode = ino ? nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino) : NULL;
return d_splice_alias(inode, dentry);
}
@@ -294,7 +289,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
nilfs_warning(inode->i_sb, __func__,
"deleting nonexistent file (%lu), %d\n",
inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
err = nilfs_delete_entry(de, page);
if (err)
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f02b9ad..7a2e7b8 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -128,7 +128,6 @@ enum {
* @ti_save: Backup of journal_info field of task_struct
* @ti_flags: Flags
* @ti_count: Nest level
- * @ti_garbage: List of inode to be put when releasing semaphore
*/
struct nilfs_transaction_info {
u32 ti_magic;
@@ -137,7 +136,6 @@ struct nilfs_transaction_info {
one of other filesystems has a bug. */
unsigned short ti_flags;
unsigned short ti_count;
- struct list_head ti_garbage;
};
/* ti_magic */
@@ -235,7 +233,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
struct page *, struct inode *);
/* file.c */
-extern int nilfs_sync_file(struct file *, int);
+extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
/* ioctl.c */
long nilfs_ioctl(struct file *, unsigned int, unsigned long);
@@ -264,7 +262,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
extern void nilfs_truncate(struct inode *);
extern void nilfs_evict_inode(struct inode *);
extern int nilfs_setattr(struct dentry *, struct iattr *);
-int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
+int nilfs_permission(struct inode *inode, int mask);
int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
extern int nilfs_inode_dirty(struct inode *);
int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
@@ -276,10 +274,10 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
/* super.c */
extern struct inode *nilfs_alloc_inode(struct super_block *);
extern void nilfs_destroy_inode(struct inode *);
-extern void nilfs_error(struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
-extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void nilfs_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void nilfs_warning(struct super_block *, const char *, const char *, ...);
extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
extern int nilfs_store_magic_and_option(struct super_block *,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 65221a0..16eacec 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -94,6 +94,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
clear_buffer_nilfs_volatile(bh);
clear_buffer_nilfs_checked(bh);
clear_buffer_nilfs_redirected(bh);
+ clear_buffer_async_write(bh);
clear_buffer_dirty(bh);
if (nilfs_page_buffers_clean(page))
__nilfs_clear_page_dirty(page);
@@ -390,6 +391,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
bh = head = page_buffers(page);
do {
lock_buffer(bh);
+ clear_buffer_async_write(bh);
clear_buffer_dirty(bh);
clear_buffer_nilfs_volatile(bh);
clear_buffer_nilfs_checked(bh);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6f24e67..6bba106 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -302,7 +302,6 @@ static void nilfs_transaction_lock(struct super_block *sb,
ti->ti_count = 0;
ti->ti_save = cur_ti;
ti->ti_magic = NILFS_TI_MAGIC;
- INIT_LIST_HEAD(&ti->ti_garbage);
current->journal_info = ti;
for (;;) {
@@ -329,8 +328,6 @@ static void nilfs_transaction_unlock(struct super_block *sb)
up_write(&nilfs->ns_segctor_sem);
current->journal_info = ti->ti_save;
- if (!list_empty(&ti->ti_garbage))
- nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
}
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -662,7 +659,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
bh = head = page_buffers(page);
do {
- if (!buffer_dirty(bh))
+ if (!buffer_dirty(bh) || buffer_async_write(bh))
continue;
get_bh(bh);
list_add_tail(&bh->b_assoc_buffers, listp);
@@ -696,7 +693,8 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
for (i = 0; i < pagevec_count(&pvec); i++) {
bh = head = page_buffers(pvec.pages[i]);
do {
- if (buffer_dirty(bh)) {
+ if (buffer_dirty(bh) &&
+ !buffer_async_write(bh)) {
get_bh(bh);
list_add_tail(&bh->b_assoc_buffers,
listp);
@@ -742,6 +740,15 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
}
}
+static void nilfs_iput_work_func(struct work_struct *work)
+{
+ struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
+ sc_iput_work);
+ struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+
+ nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
+}
+
static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
struct nilfs_root *root)
{
@@ -1436,17 +1443,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
nilfs_clear_logs(&sci->sc_segbufs);
- err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
- if (unlikely(err))
- return err;
-
if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
sci->sc_freesegs,
sci->sc_nfreesegs,
NULL);
WARN_ON(err); /* do not happen */
+ sci->sc_stage.flags &= ~NILFS_CF_SUFREED;
}
+
+ err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+ if (unlikely(err))
+ return err;
+
nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
sci->sc_stage = prev_stage;
}
@@ -1576,6 +1585,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
+ set_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page) {
lock_page(bd_page);
@@ -1589,6 +1599,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
+ set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
lock_page(bd_page);
@@ -1674,6 +1685,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(segbuf, logs, sb_list) {
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
+ clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1683,6 +1695,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
+ clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
@@ -1752,6 +1765,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
+ clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1773,6 +1787,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
+ clear_buffer_async_write(bh);
clear_buffer_delay(bh);
clear_buffer_nilfs_volatile(bh);
clear_buffer_nilfs_redirected(bh);
@@ -1887,8 +1902,9 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
- struct nilfs_transaction_info *ti = current->journal_info;
struct nilfs_inode_info *ii, *n;
+ int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
+ int defer_iput = false;
spin_lock(&nilfs->ns_inode_lock);
list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
@@ -1899,9 +1915,24 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
clear_bit(NILFS_I_BUSY, &ii->i_state);
brelse(ii->i_bh);
ii->i_bh = NULL;
- list_move_tail(&ii->i_dirty, &ti->ti_garbage);
+ list_del_init(&ii->i_dirty);
+ if (!ii->vfs_inode.i_nlink || during_mount) {
+ /*
+ * Defer calling iput() to avoid deadlocks if
+ * i_nlink == 0 or mount is not yet finished.
+ */
+ list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
+ defer_iput = true;
+ } else {
+ spin_unlock(&nilfs->ns_inode_lock);
+ iput(&ii->vfs_inode);
+ spin_lock(&nilfs->ns_inode_lock);
+ }
}
spin_unlock(&nilfs->ns_inode_lock);
+
+ if (defer_iput)
+ schedule_work(&sci->sc_iput_work);
}
/*
@@ -2568,6 +2599,8 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
INIT_LIST_HEAD(&sci->sc_segbufs);
INIT_LIST_HEAD(&sci->sc_write_logs);
INIT_LIST_HEAD(&sci->sc_gc_inodes);
+ INIT_LIST_HEAD(&sci->sc_iput_queue);
+ INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
init_timer(&sci->sc_timer);
sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2594,6 +2627,8 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
nilfs_transaction_unlock(sci->sc_super);
+ flush_work(&sci->sc_iput_work);
+
} while (ret && retrycount-- > 0);
}
@@ -2618,6 +2653,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
|| sci->sc_seq_request != sci->sc_seq_done);
spin_unlock(&sci->sc_state_lock);
+ if (flush_work(&sci->sc_iput_work))
+ flag = true;
+
if (flag || !nilfs_segctor_confirm(sci))
nilfs_segctor_write_out(sci);
@@ -2627,6 +2665,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
+ if (!list_empty(&sci->sc_iput_queue)) {
+ nilfs_warning(sci->sc_super, __func__,
+ "iput queue is not empty\n");
+ nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
+ }
+
WARN_ON(!list_empty(&sci->sc_segbufs));
WARN_ON(!list_empty(&sci->sc_write_logs));
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 38a1d00..a48d6de 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -26,6 +26,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
#include <linux/nilfs2_fs.h>
#include "nilfs.h"
@@ -92,6 +93,8 @@ struct nilfs_segsum_pointer {
* @sc_nblk_inc: Block count of current generation
* @sc_dirty_files: List of files to be written
* @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_iput_queue: list of inodes for which iput should be done
+ * @sc_iput_work: work struct to defer iput call
* @sc_freesegs: array of segment numbers to be freed
* @sc_nfreesegs: number of segments on @sc_freesegs
* @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -135,6 +138,8 @@ struct nilfs_sc_info {
struct list_head sc_dirty_files;
struct list_head sc_gc_inodes;
+ struct list_head sc_iput_queue;
+ struct work_struct sc_iput_work;
__u64 *sc_freesegs;
size_t sc_nfreesegs;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e913ad1..a721907 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -247,7 +247,7 @@ static int ocfs2_set_acl(handle_t *handle,
case ACL_TYPE_ACCESS:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
ret = posix_acl_equiv_mode(acl, &mode);
if (ret < 0)
return ret;
@@ -290,47 +290,32 @@ static int ocfs2_set_acl(handle_t *handle,
return ret;
}
-int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
{
struct ocfs2_super *osb;
struct buffer_head *di_bh = NULL;
struct posix_acl *acl;
int ret = -EAGAIN;
- if (flags & IPERM_FLAG_RCU)
- return -ECHILD;
-
osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return ret;
+ return NULL;
ret = ocfs2_read_inode_block(inode, &di_bh);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
+ if (ret < 0)
+ return ERR_PTR(ret);
- acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
+ acl = ocfs2_get_acl_nolock(inode, type, di_bh);
brelse(di_bh);
- if (IS_ERR(acl)) {
- mlog_errno(PTR_ERR(acl));
- return PTR_ERR(acl);
- }
- if (acl) {
- ret = posix_acl_permission(inode, acl, mask);
- posix_acl_release(acl);
- return ret;
- }
-
- return -EAGAIN;
+ return acl;
}
int ocfs2_acl_chmod(struct inode *inode)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct posix_acl *acl, *clone;
+ struct posix_acl *acl;
int ret;
if (S_ISLNK(inode->i_mode))
@@ -342,15 +327,12 @@ int ocfs2_acl_chmod(struct inode *inode)
acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl) || !acl)
return PTR_ERR(acl);
- clone = posix_acl_clone(acl, GFP_KERNEL);
+ ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ if (ret)
+ return ret;
+ ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+ acl, NULL, NULL);
posix_acl_release(acl);
- if (!clone)
- return -ENOMEM;
- ret = posix_acl_chmod_masq(clone, inode->i_mode);
- if (!ret)
- ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
- clone, NULL, NULL);
- posix_acl_release(clone);
return ret;
}
@@ -369,7 +351,7 @@ int ocfs2_init_acl(handle_t *handle,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct posix_acl *acl = NULL;
int ret = 0, ret2;
- mode_t mode;
+ umode_t mode;
if (!S_ISLNK(inode->i_mode)) {
if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -388,8 +370,6 @@ int ocfs2_init_acl(handle_t *handle,
}
}
if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
- struct posix_acl *clone;
-
if (S_ISDIR(inode->i_mode)) {
ret = ocfs2_set_acl(handle, inode, di_bh,
ACL_TYPE_DEFAULT, acl,
@@ -397,27 +377,22 @@ int ocfs2_init_acl(handle_t *handle,
if (ret)
goto cleanup;
}
- clone = posix_acl_clone(acl, GFP_NOFS);
- ret = -ENOMEM;
- if (!clone)
- goto cleanup;
-
mode = inode->i_mode;
- ret = posix_acl_create_masq(clone, &mode);
- if (ret >= 0) {
- ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
- if (ret2) {
- mlog_errno(ret2);
- ret = ret2;
- goto cleanup;
- }
- if (ret > 0) {
- ret = ocfs2_set_acl(handle, inode,
- di_bh, ACL_TYPE_ACCESS,
- clone, meta_ac, data_ac);
- }
+ ret = posix_acl_create(&acl, GFP_NOFS, &mode);
+ if (ret < 0)
+ return ret;
+
+ ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret2) {
+ mlog_errno(ret2);
+ ret = ret2;
+ goto cleanup;
+ }
+ if (ret > 0) {
+ ret = ocfs2_set_acl(handle, inode,
+ di_bh, ACL_TYPE_ACCESS,
+ acl, meta_ac, data_ac);
}
- posix_acl_release(clone);
}
cleanup:
posix_acl_release(acl);
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 4fe7c9c..071fbd38 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ struct ocfs2_acl_entry {
__le32 e_id;
};
-extern int ocfs2_check_acl(struct inode *, int, unsigned int);
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
extern int ocfs2_acl_chmod(struct inode *);
extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 76c8165..31b9463 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5699,7 +5699,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_commit;
}
dquot_free_space_nodirty(inode,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ac97bca..16653b2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -290,7 +290,15 @@ static int ocfs2_readpage(struct file *file, struct page *page)
}
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+ /*
+ * Unlock the page and cycle ip_alloc_sem so that we don't
+ * busyloop waiting for ip_alloc_sem to unlock
+ */
ret = AOP_TRUNCATED_PAGE;
+ unlock_page(page);
+ unlock = 0;
+ down_read(&oi->ip_alloc_sem);
+ up_read(&oi->ip_alloc_sem);
goto out_inode_unlock;
}
@@ -551,9 +559,8 @@ bail:
/*
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. Like the core uses
- * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
- * truncation on another.
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
*/
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
@@ -564,13 +571,21 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
int level;
+ wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
- if (ocfs2_iocb_is_sem_locked(iocb)) {
- up_read(&inode->i_alloc_sem);
+ if (ocfs2_iocb_is_sem_locked(iocb))
ocfs2_iocb_clear_sem_locked(iocb);
+
+ if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+
+ if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
+ waitqueue_active(wq)) {
+ wake_up_all(wq);
+ }
}
ocfs2_iocb_clear_rw_locked(iocb);
@@ -578,6 +593,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
level = ocfs2_iocb_rw_locked_level(iocb);
ocfs2_rw_unlock(inode, level);
+ inode_dio_done(inode);
if (is_async)
aio_complete(iocb, ret, 0);
}
@@ -865,6 +881,12 @@ struct ocfs2_write_ctxt {
struct page *w_target_page;
/*
+ * w_target_locked is used for page_mkwrite path indicating no unlocking
+ * against w_target_page in ocfs2_write_end_nolock.
+ */
+ unsigned int w_target_locked:1;
+
+ /*
* ocfs2_write_end() uses this to know what the real range to
* write in the target should be.
*/
@@ -895,10 +917,32 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
}
}
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
{
+ int i;
+
+ /*
+ * w_target_locked is only set to true in the page_mkwrite() case.
+ * The intent is to allow us to lock the target page from write_begin()
+ * to write_end(). The caller must hold a ref on w_target_page.
+ */
+ if (wc->w_target_locked) {
+ BUG_ON(!wc->w_target_page);
+ for (i = 0; i < wc->w_num_pages; i++) {
+ if (wc->w_target_page == wc->w_pages[i]) {
+ wc->w_pages[i] = NULL;
+ break;
+ }
+ }
+ mark_page_accessed(wc->w_target_page);
+ page_cache_release(wc->w_target_page);
+ }
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
+}
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+ ocfs2_unlock_pages(wc);
brelse(wc->w_di_bh);
kfree(wc);
}
@@ -1134,20 +1178,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
*/
lock_page(mmap_page);
+ /* Exit and let the caller retry */
if (mmap_page->mapping != mapping) {
+ WARN_ON(mmap_page->mapping);
unlock_page(mmap_page);
- /*
- * Sanity check - the locking in
- * ocfs2_pagemkwrite() should ensure
- * that this code doesn't trigger.
- */
- ret = -EINVAL;
- mlog_errno(ret);
+ ret = -EAGAIN;
goto out;
}
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
+ wc->w_target_locked = true;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
@@ -1162,6 +1203,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
wc->w_target_page = wc->w_pages[i];
}
out:
+ if (ret)
+ wc->w_target_locked = false;
return ret;
}
@@ -1819,11 +1862,23 @@ try_again:
*/
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
- if (ret) {
+ if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out_quota;
}
+ /*
+ * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+ * the target page. In this case, we exit with no error and no target
+ * page. This will trigger the caller, page_mkwrite(), to re-try
+ * the operation.
+ */
+ if (ret == -EAGAIN) {
+ BUG_ON(wc->w_target_page);
+ ret = 0;
+ goto out_quota;
+ }
+
ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
len);
if (ret) {
@@ -2008,11 +2063,19 @@ out_write_size:
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
ocfs2_journal_dirty(handle, wc->w_di_bh);
+ /* unlock pages before dealloc since it needs acquiring j_trans_barrier
+ * lock, or it will cause a deadlock since journal commit threads holds
+ * this lock and will ask for the page lock when flushing the data.
+ * put it here to preserve the unlock order.
+ */
+ ocfs2_unlock_pages(wc);
+
ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
- ocfs2_free_write_ctxt(wc);
+ brelse(wc->w_di_bh);
+ kfree(wc);
return copied;
}
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 75cf3ad..ffb2da3 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
OCFS2_IOCB_RW_LOCK_LEVEL,
OCFS2_IOCB_SEM,
+ OCFS2_IOCB_UNALIGNED_IO,
OCFS2_IOCB_NUM_LOCKS
};
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
#define ocfs2_iocb_is_sem_locked(iocb) \
test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+
+#define ocfs2_iocb_set_unaligned_aio(iocb) \
+ set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_unaligned_aio(iocb) \
+ clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_unaligned_aio(iocb) \
+ test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+
+#define OCFS2_IOEND_WQ_HASH_SZ 37
+#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
+ OCFS2_IOEND_WQ_HASH_SZ])
+extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 5d18ad1..4f66e00 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -90,7 +90,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
* information for this bh as it's not marked locally
* uptodate. */
ret = -EIO;
- put_bh(bh);
mlog_errno(ret);
}
@@ -420,7 +419,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
if (!buffer_uptodate(bh)) {
ret = -EIO;
- put_bh(bh);
mlog_errno(ret);
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9a3e6bb..a4e855e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -216,6 +216,7 @@ struct o2hb_region {
struct list_head hr_all_item;
unsigned hr_unclean_stop:1,
+ hr_aborted_start:1,
hr_item_pinned:1,
hr_item_dropped:1;
@@ -254,6 +255,10 @@ struct o2hb_region {
* a more complete api that doesn't lead to this sort of fragility. */
atomic_t hr_steady_iterations;
+ /* terminate o2hb thread if it does not reach steady state
+ * (hr_steady_iterations == 0) within hr_unsteady_iterations */
+ atomic_t hr_unsteady_iterations;
+
char hr_dev_name[BDEVNAME_SIZE];
unsigned int hr_timeout_ms;
@@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work)
static void o2hb_arm_write_timeout(struct o2hb_region *reg)
{
+ /* Arm writeout only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
+ return;
+
mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
O2HB_MAX_WRITE_TIMEOUT_MS);
@@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
return read == computed;
}
-/* We want to make sure that nobody is heartbeating on top of us --
- * this will help detect an invalid configuration. */
-static void o2hb_check_last_timestamp(struct o2hb_region *reg)
+/*
+ * Compare the slot data with what we wrote in the last iteration.
+ * If the match fails, print an appropriate error message. This is to
+ * detect errors like... another node hearting on the same slot,
+ * flaky device that is losing writes, etc.
+ * Returns 1 if check succeeds, 0 otherwise.
+ */
+static int o2hb_check_own_slot(struct o2hb_region *reg)
{
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
@@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
slot = &reg->hr_slots[o2nm_this_node()];
/* Don't check on our 1st timestamp */
if (!slot->ds_last_time)
- return;
+ return 0;
hb_block = slot->ds_raw_block;
if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
hb_block->hb_node == slot->ds_node_num)
- return;
+ return 1;
#define ERRSTR1 "Another node is heartbeating on device"
#define ERRSTR2 "Heartbeat generation mismatch on device"
@@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
(unsigned long long)slot->ds_last_time, hb_block->hb_node,
(unsigned long long)le64_to_cpu(hb_block->hb_generation),
(unsigned long long)le64_to_cpu(hb_block->hb_seq));
+
+ return 0;
}
static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
o2nm_node_put(node);
}
-static void o2hb_set_quorum_device(struct o2hb_region *reg,
- struct o2hb_disk_slot *slot)
+static void o2hb_set_quorum_device(struct o2hb_region *reg)
{
- assert_spin_locked(&o2hb_live_lock);
-
if (!o2hb_global_heartbeat_active())
return;
- if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ /* Prevent race with o2hb_heartbeat_group_drop_item() */
+ if (kthread_should_stop())
+ return;
+
+ /* Tag region as quorum only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
return;
+ spin_lock(&o2hb_live_lock);
+
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ goto unlock;
+
/*
* A region can be added to the quorum only when it sees all
* live nodes heartbeat on it. In other words, the region has been
@@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
*/
if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
sizeof(o2hb_live_node_bitmap)))
- return;
-
- if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
- return;
+ goto unlock;
- printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
- config_item_name(&reg->hr_item));
+ printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
@@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
o2hb_region_unpin(NULL);
+unlock:
+ spin_unlock(&o2hb_live_lock);
}
static int o2hb_check_slot(struct o2hb_region *reg,
@@ -925,8 +947,6 @@ fire_callbacks:
slot->ds_equal_samples = 0;
}
out:
- o2hb_set_quorum_device(reg, slot);
-
spin_unlock(&o2hb_live_lock);
o2hb_run_event_list(&event);
@@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes,
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
- int i, ret, highest_node, change = 0;
+ int i, ret, highest_node;
+ int membership_change = 0, own_slot_ok = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct o2hb_bio_wait_ctxt write_wc;
@@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
sizeof(configured_nodes));
if (ret) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
/*
@@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
if (highest_node >= O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
- return -EINVAL;
+ mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
+ ret = -EINVAL;
+ goto bail;
}
/* No sense in reading the slots of nodes that don't exist
@@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
ret = o2hb_read_slots(reg, highest_node + 1);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
/* With an up to date view of the slots, we can check that no
* other node has been improperly configured to heartbeat in
* our slot. */
- o2hb_check_last_timestamp(reg);
+ own_slot_ok = o2hb_check_own_slot(reg);
/* fill in the proper info for our next heartbeat */
o2hb_prepare_block(reg, reg->hr_generation);
- /* And fire off the write. Note that we don't wait on this I/O
- * until later. */
ret = o2hb_issue_node_write(reg, &write_wc);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
i = -1;
while((i = find_next_bit(configured_nodes,
O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
- change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+ membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
}
/*
@@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* disk */
mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
write_wc.wc_error, reg->hr_dev_name);
- return write_wc.wc_error;
+ ret = write_wc.wc_error;
+ goto bail;
}
- o2hb_arm_write_timeout(reg);
+ /* Skip disarming the timeout if own slot has stale/bad data */
+ if (own_slot_ok) {
+ o2hb_set_quorum_device(reg);
+ o2hb_arm_write_timeout(reg);
+ }
+bail:
/* let the person who launched us know when things are steady */
- if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
- if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (!ret && own_slot_ok && !membership_change) {
+ if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ wake_up(&o2hb_steady_queue);
+ }
+ }
+
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
+ printk(KERN_NOTICE "o2hb: Unable to stabilize "
+ "heartbeart on region %s (%s)\n",
+ config_item_name(&reg->hr_item),
+ reg->hr_dev_name);
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
wake_up(&o2hb_steady_queue);
+ ret = -EIO;
+ }
}
- return 0;
+ return ret;
}
/* Subtract b from a, storing the result in a. a *must* have a larger
@@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data)
/* Pin node */
o2nm_depend_this_node();
- while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+ while (!kthread_should_stop() &&
+ !reg->hr_unclean_stop && !reg->hr_aborted_start) {
/* We track the time spent inside
* o2hb_do_disk_heartbeat so that we avoid more than
* hr_timeout_ms between disk writes. On busy systems
@@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data)
* likely to time itself out. */
do_gettimeofday(&before_hb);
- i = 0;
- do {
- ret = o2hb_do_disk_heartbeat(reg);
- } while (ret && ++i < 2);
+ ret = o2hb_do_disk_heartbeat(reg);
do_gettimeofday(&after_hb);
elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
@@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data)
after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
elapsed_msec);
- if (elapsed_msec < reg->hr_timeout_ms) {
+ if (!kthread_should_stop() &&
+ elapsed_msec < reg->hr_timeout_ms) {
/* the kthread api has blocked signals for us so no
* need to record the return value. */
msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data)
* to timeout on this region when we could just as easily
* write a clear generation - thus indicating to them that
* this node has left this region.
- *
- * XXX: Should we skip this on unclean_stop? */
- o2hb_prepare_block(reg, 0);
- ret = o2hb_issue_node_write(reg, &write_wc);
- if (ret == 0) {
- o2hb_wait_on_io(reg, &write_wc);
- } else {
- mlog_errno(ret);
+ */
+ if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
+ o2hb_prepare_block(reg, 0);
+ ret = o2hb_issue_node_write(reg, &write_wc);
+ if (ret == 0)
+ o2hb_wait_on_io(reg, &write_wc);
+ else
+ mlog_errno(ret);
}
/* Unpin node */
o2nm_undepend_this_node();
- mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+ mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
return 0;
}
@@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
struct o2hb_debug_buf *db = inode->i_private;
struct o2hb_region *reg;
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long lts;
char *buf = NULL;
int i = -1;
int out = 0;
@@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
reg = (struct o2hb_region *)db->db_data;
- out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
- jiffies_to_msecs(jiffies -
- reg->hr_last_timeout_start));
+ lts = reg->hr_last_timeout_start;
+ /* If 0, it has never been set before */
+ if (lts)
+ lts = jiffies_to_msecs(jiffies - lts);
+ out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
goto done;
case O2HB_DB_TYPE_REGION_PINNED:
@@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item)
struct page *page;
struct o2hb_region *reg = to_o2hb_region(item);
+ mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
+
if (reg->hr_tmp_block)
kfree(reg->hr_tmp_block);
@@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
live_threshold <<= 1;
spin_unlock(&o2hb_live_lock);
}
- atomic_set(&reg->hr_steady_iterations, live_threshold + 1);
+ ++live_threshold;
+ atomic_set(&reg->hr_steady_iterations, live_threshold);
+ /* unsteady_iterations is double the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
reg->hr_item.ci_name);
@@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = wait_event_interruptible(o2hb_steady_queue,
atomic_read(&reg->hr_steady_iterations) == 0);
if (ret) {
- /* We got interrupted (hello ptrace!). Clean up */
- spin_lock(&o2hb_live_lock);
- hb_task = reg->hr_task;
- reg->hr_task = NULL;
- spin_unlock(&o2hb_live_lock);
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
+ }
- if (hb_task)
- kthread_stop(hb_task);
+ if (reg->hr_aborted_start) {
+ ret = -EIO;
goto out;
}
@@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = -EIO;
if (hb_task && o2hb_global_heartbeat_active())
- printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
- config_item_name(&reg->hr_item));
+ printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
out:
if (filp)
@@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
/* stop the thread when the user removes the region dir */
spin_lock(&o2hb_live_lock);
- if (o2hb_global_heartbeat_active()) {
- clear_bit(reg->hr_region_num, o2hb_region_bitmap);
- clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
- if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
- quorum_region = 1;
- clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
- }
hb_task = reg->hr_task;
reg->hr_task = NULL;
reg->hr_item_dropped = 1;
@@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
if (hb_task)
kthread_stop(hb_task);
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ quorum_region = 1;
+ clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
+ ((atomic_read(&reg->hr_steady_iterations) == 0) ?
+ "stopped" : "start aborted"), config_item_name(item),
+ reg->hr_dev_name);
+ }
+
/*
* If we're racing a dev_write(), we need to wake them. They will
* check reg->hr_task
*/
if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ reg->hr_aborted_start = 1;
atomic_set(&reg->hr_steady_iterations, 0);
wake_up(&o2hb_steady_queue);
}
- if (o2hb_global_heartbeat_active())
- printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
- config_item_name(&reg->hr_item));
-
config_item_put(item);
if (!o2hb_global_heartbeat_active() || !quorum_region)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 3a58359..dc45deb 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -47,6 +47,7 @@
#define SC_DEBUG_NAME "sock_containers"
#define NST_DEBUG_NAME "send_tracking"
#define STATS_DEBUG_NAME "stats"
+#define NODES_DEBUG_NAME "connected_nodes"
#define SHOW_SOCK_CONTAINERS 0
#define SHOW_SOCK_STATS 1
@@ -55,6 +56,7 @@ static struct dentry *o2net_dentry;
static struct dentry *sc_dentry;
static struct dentry *nst_dentry;
static struct dentry *stats_dentry;
+static struct dentry *nodes_dentry;
static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = {
.release = sc_fop_release,
};
-int o2net_debugfs_init(void)
+static int o2net_fill_bitmap(char *buf, int len)
{
- o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
- if (!o2net_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int i = -1, out = 0;
- nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &nst_seq_fops);
- if (!nst_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ o2net_fill_node_map(map, sizeof(map));
- sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &sc_seq_fops);
- if (!sc_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+ out += snprintf(buf + out, PAGE_SIZE - out, "\n");
- stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &stats_seq_fops);
- if (!stats_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ return out;
+}
+
+static int nodes_fop_open(struct inode *inode, struct file *file)
+{
+ char *buf;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
+
+ file->private_data = buf;
return 0;
-bail:
- debugfs_remove(stats_dentry);
- debugfs_remove(sc_dentry);
- debugfs_remove(nst_dentry);
- debugfs_remove(o2net_dentry);
- return -ENOMEM;
}
+static int o2net_debug_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static ssize_t o2net_debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
+}
+
+static const struct file_operations nodes_fops = {
+ .open = nodes_fop_open,
+ .release = o2net_debug_release,
+ .read = o2net_debug_read,
+ .llseek = generic_file_llseek,
+};
+
void o2net_debugfs_exit(void)
{
+ debugfs_remove(nodes_dentry);
debugfs_remove(stats_dentry);
debugfs_remove(sc_dentry);
debugfs_remove(nst_dentry);
debugfs_remove(o2net_dentry);
}
+int o2net_debugfs_init(void)
+{
+ mode_t mode = S_IFREG|S_IRUSR;
+
+ o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+ if (o2net_dentry)
+ nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nst_seq_fops);
+ if (nst_dentry)
+ sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &sc_seq_fops);
+ if (sc_dentry)
+ stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &stats_seq_fops);
+ if (stats_dentry)
+ nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nodes_fops);
+ if (nodes_dentry)
+ return 0;
+
+ o2net_debugfs_exit();
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+}
+
#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index db5ee4b..044e7b5 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -59,6 +59,7 @@
#include <linux/idr.h>
#include <linux/kref.h>
#include <linux/net.h>
+#include <linux/export.h>
#include <net/tcp.h>
#include <asm/uaccess.h>
@@ -545,7 +546,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
if (was_valid && !valid) {
- printk(KERN_NOTICE "o2net: no longer connected to "
+ printk(KERN_NOTICE "o2net: No longer connected to "
SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
o2net_complete_nodes_nsw(nn);
}
@@ -555,7 +556,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
cancel_delayed_work(&nn->nn_connect_expired);
printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
o2nm_this_node() > sc->sc_node->nd_num ?
- "connected to" : "accepted connection from",
+ "Connected to" : "Accepted connection from",
SC_NODEF_ARGS(sc));
}
@@ -643,7 +644,7 @@ static void o2net_state_change(struct sock *sk)
o2net_sc_queue_work(sc, &sc->sc_connect_work);
break;
default:
- printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+ printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
" shutdown, state %d\n",
SC_NODEF_ARGS(sc), sk->sk_state);
o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
@@ -1034,6 +1035,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
return ret;
}
+/* Get a map of all nodes to which this node is currently connected to */
+void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+{
+ struct o2net_sock_container *sc;
+ int node, ret;
+
+ BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+ memset(map, 0, bytes);
+ for (node = 0; node < O2NM_MAX_NODES; ++node) {
+ o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+ if (!ret) {
+ set_bit(node, map);
+ sc_put(sc);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(o2net_fill_node_map);
+
int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
size_t caller_veclen, u8 target_node, int *status)
{
@@ -1284,11 +1304,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
- mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
- "version %llu but %llu is required, disconnecting\n",
- SC_NODEF_ARGS(sc),
- (unsigned long long)be64_to_cpu(hand->protocol_version),
- O2NET_PROTOCOL_VERSION);
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
+ "protocol version %llu but %llu is required. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ (unsigned long long)be64_to_cpu(hand->protocol_version),
+ O2NET_PROTOCOL_VERSION);
/* don't bother reconnecting if its the wrong version. */
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
@@ -1302,33 +1322,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
*/
if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
o2net_idle_timeout()) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2net_idle_timeout_ms),
- o2net_idle_timeout());
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
+ "idle timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_idle_timeout_ms),
+ o2net_idle_timeout());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
o2net_keepalive_delay()) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2net_keepalive_delay_ms),
- o2net_keepalive_delay());
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
+ "delay of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_keepalive_delay_ms),
+ o2net_keepalive_delay());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
O2HB_MAX_WRITE_TIMEOUT_MS) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
- O2HB_MAX_WRITE_TIMEOUT_MS);
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
+ "timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
+ O2HB_MAX_WRITE_TIMEOUT_MS);
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
@@ -1539,28 +1559,16 @@ static void o2net_idle_timer(unsigned long data)
{
struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-
#ifdef CONFIG_DEBUG_FS
- ktime_t now = ktime_get();
+ unsigned long msecs = ktime_to_ms(ktime_get()) -
+ ktime_to_ms(sc->sc_tv_timer);
+#else
+ unsigned long msecs = o2net_idle_timeout();
#endif
- printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
- "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
- o2net_idle_timeout() / 1000,
- o2net_idle_timeout() % 1000);
-
-#ifdef CONFIG_DEBUG_FS
- mlog(ML_NOTICE, "Here are some times that might help debug the "
- "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
- "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
- (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
- (long long)ktime_to_us(sc->sc_tv_data_ready),
- (long long)ktime_to_us(sc->sc_tv_advance_start),
- (long long)ktime_to_us(sc->sc_tv_advance_stop),
- sc->sc_msg_key, sc->sc_msg_type,
- (long long)ktime_to_us(sc->sc_tv_func_start),
- (long long)ktime_to_us(sc->sc_tv_func_stop));
-#endif
+ printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
+ "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
+ msecs / 1000, msecs % 1000);
/*
* Initialize the nn_timeout so that the next connection attempt
@@ -1693,8 +1701,8 @@ static void o2net_start_connect(struct work_struct *work)
out:
if (ret) {
- mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
- "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+ printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
+ " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
/* 0 err so that another will be queued and attempted
* from set_nn_state */
if (sc)
@@ -1717,8 +1725,8 @@ static void o2net_connect_expired(struct work_struct *work)
spin_lock(&nn->nn_lock);
if (!nn->nn_sc_valid) {
- mlog(ML_ERROR, "no connection established with node %u after "
- "%u.%u seconds, giving up and returning errors.\n",
+ printk(KERN_NOTICE "o2net: No connection established with "
+ "node %u after %u.%u seconds, giving up.\n",
o2net_num_from_nn(nn),
o2net_idle_timeout() / 1000,
o2net_idle_timeout() % 1000);
@@ -1861,21 +1869,21 @@ static int o2net_accept_one(struct socket *sock)
node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
if (node == NULL) {
- mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n",
- &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
+ "node at %pI4:%d\n", &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
if (o2nm_this_node() >= node->nd_num) {
local_node = o2nm_get_node_by_num(o2nm_this_node());
- mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
- "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
- local_node->nd_name, local_node->nd_num,
- &(local_node->nd_ipv4_address),
- ntohs(local_node->nd_ipv4_port),
- node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Unexpected connect attempt seen "
+ "at node '%s' (%u, %pI4:%d) from node '%s' (%u, "
+ "%pI4:%d)\n", local_node->nd_name, local_node->nd_num,
+ &(local_node->nd_ipv4_address),
+ ntohs(local_node->nd_ipv4_port), node->nd_name,
+ node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
@@ -1900,10 +1908,10 @@ static int o2net_accept_one(struct socket *sock)
ret = 0;
spin_unlock(&nn->nn_lock);
if (ret) {
- mlog(ML_NOTICE, "attempt to connect from node '%s' at "
- "%pI4:%d but it already has an open connection\n",
- node->nd_name, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
+ "at %pI4:%d but it already has an open connection\n",
+ node->nd_name, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
goto out;
}
@@ -1983,7 +1991,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0) {
- mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+ printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
goto out;
}
@@ -2000,16 +2008,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
sock->sk->sk_reuse = 1;
ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
if (ret < 0) {
- mlog(ML_ERROR, "unable to bind socket at %pI4:%u, "
- "ret=%d\n", &addr, ntohs(port), ret);
+ printk(KERN_ERR "o2net: Error %d while binding socket at "
+ "%pI4:%u\n", ret, &addr, ntohs(port));
goto out;
}
ret = sock->ops->listen(sock, 64);
- if (ret < 0) {
- mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n",
- &addr, ntohs(port), ret);
- }
+ if (ret < 0)
+ printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
+ ret, &addr, ntohs(port));
out:
if (ret) {
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index fd6179e..5bada2a 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
struct list_head *unreg_list);
void o2net_unregister_handler_list(struct list_head *list);
+void o2net_fill_node_map(unsigned long *map, unsigned bytes);
+
struct o2nm_node;
int o2net_register_hb_callbacks(void);
void o2net_unregister_hb_callbacks(void);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index e5ba348..26977cc 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -175,7 +175,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
spin_lock(&inode->i_lock);
list_for_each(p, &inode->i_dentry) {
- dentry = list_entry(p, struct dentry, d_alias);
+ dentry = list_entry(p, struct dentry, d_u.d_alias);
spin_lock(&dentry->d_lock);
if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8582e3f..8fe4e28 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1184,8 +1184,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
if (pde)
le16_add_cpu(&pde->rec_len,
le16_to_cpu(de->rec_len));
- else
- de->inode = 0;
+ de->inode = 0;
dir->i_version++;
ocfs2_journal_dirty(handle, bh);
goto bail;
@@ -2292,7 +2291,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, di_bh);
i_size_write(inode, size);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_blocks = ocfs2_inode_sector_count(inode);
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
@@ -2354,7 +2353,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, new_bh);
i_size_write(inode, inode->i_sb->s_blocksize);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_blocks = ocfs2_inode_sector_count(inode);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d602abb..a5952ce 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
void dlm_put(struct dlm_ctxt *dlm);
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res)
kref_get(&res->refs);
}
void dlm_lockres_put(struct dlm_lock_resource *res);
-void __dlm_unhash_lockres(struct dlm_lock_resource *res);
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
const char *name,
unsigned int len,
@@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int namelen);
-#define dlm_lockres_set_refmap_bit(bit,res) \
- __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__)
-#define dlm_lockres_clear_refmap_bit(bit,res) \
- __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
-static inline void __dlm_lockres_set_refmap_bit(int bit,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
-{
- //printk("%s:%d:%.*s: setting bit %d\n", file, line,
- // res->lockname.len, res->lockname.name, bit);
- set_bit(bit, res->refmap);
-}
-
-static inline void __dlm_lockres_clear_refmap_bit(int bit,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
-{
- //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
- // res->lockname.len, res->lockname.name, bit);
- clear_bit(bit, res->refmap);
-}
-
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- const char *file,
- int line);
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int new_lockres,
- const char *file,
- int line);
-#define dlm_lockres_drop_inflight_ref(d,r) \
- __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref(d,r) \
- __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref_new(d,r) \
- __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 56f82cb..0e28e24 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -30,6 +30,7 @@
#include <linux/sysctl.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
+#include <linux/export.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6ed6b95..92f2ead 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing,
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
-void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- if (!hlist_unhashed(&lockres->hash_node)) {
- hlist_del_init(&lockres->hash_node);
- dlm_lockres_put(lockres);
- }
+ if (hlist_unhashed(&res->hash_node))
+ return;
+
+ mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
+ hlist_del_init(&res->hash_node);
+ dlm_lockres_put(res);
}
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res)
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
struct hlist_head *bucket;
struct qstr *q;
@@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
dlm_lockres_get(res);
hlist_add_head(&res->hash_node, bucket);
+
+ mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
}
struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
@@ -539,17 +544,17 @@ again:
static void __dlm_print_nodes(struct dlm_ctxt *dlm)
{
- int node = -1;
+ int node = -1, num = 0;
assert_spin_locked(&dlm->spinlock);
- printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
-
+ printk("( ");
while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
printk("%d ", node);
+ ++num;
}
- printk("\n");
+ printk(") %u nodes\n", num);
}
static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
node = exit_msg->node_idx;
- printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
-
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
clear_bit(node, dlm->exit_domain_map);
+ printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
@@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
dlm_mark_domain_leaving(dlm);
dlm_leave_domain(dlm);
+ printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
dlm_force_free_mles(dlm);
dlm_complete_dlm_shutdown(dlm);
}
@@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
clear_bit(assert->node_idx, dlm->exit_domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
+ printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
assert->node_idx, dlm->name);
__dlm_print_nodes(dlm);
@@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
bail:
spin_lock(&dlm->spinlock);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- if (!status)
+ if (!status) {
+ printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
__dlm_print_nodes(dlm);
+ }
spin_unlock(&dlm->spinlock);
if (ctxt) {
@@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
goto leave;
}
- if (!o2hb_check_local_node_heartbeating()) {
- mlog(ML_ERROR, "the local node has not been configured, or is "
- "not heartbeating\n");
- ret = -EPROTO;
- goto leave;
- }
-
mlog(0, "register called for domain \"%s\"\n", domain);
retry:
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 8d39e0f..975810b 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
kick_thread = 1;
}
}
- /* reduce the inflight count, this may result in the lockres
- * being purged below during calc_usage */
- if (lock->ml.node == dlm->node_num)
- dlm_lockres_drop_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
@@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
lock->ml.type, res->lockname.len,
res->lockname.name, flags);
+ /*
+ * Wait if resource is getting recovered, remastered, etc.
+ * If the resource was remastered and new owner is self, then exit.
+ */
spin_lock(&res->spinlock);
-
- /* will exit this call with spinlock held */
__dlm_wait_on_lockres(res);
+ if (res->owner == dlm->node_num) {
+ spin_unlock(&res->spinlock);
+ return DLM_RECOVERING;
+ }
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* add lock to local (secondary) queue */
@@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
sizeof(create), res->owner, &status);
if (tmpret >= 0) {
- // successfully sent and received
- ret = status; // this is already a dlm_status
+ ret = status;
if (ret == DLM_REJECTED) {
- mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres "
- "no longer owned by %u. that node is coming back "
- "up currently.\n", dlm->name, create.namelen,
+ mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
+ "owned by node %u. That node is coming back up "
+ "currently.\n", dlm->name, create.namelen,
create.name, res->owner);
dlm_print_one_lock_resource(res);
BUG();
}
} else {
- mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
- "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
- res->owner);
- if (dlm_is_host_down(tmpret)) {
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
+ "node %u\n", dlm->name, create.namelen, create.name,
+ tmpret, res->owner);
+ if (dlm_is_host_down(tmpret))
ret = DLM_RECOVERING;
- mlog(0, "node %u died so returning DLM_RECOVERING "
- "from lock message!\n", res->owner);
- } else {
+ else
ret = dlm_err_to_dlm_status(tmpret);
- }
}
return ret;
@@ -440,7 +438,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
/* zero memory only if kernel-allocated */
lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
if (!lksb) {
- kfree(lock);
+ kmem_cache_free(dlm_lock_cache, lock);
return NULL;
}
kernel_allocated = 1;
@@ -718,18 +716,10 @@ retry_lock:
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
status == DLM_FORWARD) {
- mlog(0, "retrying lock with migration/"
- "recovery/in progress\n");
msleep(100);
- /* no waiting for dlm_reco_thread */
if (recovery) {
if (status != DLM_RECOVERING)
goto retry_lock;
-
- mlog(0, "%s: got RECOVERING "
- "for $RECOVERY lock, master "
- "was %u\n", dlm->name,
- res->owner);
/* wait to see the node go down, then
* drop down and allow the lockres to
* get cleaned up. need to remaster. */
@@ -741,6 +731,14 @@ retry_lock:
}
}
+ /* Inflight taken in dlm_get_lock_resource() is dropped here */
+ spin_lock(&res->spinlock);
+ dlm_lockres_drop_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
+
+ dlm_lockres_calc_usage(dlm, res);
+ dlm_kick_thread(dlm, res);
+
if (status != DLM_NORMAL) {
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 11eefb8..8e48ba5 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -631,39 +631,58 @@ error:
return NULL;
}
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int new_lockres,
- const char *file,
- int line)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
{
- if (!new_lockres)
- assert_spin_locked(&res->spinlock);
+ assert_spin_locked(&res->spinlock);
- if (!test_bit(dlm->node_num, res->refmap)) {
- BUG_ON(res->inflight_locks != 0);
- dlm_lockres_set_refmap_bit(dlm->node_num, res);
- }
+ mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ set_bit(bit, res->refmap);
+}
+
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
+{
+ assert_spin_locked(&res->spinlock);
+
+ mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ clear_bit(bit, res->refmap);
+}
+
+static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
res->inflight_locks++;
- mlog(0, "%s:%.*s: inflight++: now %u\n",
- dlm->name, res->lockname.len, res->lockname.name,
- res->inflight_locks);
+
+ mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
+}
+
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ __dlm_lockres_grab_inflight_ref(dlm, res);
}
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
{
assert_spin_locked(&res->spinlock);
BUG_ON(res->inflight_locks == 0);
+
res->inflight_locks--;
- mlog(0, "%s:%.*s: inflight--: now %u\n",
- dlm->name, res->lockname.len, res->lockname.name,
- res->inflight_locks);
- if (res->inflight_locks == 0)
- dlm_lockres_clear_refmap_bit(dlm->node_num, res);
+
+ mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
+
wake_up(&res->wq);
}
@@ -697,7 +716,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
unsigned int hash;
int tries = 0;
int bit, wait_on_recovery = 0;
- int drop_inflight_if_nonlocal = 0;
BUG_ON(!lockid);
@@ -709,36 +727,46 @@ lookup:
spin_lock(&dlm->spinlock);
tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
if (tmpres) {
- int dropping_ref = 0;
-
spin_unlock(&dlm->spinlock);
-
spin_lock(&tmpres->spinlock);
- /* We wait for the other thread that is mastering the resource */
+
+ /*
+ * Right after dlm spinlock was released, dlm_thread could have
+ * purged the lockres. Check if lockres got unhashed. If so
+ * start over.
+ */
+ if (hlist_unhashed(&tmpres->hash_node)) {
+ spin_unlock(&tmpres->spinlock);
+ dlm_lockres_put(tmpres);
+ tmpres = NULL;
+ goto lookup;
+ }
+
+ /* Wait on the thread that is mastering the resource */
if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
__dlm_wait_on_lockres(tmpres);
BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+ spin_unlock(&tmpres->spinlock);
+ dlm_lockres_put(tmpres);
+ tmpres = NULL;
+ goto lookup;
}
- if (tmpres->owner == dlm->node_num) {
- BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
- dlm_lockres_grab_inflight_ref(dlm, tmpres);
- } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
- dropping_ref = 1;
- spin_unlock(&tmpres->spinlock);
-
- /* wait until done messaging the master, drop our ref to allow
- * the lockres to be purged, start over. */
- if (dropping_ref) {
- spin_lock(&tmpres->spinlock);
- __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
+ /* Wait on the resource purge to complete before continuing */
+ if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
+ BUG_ON(tmpres->owner == dlm->node_num);
+ __dlm_wait_on_lockres_flags(tmpres,
+ DLM_LOCK_RES_DROPPING_REF);
spin_unlock(&tmpres->spinlock);
dlm_lockres_put(tmpres);
tmpres = NULL;
goto lookup;
}
- mlog(0, "found in hash!\n");
+ /* Grab inflight ref to pin the resource */
+ dlm_lockres_grab_inflight_ref(dlm, tmpres);
+
+ spin_unlock(&tmpres->spinlock);
if (res)
dlm_lockres_put(res);
res = tmpres;
@@ -829,8 +857,8 @@ lookup:
* but they might own this lockres. wait on them. */
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
}
@@ -843,12 +871,9 @@ lookup:
/* finally add the lockres to its hash bucket */
__dlm_insert_lockres(dlm, res);
- /* since this lockres is new it doesn't not require the spinlock */
- dlm_lockres_grab_inflight_ref_new(dlm, res);
- /* if this node does not become the master make sure to drop
- * this inflight reference below */
- drop_inflight_if_nonlocal = 1;
+ /* since this lockres is new it doesn't not require the spinlock */
+ __dlm_lockres_grab_inflight_ref(dlm, res);
/* get an extra ref on the mle in case this is a BLOCK
* if so, the creator of the BLOCK may try to put the last
@@ -864,8 +889,8 @@ redo_request:
* dlm spinlock would be detectable be a change on the mle,
* so we only need to clear out the recovery map once. */
if (dlm_is_recovery_lock(lockid, namelen)) {
- mlog(ML_NOTICE, "%s: recovery map is not empty, but "
- "must master $RECOVERY lock now\n", dlm->name);
+ mlog(0, "%s: Recovery map is not empty, but must "
+ "master $RECOVERY lock now\n", dlm->name);
if (!dlm_pre_master_reco_lockres(dlm, res))
wait_on_recovery = 0;
else {
@@ -883,8 +908,8 @@ redo_request:
spin_lock(&dlm->spinlock);
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
} else
@@ -913,8 +938,8 @@ redo_request:
* yet, keep going until it does. this is how the
* master will know that asserts are needed back to
* the lower nodes. */
- mlog(0, "%s:%.*s: requests only up to %u but master "
- "is %u, keep going\n", dlm->name, namelen,
+ mlog(0, "%s: res %.*s, Requests only up to %u but "
+ "master is %u, keep going\n", dlm->name, namelen,
lockid, nodenum, mle->master);
}
}
@@ -924,13 +949,12 @@ wait:
ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
if (ret < 0) {
wait_on_recovery = 1;
- mlog(0, "%s:%.*s: node map changed, redo the "
- "master request now, blocked=%d\n",
- dlm->name, res->lockname.len,
+ mlog(0, "%s: res %.*s, Node map changed, redo the master "
+ "request now, blocked=%d\n", dlm->name, res->lockname.len,
res->lockname.name, blocked);
if (++tries > 20) {
- mlog(ML_ERROR, "%s:%.*s: spinning on "
- "dlm_wait_for_lock_mastery, blocked=%d\n",
+ mlog(ML_ERROR, "%s: res %.*s, Spinning on "
+ "dlm_wait_for_lock_mastery, blocked = %d\n",
dlm->name, res->lockname.len,
res->lockname.name, blocked);
dlm_print_one_lock_resource(res);
@@ -940,7 +964,8 @@ wait:
goto redo_request;
}
- mlog(0, "lockres mastered by %u\n", res->owner);
+ mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
+ res->lockname.name, res->owner);
/* make sure we never continue without this */
BUG_ON(res->owner == O2NM_MAX_NODES);
@@ -952,8 +977,6 @@ wait:
wake_waiters:
spin_lock(&res->spinlock);
- if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
- dlm_lockres_drop_inflight_ref(dlm, res);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
@@ -1388,6 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
int found, ret;
int set_maybe;
int dispatch_assert = 0;
+ int dispatched = 0;
if (!dlm_grab(dlm))
return DLM_MASTER_RESP_NO;
@@ -1426,9 +1450,7 @@ way_up_top:
}
if (res->owner == dlm->node_num) {
- mlog(0, "%s:%.*s: setting bit %u in refmap\n",
- dlm->name, namelen, name, request->node_idx);
- dlm_lockres_set_refmap_bit(request->node_idx, res);
+ dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
spin_unlock(&res->spinlock);
response = DLM_MASTER_RESP_YES;
if (mle)
@@ -1493,10 +1515,8 @@ way_up_top:
* go back and clean the mles on any
* other nodes */
dispatch_assert = 1;
- dlm_lockres_set_refmap_bit(request->node_idx, res);
- mlog(0, "%s:%.*s: setting bit %u in refmap\n",
- dlm->name, namelen, name,
- request->node_idx);
+ dlm_lockres_set_refmap_bit(dlm, res,
+ request->node_idx);
} else
response = DLM_MASTER_RESP_NO;
} else {
@@ -1598,13 +1618,16 @@ send_response:
mlog(ML_ERROR, "failed to dispatch assert master work\n");
response = DLM_MASTER_RESP_ERROR;
dlm_lockres_put(res);
+ } else {
+ dispatched = 1;
}
} else {
if (res)
dlm_lockres_put(res);
}
- dlm_put(dlm);
+ if (!dispatched)
+ dlm_put(dlm);
return response;
}
@@ -1702,7 +1725,7 @@ again:
"lockres, set the bit in the refmap\n",
namelen, lockname, to);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(to, res);
+ dlm_lockres_set_refmap_bit(dlm, res, to);
spin_unlock(&res->spinlock);
}
}
@@ -2022,7 +2045,6 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
/* queue up work for dlm_assert_master_worker */
- dlm_grab(dlm); /* get an extra ref for the work item */
dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
item->u.am.lockres = res; /* already have a ref */
/* can optionally ignore node numbers higher than this node */
@@ -2187,8 +2209,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
namelen = res->lockname.len;
BUG_ON(namelen > O2NM_MAX_NAME_LEN);
- mlog(0, "%s:%.*s: sending deref to %d\n",
- dlm->name, namelen, lockname, res->owner);
memset(&deref, 0, sizeof(deref));
deref.node_idx = dlm->node_num;
deref.namelen = namelen;
@@ -2197,14 +2217,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
&deref, sizeof(deref), res->owner, &r);
if (ret < 0)
- mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
- "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
- res->owner);
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
+ dlm->name, namelen, lockname, ret, res->owner);
else if (r < 0) {
/* BAD. other node says I did not have a ref. */
- mlog(ML_ERROR,"while dropping ref on %s:%.*s "
- "(master=%u) got %d.\n", dlm->name, namelen,
- lockname, res->owner, r);
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, res->owner, r);
dlm_print_one_lock_resource(res);
BUG();
}
@@ -2260,7 +2278,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
else {
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
if (test_bit(node, res->refmap)) {
- dlm_lockres_clear_refmap_bit(node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
}
@@ -2320,7 +2338,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
if (test_bit(node, res->refmap)) {
__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
- dlm_lockres_clear_refmap_bit(node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
spin_unlock(&res->spinlock);
@@ -2802,7 +2820,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
BUG_ON(!list_empty(&lock->bast_list));
BUG_ON(lock->ast_pending);
BUG_ON(lock->bast_pending);
- dlm_lockres_clear_refmap_bit(lock->ml.node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res,
+ lock->ml.node);
list_del_init(&lock->list);
dlm_lock_put(lock);
/* In a normal unlock, we would have added a
@@ -2823,7 +2842,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
mlog(0, "%s:%.*s: node %u had a ref to this "
"migrating lockres, clearing\n", dlm->name,
res->lockname.len, res->lockname.name, bit);
- dlm_lockres_clear_refmap_bit(bit, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, bit);
}
bit++;
}
@@ -2916,9 +2935,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
&migrate, sizeof(migrate), nodenum,
&status);
if (ret < 0) {
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
- dlm->key, nodenum);
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send "
+ "MIGRATE_REQUEST to node %u\n", dlm->name,
+ migrate.namelen, migrate.name, ret, nodenum);
if (!dlm_is_host_down(ret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", ret);
BUG();
@@ -2937,7 +2956,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
dlm->name, res->lockname.len, res->lockname.name,
nodenum);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(nodenum, res);
+ dlm_lockres_set_refmap_bit(dlm, res, nodenum);
spin_unlock(&res->spinlock);
}
}
@@ -3271,7 +3290,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
* mastery reference here since old_master will briefly have
* a reference after the migration completes */
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(old_master, res);
+ dlm_lockres_set_refmap_bit(dlm, res, old_master);
spin_unlock(&res->spinlock);
mlog(0, "now time to do a migrate request to other nodes\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7efab6d..0e5013e 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
}
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(ML_NOTICE, "%s: waiting %dms for notification of "
- "death of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_dead(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_dead(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
- "of death of node %u\n", dlm->name, node);
+ dlm_is_node_dead(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_dead(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(0, "%s: waiting %dms for notification of "
- "recovery of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_recovered(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_recovered(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(0, "%s: waiting indefinitely for notification "
- "of recovery of node %u\n", dlm->name, node);
+ dlm_is_node_recovered(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_recovered(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
/* callers of the top-level api calls (dlmlock/dlmunlock) should
@@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
{
spin_lock(&dlm->spinlock);
BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+ printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
+ dlm->name, dlm->reco.dead_node);
dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
}
@@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
+ printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
wake_up(&dlm->reco.event);
}
+static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
+{
+ printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
+ "dead node %u in domain %s\n", dlm->reco.new_master,
+ (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
+ dlm->reco.dead_node, dlm->name);
+}
+
static int dlm_do_recovery(struct dlm_ctxt *dlm)
{
int status = 0;
@@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
}
mlog(0, "another node will master this recovery session.\n");
}
- mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
- dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master,
- dlm->node_num, dlm->reco.dead_node);
+
+ dlm_print_recovery_master(dlm);
/* it is safe to start everything back up here
* because all of the dead node's lock resources
@@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
return 0;
master_here:
- mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node "
- "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
- dlm->node_num, dlm->reco.dead_node, dlm->name);
+ dlm_print_recovery_master(dlm);
status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
if (status < 0) {
/* we should never hit this anymore */
- mlog(ML_ERROR, "error %d remastering locks for node %u, "
- "retrying.\n", status, dlm->reco.dead_node);
+ mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
+ "retrying.\n", dlm->name, status, dlm->reco.dead_node);
/* yield a bit to allow any final network messages
* to get handled on remaining nodes */
msleep(100);
@@ -534,7 +540,10 @@ master_here:
/* success! see if any other nodes need recovery */
mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
dlm->name, dlm->reco.dead_node, dlm->node_num);
- dlm_reset_recovery(dlm);
+ spin_lock(&dlm->spinlock);
+ __dlm_reset_recovery(dlm);
+ dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
}
dlm_end_recovery(dlm);
@@ -567,7 +576,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
- mlog(0, "requesting lock info from node %u\n",
+ mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
ndata->node_num);
if (ndata->node_num == dlm->node_num) {
@@ -640,7 +649,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
spin_unlock(&dlm_reco_state_lock);
}
- mlog(0, "done requesting all lock info\n");
+ mlog(0, "%s: Done requesting all lock info\n", dlm->name);
/* nodes should be sending reco data now
* just need to wait */
@@ -692,6 +701,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
if (all_nodes_done) {
int ret;
+ /* Set this flag on recovery master to avoid
+ * a new recovery for another dead node start
+ * before the recovery is not done. That may
+ * cause recovery hung.*/
+ spin_lock(&dlm->spinlock);
+ dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
+
/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
* just send a finalize message to everyone and
* clean up */
@@ -802,10 +819,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
/* negative status is handled by caller */
if (ret < 0)
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
- dlm->key, request_from);
-
+ mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
+ "to recover dead node %u\n", dlm->name, ret,
+ request_from, dead_node);
// return from here, then
// sleep until all received or error
return ret;
@@ -956,9 +972,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
sizeof(done_msg), send_to, &tmpret);
if (ret < 0) {
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
- dlm->key, send_to);
+ mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
+ "to recover dead node %u\n", dlm->name, ret, send_to,
+ dead_node);
if (!dlm_is_host_down(ret)) {
BUG();
}
@@ -1127,9 +1143,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
if (ret < 0) {
/* XXX: negative status is not handled.
* this will end up killing this node. */
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
- dlm->key, send_to);
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
+ "node %u (%s)\n", dlm->name, mres->lockname_len,
+ mres->lockname, ret, send_to,
+ (orig_flags & DLM_MRES_MIGRATION ?
+ "migration" : "recovery"));
} else {
/* might get an -ENOMEM back here */
ret = status;
@@ -1671,6 +1689,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
unsigned int hash;
int master = DLM_LOCK_RES_OWNER_UNKNOWN;
u32 flags = DLM_ASSERT_MASTER_REQUERY;
+ int dispatched = 0;
if (!dlm_grab(dlm)) {
/* since the domain has gone away on this
@@ -1692,6 +1711,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
mlog_errno(-ENOMEM);
/* retry!? */
BUG();
+ } else {
+ dispatched = 1;
}
} else /* put.. incase we are not the master */
dlm_lockres_put(res);
@@ -1699,7 +1720,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
}
spin_unlock(&dlm->spinlock);
- dlm_put(dlm);
+ if (!dispatched)
+ dlm_put(dlm);
return master;
}
@@ -1745,13 +1767,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
struct dlm_migratable_lockres *mres)
{
struct dlm_migratable_lock *ml;
- struct list_head *queue;
+ struct list_head *queue, *iter;
struct list_head *tmpq = NULL;
struct dlm_lock *newlock = NULL;
struct dlm_lockstatus *lksb = NULL;
int ret = 0;
int i, j, bad;
- struct dlm_lock *lock = NULL;
+ struct dlm_lock *lock;
u8 from = O2NM_MAX_NODES;
unsigned int added = 0;
__be64 c;
@@ -1767,7 +1789,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
dlm->name, mres->lockname_len, mres->lockname,
from);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(from, res);
+ dlm_lockres_set_refmap_bit(dlm, res, from);
spin_unlock(&res->spinlock);
added++;
break;
@@ -1786,14 +1808,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* MIGRATION ONLY! */
BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+ lock = NULL;
spin_lock(&res->spinlock);
for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
tmpq = dlm_list_idx_to_ptr(res, j);
- list_for_each_entry(lock, tmpq, list) {
- if (lock->ml.cookie != ml->cookie)
- lock = NULL;
- else
+ list_for_each(iter, tmpq) {
+ lock = list_entry(iter,
+ struct dlm_lock, list);
+ if (lock->ml.cookie == ml->cookie)
break;
+ lock = NULL;
}
if (lock)
break;
@@ -1965,7 +1989,7 @@ skip_lvb:
mlog(0, "%s:%.*s: added lock for node %u, "
"setting refmap bit\n", dlm->name,
res->lockname.len, res->lockname.name, ml->node);
- dlm_lockres_set_refmap_bit(ml->node, res);
+ dlm_lockres_set_refmap_bit(dlm, res, ml->node);
added++;
}
spin_unlock(&res->spinlock);
@@ -2084,6 +2108,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
if (res->owner == dead_node) {
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
list_del_init(&res->recovering);
spin_lock(&res->spinlock);
/* new_master has our reference from
@@ -2105,40 +2132,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
- if (res->state & DLM_LOCK_RES_RECOVERING) {
- if (res->owner == dead_node) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, but "
- "clearing state anyway\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else if (res->owner == dlm->node_num) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, "
- "owner is THIS node, clearing\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else
- continue;
+ if (!(res->state & DLM_LOCK_RES_RECOVERING))
+ continue;
- if (!list_empty(&res->recovering)) {
- mlog(0, "%s:%.*s: lockres was "
- "marked RECOVERING, owner=%u\n",
- dlm->name, res->lockname.len,
- res->lockname.name, res->owner);
- list_del_init(&res->recovering);
- dlm_lockres_put(res);
- }
- spin_lock(&res->spinlock);
- /* new_master has our reference from
- * the lock state sent during recovery */
- dlm_change_lockres_owner(dlm, res, new_master);
- res->state &= ~DLM_LOCK_RES_RECOVERING;
- if (__dlm_lockres_has_locks(res))
- __dlm_dirty_lockres(dlm, res);
- spin_unlock(&res->spinlock);
- wake_up(&res->wq);
+ if (res->owner != dead_node &&
+ res->owner != dlm->node_num)
+ continue;
+
+ if (!list_empty(&res->recovering)) {
+ list_del_init(&res->recovering);
+ dlm_lockres_put(res);
}
+
+ /* new_master has our reference from
+ * the lock state sent during recovery */
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
+ spin_lock(&res->spinlock);
+ dlm_change_lockres_owner(dlm, res, new_master);
+ res->state &= ~DLM_LOCK_RES_RECOVERING;
+ if (__dlm_lockres_has_locks(res))
+ __dlm_dirty_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
}
}
}
@@ -2252,12 +2269,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, freed, dead_node);
__dlm_print_one_lock_resource(res);
}
- dlm_lockres_clear_refmap_bit(dead_node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
} else if (test_bit(dead_node, res->refmap)) {
mlog(0, "%s:%.*s: dead node %u had a ref, but had "
"no locks and had not purged before dying\n", dlm->name,
res->lockname.len, res->lockname.name, dead_node);
- dlm_lockres_clear_refmap_bit(dead_node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
}
/* do not kick thread yet */
@@ -2324,9 +2341,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
- mlog(ML_NOTICE, "Ignore %.*s for "
+ mlog(ML_NOTICE, "%s: res %.*s, Skip "
"recovery as it is being freed\n",
- res->lockname.len,
+ dlm->name, res->lockname.len,
res->lockname.name);
} else
dlm_move_lockres_to_recovery_list(dlm,
@@ -2870,8 +2887,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
BUG();
}
dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ __dlm_reset_recovery(dlm);
spin_unlock(&dlm->spinlock);
- dlm_reset_recovery(dlm);
dlm_kick_recovery_thread(dlm);
break;
default:
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 1d6d1d2..e73c833 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
{
int bit;
+ assert_spin_locked(&res->spinlock);
+
if (__dlm_lockres_has_locks(res))
return 0;
+ /* Locks are in the process of being created */
+ if (res->inflight_locks)
+ return 0;
+
if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
return 0;
if (res->state & DLM_LOCK_RES_RECOVERING)
return 0;
+ /* Another node has this resource with this node as the master */
bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES)
return 0;
- /*
- * since the bit for dlm->node_num is not set, inflight_locks better
- * be zero
- */
- BUG_ON(res->inflight_locks != 0);
return 1;
}
@@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
/* clear our bit from the master's refmap, ignore errors */
ret = dlm_drop_lockres_ref(dlm, res);
if (ret < 0) {
- mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
- res->lockname.len, res->lockname.name, ret);
if (!dlm_is_host_down(ret))
BUG();
}
@@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
BUG();
}
- __dlm_unhash_lockres(res);
+ __dlm_unhash_lockres(dlm, res);
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index ab4046f..b5e457c 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1692,7 +1692,7 @@ int ocfs2_open_lock(struct inode *inode)
mlog(0, "inode %llu take PRMODE open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- if (ocfs2_mount_local(osb))
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
goto out;
lockres = &OCFS2_I(inode)->ip_open_lockres;
@@ -1718,6 +1718,12 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (write)
+ status = -EROFS;
+ goto out;
+ }
+
if (ocfs2_mount_local(osb))
goto out;
@@ -2092,7 +2098,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
inode->i_gid = be32_to_cpu(lvb->lvb_igid);
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
- inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
+ set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
ocfs2_unpack_timespec(&inode->i_atime,
be64_to_cpu(lvb->lvb_iatime_packed));
ocfs2_unpack_timespec(&inode->i_mtime,
@@ -2298,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
if (ocfs2_is_hard_readonly(osb)) {
if (ex)
status = -EROFS;
- goto bail;
+ goto getbh;
}
if (ocfs2_mount_local(osb))
@@ -2356,7 +2362,7 @@ local:
mlog_errno(status);
goto bail;
}
-
+getbh:
if (ret_bh) {
status = ocfs2_assign_bh(inode, ret_bh, local_bh);
if (status < 0) {
@@ -2631,8 +2637,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
BUG_ON(!dl);
- if (ocfs2_is_hard_readonly(osb))
- return -EROFS;
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (ex)
+ return -EROFS;
+ return 0;
+ }
if (ocfs2_mount_local(osb))
return 0;
@@ -2650,7 +2659,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (!ocfs2_mount_local(osb))
+ if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
}
@@ -3959,9 +3968,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
osb->dc_work_sequence = osb->dc_wake_sequence;
processed = osb->blocked_lock_count;
- while (processed) {
- BUG_ON(list_empty(&osb->blocked_lock_list));
-
+ /*
+ * blocked lock processing in this loop might call iput which can
+ * remove items off osb->blocked_lock_list. Downconvert up to
+ * 'processed' number of locks, but stop short if we had some
+ * removed in ocfs2_mark_lockres_freeing when downconverting.
+ */
+ while (processed && !list_empty(&osb->blocked_lock_list)) {
lockres = list_entry(osb->blocked_lock_list.next,
struct ocfs2_lock_res, l_blocked_list);
list_del_init(&lockres->l_blocked_list);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 774a032..cf22847 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -831,6 +831,102 @@ out:
return ret;
}
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret;
+ unsigned int is_last = 0, is_data = 0;
+ u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 cpos, cend, clen, hole_size;
+ u64 extoff, extlen;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_extent_rec rec;
+
+ BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE);
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (*offset >= inode->i_size) {
+ ret = -ENXIO;
+ goto out_unlock;
+ }
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ if (origin == SEEK_HOLE)
+ *offset = inode->i_size;
+ goto out_unlock;
+ }
+
+ clen = 0;
+ cpos = *offset >> cs_bits;
+ cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
+
+ while (cpos < cend && !is_last) {
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
+ &rec, &is_last);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ extoff = cpos;
+ extoff <<= cs_bits;
+
+ if (rec.e_blkno == 0ULL) {
+ clen = hole_size;
+ is_data = 0;
+ } else {
+ clen = le16_to_cpu(rec.e_leaf_clusters) -
+ (cpos - le32_to_cpu(rec.e_cpos));
+ is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
+ }
+
+ if ((!is_data && origin == SEEK_HOLE) ||
+ (is_data && origin == SEEK_DATA)) {
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
+ }
+
+ if (!is_last)
+ cpos += clen;
+ }
+
+ if (origin == SEEK_HOLE) {
+ extoff = cpos;
+ extoff <<= cs_bits;
+ extlen = clen;
+ extlen <<= cs_bits;
+
+ if ((extoff + extlen) > inode->i_size)
+ extlen = inode->i_size - extoff;
+ extoff += extlen;
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
+ }
+
+ ret = -ENXIO;
+
+out_unlock:
+
+ brelse(di_bh);
+
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ocfs2_inode_unlock(inode, 0);
+out:
+ if (ret && ret != -ENXIO)
+ ret = -ENXIO;
+ return ret;
+}
+
int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index e79d41c..67ea57d 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len);
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
+
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
struct ocfs2_extent_list *el,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b1e35a3..6a7a3d9 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -171,7 +171,8 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
return 0;
}
-static int ocfs2_sync_file(struct file *file, int datasync)
+static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
+ int datasync)
{
int err = 0;
journal_t *journal;
@@ -184,6 +185,16 @@ static int ocfs2_sync_file(struct file *file, int datasync)
file->f_path.dentry->d_name.name,
(unsigned long long)datasync);
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (err)
+ return err;
+
+ /*
+ * Probably don't need the i_mutex at all in here, just putting it here
+ * to be consistent with how fsync used to be called, someone more
+ * familiar with the fs could possibly remove it.
+ */
+ mutex_lock(&inode->i_mutex);
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
/*
* We still have to flush drive's caches to get data to the
@@ -200,6 +211,7 @@ static int ocfs2_sync_file(struct file *file, int datasync)
bail:
if (err)
mlog_errno(err);
+ mutex_unlock(&inode->i_mutex);
return (err < 0) ? -EIO : 0;
}
@@ -1142,6 +1154,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
if (status)
goto bail_unlock;
+ inode_dio_wait(inode);
+
if (i_size_read(inode) > attr->ia_size) {
if (ocfs2_should_order_data(inode)) {
status = ocfs2_begin_ordered_truncate(inode,
@@ -1279,11 +1293,11 @@ bail:
return err;
}
-int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
+int ocfs2_permission(struct inode *inode, int mask)
{
int ret;
- if (flags & IPERM_FLAG_RCU)
+ if (mask & MAY_NOT_BLOCK)
return -ECHILD;
ret = ocfs2_inode_lock(inode, NULL, 0);
@@ -1293,7 +1307,7 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
goto out;
}
- ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
+ ret = generic_permission(inode, mask);
ocfs2_inode_unlock(inode, 0);
out:
@@ -1936,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (ret < 0)
mlog_errno(ret);
+ if (file && (file->f_flags & O_SYNC))
+ handle->h_sync = 1;
+
ocfs2_commit_trans(osb, handle);
out_inode_unlock:
@@ -2038,6 +2055,23 @@ out:
return ret;
}
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+
+ wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+ int blockmask = inode->i_sb->s_blocksize - 1;
+ loff_t final_size = pos + count;
+
+ if ((pos & blockmask) || (final_size & blockmask))
+ return 1;
+ return 0;
+}
+
static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
struct file *file,
loff_t pos, size_t count,
@@ -2216,6 +2250,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
+ int unaligned_dio = 0;
trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2236,9 +2271,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
ocfs2_iocb_clear_sem_locked(iocb);
relock:
- /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
+ /* to match setattr's i_mutex -> rw_lock ordering */
if (direct_io) {
- down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_sem_locked(iocb);
@@ -2284,13 +2318,16 @@ relock:
goto out;
}
+ if (direct_io && !is_sync_kiocb(iocb))
+ unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+ *ppos);
+
/*
* We can't complete the direct I/O as requested, fall back to
* buffered I/O.
*/
if (direct_io && !can_do_direct) {
ocfs2_rw_unlock(inode, rw_level);
- up_read(&inode->i_alloc_sem);
have_alloc_sem = 0;
rw_level = -1;
@@ -2299,6 +2336,18 @@ relock:
goto relock;
}
+ if (unaligned_dio) {
+ /*
+ * Wait on previous unaligned aio to complete before
+ * proceeding.
+ */
+ ocfs2_aiodio_wait(inode);
+
+ /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+ atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+ ocfs2_iocb_set_unaligned_aio(iocb);
+ }
+
/*
* To later detect whether a journal commit for sync writes is
* necessary, we sample i_size, and cluster count here.
@@ -2338,10 +2387,14 @@ out_dio:
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
+ if (unlikely(written <= 0))
+ goto no_sync;
+
if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
((file->f_flags & O_DIRECT) && !direct_io)) {
- ret = filemap_fdatawrite_range(file->f_mapping, pos,
- pos + count - 1);
+ ret = filemap_fdatawrite_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
if (ret < 0)
written = ret;
@@ -2354,15 +2407,16 @@ out_dio:
}
if (!ret)
- ret = filemap_fdatawait_range(file->f_mapping, pos,
- pos + count - 1);
+ ret = filemap_fdatawait_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
}
+no_sync:
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
* function pointer which is called when o_direct io completes so that
- * it can unlock our rw lock. (it's the clustered equivalent of
- * i_alloc_sem; protects truncate from racing with pending ios).
+ * it can unlock our rw lock.
* Unfortunately there are error cases which call end_io and others
* that don't. so we don't have to unlock the rw_lock if either an
* async dio is going to do it in the future or an end_io after an
@@ -2371,6 +2425,12 @@ out_dio:
if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
have_alloc_sem = 0;
+ unaligned_dio = 0;
+ }
+
+ if (unaligned_dio) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+ atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
}
out:
@@ -2378,10 +2438,8 @@ out:
ocfs2_rw_unlock(inode, rw_level);
out_sems:
- if (have_alloc_sem) {
- up_read(&inode->i_alloc_sem);
+ if (have_alloc_sem)
ocfs2_iocb_clear_sem_locked(iocb);
- }
mutex_unlock(&inode->i_mutex);
@@ -2416,9 +2474,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
struct address_space *mapping = out->f_mapping;
struct inode *inode = mapping->host;
struct splice_desc sd = {
- .total_len = len,
.flags = flags,
- .pos = *ppos,
.u.file = out,
};
@@ -2428,6 +2484,12 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
out->f_path.dentry->d_name.len,
out->f_path.dentry->d_name.name, len);
+ ret = generic_write_checks(out, ppos, &len, 0);
+ if (ret)
+ return ret;
+ sd.total_len = len;
+ sd.pos = *ppos;
+
if (pipe->inode)
mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
@@ -2531,7 +2593,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
* need locks to protect pending reads from racing with truncate.
*/
if (filp->f_flags & O_DIRECT) {
- down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
ocfs2_iocb_set_sem_locked(iocb);
@@ -2574,16 +2635,66 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
}
bail:
- if (have_alloc_sem) {
- up_read(&inode->i_alloc_sem);
+ if (have_alloc_sem)
ocfs2_iocb_clear_sem_locked(iocb);
- }
+
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
return ret;
}
+/* Refer generic_file_llseek_unlocked() */
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ switch (origin) {
+ case SEEK_SET:
+ break;
+ case SEEK_END:
+ offset += inode->i_size;
+ break;
+ case SEEK_CUR:
+ if (offset == 0) {
+ offset = file->f_pos;
+ goto out;
+ }
+ offset += file->f_pos;
+ break;
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+ if (ret)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+ ret = -EINVAL;
+ if (!ret && offset > inode->i_sb->s_maxbytes)
+ ret = -EINVAL;
+ if (ret)
+ goto out;
+
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+
+out:
+ mutex_unlock(&inode->i_mutex);
+ if (ret)
+ return ret;
+ return offset;
+}
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@@ -2593,12 +2704,14 @@ const struct inode_operations ocfs2_file_iops = {
.listxattr = ocfs2_listxattr,
.removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
+ .get_acl = ocfs2_iop_get_acl,
};
const struct inode_operations ocfs2_special_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
+ .get_acl = ocfs2_iop_get_acl,
};
/*
@@ -2606,7 +2719,7 @@ const struct inode_operations ocfs2_special_file_iops = {
* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
*/
const struct file_operations ocfs2_fops = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.mmap = ocfs2_mmap,
@@ -2654,7 +2767,7 @@ const struct file_operations ocfs2_dops = {
* the cluster.
*/
const struct file_operations ocfs2_fops_no_plocks = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.mmap = ocfs2_mmap,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index f5afbbe..97bf761 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask, unsigned int flags);
+int ocfs2_permission(struct inode *inode, int mask);
int ocfs2_should_update_atime(struct inode *inode,
struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b4c8bb6..17454a9 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -291,7 +291,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)le64_to_cpu(fe->i_blkno));
- inode->i_nlink = ocfs2_read_links_count(fe);
+ set_nlink(inode, ocfs2_read_links_count(fe));
trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
le32_to_cpu(fe->i_flags));
@@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
trace_ocfs2_cleanup_delete_inode(
(unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
if (sync_data)
- write_inode_now(inode, 1);
+ filemap_write_and_wait(inode->i_mapping);
truncate_inode_pages(&inode->i_data, 0);
}
@@ -1290,7 +1290,7 @@ void ocfs2_refresh_inode(struct inode *inode,
OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
ocfs2_set_inode_flags(inode);
i_size_write(inode, le64_to_cpu(fe->i_size));
- inode->i_nlink = ocfs2_read_links_count(fe);
+ set_nlink(inode, ocfs2_read_links_count(fe));
inode->i_uid = le32_to_cpu(fe->i_uid);
inode->i_gid = le32_to_cpu(fe->i_gid);
inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1c508b1..88924a3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,6 +43,9 @@ struct ocfs2_inode_info
/* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem;
+ /* Number of outstanding AIO's which are not page aligned */
+ atomic_t ip_unaligned_aio;
+
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index bc91072..726ff26 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -122,7 +122,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
(OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
if (!capable(CAP_LINUX_IMMUTABLE))
- goto bail_unlock;
+ goto bail_commit;
}
ocfs2_inode->ip_attr = flags;
@@ -132,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if (status < 0)
mlog_errno(status);
+bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
ocfs2_inode_unlock(inode, 1);
@@ -381,7 +382,7 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
if (!oifi) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_err;
}
if (o2info_from_user(*oifi, req))
@@ -431,7 +432,7 @@ bail:
o2info_set_request_error(&oifi->ifi_req, req);
kfree(oifi);
-
+out_err:
return status;
}
@@ -666,7 +667,7 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
if (!oiff) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_err;
}
if (o2info_from_user(*oiff, req))
@@ -716,7 +717,7 @@ bail:
o2info_set_request_error(&oiff->iff_req, req);
kfree(oiff);
-
+out_err:
return status;
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 295d564..0a42ae9 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
/* we need to run complete recovery for offline orphan slots */
ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
- mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
- node_num, slot_num,
- MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+ printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
jbd2_journal_destroy(journal);
+ printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
done:
/* drop the lock on this nodes journal */
if (got_lock)
@@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
* every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
* is done to catch any orphans that are left over in orphan directories.
*
+ * It scans all slots, even ones that are in use. It does so to handle the
+ * case described below:
+ *
+ * Node 1 has an inode it was using. The dentry went away due to memory
+ * pressure. Node 1 closes the inode, but it's on the free list. The node
+ * has the open lock.
+ * Node 2 unlinks the inode. It grabs the dentry lock to notify others,
+ * but node 1 has no dentry and doesn't get the message. It trylocks the
+ * open lock, sees that another node has a PR, and does nothing.
+ * Later node 2 runs its orphan dir. It igets the inode, trylocks the
+ * open lock, sees the PR still, and does nothing.
+ * Basically, we have to trigger an orphan iput on node 1. The only way
+ * for this to happen is if node 1 runs node 2's orphan dir.
+ *
* ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
* seconds. It gets an EX lock on os_lockres and checks sequence number
* stored in LVB. If the sequence number has changed, it means some other
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 68cf2f6..a3385b6 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -441,10 +441,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir + index leaf + dx root update for free list */
+ * update on dir + index leaf + dx root update for free list +
+ * previous dirblock update in the free list */
static inline int ocfs2_link_credits(struct super_block *sb)
{
- return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
+ return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
ocfs2_quota_trans_credits(sb);
}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3e9393c..9cd4108 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
struct page *page)
{
- int ret;
+ int ret = VM_FAULT_NOPAGE;
struct inode *inode = file->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
@@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
void *fsdata;
loff_t size = i_size_read(inode);
- /*
- * Another node might have truncated while we were waiting on
- * cluster locks.
- * We don't check size == 0 before the shift. This is borrowed
- * from do_generic_file_read.
- */
last_index = (size - 1) >> PAGE_CACHE_SHIFT;
- if (unlikely(!size || page->index > last_index)) {
- ret = -EINVAL;
- goto out;
- }
/*
- * The i_size check above doesn't catch the case where nodes
- * truncated and then re-extended the file. We'll re-check the
- * page mapping after taking the page lock inside of
- * ocfs2_write_begin_nolock().
+ * There are cases that lead to the page no longer bebongs to the
+ * mapping.
+ * 1) pagecache truncates locally due to memory pressure.
+ * 2) pagecache truncates when another is taking EX lock against
+ * inode lock. see ocfs2_data_convert_worker.
+ *
+ * The i_size check doesn't catch the case where nodes truncated and
+ * then re-extended the file. We'll re-check the page mapping after
+ * taking the page lock inside of ocfs2_write_begin_nolock().
+ *
+ * Let VM retry with these cases.
*/
- if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
- /*
- * the page has been umapped in ocfs2_data_downconvert_worker.
- * So return 0 here and let VFS retry.
- */
- ret = 0;
+ if ((page->mapping != inode->i_mapping) ||
+ (!PageUptodate(page)) ||
+ (page_offset(page) >= size))
goto out;
- }
/*
* Call ocfs2_write_begin() and ocfs2_write_end() to take
@@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
- fsdata);
- if (ret < 0) {
- mlog_errno(ret);
+ if (!locked_page) {
+ ret = VM_FAULT_NOPAGE;
goto out;
}
+ ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+ fsdata);
BUG_ON(ret != len);
- ret = 0;
+ ret = VM_FAULT_LOCKED;
out:
return ret;
}
@@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
out:
ocfs2_unblock_signals(&oldset);
- if (ret)
- ret = VM_FAULT_SIGBUS;
return ret;
}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index cd94270..184c76b 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -36,7 +36,6 @@
#include "dir.h"
#include "buffer_head_io.h"
#include "sysfile.h"
-#include "suballoc.h"
#include "refcounttree.h"
#include "move_extents.h"
@@ -746,7 +745,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
*/
ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
new_phys_cpos);
- if (!new_phys_cpos) {
+ if (!*new_phys_cpos) {
ret = -ENOSPC;
goto out_commit;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e5d738c..a8b2bfe 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -199,9 +199,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
* these are used by the support functions here and in
* callers. */
if (S_ISDIR(mode))
- inode->i_nlink = 2;
- else
- inode->i_nlink = 1;
+ set_nlink(inode, 2);
inode_init_owner(inode, dir, mode);
dquot_initialize(inode);
return inode;
@@ -1379,7 +1377,7 @@ static int ocfs2_rename(struct inode *old_dir,
}
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
new_inode->i_ctime = CURRENT_TIME;
}
old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
@@ -1387,9 +1385,9 @@ static int ocfs2_rename(struct inode *old_dir,
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
&old_inode_dot_dot_res, new_dir);
- old_dir->i_nlink--;
+ drop_nlink(old_dir);
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
} else {
inc_nlink(new_dir);
mark_inode_dirty(new_dir);
@@ -2018,7 +2016,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
ocfs2_add_links_count(orphan_fe, 1);
- orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
ocfs2_journal_dirty(handle, orphan_dir_bh);
status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
@@ -2116,7 +2114,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
ocfs2_add_links_count(orphan_fe, -1);
- orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
ocfs2_journal_dirty(handle, orphan_dir_bh);
leave:
@@ -2282,7 +2280,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
goto leave;
}
- inode->i_nlink = 0;
+ clear_nlink(inode);
/* do the real work now. */
status = __ocfs2_mknod_locked(dir, inode,
0, &new_di_bh, parent_di_bh, handle,
@@ -2437,7 +2435,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
di = (struct ocfs2_dinode *)di_bh->b_data;
le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
di->i_orphaned_slot = 0;
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
ocfs2_set_links_count(di, inode->i_nlink);
ocfs2_journal_dirty(handle, di_bh);
@@ -2498,4 +2496,5 @@ const struct inode_operations ocfs2_dir_iops = {
.listxattr = ocfs2_listxattr,
.removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
+ .get_acl = ocfs2_iop_get_acl,
};
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4092858..d355e6e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -836,18 +836,65 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
{
- __test_and_set_bit_le(bit, bitmap);
+ __set_bit_le(bit, bitmap);
}
#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
{
- __test_and_clear_bit_le(bit, bitmap);
+ __clear_bit_le(bit, bitmap);
}
#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
#define ocfs2_test_bit test_bit_le
#define ocfs2_find_next_zero_bit find_next_zero_bit_le
#define ocfs2_find_next_bit find_next_bit_le
+
+static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+ *bit += ((unsigned long) addr & 7UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+ *bit += ((unsigned long) addr & 3UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+ return addr;
+}
+
+static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_set_bit(bit, bitmap);
+}
+
+static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_clear_bit(bit, bitmap);
+}
+
+static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ return ocfs2_test_bit(bit, bitmap);
+}
+
+static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
+ int start)
+{
+ int fix = 0, ret, tmpmax;
+ bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
+ tmpmax = max + fix;
+ start += fix;
+
+ ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
+}
+
#endif /* OCFS2_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 92fcd57..a40e5ce 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -712,6 +712,12 @@ static int ocfs2_release_dquot(struct dquot *dquot)
*/
if (status < 0)
mlog_errno(status);
+ /*
+ * Clear dq_off so that we search for the structure in quota file next
+ * time we acquire it. The structure might be deleted and reallocated
+ * elsewhere by another node while our dquot structure is on freelist.
+ */
+ dquot->dq_off = 0;
clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_trans:
ocfs2_commit_trans(osb, handle);
@@ -750,16 +756,17 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
status = ocfs2_lock_global_qf(info, 1);
if (status < 0)
goto out;
- if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
- status = ocfs2_qinfo_lock(info, 0);
- if (status < 0)
- goto out_dq;
- status = qtree_read_dquot(&info->dqi_gi, dquot);
- ocfs2_qinfo_unlock(info, 0);
- if (status < 0)
- goto out_dq;
- }
- set_bit(DQ_READ_B, &dquot->dq_flags);
+ status = ocfs2_qinfo_lock(info, 0);
+ if (status < 0)
+ goto out_dq;
+ /*
+ * We always want to read dquot structure from disk because we don't
+ * know what happened with it while it was on freelist.
+ */
+ status = qtree_read_dquot(&info->dqi_gi, dquot);
+ ocfs2_qinfo_unlock(info, 0);
+ if (status < 0)
+ goto out_dq;
OCFS2_DQUOT(dquot)->dq_use_count++;
OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dc8007f..b6cfcf2 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
int status = 0;
struct ocfs2_quota_recovery *rec;
- mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
rec = ocfs2_alloc_quota_recovery();
if (!rec)
return ERR_PTR(-ENOMEM);
@@ -549,8 +551,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
goto out_commit;
}
lock_buffer(qbh);
- WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
- ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+ WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap));
+ ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(qbh);
ocfs2_journal_dirty(handle, qbh);
@@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
struct inode *lqinode;
unsigned int flags;
- mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
for (type = 0; type < MAXQUOTAS; type++) {
if (list_empty(&(rec->r_list[type])))
@@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
/* Someone else is holding the lock? Then he must be
* doing the recovery. Just skip the file... */
if (status == -EAGAIN) {
- mlog(ML_NOTICE, "skipping quota recovery for slot %d "
- "because quota file is locked.\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
+ "device (%s) for slot %d because quota file is "
+ "locked.\n", osb->dev_str, slot_num);
status = 0;
goto out_put;
} else if (status < 0) {
@@ -944,7 +949,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
* ol_quota_entries_per_block(sb);
}
- found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+ found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0);
/* We failed? */
if (found == len) {
mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
@@ -1208,7 +1213,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
struct ocfs2_local_disk_chunk *dchunk;
dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
- ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+ ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, -1);
}
@@ -1289,16 +1294,12 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
(od->dq_chunk->qc_headerbh->b_data);
/* Mark structure as freed */
lock_buffer(od->dq_chunk->qc_headerbh);
- ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+ ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(od->dq_chunk->qc_headerbh);
ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
out:
- /* Clear the read bit so that next time someone uses this
- * dquot he reads fresh info from disk and allocates local
- * dquot structure */
- clear_bit(DQ_READ_B, &dquot->dq_flags);
return status;
}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 15d29cc..9f32d7c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4368,25 +4368,6 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
return inode_permission(dir, MAY_WRITE | MAY_EXEC);
}
-/* copied from user_path_parent. */
-static int ocfs2_user_path_parent(const char __user *path,
- struct nameidata *nd, char **name)
-{
- char *s = getname(path);
- int error;
-
- if (IS_ERR(s))
- return PTR_ERR(s);
-
- error = kern_path_parent(s, nd);
- if (error)
- putname(s);
- else
- *name = s;
-
- return error;
-}
-
/**
* ocfs2_vfs_reflink - Create a reference-counted link
*
@@ -4460,10 +4441,8 @@ int ocfs2_reflink_ioctl(struct inode *inode,
bool preserve)
{
struct dentry *new_dentry;
- struct nameidata nd;
- struct path old_path;
+ struct path old_path, new_path;
int error;
- char *to = NULL;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
@@ -4474,39 +4453,33 @@ int ocfs2_reflink_ioctl(struct inode *inode,
return error;
}
- error = ocfs2_user_path_parent(newname, &nd, &to);
- if (error) {
+ new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
+ error = PTR_ERR(new_dentry);
+ if (IS_ERR(new_dentry)) {
mlog_errno(error);
goto out;
}
error = -EXDEV;
- if (old_path.mnt != nd.path.mnt)
- goto out_release;
- new_dentry = lookup_create(&nd, 0);
- error = PTR_ERR(new_dentry);
- if (IS_ERR(new_dentry)) {
+ if (old_path.mnt != new_path.mnt) {
mlog_errno(error);
- goto out_unlock;
+ goto out_dput;
}
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(new_path.mnt);
if (error) {
mlog_errno(error);
goto out_dput;
}
error = ocfs2_vfs_reflink(old_path.dentry,
- nd.path.dentry->d_inode,
+ new_path.dentry->d_inode,
new_dentry, preserve);
- mnt_drop_write(nd.path.mnt);
+ mnt_drop_write(new_path.mnt);
out_dput:
dput(new_dentry);
-out_unlock:
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-out_release:
- path_put(&nd.path);
- putname(to);
+ mutex_unlock(&new_path.dentry->d_inode->i_mutex);
+ path_put(&new_path);
out:
path_put(&old_path);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 26fc001..1424c15 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
goto bail;
}
} else
- mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
- slot);
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+ "allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 19965b0..9436801 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -28,6 +28,7 @@
#include "cluster/masklog.h"
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
#include "stackglue.h"
@@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
}
/*
+ * Check if this node is heartbeating and is connected to all other
+ * heartbeating nodes.
+ */
+static int o2cb_cluster_check(void)
+{
+ u8 node_num;
+ int i;
+ unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+ node_num = o2nm_this_node();
+ if (node_num == O2NM_MAX_NODES) {
+ printk(KERN_ERR "o2cb: This node has not been configured.\n");
+ return -EINVAL;
+ }
+
+ /*
+ * o2dlm expects o2net sockets to be created. If not, then
+ * dlm_join_domain() fails with a stack of errors which are both cryptic
+ * and incomplete. The idea here is to detect upfront whether we have
+ * managed to connect to all nodes or not. If not, then list the nodes
+ * to allow the user to check the configuration (incorrect IP, firewall,
+ * etc.) Yes, this is racy. But its not the end of the world.
+ */
+#define O2CB_MAP_STABILIZE_COUNT 60
+ for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
+ o2hb_fill_node_map(hbmap, sizeof(hbmap));
+ if (!test_bit(node_num, hbmap)) {
+ printk(KERN_ERR "o2cb: %s heartbeat has not been "
+ "started.\n", (o2hb_global_heartbeat_active() ?
+ "Global" : "Local"));
+ return -EINVAL;
+ }
+ o2net_fill_node_map(netmap, sizeof(netmap));
+ /* Force set the current node to allow easy compare */
+ set_bit(node_num, netmap);
+ if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+ return 0;
+ if (i < O2CB_MAP_STABILIZE_COUNT)
+ msleep(1000);
+ }
+
+ printk(KERN_ERR "o2cb: This node could not connect to nodes:");
+ i = -1;
+ while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (!test_bit(i, netmap))
+ printk(" %u", i);
+ }
+ printk(".\n");
+
+ return -ENOTCONN;
+}
+
+/*
* Called from the dlm when it's about to evict a node. This is how the
* classic stack signals node death.
*/
@@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data)
{
struct ocfs2_cluster_connection *conn = data;
- mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
- node_num, conn->cc_namelen, conn->cc_name);
+ printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
+ node_num, conn->cc_namelen, conn->cc_name);
conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
}
@@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
BUG_ON(conn == NULL);
BUG_ON(conn->cc_proto == NULL);
- /* for now we only have one cluster/node, make sure we see it
- * in the heartbeat universe */
- if (!o2hb_check_local_node_heartbeating()) {
- if (o2hb_global_heartbeat_active())
- mlog(ML_ERROR, "Global heartbeat not started\n");
- rc = -EINVAL;
+ /* Ensure cluster stack is up and all nodes are connected */
+ rc = o2cb_cluster_check();
+ if (rc) {
+ printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
+ "before retrying.\n");
goto out;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 56f6102..5fe6b1e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -54,6 +54,7 @@
#include "ocfs1_fs_compat.h"
#include "alloc.h"
+#include "aops.h"
#include "blockcheck.h"
#include "dlmglue.h"
#include "export.h"
@@ -1107,9 +1108,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
ocfs2_set_ro_flag(osb, 1);
- printk(KERN_NOTICE "Readonly device detected. No cluster "
- "services will be utilized for this mount. Recovery "
- "will be skipped.\n");
+ printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
+ "Cluster services will not be used for this mount. "
+ "Recovery will be skipped.\n", osb->dev_str);
}
if (!ocfs2_is_hard_readonly(osb)) {
@@ -1582,8 +1583,8 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
seq_printf(s, ",localflocks,");
if (osb->osb_cluster_stack[0])
- seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
- osb->osb_cluster_stack);
+ seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack,
+ OCFS2_STACK_LABEL_LEN);
if (opts & OCFS2_MOUNT_USRQUOTA)
seq_printf(s, ",usrquota");
if (opts & OCFS2_MOUNT_GRPQUOTA)
@@ -1616,12 +1617,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
return 0;
}
+wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
static int __init ocfs2_init(void)
{
- int status;
+ int status, i;
ocfs2_print_version();
+ for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
+ init_waitqueue_head(&ocfs2__ioend_wq[i]);
+
status = init_ocfs2_uptodate_cache();
if (status < 0) {
mlog_errno(status);
@@ -1760,7 +1766,7 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
oi->ip_dir_start_lookup = 0;
-
+ atomic_set(&oi->ip_unaligned_aio, 0);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
@@ -1974,7 +1980,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
* If we failed before we got a uuid_str yet, we can't stop
* heartbeat. Otherwise, do it.
*/
- if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+ if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
+ !ocfs2_is_hard_readonly(osb))
hangup_needed = 1;
if (osb->cconn)
@@ -2353,7 +2360,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog_errno(status);
goto bail;
}
- cleancache_init_shared_fs((char *)&uuid_net_key, sb);
+ cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
bail:
return status;
@@ -2462,8 +2469,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
} else {
- mlog(ML_NOTICE, "File system was not unmounted cleanly, "
- "recovering volume.\n");
+ printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
+ "unmounted cleanly, recovering it.\n", osb->dev_str);
}
local = ocfs2_mount_local(osb);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 40c7de0..74ff74c 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -31,17 +31,15 @@ extern struct workqueue_struct *ocfs2_wq;
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
-void __ocfs2_error(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_error(struct super_block *sb, const char *function,
+ const char *fmt, ...);
#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
-void __ocfs2_abort(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_abort(struct super_block *sb, const char *function,
+ const char *fmt, ...);
#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 61a84cf..bef187b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2376,16 +2376,18 @@ static int ocfs2_remove_value_outside(struct inode*inode,
}
ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
- if (ret < 0) {
- mlog_errno(ret);
- break;
- }
ocfs2_commit_trans(osb, ctxt.handle);
if (ctxt.meta_ac) {
ocfs2_free_alloc_context(ctxt.meta_ac);
ctxt.meta_ac = NULL;
}
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
}
if (ctxt.meta_ac)
@@ -7195,20 +7197,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
{
int ret = 0;
struct buffer_head *dir_bh = NULL;
- struct ocfs2_security_xattr_info si = {
- .enable = 1,
- };
- ret = ocfs2_init_security_get(inode, dir, qstr, &si);
- if (!ret) {
- ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
- si.name, si.value, si.value_len,
- XATTR_CREATE);
- if (ret) {
- mlog_errno(ret);
- goto leave;
- }
- } else if (ret != -EOPNOTSUPP) {
+ ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
+ if (ret) {
mlog_errno(ret);
goto leave;
}
@@ -7265,6 +7256,22 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
+int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
+{
+ const struct xattr *xattr;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, XATTR_CREATE);
+ if (err)
+ break;
+ }
+ return err;
+}
+
int ocfs2_init_security_get(struct inode *inode,
struct inode *dir,
const struct qstr *qstr,
@@ -7273,8 +7280,13 @@ int ocfs2_init_security_get(struct inode *inode,
/* check whether ocfs2 support feature xattr */
if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
return -EOPNOTSUPP;
- return security_inode_init_security(inode, dir, qstr, &si->name,
- &si->value, &si->value_len);
+ if (si)
+ return security_old_inode_init_security(inode, dir, qstr,
+ &si->name, &si->value,
+ &si->value_len);
+
+ return security_inode_init_security(inode, dir, qstr,
+ &ocfs2_initxattrs, NULL);
}
int ocfs2_init_security_set(handle_t *handle,
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 483442e..d1aca1d 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -214,7 +214,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
}
/* otherwise we clear all bit were set ... */
while (--i >= *beg)
- reiserfs_test_and_clear_le_bit
+ reiserfs_clear_le_bit
(i, bh->b_data);
reiserfs_restore_prepared_buffer(s, bh);
*beg = org;
@@ -1222,15 +1222,11 @@ void reiserfs_cache_bitmap_metadata(struct super_block *sb,
info->free_count = 0;
while (--cur >= (unsigned long *)bh->b_data) {
- int i;
-
/* 0 and ~0 are special, we can optimize for them */
if (*cur == 0)
info->free_count += BITS_PER_LONG;
else if (*cur != ~0L) /* A mix, investigate */
- for (i = BITS_PER_LONG - 1; i >= 0; i--)
- if (!reiserfs_test_le_bit(i, cur))
- info->free_count++;
+ info->free_count += BITS_PER_LONG - hweight_long(*cur);
}
}
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 198dabf..8048eea 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -14,7 +14,8 @@
extern const struct reiserfs_key MIN_KEY;
static int reiserfs_readdir(struct file *, void *, filldir_t);
-static int reiserfs_dir_fsync(struct file *filp, int datasync);
+static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
+ int datasync);
const struct file_operations reiserfs_dir_operations = {
.llseek = generic_file_llseek,
@@ -27,13 +28,21 @@ const struct file_operations reiserfs_dir_operations = {
#endif
};
-static int reiserfs_dir_fsync(struct file *filp, int datasync)
+static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
+ int datasync)
{
struct inode *inode = filp->f_mapping->host;
int err;
+
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (err)
+ return err;
+
+ mutex_lock(&inode->i_mutex);
reiserfs_write_lock(inode->i_sb);
err = reiserfs_commit_for_inode(inode);
reiserfs_write_unlock(inode->i_sb);
+ mutex_unlock(&inode->i_mutex);
if (err < 0)
return err;
return 0;
@@ -119,6 +128,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
char *d_name;
off_t d_off;
ino_t d_ino;
+ loff_t cur_pos = deh_offset(deh);
if (!de_visible(deh))
/* it is hidden entry */
@@ -191,8 +201,9 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
if (local_buf != small_buf) {
kfree(local_buf);
}
- // next entry should be looked for with such offset
- next_pos = deh_offset(deh) + 1;
+
+ /* deh_offset(deh) may be invalid now. */
+ next_pos = cur_pos + 1;
if (item_moved(&tmp_ih, &path_to_entry)) {
goto research;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 91f080c..f011185 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -126,7 +126,7 @@ static int reiserfs_file_open(struct inode *inode, struct file *file)
return err;
}
-static void reiserfs_vfs_truncate_file(struct inode *inode)
+void reiserfs_vfs_truncate_file(struct inode *inode)
{
mutex_lock(&(REISERFS_I(inode)->tailpack));
reiserfs_truncate_file(inode, 1);
@@ -140,12 +140,18 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
* be removed...
*/
-static int reiserfs_sync_file(struct file *filp, int datasync)
+static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
+ int datasync)
{
struct inode *inode = filp->f_mapping->host;
int err;
int barrier_done;
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (err)
+ return err;
+
+ mutex_lock(&inode->i_mutex);
BUG_ON(!S_ISREG(inode->i_mode));
err = sync_mapping_buffers(inode->i_mapping);
reiserfs_write_lock(inode->i_sb);
@@ -153,6 +159,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
reiserfs_write_unlock(inode->i_sb);
if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ mutex_unlock(&inode->i_mutex);
if (barrier_done < 0)
return barrier_done;
return (err < 0) ? -EIO : 0;
@@ -305,11 +312,11 @@ const struct file_operations reiserfs_file_operations = {
};
const struct inode_operations reiserfs_file_inode_operations = {
- .truncate = reiserfs_vfs_truncate_file,
.setattr = reiserfs_setattr,
.setxattr = reiserfs_setxattr,
.getxattr = reiserfs_getxattr,
.listxattr = reiserfs_listxattr,
.removexattr = reiserfs_removexattr,
.permission = reiserfs_permission,
+ .get_acl = reiserfs_get_acl,
};
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ebe8db4..fcb07e5 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1154,7 +1154,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
set_inode_item_key_version(inode, KEY_FORMAT_3_5);
set_inode_sd_version(inode, STAT_DATA_V1);
inode->i_mode = sd_v1_mode(sd);
- inode->i_nlink = sd_v1_nlink(sd);
+ set_nlink(inode, sd_v1_nlink(sd));
inode->i_uid = sd_v1_uid(sd);
inode->i_gid = sd_v1_gid(sd);
inode->i_size = sd_v1_size(sd);
@@ -1199,7 +1199,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
inode->i_mode = sd_v2_mode(sd);
- inode->i_nlink = sd_v2_nlink(sd);
+ set_nlink(inode, sd_v2_nlink(sd));
inode->i_uid = sd_v2_uid(sd);
inode->i_size = sd_v2_size(sd);
inode->i_gid = sd_v2_gid(sd);
@@ -1444,7 +1444,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
/* a stale NFS handle can trigger this without it being an error */
pathrelse(&path_to_sd);
reiserfs_make_bad_inode(inode);
- inode->i_nlink = 0;
+ clear_nlink(inode);
return;
}
@@ -1475,6 +1475,11 @@ void reiserfs_read_locked_inode(struct inode *inode,
reiserfs_check_path(&path_to_sd); /* init inode should be relsing */
+ /*
+ * Stat data v1 doesn't support ACLs.
+ */
+ if (get_inode_sd_version(inode) == STAT_DATA_V1)
+ cache_no_acl(inode);
}
/**
@@ -1832,7 +1837,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
#endif
/* fill stat data */
- inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
+ set_nlink(inode, (S_ISDIR(mode) ? 2 : 1));
/* uid and gid must already be set by the caller for quota init */
@@ -1989,7 +1994,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
make_bad_inode(inode);
out_inserted_sd:
- inode->i_nlink = 0;
+ clear_nlink(inode);
th->t_trans_id = 0; /* so the caller can't use this handle later */
unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
iput(inode);
@@ -3075,9 +3080,8 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
struct inode *inode = file->f_mapping->host;
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- reiserfs_get_blocks_direct_io, NULL);
+ ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ reiserfs_get_blocks_direct_io);
/*
* In case of error extending write may have instantiated a few
@@ -3087,8 +3091,10 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
loff_t isize = i_size_read(inode);
loff_t end = offset + iov_length(iov, nr_segs);
- if (end > isize)
- vmtruncate(inode, isize);
+ if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
+ truncate_setsize(inode, isize);
+ reiserfs_vfs_truncate_file(inode);
+ }
}
return ret;
@@ -3120,6 +3126,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
error = -EFBIG;
goto out;
}
+
+ inode_dio_wait(inode);
+
/* fill in hole pointers in the expanding truncate case. */
if (attr->ia_size > inode->i_size) {
error = generic_cont_expand_simple(inode, attr->ia_size);
@@ -3199,8 +3208,19 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
*/
reiserfs_write_unlock_once(inode->i_sb, depth);
if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode))
- error = vmtruncate(inode, attr->ia_size);
+ attr->ia_size != i_size_read(inode)) {
+ error = inode_newsize_ok(inode, attr->ia_size);
+ if (!error) {
+ /*
+ * Could race against reiserfs_file_release
+ * if called from NFS, so take tailpack mutex.
+ */
+ mutex_lock(&REISERFS_I(inode)->tailpack);
+ truncate_setsize(inode, attr->ia_size);
+ reiserfs_truncate_file(inode, 1);
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
+ }
+ }
if (!error) {
setattr_copy(inode, attr);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index c5e82ec..938a77f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -291,14 +291,13 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb,
for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
jb = jb_array + i;
jb->journal_list = NULL;
- jb->bitmaps = vmalloc(mem);
+ jb->bitmaps = vzalloc(mem);
if (!jb->bitmaps) {
reiserfs_warning(sb, "clm-2000", "unable to "
"allocate bitmaps for journal lists");
failed = 1;
break;
}
- memset(jb->bitmaps, 0, mem);
}
if (failed) {
free_list_bitmaps(sb, jb_array);
@@ -353,11 +352,10 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
if (num_cnodes <= 0) {
return NULL;
}
- head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
+ head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
if (!head) {
return NULL;
}
- memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode));
head[0].prev = NULL;
head[0].next = head + 1;
for (i = 1; i < num_cnodes; i++) {
@@ -678,23 +676,19 @@ struct buffer_chunk {
static void write_chunk(struct buffer_chunk *chunk)
{
int i;
- get_fs_excl();
for (i = 0; i < chunk->nr; i++) {
submit_logged_buffer(chunk->bh[i]);
}
chunk->nr = 0;
- put_fs_excl();
}
static void write_ordered_chunk(struct buffer_chunk *chunk)
{
int i;
- get_fs_excl();
for (i = 0; i < chunk->nr; i++) {
submit_ordered_buffer(chunk->bh[i]);
}
chunk->nr = 0;
- put_fs_excl();
}
static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
@@ -986,8 +980,6 @@ static int flush_commit_list(struct super_block *s,
return 0;
}
- get_fs_excl();
-
/* before we can put our commit blocks on disk, we have to make sure everyone older than
** us is on disk too
*/
@@ -1145,7 +1137,6 @@ static int flush_commit_list(struct super_block *s,
if (retval)
reiserfs_abort(s, retval, "Journal write error in %s",
__func__);
- put_fs_excl();
return retval;
}
@@ -1374,8 +1365,6 @@ static int flush_journal_list(struct super_block *s,
return 0;
}
- get_fs_excl();
-
/* if all the work is already done, get out of here */
if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
atomic_read(&(jl->j_commit_left)) <= 0) {
@@ -1597,7 +1586,6 @@ static int flush_journal_list(struct super_block *s,
put_journal_list(s, jl);
if (flushall)
mutex_unlock(&journal->j_flush_mutex);
- put_fs_excl();
return err;
}
@@ -1928,15 +1916,19 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
}
reiserfs_mounted_fs_count--;
- /* wait for all commits to finish */
- cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
/*
* We must release the write lock here because
* the workqueue job (flush_async_commit) needs this lock
*/
reiserfs_write_unlock(sb);
- flush_workqueue(commit_wq);
+ /*
+ * Cancel flushing of old commits. Note that this work will not
+ * be requeued because superblock is being shutdown and doesn't
+ * have MS_ACTIVE set.
+ */
+ /* wait for all commits to finish */
+ cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
if (!reiserfs_mounted_fs_count) {
destroy_workqueue(commit_wq);
@@ -2695,14 +2687,13 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
* dependency inversion warnings.
*/
reiserfs_write_unlock(sb);
- journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
+ journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
if (!journal) {
reiserfs_warning(sb, "journal-1256",
"unable to get memory for journal structure");
reiserfs_write_lock(sb);
return 1;
}
- memset(journal, 0, sizeof(struct reiserfs_journal));
INIT_LIST_HEAD(&journal->j_bitmap_nodes);
INIT_LIST_HEAD(&journal->j_prealloc_list);
INIT_LIST_HEAD(&journal->j_working_list);
@@ -3108,7 +3099,6 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
th->t_trans_id = journal->j_trans_id;
unlock_journal(sb);
INIT_LIST_HEAD(&th->t_list);
- get_fs_excl();
return 0;
out_fail:
@@ -3964,7 +3954,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
flush = flags & FLUSH_ALL;
wait_on_commit = flags & WAIT;
- put_fs_excl();
current->journal_info = th->t_handle_save;
reiserfs_check_lock_depth(sb, "journal end");
if (journal->j_len == 0) {
@@ -4226,8 +4215,15 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
if (flush) {
flush_commit_list(sb, jl, 1);
flush_journal_list(sb, jl, 1);
- } else if (!(jl->j_state & LIST_COMMIT_PENDING))
- queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
+ } else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
+ /*
+ * Avoid queueing work when sb is being shut down. Transaction
+ * will be flushed on journal shutdown.
+ */
+ if (sb->s_flags & MS_ACTIVE)
+ queue_delayed_work(commit_wq,
+ &journal->j_work, HZ / 10);
+ }
/* if the next transaction has any chance of wrapping, flush
** transactions that might get overwritten. If any journal lists are very
@@ -4316,4 +4312,3 @@ void reiserfs_abort_journal(struct super_block *sb, int errno)
dump_stack();
#endif
}
-
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 1186626..80058e8 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -19,7 +19,7 @@
#include <linux/reiserfs_xattr.h>
#include <linux/quotaops.h>
-#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
+#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
// directory item contains array of entry headers. This performs
@@ -622,7 +622,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
dentry->d_name.len, inode, 1 /*visible */ );
if (retval) {
int err;
- inode->i_nlink--;
+ drop_nlink(inode);
reiserfs_update_sd(&th, inode);
err = journal_end(&th, dir->i_sb, jbegin_count);
if (err)
@@ -702,7 +702,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
dentry->d_name.len, inode, 1 /*visible */ );
if (retval) {
int err;
- inode->i_nlink--;
+ drop_nlink(inode);
reiserfs_update_sd(&th, inode);
err = journal_end(&th, dir->i_sb, jbegin_count);
if (err)
@@ -787,7 +787,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
dentry->d_name.len, inode, 1 /*visible */ );
if (retval) {
int err;
- inode->i_nlink = 0;
+ clear_nlink(inode);
DEC_DIR_INODE_NLINK(dir);
reiserfs_update_sd(&th, inode);
err = journal_end(&th, dir->i_sb, jbegin_count);
@@ -964,7 +964,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
reiserfs_warning(inode->i_sb, "reiserfs-7042",
"deleting nonexistent file (%lu), %d",
inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
drop_nlink(inode);
@@ -1086,7 +1086,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
dentry->d_name.len, inode, 1 /*visible */ );
if (retval) {
int err;
- inode->i_nlink--;
+ drop_nlink(inode);
reiserfs_update_sd(&th, inode);
err = journal_end(&th, parent_dir->i_sb, jbegin_count);
if (err)
@@ -1129,7 +1129,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
retval = journal_begin(&th, dir->i_sb, jbegin_count);
if (retval) {
- inode->i_nlink--;
+ drop_nlink(inode);
reiserfs_write_unlock(dir->i_sb);
return retval;
}
@@ -1144,7 +1144,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
if (retval) {
int err;
- inode->i_nlink--;
+ drop_nlink(inode);
err = journal_end(&th, dir->i_sb, jbegin_count);
reiserfs_write_unlock(dir->i_sb);
return err ? err : retval;
@@ -1529,6 +1529,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
.listxattr = reiserfs_listxattr,
.removexattr = reiserfs_removexattr,
.permission = reiserfs_permission,
+ .get_acl = reiserfs_get_acl,
};
/*
@@ -1545,6 +1546,7 @@ const struct inode_operations reiserfs_symlink_inode_operations = {
.listxattr = reiserfs_listxattr,
.removexattr = reiserfs_removexattr,
.permission = reiserfs_permission,
+ .get_acl = reiserfs_get_acl,
};
@@ -1558,5 +1560,5 @@ const struct inode_operations reiserfs_special_inode_operations = {
.listxattr = reiserfs_listxattr,
.removexattr = reiserfs_removexattr,
.permission = reiserfs_permission,
-
+ .get_acl = reiserfs_get_acl,
};
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index b3a94d2..7483279 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -111,15 +111,13 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
/* allocate additional bitmap blocks, reallocate array of bitmap
* block pointers */
bitmap =
- vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
+ vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
if (!bitmap) {
/* Journal bitmaps are still supersized, but the memory isn't
* leaked, so I guess it's ok */
printk("reiserfs_resize: unable to allocate memory.\n");
return -ENOMEM;
}
- memset(bitmap, 0,
- sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
for (i = 0; i < bmap_nr; i++)
bitmap[i] = old_bitmap[i];
@@ -136,7 +134,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
return -EIO;
}
memset(bh->b_data, 0, sb_blocksize(sb));
- reiserfs_test_and_set_le_bit(0, bh->b_data);
+ reiserfs_set_le_bit(0, bh->b_data);
reiserfs_cache_bitmap_metadata(s, bh, bitmap + i);
set_buffer_uptodate(bh);
@@ -172,7 +170,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
reiserfs_prepare_for_journal(s, bh, 1);
for (i = block_r; i < s->s_blocksize * 8; i++)
- reiserfs_test_and_clear_le_bit(i, bh->b_data);
+ reiserfs_clear_le_bit(i, bh->b_data);
info->free_count += s->s_blocksize * 8 - block_r;
journal_mark_dirty(&th, s, bh);
@@ -190,7 +188,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
reiserfs_prepare_for_journal(s, bh, 1);
for (i = block_r_new; i < s->s_blocksize * 8; i++)
- reiserfs_test_and_set_le_bit(i, bh->b_data);
+ reiserfs_set_le_bit(i, bh->b_data);
journal_mark_dirty(&th, s, bh);
brelse(bh);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7527623..569498a 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1663,6 +1663,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
/* Set default values for options: non-aggressive tails, RO on errors */
REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO);
+ REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
/* no preallocation minimum, be smart in
reiserfs_file_write instead */
REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6e3ca4e..04eecc4 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -555,11 +555,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
reiserfs_write_unlock(inode->i_sb);
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
- down_write(&dentry->d_inode->i_alloc_sem);
+ inode_dio_wait(dentry->d_inode);
reiserfs_write_lock(inode->i_sb);
err = reiserfs_setattr(dentry, &newattrs);
- up_write(&dentry->d_inode->i_alloc_sem);
mutex_unlock(&dentry->d_inode->i_mutex);
} else
update_ctime(inode);
@@ -868,27 +867,6 @@ out:
return err;
}
-static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
-{
- struct posix_acl *acl;
- int error = -EAGAIN; /* do regular unix permission checks by default */
-
- if (flags & IPERM_FLAG_RCU)
- return -ECHILD;
-
- acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
-
- if (acl) {
- if (!IS_ERR(acl)) {
- error = posix_acl_permission(inode, acl, mask);
- posix_acl_release(acl);
- } else if (PTR_ERR(acl) != -ENODATA)
- error = PTR_ERR(acl);
- }
-
- return error;
-}
-
static int create_privroot(struct dentry *dentry)
{
int err;
@@ -952,7 +930,7 @@ static int xattr_mount_check(struct super_block *s)
return 0;
}
-int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
+int reiserfs_permission(struct inode *inode, int mask)
{
/*
* We don't do permission checks on the internal objects.
@@ -961,15 +939,7 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
if (IS_PRIVATE(inode))
return 0;
-#ifdef CONFIG_REISERFS_FS_XATTR
- /*
- * Stat data v1 doesn't support ACLs.
- */
- if (get_inode_sd_version(inode) != STAT_DATA_V1)
- return generic_permission(inode, mask, flags,
- reiserfs_check_acl);
-#endif
- return generic_permission(inode, mask, flags, NULL);
+ return generic_permission(inode, mask);
}
static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 3dc38f1..6da0396 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -272,12 +272,10 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
case ACL_TYPE_ACCESS:
name = POSIX_ACL_XATTR_ACCESS;
if (acl) {
- mode_t mode = inode->i_mode;
- error = posix_acl_equiv_mode(acl, &mode);
+ error = posix_acl_equiv_mode(acl, &inode->i_mode);
if (error < 0)
return error;
else {
- inode->i_mode = mode;
if (error == 0)
acl = NULL;
}
@@ -354,10 +352,6 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
return PTR_ERR(acl);
if (acl) {
- struct posix_acl *acl_copy;
- mode_t mode = inode->i_mode;
- int need_acl;
-
/* Copy the default ACL to the default ACL of a new directory */
if (S_ISDIR(inode->i_mode)) {
err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
@@ -368,29 +362,13 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
/* Now we reconcile the new ACL and the mode,
potentially modifying both */
- acl_copy = posix_acl_clone(acl, GFP_NOFS);
- if (!acl_copy) {
- err = -ENOMEM;
- goto cleanup;
- }
-
- need_acl = posix_acl_create_masq(acl_copy, &mode);
- if (need_acl >= 0) {
- if (mode != inode->i_mode) {
- inode->i_mode = mode;
- }
+ err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
+ if (err < 0)
+ return err;
- /* If we need an ACL.. */
- if (need_acl > 0) {
- err = reiserfs_set_acl(th, inode,
- ACL_TYPE_ACCESS,
- acl_copy);
- if (err)
- goto cleanup_copy;
- }
- }
- cleanup_copy:
- posix_acl_release(acl_copy);
+ /* If we need an ACL.. */
+ if (err > 0)
+ err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl);
cleanup:
posix_acl_release(acl);
} else {
@@ -445,7 +423,10 @@ int reiserfs_cache_default_acl(struct inode *inode)
int reiserfs_acl_chmod(struct inode *inode)
{
- struct posix_acl *acl, *clone;
+ struct reiserfs_transaction_handle th;
+ struct posix_acl *acl;
+ size_t size;
+ int depth;
int error;
if (S_ISLNK(inode->i_mode))
@@ -463,30 +444,22 @@ int reiserfs_acl_chmod(struct inode *inode)
return 0;
if (IS_ERR(acl))
return PTR_ERR(acl);
- clone = posix_acl_clone(acl, GFP_NOFS);
- posix_acl_release(acl);
- if (!clone)
- return -ENOMEM;
- error = posix_acl_chmod_masq(clone, inode->i_mode);
+ error = posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode);
+ if (error)
+ return error;
+
+ size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count));
+ depth = reiserfs_write_lock_once(inode->i_sb);
+ error = journal_begin(&th, inode->i_sb, size * 2);
if (!error) {
- struct reiserfs_transaction_handle th;
- size_t size = reiserfs_xattr_nblocks(inode,
- reiserfs_acl_size(clone->a_count));
- int depth;
-
- depth = reiserfs_write_lock_once(inode->i_sb);
- error = journal_begin(&th, inode->i_sb, size * 2);
- if (!error) {
- int error2;
- error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS,
- clone);
- error2 = journal_end(&th, inode->i_sb, size * 2);
- if (error2)
- error = error2;
- }
- reiserfs_write_unlock_once(inode->i_sb, depth);
+ int error2;
+ error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl);
+ error2 = journal_end(&th, inode->i_sb, size * 2);
+ if (error2)
+ error = error2;
}
- posix_acl_release(clone);
+ reiserfs_write_unlock_once(inode->i_sb, depth);
+ posix_acl_release(acl);
return error;
}
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index ef66c18..534668f 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -66,8 +66,8 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
if (IS_PRIVATE(dir))
return 0;
- error = security_inode_init_security(inode, dir, qstr, &sec->name,
- &sec->value, &sec->length);
+ error = security_old_inode_init_security(inode, dir, qstr, &sec->name,
+ &sec->value, &sec->length);
if (error) {
if (error == -EOPNOTSUPP)
error = 0;
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 315de66..bc4f94b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -63,7 +63,7 @@
static void shrink_liability(struct ubifs_info *c, int nr_to_write)
{
down_read(&c->vfs_sb->s_umount);
- writeback_inodes_sb(c->vfs_sb);
+ writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
up_read(&c->vfs_sb->s_umount);
}
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 87cd0ea..b2ca12f 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -78,7 +78,7 @@ static int nothing_to_commit(struct ubifs_info *c)
* If the root TNC node is dirty, we definitely have something to
* commit.
*/
- if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags))
+ if (c->zroot.znode && ubifs_zn_dirty(c->zroot.znode))
return 0;
/*
@@ -166,15 +166,10 @@ static int do_commit(struct ubifs_info *c)
err = ubifs_orphan_end_commit(c);
if (err)
goto out;
- old_ltail_lnum = c->ltail_lnum;
- err = ubifs_log_end_commit(c, new_ltail_lnum);
- if (err)
- goto out;
err = dbg_check_old_index(c, &zroot);
if (err)
goto out;
- mutex_lock(&c->mst_mutex);
c->mst_node->cmt_no = cpu_to_le64(c->cmt_no);
c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum);
c->mst_node->root_lnum = cpu_to_le32(zroot.lnum);
@@ -203,8 +198,9 @@ static int do_commit(struct ubifs_info *c)
c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
else
c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
- err = ubifs_write_master(c);
- mutex_unlock(&c->mst_mutex);
+
+ old_ltail_lnum = c->ltail_lnum;
+ err = ubifs_log_end_commit(c, new_ltail_lnum);
if (err)
goto out;
@@ -418,7 +414,7 @@ int ubifs_run_commit(struct ubifs_info *c)
spin_lock(&c->cs_lock);
if (c->cmt_state == COMMIT_BROKEN) {
- err = -EINVAL;
+ err = -EROFS;
goto out;
}
@@ -444,7 +440,7 @@ int ubifs_run_commit(struct ubifs_info *c)
* re-check it.
*/
if (c->cmt_state == COMMIT_BROKEN) {
- err = -EINVAL;
+ err = -EROFS;
goto out_cmt_unlock;
}
@@ -576,7 +572,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
struct idx_node *i;
size_t sz;
- if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX))
+ if (!dbg_is_chk_index(c))
return 0;
INIT_LIST_HEAD(&list);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 0bb2bce..b09ba2d 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -27,13 +27,12 @@
* various local functions of those subsystems.
*/
-#define UBIFS_DBG_PRESERVE_UBI
-
-#include "ubifs.h"
#include <linux/module.h>
-#include <linux/moduleparam.h>
#include <linux/debugfs.h>
#include <linux/math64.h>
+#include <linux/uaccess.h>
+#include <linux/random.h>
+#include "ubifs.h"
#ifdef CONFIG_UBIFS_FS_DEBUG
@@ -42,15 +41,6 @@ DEFINE_SPINLOCK(dbg_lock);
static char dbg_key_buf0[128];
static char dbg_key_buf1[128];
-unsigned int ubifs_chk_flags;
-unsigned int ubifs_tst_flags;
-
-module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
-module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
-
-MODULE_PARM_DESC(debug_chks, "Debug check flags");
-MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
-
static const char *get_key_fmt(int fmt)
{
switch (fmt) {
@@ -91,6 +81,28 @@ static const char *get_key_type(int type)
}
}
+static const char *get_dent_type(int type)
+{
+ switch (type) {
+ case UBIFS_ITYPE_REG:
+ return "file";
+ case UBIFS_ITYPE_DIR:
+ return "dir";
+ case UBIFS_ITYPE_LNK:
+ return "symlink";
+ case UBIFS_ITYPE_BLK:
+ return "blkdev";
+ case UBIFS_ITYPE_CHR:
+ return "char dev";
+ case UBIFS_ITYPE_FIFO:
+ return "fifo";
+ case UBIFS_ITYPE_SOCK:
+ return "socket";
+ default:
+ return "unknown/invalid type";
+ }
+}
+
static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
char *buffer)
{
@@ -234,9 +246,13 @@ static void dump_ch(const struct ubifs_ch *ch)
printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len));
}
-void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
+void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
{
const struct ubifs_inode *ui = ubifs_inode(inode);
+ struct qstr nm = { .name = NULL };
+ union ubifs_key key;
+ struct ubifs_dent_node *dent, *pdent = NULL;
+ int count = 2;
printk(KERN_DEBUG "Dump in-memory inode:");
printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino);
@@ -270,6 +286,32 @@ void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read);
printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row);
printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len);
+
+ if (!S_ISDIR(inode->i_mode))
+ return;
+
+ printk(KERN_DEBUG "List of directory entries:\n");
+ ubifs_assert(!mutex_is_locked(&c->tnc_mutex));
+
+ lowest_dent_key(c, &key, inode->i_ino);
+ while (1) {
+ dent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (IS_ERR(dent)) {
+ if (PTR_ERR(dent) != -ENOENT)
+ printk(KERN_DEBUG "error %ld\n", PTR_ERR(dent));
+ break;
+ }
+
+ printk(KERN_DEBUG "\t%d: %s (%s)\n",
+ count++, dent->name, get_dent_type(dent->type));
+
+ nm.name = dent->name;
+ nm.len = le16_to_cpu(dent->nlen);
+ kfree(pdent);
+ pdent = dent;
+ key_read(c, &dent->key, &key);
+ }
+ kfree(pdent);
}
void dbg_dump_node(const struct ubifs_info *c, const void *node)
@@ -278,7 +320,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
union ubifs_key key;
const struct ubifs_ch *ch = node;
- if (dbg_failure_mode)
+ if (dbg_is_tst_rcvry(c))
return;
/* If the magic is incorrect, just hexdump the first bytes */
@@ -828,13 +870,29 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
spin_unlock(&dbg_lock);
}
+void dbg_dump_sleb(const struct ubifs_info *c,
+ const struct ubifs_scan_leb *sleb, int offs)
+{
+ struct ubifs_scan_node *snod;
+
+ printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n",
+ current->pid, sleb->lnum, offs);
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ cond_resched();
+ printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
+ snod->offs, snod->len);
+ dbg_dump_node(c, snod->node);
+ }
+}
+
void dbg_dump_leb(const struct ubifs_info *c, int lnum)
{
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
void *buf;
- if (dbg_failure_mode)
+ if (dbg_is_tst_rcvry(c))
return;
printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
@@ -1080,6 +1138,7 @@ out:
/**
* dbg_check_synced_i_size - check synchronized inode size.
+ * @c: UBIFS file-system description object
* @inode: inode to check
*
* If inode is clean, synchronized inode size has to be equivalent to current
@@ -1087,12 +1146,12 @@ out:
* has to be locked). Returns %0 if synchronized inode size if correct, and
* %-EINVAL if not.
*/
-int dbg_check_synced_i_size(struct inode *inode)
+int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode)
{
int err = 0;
struct ubifs_inode *ui = ubifs_inode(inode);
- if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ if (!dbg_is_chk_gen(c))
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
@@ -1125,7 +1184,7 @@ int dbg_check_synced_i_size(struct inode *inode)
* Note, it is good idea to make sure the @dir->i_mutex is locked before
* calling this function.
*/
-int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir)
+int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
{
unsigned int nlink = 2;
union ubifs_key key;
@@ -1133,7 +1192,7 @@ int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir)
struct qstr nm = { .name = NULL };
loff_t size = UBIFS_INO_NODE_SZ;
- if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ if (!dbg_is_chk_gen(c))
return 0;
if (!S_ISDIR(dir->i_mode))
@@ -1167,12 +1226,14 @@ int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir)
"but calculated size is %llu", dir->i_ino,
(unsigned long long)i_size_read(dir),
(unsigned long long)size);
+ dbg_dump_inode(c, dir);
dump_stack();
return -EINVAL;
}
if (dir->i_nlink != nlink) {
ubifs_err("directory inode %lu has nlink %u, but calculated "
"nlink is %u", dir->i_ino, dir->i_nlink, nlink);
+ dbg_dump_inode(c, dir);
dump_stack();
return -EINVAL;
}
@@ -1489,7 +1550,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
long clean_cnt = 0, dirty_cnt = 0;
int err, last;
- if (!(ubifs_chk_flags & UBIFS_CHK_TNC))
+ if (!dbg_is_chk_index(c))
return 0;
ubifs_assert(mutex_is_locked(&c->tnc_mutex));
@@ -1736,7 +1797,7 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
int err;
long long calc = 0;
- if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ))
+ if (!dbg_is_chk_index(c))
return 0;
err = dbg_walk_index(c, NULL, add_size, &calc);
@@ -2312,7 +2373,7 @@ int dbg_check_filesystem(struct ubifs_info *c)
int err;
struct fsck_data fsckd;
- if (!(ubifs_chk_flags & UBIFS_CHK_FS))
+ if (!dbg_is_chk_fs(c))
return 0;
fsckd.inodes = RB_ROOT;
@@ -2347,7 +2408,7 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
struct list_head *cur;
struct ubifs_scan_node *sa, *sb;
- if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ if (!dbg_is_chk_gen(c))
return 0;
for (cur = head->next; cur->next != head; cur = cur->next) {
@@ -2414,7 +2475,7 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
struct list_head *cur;
struct ubifs_scan_node *sa, *sb;
- if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ if (!dbg_is_chk_gen(c))
return 0;
for (cur = head->next; cur->next != head; cur = cur->next) {
@@ -2491,214 +2552,141 @@ error_dump:
return 0;
}
-int dbg_force_in_the_gaps(void)
+static inline int chance(unsigned int n, unsigned int out_of)
{
- if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
- return 0;
+ return !!((random32() % out_of) + 1 <= n);
- return !(random32() & 7);
}
-/* Failure mode for recovery testing */
-
-#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d))
-
-struct failure_mode_info {
- struct list_head list;
- struct ubifs_info *c;
-};
-
-static LIST_HEAD(fmi_list);
-static DEFINE_SPINLOCK(fmi_lock);
-
-static unsigned int next;
-
-static int simple_rand(void)
+static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
{
- if (next == 0)
- next = current->pid;
- next = next * 1103515245 + 12345;
- return (next >> 16) & 32767;
-}
-
-static void failure_mode_init(struct ubifs_info *c)
-{
- struct failure_mode_info *fmi;
-
- fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
- if (!fmi) {
- ubifs_err("Failed to register failure mode - no memory");
- return;
- }
- fmi->c = c;
- spin_lock(&fmi_lock);
- list_add_tail(&fmi->list, &fmi_list);
- spin_unlock(&fmi_lock);
-}
-
-static void failure_mode_exit(struct ubifs_info *c)
-{
- struct failure_mode_info *fmi, *tmp;
-
- spin_lock(&fmi_lock);
- list_for_each_entry_safe(fmi, tmp, &fmi_list, list)
- if (fmi->c == c) {
- list_del(&fmi->list);
- kfree(fmi);
- }
- spin_unlock(&fmi_lock);
-}
-
-static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc)
-{
- struct failure_mode_info *fmi;
-
- spin_lock(&fmi_lock);
- list_for_each_entry(fmi, &fmi_list, list)
- if (fmi->c->ubi == desc) {
- struct ubifs_info *c = fmi->c;
-
- spin_unlock(&fmi_lock);
- return c;
- }
- spin_unlock(&fmi_lock);
- return NULL;
-}
-
-static int in_failure_mode(struct ubi_volume_desc *desc)
-{
- struct ubifs_info *c = dbg_find_info(desc);
-
- if (c && dbg_failure_mode)
- return c->dbg->failure_mode;
- return 0;
-}
+ struct ubifs_debug_info *d = c->dbg;
-static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
-{
- struct ubifs_info *c = dbg_find_info(desc);
- struct ubifs_debug_info *d;
+ ubifs_assert(dbg_is_tst_rcvry(c));
- if (!c || !dbg_failure_mode)
- return 0;
- d = c->dbg;
- if (d->failure_mode)
- return 1;
- if (!d->fail_cnt) {
- /* First call - decide delay to failure */
+ if (!d->pc_cnt) {
+ /* First call - decide delay to the power cut */
if (chance(1, 2)) {
- unsigned int delay = 1 << (simple_rand() >> 11);
+ unsigned long delay;
if (chance(1, 2)) {
- d->fail_delay = 1;
- d->fail_timeout = jiffies +
- msecs_to_jiffies(delay);
- dbg_rcvry("failing after %ums", delay);
+ d->pc_delay = 1;
+ /* Fail withing 1 minute */
+ delay = random32() % 60000;
+ d->pc_timeout = jiffies;
+ d->pc_timeout += msecs_to_jiffies(delay);
+ ubifs_warn("failing after %lums", delay);
} else {
- d->fail_delay = 2;
- d->fail_cnt_max = delay;
- dbg_rcvry("failing after %u calls", delay);
+ d->pc_delay = 2;
+ delay = random32() % 10000;
+ /* Fail within 10000 operations */
+ d->pc_cnt_max = delay;
+ ubifs_warn("failing after %lu calls", delay);
}
}
- d->fail_cnt += 1;
+
+ d->pc_cnt += 1;
}
+
/* Determine if failure delay has expired */
- if (d->fail_delay == 1) {
- if (time_before(jiffies, d->fail_timeout))
+ if (d->pc_delay == 1 && time_before(jiffies, d->pc_timeout))
return 0;
- } else if (d->fail_delay == 2)
- if (d->fail_cnt++ < d->fail_cnt_max)
+ if (d->pc_delay == 2 && d->pc_cnt++ < d->pc_cnt_max)
return 0;
+
if (lnum == UBIFS_SB_LNUM) {
- if (write) {
- if (chance(1, 2))
- return 0;
- } else if (chance(19, 20))
+ if (write && chance(1, 2))
return 0;
- dbg_rcvry("failing in super block LEB %d", lnum);
+ if (chance(19, 20))
+ return 0;
+ ubifs_warn("failing in super block LEB %d", lnum);
} else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) {
if (chance(19, 20))
return 0;
- dbg_rcvry("failing in master LEB %d", lnum);
+ ubifs_warn("failing in master LEB %d", lnum);
} else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) {
- if (write) {
- if (chance(99, 100))
- return 0;
- } else if (chance(399, 400))
+ if (write && chance(99, 100))
+ return 0;
+ if (chance(399, 400))
return 0;
- dbg_rcvry("failing in log LEB %d", lnum);
+ ubifs_warn("failing in log LEB %d", lnum);
} else if (lnum >= c->lpt_first && lnum <= c->lpt_last) {
- if (write) {
- if (chance(7, 8))
- return 0;
- } else if (chance(19, 20))
+ if (write && chance(7, 8))
return 0;
- dbg_rcvry("failing in LPT LEB %d", lnum);
+ if (chance(19, 20))
+ return 0;
+ ubifs_warn("failing in LPT LEB %d", lnum);
} else if (lnum >= c->orph_first && lnum <= c->orph_last) {
- if (write) {
- if (chance(1, 2))
- return 0;
- } else if (chance(9, 10))
+ if (write && chance(1, 2))
return 0;
- dbg_rcvry("failing in orphan LEB %d", lnum);
+ if (chance(9, 10))
+ return 0;
+ ubifs_warn("failing in orphan LEB %d", lnum);
} else if (lnum == c->ihead_lnum) {
if (chance(99, 100))
return 0;
- dbg_rcvry("failing in index head LEB %d", lnum);
+ ubifs_warn("failing in index head LEB %d", lnum);
} else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) {
if (chance(9, 10))
return 0;
- dbg_rcvry("failing in GC head LEB %d", lnum);
+ ubifs_warn("failing in GC head LEB %d", lnum);
} else if (write && !RB_EMPTY_ROOT(&c->buds) &&
!ubifs_search_bud(c, lnum)) {
if (chance(19, 20))
return 0;
- dbg_rcvry("failing in non-bud LEB %d", lnum);
+ ubifs_warn("failing in non-bud LEB %d", lnum);
} else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND ||
c->cmt_state == COMMIT_RUNNING_REQUIRED) {
if (chance(999, 1000))
return 0;
- dbg_rcvry("failing in bud LEB %d commit running", lnum);
+ ubifs_warn("failing in bud LEB %d commit running", lnum);
} else {
if (chance(9999, 10000))
return 0;
- dbg_rcvry("failing in bud LEB %d commit not running", lnum);
+ ubifs_warn("failing in bud LEB %d commit not running", lnum);
}
- ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
- d->failure_mode = 1;
+
+ d->pc_happened = 1;
+ ubifs_warn("========== Power cut emulated ==========");
dump_stack();
return 1;
}
-static void cut_data(const void *buf, int len)
+static void cut_data(const void *buf, unsigned int len)
{
- int flen, i;
+ unsigned int from, to, i, ffs = chance(1, 2);
unsigned char *p = (void *)buf;
- flen = (len * (long long)simple_rand()) >> 15;
- for (i = flen; i < len; i++)
- p[i] = 0xff;
-}
+ from = random32() % (len + 1);
+ if (chance(1, 2))
+ to = random32() % (len - from + 1);
+ else
+ to = len;
-int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
- int len, int check)
-{
- if (in_failure_mode(desc))
- return -EROFS;
- return ubi_leb_read(desc, lnum, buf, offset, len, check);
+ if (from < to)
+ ubifs_warn("filled bytes %u-%u with %s", from, to - 1,
+ ffs ? "0xFFs" : "random data");
+
+ if (ffs)
+ for (i = from; i < to; i++)
+ p[i] = 0xFF;
+ else
+ for (i = from; i < to; i++)
+ p[i] = random32() % 0x100;
}
-int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
- int offset, int len, int dtype)
+int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
+ int offs, int len, int dtype)
{
int err, failing;
- if (in_failure_mode(desc))
+ if (c->dbg->pc_happened)
return -EROFS;
- failing = do_fail(desc, lnum, 1);
+
+ failing = power_cut_emulated(c, lnum, 1);
if (failing)
cut_data(buf, len);
- err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
+ err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
if (err)
return err;
if (failing)
@@ -2706,162 +2694,207 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
return 0;
}
-int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf,
int len, int dtype)
{
int err;
- if (do_fail(desc, lnum, 1))
+ if (c->dbg->pc_happened)
return -EROFS;
- err = ubi_leb_change(desc, lnum, buf, len, dtype);
+ if (power_cut_emulated(c, lnum, 1))
+ return -EROFS;
+ err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
if (err)
return err;
- if (do_fail(desc, lnum, 1))
+ if (power_cut_emulated(c, lnum, 1))
return -EROFS;
return 0;
}
-int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
+int dbg_leb_unmap(struct ubifs_info *c, int lnum)
{
int err;
- if (do_fail(desc, lnum, 0))
+ if (c->dbg->pc_happened)
+ return -EROFS;
+ if (power_cut_emulated(c, lnum, 0))
return -EROFS;
- err = ubi_leb_erase(desc, lnum);
+ err = ubi_leb_unmap(c->ubi, lnum);
if (err)
return err;
- if (do_fail(desc, lnum, 0))
+ if (power_cut_emulated(c, lnum, 0))
return -EROFS;
return 0;
}
-int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
+int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype)
{
int err;
- if (do_fail(desc, lnum, 0))
+ if (c->dbg->pc_happened)
return -EROFS;
- err = ubi_leb_unmap(desc, lnum);
+ if (power_cut_emulated(c, lnum, 0))
+ return -EROFS;
+ err = ubi_leb_map(c->ubi, lnum, dtype);
if (err)
return err;
- if (do_fail(desc, lnum, 0))
+ if (power_cut_emulated(c, lnum, 0))
return -EROFS;
return 0;
}
-int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
-{
- if (in_failure_mode(desc))
- return -EROFS;
- return ubi_is_mapped(desc, lnum);
-}
+/*
+ * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
+ * contain the stuff specific to particular file-system mounts.
+ */
+static struct dentry *dfs_rootdir;
-int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
+static int dfs_file_open(struct inode *inode, struct file *file)
{
- int err;
-
- if (do_fail(desc, lnum, 0))
- return -EROFS;
- err = ubi_leb_map(desc, lnum, dtype);
- if (err)
- return err;
- if (do_fail(desc, lnum, 0))
- return -EROFS;
- return 0;
+ file->private_data = inode->i_private;
+ return nonseekable_open(inode, file);
}
/**
- * ubifs_debugging_init - initialize UBIFS debugging.
- * @c: UBIFS file-system description object
+ * provide_user_output - provide output to the user reading a debugfs file.
+ * @val: boolean value for the answer
+ * @u: the buffer to store the answer at
+ * @count: size of the buffer
+ * @ppos: position in the @u output buffer
*
- * This function initializes debugging-related data for the file system.
- * Returns zero in case of success and a negative error code in case of
+ * This is a simple helper function which stores @val boolean value in the user
+ * buffer when the user reads one of UBIFS debugfs files. Returns amount of
+ * bytes written to @u in case of success and a negative error code in case of
* failure.
*/
-int ubifs_debugging_init(struct ubifs_info *c)
+static int provide_user_output(int val, char __user *u, size_t count,
+ loff_t *ppos)
{
- c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL);
- if (!c->dbg)
- return -ENOMEM;
+ char buf[3];
- failure_mode_init(c);
- return 0;
+ if (val)
+ buf[0] = '1';
+ else
+ buf[0] = '0';
+ buf[1] = '\n';
+ buf[2] = 0x00;
+
+ return simple_read_from_buffer(u, count, ppos, buf, 2);
}
-/**
- * ubifs_debugging_exit - free debugging data.
- * @c: UBIFS file-system description object
- */
-void ubifs_debugging_exit(struct ubifs_info *c)
+static ssize_t dfs_file_read(struct file *file, char __user *u, size_t count,
+ loff_t *ppos)
{
- failure_mode_exit(c);
- kfree(c->dbg);
-}
+ struct dentry *dent = file->f_path.dentry;
+ struct ubifs_info *c = file->private_data;
+ struct ubifs_debug_info *d = c->dbg;
+ int val;
+
+ if (dent == d->dfs_chk_gen)
+ val = d->chk_gen;
+ else if (dent == d->dfs_chk_index)
+ val = d->chk_index;
+ else if (dent == d->dfs_chk_orph)
+ val = d->chk_orph;
+ else if (dent == d->dfs_chk_lprops)
+ val = d->chk_lprops;
+ else if (dent == d->dfs_chk_fs)
+ val = d->chk_fs;
+ else if (dent == d->dfs_tst_rcvry)
+ val = d->tst_rcvry;
+ else
+ return -EINVAL;
-/*
- * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
- * contain the stuff specific to particular file-system mounts.
- */
-static struct dentry *dfs_rootdir;
+ return provide_user_output(val, u, count, ppos);
+}
/**
- * dbg_debugfs_init - initialize debugfs file-system.
+ * interpret_user_input - interpret user debugfs file input.
+ * @u: user-provided buffer with the input
+ * @count: buffer size
*
- * UBIFS uses debugfs file-system to expose various debugging knobs to
- * user-space. This function creates "ubifs" directory in the debugfs
- * file-system. Returns zero in case of success and a negative error code in
- * case of failure.
+ * This is a helper function which interpret user input to a boolean UBIFS
+ * debugfs file. Returns %0 or %1 in case of success and a negative error code
+ * in case of failure.
*/
-int dbg_debugfs_init(void)
+static int interpret_user_input(const char __user *u, size_t count)
{
- dfs_rootdir = debugfs_create_dir("ubifs", NULL);
- if (IS_ERR(dfs_rootdir)) {
- int err = PTR_ERR(dfs_rootdir);
- ubifs_err("cannot create \"ubifs\" debugfs directory, "
- "error %d\n", err);
- return err;
- }
+ size_t buf_size;
+ char buf[8];
- return 0;
-}
+ buf_size = min_t(size_t, count, (sizeof(buf) - 1));
+ if (copy_from_user(buf, u, buf_size))
+ return -EFAULT;
-/**
- * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system.
- */
-void dbg_debugfs_exit(void)
-{
- debugfs_remove(dfs_rootdir);
-}
+ if (buf[0] == '1')
+ return 1;
+ else if (buf[0] == '0')
+ return 0;
-static int open_debugfs_file(struct inode *inode, struct file *file)
-{
- file->private_data = inode->i_private;
- return nonseekable_open(inode, file);
+ return -EINVAL;
}
-static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t dfs_file_write(struct file *file, const char __user *u,
+ size_t count, loff_t *ppos)
{
struct ubifs_info *c = file->private_data;
struct ubifs_debug_info *d = c->dbg;
+ struct dentry *dent = file->f_path.dentry;
+ int val;
- if (file->f_path.dentry == d->dfs_dump_lprops)
+ /*
+ * TODO: this is racy - the file-system might have already been
+ * unmounted and we'd oops in this case. The plan is to fix it with
+ * help of 'iterate_supers_type()' which we should have in v3.0: when
+ * a debugfs opened, we rember FS's UUID in file->private_data. Then
+ * whenever we access the FS via a debugfs file, we iterate all UBIFS
+ * superblocks and fine the one with the same UUID, and take the
+ * locking right.
+ *
+ * The other way to go suggested by Al Viro is to create a separate
+ * 'ubifs-debug' file-system instead.
+ */
+ if (file->f_path.dentry == d->dfs_dump_lprops) {
dbg_dump_lprops(c);
- else if (file->f_path.dentry == d->dfs_dump_budg)
+ return count;
+ }
+ if (file->f_path.dentry == d->dfs_dump_budg) {
dbg_dump_budg(c, &c->bi);
- else if (file->f_path.dentry == d->dfs_dump_tnc) {
+ return count;
+ }
+ if (file->f_path.dentry == d->dfs_dump_tnc) {
mutex_lock(&c->tnc_mutex);
dbg_dump_tnc(c);
mutex_unlock(&c->tnc_mutex);
- } else
+ return count;
+ }
+
+ val = interpret_user_input(u, count);
+ if (val < 0)
+ return val;
+
+ if (dent == d->dfs_chk_gen)
+ d->chk_gen = val;
+ else if (dent == d->dfs_chk_index)
+ d->chk_index = val;
+ else if (dent == d->dfs_chk_orph)
+ d->chk_orph = val;
+ else if (dent == d->dfs_chk_lprops)
+ d->chk_lprops = val;
+ else if (dent == d->dfs_chk_fs)
+ d->chk_fs = val;
+ else if (dent == d->dfs_tst_rcvry)
+ d->tst_rcvry = val;
+ else
return -EINVAL;
return count;
}
static const struct file_operations dfs_fops = {
- .open = open_debugfs_file,
- .write = write_debugfs_file,
+ .open = dfs_file_open,
+ .read = dfs_file_read,
+ .write = dfs_file_write,
.owner = THIS_MODULE,
.llseek = no_llseek,
};
@@ -2880,12 +2913,20 @@ static const struct file_operations dfs_fops = {
*/
int dbg_debugfs_init_fs(struct ubifs_info *c)
{
- int err;
+ int err, n;
const char *fname;
struct dentry *dent;
struct ubifs_debug_info *d = c->dbg;
- sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+ n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME,
+ c->vi.ubi_num, c->vi.vol_id);
+ if (n == UBIFS_DFS_DIR_LEN) {
+ /* The array size is too small */
+ fname = UBIFS_DFS_DIR_NAME;
+ dent = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
fname = d->dfs_dir_name;
dent = debugfs_create_dir(fname, dfs_rootdir);
if (IS_ERR_OR_NULL(dent))
@@ -2910,13 +2951,55 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
goto out_remove;
d->dfs_dump_tnc = dent;
+ fname = "chk_general";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+ &dfs_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ d->dfs_chk_gen = dent;
+
+ fname = "chk_index";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+ &dfs_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ d->dfs_chk_index = dent;
+
+ fname = "chk_orphans";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+ &dfs_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ d->dfs_chk_orph = dent;
+
+ fname = "chk_lprops";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+ &dfs_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ d->dfs_chk_lprops = dent;
+
+ fname = "chk_fs";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+ &dfs_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ d->dfs_chk_fs = dent;
+
+ fname = "tst_recovery";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+ &dfs_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ d->dfs_tst_rcvry = dent;
+
return 0;
out_remove:
debugfs_remove_recursive(d->dfs_dir);
out:
err = dent ? PTR_ERR(dent) : -ENODEV;
- ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+ ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n",
fname, err);
return err;
}
@@ -2930,4 +3013,179 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c)
debugfs_remove_recursive(c->dbg->dfs_dir);
}
+struct ubifs_global_debug_info ubifs_dbg;
+
+static struct dentry *dfs_chk_gen;
+static struct dentry *dfs_chk_index;
+static struct dentry *dfs_chk_orph;
+static struct dentry *dfs_chk_lprops;
+static struct dentry *dfs_chk_fs;
+static struct dentry *dfs_tst_rcvry;
+
+static ssize_t dfs_global_file_read(struct file *file, char __user *u,
+ size_t count, loff_t *ppos)
+{
+ struct dentry *dent = file->f_path.dentry;
+ int val;
+
+ if (dent == dfs_chk_gen)
+ val = ubifs_dbg.chk_gen;
+ else if (dent == dfs_chk_index)
+ val = ubifs_dbg.chk_index;
+ else if (dent == dfs_chk_orph)
+ val = ubifs_dbg.chk_orph;
+ else if (dent == dfs_chk_lprops)
+ val = ubifs_dbg.chk_lprops;
+ else if (dent == dfs_chk_fs)
+ val = ubifs_dbg.chk_fs;
+ else if (dent == dfs_tst_rcvry)
+ val = ubifs_dbg.tst_rcvry;
+ else
+ return -EINVAL;
+
+ return provide_user_output(val, u, count, ppos);
+}
+
+static ssize_t dfs_global_file_write(struct file *file, const char __user *u,
+ size_t count, loff_t *ppos)
+{
+ struct dentry *dent = file->f_path.dentry;
+ int val;
+
+ val = interpret_user_input(u, count);
+ if (val < 0)
+ return val;
+
+ if (dent == dfs_chk_gen)
+ ubifs_dbg.chk_gen = val;
+ else if (dent == dfs_chk_index)
+ ubifs_dbg.chk_index = val;
+ else if (dent == dfs_chk_orph)
+ ubifs_dbg.chk_orph = val;
+ else if (dent == dfs_chk_lprops)
+ ubifs_dbg.chk_lprops = val;
+ else if (dent == dfs_chk_fs)
+ ubifs_dbg.chk_fs = val;
+ else if (dent == dfs_tst_rcvry)
+ ubifs_dbg.tst_rcvry = val;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static const struct file_operations dfs_global_fops = {
+ .read = dfs_global_file_read,
+ .write = dfs_global_file_write,
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+};
+
+/**
+ * dbg_debugfs_init - initialize debugfs file-system.
+ *
+ * UBIFS uses debugfs file-system to expose various debugging knobs to
+ * user-space. This function creates "ubifs" directory in the debugfs
+ * file-system. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int dbg_debugfs_init(void)
+{
+ int err;
+ const char *fname;
+ struct dentry *dent;
+
+ fname = "ubifs";
+ dent = debugfs_create_dir(fname, NULL);
+ if (IS_ERR_OR_NULL(dent))
+ goto out;
+ dfs_rootdir = dent;
+
+ fname = "chk_general";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+ &dfs_global_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ dfs_chk_gen = dent;
+
+ fname = "chk_index";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+ &dfs_global_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ dfs_chk_index = dent;
+
+ fname = "chk_orphans";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+ &dfs_global_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ dfs_chk_orph = dent;
+
+ fname = "chk_lprops";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+ &dfs_global_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ dfs_chk_lprops = dent;
+
+ fname = "chk_fs";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+ &dfs_global_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ dfs_chk_fs = dent;
+
+ fname = "tst_recovery";
+ dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
+ &dfs_global_fops);
+ if (IS_ERR_OR_NULL(dent))
+ goto out_remove;
+ dfs_tst_rcvry = dent;
+
+ return 0;
+
+out_remove:
+ debugfs_remove_recursive(dfs_rootdir);
+out:
+ err = dent ? PTR_ERR(dent) : -ENODEV;
+ ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n",
+ fname, err);
+ return err;
+}
+
+/**
+ * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system.
+ */
+void dbg_debugfs_exit(void)
+{
+ debugfs_remove_recursive(dfs_rootdir);
+}
+
+/**
+ * ubifs_debugging_init - initialize UBIFS debugging.
+ * @c: UBIFS file-system description object
+ *
+ * This function initializes debugging-related data for the file system.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_debugging_init(struct ubifs_info *c)
+{
+ c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL);
+ if (!c->dbg)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * ubifs_debugging_exit - free debugging data.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_debugging_exit(struct ubifs_info *c)
+{
+ kfree(c->dbg);
+}
+
#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index fd75b63..c9d2941 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -31,18 +31,25 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
#ifdef CONFIG_UBIFS_FS_DEBUG
-#include <linux/random.h>
+/*
+ * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi"
+ * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte.
+ */
+#define UBIFS_DFS_DIR_NAME "ubi%d_%d"
+#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1)
/**
* ubifs_debug_info - per-FS debugging information.
* @old_zroot: old index root - used by 'dbg_check_old_index()'
* @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
* @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
- * @failure_mode: failure mode for recovery testing
- * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
- * @fail_timeout: time in jiffies when delay of failure mode expires
- * @fail_cnt: current number of calls to failure mode I/O functions
- * @fail_cnt_max: number of calls by which to delay failure mode
+ *
+ * @pc_happened: non-zero if an emulated power cut happened
+ * @pc_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @pc_timeout: time in jiffies when delay of failure mode expires
+ * @pc_cnt: current number of calls to failure mode I/O functions
+ * @pc_cnt_max: number of calls by which to delay failure mode
+ *
* @chk_lpt_sz: used by LPT tree size checker
* @chk_lpt_sz2: used by LPT tree size checker
* @chk_lpt_wastage: used by LPT tree size checker
@@ -56,21 +63,36 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
* @saved_free: saved amount of free space
* @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt
*
+ * @chk_gen: if general extra checks are enabled
+ * @chk_index: if index xtra checks are enabled
+ * @chk_orph: if orphans extra checks are enabled
+ * @chk_lprops: if lprops extra checks are enabled
+ * @chk_fs: if UBIFS contents extra checks are enabled
+ * @tst_rcvry: if UBIFS recovery testing mode enabled
+ *
* @dfs_dir_name: name of debugfs directory containing this file-system's files
* @dfs_dir: direntry object of the file-system debugfs directory
* @dfs_dump_lprops: "dump lprops" debugfs knob
* @dfs_dump_budg: "dump budgeting information" debugfs knob
* @dfs_dump_tnc: "dump TNC" debugfs knob
+ * @dfs_chk_gen: debugfs knob to enable UBIFS general extra checks
+ * @dfs_chk_index: debugfs knob to enable UBIFS index extra checks
+ * @dfs_chk_orph: debugfs knob to enable UBIFS orphans extra checks
+ * @dfs_chk_lprops: debugfs knob to enable UBIFS LEP properties extra checks
+ * @dfs_chk_fs: debugfs knob to enable UBIFS contents extra checks
+ * @dfs_tst_rcvry: debugfs knob to enable UBIFS recovery testing
*/
struct ubifs_debug_info {
struct ubifs_zbranch old_zroot;
int old_zroot_level;
unsigned long long old_zroot_sqnum;
- int failure_mode;
- int fail_delay;
- unsigned long fail_timeout;
- unsigned int fail_cnt;
- unsigned int fail_cnt_max;
+
+ int pc_happened;
+ int pc_delay;
+ unsigned long pc_timeout;
+ unsigned int pc_cnt;
+ unsigned int pc_cnt_max;
+
long long chk_lpt_sz;
long long chk_lpt_sz2;
long long chk_lpt_wastage;
@@ -84,11 +106,43 @@ struct ubifs_debug_info {
long long saved_free;
int saved_idx_gc_cnt;
- char dfs_dir_name[100];
+ unsigned int chk_gen:1;
+ unsigned int chk_index:1;
+ unsigned int chk_orph:1;
+ unsigned int chk_lprops:1;
+ unsigned int chk_fs:1;
+ unsigned int tst_rcvry:1;
+
+ char dfs_dir_name[UBIFS_DFS_DIR_LEN + 1];
struct dentry *dfs_dir;
struct dentry *dfs_dump_lprops;
struct dentry *dfs_dump_budg;
struct dentry *dfs_dump_tnc;
+ struct dentry *dfs_chk_gen;
+ struct dentry *dfs_chk_index;
+ struct dentry *dfs_chk_orph;
+ struct dentry *dfs_chk_lprops;
+ struct dentry *dfs_chk_fs;
+ struct dentry *dfs_tst_rcvry;
+};
+
+/**
+ * ubifs_global_debug_info - global (not per-FS) UBIFS debugging information.
+ *
+ * @chk_gen: if general extra checks are enabled
+ * @chk_index: if index xtra checks are enabled
+ * @chk_orph: if orphans extra checks are enabled
+ * @chk_lprops: if lprops extra checks are enabled
+ * @chk_fs: if UBIFS contents extra checks are enabled
+ * @tst_rcvry: if UBIFS recovery testing mode enabled
+ */
+struct ubifs_global_debug_info {
+ unsigned int chk_gen:1;
+ unsigned int chk_index:1;
+ unsigned int chk_orph:1;
+ unsigned int chk_lprops:1;
+ unsigned int chk_fs:1;
+ unsigned int tst_rcvry:1;
};
#define ubifs_assert(expr) do { \
@@ -128,6 +182,8 @@ const char *dbg_key_str1(const struct ubifs_info *c,
#define DBGKEY(key) dbg_key_str0(c, (key))
#define DBGKEY1(key) dbg_key_str1(c, (key))
+extern spinlock_t dbg_lock;
+
#define ubifs_dbg_msg(type, fmt, ...) \
pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
@@ -163,41 +219,36 @@ const char *dbg_key_str1(const struct ubifs_info *c,
/* Additional recovery messages */
#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
-/*
- * Debugging check flags.
- *
- * UBIFS_CHK_GEN: general checks
- * UBIFS_CHK_TNC: check TNC
- * UBIFS_CHK_IDX_SZ: check index size
- * UBIFS_CHK_ORPH: check orphans
- * UBIFS_CHK_OLD_IDX: check the old index
- * UBIFS_CHK_LPROPS: check lprops
- * UBIFS_CHK_FS: check the file-system
- */
-enum {
- UBIFS_CHK_GEN = 0x1,
- UBIFS_CHK_TNC = 0x2,
- UBIFS_CHK_IDX_SZ = 0x4,
- UBIFS_CHK_ORPH = 0x8,
- UBIFS_CHK_OLD_IDX = 0x10,
- UBIFS_CHK_LPROPS = 0x20,
- UBIFS_CHK_FS = 0x40,
-};
+extern struct ubifs_global_debug_info ubifs_dbg;
-/*
- * Special testing flags.
- *
- * UBIFS_TST_RCVRY: failure mode for recovery testing
- */
-enum {
- UBIFS_TST_RCVRY = 0x4,
-};
-
-extern spinlock_t dbg_lock;
-
-extern unsigned int ubifs_msg_flags;
-extern unsigned int ubifs_chk_flags;
-extern unsigned int ubifs_tst_flags;
+static inline int dbg_is_chk_gen(const struct ubifs_info *c)
+{
+ return !!(ubifs_dbg.chk_gen || c->dbg->chk_gen);
+}
+static inline int dbg_is_chk_index(const struct ubifs_info *c)
+{
+ return !!(ubifs_dbg.chk_index || c->dbg->chk_index);
+}
+static inline int dbg_is_chk_orph(const struct ubifs_info *c)
+{
+ return !!(ubifs_dbg.chk_orph || c->dbg->chk_orph);
+}
+static inline int dbg_is_chk_lprops(const struct ubifs_info *c)
+{
+ return !!(ubifs_dbg.chk_lprops || c->dbg->chk_lprops);
+}
+static inline int dbg_is_chk_fs(const struct ubifs_info *c)
+{
+ return !!(ubifs_dbg.chk_fs || c->dbg->chk_fs);
+}
+static inline int dbg_is_tst_rcvry(const struct ubifs_info *c)
+{
+ return !!(ubifs_dbg.tst_rcvry || c->dbg->tst_rcvry);
+}
+static inline int dbg_is_power_cut(const struct ubifs_info *c)
+{
+ return !!c->dbg->pc_happened;
+}
int ubifs_debugging_init(struct ubifs_info *c);
void ubifs_debugging_exit(struct ubifs_info *c);
@@ -208,7 +259,7 @@ const char *dbg_cstate(int cmt_state);
const char *dbg_jhead(int jhead);
const char *dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key);
-void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
+void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode);
void dbg_dump_node(const struct ubifs_info *c, const void *node);
void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
int offs);
@@ -219,6 +270,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
void dbg_dump_lprops(struct ubifs_info *c);
void dbg_dump_lpt_info(struct ubifs_info *c);
void dbg_dump_leb(const struct ubifs_info *c, int lnum);
+void dbg_dump_sleb(const struct ubifs_info *c,
+ const struct ubifs_scan_leb *sleb, int offs);
void dbg_dump_znode(const struct ubifs_info *c,
const struct ubifs_znode *znode);
void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
@@ -241,8 +294,8 @@ int dbg_check_cats(struct ubifs_info *c);
int dbg_check_ltab(struct ubifs_info *c);
int dbg_chk_lpt_free_spc(struct ubifs_info *c);
int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len);
-int dbg_check_synced_i_size(struct inode *inode);
-int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
+int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode);
+int dbg_check_dir(struct ubifs_info *c, const struct inode *dir);
int dbg_check_tnc(struct ubifs_info *c, int extra);
int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
int dbg_check_filesystem(struct ubifs_info *c);
@@ -255,54 +308,12 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
-/* Force the use of in-the-gaps method for testing */
-static inline int dbg_force_in_the_gaps_enabled(void)
-{
- return ubifs_chk_flags & UBIFS_CHK_GEN;
-}
-int dbg_force_in_the_gaps(void);
-
-/* Failure mode for recovery testing */
-#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
-
-#ifndef UBIFS_DBG_PRESERVE_UBI
-#define ubi_leb_read dbg_leb_read
-#define ubi_leb_write dbg_leb_write
-#define ubi_leb_change dbg_leb_change
-#define ubi_leb_erase dbg_leb_erase
-#define ubi_leb_unmap dbg_leb_unmap
-#define ubi_is_mapped dbg_is_mapped
-#define ubi_leb_map dbg_leb_map
-#endif
-
-int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
- int len, int check);
-int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
- int offset, int len, int dtype);
-int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
- int len, int dtype);
-int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum);
-int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum);
-int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum);
-int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype);
-
-static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf,
- int offset, int len)
-{
- return dbg_leb_read(desc, lnum, buf, offset, len, 0);
-}
-
-static inline int dbg_write(struct ubi_volume_desc *desc, int lnum,
- const void *buf, int offset, int len)
-{
- return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN);
-}
-
-static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
- const void *buf, int len)
-{
- return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
-}
+int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
+ int len, int dtype);
+int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len,
+ int dtype);
+int dbg_leb_unmap(struct ubifs_info *c, int lnum);
+int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype);
/* Debugfs-related stuff */
int dbg_debugfs_init(void);
@@ -314,7 +325,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
/* Use "if (0)" to make compiler check arguments even if debugging is off */
#define ubifs_assert(expr) do { \
- if (0 && (expr)) \
+ if (0) \
printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
__func__, __LINE__, current->pid); \
} while (0)
@@ -324,9 +335,12 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
ubifs_err(fmt, ##__VA_ARGS__); \
} while (0)
-#define ubifs_dbg_msg(fmt, ...) do { \
- if (0) \
- pr_debug(fmt "\n", ##__VA_ARGS__); \
+#define DBGKEY(key) ((char *)(key))
+#define DBGKEY1(key) ((char *)(key))
+
+#define ubifs_dbg_msg(fmt, ...) do { \
+ if (0) \
+ printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__); \
} while (0)
#define dbg_dump_stack()
@@ -347,9 +361,6 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define DBGKEY(key) ((char *)(key))
-#define DBGKEY1(key) ((char *)(key))
-
static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; }
static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; }
static inline const char *dbg_ntype(int type) { return ""; }
@@ -358,7 +369,7 @@ static inline const char *dbg_jhead(int jhead) { return ""; }
static inline const char *
dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key) { return ""; }
-static inline void dbg_dump_inode(const struct ubifs_info *c,
+static inline void dbg_dump_inode(struct ubifs_info *c,
const struct inode *inode) { return; }
static inline void dbg_dump_node(const struct ubifs_info *c,
const void *node) { return; }
@@ -379,6 +390,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; }
static inline void dbg_dump_leb(const struct ubifs_info *c,
int lnum) { return; }
static inline void
+dbg_dump_sleb(const struct ubifs_info *c,
+ const struct ubifs_scan_leb *sleb, int offs) { return; }
+static inline void
dbg_dump_znode(const struct ubifs_info *c,
const struct ubifs_znode *znode) { return; }
static inline void dbg_dump_heap(struct ubifs_info *c,
@@ -410,9 +424,11 @@ static inline int dbg_check_ltab(struct ubifs_info *c) { return 0; }
static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c) { return 0; }
static inline int dbg_chk_lpt_sz(struct ubifs_info *c,
int action, int len) { return 0; }
-static inline int dbg_check_synced_i_size(struct inode *inode) { return 0; }
-static inline int dbg_check_dir_size(struct ubifs_info *c,
- const struct inode *dir) { return 0; }
+static inline int
+dbg_check_synced_i_size(const struct ubifs_info *c,
+ struct inode *inode) { return 0; }
+static inline int dbg_check_dir(struct ubifs_info *c,
+ const struct inode *dir) { return 0; }
static inline int dbg_check_tnc(struct ubifs_info *c, int extra) { return 0; }
static inline int dbg_check_idx_size(struct ubifs_info *c,
long long idx_size) { return 0; }
@@ -432,9 +448,23 @@ static inline int
dbg_check_nondata_nodes_order(struct ubifs_info *c,
struct list_head *head) { return 0; }
-static inline int dbg_force_in_the_gaps(void) { return 0; }
-#define dbg_force_in_the_gaps_enabled() 0
-#define dbg_failure_mode 0
+static inline int dbg_leb_write(struct ubifs_info *c, int lnum,
+ const void *buf, int offset,
+ int len, int dtype) { return 0; }
+static inline int dbg_leb_change(struct ubifs_info *c, int lnum,
+ const void *buf, int len,
+ int dtype) { return 0; }
+static inline int dbg_leb_unmap(struct ubifs_info *c, int lnum) { return 0; }
+static inline int dbg_leb_map(struct ubifs_info *c, int lnum,
+ int dtype) { return 0; }
+
+static inline int dbg_is_chk_gen(const struct ubifs_info *c) { return 0; }
+static inline int dbg_is_chk_index(const struct ubifs_info *c) { return 0; }
+static inline int dbg_is_chk_orph(const struct ubifs_info *c) { return 0; }
+static inline int dbg_is_chk_lprops(const struct ubifs_info *c) { return 0; }
+static inline int dbg_is_chk_fs(const struct ubifs_info *c) { return 0; }
+static inline int dbg_is_tst_rcvry(const struct ubifs_info *c) { return 0; }
+static inline int dbg_is_power_cut(const struct ubifs_info *c) { return 0; }
static inline int dbg_debugfs_init(void) { return 0; }
static inline void dbg_debugfs_exit(void) { return; }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 936a038..aaebf0f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -102,7 +102,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
* UBIFS has to fully control "clean <-> dirty" transitions of inodes
* to make budgeting work.
*/
- inode->i_flags |= (S_NOCMTIME);
+ inode->i_flags |= S_NOCMTIME;
inode_init_owner(inode, dir, mode);
inode->i_mtime = inode->i_atime = inode->i_ctime =
@@ -172,9 +172,11 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
#ifdef CONFIG_UBIFS_FS_DEBUG
-static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm)
+static int dbg_check_name(const struct ubifs_info *c,
+ const struct ubifs_dent_node *dent,
+ const struct qstr *nm)
{
- if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ if (!dbg_is_chk_gen(c))
return 0;
if (le16_to_cpu(dent->nlen) != nm->len)
return -EINVAL;
@@ -185,7 +187,7 @@ static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm)
#else
-#define dbg_check_name(dent, nm) 0
+#define dbg_check_name(c, dent, nm) 0
#endif
@@ -219,7 +221,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- if (dbg_check_name(dent, &dentry->d_name)) {
+ if (dbg_check_name(c, dent, &dentry->d_name)) {
err = -EINVAL;
goto out;
}
@@ -546,7 +548,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
ubifs_assert(mutex_is_locked(&dir->i_mutex));
ubifs_assert(mutex_is_locked(&inode->i_mutex));
- err = dbg_check_synced_i_size(inode);
+ err = dbg_check_synced_i_size(c, inode);
if (err)
return err;
@@ -601,7 +603,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
inode->i_nlink, dir->i_ino);
ubifs_assert(mutex_is_locked(&dir->i_mutex));
ubifs_assert(mutex_is_locked(&inode->i_mutex));
- err = dbg_check_synced_i_size(inode);
+ err = dbg_check_synced_i_size(c, inode);
if (err)
return err;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5e7fccf..6589006 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1263,7 +1263,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
- err = dbg_check_synced_i_size(inode);
+ err = dbg_check_synced_i_size(c, inode);
if (err)
return err;
@@ -1304,7 +1304,7 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
return NULL;
}
-int ubifs_fsync(struct file *file, int datasync)
+int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1319,14 +1319,16 @@ int ubifs_fsync(struct file *file, int datasync)
*/
return 0;
- /*
- * VFS has already synchronized dirty pages for this inode. Synchronize
- * the inode unless this is a 'datasync()' call.
- */
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (err)
+ return err;
+ mutex_lock(&inode->i_mutex);
+
+ /* Synchronize the inode unless this is a 'datasync()' call. */
if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
- return err;
+ goto out;
}
/*
@@ -1334,10 +1336,9 @@ int ubifs_fsync(struct file *file, int datasync)
* them.
*/
err = ubifs_sync_wbufs_by_inode(c, inode);
- if (err)
- return err;
-
- return 0;
+out:
+ mutex_unlock(&inode->i_mutex);
+ return err;
}
/**
@@ -1521,8 +1522,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
ubifs_release_dirty_inode_budget(c, ui);
}
- unlock_page(page);
- return 0;
+ return VM_FAULT_LOCKED;
out_unlock:
unlock_page(page);
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 3be645e..9228950 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -86,8 +86,125 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
c->no_chk_data_crc = 0;
c->vfs_sb->s_flags |= MS_RDONLY;
ubifs_warn("switched to read-only mode, error %d", err);
+ dump_stack();
+ }
+}
+
+/*
+ * Below are simple wrappers over UBI I/O functions which include some
+ * additional checks and UBIFS debugging stuff. See corresponding UBI function
+ * for more information.
+ */
+
+int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
+ int len, int even_ebadmsg)
+{
+ int err;
+
+ err = ubi_read(c->ubi, lnum, buf, offs, len);
+ /*
+ * In case of %-EBADMSG print the error message only if the
+ * @even_ebadmsg is true.
+ */
+ if (err && (err != -EBADMSG || even_ebadmsg)) {
+ ubifs_err("reading %d bytes from LEB %d:%d failed, error %d",
+ len, lnum, offs, err);
+ dbg_dump_stack();
+ }
+ return err;
+}
+
+int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
+ int len, int dtype)
+{
+ int err;
+
+ ubifs_assert(!c->ro_media && !c->ro_mount);
+ if (c->ro_error)
+ return -EROFS;
+ if (!dbg_is_tst_rcvry(c))
+ err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
+ else
+ err = dbg_leb_write(c, lnum, buf, offs, len, dtype);
+ if (err) {
+ ubifs_err("writing %d bytes to LEB %d:%d failed, error %d",
+ len, lnum, offs, err);
+ ubifs_ro_mode(c, err);
+ dbg_dump_stack();
+ }
+ return err;
+}
+
+int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len,
+ int dtype)
+{
+ int err;
+
+ ubifs_assert(!c->ro_media && !c->ro_mount);
+ if (c->ro_error)
+ return -EROFS;
+ if (!dbg_is_tst_rcvry(c))
+ err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
+ else
+ err = dbg_leb_change(c, lnum, buf, len, dtype);
+ if (err) {
+ ubifs_err("changing %d bytes in LEB %d failed, error %d",
+ len, lnum, err);
+ ubifs_ro_mode(c, err);
+ dbg_dump_stack();
+ }
+ return err;
+}
+
+int ubifs_leb_unmap(struct ubifs_info *c, int lnum)
+{
+ int err;
+
+ ubifs_assert(!c->ro_media && !c->ro_mount);
+ if (c->ro_error)
+ return -EROFS;
+ if (!dbg_is_tst_rcvry(c))
+ err = ubi_leb_unmap(c->ubi, lnum);
+ else
+ err = dbg_leb_unmap(c, lnum);
+ if (err) {
+ ubifs_err("unmap LEB %d failed, error %d", lnum, err);
+ ubifs_ro_mode(c, err);
+ dbg_dump_stack();
+ }
+ return err;
+}
+
+int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype)
+{
+ int err;
+
+ ubifs_assert(!c->ro_media && !c->ro_mount);
+ if (c->ro_error)
+ return -EROFS;
+ if (!dbg_is_tst_rcvry(c))
+ err = ubi_leb_map(c->ubi, lnum, dtype);
+ else
+ err = dbg_leb_map(c, lnum, dtype);
+ if (err) {
+ ubifs_err("mapping LEB %d failed, error %d", lnum, err);
+ ubifs_ro_mode(c, err);
+ dbg_dump_stack();
+ }
+ return err;
+}
+
+int ubifs_is_mapped(const struct ubifs_info *c, int lnum)
+{
+ int err;
+
+ err = ubi_is_mapped(c->ubi, lnum);
+ if (err < 0) {
+ ubifs_err("ubi_is_mapped failed for LEB %d, error %d",
+ lnum, err);
dbg_dump_stack();
}
+ return err;
}
/**
@@ -406,14 +523,10 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
dirt = sync_len - wbuf->used;
if (dirt)
ubifs_pad(c, wbuf->buf + wbuf->used, dirt);
- err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
- sync_len, wbuf->dtype);
- if (err) {
- ubifs_err("cannot write %d bytes to LEB %d:%d",
- sync_len, wbuf->lnum, wbuf->offs);
- dbg_dump_stack();
+ err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len,
+ wbuf->dtype);
+ if (err)
return err;
- }
spin_lock(&wbuf->lock);
wbuf->offs += sync_len;
@@ -605,9 +718,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
if (aligned_len == wbuf->avail) {
dbg_io("flush jhead %s wbuf to LEB %d:%d",
dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
- err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
- wbuf->offs, wbuf->size,
- wbuf->dtype);
+ err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf,
+ wbuf->offs, wbuf->size,
+ wbuf->dtype);
if (err)
goto out;
@@ -642,8 +755,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
dbg_io("flush jhead %s wbuf to LEB %d:%d",
dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
- err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
- wbuf->size, wbuf->dtype);
+ err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs,
+ wbuf->size, wbuf->dtype);
if (err)
goto out;
@@ -661,8 +774,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
*/
dbg_io("write %d bytes to LEB %d:%d",
wbuf->size, wbuf->lnum, wbuf->offs);
- err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs,
- wbuf->size, wbuf->dtype);
+ err = ubifs_leb_write(c, wbuf->lnum, buf, wbuf->offs,
+ wbuf->size, wbuf->dtype);
if (err)
goto out;
@@ -683,8 +796,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
n <<= c->max_write_shift;
dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
wbuf->offs);
- err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written,
- wbuf->offs, n, wbuf->dtype);
+ err = ubifs_leb_write(c, wbuf->lnum, buf + written,
+ wbuf->offs, n, wbuf->dtype);
if (err)
goto out;
wbuf->offs += n;
@@ -766,13 +879,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
return -EROFS;
ubifs_prepare_node(c, buf, len, 1);
- err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype);
- if (err) {
- ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
- buf_len, lnum, offs, err);
+ err = ubifs_leb_write(c, lnum, buf, offs, buf_len, dtype);
+ if (err)
dbg_dump_node(c, buf);
- dbg_dump_stack();
- }
return err;
}
@@ -824,13 +933,9 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
if (rlen > 0) {
/* Read everything that goes before write-buffer */
- err = ubi_read(c->ubi, lnum, buf, offs, rlen);
- if (err && err != -EBADMSG) {
- ubifs_err("failed to read node %d from LEB %d:%d, "
- "error %d", type, lnum, offs, err);
- dbg_dump_stack();
+ err = ubifs_leb_read(c, lnum, buf, offs, rlen, 0);
+ if (err && err != -EBADMSG)
return err;
- }
}
if (type != ch->node_type) {
@@ -885,12 +990,9 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
ubifs_assert(!(offs & 7) && offs < c->leb_size);
ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
- err = ubi_read(c->ubi, lnum, buf, offs, len);
- if (err && err != -EBADMSG) {
- ubifs_err("cannot read node %d from LEB %d:%d, error %d",
- type, lnum, offs, err);
+ err = ubifs_leb_read(c, lnum, buf, offs, len, 0);
+ if (err && err != -EBADMSG)
return err;
- }
if (type != ch->node_type) {
ubifs_err("bad node type (%d but expected %d)",
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index affea94..843beda 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -110,10 +110,14 @@ static inline long long empty_log_bytes(const struct ubifs_info *c)
h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
t = (long long)c->ltail_lnum * c->leb_size;
- if (h >= t)
+ if (h > t)
return c->log_bytes - h + t;
- else
+ else if (h != t)
return t - h;
+ else if (c->lhead_lnum != c->ltail_lnum)
+ return 0;
+ else
+ return c->log_bytes;
}
/**
@@ -262,7 +266,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
* an unclean reboot, because the target LEB might have been
* unmapped, but not yet physically erased.
*/
- err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM);
+ err = ubifs_leb_map(c, bud->lnum, UBI_SHORTTERM);
if (err)
goto out_unlock;
}
@@ -283,8 +287,6 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
return 0;
out_unlock:
- if (err != -EAGAIN)
- ubifs_ro_mode(c, err);
mutex_unlock(&c->log_mutex);
kfree(ref);
kfree(bud);
@@ -455,9 +457,9 @@ out:
* @ltail_lnum: new log tail LEB number
*
* This function is called on when the commit operation was finished. It
- * moves log tail to new position and unmaps LEBs which contain obsolete data.
- * Returns zero in case of success and a negative error code in case of
- * failure.
+ * moves log tail to new position and updates the master node so that it stores
+ * the new log tail LEB number. Returns zero in case of success and a negative
+ * error code in case of failure.
*/
int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
{
@@ -485,7 +487,12 @@ int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
spin_unlock(&c->buds_lock);
err = dbg_check_bud_bytes(c);
+ if (err)
+ goto out;
+ err = ubifs_write_master(c);
+
+out:
mutex_unlock(&c->log_mutex);
return err;
}
@@ -752,7 +759,7 @@ static int dbg_check_bud_bytes(struct ubifs_info *c)
struct ubifs_bud *bud;
long long bud_bytes = 0;
- if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ if (!dbg_is_chk_gen(c))
return 0;
spin_lock(&c->buds_lock);
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index ae6f74a..ea9d491 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -510,7 +510,7 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
pnode = (struct ubifs_pnode *)container_of(lprops - pos,
struct ubifs_pnode,
lprops[0]);
- return !test_bit(COW_ZNODE, &pnode->flags) &&
+ return !test_bit(COW_CNODE, &pnode->flags) &&
test_bit(DIRTY_CNODE, &pnode->flags);
}
@@ -866,7 +866,7 @@ int dbg_check_cats(struct ubifs_info *c)
struct list_head *pos;
int i, cat;
- if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+ if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c))
return 0;
list_for_each_entry(lprops, &c->empty_list, list) {
@@ -964,7 +964,7 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
{
int i = 0, j, err = 0;
- if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+ if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c))
return;
for (i = 0; i < heap->cnt; i++) {
@@ -1268,7 +1268,7 @@ int dbg_check_lprops(struct ubifs_info *c)
int i, err;
struct ubifs_lp_stats lst;
- if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ if (!dbg_is_chk_lprops(c))
return 0;
/*
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index ef5155e..6189c74 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -701,8 +701,8 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
alen = ALIGN(len, c->min_io_size);
set_ltab(c, lnum, c->leb_size - alen, alen - len);
memset(p, 0xff, alen - len);
- err = ubi_leb_change(c->ubi, lnum++, buf, alen,
- UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum++, buf, alen,
+ UBI_SHORTTERM);
if (err)
goto out;
p = buf;
@@ -732,8 +732,8 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
set_ltab(c, lnum, c->leb_size - alen,
alen - len);
memset(p, 0xff, alen - len);
- err = ubi_leb_change(c->ubi, lnum++, buf, alen,
- UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum++, buf, alen,
+ UBI_SHORTTERM);
if (err)
goto out;
p = buf;
@@ -780,8 +780,8 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
alen = ALIGN(len, c->min_io_size);
set_ltab(c, lnum, c->leb_size - alen, alen - len);
memset(p, 0xff, alen - len);
- err = ubi_leb_change(c->ubi, lnum++, buf, alen,
- UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum++, buf, alen,
+ UBI_SHORTTERM);
if (err)
goto out;
p = buf;
@@ -806,7 +806,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
alen = ALIGN(len, c->min_io_size);
set_ltab(c, lnum, c->leb_size - alen, alen - len);
memset(p, 0xff, alen - len);
- err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum++, buf, alen, UBI_SHORTTERM);
if (err)
goto out;
p = buf;
@@ -826,7 +826,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
/* Write remaining buffer */
memset(p, 0xff, alen - len);
- err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum, buf, alen, UBI_SHORTTERM);
if (err)
goto out;
@@ -1222,7 +1222,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
if (c->big_lpt)
nnode->num = calc_nnode_num_from_parent(c, parent, iip);
} else {
- err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
+ err = ubifs_leb_read(c, lnum, buf, offs, c->nnode_sz, 1);
if (err)
goto out;
err = ubifs_unpack_nnode(c, buf, nnode);
@@ -1247,6 +1247,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
out:
ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs);
+ dbg_dump_stack();
kfree(nnode);
return err;
}
@@ -1290,7 +1291,7 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
lprops->flags = ubifs_categorize_lprops(c, lprops);
}
} else {
- err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz);
+ err = ubifs_leb_read(c, lnum, buf, offs, c->pnode_sz, 1);
if (err)
goto out;
err = unpack_pnode(c, buf, pnode);
@@ -1312,6 +1313,7 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
out:
ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
dbg_dump_pnode(c, pnode, parent, iip);
+ dbg_dump_stack();
dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
kfree(pnode);
return err;
@@ -1331,7 +1333,7 @@ static int read_ltab(struct ubifs_info *c)
buf = vmalloc(c->ltab_sz);
if (!buf)
return -ENOMEM;
- err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz);
+ err = ubifs_leb_read(c, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz, 1);
if (err)
goto out;
err = unpack_ltab(c, buf);
@@ -1354,7 +1356,8 @@ static int read_lsave(struct ubifs_info *c)
buf = vmalloc(c->lsave_sz);
if (!buf)
return -ENOMEM;
- err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz);
+ err = ubifs_leb_read(c, c->lsave_lnum, buf, c->lsave_offs,
+ c->lsave_sz, 1);
if (err)
goto out;
err = unpack_lsave(c, buf);
@@ -1814,8 +1817,8 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
if (c->big_lpt)
nnode->num = calc_nnode_num_from_parent(c, parent, iip);
} else {
- err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
- c->nnode_sz);
+ err = ubifs_leb_read(c, branch->lnum, buf, branch->offs,
+ c->nnode_sz, 1);
if (err)
return ERR_PTR(err);
err = ubifs_unpack_nnode(c, buf, nnode);
@@ -1883,8 +1886,8 @@ static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c,
ubifs_assert(branch->lnum >= c->lpt_first &&
branch->lnum <= c->lpt_last);
ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size);
- err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
- c->pnode_sz);
+ err = ubifs_leb_read(c, branch->lnum, buf, branch->offs,
+ c->pnode_sz, 1);
if (err)
return ERR_PTR(err);
err = unpack_pnode(c, buf, pnode);
@@ -2224,7 +2227,7 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
struct ubifs_cnode *cn;
int num, iip = 0, err;
- if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ if (!dbg_is_chk_lprops(c))
return 0;
while (cnode) {
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index dfcb574..cddd6bd 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -27,6 +27,7 @@
#include <linux/crc16.h>
#include <linux/slab.h>
+#include <linux/random.h>
#include "ubifs.h"
#ifdef CONFIG_UBIFS_FS_DEBUG
@@ -116,8 +117,8 @@ static int get_cnodes_to_commit(struct ubifs_info *c)
return 0;
cnt += 1;
while (1) {
- ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags));
- __set_bit(COW_ZNODE, &cnode->flags);
+ ubifs_assert(!test_bit(COW_CNODE, &cnode->flags));
+ __set_bit(COW_CNODE, &cnode->flags);
cnext = next_dirty_cnode(cnode);
if (!cnext) {
cnode->cnext = c->lpt_cnext;
@@ -465,7 +466,7 @@ static int write_cnodes(struct ubifs_info *c)
*/
clear_bit(DIRTY_CNODE, &cnode->flags);
smp_mb__before_clear_bit();
- clear_bit(COW_ZNODE, &cnode->flags);
+ clear_bit(COW_CNODE, &cnode->flags);
smp_mb__after_clear_bit();
offs += len;
dbg_chk_lpt_sz(c, 1, len);
@@ -1160,11 +1161,11 @@ static int lpt_gc_lnum(struct ubifs_info *c, int lnum)
void *buf = c->lpt_buf;
dbg_lp("LEB %d", lnum);
- err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
- if (err) {
- ubifs_err("cannot read LEB %d, error %d", lnum, err);
+
+ err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1);
+ if (err)
return err;
- }
+
while (1) {
if (!is_a_node(c, buf, len)) {
int pad_len;
@@ -1640,7 +1641,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
int ret;
void *buf, *p;
- if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ if (!dbg_is_chk_lprops(c))
return 0;
buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
@@ -1650,11 +1651,11 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
}
dbg_lp("LEB %d", lnum);
- err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
- if (err) {
- dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
+
+ err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1);
+ if (err)
goto out;
- }
+
while (1) {
if (!is_a_node(c, p, len)) {
int i, pad_len;
@@ -1711,7 +1712,7 @@ int dbg_check_ltab(struct ubifs_info *c)
{
int lnum, err, i, cnt;
- if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ if (!dbg_is_chk_lprops(c))
return 0;
/* Bring the entire tree into memory */
@@ -1754,7 +1755,7 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
long long free = 0;
int i;
- if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ if (!dbg_is_chk_lprops(c))
return 0;
for (i = 0; i < c->lpt_lebs; i++) {
@@ -1796,7 +1797,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
long long chk_lpt_sz, lpt_sz;
int err = 0;
- if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ if (!dbg_is_chk_lprops(c))
return 0;
switch (action) {
@@ -1901,11 +1902,10 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
return;
}
- err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
- if (err) {
- ubifs_err("cannot read LEB %d, error %d", lnum, err);
+ err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1);
+ if (err)
goto out;
- }
+
while (1) {
offs = c->leb_size - len;
if (!is_a_node(c, p, len)) {
@@ -2019,7 +2019,7 @@ static int dbg_populate_lsave(struct ubifs_info *c)
struct ubifs_lpt_heap *heap;
int i;
- if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ if (!dbg_is_chk_gen(c))
return 0;
if (random32() & 3)
return 0;
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 278c238..bb9f481 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -352,10 +352,9 @@ int ubifs_read_master(struct ubifs_info *c)
* ubifs_write_master - write master node.
* @c: UBIFS file-system description object
*
- * This function writes the master node. The caller has to take the
- * @c->mst_mutex lock before calling this function. Returns zero in case of
- * success and a negative error code in case of failure. The master node is
- * written twice to enable recovery.
+ * This function writes the master node. Returns zero in case of success and a
+ * negative error code in case of failure. The master node is written twice to
+ * enable recovery.
*/
int ubifs_write_master(struct ubifs_info *c)
{
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 0b5296a..ee7cb5e 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -39,6 +39,29 @@ static inline int ubifs_zn_dirty(const struct ubifs_znode *znode)
}
/**
+ * ubifs_zn_obsolete - check if znode is obsolete.
+ * @znode: znode to check
+ *
+ * This helper function returns %1 if @znode is obsolete and %0 otherwise.
+ */
+static inline int ubifs_zn_obsolete(const struct ubifs_znode *znode)
+{
+ return !!test_bit(OBSOLETE_ZNODE, &znode->flags);
+}
+
+/**
+ * ubifs_zn_cow - check if znode has to be copied on write.
+ * @znode: znode to check
+ *
+ * This helper function returns %1 if @znode is has COW flag set and %0
+ * otherwise.
+ */
+static inline int ubifs_zn_cow(const struct ubifs_znode *znode)
+{
+ return !!test_bit(COW_ZNODE, &znode->flags);
+}
+
+/**
* ubifs_wake_up_bgt - wake up background thread.
* @c: UBIFS file-system description object
*/
@@ -122,86 +145,6 @@ static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
}
/**
- * ubifs_leb_unmap - unmap an LEB.
- * @c: UBIFS file-system description object
- * @lnum: LEB number to unmap
- *
- * This function returns %0 on success and a negative error code on failure.
- */
-static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
-{
- int err;
-
- ubifs_assert(!c->ro_media && !c->ro_mount);
- if (c->ro_error)
- return -EROFS;
- err = ubi_leb_unmap(c->ubi, lnum);
- if (err) {
- ubifs_err("unmap LEB %d failed, error %d", lnum, err);
- return err;
- }
-
- return 0;
-}
-
-/**
- * ubifs_leb_write - write to a LEB.
- * @c: UBIFS file-system description object
- * @lnum: LEB number to write
- * @buf: buffer to write from
- * @offs: offset within LEB to write to
- * @len: length to write
- * @dtype: data type
- *
- * This function returns %0 on success and a negative error code on failure.
- */
-static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
- const void *buf, int offs, int len, int dtype)
-{
- int err;
-
- ubifs_assert(!c->ro_media && !c->ro_mount);
- if (c->ro_error)
- return -EROFS;
- err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
- if (err) {
- ubifs_err("writing %d bytes at %d:%d, error %d",
- len, lnum, offs, err);
- return err;
- }
-
- return 0;
-}
-
-/**
- * ubifs_leb_change - atomic LEB change.
- * @c: UBIFS file-system description object
- * @lnum: LEB number to write
- * @buf: buffer to write from
- * @len: length to write
- * @dtype: data type
- *
- * This function returns %0 on success and a negative error code on failure.
- */
-static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
- const void *buf, int len, int dtype)
-{
- int err;
-
- ubifs_assert(!c->ro_media && !c->ro_mount);
- if (c->ro_error)
- return -EROFS;
- err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
- if (err) {
- ubifs_err("changing %d bytes in LEB %d, error %d",
- len, lnum, err);
- return err;
- }
-
- return 0;
-}
-
-/**
* ubifs_encode_dev - encode device node IDs.
* @dev: UBIFS device node information
* @rdev: device IDs to encode
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index a5422ff..f9c90b5 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -130,13 +130,14 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
else if (inum > o->inum)
p = p->rb_right;
else {
- if (o->dnext) {
+ if (o->del) {
spin_unlock(&c->orphan_lock);
dbg_gen("deleted twice ino %lu",
(unsigned long)inum);
return;
}
if (o->cnext) {
+ o->del = 1;
o->dnext = c->orph_dnext;
c->orph_dnext = o;
spin_unlock(&c->orphan_lock);
@@ -447,6 +448,7 @@ static void erase_deleted(struct ubifs_info *c)
orphan = dnext;
dnext = orphan->dnext;
ubifs_assert(!orphan->new);
+ ubifs_assert(orphan->del);
rb_erase(&orphan->rb, &c->orph_tree);
list_del(&orphan->list);
c->tot_orphans -= 1;
@@ -536,6 +538,7 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
rb_link_node(&orphan->rb, parent, p);
rb_insert_color(&orphan->rb, &c->orph_tree);
list_add_tail(&orphan->list, &c->orph_list);
+ orphan->del = 1;
orphan->dnext = c->orph_dnext;
c->orph_dnext = orphan;
dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum,
@@ -929,7 +932,7 @@ static int dbg_check_orphans(struct ubifs_info *c)
struct check_info ci;
int err;
- if (!(ubifs_chk_flags & UBIFS_CHK_ORPH))
+ if (!dbg_is_chk_orph(c))
return 0;
ci.last_ino = 0;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 783d8e0..ee4f43f 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -117,7 +117,7 @@ static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf,
if (!sbuf)
return -ENOMEM;
- err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size);
+ err = ubifs_leb_read(c, lnum, sbuf, 0, c->leb_size, 0);
if (err && err != -EBADMSG)
goto out_free;
@@ -213,10 +213,10 @@ static int write_rcvrd_mst_node(struct ubifs_info *c,
mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY);
ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
- err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum, mst, sz, UBI_SHORTTERM);
if (err)
goto out;
- err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum + 1, mst, sz, UBI_SHORTTERM);
if (err)
goto out;
out:
@@ -274,7 +274,8 @@ int ubifs_recover_master_node(struct ubifs_info *c)
if (cor1)
goto out_err;
mst = mst1;
- } else if (offs1 == 0 && offs2 + sz >= c->leb_size) {
+ } else if (offs1 == 0 &&
+ c->leb_size - offs2 - sz < sz) {
/* 1st LEB was unmapped and written, 2nd not */
if (cor1)
goto out_err;
@@ -539,8 +540,8 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
int len = ALIGN(endpt, c->min_io_size);
if (start) {
- err = ubi_read(c->ubi, lnum, sleb->buf, 0,
- start);
+ err = ubifs_leb_read(c, lnum, sleb->buf, 0,
+ start, 1);
if (err)
return err;
}
@@ -554,8 +555,8 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
ubifs_pad(c, buf, pad_len);
}
}
- err = ubi_leb_change(c->ubi, lnum, sleb->buf, len,
- UBI_UNKNOWN);
+ err = ubifs_leb_change(c, lnum, sleb->buf, len,
+ UBI_UNKNOWN);
if (err)
return err;
}
@@ -819,7 +820,8 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
return -ENOMEM;
if (c->leb_size - offs < UBIFS_CS_NODE_SZ)
goto out_err;
- err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ);
+ err = ubifs_leb_read(c, lnum, (void *)cs_node, offs,
+ UBIFS_CS_NODE_SZ, 0);
if (err && err != -EBADMSG)
goto out_free;
ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
@@ -919,8 +921,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
*
* This function returns %0 on success and a negative error code on failure.
*/
-static int recover_head(const struct ubifs_info *c, int lnum, int offs,
- void *sbuf)
+static int recover_head(struct ubifs_info *c, int lnum, int offs, void *sbuf)
{
int len = c->max_write_size, err;
@@ -931,15 +932,15 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs,
return 0;
/* Read at the head location and check it is empty flash */
- err = ubi_read(c->ubi, lnum, sbuf, offs, len);
+ err = ubifs_leb_read(c, lnum, sbuf, offs, len, 1);
if (err || !is_empty(sbuf, len)) {
dbg_rcvry("cleaning head at %d:%d", lnum, offs);
if (offs == 0)
return ubifs_leb_unmap(c, lnum);
- err = ubi_read(c->ubi, lnum, sbuf, 0, offs);
+ err = ubifs_leb_read(c, lnum, sbuf, 0, offs, 1);
if (err)
return err;
- return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN);
+ return ubifs_leb_change(c, lnum, sbuf, offs, UBI_UNKNOWN);
}
return 0;
@@ -962,7 +963,7 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs,
*
* This function returns %0 on success and a negative error code on failure.
*/
-int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
+int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf)
{
int err;
@@ -982,7 +983,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
}
/**
- * clean_an_unclean_leb - read and write a LEB to remove corruption.
+ * clean_an_unclean_leb - read and write a LEB to remove corruption.
* @c: UBIFS file-system description object
* @ucleb: unclean LEB information
* @sbuf: LEB-sized buffer to use
@@ -993,7 +994,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
*
* This function returns %0 on success and a negative error code on failure.
*/
-static int clean_an_unclean_leb(const struct ubifs_info *c,
+static int clean_an_unclean_leb(struct ubifs_info *c,
struct ubifs_unclean_leb *ucleb, void *sbuf)
{
int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1;
@@ -1009,7 +1010,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c,
return 0;
}
- err = ubi_read(c->ubi, lnum, buf, offs, len);
+ err = ubifs_leb_read(c, lnum, buf, offs, len, 0);
if (err && err != -EBADMSG)
return err;
@@ -1069,7 +1070,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c,
}
/* Write back the LEB atomically */
- err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN);
+ err = ubifs_leb_change(c, lnum, sbuf, len, UBI_UNKNOWN);
if (err)
return err;
@@ -1089,7 +1090,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c,
*
* This function returns %0 on success and a negative error code on failure.
*/
-int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
+int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf)
{
dbg_rcvry("recovery");
while (!list_empty(&c->unclean_leb_list)) {
@@ -1454,7 +1455,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
if (i_size >= e->d_size)
return 0;
/* Read the LEB */
- err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size);
+ err = ubifs_leb_read(c, lnum, c->sbuf, 0, c->leb_size, 1);
if (err)
goto out;
/* Change the size field and recalculate the CRC */
@@ -1470,7 +1471,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
len -= 1;
len = ALIGN(len + 1, c->min_io_size);
/* Atomically write the fixed LEB back again */
- err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+ err = ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN);
if (err)
goto out;
dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 5e97161..ccabaf1 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -523,8 +523,7 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
if (!list_is_last(&next->list, &jh->buds_list))
return 0;
- err = ubi_read(c->ubi, next->lnum, (char *)&data,
- next->start, 4);
+ err = ubifs_leb_read(c, next->lnum, (char *)&data, next->start, 4, 1);
if (err)
return 0;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 1250016..b73ecd8 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c)
mst->total_dirty = cpu_to_le64(tmp64);
/* The indexing LEB does not contribute to dark space */
- tmp64 = (c->main_lebs - 1) * c->dark_wm;
+ tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm);
mst->total_dark = cpu_to_le64(tmp64);
mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
@@ -674,15 +674,15 @@ static int fixup_leb(struct ubifs_info *c, int lnum, int len)
if (len == 0) {
dbg_mnt("unmap empty LEB %d", lnum);
- return ubi_leb_unmap(c->ubi, lnum);
+ return ubifs_leb_unmap(c, lnum);
}
dbg_mnt("fixup LEB %d, data len %d", lnum, len);
- err = ubi_read(c->ubi, lnum, c->sbuf, 0, len);
+ err = ubifs_leb_read(c, lnum, c->sbuf, 0, len, 1);
if (err)
return err;
- return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+ return ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN);
}
/**
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 36216b4..37383e8 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -148,7 +148,7 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
INIT_LIST_HEAD(&sleb->nodes);
sleb->buf = sbuf;
- err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs);
+ err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0);
if (err && err != -EBADMSG) {
ubifs_err("cannot read %d bytes from LEB %d:%d,"
" error %d", c->leb_size - offs, lnum, offs, err);
@@ -240,7 +240,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
int len;
ubifs_err("corruption at LEB %d:%d", lnum, offs);
- if (dbg_failure_mode)
+ if (dbg_is_tst_rcvry(c))
return;
len = c->leb_size - offs;
if (len > 8192)
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 9e1d056..e0a7a76 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -128,7 +128,6 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
freed = ubifs_destroy_tnc_subtree(znode);
atomic_long_sub(freed, &ubifs_clean_zn_cnt);
atomic_long_sub(freed, &c->clean_zn_cnt);
- ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0);
total_freed += freed;
znode = zprev;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index db04976..201bcfc 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -85,7 +85,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode)
if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA)
return 4;
- if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG)
+ if (ui->xattr && !S_ISREG(inode->i_mode))
return 5;
if (!ubifs_compr_present(ui->compr_type)) {
@@ -94,7 +94,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode)
ubifs_compr_name(ui->compr_type));
}
- err = dbg_check_dir_size(c, inode);
+ err = dbg_check_dir(c, inode);
return err;
}
@@ -129,7 +129,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
goto out_ino;
inode->i_flags |= (S_NOCMTIME | S_NOATIME);
- inode->i_nlink = le32_to_cpu(ino->nlink);
+ set_nlink(inode, le32_to_cpu(ino->nlink));
inode->i_uid = le32_to_cpu(ino->uid);
inode->i_gid = le32_to_cpu(ino->gid);
inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec);
@@ -914,7 +914,7 @@ static int check_volume_empty(struct ubifs_info *c)
c->empty = 1;
for (lnum = 0; lnum < c->leb_cnt; lnum++) {
- err = ubi_is_mapped(c->ubi, lnum);
+ err = ubifs_is_mapped(c, lnum);
if (unlikely(err < 0))
return err;
if (err == 1) {
@@ -1985,7 +1985,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
mutex_init(&c->lp_mutex);
mutex_init(&c->tnc_mutex);
mutex_init(&c->log_mutex);
- mutex_init(&c->mst_mutex);
mutex_init(&c->umount_mutex);
mutex_init(&c->bu_mutex);
mutex_init(&c->write_reserve_mutex);
@@ -2264,19 +2263,12 @@ static int __init ubifs_init(void)
return -EINVAL;
}
- err = register_filesystem(&ubifs_fs_type);
- if (err) {
- ubifs_err("cannot register file system, error %d", err);
- return err;
- }
-
- err = -ENOMEM;
ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
sizeof(struct ubifs_inode), 0,
SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
&inode_slab_ctor);
if (!ubifs_inode_slab)
- goto out_reg;
+ return -ENOMEM;
register_shrinker(&ubifs_shrinker_info);
@@ -2288,15 +2280,20 @@ static int __init ubifs_init(void)
if (err)
goto out_compr;
+ err = register_filesystem(&ubifs_fs_type);
+ if (err) {
+ ubifs_err("cannot register file system, error %d", err);
+ goto out_dbg;
+ }
return 0;
+out_dbg:
+ dbg_debugfs_exit();
out_compr:
ubifs_compressors_exit();
out_shrinker:
unregister_shrinker(&ubifs_shrinker_info);
kmem_cache_destroy(ubifs_inode_slab);
-out_reg:
- unregister_filesystem(&ubifs_fs_type);
return err;
}
/* late_initcall to let compressors initialize first */
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 91b4213..0667386 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -223,7 +223,7 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c,
__set_bit(DIRTY_ZNODE, &zn->flags);
__clear_bit(COW_ZNODE, &zn->flags);
- ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+ ubifs_assert(!ubifs_zn_obsolete(znode));
__set_bit(OBSOLETE_ZNODE, &znode->flags);
if (znode->level != 0) {
@@ -271,7 +271,7 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
struct ubifs_znode *zn;
int err;
- if (!test_bit(COW_ZNODE, &znode->flags)) {
+ if (!ubifs_zn_cow(znode)) {
/* znode is not being committed */
if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) {
atomic_long_inc(&c->dirty_zn_cnt);
@@ -462,7 +462,7 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
- err = ubi_read(c->ubi, lnum, buf, offs, len);
+ err = ubifs_leb_read(c, lnum, buf, offs, len, 1);
if (err) {
ubifs_err("cannot read node type %d from LEB %d:%d, error %d",
type, lnum, offs, err);
@@ -1666,7 +1666,7 @@ static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum,
if (!overlap) {
/* We may safely unlock the write-buffer and read the data */
spin_unlock(&wbuf->lock);
- return ubi_read(c->ubi, lnum, buf, offs, len);
+ return ubifs_leb_read(c, lnum, buf, offs, len, 0);
}
/* Don't read under wbuf */
@@ -1680,7 +1680,7 @@ static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum,
if (rlen > 0)
/* Read everything that goes before write-buffer */
- return ubi_read(c->ubi, lnum, buf, offs, rlen);
+ return ubifs_leb_read(c, lnum, buf, offs, rlen, 0);
return 0;
}
@@ -1767,7 +1767,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
if (wbuf)
err = read_wbuf(wbuf, bu->buf, len, lnum, offs);
else
- err = ubi_read(c->ubi, lnum, bu->buf, offs, len);
+ err = ubifs_leb_read(c, lnum, bu->buf, offs, len, 0);
/* Check for a race with GC */
if (maybe_leb_gced(c, lnum, bu->gc_seq))
@@ -2423,7 +2423,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
*/
do {
- ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+ ubifs_assert(!ubifs_zn_obsolete(znode));
ubifs_assert(ubifs_zn_dirty(znode));
zp = znode->parent;
@@ -2479,9 +2479,8 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
c->zroot.offs = zbr->offs;
c->zroot.len = zbr->len;
c->zroot.znode = znode;
- ubifs_assert(!test_bit(OBSOLETE_ZNODE,
- &zp->flags));
- ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags));
+ ubifs_assert(!ubifs_zn_obsolete(zp));
+ ubifs_assert(ubifs_zn_dirty(zp));
atomic_long_dec(&c->dirty_zn_cnt);
if (zp->cnext) {
@@ -2865,7 +2864,7 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
struct ubifs_znode *znode = cnext;
cnext = cnext->cnext;
- if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+ if (ubifs_zn_obsolete(znode))
kfree(znode);
} while (cnext && cnext != c->cnext);
}
@@ -3301,7 +3300,7 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
if (!S_ISREG(inode->i_mode))
return 0;
- if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ if (!dbg_is_chk_gen(c))
return 0;
block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
@@ -3337,9 +3336,10 @@ out_dump:
ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
"(data key %s)", (unsigned long)inode->i_ino, size,
((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
+ mutex_unlock(&c->tnc_mutex);
dbg_dump_inode(c, inode);
dbg_dump_stack();
- err = -EINVAL;
+ return -EINVAL;
out_unlock:
mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 41920f3..4c15f07 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -22,6 +22,7 @@
/* This file implements TNC functions for committing */
+#include <linux/random.h>
#include "ubifs.h"
/**
@@ -87,8 +88,12 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
atomic_long_dec(&c->dirty_zn_cnt);
ubifs_assert(ubifs_zn_dirty(znode));
- ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+ ubifs_assert(ubifs_zn_cow(znode));
+ /*
+ * Note, unlike 'write_index()' we do not add memory barriers here
+ * because this function is called with @c->tnc_mutex locked.
+ */
__clear_bit(DIRTY_ZNODE, &znode->flags);
__clear_bit(COW_ZNODE, &znode->flags);
@@ -377,7 +382,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
c->gap_lebs = NULL;
return err;
}
- if (dbg_force_in_the_gaps_enabled()) {
+ if (!dbg_is_chk_index(c)) {
/*
* Do not print scary warnings if the debugging
* option which forces in-the-gaps is enabled.
@@ -491,25 +496,6 @@ static int layout_in_empty_space(struct ubifs_info *c)
else
next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
- if (c->min_io_size == 1) {
- buf_offs += ALIGN(len, 8);
- if (next_len) {
- if (buf_offs + next_len <= c->leb_size)
- continue;
- err = ubifs_update_one_lp(c, lnum, 0,
- c->leb_size - buf_offs, 0, 0);
- if (err)
- return err;
- lnum = -1;
- continue;
- }
- err = ubifs_update_one_lp(c, lnum,
- c->leb_size - buf_offs, 0, 0, 0);
- if (err)
- return err;
- break;
- }
-
/* Update buffer positions */
wlen = used + len;
used += ALIGN(len, 8);
@@ -658,7 +644,7 @@ static int get_znodes_to_commit(struct ubifs_info *c)
}
cnt += 1;
while (1) {
- ubifs_assert(!test_bit(COW_ZNODE, &znode->flags));
+ ubifs_assert(!ubifs_zn_cow(znode));
__set_bit(COW_ZNODE, &znode->flags);
znode->alt = 0;
cnext = find_next_dirty(znode);
@@ -704,7 +690,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
c->ilebs[c->ileb_cnt++] = lnum;
dbg_cmt("LEB %d", lnum);
}
- if (dbg_force_in_the_gaps())
+ if (dbg_is_chk_index(c) && !(random32() & 7))
return -ENOSPC;
return 0;
}
@@ -830,7 +816,7 @@ static int write_index(struct ubifs_info *c)
struct ubifs_idx_node *idx;
struct ubifs_znode *znode, *cnext;
int i, lnum, offs, len, next_len, buf_len, buf_offs, used;
- int avail, wlen, err, lnum_pos = 0;
+ int avail, wlen, err, lnum_pos = 0, blen, nxt_offs;
cnext = c->enext;
if (!cnext)
@@ -907,7 +893,7 @@ static int write_index(struct ubifs_info *c)
cnext = znode->cnext;
ubifs_assert(ubifs_zn_dirty(znode));
- ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+ ubifs_assert(ubifs_zn_cow(znode));
/*
* It is important that other threads should see %DIRTY_ZNODE
@@ -922,6 +908,28 @@ static int write_index(struct ubifs_info *c)
clear_bit(COW_ZNODE, &znode->flags);
smp_mb__after_clear_bit();
+ /*
+ * We have marked the znode as clean but have not updated the
+ * @c->clean_zn_cnt counter. If this znode becomes dirty again
+ * before 'free_obsolete_znodes()' is called, then
+ * @c->clean_zn_cnt will be decremented before it gets
+ * incremented (resulting in 2 decrements for the same znode).
+ * This means that @c->clean_zn_cnt may become negative for a
+ * while.
+ *
+ * Q: why we cannot increment @c->clean_zn_cnt?
+ * A: because we do not have the @c->tnc_mutex locked, and the
+ * following code would be racy and buggy:
+ *
+ * if (!ubifs_zn_obsolete(znode)) {
+ * atomic_long_inc(&c->clean_zn_cnt);
+ * atomic_long_inc(&ubifs_clean_zn_cnt);
+ * }
+ *
+ * Thus, we just delay the @c->clean_zn_cnt update until we
+ * have the mutex locked.
+ */
+
/* Do not access znode from this point on */
/* Update buffer positions */
@@ -938,65 +946,38 @@ static int write_index(struct ubifs_info *c)
else
next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
- if (c->min_io_size == 1) {
- /*
- * Write the prepared index node immediately if there is
- * no minimum IO size
- */
- err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
- wlen, UBI_SHORTTERM);
- if (err)
- return err;
- buf_offs += ALIGN(wlen, 8);
- if (next_len) {
- used = 0;
- avail = buf_len;
- if (buf_offs + next_len > c->leb_size) {
- err = ubifs_update_one_lp(c, lnum,
- LPROPS_NC, 0, 0, LPROPS_TAKEN);
- if (err)
- return err;
- lnum = -1;
- }
+ nxt_offs = buf_offs + used + next_len;
+ if (next_len && nxt_offs <= c->leb_size) {
+ if (avail > 0)
continue;
- }
+ else
+ blen = buf_len;
} else {
- int blen, nxt_offs = buf_offs + used + next_len;
-
- if (next_len && nxt_offs <= c->leb_size) {
- if (avail > 0)
- continue;
- else
- blen = buf_len;
- } else {
- wlen = ALIGN(wlen, 8);
- blen = ALIGN(wlen, c->min_io_size);
- ubifs_pad(c, c->cbuf + wlen, blen - wlen);
- }
- /*
- * The buffer is full or there are no more znodes
- * to do
- */
- err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
- blen, UBI_SHORTTERM);
- if (err)
- return err;
- buf_offs += blen;
- if (next_len) {
- if (nxt_offs > c->leb_size) {
- err = ubifs_update_one_lp(c, lnum,
- LPROPS_NC, 0, 0, LPROPS_TAKEN);
- if (err)
- return err;
- lnum = -1;
- }
- used -= blen;
- if (used < 0)
- used = 0;
- avail = buf_len - used;
- memmove(c->cbuf, c->cbuf + blen, used);
- continue;
+ wlen = ALIGN(wlen, 8);
+ blen = ALIGN(wlen, c->min_io_size);
+ ubifs_pad(c, c->cbuf + wlen, blen - wlen);
+ }
+
+ /* The buffer is full or there are no more znodes to do */
+ err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen,
+ UBI_SHORTTERM);
+ if (err)
+ return err;
+ buf_offs += blen;
+ if (next_len) {
+ if (nxt_offs > c->leb_size) {
+ err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0,
+ 0, LPROPS_TAKEN);
+ if (err)
+ return err;
+ lnum = -1;
}
+ used -= blen;
+ if (used < 0)
+ used = 0;
+ avail = buf_len - used;
+ memmove(c->cbuf, c->cbuf + blen, used);
+ continue;
}
break;
}
@@ -1029,7 +1010,7 @@ static void free_obsolete_znodes(struct ubifs_info *c)
do {
znode = cnext;
cnext = znode->cnext;
- if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+ if (ubifs_zn_obsolete(znode))
kfree(znode);
else {
znode->cnext = NULL;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 62d50a9..223dd42 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -230,14 +230,14 @@ enum {
* LPT cnode flag bits.
*
* DIRTY_CNODE: cnode is dirty
- * COW_CNODE: cnode is being committed and must be copied before writing
* OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted),
- * so it can (and must) be freed when the commit is finished
+ * so it can (and must) be freed when the commit is finished
+ * COW_CNODE: cnode is being committed and must be copied before writing
*/
enum {
DIRTY_CNODE = 0,
- COW_CNODE = 1,
- OBSOLETE_CNODE = 2,
+ OBSOLETE_CNODE = 1,
+ COW_CNODE = 2,
};
/*
@@ -908,6 +908,7 @@ struct ubifs_budget_req {
* @dnext: next orphan to delete
* @inum: inode number
* @new: %1 => added since the last commit, otherwise %0
+ * @del: %1 => delete pending, otherwise %0
*/
struct ubifs_orphan {
struct rb_node rb;
@@ -917,6 +918,7 @@ struct ubifs_orphan {
struct ubifs_orphan *dnext;
ino_t inum;
int new;
+ unsigned del:1;
};
/**
@@ -1042,7 +1044,6 @@ struct ubifs_debug_info;
*
* @mst_node: master node
* @mst_offs: offset of valid master node
- * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
*
* @max_bu_buf_len: maximum bulk-read buffer length
* @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
@@ -1282,7 +1283,6 @@ struct ubifs_info {
struct ubifs_mst_node *mst_node;
int mst_offs;
- struct mutex mst_mutex;
int max_bu_buf_len;
struct mutex bu_mutex;
@@ -1471,6 +1471,15 @@ extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
/* io.c */
void ubifs_ro_mode(struct ubifs_info *c, int err);
+int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
+ int len, int even_ebadmsg);
+int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
+ int len, int dtype);
+int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len,
+ int dtype);
+int ubifs_leb_unmap(struct ubifs_info *c, int lnum);
+int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype);
+int ubifs_is_mapped(const struct ubifs_info *c, int lnum);
int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
int dtype);
@@ -1723,7 +1732,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
int ubifs_calc_dark(const struct ubifs_info *c, int spc);
/* file.c */
-int ubifs_fsync(struct file *file, int datasync);
+int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
/* dir.c */
@@ -1750,8 +1759,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
int offs, void *sbuf, int jhead);
struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
int offs, void *sbuf);
-int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
-int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf);
+int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf);
+int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf);
int ubifs_rcvry_gc_commit(struct ubifs_info *c);
int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
int deletion, loff_t new_size);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 16f19f5..bf18f7a 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -558,10 +558,10 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
}
ubifs_assert(inode->i_nlink == 1);
- inode->i_nlink = 0;
+ clear_nlink(inode);
err = remove_xattr(c, host, inode, &nm);
if (err)
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
/* If @i_nlink is 0, 'iput()' will delete the inode */
iput(inode);
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 95518a9..987585b 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -59,8 +59,8 @@ static int __load_block_bitmap(struct super_block *sb,
int nr_groups = bitmap->s_nr_groups;
if (block_group >= nr_groups) {
- udf_debug("block_group (%d) > nr_groups (%d)\n", block_group,
- nr_groups);
+ udf_debug("block_group (%d) > nr_groups (%d)\n",
+ block_group, nr_groups);
}
if (bitmap->s_block_bitmap[block_group]) {
@@ -126,8 +126,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
if (bloc->logicalBlockNum + count < count ||
(bloc->logicalBlockNum + count) > partmap->s_partition_len) {
udf_debug("%d < %d || %d + %d > %d\n",
- bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
- count, partmap->s_partition_len);
+ bloc->logicalBlockNum, 0,
+ bloc->logicalBlockNum, count,
+ partmap->s_partition_len);
goto error_return;
}
@@ -155,7 +156,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
if (udf_set_bit(bit + i, bh->b_data)) {
udf_debug("bit %ld already set\n", bit + i);
udf_debug("byte=%2x\n",
- ((char *)bh->b_data)[(bit + i) >> 3]);
+ ((char *)bh->b_data)[(bit + i) >> 3]);
}
}
udf_add_free_space(sb, sbi->s_partition, count);
@@ -369,7 +370,8 @@ static void udf_table_free_blocks(struct super_block *sb,
if (bloc->logicalBlockNum + count < count ||
(bloc->logicalBlockNum + count) > partmap->s_partition_len) {
udf_debug("%d < %d || %d + %d > %d\n",
- bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
+ bloc->logicalBlockNum, 0,
+ bloc->logicalBlockNum, count,
partmap->s_partition_len);
goto error_return;
}
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index eb8bfe2..56341af 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -163,7 +163,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
- flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+ flen = udf_get_filename(dir->i_sb, nameptr, lfi, fname,
+ UDF_NAME_LEN);
dt_type = DT_UNKNOWN;
}
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2ffdb67..3e44f57 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -162,8 +162,8 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
int padlen;
if ((!buffer) || (!offset)) {
- udf_debug("invalidparms\n, buffer=%p, offset=%p\n", buffer,
- offset);
+ udf_debug("invalidparms, buffer=%p, offset=%p\n",
+ buffer, offset);
return NULL;
}
@@ -201,7 +201,7 @@ struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offs
struct short_ad *sa;
if ((!ptr) || (!offset)) {
- printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
+ pr_err("%s: invalidparms\n", __func__);
return NULL;
}
@@ -223,7 +223,7 @@ struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset
struct long_ad *la;
if ((!ptr) || (!offset)) {
- printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
+ pr_err("%s: invalidparms\n", __func__);
return NULL;
}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 8eb9628..874c9e3 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -173,7 +173,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
long old_block, new_block;
int result = -EINVAL;
- if (file_permission(filp, MAY_READ) != 0) {
+ if (inode_permission(inode, MAY_READ) != 0) {
udf_debug("no permission to access inode %lu\n", inode->i_ino);
result = -EPERM;
goto out;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 957c974..e081440 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -37,6 +37,7 @@
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
+#include <linux/mpage.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -83,12 +84,10 @@ void udf_evict_inode(struct inode *inode)
end_writeback(inode);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
inode->i_size != iinfo->i_lenExtents) {
- printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
- "inode size %llu different from extent length %llu. "
- "Filesystem need not be standards compliant.\n",
- inode->i_sb->s_id, inode->i_ino, inode->i_mode,
- (unsigned long long)inode->i_size,
- (unsigned long long)iinfo->i_lenExtents);
+ udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n",
+ inode->i_ino, inode->i_mode,
+ (unsigned long long)inode->i_size,
+ (unsigned long long)iinfo->i_lenExtents);
}
kfree(iinfo->i_ext.i_data);
iinfo->i_ext.i_data = NULL;
@@ -104,7 +103,13 @@ static int udf_writepage(struct page *page, struct writeback_control *wbc)
static int udf_readpage(struct file *file, struct page *page)
{
- return block_read_full_page(page, udf_get_block);
+ return mpage_readpage(page, udf_get_block);
+}
+
+static int udf_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, udf_get_block);
}
static int udf_write_begin(struct file *file, struct address_space *mapping,
@@ -139,6 +144,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations udf_aops = {
.readpage = udf_readpage,
+ .readpages = udf_readpages,
.writepage = udf_writepage,
.write_begin = udf_write_begin,
.write_end = generic_write_end,
@@ -1170,13 +1176,22 @@ update_time:
return 0;
}
+/*
+ * Maximum length of linked list formed by ICB hierarchy. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_ICB_NESTING 1024
+
static void __udf_read_inode(struct inode *inode)
{
struct buffer_head *bh = NULL;
struct fileEntry *fe;
uint16_t ident;
struct udf_inode_info *iinfo = UDF_I(inode);
+ unsigned int indirections = 0;
+reread:
/*
* Set defaults, but the inode is still incomplete!
* Note: get_new_inode() sets the following on a new inode:
@@ -1191,16 +1206,15 @@ static void __udf_read_inode(struct inode *inode)
*/
bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
if (!bh) {
- printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
- inode->i_ino);
+ udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
make_bad_inode(inode);
return;
}
if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
ident != TAG_IDENT_USE) {
- printk(KERN_ERR "udf: udf_read_inode(ino %ld) "
- "failed ident=%d\n", inode->i_ino, ident);
+ udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
+ inode->i_ino, ident);
brelse(bh);
make_bad_inode(inode);
return;
@@ -1214,34 +1228,32 @@ static void __udf_read_inode(struct inode *inode)
ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
&ident);
if (ident == TAG_IDENT_IE && ibh) {
- struct buffer_head *nbh = NULL;
struct kernel_lb_addr loc;
struct indirectEntry *ie;
ie = (struct indirectEntry *)ibh->b_data;
loc = lelb_to_cpu(ie->indirectICB.extLocation);
- if (ie->indirectICB.extLength &&
- (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
- &ident))) {
- if (ident == TAG_IDENT_FE ||
- ident == TAG_IDENT_EFE) {
- memcpy(&iinfo->i_location,
- &loc,
- sizeof(struct kernel_lb_addr));
- brelse(bh);
- brelse(ibh);
- brelse(nbh);
- __udf_read_inode(inode);
+ if (ie->indirectICB.extLength) {
+ brelse(bh);
+ brelse(ibh);
+ memcpy(&iinfo->i_location, &loc,
+ sizeof(struct kernel_lb_addr));
+ if (++indirections > UDF_MAX_ICB_NESTING) {
+ udf_err(inode->i_sb,
+ "too many ICBs in ICB hierarchy"
+ " (max %d supported)\n",
+ UDF_MAX_ICB_NESTING);
+ make_bad_inode(inode);
return;
}
- brelse(nbh);
+ goto reread;
}
}
brelse(ibh);
} else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
- printk(KERN_ERR "udf: unsupported strategy type: %d\n",
- le16_to_cpu(fe->icbTag.strategyType));
+ udf_err(inode->i_sb, "unsupported strategy type: %d\n",
+ le16_to_cpu(fe->icbTag.strategyType));
brelse(bh);
make_bad_inode(inode);
return;
@@ -1258,6 +1270,8 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
int offset;
struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
struct udf_inode_info *iinfo = UDF_I(inode);
+ unsigned int link_count;
+ int bs = inode->i_sb->s_blocksize;
fe = (struct fileEntry *)bh->b_data;
efe = (struct extendedFileEntry *)bh->b_data;
@@ -1278,41 +1292,38 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
iinfo->i_efe = 1;
iinfo->i_use = 0;
- if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+ if (udf_alloc_i_data(inode, bs -
sizeof(struct extendedFileEntry))) {
make_bad_inode(inode);
return;
}
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct extendedFileEntry),
- inode->i_sb->s_blocksize -
- sizeof(struct extendedFileEntry));
+ bs - sizeof(struct extendedFileEntry));
} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
iinfo->i_efe = 0;
iinfo->i_use = 0;
- if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
- sizeof(struct fileEntry))) {
+ if (udf_alloc_i_data(inode, bs - sizeof(struct fileEntry))) {
make_bad_inode(inode);
return;
}
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct fileEntry),
- inode->i_sb->s_blocksize - sizeof(struct fileEntry));
+ bs - sizeof(struct fileEntry));
} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
iinfo->i_efe = 0;
iinfo->i_use = 1;
iinfo->i_lenAlloc = le32_to_cpu(
((struct unallocSpaceEntry *)bh->b_data)->
lengthAllocDescs);
- if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+ if (udf_alloc_i_data(inode, bs -
sizeof(struct unallocSpaceEntry))) {
make_bad_inode(inode);
return;
}
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct unallocSpaceEntry),
- inode->i_sb->s_blocksize -
- sizeof(struct unallocSpaceEntry));
+ bs - sizeof(struct unallocSpaceEntry));
return;
}
@@ -1340,9 +1351,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
inode->i_mode &= ~sbi->s_umask;
read_unlock(&sbi->s_cred_lock);
- inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
- if (!inode->i_nlink)
- inode->i_nlink = 1;
+ link_count = le16_to_cpu(fe->fileLinkCount);
+ if (!link_count)
+ link_count = 1;
+ set_nlink(inode, link_count);
inode->i_size = le64_to_cpu(fe->informationLength);
iinfo->i_lenExtents = inode->i_size;
@@ -1389,6 +1401,36 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
iinfo->i_lenEAttr;
}
+ /*
+ * Sanity check length of allocation descriptors and extended attrs to
+ * avoid integer overflows
+ */
+ if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs) {
+ make_bad_inode(inode);
+ return;
+ }
+ /* Now do exact checks */
+ if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs) {
+ make_bad_inode(inode);
+ return;
+ }
+ /* Sanity checks for files in ICB so that we don't get confused later */
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+ /*
+ * For file in ICB data is stored in allocation descriptor
+ * so sizes should match
+ */
+ if (iinfo->i_lenAlloc != inode->i_size) {
+ make_bad_inode(inode);
+ return;
+ }
+ /* File in ICB has to fit in there... */
+ if (inode->i_size > bs - udf_file_entry_alloc_offset(inode)) {
+ make_bad_inode(inode);
+ return;
+ }
+ }
+
switch (fe->icbTag.fileType) {
case ICBTAG_FILE_TYPE_DIRECTORY:
inode->i_op = &udf_dir_inode_operations;
@@ -1435,9 +1477,8 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
udf_debug("METADATA BITMAP FILE-----\n");
break;
default:
- printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown "
- "file type=%d\n", inode->i_ino,
- fe->icbTag.fileType);
+ udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
+ inode->i_ino, fe->icbTag.fileType);
make_bad_inode(inode);
return;
}
@@ -1460,8 +1501,8 @@ static int udf_alloc_i_data(struct inode *inode, size_t size)
iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL);
if (!iinfo->i_ext.i_data) {
- printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) "
- "no free memory\n", inode->i_ino);
+ udf_err(inode->i_sb, "(ino %ld) no free memory\n",
+ inode->i_ino);
return -ENOMEM;
}
@@ -1711,9 +1752,8 @@ out:
if (do_sync) {
sync_dirty_buffer(bh);
if (buffer_write_io_error(bh)) {
- printk(KERN_WARNING "IO error syncing udf inode "
- "[%s:%08lx]\n", inode->i_sb->s_id,
- inode->i_ino);
+ udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n",
+ inode->i_ino);
err = -EIO;
}
}
@@ -2004,8 +2044,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
*elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK;
break;
default:
- udf_debug("alloc_type = %d unsupported\n",
- iinfo->i_alloc_type);
+ udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type);
return -1;
}
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 43e24a3..6583fe9 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -38,7 +38,7 @@ unsigned int udf_get_last_session(struct super_block *sb)
if (i == 0) {
udf_debug("XA disk: %s, vol_desc_start=%d\n",
- (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
+ ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba);
if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
vol_desc_start = ms_info.addr.lba;
} else {
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 9215700..c175b4d 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -204,6 +204,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
{
struct tag *tag_p;
struct buffer_head *bh = NULL;
+ u8 checksum;
/* Read the block */
if (block == 0xFFFFFFFF)
@@ -211,8 +212,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
bh = udf_tread(sb, block);
if (!bh) {
- udf_debug("block=%d, location=%d: read failed\n",
- block, location);
+ udf_err(sb, "read failed, block=%u, location=%d\n",
+ block, location);
return NULL;
}
@@ -227,16 +228,18 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
}
/* Verify the tag checksum */
- if (udf_tag_checksum(tag_p) != tag_p->tagChecksum) {
- printk(KERN_ERR "udf: tag checksum failed block %d\n", block);
+ checksum = udf_tag_checksum(tag_p);
+ if (checksum != tag_p->tagChecksum) {
+ udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n",
+ block, checksum, tag_p->tagChecksum);
goto error_out;
}
/* Verify the tag version */
if (tag_p->descVersion != cpu_to_le16(0x0002U) &&
tag_p->descVersion != cpu_to_le16(0x0003U)) {
- udf_debug("tag version 0x%04x != 0x0002 || 0x0003 block %d\n",
- le16_to_cpu(tag_p->descVersion), block);
+ udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n",
+ le16_to_cpu(tag_p->descVersion), block);
goto error_out;
}
@@ -248,8 +251,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
return bh;
udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block,
- le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength));
-
+ le16_to_cpu(tag_p->descCRC),
+ le16_to_cpu(tag_p->descCRCLength));
error_out:
brelse(bh);
return NULL;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index d8c1bb5..483d662 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -235,7 +235,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
if (!lfi)
continue;
- flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+ flen = udf_get_filename(dir->i_sb, nameptr, lfi, fname,
+ UDF_NAME_LEN);
if (flen && udf_match(flen, fname, child->len, child->name))
goto out_ok;
}
@@ -577,8 +578,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
if (!fi) {
- inode->i_nlink--;
- mark_inode_dirty(inode);
+ inode_dec_link_count(inode);
iput(inode);
return err;
}
@@ -618,8 +618,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
init_special_inode(inode, mode, rdev);
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
if (!fi) {
- inode->i_nlink--;
- mark_inode_dirty(inode);
+ inode_dec_link_count(inode);
iput(inode);
return err;
}
@@ -665,12 +664,11 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
inode->i_fop = &udf_dir_operations;
fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
if (!fi) {
- inode->i_nlink--;
- mark_inode_dirty(inode);
+ inode_dec_link_count(inode);
iput(inode);
goto out;
}
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location);
*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
@@ -683,7 +681,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
if (!fi) {
- inode->i_nlink = 0;
+ clear_nlink(inode);
mark_inode_dirty(inode);
iput(inode);
goto out;
@@ -799,9 +797,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
if (retval)
goto end_rmdir;
if (inode->i_nlink != 2)
- udf_warning(inode->i_sb, "udf_rmdir",
- "empty directory has nlink != 2 (%d)",
- inode->i_nlink);
+ udf_warn(inode->i_sb, "empty directory has nlink != 2 (%d)\n",
+ inode->i_nlink);
clear_nlink(inode);
inode->i_size = 0;
inode_dec_link_count(dir);
@@ -840,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
if (!inode->i_nlink) {
udf_debug("Deleting nonexistent file (%lu), %d\n",
inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
retval = udf_delete_entry(dir, fi, &fibh, &cfi);
if (retval)
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index a71090e..d6caf01 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -33,8 +33,8 @@ uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
struct udf_sb_info *sbi = UDF_SB(sb);
struct udf_part_map *map;
if (partition >= sbi->s_partitions) {
- udf_debug("block=%d, partition=%d, offset=%d: "
- "invalid partition\n", block, partition, offset);
+ udf_debug("block=%d, partition=%d, offset=%d: invalid partition\n",
+ block, partition, offset);
return 0xFFFFFFFF;
}
map = &sbi->s_partmaps[partition];
@@ -60,8 +60,8 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
vdata = &map->s_type_specific.s_virtual;
if (block > vdata->s_num_entries) {
- udf_debug("Trying to access block beyond end of VAT "
- "(%d max %d)\n", block, vdata->s_num_entries);
+ udf_debug("Trying to access block beyond end of VAT (%d max %d)\n",
+ block, vdata->s_num_entries);
return 0xFFFFFFFF;
}
@@ -321,9 +321,14 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
/* We shouldn't mount such media... */
BUG_ON(!inode);
retblk = udf_try_read_meta(inode, block, partition, offset);
- if (retblk == 0xFFFFFFFF) {
- udf_warning(sb, __func__, "error reading from METADATA, "
- "trying to read from MIRROR");
+ if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) {
+ udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n");
+ if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) {
+ mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
+ mdata->s_mirror_file_loc, map->s_partition_num);
+ mdata->s_flags |= MF_MIRROR_FE_LOADED;
+ }
+
inode = mdata->s_mirror_fe;
if (!inode)
return 0xFFFFFFFF;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b0c7b53..f66439e 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -76,8 +76,6 @@
#define UDF_DEFAULT_BLOCKSIZE 2048
-static char error_buf[1024];
-
/* These are the "meat" - everything else is stuffing */
static int udf_fill_super(struct super_block *, void *, int);
static void udf_put_super(struct super_block *);
@@ -93,8 +91,6 @@ static void udf_close_lvid(struct super_block *);
static unsigned int udf_count_free(struct super_block *);
static int udf_statfs(struct dentry *, struct kstatfs *);
static int udf_show_options(struct seq_file *, struct vfsmount *);
-static void udf_error(struct super_block *sb, const char *function,
- const char *fmt, ...);
struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
{
@@ -245,9 +241,8 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map),
GFP_KERNEL);
if (!sbi->s_partmaps) {
- udf_error(sb, __func__,
- "Unable to allocate space for %d partition maps",
- count);
+ udf_err(sb, "Unable to allocate space for %d partition maps\n",
+ count);
sbi->s_partitions = 0;
return -ENOMEM;
}
@@ -551,8 +546,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
uopt->dmode = option & 0777;
break;
default:
- printk(KERN_ERR "udf: bad mount option \"%s\" "
- "or missing value\n", p);
+ pr_err("bad mount option \"%s\" or missing value\n", p);
return 0;
}
}
@@ -646,20 +640,16 @@ static loff_t udf_check_vsd(struct super_block *sb)
udf_debug("ISO9660 Boot Record found\n");
break;
case 1:
- udf_debug("ISO9660 Primary Volume Descriptor "
- "found\n");
+ udf_debug("ISO9660 Primary Volume Descriptor found\n");
break;
case 2:
- udf_debug("ISO9660 Supplementary Volume "
- "Descriptor found\n");
+ udf_debug("ISO9660 Supplementary Volume Descriptor found\n");
break;
case 3:
- udf_debug("ISO9660 Volume Partition Descriptor "
- "found\n");
+ udf_debug("ISO9660 Volume Partition Descriptor found\n");
break;
case 255:
- udf_debug("ISO9660 Volume Descriptor Set "
- "Terminator found\n");
+ udf_debug("ISO9660 Volume Descriptor Set Terminator found\n");
break;
default:
udf_debug("ISO9660 VRS (%u) found\n",
@@ -810,8 +800,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
pvoldesc->recordingDateAndTime)) {
#ifdef UDFFS_DEBUG
struct timestamp *ts = &pvoldesc->recordingDateAndTime;
- udf_debug("recording time %04u/%02u/%02u"
- " %02u:%02u (%x)\n",
+ udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n",
le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
ts->minute, le16_to_cpu(ts->typeAndTimezone));
#endif
@@ -822,7 +811,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
outstr->u_len > 31 ? 31 : outstr->u_len);
udf_debug("volIdent[] = '%s'\n",
- UDF_SB(sb)->s_volume_ident);
+ UDF_SB(sb)->s_volume_ident);
}
if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
@@ -838,64 +827,57 @@ out1:
return ret;
}
+struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
+ u32 meta_file_loc, u32 partition_num)
+{
+ struct kernel_lb_addr addr;
+ struct inode *metadata_fe;
+
+ addr.logicalBlockNum = meta_file_loc;
+ addr.partitionReferenceNum = partition_num;
+
+ metadata_fe = udf_iget(sb, &addr);
+
+ if (metadata_fe == NULL)
+ udf_warn(sb, "metadata inode efe not found\n");
+ else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
+ udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
+ iput(metadata_fe);
+ metadata_fe = NULL;
+ }
+
+ return metadata_fe;
+}
+
static int udf_load_metadata_files(struct super_block *sb, int partition)
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct udf_part_map *map;
struct udf_meta_data *mdata;
struct kernel_lb_addr addr;
- int fe_error = 0;
map = &sbi->s_partmaps[partition];
mdata = &map->s_type_specific.s_metadata;
/* metadata address */
- addr.logicalBlockNum = mdata->s_meta_file_loc;
- addr.partitionReferenceNum = map->s_partition_num;
-
udf_debug("Metadata file location: block = %d part = %d\n",
- addr.logicalBlockNum, addr.partitionReferenceNum);
+ mdata->s_meta_file_loc, map->s_partition_num);
- mdata->s_metadata_fe = udf_iget(sb, &addr);
+ mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb,
+ mdata->s_meta_file_loc, map->s_partition_num);
if (mdata->s_metadata_fe == NULL) {
- udf_warning(sb, __func__, "metadata inode efe not found, "
- "will try mirror inode.");
- fe_error = 1;
- } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type !=
- ICBTAG_FLAG_AD_SHORT) {
- udf_warning(sb, __func__, "metadata inode efe does not have "
- "short allocation descriptors!");
- fe_error = 1;
- iput(mdata->s_metadata_fe);
- mdata->s_metadata_fe = NULL;
- }
+ /* mirror file entry */
+ udf_debug("Mirror metadata file location: block = %d part = %d\n",
+ mdata->s_mirror_file_loc, map->s_partition_num);
- /* mirror file entry */
- addr.logicalBlockNum = mdata->s_mirror_file_loc;
- addr.partitionReferenceNum = map->s_partition_num;
-
- udf_debug("Mirror metadata file location: block = %d part = %d\n",
- addr.logicalBlockNum, addr.partitionReferenceNum);
+ mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
+ mdata->s_mirror_file_loc, map->s_partition_num);
- mdata->s_mirror_fe = udf_iget(sb, &addr);
-
- if (mdata->s_mirror_fe == NULL) {
- if (fe_error) {
- udf_error(sb, __func__, "mirror inode efe not found "
- "and metadata inode is missing too, exiting...");
- goto error_exit;
- } else
- udf_warning(sb, __func__, "mirror inode efe not found,"
- " but metadata inode is OK");
- } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type !=
- ICBTAG_FLAG_AD_SHORT) {
- udf_warning(sb, __func__, "mirror inode efe does not have "
- "short allocation descriptors!");
- iput(mdata->s_mirror_fe);
- mdata->s_mirror_fe = NULL;
- if (fe_error)
+ if (mdata->s_mirror_fe == NULL) {
+ udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
goto error_exit;
+ }
}
/*
@@ -908,18 +890,15 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
addr.partitionReferenceNum = map->s_partition_num;
udf_debug("Bitmap file location: block = %d part = %d\n",
- addr.logicalBlockNum, addr.partitionReferenceNum);
+ addr.logicalBlockNum, addr.partitionReferenceNum);
mdata->s_bitmap_fe = udf_iget(sb, &addr);
if (mdata->s_bitmap_fe == NULL) {
if (sb->s_flags & MS_RDONLY)
- udf_warning(sb, __func__, "bitmap inode efe "
- "not found but it's ok since the disc"
- " is mounted read-only");
+ udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
else {
- udf_error(sb, __func__, "bitmap inode efe not "
- "found and attempted read-write mount");
+ udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
goto error_exit;
}
}
@@ -972,9 +951,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
if (bitmap == NULL) {
- udf_error(sb, __func__,
- "Unable to allocate space for bitmap "
- "and %d buffer_head pointers", nr_groups);
+ udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n",
+ nr_groups);
return NULL;
}
@@ -1004,10 +982,9 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
- udf_debug("Partition (%d type %x) starts at physical %d, "
- "block length %d\n", p_index,
- map->s_partition_type, map->s_partition_root,
- map->s_partition_len);
+ udf_debug("Partition (%d type %x) starts at physical %d, block length %d\n",
+ p_index, map->s_partition_type,
+ map->s_partition_root, map->s_partition_len);
if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
@@ -1024,12 +1001,12 @@ static int udf_fill_partdesc_info(struct super_block *sb,
map->s_uspace.s_table = udf_iget(sb, &loc);
if (!map->s_uspace.s_table) {
udf_debug("cannot load unallocSpaceTable (part %d)\n",
- p_index);
+ p_index);
return 1;
}
map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
udf_debug("unallocSpaceTable (part %d) @ %ld\n",
- p_index, map->s_uspace.s_table->i_ino);
+ p_index, map->s_uspace.s_table->i_ino);
}
if (phd->unallocSpaceBitmap.extLength) {
@@ -1042,8 +1019,8 @@ static int udf_fill_partdesc_info(struct super_block *sb,
bitmap->s_extPosition = le32_to_cpu(
phd->unallocSpaceBitmap.extPosition);
map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
- udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index,
- bitmap->s_extPosition);
+ udf_debug("unallocSpaceBitmap (part %d) @ %d\n",
+ p_index, bitmap->s_extPosition);
}
if (phd->partitionIntegrityTable.extLength)
@@ -1059,13 +1036,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
map->s_fspace.s_table = udf_iget(sb, &loc);
if (!map->s_fspace.s_table) {
udf_debug("cannot load freedSpaceTable (part %d)\n",
- p_index);
+ p_index);
return 1;
}
map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
udf_debug("freedSpaceTable (part %d) @ %ld\n",
- p_index, map->s_fspace.s_table->i_ino);
+ p_index, map->s_fspace.s_table->i_ino);
}
if (phd->freedSpaceBitmap.extLength) {
@@ -1078,8 +1055,8 @@ static int udf_fill_partdesc_info(struct super_block *sb,
bitmap->s_extPosition = le32_to_cpu(
phd->freedSpaceBitmap.extPosition);
map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
- udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index,
- bitmap->s_extPosition);
+ udf_debug("freedSpaceBitmap (part %d) @ %d\n",
+ p_index, bitmap->s_extPosition);
}
return 0;
}
@@ -1119,11 +1096,9 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
if (!sbi->s_vat_inode &&
sbi->s_last_block != blocks - 1) {
- printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
- " last recorded block (%lu), retrying with the last "
- "block of the device (%lu).\n",
- (unsigned long)sbi->s_last_block,
- (unsigned long)blocks - 1);
+ pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n",
+ (unsigned long)sbi->s_last_block,
+ (unsigned long)blocks - 1);
udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
}
if (!sbi->s_vat_inode)
@@ -1221,8 +1196,8 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
if (map->s_partition_type == UDF_METADATA_MAP25) {
ret = udf_load_metadata_files(sb, i);
if (ret) {
- printk(KERN_ERR "UDF-fs: error loading MetaData "
- "partition map %d\n", i);
+ udf_err(sb, "error loading MetaData partition map %d\n",
+ i);
goto out_bh;
}
} else {
@@ -1235,9 +1210,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
* overwrite blocks instead of relocating them).
*/
sb->s_flags |= MS_RDONLY;
- printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only "
- "because writing to pseudooverwrite partition is "
- "not implemented.\n");
+ pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n");
}
out_bh:
/* In case loading failed, we handle cleanup in udf_fill_super */
@@ -1259,13 +1232,13 @@ static int udf_load_sparable_map(struct super_block *sb,
map->s_partition_type = UDF_SPARABLE_MAP15;
sdata->s_packet_len = le16_to_cpu(spm->packetLength);
if (!is_power_of_2(sdata->s_packet_len)) {
- udf_error(sb, __func__, "error loading logical volume descriptor: "
+ udf_err(sb, "error loading logical volume descriptor: "
"Invalid packet length %u\n",
(unsigned)sdata->s_packet_len);
return -EIO;
}
if (spm->numSparingTables > 4) {
- udf_error(sb, __func__, "error loading logical volume descriptor: "
+ udf_err(sb, "error loading logical volume descriptor: "
"Too many sparing tables (%d)\n",
(int)spm->numSparingTables);
return -EIO;
@@ -1312,8 +1285,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
BUG_ON(ident != TAG_IDENT_LVD);
lvd = (struct logicalVolDesc *)bh->b_data;
table_len = le32_to_cpu(lvd->mapTableLength);
- if (sizeof(*lvd) + table_len > sb->s_blocksize) {
- udf_error(sb, __func__, "error loading logical volume descriptor: "
+ if (table_len > sb->s_blocksize - sizeof(*lvd)) {
+ udf_err(sb, "error loading logical volume descriptor: "
"Partition table too long (%u > %lu)\n", table_len,
sb->s_blocksize - sizeof(*lvd));
ret = 1;
@@ -1373,9 +1346,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
struct metadataPartitionMap *mdm =
(struct metadataPartitionMap *)
&(lvd->partitionMaps[offset]);
- udf_debug("Parsing Logical vol part %d "
- "type %d id=%s\n", i, type,
- UDF_ID_METADATA);
+ udf_debug("Parsing Logical vol part %d type %d id=%s\n",
+ i, type, UDF_ID_METADATA);
map->s_partition_type = UDF_METADATA_MAP25;
map->s_partition_func = udf_get_pblock_meta25;
@@ -1390,25 +1362,24 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
le32_to_cpu(mdm->allocUnitSize);
mdata->s_align_unit_size =
le16_to_cpu(mdm->alignUnitSize);
- mdata->s_dup_md_flag =
- mdm->flags & 0x01;
+ if (mdm->flags & 0x01)
+ mdata->s_flags |= MF_DUPLICATE_MD;
udf_debug("Metadata Ident suffix=0x%x\n",
- (le16_to_cpu(
- ((__le16 *)
- mdm->partIdent.identSuffix)[0])));
+ le16_to_cpu(*(__le16 *)
+ mdm->partIdent.identSuffix));
udf_debug("Metadata part num=%d\n",
- le16_to_cpu(mdm->partitionNum));
+ le16_to_cpu(mdm->partitionNum));
udf_debug("Metadata part alloc unit size=%d\n",
- le32_to_cpu(mdm->allocUnitSize));
+ le32_to_cpu(mdm->allocUnitSize));
udf_debug("Metadata file loc=%d\n",
- le32_to_cpu(mdm->metadataFileLoc));
+ le32_to_cpu(mdm->metadataFileLoc));
udf_debug("Mirror file loc=%d\n",
- le32_to_cpu(mdm->metadataMirrorFileLoc));
+ le32_to_cpu(mdm->metadataMirrorFileLoc));
udf_debug("Bitmap file loc=%d\n",
- le32_to_cpu(mdm->metadataBitmapFileLoc));
- udf_debug("Duplicate Flag: %d %d\n",
- mdata->s_dup_md_flag, mdm->flags);
+ le32_to_cpu(mdm->metadataBitmapFileLoc));
+ udf_debug("Flags: %d %d\n",
+ mdata->s_flags, mdm->flags);
} else {
udf_debug("Unknown ident: %s\n",
upm2->partIdent.ident);
@@ -1418,16 +1389,15 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
map->s_partition_num = le16_to_cpu(upm2->partitionNum);
}
udf_debug("Partition (%d:%d) type %d on volume %d\n",
- i, map->s_partition_num, type,
- map->s_volumeseqnum);
+ i, map->s_partition_num, type, map->s_volumeseqnum);
}
if (fileset) {
struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
*fileset = lelb_to_cpu(la->extLocation);
- udf_debug("FileSet found in LogicalVolDesc at block=%d, "
- "partition=%d\n", fileset->logicalBlockNum,
+ udf_debug("FileSet found in LogicalVolDesc at block=%d, partition=%d\n",
+ fileset->logicalBlockNum,
fileset->partitionReferenceNum);
}
if (lvd->integritySeqExt.extLength)
@@ -1507,9 +1477,9 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
bh = udf_read_tagged(sb, block, block, &ident);
if (!bh) {
- printk(KERN_ERR "udf: Block %Lu of volume descriptor "
- "sequence is corrupted or we could not read "
- "it.\n", (unsigned long long)block);
+ udf_err(sb,
+ "Block %llu of volume descriptor sequence is corrupted or we could not read it\n",
+ (unsigned long long)block);
return 1;
}
@@ -1582,7 +1552,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
* in a suitable order
*/
if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
- printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n");
+ udf_err(sb, "Primary Volume Descriptor not found!\n");
return 1;
}
if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
@@ -1769,7 +1739,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
if (!sb_set_blocksize(sb, uopt->blocksize)) {
if (!silent)
- printk(KERN_WARNING "UDF-fs: Bad block size\n");
+ udf_warn(sb, "Bad block size\n");
return 0;
}
sbi->s_last_block = uopt->lastblock;
@@ -1778,12 +1748,11 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
nsr_off = udf_check_vsd(sb);
if (!nsr_off) {
if (!silent)
- printk(KERN_WARNING "UDF-fs: No VRS found\n");
+ udf_warn(sb, "No VRS found\n");
return 0;
}
if (nsr_off == -1)
- udf_debug("Failed to read byte 32768. Assuming open "
- "disc. Skipping validity check\n");
+ udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n");
if (!sbi->s_last_block)
sbi->s_last_block = udf_get_last_block(sb);
} else {
@@ -1794,7 +1763,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
sbi->s_anchor = uopt->anchor;
if (!udf_find_anchor(sb, fileset)) {
if (!silent)
- printk(KERN_WARNING "UDF-fs: No anchor found\n");
+ udf_warn(sb, "No anchor found\n");
return 0;
}
return 1;
@@ -1972,8 +1941,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
if (uopt.flags & (1 << UDF_FLAG_UTF8) &&
uopt.flags & (1 << UDF_FLAG_NLS_MAP)) {
- udf_error(sb, "udf_read_super",
- "utf8 cannot be combined with iocharset\n");
+ udf_err(sb, "utf8 cannot be combined with iocharset\n");
goto error_out;
}
#ifdef CONFIG_UDF_NLS
@@ -2022,15 +1990,14 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
ret = udf_load_vrs(sb, &uopt, silent, &fileset);
if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
if (!silent)
- printk(KERN_NOTICE
- "UDF-fs: Rescanning with blocksize "
- "%d\n", UDF_DEFAULT_BLOCKSIZE);
+ pr_notice("Rescanning with blocksize %d\n",
+ UDF_DEFAULT_BLOCKSIZE);
uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
ret = udf_load_vrs(sb, &uopt, silent, &fileset);
}
}
if (!ret) {
- printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
+ udf_warn(sb, "No partition found (1)\n");
goto error_out;
}
@@ -2045,10 +2012,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
le16_to_cpu(lvidiu->maxUDFWriteRev); */
if (minUDFReadRev > UDF_MAX_READ_VERSION) {
- printk(KERN_ERR "UDF-fs: minUDFReadRev=%x "
- "(max is %x)\n",
- le16_to_cpu(lvidiu->minUDFReadRev),
- UDF_MAX_READ_VERSION);
+ udf_err(sb, "minUDFReadRev=%x (max is %x)\n",
+ le16_to_cpu(lvidiu->minUDFReadRev),
+ UDF_MAX_READ_VERSION);
goto error_out;
} else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION)
sb->s_flags |= MS_RDONLY;
@@ -2062,28 +2028,27 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
}
if (!sbi->s_partitions) {
- printk(KERN_WARNING "UDF-fs: No partition found (2)\n");
+ udf_warn(sb, "No partition found (2)\n");
goto error_out;
}
if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
UDF_PART_FLAG_READ_ONLY) {
- printk(KERN_NOTICE "UDF-fs: Partition marked readonly; "
- "forcing readonly mount\n");
+ pr_notice("Partition marked readonly; forcing readonly mount\n");
sb->s_flags |= MS_RDONLY;
}
if (udf_find_fileset(sb, &fileset, &rootdir)) {
- printk(KERN_WARNING "UDF-fs: No fileset found\n");
+ udf_warn(sb, "No fileset found\n");
goto error_out;
}
if (!silent) {
struct timestamp ts;
udf_time_to_disk_stamp(&ts, sbi->s_record_time);
- udf_info("UDF: Mounting volume '%s', "
- "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
- sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day,
+ udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
+ sbi->s_volume_ident,
+ le16_to_cpu(ts.year), ts.month, ts.day,
ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
}
if (!(sb->s_flags & MS_RDONLY))
@@ -2094,8 +2059,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
/* perhaps it's not extensible enough, but for now ... */
inode = udf_iget(sb, &rootdir);
if (!inode) {
- printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
- "partition=%d\n",
+ udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
goto error_out;
}
@@ -2103,7 +2067,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
/* Allocate a dentry for the root inode */
sb->s_root = d_alloc_root(inode);
if (!sb->s_root) {
- printk(KERN_ERR "UDF-fs: Couldn't allocate root dentry\n");
+ udf_err(sb, "Couldn't allocate root dentry\n");
iput(inode);
goto error_out;
}
@@ -2131,32 +2095,40 @@ error_out:
return -EINVAL;
}
-static void udf_error(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void _udf_err(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
- if (!(sb->s_flags & MS_RDONLY)) {
- /* mark sb error */
+ /* mark sb error */
+ if (!(sb->s_flags & MS_RDONLY))
sb->s_dirt = 1;
- }
+
va_start(args, fmt);
- vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf);
+
va_end(args);
- printk(KERN_CRIT "UDF-fs error (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
}
-void udf_warning(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void _udf_warn(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf);
+
va_end(args);
- printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
}
static void udf_put_super(struct super_block *sb)
@@ -2248,11 +2220,11 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
bh = udf_read_ptagged(sb, &loc, 0, &ident);
if (!bh) {
- printk(KERN_ERR "udf: udf_count_free failed\n");
+ udf_err(sb, "udf_count_free failed\n");
goto out;
} else if (ident != TAG_IDENT_SBD) {
brelse(bh);
- printk(KERN_ERR "udf: udf_count_free failed\n");
+ udf_err(sb, "udf_count_free failed\n");
goto out;
}
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index b1d4488..0422b7b 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -30,43 +30,73 @@
#include <linux/buffer_head.h>
#include "udf_i.h"
-static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
- int fromlen, unsigned char *to)
+static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
+ int fromlen, unsigned char *to, int tolen)
{
struct pathComponent *pc;
int elen = 0;
+ int comp_len;
unsigned char *p = to;
+ /* Reserve one byte for terminating \0 */
+ tolen--;
while (elen < fromlen) {
pc = (struct pathComponent *)(from + elen);
+ elen += sizeof(struct pathComponent);
switch (pc->componentType) {
case 1:
- if (pc->lengthComponentIdent == 0) {
- p = to;
- *p++ = '/';
+ /*
+ * Symlink points to some place which should be agreed
+ * upon between originator and receiver of the media. Ignore.
+ */
+ if (pc->lengthComponentIdent > 0) {
+ elen += pc->lengthComponentIdent;
+ break;
}
+ /* Fall through */
+ case 2:
+ if (tolen == 0)
+ return -ENAMETOOLONG;
+ p = to;
+ *p++ = '/';
+ tolen--;
break;
case 3:
+ if (tolen < 3)
+ return -ENAMETOOLONG;
memcpy(p, "../", 3);
p += 3;
+ tolen -= 3;
break;
case 4:
+ if (tolen < 2)
+ return -ENAMETOOLONG;
memcpy(p, "./", 2);
p += 2;
+ tolen -= 2;
/* that would be . - just ignore */
break;
case 5:
- p += udf_get_filename(sb, pc->componentIdent, p,
- pc->lengthComponentIdent);
+ elen += pc->lengthComponentIdent;
+ if (elen > fromlen)
+ return -EIO;
+ comp_len = udf_get_filename(sb, pc->componentIdent,
+ pc->lengthComponentIdent,
+ p, tolen);
+ p += comp_len;
+ tolen -= comp_len;
+ if (tolen == 0)
+ return -ENAMETOOLONG;
*p++ = '/';
+ tolen--;
break;
}
- elen += sizeof(struct pathComponent) + pc->lengthComponentIdent;
}
if (p > to + 1)
p[-1] = '\0';
else
p[0] = '\0';
+ return 0;
}
static int udf_symlink_filler(struct file *file, struct page *page)
@@ -74,11 +104,17 @@ static int udf_symlink_filler(struct file *file, struct page *page)
struct inode *inode = page->mapping->host;
struct buffer_head *bh = NULL;
unsigned char *symlink;
- int err = -EIO;
+ int err;
unsigned char *p = kmap(page);
struct udf_inode_info *iinfo;
uint32_t pos;
+ /* We don't support symlinks longer than one block */
+ if (inode->i_size > inode->i_sb->s_blocksize) {
+ err = -ENAMETOOLONG;
+ goto out_unmap;
+ }
+
iinfo = UDF_I(inode);
pos = udf_block_map(inode, 0);
@@ -88,14 +124,18 @@ static int udf_symlink_filler(struct file *file, struct page *page)
} else {
bh = sb_bread(inode->i_sb, pos);
- if (!bh)
- goto out;
+ if (!bh) {
+ err = -EIO;
+ goto out_unlock_inode;
+ }
symlink = bh->b_data;
}
- udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
+ err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE);
brelse(bh);
+ if (err)
+ goto out_unlock_inode;
up_read(&iinfo->i_data_sem);
SetPageUptodate(page);
@@ -103,9 +143,10 @@ static int udf_symlink_filler(struct file *file, struct page *page)
unlock_page(page);
return 0;
-out:
+out_unlock_inode:
up_read(&iinfo->i_data_sem);
SetPageError(page);
+out_unmap:
kunmap(page);
unlock_page(page);
return err;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 8424308..4b98fee 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -95,23 +95,21 @@ void udf_truncate_tail_extent(struct inode *inode)
lbcount += elen;
if (lbcount > inode->i_size) {
if (lbcount - inode->i_size >= inode->i_sb->s_blocksize)
- printk(KERN_WARNING
- "udf_truncate_tail_extent(): Too long "
- "extent after EOF in inode %u: i_size: "
- "%Ld lbcount: %Ld extent %u+%u\n",
- (unsigned)inode->i_ino,
- (long long)inode->i_size,
- (long long)lbcount,
- (unsigned)eloc.logicalBlockNum,
- (unsigned)elen);
+ udf_warn(inode->i_sb,
+ "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n",
+ (unsigned)inode->i_ino,
+ (long long)inode->i_size,
+ (long long)lbcount,
+ (unsigned)eloc.logicalBlockNum,
+ (unsigned)elen);
nelen = elen - (lbcount - inode->i_size);
epos.offset -= adsize;
extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
epos.offset += adsize;
if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
- printk(KERN_ERR "udf_truncate_tail_extent(): "
- "Extent after EOF in inode %u.\n",
- (unsigned)inode->i_ino);
+ udf_err(inode->i_sb,
+ "Extent after EOF in inode %u\n",
+ (unsigned)inode->i_ino);
break;
}
}
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 54706dc..604f5fc 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -54,13 +54,16 @@
#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
+#define MF_DUPLICATE_MD 0x01
+#define MF_MIRROR_FE_LOADED 0x02
+
struct udf_meta_data {
__u32 s_meta_file_loc;
__u32 s_mirror_file_loc;
__u32 s_bitmap_file_loc;
__u32 s_alloc_unit_size;
__u16 s_align_unit_size;
- __u8 s_dup_md_flag;
+ int s_flags;
struct inode *s_metadata_fe;
struct inode *s_mirror_fe;
struct inode *s_bitmap_fe;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index dbd52d4b..8775ab2 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -1,6 +1,8 @@
#ifndef __UDF_DECL_H
#define __UDF_DECL_H
+#define pr_fmt(fmt) "UDF-fs: " fmt
+
#include "ecma_167.h"
#include "osta_udf.h"
@@ -16,23 +18,30 @@
#define UDF_PREALLOCATE
#define UDF_DEFAULT_PREALLOC_BLOCKS 8
+extern __printf(3, 4) void _udf_err(struct super_block *sb,
+ const char *function, const char *fmt, ...);
+#define udf_err(sb, fmt, ...) \
+ _udf_err(sb, __func__, fmt, ##__VA_ARGS__)
+
+extern __printf(3, 4) void _udf_warn(struct super_block *sb,
+ const char *function, const char *fmt, ...);
+#define udf_warn(sb, fmt, ...) \
+ _udf_warn(sb, __func__, fmt, ##__VA_ARGS__)
+
+#define udf_info(fmt, ...) \
+ pr_info("INFO " fmt, ##__VA_ARGS__)
+
#undef UDFFS_DEBUG
#ifdef UDFFS_DEBUG
-#define udf_debug(f, a...) \
-do { \
- printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \
- __FILE__, __LINE__, __func__); \
- printk(f, ##a); \
-} while (0)
+#define udf_debug(fmt, ...) \
+ printk(KERN_DEBUG pr_fmt("%s:%d:%s: " fmt), \
+ __FILE__, __LINE__, __func__, ##__VA_ARGS__)
#else
-#define udf_debug(f, a...) /**/
+#define udf_debug(fmt, ...) \
+ no_printk(fmt, ##__VA_ARGS__)
#endif
-#define udf_info(f, a...) \
- printk(KERN_INFO "UDF-fs INFO " f, ##a);
-
-
#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
@@ -112,8 +121,6 @@ struct extent_position {
/* super.c */
-__attribute__((format(printf, 3, 4)))
-extern void udf_warning(struct super_block *, const char *, const char *, ...);
static inline void udf_updated_lvid(struct super_block *sb)
{
struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
@@ -126,6 +133,8 @@ static inline void udf_updated_lvid(struct super_block *sb)
UDF_SB(sb)->s_lvid_dirty = 1;
}
extern u64 lvid_get_unique_id(struct super_block *sb);
+struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
+ u32 meta_file_loc, u32 partition_num);
/* namei.c */
extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -198,7 +207,8 @@ udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
}
/* unicode.c */
-extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
+extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *,
+ int);
extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
int);
extern int udf_build_ustr(struct ustr *, dstring *, int);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index b8c828c..1f11483 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -34,9 +34,10 @@
* http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm
*/
+#include "udfdecl.h"
+
#include <linux/types.h>
#include <linux/kernel.h>
-#include "udfdecl.h"
#define EPOCH_YEAR 1970
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index d03a90b..d29c06f 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,7 +28,8 @@
#include "udf_sb.h"
-static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);
+static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
+ int);
static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
{
@@ -114,7 +115,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
cmp_id = ocu_i->u_cmpID;
if (cmp_id != 8 && cmp_id != 16) {
memset(utf_o, 0, sizeof(struct ustr));
- printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
+ pr_err("unknown compression code (%d) stri=%s\n",
cmp_id, ocu_i->u_name);
return 0;
}
@@ -242,7 +243,7 @@ try_again:
if (utf_cnt) {
error_out:
ocu[++u_len] = '?';
- printk(KERN_DEBUG "udf: bad UTF-8 character\n");
+ printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
}
ocu[length - 1] = (uint8_t)u_len + 1;
@@ -267,7 +268,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
cmp_id = ocu_i->u_cmpID;
if (cmp_id != 8 && cmp_id != 16) {
memset(utf_o, 0, sizeof(struct ustr));
- printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
+ pr_err("unknown compression code (%d) stri=%s\n",
cmp_id, ocu_i->u_name);
return 0;
}
@@ -333,8 +334,8 @@ try_again:
return u_len + 1;
}
-int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
- int flen)
+int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
+ uint8_t *dname, int dlen)
{
struct ustr *filename, *unifilename;
int len = 0;
@@ -347,7 +348,7 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
if (!unifilename)
goto out1;
- if (udf_build_ustr_exact(unifilename, sname, flen))
+ if (udf_build_ustr_exact(unifilename, sname, slen))
goto out2;
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
@@ -366,7 +367,8 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
} else
goto out2;
- len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
+ len = udf_translate_to_linux(dname, dlen,
+ filename->u_name, filename->u_len,
unifilename->u_name, unifilename->u_len);
out2:
kfree(unifilename);
@@ -403,10 +405,12 @@ int udf_put_filename(struct super_block *sb, const uint8_t *sname,
#define EXT_MARK '.'
#define CRC_MARK '#'
#define EXT_SIZE 5
+/* Number of chars we need to store generated CRC to make filename unique */
+#define CRC_LEN 5
-static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
- int udfLen, uint8_t *fidName,
- int fidNameLen)
+static int udf_translate_to_linux(uint8_t *newName, int newLen,
+ uint8_t *udfName, int udfLen,
+ uint8_t *fidName, int fidNameLen)
{
int index, newIndex = 0, needsCRC = 0;
int extIndex = 0, newExtIndex = 0, hasExt = 0;
@@ -440,7 +444,7 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
newExtIndex = newIndex;
}
}
- if (newIndex < 256)
+ if (newIndex < newLen)
newName[newIndex++] = curr;
else
needsCRC = 1;
@@ -468,13 +472,13 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
}
ext[localExtIndex++] = curr;
}
- maxFilenameLen = 250 - localExtIndex;
+ maxFilenameLen = newLen - CRC_LEN - localExtIndex;
if (newIndex > maxFilenameLen)
newIndex = maxFilenameLen;
else
newIndex = newExtIndex;
- } else if (newIndex > 250)
- newIndex = 250;
+ } else if (newIndex > newLen - CRC_LEN)
+ newIndex = newLen - CRC_LEN;
newName[newIndex++] = CRC_MARK;
valueCRC = crc_itu_t(0, fidName, fidNameLen);
newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 2eabf04..78a4c70 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -341,7 +341,7 @@ cg_found:
fail_remove_inode:
unlock_super(sb);
- inode->i_nlink = 0;
+ clear_nlink(inode);
iput(inode);
UFSD("EXIT (FAILED): err %d\n", err);
return ERR_PTR(err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index b4d791a..879b134 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -589,7 +589,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
* Copy data to the in-core inode.
*/
inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
- inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink);
+ set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink));
if (inode->i_nlink == 0) {
ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
return -1;
@@ -637,7 +637,7 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
* Copy data to the in-core inode.
*/
inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
- inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink);
+ set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink));
if (inode->i_nlink == 0) {
ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
return -1;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b57aab9..639d491 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -59,8 +59,6 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
if (ino)
inode = ufs_iget(dir->i_sb, ino);
unlock_ufs(dir->i_sb);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
return d_splice_alias(inode, dentry);
}
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 5be2755..c26f2bc 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -117,9 +117,12 @@ extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buf
extern const struct file_operations ufs_dir_operations;
/* super.c */
-extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
-extern void ufs_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
-extern void ufs_panic (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ufs_warning(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ufs_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ufs_panic(struct super_block *, const char *, const char *, ...);
/* symlink.c */
extern const struct inode_operations ufs_fast_symlink_inode_operations;