From 04bedd79a7037ee7af816b06c60c738144475c4a Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 18 Sep 2009 14:31:47 -0500 Subject: dlm: fix lowcomms_connect_node for sctp The recently added dlm_lowcomms_connect_node() from 391fbdc5d527149578490db2f1619951d91f3561 does not work when using SCTP instead of TCP. The sctp connection code has nothing to do without data to send. Check for no data in the sctp connection code and do nothing instead of triggering a BUG. Also have connect_node() do nothing when the protocol is sctp. Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 240cef1..a3350e4 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -316,6 +316,10 @@ int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; + /* with sctp there's no connecting without sending */ + if (dlm_config.ci_protocol != 0) + return 0; + if (nodeid == dlm_our_nodeid()) return 0; @@ -855,11 +859,14 @@ static void sctp_init_assoc(struct connection *con) outmessage.msg_flags = MSG_EOR; spin_lock(&con->writequeue_lock); - e = list_entry(con->writequeue.next, struct writequeue_entry, - list); - BUG_ON((struct list_head *) e == &con->writequeue); + if (list_empty(&con->writequeue)) { + spin_unlock(&con->writequeue_lock); + log_print("writequeue empty for nodeid %d", con->nodeid); + return; + } + e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; spin_unlock(&con->writequeue_lock); -- cgit v1.1 From 6861f350785bf476c2d4e3b9cb69ee36b78df2fc Mon Sep 17 00:00:00 2001 From: David Teigland Date: Thu, 24 Sep 2009 15:58:23 -0500 Subject: dlm: fix socket fd translation The code to set up sctp sockets was not using the sockfd_lookup() and sockfd_put() routines to translate an fd to a socket. The direct fget and fput calls were resulting in error messages from alloc_fd(). Also clean up two log messages and remove a third, related to setting up sctp associations. Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a3350e4..70736eb 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -459,9 +459,9 @@ static void process_sctp_notification(struct connection *con, int prim_len, ret; int addr_len; struct connection *new_con; - struct file *file; sctp_peeloff_arg_t parg; int parglen = sizeof(parg); + int err; /* * We get this before any data for an association. @@ -516,19 +516,22 @@ static void process_sctp_notification(struct connection *con, ret = kernel_getsockopt(con->sock, IPPROTO_SCTP, SCTP_SOCKOPT_PEELOFF, (void *)&parg, &parglen); - if (ret) { + if (ret < 0) { log_print("Can't peel off a socket for " - "connection %d to node %d: err=%d\n", + "connection %d to node %d: err=%d", parg.associd, nodeid, ret); + return; + } + new_con->sock = sockfd_lookup(parg.sd, &err); + if (!new_con->sock) { + log_print("sockfd_lookup error %d", err); + return; } - file = fget(parg.sd); - new_con->sock = SOCKET_I(file->f_dentry->d_inode); add_sock(new_con->sock, new_con); - fput(file); - put_unused_fd(parg.sd); + sockfd_put(new_con->sock); - log_print("got new/restarted association %d nodeid %d", - (int)sn->sn_assoc_change.sac_assoc_id, nodeid); + log_print("connecting to %d sctp association %d", + nodeid, (int)sn->sn_assoc_change.sac_assoc_id); /* Send any pending writes */ clear_bit(CF_CONNECT_PENDING, &new_con->flags); @@ -841,8 +844,6 @@ static void sctp_init_assoc(struct connection *con) if (con->retries++ > MAX_CONNECT_RETRIES) return; - log_print("Initiating association with node %d", con->nodeid); - if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) { log_print("no address for nodeid %d", con->nodeid); return; -- cgit v1.1 From 61d92c328c16419fc96dc50dd16f8b8c695409ec Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 2 Oct 2009 19:11:56 -0400 Subject: Btrfs: fix deadlock on async thread startup The btrfs async worker threads are used for a wide variety of things, including processing bio end_io functions. This means that when the endio threads aren't running, the rest of the FS isn't able to do the final processing required to clear PageWriteback. The endio threads also try to exit as they become idle and start more as the work piles up. The problem is that starting more threads means kthreadd may need to allocate ram, and that allocation may wait until the global number of writeback pages on the system is below a certain limit. The result of that throttling is that end IO threads wait on kthreadd, who is waiting on IO to end, which will never happen. This commit fixes the deadlock by handing off thread startup to a dedicated thread. It also fixes a bug where the on-demand thread creation was creating far too many threads because it didn't take into account threads being started by other procs. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 81 ++++++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/async-thread.h | 10 ++++-- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 42 +++++++++++++------------ fs/btrfs/relocation.c | 4 +-- 5 files changed, 106 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 282ca08..c0861e7 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -64,6 +64,51 @@ struct btrfs_worker_thread { }; /* + * btrfs_start_workers uses kthread_run, which can block waiting for memory + * for a very long time. It will actually throttle on page writeback, + * and so it may not make progress until after our btrfs worker threads + * process all of the pending work structs in their queue + * + * This means we can't use btrfs_start_workers from inside a btrfs worker + * thread that is used as part of cleaning dirty memory, which pretty much + * involves all of the worker threads. + * + * Instead we have a helper queue who never has more than one thread + * where we scheduler thread start operations. This worker_start struct + * is used to contain the work and hold a pointer to the queue that needs + * another worker. + */ +struct worker_start { + struct btrfs_work work; + struct btrfs_workers *queue; +}; + +static void start_new_worker_func(struct btrfs_work *work) +{ + struct worker_start *start; + start = container_of(work, struct worker_start, work); + btrfs_start_workers(start->queue, 1); + kfree(start); +} + +static int start_new_worker(struct btrfs_workers *queue) +{ + struct worker_start *start; + int ret; + + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return -ENOMEM; + + start->work.func = start_new_worker_func; + start->queue = queue; + ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); + if (ret) + kfree(start); + return ret; +} + +/* * helper function to move a thread onto the idle list after it * has finished some requests. */ @@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker) goto out; workers->atomic_start_pending = 0; - if (workers->num_workers >= workers->max_workers) + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) goto out; + workers->num_workers_starting += 1; spin_unlock_irqrestore(&workers->lock, flags); - btrfs_start_workers(workers, 1); + start_new_worker(workers); return; out: @@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers) /* * simple init on struct btrfs_workers */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_helper) { workers->num_workers = 0; + workers->num_workers_starting = 0; INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); @@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) workers->name = name; workers->ordered = 0; workers->atomic_start_pending = 0; - workers->atomic_worker_start = 0; + workers->atomic_worker_start = async_helper; } /* * starts new worker threads. This does not enforce the max worker * count in case you need to temporarily go past it. */ -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +static int __btrfs_start_workers(struct btrfs_workers *workers, + int num_workers) { struct btrfs_worker_thread *worker; int ret = 0; @@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) list_add_tail(&worker->worker_list, &workers->idle_list); worker->idle = 1; workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); spin_unlock_irq(&workers->lock); } return 0; @@ -452,6 +504,14 @@ fail: return ret; } +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + spin_lock_irq(&workers->lock); + workers->num_workers_starting += num_workers; + spin_unlock_irq(&workers->lock); + return __btrfs_start_workers(workers, num_workers); +} + /* * run through the list and find a worker thread that doesn't have a lot * to do right now. This can return null if we aren't yet at the thread @@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; struct list_head *next; - int enforce_min = workers->num_workers < workers->max_workers; + int enforce_min; + + enforce_min = (workers->num_workers + workers->num_workers_starting) < + workers->max_workers; /* * if we find an idle thread, don't move it to the end of the @@ -509,15 +572,17 @@ again: worker = next_worker(workers); if (!worker) { - if (workers->num_workers >= workers->max_workers) { + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) { goto fallback; } else if (workers->atomic_worker_start) { workers->atomic_start_pending = 1; goto fallback; } else { + workers->num_workers_starting++; spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ - btrfs_start_workers(workers, 1); + __btrfs_start_workers(workers, 1); goto again; } } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index fc089b9..5077746 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,6 +64,8 @@ struct btrfs_workers { /* current number of running workers */ int num_workers; + int num_workers_starting; + /* max number of workers allowed. changed by btrfs_start_workers */ int max_workers; @@ -78,9 +80,10 @@ struct btrfs_workers { /* * are we allowed to sleep while starting workers or are we required - * to start them at a later time? + * to start them at a later time? If we can't sleep, this indicates + * which queue we need to use to schedule thread creation. */ - int atomic_worker_start; + struct btrfs_workers *atomic_worker_start; /* list with all the work threads. The workers on the idle thread * may be actively servicing jobs, but they haven't yet hit the @@ -109,7 +112,8 @@ struct btrfs_workers { int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_starter); int btrfs_requeue_work(struct btrfs_work *work); void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8184f2f..1b920ff 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -907,6 +907,7 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other * two */ + struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; struct btrfs_workers endio_workers; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 69dce50..9903f04 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1746,21 +1746,22 @@ struct btrfs_root *open_ctree(struct super_block *sb, err = -EINVAL; goto fail_iput; } -printk("thread pool is %d\n", fs_info->thread_pool_size); - /* - * we need to start all the end_io workers up front because the - * queue work function gets called at interrupt time, and so it - * cannot dynamically grow. - */ + + btrfs_init_workers(&fs_info->generic_worker, + "genwork", 1, NULL); + btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size)); + fs_info->thread_pool_size), + &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1774,15 +1775,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); fs_info->delalloc_workers.idle_thresh = 2; fs_info->delalloc_workers.ordered = 1; - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size); + "endio-meta-write", fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1794,12 +1800,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; - fs_info->endio_workers.atomic_worker_start = 1; - fs_info->endio_meta_workers.atomic_worker_start = 1; - fs_info->endio_write_workers.atomic_worker_start = 1; - fs_info->endio_meta_write_workers.atomic_worker_start = 1; - btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->generic_worker, 1); btrfs_start_workers(&fs_info->submit_workers, 1); btrfs_start_workers(&fs_info->delalloc_workers, 1); btrfs_start_workers(&fs_info->fixup_workers, 1); @@ -2012,6 +2014,7 @@ fail_chunk_root: free_extent_buffer(chunk_root->node); free_extent_buffer(chunk_root->commit_root); fail_sb_buffer: + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2437,6 +2440,7 @@ int close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 361ad32..cfcc93c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) BUG_ON(!rc->block_group); btrfs_init_workers(&rc->workers, "relocate", - fs_info->thread_pool_size); + fs_info->thread_pool_size, NULL); rc->extent_root = extent_root; btrfs_prepare_block_group_relocation(extent_root, rc->block_group); @@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) mapping_tree_init(&rc->reloc_root_tree); INIT_LIST_HEAD(&rc->reloc_roots); btrfs_init_workers(&rc->workers, "relocate", - root->fs_info->thread_pool_size); + root->fs_info->thread_pool_size, NULL); rc->extent_root = root->fs_info->extent_root; set_reloc_control(rc); -- cgit v1.1 From 1cdda9b81ac0e6ee986f034fa02f221679e1c11a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 6 Oct 2009 10:04:28 -0400 Subject: Btrfs: fix possible softlockup in the allocator Like the cluster allocating stuff, we can lockup the box with the normal allocation path. This happens when we 1) Start to cache a block group that is severely fragmented, but has a decent amount of free space. 2) Start to commit a transaction 3) Have the commit try and empty out some of the delalloc inodes with extents that are relatively large. The inodes will not be able to make the allocations because they will ask for allocations larger than a contiguous area in the free space cache. So we will wait for more progress to be made on the block group, but since we're in a commit the caching kthread won't make any more progress and it already has enough free space that wait_block_group_cache_progress will just return. So, if we wait and fail to make the allocation the next time around, just loop and go to the next block group. This keeps us from getting stuck in a softlockup. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d119c03..2f82fab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4028,6 +4028,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int loop = 0; bool found_uncached_bg = false; bool failed_cluster_refill = false; + bool failed_alloc = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -4232,14 +4233,23 @@ refill_cluster: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); - if (!offset && (cached || (!cached && - loop == LOOP_CACHING_NOWAIT))) { - goto loop; - } else if (!offset && (!cached && - loop > LOOP_CACHING_NOWAIT)) { + /* + * If we didn't find a chunk, and we haven't failed on this + * block group before, and this block group is in the middle of + * caching and we are ok with waiting, then go ahead and wait + * for progress to be made, and set failed_alloc to true. + * + * If failed_alloc is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !failed_alloc && !cached && + loop > LOOP_CACHING_NOWAIT) { wait_block_group_cache_progress(block_group, - num_bytes + empty_size); + num_bytes + empty_size); + failed_alloc = true; goto have_block_group; + } else if (!offset) { + goto loop; } checks: search_start = stripe_align(root, offset); @@ -4287,6 +4297,7 @@ checks: break; loop: failed_cluster_refill = false; + failed_alloc = false; btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); -- cgit v1.1 From 316d315bffa4026f28085f6b24ebcebede370ac7 Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Tue, 6 Oct 2009 20:16:55 +0200 Subject: block: Seperate read and write statistics of in_flight requests v2 Commit a9327cac440be4d8333bba975cbbf76045096275 added seperate read and write statistics of in_flight requests. And exported the number of read and write requests in progress seperately through sysfs. But Corrado Zoccolo reported getting strange output from "iostat -kx 2". Global values for service time and utilization were garbage. For interval values, utilization was always 100%, and service time is higher than normal. So this was reverted by commit 0f78ab9899e9d6acb09d5465def618704255963b The problem was in part_round_stats_single(), I missed the following: if (now == part->stamp) return; - if (part->in_flight) { + if (part_in_flight(part)) { __part_stat_add(cpu, part, time_in_queue, part_in_flight(part) * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); With this chunk included, the reported regression gets fixed. Signed-off-by: Nikanth Karthikesan -- Signed-off-by: Jens Axboe --- fs/partitions/check.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index f38fee0..7b685e1 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - p->in_flight, + part_in_flight(p), jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } +ssize_t part_inflight_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); +} + #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = { &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif -- cgit v1.1 From 8347a5cdd1422eea0470ed586274c7f29e274b47 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 6 Oct 2009 18:31:29 +0000 Subject: [CIFS] Fixing to avoid invalid kfree() in cifs_get_tcp_session() trivial bug in fs/cifs/connect.c . The bug is caused by fail of extract_hostname() when mounting cifs file system. This is the situation when I noticed this bug. % sudo mount -t cifs //192.168.10.208 mountpoint -o options... Then my kernel says, [ 1461.807776] ------------[ cut here ]------------ [ 1461.807781] kernel BUG at mm/slab.c:521! [ 1461.807784] invalid opcode: 0000 [#2] PREEMPT SMP [ 1461.807790] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:09:02.0/resource [ 1461.807793] CPU 0 [ 1461.807796] Modules linked in: nls_iso8859_1 usbhid sbp2 uhci_hcd ehci_hcd i2c_i801 ohci1394 ieee1394 psmouse serio_raw pcspkr sky2 usbcore evdev [ 1461.807816] Pid: 3446, comm: mount Tainted: G D 2.6.32-rc2-vanilla [ 1461.807820] RIP: 0010:[] [] kfree+0x63/0x156 [ 1461.807829] RSP: 0018:ffff8800b4f7fbb8 EFLAGS: 00010046 [ 1461.807832] RAX: ffffea00033fff98 RBX: ffff8800afbae7e2 RCX: 0000000000000000 [ 1461.807836] RDX: ffffea0000000000 RSI: 000000000000005c RDI: ffffffffffffffea [ 1461.807839] RBP: ffff8800b4f7fbf8 R08: 0000000000000001 R09: 0000000000000000 [ 1461.807842] R10: 0000000000000000 R11: ffff8800b4f7fbf8 R12: 00000000ffffffea [ 1461.807845] R13: ffff8800afb23000 R14: ffff8800b4f87bc0 R15: ffffffffffffffea [ 1461.807849] FS: 00007f52b6f187c0(0000) GS:ffff880007600000(0000) knlGS:0000000000000000 [ 1461.807852] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1461.807855] CR2: 0000000000613000 CR3: 00000000af8f9000 CR4: 00000000000006f0 [ 1461.807858] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1461.807861] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 1461.807865] Process mount (pid: 3446, threadinfo ffff8800b4f7e000, task ffff8800950e4380) [ 1461.807867] Stack: [ 1461.807869] 0000000000000202 0000000000000282 ffff8800b4f7fbf8 ffff8800afbae7e2 [ 1461.807876] <0> 00000000ffffffea ffff8800afb23000 ffff8800b4f87bc0 ffff8800b4f7fc28 [ 1461.807884] <0> ffff8800b4f7fcd8 ffffffff81159f6d ffffffff81147bc2 ffffffff816bfb48 [ 1461.807892] Call Trace: [ 1461.807899] [] cifs_get_tcp_session+0x440/0x44b [ 1461.807904] [] ? find_nls+0x1c/0xe9 [ 1461.807909] [] cifs_mount+0x16bc/0x2167 [ 1461.807917] [] ? _spin_unlock+0x30/0x4b [ 1461.807923] [] cifs_get_sb+0xa5/0x1a8 [ 1461.807928] [] vfs_kern_mount+0x56/0xc9 [ 1461.807933] [] do_kern_mount+0x47/0xe7 [ 1461.807938] [] do_mount+0x712/0x775 [ 1461.807943] [] ? copy_mount_options+0xcf/0x132 [ 1461.807948] [] sys_mount+0x7f/0xbf [ 1461.807953] [] ? lockdep_sys_exit_thunk+0x35/0x67 [ 1461.807960] [] system_call_fastpath+0x16/0x1b [ 1461.807963] Code: 00 00 00 00 ea ff ff 48 c1 e8 0c 48 6b c0 68 48 01 d0 66 83 38 00 79 04 48 8b 40 10 66 83 38 00 79 04 48 8b 40 10 80 38 00 78 04 <0f> 0b eb fe 4c 8b 70 58 4c 89 ff 41 8b 76 4c e8 b8 49 fb ff e8 [ 1461.808022] RIP [] kfree+0x63/0x156 [ 1461.808027] RSP [ 1461.808031] ---[ end trace ffe26fcdc72c0ce4 ]--- The reason of this bug is that the error handling code of cifs_get_tcp_session() calls kfree() when corresponding kmalloc() failed. (The kmalloc() is called by extract_hostname().) Signed-off-by: Hitoshi Mitake CC: Stable Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 43003e0..b090980 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1577,7 +1577,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info) out_err: if (tcp_ses) { - kfree(tcp_ses->hostname); + if (!IS_ERR(tcp_ses->hostname)) + kfree(tcp_ses->hostname); if (tcp_ses->ssocket) sock_release(tcp_ses->ssocket); kfree(tcp_ses); -- cgit v1.1 From c5811dbdd26284d63c19fca618bd740dd10ad53d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:40:15 -0400 Subject: NFS: Fix a default mount regression... With the recent spate of changes, the nfs protocol version will now default to 2 instead of 3, while the mount protocol version defaults to 3. The following patch should ensure the defaults are consistent with the previous defaults of vers=3,proto=tcp,mountvers=3,mountproto=tcp. This fixes the bug http://bugzilla.kernel.org/show_bug.cgi?id=14259 Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 29786d3..0343ebc 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -728,22 +728,24 @@ static void nfs_umount_begin(struct super_block *sb) unlock_kernel(); } -static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags) +static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) { struct nfs_parsed_mount_data *data; data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { - data->flags = flags; data->rsize = NFS_MAX_FILE_IO_SIZE; data->wsize = NFS_MAX_FILE_IO_SIZE; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; data->acdirmax = NFS_DEF_ACDIRMAX; + data->mount_server.port = NFS_UNSPEC_PORT; data->nfs_server.port = NFS_UNSPEC_PORT; + data->nfs_server.protocol = XPRT_TRANSPORT_TCP; data->auth_flavors[0] = RPC_AUTH_UNIX; data->auth_flavor_len = 1; + data->version = version; data->minorversion = 0; } return data; @@ -1711,8 +1713,6 @@ static int nfs_validate_mount_data(void *options, if (!(data->flags & NFS_MOUNT_TCP)) args->nfs_server.protocol = XPRT_TRANSPORT_UDP; - else - args->nfs_server.protocol = XPRT_TRANSPORT_TCP; /* N.B. caller will free nfs_server.hostname in all cases */ args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); args->namlen = data->namlen; @@ -2106,7 +2106,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, }; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP); + data = nfs_alloc_parsed_mount_data(3); mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); if (data == NULL || mntfh == NULL) goto out_free_fh; @@ -2376,7 +2376,6 @@ static int nfs4_validate_mount_data(void *options, if (data == NULL) goto out_no_data; - args->version = 4; switch (data->version) { case 1: if (data->host_addrlen > sizeof(args->nfs_server.address)) @@ -2660,7 +2659,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, struct nfs_parsed_mount_data *data; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(0); + data = nfs_alloc_parsed_mount_data(4); if (data == NULL) goto out_free_data; -- cgit v1.1 From f5855fecda65c1965c894915ace1e086d4925154 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:40:37 -0400 Subject: NFS: Fix port and mountport display in /proc/self/mountinfo Currently, the port and mount port will both display as 65535 if you do not specify a port number. That would be wrong... Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0343ebc..0d14704 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -778,15 +778,13 @@ static int nfs_verify_server_address(struct sockaddr *addr) * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ -static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port, +static void nfs_set_port(struct sockaddr *sap, int *port, const unsigned short default_port) { - unsigned short port = default_port; + if (*port == NFS_UNSPEC_PORT) + *port = default_port; - if (parsed_port != NFS_UNSPEC_PORT) - port = parsed_port; - - rpc_set_port(sap, port); + rpc_set_port(sap, *port); } /* @@ -1477,7 +1475,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, args->mount_server.addrlen = args->nfs_server.addrlen; } request.salen = args->mount_server.addrlen; - nfs_set_default_port(request.sap, args->mount_server.port, 0); + nfs_set_port(request.sap, &args->mount_server.port, 0); /* * Now ask the mount server to map our export path @@ -1767,7 +1765,7 @@ static int nfs_validate_mount_data(void *options, goto out_v4_not_compiled; #endif - nfs_set_default_port(sap, args->nfs_server.port, 0); + nfs_set_port(sap, &args->nfs_server.port, 0); nfs_set_mount_transport_protocol(args); @@ -2331,7 +2329,7 @@ static int nfs4_validate_text_mount_data(void *options, { struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; - nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT); + nfs_set_port(sap, &args->nfs_server.port, NFS_PORT); nfs_validate_transport_protocol(args); -- cgit v1.1 From bcd2ea17da6a329a7276cde7286d802f009af332 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:41:22 -0400 Subject: NFS: Fix port initialisation in nfs_remount() The recent changeset 53a0b9c4c99ab0085a06421f71592722e5b3fd5f (NFS: Replace nfs_parse_ip_address() with rpc_pton()) broke nfs_remount, since the call to rpc_pton() will zero out the port number in data->nfs_server.address. This is actually due to a bug in nfs_remount: it should be looking at the port number in nfs_server.port instead... This fixes bug http://bugzilla.kernel.org/show_bug.cgi?id=14276 Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0d14704..fb3b280 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1846,9 +1846,10 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->acdirmin != nfss->acdirmin / HZ || data->acdirmax != nfss->acdirmax / HZ || data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + data->nfs_server.port != nfss->port || data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || - memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr, - data->nfs_server.addrlen) != 0) + !rpc_cmp_addr(&data->nfs_server.address, + &nfss->nfs_client->cl_addr)) return -EINVAL; return 0; @@ -1891,6 +1892,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->acdirmin = nfss->acdirmin / HZ; data->acdirmax = nfss->acdirmax / HZ; data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + data->nfs_server.port = nfss->port; data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, data->nfs_server.addrlen); -- cgit v1.1 From f4373bf9e67e4a653c8854acd7b02dac9714c98a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 6 Oct 2009 15:42:18 -0400 Subject: nfs: Avoid overrun when copying client IP address string As seen in , nfs4_init_client() can overrun the source string when copying the client IP address from nfs_parsed_mount_data::client_address to nfs_client::cl_ipaddr. Since these are both treated as null-terminated strings elsewhere, the copy should be done with strlcpy() not memcpy(). Signed-off-by: Ben Hutchings Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 63976c0..99ea196 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1180,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp, 1, flags & NFS_MOUNT_NORESVPORT); if (error < 0) goto error; - memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); error = nfs_idmap_new(clp); if (error < 0) { -- cgit v1.1 From 517be09def6cd7bc231222ee756fde8ea245a6fe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:42:20 -0400 Subject: NFSv4: Fix the referral mount code Fix a typo which causes try_location() to use the wrong length argument when calling nfs_parse_server_name(). This again, causes the initialisation of the mount's sockaddr structure to fail. Also ensure that if nfs4_pathname_string() returns an error, then we pass that error back up the stack instead of ENOENT. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4namespace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2636c26..fa3408f 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -121,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) - return mnt; + return ERR_CAST(mnt_path); mountdata->mnt_path = mnt_path; maxbuflen = mnt_path - 1 - page2; @@ -132,15 +132,15 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (buf->len <= 0 || buf->len >= maxbuflen) continue; - mountdata->addr = (struct sockaddr *)&addr; - if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) continue; - mountdata->addrlen = nfs_parse_server_name(buf->data, - buf->len, - mountdata->addr, mountdata->addrlen); + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, + (struct sockaddr *)&addr, sizeof(addr)); if (mountdata->addrlen == 0) continue; + + mountdata->addr = (struct sockaddr *)&addr; rpc_set_port(mountdata->addr, NFS_PORT); memcpy(page2, buf->data, buf->len); -- cgit v1.1 From 4055e97318809638a57fbe1746b93bc7a90ef0d3 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 7 Oct 2009 16:32:24 -0700 Subject: fs: includecheck fix: proc, kcore.c fix the following 'make includecheck' warning: fs/proc/kcore.c: linux/mm.h is included more than once. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 5601337..a44a789 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include -- cgit v1.1 From 253fb02d62571e5455eedc9e39b9d660e86a40f0 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Oct 2009 16:32:27 -0700 Subject: pagemap: export KPF_HWPOISON This flag indicates a hardware detected memory corruption on the page. Any future access of the page data may bring down the machine. Signed-off-by: Wu Fengguang Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 2281c2c..5033ce0 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -94,6 +94,7 @@ static const struct file_operations proc_kpagecount_operations = { #define KPF_COMPOUND_TAIL 16 #define KPF_HUGE 17 #define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 #define KPF_NOPAGE 20 #define KPF_KSM 21 @@ -180,6 +181,10 @@ static u64 get_uflags(struct page *page) u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); +#ifdef CONFIG_MEMORY_FAILURE + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); +#endif + #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); #endif -- cgit v1.1 From 3050141bae57984dd660e6861632ccf9b8bca77e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 8 Oct 2009 11:50:55 -0400 Subject: NFSv4: Kill nfs4_renewd_prepare_shutdown() The NFSv4 renew daemon is shared between all active super blocks that refer to a particular NFS server, so it is wrong to be shutting it down in nfs4_kill_super every time a super block is destroyed. This patch therefore kills nfs4_renewd_prepare_shutdown altogether, and leaves it up to nfs4_shutdown_client() to also shut down the renew daemon by means of the existing call to nfs4_kill_renewd(). Signed-off-by: Trond Myklebust --- fs/nfs/nfs4renewd.c | 6 ------ fs/nfs/super.c | 1 - 2 files changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index e27c6ce..0156c01 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -127,12 +127,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) } void -nfs4_renewd_prepare_shutdown(struct nfs_server *server) -{ - cancel_delayed_work(&server->nfs_client->cl_renewd); -} - -void nfs4_kill_renewd(struct nfs_client *clp) { cancel_delayed_work_sync(&clp->cl_renewd); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index fb3b280..6dabf6f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2689,7 +2689,6 @@ static void nfs4_kill_super(struct super_block *sb) dprintk("--> %s\n", __func__); nfs_super_return_all_delegations(sb); kill_anon_super(sb); - nfs4_renewd_prepare_shutdown(server); nfs_fscache_release_super_cookie(sb); nfs_free_server(server); dprintk("<-- %s\n", __func__); -- cgit v1.1 From 664fc5a4e7d0d7f3487e5c856b79f7dac79567fd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Sep 2009 13:34:20 -0700 Subject: ecryptfs: depends on CRYPTO ecryptfs uses crypto APIs so it should depend on CRYPTO. Otherwise many build errors occur. [63 lines not pasted] Signed-off-by: Randy Dunlap Cc: Andrew Morton Cc: ecryptfs-devel@lists.launchpad.net Signed-off-by: Tyler Hicks --- fs/ecryptfs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 8aadb99..89ebed3 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,6 +1,6 @@ config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && NET + depends on EXPERIMENTAL && KEYS && NET && CRYPTO select CRYPTO_ECB select CRYPTO_CBC help -- cgit v1.1 From ed1f21857e76a92a006e0f890a3d7f72953b1469 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 29 Sep 2009 02:33:59 -0500 Subject: eCryptfs: Remove Kconfig NET dependency and select MD5 eCryptfs no longer uses a netlink interface to communicate with ecryptfsd, so NET is not a valid dependency anymore. MD5 is required and must be built for eCryptfs to be of any use. Signed-off-by: Tyler Hicks --- fs/ecryptfs/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 89ebed3..1cd6d9d 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,8 +1,9 @@ config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && NET && CRYPTO + depends on EXPERIMENTAL && KEYS && CRYPTO select CRYPTO_ECB select CRYPTO_CBC + select CRYPTO_MD5 help Encrypted filesystem that operates on the VFS layer. See to learn more about -- cgit v1.1 From 36520be8e32b49bd85a63b7b8b40cd07c3da59a5 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Mon, 5 Oct 2009 14:25:44 -0400 Subject: ima: ecryptfs fix imbalance message The unencrypted files are being measured. Update the counters to get rid of the ecryptfs imbalance message. (http://bugzilla.redhat.com/519737) Reported-by: Sachin Garg Cc: Eric Paris Cc: Dustin Kirkland Cc: James Morris Cc: David Safford Cc: stable@kernel.org Signed-off-by: Mimi Zohar Signed-off-by: Tyler Hicks --- fs/ecryptfs/main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 101fe4c..c6ac85d 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "ecryptfs_kernel.h" /** @@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) const struct cred *cred = current_cred(); struct ecryptfs_inode_info *inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); + int opened_lower_file = 0; int rc = 0; mutex_lock(&inode_info->lower_file_mutex); @@ -134,9 +136,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", lower_dentry, lower_mnt, rc); inode_info->lower_file = NULL; - } + } else + opened_lower_file = 1; } mutex_unlock(&inode_info->lower_file_mutex); + if (opened_lower_file) + ima_counts_get(inode_info->lower_file); return rc; } -- cgit v1.1 From f9581b1443abac50c90168301d40a7734b13a5dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Oct 2009 20:29:26 +0000 Subject: xfs: implement ->dirty_inode to fix timestamp handling This is picking up on Felix's repost of Dave's patch to implement a .dirty_inode method. We really need this notification because the VFS keeps writing directly into the inode structure instead of going through methods to update this state. In addition to the long-known atime issue we now also have a caller in VM code that updates c/mtime that way for shared writeable mmaps. And I found another one that no one has noticed in practice in the FIFO code. So implement ->dirty_inode to set i_update_core whenever the inode gets externally dirtied, and switch the c/mtime handling to the same scheme we already use for atime (always picking up the value from the Linux inode). Note that this patch also removes the xfs_synchronize_atime call in xfs_reclaim it was superflous as we already synchronize the time when writing the inode via the log (xfs_inode_item_format) or the normal buffers (xfs_iflush_int). In addition also remove the I_CLEAR check before copying the Linux timestamps - now that we always have the Linux inode available we can always use the timestamps in it. Also switch to just using file_update_time for regular reads/writes - that will get us all optimization done to it for free and make sure we notice early when it breaks. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_aops.c | 1 - fs/xfs/linux-2.6/xfs_iops.c | 41 +++++++++++++++-------------------------- fs/xfs/linux-2.6/xfs_lrw.c | 2 +- fs/xfs/linux-2.6/xfs_super.c | 23 +++++++++++++++++++++++ fs/xfs/xfs_dfrag.c | 8 ++++---- fs/xfs/xfs_inode.c | 4 ++-- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_inode_item.c | 18 +++++++++++------- fs/xfs/xfs_itable.c | 21 +++++++++++++-------- fs/xfs/xfs_vnodeops.c | 6 ------ 10 files changed, 70 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index d5e5559..40d2226 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -215,7 +215,6 @@ xfs_setfilesize( if (ip->i_d.di_size < isize) { ip->i_d.di_size = isize; - ip->i_update_core = 1; xfs_mark_inode_dirty_sync(ip); } diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 626b474..cdf7114 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -57,19 +57,22 @@ #include /* - * Bring the atime in the XFS inode uptodate. - * Used before logging the inode to disk or when the Linux inode goes away. + * Bring the timestamps in the XFS inode uptodate. + * + * Used before writing the inode to disk. */ void -xfs_synchronize_atime( +xfs_synchronize_times( xfs_inode_t *ip) { struct inode *inode = VFS_I(ip); - if (!(inode->i_state & I_CLEAR)) { - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - } + ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; + ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; + ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; } /* @@ -106,32 +109,20 @@ xfs_ichgtime( if ((flags & XFS_ICHGTIME_MOD) && !timespec_equal(&inode->i_mtime, &tv)) { inode->i_mtime = tv; - ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } if ((flags & XFS_ICHGTIME_CHG) && !timespec_equal(&inode->i_ctime, &tv)) { inode->i_ctime = tv; - ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } /* - * We update the i_update_core field _after_ changing - * the timestamps in order to coordinate properly with - * xfs_iflush() so that we don't lose timestamp updates. - * This keeps us from having to hold the inode lock - * while doing this. We use the SYNCHRONIZE macro to - * ensure that the compiler does not reorder the update - * of i_update_core above the timestamp updates above. + * Update complete - now make sure everyone knows that the inode + * is dirty. */ - if (sync_it) { - SYNCHRONIZE(); - ip->i_update_core = 1; + if (sync_it) xfs_mark_inode_dirty_sync(ip); - } } /* @@ -514,10 +505,8 @@ xfs_vn_getattr( stat->gid = ip->i_d.di_gid; stat->ino = ip->i_ino; stat->atime = inode->i_atime; - stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; - stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec; - stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; stat->blocks = XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 49e4a6a..072050f 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -667,7 +667,7 @@ start: xip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) - xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + file_update_time(file); /* * If the offset is beyond the size of the file, we have a couple diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 5d7c60a..1ea65b6 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -977,6 +977,28 @@ xfs_fs_inode_init_once( } /* + * Dirty the XFS inode when mark_inode_dirty_sync() is called so that + * we catch unlogged VFS level updates to the inode. Care must be taken + * here - the transaction code calls mark_inode_dirty_sync() to mark the + * VFS inode dirty in a transaction and clears the i_update_core field; + * it must clear the field after calling mark_inode_dirty_sync() to + * correctly indicate that the dirty state has been propagated into the + * inode log item. + * + * We need the barrier() to maintain correct ordering between unlogged + * updates and the transaction commit code that clears the i_update_core + * field. This requires all updates to be completed before marking the + * inode dirty. + */ +STATIC void +xfs_fs_dirty_inode( + struct inode *inode) +{ + barrier(); + XFS_I(inode)->i_update_core = 1; +} + +/* * Attempt to flush the inode, this will actually fail * if the inode is pinned, but we dirty the inode again * at the point when it is unpinned after a log write, @@ -1539,6 +1561,7 @@ xfs_fs_get_sb( static struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 7465f9e..ab89a7e 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -206,10 +206,10 @@ xfs_swap_extents( * process that the file was not changed out from * under it. */ - if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) || - (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) || - (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || - (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = XFS_ERROR(EBUSY); goto out_unlock; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c1dc7ef..b92a4fa 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3068,9 +3068,9 @@ xfs_iflush_int( SYNCHRONIZE(); /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); + xfs_synchronize_times(ip); if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0b38b9a..41555de 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -504,7 +504,7 @@ void xfs_ichgtime(xfs_inode_t *, int); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); -void xfs_synchronize_atime(xfs_inode_t *); +void xfs_synchronize_times(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); #if defined(XFS_INODE_TRACE) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 47d5b66..9794b87 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -232,6 +232,15 @@ xfs_inode_item_format( nvecs = 1; /* + * Make sure the linux inode is dirty. We do this before + * clearing i_update_core as the VFS will call back into + * XFS here and set i_update_core, so we need to dirty the + * inode first so that the ordering of i_update_core and + * unlogged modifications still works as described below. + */ + xfs_mark_inode_dirty_sync(ip); + + /* * Clear i_update_core if the timestamps (or any other * non-transactional modification) need flushing/logging * and we're about to log them with the rest of the core. @@ -263,14 +272,9 @@ xfs_inode_item_format( } /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); - - /* - * make sure the linux inode is dirty - */ - xfs_mark_inode_dirty_sync(ip); + xfs_synchronize_times(ip); vecp->i_addr = (xfs_caddr_t)&ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index b68f910..62efab2 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -59,6 +59,7 @@ xfs_bulkstat_one_iget( { xfs_icdinode_t *dic; /* dinode core info pointer */ xfs_inode_t *ip; /* incore inode pointer */ + struct inode *inode; int error; error = xfs_iget(mp, NULL, ino, @@ -72,6 +73,7 @@ xfs_bulkstat_one_iget( ASSERT(ip->i_imap.im_blkno != 0); dic = &ip->i_d; + inode = VFS_I(ip); /* xfs_iget returns the following without needing * further change. @@ -83,16 +85,19 @@ xfs_bulkstat_one_iget( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; + /* - * We are reading the atime from the Linux inode because the - * dinode might not be uptodate. + * We need to read the timestamps from the Linux inode because + * the VFS keeps writing directly into the inode structure instead + * of telling us about the updates. */ - buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec; - buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec; - buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; - buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; - buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; - buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; + buf->bs_atime.tv_sec = inode->i_atime.tv_sec; + buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec; + buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec; + buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec; + buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec; + buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec; + buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; buf->bs_extents = dic->di_nextents; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index a434f28..b572f7e 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2476,12 +2476,6 @@ xfs_reclaim( ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); /* - * Make sure the atime in the XFS inode is correct before freeing the - * Linux inode. - */ - xfs_synchronize_atime(ip); - - /* * If we have nothing to flush with this inode then complete the * teardown now, otherwise break the link between the xfs inode and the * linux inode and clean up the xfs inode later. This avoids flushing -- cgit v1.1 From c90b07e8dd9f263d2e2af1d9648ba269f4d0d8fd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Oct 2009 20:29:27 +0000 Subject: xfs: fix xfs_quiesce_data We need to do a synchronous xfs_sync_fsdata to make sure the superblock actually is on disk when we return. Also remove SYNC_BDFLUSH flag to xfs_sync_inodes because that particular flag is never checked. Move xfs_filestream_flush call later to only release inodes after they have been written out. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 320be6a..7647b8d 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -419,14 +419,16 @@ xfs_quiesce_data( /* push non-blocking */ xfs_sync_data(mp, 0); xfs_qm_sync(mp, SYNC_TRYLOCK); - xfs_filestream_flush(mp); - /* push and block */ + /* push and block till complete */ xfs_sync_data(mp, SYNC_WAIT); xfs_qm_sync(mp, SYNC_WAIT); + /* drop inode references pinned by filestreams */ + xfs_filestream_flush(mp); + /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp, 0); + error = xfs_sync_fsdata(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) -- cgit v1.1 From 69961a26b82931601e07c9e3656bd90c598f2395 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Oct 2009 20:29:28 +0000 Subject: xfs: cleanup ->sync_fs Sort out ->sync_fs to not perform a superblock writeback for the wait = 0 case as that is just an optional first pass and the superblock will be written back properly in the next call with wait = 1. Instead perform an opportunistic quota writeback to have less work later. Also remove the freeze special case as we do a proper wait = 1 call in the freeze code anyway. Also rename the function to xfs_fs_sync_fs to match the normal naming convention, update comments and avoid calling into the laptop_mode logic on an error. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_super.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 1ea65b6..eefea0d 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1148,7 +1148,7 @@ xfs_fs_put_super( } STATIC int -xfs_fs_sync_super( +xfs_fs_sync_fs( struct super_block *sb, int wait) { @@ -1156,23 +1156,23 @@ xfs_fs_sync_super( int error; /* - * Treat a sync operation like a freeze. This is to work - * around a race in sync_inodes() which works in two phases - * - an asynchronous flush, which can write out an inode - * without waiting for file size updates to complete, and a - * synchronous flush, which wont do anything because the - * async flush removed the inode's dirty flag. Also - * sync_inodes() will not see any files that just have - * outstanding transactions to be flushed because we don't - * dirty the Linux inode until after the transaction I/O - * completes. + * Not much we can do for the first async pass. Writing out the + * superblock would be counter-productive as we are going to redirty + * when writing out other data and metadata (and writing out a single + * block is quite fast anyway). + * + * Try to asynchronously kick off quota syncing at least. */ - if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) - error = xfs_quiesce_data(mp); - else - error = xfs_sync_fsdata(mp, 0); + if (!wait) { + xfs_qm_sync(mp, SYNC_TRYLOCK); + return 0; + } + + error = xfs_quiesce_data(mp); + if (error) + return -error; - if (unlikely(laptop_mode)) { + if (laptop_mode) { int prev_sync_seq = mp->m_sync_seq; /* @@ -1191,7 +1191,7 @@ xfs_fs_sync_super( mp->m_sync_seq != prev_sync_seq); } - return -error; + return 0; } STATIC int @@ -1565,7 +1565,7 @@ static struct super_operations xfs_super_operations = { .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, - .sync_fs = xfs_fs_sync_super, + .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, -- cgit v1.1 From 932640e8adaff3d0c28ee32d4863c7a7a74df055 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Oct 2009 20:29:29 +0000 Subject: xfs: mark inodes dirty before issuing I/O To make sure they get properly waited on in sync when I/O is in flight and we latter need to update the inode size. Requires a new helper to check if an ioend structure is beyond the current EOF. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_aops.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 40d2226..6a2910e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -186,19 +186,37 @@ xfs_destroy_ioend( } /* + * If the end of the current ioend is beyond the current EOF, + * return the new EOF value, otherwise zero. + */ +STATIC xfs_fsize_t +xfs_ioend_new_eof( + xfs_ioend_t *ioend) +{ + xfs_inode_t *ip = XFS_I(ioend->io_inode); + xfs_fsize_t isize; + xfs_fsize_t bsize; + + bsize = ioend->io_offset + ioend->io_size; + isize = MAX(ip->i_size, ip->i_new_size); + isize = MIN(isize, bsize); + return isize > ip->i_d.di_size ? isize : 0; +} + +/* * Update on-disk file size now that data has been written to disk. * The current in-memory file size is i_size. If a write is beyond * eof i_new_size will be the intended file size until i_size is * updated. If this write does not extend all the way to the valid * file size then restrict this update to the end of the write. */ + STATIC void xfs_setfilesize( xfs_ioend_t *ioend) { xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - xfs_fsize_t bsize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); ASSERT(ioend->io_type != IOMAP_READ); @@ -206,14 +224,9 @@ xfs_setfilesize( if (unlikely(ioend->io_error)) return; - bsize = ioend->io_offset + ioend->io_size; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - isize = MAX(ip->i_size, ip->i_new_size); - isize = MIN(isize, bsize); - - if (ip->i_d.di_size < isize) { + isize = xfs_ioend_new_eof(ioend); + if (isize) { ip->i_d.di_size = isize; xfs_mark_inode_dirty_sync(ip); } @@ -403,10 +416,16 @@ xfs_submit_ioend_bio( struct bio *bio) { atomic_inc(&ioend->io_remaining); - bio->bi_private = ioend; bio->bi_end_io = xfs_end_bio; + /* + * If the I/O is beyond EOF we mark the inode dirty immediately + * but don't update the inode size until I/O completion. + */ + if (xfs_ioend_new_eof(ioend)) + xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); + submit_bio(WRITE, bio); ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); bio_put(bio); -- cgit v1.1 From dce5065a57c158e0ca5db8e63c50118b2ecf4ac5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Oct 2009 20:29:30 +0000 Subject: xfs: make sure xfs_sync_fsdata covers the log We want to always cover the log after writing out the superblock, and in case of a synchronous writeout make sure we actually wait for the log to be covered. That way a filesystem that has been sync()ed can be considered clean by log recovery. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 7647b8d..961df0a 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -309,11 +309,15 @@ xfs_sync_attr( STATIC int xfs_commit_dummy_trans( struct xfs_mount *mp, - uint log_flags) + uint flags) { struct xfs_inode *ip = mp->m_rootip; struct xfs_trans *tp; int error; + int log_flags = XFS_LOG_FORCE; + + if (flags & SYNC_WAIT) + log_flags |= XFS_LOG_SYNC; /* * Put a dummy transaction in the log to tell recovery @@ -331,13 +335,12 @@ xfs_commit_dummy_trans( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* XXX(hch): ignoring the error here.. */ error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* the log force ensures this transaction is pushed to disk */ xfs_log_force(mp, 0, log_flags); - return 0; + return error; } int @@ -385,7 +388,20 @@ xfs_sync_fsdata( else XFS_BUF_ASYNC(bp); - return xfs_bwrite(mp, bp); + error = xfs_bwrite(mp, bp); + if (error) + return error; + + /* + * If this is a data integrity sync make sure all pending buffers + * are flushed out for the log coverage check below. + */ + if (flags & SYNC_WAIT) + xfs_flush_buftarg(mp->m_ddev_targp, 1); + + if (xfs_log_need_covered(mp)) + error = xfs_commit_dummy_trans(mp, flags); + return error; out_brelse: xfs_buf_relse(bp); @@ -572,8 +588,6 @@ xfs_sync_worker( /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); -- cgit v1.1 From 8e69ce147127a0235e8d1f2b75ea214be78c61b3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 25 Sep 2009 19:42:26 +0000 Subject: fix readahead calculations in xfs_dir2_leaf_getdents() This is for bug #850, http://oss.sgi.com/bugzilla/show_bug.cgi?id=850 XFS file system segfaults , repeatedly and 100% reproducable in 2.6.30 , 2.6.31 The above only showed up on a CONFIG_XFS_DEBUG=y kernel, because xfs_bmapi() ASSERTs that it has been asked for at least one map, and it was getting 0. The root cause is that our guesstimated "bufsize" from xfs_file_readdir was fairly small, and the bufsize -= length; in the loop was going negative - except bufsize is a size_t, so it was wrapping to a very large number. Then when we did ra_want = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize) - 1; with that very large number, the (int) ra_want was coming out negative, and a subsequent compare: if (1 + ra_want > map_blocks ... was coming out -true- (negative int compare w/ uint) and we went back to xfs_bmapi() for more, even though we did not need more, and asked for 0 maps, and hit the ASSERT. We have kind of a type mess here, but just keeping bufsize from going negative is probably sufficient to avoid the problem. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/xfs_dir2_leaf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index fa913e45..41ad537 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -854,6 +854,7 @@ xfs_dir2_leaf_getdents( */ ra_want = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize) - 1; + ASSERT(ra_want >= 0); /* * If we don't have as many as we want, and we haven't @@ -1088,7 +1089,8 @@ xfs_dir2_leaf_getdents( */ ptr += length; curoff += length; - bufsize -= length; + /* bufsize may have just been a guess; don't go negative */ + bufsize = bufsize > length ? bufsize - length : 0; } /* -- cgit v1.1 From d0800703febc04827b8fa91921aa4e254d01e8d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 26 Sep 2009 19:55:04 +0000 Subject: xfs: stop calling filemap_fdatawait inside ->fsync Now that the VFS actually waits for the data I/O to complete before calling into ->fsync we can stop doing it ourselves. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_file.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 988d8f8..6d65baf 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -176,14 +176,7 @@ xfs_file_fsync( struct dentry *dentry, int datasync) { - struct inode *inode = dentry->d_inode; - struct xfs_inode *ip = XFS_I(inode); - int error; - - /* capture size updates in I/O completion before writing the inode. */ - error = filemap_fdatawait(inode->i_mapping); - if (error) - return error; + struct xfs_inode *ip = XFS_I(dentry->d_inode); xfs_iflags_clear(ip, XFS_ITRUNCATED); return -xfs_fsync(ip); -- cgit v1.1 From a791e35e12ff672e8a0e140abeeaf900c3b2ea77 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 8 Oct 2009 11:27:10 -0400 Subject: Btrfs: cleanup extent_clear_unlock_delalloc flags extent_clear_unlock_delalloc has a growing set of ugly parameters that is very difficult to read and maintain. This switches to a flag field and well named flag defines. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 27 +++++++++++---------------- fs/btrfs/extent_io.h | 16 ++++++++++------ fs/btrfs/inode.c | 45 ++++++++++++++++++++++++++++++--------------- 3 files changed, 51 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index de1793b..f9708bd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1401,12 +1401,7 @@ out_failed: int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, - int unlock_pages, - int clear_unlock, - int clear_delalloc, int clear_dirty, - int set_writeback, - int end_writeback, - int set_private2) + unsigned long op) { int ret; struct page *pages[16]; @@ -1416,17 +1411,17 @@ int extent_clear_unlock_delalloc(struct inode *inode, int i; int clear_bits = 0; - if (clear_unlock) + if (op & EXTENT_CLEAR_UNLOCK) clear_bits |= EXTENT_LOCKED; - if (clear_dirty) + if (op & EXTENT_CLEAR_DIRTY) clear_bits |= EXTENT_DIRTY; - if (clear_delalloc) + if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (!(unlock_pages || clear_dirty || set_writeback || end_writeback || - set_private2)) + if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK | EXTENT_SET_PRIVATE2))) return 0; while (nr_pages > 0) { @@ -1435,20 +1430,20 @@ int extent_clear_unlock_delalloc(struct inode *inode, nr_pages, ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { - if (set_private2) + if (op & EXTENT_SET_PRIVATE2) SetPagePrivate2(pages[i]); if (pages[i] == locked_page) { page_cache_release(pages[i]); continue; } - if (clear_dirty) + if (op & EXTENT_CLEAR_DIRTY) clear_page_dirty_for_io(pages[i]); - if (set_writeback) + if (op & EXTENT_SET_WRITEBACK) set_page_writeback(pages[i]); - if (end_writeback) + if (op & EXTENT_END_WRITEBACK) end_page_writeback(pages[i]); - if (unlock_pages) + if (op & EXTENT_CLEAR_UNLOCK_PAGE) unlock_page(pages[i]); page_cache_release(pages[i]); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4794ec8..41d2a47 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -25,6 +25,15 @@ #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 +/* these are flags for extent_clear_unlock_delalloc */ +#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 +#define EXTENT_CLEAR_UNLOCK 0x2 +#define EXTENT_CLEAR_DELALLOC 0x4 +#define EXTENT_CLEAR_DIRTY 0x8 +#define EXTENT_SET_WRITEBACK 0x10 +#define EXTENT_END_WRITEBACK 0x20 +#define EXTENT_SET_PRIVATE2 0x40 + /* * page->private values. Every page that is controlled by the extent * map has page->private set to one. @@ -288,10 +297,5 @@ int extent_range_uptodate(struct extent_io_tree *tree, int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, - int unlock_page, - int clear_unlock, - int clear_delalloc, int clear_dirty, - int set_writeback, - int end_writeback, - int set_private2); + unsigned long op); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3a6f953..a7058fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -424,9 +424,10 @@ again: * and free up our temp pages. */ extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, 1, 0, - 0, 1, 1, 1, 0); + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); ret = 0; goto free_pages_out; } @@ -637,11 +638,13 @@ static noinline int submit_compressed_extents(struct inode *inode, * clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - NULL, 1, 1, 0, 1, 1, 0, 0); + &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); ret = btrfs_submit_compressed_write(inode, async_extent->start, @@ -712,9 +715,14 @@ static noinline int cow_file_range(struct inode *inode, start, end, 0, NULL); if (ret == 0) { extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, 1, 1, - 1, 1, 1, 1, 0); + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; @@ -738,6 +746,8 @@ static noinline int cow_file_range(struct inode *inode, btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { + unsigned long op; + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, cur_alloc_size, root->sectorsize, 0, alloc_hint, @@ -789,10 +799,13 @@ static noinline int cow_file_range(struct inode *inode, * Do set the Private2 bit so we know this page was properly * setup for writepage */ + op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; + op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2; + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, start, start + ram_size - 1, - locked_page, unlock, 1, - 1, 0, 0, 0, 1); + locked_page, op); disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; @@ -1112,8 +1125,10 @@ out_check: BUG_ON(ret); extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - cur_offset, cur_offset + num_bytes - 1, - locked_page, 1, 1, 1, 0, 0, 0, 1); + cur_offset, cur_offset + num_bytes - 1, + locked_page, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2); cur_offset = extent_end; if (cur_offset > end) break; -- cgit v1.1 From a3429ab70b04363c6190964e82f04f44f3e34cf0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 8 Oct 2009 12:30:20 -0400 Subject: Btrfs: delay clearing EXTENT_DELALLOC for compressed extents When compression is on, the cow_file_range code is farmed off to worker threads. This allows us to do significant CPU work in parallel on SMP machines. But it is a delicate balance around when we clear flags and how. In the past we cleared the delalloc flag immediately, which was safe because the pages stayed locked. But this is causing problems with the newest ENOSPC code, and with the recent extent state cleanups we can now clear the delalloc bit at the same time the uncompressed code does. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a7058fc..401dfb2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -427,6 +427,7 @@ again: &BTRFS_I(inode)->io_tree, start, end, NULL, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_CLEAR_DELALLOC | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); ret = 0; goto free_pages_out; @@ -644,6 +645,7 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->ram_size - 1, NULL, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); ret = btrfs_submit_compressed_write(inode, @@ -877,8 +879,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; int limit = 10 * 1024 * 1042; - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | - EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, + 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); async_cow->inode = inode; -- cgit v1.1 From 32c00aff718bb54a214b39146bdd9ac01511cd25 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 8 Oct 2009 13:34:05 -0400 Subject: Btrfs: release delalloc reservations on extent item insertion This patch fixes an issue with the delalloc metadata space reservation code. The problem is we used to free the reservation as soon as we allocated the delalloc region. The problem with this is if we are not inserting an inline extent, we don't actually insert the extent item until after the ordered extent is written out. This patch does 3 things, 1) It moves the reservation clearing stuff into the ordered code, so when we remove the ordered extent we remove the reservation. 2) It adds a EXTENT_DO_ACCOUNTING flag that gets passed when we clear delalloc bits in the cases where we want to clear the metadata reservation when we clear the delalloc extent, in the case that we do an inline extent or we invalidate the page. 3) It adds another waitqueue to the space info so that when we start a fs wide delalloc flush, anybody else who also hits that area will simply wait for the flush to finish and then try to make their allocation. This has been tested thoroughly to make sure we did not regress on performance. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 12 ++++++----- fs/btrfs/ctree.h | 3 +++ fs/btrfs/extent-tree.c | 54 +++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/extent_io.c | 19 +++++++++++------ fs/btrfs/extent_io.h | 2 ++ fs/btrfs/file.c | 3 ++- fs/btrfs/inode.c | 45 ++++++++++++++++++++++++++++------------- fs/btrfs/ordered-data.c | 6 ++++++ 8 files changed, 107 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a54d354..c71abec 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -128,12 +128,14 @@ struct btrfs_inode { u64 last_unlink_trans; /* - * These two counters are for delalloc metadata reservations. We keep - * track of how many extents we've accounted for vs how many extents we - * have. + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. */ - int delalloc_reserved_extents; - int delalloc_extents; + spinlock_t accounting_lock; + int reserved_extents; + int outstanding_extents; /* * ordered_data_close is set by truncate when a file that used diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1b920ff..dbdada5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -699,6 +699,9 @@ struct btrfs_space_info { int allocating_chunk; wait_queue_head_t wait; + + int flushing; + wait_queue_head_t flush_wait; }; /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2f82fab..3d1be0b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2823,14 +2823,17 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, num_items); spin_lock(&meta_sinfo->lock); - if (BTRFS_I(inode)->delalloc_reserved_extents <= - BTRFS_I(inode)->delalloc_extents) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + if (BTRFS_I(inode)->reserved_extents <= + BTRFS_I(inode)->outstanding_extents) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); spin_unlock(&meta_sinfo->lock); return 0; } + spin_unlock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->delalloc_reserved_extents--; - BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0); + BTRFS_I(inode)->reserved_extents--; + BUG_ON(BTRFS_I(inode)->reserved_extents < 0); if (meta_sinfo->bytes_delalloc < num_bytes) { bug = true; @@ -2863,6 +2866,37 @@ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) meta_sinfo->force_delalloc = 0; } +static void flush_delalloc(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + bool wait = false; + + spin_lock(&info->lock); + + if (!info->flushing) { + info->flushing = 1; + init_waitqueue_head(&info->flush_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_event(info->flush_wait, + !info->flushing); + return; + } + + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); +} + static int maybe_allocate_chunk(struct btrfs_root *root, struct btrfs_space_info *info) { @@ -2980,21 +3014,20 @@ again: filemap_flush(inode->i_mapping); goto again; } else if (flushed == 3) { - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + flush_delalloc(root, meta_sinfo); goto again; } spin_lock(&meta_sinfo->lock); meta_sinfo->bytes_delalloc -= num_bytes; spin_unlock(&meta_sinfo->lock); printk(KERN_ERR "enospc, has %d, reserved %d\n", - BTRFS_I(inode)->delalloc_extents, - BTRFS_I(inode)->delalloc_reserved_extents); + BTRFS_I(inode)->outstanding_extents, + BTRFS_I(inode)->reserved_extents); dump_space_info(meta_sinfo, 0, 0); return -ENOSPC; } - BTRFS_I(inode)->delalloc_reserved_extents++; + BTRFS_I(inode)->reserved_extents++; check_force_delalloc(meta_sinfo); spin_unlock(&meta_sinfo->lock); @@ -3093,8 +3126,7 @@ again: } if (retries == 2) { - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + flush_delalloc(root, meta_sinfo); goto again; } spin_lock(&meta_sinfo->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f9708bd..96577e8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -460,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, int bits, int wake, int delete) { - int ret = state->state & bits; + int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int ret = state->state & bits_to_clear; if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -468,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree, tree->dirty_bytes -= range; } clear_state_cb(tree, state, bits); - state->state &= ~bits; + state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); if (delete || state->state == 0) { @@ -956,7 +957,8 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); } @@ -1419,9 +1421,13 @@ int extent_clear_unlock_delalloc(struct inode *inode, if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; + if (op & EXTENT_CLEAR_ACCOUNTING) + clear_bits |= EXTENT_DO_ACCOUNTING; + clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK | EXTENT_SET_PRIVATE2))) + if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | + EXTENT_SET_PRIVATE2))) return 0; while (nr_pages > 0) { @@ -2709,7 +2715,8 @@ int extent_invalidatepage(struct extent_io_tree *tree, lock_extent(tree, start, end, GFP_NOFS); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); return 0; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 41d2a47..36de250 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -15,6 +15,7 @@ #define EXTENT_BUFFER_FILLED (1 << 8) #define EXTENT_BOUNDARY (1 << 9) #define EXTENT_NODATASUM (1 << 10) +#define EXTENT_DO_ACCOUNTING (1 << 11) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) /* flags for bio submission */ @@ -33,6 +34,7 @@ #define EXTENT_SET_WRITEBACK 0x10 #define EXTENT_END_WRITEBACK 0x20 #define EXTENT_SET_PRIVATE2 0x40 +#define EXTENT_CLEAR_ACCOUNTING 0x80 /* * page->private values. Every page that is controlled by the extent diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f155179..53fb1c9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -878,7 +878,8 @@ again: btrfs_put_ordered_extent(ordered); clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, + last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, GFP_NOFS); unlock_extent(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, GFP_NOFS); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 401dfb2..ccc4f11 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -428,6 +428,7 @@ again: start, end, NULL, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); ret = 0; goto free_pages_out; @@ -722,6 +723,7 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); @@ -1195,15 +1197,17 @@ static int btrfs_split_extent_hook(struct inode *inode, root->fs_info->max_extent); /* - * if we break a large extent up then leave delalloc_extents be, - * since we've already accounted for the large extent. + * if we break a large extent up then leave oustanding_extents + * be, since we've already accounted for the large extent. */ if (div64_u64(new_size + root->fs_info->max_extent - 1, root->fs_info->max_extent) < num_extents) return 0; } - BTRFS_I(inode)->delalloc_extents++; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); return 0; } @@ -1234,7 +1238,9 @@ static int btrfs_merge_extent_hook(struct inode *inode, /* we're not bigger than the max, unreserve the space and go */ if (new_size <= root->fs_info->max_extent) { - BTRFS_I(inode)->delalloc_extents--; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); return 0; } @@ -1248,7 +1254,9 @@ static int btrfs_merge_extent_hook(struct inode *inode, root->fs_info->max_extent) > num_extents) return 0; - BTRFS_I(inode)->delalloc_extents--; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); return 0; } @@ -1270,7 +1278,9 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; - BTRFS_I(inode)->delalloc_extents++; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); btrfs_delalloc_reserve_space(root, inode, end - start + 1); spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += end - start + 1; @@ -1298,8 +1308,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; - BTRFS_I(inode)->delalloc_extents--; - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + if (bits & EXTENT_DO_ACCOUNTING) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + } spin_lock(&root->fs_info->delalloc_lock); if (state->end - state->start + 1 > @@ -4825,7 +4839,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, + NULL, GFP_NOFS); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -4838,8 +4853,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) lock_extent(tree, page_start, page_end, GFP_NOFS); } clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, - 1, 1, NULL, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); @@ -4934,7 +4949,8 @@ again: * prepare_pages in the normal write path. */ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); ret = btrfs_set_extent_delalloc(inode, page_start, page_end); if (ret) { @@ -5082,8 +5098,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) return NULL; ei->last_trans = 0; ei->logged_trans = 0; - ei->delalloc_extents = 0; - ei->delalloc_reserved_extents = 0; + ei->outstanding_extents = 0; + ei->reserved_extents = 0; + spin_lock_init(&ei->accounting_lock); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->ordered_operations); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4a9c8c4..ab21c29 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -306,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode, tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, + inode, 1); + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); -- cgit v1.1 From e3ccfa989752c083ceb23c823a84f7ce3a081e61 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 7 Oct 2009 20:44:34 -0400 Subject: Btrfs: async delalloc flushing under space pressure This patch moves the delalloc flushing that occurs when we are under space pressure off to a async thread pool. This helps since we only free up metadata space when we actually insert the extent item, which means it takes quite a while for space to be free'ed up if we wait on all ordered extents. However, if space is freed up due to inline extents being inserted, we can wake people who are waiting up early, and they can finish their work. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 13 ++++---- fs/btrfs/disk-io.c | 6 ++++ fs/btrfs/extent-tree.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 88 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dbdada5..a362dd6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -691,17 +691,17 @@ struct btrfs_space_info { struct list_head list; + /* for controlling how we free up space for allocations */ + wait_queue_head_t allocate_wait; + wait_queue_head_t flush_wait; + int allocating_chunk; + int flushing; + /* for block groups in our same type */ struct list_head block_groups; spinlock_t lock; struct rw_semaphore groups_sem; atomic_t caching_threads; - - int allocating_chunk; - wait_queue_head_t wait; - - int flushing; - wait_queue_head_t flush_wait; }; /* @@ -918,6 +918,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers submit_workers; + struct btrfs_workers enospc_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9903f04..ac8927b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1762,6 +1762,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size), &fs_info->generic_worker); + btrfs_init_workers(&fs_info->enospc_workers, "enospc", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1809,6 +1812,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_meta_workers, 1); btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); btrfs_start_workers(&fs_info->endio_write_workers, 1); + btrfs_start_workers(&fs_info->enospc_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -2023,6 +2027,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -2449,6 +2454,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d1be0b..5302680 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2866,9 +2866,66 @@ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) meta_sinfo->force_delalloc = 0; } +struct async_flush { + struct btrfs_root *root; + struct btrfs_space_info *info; + struct btrfs_work work; +}; + +static noinline void flush_delalloc_async(struct btrfs_work *work) +{ + struct async_flush *async; + struct btrfs_root *root; + struct btrfs_space_info *info; + + async = container_of(work, struct async_flush, work); + root = async->root; + info = async->info; + + btrfs_start_delalloc_inodes(root); + wake_up(&info->flush_wait); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); + + kfree(async); +} + +static void wait_on_flush(struct btrfs_space_info *info) +{ + DEFINE_WAIT(wait); + u64 used; + + while (1) { + prepare_to_wait(&info->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_lock(&info->lock); + if (!info->flushing) { + spin_unlock(&info->lock); + break; + } + + used = info->bytes_used + info->bytes_reserved + + info->bytes_pinned + info->bytes_readonly + + info->bytes_super + info->bytes_root + + info->bytes_may_use + info->bytes_delalloc; + if (used < info->total_bytes) { + spin_unlock(&info->lock); + break; + } + spin_unlock(&info->lock); + schedule(); + } + finish_wait(&info->flush_wait, &wait); +} + static void flush_delalloc(struct btrfs_root *root, struct btrfs_space_info *info) { + struct async_flush *async; bool wait = false; spin_lock(&info->lock); @@ -2883,11 +2940,24 @@ static void flush_delalloc(struct btrfs_root *root, spin_unlock(&info->lock); if (wait) { - wait_event(info->flush_wait, - !info->flushing); + wait_on_flush(info); return; } + async = kzalloc(sizeof(*async), GFP_NOFS); + if (!async) + goto flush; + + async->root = root; + async->info = info; + async->work.func = flush_delalloc_async; + + btrfs_queue_worker(&root->fs_info->enospc_workers, + &async->work); + wait_on_flush(info); + return; + +flush: btrfs_start_delalloc_inodes(root); btrfs_wait_ordered_extents(root, 0); @@ -2927,7 +2997,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root, if (!info->allocating_chunk) { info->force_alloc = 1; info->allocating_chunk = 1; - init_waitqueue_head(&info->wait); + init_waitqueue_head(&info->allocate_wait); } else { wait = true; } @@ -2935,7 +3005,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root, spin_unlock(&info->lock); if (wait) { - wait_event(info->wait, + wait_event(info->allocate_wait, !info->allocating_chunk); return 1; } @@ -2956,7 +3026,7 @@ out: spin_lock(&info->lock); info->allocating_chunk = 0; spin_unlock(&info->lock); - wake_up(&info->wait); + wake_up(&info->allocate_wait); if (ret) return 0; -- cgit v1.1 From ff782e0a131c7f669445c07fe5c7ba91e043b7ed Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 8 Oct 2009 15:30:04 -0400 Subject: Btrfs: optimize fsync for the single writer case This patch optimizes the tree logging stuff so it doesn't always wait 1 jiffie for new people to join the logging transaction if there is only ever 1 writer. This helps a little bit with latency where we have something like RPM where it will fdatasync every file it writes, and so waiting the 1 jiffie for every fdatasync really starts to add up. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/tree-log.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a362dd6..16ddb19 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1010,6 +1010,8 @@ struct btrfs_root { atomic_t log_commit[2]; unsigned long log_transid; unsigned long log_batch; + pid_t log_start_pid; + bool log_multiple_pids; u64 objectid; u64 last_trans; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4d7d9ab..78f6254 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { + if (!root->log_start_pid) { + root->log_start_pid = current->pid; + root->log_multiple_pids = false; + } else if (root->log_start_pid != current->pid) { + root->log_multiple_pids = true; + } + root->log_batch++; atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return 0; } + root->log_multiple_pids = false; + root->log_start_pid = current->pid; mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, root->fs_info); @@ -1985,7 +1994,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); - while (1) { + while (root->log_multiple_pids) { unsigned long batch = root->log_batch; mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); @@ -2011,6 +2020,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root->log_batch = 0; root->log_transid++; log->log_transid = root->log_transid; + root->log_start_pid = 0; smp_mb(); /* * log tree has been flushed to disk, new modifications of -- cgit v1.1 From efefb1438be269897585934fc6c05deb4dfa01ce Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 9 Oct 2009 09:25:16 -0400 Subject: Btrfs: remove negative dentry when deleting subvolumne The use of btrfs_dentry_delete is removing dentries from the dcache when deleting subvolumne. btrfs_dentry_delete ignores negative dentries. This is incorrect since if we don't remove the negative dentry, its parent dentry can't be removed. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 12 +++++++----- fs/btrfs/ioctl.c | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ccc4f11..d3585cb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3629,12 +3629,14 @@ static int btrfs_dentry_delete(struct dentry *dentry) { struct btrfs_root *root; - if (!dentry->d_inode) - return 0; + if (!dentry->d_inode && !IS_ROOT(dentry)) + dentry = dentry->d_parent; - root = BTRFS_I(dentry->d_inode)->root; - if (btrfs_root_refs(&root->root_item) == 0) - return 1; + if (dentry->d_inode) { + root = BTRFS_I(dentry->d_inode)->root; + if (btrfs_root_refs(&root->root_item) == 0) + return 1; + } return 0; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9a780c8..e8795be 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -830,6 +830,7 @@ out_up_write: out_unlock: mutex_unlock(&inode->i_mutex); if (!err) { + shrink_dcache_sb(root->fs_info->sb); btrfs_invalidate_inodes(dest); d_delete(dentry); } -- cgit v1.1 From 94fcca9f8999e7828d5f4dc181daa39cad2af38a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 9 Oct 2009 09:25:16 -0400 Subject: Btrfs: optimize back reference update during btrfs_drop_snapshot This patch reading level 0 tree blocks that already use full backrefs. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 82 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5302680..4aedbff 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4911,6 +4911,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 refs; + u64 flags; u64 last = 0; u32 nritems; u32 blocksize; @@ -4948,15 +4949,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, generation <= root->root_key.offset) continue; + /* We don't lock the tree block, it's OK to be racy here */ + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &refs, &flags); + BUG_ON(ret); + BUG_ON(refs == 0); + if (wc->stage == DROP_REFERENCE) { - ret = btrfs_lookup_extent_info(trans, root, - bytenr, blocksize, - &refs, NULL); - BUG_ON(ret); - BUG_ON(refs == 0); if (refs == 1) goto reada; + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; if (!wc->update_ref || generation <= root->root_key.offset) continue; @@ -4965,6 +4970,10 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, &wc->update_progress); if (ret < 0) continue; + } else { + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; } reada: ret = readahead_tree_block(root, bytenr, blocksize, @@ -4988,7 +4997,7 @@ reada: static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc) + struct walk_control *wc, int lookup_info) { int level = wc->level; struct extent_buffer *eb = path->nodes[level]; @@ -5003,8 +5012,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ - if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || - (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { + if (lookup_info && + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { BUG_ON(!path->locks[level]); ret = btrfs_lookup_extent_info(trans, root, eb->start, eb->len, @@ -5065,7 +5075,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc) + struct walk_control *wc, int *lookup_info) { u64 bytenr; u64 generation; @@ -5085,8 +5095,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= root->root_key.offset) { + *lookup_info = 1; return 1; + } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); blocksize = btrfs_level_size(root, level - 1); @@ -5099,14 +5111,19 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - if (wc->stage == DROP_REFERENCE) { - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &wc->refs[level - 1], - &wc->flags[level - 1]); - BUG_ON(ret); - BUG_ON(wc->refs[level - 1] == 0); + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &wc->refs[level - 1], + &wc->flags[level - 1]); + BUG_ON(ret); + BUG_ON(wc->refs[level - 1] == 0); + *lookup_info = 0; + if (wc->stage == DROP_REFERENCE) { if (wc->refs[level - 1] > 1) { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + if (!wc->update_ref || generation <= root->root_key.offset) goto skip; @@ -5120,12 +5137,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, wc->stage = UPDATE_BACKREF; wc->shared_level = level - 1; } + } else { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; } if (!btrfs_buffer_uptodate(next, generation)) { btrfs_tree_unlock(next); free_extent_buffer(next); next = NULL; + *lookup_info = 1; } if (!next) { @@ -5148,21 +5170,22 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, skip: wc->refs[level - 1] = 0; wc->flags[level - 1] = 0; + if (wc->stage == DROP_REFERENCE) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level])); + parent = 0; + } - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - parent = path->nodes[level]->start; - } else { - BUG_ON(root->root_key.objectid != - btrfs_header_owner(path->nodes[level])); - parent = 0; + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, + root->root_key.objectid, level - 1, 0); + BUG_ON(ret); } - - ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0); - BUG_ON(ret); - btrfs_tree_unlock(next); free_extent_buffer(next); + *lookup_info = 1; return 1; } @@ -5276,6 +5299,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { int level = wc->level; + int lookup_info = 1; int ret; while (level >= 0) { @@ -5283,14 +5307,14 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, btrfs_header_nritems(path->nodes[level])) break; - ret = walk_down_proc(trans, root, path, wc); + ret = walk_down_proc(trans, root, path, wc, lookup_info); if (ret > 0) break; if (level == 0) break; - ret = do_walk_down(trans, root, path, wc); + ret = do_walk_down(trans, root, path, wc, &lookup_info); if (ret > 0) { path->slots[level]++; continue; -- cgit v1.1 From 82d339d9b3a6395f17d3253887653250b693b74b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 9 Oct 2009 09:54:36 -0400 Subject: Btrfs: constify dentry_operations Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 16ddb19..36a19cd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2330,7 +2330,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); -extern struct dentry_operations btrfs_dentry_operations; +extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3585cb..d960492 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5840,6 +5840,6 @@ static struct inode_operations btrfs_symlink_inode_operations = { .removexattr = btrfs_removexattr, }; -struct dentry_operations btrfs_dentry_operations = { +const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, }; -- cgit v1.1 From e9061e214810c9534381a705a1b46533e09f2676 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 9 Oct 2009 09:57:45 -0400 Subject: Btrfs: fix uninit compiler warning in cow_file_range_nocow The extent_type variable was exposed uninit via a goto. It should be impossible to trigger because it is protected by a check on another variable, but this makes sure. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d960492..ef399a7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1023,6 +1023,7 @@ next_slot: if (found_key.offset > cur_offset) { extent_end = found_key.offset; + extent_type = 0; goto out_check; } -- cgit v1.1 From ac6889cbb254be1ffea376bea4a96ce9be0e0ed0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 9 Oct 2009 11:29:53 -0400 Subject: Btrfs: fix file clone ioctl for bookend extents The file clone ioctl was incorrectly taking the offset into the extent on disk into account when calculating the length of the cloned extent. The length never changes based on the offset into the physical extent. Test case: fallocate -l 1g image mke2fs image bcp image image2 e2fsck -f image2 (errors on image2) The math bug ends up wrapping the length of the extent, and things go wrong from there. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e8795be..cdbb054 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1123,8 +1123,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, datao += off - key.offset; datal -= off - key.offset; } - if (key.offset + datao + datal > off + len) - datal = off + len - key.offset - datao; + + if (key.offset + datal > off + len) + datal = off + len - key.offset; + /* disko == 0 means it's a hole */ if (!disko) datao = 0; -- cgit v1.1 From d43c36dc6b357fa1806800f18aa30123c747a6d1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 7 Oct 2009 17:09:06 +0400 Subject: headers: remove sched.h from interrupt.h After m68k's task_thread_info() doesn't refer to current, it's possible to remove sched.h from interrupt.h and not break m68k! Many thanks to Heiko Carstens for allowing this. Signed-off-by: Alexey Dobriyan --- fs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index f313314..87e1290 100644 --- a/fs/file.c +++ b/fs/file.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include -- cgit v1.1 From ef1f7a7e878e4ae37b3a78ebdeef9f911bae59df Mon Sep 17 00:00:00 2001 From: Bernd Schmidt Date: Tue, 6 Oct 2009 09:55:26 +0100 Subject: ROMFS: fix length used with romfs_dev_strnlen() function An interestingly corrupted romfs file system exposed a problem with the romfs_dev_strnlen function: it's passing the wrong value to its helpers. Rather than limit the string to the length passed in by the callers, it uses the size of the device as the limit. Signed-off-by: Bernd Schmidt Signed-off-by: Mike Frysinger Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/romfs/storage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index b3208ad..71e2b4d 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -253,11 +253,11 @@ ssize_t romfs_dev_strnlen(struct super_block *sb, #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) - return romfs_mtd_strnlen(sb, pos, limit); + return romfs_mtd_strnlen(sb, pos, maxlen); #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) - return romfs_blk_strnlen(sb, pos, limit); + return romfs_blk_strnlen(sb, pos, maxlen); #endif return -EIO; } -- cgit v1.1 From a1be9eee2996fd9969625e7b5e2f2bc2032fd3cb Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Mon, 12 Oct 2009 11:26:12 -0400 Subject: NFS: suppress a build warning struct sockaddr_storage * can safely be used as struct sockaddr *. Suppress an "incompatible pointer type" warning. Signed-off-by: Stefan Richter Signed-off-by: Trond Myklebust Signed-off-by: Linus Torvalds --- fs/nfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6dabf6f..a2c18ac 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1848,8 +1848,8 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || data->nfs_server.port != nfss->port || data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || - !rpc_cmp_addr(&data->nfs_server.address, - &nfss->nfs_client->cl_addr)) + !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address, + (struct sockaddr *)&nfss->nfs_client->cl_addr)) return -EINVAL; return 0; -- cgit v1.1 From 96ec2e0a719fd61791dd2b0dd01325c5d20e1233 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 16 Sep 2009 11:21:13 -0400 Subject: ext3: Don't update superblock write time when filesystem is read-only This avoids updating the superblock write time when we are mounting the root file system read/only but we need to replay the journal; at that point, for people who are east of GMT and who make their clock tick in localtime for Windows bug-for-bug compatibility, and this will cause e2fsck to complain and force a full file system check. Signed-off-by: "Theodore Ts'o" Signed-off-by: Jan Kara --- fs/ext3/super.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 72743d3..7a520a8 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2321,7 +2321,18 @@ static int ext3_commit_super(struct super_block *sb, if (!sbh) return error; - es->s_wtime = cpu_to_le32(get_seconds()); + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); -- cgit v1.1 From 4722607db6a78bd7748c51fa4c8d7371da797254 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 12:55:09 -0400 Subject: Btrfs: only write one super copy during fsync During a tree-log commit for fsync, we've been writing at least two copies of the super block and forcing them to disk. The other filesystems write only one, and this change brings us on par with them. A full transaction commit will write all the super copies, so we still have redundant info written on a regular basis. Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 78f6254..6d9ec28 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2092,7 +2092,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * the running transaction open, so a full commit can't hop * in and cause problems either. */ - write_ctree_super(trans, root->fs_info->tree_root, 2); + write_ctree_super(trans, root->fs_info->tree_root, 1); ret = 0; out_wake_log_root: -- cgit v1.1 From 257c62e1bce03e5b9f3f069fd52ad73a56de71fd Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 13:21:08 -0400 Subject: Btrfs: avoid tree log commit when there are no changes rpm has a habit of running fdatasync when the file hasn't changed. We already detect if a file hasn't been changed in the current transaction but it might have been sent to the tree-log in this transaction and not changed since the last call to fsync. In this case, we want to avoid a tree log sync, which includes a number of synchronous writes and barriers. This commit extends the existing tracking of the last transaction to change a file to also track the last sub-transaction. The end result is that rpm -ivh and -Uvh are roughly twice as fast, and on par with ext3. Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 6 ++++++ fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 2 ++ fs/btrfs/file.c | 41 ++++++++++++++++++++++++++--------------- fs/btrfs/inode.c | 6 +++++- fs/btrfs/transaction.h | 1 + fs/btrfs/tree-log.c | 27 +++++++++++++++++++++++++++ fs/btrfs/tree-log.h | 3 +++ 8 files changed, 71 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c71abec..f6783a4 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -86,6 +86,12 @@ struct btrfs_inode { * transid of the trans_handle that last modified this inode */ u64 last_trans; + + /* + * log transid when this inode was last modified + */ + u64 last_sub_trans; + /* * transid that last logged this inode */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 36a19cd..d0cede5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1009,6 +1009,7 @@ struct btrfs_root { atomic_t log_writers; atomic_t log_commit[2]; unsigned long log_transid; + unsigned long last_log_commit; unsigned long log_batch; pid_t log_start_pid; bool log_multiple_pids; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ac8927b..d4132aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -919,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_writers, 0); root->log_batch = 0; root->log_transid = 0; + root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -1089,6 +1090,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->last_log_commit = 0; return 0; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 53fb1c9..4599113 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1087,8 +1087,10 @@ out_nolock: btrfs_end_transaction(trans, root); else btrfs_commit_transaction(trans, root); - } else { + } else if (ret != BTRFS_NO_LOG_SYNC) { btrfs_commit_transaction(trans, root); + } else { + btrfs_end_transaction(trans, root); } } if (file->f_flags & O_DIRECT) { @@ -1138,6 +1140,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) int ret = 0; struct btrfs_trans_handle *trans; + + /* we wait first, since the writeback may change the inode */ + root->log_batch++; + /* the VFS called filemap_fdatawrite for us */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + root->log_batch++; + /* * check the transaction that last modified this inode * and see if its already been committed @@ -1145,6 +1154,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (!BTRFS_I(inode)->last_trans) goto out; + /* + * if the last transaction that changed this file was before + * the current transaction, we can bail out now without any + * syncing + */ mutex_lock(&root->fs_info->trans_mutex); if (BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { @@ -1154,13 +1168,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) } mutex_unlock(&root->fs_info->trans_mutex); - root->log_batch++; - filemap_fdatawrite(inode->i_mapping); - btrfs_wait_ordered_range(inode, 0, (u64)-1); - root->log_batch++; - - if (datasync && !(inode->i_state & I_DIRTY_PAGES)) - goto out; /* * ok we haven't committed the transaction yet, lets do a commit */ @@ -1189,14 +1196,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) */ mutex_unlock(&dentry->d_inode->i_mutex); - if (ret > 0) { - ret = btrfs_commit_transaction(trans, root); - } else { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - ret = btrfs_end_transaction(trans, root); - else + if (ret != BTRFS_NO_LOG_SYNC) { + if (ret > 0) { ret = btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_sync_log(trans, root); + if (ret == 0) + ret = btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); + } + } else { + ret = btrfs_end_transaction(trans, root); } mutex_lock(&dentry->d_inode->i_mutex); out: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ef399a7..5b9567c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3480,6 +3480,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->generation = 0; bi->sequence = 0; bi->last_trans = 0; + bi->last_sub_trans = 0; bi->logged_trans = 0; bi->delalloc_bytes = 0; bi->reserved_bytes = 0; @@ -4980,7 +4981,9 @@ again: set_page_dirty(page); SetPageUptodate(page); - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: @@ -5100,6 +5103,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) if (!ei) return NULL; ei->last_trans = 0; + ei->last_sub_trans = 0; ei->logged_trans = 0; ei->outstanding_extents = 0; ei->reserved_extents = 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 663c674..f68cbbe 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { BTRFS_I(inode)->last_trans = trans->transaction->transid; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6d9ec28..0a1bde2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1980,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; + u64 log_transid = 0; mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; @@ -2018,6 +2019,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; + log_transid = root->log_transid; root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; @@ -2095,6 +2097,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, write_ctree_super(trans, root->fs_info->tree_root, 1); ret = 0; + mutex_lock(&root->log_mutex); + if (root->last_log_commit < log_transid) + root->last_log_commit = log_transid; + mutex_unlock(&root->log_mutex); + out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); @@ -2862,6 +2869,21 @@ out: return ret; } +static int inode_in_log(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == trans->transid && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -2901,6 +2923,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; + if (inode_in_log(trans, inode)) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; + } + start_log_trans(trans, root); ret = btrfs_log_inode(trans, root, inode, inode_only); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index d09c760..0776eac 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,9 @@ #ifndef __TREE_LOG_ #define __TREE_LOG_ +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ +#define BTRFS_NO_LOG_SYNC 256 + int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); -- cgit v1.1 From 690587d109ffe19d6743e4cc80c18b0906b7f9ff Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 13:29:19 -0400 Subject: Btrfs: streamline tree-log btree block writeout Syncing the tree log is a 3 phase operation. 1) write and wait for all the tree log blocks for a given root. 2) write and wait for all the tree log blocks for the tree of tree log roots. 3) write and wait for the super blocks (barriers here) This isn't as efficient as it could be because there is no requirement to wait for the blocks from step one to hit the disk before we start writing the blocks from step two. This commit changes the sequence so that we don't start waiting until all the tree blocks from both steps one and two have been sent to disk. We do this by breaking up btrfs_write_wait_marked_extents into two functions, which is trivial because it was already broken up into two parts. Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/transaction.h | 4 ++++ fs/btrfs/tree-log.c | 8 +++++++- 3 files changed, 53 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0b8f36d..bca82a4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -344,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of - * those extents are on disk for transaction or log commit + * those extents are sent to disk but does not wait on them */ -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) { int ret; int err = 0; @@ -394,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, page_cache_release(page); } } + if (err) + werr = err; + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit. We wait + * on all the pages and clear them from the dirty pages state tree + */ +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + while (1) { ret = find_first_extent_bit(dirty_pages, 0, &start, &end, EXTENT_DIRTY); @@ -424,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, return werr; } +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int ret2; + + ret = btrfs_write_marked_extents(root, dirty_pages); + ret2 = btrfs_wait_marked_extents(root, dirty_pages); + return ret || ret2; +} + int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index f68cbbe..d4e3e7a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -108,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0a1bde2..4aff766 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2013,7 +2013,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); + /* we start IO on all the marked extents here, but we don't actually + * wait for them until later. + */ + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); btrfs_set_root_node(&log->root_item, log->node); @@ -2048,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2067,6 +2071,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -2075,6 +2080,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages); BUG_ON(ret); + btrfs_wait_marked_extents(log, &log->dirty_log_pages); btrfs_set_super_log_root(&root->fs_info->super_for_commit, log_root_tree->node->start); -- cgit v1.1 From 0eda294dfc980c1cbe4f8a0564bf543f86a01ddb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 13:50:18 -0400 Subject: Btrfs: fix btrfs acl #ifdef checks The btrfs acl code was #ifdefing for a define that didn't exist. This correctly matches it to the values used by the Kconfig file. Signed-off-by: Chris Mason --- fs/btrfs/acl.c | 6 +++--- fs/btrfs/ctree.h | 2 +- fs/btrfs/super.c | 2 +- fs/btrfs/xattr.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 69b355a..3616042 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -27,7 +27,7 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { @@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { .set = btrfs_xattr_acl_access_set, }; -#else /* CONFIG_BTRFS_POSIX_ACL */ +#else /* CONFIG_BTRFS_FS_POSIX_ACL */ int btrfs_acl_chmod(struct inode *inode) { @@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -#endif /* CONFIG_BTRFS_POSIX_ACL */ +#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d0cede5..dfd7e6f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2374,7 +2374,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e0a6432..3fbbf07 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -344,7 +344,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b0fc93f..b6dd596 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -260,7 +260,7 @@ err: * attributes are handled directly. */ struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, #endif -- cgit v1.1 From 05277c75f6dea8ecf59138cd1b6781fb54ae08bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Oct 2009 23:42:10 +0000 Subject: xfs: fix double IRELE in xfs_dqrele_inode xfs_dqrele_inode calls xfs_iput to release the ilock and a reference and then also calls IRELE which does a second decrement of the reference count. This leads to a premature freeing of inodes when quotas were turned off while the filesystem was mounted. Thanks to Utako Kusaka for reporting the bug and provinding a good testcase. Signed-off-by: Christoph Hellwig Reported-by: Utako Kusaka Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/quota/xfs_qm_syscalls.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 4e4276b..5d1a3b9 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -876,7 +876,6 @@ xfs_dqrele_inode( ip->i_gdquot = NULL; } xfs_iput(ip, XFS_ILOCK_EXCL); - IRELE(ip); return 0; } -- cgit v1.1 From 5d5e103a70f74ae98e3965a4add1ab951d0651d1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 13 Oct 2009 16:46:49 -0400 Subject: Btrfs: fix possible ENOSPC problems with truncate There's a problem where we don't do any space reservation for truncates, which can cause you to OOPs because you will be allowed to go off in the weeds a bit since we don't account for the delalloc bytes that are created as a result of the truncate. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b9567c..78139ef 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3032,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if ((offset & (blocksize - 1)) == 0) goto out; + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) + goto out; ret = -ENOMEM; again: page = grab_cache_page(mapping, index); - if (!page) + if (!page) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); goto out; + } page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; @@ -3070,6 +3080,10 @@ again: goto again; } + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); if (ret) { unlock_extent(io_tree, page_start, page_end, GFP_NOFS); @@ -3088,6 +3102,9 @@ again: unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: + if (ret) + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); unlock_page(page); page_cache_release(page); out: @@ -3111,7 +3128,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) if (size <= hole_start) return 0; - btrfs_truncate_page(inode->i_mapping, inode->i_size); + err = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (err) + return err; while (1) { struct btrfs_ordered_extent *ordered; @@ -5008,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - btrfs_truncate_page(inode->i_mapping, inode->i_size); + ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (ret) + return; btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); -- cgit v1.1 From 86df7eb921a009515285e7171363fa57dd2d7d31 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 14 Oct 2009 09:24:59 -0400 Subject: Btrfs: properly wait log writers during log sync A recently fsync optimization make btrfs_sync_log skip calling wait_for_writer in the single log writer case. This is incorrect since the writer count can also be increased by btrfs_pin_log. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4aff766..f51bf13 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1995,12 +1995,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); - while (root->log_multiple_pids) { + while (1) { unsigned long batch = root->log_batch; - mutex_unlock(&root->log_mutex); - schedule_timeout_uninterruptible(1); - mutex_lock(&root->log_mutex); - + if (root->log_multiple_pids) { + mutex_unlock(&root->log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&root->log_mutex); + } wait_for_writer(trans, root); if (batch == root->log_batch) break; -- cgit v1.1 From e244a0aeb6a599c19a7c802cda6e2d67c847b154 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Oct 2009 09:24:59 -0400 Subject: Btrfs: add -o discard option Enable discard by default is not a good idea given the the trim speed of SSD prototypes we've seen, and the carecteristics for many high-end arrays. Turn of discards by default and require the -o discard option to enable them on. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 3 +++ fs/btrfs/super.c | 7 ++++++- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dfd7e6f..e5dd628 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1153,6 +1153,7 @@ struct btrfs_root { #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) +#define BTRFS_MOUNT_DISCARD (1 << 10) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4aedbff..bf7782f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1584,6 +1584,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, u64 map_length = num_bytes; struct btrfs_multi_bio *multi = NULL; + if (!btrfs_test_opt(root, DISCARD)) + return 0; + /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, bytenr, &map_length, &multi, 0); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3fbbf07..939b68f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,7 +66,8 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, + Opt_discard, Opt_err, }; static match_table_t tokens = { @@ -88,6 +89,7 @@ static match_table_t tokens = { {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, + {Opt_discard, "discard"}, {Opt_err, NULL}, }; @@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->metadata_ratio); } break; + case Opt_discard: + btrfs_set_opt(info->mount_opt, DISCARD); + break; default: break; } -- cgit v1.1 From 0634857488ec6e28fa22920cd0bee3c2ac07ccfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Oct 2009 09:24:59 -0400 Subject: Btrfs: enable discard support The discard support code in btrfs currently is guarded by ifdefs for BIO_RW_DISCARD, which is never defines as it's the name of an enum memeber. Just remove the useless ifdefs to actually enable the code. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bf7782f..d4a3528 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1568,18 +1568,15 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -#ifdef BIO_RW_DISCARD static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); } -#endif static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { -#ifdef BIO_RW_DISCARD int ret; u64 map_length = num_bytes; struct btrfs_multi_bio *multi = NULL; @@ -1606,9 +1603,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } return ret; -#else - return 0; -#endif } int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, -- cgit v1.1 From 444528b3e614f7f2391488d9bca8e0b872db909b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 14 Oct 2009 09:38:28 -0400 Subject: Btrfs: always pin metadata in discard mode We have an optimization in btrfs to allow blocks to be immediately freed if they were allocated in this transaction and never written. Otherwise they are pinned and freed when the transaction commits. This isn't optimal for discard mode because immediately freeing them means immediately discarding them. It is better to give the block to the pinning code and letting the (slow) discard happen later. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d4a3528..c56f916 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3686,6 +3686,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, if (is_data) goto pinit; + /* + * discard is sloooow, and so triggering discards on + * individual btree blocks isn't a good plan. Just + * pin everything in discard mode. + */ + if (btrfs_test_opt(root, DISCARD)) + goto pinit; + buf = btrfs_find_tree_block(root, bytenr, num_bytes); if (!buf) goto pinit; -- cgit v1.1 From a6a8357788d6a37f8ad0f7eb46b0a386b613abb9 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 6 Oct 2009 15:33:35 +0200 Subject: sysfs: Allow sysfs_move_dir(..., NULL) again. As device_move() and kobject_move() both handle a NULL destination, sysfs_move_dir() should do this as well (again) and fall back to sysfs_root in that case. Signed-off-by: Cornelia Huck Cc: Phil Carmody Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 0050fc4..5fad489 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -894,7 +894,8 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) mutex_lock(&sysfs_rename_mutex); BUG_ON(!sd->s_parent); - new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; + new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ? + new_parent_kobj->sd : &sysfs_root; error = 0; if (sd->s_parent == new_parent_sd) -- cgit v1.1 From 83db93f4de2d9ae441a491d1dc61c2204f0195de Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Tue, 15 Sep 2009 16:05:51 -0700 Subject: sysfs: Allow sysfs_notify_dirent to be called from interrupt context. sysfs_notify_dirent is a simple atomic operation that can be used to alert user-space that new data can be read from a sysfs attribute. Unfortunately it cannot currently be called from non-process context because of its use of spin_lock which is sometimes taken with interrupts enabled. So change all lockers of sysfs_open_dirent_lock to disable interrupts, thus making sysfs_notify_dirent safe to be called from non-process context (as drivers/md does in md_safemode_timeout). sysfs_get_open_dirent is (documented as being) only called from process context, so it uses spin_lock_irq. Other places use spin_lock_irqsave. The usage for sysfs_notify_dirent in md_safemode_timeout was introduced in 2.6.28, so this patch is suitable for that and more recent kernels. Reported-by: Joel Andres Granados Signed-off-by: NeilBrown Signed-off-by: Dan Williams Cc: stable Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 561a9c0..f5ea468 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -268,7 +268,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd, struct sysfs_open_dirent *od, *new_od = NULL; retry: - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irq(&sysfs_open_dirent_lock); if (!sd->s_attr.open && new_od) { sd->s_attr.open = new_od; @@ -281,7 +281,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd, list_add_tail(&buffer->list, &od->buffers); } - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irq(&sysfs_open_dirent_lock); if (od) { kfree(new_od); @@ -315,8 +315,9 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd, struct sysfs_buffer *buffer) { struct sysfs_open_dirent *od = sd->s_attr.open; + unsigned long flags; - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irqsave(&sysfs_open_dirent_lock, flags); list_del(&buffer->list); if (atomic_dec_and_test(&od->refcnt)) @@ -324,7 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd, else od = NULL; - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); kfree(od); } @@ -456,8 +457,9 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) void sysfs_notify_dirent(struct sysfs_dirent *sd) { struct sysfs_open_dirent *od; + unsigned long flags; - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irqsave(&sysfs_open_dirent_lock, flags); od = sd->s_attr.open; if (od) { @@ -465,7 +467,7 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd) wake_up_interruptible(&od->poll); } - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); } EXPORT_SYMBOL_GPL(sysfs_notify_dirent); -- cgit v1.1 From 9f0d793b52eb2266359661369ef6303838904855 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 11 Sep 2009 13:03:19 -0400 Subject: fsnotify: do not set group for a mark before it is on the i_list fsnotify_add_mark is supposed to add a mark to the g_list and i_list and to set the group and inode for the mark. fsnotify_destroy_mark_by_entry uses the fact that ->group != NULL to know if this group should be destroyed or if it's already been done. But fsnotify_add_mark sets the group and inode before it actually adds the mark to the i_list and g_list. This can result in a race in inotify, it requires 3 threads. sys_inotify_add_watch("file") sys_inotify_add_watch("file") sys_inotify_rm_watch([a]) inotify_update_watch() inotify_new_watch() inotify_add_to_idr() ^--- returns wd = [a] inotfiy_update_watch() inotify_new_watch() inotify_add_to_idr() fsnotify_add_mark() ^--- returns wd = [b] returns to userspace; inotify_idr_find([a]) ^--- gives us the pointer from task 1 fsnotify_add_mark() ^--- this is going to set the mark->group and mark->inode fields, but will return -EEXIST because of the race with [b]. fsnotify_destroy_mark() ^--- since ->group != NULL we call back into inotify_freeing_mark() which calls inotify_remove_from_idr([a]) since fsnotify_add_mark() failed we call: inotify_remove_from_idr([a]) <------WHOOPS it's not in the idr, this could have been any entry added later! The fix is to make sure we don't set mark->group until we are sure the mark is on the inode and fsnotify_add_mark will return success. Signed-off-by: Eric Paris --- fs/notify/inode_mark.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index c8a07c6..3165d85 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -324,11 +324,11 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry, spin_lock(&group->mark_lock); spin_lock(&inode->i_lock); - entry->group = group; - entry->inode = inode; - lentry = fsnotify_find_mark_entry(group, inode); if (!lentry) { + entry->group = group; + entry->inode = inode; + hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); list_add(&entry->g_list, &group->mark_entries); -- cgit v1.1 From 3de0ef4f2067da58fa5126d821a56dcb98cdb565 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 14 Oct 2009 20:54:03 +0800 Subject: inotify: fix coalesce duplicate events into a single event in special case If we do rename a dir entry, like this: rename("/tmp/ino7UrgoJ.rename1", "/tmp/ino7UrgoJ.rename2") rename("/tmp/ino7UrgoJ.rename2", "/tmp/ino7UrgoJ") The duplicate events should be coalesced into a single event. But those two events do not be coalesced into a single event, due to some bad check in event_compare(). It can not match the two NULL inodes as the same event. Signed-off-by: Wei Yongjun Signed-off-by: Eric Paris --- fs/notify/notification.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 3816d57..b8bf53b 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -143,7 +143,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new /* remember, after old was put on the wait_q we aren't * allowed to look at the inode any more, only thing * left to check was if the file_name is the same */ - if (old->name_len && + if (!old->name_len || !strcmp(old->file_name, new->file_name)) return true; break; -- cgit v1.1 From 945526846a84c00adac1efd1c6befdaa77039623 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 15 Oct 2009 00:13:23 +0200 Subject: dnotify: ignore FS_EVENT_ON_CHILD Mask off FS_EVENT_ON_CHILD in dnotify_handle_event(). Otherwise, when there is more than one watch on a directory and dnotify_should_send_event() succeeds, events with FS_EVENT_ON_CHILD set will trigger all watches and cause spurious events. This case was overlooked in commit e42e2773. #define _GNU_SOURCE #include #include #include #include #include #include #include #include static void create_event(int s, siginfo_t* si, void* p) { printf("create\n"); } static void delete_event(int s, siginfo_t* si, void* p) { printf("delete\n"); } int main (void) { struct sigaction action; char *tmpdir, *file; int fd1, fd2; sigemptyset (&action.sa_mask); action.sa_flags = SA_SIGINFO; action.sa_sigaction = create_event; sigaction (SIGRTMIN + 0, &action, NULL); action.sa_sigaction = delete_event; sigaction (SIGRTMIN + 1, &action, NULL); # define TMPDIR "/tmp/test.XXXXXX" tmpdir = malloc(strlen(TMPDIR) + 1); strcpy(tmpdir, TMPDIR); mkdtemp(tmpdir); # define TMPFILE "/file" file = malloc(strlen(tmpdir) + strlen(TMPFILE) + 1); sprintf(file, "%s/%s", tmpdir, TMPFILE); fd1 = open (tmpdir, O_RDONLY); fcntl(fd1, F_SETSIG, SIGRTMIN); fcntl(fd1, F_NOTIFY, DN_MULTISHOT | DN_CREATE); fd2 = open (tmpdir, O_RDONLY); fcntl(fd2, F_SETSIG, SIGRTMIN + 1); fcntl(fd2, F_NOTIFY, DN_MULTISHOT | DN_DELETE); if (fork()) { /* This triggers a create event */ creat(file, 0600); /* This triggers a create and delete event (!) */ unlink(file); } else { sleep(1); rmdir(tmpdir); } return 0; } Signed-off-by: Andreas Gruenbacher Signed-off-by: Eric Paris --- fs/notify/dnotify/dnotify.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 828a889..7e54e52 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -91,6 +91,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; + __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; to_tell = event->to_tell; @@ -106,7 +107,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, spin_lock(&entry->lock); prev = &dnentry->dn; while ((dn = *prev) != NULL) { - if ((dn->dn_mask & event->mask) == 0) { + if ((dn->dn_mask & test_mask) == 0) { prev = &dn->dn_next; continue; } -- cgit v1.1 From ad3960243e55320d74195fb85c975e0a8cc4466c Mon Sep 17 00:00:00 2001 From: Earl Chew Date: Mon, 19 Oct 2009 15:55:41 -0700 Subject: fs: pipe.c null pointer dereference This patch fixes a null pointer exception in pipe_rdwr_open() which generates the stack trace: > Unable to handle kernel NULL pointer dereference at 0000000000000028 RIP: > [] pipe_rdwr_open+0x35/0x70 > [] __dentry_open+0x13c/0x230 > [] do_filp_open+0x2d/0x40 > [] do_sys_open+0x5a/0x100 > [] sysenter_do_call+0x1b/0x67 The failure mode is triggered by an attempt to open an anonymous pipe via /proc/pid/fd/* as exemplified by this script: ============================================================= while : ; do { echo y ; sleep 1 ; } | { while read ; do echo z$REPLY; done ; } & PID=$! OUT=$(ps -efl | grep 'sleep 1' | grep -v grep | { read PID REST ; echo $PID; } ) OUT="${OUT%% *}" DELAY=$((RANDOM * 1000 / 32768)) usleep $((DELAY * 1000 + RANDOM % 1000 )) echo n > /proc/$OUT/fd/1 # Trigger defect done ============================================================= Note that the failure window is quite small and I could only reliably reproduce the defect by inserting a small delay in pipe_rdwr_open(). For example: static int pipe_rdwr_open(struct inode *inode, struct file *filp) { msleep(100); mutex_lock(&inode->i_mutex); Although the defect was observed in pipe_rdwr_open(), I think it makes sense to replicate the change through all the pipe_*_open() functions. The core of the change is to verify that inode->i_pipe has not been released before attempting to manipulate it. If inode->i_pipe is no longer present, return ENOENT to indicate so. The comment about potentially using atomic_t for i_pipe->readers and i_pipe->writers has also been removed because it is no longer relevant in this context. The inode->i_mutex lock must be used so that inode->i_pipe can be dealt with correctly. Signed-off-by: Earl Chew Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/pipe.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 52c4151..ae17d02 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -777,36 +777,55 @@ pipe_rdwr_release(struct inode *inode, struct file *filp) static int pipe_read_open(struct inode *inode, struct file *filp) { - /* We could have perhaps used atomic_t, but this and friends - below are the only places. So it doesn't seem worthwhile. */ + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - inode->i_pipe->readers++; + + if (inode->i_pipe) { + ret = 0; + inode->i_pipe->readers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } static int pipe_write_open(struct inode *inode, struct file *filp) { + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - inode->i_pipe->writers++; + + if (inode->i_pipe) { + ret = 0; + inode->i_pipe->writers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } static int pipe_rdwr_open(struct inode *inode, struct file *filp) { + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - if (filp->f_mode & FMODE_READ) - inode->i_pipe->readers++; - if (filp->f_mode & FMODE_WRITE) - inode->i_pipe->writers++; + + if (inode->i_pipe) { + ret = 0; + if (filp->f_mode & FMODE_READ) + inode->i_pipe->readers++; + if (filp->f_mode & FMODE_WRITE) + inode->i_pipe->writers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } /* -- cgit v1.1 From 4223a4a155f245d41c350ed9eba4fc32e965c4da Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 20 Oct 2009 14:13:46 +0900 Subject: nfs: Fix nfs_parse_mount_options() kfree() leak Fix a (small) memory leak in one of the error paths of the NFS mount options parsing code. Regression introduced in 2.6.30 by commit a67d18f (NFS: load the rpc/rdma transport module automatically). Reported-by: Yinghai Lu Reported-by: Pekka Enberg Signed-off-by: Ingo Molnar Signed-off-by: Trond Myklebust Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/nfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a2c18ac..90be551 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1253,6 +1253,7 @@ static int nfs_parse_mount_options(char *raw, default: dfprintk(MOUNT, "NFS: unrecognized " "transport protocol\n"); + kfree(string); return 0; } break; -- cgit v1.1 From a8b40bc7e635831b61c43acc71a86d3a68b2dff0 Mon Sep 17 00:00:00 2001 From: Terry Loftin Date: Thu, 22 Oct 2009 21:36:01 -0400 Subject: nfs: Panic when commit fails Actually pass the NFS_FILE_SYNC option to the server to avoid a Panic in nfs_direct_write_complete() when a commit fails. At the end of an nfs write, if the nfs commit fails, all the writes will be rescheduled. They are supposed to be rescheduled as NFS_FILE_SYNC writes, but the rpc_task structure is not completely intialized and so the option is not passed. When the rescheduled writes complete, the return indicates that they are NFS_UNSTABLE and we try to do another commit. This leads to a Panic because the commit data structure pointer was set to null in the initial (failed) commit attempt. Signed-off-by: Terry Loftin Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 6c32100..e1d415e 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -457,6 +457,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) }; struct rpc_task_setup task_setup_data = { .rpc_client = NFS_CLIENT(inode), + .rpc_message = &msg, .callback_ops = &nfs_write_direct_ops, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, -- cgit v1.1 From 52567b03ca38b6e556ced450d64dba8d66e23b0e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 23 Oct 2009 14:46:42 -0400 Subject: NFSv4: Fix a bug when the server returns NFS4ERR_RESOURCE RFC 3530 states that when we recieve the error NFS4ERR_RESOURCE, we are not supposed to bump the sequence number on OPEN, LOCK, LOCKU, CLOSE, etc operations. The problem is that we map that error into EREMOTEIO in the XDR layer, and so the NFSv4 middle-layer routines like seqid_mutating_err(), and nfs_increment_seqid() don't recognise it. The fix is to defer the mapping until after the middle layers have processed the error. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 11 ++++++++--- fs/nfs/nfs4xdr.c | 1 - 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ed7c269..65c2527 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -72,12 +72,17 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, /* Prevent leaks of NFSv4 errors into userland */ static int nfs4_map_errors(int err) { - if (err < -1000) { + if (err >= -1000) + return err; + switch (err) { + case -NFS4ERR_RESOURCE: + return -EREMOTEIO; + default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); - return -EIO; + break; } - return err; + return -EIO; } /* diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 83ad47c..20b4e30 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5681,7 +5681,6 @@ static struct { { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, { NFS4ERR_BADTYPE, -EBADTYPE }, { NFS4ERR_LOCKED, -EAGAIN }, - { NFS4ERR_RESOURCE, -EREMOTEIO }, { NFS4ERR_SYMLINK, -ELOOP }, { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, { NFS4ERR_DEADLOCK, -EDEADLK }, -- cgit v1.1 From 141aeb9f26f9f12f1584c128ce8697cdffb046e7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Oct 2009 08:09:46 -0400 Subject: NFSv4: Fix two unbalanced put_rpccred() issues. Commits 29fba38b (nfs41: lease renewal) and fc01cea9 (nfs41: sequence operation) introduce a couple of put_rpccred() calls on credentials for which there is no corresponding get_rpccred(). See http://bugzilla.kernel.org/show_bug.cgi?id=14249 Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 65c2527..ff37454 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3065,9 +3065,6 @@ static void nfs4_renew_done(struct rpc_task *task, void *data) if (time_before(clp->cl_last_renewal,timestamp)) clp->cl_last_renewal = timestamp; spin_unlock(&clp->cl_lock); - dprintk("%s calling put_rpccred on rpc_cred %p\n", __func__, - task->tk_msg.rpc_cred); - put_rpccred(task->tk_msg.rpc_cred); } static const struct rpc_call_ops nfs4_renew_ops = { @@ -4882,7 +4879,6 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data) nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp); dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); - put_rpccred(task->tk_msg.rpc_cred); kfree(task->tk_msg.rpc_argp); kfree(task->tk_msg.rpc_resp); -- cgit v1.1 From 9a3936aac133037f65124fcb2d676a6c201a90a4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Oct 2009 08:09:46 -0400 Subject: NFSv4: The link() operation should return any delegation on the file Otherwise, we have to wait for the server to recall it. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 32062c3..7cb2985 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1536,6 +1536,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry->d_parent->d_name.name, old_dentry->d_name.name, dentry->d_parent->d_name.name, dentry->d_name.name); + nfs_inode_return_delegation(inode); + d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { -- cgit v1.1 From 960cc0f4fef607baabc2232fbd7cce5368a9dcfd Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Mon, 26 Oct 2009 08:59:17 +0100 Subject: block: use after free bug in __blkdev_get commit 0762b8bde9729f10f8e6249809660ff2ec3ad735 (from 14 months ago) introduced a use-after-free bug which has just recently started manifesting in my md testing. I tried git bisect to find out what caused the bug to start manifesting, and it could have been the recent change to blk_unregister_queue (48c0d4d4c04) but the results were inconclusive. This patch certainly fixes my symptoms and looks correct as the two calls are now in the same order as elsewhere in that function. Signed-off-by: NeilBrown Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 9cf4b92..8bed055 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1248,8 +1248,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); } } else { - put_disk(disk); module_put(disk->fops->owner); + put_disk(disk); disk = NULL; if (bdev->bd_contains == bdev) { if (bdev->bd_disk->fops->open) { -- cgit v1.1 From ffb4a73d8906f71910e6c83ec2b499e70025ee8e Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 27 Oct 2009 07:22:37 +0900 Subject: sh: Fix hugetlbfs dependencies for SH-3 && MMU configurations. The hugetlb dependencies presently depend on SUPERH && MMU while the hugetlb page size definitions depend on CPU_SH4 or CPU_SH5. This unfortunately allows SH-3 + MMU configurations to enable hugetlbfs without a corresponding HPAGE_SHIFT definition, resulting in the build blowing up. As SH-3 doesn't support variable page sizes, we tighten up the dependenies a bit to prevent hugetlbfs from being enabled. These days we also have a shiny new SYS_SUPPORTS_HUGETLBFS, so switch to using that rather than adding to the list of corner cases in fs/Kconfig. Reported-by: Kristoffer Ericson Signed-off-by: Paul Mundt --- fs/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index d4bf8ca..48b81f9 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -135,8 +135,8 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \ - (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN + depends on X86 || IA64 || PPC64 || SPARC64 || (S390 && 64BIT) || \ + SYS_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read -- cgit v1.1 From 0cd9ad73b8d181737005ff4e506b9b6bd043f4dd Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Fri, 16 Oct 2009 07:21:35 +0000 Subject: powerpc: Limit hugetlbfs support to PPC64 Book-3S machines Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index d4bf8ca..f93d0ed 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -135,7 +135,7 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \ + depends on X86 || IA64 || PPC_BOOK3S_64 || SPARC64 || (SUPERH && MMU) || \ (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on -- cgit v1.1 From 370c28def65b3a5765192753a164403619b41d01 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 26 Oct 2009 16:49:32 -0700 Subject: hwpoison: fix/proc/meminfo alignment Given such a long name, the kB count in /proc/meminfo's HardwareCorrupted line is being shown too far right (it does align with x86_64's VmallocChunk above, but I hope nobody will ever have that much corrupted!). Align it. Signed-off-by: Hugh Dickins Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c7bff4f..a65239c 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -99,7 +99,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "VmallocUsed: %8lu kB\n" "VmallocChunk: %8lu kB\n" #ifdef CONFIG_MEMORY_FAILURE - "HardwareCorrupted: %8lu kB\n" + "HardwareCorrupted: %5lu kB\n" #endif , K(i.totalram), -- cgit v1.1 From 5c36fe3d87b3f0c85894a49193c66096a3d6b26f Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 26 Oct 2009 16:49:51 -0700 Subject: hfsplus: refuse to mount volumes larger than 2TB As found in , hfsplus is using type u32 rather than sector_t for some sector number calculations. In particular, hfsplus_get_block() does: u32 ablock, dblock, mask; ... map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask)); I am not confident that I can find and fix all cases where a sector number may be truncated. For now, avoid data loss by refusing to mount HFS+ volumes with more than 2^32 sectors (2TB). [akpm@linux-foundation.org: fix 32 and 64-bit issues] Signed-off-by: Ben Hutchings Cc: Eric Sesterhenn Cc: Roman Zippel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/wrapper.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 175d08e..bed78ac8 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -99,6 +99,10 @@ int hfsplus_read_wrapper(struct super_block *sb) if (hfsplus_get_last_session(sb, &part_start, &part_size)) return -EINVAL; + if ((u64)part_start + part_size > 0x100000000ULL) { + pr_err("hfs: volumes larger than 2TB are not supported yet\n"); + return -EINVAL; + } while (1) { bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); if (!bh) -- cgit v1.1 From 47f365eb575735c6b2edf5d08e0d16d26a9c23bd Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 26 Oct 2009 16:49:56 -0700 Subject: hfs: fix oops on mount with corrupted btree extent records A particular fsfuzzer run caused an hfs file system to crash on mount. This is due to a corrupted MDB extent record causing a miscalculation of HFS_I(inode)->first_blocks for the extent tree. If the extent records are zereod out, it won't trigger the first_blocks special case. Instead it falls through to the extent code which we're still in the middle of initializing. This patch catches the 0 size extent records, reports the corruption, and fails the mount. Reported-by: Ramon de Carvalho Valle Signed-off-by: Jeff Mahoney Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/btree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 9b9d639..052f214 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -58,6 +58,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke } unlock_new_inode(tree->inode); + if (!HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n"); + goto free_inode; + } + mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) -- cgit v1.1 From 5a1eb5c4453207ad9e7f6e8ca4f8db289743c993 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 30 Oct 2009 15:03:54 +1100 Subject: powerpc: Cleanup Kconfig selection of hugetlbfs support Signed-off-by: Benjamin Herrenschmidt --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 2126078..64d44ef 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -135,7 +135,7 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || PPC_BOOK3S_64 || SPARC64 || (S390 && 64BIT) || \ + depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ SYS_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on -- cgit v1.1 From 3b826386d376e5545d2e92b2da5ebd965cafae97 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 30 Oct 2009 09:27:07 +0100 Subject: xfs: free temporary cursor in xfs_dialloc Commit bd169565993b39b9b4b102cdac8b13e0a259ce2f seems to have a slight regression where this code path: if (!--searchdistance) { /* * Not in range - save last search * location and allocate a new inode */ ... goto newino; } doesn't free the temporary cursor (tcur) that got dup'd in this function. This leaks an item in the xfs_btree_cur zone, and it's caught on module unload: =========================================================== BUG xfs_btree_cur: Objects remaining on kmem_cache_close() ----------------------------------------------------------- It seems like maybe a single free at the end of the function might be cleaner, but for now put a del_cursor right in this code block similar to the handling in the rest of the function. Signed-off-by: Eric Sandeen Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_ialloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index ab64f3e..0785797 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -880,6 +880,7 @@ nextag: * Not in range - save last search * location and allocate a new inode */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; -- cgit v1.1 From c7ff91d722e44c98504e6e2c357b47e1988dfbbd Mon Sep 17 00:00:00 2001 From: Ryota Yamauchi Date: Fri, 30 Oct 2009 09:27:44 +0100 Subject: xfs: fix xfs_quota remove error The xfs_quota returns ENOSYS when remove command is executed. Reproducable with following steps. # mount -t xfs -o uquota /dev/sda7 /mnt/mp1 # xfs_quota -x -c off -c remove XFS_QUOTARM: Function not implemented. The remove command is allowed during quotaoff, but xfs_fs_set_xstate() checks whether quota is running, and it leads to ENOSYS. To solve this problem, add a check for X_QUOTARM. Signed-off-by: Ryota Yamauchi Signed-off-by: Utako Kusaka Signed-off-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_quotaops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index cb6e2cc..13cc7b5 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -80,7 +80,7 @@ xfs_fs_set_xstate( if (sb->s_flags & MS_RDONLY) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) + if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; if (!capable(CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.1 From ad0bf11070ebb3c95f8ce82e6219dbd79c8e8b69 Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Mon, 2 Nov 2009 11:39:22 +0100 Subject: bio_put(): add bio_clone() to the list of functions in the comment In bio_put()'s comment, add bio_clone() to the list of functions that can give you a bio reference. Signed-off-by: Alberto Bertogli Signed-off-by: Jens Axboe --- fs/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 402cb84..bcab9e9 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -407,7 +407,7 @@ EXPORT_SYMBOL(zero_fill_bio); * * Description: * Put a reference to a &struct bio, either one you have gotten with - * bio_alloc or bio_get. The last put of a bio will free it. + * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. **/ void bio_put(struct bio *bio) { -- cgit v1.1 From 5f04eeb8a76521dec371ceb05e8263889a8af2bc Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Mon, 2 Nov 2009 11:39:42 +0100 Subject: Fix bio_alloc() and bio_kmalloc() documentation Commit 451a9ebf accidentally broke bio_alloc() and bio_kmalloc() comments by (almost) swapping them. This patch fixes that, by placing the comments in the right place. Signed-off-by: Alberto Bertogli Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/bio.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index bcab9e9..12da5db 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -325,8 +325,16 @@ static void bio_fs_destructor(struct bio *bio) * @gfp_mask: allocation mask to use * @nr_iovecs: number of iovecs * - * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask - * contains __GFP_WAIT, the allocation is guaranteed to succeed. + * bio_alloc will allocate a bio and associated bio_vec array that can hold + * at least @nr_iovecs entries. Allocations will be done from the + * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc. + * + * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate + * a bio. This is due to the mempool guarantees. To make this work, callers + * must never allocate more than 1 bio at a time from this pool. Callers + * that need to allocate more than 1 bio must always submit the previously + * allocated bio for IO before attempting to allocate a new one. Failure to + * do so can cause livelocks under memory pressure. * * RETURNS: * Pointer to new bio on success, NULL on failure. @@ -350,21 +358,13 @@ static void bio_kmalloc_destructor(struct bio *bio) } /** - * bio_alloc - allocate a bio for I/O + * bio_kmalloc - allocate a bio for I/O using kmalloc() * @gfp_mask: the GFP_ mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate * * Description: - * bio_alloc will allocate a bio and associated bio_vec array that can hold - * at least @nr_iovecs entries. Allocations will be done from the - * fs_bio_set. Also see @bio_alloc_bioset. - * - * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate - * a bio. This is due to the mempool guarantees. To make this work, callers - * must never allocate more than 1 bio at a time from this pool. Callers - * that need to allocate more than 1 bio must always submit the previously - * allocated bio for IO before attempting to allocate a new one. Failure to - * do so can cause livelocks under memory pressure. + * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains + * %__GFP_WAIT, the allocation is guaranteed to succeed. * **/ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) -- cgit v1.1 From f91b90993f0d286be89f06c2f547ced8cfe291c6 Mon Sep 17 00:00:00 2001 From: Martin Stava Date: Mon, 2 Nov 2009 08:39:35 -0600 Subject: 9p: fix a small bug in readdir for long directories Here is a proposed patch for bug in readdir. Listing of dirs with many files fails without this patch. Signed-off-by: Martin Stava Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_dir.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 873cd31..cae53d4 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -90,6 +90,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) if (err <= 0) break; + i = 0; n = err; while (i < n) { err = p9stat_read(statbuf + i, buflen-i, &st, -- cgit v1.1 From 2511cd0b3b9e9b1c3e9360cc565c3745ac3f3f3f Mon Sep 17 00:00:00 2001 From: Martin Stava Date: Mon, 2 Nov 2009 08:39:34 -0600 Subject: 9p: fix readlink I do not know if you've looked on the patch, but unfortunately it is incorrect. A suggested better version is in this email (the old version didn't work in case the user provided buffer was not long enough - it incorrectly appended null byte on a position of last char, and thus broke the contract of the readlink method). However, I'm still not sure this is 100% correct thing to do, I think readlink is supposed to return buffer without last null byte in all cases, but we do return last null byte (even the old version).. on the other hand it is likely unspecified what is in the remaining part of the buffer, so null character may be fine there ;): Signed-off-by: Martin Stava Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 5947628..18f74ec 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -994,8 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); - retval = buflen; - + retval = strnlen(buffer, buflen); done: kfree(st); return retval; @@ -1062,7 +1061,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) __putname(link); link = ERR_PTR(len); } else - link[len] = 0; + link[min(len, PATH_MAX-1)] = 0; } nd_set_link(nd, link); -- cgit v1.1 From 3e2796a90cf349527e50b3bc4d0b2f4019b1ce7a Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Mon, 2 Nov 2009 08:39:28 -0600 Subject: 9p: fix readdir corner cases The patch below also addresses a couple of other corner cases in readdir seen with a large (e.g. 64k) msize. I'm not sure what people think of my co-opting of fid->aux here. I'd be happy to rework if there's a better way. When the size of the user supplied buffer passed to readdir is smaller than the data returned in one go by the 9P read request, v9fs_dir_readdir() currently discards extra data so that, on the next call, a 9P read request will be issued with offset < previous offset + bytes returned, which voilates the constraint described in paragraph 3 of read(5) description. This patch preseves the leftover data in fid->aux for use in the next call. Signed-off-by: Jim Garlick Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_dir.c | 94 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index cae53d4..15cce53 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -40,6 +40,24 @@ #include "fid.h" /** + * struct p9_rdir - readdir accounting + * @mutex: mutex protecting readdir + * @head: start offset of current dirread buffer + * @tail: end offset of current dirread buffer + * @buf: dirread buffer + * + * private structure for keeping track of readdir + * allocated on demand + */ + +struct p9_rdir { + struct mutex mutex; + int head; + int tail; + uint8_t *buf; +}; + +/** * dt_type - return file type * @mistat: mistat structure * @@ -70,57 +88,79 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) { int over; struct p9_wstat st; - int err; + int err = 0; struct p9_fid *fid; int buflen; - char *statbuf; - int n, i = 0; + int reclen = 0; + struct p9_rdir *rdir; P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; buflen = fid->clnt->msize - P9_IOHDRSZ; - statbuf = kmalloc(buflen, GFP_KERNEL); - if (!statbuf) - return -ENOMEM; - - while (1) { - err = v9fs_file_readn(filp, statbuf, NULL, buflen, - fid->rdir_fpos); - if (err <= 0) - break; - - i = 0; - n = err; - while (i < n) { - err = p9stat_read(statbuf + i, buflen-i, &st, - fid->clnt->dotu); + + /* allocate rdir on demand */ + if (!fid->rdir) { + rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); + + if (rdir == NULL) { + err = -ENOMEM; + goto exit; + } + spin_lock(&filp->f_dentry->d_lock); + if (!fid->rdir) { + rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir); + mutex_init(&rdir->mutex); + rdir->head = rdir->tail = 0; + fid->rdir = (void *) rdir; + rdir = NULL; + } + spin_unlock(&filp->f_dentry->d_lock); + kfree(rdir); + } + rdir = (struct p9_rdir *) fid->rdir; + + err = mutex_lock_interruptible(&rdir->mutex); + while (err == 0) { + if (rdir->tail == rdir->head) { + err = v9fs_file_readn(filp, rdir->buf, NULL, + buflen, filp->f_pos); + if (err <= 0) + goto unlock_and_exit; + + rdir->head = 0; + rdir->tail = err; + } + + while (rdir->head < rdir->tail) { + err = p9stat_read(rdir->buf + rdir->head, + buflen - rdir->head, &st, + fid->clnt->dotu); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; p9stat_free(&st); - goto free_and_exit; + goto unlock_and_exit; } - - i += st.size+2; - fid->rdir_fpos += st.size+2; + reclen = st.size+2; over = filldir(dirent, st.name, strlen(st.name), filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); - filp->f_pos += st.size+2; - p9stat_free(&st); if (over) { err = 0; - goto free_and_exit; + goto unlock_and_exit; } + rdir->head += reclen; + filp->f_pos += reclen; } } -free_and_exit: - kfree(statbuf); +unlock_and_exit: + mutex_unlock(&rdir->mutex); +exit: return err; } -- cgit v1.1 From d4da6c9ccf648f3f1cb5bf9d981a62c253d30e28 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 2 Nov 2009 10:15:27 -0800 Subject: Revert "ext4: Remove journal_checksum mount option and enable it by default" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit d0646f7b636d067d715fab52a2ba9c6f0f46b0d7, as requested by Eric Sandeen. It can basically cause an ext4 filesystem to miss recovery (and thus get mounted with errors) if the journal checksum does not match. Quoth Eric: "My hand-wavy hunch about what is happening is that we're finding a bad checksum on the last partially-written transaction, which is not surprising, but if we have a wrapped log and we're doing the initial scan for head/tail, and we abort scanning on that bad checksum, then we are essentially running an unrecovered filesystem. But that's hand-wavy and I need to go look at the code. We lived without journal checksums on by default until now, and at this point they're doing more harm than good, so we should revert the default-changing commit until we can fix it and do some good power-fail testing with the fixes in place." See http://bugzilla.kernel.org/show_bug.cgi?id=14354 for all the gory details. Requested-by: Eric Sandeen Cc: Theodore Tso Cc: Alexey Fisher Cc: Maxim Levitsky Cc: Aneesh Kumar K.V Cc: Mathias Burén Signed-off-by: Linus Torvalds --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 984ca0c..00d153f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -743,6 +743,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 312211e..d4ca92a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1300,9 +1300,11 @@ static int parse_options(char *options, struct super_block *sb, *journal_devnum = option; break; case Opt_journal_checksum: - break; /* Kept for backwards compatibility */ + set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + break; case Opt_journal_async_commit: set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); + set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); break; case Opt_noload: set_opt(sbi->s_mount_opt, NOLOAD); @@ -2759,14 +2761,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) - jbd2_journal_set_features(sbi->s_journal, 0, 0, + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - else + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ -- cgit v1.1 From fa5d11133b07053270e18fa9c18560e66e79217e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 2 Nov 2009 18:50:49 -0500 Subject: ext4: discard preallocation when restarting a transaction during truncate When restart a transaction during a truncate operation, we drop and reacquire i_data_sem. After reacquiring i_data_sem, we need to discard any inode-based preallocation that might have been grabbed while we released i_data_sem (for example, if pdflush is allocating blocks and racing against the truncate). Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5c5bc5d..d1ec698 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -193,7 +193,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) * so before we call here everything must be consistently dirtied against * this transaction. */ - int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, int nblocks) { int ret; @@ -209,6 +209,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) up_write(&EXT4_I(inode)->i_data_sem); ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); return ret; } -- cgit v1.1 From b1e19e5601277845b4f17ecd7c9ba04f73ee11aa Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 3 Nov 2009 00:25:53 +0900 Subject: nilfs2: fix dirty page accounting leak causing hang at write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bruno Prémont and Dunphy, Bill noticed me that NILFS will certainly hang on ARM-based targets. I found this was caused by an underflow of dirty pages counter. A b-tree cache routine was marking page dirty without adjusting page account information. This fixes the dirty page accounting leak and resolves the hang on arm-based targets. Reported-by: Bruno Prémont Reported-by: Dunphy, Bill Signed-off-by: Ryusuke Konishi Tested-by: Bruno Prémont Cc: stable --- fs/nilfs2/btnode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 5941958..435864c 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -276,8 +276,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); - if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage)) - BUG(); + nilfs_btnode_mark_dirty(obh); spin_lock_irq(&btnc->tree_lock); radix_tree_delete(&btnc->page_tree, oldkey); -- cgit v1.1 From aeda7f6343e6375a832e52ff5ed389c115023ca5 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 2 Nov 2009 15:08:13 +0900 Subject: nilfs2: fix irregular checkpoint creation due to data flush When nilfs flushes out dirty data to reduce memory pressure, creation of checkpoints is wrongly postponed. This bug causes irregular checkpoint creation especially in small footprint systems. To correct this issue, a timer for the checkpoint creation has to be continued if a log writer does not create a checkpoint. This will do the correction. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/segment.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 683df89..6eff66a 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2468,17 +2468,22 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, /* Clear requests (even when the construction failed) */ spin_lock(&sci->sc_state_lock); - sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; - if (req->mode == SC_LSEG_SR) { + sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; sci->sc_seq_done = req->seq_accepted; nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); sci->sc_flush_request = 0; - } else if (req->mode == SC_FLUSH_FILE) - sci->sc_flush_request &= ~FLUSH_FILE_BIT; - else if (req->mode == SC_FLUSH_DAT) - sci->sc_flush_request &= ~FLUSH_DAT_BIT; + } else { + if (req->mode == SC_FLUSH_FILE) + sci->sc_flush_request &= ~FLUSH_FILE_BIT; + else if (req->mode == SC_FLUSH_DAT) + sci->sc_flush_request &= ~FLUSH_DAT_BIT; + /* re-enable timer if checkpoint creation was not done */ + if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_before(jiffies, sci->sc_timer->expires)) + add_timer(sci->sc_timer); + } spin_unlock(&sci->sc_state_lock); } -- cgit v1.1 From 05b4358ad564d7a6a51b3717afe771d36711e9c4 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 14 Sep 2009 01:20:35 +0900 Subject: nilfs2: add zero-fill for new btree node buffers Adds missing initialization of newly allocated b-tree node buffers. This avoids garbage data to be mixed in b-tree node blocks. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/btnode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 435864c..84c2538 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -87,6 +87,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, brelse(bh); BUG(); } + memset(bh->b_data, 0, 1 << inode->i_blkbits); bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); -- cgit v1.1 From 109f55651954def97fa41ee71c464d268c512ab0 Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 10 Nov 2009 10:48:08 -0500 Subject: ext4: fix ext4_ext_direct_IO()'s return value after converting uninit extents After a direct I/O request covering an uninitalized extent (i.e., created using the fallocate system call) or a hole in a file, ext4 will convert the uninitialized extent so it is marked as initialized by calling ext4_convert_unwritten_extents(). This function returns zero on success. This return value was getting returned by ext4_direct_IO(); however the file system's direct_IO function is supposed to return the number of bytes read or written on a success. By returning zero, it confused the direct I/O code into falling back to buffered I/O unnecessarily. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 1 + fs/ext4/inode.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 10539e3..441716f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3519,6 +3519,7 @@ retry: * * This function is called from the direct IO end io call back * function, to convert the fallocated extents after IO is completed. + * Returns 0 on success. */ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, loff_t len) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d1ec698..12d727f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3772,13 +3772,17 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0) + } else if (ret > 0) { + int err; /* * for non AIO case, since the IO is already * completed, we could do the convertion right here */ - ret = ext4_convert_unwritten_extents(inode, - offset, ret); + err = ext4_convert_unwritten_extents(inode, + offset, ret); + if (err < 0) + ret = err; + } return ret; } -- cgit v1.1 From 5f5249507e4b5c4fc0f9c93f33d133d8c95f47e1 Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 10 Nov 2009 10:48:04 -0500 Subject: ext4: skip conversion of uninit extents after direct IO if there isn't any At the end of direct I/O operation, ext4_ext_direct_IO() always called ext4_convert_unwritten_extents(), regardless of whether there were any unwritten extents involved in the I/O or not. This commit adds a state flag so that ext4_ext_direct_IO() only calls ext4_convert_unwritten_extents() when necessary. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 22 +++++++++++++++++----- fs/ext4/inode.c | 4 +++- 3 files changed, 21 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 00d153f..8825515 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -322,6 +322,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ #define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ +#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 441716f..e991ae2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3048,12 +3048,18 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_split_unwritten_extents(handle, inode, path, iblock, max_blocks, flags); - /* flag the io_end struct that we need convert when IO done */ + /* + * Flag the inode(non aio case) or end_io struct (aio case) + * that this IO needs to convertion to written when IO is + * completed + */ if (io) io->flag = DIO_AIO_UNWRITTEN; + else + EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; goto out; } - /* DIO end_io complete, convert the filled extent to written */ + /* async DIO end_io complete, convert the filled extent to written */ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { ret = ext4_convert_unwritten_extents_dio(handle, inode, path); @@ -3295,10 +3301,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * To avoid unecessary convertion for every aio dio rewrite * to the mid of file, here we flag the IO that is really * need the convertion. - * + * For non asycn direct IO case, flag the inode state + * that we need to perform convertion when IO is done. */ - if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) - io->flag = DIO_AIO_UNWRITTEN; + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + if (io) + io->flag = DIO_AIO_UNWRITTEN; + else + EXT4_I(inode)->i_state |= + EXT4_STATE_DIO_UNWRITTEN;; + } } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 12d727f..a9ed2bc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3772,7 +3772,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0) { + } else if (ret > 0 && (EXT4_I(inode)->i_state & + EXT4_STATE_DIO_UNWRITTEN)) { int err; /* * for non AIO case, since the IO is already @@ -3782,6 +3783,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, offset, ret); if (err < 0) ret = err; + EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; } return ret; } -- cgit v1.1 From 4b70df181611012a3556f017b57dfcef7e1d279f Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 3 Nov 2009 14:44:54 -0500 Subject: ext4: code clean up for dio fallocate handling The ext4_debug() call in ext4_end_io_dio() should be moved after the check to make sure that io_end is non-NULL. The comment above ext4_get_block_dio_write() ("Maximum number of blocks...") is a duplicate; the original and correct comment is above the #define DIO_MAX_BLOCKS up above. Based on review comments from Curt Wohlgemuth. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a9ed2bc..2c8caa5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3446,8 +3446,6 @@ out: return ret; } -/* Maximum number of blocks we map for direct IO at once. */ - static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -3655,13 +3653,14 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; + /* if not async direct IO or dio with 0 bytes write, just return */ + if (!io_end || !size) + return; + ext_debug("ext4_end_io_dio(): io_end 0x%p" "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - return; /* if not aio dio with unwritten extents, just free io and return */ if (io_end->flag != DIO_AIO_UNWRITTEN){ -- cgit v1.1 From f60311d5f7670d9539b424e4ed8b5c0872fc9e83 Mon Sep 17 00:00:00 2001 From: "Anand V. Avati" Date: Thu, 22 Oct 2009 06:24:52 -0700 Subject: fuse: prevent fuse_put_request on invalid pointer fuse_direct_io() has a loop where requests are allocated in each iteration. if allocation fails, the loop is broken out and follows into an unconditional fuse_put_request() on that invalid pointer. Signed-off-by: Anand V. Avati Signed-off-by: Miklos Szeredi Cc: stable@kernel.org --- fs/fuse/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a3492f7..5887a63 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1063,7 +1063,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, break; } } - fuse_put_request(fc, req); + if (!IS_ERR(req)) + fuse_put_request(fc, req); if (res > 0) *ppos = pos; -- cgit v1.1 From 0bd87182d3ab18a32a8e9175d3f68754c58e3432 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Nov 2009 11:40:44 +0100 Subject: fuse: fix kunmap in fuse_ioctl_copy_user Looks like another victim of the confusing kmap() vs kmap_atomic() API differences. Reported-by: Todor Gyumyushev Signed-off-by: Jens Axboe Signed-off-by: Miklos Szeredi Cc: Tejun Heo Cc: stable@kernel.org --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5887a63..c18913a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1600,7 +1600,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, kaddr += copy; } - kunmap(map); + kunmap(page); } return 0; -- cgit v1.1 From 5219f346b0ea2a2a8821f1e966b190788c285b0b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 4 Nov 2009 10:24:52 +0100 Subject: fuse: invalidate target of rename Invalidate the target's attributes, which may have changed (such as nlink, change time) so that they are refreshed on the next getattr(). Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 992f6c9..8ada78a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -712,8 +712,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (newent->d_inode) + if (newent->d_inode) { + fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); + } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation -- cgit v1.1 From 89240ba059ca468ae7a8346edf7f95082458c2fc Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Tue, 3 Nov 2009 10:22:40 +0100 Subject: x86, fs: Fix x86 procfs stack information for threads on 64-bit This patch fixes two issues in the procfs stack information on x86-64 linux. The 32 bit loader compat_do_execve did not store stack start. (this was figured out by Alexey Dobriyan). The stack information on a x64_64 kernel always shows 0 kbyte stack usage, because of a missing implementation of the KSTK_ESP macro which always returned -1. The new implementation now returns the right value. Signed-off-by: Stefani Seibold Cc: Americo Wang Cc: Alexey Dobriyan Cc: Al Viro Cc: Andrew Morton LKML-Reference: <1257240160.4889.24.camel@wall-e> Signed-off-by: Ingo Molnar --- fs/compat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index d576b55..6c19040 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1532,6 +1532,8 @@ int compat_do_execve(char * filename, if (retval < 0) goto out; + current->stack_start = current->mm->start_stack; + /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; -- cgit v1.1 From 4c3da2209b1261af9a948b7509a38904c8eee554 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Nov 2009 02:50:06 -0800 Subject: sysfs: Don't leak secdata when a sysfs_dirent is freed. While refreshing my sysfs patches I noticed a leak in the secdata implementation. We don't free the secdata when we free the sysfs dirent. This is a bug in 2.6.32-rc5 that we really should close. Signed-off-by: Eric W. Biederman Acked-by: Serge Hallyn Signed-off-by: James Morris --- fs/sysfs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 5fad489..e020183 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "sysfs.h" DEFINE_MUTEX(sysfs_mutex); @@ -285,6 +286,9 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) sysfs_put(sd->s_symlink.target_sd); if (sysfs_type(sd) & SYSFS_COPY_NAME) kfree(sd->s_name); + if (sd->s_iattr && sd->s_iattr->ia_secdata) + security_release_secctx(sd->s_iattr->ia_secdata, + sd->s_iattr->ia_secdata_len); kfree(sd->s_iattr); sysfs_free_ino(sd->s_ino); kmem_cache_free(sysfs_dir_cachep, sd); -- cgit v1.1 From ba230c3f6dc88ec008806adb27b12088486d508e Mon Sep 17 00:00:00 2001 From: Mingming Date: Fri, 6 Nov 2009 04:01:23 -0500 Subject: ext4: Fix return value of ext4_split_unwritten_extents() to fix direct I/O To prepare for a direct I/O write, we need to split the unwritten extents before submitting the I/O. When no extents needed to be split, ext4_split_unwritten_extents() was incorrectly returning 0 instead of the size of uninitialized extents. This bug caused the wrong return value sent back to VFS code when it gets called from async IO path, leading to an unnecessary fall back to buffered IO. This bug also hid the fact that the check to see whether or not a split would be necessary was incorrect; we can only skip splitting the extent if the write completely covers the uninitialized extent. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e991ae2..715264b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2807,6 +2807,8 @@ fix_extent_len: * into three uninitialized extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). + * + * Returns the size of uninitialized extent to be written on success. */ static int ext4_split_unwritten_extents(handle_t *handle, struct inode *inode, @@ -2824,7 +2826,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, unsigned int allocated, ee_len, depth; ext4_fsblk_t newblock; int err = 0; - int ret = 0; ext_debug("ext4_split_unwritten_extents: inode %lu," "iblock %llu, max_blocks %u\n", inode->i_ino, @@ -2842,12 +2843,12 @@ static int ext4_split_unwritten_extents(handle_t *handle, ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); /* - * if the entire unintialized extent length less than - * the size of extent to write, there is no need to split - * uninitialized extent + * If the uninitialized extent begins at the same logical + * block where the write begins, and the write completely + * covers the extent, then we don't need to split it. */ - if (allocated <= max_blocks) - return ret; + if ((iblock == ee_block) && (allocated <= max_blocks)) + return allocated; err = ext4_ext_get_access(handle, inode, path + depth); if (err) -- cgit v1.1 From ec06aedd44541129840ed52e6165afa3796a27bf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 6 Nov 2009 14:18:29 -0500 Subject: cifs: clean up handling when server doesn't consistently support inode numbers It's possible that a server will return a valid FileID when we query the FILE_INTERNAL_INFO for the root inode, but then zeroed out inode numbers when we do a FindFile with an infolevel of SMB_FIND_FILE_ID_FULL_DIR_INFO. In this situation turn off querying for server inode numbers, generate a warning for the user and just generate an inode number using iunique. Once we generate any inode number with iunique we can no longer use any server inode numbers or we risk collisions, so ensure that we don't do that in cifs_get_inode_info either. Cc: Stable Reported-by: Timothy Normand Miller Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 1 + fs/cifs/inode.c | 7 ++----- fs/cifs/misc.c | 14 ++++++++++++++ fs/cifs/readdir.c | 7 ++++--- 4 files changed, 21 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 6928c24..5646727 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -388,4 +388,5 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon, const struct nls_table *nls_codepage, int remap_special_chars); extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); +extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 5e24925..cababd8 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -512,13 +512,10 @@ int cifs_get_inode_info(struct inode **pinode, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc1) { + if (rc1 || !fattr.cf_uniqueid) { cFYI(1, ("GetSrvInodeNum rc %d", rc1)); fattr.cf_uniqueid = iunique(sb, ROOT_I); - /* disable serverino if call not supported */ - if (rc1 == -EINVAL) - cifs_sb->mnt_cifs_flags &= - ~CIFS_MOUNT_SERVER_INUM; + cifs_autodisable_serverino(cifs_sb); } } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 0241b25..1e25efc 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -715,3 +715,17 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen, ctoUCS_out: return i; } + +void +cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + cifs_sb->mnt_cifs_flags &= CIFS_MOUNT_SERVER_INUM; + cERROR(1, ("Autodisabling the use of server inode numbers on " + "%s. This server doesn't seem to support them " + "properly. Hardlinks will not be recognized on this " + "mount. Consider mounting with the \"noserverino\" " + "option to silence this message.", + cifs_sb->tcon->treeName)); + } +} diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 1f098ca..f84062f 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -727,11 +727,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) pfindEntry, cifs_sb); - /* FIXME: make _to_fattr functions fill this out */ - if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO) + if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { fattr.cf_uniqueid = inum; - else + } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); + cifs_autodisable_serverino(cifs_sb); + } ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); -- cgit v1.1 From f475f6775465283494346663f201ad04810d2e8a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 6 Nov 2009 14:18:49 -0500 Subject: cifs: don't use CIFSGetSrvInodeNumber in is_path_accessible Because it's lighter weight, CIFS tries to use CIFSGetSrvInodeNumber to verify the accessibility of the root inode and then falls back to doing a full QPathInfo if that fails with -EOPNOTSUPP. I have at least a report of a server that returns NT_STATUS_INTERNAL_ERROR rather than something that translates to EOPNOTSUPP. Rather than trying to be clever with that call, just have is_path_accessible do a normal QPathInfo. That call is widely supported and it shouldn't increase the overhead significantly. Cc: Stable Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b090980..63ea83f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2220,16 +2220,8 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon, struct cifs_sb_info *cifs_sb, const char *full_path) { int rc; - __u64 inode_num; FILE_ALL_INFO *pfile_info; - rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc != -EOPNOTSUPP) - return rc; - pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (pfile_info == NULL) return -ENOMEM; -- cgit v1.1 From 5399dd1fc8f5e812db931225ef5f67d89f3b1a56 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 7 Nov 2009 18:45:16 +0900 Subject: nilfs2: fix kernel oops in error case of nilfs_ioctl_move_blocks This fixes a kernel oops reported by Markus Trippelsdorf in the email titled "[NILFS users] kernel Oops while running nilfs_cleanerd". The oops was caused by a bug of error path in nilfs_ioctl_move_blocks() function, which was inlined in nilfs_ioctl_clean_segments(). nilfs_ioctl_move_blocks checks duplication of blocks which will be moved in garbage collection. But, the check should have be done within nilfs_ioctl_move_inode_block() to prevent list corruption among buffers storing the target blocks. To fix the kernel oops, this moves forward the duplication check before the list insertion. I also tested this for stable trees [2.6.30, 2.6.31]. Reported-by: Markus Trippelsdorf Signed-off-by: Ryusuke Konishi Cc: stable --- fs/nilfs2/ioctl.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 6572ea4..89dd73e 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -297,7 +297,18 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode, (unsigned long long)vdesc->vd_vblocknr); return ret; } - bh->b_private = vdesc; + if (unlikely(!list_empty(&bh->b_assoc_buffers))) { + printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, " + "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + brelse(bh); + return -EEXIST; + } list_add_tail(&bh->b_assoc_buffers, buffers); return 0; } @@ -335,24 +346,10 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { ret = nilfs_gccache_wait_and_mark_dirty(bh); if (unlikely(ret < 0)) { - if (ret == -EEXIST) { - vdesc = bh->b_private; - printk(KERN_CRIT - "%s: conflicting %s buffer: " - "ino=%llu, cno=%llu, offset=%llu, " - "blocknr=%llu, vblocknr=%llu\n", - __func__, - vdesc->vd_flags ? "node" : "data", - (unsigned long long)vdesc->vd_ino, - (unsigned long long)vdesc->vd_cno, - (unsigned long long)vdesc->vd_offset, - (unsigned long long)vdesc->vd_blocknr, - (unsigned long long)vdesc->vd_vblocknr); - } + WARN_ON(ret == -EEXIST); goto failed; } list_del_init(&bh->b_assoc_buffers); - bh->b_private = NULL; brelse(bh); } return nmembs; @@ -360,7 +357,6 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, failed: list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { list_del_init(&bh->b_assoc_buffers); - bh->b_private = NULL; brelse(bh); } return ret; -- cgit v1.1 From c083234f1592ef3fad3d8083663c5e4a357ec77c Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 8 Nov 2009 12:09:24 +0900 Subject: nilfs2: fix missing cleanup of gc cache on error cases This fixes an -rc1 regression brought by the commit: 1cf58fa840472ec7df6bf2312885949ebb308853 ("nilfs2: shorten freeze period due to GC in write operation v3"). Although the patch moved out a function call of nilfs_ioctl_move_blocks() to nilfs_ioctl_clean_segments() from nilfs_ioctl_prepare_clean_segments(), it didn't move corresponding cleanup job needed for the error case. This will move the missing cleanup job to the destination function. Signed-off-by: Ryusuke Konishi Acked-by: Jiro SEKIBA --- fs/nilfs2/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 89dd73e..d24057d 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -467,7 +467,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, return 0; failed: - nilfs_remove_all_gcinode(nilfs); printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", msg, ret); return ret; @@ -556,6 +555,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, else ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + if (ret < 0) + nilfs_remove_all_gcinode(nilfs); clear_nilfs_gc_running(nilfs); out_free: -- cgit v1.1 From 1e424a348303694fabdf8b1efbfcb1a892dfa63a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 8 Nov 2009 15:45:44 -0500 Subject: ext4: partial revert to fix double brelse WARNING() This is a partial revert of commit 6487a9d (only the changes made to fs/ext4/namei.c), since it is causing the following brelse() double-free warning when running fsstress on a file system with 1k blocksize and we run into a block allocation failure while converting a single-block directory to a multi-block hash-tree indexed directory. WARNING: at fs/buffer.c:1197 __brelse+0x2e/0x33() Hardware name: VFS: brelse: Trying to free free buffer Modules linked in: Pid: 2226, comm: jbd2/sdd-8 Not tainted 2.6.32-rc6-00577-g0003f55 #101 Call Trace: [] warn_slowpath_common+0x65/0x95 [] warn_slowpath_fmt+0x29/0x2c [] __brelse+0x2e/0x33 [] jbd2_journal_refile_buffer+0x67/0x6c [] jbd2_journal_commit_transaction+0x319/0x14d8 [] ? try_to_del_timer_sync+0x58/0x60 [] ? sched_clock_cpu+0x12a/0x13e [] ? trace_hardirqs_off+0xb/0xd [] ? cpu_clock+0x3f/0x5b [] ? lock_release_holdtime+0x36/0x137 [] ? _spin_unlock_irqrestore+0x44/0x51 [] ? trace_hardirqs_on_caller+0x103/0x124 [] ? trace_hardirqs_on+0xb/0xd [] ? try_to_del_timer_sync+0x58/0x60 [] kjournald2+0x11a/0x310 [] ? autoremove_wake_function+0x0/0x38 [] ? kjournald2+0x0/0x310 [] kthread+0x66/0x6b [] ? kthread+0x0/0x6b [] kernel_thread_helper+0x7/0x10 ---[ end trace 5579351b86af61e3 ]--- Commit 6487a9d was an attempt some buffer head leaks in an ENOSPC error path, but in some cases it actually results in an excess ENOSPC, as shown above. Fixing this means cleaning up who is responsible for releasing the buffer heads from the callee to the caller of add_dirent_to_buf(). Since that's a relatively complex change, and we're late in the rcX development cycle, I'm reverting this now, and holding back a more complete fix until after 2.6.32 ships. We've lived with this buffer_head leak on ENOSPC in ext3 and ext4 for a very long time; a few more months won't kill us. Signed-off-by: "Theodore Ts'o" Cc: Curt Wohlgemuth --- fs/ext4/namei.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 7c8fe80..6d2c1b8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1518,12 +1518,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { - retval = make_indexed_dir(handle, dentry, inode, bh); - if (retval == -ENOSPC) - brelse(bh); - return retval; - } + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) + return make_indexed_dir(handle, dentry, inode, bh); brelse(bh); } bh = ext4_append(handle, dir, &block, &retval); @@ -1532,10 +1528,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); - if (retval == -ENOSPC) - brelse(bh); - return retval; + return add_dirent_to_buf(handle, dentry, inode, de, bh); } /* @@ -1664,8 +1657,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!de) goto cleanup; err = add_dirent_to_buf(handle, dentry, inode, de, bh); - if (err != -ENOSPC) - bh = NULL; + bh = NULL; goto cleanup; journal_error: -- cgit v1.1 From ea0174a7137c8ca9f130ca681f3a99c872da6778 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 12 Oct 2009 21:34:27 -0500 Subject: ext3: retry failed direct IO allocations On a 256M 4k block filesystem, doing this in a loop: dd if=/dev/zero of=test oflag=direct bs=1M count=64 rm -f test eventually leads to spurious ENOSPC: dd: writing `test': No space left on device As with other block allocation callers, it looks like we need to potentially retry the allocations on the initial ENOSPC. A similar patch went into ext4 (commit fbbf69456619de5d251cb9f1df609069178c62d5) Signed-off-by: Eric Sandeen Signed-off-by: Jan Kara --- fs/ext3/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index acf1b14..069a163 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1735,6 +1735,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, ssize_t ret; int orphan = 0; size_t count = iov_length(iov, nr_segs); + int retries = 0; if (rw == WRITE) { loff_t final_size = offset + count; @@ -1757,9 +1758,12 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, } } +retry: ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext3_get_block, NULL); + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (orphan) { int err; -- cgit v1.1 From fe8bc91c4c30122b357d197117705cfd4fabaf28 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Oct 2009 19:26:15 +0200 Subject: ext3: Wait for proper transaction commit on fsync We cannot rely on buffer dirty bits during fsync because pdflush can come before fsync is called and clear dirty bits without forcing a transaction commit. What we do is that we track which transaction has last changed the inode and which transaction last changed allocation and force it to disk on fsync. Signed-off-by: Jan Kara Reviewed-by: Aneesh Kumar K.V --- fs/ext3/fsync.c | 36 ++++++++++++++++-------------------- fs/ext3/inode.c | 32 +++++++++++++++++++++++++++++++- fs/ext3/super.c | 2 ++ 3 files changed, 49 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 451d166..8209f26 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -46,19 +46,21 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + struct ext3_inode_info *ei = EXT3_I(inode); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; int ret = 0; + tid_t commit_tid; + + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; J_ASSERT(ext3_journal_current_handle() == NULL); /* - * data=writeback: + * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. - * sync_inode() will sync the metadata - * - * data=ordered: - * The caller's filemap_fdatawrite() will write the data and - * sync_inode() will write the inode if it is dirty. Then the caller's - * filemap_fdatawait() will wait on the pages. + * Metadata is in the journal, we wait for a proper transaction + * to commit here. * * data=journal: * filemap_fdatawrite won't do anything (the buffers are clean). @@ -73,22 +75,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) goto out; } - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto flush; + if (datasync) + commit_tid = atomic_read(&ei->i_datasync_tid); + else + commit_tid = atomic_read(&ei->i_sync_tid); - /* - * The VFS has written the file data. If the inode is unaltered - * then we need not start a commit. - */ - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - ret = sync_inode(inode, &wbc); + if (log_start_commit(journal, commit_tid)) { + log_wait_commit(journal, commit_tid); goto out; } -flush: + /* * In case we didn't commit a transaction, we have to flush * disk caches manually so that data really is on persistent diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 069a163..354ed3b 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -699,8 +699,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, int err = 0; struct ext3_block_alloc_info *block_i; ext3_fsblk_t current_block; + struct ext3_inode_info *ei = EXT3_I(inode); - block_i = EXT3_I(inode)->i_block_alloc_info; + block_i = ei->i_block_alloc_info; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block @@ -741,6 +742,8 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); + /* ext3_mark_inode_dirty already updated i_sync_tid */ + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); /* had we spliced it onto indirect block? */ if (where->bh) { @@ -2754,6 +2757,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) struct ext3_inode_info *ei; struct buffer_head *bh; struct inode *inode; + journal_t *journal = EXT3_SB(sb)->s_journal; + transaction_t *transaction; long ret; int block; @@ -2831,6 +2836,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + atomic_set(&ei->i_sync_tid, tid); + atomic_set(&ei->i_datasync_tid, tid); + } + if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { /* @@ -3015,6 +3044,7 @@ again: err = rc; ei->i_state &= ~EXT3_STATE_NEW; + atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); out_brelse: brelse (bh); ext3_std_error(inode->i_sb, err); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7a520a8..427496c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -466,6 +466,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) return NULL; ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; + atomic_set(&ei->i_datasync_tid, 0); + atomic_set(&ei->i_sync_tid, 0); return &ei->vfs_inode; } -- cgit v1.1 From 7b02bec07efe1d6c7d48c786e0c1a38d28fe7245 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 10 Nov 2009 17:13:22 +0800 Subject: JBD/JBD2: free j_wbuf if journal init fails. If journal init fails, we need to free j_wbuf. Cc: Andrew Morton Cc: Jan Kara Signed-off-by: Tao Ma Signed-off-by: Jan Kara --- fs/jbd/journal.c | 2 ++ fs/jbd2/journal.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index bd3c073..49d5cd6 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -756,6 +756,7 @@ journal_t * journal_init_dev(struct block_device *bdev, return journal; out_err: + kfree(journal->j_wbuf); kfree(journal); return NULL; } @@ -820,6 +821,7 @@ journal_t * journal_init_inode (struct inode *inode) return journal; out_err: + kfree(journal->j_wbuf); kfree(journal); return NULL; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b0ab521..fed85388 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -913,6 +913,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, return journal; out_err: + kfree(journal->j_wbuf); jbd2_stats_proc_exit(journal); kfree(journal); return NULL; @@ -986,6 +987,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) return journal; out_err: + kfree(journal->j_wbuf); jbd2_stats_proc_exit(journal); kfree(journal); return NULL; -- cgit v1.1 From 6346c93988caa3048bf4d81f9ba3608a7a195aa2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:47 -0500 Subject: Btrfs: fix data allocation hint start Sometimes our start allocation hint when we cow a file can be either EXTENT_HOLE or some other such place holder, which is not optimal. So if we find that our em->block_start is one of these special values, check to see where the first block of the inode is stored, and use that as a hint. If that block is also a special value, just fallback on a hint of 0 and let the allocator figure out a good place to put the data. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 78139ef..d8393dd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -743,8 +743,22 @@ static noinline int cow_file_range(struct inode *inode, em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, start, num_bytes); if (em) { - alloc_hint = em->block_start; - free_extent_map(em); + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } } read_unlock(&BTRFS_I(inode)->extent_tree.lock); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); -- cgit v1.1 From 249ac1e55c642c670f47aacdc57629bbbf10a8db Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: cleanup transaction starting and fix journal_info usage We use journal_info to tell if we're in a nested transaction to make sure we don't commit the transaction within a nested transaction. We use another method to see if there are any outstanding ioctl trans handles, so if we're starting one do not set current->journal_info, since it will screw with other filesystems. This patch also cleans up the starting stuff so there aren't any magic numbers. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bca82a4..c207e8c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -163,8 +163,14 @@ static void wait_current_trans(struct btrfs_root *root) } } +enum btrfs_trans_type { + TRANS_START, + TRANS_JOIN, + TRANS_USERSPACE, +}; + static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - int num_blocks, int wait) + int num_blocks, int type) { struct btrfs_trans_handle *h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); @@ -172,7 +178,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, mutex_lock(&root->fs_info->trans_mutex); if (!root->fs_info->log_root_recovering && - ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) + ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || + type == TRANS_USERSPACE)) wait_current_trans(root); ret = join_transaction(root); BUG_ON(ret); @@ -186,7 +193,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->alloc_exclude_start = 0; h->delayed_ref_updates = 0; - if (!current->journal_info) + if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; root->fs_info->running_transaction->use_count++; @@ -198,18 +205,18 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_blocks) { - return start_transaction(root, num_blocks, 1); + return start_transaction(root, num_blocks, TRANS_START); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, int num_blocks) { - return start_transaction(root, num_blocks, 0); + return start_transaction(root, num_blocks, TRANS_JOIN); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, int num_blocks) { - return start_transaction(r, num_blocks, 2); + return start_transaction(r, num_blocks, TRANS_USERSPACE); } /* wait for a transaction commit to be fully complete */ -- cgit v1.1 From 01dea1efc23b511d3b58bb94da07ddb6d6db9895 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: fix how we set max_size for free space clusters This patch fixes a problem where max_size can be set to 0 even though we filled the cluster properly. We set max_size to 0 if we restart the cluster window, but if the new start entry is big enough to be our new cluster then we could return with a max_size set to 0, which will mean the next time we try to allocate from this cluster it will fail. So set max_extent to the entry's size. Tested this on my box and now we actually allocate from the cluster after we fill it. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5c2caad..cb2849f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1296,7 +1296,7 @@ again: window_start = entry->offset; window_free = entry->bytes; last = entry; - max_extent = 0; + max_extent = entry->bytes; } else { last = next; window_free += next->bytes; -- cgit v1.1 From 5df6a9f606bf2ee25ab8031bff124ed883b823be Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: fix some metadata enospc issues We weren't reserving metadata space for rename, rmdir and unlink, which could cause problems. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d8393dd..bb7fd80 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2488,7 +2488,19 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) root = BTRFS_I(dir)->root; + /* + * 5 items for unlink inode + * 1 for orphan + */ + ret = btrfs_reserve_metadata_space(root, 6); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_unreserve_metadata_space(root, 6); + return PTR_ERR(trans); + } btrfs_set_trans_block_group(trans, dir); @@ -2503,6 +2515,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 6); btrfs_btree_balance_dirty(root, nr); return ret; } @@ -2583,7 +2596,16 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -ENOTEMPTY; + ret = btrfs_reserve_metadata_space(root, 5); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_unreserve_metadata_space(root, 5); + return PTR_ERR(trans); + } + btrfs_set_trans_block_group(trans, dir); if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { @@ -2606,6 +2628,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) out: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 5); btrfs_btree_balance_dirty(root, nr); if (ret && !err) @@ -5297,11 +5320,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, return -ENOTEMPTY; /* - * 2 items for dir items - * 1 item for orphan entry - * 1 item for ref + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items + * should cover the worst case number of items we'll modify. */ - ret = btrfs_reserve_metadata_space(root, 4); + ret = btrfs_reserve_metadata_space(root, 11); if (ret) return ret; @@ -5417,7 +5443,7 @@ out_fail: if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); - btrfs_unreserve_metadata_space(root, 4); + btrfs_unreserve_metadata_space(root, 11); return ret; } -- cgit v1.1 From df66916e71231e9f2377cac9c5c1e2d190f9a427 Mon Sep 17 00:00:00 2001 From: Li Dongyang Date: Fri, 6 Nov 2009 14:33:01 +0000 Subject: Btrfs: skip btrfs_release_path in btrfs_update_root and btrfs_del_root We don't need to call btrfs_release_path because btrfs_free_path will do that for us. Signed-off-by: Li Dongyang Signed-off-by: Chris Mason --- fs/btrfs/root-tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 9351428..67fa2d2 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -159,7 +159,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root write_extent_buffer(l, item, ptr, sizeof(*item)); btrfs_mark_buffer_dirty(path->nodes[0]); out: - btrfs_release_path(root, path); btrfs_free_path(path); return ret; } @@ -332,7 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, BUG_ON(refs != 0); ret = btrfs_del_item(trans, root, path); out: - btrfs_release_path(root, path); btrfs_free_path(path); return ret; } -- cgit v1.1 From 4eb3991c5def39bcf553c14ebe2618fcb47b627f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 10 Nov 2009 09:01:43 +0000 Subject: Btrfs: avoid null deref in unpin_extent_cache() I re-orderred the checks to avoid dereferencing "em" if it was null. Found by smatch static checker. Signed-off-by: Dan Carpenter Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2c726b7..ccbdcb5 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -208,7 +208,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); - WARN_ON(em->start != start || !em); + WARN_ON(!em || em->start != start); if (!em) goto out; -- cgit v1.1 From ccf0e72537a9f68611ca575121afd08e2b4d0fb0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: find ideal block group for caching This patch changes a few things. Hopefully the comments are helpfull, but I'll try and be as verbose here. Problem: My fedora box was taking 1 minute and 21 seconds to boot with btrfs as root. Part of this problem was we pick the first block group we can find and start caching it, even if it may not have enough free space. The other problem is we only search for cached block groups the first time around, which we won't find any cached block groups because this is a newly mounted fs, so we end up caching several block groups during bootup, which with alot of fragmentation takes around 30-45 seconds to complete, which bogs down the system. So Solution: 1) Don't cache block groups willy-nilly at first. Instead try and figure out which block group has the most free, and therefore will take the least amount of time to cache. 2) Don't be so picky about cached block groups. The other problem is once we've filled up a cluster, if the block group isn't finished caching the next time we try and do the allocation we'll completely ignore the cluster and start searching from the beginning of the space, which makes us cache more block groups, which slows us down even more. So instead of skipping block groups that are not finished caching when we have a hint, only skip the block group if it hasn't started caching yet. There is one other tweak in here. Before if we allocated a chunk and still couldn't find new space, we'd end up switching the space info to force another chunk allocation. This could make us end up with way too many chunks, so keep track of this particular case. With this patch and my previous cluster fixes my fedora box now boots in 43 seconds, and according to the bootchart is not held up by our block group caching at all. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 109 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c56f916..2a4cdce 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4101,7 +4101,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) } enum btrfs_loop_type { - LOOP_CACHED_ONLY = 0, + LOOP_FIND_IDEAL = 0, LOOP_CACHING_NOWAIT = 1, LOOP_CACHING_WAIT = 2, LOOP_ALLOC_CHUNK = 3, @@ -4130,12 +4130,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group = NULL; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; + int done_chunk_alloc = 0; struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; + u64 ideal_cache_percent = 0; + u64 ideal_cache_offset = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -4171,14 +4174,19 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 0; if (search_start == hint_byte) { +ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. + * + * However if we are re-searching with an ideal block group + * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, data) && - block_group_cache_done(block_group)) { + (block_group->cached != BTRFS_CACHE_NO || + search_start == ideal_cache_offset)) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -4190,13 +4198,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, */ btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); - } else + } else { goto have_block_group; + } } else if (block_group) { btrfs_put_block_group(block_group); } } - search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups, list) { @@ -4208,28 +4216,45 @@ search: have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + u64 free_percent; + + free_percent = btrfs_block_group_used(&block_group->item); + free_percent *= 100; + free_percent = div64_u64(free_percent, + block_group->key.offset); + free_percent = 100 - free_percent; + if (free_percent > ideal_cache_percent && + likely(!block_group->ro)) { + ideal_cache_offset = block_group->key.objectid; + ideal_cache_percent = free_percent; + } + /* - * we want to start caching kthreads, but not too many - * right off the bat so we don't overwhelm the system, - * so only start them if there are less than 2 and we're - * in the initial allocation phase. + * We only want to start kthread caching if we are at + * the point where we will wait for caching to make + * progress, or if our ideal search is over and we've + * found somebody to start caching. */ if (loop > LOOP_CACHING_NOWAIT || - atomic_read(&space_info->caching_threads) < 2) { + (loop > LOOP_FIND_IDEAL && + atomic_read(&space_info->caching_threads) < 2)) { ret = cache_block_group(block_group); BUG_ON(ret); } - } - - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) { found_uncached_bg = true; - /* if we only want cached bgs, loop */ - if (loop == LOOP_CACHED_ONLY) + /* + * If loop is set for cached only, try the next block + * group. + */ + if (loop == LOOP_FIND_IDEAL) goto loop; } + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) + found_uncached_bg = true; + if (unlikely(block_group->ro)) goto loop; @@ -4409,9 +4434,11 @@ loop: } up_read(&space_info->groups_sem); - /* LOOP_CACHED_ONLY, only search fully cached block groups - * LOOP_CACHING_NOWAIT, search partially cached block groups, but - * dont wait foR them to finish caching + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for + * for them to make caching progress. Also + * determine the best possible bg to cache + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching * LOOP_ALLOC_CHUNK, force a chunk allocation and try again * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try @@ -4420,12 +4447,47 @@ loop: if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && (found_uncached_bg || empty_size || empty_cluster || allowed_chunk_alloc)) { - if (found_uncached_bg) { + if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; - if (loop < LOOP_CACHING_WAIT) { - loop++; + loop++; + if (!ideal_cache_percent && + atomic_read(&space_info->caching_threads)) goto search; - } + + /* + * 1 of the following 2 things have happened so far + * + * 1) We found an ideal block group for caching that + * is mostly full and will cache quickly, so we might + * as well wait for it. + * + * 2) We searched for cached only and we didn't find + * anything, and we didn't start any caching kthreads + * either, so chances are we will loop through and + * start a couple caching kthreads, and then come back + * around and just wait for them. This will be slower + * because we will have 2 caching kthreads reading at + * the same time when we could have just started one + * and waited for it to get far enough to give us an + * allocation, so go ahead and go to the wait caching + * loop. + */ + loop = LOOP_CACHING_WAIT; + search_start = ideal_cache_offset; + ideal_cache_percent = 0; + goto ideal_cache; + } else if (loop == LOOP_FIND_IDEAL) { + /* + * Didn't find a uncached bg, wait on anything we find + * next. + */ + loop = LOOP_CACHING_WAIT; + goto search; + } + + if (loop < LOOP_CACHING_WAIT) { + loop++; + goto search; } if (loop == LOOP_ALLOC_CHUNK) { @@ -4437,7 +4499,8 @@ loop: ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, data, 1); allowed_chunk_alloc = 0; - } else { + done_chunk_alloc = 1; + } else if (!done_chunk_alloc) { space_info->force_alloc = 1; } -- cgit v1.1 From f5a84ee3cdd88d96b7bcede10af58598ad8d52a7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: fallback on uncompressed io if compressed io fails Currently compressed IO does not deal with not having its entire extent able to be allocated. So if we have enough free space to allocate for the extent, but its not contiguous, it will fail spectacularly. This patch fixes this by falling back on uncompressed IO which lets us spread the delalloc extent across multiple extents. I tested this by making us randomly think the reservation had failed to make it fallback on the uncompressed io way and it seemed to work fine. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bb7fd80..d3d7d46 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -538,7 +538,7 @@ static noinline int submit_compressed_extents(struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree; - int ret; + int ret = 0; if (list_empty(&async_cow->extents)) return 0; @@ -552,6 +552,7 @@ static noinline int submit_compressed_extents(struct inode *inode, io_tree = &BTRFS_I(inode)->io_tree; +retry: /* did the compression code fall back to uncompressed IO? */ if (!async_extent->pages) { int page_started = 0; @@ -562,11 +563,11 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->ram_size - 1, GFP_NOFS); /* allocate blocks */ - cow_file_range(inode, async_cow->locked_page, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - &page_started, &nr_written, 0); + ret = cow_file_range(inode, async_cow->locked_page, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0); /* * if page_started, cow_file_range inserted an @@ -574,7 +575,7 @@ static noinline int submit_compressed_extents(struct inode *inode, * and IO for us. Otherwise, we need to submit * all those pages down to the drive. */ - if (!page_started) + if (!page_started && !ret) extent_write_locked_range(io_tree, inode, async_extent->start, async_extent->start + @@ -602,7 +603,21 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->compressed_size, 0, alloc_hint, (u64)-1, &ins, 1); - BUG_ON(ret); + if (ret) { + int i; + for (i = 0; i < async_extent->nr_pages; i++) { + WARN_ON(async_extent->pages[i]->mapping); + page_cache_release(async_extent->pages[i]); + } + kfree(async_extent->pages); + async_extent->nr_pages = 0; + async_extent->pages = NULL; + unlock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, GFP_NOFS); + goto retry; + } + em = alloc_extent_map(GFP_NOFS); em->start = async_extent->start; em->len = async_extent->ram_size; -- cgit v1.1 From 33b258086441dd07e00133c79fcd8cbc6a76d737 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 11 Nov 2009 10:16:57 -0500 Subject: Btrfs: allow more metadata chunk preallocation On an FS where all of the space has not been allocated into chunks yet, the enospc can return enospc just because the existing metadata chunks are full. We get around this by allowing more metadata chunks to be allocated up to a certain limit, and finding the right limit is a little fuzzy. The problem is the reservations for delalloc would preallocate way too much of the FS as metadata. We need to start saying no and just force some IO to happen. But we also need to let a reasonable amount of the FS become metadata. This bumps the hard limit up, later releases will have a better system. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2a4cdce..8d1fd6d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2976,10 +2976,10 @@ static int maybe_allocate_chunk(struct btrfs_root *root, free_space = btrfs_super_total_bytes(disk_super); /* - * we allow the metadata to grow to a max of either 5gb or 5% of the + * we allow the metadata to grow to a max of either 10gb or 5% of the * space in the volume. */ - min_metadata = min((u64)5 * 1024 * 1024 * 1024, + min_metadata = min((u64)10 * 1024 * 1024 * 1024, div64_u64(free_space * 5, 100)); if (info->total_bytes >= min_metadata) { spin_unlock(&info->lock); -- cgit v1.1 From a6dbd429d8dd3382bbd9594b8d2ec74843a260d9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 Nov 2009 15:53:34 -0500 Subject: Btrfs: fix panic when trying to destroy a newly allocated There is a problem where iget5_locked will look for an inode, not find it, and then subsequently try to allocate it. Another CPU will have raced in and allocated the inode instead, so when iget5_locked gets the inode spin lock again and does a search, it finds the new inode. So it goes ahead and calls destroy_inode on the inode it just allocated. The problem is we don't set BTRFS_I(inode)->root until the new inode is completely initialized. This patch makes us set root to NULL when alloc'ing a new inode, so when we get to btrfs_destroy_inode and we see that root is NULL we can just free up the memory and continue on. This fixes the panic http://www.kerneloops.org/submitresult.php?number=812690 Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3d7d46..ee92801 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5180,6 +5180,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->logged_trans = 0; ei->outstanding_extents = 0; ei->reserved_extents = 0; + ei->root = NULL; spin_lock_init(&ei->accounting_lock); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); @@ -5196,6 +5197,14 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(inode->i_data.nrpages); /* + * This can happen where we create an inode, but somebody else also + * created the same inode and we need to destroy the one we already + * created. + */ + if (!root) + goto free; + + /* * Make sure we're properly removed from the ordered operation * lists. */ @@ -5230,6 +5239,7 @@ void btrfs_destroy_inode(struct inode *inode) } inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); +free: kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } -- cgit v1.1 From ff5e4b51a397568c6a2901903c35a4bfaaf752a4 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Thu, 12 Nov 2009 09:53:50 +0100 Subject: fs/jbd: Export log_start_commit to fix ext3 build. This fixes: ERROR: "log_start_commit" [fs/ext3/ext3.ko] undefined! Signed-off-by: Stefan Schmidt --- fs/jbd/journal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 49d5cd6..4160afa 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -73,6 +73,7 @@ EXPORT_SYMBOL(journal_errno); EXPORT_SYMBOL(journal_ack_err); EXPORT_SYMBOL(journal_clear_err); EXPORT_SYMBOL(log_wait_commit); +EXPORT_SYMBOL(log_start_commit); EXPORT_SYMBOL(journal_start_commit); EXPORT_SYMBOL(journal_force_commit_nested); EXPORT_SYMBOL(journal_wipe); -- cgit v1.1 From 29f12ca32122db98481150be09d35bd72b68045e Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 11 Nov 2009 14:26:32 -0800 Subject: pidns: fix a leak in /proc dentries and inodes with pid namespaces. Daniel Lezcano reported a leak in 'struct pid' and 'struct pid_namespace' that is discussed in: http://lkml.org/lkml/2009/10/2/159. To summarize the thread, when container-init is terminated, it sets the PF_EXITING flag, zaps other processes in the container and waits to reap them. As a part of reaping, the container-init should flush any /proc dentries associated with the processes. But because the container-init is itself exiting and the following PF_EXITING check, the dentries are not flushed, resulting in leak in /proc inodes and dentries. This fix reverts the commit 7766755a2f249e7e0 ("Fix /proc dcache deadlock in do_exit") which introduced the check for PF_EXITING. At the time of the commit, shrink_dcache_parent() flushed dentries from other filesystems also and could have caused a deadlock which the commit fixed. But as pointed out by Eric Biederman, after commit 0feae5c47aabdde59, shrink_dcache_parent() no longer affects other filesystems. So reverting the commit is now safe. As pointed out by Jan Kara, the leak is not as critical since the unclaimed space will be reclaimed under memory pressure or by: echo 3 > /proc/sys/vm/drop_caches But since this check is no longer required, its best to remove it. Signed-off-by: Sukadev Bhattiprolu Reported-by: Daniel Lezcano Acked-by: Eric W. Biederman Acked-by: Jan Kara Cc: Andrea Arcangeli Cc: Serge Hallyn Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 837469a..af643b5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2597,8 +2597,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) name.len = snprintf(buf, sizeof(buf), "%d", pid); dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { - if (!(current->flags & PF_EXITING)) - shrink_dcache_parent(dentry); + shrink_dcache_parent(dentry); d_drop(dentry); dput(dentry); } -- cgit v1.1 From 7779d7bed950a7fb1af4f540c2f82a6b81b65901 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Nov 2009 14:26:34 -0800 Subject: fs: add missing compat_ptr handling for FS_IOC_RESVSP ioctl For FS_IOC_RESVSP and FS_IOC_RESVSP64 compat_sys_ioctl() uses its arg argument as a pointer to userspace. However it is missing a a call to compat_ptr() which will do a proper pointer conversion. This was introduced with 3e63cbb1 "fs: Add new pre-allocation ioctls to vfs for compatibility with legacy xfs ioctls". Signed-off-by: Heiko Carstens Cc: Ankit Jain Acked-by: Christoph Hellwig Cc: Al Viro Acked-by: Arnd Bergmann Acked-by: David S. Miller Cc: [2.6.31.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat_ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index f91fd51..d84e705 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1800,7 +1800,7 @@ struct space_resv_32 { /* just account for different alignment */ static int compat_ioctl_preallocate(struct file *file, unsigned long arg) { - struct space_resv_32 __user *p32 = (void __user *)arg; + struct space_resv_32 __user *p32 = compat_ptr(arg); struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || @@ -2802,7 +2802,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, #else case FS_IOC_RESVSP: case FS_IOC_RESVSP64: - error = ioctl_preallocate(filp, (void __user *)arg); + error = ioctl_preallocate(filp, compat_ptr(arg)); goto out_fput; #endif -- cgit v1.1 From fc63cf237078c86214abcb2ee9926d8ad289da9b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 11 Nov 2009 14:26:48 -0800 Subject: exec: setup_arg_pages() fails to return errors In setup_arg_pages we work hard to assign a value to ret, but on exit we always return 0. Also remove a now duplicated exit path and branch to out_unlock instead. Signed-off-by: Anton Blanchard Acked-by: Serge Hallyn Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index d49be6b..ba112bd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -624,10 +624,8 @@ int setup_arg_pages(struct linux_binprm *bprm, /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); - if (ret) { - up_write(&mm->mmap_sem); - return ret; - } + if (ret) + goto out_unlock; } #ifdef CONFIG_STACK_GROWSUP @@ -641,7 +639,7 @@ int setup_arg_pages(struct linux_binprm *bprm, out_unlock: up_write(&mm->mmap_sem); - return 0; + return ret; } EXPORT_SYMBOL(setup_arg_pages); -- cgit v1.1 From e04b5ef8b49db87d01a9b3a47fe41a918a0c0ff5 Mon Sep 17 00:00:00 2001 From: Mike Hommey Date: Wed, 11 Nov 2009 14:26:55 -0800 Subject: __generic_block_fiemap(): fix for files bigger than 4GB Because of an integer overflow on start_blk, various kind of wrong results would be returned by the generic_block_fiemap() handler, such as no extents when there is a 4GB+ hole at the beginning of the file, or wrong fe_logical when an extent starts after the first 4GB. Signed-off-by: Mike Hommey Cc: Alexander Viro Cc: Steven Whitehouse Cc: Theodore Ts'o Cc: Eric Sandeen Cc: Josef Bacik Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ioctl.c b/fs/ioctl.c index 7b17a14..6c75110 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -254,7 +254,7 @@ int __generic_block_fiemap(struct inode *inode, u64 len, get_block_t *get_block) { struct buffer_head tmp; - unsigned int start_blk; + unsigned long long start_blk; long long length = 0, map_len = 0; u64 logical = 0, phys = 0, size = 0; u32 flags = FIEMAP_EXTENT_MERGED; -- cgit v1.1 From c1ea985c710f41e97f1c72c29bbf367375370f0b Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 12 Nov 2009 00:13:32 +0900 Subject: nilfs2: fix lock order reversal in chcp operation Will fix the following lock order reversal lockdep detected: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.32-rc6 #7 ------------------------------------------------------- chcp/30157 is trying to acquire lock: (&nilfs->ns_mount_mutex){+.+.+.}, at: [] nilfs_cpfile_change_cpmode+0x46/0x752 [nilfs2] but task is already holding lock: (&nilfs->ns_segctor_sem){++++.+}, at: [] nilfs_transaction_begin+0xba/0x110 [nilfs2] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&nilfs->ns_segctor_sem){++++.+}: [] __lock_acquire+0x109c/0x139d [] lock_acquire+0x89/0xa0 [] down_read+0x31/0x45 [] nilfs_attach_checkpoint+0x8f/0x16b [nilfs2] [] nilfs_get_sb+0x3e7/0x653 [nilfs2] [] vfs_kern_mount+0x8b/0x124 [] do_kern_mount+0x37/0xc3 [] do_mount+0x64d/0x69d [] sys_mount+0x66/0x95 [] sysenter_do_call+0x12/0x32 -> #1 (&type->s_umount_key#31/1){+.+.+.}: [] __lock_acquire+0x109c/0x139d [] lock_acquire+0x89/0xa0 [] down_write_nested+0x34/0x52 [] sget+0x22e/0x389 [] nilfs_get_sb+0x187/0x653 [nilfs2] [] vfs_kern_mount+0x8b/0x124 [] do_kern_mount+0x37/0xc3 [] do_mount+0x64d/0x69d [] sys_mount+0x66/0x95 [] sysenter_do_call+0x12/0x32 -> #0 (&nilfs->ns_mount_mutex){+.+.+.}: [] __lock_acquire+0xe27/0x139d [] lock_acquire+0x89/0xa0 [] mutex_lock_nested+0x41/0x23e [] nilfs_cpfile_change_cpmode+0x46/0x752 [nilfs2] [] nilfs_ioctl+0x11a/0x7da [nilfs2] [] vfs_ioctl+0x27/0x6e [] do_vfs_ioctl+0x491/0x4db [] sys_ioctl+0x45/0x5f [] sysenter_do_call+0x12/0x32 Signed-off-by: Ryusuke Konishi --- fs/nilfs2/cpfile.c | 2 -- fs/nilfs2/ioctl.c | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 1c6cfb5..3f5d5d0 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -871,7 +871,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) * exclusive with a new mount job. Though it doesn't cover * umount, it's enough for the purpose. */ - mutex_lock(&nilfs->ns_mount_mutex); if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { /* Current implementation does not have to protect plain read-only mounts since they are exclusive @@ -880,7 +879,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) ret = -EBUSY; } else ret = nilfs_cpfile_clear_snapshot(cpfile, cno); - mutex_unlock(&nilfs->ns_mount_mutex); return ret; case NILFS_SNAPSHOT: return nilfs_cpfile_set_snapshot(cpfile, cno); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index d24057d..f6af760 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -99,7 +99,8 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { - struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct inode *cpfile = nilfs->ns_cpfile; struct nilfs_transaction_info ti; struct nilfs_cpmode cpmode; int ret; @@ -109,14 +110,17 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, if (copy_from_user(&cpmode, argp, sizeof(cpmode))) return -EFAULT; + mutex_lock(&nilfs->ns_mount_mutex); nilfs_transaction_begin(inode->i_sb, &ti, 0); ret = nilfs_cpfile_change_cpmode( cpfile, cpmode.cm_cno, cpmode.cm_mode); if (unlikely(ret < 0)) { nilfs_transaction_abort(inode->i_sb); + mutex_unlock(&nilfs->ns_mount_mutex); return ret; } nilfs_transaction_commit(inode->i_sb); /* never fails */ + mutex_unlock(&nilfs->ns_mount_mutex); return ret; } -- cgit v1.1 From 479c2553af9a176a0613894b9f1ec73425fd56a3 Mon Sep 17 00:00:00 2001 From: Petr Vandrovec Date: Sat, 14 Nov 2009 10:47:07 +0100 Subject: Fix memory corruption caused by nfsd readdir+ Commit 8177e6d6dfb9cd03d9bdeb647c32161f8f58f686 ("nfsd: clean up readdirplus encoding") introduced single character typo in nfs3 readdir+ implementation. Unfortunately that typo has quite bad side effects: random memory corruption, followed (on my box) with immediate spontaneous box reboot. Using 'p1' instead of 'p' fixes my Linux box rebooting whenever VMware ESXi box tries to list contents of my home directory. Signed-off-by: Petr Vandrovec Cc: "J. Bruce Fields" Cc: Neil Brown Signed-off-by: Linus Torvalds --- fs/nfsd/nfs3xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index edf926e..d0a2ce1 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -958,7 +958,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, p1 = encode_entry_baggage(cd, p1, name, namlen, ino); if (plus) - p = encode_entryplus_baggage(cd, p1, name, namlen); + p1 = encode_entryplus_baggage(cd, p1, name, namlen); /* determine entry word length and lengths to go in pages */ num_entry_words = p1 - tmp; -- cgit v1.1 From 18dafac1a4c6c88867a50f9a82492976f20383d6 Mon Sep 17 00:00:00 2001 From: Jiro SEKIBA Date: Sun, 15 Nov 2009 13:49:45 +0900 Subject: nilfs2: deleted inconsistent comment in nilfs_load_inode_block() The comment says, "Caller of this function MUST lock s_inode_lock", however just above the comment, it locks s_inode_lock in the function. Signed-off-by: Jiro SEKIBA Signed-off-by: Ryusuke Konishi --- fs/nilfs2/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 5040220..2a0a5a3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -664,7 +664,6 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, int err; spin_lock(&sbi->s_inode_lock); - /* Caller of this function MUST lock s_inode_lock */ if (ii->i_bh == NULL) { spin_unlock(&sbi->s_inode_lock); err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, -- cgit v1.1 From f534dc994397560343be4a3223b9bbaa8e739e1f Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Mon, 16 Nov 2009 12:03:16 +0530 Subject: cifs: clear server inode number flag while autodisabling Fix the commit ec06aedd44 that intended to turn off querying for server inode numbers when server doesn't consistently support inode numbers. Presumably the commit didn't actually clear the CIFS_MOUNT_SERVER_INUM flag, perhaps a typo. Signed-off-by: Suresh Jayaraman Acked-by: Jeff Layton Cc: Stable Signed-off-by: Steve French --- fs/cifs/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1e25efc..d27d4ec 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -720,7 +720,7 @@ void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { - cifs_sb->mnt_cifs_flags &= CIFS_MOUNT_SERVER_INUM; + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; cERROR(1, ("Autodisabling the use of server inode numbers on " "%s. This server doesn't seem to support them " "properly. Hardlinks will not be recognized on this " -- cgit v1.1 From 8ec6dba2581754e375be66f7bedd708d856d8b30 Mon Sep 17 00:00:00 2001 From: Jan Rekorajski Date: Mon, 16 Nov 2009 11:57:02 +0000 Subject: XFS bug in log recover with quota (bugzilla id 855) Hi, I was hit by a bug in linux 2.6.31 when XFS is not able to recover the log after a crash if fs was mounted with quotas. Gory details in XFS bugzilla: http://oss.sgi.com/bugzilla/show_bug.cgi?id=855. It looks like wrong struct is used in buffer length check, and the following patch should fix the problem. xfs_dqblk_t has a size of 104+32 bytes, while xfs_disk_dquot_t is 104 bytes long, and this is exactly what I see in system logs - "XFS: dquot too small (104) in xlog_recover_do_dquot_trans." Signed-off-by: Jan Rekorajski Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_log_recover.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1099395..fb17f82 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1980,7 +1980,7 @@ xlog_recover_do_reg_buffer( "XFS: NULL dquot in %s.", __func__); goto next; } - if (item->ri_buf[i].i_len < sizeof(xfs_dqblk_t)) { + if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { cmn_err(CE_ALERT, "XFS: dquot too small (%d) in %s.", item->ri_buf[i].i_len, __func__); @@ -2635,7 +2635,7 @@ xlog_recover_do_dquot_trans( "XFS: NULL dquot in %s.", __func__); return XFS_ERROR(EIO); } - if (item->ri_buf[1].i_len < sizeof(xfs_dqblk_t)) { + if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { cmn_err(CE_ALERT, "XFS: dquot too small (%d) in %s.", item->ri_buf[1].i_len, __func__); -- cgit v1.1 From 6c06f072c2d797ddbb2270363de97c53ebbe0385 Mon Sep 17 00:00:00 2001 From: "Nathaniel W. Turner" Date: Mon, 16 Nov 2009 19:51:48 +0000 Subject: xfs: copy li_lsn before dropping AIL lock Access to log items on the AIL is generally protected by m_ail_lock; this is particularly needed when we're getting or setting the 64-bit li_lsn on a 32-bit platform. This patch fixes a couple places where we were accessing the log item after dropping the AIL lock on 32-bit machines. This can result in a partially-zeroed log->l_tail_lsn if xfs_trans_ail_delete is racing with xfs_trans_ail_update, and in at least some cases, this can leave the l_tail_lsn with a zero cycle number, which means xlog_space_left will think the log is full (unless CONFIG_XFS_DEBUG is set, in which case we'll trip an ASSERT), leading to processes stuck forever in xlog_grant_log_space. Thanks to Adrian VanderSpek for first spotting the race potential and to Dave Chinner for debug assistance. Signed-off-by: Nathaniel W. Turner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_trans_ail.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index f31271c..2ffc570 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -467,6 +467,7 @@ xfs_trans_ail_update( { xfs_log_item_t *dlip = NULL; xfs_log_item_t *mlip; /* ptr to minimum lip */ + xfs_lsn_t tail_lsn; mlip = xfs_ail_min(ailp); @@ -483,8 +484,16 @@ xfs_trans_ail_update( if (mlip == dlip) { mlip = xfs_ail_min(ailp); + /* + * It is not safe to access mlip after the AIL lock is + * dropped, so we must get a copy of li_lsn before we do + * so. This is especially important on 32-bit platforms + * where accessing and updating 64-bit values like li_lsn + * is not atomic. + */ + tail_lsn = mlip->li_lsn; spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); } else { spin_unlock(&ailp->xa_lock); } @@ -514,6 +523,7 @@ xfs_trans_ail_delete( { xfs_log_item_t *dlip; xfs_log_item_t *mlip; + xfs_lsn_t tail_lsn; if (lip->li_flags & XFS_LI_IN_AIL) { mlip = xfs_ail_min(ailp); @@ -527,9 +537,16 @@ xfs_trans_ail_delete( if (mlip == dlip) { mlip = xfs_ail_min(ailp); + /* + * It is not safe to access mlip after the AIL lock + * is dropped, so we must get a copy of li_lsn + * before we do so. This is especially important + * on 32-bit platforms where accessing and updating + * 64-bit values like li_lsn is not atomic. + */ + tail_lsn = mlip ? mlip->li_lsn : 0; spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, - (mlip ? mlip->li_lsn : 0)); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); } else { spin_unlock(&ailp->xa_lock); } -- cgit v1.1 From 9ebd4eba761b624a6a6c9189335adeddcb1fa0e0 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Tue, 17 Nov 2009 14:06:23 -0800 Subject: procfs: fix /proc//stat stack pointer for kernel threads Fix a small issue for the stack pointer in /proc//stat. In case of a kernel thread the value of the printed stack pointer should be 0. Signed-off-by: Stefani Seibold Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 07f77a7..822c2d5 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -571,7 +571,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, rsslim, mm ? mm->start_code : 0, mm ? mm->end_code : 0, - (permitted) ? task->stack_start : 0, + (permitted && mm) ? task->stack_start : 0, esp, eip, /* The signal information here is obsolete. -- cgit v1.1 From 978b4053aefd422713f289f2a315ce2acba62018 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2009 14:06:24 -0800 Subject: fcntl: rename F_OWNER_GID to F_OWNER_PGRP This is for consistency with various ioctl() operations that include the suffix "PGRP" in their names, and also for consistency with PRIO_PGRP, used with setpriority() and getpriority(). Also, using PGRP instead of GID avoids confusion with the common abbreviation of "group ID". I'm fine with anything that makes it more consistent, and if PGRP is what is the predominant abbreviation then I see no need to further confuse matters by adding a third one. Signed-off-by: Peter Zijlstra Acked-by: Michael Kerrisk Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index fc089f2..2cf93ec 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -284,7 +284,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg) type = PIDTYPE_PID; break; - case F_OWNER_GID: + case F_OWNER_PGRP: type = PIDTYPE_PGID; break; @@ -321,7 +321,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg) break; case PIDTYPE_PGID: - owner.type = F_OWNER_GID; + owner.type = F_OWNER_PGRP; break; default: -- cgit v1.1