From 9655d2982b53fdb38a9e0f2f11315b99b92d66e2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 15:22:30 -0400 Subject: Btrfs: use a cached state for extent state operations during delalloc This changes the btrfs code to find delalloc ranges in the extent state tree to use the new state caching code from set/test bit. It reduces one of the biggest causes of rbtree searches in the writeback path. test_range_bit is also modified to take the cached state as a starting point while searching. Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/ordered-data.c') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index d6f0806..7f751e4 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -262,7 +262,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, ret = test_range_bit(io_tree, entry->file_offset, entry->file_offset + entry->len - 1, - EXTENT_ORDERED, 0); + EXTENT_ORDERED, 0, NULL); if (ret == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); out: @@ -522,7 +522,7 @@ again: end--; } if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { + EXTENT_ORDERED | EXTENT_DELALLOC, 0, NULL)) { schedule_timeout(1); goto again; } @@ -613,7 +613,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, */ if (test_range_bit(io_tree, disk_i_size, ordered->file_offset + ordered->len - 1, - EXTENT_DELALLOC, 0)) { + EXTENT_DELALLOC, 0, NULL)) { goto out; } /* @@ -664,7 +664,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, */ if (i_size_test > entry_end(ordered) && !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, - EXTENT_DELALLOC, 0)) { + EXTENT_DELALLOC, 0, NULL)) { new_i_size = min_t(u64, i_size_test, i_size_read(inode)); } BTRFS_I(inode)->disk_i_size = new_i_size; -- cgit v1.1 From 8b62b72b26bcd72082c4a69d179dd906bcc22200 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 16:53:46 -0400 Subject: Btrfs: Use PagePrivate2 to track pages in the data=ordered code. Btrfs writes go through delalloc to the data=ordered code. This makes sure that all of the data is on disk before the metadata that references it. The tracking means that we have to make sure each page in an extent is fully written before we add that extent into the on-disk btree. This was done in the past by setting the EXTENT_ORDERED bit for the range of an extent when it was added to the data=ordered code, and then clearing the EXTENT_ORDERED bit in the extent state tree as each page finished IO. One of the reasons we had to do this was because sometimes pages are magically dirtied without page_mkwrite being called. The EXTENT_ORDERED bit is checked at writepage time, and if it isn't there, our page become dirty without going through the proper path. These bit operations make for a number of rbtree searches for each page, and can cause considerable lock contention. This commit switches from the EXTENT_ORDERED bit to use PagePrivate2. As pages go into the ordered code, PagePrivate2 is set on each one. This is a cheap operation because we already have all the pages locked and ready to go. As IO finishes, the PagePrivate2 bit is cleared and the ordered accoutning is updated for each page. At writepage time, if the PagePrivate2 bit is missing, we go into the writepage fixup code to handle improperly dirtied pages. Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'fs/btrfs/ordered-data.c') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7f751e4..4a9c8c4 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * * len is the length of the extent * - * This also sets the EXTENT_ORDERED bit on the range in the inode. - * * The tree is given a single reference on the ordered extent that was * inserted. */ @@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->start = start; entry->len = len; entry->disk_len = disk_len; + entry->bytes_left = len; entry->inode = inode; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, &entry->rb_node); BUG_ON(node); - set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, - entry_end(entry) - 1, GFP_NOFS); - spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_add_tail(&entry->root_extent_list, &BTRFS_I(inode)->root->fs_info->ordered_extents); @@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; int ret; tree = &BTRFS_I(inode)->ordered_tree; mutex_lock(&tree->mutex); - clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1, - GFP_NOFS); node = tree_search(tree, file_offset); if (!node) { ret = 1; @@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, goto out; } - ret = test_range_bit(io_tree, entry->file_offset, - entry->file_offset + entry->len - 1, - EXTENT_ORDERED, 0, NULL); - if (ret == 0) + if (io_size > entry->bytes_left) { + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + (unsigned long long)entry->bytes_left, + (unsigned long long)io_size); + } + entry->bytes_left -= io_size; + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + else + ret = 1; out: mutex_unlock(&tree->mutex); return ret == 0; @@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) u64 orig_end; u64 wait_end; struct btrfs_ordered_extent *ordered; + int found; if (start + len < start) { orig_end = INT_LIMIT(loff_t); @@ -502,6 +501,7 @@ again: orig_end >> PAGE_CACHE_SHIFT); end = orig_end; + found = 0; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, end); if (!ordered) @@ -514,6 +514,7 @@ again: btrfs_put_ordered_extent(ordered); break; } + found++; btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; btrfs_put_ordered_extent(ordered); @@ -521,8 +522,8 @@ again: break; end--; } - if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_ORDERED | EXTENT_DELALLOC, 0, NULL)) { + if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, + EXTENT_DELALLOC, 0, NULL)) { schedule_timeout(1); goto again; } -- cgit v1.1