diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 1044 |
1 files changed, 837 insertions, 207 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7055d11..9a837a8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,6 +17,7 @@ #include "compat.h" #include "ctree.h" #include "btrfs_inode.h" +#include "volumes.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -254,14 +255,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, * * This should be called with the tree lock held. */ -static int merge_state(struct extent_io_tree *tree, - struct extent_state *state) +static void merge_state(struct extent_io_tree *tree, + struct extent_state *state) { struct extent_state *other; struct rb_node *other_node; if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) - return 0; + return; other_node = rb_prev(&state->rb_node); if (other_node) { @@ -281,26 +282,19 @@ static int merge_state(struct extent_io_tree *tree, if (other->start == state->end + 1 && other->state == state->state) { merge_cb(tree, state, other); - other->start = state->start; - state->tree = NULL; - rb_erase(&state->rb_node, &tree->state); - free_extent_state(state); - state = NULL; + state->end = other->end; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); } } - - return 0; } -static int set_state_cb(struct extent_io_tree *tree, +static void set_state_cb(struct extent_io_tree *tree, struct extent_state *state, int *bits) { - if (tree->ops && tree->ops->set_bit_hook) { - return tree->ops->set_bit_hook(tree->mapping->host, - state, bits); - } - - return 0; + if (tree->ops && tree->ops->set_bit_hook) + tree->ops->set_bit_hook(tree->mapping->host, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, @@ -310,6 +304,9 @@ static void clear_state_cb(struct extent_io_tree *tree, tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, int *bits); + /* * insert an extent_state struct into the tree. 'bits' are set on the * struct before it is inserted. @@ -325,8 +322,6 @@ static int insert_state(struct extent_io_tree *tree, int *bits) { struct rb_node *node; - int bits_to_set = *bits & ~EXTENT_CTLBITS; - int ret; if (end < start) { printk(KERN_ERR "btrfs end < start %llu %llu\n", @@ -336,13 +331,9 @@ static int insert_state(struct extent_io_tree *tree, } state->start = start; state->end = end; - ret = set_state_cb(tree, state, bits); - if (ret) - return ret; - if (bits_to_set & EXTENT_DIRTY) - tree->dirty_bytes += end - start + 1; - state->state |= bits_to_set; + set_state_bits(tree, state, bits); + node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; @@ -351,7 +342,6 @@ static int insert_state(struct extent_io_tree *tree, "%llu %llu\n", (unsigned long long)found->start, (unsigned long long)found->end, (unsigned long long)start, (unsigned long long)end); - free_extent_state(state); return -EEXIST; } state->tree = tree; @@ -359,13 +349,11 @@ static int insert_state(struct extent_io_tree *tree, return 0; } -static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, +static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, u64 split) { if (tree->ops && tree->ops->split_extent_hook) - return tree->ops->split_extent_hook(tree->mapping->host, - orig, split); - return 0; + tree->ops->split_extent_hook(tree->mapping->host, orig, split); } /* @@ -500,7 +488,8 @@ again: cached_state = NULL; } - if (cached && cached->tree && cached->start == start) { + if (cached && cached->tree && cached->start <= start && + cached->end > start) { if (clear) atomic_dec(&cached->refs); state = cached; @@ -660,34 +649,25 @@ again: if (start > end) break; - if (need_resched()) { - spin_unlock(&tree->lock); - cond_resched(); - spin_lock(&tree->lock); - } + cond_resched_lock(&tree->lock); } out: spin_unlock(&tree->lock); return 0; } -static int set_state_bits(struct extent_io_tree *tree, +static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, int *bits) { - int ret; int bits_to_set = *bits & ~EXTENT_CTLBITS; - ret = set_state_cb(tree, state, bits); - if (ret) - return ret; + set_state_cb(tree, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } state->state |= bits_to_set; - - return 0; } static void cache_state(struct extent_state *state, @@ -742,7 +722,8 @@ again: spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; - if (state->start == start && state->tree) { + if (state->start <= start && state->end > start && + state->tree) { node = &state->rb_node; goto hit_next; } @@ -779,17 +760,15 @@ hit_next: goto out; } - err = set_state_bits(tree, state, &bits); - if (err) - goto out; + set_state_bits(tree, state, &bits); - next_node = rb_next(node); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; + next_node = rb_next(&state->rb_node); if (next_node && start < end && prealloc && !need_resched()) { state = rb_entry(next_node, struct extent_state, rb_node); @@ -830,9 +809,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - err = set_state_bits(tree, state, &bits); - if (err) - goto out; + set_state_bits(tree, state, &bits); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -862,7 +839,6 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - atomic_inc(&prealloc->refs); err = insert_state(tree, prealloc, start, this_end, &bits); BUG_ON(err == -EEXIST); @@ -872,7 +848,6 @@ hit_next: goto out; } cache_state(prealloc, cached_state); - free_extent_state(prealloc); prealloc = NULL; start = this_end + 1; goto search_again; @@ -895,12 +870,204 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - err = set_state_bits(tree, prealloc, &bits); + set_state_bits(tree, prealloc, &bits); + cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +/** + * convert_extent - convert all bits in a given range from one bit to another + * @tree: the io tree to search + * @start: the start offset in bytes + * @end: the end offset in bytes (inclusive) + * @bits: the bits to set in this range + * @clear_bits: the bits to clear in this range + * @mask: the allocation mask + * + * This will go through and set bits for the given range. If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits. This is only meant to be used by things that are mergeable, ie + * converting from say DELALLOC to DIRTY. This is not meant to be used with + * boundary bits like LOCK. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int clear_bits, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + u64 last_start; + u64 last_end; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = insert_state(tree, prealloc, start, end, &bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + struct rb_node *next_node; + + set_state_bits(tree, state, &bits); + clear_state_bit(tree, state, &clear_bits, 0); + + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + + start = last_end + 1; + next_node = rb_next(&state->rb_node); + if (next_node && start < end && prealloc && !need_resched()) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, &bits); + clear_state_bit(tree, state, &clear_bits, 0); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + /* + * Avoid to free 'prealloc' if it can be merged with + * the later extent. + */ + err = insert_state(tree, prealloc, start, this_end, + &bits); + BUG_ON(err == -EEXIST); if (err) { + free_extent_state(prealloc); prealloc = NULL; goto out; } - cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + set_state_bits(tree, prealloc, &bits); + clear_state_bit(tree, prealloc, &clear_bits, 0); + merge_state(tree, prealloc); prealloc = NULL; goto out; @@ -949,7 +1116,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, NULL, cached_state, mask); } @@ -1042,19 +1209,33 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) mask); } -/* - * helper function to set both pages and extents in the tree writeback - */ -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; struct page *page; while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); - set_page_writeback(page); + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + page_cache_release(page); + index++; + } + return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + account_page_redirty(page); + __set_page_dirty_nobuffers(page); page_cache_release(page); index++; } @@ -1062,43 +1243,22 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) } /* - * find the first offset in the io tree with 'bits' set. zero is - * returned if we find something, and *start_ret and *end_ret are - * set to reflect the state struct that was found. - * - * If nothing was found, 1 is returned, < 0 on error + * helper function to set both pages and extents in the tree writeback */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits) +static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { - struct rb_node *node; - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) - goto out; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) { - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - break; - } - node = rb_next(node); - if (!node) - break; + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + set_page_writeback(page); + page_cache_release(page); + index++; } -out: - spin_unlock(&tree->lock); - return ret; + return 0; } /* find the first state struct with 'bits' set after 'start', and @@ -1133,6 +1293,30 @@ out: } /* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned, < 0 on error + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, bits); + if (state) { + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + } + spin_unlock(&tree->lock); + return ret; +} + +/* * find a contiguous range of bytes in the file marked as delalloc, not * more than 'max_bytes'. start and end are used to return the range, * @@ -1339,6 +1523,7 @@ again: * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); + cached_state = NULL; if (!loops) { unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); max_bytes = PAGE_CACHE_SIZE - offset; @@ -1564,7 +1749,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bitset = 0; spin_lock(&tree->lock); - if (cached && cached->tree && cached->start == start) + if (cached && cached->tree && cached->start <= start && + cached->end > start) node = &cached->rb_node; else node = tree_search(tree, start); @@ -1644,6 +1830,368 @@ static int check_page_writeback(struct extent_io_tree *tree, return 0; } +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int this_mirror; + int failed_mirror; + int in_validation; +}; + +static int free_io_failure(struct inode *inode, struct io_failure_record *rec, + int did_repair) +{ + int ret; + int err = 0; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + + set_state_private(failure_tree, rec->start, 0); + ret = clear_extent_bits(failure_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret) + err = ret; + + if (did_repair) { + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; + } + + kfree(rec); + return err; +} + +static void repair_io_failure_callback(struct bio *bio, int err) +{ + complete(bio->bi_private); +} + +/* + * this bypasses the standard btrfs submit functions deliberately, as + * the standard behavior is to write all copies in a raid setup. here we only + * want to write the one bad copy. so we do the mapping for ourselves and issue + * submit_bio directly. + * to avoid any synchonization issues, wait for the data after writing, which + * actually prevents the read that triggered the error from finishing. + * currently, there can be no more than two copies of every data bit. thus, + * exactly one rewrite is required. + */ +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num) +{ + struct bio *bio; + struct btrfs_device *dev; + DECLARE_COMPLETION_ONSTACK(compl); + u64 map_length = 0; + u64 sector; + struct btrfs_bio *bbio = NULL; + int ret; + + BUG_ON(!mirror_num); + + bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_private = &compl; + bio->bi_end_io = repair_io_failure_callback; + bio->bi_size = 0; + map_length = length; + + ret = btrfs_map_block(map_tree, WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); + sector = bbio->stripes[mirror_num-1].physical >> 9; + bio->bi_sector = sector; + dev = bbio->stripes[mirror_num-1].dev; + kfree(bbio); + if (!dev || !dev->bdev || !dev->writeable) { + bio_put(bio); + return -EIO; + } + bio->bi_bdev = dev->bdev; + bio_add_page(bio, page, length, start-page_offset(page)); + submit_bio(WRITE_SYNC, bio); + wait_for_completion(&compl); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + /* try to remap that extent elsewhere? */ + bio_put(bio); + return -EIO; + } + + printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " + "sector %llu)\n", page->mapping->host->i_ino, start, + dev->name, sector); + + bio_put(bio); + return 0; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int clean_io_failure(u64 start, struct page *page) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failrec; + struct btrfs_mapping_tree *map_tree; + struct extent_state *state; + int num_copies; + int did_repair = 0; + int ret; + struct inode *inode = page->mapping->host; + + private = 0; + ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY, 0); + if (!ret) + return 0; + + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, + &private_failure); + if (ret) + return 0; + + failrec = (struct io_failure_record *)(unsigned long) private_failure; + BUG_ON(!failrec->this_mirror); + + if (failrec->in_validation) { + /* there was no real error, just free the record */ + pr_debug("clean_io_failure: freeing dummy error at %llu\n", + failrec->start); + did_repair = 1; + goto out; + } + + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + + if (state && state->start == failrec->start) { + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; + num_copies = btrfs_num_copies(map_tree, failrec->logical, + failrec->len); + if (num_copies > 1) { + ret = repair_io_failure(map_tree, start, failrec->len, + failrec->logical, page, + failrec->failed_mirror); + did_repair = !ret; + } + } + +out: + if (!ret) + ret = free_io_failure(inode, failrec, did_repair); + + return ret; +} + +/* + * this is a generic handler for readpage errors (default + * readpage_io_failed_hook). if other copies exist, read those and write back + * good data to the failed position. does not investigate in remapping the + * failed extent elsewhere, hoping the device will be smart enough to do this as + * needed + */ + +static int bio_readpage_error(struct bio *failed_bio, struct page *page, + u64 start, u64 end, int failed_mirror, + struct extent_state *state) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + int num_copies; + int ret; + int read_mode; + u64 logical; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->this_mirror = 0; + failrec->bio_flags = 0; + failrec->in_validation = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (!em) { + read_unlock(&em_tree->lock); + kfree(failrec); + return -EIO; + } + + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); + } + pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " + "len=%llu\n", logical, start, failrec->len); + failrec->logical = logical; + free_extent_map(em); + + /* set the bits in the private failure tree */ + ret = set_extent_bits(failure_tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret >= 0) + ret = set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + /* set the bits in the inode's tree */ + if (ret >= 0) + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, + GFP_NOFS); + if (ret < 0) { + kfree(failrec); + return ret; + } + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + pr_debug("bio_readpage_error: (found) logical=%llu, " + "start=%llu, len=%llu, validation=%d\n", + failrec->logical, failrec->start, failrec->len, + failrec->in_validation); + /* + * when data can be on disk more than twice, add to failrec here + * (e.g. with a list for failed_mirror) to make + * clean_io_failure() clean all those errors at once. + */ + } + num_copies = btrfs_num_copies( + &BTRFS_I(inode)->root->fs_info->mapping_tree, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " + "state=%p, num_copies=%d, next_mirror %d, " + "failed_mirror %d\n", state, num_copies, + failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + if (!state) { + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, failrec->start, + EXTENT_LOCKED); + if (state && state->start != failrec->start) + state = NULL; + spin_unlock(&tree->lock); + } + + /* + * there are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + */ + if (failed_bio->bi_vcnt > 1) { + /* + * to fulfill b), we need to know the exact failing sectors, as + * we don't want to rewrite any more than the failed ones. thus, + * we need separate read requests for the failed bio + * + * if the following BUG_ON triggers, our validation request got + * merged. we need separate requests for our algorithm to work. + */ + BUG_ON(failrec->in_validation); + failrec->in_validation = 1; + failrec->this_mirror = failed_mirror; + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + } else { + /* + * we're ready to fulfill a) and b) alongside. get a good copy + * of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + if (failrec->in_validation) { + BUG_ON(failrec->this_mirror != failed_mirror); + failrec->in_validation = 0; + failrec->this_mirror = 0; + } + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + read_mode = READ_SYNC; + } + + if (!state || failrec->this_mirror > num_copies) { + pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " + "next_mirror %d, failed_mirror %d\n", state, + num_copies, failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_private = state; + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_sector = failrec->logical >> 9; + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + bio->bi_size = 0; + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + + pr_debug("bio_readpage_error: submitting new read[%#x] to " + "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, + failrec->this_mirror, num_copies, failrec->in_validation); + + tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, + failrec->bio_flags, 0); + return 0; +} + /* lots and lots of room for performance fixes in the end_bio funcs */ /* @@ -1742,6 +2290,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_state *cached = NULL; struct extent_state *state; + pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " + "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, + (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; start = ((u64)page->index << PAGE_CACHE_SHIFT) + @@ -1772,12 +2323,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err) state); if (ret) uptodate = 0; + else + clean_io_failure(start, page); } - if (!uptodate && tree->ops && - tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(bio, page, - start, end, NULL); + if (!uptodate) { + int failed_mirror; + failed_mirror = (int)(unsigned long)bio->bi_bdev; + /* + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, not + * in the !uptodate case). In that case it returns 0 and + * we just go on with the next page in our bio. If it + * can't handle the error it will return -EIO and we + * remain responsible for that page. + */ + ret = bio_readpage_error(bio, page, start, end, + failed_mirror, NULL); if (ret == 0) { +error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -1785,6 +2350,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) uncache_state(&cached); continue; } + if (tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook( + bio, page, start, end, + failed_mirror, state); + if (ret == 0) + goto error_handled; + } } if (uptodate) { @@ -1856,6 +2428,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, mirror_num, bio_flags, start); else submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; bio_put(bio); @@ -1871,7 +2444,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_end_io_t end_io_func, int mirror_num, unsigned long prev_bio_flags, - unsigned long bio_flags) + unsigned long bio_flags, + bool force_bio_submit) { int ret = 0; struct bio *bio; @@ -1890,6 +2464,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, sector; if (prev_bio_flags != bio_flags || !contig || + force_bio_submit || (tree->ops && tree->ops->merge_bio_hook && tree->ops->merge_bio_hook(page, offset, page_size, bio, bio_flags)) || @@ -1946,7 +2521,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct bio **bio, int mirror_num, - unsigned long *bio_flags) + unsigned long *bio_flags, + u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; @@ -2002,6 +2578,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } } while (cur <= end) { + bool force_bio_submit = false; + if (cur >= last_byte) { char *userpage; struct extent_state *cached = NULL; @@ -2048,6 +2626,49 @@ static int __extent_read_full_page(struct extent_io_tree *tree, block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; + + /* + * If we have a file range that points to a compressed extent + * and it's followed by a consecutive file range that points to + * to the same compressed extent (possibly with a different + * offset and/or length, so it either points to the whole extent + * or only part of it), we must make sure we do not submit a + * single bio to populate the pages for the 2 ranges because + * this makes the compressed extent read zero out the pages + * belonging to the 2nd range. Imagine the following scenario: + * + * File layout + * [0 - 8K] [8K - 24K] + * | | + * | | + * points to extent X, points to extent X, + * offset 4K, length of 8K offset 0, length 16K + * + * [extent X, compressed length = 4K uncompressed length = 16K] + * + * If the bio to read the compressed extent covers both ranges, + * it will decompress extent X into the pages belonging to the + * first range and then it will stop, zeroing out the remaining + * pages that belong to the other range that points to extent X. + * So here we make sure we submit 2 bios, one for the first + * range and another one for the third range. Both will target + * the same physical extent from disk, but we can't currently + * make the compressed bio endio callback populate the pages + * for both ranges because each compressed bio is tightly + * coupled with a single extent map, and each range can have + * an extent map with a different offset value relative to the + * uncompressed data of our extent and different lengths. This + * is a corner case so we prioritize correctness over + * non-optimal behavior (submitting 2 bios for the same extent). + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && + prev_em_start && *prev_em_start != (u64)-1 && + *prev_em_start != em->orig_start) + force_bio_submit = true; + + if (prev_em_start) + *prev_em_start = em->orig_start; + free_extent_map(em); em = NULL; @@ -2102,7 +2723,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, - this_bio_flag); + this_bio_flag, + force_bio_submit); nr++; *bio_flags = this_bio_flag; } @@ -2121,16 +2743,16 @@ out: } int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent) + get_extent_t *get_extent, int mirror_num) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, - &bio_flags); + ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, + &bio_flags, NULL); if (bio) - ret = submit_one_bio(READ, bio, 0, bio_flags); + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; } @@ -2181,6 +2803,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, int compressed; int write_flags; unsigned long nr_written = 0; + bool fill_delalloc = true; if (wbc->sync_mode == WB_SYNC_ALL) write_flags = WRITE_SYNC; @@ -2190,6 +2813,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, trace___extent_writepage(page, inode, wbc); WARN_ON(!PageLocked(page)); + + ClearPageError(page); + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || (page->index == end_index && !pg_offset)) { @@ -2211,10 +2837,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); + if (!tree->ops || !tree->ops->fill_delalloc) + fill_delalloc = false; + delalloc_start = start; delalloc_end = 0; page_started = 0; - if (!epd->extent_locked) { + if (!epd->extent_locked && fill_delalloc) { u64 delalloc_to_write = 0; /* * make sure the wbc mapping index is at least updated @@ -2380,7 +3009,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, sector, iosize, pg_offset, bdev, &epd->bio, max_nr, end_bio_extent_writepage, - 0, 0, 0); + 0, 0, 0, false); if (ret) SetPageError(page); } @@ -2432,6 +3061,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; + int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { @@ -2442,11 +3072,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, end = wbc->range_end >> PAGE_CACHE_SHIFT; scanned = 1; } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, min(end - index, - (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { unsigned i; scanned = 1; @@ -2460,10 +3095,16 @@ retry: * swizzled back from swapper_space to tmpfs file * mapping */ - if (tree->ops && tree->ops->write_cache_pages_lock_hook) - tree->ops->write_cache_pages_lock_hook(page); - else - lock_page(page); + if (tree->ops && + tree->ops->write_cache_pages_lock_hook) { + tree->ops->write_cache_pages_lock_hook(page, + data, flush_fn); + } else { + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } + } if (unlikely(page->mapping != mapping)) { unlock_page(page); @@ -2541,7 +3182,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, struct writeback_control *wbc) { int ret; - struct address_space *mapping = page->mapping; struct extent_page_data epd = { .bio = NULL, .tree = tree, @@ -2549,18 +3189,9 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; - struct writeback_control wbc_writepages = { - .sync_mode = wbc->sync_mode, - .older_than_this = NULL, - .nr_to_write = 64, - .range_start = page_offset(page) + PAGE_CACHE_SIZE, - .range_end = (loff_t)-1, - }; ret = __extent_writepage(page, wbc, &epd); - extent_write_cache_pages(tree, mapping, &wbc_writepages, - __extent_writepage, &epd, flush_write_bio); flush_epd_write_bio(&epd); return ret; } @@ -2584,7 +3215,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, }; struct writeback_control wbc_writepages = { .sync_mode = mode, - .older_than_this = NULL, .nr_to_write = nr_pages * 2, .range_start = start, .range_end = end + 1, @@ -2638,6 +3268,7 @@ int extent_readpages(struct extent_io_tree *tree, struct bio *bio = NULL; unsigned page_idx; unsigned long bio_flags = 0; + u64 prev_em_start = (u64)-1; for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_entry(pages->prev, struct page, lru); @@ -2647,7 +3278,8 @@ int extent_readpages(struct extent_io_tree *tree, if (!add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { __extent_read_full_page(tree, page, get_extent, - &bio, 0, &bio_flags); + &bio, 0, &bio_flags, + &prev_em_start); } page_cache_release(page); } @@ -2840,6 +3472,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); + len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + /* * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size @@ -2887,7 +3522,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent_skip_holes(inode, off, last_for_get_extent, + em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); if (!em) goto out; @@ -2976,7 +3611,7 @@ out: return ret; } -static inline struct page *extent_buffer_page(struct extent_buffer *eb, +inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { struct page *p; @@ -3001,7 +3636,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, return p; } -static inline unsigned long num_extent_pages(u64 start, u64 len) +inline unsigned long num_extent_pages(u64 start, u64 len) { return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT); @@ -3022,8 +3657,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; - spin_lock_init(&eb->lock); - init_waitqueue_head(&eb->lock_wq); + rwlock_init(&eb->lock); + atomic_set(&eb->write_locks, 0); + atomic_set(&eb->read_locks, 0); + atomic_set(&eb->blocking_readers, 0); + atomic_set(&eb->blocking_writers, 0); + atomic_set(&eb->spinning_readers, 0); + atomic_set(&eb->spinning_writers, 0); + init_waitqueue_head(&eb->write_lock_wq); + init_waitqueue_head(&eb->read_lock_wq); #if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); @@ -3119,7 +3761,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, i = 0; } for (; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); + p = find_or_create_page(mapping, index, GFP_NOFS); if (!p) { WARN_ON(1); goto free_eb; @@ -3247,6 +3889,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&page->mapping->tree_lock); + ClearPageError(page); unlock_page(page); } return 0; @@ -3266,6 +3909,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, return was_dirty; } +static int __eb_straddles_pages(u64 start, u64 len) +{ + if (len < PAGE_CACHE_SIZE) + return 1; + if (start & (PAGE_CACHE_SIZE - 1)) + return 1; + if ((start + len) & (PAGE_CACHE_SIZE - 1)) + return 1; + return 0; +} + +static int eb_straddles_pages(struct extent_buffer *eb) +{ + return __eb_straddles_pages(eb->start, eb->len); +} + int clear_extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb, struct extent_state **cached_state) @@ -3277,8 +3936,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); + if (eb_straddles_pages(eb)) { + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + cached_state, GFP_NOFS); + } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3296,8 +3957,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - NULL, GFP_NOFS); + if (eb_straddles_pages(eb)) { + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + NULL, GFP_NOFS); + } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || @@ -3320,9 +3983,12 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); - if (ret) - return 1; + if (__eb_straddles_pages(start, end - start + 1)) { + ret = test_range_bit(tree, start, end, + EXTENT_UPTODATE, 1, NULL); + if (ret) + return 1; + } while (start <= end) { index = start >> PAGE_CACHE_SHIFT; page = find_get_page(tree->mapping, index); @@ -3350,10 +4016,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 1; - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; + if (eb_straddles_pages(eb)) { + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, cached_state); + if (ret) + return ret; + } num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { @@ -3367,8 +4035,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, } int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, - u64 start, int wait, + struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num) { unsigned long i; @@ -3382,13 +4049,16 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, unsigned long num_pages; struct bio *bio = NULL; unsigned long bio_flags = 0; + u64 prev_em_start = (u64)-1; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; + if (eb_straddles_pages(eb)) { + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, NULL)) { + return 0; + } } if (start) { @@ -3402,7 +4072,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (!wait) { + if (wait == WAIT_NONE) { if (!trylock_page(page)) goto unlock_exit; } else { @@ -3435,7 +4105,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, - mirror_num, &bio_flags); + mirror_num, &bio_flags, + &prev_em_start); if (err) ret = err; } else { @@ -3446,7 +4117,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (bio) submit_one_bio(READ, bio, mirror_num, bio_flags); - if (ret || !wait) + if (ret || wait != WAIT_COMPLETE) return ret; for (i = start_i; i < num_pages; i++) { @@ -3492,9 +4163,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, page = extent_buffer_page(eb, i); cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(dst, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER1); dst += cur; len -= cur; @@ -3504,9 +4174,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, char **token, char **map, + unsigned long min_len, char **map, unsigned long *map_start, - unsigned long *map_len, int km) + unsigned long *map_len) { size_t offset = start & (PAGE_CACHE_SIZE - 1); char *kaddr; @@ -3536,42 +4206,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, } p = extent_buffer_page(eb, i); - kaddr = kmap_atomic(p, km); - *token = kaddr; + kaddr = page_address(p); *map = kaddr + offset; *map_len = PAGE_CACHE_SIZE - offset; return 0; } -int map_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, - char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km) -{ - int err; - int save = 0; - if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, km); - eb->map_token = NULL; - save = 1; - } - err = map_private_extent_buffer(eb, start, min_len, token, map, - map_start, map_len, km); - if (!err && save) { - eb->map_token = *token; - eb->kaddr = *map; - eb->map_start = *map_start; - eb->map_len = *map_len; - } - return err; -} - -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) -{ - kunmap_atomic(token, km); -} - int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) @@ -3595,9 +4235,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); ret = memcmp(ptr, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER0); if (ret) break; @@ -3630,9 +4269,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(kaddr + offset, src, cur); - kunmap_atomic(kaddr, KM_USER1); src += cur; len -= cur; @@ -3661,9 +4299,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); memset(kaddr + offset, c, cur); - kunmap_atomic(kaddr, KM_USER0); len -= cur; offset = 0; @@ -3694,9 +4331,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); read_extent_buffer(src, kaddr + offset, src_offset, cur); - kunmap_atomic(kaddr, KM_USER0); src_offset += cur; len -= cur; @@ -3709,20 +4345,17 @@ static void move_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); if (dst_page == src_page) { memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); } else { - char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *src_kaddr = page_address(src_page); char *p = dst_kaddr + dst_off + len; char *s = src_kaddr + src_off + len; while (len--) *--p = *--s; - - kunmap_atomic(src_kaddr, KM_USER1); } - kunmap_atomic(dst_kaddr, KM_USER0); } static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) @@ -3735,20 +4368,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); char *src_kaddr; if (dst_page != src_page) { - src_kaddr = kmap_atomic(src_page, KM_USER1); + src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; BUG_ON(areas_overlap(src_off, dst_off, len)); } memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); - kunmap_atomic(dst_kaddr, KM_USER0); - if (dst_page != src_page) - kunmap_atomic(src_kaddr, KM_USER1); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, |