diff options
Diffstat (limited to 'fs')
76 files changed, 1013 insertions, 462 deletions
diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 5a9b684..1f3624d 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -109,7 +109,7 @@ struct afs_call { unsigned reply_size; /* current size of reply */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ - unsigned short offset; /* offset into received data store */ + unsigned offset; /* offset into received data store */ unsigned char unmarshall; /* unmarshalling phase */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index e45a323..8ad8c2a 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -314,6 +314,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; int ret; + struct sk_buff *skb; _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -380,6 +381,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, error_do_abort: rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); + while ((skb = skb_dequeue(&call->rx_queue))) + afs_free_skb(skb); rxrpc_kernel_end_call(rxcall); call->rxcall = NULL; error_kill_call: @@ -228,12 +228,6 @@ static void __put_ioctx(struct kioctx *ctx) call_rcu(&ctx->rcu_head, ctx_rcu_free); } -static inline void get_ioctx(struct kioctx *kioctx) -{ - BUG_ON(atomic_read(&kioctx->users) <= 0); - atomic_inc(&kioctx->users); -} - static inline int try_get_ioctx(struct kioctx *kioctx) { return atomic_inc_not_zero(&kioctx->users); @@ -273,7 +267,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) mm = ctx->mm = current->mm; atomic_inc(&mm->mm_count); - atomic_set(&ctx->users, 1); + atomic_set(&ctx->users, 2); spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->ring_info.ring_lock); init_waitqueue_head(&ctx->wait); @@ -527,11 +521,16 @@ static void aio_fput_routine(struct work_struct *data) fput(req->ki_filp); /* Link the iocb into the context's free list */ + rcu_read_lock(); spin_lock_irq(&ctx->ctx_lock); really_put_req(ctx, req); + /* + * at that point ctx might've been killed, but actual + * freeing is RCU'd + */ spin_unlock_irq(&ctx->ctx_lock); + rcu_read_unlock(); - put_ioctx(ctx); spin_lock_irq(&fput_lock); } spin_unlock_irq(&fput_lock); @@ -562,7 +561,6 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) * this function will be executed w/out any aio kthread wakeup. */ if (unlikely(!fput_atomic(req->ki_filp))) { - get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); spin_unlock(&fput_lock); @@ -1256,10 +1254,10 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); - if (!ret) + if (!ret) { + put_ioctx(ioctx); return 0; - - get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ + } io_destroy(ioctx); } diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 475f9c5..756d328 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -278,6 +278,17 @@ int autofs4_fill_super(struct super_block *, void *, int); struct autofs_info *autofs4_new_ino(struct autofs_sb_info *); void autofs4_clean_ino(struct autofs_info *); +static inline int autofs_prepare_pipe(struct file *pipe) +{ + if (!pipe->f_op || !pipe->f_op->write) + return -EINVAL; + if (!S_ISFIFO(pipe->f_dentry->d_inode->i_mode)) + return -EINVAL; + /* We want a packet pipe */ + pipe->f_flags |= O_DIRECT; + return 0; +} + /* Queue management functions */ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 509fe1e..de54271 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -376,7 +376,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, err = -EBADF; goto out; } - if (!pipe->f_op || !pipe->f_op->write) { + if (autofs_prepare_pipe(pipe) < 0) { err = -EPIPE; fput(pipe); goto out; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 180fa24..7c26678 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -292,7 +292,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) printk("autofs: could not open pipe file descriptor\n"); goto fail_dput; } - if (!pipe->f_op || !pipe->f_op->write) + if (autofs_prepare_pipe(pipe) < 0) goto fail_fput; sbi->pipe = pipe; sbi->pipefd = pipefd; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 2543598..813ea10 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -90,7 +90,7 @@ static int autofs4_write(struct file *file, const void *addr, int bytes) return (bytes > 0); } - + static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 9ba2ac7..618493e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1422,7 +1422,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, for (i = 1; i < view->n; ++i) { const struct user_regset *regset = &view->regsets[i]; do_thread_regset_writeback(t->task, regset); - if (regset->core_note_type && + if (regset->core_note_type && regset->get && (!regset->active || regset->active(t->task, regset))) { int ret; size_t size = regset->n * regset->size; diff --git a/fs/block_dev.c b/fs/block_dev.c index 34503ba..74fc5ed 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1149,8 +1149,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) * The latter is necessary to prevent ghost * partitions on a removed medium. */ - if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) - rescan_partitions(disk, bdev); + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(disk, bdev); + } if (ret) goto out_clear; } else { @@ -1180,8 +1184,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ - if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) - rescan_partitions(bdev->bd_disk, bdev); + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(bdev->bd_disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(bdev->bd_disk, bdev); + } if (ret) goto out_unlock_bdev; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3b859a3..66179bc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1972,7 +1972,7 @@ BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, static inline bool btrfs_root_readonly(struct btrfs_root *root) { - return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } /* struct btrfs_super_block */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index cb85825..b775809 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3004,7 +3004,7 @@ cifs_get_volume_info(char *mount_data, const char *devname) int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { - int rc = 0; + int rc; int xid; struct cifs_ses *pSesInfo; struct cifs_tcon *tcon; @@ -3033,6 +3033,7 @@ try_mount_again: FreeXid(xid); } #endif + rc = 0; tcon = NULL; pSesInfo = NULL; srvTcp = NULL; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 16cdd6d..ed5c07b 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -583,10 +583,26 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, * If either that or op not supported returned, follow * the normal lookup. */ - if ((rc == 0) || (rc == -ENOENT)) + switch (rc) { + case 0: + /* + * The server may allow us to open things like + * FIFOs, but the client isn't set up to deal + * with that. If it's not a regular file, just + * close it and proceed as if it were a normal + * lookup. + */ + if (newInode && !S_ISREG(newInode->i_mode)) { + CIFSSMBClose(xid, pTcon, fileHandle); + break; + } + case -ENOENT: posix_open = true; - else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP)) + case -EOPNOTSUPP: + break; + default: pTcon->broken_posix_open = true; + } } if (!posix_open) rc = cifs_get_inode_info_unix(&newInode, full_path, diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d3e6196..0cfae19 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -244,16 +244,15 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* copy user */ /* BB what about null user mounts - check that we do this BB */ /* copy user */ - if (ses->user_name != NULL) + if (ses->user_name != NULL) { strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE); + bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); + } /* else null user mount */ - - bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); *bcc_ptr = 0; bcc_ptr++; /* account for null termination */ /* copy domain */ - if (ses->domainName != NULL) { strncpy(bcc_ptr, ses->domainName, 256); bcc_ptr += strnlen(ses->domainName, 256); diff --git a/fs/dcache.c b/fs/dcache.c index d2f8feb..0b51cfc 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -241,6 +241,7 @@ static void dentry_lru_add(struct dentry *dentry) static void __dentry_lru_del(struct dentry *dentry) { list_del_init(&dentry->d_lru); + dentry->d_flags &= ~DCACHE_SHRINK_LIST; dentry->d_sb->s_nr_dentry_unused--; dentry_stat.nr_unused--; } @@ -753,6 +754,7 @@ relock: spin_unlock(&dentry->d_lock); } else { list_move_tail(&dentry->d_lru, &tmp); + dentry->d_flags |= DCACHE_SHRINK_LIST; spin_unlock(&dentry->d_lock); if (!--cnt) break; @@ -1144,14 +1146,18 @@ resume: /* * move only zero ref count dentries to the end * of the unused list for prune_dcache + * + * Those which are presently on the shrink list, being processed + * by shrink_dentry_list(), shouldn't be moved. Otherwise the + * loop in shrink_dcache_parent() might not make any progress + * and loop forever. */ - if (!dentry->d_count) { + if (dentry->d_count) { + dentry_lru_del(dentry); + } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { dentry_lru_move_tail(dentry); found++; - } else { - dentry_lru_del(dentry); } - /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find @@ -2427,6 +2433,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) if (d_ancestor(alias, dentry)) { /* Check for loops */ actual = ERR_PTR(-ELOOP); + spin_unlock(&inode->i_lock); } else if (IS_ROOT(alias)) { /* Is this an anonymous mountpoint that we * could splice into our tree? */ @@ -2436,7 +2443,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) goto found; } else { /* Nope, but we must(!) avoid directory - * aliasing */ + * aliasing. This drops inode->i_lock */ actual = __d_unalias(inode, dentry, alias); } write_sequnlock(&rename_lock); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 7cf5c3e..c6602d2 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -417,17 +417,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page, (unsigned long long)(extent_base + extent_offset), rc); goto out; } - if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Encrypting extent " - "with iv:\n"); - ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes); - ecryptfs_printk(KERN_DEBUG, "First 8 bytes before " - "encryption:\n"); - ecryptfs_dump_hex((char *) - (page_address(page) - + (extent_offset * crypt_stat->extent_size)), - 8); - } rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0, page, (extent_offset * crypt_stat->extent_size), @@ -440,14 +429,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page, goto out; } rc = 0; - if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; " - "rc = [%d]\n", - (unsigned long long)(extent_base + extent_offset), rc); - ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " - "encryption:\n"); - ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8); - } out: return rc; } @@ -543,17 +524,6 @@ static int ecryptfs_decrypt_extent(struct page *page, (unsigned long long)(extent_base + extent_offset), rc); goto out; } - if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Decrypting extent " - "with iv:\n"); - ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes); - ecryptfs_printk(KERN_DEBUG, "First 8 bytes before " - "decryption:\n"); - ecryptfs_dump_hex((char *) - (page_address(enc_extent_page) - + (extent_offset * crypt_stat->extent_size)), - 8); - } rc = ecryptfs_decrypt_page_offset(crypt_stat, page, (extent_offset * crypt_stat->extent_size), @@ -567,16 +537,6 @@ static int ecryptfs_decrypt_extent(struct page *page, goto out; } rc = 0; - if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; " - "rc = [%d]\n", - (unsigned long long)(extent_base + extent_offset), rc); - ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " - "decryption:\n"); - ecryptfs_dump_hex((char *)(page_address(page) - + (extent_offset - * crypt_stat->extent_size)), 8); - } out: return rc; } @@ -1618,7 +1578,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); if (rc) { printk(KERN_DEBUG "Valid eCryptfs headers not found in " - "file header region or xattr region\n"); + "file header region or xattr region, inode %lu\n", + ecryptfs_inode->i_ino); rc = -EINVAL; goto out; } @@ -1627,7 +1588,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) ECRYPTFS_DONT_VALIDATE_HEADER_SIZE); if (rc) { printk(KERN_DEBUG "Valid eCryptfs headers not found in " - "file xattr region either\n"); + "file xattr region either, inode %lu\n", + ecryptfs_inode->i_ino); rc = -EINVAL; } if (crypt_stat->mount_crypt_stat->flags @@ -1638,7 +1600,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) "crypto metadata only in the extended attribute " "region, but eCryptfs was mounted without " "xattr support enabled. eCryptfs will not treat " - "this like an encrypted file.\n"); + "this like an encrypted file, inode %lu\n", + ecryptfs_inode->i_ino); rc = -EINVAL; } } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 876b25c..28f09c7 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -887,18 +887,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, size_t num_zeros = (PAGE_CACHE_SIZE - (ia->ia_size & ~PAGE_CACHE_MASK)); - - /* - * XXX(truncate) this should really happen at the begginning - * of ->setattr. But the code is too messy to that as part - * of a larger patch. ecryptfs is also totally missing out - * on the inode_change_ok check at the beginning of - * ->setattr while would include this. - */ - rc = inode_newsize_ok(inode, ia->ia_size); - if (rc) - goto out; - if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { truncate_setsize(inode, ia->ia_size); lower_ia->ia_size = ia->ia_size; @@ -948,6 +936,28 @@ out: return rc; } +static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset) +{ + struct ecryptfs_crypt_stat *crypt_stat; + loff_t lower_oldsize, lower_newsize; + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + lower_oldsize = upper_size_to_lower_size(crypt_stat, + i_size_read(inode)); + lower_newsize = upper_size_to_lower_size(crypt_stat, offset); + if (lower_newsize > lower_oldsize) { + /* + * The eCryptfs inode and the new *lower* size are mixed here + * because we may not have the lower i_mutex held and/or it may + * not be appropriate to call inode_newsize_ok() with inodes + * from other filesystems. + */ + return inode_newsize_ok(inode, lower_newsize); + } + + return 0; +} + /** * ecryptfs_truncate * @dentry: The ecryptfs layer dentry @@ -964,6 +974,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) struct iattr lower_ia = { .ia_valid = 0 }; int rc; + rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length); + if (rc) + return rc; + rc = truncate_upper(dentry, &ia, &lower_ia); if (!rc && lower_ia.ia_valid & ATTR_SIZE) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); @@ -1045,6 +1059,16 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } } mutex_unlock(&crypt_stat->cs_mutex); + + rc = inode_change_ok(inode, ia); + if (rc) + goto out; + if (ia->ia_valid & ATTR_SIZE) { + rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size); + if (rc) + goto out; + } + if (S_ISREG(inode->i_mode)) { rc = filemap_write_and_wait(inode->i_mapping); if (rc) @@ -1128,6 +1152,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, } rc = vfs_setxattr(lower_dentry, name, value, size, flags); + if (!rc) + fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); out: return rc; } diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 940a82e..0dc5a3d 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -409,11 +409,47 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, ssize_t sz = 0; char *data; uid_t euid = current_euid(); + unsigned char packet_size_peek[3]; int rc; - if (count == 0) + if (count == 0) { goto out; + } else if (count == (1 + 4)) { + /* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */ + goto memdup; + } else if (count < (1 + 4 + 1) + || count > (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4 + + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)) { + printk(KERN_WARNING "%s: Acceptable packet size range is " + "[%d-%lu], but amount of data written is [%zu].", + __func__, (1 + 4 + 1), + (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4 + + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES), count); + return -EINVAL; + } + + if (copy_from_user(packet_size_peek, (buf + 1 + 4), + sizeof(packet_size_peek))) { + printk(KERN_WARNING "%s: Error while inspecting packet size\n", + __func__); + return -EFAULT; + } + + rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size, + &packet_size_length); + if (rc) { + printk(KERN_WARNING "%s: Error parsing packet length; " + "rc = [%d]\n", __func__, rc); + return rc; + } + + if ((1 + 4 + packet_size_length + packet_size) != count) { + printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__, + packet_size); + return -EINVAL; + } +memdup: data = memdup_user(buf, count); if (IS_ERR(data)) { printk(KERN_ERR "%s: memdup_user returned error [%ld]\n", @@ -435,23 +471,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, } memcpy(&counter_nbo, &data[i], 4); seq = be32_to_cpu(counter_nbo); - i += 4; - rc = ecryptfs_parse_packet_length(&data[i], &packet_size, - &packet_size_length); - if (rc) { - printk(KERN_WARNING "%s: Error parsing packet length; " - "rc = [%d]\n", __func__, rc); - goto out_free; - } - i += packet_size_length; - if ((1 + 4 + packet_size_length + packet_size) != count) { - printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])" - " + packet_size([%zd]))([%zd]) != " - "count([%zd]). Invalid packet format.\n", - __func__, packet_size_length, packet_size, - (1 + packet_size_length + packet_size), count); - goto out_free; - } + i += 4 + packet_size_length; rc = ecryptfs_miscdev_response(&data[i], packet_size, euid, current_user_ns(), task_pid(current), seq); diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 3745f7c..608c1c3 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -130,13 +130,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT); size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK); size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page); - size_t total_remaining_bytes = ((offset + size) - pos); + loff_t total_remaining_bytes = ((offset + size) - pos); + + if (fatal_signal_pending(current)) { + rc = -EINTR; + break; + } if (num_bytes > total_remaining_bytes) num_bytes = total_remaining_bytes; if (pos < offset) { /* remaining zeros to write, up to destination offset */ - size_t total_remaining_zeros = (offset - pos); + loff_t total_remaining_zeros = (offset - pos); if (num_bytes > total_remaining_zeros) num_bytes = total_remaining_zeros; @@ -193,15 +198,19 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, } pos += num_bytes; } - if ((offset + size) > ecryptfs_file_size) { - i_size_write(ecryptfs_inode, (offset + size)); + if (pos > ecryptfs_file_size) { + i_size_write(ecryptfs_inode, pos); if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) { - rc = ecryptfs_write_inode_size_to_metadata( + int rc2; + + rc2 = ecryptfs_write_inode_size_to_metadata( ecryptfs_inode); - if (rc) { + if (rc2) { printk(KERN_ERR "Problem with " "ecryptfs_write_inode_size_to_metadata; " - "rc = [%d]\n", rc); + "rc = [%d]\n", rc2); + if (!rc) + rc = rc2; goto out; } } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2acaf60..35a852a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -197,6 +197,12 @@ struct eventpoll { /* The user that created the eventpoll descriptor */ struct user_struct *user; + + struct file *file; + + /* used to optimize loop detection check */ + int visited; + struct list_head visited_list_link; }; /* Wait structure used by the poll hooks */ @@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; +/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ +static LIST_HEAD(visited_list); + +/* + * List of files with newly added links, where we may need to limit the number + * of emanating paths. Protected by the epmutex. + */ +static LIST_HEAD(tfile_check_list); + #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -276,6 +291,12 @@ ctl_table epoll_table[] = { }; #endif /* CONFIG_SYSCTL */ +static const struct file_operations eventpoll_fops; + +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, @@ -299,6 +320,11 @@ static inline int ep_is_linked(struct list_head *p) return !list_empty(p); } +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +{ + return container_of(p, struct eppoll_entry, wait); +} + /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_t *p) { @@ -446,6 +472,18 @@ static void ep_poll_safewake(wait_queue_head_t *wq) put_cpu(); } +static void ep_remove_wait_queue(struct eppoll_entry *pwq) +{ + wait_queue_head_t *whead; + + rcu_read_lock(); + /* If it is cleared by POLLFREE, it should be rcu-safe */ + whead = rcu_dereference(pwq->whead); + if (whead) + remove_wait_queue(whead, &pwq->wait); + rcu_read_unlock(); +} + /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held (or "epmutex" if called from @@ -460,7 +498,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) pwq = list_first_entry(lsthead, struct eppoll_entry, llink); list_del(&pwq->llink); - remove_wait_queue(pwq->whead, &pwq->wait); + ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } } @@ -711,12 +749,6 @@ static const struct file_operations eventpoll_fops = { .llseek = noop_llseek, }; -/* Fast test to see if the file is an evenpoll file */ -static inline int is_file_epoll(struct file *f) -{ - return f->f_op == &eventpoll_fops; -} - /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are @@ -827,6 +859,17 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + if ((unsigned long)key & POLLFREE) { + ep_pwq_from_wait(wait)->whead = NULL; + /* + * whead = NULL above can race with ep_remove_wait_queue() + * which can do another remove_wait_queue() after us, so we + * can't use __remove_wait_queue(). whead->lock is held by + * the caller. + */ + list_del_init(&wait->task_list); + } + spin_lock_irqsave(&ep->lock, flags); /* @@ -926,6 +969,103 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + +#define PATH_ARR_SIZE 5 +/* + * These are the number paths of length 1 to 5, that we are allowing to emanate + * from a single file of interest. For example, we allow 1000 paths of length + * 1, to emanate from each file of interest. This essentially represents the + * potential wakeup paths, which need to be limited in order to avoid massive + * uncontrolled wakeup storms. The common use case should be a single ep which + * is connected to n file sources. In this case each file source has 1 path + * of length 1. Thus, the numbers below should be more than sufficient. These + * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify + * and delete can't add additional paths. Protected by the epmutex. + */ +static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; +static int path_count[PATH_ARR_SIZE]; + +static int path_count_inc(int nests) +{ + /* Allow an arbitrary number of depth 1 paths */ + if (nests == 0) + return 0; + + if (++path_count[nests] > path_limits[nests]) + return -1; + return 0; +} + +static void path_count_init(void) +{ + int i; + + for (i = 0; i < PATH_ARR_SIZE; i++) + path_count[i] = 0; +} + +static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct file *child_file; + struct epitem *epi; + + list_for_each_entry(epi, &file->f_ep_links, fllink) { + child_file = epi->ep->file; + if (is_file_epoll(child_file)) { + if (list_empty(&child_file->f_ep_links)) { + if (path_count_inc(call_nests)) { + error = -1; + break; + } + } else { + error = ep_call_nested(&poll_loop_ncalls, + EP_MAX_NESTS, + reverse_path_check_proc, + child_file, child_file, + current); + } + if (error != 0) + break; + } else { + printk(KERN_ERR "reverse_path_check_proc: " + "file is not an ep!\n"); + } + } + return error; +} + +/** + * reverse_path_check - The tfile_check_list is list of file *, which have + * links that are proposed to be newly added. We need to + * make sure that those added links don't add too many + * paths such that we will spend all our time waking up + * eventpoll objects. + * + * Returns: Returns zero if the proposed links don't create too many paths, + * -1 otherwise. + */ +static int reverse_path_check(void) +{ + int length = 0; + int error = 0; + struct file *current_file; + + /* let's call this for all tfiles */ + list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { + length++; + path_count_init(); + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + reverse_path_check_proc, current_file, + current_file, current); + if (error) + break; + } + return error; +} + /* * Must be called with "mtx" held. */ @@ -987,6 +1127,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, */ ep_rbtree_insert(ep, epi); + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (reverse_path_check()) + goto error_remove_epi; + /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); @@ -1011,6 +1156,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, return 0; +error_remove_epi: + spin_lock(&tfile->f_lock); + if (ep_is_linked(&epi->fllink)) + list_del_init(&epi->fllink); + spin_unlock(&tfile->f_lock); + + rb_erase(&epi->rbn, &ep->rbr); + error_unregister: ep_unregister_pollwait(ep, epi); @@ -1275,18 +1428,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) int error = 0; struct file *file = priv; struct eventpoll *ep = file->private_data; + struct eventpoll *ep_tovisit; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, call_nests + 1); + ep->visited = 1; + list_add(&ep->visited_list_link, &visited_list); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { + ep_tovisit = epi->ffd.file->private_data; + if (ep_tovisit->visited) + continue; error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, - ep_loop_check_proc, epi->ffd.file, - epi->ffd.file->private_data, current); + ep_loop_check_proc, epi->ffd.file, + ep_tovisit, current); if (error != 0) break; + } else { + /* + * If we've reached a file that is not associated with + * an ep, then we need to check if the newly added + * links are going to add too many wakeup paths. We do + * this by adding it to the tfile_check_list, if it's + * not already there, and calling reverse_path_check() + * during ep_insert(). + */ + if (list_empty(&epi->ffd.file->f_tfile_llink)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); } } mutex_unlock(&ep->mtx); @@ -1307,8 +1478,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + int ret; + struct eventpoll *ep_cur, *ep_next; + + ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, ep_loop_check_proc, file, ep, current); + /* clear visited list */ + list_for_each_entry_safe(ep_cur, ep_next, &visited_list, + visited_list_link) { + ep_cur->visited = 0; + list_del(&ep_cur->visited_list_link); + } + return ret; +} + +static void clear_tfile_check_list(void) +{ + struct file *file; + + /* first clear the tfile_check_list */ + while (!list_empty(&tfile_check_list)) { + file = list_first_entry(&tfile_check_list, struct file, + f_tfile_llink); + list_del_init(&file->f_tfile_llink); + } + INIT_LIST_HEAD(&tfile_check_list); } /* @@ -1316,8 +1510,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) */ SYSCALL_DEFINE1(epoll_create1, int, flags) { - int error; + int error, fd; struct eventpoll *ep = NULL; + struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); @@ -1334,11 +1529,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); + if (fd < 0) { + error = fd; + goto out_free_ep; + } + file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); - if (error < 0) - ep_free(ep); - + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_fd; + } + fd_install(fd, file); + ep->file = file; + return fd; + +out_free_fd: + put_unused_fd(fd); +out_free_ep: + ep_free(ep); return error; } @@ -1404,21 +1613,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* * When we insert an epoll file descriptor, inside another epoll file * descriptor, there is the change of creating closed loops, which are - * better be handled here, than in more critical paths. + * better be handled here, than in more critical paths. While we are + * checking for loops we also determine the list of files reachable + * and hang them on the tfile_check_list, so we can check that we + * haven't created too many possible wakeup paths. * - * We hold epmutex across the loop check and the insert in this case, in - * order to prevent two separate inserts from racing and each doing the - * insert "at the same time" such that ep_loop_check passes on both - * before either one does the insert, thereby creating a cycle. + * We need to hold the epmutex across both ep_insert and ep_remove + * b/c we want to make sure we are looking at a coherent view of + * epoll network. */ - if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { mutex_lock(&epmutex); did_lock_epmutex = 1; - error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) - goto error_tgt_fput; } - + if (op == EPOLL_CTL_ADD) { + if (is_file_epoll(tfile)) { + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } else + list_add(&tfile->f_tfile_llink, &tfile_check_list); + } mutex_lock_nested(&ep->mtx, 0); @@ -1437,6 +1652,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; + clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -1455,7 +1671,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: - if (unlikely(did_lock_epmutex)) + if (did_lock_epmutex) mutex_unlock(&epmutex); fput(tfile); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 3451d23..db9ba1a 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1568,7 +1568,13 @@ static int ext3_ordered_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); /* * We give up here if we're reentered, because it might be for a @@ -1642,7 +1648,13 @@ static int ext3_writeback_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); if (ext3_journal_current_handle()) goto out_fail; @@ -1684,7 +1696,13 @@ static int ext3_journalled_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); if (ext3_journal_current_handle()) goto no_write; diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5802fa1..95af6f8 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -261,43 +261,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, /* super.c */ int ext4_force_commit(struct super_block *sb); -static inline int ext4_should_journal_data(struct inode *inode) +/* + * Ext4 inode journal modes + */ +#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ +#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ +#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ + +static inline int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 1; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return 1; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 1; - return 0; + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC)) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + else + BUG(); +} + +static inline int ext4_should_journal_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; } static inline int ext4_should_order_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; } static inline int ext4_should_writeback_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 1; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } /* diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 14299de..4f31deb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -341,6 +341,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + if (len == 0) + return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -2844,7 +2846,7 @@ static int ext4_split_extent_at(handle_t *handle, if (err) goto fix_extent_len; /* update the extent length and mark as initialized */ - ex->ee_len = cpu_to_le32(ee_len); + ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(inode, path, ex); err = ext4_ext_dirty(handle, inode, path + depth); goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7743b1f..55d3e51 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3216,13 +3216,14 @@ static int ext4_da_write_end(struct file *file, int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) { - if (ext4_should_order_data(inode)) { + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: return ext4_ordered_write_end(file, mapping, pos, len, copied, page, fsdata); - } else if (ext4_should_writeback_data(inode)) { + case EXT4_INODE_WRITEBACK_DATA_MODE: return ext4_writeback_write_end(file, mapping, pos, len, copied, page, fsdata); - } else { + default: BUG(); } } @@ -3514,12 +3515,17 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, } retry: - if (rw == READ && ext4_should_dioread_nolock(inode)) + if (rw == READ && ext4_should_dioread_nolock(inode)) { + if (unlikely(!list_empty(&ei->i_completed_io_list))) { + mutex_lock(&inode->i_mutex); + ext4_flush_completed_IO(inode); + mutex_unlock(&inode->i_mutex); + } ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL, NULL, 0); - else { + } else { ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, @@ -3917,18 +3923,25 @@ static const struct address_space_operations ext4_da_aops = { void ext4_set_aops(struct inode *inode) { - if (ext4_should_order_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_order_data(inode)) - inode->i_mapping->a_ops = &ext4_ordered_aops; - else if (ext4_should_writeback_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_writeback_data(inode)) - inode->i_mapping->a_ops = &ext4_writeback_aops; - else + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_ordered_aops; + break; + case EXT4_INODE_WRITEBACK_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_writeback_aops; + break; + case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; + break; + default: + BUG(); + } } /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7aa77f0..df121b2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1957,17 +1957,16 @@ static int ext4_fill_flex_info(struct super_block *sb) struct ext4_group_desc *gdp = NULL; ext4_group_t flex_group_count; ext4_group_t flex_group; - int groups_per_flex = 0; + unsigned int groups_per_flex = 0; size_t size; int i; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - - if (groups_per_flex < 2) { + if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } + groups_per_flex = 1 << sbi->s_log_groups_per_flex; /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index b4ba1b3..408073a 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -360,6 +360,10 @@ int hfsplus_rename_cat(u32 cnid, err = hfs_brec_find(&src_fd); if (err) goto out; + if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) { + err = -EIO; + goto out; + } hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset, src_fd.entrylength); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 4df5059..159f5eb 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -146,6 +146,11 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) filp->f_pos++; /* fall through */ case 1: + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) { @@ -177,6 +182,12 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } + + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); type = be16_to_cpu(entry.type); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7aafeb8..8b0c875 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -238,17 +238,10 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, loff_t isize; ssize_t retval = 0; - mutex_lock(&inode->i_mutex); - /* validate length */ if (len == 0) goto out; - isize = i_size_read(inode); - if (!isize) - goto out; - - end_index = (isize - 1) >> huge_page_shift(h); for (;;) { struct page *page; unsigned long nr, ret; @@ -256,18 +249,21 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); + isize = i_size_read(inode); + if (!isize) + goto out; + end_index = (isize - 1) >> huge_page_shift(h); if (index >= end_index) { if (index > end_index) goto out; nr = ((isize - 1) & ~huge_page_mask(h)) + 1; - if (nr <= offset) { + if (nr <= offset) goto out; - } } nr = nr - offset; /* Find the page */ - page = find_get_page(mapping, index); + page = find_lock_page(mapping, index); if (unlikely(page == NULL)) { /* * We have a HOLE, zero out the user-buffer for the @@ -279,17 +275,18 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, else ra = 0; } else { + unlock_page(page); + /* * We have the page, copy it to user space buffer. */ ra = hugetlbfs_read_actor(page, offset, buf, len, nr); ret = ra; + page_cache_release(page); } if (ra < 0) { if (retval == 0) retval = ra; - if (page) - page_cache_release(page); goto out; } @@ -299,16 +296,12 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, index += offset >> huge_page_shift(h); offset &= ~huge_page_mask(h); - if (page) - page_cache_release(page); - /* short read or no more work */ if ((ret != nr) || (len == 0)) break; } out: *ppos = ((loff_t)index << huge_page_shift(h)) + offset; - mutex_unlock(&inode->i_mutex); return retval; } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index eef6979..36c2e80 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -683,7 +683,7 @@ start_journal_io: if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, @@ -819,7 +819,7 @@ wait_for_iobuf: if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && journal->j_flags & JBD2_BARRIER) { - blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL); + blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); } if (err) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 0615bfd..8088e00 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1911,6 +1911,8 @@ zap_buffer_unlocked: clear_buffer_mapped(bh); clear_buffer_req(bh); clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); bh->b_bdev = NULL; return may_free; } diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index f848b52..046bb77 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -241,7 +241,7 @@ static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat) p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) goto out_overflow; - if (unlikely(*p > nlm4_failed)) + if (unlikely(ntohl(*p) > ntohl(nlm4_failed))) goto out_bad_xdr; *stat = *p; return 0; diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 180ac34..36057ce 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -236,7 +236,7 @@ static int decode_nlm_stat(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) goto out_overflow; - if (unlikely(*p > nlm_lck_denied_grace_period)) + if (unlikely(ntohl(*p) > ntohl(nlm_lck_denied_grace_period))) goto out_enum; *stat = *p; return 0; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index abfff9d..1743064 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -440,7 +440,7 @@ static int param_set_##name(const char *val, struct kernel_param *kp) \ __typeof__(type) num = which_strtol(val, &endp, 0); \ if (endp == val || *endp || num < (min) || num > (max)) \ return -EINVAL; \ - *((int *) kp->arg) = num; \ + *((type *) kp->arg) = num; \ return 0; \ } @@ -2109,7 +2109,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, /* sayonara */ error = complete_walk(nd); if (error) - return ERR_PTR(-ECHILD); + return ERR_PTR(error); error = -ENOTDIR; if (nd->flags & LOOKUP_DIRECTORY) { @@ -2208,7 +2208,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); if (error) - goto exit; + return ERR_PTR(error); error = -EISDIR; if (S_ISDIR(nd->inode->i_mode)) goto exit; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index aaa09e9..b5c826e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -324,7 +324,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) dprintk("%s enter. slotid %d seqid %d\n", __func__, args->csa_slotid, args->csa_sequenceid); - if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS) + if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS) return htonl(NFS4ERR_BADSLOT); slot = tbl->slots + args->csa_slotid; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 321a66b..ecabbd8 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -466,6 +466,17 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } +void nfs_remove_bad_delegation(struct inode *inode) +{ + struct nfs_delegation *delegation; + + delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode)); + if (delegation) { + nfs_inode_find_state_and_recover(inode, &delegation->stateid); + nfs_free_delegation(delegation); + } +} + /** * nfs_expire_all_delegation_types * @clp: client to process diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index d9322e4..691a796 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -45,6 +45,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp); void nfs_handle_cb_pathdown(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); +void nfs_remove_bad_delegation(struct inode *inode); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c4a6983..e1c1365 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -209,6 +209,7 @@ struct nfs4_exception { long timeout; int retry; struct nfs4_state *state; + struct inode *inode; }; struct nfs4_state_recovery_ops { @@ -344,6 +345,8 @@ extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); +extern void nfs_inode_find_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid); extern void nfs4_schedule_lease_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a2a7d0a..3d67302 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -254,15 +254,28 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; + struct inode *inode = exception->inode; int ret = errorcode; exception->retry = 0; switch(errorcode) { case 0: return 0; + case -NFS4ERR_OPENMODE: + if (nfs_have_delegation(inode, FMODE_READ)) { + nfs_inode_return_delegation(inode); + exception->retry = 1; + return 0; + } + if (state == NULL) + break; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; + case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OPENMODE: + if (state != NULL) + nfs_remove_bad_delegation(state->inode); if (state == NULL) break; nfs4_schedule_stateid_recovery(server, state); @@ -1305,8 +1318,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state * The show must go on: exit, but mark the * stateid as needing recovery. */ + case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: + nfs_inode_find_state_and_recover(state->inode, + stateid); nfs4_schedule_stateid_recovery(server, state); case -EKEYEXPIRED: /* @@ -1862,7 +1878,10 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .state = state, + .inode = inode, + }; int err; do { err = nfs4_handle_exception(server, @@ -3678,8 +3697,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, if (task->tk_status >= 0) return 0; switch(task->tk_status) { + case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: + if (state != NULL) + nfs_remove_bad_delegation(state->inode); case -NFS4ERR_OPENMODE: if (state == NULL) break; @@ -4402,7 +4424,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request) { struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .inode = state->inode, + }; int err; do { @@ -4420,7 +4444,9 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request) { struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .inode = state->inode, + }; int err; err = nfs4_set_lock_state(state, request); @@ -4484,7 +4510,10 @@ out: static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .state = state, + .inode = state->inode, + }; int err; do { @@ -4529,6 +4558,20 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (state == NULL) return -ENOLCK; + /* + * Don't rely on the VFS having checked the file open mode, + * since it won't do this for flock() locks. + */ + switch (request->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) { + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } + do { status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) @@ -4577,6 +4620,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) * The show must go on: exit, but mark the * stateid as needing recovery. */ + case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OPENMODE: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e97dd21..c6e2769 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1069,6 +1069,33 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4 nfs4_schedule_state_manager(clp); } +void nfs_inode_find_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + struct nfs4_state *state; + bool found = false; + + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + if (state == NULL) + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; + if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) + continue; + nfs4_state_mark_reclaim_nograce(clp, state); + found = true; + } + spin_unlock(&inode->i_lock); + if (found) + nfs4_schedule_state_manager(clp); +} + + static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) { struct inode *inode = state->inode; @@ -1519,16 +1546,16 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) { if (!flags) return; - else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) + if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); - else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | SEQ4_STATUS_ADMIN_STATE_REVOKED | SEQ4_STATUS_LEASE_MOVED)) nfs41_handle_state_revoked(clp); - else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) + if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); - else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + if (flags & (SEQ4_STATUS_CB_PATH_DOWN | SEQ4_STATUS_BACKCHANNEL_FAULT | SEQ4_STATUS_CB_PATH_DOWN_SESSION)) nfs41_handle_cb_path_down(clp); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 1d1dc1e..75fe694 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -1006,7 +1006,8 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, .name = "LAYOUT_OSD2_OBJECTS", - .flags = PNFS_LAYOUTRET_ON_SETATTR, + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR, .alloc_layout_hdr = objlayout_alloc_layout_hdr, .free_layout_hdr = objlayout_free_layout_hdr, diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 1d06f8e..fefa122 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -294,9 +294,11 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); rdata = state->rpcdata; rdata->task.tk_status = status; - if (status >= 0) { + if (likely(status >= 0)) { rdata->res.count = status; rdata->res.eof = eof; + } else { + rdata->pnfs_error = status; } objlayout_iodone(state); /* must not use state after this point */ @@ -380,15 +382,17 @@ objlayout_write_done(struct objlayout_io_state *state, ssize_t status, wdata = state->rpcdata; state->status = status; wdata->task.tk_status = status; - if (status >= 0) { + if (likely(status >= 0)) { wdata->res.count = status; wdata->verf.committed = state->committed; dprintk("%s: Return status %d committed %d\n", __func__, wdata->task.tk_status, wdata->verf.committed); - } else + } else { + wdata->pnfs_error = status; dprintk("%s: Return status %d\n", __func__, wdata->task.tk_status); + } objlayout_iodone(state); /* must not use state after this point */ diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 36d2a29..9951887 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1119,6 +1119,14 @@ pnfs_ld_write_done(struct nfs_write_data *data) data->mds_ops->rpc_release(data); return 0; } + if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) { + /* Don't lo_commit on error, Server will needs to + * preform a file recovery. + */ + clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(data->inode)->flags); + pnfs_return_layout(data->inode); + } dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, data->pnfs_error); @@ -1167,6 +1175,10 @@ pnfs_ld_read_done(struct nfs_read_data *data) return 0; } + if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) + pnfs_return_layout(data->inode); + dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, data->pnfs_error); status = nfs_initiate_read(data, NFS_CLIENT(data->inode), diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 9d147d9..bb8b324 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -68,6 +68,7 @@ enum { enum layoutdriver_policy_flags { /* Should the pNFS client commit and return the layout upon a setattr */ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, + PNFS_LAYOUTRET_ON_ERROR = 1 << 1, }; struct nfs4_deviceid_node; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 858d31b..8e7b61d 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -904,10 +904,24 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve data->auth_flavor_len = 1; data->version = version; data->minorversion = 0; + security_init_mnt_opts(&data->lsm_opts); } return data; } +static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data) +{ + if (data) { + kfree(data->client_address); + kfree(data->mount_server.hostname); + kfree(data->nfs_server.export_path); + kfree(data->nfs_server.hostname); + kfree(data->fscache_uniq); + security_free_mnt_opts(&data->lsm_opts); + kfree(data); + } +} + /* * Sanity-check a server address provided by the mount command. * @@ -2218,9 +2232,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) - goto out_free_fh; - - security_init_mnt_opts(&data->lsm_opts); + goto out; /* Validate the mount data */ error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); @@ -2232,8 +2244,6 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, #ifdef CONFIG_NFS_V4 if (data->version == 4) { mntroot = nfs4_try_mount(flags, dev_name, data); - kfree(data->client_address); - kfree(data->nfs_server.export_path); goto out; } #endif /* CONFIG_NFS_V4 */ @@ -2284,13 +2294,8 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, s->s_flags |= MS_ACTIVE; out: - kfree(data->nfs_server.hostname); - kfree(data->mount_server.hostname); - kfree(data->fscache_uniq); - security_free_mnt_opts(&data->lsm_opts); -out_free_fh: + nfs_free_parsed_mount_data(data); nfs_free_fhandle(mntfh); - kfree(data); return mntroot; out_err_nosb: @@ -2613,9 +2618,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags, mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) - goto out_free_fh; - - security_init_mnt_opts(&data->lsm_opts); + goto out; /* Get a volume representation */ server = nfs4_create_server(data, mntfh); @@ -2663,13 +2666,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags, s->s_flags |= MS_ACTIVE; - security_free_mnt_opts(&data->lsm_opts); nfs_free_fhandle(mntfh); return mntroot; out: - security_free_mnt_opts(&data->lsm_opts); -out_free_fh: nfs_free_fhandle(mntfh); return ERR_PTR(error); @@ -2694,11 +2694,15 @@ static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, char *root_devname; size_t len; - len = strlen(hostname) + 3; + len = strlen(hostname) + 5; root_devname = kmalloc(len, GFP_KERNEL); if (root_devname == NULL) return ERR_PTR(-ENOMEM); - snprintf(root_devname, len, "%s:/", hostname); + /* Does hostname needs to be enclosed in brackets? */ + if (strchr(hostname, ':')) + snprintf(root_devname, len, "[%s]:/", hostname); + else + snprintf(root_devname, len, "%s:/", hostname); root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data); kfree(root_devname); return root_mnt; @@ -2855,7 +2859,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type, data = nfs_alloc_parsed_mount_data(4); if (data == NULL) - goto out_free_data; + goto out; /* Validate the mount data */ error = nfs4_validate_mount_data(raw_data, data, dev_name); @@ -2869,12 +2873,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type, error = PTR_ERR(res); out: - kfree(data->client_address); - kfree(data->nfs_server.export_path); - kfree(data->nfs_server.hostname); - kfree(data->fscache_uniq); -out_free_data: - kfree(data); + nfs_free_parsed_mount_data(data); dprintk("<-- nfs4_mount() = %d%s\n", error, error != 0 ? " [error]" : ""); return res; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index b9566e4..4b470f6 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -88,7 +88,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) struct svc_expkey key; struct svc_expkey *ek = NULL; - if (mesg[mlen-1] != '\n') + if (mlen < 1 || mesg[mlen-1] != '\n') return -EINVAL; mesg[mlen-1] = 0; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 08c6e36..43f46cd 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -803,13 +803,13 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, return p; } -static int +static __be32 compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, const char *name, int namlen) { struct svc_export *exp; struct dentry *dparent, *dchild; - int rv = 0; + __be32 rv = nfserr_noent; dparent = cd->fh.fh_dentry; exp = cd->fh.fh_export; @@ -817,26 +817,20 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, if (isdotent(name, namlen)) { if (namlen == 2) { dchild = dget_parent(dparent); - if (dchild == dparent) { - /* filesystem root - cannot return filehandle for ".." */ - dput(dchild); - return -ENOENT; - } + /* filesystem root - cannot return filehandle for ".." */ + if (dchild == dparent) + goto out; } else dchild = dget(dparent); } else dchild = lookup_one_len(name, dparent, namlen); if (IS_ERR(dchild)) - return -ENOENT; - rv = -ENOENT; + return rv; if (d_mountpoint(dchild)) goto out; - rv = fh_compose(fhp, exp, dchild, &cd->fh); - if (rv) - goto out; if (!dchild->d_inode) goto out; - rv = 0; + rv = fh_compose(fhp, exp, dchild, &cd->fh); out: dput(dchild); return rv; @@ -845,7 +839,7 @@ out: static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) { struct svc_fh fh; - int err; + __be32 err; fh_init(&fh, NFS3_FHSIZE); err = compose_entry_fh(cd, &fh, name, namlen); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0b8830c..d06a02c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -812,6 +812,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setattr *setattr) { __be32 status = nfs_ok; + int err; if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { nfs4_lock_state(); @@ -823,9 +824,9 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } } - status = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt); - if (status) - return status; + err = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt); + if (err) + return nfserrno(err); status = nfs_ok; status = check_attr_support(rqstp, cstate, setattr->sa_bmval, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ecd8152..92f7eb7 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3956,16 +3956,14 @@ out: * vfs_test_lock. (Arguably perhaps test_lock should be done with an * inode operation.) */ -static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) +static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { struct file *file; - int err; - - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); - if (err) - return err; - err = vfs_test_lock(file, lock); - nfsd_close(file); + __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + if (!err) { + err = nfserrno(vfs_test_lock(file, lock)); + nfsd_close(file); + } return err; } @@ -3978,7 +3976,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct inode *inode; struct file_lock file_lock; - int error; __be32 status; if (locks_in_grace()) @@ -4030,12 +4027,10 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_transform_lock_offset(&file_lock); - status = nfs_ok; - error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock); - if (error) { - status = nfserrno(error); + status = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock); + if (status) goto out; - } + if (file_lock.fl_type != F_UNLCK) { status = nfserr_denied; nfs4_set_lock_denied(&file_lock, &lockt->lt_denied); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 41d6743..3e65427 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -842,6 +842,19 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; + case NILFS_IOCTL_CHANGE_CPMODE: + case NILFS_IOCTL_DELETE_CHECKPOINT: + case NILFS_IOCTL_GET_CPINFO: + case NILFS_IOCTL_GET_CPSTAT: + case NILFS_IOCTL_GET_SUINFO: + case NILFS_IOCTL_GET_SUSTAT: + case NILFS_IOCTL_GET_VINFO: + case NILFS_IOCTL_GET_BDESCS: + case NILFS_IOCTL_CLEAN_SEGMENTS: + case NILFS_IOCTL_SYNC: + case NILFS_IOCTL_RESIZE: + case NILFS_IOCTL_SET_ALLOC_RANGE: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index d327140..35a8970 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -515,6 +515,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, brelse(sbh[1]); sbh[1] = NULL; sbp[1] = NULL; + valid[1] = 0; swp = 0; } if (!valid[swp]) { diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 252ab1f..42ed195 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -135,9 +135,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; - /* 1 from caller and 1 for being on i_list/g_list */ - BUG_ON(atomic_read(&mark->refcnt) < 2); - spin_lock(&group->mark_lock); if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { @@ -182,6 +179,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) iput(inode); /* + * We don't necessarily have a ref on mark from caller so the above iput + * may have already destroyed it. Don't touch from now on. + */ + + /* * it's possible that this group tried to destroy itself, but this * this mark was simultaneously being freed by inode. If that's the * case, we finish freeing the group here. diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ed553c6..76c8165 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1134,7 +1134,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle, } el = path_leaf_el(path); - rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1]; + rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1]; ocfs2_adjust_rightmost_records(handle, et, path, rec); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ebfd382..15d29cc 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1036,14 +1036,14 @@ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, tmp_el = left_path->p_node[subtree_root].el; blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; - for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) { + for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) { if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); break; } } - BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec)); + BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec)); out: ocfs2_free_path(left_path); @@ -1468,7 +1468,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, trace_ocfs2_divide_leaf_refcount_block( (unsigned long long)ref_leaf_bh->b_blocknr, - le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used)); + le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used)); /* * XXX: Improvement later. @@ -2411,7 +2411,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, rb = (struct ocfs2_refcount_block *) prev_bh->b_data; - if (le64_to_cpu(rb->rf_records.rl_used) + + if (le16_to_cpu(rb->rf_records.rl_used) + recs_add > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; @@ -2476,7 +2476,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, if (prev_bh) { rb = (struct ocfs2_refcount_block *)prev_bh->b_data; - if (le64_to_cpu(rb->rf_records.rl_used) + recs_add > + if (le16_to_cpu(rb->rf_records.rl_used) + recs_add > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; @@ -3629,7 +3629,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode, * one will split a refcount rec, so totally we need * clusters * 2 new refcount rec. */ - if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 > + if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index ba5d97e..f169da4 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -600,7 +600,7 @@ static void ocfs2_bg_alloc_cleanup(handle_t *handle, ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, cluster_ac->ac_bh, le64_to_cpu(rec->e_blkno), - le32_to_cpu(rec->e_leaf_clusters)); + le16_to_cpu(rec->e_leaf_clusters)); if (ret) mlog_errno(ret); /* Try all the clusters to free */ @@ -1628,7 +1628,7 @@ static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, { unsigned int bpc = le16_to_cpu(cl->cl_bpc); unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; - unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc; + unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; if (res->sr_bit_offset < bitoff) return 0; diff --git a/fs/partitions/check.c b/fs/partitions/check.c index a0a041d..811960a 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -550,17 +550,11 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) } } -int rescan_partitions(struct gendisk *disk, struct block_device *bdev) +static int drop_partitions(struct gendisk *disk, struct block_device *bdev) { - struct parsed_partitions *state = NULL; struct disk_part_iter piter; struct hd_struct *part; - int p, highest, res; -rescan: - if (state && !IS_ERR(state)) { - kfree(state); - state = NULL; - } + int res; if (bdev->bd_part_count) return -EBUSY; @@ -573,6 +567,24 @@ rescan: delete_partition(disk, part->partno); disk_part_iter_exit(&piter); + return 0; +} + +int rescan_partitions(struct gendisk *disk, struct block_device *bdev) +{ + struct parsed_partitions *state = NULL; + struct hd_struct *part; + int p, highest, res; +rescan: + if (state && !IS_ERR(state)) { + kfree(state); + state = NULL; + } + + res = drop_partitions(disk, bdev); + if (res) + return res; + if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); check_disk_size_change(disk, bdev); @@ -676,6 +688,26 @@ rescan: return 0; } +int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) +{ + int res; + + if (!bdev->bd_invalidated) + return 0; + + res = drop_partitions(disk, bdev); + if (res) + return res; + + set_capacity(disk, 0); + check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; + /* tell userspace that the media / partition table may have changed */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + + return 0; +} + unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) { struct address_space *mapping = bdev->bd_inode->i_mapping; @@ -345,6 +345,16 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static const struct pipe_buf_operations packet_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = anon_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + static ssize_t pipe_read(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t pos) @@ -406,6 +416,13 @@ redo: ret += chars; buf->offset += chars; buf->len -= chars; + + /* Was it a packet buffer? Clean up and exit */ + if (buf->flags & PIPE_BUF_FLAG_PACKET) { + total_len = chars; + buf->len = 0; + } + if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); @@ -458,6 +475,11 @@ redo: return ret; } +static inline int is_packetized(struct file *file) +{ + return (file->f_flags & O_DIRECT) != 0; +} + static ssize_t pipe_write(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t ppos) @@ -592,6 +614,11 @@ redo2: buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = chars; + buf->flags = 0; + if (is_packetized(filp)) { + buf->ops = &packet_pipe_buf_ops; + buf->flags = PIPE_BUF_FLAG_PACKET; + } pipe->nrbufs = ++bufs; pipe->tmp_page = NULL; @@ -1012,7 +1039,7 @@ struct file *create_write_pipe(int flags) goto err_dentry; f->f_mapping = inode->i_mapping; - f->f_flags = O_WRONLY | (flags & O_NONBLOCK); + f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); f->f_version = 0; return f; @@ -1056,7 +1083,7 @@ int do_pipe_flags(int *fd, int flags) int error; int fdw, fdr; - if (flags & ~(O_CLOEXEC | O_NONBLOCK)) + if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) return -EINVAL; fw = create_write_pipe(flags); diff --git a/fs/proc/base.c b/fs/proc/base.c index ca54a1b..cd2df5a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -211,8 +211,7 @@ static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (mm && mm != current->mm && - !ptrace_may_access(task, mode) && - !capable(CAP_SYS_RESOURCE)) { + !ptrace_may_access(task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } @@ -782,6 +781,13 @@ static int mem_open(struct inode* inode, struct file* file) if (IS_ERR(mm)) return PTR_ERR(mm); + if (mm) { + /* ensure this mm_struct can't be freed */ + atomic_inc(&mm->mm_count); + /* but do not pin its memory */ + mmput(mm); + } + /* OK to pass negative loff_t, we can catch out-of-range */ file->f_mode |= FMODE_UNSIGNED_OFFSET; file->private_data = mm; @@ -789,57 +795,13 @@ static int mem_open(struct inode* inode, struct file* file) return 0; } -static ssize_t mem_read(struct file * file, char __user * buf, - size_t count, loff_t *ppos) +static ssize_t mem_rw(struct file *file, char __user *buf, + size_t count, loff_t *ppos, int write) { - int ret; - char *page; - unsigned long src = *ppos; struct mm_struct *mm = file->private_data; - - if (!mm) - return 0; - - page = (char *)__get_free_page(GFP_TEMPORARY); - if (!page) - return -ENOMEM; - - ret = 0; - - while (count > 0) { - int this_len, retval; - - this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - retval = access_remote_vm(mm, src, page, this_len, 0); - if (!retval) { - if (!ret) - ret = -EIO; - break; - } - - if (copy_to_user(buf, page, retval)) { - ret = -EFAULT; - break; - } - - ret += retval; - src += retval; - buf += retval; - count -= retval; - } - *ppos = src; - - free_page((unsigned long) page); - return ret; -} - -static ssize_t mem_write(struct file * file, const char __user *buf, - size_t count, loff_t *ppos) -{ - int copied; + unsigned long addr = *ppos; + ssize_t copied; char *page; - unsigned long dst = *ppos; - struct mm_struct *mm = file->private_data; if (!mm) return 0; @@ -849,31 +811,54 @@ static ssize_t mem_write(struct file * file, const char __user *buf, return -ENOMEM; copied = 0; + if (!atomic_inc_not_zero(&mm->mm_users)) + goto free; + while (count > 0) { - int this_len, retval; + int this_len = min_t(int, count, PAGE_SIZE); - this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - if (copy_from_user(page, buf, this_len)) { + if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; break; } - retval = access_remote_vm(mm, dst, page, this_len, 1); - if (!retval) { + + this_len = access_remote_vm(mm, addr, page, this_len, write); + if (!this_len) { if (!copied) copied = -EIO; break; } - copied += retval; - buf += retval; - dst += retval; - count -= retval; + + if (!write && copy_to_user(buf, page, this_len)) { + copied = -EFAULT; + break; + } + + buf += this_len; + addr += this_len; + copied += this_len; + count -= this_len; } - *ppos = dst; + *ppos = addr; + mmput(mm); +free: free_page((unsigned long) page); return copied; } +static ssize_t mem_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, buf, count, ppos, 0); +} + +static ssize_t mem_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, (char __user*)buf, count, ppos, 1); +} + loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { @@ -893,8 +878,9 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig) static int mem_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; + if (mm) + mmdrop(mm); - mmput(mm); return 0; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index be177f7..d6c078e 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -54,7 +54,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, ei->ns_ops = ns_ops; ei->ns = ns; - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, NULL)) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c7d4ee6..55a1f49 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -407,6 +407,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } else { spin_unlock(&walk->mm->page_table_lock); } + + if (pmd_trans_unstable(pmd)) + return 0; /* * The mmap_sem held all the way back in m_start() is what * keeps khugepaged out of here and from collapsing things @@ -505,6 +508,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { @@ -516,6 +521,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!page) continue; + if (PageReserved(page)) + continue; + /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); ClearPageReferenced(page); @@ -665,6 +673,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, int err = 0; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); @@ -956,6 +966,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, spin_unlock(&walk->mm->page_table_lock); } + if (pmd_trans_unstable(pmd)) + return 0; orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { struct page *page = can_gather_numa_stats(*pte, md->vma, addr); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 766b1d4..29166ec 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -11,15 +11,20 @@ static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec uptime; struct timespec idle; + cputime64_t idletime; + u64 nsec; + u32 rem; int i; - cputime_t idletime = cputime_zero; + idletime = 0; for_each_possible_cpu(i) idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); - cputime_to_timespec(idletime, &idle); + nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, (uptime.tv_nsec / (NSEC_PER_SEC / 100)), diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index aa91089..f19dfbf 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -453,16 +453,20 @@ int remove_save_link(struct inode *inode, int truncate) static void reiserfs_kill_sb(struct super_block *s) { if (REISERFS_SB(s)) { - if (REISERFS_SB(s)->xattr_root) { - d_invalidate(REISERFS_SB(s)->xattr_root); - dput(REISERFS_SB(s)->xattr_root); - REISERFS_SB(s)->xattr_root = NULL; - } - if (REISERFS_SB(s)->priv_root) { - d_invalidate(REISERFS_SB(s)->priv_root); - dput(REISERFS_SB(s)->priv_root); - REISERFS_SB(s)->priv_root = NULL; - } + /* + * Force any pending inode evictions to occur now. Any + * inodes to be removed that have extended attributes + * associated with them need to clean them up before + * we can release the extended attribute root dentries. + * shrink_dcache_for_umount will BUG if we don't release + * those before it's called so ->put_super is too late. + */ + shrink_dcache_sb(s); + + dput(REISERFS_SB(s)->xattr_root); + REISERFS_SB(s)->xattr_root = NULL; + dput(REISERFS_SB(s)->priv_root); + REISERFS_SB(s)->priv_root = NULL; } kill_block_super(s); @@ -1164,7 +1168,8 @@ static void handle_quota_files(struct super_block *s, char **qf_names, kfree(REISERFS_SB(s)->s_qf_names[i]); REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; } - REISERFS_SB(s)->s_jquota_fmt = *qfmt; + if (*qfmt) + REISERFS_SB(s)->s_jquota_fmt = *qfmt; } #endif diff --git a/fs/signalfd.c b/fs/signalfd.c index 492465b..7ae2a57 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,21 @@ #include <linux/signalfd.h> #include <linux/syscalls.h> +void signalfd_cleanup(struct sighand_struct *sighand) +{ + wait_queue_head_t *wqh = &sighand->signalfd_wqh; + /* + * The lockless check can race with remove_wait_queue() in progress, + * but in this case its caller should run under rcu_read_lock() and + * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. + */ + if (likely(!waitqueue_active(wqh))) + return; + + /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + wake_up_poll(wqh, POLLHUP | POLLFREE); +} + struct signalfd_ctx { sigset_t sigmask; }; diff --git a/fs/splice.c b/fs/splice.c index aa866d3..9d89008 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -31,6 +31,7 @@ #include <linux/uio.h> #include <linux/security.h> #include <linux/gfp.h> +#include <linux/socket.h> /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -691,7 +692,9 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, if (!likely(file->f_op && file->f_op->sendpage)) return -EINVAL; - more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; + more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; + if (sd->len < sd->total_len) + more |= MSG_SENDPAGE_NOTLAST; return file->f_op->sendpage(file, buf->page, buf->offset, sd->len, &pos, more); } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 0a12eb8..a494413 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -136,12 +136,13 @@ static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *sec void *old_secdata; size_t old_secdata_len; - iattrs = sd->s_iattr; - if (!iattrs) - iattrs = sysfs_init_inode_attrs(sd); - if (!iattrs) - return -ENOMEM; + if (!sd->s_iattr) { + sd->s_iattr = sysfs_init_inode_attrs(sd); + if (!sd->s_iattr) + return -ENOMEM; + } + iattrs = sd->s_iattr; old_secdata = iattrs->ia_secdata; old_secdata_len = iattrs->ia_secdata_len; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index a811ac4..fd75b63 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -121,20 +121,21 @@ const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key); /* - * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message - * macros. + * TODO: these macros are now broken because there is no locking around them + * and we use a global buffer for the key string. This means that in case of + * concurrent execution we will end up with incorrect and messy key strings. */ #define DBGKEY(key) dbg_key_str0(c, (key)) #define DBGKEY1(key) dbg_key_str1(c, (key)) -#define ubifs_dbg_msg(type, fmt, ...) do { \ - spin_lock(&dbg_lock); \ - pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \ - spin_unlock(&dbg_lock); \ -} while (0) +#define ubifs_dbg_msg(type, fmt, ...) \ + pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__) /* Just a debugging messages not related to any specific UBIFS subsystem */ -#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__) +#define dbg_msg(fmt, ...) \ + printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \ + __func__, ##__VA_ARGS__) + /* General messages */ #define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) /* Additional journal messages */ diff --git a/fs/udf/file.c b/fs/udf/file.c index 2a346bb..3438b00 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -125,7 +125,6 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, err = udf_expand_file_adinicb(inode); if (err) { udf_debug("udf_expand_adinicb: err=%d\n", err); - up_write(&iinfo->i_data_sem); return err; } } else { @@ -133,9 +132,10 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, iinfo->i_lenAlloc = pos + count; else iinfo->i_lenAlloc = inode->i_size; + up_write(&iinfo->i_data_sem); } - } - up_write(&iinfo->i_data_sem); + } else + up_write(&iinfo->i_data_sem); retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); if (retval > 0) @@ -201,12 +201,10 @@ out: static int udf_release_file(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE) { - mutex_lock(&inode->i_mutex); down_write(&UDF_I(inode)->i_data_sem); udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); up_write(&UDF_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); } return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1d1358e..262050f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -145,6 +145,12 @@ const struct address_space_operations udf_aops = { .bmap = udf_bmap, }; +/* + * Expand file stored in ICB to a normal one-block-file + * + * This function requires i_data_sem for writing and releases it. + * This function requires i_mutex held + */ int udf_expand_file_adinicb(struct inode *inode) { struct page *page; @@ -163,9 +169,15 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; + up_write(&iinfo->i_data_sem); mark_inode_dirty(inode); return 0; } + /* + * Release i_data_sem so that we can lock a page - page lock ranks + * above i_data_sem. i_mutex still protects us against file changes. + */ + up_write(&iinfo->i_data_sem); page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); if (!page) @@ -181,6 +193,7 @@ int udf_expand_file_adinicb(struct inode *inode) SetPageUptodate(page); kunmap(page); } + down_write(&iinfo->i_data_sem); memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; @@ -190,17 +203,20 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; + up_write(&iinfo->i_data_sem); err = inode->i_data.a_ops->writepage(page, &udf_wbc); if (err) { /* Restore everything back so that we don't lose data... */ lock_page(page); kaddr = kmap(page); + down_write(&iinfo->i_data_sem); memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); kunmap(page); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; inode->i_data.a_ops = &udf_adinicb_aops; + up_write(&iinfo->i_data_sem); } page_cache_release(page); mark_inode_dirty(inode); @@ -1105,10 +1121,9 @@ int udf_setsize(struct inode *inode, loff_t newsize) if (bsize < (udf_file_entry_alloc_offset(inode) + newsize)) { err = udf_expand_file_adinicb(inode); - if (err) { - up_write(&iinfo->i_data_sem); + if (err) return err; - } + down_write(&iinfo->i_data_sem); } else iinfo->i_lenAlloc = newsize; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 7b27b06..7f0e18a 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1830,6 +1830,12 @@ static void udf_close_lvid(struct super_block *sb) le16_to_cpu(lvid->descTag.descCRCLength))); lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); + /* + * We set buffer uptodate unconditionally here to avoid spurious + * warnings from mark_buffer_dirty() when previous EIO has marked + * the buffer as !uptodate + */ + set_buffer_uptodate(bh); mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 4b9fb91..f86e034 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -39,7 +39,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp) struct posix_acl_entry *acl_e; struct posix_acl *acl; struct xfs_acl_entry *ace; - int count, i; + unsigned int count, i; count = be32_to_cpu(aclp->acl_cnt); if (count > XFS_ACL_MAX_ENTRIES) diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c index 244e797..572494f 100644 --- a/fs/xfs/linux-2.6/xfs_discard.c +++ b/fs/xfs/linux-2.6/xfs_discard.c @@ -68,7 +68,7 @@ xfs_trim_extents( * Look up the longest btree in the AGF and start with it. */ error = xfs_alloc_lookup_le(cur, 0, - XFS_BUF_TO_AGF(agbp)->agf_longest, &i); + be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); if (error) goto out_del_cursor; @@ -84,7 +84,7 @@ xfs_trim_extents( if (error) goto out_del_cursor; XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); - ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); + ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); /* * Too small? Give up. diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 28de70b..e6ac98c 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -871,27 +871,6 @@ xfs_fs_dirty_inode( } STATIC int -xfs_log_inode( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); -} - -STATIC int xfs_fs_write_inode( struct inode *inode, struct writeback_control *wbc) @@ -904,10 +883,8 @@ xfs_fs_write_inode( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - if (!ip->i_update_core) - return 0; - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { /* * Make sure the inode has made it it into the log. Instead * of forcing it all the way to stable storage using a @@ -916,11 +893,14 @@ xfs_fs_write_inode( * of synchronous log foces dramatically. */ xfs_ioend_wait(ip); - error = xfs_log_inode(ip); + error = xfs_log_dirty_inode(ip, NULL, 0); if (error) goto out; return 0; } else { + if (!ip->i_update_core) + return 0; + /* * We make this non-blocking if the inode is contended, return * EAGAIN to indicate to the caller that they did not succeed. diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index b69688d..2f277a0 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -336,6 +336,32 @@ xfs_sync_fsdata( return xfs_bwrite(mp, bp); } +int +xfs_log_dirty_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + if (!ip->i_update_core) + return 0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp, 0); +} + /* * When remounting a filesystem read-only or freezing the filesystem, we have * two phases to execute. This first phase is syncing the data before we @@ -365,6 +391,17 @@ xfs_quiesce_data( /* push and block till complete */ xfs_sync_data(mp, SYNC_WAIT); + + /* + * Log all pending size and timestamp updates. The vfs writeback + * code is supposed to do this, but due to its overagressive + * livelock detection it will skip inodes where appending writes + * were written out in the first non-blocking sync phase if their + * completion took long enough that it happened after taking the + * timestamp for the cut-off in the blocking phase. + */ + xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); + xfs_qm_sync(mp, SYNC_WAIT); /* write superblock and hoover up shutdown errors */ diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index e3a6ad2..ef5b2ce 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -42,6 +42,8 @@ void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inodes(struct xfs_inode *ip); +int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags); + int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 3631783..ca752f0 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -356,9 +356,20 @@ xfs_iget_cache_miss( BUG(); } - spin_lock(&pag->pag_ici_lock); + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + ip->i_udquot = ip->i_gdquot = NULL; + xfs_iflags_set(ip, XFS_INEW); /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); error = radix_tree_insert(&pag->pag_ici_root, agino, ip); if (unlikely(error)) { WARN_ON(error != -EEXIST); @@ -366,11 +377,6 @@ xfs_iget_cache_miss( error = EAGAIN; goto out_preload_end; } - - /* These values _must_ be set before releasing the radix tree lock! */ - ip->i_udquot = ip->i_gdquot = NULL; - xfs_iflags_set(ip, XFS_INEW); - spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 04142ca..b75fd67 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3159,37 +3159,26 @@ xlog_recover_process_iunlinks( */ continue; } + /* + * Unlock the buffer so that it can be acquired in the normal + * course of the transaction to truncate and free each inode. + * Because we are not racing with anyone else here for the AGI + * buffer, we don't even need to hold it locked to read the + * initial unlinked bucket entries out of the buffer. We keep + * buffer reference though, so that it stays pinned in memory + * while we need the buffer. + */ agi = XFS_BUF_TO_AGI(agibp); + xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - /* - * Release the agi buffer so that it can - * be acquired in the normal course of the - * transaction to truncate and free the inode. - */ - xfs_buf_relse(agibp); - agino = xlog_recover_process_one_iunlink(mp, agno, agino, bucket); - - /* - * Reacquire the agibuffer and continue around - * the loop. This should never fail as we know - * the buffer was good earlier on. - */ - error = xfs_read_agi(mp, NULL, agno, &agibp); - ASSERT(error == 0); - agi = XFS_BUF_TO_AGI(agibp); } } - - /* - * Release the buffer for the current agi so we can - * go on to the next one. - */ - xfs_buf_relse(agibp); + xfs_buf_rele(agibp); } mp->m_dmevmask = mp_dmevmask; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 6cc4d41..59509ae 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -554,7 +554,8 @@ xfs_readlink( __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); - return XFS_ERROR(EFSCORRUPTED); + error = XFS_ERROR(EFSCORRUPTED); + goto out; } |