diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/events/core.c | 300 | ||||
-rw-r--r-- | kernel/exit.c | 1 | ||||
-rw-r--r-- | kernel/fork.c | 7 | ||||
-rw-r--r-- | kernel/futex.c | 213 | ||||
-rw-r--r-- | kernel/power/earlysuspend.c | 13 | ||||
-rw-r--r-- | kernel/sys.c | 146 | ||||
-rw-r--r-- | kernel/sysctl.c | 14 |
7 files changed, 478 insertions, 216 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index acdc087..466213f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -604,6 +604,77 @@ static void put_ctx(struct perf_event_context *ctx) } } +/* + * Because of perf_event::ctx migration in sys_perf_event_open::move_group and + * perf_pmu_migrate_context() we need some magic. + * + * Those places that change perf_event::ctx will hold both + * perf_event_ctx::mutex of the 'old' and 'new' ctx value. + * + * Lock ordering is by mutex address. There is one other site where + * perf_event_context::mutex nests and that is put_event(). But remember that + * that is a parent<->child context relation, and migration does not affect + * children, therefore these two orderings should not interact. + * + * The change in perf_event::ctx does not affect children (as claimed above) + * because the sys_perf_event_open() case will install a new event and break + * the ctx parent<->child relation, and perf_pmu_migrate_context() is only + * concerned with cpuctx and that doesn't have children. + * + * The places that change perf_event::ctx will issue: + * + * perf_remove_from_context(); + * synchronize_rcu(); + * perf_install_in_context(); + * + * to affect the change. The remove_from_context() + synchronize_rcu() should + * quiesce the event, after which we can install it in the new location. This + * means that only external vectors (perf_fops, prctl) can perturb the event + * while in transit. Therefore all such accessors should also acquire + * perf_event_context::mutex to serialize against this. + * + * However; because event->ctx can change while we're waiting to acquire + * ctx->mutex we must be careful and use the below perf_event_ctx_lock() + * function. + * + * Lock order: + * task_struct::perf_event_mutex + * perf_event_context::mutex + * perf_event_context::lock + * perf_event::child_mutex; + * perf_event::mmap_mutex + * mmap_sem + */ +static struct perf_event_context *perf_event_ctx_lock(struct perf_event *event) +{ + struct perf_event_context *ctx; + +again: + rcu_read_lock(); + ctx = ACCESS_ONCE(event->ctx); + if (!atomic_inc_not_zero(&ctx->refcount)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + mutex_lock(&ctx->mutex); + if (event->ctx != ctx) { + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + + return ctx; +} + +static void perf_event_ctx_unlock(struct perf_event *event, + struct perf_event_context *ctx) +{ + mutex_unlock(&ctx->mutex); + put_ctx(ctx); +} + static void unclone_ctx(struct perf_event_context *ctx) { if (ctx->parent_ctx) { @@ -1245,7 +1316,7 @@ static int __perf_event_disable(void *info) * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ -void perf_event_disable(struct perf_event *event) +static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -1287,6 +1358,19 @@ retry: raw_spin_unlock_irq(&ctx->lock); } +/* + * Strictly speaking kernel users cannot create groups and therefore this + * interface does not need the perf_event_ctx_lock() magic. + */ +void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + _perf_event_disable(event); + perf_event_ctx_unlock(event, ctx); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -1580,6 +1664,8 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); event->ctx = ctx; + if (event->cpu != -1) + event->cpu = cpu; if (!task) { /* @@ -1720,7 +1806,7 @@ unlock: * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ -void perf_event_enable(struct perf_event *event) +static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -1777,7 +1863,19 @@ out: raw_spin_unlock_irq(&ctx->lock); } -static int perf_event_refresh(struct perf_event *event, int refresh) +/* + * See perf_event_disable(); + */ +void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + _perf_event_enable(event); + perf_event_ctx_unlock(event, ctx); +} + +static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events @@ -1786,11 +1884,26 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return -EINVAL; atomic_add(refresh, &event->event_limit); - perf_event_enable(event); + _perf_event_enable(event); return 0; } +/* + * See perf_event_disable() + */ +int perf_event_refresh(struct perf_event *event, int refresh) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_event_refresh(event, refresh); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) @@ -3034,7 +3147,16 @@ static void put_event(struct perf_event *event) rcu_read_unlock(); if (owner) { - mutex_lock(&owner->perf_event_mutex); + /* + * If we're here through perf_event_exit_task() we're already + * holding ctx->mutex which would be an inversion wrt. the + * normal lock order. + * + * However we can safely take this lock because its the child + * ctx->mutex. + */ + mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); + /* * We have to re-check the event->owner field, if it is cleared * we raced with perf_event_exit_task(), acquiring the mutex @@ -3086,12 +3208,13 @@ static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, ret = -EFAULT; struct perf_event_context *ctx = leader->ctx; - u64 values[5]; + int n = 0, size = 0, ret; u64 count, enabled, running; + u64 values[5]; + + lockdep_assert_held(&ctx->mutex); - mutex_lock(&ctx->mutex); count = perf_event_read_value(leader, &enabled, &running); values[n++] = 1 + leader->nr_siblings; @@ -3106,7 +3229,7 @@ static int perf_event_read_group(struct perf_event *event, size = n * sizeof(u64); if (copy_to_user(buf, values, size)) - goto unlock; + return -EFAULT; ret = size; @@ -3120,14 +3243,12 @@ static int perf_event_read_group(struct perf_event *event, size = n * sizeof(u64); if (copy_to_user(buf + ret, values, size)) { - ret = -EFAULT; - goto unlock; + return -EFAULT; } ret += size; } -unlock: - mutex_unlock(&ctx->mutex); + return ret; } @@ -3186,8 +3307,14 @@ static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_event *event = file->private_data; + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = perf_read_hw(event, buf, count); + perf_event_ctx_unlock(event, ctx); - return perf_read_hw(event, buf, count); + return ret; } static unsigned int perf_poll(struct file *file, poll_table *wait) @@ -3207,7 +3334,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) return events; } -static void perf_event_reset(struct perf_event *event) +static void _perf_event_reset(struct perf_event *event) { (void)perf_event_read(event); local64_set(&event->count, 0); @@ -3306,25 +3433,24 @@ static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { - struct perf_event *event = file->private_data; void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { case PERF_EVENT_IOC_ENABLE: - func = perf_event_enable; + func = _perf_event_enable; break; case PERF_EVENT_IOC_DISABLE: - func = perf_event_disable; + func = _perf_event_disable; break; case PERF_EVENT_IOC_RESET: - func = perf_event_reset; + func = _perf_event_reset; break; case PERF_EVENT_IOC_REFRESH: - return perf_event_refresh(event, arg); + return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: return perf_event_period(event, (u64 __user *)arg); @@ -3365,13 +3491,30 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + struct perf_event_context *ctx; + long ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_ioctl(event, cmd, arg); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + int perf_event_task_enable(void) { + struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_enable); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { + ctx = perf_event_ctx_lock(event); + perf_event_for_each_child(event, _perf_event_enable); + perf_event_ctx_unlock(event, ctx); + } mutex_unlock(¤t->perf_event_mutex); return 0; @@ -3379,11 +3522,15 @@ int perf_event_task_enable(void) int perf_event_task_disable(void) { + struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_disable); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { + ctx = perf_event_ctx_lock(event); + perf_event_for_each_child(event, _perf_event_disable); + perf_event_ctx_unlock(event, ctx); + } mutex_unlock(¤t->perf_event_mutex); return 0; @@ -5416,7 +5563,6 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) int err = 0; mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { struct swevent_hlist *hlist; @@ -6216,6 +6362,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!group_leader) group_leader = event; + mutex_init(&event->group_leader_mutex); mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); @@ -6444,6 +6591,15 @@ out: return ret; } +static void mutex_lock_double(struct mutex *a, struct mutex *b) +{ + if (b < a) + swap(a, b); + + mutex_lock(a); + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -6459,7 +6615,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx; + struct perf_event_context *ctx, *uninitialized_var(gctx); struct file *event_file = NULL; struct file *group_file = NULL; struct task_struct *task = NULL; @@ -6513,6 +6669,16 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } + /* + * Take the group_leader's group_leader_mutex before observing + * anything in the group leader that leads to changes in ctx, + * many of which may be changing on another thread. + * In particular, we want to take this lock before deciding + * whether we need to move_group. + */ + if (group_leader) + mutex_lock(&group_leader->group_leader_mutex); + if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { @@ -6572,7 +6738,7 @@ SYSCALL_DEFINE5(perf_event_open, /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, cpu); + ctx = find_get_context(pmu, task, event->cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; @@ -6627,9 +6793,14 @@ SYSCALL_DEFINE5(perf_event_open, } if (move_group) { - struct perf_event_context *gctx = group_leader->ctx; + gctx = group_leader->ctx; + + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&gctx->mutex, &ctx->mutex); - mutex_lock(&gctx->mutex); perf_remove_from_context(group_leader); /* @@ -6644,28 +6815,41 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(sibling); put_ctx(gctx); } - mutex_unlock(&gctx->mutex); - put_ctx(gctx); + } else { + mutex_lock(&ctx->mutex); } WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); if (move_group) { - perf_install_in_context(ctx, group_leader, cpu); + /* + * Wait for everybody to stop referencing the events through + * the old lists, before installing it on new lists. + */ + synchronize_rcu(); + + perf_install_in_context(ctx, group_leader, event->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_install_in_context(ctx, sibling, cpu); + perf_install_in_context(ctx, sibling, event->cpu); get_ctx(ctx); } } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); ++ctx->generation; perf_unpin_context(ctx); + + if (move_group) { + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + } mutex_unlock(&ctx->mutex); + if (group_leader) + mutex_unlock(&group_leader->group_leader_mutex); + event->owner = current; mutex_lock(¤t->perf_event_mutex); @@ -6697,6 +6881,8 @@ err_task: if (task) put_task_struct(task); err_group_fd: + if (group_leader) + mutex_unlock(&group_leader->group_leader_mutex); fput_light(group_file, fput_needed); err_fd: put_unused_fd(event_fd); @@ -6751,6 +6937,42 @@ err: } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx; + struct perf_event_context *dst_ctx; + struct perf_event *event, *tmp; + LIST_HEAD(events); + + src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); + list_for_each_entry_safe(event, tmp, &src_ctx->event_list, + event_entry) { + perf_remove_from_context(event); + put_ctx(src_ctx); + list_add(&event->event_entry, &events); + } + + synchronize_rcu(); + + list_for_each_entry_safe(event, tmp, &events, event_entry) { + list_del(&event->event_entry); + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + perf_install_in_context(dst_ctx, event, dst_cpu); + get_ctx(dst_ctx); + } + mutex_unlock(&dst_ctx->mutex); + mutex_unlock(&src_ctx->mutex); +} +EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); + static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { @@ -7309,12 +7531,6 @@ static void perf_event_exit_cpu_context(int cpu) static void perf_event_exit_cpu(int cpu) { - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - mutex_lock(&swhash->hlist_mutex); - swevent_hlist_release(swhash); - mutex_unlock(&swhash->hlist_mutex); - perf_event_exit_cpu_context(cpu); } #else diff --git a/kernel/exit.c b/kernel/exit.c index 97dd317..6b8a7af 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -68,6 +68,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); + delete_from_adj_tree(p); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } diff --git a/kernel/fork.c b/kernel/fork.c index 158ca4f..ba7e2bc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -317,6 +317,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) account_kernel_stack(ti, 1); + RB_CLEAR_NODE(&tsk->adj_node); + return tsk; out: @@ -1009,6 +1011,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; +#ifdef CONFIG_ANDROID_LMK_ADJ_RBTREE + RB_CLEAR_NODE(&sig->adj_node); +#endif + mutex_init(&sig->cred_guard_mutex); return 0; @@ -1375,6 +1381,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); + add_2_adj_tree(p); __this_cpu_inc(process_counts); } attach_pid(p, PIDTYPE_PID, pid); diff --git a/kernel/futex.c b/kernel/futex.c index 7517c78..c7c19cb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -588,55 +588,6 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); } -/* - * We need to check the following states: - * - * Waiter | pi_state | pi->owner | uTID | uODIED | ? - * - * [1] NULL | --- | --- | 0 | 0/1 | Valid - * [2] NULL | --- | --- | >0 | 0/1 | Valid - * - * [3] Found | NULL | -- | Any | 0/1 | Invalid - * - * [4] Found | Found | NULL | 0 | 1 | Valid - * [5] Found | Found | NULL | >0 | 1 | Invalid - * - * [6] Found | Found | task | 0 | 1 | Valid - * - * [7] Found | Found | NULL | Any | 0 | Invalid - * - * [8] Found | Found | task | ==taskTID | 0/1 | Valid - * [9] Found | Found | task | 0 | 0 | Invalid - * [10] Found | Found | task | !=taskTID | 0/1 | Invalid - * - * [1] Indicates that the kernel can acquire the futex atomically. We - * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. - * - * [2] Valid, if TID does not belong to a kernel thread. If no matching - * thread is found then it indicates that the owner TID has died. - * - * [3] Invalid. The waiter is queued on a non PI futex - * - * [4] Valid state after exit_robust_list(), which sets the user space - * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. - * - * [5] The user space value got manipulated between exit_robust_list() - * and exit_pi_state_list() - * - * [6] Valid state after exit_pi_state_list() which sets the new owner in - * the pi_state but cannot access the user space value. - * - * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. - * - * [8] Owner and user space value match - * - * [9] There is no transient state which sets the user space TID to 0 - * except exit_robust_list(), but this is indicated by the - * FUTEX_OWNER_DIED bit. See [4] - * - * [10] There is no transient state which leaves owner and user space - * TID out of sync. - */ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps) @@ -652,13 +603,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, plist_for_each_entry_safe(this, next, head, list) { if (match_futex(&this->key, key)) { /* - * Sanity check the waiter before increasing - * the refcount and attaching to it. + * Another waiter already exists - bump up + * the refcount and return its pi_state: */ pi_state = this->pi_state; /* - * Userspace might have messed up non-PI and - * PI futexes [3] + * Userspace might have messed up non-PI and PI futexes */ if (unlikely(!pi_state)) return -EINVAL; @@ -666,70 +616,34 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, WARN_ON(!atomic_read(&pi_state->refcount)); /* - * Handle the owner died case: + * When pi_state->owner is NULL then the owner died + * and another waiter is on the fly. pi_state->owner + * is fixed up by the task which acquires + * pi_state->rt_mutex. + * + * We do not check for pid == 0 which can happen when + * the owner died and robust_list_exit() cleared the + * TID. */ - if (uval & FUTEX_OWNER_DIED) { + if (pid && pi_state->owner) { /* - * exit_pi_state_list sets owner to NULL and - * wakes the topmost waiter. The task which - * acquires the pi_state->rt_mutex will fixup - * owner. + * Bail out if user space manipulated the + * futex value. */ - if (!pi_state->owner) { - /* - * No pi state owner, but the user - * space TID is not 0. Inconsistent - * state. [5] - */ - if (pid) - return -EINVAL; - /* - * Take a ref on the state and - * return. [4] - */ - goto out_state; - } - - /* - * If TID is 0, then either the dying owner - * has not yet executed exit_pi_state_list() - * or some waiter acquired the rtmutex in the - * pi state, but did not yet fixup the TID in - * user space. - * - * Take a ref on the state and return. [6] - */ - if (!pid) - goto out_state; - } else { - /* - * If the owner died bit is not set, - * then the pi_state must have an - * owner. [7] - */ - if (!pi_state->owner) + if (pid != task_pid_vnr(pi_state->owner)) return -EINVAL; } - /* - * Bail out if user space manipulated the - * futex value. If pi state exists then the - * owner TID must be the same as the user - * space TID. [9/10] - */ - if (pid != task_pid_vnr(pi_state->owner)) - return -EINVAL; - - out_state: atomic_inc(&pi_state->refcount); *ps = pi_state; + return 0; } } /* * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when TID = 0 [1] + * the new pi_state to it, but bail out when TID = 0 */ if (!pid) return -ESRCH; @@ -737,11 +651,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!p) return -ESRCH; - if (!p->mm) { - put_task_struct(p); - return -EPERM; - } - /* * We need to look at the task state flags to figure out, * whether the task is exiting. To protect against the do_exit @@ -762,9 +671,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return ret; } - /* - * No existing pi state. First waiter. [2] - */ pi_state = alloc_pi_state(); /* @@ -836,18 +742,10 @@ retry: return -EDEADLK; /* - * Surprise - we got the lock, but we do not trust user space at all. + * Surprise - we got the lock. Just return to userspace: */ - if (unlikely(!curval)) { - /* - * We verify whether there is kernel state for this - * futex. If not, we can safely assume, that the 0 -> - * TID transition is correct. If state exists, we do - * not bother to fixup the user space state as it was - * corrupted already. - */ - return futex_top_waiter(hb, key) ? -EINVAL : 1; - } + if (unlikely(!curval)) + return 1; uval = curval; @@ -977,7 +875,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 curval, newval; - int ret = 0; if (!pi_state) return -EINVAL; @@ -1001,19 +898,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) new_owner = this->task; /* - * We pass it to the next owner. The WAITERS bit is always - * kept enabled while there is PI state around. We cleanup the - * owner died bit, because we are the owner. + * We pass it to the next owner. (The WAITERS bit is always + * kept enabled while there is PI state around. We must also + * preserve the owner died bit.) */ - newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + if (!(uval & FUTEX_OWNER_DIED)) { + int ret = 0; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; - if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - return ret; + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + ret = -EFAULT; + else if (curval != uval) + ret = -EINVAL; + if (ret) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + return ret; + } } raw_spin_lock_irq(&pi_state->owner->pi_lock); @@ -1292,7 +1193,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * * Returns: * 0 - failed to acquire the lock atomicly - * >0 - acquired the lock, return value is vpid of the top_waiter + * 1 - acquired the lock * <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1303,7 +1204,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, { struct futex_q *top_waiter = NULL; u32 curval; - int ret, vpid; + int ret; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; @@ -1331,14 +1232,11 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * the contended case or if set_waiters is 1. The pi_state is returned * in ps in contended cases. */ - vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); - if (ret == 1) { + if (ret == 1) requeue_pi_wake_futex(top_waiter, key2, hb2); - return vpid; - } return ret; } @@ -1370,6 +1268,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; + u32 curval2; if (requeue_pi) { /* @@ -1471,25 +1370,16 @@ retry_private: * At this point the top_waiter has either taken uaddr2 or is * waiting on it. If the former, then the pi_state will not * exist yet, look it up one more time to ensure we have a - * reference to it. If the lock was taken, ret contains the - * vpid of the top waiter task. + * reference to it. */ - if (ret > 0) { + if (ret == 1) { WARN_ON(pi_state); drop_count++; task_count++; - /* - * If we acquired the lock, then the user - * space value of uaddr2 should be vpid. It - * cannot be changed by the top waiter as it - * is blocked on hb2 lock if it tries to do - * so. If something fiddled with it behind our - * back the pi state lookup might unearth - * it. So we rather use the known value than - * rereading and handing potential crap to - * lookup_pi_state. - */ - ret = lookup_pi_state(ret, hb2, &key2, &pi_state); + ret = get_futex_value_locked(&curval2, uaddr2); + if (!ret) + ret = lookup_pi_state(curval2, hb2, &key2, + &pi_state); } switch (ret) { @@ -2259,10 +2149,9 @@ retry: /* * To avoid races, try to do the TID -> 0 atomic transition * again. If it succeeds then we can return without waking - * anyone else up. We only try this if neither the waiters nor - * the owner died bit are set. + * anyone else up: */ - if (!(uval & ~FUTEX_TID_MASK) && + if (!(uval & FUTEX_OWNER_DIED) && cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) goto pi_faulted; /* @@ -2294,9 +2183,11 @@ retry: /* * No waiters - kernel unlocks the futex: */ - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; + if (!(uval & FUTEX_OWNER_DIED)) { + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + } out_unlock: spin_unlock(&hb->lock); diff --git a/kernel/power/earlysuspend.c b/kernel/power/earlysuspend.c index e6303fd..f0fb629 100644 --- a/kernel/power/earlysuspend.c +++ b/kernel/power/earlysuspend.c @@ -20,9 +20,6 @@ #include <linux/syscalls.h> /* sys_sync */ #include <linux/wakelock.h> #include <linux/workqueue.h> -#ifdef CONFIG_ZRAM_FOR_ANDROID -#include <asm/atomic.h> -#endif /* CONFIG_ZRAM_FOR_ANDROID */ #include "power.h" @@ -32,10 +29,6 @@ enum { DEBUG_VERBOSE = 1U << 3, }; static int debug_mask = DEBUG_USER_STATE; -#ifdef CONFIG_ZRAM_FOR_ANDROID -atomic_t optimize_comp_on = ATOMIC_INIT(0); -EXPORT_SYMBOL(optimize_comp_on); -#endif /* CONFIG_ZRAM_FOR_ANDROID */ module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP); @@ -102,9 +95,6 @@ static void early_suspend(struct work_struct *work) mutex_lock(&early_suspend_lock); spin_lock_irqsave(&state_lock, irqflags); -#ifdef CONFIG_ZRAM_FOR_ANDROID - atomic_set(&optimize_comp_on, 1); -#endif /* CONFIG_ZRAM_FOR_ANDROID */ if (state == SUSPEND_REQUESTED) state |= SUSPENDED; else @@ -156,9 +146,6 @@ static void late_resume(struct work_struct *work) mutex_lock(&early_suspend_lock); spin_lock_irqsave(&state_lock, irqflags); -#ifdef CONFIG_ZRAM_FOR_ANDROID - atomic_set(&optimize_comp_on, 0); -#endif /* CONFIG_ZRAM_FOR_ANDROID */ if (state == SUSPENDED) state &= ~SUSPENDED; else diff --git a/kernel/sys.c b/kernel/sys.c index be53817..79b66c0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -40,6 +40,9 @@ #include <linux/syscore_ops.h> #include <linux/version.h> #include <linux/ctype.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/mempolicy.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -1660,6 +1663,146 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } + +static int prctl_update_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + const char __user *name_addr) +{ + struct mm_struct * mm = vma->vm_mm; + int error = 0; + pgoff_t pgoff; + + if (name_addr == vma_get_anon_name(vma)) { + *prev = vma; + goto out; + } + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma), + name_addr); + if (*prev) { + vma = *prev; + goto success; + } + + *prev = vma; + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto out; + } + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto out; + } + +success: + if (!vma->vm_file) + vma->shared.anon_name = name_addr; + +out: + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +static int prctl_set_vma_anon_name(unsigned long start, unsigned long end, + unsigned long arg) +{ + unsigned long tmp; + struct vm_area_struct * vma, *prev; + int unmapped_error = 0; + int error = -EINVAL; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - this matches the handling in madvise. + */ + vma = find_vma_prev(current->mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + return error; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + return error; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = prctl_update_vma_anon_name(vma, &prev, start, end, + (const char __user *)arg); + if (error) + return error; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + error = unmapped_error; + if (start >= end) + return error; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); + } +} + +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long len_in, unsigned long arg) +{ + struct mm_struct *mm = current->mm; + int error; + unsigned long len; + unsigned long end; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + down_write(&mm->mmap_sem); + + switch (opt) { + case PR_SET_VMA_ANON_NAME: + error = prctl_set_vma_anon_name(start, end, arg); + break; + default: + error = -EINVAL; + } + + up_write(&mm->mmap_sem); + + return error; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -1808,6 +1951,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else error = PR_MCE_KILL_DEFAULT; break; + case PR_SET_VMA: + error = prctl_set_vma(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5b6afb2..3cd04f1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1515,6 +1515,20 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; |