diff options
Diffstat (limited to 'kernel')
106 files changed, 10283 insertions, 9495 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index b833bd5..b8d4cd8 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o -obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o @@ -87,17 +86,15 @@ obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o -obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is @@ -117,7 +114,7 @@ $(obj)/config_data.gz: .config FORCE $(call if_changed,gzip) quiet_cmd_ikconfiggz = IKCFG $@ - cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ + cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call if_changed,ikconfiggz) diff --git a/kernel/audit.c b/kernel/audit.c index defc2e6..5feed23 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } case AUDIT_SIGNAL_INFO: - err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); - if (err) - return err; + len = 0; + if (audit_sig_sid) { + err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); + if (err) + return err; + } sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); if (!sig_data) { - security_release_secctx(ctx, len); + if (audit_sig_sid) + security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = audit_sig_uid; sig_data->pid = audit_sig_pid; - memcpy(sig_data->ctx, ctx, len); - security_release_secctx(ctx, len); + if (audit_sig_sid) { + memcpy(sig_data->ctx, ctx, len); + security_release_secctx(ctx, len); + } audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0e96dbc..cc7e879 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -45,8 +45,8 @@ struct audit_watch { atomic_t count; /* reference count */ - char *path; /* insertion path */ dev_t dev; /* associated superblock device */ + char *path; /* insertion path */ unsigned long ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 68d3c6a..267e484 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -168,12 +168,12 @@ struct audit_context { int in_syscall; /* 1 if task is in a syscall */ enum audit_state state, current_state; unsigned int serial; /* serial number for record */ - struct timespec ctime; /* time of syscall entry */ int major; /* syscall number */ + struct timespec ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ - int return_valid; /* return code is valid */ long return_code;/* syscall return code */ u64 prio; + int return_valid; /* return code is valid */ int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ @@ -198,8 +198,8 @@ struct audit_context { char target_comm[TASK_COMM_LEN]; struct audit_tree_refs *trees, *first_trees; - int tree_count; struct list_head killed_trees; + int tree_count; int type; union { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c7ece8f..7ccba4b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -23,6 +23,7 @@ */ #include <linux/cgroup.h> +#include <linux/ctype.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -48,6 +49,8 @@ #include <linux/namei.h> #include <linux/smp_lock.h> #include <linux/pid_namespace.h> +#include <linux/idr.h> +#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <asm/atomic.h> @@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = { #include <linux/cgroup_subsys.h> }; +#define MAX_CGROUP_ROOT_NAMELEN 64 + /* * A cgroupfs_root represents the root of a cgroup hierarchy, * and may be associated with a superblock to form an active @@ -74,6 +79,9 @@ struct cgroupfs_root { */ unsigned long subsys_bits; + /* Unique id for this hierarchy. */ + int hierarchy_id; + /* The bitmask of subsystems currently attached to this hierarchy */ unsigned long actual_subsys_bits; @@ -94,6 +102,9 @@ struct cgroupfs_root { /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; }; /* @@ -141,6 +152,10 @@ struct css_id { static LIST_HEAD(roots); static int root_count; +static DEFINE_IDA(hierarchy_ida); +static int next_hierarchy_id; +static DEFINE_SPINLOCK(hierarchy_id_lock); + /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -201,6 +216,7 @@ struct cg_cgroup_link { * cgroup, anchored on cgroup->css_sets */ struct list_head cgrp_link_list; + struct cgroup *cgrp; /* * List running through cg_cgroup_links pointing at a * single css_set object, anchored on css_set->cg_links @@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); static DEFINE_RWLOCK(css_set_lock); static int css_set_count; -/* hash table for cgroup groups. This improves the performance to - * find an existing css_set */ +/* + * hash table for cgroup groups. This improves the performance to find + * an existing css_set. This hash doesn't (currently) take into + * account cgroups in empty hierarchies. + */ #define CSS_SET_HASH_BITS 7 #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; @@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } +static void free_css_set_rcu(struct rcu_head *obj) +{ + struct css_set *cg = container_of(obj, struct css_set, rcu_head); + kfree(cg); +} + /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups * compiled into their kernel but not actually in use */ static int use_task_css_set_links __read_mostly; -/* When we create or destroy a css_set, the operation simply - * takes/releases a reference count on all the cgroups referenced - * by subsystems in this css_set. This can end up multiple-counting - * some cgroups, but that's OK - the ref-count is just a - * busy/not-busy indicator; ensuring that we only count each cgroup - * once would require taking a global lock to ensure that no - * subsystems moved between hierarchies while we were doing so. - * - * Possible TODO: decide at boot time based on the number of - * registered subsystems and the number of CPUs or NUMA nodes whether - * it's better for performance to ref-count every subsystem, or to - * take a global lock and only add one ref count to each hierarchy. - */ - -/* - * unlink a css_set from the list and free it - */ -static void unlink_css_set(struct css_set *cg) +static void __put_css_set(struct css_set *cg, int taskexit) { struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; - - hlist_del(&cg->hlist); - css_set_count--; - - list_for_each_entry_safe(link, saved_link, &cg->cg_links, - cg_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); - kfree(link); - } -} - -static void __put_css_set(struct css_set *cg, int taskexit) -{ - int i; /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an @@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit) write_unlock(&css_set_lock); return; } - unlink_css_set(cg); - write_unlock(&css_set_lock); - rcu_read_lock(); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); + /* This css_set is dead. unlink it and release cgroup refcounts */ + hlist_del(&cg->hlist); + css_set_count--; + + list_for_each_entry_safe(link, saved_link, &cg->cg_links, + cg_link_list) { + struct cgroup *cgrp = link->cgrp; + list_del(&link->cg_link_list); + list_del(&link->cgrp_link_list); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } + + kfree(link); } - rcu_read_unlock(); - kfree(cg); + + write_unlock(&css_set_lock); + call_rcu(&cg->rcu_head, free_css_set_rcu); } /* @@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg) } /* + * compare_css_sets - helper function for find_existing_css_set(). + * @cg: candidate css_set being tested + * @old_cg: existing css_set for a task + * @new_cgrp: cgroup that's being entered by the task + * @template: desired set of css pointers in css_set (pre-calculated) + * + * Returns true if "cg" matches "old_cg" except for the hierarchy + * which "new_cgrp" belongs to, for which it should match "new_cgrp". + */ +static bool compare_css_sets(struct css_set *cg, + struct css_set *old_cg, + struct cgroup *new_cgrp, + struct cgroup_subsys_state *template[]) +{ + struct list_head *l1, *l2; + + if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { + /* Not all subsystems matched */ + return false; + } + + /* + * Compare cgroup pointers in order to distinguish between + * different cgroups in heirarchies with no subsystems. We + * could get by with just this check alone (and skip the + * memcmp above) but on most setups the memcmp check will + * avoid the need for this more expensive check on almost all + * candidates. + */ + + l1 = &cg->cg_links; + l2 = &old_cg->cg_links; + while (1) { + struct cg_cgroup_link *cgl1, *cgl2; + struct cgroup *cg1, *cg2; + + l1 = l1->next; + l2 = l2->next; + /* See if we reached the end - both lists are equal length. */ + if (l1 == &cg->cg_links) { + BUG_ON(l2 != &old_cg->cg_links); + break; + } else { + BUG_ON(l2 == &old_cg->cg_links); + } + /* Locate the cgroups associated with these links. */ + cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); + cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); + cg1 = cgl1->cgrp; + cg2 = cgl2->cgrp; + /* Hierarchies should be linked in the same order. */ + BUG_ON(cg1->root != cg2->root); + + /* + * If this hierarchy is the hierarchy of the cgroup + * that's changing, then we need to check that this + * css_set points to the new cgroup; if it's any other + * hierarchy, then this css_set should point to the + * same cgroup as the old css_set. + */ + if (cg1->root == new_cgrp->root) { + if (cg1 != new_cgrp) + return false; + } else { + if (cg1 != cg2) + return false; + } + } + return true; +} + +/* * find_existing_css_set() is a helper for * find_css_set(), and checks to see whether an existing * css_set is suitable. @@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set( hhead = css_set_hash(template); hlist_for_each_entry(cg, node, hhead, hlist) { - if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { - /* All subsystems matched */ - return cg; - } + if (!compare_css_sets(cg, oldcg, cgrp, template)) + continue; + + /* This css_set matches what we need */ + return cg; } /* No existing cgroup group matched */ @@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links, link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, cgrp_link_list); link->cg = cg; + link->cgrp = cgrp; + atomic_inc(&cgrp->count); list_move(&link->cgrp_link_list, &cgrp->css_sets); - list_add(&link->cg_link_list, &cg->cg_links); + /* + * Always add links to the tail of the list so that the list + * is sorted by order of hierarchy creation + */ + list_add_tail(&link->cg_link_list, &cg->cg_links); } /* @@ -451,11 +530,11 @@ static struct css_set *find_css_set( { struct css_set *res; struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - int i; struct list_head tmp_cg_links; struct hlist_head *hhead; + struct cg_cgroup_link *link; /* First see if we already have a cgroup group that matches * the desired set */ @@ -489,20 +568,12 @@ static struct css_set *find_css_set( write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = res->subsys[i]->cgroup; - struct cgroup_subsys *ss = subsys[i]; - atomic_inc(&cgrp->count); - /* - * We want to add a link once per cgroup, so we - * only do it for the first subsystem in each - * hierarchy - */ - if (ss->root->subsys_list.next == &ss->sibling) - link_css_set(&tmp_cg_links, res, cgrp); + list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == cgrp->root) + c = cgrp; + link_css_set(&tmp_cg_links, res, c); } - if (list_empty(&rootnode.subsys_list)) - link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -518,6 +589,41 @@ static struct css_set *find_css_set( } /* + * Return the cgroup for "task" from the given hierarchy. Must be + * called with cgroup_mutex held. + */ +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroupfs_root *root) +{ + struct css_set *css; + struct cgroup *res = NULL; + + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + read_lock(&css_set_lock); + /* + * No need to lock the task - since we hold cgroup_mutex the + * task can't change groups, so the only thing that can happen + * is that it exits and its css is set back to init_css_set. + */ + css = task->cgroups; + if (css == &init_css_set) { + res = &root->top_cgroup; + } else { + struct cg_cgroup_link *link; + list_for_each_entry(link, &css->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == root) { + res = c; + break; + } + } + } + read_unlock(&css_set_lock); + BUG_ON(!res); + return res; +} + +/* * There is one global cgroup mutex. We also require taking * task_lock() when dereferencing a task's cgroup subsys pointers. * See "The task_lock() exception", at the end of this comment. @@ -596,7 +702,7 @@ void cgroup_unlock(void) static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); -static struct inode_operations cgroup_dir_inode_operations; +static const struct inode_operations cgroup_dir_inode_operations; static struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { @@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ deactivate_super(cgrp->root->sb); + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); @@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + if (strlen(root->name)) + seq_printf(seq, ",name=%s", root->name); mutex_unlock(&cgroup_mutex); return 0; } @@ -849,6 +963,12 @@ struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; + char *name; + /* User explicitly requested empty subsystem */ + bool none; + + struct cgroupfs_root *new_root; + }; /* Convert a hierarchy specifier into a bitmask of subsystems and @@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data, mask = ~(1UL << cpuset_subsys_id); #endif - opts->subsys_bits = 0; - opts->flags = 0; - opts->release_agent = NULL; + memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { if (!*token) @@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data, if (!ss->disabled) opts->subsys_bits |= 1ul << i; } + } else if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; - opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); + opts->release_agent = + kstrndup(token + 14, PATH_MAX, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - strncpy(opts->release_agent, token + 14, PATH_MAX - 1); - opts->release_agent[PATH_MAX - 1] = 0; + } else if (!strncmp(token, "name=", 5)) { + int i; + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; } else { struct cgroup_subsys *ss; int i; @@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data, } } + /* Consistency checks */ + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just @@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data, (opts->subsys_bits & mask)) return -EINVAL; - /* We can't have an empty hierarchy */ - if (!opts->subsys_bits) + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_bits && opts->none) + return -EINVAL; + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_bits && !opts->name) return -EINVAL; return 0; @@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* Don't allow name to change at remount */ + if (opts.name && strcmp(opts.name, root->name)) { + ret = -EINVAL; + goto out_unlock; + } + ret = rebind_subsystems(root, opts.subsys_bits); if (ret) goto out_unlock; @@ -955,13 +1114,14 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); out_unlock: kfree(opts.release_agent); + kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); unlock_kernel(); return ret; } -static struct super_operations cgroup_ops = { +static const struct super_operations cgroup_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = cgroup_show_options, @@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); - INIT_LIST_HEAD(&cgrp->pids_list); - init_rwsem(&cgrp->pids_mutex); + INIT_LIST_HEAD(&cgrp->pidlists); + mutex_init(&cgrp->pidlist_mutex); } + static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; @@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root) init_cgroup_housekeeping(cgrp); } +static bool init_root_id(struct cgroupfs_root *root) +{ + int ret = 0; + + do { + if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) + return false; + spin_lock(&hierarchy_id_lock); + /* Try to allocate the next unused ID */ + ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, + &root->hierarchy_id); + if (ret == -ENOSPC) + /* Try again starting from 0 */ + ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); + if (!ret) { + next_hierarchy_id = root->hierarchy_id + 1; + } else if (ret != -EAGAIN) { + /* Can only get here if the 31-bit IDR is full ... */ + BUG_ON(ret); + } + spin_unlock(&hierarchy_id_lock); + } while (ret); + return true; +} + static int cgroup_test_super(struct super_block *sb, void *data) { - struct cgroupfs_root *new = data; + struct cgroup_sb_opts *opts = data; struct cgroupfs_root *root = sb->s_fs_info; - /* First check subsystems */ - if (new->subsys_bits != root->subsys_bits) - return 0; + /* If we asked for a name then it must match */ + if (opts->name && strcmp(opts->name, root->name)) + return 0; - /* Next check flags */ - if (new->flags != root->flags) + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match + */ + if ((opts->subsys_bits || opts->none) + && (opts->subsys_bits != root->subsys_bits)) return 0; return 1; } +static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) +{ + struct cgroupfs_root *root; + + if (!opts->subsys_bits && !opts->none) + return NULL; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + if (!init_root_id(root)) { + kfree(root); + return ERR_PTR(-ENOMEM); + } + init_cgroup_root(root); + + root->subsys_bits = opts->subsys_bits; + root->flags = opts->flags; + if (opts->release_agent) + strcpy(root->release_agent_path, opts->release_agent); + if (opts->name) + strcpy(root->name, opts->name); + return root; +} + +static void cgroup_drop_root(struct cgroupfs_root *root) +{ + if (!root) + return; + + BUG_ON(!root->hierarchy_id); + spin_lock(&hierarchy_id_lock); + ida_remove(&hierarchy_ida, root->hierarchy_id); + spin_unlock(&hierarchy_id_lock); + kfree(root); +} + static int cgroup_set_super(struct super_block *sb, void *data) { int ret; - struct cgroupfs_root *root = data; + struct cgroup_sb_opts *opts = data; + + /* If we don't have a new root, we can't set up a new sb */ + if (!opts->new_root) + return -EINVAL; + + BUG_ON(!opts->subsys_bits && !opts->none); ret = set_anon_super(sb, NULL); if (ret) return ret; - sb->s_fs_info = root; - root->sb = sb; + sb->s_fs_info = opts->new_root; + opts->new_root->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; @@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type, void *data, struct vfsmount *mnt) { struct cgroup_sb_opts opts; + struct cgroupfs_root *root; int ret = 0; struct super_block *sb; - struct cgroupfs_root *root; - struct list_head tmp_cg_links; + struct cgroupfs_root *new_root; /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); - if (ret) { - kfree(opts.release_agent); - return ret; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - kfree(opts.release_agent); - return -ENOMEM; - } + if (ret) + goto out_err; - init_cgroup_root(root); - root->subsys_bits = opts.subsys_bits; - root->flags = opts.flags; - if (opts.release_agent) { - strcpy(root->release_agent_path, opts.release_agent); - kfree(opts.release_agent); + /* + * Allocate a new cgroup root. We may not need it if we're + * reusing an existing hierarchy. + */ + new_root = cgroup_root_from_opts(&opts); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out_err; } + opts.new_root = new_root; - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); - + /* Locate an existing or new sb for this hierarchy */ + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); if (IS_ERR(sb)) { - kfree(root); - return PTR_ERR(sb); + ret = PTR_ERR(sb); + cgroup_drop_root(opts.new_root); + goto out_err; } - if (sb->s_fs_info != root) { - /* Reusing an existing superblock */ - BUG_ON(sb->s_root == NULL); - kfree(root); - root = NULL; - } else { - /* New superblock */ + root = sb->s_fs_info; + BUG_ON(!root); + if (root == opts.new_root) { + /* We used the new root structure, so this is a new hierarchy */ + struct list_head tmp_cg_links; struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; + struct cgroupfs_root *existing_root; int i; BUG_ON(sb->s_root != NULL); @@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); + if (strlen(root->name)) { + /* Check for name clashes with existing mounts */ + for_each_active_root(existing_root) { + if (!strcmp(existing_root->name, root->name)) { + ret = -EBUSY; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + goto drop_new_super; + } + } + } + /* * We're accessing css_set_count without locking * css_set_lock here, but that's OK - it can only be @@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, if (ret == -EBUSY) { mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - goto free_cg_links; + free_cg_links(&tmp_cg_links); + goto drop_new_super; } /* EBUSY should be the only error here */ @@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cgroup_populate_dir(root_cgrp); - mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + } else { + /* + * We re-used an existing hierarchy - the new root (if + * any) is not needed + */ + cgroup_drop_root(opts.new_root); } simple_set_mnt(mnt, sb); + kfree(opts.release_agent); + kfree(opts.name); return 0; - free_cg_links: - free_cg_links(&tmp_cg_links); drop_new_super: deactivate_locked_super(sb); + out_err: + kfree(opts.release_agent); + kfree(opts.name); + return ret; } @@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_mutex); kill_litter_super(sb); - kfree(root); + cgroup_drop_root(root); } static struct file_system_type cgroup_fs_type = { @@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } -/* - * Return the first subsystem attached to a cgroup's hierarchy, and - * its subsystem id. - */ - -static void get_first_subsys(const struct cgroup *cgrp, - struct cgroup_subsys_state **css, int *subsys_id) -{ - const struct cgroupfs_root *root = cgrp->root; - const struct cgroup_subsys *test_ss; - BUG_ON(list_empty(&root->subsys_list)); - test_ss = list_entry(root->subsys_list.next, - struct cgroup_subsys, sibling); - if (css) { - *css = cgrp->subsys[test_ss->subsys_id]; - BUG_ON(!*css); - } - if (subsys_id) - *subsys_id = test_ss->subsys_id; -} - /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; - int subsys_id; - - get_first_subsys(cgrp, NULL, &subsys_id); /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup(tsk, subsys_id); + oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) return 0; for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); + retval = ss->can_attach(ss, cgrp, tsk, false); if (retval) return retval; } @@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); + ss->attach(ss, cgrp, oldcgrp, tsk, false); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) return ret; } -/* The various types of files and directories in a cgroup file system */ -enum cgroup_filetype { - FILE_ROOT, - FILE_DIR, - FILE_TASKLIST, - FILE_NOTIFY_ON_RELEASE, - FILE_RELEASE_AGENT, -}; - /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -1711,7 +1930,7 @@ static struct file_operations cgroup_file_operations = { .release = cgroup_file_release, }; -static struct inode_operations cgroup_dir_inode_operations = { +static const struct inode_operations cgroup_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, @@ -1876,7 +2095,7 @@ int cgroup_task_count(const struct cgroup *cgrp) * the start of a css_set */ static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) + struct cgroup_iter *it) { struct list_head *l = it->cg_link; struct cg_cgroup_link *link; @@ -2129,7 +2348,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) } /* - * Stuff for reading the 'tasks' file. + * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), @@ -2139,27 +2358,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ /* - * Load into 'pidarray' up to 'npids' of the tasks using cgroup - * 'cgrp'. Return actual number of pids loaded. No need to - * task_lock(p) when reading out p->cgroup, since we're in an RCU - * read section, so the css_set can't go away, and is - * immutable after creation. + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} +static void pidlist_free(void *p) +{ + if (is_vmalloc_addr(p)) + vfree(p); + else + kfree(p); +} +static void *pidlist_resize(void *p, int newcount) +{ + void *newlist; + /* note: if new alloc fails, old p will still be valid either way */ + if (is_vmalloc_addr(p)) { + newlist = vmalloc(newcount * sizeof(pid_t)); + if (!newlist) + return NULL; + memcpy(newlist, p, newcount * sizeof(pid_t)); + vfree(p); + } else { + newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); + } + return newlist; +} + +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * If the new stripped list is sufficiently smaller and there's enough memory + * to allocate a new buffer, will let go of the unneeded memory. Returns the + * number of unique elements. + */ +/* is the size difference enough that we should re-allocate the array? */ +#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) +static int pidlist_uniq(pid_t **p, int length) +{ + int src, dest = 1; + pid_t *list = *p; + pid_t *newlist; + + /* + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either + */ + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + /* + * if the length difference is large enough, we want to allocate a + * smaller buffer to save memory. if this fails due to out of memory, + * we'll just stay with what we've got. + */ + if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { + newlist = pidlist_resize(list, dest); + if (newlist) + *p = newlist; + } + return dest; +} + +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. */ -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) { - int n = 0, pid; + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); + /* + * We can't drop the pidlist_mutex before taking the l->mutex in case + * the last ref-holder is trying to remove l from the list at the same + * time. Holding the pidlist_mutex precludes somebody taking whichever + * list we find out from under us - compare release_pid_array(). + */ + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry(l, &cgrp->pidlists, links) { + if (l->key.type == type && l->key.ns == ns) { + /* found a matching list - drop the extra refcount */ + put_pid_ns(ns); + /* make sure l doesn't vanish out from under us */ + down_write(&l->mutex); + mutex_unlock(&cgrp->pidlist_mutex); + l->use_count++; + return l; + } + } + /* entry not found; create a new one */ + l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); + put_pid_ns(ns); + return l; + } + init_rwsem(&l->mutex); + down_write(&l->mutex); + l->key.type = type; + l->key.ns = ns; + l->use_count = 0; /* don't increment here */ + l->list = NULL; + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + mutex_unlock(&cgrp->pidlist_mutex); + return l; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ struct cgroup_iter it; struct task_struct *tsk; + struct cgroup_pidlist *l; + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { - if (unlikely(n == npids)) + if (unlikely(n == length)) break; - pid = task_pid_vnr(tsk); - if (pid > 0) - pidarray[n++] = pid; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; } cgroup_iter_end(cgrp, &it); - return n; + length = n; + /* now sort & (if procs) strip out duplicates */ + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(&array, length); + l = cgroup_pidlist_find(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + /* store array, freeing old if necessary - lock already held */ + pidlist_free(l->list); + l->list = array; + l->length = length; + l->use_count++; + up_write(&l->mutex); + *lp = l; + return 0; } /** @@ -2216,37 +2604,14 @@ err: return ret; } -/* - * Cache pids for all threads in the same pid namespace that are - * opening the same "tasks" file. - */ -struct cgroup_pids { - /* The node in cgrp->pids_list */ - struct list_head list; - /* The cgroup those pids belong to */ - struct cgroup *cgrp; - /* The namepsace those pids belong to */ - struct pid_namespace *ns; - /* Array of process ids in the cgroup */ - pid_t *tasks_pids; - /* How many files are using the this tasks_pids array */ - int use_count; - /* Length of the current tasks_pids array */ - int length; -}; - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} /* - * seq_file methods for the "tasks" file. The seq_file position is the + * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->tasks_pids array. + * in the cgroup->l->list array. */ -static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to @@ -2254,48 +2619,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; + struct cgroup_pidlist *l = s->private; int index = 0, pid = *pos; int *iter; - down_read(&cgrp->pids_mutex); + down_read(&l->mutex); if (pid) { - int end = cp->length; + int end = l->length; while (index < end) { int mid = (index + end) / 2; - if (cp->tasks_pids[mid] == pid) { + if (l->list[mid] == pid) { index = mid; break; - } else if (cp->tasks_pids[mid] <= pid) + } else if (l->list[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= cp->length) + if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = cp->tasks_pids + index; + iter = l->list + index; *pos = *iter; return iter; } -static void cgroup_tasks_stop(struct seq_file *s, void *v) +static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; - up_read(&cgrp->pids_mutex); + struct cgroup_pidlist *l = s->private; + up_read(&l->mutex); } -static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_pids *cp = s->private; - int *p = v; - int *end = cp->tasks_pids + cp->length; - + struct cgroup_pidlist *l = s->private; + pid_t *p = v; + pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done @@ -2309,124 +2671,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) } } -static int cgroup_tasks_show(struct seq_file *s, void *v) +static int cgroup_pidlist_show(struct seq_file *s, void *v) { return seq_printf(s, "%d\n", *(int *)v); } -static struct seq_operations cgroup_tasks_seq_operations = { - .start = cgroup_tasks_start, - .stop = cgroup_tasks_stop, - .next = cgroup_tasks_next, - .show = cgroup_tasks_show, +/* + * seq_operations functions for iterating on pidlists through seq_file - + * independent of whether it's tasks or procs + */ +static const struct seq_operations cgroup_pidlist_seq_operations = { + .start = cgroup_pidlist_start, + .stop = cgroup_pidlist_stop, + .next = cgroup_pidlist_next, + .show = cgroup_pidlist_show, }; -static void release_cgroup_pid_array(struct cgroup_pids *cp) +static void cgroup_release_pid_array(struct cgroup_pidlist *l) { - struct cgroup *cgrp = cp->cgrp; - - down_write(&cgrp->pids_mutex); - BUG_ON(!cp->use_count); - if (!--cp->use_count) { - list_del(&cp->list); - put_pid_ns(cp->ns); - kfree(cp->tasks_pids); - kfree(cp); + /* + * the case where we're the last user of this particular pidlist will + * have us remove it from the cgroup's list, which entails taking the + * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> + * pidlist_mutex, we have to take pidlist_mutex first. + */ + mutex_lock(&l->owner->pidlist_mutex); + down_write(&l->mutex); + BUG_ON(!l->use_count); + if (!--l->use_count) { + /* we're the last user if refcount is 0; remove and free */ + list_del(&l->links); + mutex_unlock(&l->owner->pidlist_mutex); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + up_write(&l->mutex); + kfree(l); + return; } - up_write(&cgrp->pids_mutex); + mutex_unlock(&l->owner->pidlist_mutex); + up_write(&l->mutex); } -static int cgroup_tasks_release(struct inode *inode, struct file *file) +static int cgroup_pidlist_release(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct cgroup_pids *cp; - + struct cgroup_pidlist *l; if (!(file->f_mode & FMODE_READ)) return 0; - - seq = file->private_data; - cp = seq->private; - - release_cgroup_pid_array(cp); + /* + * the seq_file will only be initialized if the file was opened for + * reading; hence we check if it's not null only in that case. + */ + l = ((struct seq_file *)file->private_data)->private; + cgroup_release_pid_array(l); return seq_release(inode, file); } -static struct file_operations cgroup_tasks_operations = { +static const struct file_operations cgroup_pidlist_operations = { .read = seq_read, .llseek = seq_lseek, .write = cgroup_file_write, - .release = cgroup_tasks_release, + .release = cgroup_pidlist_release, }; /* - * Handle an open on 'tasks' file. Prepare an array containing the - * process id's of tasks currently attached to the cgroup being opened. + * The following functions handle opens on a file that displays a pidlist + * (tasks or procs). Prepare an array of the process/thread IDs of whoever's + * in the cgroup. */ - -static int cgroup_tasks_open(struct inode *unused, struct file *file) +/* helper function for the two below it */ +static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct pid_namespace *ns = current->nsproxy->pid_ns; - struct cgroup_pids *cp; - pid_t *pidarray; - int npids; + struct cgroup_pidlist *l; int retval; /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - npids = cgroup_task_count(cgrp); - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - return -ENOMEM; - npids = pid_array_load(pidarray, npids, cgrp); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* - * Store the array in the cgroup, freeing the old - * array if necessary - */ - down_write(&cgrp->pids_mutex); - - list_for_each_entry(cp, &cgrp->pids_list, list) { - if (ns == cp->ns) - goto found; - } - - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) { - up_write(&cgrp->pids_mutex); - kfree(pidarray); - return -ENOMEM; - } - cp->cgrp = cgrp; - cp->ns = ns; - get_pid_ns(ns); - list_add(&cp->list, &cgrp->pids_list); -found: - kfree(cp->tasks_pids); - cp->tasks_pids = pidarray; - cp->length = npids; - cp->use_count++; - up_write(&cgrp->pids_mutex); - - file->f_op = &cgroup_tasks_operations; + /* have the array populated */ + retval = pidlist_array_load(cgrp, type, &l); + if (retval) + return retval; + /* configure file information */ + file->f_op = &cgroup_pidlist_operations; - retval = seq_open(file, &cgroup_tasks_seq_operations); + retval = seq_open(file, &cgroup_pidlist_seq_operations); if (retval) { - release_cgroup_pid_array(cp); + cgroup_release_pid_array(l); return retval; } - ((struct seq_file *)file->private_data)->private = cp; + ((struct seq_file *)file->private_data)->private = l; return 0; } +static int cgroup_tasks_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); +} +static int cgroup_procs_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); +} static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, struct cftype *cft) @@ -2449,21 +2794,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, /* * for the common functions, 'private' gives the type of file */ +/* for hysterical raisins, we can't put this on the older files */ +#define CGROUP_FILE_GENERIC_PREFIX "cgroup." static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, .write_u64 = cgroup_tasks_write, - .release = cgroup_tasks_release, - .private = FILE_TASKLIST, + .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, - + { + .name = CGROUP_FILE_GENERIC_PREFIX "procs", + .open = cgroup_procs_open, + /* .write_u64 = cgroup_procs_write, TODO */ + .release = cgroup_pidlist_release, + .mode = S_IRUGO, + }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, - .private = FILE_NOTIFY_ON_RELEASE, }, }; @@ -2472,7 +2823,6 @@ static struct cftype cft_release_agent = { .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, - .private = FILE_RELEASE_AGENT, }; static int cgroup_populate_dir(struct cgroup *cgrp) @@ -2879,6 +3229,7 @@ int __init cgroup_init_early(void) init_task.cgroups = &init_css_set; init_css_set_link.cg = &init_css_set; + init_css_set_link.cgrp = dummytop; list_add(&init_css_set_link.cgrp_link_list, &rootnode.top_cgroup.css_sets); list_add(&init_css_set_link.cg_link_list, @@ -2933,7 +3284,7 @@ int __init cgroup_init(void) /* Add init_css_set to the hash table */ hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); - + BUG_ON(!init_root_id(&rootnode)); err = register_filesystem(&cgroup_fs_type); if (err < 0) goto out; @@ -2986,15 +3337,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v) for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; - int subsys_id; int count = 0; - seq_printf(m, "%lu:", root->subsys_bits); + seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + if (strlen(root->name)) + seq_printf(m, "%sname=%s", count ? "," : "", + root->name); seq_putc(m, ':'); - get_first_subsys(&root->top_cgroup, NULL, &subsys_id); - cgrp = task_cgroup(tsk, subsys_id); + cgrp = task_cgroup_from_root(tsk, root); retval = cgroup_path(cgrp, buf, PAGE_SIZE); if (retval < 0) goto out_unlock; @@ -3033,8 +3385,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - seq_printf(m, "%s\t%lu\t%d\t%d\n", - ss->name, ss->root->subsys_bits, + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->name, ss->root->hierarchy_id, ss->root->number_of_cgroups, !ss->disabled); } mutex_unlock(&cgroup_mutex); @@ -3320,13 +3672,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) { int ret; struct cgroup *target; - int subsys_id; if (cgrp == dummytop) return 1; - get_first_subsys(cgrp, NULL, &subsys_id); - target = task_cgroup(task, subsys_id); + target = task_cgroup_from_root(task, cgrp->root); while (cgrp != target && cgrp!= cgrp->top_cgroup) cgrp = cgrp->parent; ret = (cgrp == target); @@ -3693,3 +4043,154 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) +{ + return cgroup_task_count(cont); +} + +static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup *cont, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(¤t->cgroups->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + struct css_set *cg; + + read_lock(&css_set_lock); + rcu_read_lock(); + cg = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + const char *name; + + if (c->dentry) + name = c->dentry->d_name.name; + else + name = "?"; + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name); + } + rcu_read_unlock(); + read_unlock(&css_set_lock); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + + read_lock(&css_set_lock); + list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + struct task_struct *task; + int count = 0; + seq_printf(seq, "css_set %p\n", cg); + list_for_each_entry(task, &cg->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) { + seq_puts(seq, " ...\n"); + break; + } else { + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); + } + } + } + read_unlock(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + +static struct cftype debug_files[] = { + { + .name = "cgroup_refcount", + .read_u64 = cgroup_refcount_read, + }, + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .read_seq_string = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .read_seq_string = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, +}; + +static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, debug_files, + ARRAY_SIZE(debug_files)); +} + +struct cgroup_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c deleted file mode 100644 index 0c92d79..0000000 --- a/kernel/cgroup_debug.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * kernel/cgroup_debug.c - Example cgroup subsystem that - * exposes debug info - * - * Copyright (C) Google Inc, 2007 - * - * Developed by Paul Menage (menage@google.com) - * - */ - -#include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/rcupdate.h> - -#include <asm/atomic.h> - -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) -{ - kfree(cont->subsys[debug_subsys_id]); -} - -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) -{ - return atomic_read(&cont->count); -} - -static u64 taskcount_read(struct cgroup *cont, struct cftype *cft) -{ - u64 count; - - count = cgroup_task_count(cont); - return count; -} - -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) -{ - return (u64)(long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(¤t->cgroups->refcount); - rcu_read_unlock(); - return count; -} - -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - -static struct cftype files[] = { - { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, - { - .name = "taskcount", - .read_u64 = taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, -}; - -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); -} - -struct cgroup_subsys debug_subsys = { - .name = "debug", - .create = debug_create, - .destroy = debug_destroy, - .populate = debug_populate, - .subsys_id = debug_subsys_id, -}; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fb249e2..59e9ef6 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task) */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task) + struct task_struct *task, bool threadgroup) { struct freezer *freezer; @@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state == CGROUP_FROZEN) return -EBUSY; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (is_task_frozen_enough(c)) { + rcu_read_unlock(); + return -EBUSY; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce1004..6ba0f1e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -401,6 +401,7 @@ int disable_nonboot_cpus(void) break; } } + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ @@ -413,6 +414,14 @@ int disable_nonboot_cpus(void) return error; } +void __weak arch_enable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_enable_nonboot_cpus_end(void) +{ +} + void __ref enable_nonboot_cpus(void) { int cpu, error; @@ -424,6 +433,9 @@ void __ref enable_nonboot_cpus(void) goto out; printk("Enabling non-boot CPUs ...\n"); + + arch_enable_nonboot_cpus_begin(); + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { @@ -432,6 +444,9 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } + + arch_enable_nonboot_cpus_end(); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7e75a41..b5cb469 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp) static cpumask_var_t cpus_attach; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct task_struct *tsk) +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct task_struct *tsk, bool threadgroup) { + int ret; struct cpuset *cs = cgroup_cs(cont); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) @@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - return security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk, 0, NULL); + if (ret) + return ret; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + ret = security_task_setscheduler(c, 0, NULL); + if (ret) { + rcu_read_unlock(); + return ret; + } + } + rcu_read_unlock(); + } + return 0; +} + +static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, + struct cpuset *cs) +{ + int err; + /* + * can_attach beforehand should guarantee that this doesn't fail. + * TODO: have a better way to handle failure here + */ + err = set_cpus_allowed_ptr(tsk, cpus_attach); + WARN_ON_ONCE(err); + + task_lock(tsk); + cpuset_change_task_nodemask(tsk, to); + task_unlock(tsk); + cpuset_update_task_spread_flag(cs, tsk); + } -static void cpuset_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct cgroup *oldcont, - struct task_struct *tsk) +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct cgroup *oldcont, struct task_struct *tsk, + bool threadgroup) { nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - int err; if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss, guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &to); } - err = set_cpus_allowed_ptr(tsk, cpus_attach); - if (err) - return; - task_lock(tsk); - cpuset_change_task_nodemask(tsk, &to); - task_unlock(tsk); - cpuset_update_task_spread_flag(cs, tsk); + /* do per-task migration stuff possibly for each in the threadgroup */ + cpuset_attach_task(tsk, &to, cs); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + cpuset_attach_task(c, &to, cs); + } + rcu_read_unlock(); + } + /* change mm; only needs to be done once even if threadgroup */ from = oldcs->mems_allowed; to = cs->mems_allowed; mm = get_task_mm(tsk); diff --git a/kernel/cred.c b/kernel/cred.c index 006fcab..dd76cfe 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -147,7 +147,8 @@ static void put_cred_rcu(struct rcu_head *rcu) key_put(cred->thread_keyring); key_put(cred->request_key_auth); release_tgcred(cred); - put_group_info(cred->group_info); + if (cred->group_info) + put_group_info(cred->group_info); free_uid(cred->user); kmem_cache_free(cred_jar, cred); } @@ -781,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as); #ifdef CONFIG_DEBUG_CREDENTIALS +bool creds_are_invalid(const struct cred *cred) +{ + if (cred->magic != CRED_MAGIC) + return true; + if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers)) + return true; +#ifdef CONFIG_SECURITY_SELINUX + if (selinux_is_enabled()) { + if ((unsigned long) cred->security < PAGE_SIZE) + return true; + if ((*(u32 *)cred->security & 0xffffff00) == + (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) + return true; + } +#endif + return false; +} +EXPORT_SYMBOL(creds_are_invalid); + /* * dump invalid credentials */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index abb6e17..ead9b61 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/taskstats.h> #include <linux/time.h> #include <linux/sysctl.h> #include <linux/delayacct.h> diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c deleted file mode 100644 index 962a3b5..0000000 --- a/kernel/dma-coherent.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Coherent per-device memory handling. - * Borrowed from i386 - */ -#include <linux/kernel.h> -#include <linux/dma-mapping.h> - -struct dma_coherent_mem { - void *virt_base; - u32 device_base; - int size; - int flags; - unsigned long *bitmap; -}; - -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem); - out: - if (mem_base) - iounmap(mem_base); - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pos, err; - - size += device_addr & ~PAGE_MASK; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -/** - * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area - * - * @dev: device from which we allocate memory - * @size: size of requested memory area - * @dma_handle: This will be filled with the correct dma handle - * @ret: This pointer will be filled with the virtual address - * to allocated area. - * - * This function should be only called from per-arch dma_alloc_coherent() - * to support allocation from per-device coherent memory pools. - * - * Returns 0 if dma_alloc_coherent should continue with allocating from - * generic memory areas, or !0 if dma_alloc_coherent should return @ret. - */ -int dma_alloc_from_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem; - int order = get_order(size); - int pageno; - - if (!dev) - return 0; - mem = dev->dma_mem; - if (!mem) - return 0; - - *ret = NULL; - - if (unlikely(size > (mem->size << PAGE_SHIFT))) - goto err; - - pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); - if (unlikely(pageno < 0)) - goto err; - - /* - * Memory was found in the per-device area. - */ - *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); - *ret = mem->virt_base + (pageno << PAGE_SHIFT); - memset(*ret, 0, size); - - return 1; - -err: - /* - * In the case where the allocation can not be satisfied from the - * per-device area, try to fall back to generic memory if the - * constraints allow it. - */ - return mem->flags & DMA_MEMORY_EXCLUSIVE; -} -EXPORT_SYMBOL(dma_alloc_from_coherent); - -/** - * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool - * @dev: device from which the memory was allocated - * @order: the order of pages allocated - * @vaddr: virtual address of allocated pages - * - * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, releases that memory. - * - * Returns 1 if we correctly released the memory, or 0 if - * dma_release_coherent() should proceed with releasing memory from - * generic pools. - */ -int dma_release_from_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - return 1; - } - return 0; -} -EXPORT_SYMBOL(dma_release_from_coherent); diff --git a/kernel/exit.c b/kernel/exit.c index ae5d866..5859f59 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,7 +47,7 @@ #include <linux/tracehook.h> #include <linux/fs_struct.h> #include <linux/init_task.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <trace/events/sched.h> #include <asm/uaccess.h> @@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(tsk->perf_counter_ctxp); +#ifdef CONFIG_PERF_EVENTS + WARN_ON_ONCE(tsk->perf_event_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); @@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) + if (task_session(curr) != pid) { change_pid(curr, PIDTYPE_SID, pid); + proc_sid_connector(curr); + } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); @@ -945,6 +947,8 @@ NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); + if (tsk->mm) + setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) @@ -972,8 +976,6 @@ NORET_TYPE void do_exit(long code) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); - if (tsk->binfmt) - module_put(tsk->binfmt->module); proc_exit_connector(tsk); @@ -981,7 +983,7 @@ NORET_TYPE void do_exit(long code) * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ - perf_counter_exit_task(tsk); + perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA @@ -1093,28 +1095,28 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; + wait_queue_t child_wait; int notask_error; }; -static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { - struct pid *pid = NULL; - if (type == PIDTYPE_PID) - pid = task->pids[type].pid; - else if (type < PIDTYPE_MAX) - pid = task->group_leader->pids[type].pid; - return pid; + if (type != PIDTYPE_PID) + task = task->group_leader; + return task->pids[type].pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { - int err; - - if (wo->wo_type < PIDTYPE_MAX) { - if (task_pid_type(p, wo->wo_type) != wo->wo_pid) - return 0; - } + return wo->wo_type == PIDTYPE_MAX || + task_pid_type(p, wo->wo_type) == wo->wo_pid; +} +static int eligible_child(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return 0; /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1124,10 +1126,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p) && !(wo->wo_flags & __WALL)) return 0; - err = security_task_wait(p); - if (err) - return err; - return 1; } @@ -1140,18 +1138,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); + if (infop) { + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + } if (!retval) retval = pid; return retval; @@ -1208,6 +1208,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (likely(!traced) && likely(!task_detached(p))) { struct signal_struct *psig; struct signal_struct *sig; + unsigned long maxrss; /* * The resource counters for the group leader are in its @@ -1256,6 +1257,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; + maxrss = max(sig->maxrss, sig->cmaxrss); + if (psig->cmaxrss < maxrss) + psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->real_parent->sighand->siglock); @@ -1477,13 +1481,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * then ->notask_error is 0 if @p is an eligible child, * or another error from security_task_wait(), or still -ECHILD. */ -static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, - int ptrace, struct task_struct *p) +static int wait_consider_task(struct wait_opts *wo, int ptrace, + struct task_struct *p) { int ret = eligible_child(wo, p); if (!ret) return ret; + ret = security_task_wait(p); if (unlikely(ret < 0)) { /* * If we have not yet seen any eligible child, @@ -1545,7 +1550,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) * Do not consider detached threads. */ if (!task_detached(p)) { - int ret = wait_consider_task(wo, tsk, 0, p); + int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } @@ -1559,7 +1564,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { - int ret = wait_consider_task(wo, tsk, 1, p); + int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } @@ -1567,15 +1572,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } +static int child_wait_callback(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct wait_opts *wo = container_of(wait, struct wait_opts, + child_wait); + struct task_struct *p = key; + + if (!eligible_pid(wo, p)) + return 0; + + if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) + return 0; + + return default_wake_function(wait, mode, sync, key); +} + +void __wake_up_parent(struct task_struct *p, struct task_struct *parent) +{ + __wake_up_sync_key(&parent->signal->wait_chldexit, + TASK_INTERRUPTIBLE, 1, p); +} + static long do_wait(struct wait_opts *wo) { - DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); - add_wait_queue(¤t->signal->wait_chldexit,&wait); + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); repeat: /* * If there is nothing that can match our critiera just get out. @@ -1616,32 +1644,7 @@ notask: } end: __set_current_state(TASK_RUNNING); - remove_wait_queue(¤t->signal->wait_chldexit,&wait); - if (wo->wo_info) { - struct siginfo __user *infop = wo->wo_info; - - if (retval > 0) - retval = 0; - else { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!retval) - retval = put_user(0, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user(0, &infop->si_code); - if (!retval) - retval = put_user(0, &infop->si_pid); - if (!retval) - retval = put_user(0, &infop->si_uid); - if (!retval) - retval = put_user(0, &infop->si_status); - } - } + remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } @@ -1686,6 +1689,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); + + if (ret > 0) { + ret = 0; + } else if (infop) { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!ret) + ret = put_user(0, &infop->si_signo); + if (!ret) + ret = put_user(0, &infop->si_errno); + if (!ret) + ret = put_user(0, &infop->si_code); + if (!ret) + ret = put_user(0, &infop->si_pid); + if (!ret) + ret = put_user(0, &infop->si_uid); + if (!ret) + ret = put_user(0, &infop->si_status); + } + put_pid(pid); /* avoid REGPARM breakage on x86: */ diff --git a/kernel/fork.c b/kernel/fork.c index bfee931..266c6af 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -49,6 +49,7 @@ #include <linux/ftrace.h> #include <linux/profile.h> #include <linux/rmap.h> +#include <linux/ksm.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> @@ -61,7 +62,8 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> +#include <linux/posix-timers.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +static void account_kernel_stack(struct thread_info *ti, int account) +{ + struct zone *zone = page_zone(virt_to_page(ti)); + + mod_zone_page_state(zone, NR_KERNEL_STACK, account); +} + void free_task(struct task_struct *tsk) { prop_local_destroy_single(&tsk->dirties); + account_kernel_stack(tsk->stack, -1); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); @@ -253,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; + + account_kernel_stack(ti, 1); + return tsk; out: @@ -288,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; @@ -418,22 +434,30 @@ __setup("coredump_filter=", coredump_filter_setup); #include <linux/init_task.h> +static void mm_init_aio(struct mm_struct *mm) +{ +#ifdef CONFIG_AIO + spin_lock_init(&mm->ioctx_lock); + INIT_HLIST_HEAD(&mm->ioctx_list); +#endif +} + static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; + mm->flags = (current->mm) ? + (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); - spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + mm_init_aio(mm); mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { @@ -485,6 +509,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); + ksm_exit(mm); exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -493,6 +518,8 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + if (mm->binfmt) + module_put(mm->binfmt->module); mmdrop(mm); } } @@ -618,9 +645,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; + if (mm->binfmt && !try_module_get(mm->binfmt->module)) + goto free_pt; + return mm; free_pt: + /* don't put binfmt in mmput, we haven't got module yet */ + mm->binfmt = NULL; mmput(mm); fail_nomem: @@ -788,10 +820,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) thread_group_cputime_init(sig); /* Expiration times and increments. */ - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; + sig->it[CPUCLOCK_PROF].expires = cputime_zero; + sig->it[CPUCLOCK_PROF].incr = cputime_zero; + sig->it[CPUCLOCK_VIRT].expires = cputime_zero; + sig->it[CPUCLOCK_VIRT].incr = cputime_zero; /* Cached expiration times. */ sig->cputime_expires.prof_exp = cputime_zero; @@ -849,6 +881,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; + sig->maxrss = sig->cmaxrss = 0; task_io_accounting_init(&sig->ioac); sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); @@ -863,6 +896,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); + sig->oom_adj = current->signal->oom_adj; + return 0; } @@ -958,6 +993,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); + /* + * Siblings of global init remain as zombies on exit since they are + * not reaped by their parent (swapper). To solve this and to avoid + * multi-rooted process trees, prevent global and container-inits + * from creating siblings. + */ + if ((clone_flags & CLONE_PARENT) && + current->signal->flags & SIGNAL_UNKILLABLE) + return ERR_PTR(-EINVAL); + retval = security_task_create(clone_flags); if (retval) goto fork_out; @@ -999,9 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; - if (p->binfmt && !try_module_get(p->binfmt->module)) - goto bad_fork_cleanup_put_domain; - p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); @@ -1075,10 +1117,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->bts = NULL; + p->stack_start = stack_start; + /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - retval = perf_counter_init_task(p); + retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; @@ -1253,7 +1297,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); - perf_counter_fork(p); + perf_event_fork(p); return p; bad_fork_free_pid: @@ -1280,16 +1324,13 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: - perf_counter_free_task(p); + perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); - if (p->binfmt) - module_put(p->binfmt->module); -bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 22e9dcf..70a298d 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 + depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 05071bf..e5d98ce 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,36 +48,7 @@ #include <asm/uaccess.h> -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); -} - -EXPORT_SYMBOL_GPL(ktime_get_real); +#include <trace/events/timer.h> /* * The timer bases: @@ -106,31 +77,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); - /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. @@ -498,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif +static inline void +debug_init(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + trace_hrtimer_init(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer) +{ + debug_hrtimer_activate(timer); + trace_hrtimer_start(timer); +} + +static inline void debug_deactivate(struct hrtimer *timer) +{ + debug_hrtimer_deactivate(timer); + trace_hrtimer_cancel(timer); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -854,7 +820,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer *entry; int leftmost = 1; - debug_hrtimer_activate(timer); + debug_activate(timer); /* * Find the right place in the rbtree: @@ -940,7 +906,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, @@ -1155,7 +1121,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &cpu_base->clock_base[clock_id]; - INIT_LIST_HEAD(&timer->cb_entry); hrtimer_init_timer_hres(timer); #ifdef CONFIG_TIMER_STATS @@ -1174,7 +1139,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - debug_hrtimer_init(timer); + debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -1198,7 +1163,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) } EXPORT_SYMBOL_GPL(hrtimer_get_res); -static void __run_hrtimer(struct hrtimer *timer) +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) { struct hrtimer_clock_base *base = timer->base; struct hrtimer_cpu_base *cpu_base = base->cpu_base; @@ -1207,7 +1172,7 @@ static void __run_hrtimer(struct hrtimer *timer) WARN_ON(!irqs_disabled()); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1218,7 +1183,9 @@ static void __run_hrtimer(struct hrtimer *timer) * the timer base. */ spin_unlock(&cpu_base->lock); + trace_hrtimer_expire_entry(timer, now); restart = fn(timer); + trace_hrtimer_expire_exit(timer); spin_lock(&cpu_base->lock); /* @@ -1329,7 +1296,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) break; } - __run_hrtimer(timer); + __run_hrtimer(timer, &basenow); } base++; } @@ -1451,7 +1418,7 @@ void hrtimer_run_queues(void) hrtimer_get_expires_tv64(timer)) break; - __run_hrtimer(timer); + __run_hrtimer(timer, &base->softirq_time); } spin_unlock(&cpu_base->lock); } @@ -1628,7 +1595,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); /* * Mark it as STATE_MIGRATE not INACTIVE otherwise the diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 022a492..d4e8417 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout) * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; diff --git a/kernel/itimer.c b/kernel/itimer.c index 58762f7..b03451e 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -12,6 +12,7 @@ #include <linux/time.h> #include <linux/posix-timers.h> #include <linux/hrtimer.h> +#include <trace/events/timer.h> #include <asm/uaccess.h> @@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) return ktime_to_timeval(rem); } +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *const value) +{ + cputime_t cval, cinterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero)) { + struct task_cputime cputime; + cputime_t t; + + thread_group_cputimer(tsk, &cputime); + if (clock_id == CPUCLOCK_PROF) + t = cputime_add(cputime.utime, cputime.stime); + else + /* CPUCLOCK_VIRT */ + t = cputime.utime; + + if (cputime_le(cval, t)) + /* about to fire */ + cval = cputime_one_jiffy; + else + cval = cputime_sub(cval, t); + } + + spin_unlock_irq(&tsk->sighand->siglock); + + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); +} + int do_getitimer(int which, struct itimerval *value) { struct task_struct *tsk = current; - cputime_t cinterval, cval; switch (which) { case ITIMER_REAL: @@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime cputime; - cputime_t utime; - - thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - if (cputime_le(cval, utime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, utime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); break; case ITIMER_PROF: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime times; - cputime_t ptime; - - thread_group_cputimer(tsk, ×); - ptime = cputime_add(times.utime, times.stime); - if (cputime_le(cval, ptime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, ptime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); break; default: return(-EINVAL); @@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); + trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); return HRTIMER_NORESTART; } +static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) +{ + struct timespec ts; + s64 cpu_ns; + + cputime_to_timespec(ct, &ts); + cpu_ns = timespec_to_ns(&ts); + + return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; +} + +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + const struct itimerval *const value, + struct itimerval *const ovalue) +{ + cputime_t cval, nval, cinterval, ninterval; + s64 ns_ninterval, ns_nval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = timeval_to_cputime(&value->it_value); + ns_nval = timeval_to_ns(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + ns_ninterval = timeval_to_ns(&value->it_interval); + + it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); + it->error = cputime_sub_ns(nval, ns_nval); + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero) || + !cputime_eq(nval, cputime_zero)) { + if (cputime_gt(nval, cputime_zero)) + nval = cputime_add(nval, cputime_one_jiffy); + set_process_cpu_timer(tsk, clock_id, &nval, &cval); + } + it->expires = nval; + it->incr = ninterval; + trace_itimer_state(clock_id == CPUCLOCK_VIRT ? + ITIMER_VIRTUAL : ITIMER_PROF, value, nval); + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } +} + /* * Returns true if the timeval is in canonical form */ @@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; - cputime_t cval, cinterval, nval, ninterval; /* * Validate the timevals in value. @@ -171,51 +221,14 @@ again: } else tsk->signal->it_real_incr.tv64 = 0; + trace_itimer_state(ITIMER_REAL, value, 0); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_VIRT, - &nval, &cval); - } - tsk->signal->it_virt_expires = nval; - tsk->signal->it_virt_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); break; case ITIMER_PROF: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_PROF, - &nval, &cval); - } - tsk->signal->it_prof_expires = nval; - tsk->signal->it_prof_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); break; default: return -EINVAL; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3a29dbe..8b6b8b6 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr) static inline int is_kernel_text(unsigned long addr) { - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) + if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || + arch_is_kernel_text(addr)) return 1; return in_gate_area_no_task(addr); } diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 26539e3..3765ff3 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free); * writer, you don't need extra locking to use these functions. */ unsigned int __kfifo_put(struct kfifo *fifo, - unsigned char *buffer, unsigned int len) + const unsigned char *buffer, unsigned int len) { unsigned int l; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ef177d6..cfadc12 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1321,7 +1321,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_seq_ops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f74d2d7..3815ac1d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -578,6 +578,9 @@ static int static_obj(void *obj) if ((addr >= start) && (addr < end)) return 1; + if (arch_is_kernel_data(addr)) + return 1; + #ifdef CONFIG_SMP /* * percpu var? diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d4b3dbc..d4aba4f 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -594,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations lockstat_ops = { +static const struct seq_operations lockstat_ops = { .start = ls_start, .next = ls_next, .stop = ls_stop, diff --git a/kernel/marker.c b/kernel/marker.c deleted file mode 100644 index ea54f26..0000000 --- a/kernel/marker.c +++ /dev/null @@ -1,930 +0,0 @@ -/* - * Copyright (C) 2007 Mathieu Desnoyers - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/types.h> -#include <linux/jhash.h> -#include <linux/list.h> -#include <linux/rcupdate.h> -#include <linux/marker.h> -#include <linux/err.h> -#include <linux/slab.h> - -extern struct marker __start___markers[]; -extern struct marker __stop___markers[]; - -/* Set to 1 to enable marker debug output */ -static const int marker_debug; - -/* - * markers_mutex nests inside module_mutex. Markers mutex protects the builtin - * and module markers and the hash table. - */ -static DEFINE_MUTEX(markers_mutex); - -/* - * Marker hash table, containing the active markers. - * Protected by module_mutex. - */ -#define MARKER_HASH_BITS 6 -#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) -static struct hlist_head marker_table[MARKER_TABLE_SIZE]; - -/* - * Note about RCU : - * It is used to make sure every handler has finished using its private data - * between two consecutive operation (add or remove) on a given marker. It is - * also used to delay the free of multiple probes array until a quiescent state - * is reached. - * marker entries modifications are protected by the markers_mutex. - */ -struct marker_entry { - struct hlist_node hlist; - char *format; - /* Probe wrapper */ - void (*call)(const struct marker *mdata, void *call_private, ...); - struct marker_probe_closure single; - struct marker_probe_closure *multi; - int refcount; /* Number of times armed. 0 if disarmed. */ - struct rcu_head rcu; - void *oldptr; - int rcu_pending; - unsigned char ptype:1; - unsigned char format_allocated:1; - char name[0]; /* Contains name'\0'format'\0' */ -}; - -/** - * __mark_empty_function - Empty probe callback - * @probe_private: probe private data - * @call_private: call site private data - * @fmt: format string - * @...: variable argument list - * - * Empty callback provided as a probe to the markers. By providing this to a - * disabled marker, we make sure the execution flow is always valid even - * though the function pointer change and the marker enabling are two distinct - * operations that modifies the execution flow of preemptible code. - */ -notrace void __mark_empty_function(void *probe_private, void *call_private, - const char *fmt, va_list *args) -{ -} -EXPORT_SYMBOL_GPL(__mark_empty_function); - -/* - * marker_probe_cb Callback that prepares the variable argument list for probes. - * @mdata: pointer of type struct marker - * @call_private: caller site private data - * @...: Variable argument list. - * - * Since we do not use "typical" pointer based RCU in the 1 argument case, we - * need to put a full smp_rmb() in this branch. This is why we do not use - * rcu_dereference() for the pointer read. - */ -notrace void marker_probe_cb(const struct marker *mdata, - void *call_private, ...) -{ - va_list args; - char ptype; - - /* - * rcu_read_lock_sched does two things : disabling preemption to make - * sure the teardown of the callbacks can be done correctly when they - * are in modules and they insure RCU read coherency. - */ - rcu_read_lock_sched_notrace(); - ptype = mdata->ptype; - if (likely(!ptype)) { - marker_probe_func *func; - /* Must read the ptype before ptr. They are not data dependant, - * so we put an explicit smp_rmb() here. */ - smp_rmb(); - func = mdata->single.func; - /* Must read the ptr before private data. They are not data - * dependant, so we put an explicit smp_rmb() here. */ - smp_rmb(); - va_start(args, call_private); - func(mdata->single.probe_private, call_private, mdata->format, - &args); - va_end(args); - } else { - struct marker_probe_closure *multi; - int i; - /* - * Read mdata->ptype before mdata->multi. - */ - smp_rmb(); - multi = mdata->multi; - /* - * multi points to an array, therefore accessing the array - * depends on reading multi. However, even in this case, - * we must insure that the pointer is read _before_ the array - * data. Same as rcu_dereference, but we need a full smp_rmb() - * in the fast path, so put the explicit barrier here. - */ - smp_read_barrier_depends(); - for (i = 0; multi[i].func; i++) { - va_start(args, call_private); - multi[i].func(multi[i].probe_private, call_private, - mdata->format, &args); - va_end(args); - } - } - rcu_read_unlock_sched_notrace(); -} -EXPORT_SYMBOL_GPL(marker_probe_cb); - -/* - * marker_probe_cb Callback that does not prepare the variable argument list. - * @mdata: pointer of type struct marker - * @call_private: caller site private data - * @...: Variable argument list. - * - * Should be connected to markers "MARK_NOARGS". - */ -static notrace void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, ...) -{ - va_list args; /* not initialized */ - char ptype; - - rcu_read_lock_sched_notrace(); - ptype = mdata->ptype; - if (likely(!ptype)) { - marker_probe_func *func; - /* Must read the ptype before ptr. They are not data dependant, - * so we put an explicit smp_rmb() here. */ - smp_rmb(); - func = mdata->single.func; - /* Must read the ptr before private data. They are not data - * dependant, so we put an explicit smp_rmb() here. */ - smp_rmb(); - func(mdata->single.probe_private, call_private, mdata->format, - &args); - } else { - struct marker_probe_closure *multi; - int i; - /* - * Read mdata->ptype before mdata->multi. - */ - smp_rmb(); - multi = mdata->multi; - /* - * multi points to an array, therefore accessing the array - * depends on reading multi. However, even in this case, - * we must insure that the pointer is read _before_ the array - * data. Same as rcu_dereference, but we need a full smp_rmb() - * in the fast path, so put the explicit barrier here. - */ - smp_read_barrier_depends(); - for (i = 0; multi[i].func; i++) - multi[i].func(multi[i].probe_private, call_private, - mdata->format, &args); - } - rcu_read_unlock_sched_notrace(); -} - -static void free_old_closure(struct rcu_head *head) -{ - struct marker_entry *entry = container_of(head, - struct marker_entry, rcu); - kfree(entry->oldptr); - /* Make sure we free the data before setting the pending flag to 0 */ - smp_wmb(); - entry->rcu_pending = 0; -} - -static void debug_print_probes(struct marker_entry *entry) -{ - int i; - - if (!marker_debug) - return; - - if (!entry->ptype) { - printk(KERN_DEBUG "Single probe : %p %p\n", - entry->single.func, - entry->single.probe_private); - } else { - for (i = 0; entry->multi[i].func; i++) - printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, - entry->multi[i].func, - entry->multi[i].probe_private); - } -} - -static struct marker_probe_closure * -marker_entry_add_probe(struct marker_entry *entry, - marker_probe_func *probe, void *probe_private) -{ - int nr_probes = 0; - struct marker_probe_closure *old, *new; - - WARN_ON(!probe); - - debug_print_probes(entry); - old = entry->multi; - if (!entry->ptype) { - if (entry->single.func == probe && - entry->single.probe_private == probe_private) - return ERR_PTR(-EBUSY); - if (entry->single.func == __mark_empty_function) { - /* 0 -> 1 probes */ - entry->single.func = probe; - entry->single.probe_private = probe_private; - entry->refcount = 1; - entry->ptype = 0; - debug_print_probes(entry); - return NULL; - } else { - /* 1 -> 2 probes */ - nr_probes = 1; - old = NULL; - } - } else { - /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) - if (old[nr_probes].func == probe - && old[nr_probes].probe_private - == probe_private) - return ERR_PTR(-EBUSY); - } - /* + 2 : one for new probe, one for NULL func */ - new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure), - GFP_KERNEL); - if (new == NULL) - return ERR_PTR(-ENOMEM); - if (!old) - new[0] = entry->single; - else - memcpy(new, old, - nr_probes * sizeof(struct marker_probe_closure)); - new[nr_probes].func = probe; - new[nr_probes].probe_private = probe_private; - entry->refcount = nr_probes + 1; - entry->multi = new; - entry->ptype = 1; - debug_print_probes(entry); - return old; -} - -static struct marker_probe_closure * -marker_entry_remove_probe(struct marker_entry *entry, - marker_probe_func *probe, void *probe_private) -{ - int nr_probes = 0, nr_del = 0, i; - struct marker_probe_closure *old, *new; - - old = entry->multi; - - debug_print_probes(entry); - if (!entry->ptype) { - /* 0 -> N is an error */ - WARN_ON(entry->single.func == __mark_empty_function); - /* 1 -> 0 probes */ - WARN_ON(probe && entry->single.func != probe); - WARN_ON(entry->single.probe_private != probe_private); - entry->single.func = __mark_empty_function; - entry->refcount = 0; - entry->ptype = 0; - debug_print_probes(entry); - return NULL; - } else { - /* (N -> M), (N > 1, M >= 0) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if ((!probe || old[nr_probes].func == probe) - && old[nr_probes].probe_private - == probe_private) - nr_del++; - } - } - - if (nr_probes - nr_del == 0) { - /* N -> 0, (N > 1) */ - entry->single.func = __mark_empty_function; - entry->refcount = 0; - entry->ptype = 0; - } else if (nr_probes - nr_del == 1) { - /* N -> 1, (N > 1) */ - for (i = 0; old[i].func; i++) - if ((probe && old[i].func != probe) || - old[i].probe_private != probe_private) - entry->single = old[i]; - entry->refcount = 1; - entry->ptype = 0; - } else { - int j = 0; - /* N -> M, (N > 1, M > 1) */ - /* + 1 for NULL */ - new = kzalloc((nr_probes - nr_del + 1) - * sizeof(struct marker_probe_closure), GFP_KERNEL); - if (new == NULL) - return ERR_PTR(-ENOMEM); - for (i = 0; old[i].func; i++) - if ((probe && old[i].func != probe) || - old[i].probe_private != probe_private) - new[j++] = old[i]; - entry->refcount = nr_probes - nr_del; - entry->ptype = 1; - entry->multi = new; - } - debug_print_probes(entry); - return old; -} - -/* - * Get marker if the marker is present in the marker hash table. - * Must be called with markers_mutex held. - * Returns NULL if not present. - */ -static struct marker_entry *get_marker(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - u32 hash = jhash(name, strlen(name), 0); - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) - return e; - } - return NULL; -} - -/* - * Add the marker to the marker hash table. Must be called with markers_mutex - * held. - */ -static struct marker_entry *add_marker(const char *name, const char *format) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - size_t name_len = strlen(name) + 1; - size_t format_len = 0; - u32 hash = jhash(name, name_len-1, 0); - - if (format) - format_len = strlen(format) + 1; - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - printk(KERN_NOTICE - "Marker %s busy\n", name); - return ERR_PTR(-EBUSY); /* Already there */ - } - } - /* - * Using kmalloc here to allocate a variable length element. Could - * cause some memory fragmentation if overused. - */ - e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, - GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - memcpy(&e->name[0], name, name_len); - if (format) { - e->format = &e->name[name_len]; - memcpy(e->format, format, format_len); - if (strcmp(e->format, MARK_NOARGS) == 0) - e->call = marker_probe_cb_noarg; - else - e->call = marker_probe_cb; - trace_mark(core_marker_format, "name %s format %s", - e->name, e->format); - } else { - e->format = NULL; - e->call = marker_probe_cb; - } - e->single.func = __mark_empty_function; - e->single.probe_private = NULL; - e->multi = NULL; - e->ptype = 0; - e->format_allocated = 0; - e->refcount = 0; - e->rcu_pending = 0; - hlist_add_head(&e->hlist, head); - return e; -} - -/* - * Remove the marker from the marker hash table. Must be called with mutex_lock - * held. - */ -static int remove_marker(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - int found = 0; - size_t len = strlen(name) + 1; - u32 hash = jhash(name, len-1, 0); - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - found = 1; - break; - } - } - if (!found) - return -ENOENT; - if (e->single.func != __mark_empty_function) - return -EBUSY; - hlist_del(&e->hlist); - if (e->format_allocated) - kfree(e->format); - /* Make sure the call_rcu has been executed */ - if (e->rcu_pending) - rcu_barrier_sched(); - kfree(e); - return 0; -} - -/* - * Set the mark_entry format to the format found in the element. - */ -static int marker_set_format(struct marker_entry *entry, const char *format) -{ - entry->format = kstrdup(format, GFP_KERNEL); - if (!entry->format) - return -ENOMEM; - entry->format_allocated = 1; - - trace_mark(core_marker_format, "name %s format %s", - entry->name, entry->format); - return 0; -} - -/* - * Sets the probe callback corresponding to one marker. - */ -static int set_marker(struct marker_entry *entry, struct marker *elem, - int active) -{ - int ret = 0; - WARN_ON(strcmp(entry->name, elem->name) != 0); - - if (entry->format) { - if (strcmp(entry->format, elem->format) != 0) { - printk(KERN_NOTICE - "Format mismatch for probe %s " - "(%s), marker (%s)\n", - entry->name, - entry->format, - elem->format); - return -EPERM; - } - } else { - ret = marker_set_format(entry, elem->format); - if (ret) - return ret; - } - - /* - * probe_cb setup (statically known) is done here. It is - * asynchronous with the rest of execution, therefore we only - * pass from a "safe" callback (with argument) to an "unsafe" - * callback (does not set arguments). - */ - elem->call = entry->call; - /* - * Sanity check : - * We only update the single probe private data when the ptr is - * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) - */ - WARN_ON(elem->single.func != __mark_empty_function - && elem->single.probe_private != entry->single.probe_private - && !elem->ptype); - elem->single.probe_private = entry->single.probe_private; - /* - * Make sure the private data is valid when we update the - * single probe ptr. - */ - smp_wmb(); - elem->single.func = entry->single.func; - /* - * We also make sure that the new probe callbacks array is consistent - * before setting a pointer to it. - */ - rcu_assign_pointer(elem->multi, entry->multi); - /* - * Update the function or multi probe array pointer before setting the - * ptype. - */ - smp_wmb(); - elem->ptype = entry->ptype; - - if (elem->tp_name && (active ^ elem->state)) { - WARN_ON(!elem->tp_cb); - /* - * It is ok to directly call the probe registration because type - * checking has been done in the __trace_mark_tp() macro. - */ - - if (active) { - /* - * try_module_get should always succeed because we hold - * lock_module() to get the tp_cb address. - */ - ret = try_module_get(__module_text_address( - (unsigned long)elem->tp_cb)); - BUG_ON(!ret); - ret = tracepoint_probe_register_noupdate( - elem->tp_name, - elem->tp_cb); - } else { - ret = tracepoint_probe_unregister_noupdate( - elem->tp_name, - elem->tp_cb); - /* - * tracepoint_probe_update_all() must be called - * before the module containing tp_cb is unloaded. - */ - module_put(__module_text_address( - (unsigned long)elem->tp_cb)); - } - } - elem->state = active; - - return ret; -} - -/* - * Disable a marker and its probe callback. - * Note: only waiting an RCU period after setting elem->call to the empty - * function insures that the original callback is not used anymore. This insured - * by rcu_read_lock_sched around the call site. - */ -static void disable_marker(struct marker *elem) -{ - int ret; - - /* leave "call" as is. It is known statically. */ - if (elem->tp_name && elem->state) { - WARN_ON(!elem->tp_cb); - /* - * It is ok to directly call the probe registration because type - * checking has been done in the __trace_mark_tp() macro. - */ - ret = tracepoint_probe_unregister_noupdate(elem->tp_name, - elem->tp_cb); - WARN_ON(ret); - /* - * tracepoint_probe_update_all() must be called - * before the module containing tp_cb is unloaded. - */ - module_put(__module_text_address((unsigned long)elem->tp_cb)); - } - elem->state = 0; - elem->single.func = __mark_empty_function; - /* Update the function before setting the ptype */ - smp_wmb(); - elem->ptype = 0; /* single probe */ - /* - * Leave the private data and id there, because removal is racy and - * should be done only after an RCU period. These are never used until - * the next initialization anyway. - */ -} - -/** - * marker_update_probe_range - Update a probe range - * @begin: beginning of the range - * @end: end of the range - * - * Updates the probe callback corresponding to a range of markers. - */ -void marker_update_probe_range(struct marker *begin, - struct marker *end) -{ - struct marker *iter; - struct marker_entry *mark_entry; - - mutex_lock(&markers_mutex); - for (iter = begin; iter < end; iter++) { - mark_entry = get_marker(iter->name); - if (mark_entry) { - set_marker(mark_entry, iter, !!mark_entry->refcount); - /* - * ignore error, continue - */ - } else { - disable_marker(iter); - } - } - mutex_unlock(&markers_mutex); -} - -/* - * Update probes, removing the faulty probes. - * - * Internal callback only changed before the first probe is connected to it. - * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 - * transitions. All other transitions will leave the old private data valid. - * This makes the non-atomicity of the callback/private data updates valid. - * - * "special case" updates : - * 0 -> 1 callback - * 1 -> 0 callback - * 1 -> 2 callbacks - * 2 -> 1 callbacks - * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. - * Site effect : marker_set_format may delete the marker entry (creating a - * replacement). - */ -static void marker_update_probes(void) -{ - /* Core kernel markers */ - marker_update_probe_range(__start___markers, __stop___markers); - /* Markers in modules. */ - module_update_markers(); - tracepoint_probe_update_all(); -} - -/** - * marker_probe_register - Connect a probe to a marker - * @name: marker name - * @format: format string - * @probe: probe handler - * @probe_private: probe private data - * - * private data must be a valid allocated memory address, or NULL. - * Returns 0 if ok, error value on error. - * The probe address must at least be aligned on the architecture pointer size. - */ -int marker_probe_register(const char *name, const char *format, - marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - int ret = 0; - struct marker_probe_closure *old; - - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) { - entry = add_marker(name, format); - if (IS_ERR(entry)) - ret = PTR_ERR(entry); - } else if (format) { - if (!entry->format) - ret = marker_set_format(entry, format); - else if (strcmp(entry->format, format)) - ret = -EPERM; - } - if (ret) - goto end; - - /* - * If we detect that a call_rcu is pending for this marker, - * make sure it's executed now. - */ - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_add_probe(entry, probe, probe_private); - if (IS_ERR(old)) { - ret = PTR_ERR(old); - goto end; - } - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_register); - -/** - * marker_probe_unregister - Disconnect a probe from a marker - * @name: marker name - * @probe: probe function pointer - * @probe_private: probe private data - * - * Returns the private data given to marker_probe_register, or an ERR_PTR(). - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. - */ -int marker_probe_unregister(const char *name, - marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - struct marker_probe_closure *old; - int ret = -ENOENT; - - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_remove_probe(entry, probe, probe_private); - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); - remove_marker(name); /* Ignore busy error message */ - ret = 0; -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_unregister); - -static struct marker_entry * -get_marker_from_private_data(marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - unsigned int i; - struct hlist_head *head; - struct hlist_node *node; - - for (i = 0; i < MARKER_TABLE_SIZE; i++) { - head = &marker_table[i]; - hlist_for_each_entry(entry, node, head, hlist) { - if (!entry->ptype) { - if (entry->single.func == probe - && entry->single.probe_private - == probe_private) - return entry; - } else { - struct marker_probe_closure *closure; - closure = entry->multi; - for (i = 0; closure[i].func; i++) { - if (closure[i].func == probe && - closure[i].probe_private - == probe_private) - return entry; - } - } - } - } - return NULL; -} - -/** - * marker_probe_unregister_private_data - Disconnect a probe from a marker - * @probe: probe function - * @probe_private: probe private data - * - * Unregister a probe by providing the registered private data. - * Only removes the first marker found in hash table. - * Return 0 on success or error value. - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. - */ -int marker_probe_unregister_private_data(marker_probe_func *probe, - void *probe_private) -{ - struct marker_entry *entry; - int ret = 0; - struct marker_probe_closure *old; - - mutex_lock(&markers_mutex); - entry = get_marker_from_private_data(probe, probe_private); - if (!entry) { - ret = -ENOENT; - goto end; - } - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_remove_probe(entry, NULL, probe_private); - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker_from_private_data(probe, probe_private); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); - remove_marker(entry->name); /* Ignore busy error message */ -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); - -/** - * marker_get_private_data - Get a marker's probe private data - * @name: marker name - * @probe: probe to match - * @num: get the nth matching probe's private data - * - * Returns the nth private data pointer (starting from 0) matching, or an - * ERR_PTR. - * Returns the private data pointer, or an ERR_PTR. - * The private data pointer should _only_ be dereferenced if the caller is the - * owner of the data, or its content could vanish. This is mostly used to - * confirm that a caller is the owner of a registered probe. - */ -void *marker_get_private_data(const char *name, marker_probe_func *probe, - int num) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - size_t name_len = strlen(name) + 1; - u32 hash = jhash(name, name_len-1, 0); - int i; - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - if (!e->ptype) { - if (num == 0 && e->single.func == probe) - return e->single.probe_private; - } else { - struct marker_probe_closure *closure; - int match = 0; - closure = e->multi; - for (i = 0; closure[i].func; i++) { - if (closure[i].func != probe) - continue; - if (match++ == num) - return closure[i].probe_private; - } - } - break; - } - } - return ERR_PTR(-ENOENT); -} -EXPORT_SYMBOL_GPL(marker_get_private_data); - -#ifdef CONFIG_MODULES - -int marker_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - switch (val) { - case MODULE_STATE_COMING: - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - break; - case MODULE_STATE_GOING: - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - break; - } - return 0; -} - -struct notifier_block marker_module_nb = { - .notifier_call = marker_module_notify, - .priority = 0, -}; - -static int init_markers(void) -{ - return register_module_notifier(&marker_module_nb); -} -__initcall(init_markers); - -#endif /* CONFIG_MODULES */ diff --git a/kernel/module.c b/kernel/module.c index 46580ed..5a29397 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -47,6 +47,7 @@ #include <linux/rculist.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> +#include <asm/mmu_context.h> #include <linux/license.h> #include <asm/sections.h> #include <linux/tracepoint.h> @@ -369,7 +370,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -394,7 +395,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -540,7 +541,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -1535,6 +1536,10 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ module_free(mod, mod->module_core); + +#ifdef CONFIG_MPU + update_protections(current->mm); +#endif } void *__symbol_get(const char *symbol) @@ -1792,6 +1797,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } } +static void free_modinfo(struct module *mod) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (attr->free) + attr->free(mod); + } +} + #ifdef CONFIG_KALLSYMS /* lookup symbol in given range of kernel_symbols */ @@ -1857,13 +1873,93 @@ static char elf_type(const Elf_Sym *sym, return '?'; } +static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, + unsigned int shnum) +{ + const Elf_Shdr *sec; + + if (src->st_shndx == SHN_UNDEF + || src->st_shndx >= shnum + || !src->st_name) + return false; + + sec = sechdrs + src->st_shndx; + if (!(sec->sh_flags & SHF_ALLOC) +#ifndef CONFIG_KALLSYMS_ALL + || !(sec->sh_flags & SHF_EXECINSTR) +#endif + || (sec->sh_entsize & INIT_OFFSET_MASK)) + return false; + + return true; +} + +static unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Ehdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ + unsigned long symoffs; + Elf_Shdr *symsect = sechdrs + symindex; + Elf_Shdr *strsect = sechdrs + strindex; + const Elf_Sym *src; + const char *strtab; + unsigned int i, nsrc, ndst; + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; + symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + symindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + symsect->sh_name); + + src = (void *)hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); + strtab = (void *)hdr + strsect->sh_offset; + for (ndst = i = 1; i < nsrc; ++i, ++src) + if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { + unsigned int j = src->st_name; + + while(!__test_and_set_bit(j, strmap) && strtab[j]) + ++j; + ++ndst; + } + + /* Append room for core symbols at end of core part. */ + symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); + mod->core_size = symoffs + ndst * sizeof(Elf_Sym); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; + strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + strindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + strsect->sh_name); + + /* Append room for core symbols' strings at end of core part. */ + *pstroffs = mod->core_size; + __set_bit(0, strmap); + mod->core_size += bitmap_weight(strmap, strsect->sh_size); + + return symoffs; +} + static void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + unsigned long *strmap) { - unsigned int i; + unsigned int i, ndst; + const Elf_Sym *src; + Elf_Sym *dst; + char *s; mod->symtab = (void *)sechdrs[symindex].sh_addr; mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); @@ -1873,13 +1969,44 @@ static void add_kallsyms(struct module *mod, for (i = 0; i < mod->num_symtab; i++) mod->symtab[i].st_info = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); + + mod->core_symtab = dst = mod->module_core + symoffs; + src = mod->symtab; + *dst = *src; + for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { + if (!is_core_symbol(src, sechdrs, shnum)) + continue; + dst[ndst] = *src; + dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); + ++ndst; + } + mod->core_num_syms = ndst; + + mod->core_strtab = s = mod->module_core + stroffs; + for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) + if (test_bit(i, strmap)) + *++s = mod->strtab[i]; } #else +static inline unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Hdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ +} static inline void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + const unsigned long *strmap) { } #endif /* CONFIG_KALLSYMS */ @@ -1954,6 +2081,9 @@ static noinline struct module *load_module(void __user *umod, struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ +#ifdef CONFIG_KALLSYMS + unsigned long symoffs, stroffs, *strmap; +#endif mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2035,11 +2165,6 @@ static noinline struct module *load_module(void __user *umod, /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; -#ifdef CONFIG_KALLSYMS - /* Keep symbol and string tables for decoding later. */ - sechdrs[symindex].sh_flags |= SHF_ALLOC; - sechdrs[strindex].sh_flags |= SHF_ALLOC; -#endif /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -2075,6 +2200,13 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) + * sizeof(long), GFP_KERNEL); + if (!strmap) { + err = -ENOMEM; + goto free_mod; + } + if (find_module(mod->name)) { err = -EEXIST; goto free_mod; @@ -2104,6 +2236,8 @@ static noinline struct module *load_module(void __user *umod, this is done generically; there doesn't appear to be any special cases for the architectures. */ layout_sections(mod, hdr, sechdrs, secstrings); + symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, + secstrings, &stroffs, strmap); /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); @@ -2237,10 +2371,6 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->ctors), &mod->num_ctors); #endif -#ifdef CONFIG_MARKERS - mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", - sizeof(*mod->markers), &mod->num_markers); -#endif #ifdef CONFIG_TRACEPOINTS mod->tracepoints = section_objs(hdr, sechdrs, secstrings, "__tracepoints", @@ -2312,7 +2442,10 @@ static noinline struct module *load_module(void __user *umod, percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, sechdrs[pcpuindex].sh_size); - add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, + symoffs, stroffs, secstrings, strmap); + kfree(strmap); + strmap = NULL; if (!mod->taints) { struct _ddebug *debug; @@ -2384,13 +2517,14 @@ static noinline struct module *load_module(void __user *umod, synchronize_sched(); module_arch_cleanup(mod); cleanup: + free_modinfo(mod); kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - free_init: percpu_modfree(mod->refptr); + free_init: #endif module_free(mod, mod->module_init); free_core: @@ -2401,6 +2535,7 @@ static noinline struct module *load_module(void __user *umod, percpu_modfree(percpu); free_mod: kfree(args); + kfree(strmap); free_hdr: vfree(hdr); return ERR_PTR(err); @@ -2490,6 +2625,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, /* Drop initial reference. */ module_put(mod); trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + mod->num_symtab = mod->core_num_syms; + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; +#endif module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; @@ -2958,20 +3098,6 @@ void module_layout(struct module *mod, EXPORT_SYMBOL(module_layout); #endif -#ifdef CONFIG_MARKERS -void module_update_markers(void) -{ - struct module *mod; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) - if (!mod->taints) - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - mutex_unlock(&module_mutex); -} -#endif - #ifdef CONFIG_TRACEPOINTS void module_update_tracepoints(void) { diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 5aa854f..2a5dfec 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid) * (hence either you are in the same cgroup as task, or in an * ancestor cgroup thereof) */ -static int ns_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, struct task_struct *task) +static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, + struct task_struct *task, bool threadgroup) { if (current != task) { if (!capable(CAP_SYS_ADMIN)) @@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss, if (!cgroup_is_descendant(new_cgroup, task)) return -EPERM; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (!cgroup_is_descendant(new_cgroup, c)) { + rcu_read_unlock(); + return -EPERM; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/panic.c b/kernel/panic.c index 512ab73..bcdef26 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -177,7 +177,7 @@ static const struct tnt tnts[] = { * 'W' - Taint on warning. * 'C' - modules from drivers/staging are loaded. * - * The string is overwritten by the next call to print_taint(). + * The string is overwritten by the next call to print_tainted(). */ const char *print_tainted(void) { diff --git a/kernel/params.c b/kernel/params.c index 7f6912c..9da58ea 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -23,6 +23,7 @@ #include <linux/device.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/ctype.h> #if 0 #define DEBUGP printk @@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val) } for (i = 0; args[i]; i++) { - if (args[i] == ' ' && !in_quote) + if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') @@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val) next = args + i; /* Chew up trailing spaces. */ - while (*next == ' ') + while (isspace(*next)) next++; return next; } @@ -138,7 +139,7 @@ int parse_args(const char *name, DEBUGP("Parsing ARGS: %s\n", args); /* Chew leading spaces */ - while (*args == ' ') + while (isspace(*args)) args++; while (*args) { diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c deleted file mode 100644 index e0d91fd..0000000 --- a/kernel/perf_counter.c +++ /dev/null @@ -1,4962 +0,0 @@ -/* - * Performance counter core code - * - * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> - * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING - */ - -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/file.h> -#include <linux/poll.h> -#include <linux/sysfs.h> -#include <linux/dcache.h> -#include <linux/percpu.h> -#include <linux/ptrace.h> -#include <linux/vmstat.h> -#include <linux/hardirq.h> -#include <linux/rculist.h> -#include <linux/uaccess.h> -#include <linux/syscalls.h> -#include <linux/anon_inodes.h> -#include <linux/kernel_stat.h> -#include <linux/perf_counter.h> - -#include <asm/irq_regs.h> - -/* - * Each CPU has a list of per CPU counters: - */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); - -int perf_max_counters __read_mostly = 1; -static int perf_reserved_percpu __read_mostly; -static int perf_overcommit __read_mostly = 1; - -static atomic_t nr_counters __read_mostly; -static atomic_t nr_mmap_counters __read_mostly; -static atomic_t nr_comm_counters __read_mostly; -static atomic_t nr_task_counters __read_mostly; - -/* - * perf counter paranoia level: - * -1 - not paranoid at all - * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu counters for unpriv - * 2 - disallow kernel profiling for unpriv - */ -int sysctl_perf_counter_paranoid __read_mostly = 1; - -static inline bool perf_paranoid_tracepoint_raw(void) -{ - return sysctl_perf_counter_paranoid > -1; -} - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_counter_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_counter_paranoid > 1; -} - -int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ - -/* - * max perf counter sample rate - */ -int sysctl_perf_counter_sample_rate __read_mostly = 100000; - -static atomic64_t perf_counter_id; - -/* - * Lock for (sysadmin-configurable) counter reservations: - */ -static DEFINE_SPINLOCK(perf_resource_lock); - -/* - * Architecture provided APIs - weak aliases: - */ -extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} - -void __weak hw_perf_disable(void) { barrier(); } -void __weak hw_perf_enable(void) { barrier(); } - -void __weak hw_perf_counter_setup(int cpu) { barrier(); } -void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } - -int __weak -hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu) -{ - return 0; -} - -void __weak perf_counter_print_debug(void) { } - -static DEFINE_PER_CPU(int, disable_count); - -void __perf_disable(void) -{ - __get_cpu_var(disable_count)++; -} - -bool __perf_enable(void) -{ - return !--__get_cpu_var(disable_count); -} - -void perf_disable(void) -{ - __perf_disable(); - hw_perf_disable(); -} - -void perf_enable(void) -{ - if (__perf_enable()) - hw_perf_enable(); -} - -static void get_ctx(struct perf_counter_context *ctx) -{ - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); -} - -static void free_ctx(struct rcu_head *head) -{ - struct perf_counter_context *ctx; - - ctx = container_of(head, struct perf_counter_context, rcu_head); - kfree(ctx); -} - -static void put_ctx(struct perf_counter_context *ctx) -{ - if (atomic_dec_and_test(&ctx->refcount)) { - if (ctx->parent_ctx) - put_ctx(ctx->parent_ctx); - if (ctx->task) - put_task_struct(ctx->task); - call_rcu(&ctx->rcu_head, free_ctx); - } -} - -static void unclone_ctx(struct perf_counter_context *ctx) -{ - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } -} - -/* - * If we inherit counters we want to return the parent counter id - * to userspace. - */ -static u64 primary_counter_id(struct perf_counter *counter) -{ - u64 id = counter->id; - - if (counter->parent) - id = counter->parent->id; - - return id; -} - -/* - * Get the perf_counter_context for a task and lock it. - * This has to cope with with the fact that until it is locked, - * the context could get moved to another task. - */ -static struct perf_counter_context * -perf_lock_task_context(struct task_struct *task, unsigned long *flags) -{ - struct perf_counter_context *ctx; - - rcu_read_lock(); - retry: - ctx = rcu_dereference(task->perf_counter_ctxp); - if (ctx) { - /* - * If this context is a clone of another, it might - * get swapped for another underneath us by - * perf_counter_task_sched_out, though the - * rcu_read_lock() protects us from any context - * getting freed. Lock the context and check if it - * got swapped before we could get the lock, and retry - * if so. If we locked the right context, then it - * can't get swapped on us any more. - */ - spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_counter_ctxp)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - goto retry; - } - - if (!atomic_inc_not_zero(&ctx->refcount)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - ctx = NULL; - } - } - rcu_read_unlock(); - return ctx; -} - -/* - * Get the context for a task and increment its pin_count so it - * can't get swapped to another task. This also increments its - * reference count so that the context can't get freed. - */ -static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) -{ - struct perf_counter_context *ctx; - unsigned long flags; - - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - ++ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - } - return ctx; -} - -static void perf_unpin_context(struct perf_counter_context *ctx) -{ - unsigned long flags; - - spin_lock_irqsave(&ctx->lock, flags); - --ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - put_ctx(ctx); -} - -/* - * Add a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *group_leader = counter->group_leader; - - /* - * Depending on whether it is a standalone or sibling counter, - * add it straight to the context's counter list, or to the group - * leader's sibling list: - */ - if (group_leader == counter) - list_add_tail(&counter->list_entry, &ctx->counter_list); - else { - list_add_tail(&counter->list_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; - } - - list_add_rcu(&counter->event_entry, &ctx->event_list); - ctx->nr_counters++; - if (counter->attr.inherit_stat) - ctx->nr_stat++; -} - -/* - * Remove a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *sibling, *tmp; - - if (list_empty(&counter->list_entry)) - return; - ctx->nr_counters--; - if (counter->attr.inherit_stat) - ctx->nr_stat--; - - list_del_init(&counter->list_entry); - list_del_rcu(&counter->event_entry); - - if (counter->group_leader != counter) - counter->group_leader->nr_siblings--; - - /* - * If this was a group counter with sibling counters then - * upgrade the siblings to singleton counters by adding them - * to the context list directly: - */ - list_for_each_entry_safe(sibling, tmp, - &counter->sibling_list, list_entry) { - - list_move_tail(&sibling->list_entry, &ctx->counter_list); - sibling->group_leader = sibling; - } -} - -static void -counter_sched_out(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - if (counter->pending_disable) { - counter->pending_disable = 0; - counter->state = PERF_COUNTER_STATE_OFF; - } - counter->tstamp_stopped = ctx->time; - counter->pmu->disable(counter); - counter->oncpu = -1; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu--; - ctx->nr_active--; - if (counter->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; -} - -static void -group_sched_out(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter_sched_out(group_counter, cpuctx, ctx); - - /* - * Schedule out siblings (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) - counter_sched_out(counter, cpuctx, ctx); - - if (group_counter->attr.exclusive) - cpuctx->exclusive = 0; -} - -/* - * Cross CPU call to remove a performance counter - * - * We disable the counter on the hardware level first. After that we - * remove it from the context list. - */ -static void __perf_counter_remove_from_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. - */ - perf_disable(); - - counter_sched_out(counter, cpuctx, ctx); - - list_del_counter(counter, ctx); - - if (!ctx->task) { - /* - * Allow more per task counters with respect to the - * reservation: - */ - cpuctx->max_pertask = - min(perf_max_counters - ctx->nr_counters, - perf_max_counters - perf_reserved_percpu); - } - - perf_enable(); - spin_unlock(&ctx->lock); -} - - -/* - * Remove the counter from a task's (or a CPU's) list of counters. - * - * Must be called with ctx->mutex held. - * - * CPU counters are removed with a smp call. For task counters we only - * call when the task is on a CPU. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This is OK when called from perf_release since - * that only calls us on the top-level context, which can't be a clone. - * When called from perf_counter_exit_task, it's OK because the - * context has been detached from its task. - */ -static void perf_counter_remove_from_context(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are removed via an smp call and - * the removal is always sucessful. - */ - smp_call_function_single(counter->cpu, - __perf_counter_remove_from_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_counter_remove_from_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * If the context is active we need to retry the smp call. - */ - if (ctx->nr_active && !list_empty(&counter->list_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can remove the counter safely, if the call above did not - * succeed. - */ - if (!list_empty(&counter->list_entry)) { - list_del_counter(counter, ctx); - } - spin_unlock_irq(&ctx->lock); -} - -static inline u64 perf_clock(void) -{ - return cpu_clock(smp_processor_id()); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_counter_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -/* - * Update the total_time_enabled and total_time_running fields for a counter. - */ -static void update_counter_times(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - u64 run_end; - - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE) - return; - - counter->total_time_enabled = ctx->time - counter->tstamp_enabled; - - if (counter->state == PERF_COUNTER_STATE_INACTIVE) - run_end = counter->tstamp_stopped; - else - run_end = ctx->time; - - counter->total_time_running = run_end - counter->tstamp_running; -} - -/* - * Update total_time_enabled and total_time_running for all counters in a group. - */ -static void update_group_times(struct perf_counter *leader) -{ - struct perf_counter *counter; - - update_counter_times(leader); - list_for_each_entry(counter, &leader->sibling_list, list_entry) - update_counter_times(counter); -} - -/* - * Cross CPU call to disable a performance counter - */ -static void __perf_counter_disable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - - /* - * If the counter is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - update_context_time(ctx); - update_group_times(counter); - if (counter == counter->group_leader) - group_sched_out(counter, cpuctx, ctx); - else - counter_sched_out(counter, cpuctx, ctx); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock(&ctx->lock); -} - -/* - * Disable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through - * perf_counter_for_each_child or perf_counter_for_each because they - * hold the top-level counter's child_mutex, so any descendant that - * goes to exit will block in sync_child_counter. - * When called from perf_pending_counter it's OK because counter->ctx - * is the current context on this CPU and preemption is disabled, - * hence we can't get into perf_counter_task_sched_out for this context. - */ -static void perf_counter_disable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Disable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_disable, - counter, 1); - return; - } - - retry: - task_oncpu_function_call(task, __perf_counter_disable, counter); - - spin_lock_irq(&ctx->lock); - /* - * If the counter is still active, we need to retry the cross-call. - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock_irq(&ctx->lock); -} - -static int -counter_sched_in(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - if (counter->state <= PERF_COUNTER_STATE_OFF) - return 0; - - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - - if (counter->pmu->enable(counter)) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->oncpu = -1; - return -EAGAIN; - } - - counter->tstamp_running += ctx->time - counter->tstamp_stopped; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu++; - ctx->nr_active++; - - if (counter->attr.exclusive) - cpuctx->exclusive = 1; - - return 0; -} - -static int -group_sched_in(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - struct perf_counter *counter, *partial_group; - int ret; - - if (group_counter->state == PERF_COUNTER_STATE_OFF) - return 0; - - ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); - if (ret) - return ret < 0 ? ret : 0; - - if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) - return -EAGAIN; - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) { - partial_group = counter; - goto group_error; - } - } - - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter == partial_group) - break; - counter_sched_out(counter, cpuctx, ctx); - } - counter_sched_out(group_counter, cpuctx, ctx); - - return -EAGAIN; -} - -/* - * Return 1 for a group consisting entirely of software counters, - * 0 if the group contains any hardware counters. - */ -static int is_software_only_group(struct perf_counter *leader) -{ - struct perf_counter *counter; - - if (!is_software_counter(leader)) - return 0; - - list_for_each_entry(counter, &leader->sibling_list, list_entry) - if (!is_software_counter(counter)) - return 0; - - return 1; -} - -/* - * Work out whether we can put this counter group on the CPU now. - */ -static int group_can_go_on(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - int can_add_hw) -{ - /* - * Groups consisting entirely of software counters can always go on. - */ - if (is_software_only_group(counter)) - return 1; - /* - * If an exclusive group is already on, no other hardware - * counters can go on. - */ - if (cpuctx->exclusive) - return 0; - /* - * If this group is exclusive and there are already - * counters on the CPU, it can't go on. - */ - if (counter->attr.exclusive && cpuctx->active_oncpu) - return 0; - /* - * Otherwise, try to add it if all previous groups were able - * to go on. - */ - return can_add_hw; -} - -static void add_counter_to_ctx(struct perf_counter *counter, - struct perf_counter_context *ctx) -{ - list_add_counter(counter, ctx); - counter->tstamp_enabled = ctx->time; - counter->tstamp_running = ctx->time; - counter->tstamp_stopped = ctx->time; -} - -/* - * Cross CPU call to install and enable a performance counter - * - * Must be called with ctx->mutex held - */ -static void __perf_install_in_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int cpu = smp_processor_id(); - int err; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - * Or possibly this is the right context but it isn't - * on this cpu because it had no counters. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. NOP for non NMI based counters. - */ - perf_disable(); - - add_counter_to_ctx(counter, ctx); - - /* - * Don't put the counter on if it is disabled or if - * it is in a group and the group isn't on. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE || - (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) - goto unlock; - - /* - * An exclusive counter can't go on if there are already active - * hardware counters, and no hardware counter can go on if there - * is already an exclusive counter on. - */ - if (!group_can_go_on(counter, cpuctx, 1)) - err = -EEXIST; - else - err = counter_sched_in(counter, cpuctx, ctx, cpu); - - if (err) { - /* - * This counter couldn't go on. If it is in a group - * then we have to pull the whole group off. - * If the counter group is pinned then put it in error state. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - if (!err && !ctx->task && cpuctx->max_pertask) - cpuctx->max_pertask--; - - unlock: - perf_enable(); - - spin_unlock(&ctx->lock); -} - -/* - * Attach a performance counter to a context - * - * First we add the counter to the list with the hardware enable bit - * in counter->hw_config cleared. - * - * If the counter is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. - * - * Must be called with ctx->mutex held. - */ -static void -perf_install_in_context(struct perf_counter_context *ctx, - struct perf_counter *counter, - int cpu) -{ - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are installed via an smp call and - * the install is always sucessful. - */ - smp_call_function_single(cpu, __perf_install_in_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_install_in_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * we need to retry the smp call. - */ - if (ctx->is_active && list_empty(&counter->list_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can add the counter safely, if it the call above did not - * succeed. - */ - if (list_empty(&counter->list_entry)) - add_counter_to_ctx(counter, ctx); - spin_unlock_irq(&ctx->lock); -} - -/* - * Put a counter into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_counter_mark_enabled(struct perf_counter *counter, - struct perf_counter_context *ctx) -{ - struct perf_counter *sub; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - list_for_each_entry(sub, &counter->sibling_list, list_entry) - if (sub->state >= PERF_COUNTER_STATE_INACTIVE) - sub->tstamp_enabled = - ctx->time - sub->total_time_enabled; -} - -/* - * Cross CPU call to enable a performance counter - */ -static void __perf_counter_enable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int err; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto unlock; - __perf_counter_mark_enabled(counter, ctx); - - /* - * If the counter is in a group and isn't the group leader, - * then don't put it on unless the group is on. - */ - if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(counter, cpuctx, 1)) { - err = -EEXIST; - } else { - perf_disable(); - if (counter == leader) - err = group_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - else - err = counter_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - perf_enable(); - } - - if (err) { - /* - * If this counter can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - unlock: - spin_unlock(&ctx->lock); -} - -/* - * Enable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisfied when called through - * perf_counter_for_each_child or perf_counter_for_each as described - * for perf_counter_disable. - */ -static void perf_counter_enable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Enable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_enable, - counter, 1); - return; - } - - spin_lock_irq(&ctx->lock); - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto out; - - /* - * If the counter is in error state, clear that first. - * That way, if we see the counter in error state below, we - * know that it has gone back into error state, as distinct - * from the task having been scheduled away before the - * cross-call arrived. - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - counter->state = PERF_COUNTER_STATE_OFF; - - retry: - spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_counter_enable, counter); - - spin_lock_irq(&ctx->lock); - - /* - * If the context is active and the counter is still off, - * we need to retry the cross-call. - */ - if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) - goto retry; - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_OFF) - __perf_counter_mark_enabled(counter, ctx); - - out: - spin_unlock_irq(&ctx->lock); -} - -static int perf_counter_refresh(struct perf_counter *counter, int refresh) -{ - /* - * not supported on inherited counters - */ - if (counter->attr.inherit) - return -EINVAL; - - atomic_add(refresh, &counter->event_limit); - perf_counter_enable(counter); - - return 0; -} - -void __perf_counter_sched_out(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_counter *counter; - - spin_lock(&ctx->lock); - ctx->is_active = 0; - if (likely(!ctx->nr_counters)) - goto out; - update_context_time(ctx); - - perf_disable(); - if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter != counter->group_leader) - counter_sched_out(counter, cpuctx, ctx); - else - group_sched_out(counter, cpuctx, ctx); - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled counters. - * If the number of enabled counters is the same, then the set - * of enabled counters should be the same, because these are both - * inherited contexts, therefore we can't access individual counters - * in them directly with an fd; we can only enable/disable all - * counters via prctl, or enable/disable all counters in a family - * via ioctl, which will have the same effect on both contexts. - */ -static int context_equiv(struct perf_counter_context *ctx1, - struct perf_counter_context *ctx2) -{ - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; -} - -static void __perf_counter_read(void *counter); - -static void __perf_counter_sync_stat(struct perf_counter *counter, - struct perf_counter *next_counter) -{ - u64 value; - - if (!counter->attr.inherit_stat) - return; - - /* - * Update the counter value, we cannot use perf_counter_read() - * because we're in the middle of a context switch and have IRQs - * disabled, which upsets smp_call_function_single(), however - * we know the counter must be on the current CPU, therefore we - * don't need to use it. - */ - switch (counter->state) { - case PERF_COUNTER_STATE_ACTIVE: - __perf_counter_read(counter); - break; - - case PERF_COUNTER_STATE_INACTIVE: - update_counter_times(counter); - break; - - default: - break; - } - - /* - * In order to keep per-task stats reliable we need to flip the counter - * values when we flip the contexts. - */ - value = atomic64_read(&next_counter->count); - value = atomic64_xchg(&counter->count, value); - atomic64_set(&next_counter->count, value); - - swap(counter->total_time_enabled, next_counter->total_time_enabled); - swap(counter->total_time_running, next_counter->total_time_running); - - /* - * Since we swizzled the values, update the user visible data too. - */ - perf_counter_update_userpage(counter); - perf_counter_update_userpage(next_counter); -} - -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - -static void perf_counter_sync_stat(struct perf_counter_context *ctx, - struct perf_counter_context *next_ctx) -{ - struct perf_counter *counter, *next_counter; - - if (!ctx->nr_stat) - return; - - counter = list_first_entry(&ctx->event_list, - struct perf_counter, event_entry); - - next_counter = list_first_entry(&next_ctx->event_list, - struct perf_counter, event_entry); - - while (&counter->event_entry != &ctx->event_list && - &next_counter->event_entry != &next_ctx->event_list) { - - __perf_counter_sync_stat(counter, next_counter); - - counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(next_counter, event_entry); - } -} - -/* - * Called from scheduler to remove the counters of the current task, - * with interrupts disabled. - * - * We stop each counter and update the counter value in counter->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * not restart the counter. - */ -void perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter_context *next_ctx; - struct perf_counter_context *parent; - struct pt_regs *regs; - int do_switch = 1; - - regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); - - if (likely(!ctx || !cpuctx->task_ctx)) - return; - - update_context_time(ctx); - - rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_counter_ctxp; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { - /* - * Looks like the two contexts are clones, so we might be - * able to optimize the context switch. We lock both - * contexts and check that they are clones under the - * lock (including re-checking that neither has been - * uncloned in the meantime). It doesn't matter which - * order we take the locks because no other cpu could - * be trying to lock both of these tasks. - */ - spin_lock(&ctx->lock); - spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); - if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_counter_ctxp - */ - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; - ctx->task = next; - next_ctx->task = task; - do_switch = 0; - - perf_counter_sync_stat(ctx, next_ctx); - } - spin_unlock(&next_ctx->lock); - spin_unlock(&ctx->lock); - } - rcu_read_unlock(); - - if (do_switch) { - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; - } -} - -/* - * Called with IRQs disabled - */ -static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; -} - -/* - * Called with IRQs disabled - */ -static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) -{ - __perf_counter_sched_out(&cpuctx->ctx, cpuctx); -} - -static void -__perf_counter_sched_in(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter *counter; - int can_add_hw = 1; - - spin_lock(&ctx->lock); - ctx->is_active = 1; - if (likely(!ctx->nr_counters)) - goto out; - - ctx->timestamp = perf_clock(); - - perf_disable(); - - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->attr.pinned) - continue; - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) - counter_sched_in(counter, cpuctx, ctx, cpu); - else { - if (group_can_go_on(counter, cpuctx, 1)) - group_sched_in(counter, cpuctx, ctx, cpu); - } - - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_ERROR; - } - } - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - /* - * Ignore counters in OFF or ERROR state, and - * ignore pinned counters since we did them already. - */ - if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->attr.pinned) - continue; - - /* - * Listen to the 'cpu' scheduling filter constraint - * of counters: - */ - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } else { - if (group_can_go_on(counter, cpuctx, can_add_hw)) { - if (group_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Called from scheduler to add the counters of the current task - * with interrupts disabled. - * - * We restore the counter value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * keep the counter running. - */ -void perf_counter_task_sched_in(struct task_struct *task, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - - if (likely(!ctx)) - return; - if (cpuctx->task_ctx == ctx) - return; - __perf_counter_sched_in(ctx, cpuctx, cpu); - cpuctx->task_ctx = ctx; -} - -static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter_context *ctx = &cpuctx->ctx; - - __perf_counter_sched_in(ctx, cpuctx, cpu); -} - -#define MAX_INTERRUPTS (~0ULL) - -static void perf_log_throttle(struct perf_counter *counter, int enable); - -static void perf_adjust_period(struct perf_counter *counter, u64 events) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period, sample_period; - s64 delta; - - events *= hwc->sample_period; - period = div64_u64(events, counter->attr.sample_freq); - - delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ - - sample_period = hwc->sample_period + delta; - - if (!sample_period) - sample_period = 1; - - hwc->sample_period = sample_period; -} - -static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - struct hw_perf_counter *hwc; - u64 interrupts, freq; - - spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - continue; - - hwc = &counter->hw; - - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle counters on the tick - */ - if (interrupts == MAX_INTERRUPTS) { - perf_log_throttle(counter, 1); - counter->pmu->unthrottle(counter); - interrupts = 2*sysctl_perf_counter_sample_rate/HZ; - } - - if (!counter->attr.freq || !counter->attr.sample_freq) - continue; - - /* - * if the specified freq < HZ then we need to skip ticks - */ - if (counter->attr.sample_freq < HZ) { - freq = counter->attr.sample_freq; - - hwc->freq_count += freq; - hwc->freq_interrupts += interrupts; - - if (hwc->freq_count < HZ) - continue; - - interrupts = hwc->freq_interrupts; - hwc->freq_interrupts = 0; - hwc->freq_count -= HZ; - } else - freq = HZ; - - perf_adjust_period(counter, freq * interrupts); - - /* - * In order to avoid being stalled by an (accidental) huge - * sample period, force reset the sample period if we didn't - * get any events in this freq period. - */ - if (!interrupts) { - perf_disable(); - counter->pmu->disable(counter); - atomic64_set(&hwc->period_left, 0); - counter->pmu->enable(counter); - perf_enable(); - } - } - spin_unlock(&ctx->lock); -} - -/* - * Round-robin a context's counters: - */ -static void rotate_ctx(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (!ctx->nr_counters) - return; - - spin_lock(&ctx->lock); - /* - * Rotate the first entry last (works just fine for group counters too): - */ - perf_disable(); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - list_move_tail(&counter->list_entry, &ctx->counter_list); - break; - } - perf_enable(); - - spin_unlock(&ctx->lock); -} - -void perf_counter_task_tick(struct task_struct *curr, int cpu) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - - if (!atomic_read(&nr_counters)) - return; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = curr->perf_counter_ctxp; - - perf_ctx_adjust_freq(&cpuctx->ctx); - if (ctx) - perf_ctx_adjust_freq(ctx); - - perf_counter_cpu_sched_out(cpuctx); - if (ctx) - __perf_counter_task_sched_out(ctx); - - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); - - perf_counter_cpu_sched_in(cpuctx, cpu); - if (ctx) - perf_counter_task_sched_in(curr, cpu); -} - -/* - * Enable all of a task's counters that have been marked enable-on-exec. - * This expects task == current. - */ -static void perf_counter_enable_on_exec(struct task_struct *task) -{ - struct perf_counter_context *ctx; - struct perf_counter *counter; - unsigned long flags; - int enabled = 0; - - local_irq_save(flags); - ctx = task->perf_counter_ctxp; - if (!ctx || !ctx->nr_counters) - goto out; - - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (!counter->attr.enable_on_exec) - continue; - counter->attr.enable_on_exec = 0; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - continue; - __perf_counter_mark_enabled(counter, ctx); - enabled = 1; - } - - /* - * Unclone this context if we enabled any counter. - */ - if (enabled) - unclone_ctx(ctx); - - spin_unlock(&ctx->lock); - - perf_counter_task_sched_in(task, smp_processor_id()); - out: - local_irq_restore(flags); -} - -/* - * Cross CPU call to read the hardware counter - */ -static void __perf_counter_read(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - unsigned long flags; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. In that case - * counter->count would have been updated to a recent sample - * when the counter was scheduled out. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - local_irq_save(flags); - if (ctx->is_active) - update_context_time(ctx); - counter->pmu->read(counter); - update_counter_times(counter); - local_irq_restore(flags); -} - -static u64 perf_counter_read(struct perf_counter *counter) -{ - /* - * If counter is enabled and currently active on a CPU, update the - * value in the counter structure: - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - smp_call_function_single(counter->oncpu, - __perf_counter_read, counter, 1); - } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); - } - - return atomic64_read(&counter->count); -} - -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->counter_list); - INIT_LIST_HEAD(&ctx->event_list); - atomic_set(&ctx->refcount, 1); - ctx->task = task; -} - -static struct perf_counter_context *find_get_context(pid_t pid, int cpu) -{ - struct perf_counter_context *ctx; - struct perf_cpu_context *cpuctx; - struct task_struct *task; - unsigned long flags; - int err; - - /* - * If cpu is not a wildcard then this is a percpu counter: - */ - if (cpu != -1) { - /* Must be root to operate on a CPU counter: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - - if (cpu < 0 || cpu > num_possible_cpus()) - return ERR_PTR(-EINVAL); - - /* - * We could be clever and allow to attach a counter to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_isset(cpu, cpu_online_map)) - return ERR_PTR(-ENODEV); - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); - - return ctx; - } - - rcu_read_lock(); - if (!pid) - task = current; - else - task = find_task_by_vpid(pid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - if (!task) - return ERR_PTR(-ESRCH); - - /* - * Can't attach counters to a dying task. - */ - err = -ESRCH; - if (task->flags & PF_EXITING) - goto errout; - - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto errout; - - retry: - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - unclone_ctx(ctx); - spin_unlock_irqrestore(&ctx->lock, flags); - } - - if (!ctx) { - ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - err = -ENOMEM; - if (!ctx) - goto errout; - __perf_counter_init_context(ctx, task); - get_ctx(ctx); - if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { - /* - * We raced with some other task; use - * the context they set. - */ - kfree(ctx); - goto retry; - } - get_task_struct(task); - } - - put_task_struct(task); - return ctx; - - errout: - put_task_struct(task); - return ERR_PTR(err); -} - -static void free_counter_rcu(struct rcu_head *head) -{ - struct perf_counter *counter; - - counter = container_of(head, struct perf_counter, rcu_head); - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); -} - -static void perf_pending_sync(struct perf_counter *counter); - -static void free_counter(struct perf_counter *counter) -{ - perf_pending_sync(counter); - - if (!counter->parent) { - atomic_dec(&nr_counters); - if (counter->attr.mmap) - atomic_dec(&nr_mmap_counters); - if (counter->attr.comm) - atomic_dec(&nr_comm_counters); - if (counter->attr.task) - atomic_dec(&nr_task_counters); - } - - if (counter->output) { - fput(counter->output->filp); - counter->output = NULL; - } - - if (counter->destroy) - counter->destroy(counter); - - put_ctx(counter->ctx); - call_rcu(&counter->rcu_head, free_counter_rcu); -} - -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) -{ - struct perf_counter *counter = file->private_data; - struct perf_counter_context *ctx = counter->ctx; - - file->private_data = NULL; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_counter_remove_from_context(counter); - mutex_unlock(&ctx->mutex); - - mutex_lock(&counter->owner->perf_counter_mutex); - list_del_init(&counter->owner_entry); - mutex_unlock(&counter->owner->perf_counter_mutex); - put_task_struct(counter->owner); - - free_counter(counter); - - return 0; -} - -static int perf_counter_read_size(struct perf_counter *counter) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_GROUP) { - nr += counter->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - - return size; -} - -static u64 perf_counter_read_value(struct perf_counter *counter) -{ - struct perf_counter *child; - u64 total = 0; - - total += perf_counter_read(counter); - list_for_each_entry(child, &counter->child_list, child_list) - total += perf_counter_read(child); - - return total; -} - -static int perf_counter_read_entry(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - int n = 0, count = 0; - u64 values[2]; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; -} - -static int perf_counter_read_group(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - struct perf_counter *leader = counter->group_leader, *sub; - int n = 0, size = 0, err = -EFAULT; - u64 values[3]; - - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = leader->total_time_enabled + - atomic64_read(&leader->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = leader->total_time_running + - atomic64_read(&leader->child_total_time_running); - } - - size = n * sizeof(u64); - - if (copy_to_user(buf, values, size)) - return -EFAULT; - - err = perf_counter_read_entry(leader, read_format, buf + size); - if (err < 0) - return err; - - size += err; - - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - err = perf_counter_read_entry(sub, read_format, - buf + size); - if (err < 0) - return err; - - size += err; - } - - return size; -} - -static int perf_counter_read_one(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - u64 values[4]; - int n = 0; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - if (copy_to_user(buf, values, n * sizeof(u64))) - return -EFAULT; - - return n * sizeof(u64); -} - -/* - * Read the performance counter - simple non blocking version for now - */ -static ssize_t -perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) -{ - u64 read_format = counter->attr.read_format; - int ret; - - /* - * Return end-of-file for a read on a counter that is in - * error state (i.e. because it was pinned but it couldn't be - * scheduled on to the CPU at some point). - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - return 0; - - if (count < perf_counter_read_size(counter)) - return -ENOSPC; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - if (read_format & PERF_FORMAT_GROUP) - ret = perf_counter_read_group(counter, read_format, buf); - else - ret = perf_counter_read_one(counter, read_format, buf); - mutex_unlock(&counter->child_mutex); - - return ret; -} - -static ssize_t -perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct perf_counter *counter = file->private_data; - - return perf_read_hw(counter, buf, count); -} - -static unsigned int perf_poll(struct file *file, poll_table *wait) -{ - struct perf_counter *counter = file->private_data; - struct perf_mmap_data *data; - unsigned int events = POLL_HUP; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) - events = atomic_xchg(&data->poll, 0); - rcu_read_unlock(); - - poll_wait(file, &counter->waitq, wait); - - return events; -} - -static void perf_counter_reset(struct perf_counter *counter) -{ - (void)perf_counter_read(counter); - atomic64_set(&counter->count, 0); - perf_counter_update_userpage(counter); -} - -/* - * Holding the top-level counter's child_mutex means that any - * descendant process that has inherited this counter will block - * in sync_child_counter if it goes to exit, thus satisfying the - * task existence requirements of perf_counter_enable/disable. - */ -static void perf_counter_for_each_child(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter *child; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - func(counter); - list_for_each_entry(child, &counter->child_list, child_list) - func(child); - mutex_unlock(&counter->child_mutex); -} - -static void perf_counter_for_each(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *sibling; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - counter = counter->group_leader; - - perf_counter_for_each_child(counter, func); - func(counter); - list_for_each_entry(sibling, &counter->sibling_list, list_entry) - perf_counter_for_each_child(counter, func); - mutex_unlock(&ctx->mutex); -} - -static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) -{ - struct perf_counter_context *ctx = counter->ctx; - unsigned long size; - int ret = 0; - u64 value; - - if (!counter->attr.sample_period) - return -EINVAL; - - size = copy_from_user(&value, arg, sizeof(value)); - if (size != sizeof(value)) - return -EFAULT; - - if (!value) - return -EINVAL; - - spin_lock_irq(&ctx->lock); - if (counter->attr.freq) { - if (value > sysctl_perf_counter_sample_rate) { - ret = -EINVAL; - goto unlock; - } - - counter->attr.sample_freq = value; - } else { - counter->attr.sample_period = value; - counter->hw.sample_period = value; - } -unlock: - spin_unlock_irq(&ctx->lock); - - return ret; -} - -int perf_counter_set_output(struct perf_counter *counter, int output_fd); - -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct perf_counter *counter = file->private_data; - void (*func)(struct perf_counter *); - u32 flags = arg; - - switch (cmd) { - case PERF_COUNTER_IOC_ENABLE: - func = perf_counter_enable; - break; - case PERF_COUNTER_IOC_DISABLE: - func = perf_counter_disable; - break; - case PERF_COUNTER_IOC_RESET: - func = perf_counter_reset; - break; - - case PERF_COUNTER_IOC_REFRESH: - return perf_counter_refresh(counter, arg); - - case PERF_COUNTER_IOC_PERIOD: - return perf_counter_period(counter, (u64 __user *)arg); - - case PERF_COUNTER_IOC_SET_OUTPUT: - return perf_counter_set_output(counter, arg); - - default: - return -ENOTTY; - } - - if (flags & PERF_IOC_FLAG_GROUP) - perf_counter_for_each(counter, func); - else - perf_counter_for_each_child(counter, func); - - return 0; -} - -int perf_counter_task_enable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_enable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -int perf_counter_task_disable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_disable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -#ifndef PERF_COUNTER_INDEX_OFFSET -# define PERF_COUNTER_INDEX_OFFSET 0 -#endif - -static int perf_counter_index(struct perf_counter *counter) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return 0; - - return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET; -} - -/* - * Callers need to ensure there can be no nesting of this function, otherwise - * the seqlock logic goes bad. We can not serialize this because the arch - * code calls this from NMI context. - */ -void perf_counter_update_userpage(struct perf_counter *counter) -{ - struct perf_counter_mmap_page *userpg; - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - userpg = data->user_page; - - /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. - */ - preempt_disable(); - ++userpg->lock; - barrier(); - userpg->index = perf_counter_index(counter); - userpg->offset = atomic64_read(&counter->count); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - userpg->offset -= atomic64_read(&counter->hw.prev_count); - - userpg->time_enabled = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - - userpg->time_running = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - - barrier(); - ++userpg->lock; - preempt_enable(); -unlock: - rcu_read_unlock(); -} - -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct perf_counter *counter = vma->vm_file->private_data; - struct perf_mmap_data *data; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - if (vmf->pgoff == 0) { - vmf->page = virt_to_page(data->user_page); - } else { - int nr = vmf->pgoff - 1; - - if ((unsigned)nr > data->nr_pages) - goto unlock; - - if (vmf->flags & FAULT_FLAG_WRITE) - goto unlock; - - vmf->page = virt_to_page(data->data_pages[nr]); - } - - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - -static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) -{ - struct perf_mmap_data *data; - unsigned long size; - int i; - - WARN_ON(atomic_read(&counter->mmap_count)); - - size = sizeof(struct perf_mmap_data); - size += nr_pages * sizeof(void *); - - data = kzalloc(size, GFP_KERNEL); - if (!data) - goto fail; - - data->user_page = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->data_pages[i]) - goto fail_data_pages; - } - - data->nr_pages = nr_pages; - atomic_set(&data->lock, -1); - - rcu_assign_pointer(counter->data, data); - - return 0; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)data->data_pages[i]); - - free_page((unsigned long)data->user_page); - -fail_user_page: - kfree(data); - -fail: - return -ENOMEM; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -static void __perf_mmap_data_free(struct rcu_head *rcu_head) -{ - struct perf_mmap_data *data; - int i; - - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - - perf_mmap_free_page((unsigned long)data->user_page); - for (i = 0; i < data->nr_pages; i++) - perf_mmap_free_page((unsigned long)data->data_pages[i]); - - kfree(data); -} - -static void perf_mmap_data_free(struct perf_counter *counter) -{ - struct perf_mmap_data *data = counter->data; - - WARN_ON(atomic_read(&counter->mmap_count)); - - rcu_assign_pointer(counter->data, NULL); - call_rcu(&data->rcu_head, __perf_mmap_data_free); -} - -static void perf_mmap_open(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - atomic_inc(&counter->mmap_count); -} - -static void perf_mmap_close(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { - struct user_struct *user = current_user(); - - atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= counter->data->nr_locked; - perf_mmap_data_free(counter); - mutex_unlock(&counter->mmap_mutex); - } -} - -static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, -}; - -static int perf_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct perf_counter *counter = file->private_data; - unsigned long user_locked, user_lock_limit; - struct user_struct *user = current_user(); - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra, extra; - int ret = 0; - - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; - - vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; - - /* - * If we have data pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - if (vma->vm_pgoff != 0) - return -EINVAL; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->mmap_mutex); - if (counter->output) { - ret = -EINVAL; - goto unlock; - } - - if (atomic_inc_not_zero(&counter->mmap_count)) { - if (nr_pages != counter->data->nr_pages) - ret = -EINVAL; - goto unlock; - } - - user_extra = nr_pages + 1; - user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm) + user_extra; - - extra = 0; - if (user_locked > user_lock_limit) - extra = user_locked - user_lock_limit; - - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; - - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } - - WARN_ON(counter->data); - ret = perf_mmap_data_alloc(counter, nr_pages); - if (ret) - goto unlock; - - atomic_set(&counter->mmap_count, 1); - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->locked_vm += extra; - counter->data->nr_locked = extra; - if (vma->vm_flags & VM_WRITE) - counter->data->writable = 1; - -unlock: - mutex_unlock(&counter->mmap_mutex); - - vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &perf_mmap_vmops; - - return ret; -} - -static int perf_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct perf_counter *counter = filp->private_data; - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &counter->fasync); - mutex_unlock(&inode->i_mutex); - - if (retval < 0) - return retval; - - return 0; -} - -static const struct file_operations perf_fops = { - .release = perf_release, - .read = perf_read, - .poll = perf_poll, - .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, - .mmap = perf_mmap, - .fasync = perf_fasync, -}; - -/* - * Perf counter wakeup - * - * If there's data, ensure we set the poll() state and publish everything - * to user-space before waking everybody up. - */ - -void perf_counter_wakeup(struct perf_counter *counter) -{ - wake_up_all(&counter->waitq); - - if (counter->pending_kill) { - kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); - counter->pending_kill = 0; - } -} - -/* - * Pending wakeups - * - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. - * - * The NMI bit means we cannot possibly take locks. Therefore, maintain a - * single linked list and use cmpxchg() to add entries lockless. - */ - -static void perf_pending_counter(struct perf_pending_entry *entry) -{ - struct perf_counter *counter = container_of(entry, - struct perf_counter, pending); - - if (counter->pending_disable) { - counter->pending_disable = 0; - __perf_counter_disable(counter); - } - - if (counter->pending_wakeup) { - counter->pending_wakeup = 0; - perf_counter_wakeup(counter); - } -} - -#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) - -static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { - PENDING_TAIL, -}; - -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; - - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) - return; - - entry->func = func; - - head = &get_cpu_var(perf_pending_head); - - do { - entry->next = *head; - } while (cmpxchg(head, entry->next, entry) != entry->next); - - set_perf_counter_pending(); - - put_cpu_var(perf_pending_head); -} - -static int __perf_pending_run(void) -{ - struct perf_pending_entry *list; - int nr = 0; - - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); - while (list != PENDING_TAIL) { - void (*func)(struct perf_pending_entry *); - struct perf_pending_entry *entry = list; - - list = list->next; - - func = entry->func; - entry->next = NULL; - /* - * Ensure we observe the unqueue before we issue the wakeup, - * so that we won't be waiting forever. - * -- see perf_not_pending(). - */ - smp_wmb(); - - func(entry); - nr++; - } - - return nr; -} - -static inline int perf_not_pending(struct perf_counter *counter) -{ - /* - * If we flush on whatever cpu we run, there is a chance we don't - * need to wait. - */ - get_cpu(); - __perf_pending_run(); - put_cpu(); - - /* - * Ensure we see the proper queue state before going to sleep - * so that we do not miss the wakeup. -- see perf_pending_handle() - */ - smp_rmb(); - return counter->pending.next == NULL; -} - -static void perf_pending_sync(struct perf_counter *counter) -{ - wait_event(counter->waitq, perf_not_pending(counter)); -} - -void perf_counter_do_pending(void) -{ - __perf_pending_run(); -} - -/* - * Callchain support -- arch specific - */ - -__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - return NULL; -} - -/* - * Output - */ - -struct perf_output_handle { - struct perf_counter *counter; - struct perf_mmap_data *data; - unsigned long head; - unsigned long offset; - int nmi; - int sample; - int locked; - unsigned long flags; -}; - -static bool perf_output_space(struct perf_mmap_data *data, - unsigned int offset, unsigned int head) -{ - unsigned long tail; - unsigned long mask; - - if (!data->writable) - return true; - - mask = (data->nr_pages << PAGE_SHIFT) - 1; - /* - * Userspace could choose to issue a mb() before updating the tail - * pointer. So that all reads will be completed before the write is - * issued. - */ - tail = ACCESS_ONCE(data->user_page->data_tail); - smp_rmb(); - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->data->poll, POLL_IN); - - if (handle->nmi) { - handle->counter->pending_wakeup = 1; - perf_pending_queue(&handle->counter->pending, - perf_pending_counter); - } else - perf_counter_wakeup(handle->counter); -} - -/* - * Curious locking construct. - * - * We need to ensure a later event doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * What we do is serialize between CPUs so we only have to deal with NMI - * nesting on a single CPU. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_lock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - int cpu; - - handle->locked = 0; - - local_irq_save(handle->flags); - cpu = smp_processor_id(); - - if (in_nmi() && atomic_read(&data->lock) == cpu) - return; - - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - handle->locked = 1; -} - -static void perf_output_unlock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - unsigned long head; - int cpu; - - data->done_head = data->head; - - if (!handle->locked) - goto out; - -again: - /* - * The xchg implies a full barrier that ensures all writes are done - * before we publish the new head, matched by a rmb() in userspace when - * reading this position. - */ - while ((head = atomic_long_xchg(&data->done_head, 0))) - data->user_page->data_head = head; - - /* - * NMI can happen here, which means we can miss a done_head update. - */ - - cpu = atomic_xchg(&data->lock, -1); - WARN_ON_ONCE(cpu != smp_processor_id()); - - /* - * Therefore we have to validate we did not indeed do so. - */ - if (unlikely(atomic_long_read(&data->done_head))) { - /* - * Since we had it locked, we can lock it again. - */ - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - goto again; - } - - if (atomic_xchg(&data->wakeup, 0)) - perf_output_wakeup(handle); -out: - local_irq_restore(handle->flags); -} - -static void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - unsigned int pages_mask; - unsigned int offset; - unsigned int size; - void **pages; - - offset = handle->offset; - pages_mask = handle->data->nr_pages - 1; - pages = handle->data->data_pages; - - do { - unsigned int page_offset; - int nr; - - nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); - - memcpy(pages[nr] + page_offset, buf, size); - - len -= size; - buf += size; - offset += size; - } while (len); - - handle->offset = offset; - - /* - * Check we didn't copy past our reservation window, taking the - * possible unsigned int wrap into account. - */ - WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); -} - -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) - -static int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, - int nmi, int sample) -{ - struct perf_counter *output_counter; - struct perf_mmap_data *data; - unsigned int offset, head; - int have_lost; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - rcu_read_lock(); - /* - * For inherited counters we send all the output towards the parent. - */ - if (counter->parent) - counter = counter->parent; - - output_counter = rcu_dereference(counter->output); - if (output_counter) - counter = output_counter; - - data = rcu_dereference(counter->data); - if (!data) - goto out; - - handle->data = data; - handle->counter = counter; - handle->nmi = nmi; - handle->sample = sample; - - if (!data->nr_pages) - goto fail; - - have_lost = atomic_read(&data->lost); - if (have_lost) - size += sizeof(lost_event); - - perf_output_lock(handle); - - do { - offset = head = atomic_long_read(&data->head); - head += size; - if (unlikely(!perf_output_space(data, offset, head))) - goto fail; - } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); - - handle->offset = offset; - handle->head = head; - - if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) - atomic_set(&data->wakeup, 1); - - if (have_lost) { - lost_event.header.type = PERF_EVENT_LOST; - lost_event.header.misc = 0; - lost_event.header.size = sizeof(lost_event); - lost_event.id = counter->id; - lost_event.lost = atomic_xchg(&data->lost, 0); - - perf_output_put(handle, lost_event); - } - - return 0; - -fail: - atomic_inc(&data->lost); - perf_output_unlock(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -static void perf_output_end(struct perf_output_handle *handle) -{ - struct perf_counter *counter = handle->counter; - struct perf_mmap_data *data = handle->data; - - int wakeup_events = counter->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = atomic_inc_return(&data->events); - if (events >= wakeup_events) { - atomic_sub(wakeup_events, &data->events); - atomic_set(&data->wakeup, 1); - } - } - - perf_output_unlock(handle); - rcu_read_unlock(); -} - -static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_tgid_nr_ns(p, counter->ns); -} - -static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_pid_nr_ns(p, counter->ns); -} - -static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - u64 read_format = counter->attr.read_format; - u64 values[4]; - int n = 0; - - values[n++] = atomic64_read(&counter->count); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - perf_output_copy(handle, values, n * sizeof(u64)); -} - -/* - * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. - */ -static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - struct perf_counter *leader = counter->group_leader, *sub; - u64 read_format = counter->attr.read_format; - u64 values[5]; - int n = 0; - - values[n++] = 1 + leader->nr_siblings; - - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = leader->total_time_enabled; - - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = leader->total_time_running; - - if (leader != counter) - leader->pmu->read(leader); - - values[n++] = atomic64_read(&leader->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(leader); - - perf_output_copy(handle, values, n * sizeof(u64)); - - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - n = 0; - - if (sub != counter) - sub->pmu->read(sub); - - values[n++] = atomic64_read(&sub->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(sub); - - perf_output_copy(handle, values, n * sizeof(u64)); - } -} - -static void perf_output_read(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - if (counter->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, counter); - else - perf_output_read_one(handle, counter); -} - -void perf_counter_output(struct perf_counter *counter, int nmi, - struct perf_sample_data *data) -{ - int ret; - u64 sample_type = counter->attr.sample_type; - struct perf_output_handle handle; - struct perf_event_header header; - u64 ip; - struct { - u32 pid, tid; - } tid_entry; - struct perf_callchain_entry *callchain = NULL; - int callchain_size = 0; - u64 time; - struct { - u32 cpu, reserved; - } cpu_entry; - - header.type = PERF_EVENT_SAMPLE; - header.size = sizeof(header); - - header.misc = 0; - header.misc |= perf_misc_flags(data->regs); - - if (sample_type & PERF_SAMPLE_IP) { - ip = perf_instruction_pointer(data->regs); - header.size += sizeof(ip); - } - - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - tid_entry.pid = perf_counter_pid(counter, current); - tid_entry.tid = perf_counter_tid(counter, current); - - header.size += sizeof(tid_entry); - } - - if (sample_type & PERF_SAMPLE_TIME) { - /* - * Maybe do better on x86 and provide cpu_clock_nmi() - */ - time = sched_clock(); - - header.size += sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_ADDR) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_ID) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_CPU) { - header.size += sizeof(cpu_entry); - - cpu_entry.cpu = raw_smp_processor_id(); - cpu_entry.reserved = 0; - } - - if (sample_type & PERF_SAMPLE_PERIOD) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_READ) - header.size += perf_counter_read_size(counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - callchain = perf_callchain(data->regs); - - if (callchain) { - callchain_size = (1 + callchain->nr) * sizeof(u64); - header.size += callchain_size; - } else - header.size += sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); - - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header.size += size; - } - - ret = perf_output_begin(&handle, counter, header.size, nmi, 1); - if (ret) - return; - - perf_output_put(&handle, header); - - if (sample_type & PERF_SAMPLE_IP) - perf_output_put(&handle, ip); - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(&handle, tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(&handle, time); - - if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(&handle, data->addr); - - if (sample_type & PERF_SAMPLE_ID) { - u64 id = primary_counter_id(counter); - - perf_output_put(&handle, id); - } - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(&handle, counter->id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(&handle, cpu_entry); - - if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(&handle, data->period); - - if (sample_type & PERF_SAMPLE_READ) - perf_output_read(&handle, counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (callchain) - perf_output_copy(&handle, callchain, callchain_size); - else { - u64 nr = 0; - perf_output_put(&handle, nr); - } - } - - if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - perf_output_put(&handle, data->raw->size); - perf_output_copy(&handle, data->raw->data, data->raw->size); - } else { - struct { - u32 size; - u32 data; - } raw = { - .size = sizeof(u32), - .data = 0, - }; - perf_output_put(&handle, raw); - } - } - - perf_output_end(&handle); -} - -/* - * read event - */ - -struct perf_read_event { - struct perf_event_header header; - - u32 pid; - u32 tid; -}; - -static void -perf_counter_read_event(struct perf_counter *counter, - struct task_struct *task) -{ - struct perf_output_handle handle; - struct perf_read_event event = { - .header = { - .type = PERF_EVENT_READ, - .misc = 0, - .size = sizeof(event) + perf_counter_read_size(counter), - }, - .pid = perf_counter_pid(counter, task), - .tid = perf_counter_tid(counter, task), - }; - int ret; - - ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); - if (ret) - return; - - perf_output_put(&handle, event); - perf_output_read(&handle, counter); - - perf_output_end(&handle); -} - -/* - * task tracking -- fork/exit - * - * enabled by: attr.comm | attr.mmap | attr.task - */ - -struct perf_task_event { - struct task_struct *task; - struct perf_counter_context *task_ctx; - - struct { - struct perf_event_header header; - - u32 pid; - u32 ppid; - u32 tid; - u32 ptid; - } event; -}; - -static void perf_counter_task_output(struct perf_counter *counter, - struct perf_task_event *task_event) -{ - struct perf_output_handle handle; - int size = task_event->event.header.size; - struct task_struct *task = task_event->task; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, current); - - task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, current); - - perf_output_put(&handle, task_event->event); - perf_output_end(&handle); -} - -static int perf_counter_task_match(struct perf_counter *counter) -{ - if (counter->attr.comm || counter->attr.mmap || counter->attr.task) - return 1; - - return 0; -} - -static void perf_counter_task_ctx(struct perf_counter_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_task_match(counter)) - perf_counter_task_output(counter, task_event); - } - rcu_read_unlock(); -} - -static void perf_counter_task_event(struct perf_task_event *task_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx = task_event->task_ctx; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_task_ctx(&cpuctx->ctx, task_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - if (!ctx) - ctx = rcu_dereference(task_event->task->perf_counter_ctxp); - if (ctx) - perf_counter_task_ctx(ctx, task_event); - rcu_read_unlock(); -} - -static void perf_counter_task(struct task_struct *task, - struct perf_counter_context *task_ctx, - int new) -{ - struct perf_task_event task_event; - - if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters) && - !atomic_read(&nr_task_counters)) - return; - - task_event = (struct perf_task_event){ - .task = task, - .task_ctx = task_ctx, - .event = { - .header = { - .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, - .misc = 0, - .size = sizeof(task_event.event), - }, - /* .pid */ - /* .ppid */ - /* .tid */ - /* .ptid */ - }, - }; - - perf_counter_task_event(&task_event); -} - -void perf_counter_fork(struct task_struct *task) -{ - perf_counter_task(task, NULL, 1); -} - -/* - * comm tracking - */ - -struct perf_comm_event { - struct task_struct *task; - char *comm; - int comm_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - } event; -}; - -static void perf_counter_comm_output(struct perf_counter *counter, - struct perf_comm_event *comm_event) -{ - struct perf_output_handle handle; - int size = comm_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - comm_event->event.pid = perf_counter_pid(counter, comm_event->task); - comm_event->event.tid = perf_counter_tid(counter, comm_event->task); - - perf_output_put(&handle, comm_event->event); - perf_output_copy(&handle, comm_event->comm, - comm_event->comm_size); - perf_output_end(&handle); -} - -static int perf_counter_comm_match(struct perf_counter *counter) -{ - if (counter->attr.comm) - return 1; - - return 0; -} - -static void perf_counter_comm_ctx(struct perf_counter_context *ctx, - struct perf_comm_event *comm_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_comm_match(counter)) - perf_counter_comm_output(counter, comm_event); - } - rcu_read_unlock(); -} - -static void perf_counter_comm_event(struct perf_comm_event *comm_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - unsigned int size; - char comm[TASK_COMM_LEN]; - - memset(comm, 0, sizeof(comm)); - strncpy(comm, comm_event->task->comm, sizeof(comm)); - size = ALIGN(strlen(comm)+1, sizeof(u64)); - - comm_event->comm = comm; - comm_event->comm_size = size; - - comm_event->event.header.size = sizeof(comm_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(&cpuctx->ctx, comm_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_comm_ctx(ctx, comm_event); - rcu_read_unlock(); -} - -void perf_counter_comm(struct task_struct *task) -{ - struct perf_comm_event comm_event; - - if (task->perf_counter_ctxp) - perf_counter_enable_on_exec(task); - - if (!atomic_read(&nr_comm_counters)) - return; - - comm_event = (struct perf_comm_event){ - .task = task, - /* .comm */ - /* .comm_size */ - .event = { - .header = { - .type = PERF_EVENT_COMM, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - }, - }; - - perf_counter_comm_event(&comm_event); -} - -/* - * mmap tracking - */ - -struct perf_mmap_event { - struct vm_area_struct *vma; - - const char *file_name; - int file_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - u64 start; - u64 len; - u64 pgoff; - } event; -}; - -static void perf_counter_mmap_output(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - struct perf_output_handle handle; - int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - mmap_event->event.pid = perf_counter_pid(counter, current); - mmap_event->event.tid = perf_counter_tid(counter, current); - - perf_output_put(&handle, mmap_event->event); - perf_output_copy(&handle, mmap_event->file_name, - mmap_event->file_size); - perf_output_end(&handle); -} - -static int perf_counter_mmap_match(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - if (counter->attr.mmap) - return 1; - - return 0; -} - -static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, - struct perf_mmap_event *mmap_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_mmap_match(counter, mmap_event)) - perf_counter_mmap_output(counter, mmap_event); - } - rcu_read_unlock(); -} - -static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - struct vm_area_struct *vma = mmap_event->vma; - struct file *file = vma->vm_file; - unsigned int size; - char tmp[16]; - char *buf = NULL; - const char *name; - - memset(tmp, 0, sizeof(tmp)); - - if (file) { - /* - * d_path works from the end of the buffer backwards, so we - * need to add enough zero bytes after the string to handle - * the 64bit alignment we do later. - */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); - if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; - } - } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); - goto got_name; - } - - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } - - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; - } - -got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); - - mmap_event->file_name = name; - mmap_event->file_size = size; - - mmap_event->event.header.size = sizeof(mmap_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_mmap_ctx(ctx, mmap_event); - rcu_read_unlock(); - - kfree(buf); -} - -void __perf_counter_mmap(struct vm_area_struct *vma) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_mmap_counters)) - return; - - mmap_event = (struct perf_mmap_event){ - .vma = vma, - /* .file_name */ - /* .file_size */ - .event = { - .header = { - .type = PERF_EVENT_MMAP, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - .start = vma->vm_start, - .len = vma->vm_end - vma->vm_start, - .pgoff = vma->vm_pgoff, - }, - }; - - perf_counter_mmap_event(&mmap_event); -} - -/* - * IRQ throttle logging - */ - -static void perf_log_throttle(struct perf_counter *counter, int enable) -{ - struct perf_output_handle handle; - int ret; - - struct { - struct perf_event_header header; - u64 time; - u64 id; - u64 stream_id; - } throttle_event = { - .header = { - .type = PERF_EVENT_THROTTLE, - .misc = 0, - .size = sizeof(throttle_event), - }, - .time = sched_clock(), - .id = primary_counter_id(counter), - .stream_id = counter->id, - }; - - if (enable) - throttle_event.header.type = PERF_EVENT_UNTHROTTLE; - - ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); - if (ret) - return; - - perf_output_put(&handle, throttle_event); - perf_output_end(&handle); -} - -/* - * Generic counter overflow handling, sampling. - */ - -int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data) -{ - int events = atomic_read(&counter->event_limit); - int throttle = counter->pmu->unthrottle != NULL; - struct hw_perf_counter *hwc = &counter->hw; - int ret = 0; - - if (!throttle) { - hwc->interrupts++; - } else { - if (hwc->interrupts != MAX_INTERRUPTS) { - hwc->interrupts++; - if (HZ * hwc->interrupts > - (u64)sysctl_perf_counter_sample_rate) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(counter, 0); - ret = 1; - } - } else { - /* - * Keep re-disabling counters even though on the previous - * pass we disabled it - just in case we raced with a - * sched-in and the counter got enabled again: - */ - ret = 1; - } - } - - if (counter->attr.freq) { - u64 now = sched_clock(); - s64 delta = now - hwc->freq_stamp; - - hwc->freq_stamp = now; - - if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); - } - - /* - * XXX event_limit might not quite work as expected on inherited - * counters - */ - - counter->pending_kill = POLL_IN; - if (events && atomic_dec_and_test(&counter->event_limit)) { - ret = 1; - counter->pending_kill = POLL_HUP; - if (nmi) { - counter->pending_disable = 1; - perf_pending_queue(&counter->pending, - perf_pending_counter); - } else - perf_counter_disable(counter); - } - - perf_counter_output(counter, nmi, data); - return ret; -} - -/* - * Generic software counter infrastructure - */ - -/* - * We directly increment counter->count and keep a second value in - * counter->hw.period_left to count intervals. This period counter - * is kept in the range [-sample_period, 0] so that we can use the - * sign as trigger. - */ - -static u64 perf_swcounter_set_period(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period = hwc->last_period; - u64 nr, offset; - s64 old, val; - - hwc->last_period = hwc->sample_period; - -again: - old = val = atomic64_read(&hwc->period_left); - if (val < 0) - return 0; - - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; - - return nr; -} - -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 overflow; - - data->period = counter->hw.last_period; - overflow = perf_swcounter_set_period(counter); - - if (hwc->interrupts == MAX_INTERRUPTS) - return; - - for (; overflow; overflow--) { - if (perf_counter_overflow(counter, nmi, data)) { - /* - * We inhibit the overflow from happening when - * hwc->interrupts == MAX_INTERRUPTS. - */ - break; - } - } -} - -static void perf_swcounter_unthrottle(struct perf_counter *counter) -{ - /* - * Nothing to do, we already reset hwc->interrupts. - */ -} - -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) -{ - struct hw_perf_counter *hwc = &counter->hw; - - atomic64_add(nr, &counter->count); - - if (!hwc->sample_period) - return; - - if (!data->regs) - return; - - if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swcounter_overflow(counter, nmi, data); -} - -static int perf_swcounter_is_counting(struct perf_counter *counter) -{ - /* - * The counter is active, we're good! - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - return 1; - - /* - * The counter is off/error, not counting. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE) - return 0; - - /* - * The counter is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (counter->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - -static int perf_swcounter_match(struct perf_counter *counter, - enum perf_type_id type, - u32 event, struct pt_regs *regs) -{ - if (!perf_swcounter_is_counting(counter)) - return 0; - - if (counter->attr.type != type) - return 0; - if (counter->attr.config != event) - return 0; - - if (regs) { - if (counter->attr.exclude_user && user_mode(regs)) - return 0; - - if (counter->attr.exclude_kernel && !user_mode(regs)) - return 0; - } - - return 1; -} - -static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum perf_type_id type, - u32 event, u64 nr, int nmi, - struct perf_sample_data *data) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event, data->regs)) - perf_swcounter_add(counter, nr, nmi, data); - } - rcu_read_unlock(); -} - -static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) -{ - if (in_nmi()) - return &cpuctx->recursion[3]; - - if (in_irq()) - return &cpuctx->recursion[2]; - - if (in_softirq()) - return &cpuctx->recursion[1]; - - return &cpuctx->recursion[0]; -} - -static void do_perf_swcounter_event(enum perf_type_id type, u32 event, - u64 nr, int nmi, - struct perf_sample_data *data) -{ - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swcounter_recursion_context(cpuctx); - struct perf_counter_context *ctx; - - if (*recursion) - goto out; - - (*recursion)++; - barrier(); - - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, - nr, nmi, data); - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data); - rcu_read_unlock(); - - barrier(); - (*recursion)--; - -out: - put_cpu_var(perf_cpu_context); -} - -void __perf_swcounter_event(u32 event, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) -{ - struct perf_sample_data data = { - .regs = regs, - .addr = addr, - }; - - do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data); -} - -static void perf_swcounter_read(struct perf_counter *counter) -{ -} - -static int perf_swcounter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - - if (hwc->sample_period) { - hwc->last_period = hwc->sample_period; - perf_swcounter_set_period(counter); - } - return 0; -} - -static void perf_swcounter_disable(struct perf_counter *counter) -{ -} - -static const struct pmu perf_ops_generic = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, - .unthrottle = perf_swcounter_unthrottle, -}; - -/* - * hrtimer based swcounter callback - */ - -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - data.regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((counter->attr.exclude_kernel || !data.regs) && - !counter->attr.exclude_user) - data.regs = task_pt_regs(current); - - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -/* - * Software counter: cpu wall time clock - */ - -static void cpu_clock_perf_counter_update(struct perf_counter *counter) -{ - int cpu = raw_smp_processor_id(); - s64 prev; - u64 now; - - now = cpu_clock(cpu); - prev = atomic64_read(&counter->hw.prev_count); - atomic64_set(&counter->hw.prev_count, now); - atomic64_add(now - prev, &counter->count); -} - -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - int cpu = raw_smp_processor_id(); - - atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void cpu_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - cpu_clock_perf_counter_update(counter); -} - -static void cpu_clock_perf_counter_read(struct perf_counter *counter) -{ - cpu_clock_perf_counter_update(counter); -} - -static const struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_counter_enable, - .disable = cpu_clock_perf_counter_disable, - .read = cpu_clock_perf_counter_read, -}; - -/* - * Software counter: task time clock - */ - -static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) -{ - u64 prev; - s64 delta; - - prev = atomic64_xchg(&counter->hw.prev_count, now); - delta = now - prev; - atomic64_add(delta, &counter->count); -} - -static int task_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 now; - - now = counter->ctx->time; - - atomic64_set(&hwc->prev_count, now); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void task_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter, counter->ctx->time); - -} - -static void task_clock_perf_counter_read(struct perf_counter *counter) -{ - u64 time; - - if (!in_nmi()) { - update_context_time(counter->ctx); - time = counter->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - counter->ctx->timestamp; - time = counter->ctx->time + delta; - } - - task_clock_perf_counter_update(counter, time); -} - -static const struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_counter_enable, - .disable = task_clock_perf_counter_disable, - .read = task_clock_perf_counter_read, -}; - -#ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, - int entry_size) -{ - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - struct perf_sample_data data = { - .regs = get_irq_regs(), - .addr = addr, - .raw = &raw, - }; - - if (!data.regs) - data.regs = task_pt_regs(current); - - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); -} -EXPORT_SYMBOL_GPL(perf_tpcounter_event); - -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); - -static void tp_perf_counter_destroy(struct perf_counter *counter) -{ - ftrace_profile_disable(counter->attr.config); -} - -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - /* - * Raw tracepoint data is a severe data leak, only allow root to - * have these. - */ - if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && - perf_paranoid_tracepoint_raw() && - !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - if (ftrace_profile_enable(counter->attr.config)) - return NULL; - - counter->destroy = tp_perf_counter_destroy; - - return &perf_ops_generic; -} -#else -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} -#endif - -atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; - -static void sw_perf_counter_destroy(struct perf_counter *counter) -{ - u64 event = counter->attr.config; - - WARN_ON(counter->parent); - - atomic_dec(&perf_swcounter_enabled[event]); -} - -static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) -{ - const struct pmu *pmu = NULL; - u64 event = counter->attr.config; - - /* - * Software counters (currently) can't in general distinguish - * between user, kernel and hypervisor events. - * However, context switches and cpu migrations are considered - * to be kernel events, and page faults are never hypervisor - * events. - */ - switch (event) { - case PERF_COUNT_SW_CPU_CLOCK: - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_TASK_CLOCK: - /* - * If the user instantiates this as a per-cpu counter, - * use the cpu_clock counter instead. - */ - if (counter->ctx->task) - pmu = &perf_ops_task_clock; - else - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_PAGE_FAULTS: - case PERF_COUNT_SW_PAGE_FAULTS_MIN: - case PERF_COUNT_SW_PAGE_FAULTS_MAJ: - case PERF_COUNT_SW_CONTEXT_SWITCHES: - case PERF_COUNT_SW_CPU_MIGRATIONS: - if (!counter->parent) { - atomic_inc(&perf_swcounter_enabled[event]); - counter->destroy = sw_perf_counter_destroy; - } - pmu = &perf_ops_generic; - break; - } - - return pmu; -} - -/* - * Allocate and initialize a counter structure - */ -static struct perf_counter * -perf_counter_alloc(struct perf_counter_attr *attr, - int cpu, - struct perf_counter_context *ctx, - struct perf_counter *group_leader, - struct perf_counter *parent_counter, - gfp_t gfpflags) -{ - const struct pmu *pmu; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - long err; - - counter = kzalloc(sizeof(*counter), gfpflags); - if (!counter) - return ERR_PTR(-ENOMEM); - - /* - * Single counters are their own group leaders, with an - * empty sibling list: - */ - if (!group_leader) - group_leader = counter; - - mutex_init(&counter->child_mutex); - INIT_LIST_HEAD(&counter->child_list); - - INIT_LIST_HEAD(&counter->list_entry); - INIT_LIST_HEAD(&counter->event_entry); - INIT_LIST_HEAD(&counter->sibling_list); - init_waitqueue_head(&counter->waitq); - - mutex_init(&counter->mmap_mutex); - - counter->cpu = cpu; - counter->attr = *attr; - counter->group_leader = group_leader; - counter->pmu = NULL; - counter->ctx = ctx; - counter->oncpu = -1; - - counter->parent = parent_counter; - - counter->ns = get_pid_ns(current->nsproxy->pid_ns); - counter->id = atomic64_inc_return(&perf_counter_id); - - counter->state = PERF_COUNTER_STATE_INACTIVE; - - if (attr->disabled) - counter->state = PERF_COUNTER_STATE_OFF; - - pmu = NULL; - - hwc = &counter->hw; - hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) - hwc->sample_period = 1; - hwc->last_period = hwc->sample_period; - - atomic64_set(&hwc->period_left, hwc->sample_period); - - /* - * we currently do not support PERF_FORMAT_GROUP on inherited counters - */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) - goto done; - - switch (attr->type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - pmu = hw_perf_counter_init(counter); - break; - - case PERF_TYPE_SOFTWARE: - pmu = sw_perf_counter_init(counter); - break; - - case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_counter_init(counter); - break; - - default: - break; - } -done: - err = 0; - if (!pmu) - err = -EINVAL; - else if (IS_ERR(pmu)) - err = PTR_ERR(pmu); - - if (err) { - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); - return ERR_PTR(err); - } - - counter->pmu = pmu; - - if (!counter->parent) { - atomic_inc(&nr_counters); - if (counter->attr.mmap) - atomic_inc(&nr_mmap_counters); - if (counter->attr.comm) - atomic_inc(&nr_comm_counters); - if (counter->attr.task) - atomic_inc(&nr_task_counters); - } - - return counter; -} - -static int perf_copy_attr(struct perf_counter_attr __user *uattr, - struct perf_counter_attr *attr) -{ - int ret; - u32 size; - - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) - goto err_size; - - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0. - */ - if (size > sizeof(*attr)) { - unsigned long val; - unsigned long __user *addr; - unsigned long __user *end; - - addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr), - sizeof(unsigned long)); - end = PTR_ALIGN((void __user *)uattr + size, - sizeof(unsigned long)); - - for (; addr < end; addr += sizeof(unsigned long)) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - - /* - * If the type exists, the corresponding creation will verify - * the attr->config. - */ - if (attr->type >= PERF_TYPE_MAX) - return -EINVAL; - - if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) - return -EINVAL; - - if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) - return -EINVAL; - - if (attr->read_format & ~(PERF_FORMAT_MAX-1)) - return -EINVAL; - -out: - return ret; - -err_size: - put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; -} - -int perf_counter_set_output(struct perf_counter *counter, int output_fd) -{ - struct perf_counter *output_counter = NULL; - struct file *output_file = NULL; - struct perf_counter *old_output; - int fput_needed = 0; - int ret = -EINVAL; - - if (!output_fd) - goto set; - - output_file = fget_light(output_fd, &fput_needed); - if (!output_file) - return -EBADF; - - if (output_file->f_op != &perf_fops) - goto out; - - output_counter = output_file->private_data; - - /* Don't chain output fds */ - if (output_counter->output) - goto out; - - /* Don't set an output fd when we already have an output channel */ - if (counter->data) - goto out; - - atomic_long_inc(&output_file->f_count); - -set: - mutex_lock(&counter->mmap_mutex); - old_output = counter->output; - rcu_assign_pointer(counter->output, output_counter); - mutex_unlock(&counter->mmap_mutex); - - if (old_output) { - /* - * we need to make sure no existing perf_output_*() - * is still referencing this counter. - */ - synchronize_rcu(); - fput(old_output->filp); - } - - ret = 0; -out: - fput_light(output_file, fput_needed); - return ret; -} - -/** - * sys_perf_counter_open - open a performance counter, associate it to a task/cpu - * - * @attr_uptr: event type attributes for monitoring/sampling - * @pid: target pid - * @cpu: target cpu - * @group_fd: group leader counter fd - */ -SYSCALL_DEFINE5(perf_counter_open, - struct perf_counter_attr __user *, attr_uptr, - pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) -{ - struct perf_counter *counter, *group_leader; - struct perf_counter_attr attr; - struct perf_counter_context *ctx; - struct file *counter_file = NULL; - struct file *group_file = NULL; - int fput_needed = 0; - int fput_needed2 = 0; - int err; - - /* for future expandability... */ - if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) - return -EINVAL; - - err = perf_copy_attr(attr_uptr, &attr); - if (err) - return err; - - if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - } - - if (attr.freq) { - if (attr.sample_freq > sysctl_perf_counter_sample_rate) - return -EINVAL; - } - - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - /* - * Look up the group leader (we will attach this counter to it): - */ - group_leader = NULL; - if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { - err = -EINVAL; - group_file = fget_light(group_fd, &fput_needed); - if (!group_file) - goto err_put_context; - if (group_file->f_op != &perf_fops) - goto err_put_context; - - group_leader = group_file->private_data; - /* - * Do not allow a recursive hierarchy (this new sibling - * becoming part of another group-sibling): - */ - if (group_leader->group_leader != group_leader) - goto err_put_context; - /* - * Do not allow to attach to a group in a different - * task or CPU context: - */ - if (group_leader->ctx != ctx) - goto err_put_context; - /* - * Only a group leader can be exclusive or pinned - */ - if (attr.exclusive || attr.pinned) - goto err_put_context; - } - - counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, - NULL, GFP_KERNEL); - err = PTR_ERR(counter); - if (IS_ERR(counter)) - goto err_put_context; - - err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); - if (err < 0) - goto err_free_put_context; - - counter_file = fget_light(err, &fput_needed2); - if (!counter_file) - goto err_free_put_context; - - if (flags & PERF_FLAG_FD_OUTPUT) { - err = perf_counter_set_output(counter, group_fd); - if (err) - goto err_fput_free_put_context; - } - - counter->filp = counter_file; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_install_in_context(ctx, counter, cpu); - ++ctx->generation; - mutex_unlock(&ctx->mutex); - - counter->owner = current; - get_task_struct(current); - mutex_lock(¤t->perf_counter_mutex); - list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); - mutex_unlock(¤t->perf_counter_mutex); - -err_fput_free_put_context: - fput_light(counter_file, fput_needed2); - -err_free_put_context: - if (err < 0) - kfree(counter); - -err_put_context: - if (err < 0) - put_ctx(ctx); - - fput_light(group_file, fput_needed); - - return err; -} - -/* - * inherit a counter from parent task to child task: - */ -static struct perf_counter * -inherit_counter(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter *group_leader, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *child_counter; - - /* - * Instead of creating recursive hierarchies of counters, - * we link inherited counters back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_counter->parent) - parent_counter = parent_counter->parent; - - child_counter = perf_counter_alloc(&parent_counter->attr, - parent_counter->cpu, child_ctx, - group_leader, parent_counter, - GFP_KERNEL); - if (IS_ERR(child_counter)) - return child_counter; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent counter, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en, dis}able_family. - */ - if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - else - child_counter->state = PERF_COUNTER_STATE_OFF; - - if (parent_counter->attr.freq) - child_counter->hw.sample_period = parent_counter->hw.sample_period; - - /* - * Link it up in the child's context: - */ - add_counter_to_ctx(child_counter, child_ctx); - - /* - * Get a reference to the parent filp - we will fput it - * when the child counter exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_counter->filp->f_count); - - /* - * Link this into the parent counter's child list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_add_tail(&child_counter->child_list, &parent_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - return child_counter; -} - -static int inherit_group(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *leader; - struct perf_counter *sub; - struct perf_counter *child_ctr; - - leader = inherit_counter(parent_counter, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { - child_ctr = inherit_counter(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - -static void sync_child_counter(struct perf_counter *child_counter, - struct task_struct *child) -{ - struct perf_counter *parent_counter = child_counter->parent; - u64 child_val; - - if (child_counter->attr.inherit_stat) - perf_counter_read_event(child_counter, child); - - child_val = atomic64_read(&child_counter->count); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_counter->count); - atomic64_add(child_counter->total_time_enabled, - &parent_counter->child_total_time_enabled); - atomic64_add(child_counter->total_time_running, - &parent_counter->child_total_time_running); - - /* - * Remove this counter from the parent's list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_del_init(&child_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - /* - * Release the parent counter, if this was the last - * reference to it. - */ - fput(parent_counter->filp); -} - -static void -__perf_counter_exit_task(struct perf_counter *child_counter, - struct perf_counter_context *child_ctx, - struct task_struct *child) -{ - struct perf_counter *parent_counter; - - update_counter_times(child_counter); - perf_counter_remove_from_context(child_counter); - - parent_counter = child_counter->parent; - /* - * It can happen that parent exits first, and has counters - * that are still around due to the child reference. These - * counters need to be zapped - but otherwise linger. - */ - if (parent_counter) { - sync_child_counter(child_counter, child); - free_counter(child_counter); - } -} - -/* - * When a child task exits, feed back counter values to parent counters. - */ -void perf_counter_exit_task(struct task_struct *child) -{ - struct perf_counter *child_counter, *tmp; - struct perf_counter_context *child_ctx; - unsigned long flags; - - if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, NULL, 0); - return; - } - - local_irq_save(flags); - /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. - */ - child_ctx = child->perf_counter_ctxp; - __perf_counter_task_sched_out(child_ctx); - - /* - * Take the context lock here so that if find_get_context is - * reading child->perf_counter_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. - */ - spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; - /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the counters from it. - */ - unclone_ctx(child_ctx); - spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Report the task dead after unscheduling the counters so that we - * won't get any samples after PERF_EVENT_EXIT. We can however still - * get a few PERF_EVENT_READ events. - */ - perf_counter_task(child, child_ctx, 0); - - /* - * We can recurse on the same lock type through: - * - * __perf_counter_exit_task() - * sync_child_counter() - * fput(parent_counter->filp) - * perf_release() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); - -again: - list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, - list_entry) - __perf_counter_exit_task(child_counter, child_ctx, child); - - /* - * If the last counter was a group counter, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->counter_list)) - goto again; - - mutex_unlock(&child_ctx->mutex); - - put_ctx(child_ctx); -} - -/* - * free an unexposed, unused context as created by inheritance by - * init_task below, used by fork() in case of fail. - */ -void perf_counter_free_task(struct task_struct *task) -{ - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter *counter, *tmp; - - if (!ctx) - return; - - mutex_lock(&ctx->mutex); -again: - list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) { - struct perf_counter *parent = counter->parent; - - if (WARN_ON_ONCE(!parent)) - continue; - - mutex_lock(&parent->child_mutex); - list_del_init(&counter->child_list); - mutex_unlock(&parent->child_mutex); - - fput(parent->filp); - - list_del_counter(counter, ctx); - free_counter(counter); - } - - if (!list_empty(&ctx->counter_list)) - goto again; - - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - -/* - * Initialize the perf_counter context in task_struct - */ -int perf_counter_init_task(struct task_struct *child) -{ - struct perf_counter_context *child_ctx, *parent_ctx; - struct perf_counter_context *cloned_ctx; - struct perf_counter *counter; - struct task_struct *parent = current; - int inherited_all = 1; - int ret = 0; - - child->perf_counter_ctxp = NULL; - - mutex_init(&child->perf_counter_mutex); - INIT_LIST_HEAD(&child->perf_counter_list); - - if (likely(!parent->perf_counter_ctxp)) - return 0; - - /* - * This is executed from the parent task context, so inherit - * counters that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_counter_init_context(child_ctx, child); - child->perf_counter_ctxp = child_ctx; - get_task_struct(child); - - /* - * If the parent's context is a clone, pin it so it won't get - * swapped under us. - */ - parent_ctx = perf_pin_task_context(parent); - - /* - * No need to check if parent_ctx != NULL here; since we saw - * it non-NULL earlier, the only reason for it to become NULL - * is if we exit, and since we're currently in the middle of - * a fork we can't be exiting at the same time. - */ - - /* - * Lock the parent list. No need to lock the child - not PID - * hashed yet and not running, so nobody can access it. - */ - mutex_lock(&parent_ctx->mutex); - - /* - * We dont have to disable NMIs - we are only looking at - * the list, not manipulating it: - */ - list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { - if (counter != counter->group_leader) - continue; - - if (!counter->attr.inherit) { - inherited_all = 0; - continue; - } - - ret = inherit_group(counter, parent, parent_ctx, - child, child_ctx); - if (ret) { - inherited_all = 0; - break; - } - } - - if (inherited_all) { - /* - * Mark the child context as a clone of the parent - * context, or of whatever the parent is a clone of. - * Note that if the parent is a clone, it could get - * uncloned at any point, but that doesn't matter - * because the list of counters and the generation - * count can't have changed since we took the mutex. - */ - cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); - if (cloned_ctx) { - child_ctx->parent_ctx = cloned_ctx; - child_ctx->parent_gen = parent_ctx->parent_gen; - } else { - child_ctx->parent_ctx = parent_ctx; - child_ctx->parent_gen = parent_ctx->generation; - } - get_ctx(child_ctx->parent_ctx); - } - - mutex_unlock(&parent_ctx->mutex); - - perf_unpin_context(parent_ctx); - - return ret; -} - -static void __cpuinit perf_counter_init_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_counter_init_context(&cpuctx->ctx, NULL); - - spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; - spin_unlock(&perf_resource_lock); - - hw_perf_counter_setup(cpu); -} - -#ifdef CONFIG_HOTPLUG_CPU -static void __perf_counter_exit_cpu(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = &cpuctx->ctx; - struct perf_counter *counter, *tmp; - - list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) - __perf_counter_remove_from_context(counter); -} -static void perf_counter_exit_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &cpuctx->ctx; - - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); - mutex_unlock(&ctx->mutex); -} -#else -static inline void perf_counter_exit_cpu(int cpu) { } -#endif - -static int __cpuinit -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - perf_counter_init_cpu(cpu); - break; - - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - hw_perf_counter_setup_online(cpu); - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - perf_counter_exit_cpu(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -/* - * This has to have a higher priority than migration_notifier in sched.c. - */ -static struct notifier_block __cpuinitdata perf_cpu_nb = { - .notifier_call = perf_cpu_notify, - .priority = 20, -}; - -void __init perf_counter_init(void) -{ - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&perf_cpu_nb); -} - -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_reserved_percpu); -} - -static ssize_t -perf_set_reserve_percpu(struct sysdev_class *class, - const char *buf, - size_t count) -{ - struct perf_cpu_context *cpuctx; - unsigned long val; - int err, cpu, mpt; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > perf_max_counters) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_reserved_percpu = val; - for_each_online_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, - perf_max_counters - perf_reserved_percpu); - cpuctx->max_pertask = mpt; - spin_unlock_irq(&cpuctx->ctx.lock); - } - spin_unlock(&perf_resource_lock); - - return count; -} - -static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_overcommit); -} - -static ssize_t -perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) -{ - unsigned long val; - int err; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > 1) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_overcommit = val; - spin_unlock(&perf_resource_lock); - - return count; -} - -static SYSDEV_CLASS_ATTR( - reserve_percpu, - 0644, - perf_show_reserve_percpu, - perf_set_reserve_percpu - ); - -static SYSDEV_CLASS_ATTR( - overcommit, - 0644, - perf_show_overcommit, - perf_set_overcommit - ); - -static struct attribute *perfclass_attrs[] = { - &attr_reserve_percpu.attr, - &attr_overcommit.attr, - NULL -}; - -static struct attribute_group perfclass_attr_group = { - .attrs = perfclass_attrs, - .name = "perf_counters", -}; - -static int __init perf_counter_sysfs_init(void) -{ - return sysfs_create_group(&cpu_sysdev_class.kset.kobj, - &perfclass_attr_group); -} -device_initcall(perf_counter_sysfs_init); diff --git a/kernel/perf_event.c b/kernel/perf_event.c new file mode 100644 index 0000000..76ac4db --- /dev/null +++ b/kernel/perf_event.c @@ -0,0 +1,5000 @@ +/* + * Performance events core code: + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * For licensing details see kernel-base/COPYING + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/sysfs.h> +#include <linux/dcache.h> +#include <linux/percpu.h> +#include <linux/ptrace.h> +#include <linux/vmstat.h> +#include <linux/hardirq.h> +#include <linux/rculist.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/anon_inodes.h> +#include <linux/kernel_stat.h> +#include <linux/perf_event.h> + +#include <asm/irq_regs.h> + +/* + * Each CPU has a list of per CPU events: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_events __read_mostly = 1; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +static atomic_t nr_events __read_mostly; +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; + +/* + * perf event paranoia level: + * -1 - not paranoid at all + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv + */ +int sysctl_perf_event_paranoid __read_mostly = 1; + +static inline bool perf_paranoid_tracepoint_raw(void) +{ + return sysctl_perf_event_paranoid > -1; +} + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_event_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_event_paranoid > 1; +} + +int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ + +/* + * max perf event sample rate + */ +int sysctl_perf_event_sample_rate __read_mostly = 100000; + +static atomic64_t perf_event_id; + +/* + * Lock for (sysadmin-configurable) event reservations: + */ +static DEFINE_SPINLOCK(perf_resource_lock); + +/* + * Architecture provided APIs - weak aliases: + */ +extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + return NULL; +} + +void __weak hw_perf_disable(void) { barrier(); } +void __weak hw_perf_enable(void) { barrier(); } + +void __weak hw_perf_event_setup(int cpu) { barrier(); } +void __weak hw_perf_event_setup_online(int cpu) { barrier(); } + +int __weak +hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu) +{ + return 0; +} + +void __weak perf_event_print_debug(void) { } + +static DEFINE_PER_CPU(int, perf_disable_count); + +void __perf_disable(void) +{ + __get_cpu_var(perf_disable_count)++; +} + +bool __perf_enable(void) +{ + return !--__get_cpu_var(perf_disable_count); +} + +void perf_disable(void) +{ + __perf_disable(); + hw_perf_disable(); +} + +void perf_enable(void) +{ + if (__perf_enable()) + hw_perf_enable(); +} + +static void get_ctx(struct perf_event_context *ctx) +{ + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +} + +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx); +} + +static void put_ctx(struct perf_event_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); + } +} + +static void unclone_ctx(struct perf_event_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + +/* + * If we inherit events we want to return the parent event id + * to userspace. + */ +static u64 primary_event_id(struct perf_event *event) +{ + u64 id = event->id; + + if (event->parent) + id = event->parent->id; + + return id; +} + +/* + * Get the perf_event_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_event_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) +{ + struct perf_event_context *ctx; + + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_event_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_event_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_event_ctxp)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_event_context *perf_pin_task_context(struct task_struct *task) +{ + struct perf_event_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + ++ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_event_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + put_ctx(ctx); +} + +/* + * Add a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *group_leader = event->group_leader; + + /* + * Depending on whether it is a standalone or sibling event, + * add it straight to the context's event list, or to the group + * leader's sibling list: + */ + if (group_leader == event) + list_add_tail(&event->group_entry, &ctx->group_list); + else { + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + } + + list_add_rcu(&event->event_entry, &ctx->event_list); + ctx->nr_events++; + if (event->attr.inherit_stat) + ctx->nr_stat++; +} + +/* + * Remove a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *sibling, *tmp; + + if (list_empty(&event->group_entry)) + return; + ctx->nr_events--; + if (event->attr.inherit_stat) + ctx->nr_stat--; + + list_del_init(&event->group_entry); + list_del_rcu(&event->event_entry); + + if (event->group_leader != event) + event->group_leader->nr_siblings--; + + /* + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them + * to the context list directly: + */ + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + + list_move_tail(&sibling->group_entry, &ctx->group_list); + sibling->group_leader = sibling; + } +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; + } + event->tstamp_stopped = ctx->time; + event->pmu->disable(event); + event->oncpu = -1; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event_sched_out(group_event, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); + + if (group_event->attr.exclusive) + cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance event + * + * We disable the event on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_event_remove_from_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + /* + * Protect the list operation against NMI by disabling the + * events on a global level. + */ + perf_disable(); + + event_sched_out(event, cpuctx, ctx); + + list_del_event(event, ctx); + + if (!ctx->task) { + /* + * Allow more per task events with respect to the + * reservation: + */ + cpuctx->max_pertask = + min(perf_max_events - ctx->nr_events, + perf_max_events - perf_reserved_percpu); + } + + perf_enable(); + spin_unlock(&ctx->lock); +} + + +/* + * Remove the event from a task's (or a CPU's) list of events. + * + * Must be called with ctx->mutex held. + * + * CPU events are removed with a smp call. For task events we only + * call when the task is on a CPU. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_event_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_event_remove_from_context(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are removed via an smp call and + * the removal is always sucessful. + */ + smp_call_function_single(event->cpu, + __perf_event_remove_from_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_event_remove_from_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active we need to retry the smp call. + */ + if (ctx->nr_active && !list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can remove the event safely, if the call above did not + * succeed. + */ + if (!list_empty(&event->group_entry)) { + list_del_event(event, ctx); + } + spin_unlock_irq(&ctx->lock); +} + +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + + event->total_time_enabled = ctx->time - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = ctx->time; + + event->total_time_running = run_end - event->tstamp_running; +} + +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + +/* + * Cross CPU call to disable a performance event + */ +static void __perf_event_disable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + /* + * If the event is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (event->state >= PERF_EVENT_STATE_INACTIVE) { + update_context_time(ctx); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock(&ctx->lock); +} + +/* + * Disable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_event_for_each_child or perf_event_for_each because they + * hold the top-level event's child_mutex, so any descendant that + * goes to exit will block in sync_child_event. + * When called from perf_pending_event it's OK because event->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_event_task_sched_out for this context. + */ +static void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_disable, + event, 1); + return; + } + + retry: + task_oncpu_function_call(task, __perf_event_disable, event); + + spin_lock_irq(&ctx->lock); + /* + * If the event is still active, we need to retry the cross-call. + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock_irq(&ctx->lock); +} + +static int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (event->pmu->enable(event)) { + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; + return -EAGAIN; + } + + event->tstamp_running += ctx->time - event->tstamp_stopped; + + if (!is_software_event(event)) + cpuctx->active_oncpu++; + ctx->nr_active++; + + if (event->attr.exclusive) + cpuctx->exclusive = 1; + + return 0; +} + +static int +group_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + struct perf_event *event, *partial_group; + int ret; + + if (group_event->state == PERF_EVENT_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; + + if (event_sched_in(group_event, cpuctx, ctx, cpu)) + return -EAGAIN; + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event_sched_in(event, cpuctx, ctx, cpu)) { + partial_group = event; + goto group_error; + } + } + + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event == partial_group) + break; + event_sched_out(event, cpuctx, ctx); + } + event_sched_out(group_event, cpuctx, ctx); + + return -EAGAIN; +} + +/* + * Return 1 for a group consisting entirely of software events, + * 0 if the group contains any hardware events. + */ +static int is_software_only_group(struct perf_event *leader) +{ + struct perf_event *event; + + if (!is_software_event(leader)) + return 0; + + list_for_each_entry(event, &leader->sibling_list, group_entry) + if (!is_software_event(event)) + return 0; + + return 1; +} + +/* + * Work out whether we can put this event group on the CPU now. + */ +static int group_can_go_on(struct perf_event *event, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software events can always go on. + */ + if (is_software_only_group(event)) + return 1; + /* + * If an exclusive group is already on, no other hardware + * events can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * events on the CPU, it can't go on. + */ + if (event->attr.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + +static void add_event_to_ctx(struct perf_event *event, + struct perf_event_context *ctx) +{ + list_add_event(event, ctx); + event->tstamp_enabled = ctx->time; + event->tstamp_running = ctx->time; + event->tstamp_stopped = ctx->time; +} + +/* + * Cross CPU call to install and enable a performance event + * + * Must be called with ctx->mutex held + */ +static void __perf_install_in_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int cpu = smp_processor_id(); + int err; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no events. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + /* + * Protect the list operation against NMI by disabling the + * events on a global level. NOP for non NMI based events. + */ + perf_disable(); + + add_event_to_ctx(event, ctx); + + /* + * Don't put the event on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE || + (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) + goto unlock; + + /* + * An exclusive event can't go on if there are already active + * hardware events, and no hardware event can go on if there + * is already an exclusive event on. + */ + if (!group_can_go_on(event, cpuctx, 1)) + err = -EEXIST; + else + err = event_sched_in(event, cpuctx, ctx, cpu); + + if (err) { + /* + * This event couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the event group is pinned then put it in error state. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + if (!err && !ctx->task && cpuctx->max_pertask) + cpuctx->max_pertask--; + + unlock: + perf_enable(); + + spin_unlock(&ctx->lock); +} + +/* + * Attach a performance event to a context + * + * First we add the event to the list with the hardware enable bit + * in event->hw_config cleared. + * + * If the event is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. + */ +static void +perf_install_in_context(struct perf_event_context *ctx, + struct perf_event *event, + int cpu) +{ + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are installed via an smp call and + * the install is always sucessful. + */ + smp_call_function_single(cpu, __perf_install_in_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_install_in_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * we need to retry the smp call. + */ + if (ctx->is_active && list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can add the event safely, if it the call above did not + * succeed. + */ + if (list_empty(&event->group_entry)) + add_event_to_ctx(event, ctx); + spin_unlock_irq(&ctx->lock); +} + +/* + * Put a event into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + + event->state = PERF_EVENT_STATE_INACTIVE; + event->tstamp_enabled = ctx->time - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = + ctx->time - sub->total_time_enabled; +} + +/* + * Cross CPU call to enable a performance event + */ +static void __perf_event_enable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int err; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto unlock; + __perf_event_mark_enabled(event, ctx); + + /* + * If the event is in a group and isn't the group leader, + * then don't put it on unless the group is on. + */ + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; + + if (!group_can_go_on(event, cpuctx, 1)) { + err = -EEXIST; + } else { + perf_disable(); + if (event == leader) + err = group_sched_in(event, cpuctx, ctx, + smp_processor_id()); + else + err = event_sched_in(event, cpuctx, ctx, + smp_processor_id()); + perf_enable(); + } + + if (err) { + /* + * If this event can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + unlock: + spin_unlock(&ctx->lock); +} + +/* + * Enable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_event_for_each_child or perf_event_for_each as described + * for perf_event_disable. + */ +static void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_enable, + event, 1); + return; + } + + spin_lock_irq(&ctx->lock); + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto out; + + /* + * If the event is in error state, clear that first. + * That way, if we see the event in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (event->state == PERF_EVENT_STATE_ERROR) + event->state = PERF_EVENT_STATE_OFF; + + retry: + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_event_enable, event); + + spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the event is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) + goto retry; + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_OFF) + __perf_event_mark_enabled(event, ctx); + + out: + spin_unlock_irq(&ctx->lock); +} + +static int perf_event_refresh(struct perf_event *event, int refresh) +{ + /* + * not supported on inherited events + */ + if (event->attr.inherit) + return -EINVAL; + + atomic_add(refresh, &event->event_limit); + perf_event_enable(event); + + return 0; +} + +void __perf_event_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + + spin_lock(&ctx->lock); + ctx->is_active = 0; + if (likely(!ctx->nr_events)) + goto out; + update_context_time(ctx); + + perf_disable(); + if (ctx->nr_active) { + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event != event->group_leader) + event_sched_out(event, cpuctx, ctx); + else + group_sched_out(event, cpuctx, ctx); + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events + * in them directly with an fd; we can only enable/disable all + * events via prctl, or enable/disable all events in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_event_context *ctx1, + struct perf_event_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; +} + +static void __perf_event_read(void *event); + +static void __perf_event_sync_stat(struct perf_event *event, + struct perf_event *next_event) +{ + u64 value; + + if (!event->attr.inherit_stat) + return; + + /* + * Update the event value, we cannot use perf_event_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the event must be on the current CPU, therefore we + * don't need to use it. + */ + switch (event->state) { + case PERF_EVENT_STATE_ACTIVE: + __perf_event_read(event); + break; + + case PERF_EVENT_STATE_INACTIVE: + update_event_times(event); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the event + * values when we flip the contexts. + */ + value = atomic64_read(&next_event->count); + value = atomic64_xchg(&event->count, value); + atomic64_set(&next_event->count, value); + + swap(event->total_time_enabled, next_event->total_time_enabled); + swap(event->total_time_running, next_event->total_time_running); + + /* + * Since we swizzled the values, update the user visible data too. + */ + perf_event_update_userpage(event); + perf_event_update_userpage(next_event); +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_event_sync_stat(struct perf_event_context *ctx, + struct perf_event_context *next_ctx) +{ + struct perf_event *event, *next_event; + + if (!ctx->nr_stat) + return; + + event = list_first_entry(&ctx->event_list, + struct perf_event, event_entry); + + next_event = list_first_entry(&next_ctx->event_list, + struct perf_event, event_entry); + + while (&event->event_entry != &ctx->event_list && + &next_event->event_entry != &next_ctx->event_list) { + + __perf_event_sync_stat(event, next_event); + + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); + } +} + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; + struct pt_regs *regs; + int do_switch = 1; + + regs = task_pt_regs(task); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + + if (likely(!ctx || !cpuctx->task_ctx)) + return; + + update_context_time(ctx); + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); + next_ctx = next->perf_event_ctxp; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + spin_lock(&ctx->lock); + spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_event_ctxp + */ + task->perf_event_ctxp = next_ctx; + next->perf_event_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + + perf_event_sync_stat(ctx, next_ctx); + } + spin_unlock(&next_ctx->lock); + spin_unlock(&ctx->lock); + } + rcu_read_unlock(); + + if (do_switch) { + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; + } +} + +/* + * Called with IRQs disabled + */ +static void __perf_event_task_sched_out(struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) +{ + __perf_event_sched_out(&cpuctx->ctx, cpuctx); +} + +static void +__perf_event_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event *event; + int can_add_hw = 1; + + spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + ctx->timestamp = perf_clock(); + + perf_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF || + !event->attr.pinned) + continue; + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (event != event->group_leader) + event_sched_in(event, cpuctx, ctx, cpu); + else { + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); + } + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; + } + } + + list_for_each_entry(event, &ctx->group_list, group_entry) { + /* + * Ignore events in OFF or ERROR state, and + * ignore pinned events since we did them already. + */ + if (event->state <= PERF_EVENT_STATE_OFF || + event->attr.pinned) + continue; + + /* + * Listen to the 'cpu' scheduling filter constraint + * of events: + */ + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (event != event->group_leader) { + if (event_sched_in(event, cpuctx, ctx, cpu)) + can_add_hw = 0; + } else { + if (group_can_go_on(event, cpuctx, can_add_hw)) { + if (group_sched_in(event, cpuctx, ctx, cpu)) + can_add_hw = 0; + } + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void perf_event_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + + if (likely(!ctx)) + return; + if (cpuctx->task_ctx == ctx) + return; + __perf_event_sched_in(ctx, cpuctx, cpu); + cpuctx->task_ctx = ctx; +} + +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + __perf_event_sched_in(ctx, cpuctx, cpu); +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + +static void perf_adjust_period(struct perf_event *event, u64 events) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period, sample_period; + s64 delta; + + events *= hwc->sample_period; + period = div64_u64(events, event->attr.sample_freq); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_event_context *ctx) +{ + struct perf_event *event; + struct hw_perf_event *hwc; + u64 interrupts, freq; + + spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + continue; + + hwc = &event->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; + + /* + * unthrottle events on the tick + */ + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(event, 1); + event->pmu->unthrottle(event); + interrupts = 2*sysctl_perf_event_sample_rate/HZ; + } + + if (!event->attr.freq || !event->attr.sample_freq) + continue; + + /* + * if the specified freq < HZ then we need to skip ticks + */ + if (event->attr.sample_freq < HZ) { + freq = event->attr.sample_freq; + + hwc->freq_count += freq; + hwc->freq_interrupts += interrupts; + + if (hwc->freq_count < HZ) + continue; + + interrupts = hwc->freq_interrupts; + hwc->freq_interrupts = 0; + hwc->freq_count -= HZ; + } else + freq = HZ; + + perf_adjust_period(event, freq * interrupts); + + /* + * In order to avoid being stalled by an (accidental) huge + * sample period, force reset the sample period if we didn't + * get any events in this freq period. + */ + if (!interrupts) { + perf_disable(); + event->pmu->disable(event); + atomic64_set(&hwc->period_left, 0); + event->pmu->enable(event); + perf_enable(); + } + } + spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's events: + */ +static void rotate_ctx(struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (!ctx->nr_events) + return; + + spin_lock(&ctx->lock); + /* + * Rotate the first entry last (works just fine for group events too): + */ + perf_disable(); + list_for_each_entry(event, &ctx->group_list, group_entry) { + list_move_tail(&event->group_entry, &ctx->group_list); + break; + } + perf_enable(); + + spin_unlock(&ctx->lock); +} + +void perf_event_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + + if (!atomic_read(&nr_events)) + return; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = curr->perf_event_ctxp; + + perf_ctx_adjust_freq(&cpuctx->ctx); + if (ctx) + perf_ctx_adjust_freq(ctx); + + perf_event_cpu_sched_out(cpuctx); + if (ctx) + __perf_event_task_sched_out(ctx); + + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + perf_event_cpu_sched_in(cpuctx, cpu); + if (ctx) + perf_event_task_sched_in(curr, cpu); +} + +/* + * Enable all of a task's events that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_event_enable_on_exec(struct task_struct *task) +{ + struct perf_event_context *ctx; + struct perf_event *event; + unsigned long flags; + int enabled = 0; + + local_irq_save(flags); + ctx = task->perf_event_ctxp; + if (!ctx || !ctx->nr_events) + goto out; + + __perf_event_task_sched_out(ctx); + + spin_lock(&ctx->lock); + + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (!event->attr.enable_on_exec) + continue; + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + continue; + __perf_event_mark_enabled(event, ctx); + enabled = 1; + } + + /* + * Unclone this context if we enabled any event. + */ + if (enabled) + unclone_ctx(ctx); + + spin_unlock(&ctx->lock); + + perf_event_task_sched_in(task, smp_processor_id()); + out: + local_irq_restore(flags); +} + +/* + * Cross CPU call to read the hardware event + */ +static void __perf_event_read(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * event->count would have been updated to a recent sample + * when the event was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + local_irq_save(flags); + if (ctx->is_active) + update_context_time(ctx); + event->pmu->read(event); + update_event_times(event); + local_irq_restore(flags); +} + +static u64 perf_event_read(struct perf_event *event) +{ + /* + * If event is enabled and currently active on a CPU, update the + * value in the event structure: + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + smp_call_function_single(event->oncpu, + __perf_event_read, event, 1); + } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_event_times(event); + } + + return atomic64_read(&event->count); +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void +__perf_event_init_context(struct perf_event_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->group_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + +static struct perf_event_context *find_get_context(pid_t pid, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + struct task_struct *task; + unsigned long flags; + int err; + + /* + * If cpu is not a wildcard then this is a percpu event: + */ + if (cpu != -1) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu > num_possible_cpus()) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_isset(cpu, cpu_online_map)) + return ERR_PTR(-ENODEV); + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + + return ctx; + } + + rcu_read_lock(); + if (!pid) + task = current; + else + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + /* + * Can't attach events to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + + /* Reuse ptrace permission checks for now. */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + retry: + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + unclone_ctx(ctx); + spin_unlock_irqrestore(&ctx->lock, flags); + } + + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + err = -ENOMEM; + if (!ctx) + goto errout; + __perf_event_init_context(ctx, task); + get_ctx(ctx); + if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + goto retry; + } + get_task_struct(task); + } + + put_task_struct(task); + return ctx; + + errout: + put_task_struct(task); + return ERR_PTR(err); +} + +static void free_event_rcu(struct rcu_head *head) +{ + struct perf_event *event; + + event = container_of(head, struct perf_event, rcu_head); + if (event->ns) + put_pid_ns(event->ns); + kfree(event); +} + +static void perf_pending_sync(struct perf_event *event); + +static void free_event(struct perf_event *event) +{ + perf_pending_sync(event); + + if (!event->parent) { + atomic_dec(&nr_events); + if (event->attr.mmap) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); + } + + if (event->output) { + fput(event->output->filp); + event->output = NULL; + } + + if (event->destroy) + event->destroy(event); + + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + struct perf_event_context *ctx = event->ctx; + + file->private_data = NULL; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_event_remove_from_context(event); + mutex_unlock(&ctx->mutex); + + mutex_lock(&event->owner->perf_event_mutex); + list_del_init(&event->owner_entry); + mutex_unlock(&event->owner->perf_event_mutex); + put_task_struct(event->owner); + + free_event(event); + + return 0; +} + +static int perf_event_read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_event_read_value(struct perf_event *event) +{ + struct perf_event *child; + u64 total = 0; + + total += perf_event_read(event); + list_for_each_entry(child, &event->child_list, child_list) + total += perf_event_read(child); + + return total; +} + +static int perf_event_read_entry(struct perf_event *event, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_event_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_event_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + err = perf_event_read_entry(sub, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_event_read_one(struct perf_event *event, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + +/* + * Read the performance event - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +{ + u64 read_format = event->attr.read_format; + int ret; + + /* + * Return end-of-file for a read on a event that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (event->state == PERF_EVENT_STATE_ERROR) + return 0; + + if (count < perf_event_read_size(event)) + return -ENOSPC; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_event_read_group(event, read_format, buf); + else + ret = perf_event_read_one(event, read_format, buf); + mutex_unlock(&event->child_mutex); + + return ret; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_event *event = file->private_data; + + return perf_read_hw(event, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_event *event = file->private_data; + struct perf_mmap_data *data; + unsigned int events = POLL_HUP; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (data) + events = atomic_xchg(&data->poll, 0); + rcu_read_unlock(); + + poll_wait(file, &event->waitq, wait); + + return events; +} + +static void perf_event_reset(struct perf_event *event) +{ + (void)perf_event_read(event); + atomic64_set(&event->count, 0); + perf_event_update_userpage(event); +} + +/* + * Holding the top-level event's child_mutex means that any + * descendant process that has inherited this event will block + * in sync_child_event if it goes to exit, thus satisfying the + * task existence requirements of perf_event_enable/disable. + */ +static void perf_event_for_each_child(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event *child; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + func(event); + list_for_each_entry(child, &event->child_list, child_list) + func(child); + mutex_unlock(&event->child_mutex); +} + +static void perf_event_for_each(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *sibling; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + event = event->group_leader; + + perf_event_for_each_child(event, func); + func(event); + list_for_each_entry(sibling, &event->sibling_list, group_entry) + perf_event_for_each_child(event, func); + mutex_unlock(&ctx->mutex); +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct perf_event_context *ctx = event->ctx; + unsigned long size; + int ret = 0; + u64 value; + + if (!event->attr.sample_period) + return -EINVAL; + + size = copy_from_user(&value, arg, sizeof(value)); + if (size != sizeof(value)) + return -EFAULT; + + if (!value) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) { + ret = -EINVAL; + goto unlock; + } + + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } +unlock: + spin_unlock_irq(&ctx->lock); + + return ret; +} + +int perf_event_set_output(struct perf_event *event, int output_fd); + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + void (*func)(struct perf_event *); + u32 flags = arg; + + switch (cmd) { + case PERF_EVENT_IOC_ENABLE: + func = perf_event_enable; + break; + case PERF_EVENT_IOC_DISABLE: + func = perf_event_disable; + break; + case PERF_EVENT_IOC_RESET: + func = perf_event_reset; + break; + + case PERF_EVENT_IOC_REFRESH: + return perf_event_refresh(event, arg); + + case PERF_EVENT_IOC_PERIOD: + return perf_event_period(event, (u64 __user *)arg); + + case PERF_EVENT_IOC_SET_OUTPUT: + return perf_event_set_output(event, arg); + + default: + return -ENOTTY; + } + + if (flags & PERF_IOC_FLAG_GROUP) + perf_event_for_each(event, func); + else + perf_event_for_each_child(event, func); + + return 0; +} + +int perf_event_task_enable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_enable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +int perf_event_task_disable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_disable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +#ifndef PERF_EVENT_INDEX_OFFSET +# define PERF_EVENT_INDEX_OFFSET 0 +#endif + +static int perf_event_index(struct perf_event *event) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + + return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_event_update_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + userpg = data->user_page; + + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); + ++userpg->lock; + barrier(); + userpg->index = perf_event_index(event); + userpg->offset = atomic64_read(&event->count); + if (event->state == PERF_EVENT_STATE_ACTIVE) + userpg->offset -= atomic64_read(&event->hw.prev_count); + + userpg->time_enabled = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + + userpg->time_running = event->total_time_running + + atomic64_read(&event->child_total_time_running); + + barrier(); + ++userpg->lock; + preempt_enable(); +unlock: + rcu_read_unlock(); +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + if (vmf->pgoff == 0) { + vmf->page = virt_to_page(data->user_page); + } else { + int nr = vmf->pgoff - 1; + + if ((unsigned)nr > data->nr_pages) + goto unlock; + + if (vmf->flags & FAULT_FLAG_WRITE) + goto unlock; + + vmf->page = virt_to_page(data->data_pages[nr]); + } + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + int i; + + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += nr_pages * sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->data_pages[i]) + goto fail_data_pages; + } + + data->nr_pages = nr_pages; + atomic_set(&data->lock, -1); + + if (event->attr.watermark) { + data->watermark = min_t(long, PAGE_SIZE * nr_pages, + event->attr.wakeup_watermark); + } + if (!data->watermark) + data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); + + rcu_assign_pointer(event->data, data); + + return 0; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)data->data_pages[i]); + + free_page((unsigned long)data->user_page); + +fail_user_page: + kfree(data); + +fail: + return -ENOMEM; +} + +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + +static void __perf_mmap_data_free(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + int i; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + + perf_mmap_free_page((unsigned long)data->user_page); + for (i = 0; i < data->nr_pages; i++) + perf_mmap_free_page((unsigned long)data->data_pages[i]); + + kfree(data); +} + +static void perf_mmap_data_free(struct perf_event *event) +{ + struct perf_mmap_data *data = event->data; + + WARN_ON(atomic_read(&event->mmap_count)); + + rcu_assign_pointer(event->data, NULL); + call_rcu(&data->rcu_head, __perf_mmap_data_free); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + atomic_inc(&event->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + WARN_ON_ONCE(event->ctx->parent_ctx); + if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + struct user_struct *user = current_user(); + + atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= event->data->nr_locked; + perf_mmap_data_free(event); + mutex_unlock(&event->mmap_mutex); + } +} + +static struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long user_locked, user_lock_limit; + struct user_struct *user = current_user(); + unsigned long locked, lock_limit; + unsigned long vma_size; + unsigned long nr_pages; + long user_extra, extra; + int ret = 0; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have data pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + if (vma->vm_pgoff != 0) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->mmap_mutex); + if (event->output) { + ret = -EINVAL; + goto unlock; + } + + if (atomic_inc_not_zero(&event->mmap_count)) { + if (nr_pages != event->data->nr_pages) + ret = -EINVAL; + goto unlock; + } + + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm) + user_extra; + + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; + + if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(event->data); + ret = perf_mmap_data_alloc(event, nr_pages); + if (ret) + goto unlock; + + atomic_set(&event->mmap_count, 1); + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->locked_vm += extra; + event->data->nr_locked = extra; + if (vma->vm_flags & VM_WRITE) + event->data->writable = 1; + +unlock: + mutex_unlock(&event->mmap_mutex); + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + + return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_event *event = filp->private_data; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &event->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + +static const struct file_operations perf_fops = { + .release = perf_release, + .read = perf_read, + .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, + .fasync = perf_fasync, +}; + +/* + * Perf event wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_event_wakeup(struct perf_event *event) +{ + wake_up_all(&event->waitq); + + if (event->pending_kill) { + kill_fasync(&event->fasync, SIGIO, event->pending_kill); + event->pending_kill = 0; + } +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +static void perf_pending_event(struct perf_pending_entry *entry) +{ + struct perf_event *event = container_of(entry, + struct perf_event, pending); + + if (event->pending_disable) { + event->pending_disable = 0; + __perf_event_disable(event); + } + + if (event->pending_wakeup) { + event->pending_wakeup = 0; + perf_event_wakeup(event); + } +} + +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ + struct perf_pending_entry **head; + + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) + return; + + entry->func = func; + + head = &get_cpu_var(perf_pending_head); + + do { + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); + + set_perf_event_pending(); + + put_cpu_var(perf_pending_head); +} + +static int __perf_pending_run(void) +{ + struct perf_pending_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; + + list = list->next; + + func = entry->func; + entry->next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + func(entry); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_event *event) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return event->pending.next == NULL; +} + +static void perf_pending_sync(struct perf_event *event) +{ + wait_event(event->waitq, perf_not_pending(event)); +} + +void perf_event_do_pending(void) +{ + __perf_pending_run(); +} + +/* + * Callchain support -- arch specific + */ + +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + +/* + * Output + */ +static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long mask; + + if (!data->writable) + return true; + + mask = (data->nr_pages << PAGE_SHIFT) - 1; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->data->poll, POLL_IN); + + if (handle->nmi) { + handle->event->pending_wakeup = 1; + perf_pending_queue(&handle->event->pending, + perf_pending_event); + } else + perf_event_wakeup(handle->event); +} + +/* + * Curious locking construct. + * + * We need to ensure a later event_id doesn't publish a head when a former + * event_id isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event_id completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + unsigned long head; + int cpu; + + data->done_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_long_xchg(&data->done_head, 0))) + data->user_page->data_head = head; + + /* + * NMI can happen here, which means we can miss a done_head update. + */ + + cpu = atomic_xchg(&data->lock, -1); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_long_read(&data->done_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + goto again; + } + + if (atomic_xchg(&data->wakeup, 0)) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + +void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; + + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample) +{ + struct perf_event *output_event; + struct perf_mmap_data *data; + unsigned long tail, offset, head; + int have_lost; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; + + rcu_read_lock(); + /* + * For inherited events we send all the output towards the parent. + */ + if (event->parent) + event = event->parent; + + output_event = rcu_dereference(event->output); + if (output_event) + event = output_event; + + data = rcu_dereference(event->data); + if (!data) + goto out; + + handle->data = data; + handle->event = event; + handle->nmi = nmi; + handle->sample = sample; + + if (!data->nr_pages) + goto fail; + + have_lost = atomic_read(&data->lost); + if (have_lost) + size += sizeof(lost_event); + + perf_output_lock(handle); + + do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(data->user_page->data_tail); + smp_rmb(); + offset = head = atomic_long_read(&data->head); + head += size; + if (unlikely(!perf_output_space(data, tail, offset, head))) + goto fail; + } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + + handle->offset = offset; + handle->head = head; + + if (head - tail > data->watermark) + atomic_set(&data->wakeup, 1); + + if (have_lost) { + lost_event.header.type = PERF_RECORD_LOST; + lost_event.header.misc = 0; + lost_event.header.size = sizeof(lost_event); + lost_event.id = event->id; + lost_event.lost = atomic_xchg(&data->lost, 0); + + perf_output_put(handle, lost_event); + } + + return 0; + +fail: + atomic_inc(&data->lost); + perf_output_unlock(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_event *event = handle->event; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = event->attr.wakeup_events; + + if (handle->sample && wakeup_events) { + int events = atomic_inc_return(&data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &data->events); + atomic_set(&data->wakeup, 1); + } + } + + perf_output_unlock(handle); + rcu_read_unlock(); +} + +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_event *event) +{ + u64 read_format = event->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&event->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader, *sub; + u64 read_format = event->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != event) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + if (sub != event) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_event *event) +{ + if (event->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, event); + else + perf_output_read_one(handle, event); +} + +void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = data->type; + + perf_output_put(handle, *header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(handle, data->ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(handle, data->period); + + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(handle, event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (data->callchain) { + int size = 1; + + if (data->callchain) + size += data->callchain->nr; + + size *= sizeof(u64); + + perf_output_copy(handle, data->callchain, size); + } else { + u64 nr = 0; + perf_output_put(handle, nr); + } + } + + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(handle, data->raw->size); + perf_output_copy(handle, data->raw->data, + data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(handle, raw); + } + } +} + +void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header); + + header->misc = 0; + header->misc |= perf_misc_flags(regs); + + if (sample_type & PERF_SAMPLE_IP) { + data->ip = perf_instruction_pointer(regs); + + header->size += sizeof(data->ip); + } + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + + header->size += sizeof(data->tid_entry); + } + + if (sample_type & PERF_SAMPLE_TIME) { + data->time = perf_clock(); + + header->size += sizeof(data->time); + } + + if (sample_type & PERF_SAMPLE_ADDR) + header->size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_ID) { + data->id = primary_event_id(event); + + header->size += sizeof(data->id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) { + data->stream_id = event->id; + + header->size += sizeof(data->stream_id); + } + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + + header->size += sizeof(data->cpu_entry); + } + + if (sample_type & PERF_SAMPLE_PERIOD) + header->size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + header->size += perf_event_read_size(event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + int size = 1; + + data->callchain = perf_callchain(regs); + + if (data->callchain) + size += data->callchain->nr; + + header->size += size * sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_RAW) { + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header->size += size; + } +} + +static void perf_event_output(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_output_handle handle; + struct perf_event_header header; + + perf_prepare_sample(&header, data, event, regs); + + if (perf_output_begin(&handle, event, header.size, nmi, 1)) + return; + + perf_output_sample(&handle, &header, data, event); + + perf_output_end(&handle); +} + +/* + * read event_id + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; +}; + +static void +perf_event_read_event(struct perf_event *event, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_read_event read_event = { + .header = { + .type = PERF_RECORD_READ, + .misc = 0, + .size = sizeof(read_event) + perf_event_read_size(event), + }, + .pid = perf_event_pid(event, task), + .tid = perf_event_tid(event, task), + }; + int ret; + + ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + if (ret) + return; + + perf_output_put(&handle, read_event); + perf_output_read(&handle, event); + + perf_output_end(&handle); +} + +/* + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task + */ + +struct perf_task_event { + struct task_struct *task; + struct perf_event_context *task_ctx; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + u32 tid; + u32 ptid; + u64 time; + } event_id; +}; + +static void perf_event_task_output(struct perf_event *event, + struct perf_task_event *task_event) +{ + struct perf_output_handle handle; + int size; + struct task_struct *task = task_event->task; + int ret; + + size = task_event->event_id.header.size; + ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + task_event->event_id.pid = perf_event_pid(event, task); + task_event->event_id.ppid = perf_event_pid(event, current); + + task_event->event_id.tid = perf_event_tid(event, task); + task_event->event_id.ptid = perf_event_tid(event, current); + + task_event->event_id.time = perf_clock(); + + perf_output_put(&handle, task_event->event_id); + + perf_output_end(&handle); +} + +static int perf_event_task_match(struct perf_event *event) +{ + if (event->attr.comm || event->attr.mmap || event->attr.task) + return 1; + + return 0; +} + +static void perf_event_task_ctx(struct perf_event_context *ctx, + struct perf_task_event *task_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_task_match(event)) + perf_event_task_output(event, task_event); + } + rcu_read_unlock(); +} + +static void perf_event_task_event(struct perf_task_event *task_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx = task_event->task_ctx; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_task_ctx(&cpuctx->ctx, task_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_event_ctxp); + if (ctx) + perf_event_task_ctx(ctx, task_event); + rcu_read_unlock(); +} + +static void perf_event_task(struct task_struct *task, + struct perf_event_context *task_ctx, + int new) +{ + struct perf_task_event task_event; + + if (!atomic_read(&nr_comm_events) && + !atomic_read(&nr_mmap_events) && + !atomic_read(&nr_task_events)) + return; + + task_event = (struct perf_task_event){ + .task = task, + .task_ctx = task_ctx, + .event_id = { + .header = { + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, + .misc = 0, + .size = sizeof(task_event.event_id), + }, + /* .pid */ + /* .ppid */ + /* .tid */ + /* .ptid */ + }, + }; + + perf_event_task_event(&task_event); +} + +void perf_event_fork(struct task_struct *task) +{ + perf_event_task(task, NULL, 1); +} + +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event_id; +}; + +static void perf_event_comm_output(struct perf_event *event, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + comm_event->event_id.pid = perf_event_pid(event, comm_event->task); + comm_event->event_id.tid = perf_event_tid(event, comm_event->task); + + perf_output_put(&handle, comm_event->event_id); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); +} + +static int perf_event_comm_match(struct perf_event *event) +{ + if (event->attr.comm) + return 1; + + return 0; +} + +static void perf_event_comm_ctx(struct perf_event_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_comm_match(event)) + perf_event_comm_output(event, comm_event); + } + rcu_read_unlock(); +} + +static void perf_event_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + unsigned int size; + char comm[TASK_COMM_LEN]; + + memset(comm, 0, sizeof(comm)); + strncpy(comm, comm_event->task->comm, sizeof(comm)); + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); + rcu_read_unlock(); +} + +void perf_event_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + + if (task->perf_event_ctxp) + perf_event_enable_on_exec(task); + + if (!atomic_read(&nr_comm_events)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + /* .comm */ + /* .comm_size */ + .event_id = { + .header = { + .type = PERF_RECORD_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + }, + }; + + perf_event_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct vm_area_struct *vma; + + const char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event_id; +}; + +static void perf_event_mmap_output(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + mmap_event->event_id.pid = perf_event_pid(event, current); + mmap_event->event_id.tid = perf_event_tid(event, current); + + perf_output_put(&handle, mmap_event->event_id); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle); +} + +static int perf_event_mmap_match(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + if (event->attr.mmap) + return 1; + + return 0; +} + +static void perf_event_mmap_ctx(struct perf_event_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_mmap_match(event, mmap_event)) + perf_event_mmap_output(event, mmap_event); + } + rcu_read_unlock(); +} + +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + const char *name; + + memset(tmp, 0, sizeof(tmp)); + + if (file) { + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); + goto got_name; + } + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } + + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_mmap_ctx(ctx, mmap_event); + rcu_read_unlock(); + + kfree(buf); +} + +void __perf_event_mmap(struct vm_area_struct *vma) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_events)) + return; + + mmap_event = (struct perf_mmap_event){ + .vma = vma, + /* .file_name */ + /* .file_size */ + .event_id = { + .header = { + .type = PERF_RECORD_MMAP, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = vma->vm_pgoff, + }, + }; + + perf_event_mmap_event(&mmap_event); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_event *event, int enable) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 id; + u64 stream_id; + } throttle_event = { + .header = { + .type = PERF_RECORD_THROTTLE, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = perf_clock(), + .id = primary_event_id(event), + .stream_id = event->id, + }; + + if (enable) + throttle_event.header.type = PERF_RECORD_UNTHROTTLE; + + ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_output_end(&handle); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, int nmi, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + struct hw_perf_event *hwc = &event->hw; + int ret = 0; + + throttle = (throttle && event->pmu->unthrottle != NULL); + + if (!throttle) { + hwc->interrupts++; + } else { + if (hwc->interrupts != MAX_INTERRUPTS) { + hwc->interrupts++; + if (HZ * hwc->interrupts > + (u64)sysctl_perf_event_sample_rate) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); + ret = 1; + } + } else { + /* + * Keep re-disabling events even though on the previous + * pass we disabled it - just in case we raced with a + * sched-in and the event got enabled again: + */ + ret = 1; + } + } + + if (event->attr.freq) { + u64 now = perf_clock(); + s64 delta = now - hwc->freq_stamp; + + hwc->freq_stamp = now; + + if (delta > 0 && delta < TICK_NSEC) + perf_adjust_period(event, NSEC_PER_SEC / (int)delta); + } + + /* + * XXX event_limit might not quite work as expected on inherited + * events + */ + + event->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&event->event_limit)) { + ret = 1; + event->pending_kill = POLL_HUP; + if (nmi) { + event->pending_disable = 1; + perf_pending_queue(&event->pending, + perf_pending_event); + } else + perf_event_disable(event); + } + + perf_event_output(event, nmi, data, regs); + return ret; +} + +int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return __perf_event_overflow(event, nmi, 1, data, regs); +} + +/* + * Generic software event infrastructure + */ + +/* + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swevent_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; + +again: + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; + + return nr; +} + +static void perf_swevent_overflow(struct perf_event *event, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + int throttle = 0; + u64 overflow; + + data->period = event->hw.last_period; + overflow = perf_swevent_set_period(event); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (__perf_event_overflow(event, nmi, throttle, + data, regs)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + throttle = 1; + } +} + +static void perf_swevent_unthrottle(struct perf_event *event) +{ + /* + * Nothing to do, we already reset hwc->interrupts. + */ +} + +static void perf_swevent_add(struct perf_event *event, u64 nr, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + + atomic64_add(nr, &event->count); + + if (!hwc->sample_period) + return; + + if (!regs) + return; + + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swevent_overflow(event, nmi, data, regs); +} + +static int perf_swevent_is_counting(struct perf_event *event) +{ + /* + * The event is active, we're good! + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) + return 1; + + /* + * The event is off/error, not counting. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE) + return 0; + + /* + * The event is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. + */ + if (event->ctx->is_active) + return 0; + + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; +} + +static int perf_swevent_match(struct perf_event *event, + enum perf_type_id type, + u32 event_id, struct pt_regs *regs) +{ + if (!perf_swevent_is_counting(event)) + return 0; + + if (event->attr.type != type) + return 0; + if (event->attr.config != event_id) + return 0; + + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 0; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 0; + } + + return 1; +} + +static void perf_swevent_ctx_event(struct perf_event_context *ctx, + enum perf_type_id type, + u32 event_id, u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_swevent_match(event, type, event_id, regs)) + perf_swevent_add(event, nr, nmi, data, regs); + } + rcu_read_unlock(); +} + +static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +{ + if (in_nmi()) + return &cpuctx->recursion[3]; + + if (in_irq()) + return &cpuctx->recursion[2]; + + if (in_softirq()) + return &cpuctx->recursion[1]; + + return &cpuctx->recursion[0]; +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_event_context *ctx; + + if (*recursion) + goto out; + + (*recursion)++; + barrier(); + + perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, + nr, nmi, data, regs); + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); + rcu_read_unlock(); + + barrier(); + (*recursion)--; + +out: + put_cpu_var(perf_cpu_context); +} + +void __perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) +{ + struct perf_sample_data data = { + .addr = addr, + }; + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, + &data, regs); +} + +static void perf_swevent_read(struct perf_event *event) +{ +} + +static int perf_swevent_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(event); + } + return 0; +} + +static void perf_swevent_disable(struct perf_event *event) +{ +} + +static const struct pmu perf_ops_generic = { + .enable = perf_swevent_enable, + .disable = perf_swevent_disable, + .read = perf_swevent_read, + .unthrottle = perf_swevent_unthrottle, +}; + +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); + + data.addr = 0; + regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((event->attr.exclude_kernel || !regs) && + !event->attr.exclude_user) + regs = task_pt_regs(current); + + if (regs) { + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_perf_event_update(struct perf_event *event) +{ + int cpu = raw_smp_processor_id(); + s64 prev; + u64 now; + + now = cpu_clock(cpu); + prev = atomic64_read(&event->hw.prev_count); + atomic64_set(&event->hw.prev_count, now); + atomic64_add(now - prev, &event->count); +} + +static int cpu_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int cpu = raw_smp_processor_id(); + + atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void cpu_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + cpu_clock_perf_event_update(event); +} + +static void cpu_clock_perf_event_read(struct perf_event *event) +{ + cpu_clock_perf_event_update(event); +} + +static const struct pmu perf_ops_cpu_clock = { + .enable = cpu_clock_perf_event_enable, + .disable = cpu_clock_perf_event_disable, + .read = cpu_clock_perf_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_perf_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = atomic64_xchg(&event->hw.prev_count, now); + delta = now - prev; + atomic64_add(delta, &event->count); +} + +static int task_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 now; + + now = event->ctx->time; + + atomic64_set(&hwc->prev_count, now); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void task_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + task_clock_perf_event_update(event, event->ctx->time); + +} + +static void task_clock_perf_event_read(struct perf_event *event) +{ + u64 time; + + if (!in_nmi()) { + update_context_time(event->ctx); + time = event->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; + } + + task_clock_perf_event_update(event, time); +} + +static const struct pmu perf_ops_task_clock = { + .enable = task_clock_perf_event_enable, + .disable = task_clock_perf_event_disable, + .read = task_clock_perf_event_read, +}; + +#ifdef CONFIG_EVENT_PROFILE +void perf_tp_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) +{ + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + struct perf_sample_data data = { + .addr = addr, + .raw = &raw, + }; + + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); + + do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + &data, regs); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_event_destroy(struct perf_event *event) +{ + ftrace_profile_disable(event->attr.config); +} + +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((event->attr.sample_type & PERF_SAMPLE_RAW) && + perf_paranoid_tracepoint_raw() && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (ftrace_profile_enable(event->attr.config)) + return NULL; + + event->destroy = tp_perf_event_destroy; + + return &perf_ops_generic; +} +#else +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + return NULL; +} +#endif + +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + atomic_dec(&perf_swevent_enabled[event_id]); +} + +static const struct pmu *sw_perf_event_init(struct perf_event *event) +{ + const struct pmu *pmu = NULL; + u64 event_id = event->attr.config; + + /* + * Software events (currently) can't in general distinguish + * between user, kernel and hypervisor events. + * However, context switches and cpu migrations are considered + * to be kernel events, and page faults are never hypervisor + * events. + */ + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_TASK_CLOCK: + /* + * If the user instantiates this as a per-cpu event, + * use the cpu_clock event instead. + */ + if (event->ctx->task) + pmu = &perf_ops_task_clock; + else + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_PAGE_FAULTS: + case PERF_COUNT_SW_PAGE_FAULTS_MIN: + case PERF_COUNT_SW_PAGE_FAULTS_MAJ: + case PERF_COUNT_SW_CONTEXT_SWITCHES: + case PERF_COUNT_SW_CPU_MIGRATIONS: + if (!event->parent) { + atomic_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + pmu = &perf_ops_generic; + break; + } + + return pmu; +} + +/* + * Allocate and initialize a event structure + */ +static struct perf_event * +perf_event_alloc(struct perf_event_attr *attr, + int cpu, + struct perf_event_context *ctx, + struct perf_event *group_leader, + struct perf_event *parent_event, + gfp_t gfpflags) +{ + const struct pmu *pmu; + struct perf_event *event; + struct hw_perf_event *hwc; + long err; + + event = kzalloc(sizeof(*event), gfpflags); + if (!event) + return ERR_PTR(-ENOMEM); + + /* + * Single events are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = event; + + mutex_init(&event->child_mutex); + INIT_LIST_HEAD(&event->child_list); + + INIT_LIST_HEAD(&event->group_entry); + INIT_LIST_HEAD(&event->event_entry); + INIT_LIST_HEAD(&event->sibling_list); + init_waitqueue_head(&event->waitq); + + mutex_init(&event->mmap_mutex); + + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; + event->pmu = NULL; + event->ctx = ctx; + event->oncpu = -1; + + event->parent = parent_event; + + event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->id = atomic64_inc_return(&perf_event_id); + + event->state = PERF_EVENT_STATE_INACTIVE; + + if (attr->disabled) + event->state = PERF_EVENT_STATE_OFF; + + pmu = NULL; + + hwc = &event->hw; + hwc->sample_period = attr->sample_period; + if (attr->freq && attr->sample_freq) + hwc->sample_period = 1; + hwc->last_period = hwc->sample_period; + + atomic64_set(&hwc->period_left, hwc->sample_period); + + /* + * we currently do not support PERF_FORMAT_GROUP on inherited events + */ + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + goto done; + + switch (attr->type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + pmu = hw_perf_event_init(event); + break; + + case PERF_TYPE_SOFTWARE: + pmu = sw_perf_event_init(event); + break; + + case PERF_TYPE_TRACEPOINT: + pmu = tp_perf_event_init(event); + break; + + default: + break; + } +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); + + if (err) { + if (event->ns) + put_pid_ns(event->ns); + kfree(event); + return ERR_PTR(err); + } + + event->pmu = pmu; + + if (!event->parent) { + atomic_inc(&nr_events); + if (event->attr.mmap) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); + } + + return event; +} + +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + +int perf_event_set_output(struct perf_event *event, int output_fd) +{ + struct perf_event *output_event = NULL; + struct file *output_file = NULL; + struct perf_event *old_output; + int fput_needed = 0; + int ret = -EINVAL; + + if (!output_fd) + goto set; + + output_file = fget_light(output_fd, &fput_needed); + if (!output_file) + return -EBADF; + + if (output_file->f_op != &perf_fops) + goto out; + + output_event = output_file->private_data; + + /* Don't chain output fds */ + if (output_event->output) + goto out; + + /* Don't set an output fd when we already have an output channel */ + if (event->data) + goto out; + + atomic_long_inc(&output_file->f_count); + +set: + mutex_lock(&event->mmap_mutex); + old_output = event->output; + rcu_assign_pointer(event->output, output_event); + mutex_unlock(&event->mmap_mutex); + + if (old_output) { + /* + * we need to make sure no existing perf_output_*() + * is still referencing this event. + */ + synchronize_rcu(); + fput(old_output->filp); + } + + ret = 0; +out: + fput_light(output_file, fput_needed); + return ret; +} + +/** + * sys_perf_event_open - open a performance event, associate it to a task/cpu + * + * @attr_uptr: event_id type attributes for monitoring/sampling + * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader event fd + */ +SYSCALL_DEFINE5(perf_event_open, + struct perf_event_attr __user *, attr_uptr, + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ + struct perf_event *event, *group_leader; + struct perf_event_attr attr; + struct perf_event_context *ctx; + struct file *event_file = NULL; + struct file *group_file = NULL; + int fput_needed = 0; + int fput_needed2 = 0; + int err; + + /* for future expandability... */ + if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) + return -EINVAL; + + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; + + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_event_sample_rate) + return -EINVAL; + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + /* + * Look up the group leader (we will attach this event to it): + */ + group_leader = NULL; + if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { + err = -EINVAL; + group_file = fget_light(group_fd, &fput_needed); + if (!group_file) + goto err_put_context; + if (group_file->f_op != &perf_fops) + goto err_put_context; + + group_leader = group_file->private_data; + /* + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): + */ + if (group_leader->group_leader != group_leader) + goto err_put_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: + */ + if (group_leader->ctx != ctx) + goto err_put_context; + /* + * Only a group leader can be exclusive or pinned + */ + if (attr.exclusive || attr.pinned) + goto err_put_context; + } + + event = perf_event_alloc(&attr, cpu, ctx, group_leader, + NULL, GFP_KERNEL); + err = PTR_ERR(event); + if (IS_ERR(event)) + goto err_put_context; + + err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); + if (err < 0) + goto err_free_put_context; + + event_file = fget_light(err, &fput_needed2); + if (!event_file) + goto err_free_put_context; + + if (flags & PERF_FLAG_FD_OUTPUT) { + err = perf_event_set_output(event, group_fd); + if (err) + goto err_fput_free_put_context; + } + + event->filp = event_file; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + event->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + +err_fput_free_put_context: + fput_light(event_file, fput_needed2); + +err_free_put_context: + if (err < 0) + kfree(event); + +err_put_context: + if (err < 0) + put_ctx(ctx); + + fput_light(group_file, fput_needed); + + return err; +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, child_ctx, + group_leader, parent_event, + GFP_KERNEL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) + child_event->hw.sample_period = parent_event->hw.sample_period; + + /* + * Link it up in the child's context: + */ + add_event_to_ctx(child_event, child_ctx); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; +} + +static void sync_child_event(struct perf_event *child_event, + struct task_struct *child) +{ + struct perf_event *parent_event = child_event->parent; + u64 child_val; + + if (child_event->attr.inherit_stat) + perf_event_read_event(child_event, child); + + child_val = atomic64_read(&child_event->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_event->count); + atomic64_add(child_event->total_time_enabled, + &parent_event->child_total_time_enabled); + atomic64_add(child_event->total_time_running, + &parent_event->child_total_time_running); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Release the parent event, if this was the last + * reference to it. + */ + fput(parent_event->filp); +} + +static void +__perf_event_exit_task(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) +{ + struct perf_event *parent_event; + + update_event_times(child_event); + perf_event_remove_from_context(child_event); + + parent_event = child_event->parent; + /* + * It can happen that parent exits first, and has events + * that are still around due to the child reference. These + * events need to be zapped - but otherwise linger. + */ + if (parent_event) { + sync_child_event(child_event, child); + free_event(child_event); + } +} + +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + struct perf_event *child_event, *tmp; + struct perf_event_context *child_ctx; + unsigned long flags; + + if (likely(!child->perf_event_ctxp)) { + perf_event_task(child, NULL, 0); + return; + } + + local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = child->perf_event_ctxp; + __perf_event_task_sched_out(child_ctx); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_event_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + spin_lock(&child_ctx->lock); + child->perf_event_ctxp = NULL; + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the events from it. + */ + unclone_ctx(child_ctx); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the events so that we + * won't get any samples after PERF_RECORD_EXIT. We can however still + * get a few PERF_RECORD_READ events. + */ + perf_event_task(child, child_ctx, 0); + + /* + * We can recurse on the same lock type through: + * + * __perf_event_exit_task() + * sync_child_event() + * fput(parent_event->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + +again: + list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + /* + * If the last event was a group event, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->group_list)) + goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); +} + +/* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_event_free_task(struct task_struct *task) +{ + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event *event, *tmp; + + if (!ctx) + return; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + continue; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_event(event, ctx); + free_event(event); + } + + if (!list_empty(&ctx->group_list)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *cloned_ctx; + struct perf_event *event; + struct task_struct *parent = current; + int inherited_all = 1; + int ret = 0; + + child->perf_event_ctxp = NULL; + + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + + if (likely(!parent->perf_event_ctxp)) + return 0; + + /* + * This is executed from the parent task context, so inherit + * events that have been marked for cloning. + * First allocate and initialize a context for the child. + */ + + child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!child_ctx) + return -ENOMEM; + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + + /* + * If the parent's context is a clone, pin it so it won't get + * swapped under us. + */ + parent_ctx = perf_pin_task_context(parent); + + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + mutex_lock(&parent_ctx->mutex); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { + if (event != event->group_leader) + continue; + + if (!event->attr.inherit) { + inherited_all = 0; + continue; + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + if (ret) { + inherited_all = 0; + break; + } + } + + if (inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + * Note that if the parent is a clone, it could get + * uncloned at any point, but that doesn't matter + * because the list of events and the generation + * count can't have changed since we took the mutex. + */ + cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); + } + + mutex_unlock(&parent_ctx->mutex); + + perf_unpin_context(parent_ctx); + + return ret; +} + +static void __cpuinit perf_event_init_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx, NULL); + + spin_lock(&perf_resource_lock); + cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; + spin_unlock(&perf_resource_lock); + + hw_perf_event_setup(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_event_exit_cpu(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_event *event, *tmp; + + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + __perf_event_remove_from_context(event); +} +static void perf_event_exit_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); + mutex_unlock(&ctx->mutex); +} +#else +static inline void perf_event_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + perf_event_init_cpu(cpu); + break; + + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_event_setup_online(cpu); + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + perf_event_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +/* + * This has to have a higher priority than migration_notifier in sched.c. + */ +static struct notifier_block __cpuinitdata perf_cpu_nb = { + .notifier_call = perf_cpu_notify, + .priority = 20, +}; + +void __init perf_event_init(void) +{ + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&perf_cpu_nb); +} + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, + const char *buf, + size_t count) +{ + struct perf_cpu_context *cpuctx; + unsigned long val; + int err, cpu, mpt; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > perf_max_events) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_reserved_percpu = val; + for_each_online_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_irq(&cpuctx->ctx.lock); + mpt = min(perf_max_events - cpuctx->ctx.nr_events, + perf_max_events - perf_reserved_percpu); + cpuctx->max_pertask = mpt; + spin_unlock_irq(&cpuctx->ctx.lock); + } + spin_unlock(&perf_resource_lock); + + return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ + unsigned long val; + int err; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > 1) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_overcommit = val; + spin_unlock(&perf_resource_lock); + + return count; +} + +static SYSDEV_CLASS_ATTR( + reserve_percpu, + 0644, + perf_show_reserve_percpu, + perf_set_reserve_percpu + ); + +static SYSDEV_CLASS_ATTR( + overcommit, + 0644, + perf_show_overcommit, + perf_set_overcommit + ); + +static struct attribute *perfclass_attrs[] = { + &attr_reserve_percpu.attr, + &attr_overcommit.attr, + NULL +}; + +static struct attribute_group perfclass_attr_group = { + .attrs = perfclass_attrs, + .name = "perf_events", +}; + +static int __init perf_event_sysfs_init(void) +{ + return sysfs_create_group(&cpu_sysdev_class.kset.kobj, + &perfclass_attr_group); +} +device_initcall(perf_event_sysfs_init); diff --git a/kernel/pid.c b/kernel/pid.c index 31310b5..d3f722d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -40,7 +40,7 @@ #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) static struct hlist_head *pid_hash; -static int pidhash_shift; +static unsigned int pidhash_shift = 4; struct pid init_struct_pid = INIT_STRUCT_PID; int pid_max = PID_MAX_DEFAULT; @@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) void __init pidhash_init(void) { int i, pidhash_size; - unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); - pidhash_shift = max(4, fls(megabytes * 4)); - pidhash_shift = min(12, pidhash_shift); + pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, + HASH_EARLY | HASH_SMALL, + &pidhash_shift, NULL, 4096); pidhash_size = 1 << pidhash_shift; - printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", - pidhash_size, pidhash_shift, - pidhash_size * sizeof(struct hlist_head)); - - pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); - if (!pid_hash) - panic("Could not alloc pidhash!\n"); for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 821722a..86b3796 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & CLONE_THREAD) + if (flags & (CLONE_THREAD|CLONE_PARENT)) return ERR_PTR(-EINVAL); return create_pid_namespace(old_ns); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e33a21c..5c9dc22 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -8,17 +8,18 @@ #include <linux/math64.h> #include <asm/uaccess.h> #include <linux/kernel_stat.h> +#include <trace/events/timer.h> /* * Called after updating RLIMIT_CPU to set timer expiration if necessary. */ void update_rlimit_cpu(unsigned long rlim_new) { - cputime_t cputime; + cputime_t cputime = secs_to_cputime(rlim_new); + struct signal_struct *const sig = current->signal; - cputime = secs_to_cputime(rlim_new); - if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - cputime_gt(current->signal->it_prof_expires, cputime)) { + if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || + cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { spin_lock_irq(¤t->sighand->siglock); set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); @@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } +static inline int expires_gt(cputime_t expires, cputime_t new_exp) +{ + return cputime_eq(expires, cputime_zero) || + cputime_gt(expires, new_exp); +} + +static inline int expires_le(cputime_t expires, cputime_t new_exp) +{ + return !cputime_eq(expires, cputime_zero) && + cputime_le(expires, new_exp); +} /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + union cpu_time_count *exp = &nt->expires; + switch (CPUCLOCK_WHICH(timer->it_clock)) { default: BUG(); case CPUCLOCK_PROF: - if (cputime_eq(p->cputime_expires.prof_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.prof_exp, - nt->expires.cpu)) - p->cputime_expires.prof_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.prof_exp, + exp->cpu)) + p->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_VIRT: - if (cputime_eq(p->cputime_expires.virt_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.virt_exp, - nt->expires.cpu)) - p->cputime_expires.virt_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.virt_exp, + exp->cpu)) + p->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_SCHED: if (p->cputime_expires.sched_exp == 0 || - p->cputime_expires.sched_exp > - nt->expires.sched) + p->cputime_expires.sched_exp > exp->sched) p->cputime_expires.sched_exp = - nt->expires.sched; + exp->sched; break; } } else { + struct signal_struct *const sig = p->signal; + union cpu_time_count *exp = &timer->it.cpu.expires; + /* * For a process timer, set the cached expiration time. */ @@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_VIRT: - if (!cputime_eq(p->signal->it_virt_expires, - cputime_zero) && - cputime_lt(p->signal->it_virt_expires, - timer->it.cpu.expires.cpu)) + if (expires_le(sig->it[CPUCLOCK_VIRT].expires, + exp->cpu)) break; - p->signal->cputime_expires.virt_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_PROF: - if (!cputime_eq(p->signal->it_prof_expires, - cputime_zero) && - cputime_lt(p->signal->it_prof_expires, - timer->it.cpu.expires.cpu)) + if (expires_le(sig->it[CPUCLOCK_PROF].expires, + exp->cpu)) break; - i = p->signal->rlim[RLIMIT_CPU].rlim_cur; + i = sig->rlim[RLIMIT_CPU].rlim_cur; if (i != RLIM_INFINITY && - i <= cputime_to_secs(timer->it.cpu.expires.cpu)) + i <= cputime_to_secs(exp->cpu)) break; - p->signal->cputime_expires.prof_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_SCHED: - p->signal->cputime_expires.sched_exp = - timer->it.cpu.expires.sched; + sig->cputime_expires.sched_exp = exp->sched; break; } } @@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk) spin_unlock_irqrestore(&cputimer->lock, flags); } +static u32 onecputick; + +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + cputime_t *expires, cputime_t cur_time, int signo) +{ + if (cputime_eq(it->expires, cputime_zero)) + return; + + if (cputime_ge(cur_time, it->expires)) { + if (!cputime_eq(it->incr, cputime_zero)) { + it->expires = cputime_add(it->expires, it->incr); + it->error += it->incr_error; + if (it->error >= onecputick) { + it->expires = cputime_sub(it->expires, + cputime_one_jiffy); + it->error -= onecputick; + } + } else { + it->expires = cputime_zero; + } + + trace_itimer_expire(signo == SIGPROF ? + ITIMER_PROF : ITIMER_VIRTUAL, + tsk->signal->leader_pid, cur_time); + __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + } + + if (!cputime_eq(it->expires, cputime_zero) && + (cputime_eq(*expires, cputime_zero) || + cputime_lt(it->expires, *expires))) { + *expires = it->expires; + } +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk, * Don't sample the current process CPU clocks if there are no timers. */ if (list_empty(&timers[CPUCLOCK_PROF]) && - cputime_eq(sig->it_prof_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) && sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && - cputime_eq(sig->it_virt_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && list_empty(&timers[CPUCLOCK_SCHED])) { stop_process_timers(tsk); return; @@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk, /* * Check for the special case process timers. */ - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - if (cputime_ge(ptime, sig->it_prof_expires)) { - /* ITIMER_PROF fires and reloads. */ - sig->it_prof_expires = sig->it_prof_incr; - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - sig->it_prof_expires = cputime_add( - sig->it_prof_expires, ptime); - } - __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_prof_expires, cputime_zero) && - (cputime_eq(prof_expires, cputime_zero) || - cputime_lt(sig->it_prof_expires, prof_expires))) { - prof_expires = sig->it_prof_expires; - } - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - if (cputime_ge(utime, sig->it_virt_expires)) { - /* ITIMER_VIRTUAL fires and reloads. */ - sig->it_virt_expires = sig->it_virt_incr; - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - sig->it_virt_expires = cputime_add( - sig->it_virt_expires, utime); - } - __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero) && - (cputime_eq(virt_expires, cputime_zero) || - cputime_lt(sig->it_virt_expires, virt_expires))) { - virt_expires = sig->it_virt_expires; - } - } + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, + SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, + SIGVTALRM); + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); cputime_t x; @@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (!cputime_eq(*oldval, cputime_zero)) { if (cputime_le(*oldval, now.cpu)) { /* Just about to fire. */ - *oldval = jiffies_to_cputime(1); + *oldval = cputime_one_jiffy; } else { *oldval = cputime_sub(*oldval, now.cpu); } @@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void) .nsleep = thread_cpu_nsleep, .nsleep_restart = thread_cpu_nsleep_restart, }; + struct timespec ts; register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + cputime_to_timespec(cputime_one_jiffy, &ts); + onecputick = ts.tv_nsec; + WARN_ON(ts.tv_sec != 0); + return 0; } __initcall(init_posix_cpu_timers); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d089d05..4954407 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) return 0; } + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +{ + *tp = current_kernel_time(); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec *tp) +{ + *tp = get_monotonic_coarse(); + return 0; +} + +int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +{ + *tp = ktime_to_timespec(KTIME_LOW_RES); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -262,10 +281,26 @@ static __init int init_posix_timers(void) .timer_create = no_timer_create, .nsleep = no_nsleep, }; + struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; + struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 72067cb..91e09d3 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -208,3 +208,17 @@ config APM_EMULATION random kernel OOPSes or reboots that don't seem to be related to anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on PM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. diff --git a/kernel/power/console.c b/kernel/power/console.c index a3961b20..5187136 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -14,56 +14,13 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; -static int disable_vt_switch; - -/* - * Normally during a suspend, we allocate a new console and switch to it. - * When we resume, we switch back to the original console. This switch - * can be slow, so on systems where the framebuffer can handle restoration - * of video registers anyways, there's little point in doing the console - * switch. This function allows you to disable it by passing it '0'. - */ -void pm_set_vt_switch(int do_switch) -{ - acquire_console_sem(); - disable_vt_switch = !do_switch; - release_console_sem(); -} -EXPORT_SYMBOL(pm_set_vt_switch); int pm_prepare_console(void) { - acquire_console_sem(); - - if (disable_vt_switch) { - release_console_sem(); - return 0; - } - - orig_fgconsole = fg_console; - - if (vc_allocate(SUSPEND_CONSOLE)) { - /* we can't have a free VC for now. Too bad, - * we don't want to mess the screen for now. */ - release_console_sem(); + orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); + if (orig_fgconsole < 0) return 1; - } - if (set_console(SUSPEND_CONSOLE)) { - /* - * We're unable to switch to the SUSPEND_CONSOLE. - * Let the calling function know so it can decide - * what to do. - */ - release_console_sem(); - return 1; - } - release_console_sem(); - - if (vt_waitactive(SUSPEND_CONSOLE)) { - pr_debug("Suspend: Can't switch VCs."); - return 1; - } orig_kmsg = kmsg_redirect; kmsg_redirect = SUSPEND_CONSOLE; return 0; @@ -71,19 +28,9 @@ int pm_prepare_console(void) void pm_restore_console(void) { - acquire_console_sem(); - if (disable_vt_switch) { - release_console_sem(); - return; - } - set_console(orig_fgconsole); - release_console_sem(); - - if (vt_waitactive(orig_fgconsole)) { - pr_debug("Resume: Can't switch VCs."); - return; + if (orig_fgconsole >= 0) { + vt_move_to_console(orig_fgconsole, 0); + kmsg_redirect = orig_kmsg; } - - kmsg_redirect = orig_kmsg; } #endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 81d2e74..04b3a83 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode) if (error) return error; - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); + /* Preallocate image memory before shutting down devices. */ + error = hibernate_preallocate_memory(); if (error) goto Close; @@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode) /* Control returns here after successful restore */ Resume_devices: + /* We may need to release the preallocated image pages here. */ + if (error || !in_suspend) + swsusp_free(); + dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); resume_console(); @@ -460,11 +464,11 @@ int hibernation_platform_enter(void) error = hibernation_ops->prepare(); if (error) - goto Platofrm_finish; + goto Platform_finish; error = disable_nonboot_cpus(); if (error) - goto Platofrm_finish; + goto Platform_finish; local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); @@ -476,7 +480,7 @@ int hibernation_platform_enter(void) * We don't need to reenable the nonboot CPUs or resume consoles, since * the system is going to be halted anyway. */ - Platofrm_finish: + Platform_finish: hibernation_ops->finish(); dpm_suspend_noirq(PMSG_RESTORE); @@ -578,7 +582,10 @@ int hibernate(void) goto Thaw; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); - if (in_suspend && !error) { + if (error) + goto Thaw; + + if (in_suspend) { unsigned int flags = 0; if (hibernation_mode == HIBERNATION_PLATFORM) @@ -590,8 +597,8 @@ int hibernate(void) power_down(); } else { pr_debug("PM: Image restored successfully.\n"); - swsusp_free(); } + Thaw: thaw_processes(); Finish: diff --git a/kernel/power/main.c b/kernel/power/main.c index f710e36..347d2cc 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -11,6 +11,7 @@ #include <linux/kobject.h> #include <linux/string.h> #include <linux/resume-trace.h> +#include <linux/workqueue.h> #include "power.h" @@ -217,8 +218,24 @@ static struct attribute_group attr_group = { .attrs = g, }; +#ifdef CONFIG_PM_RUNTIME +struct workqueue_struct *pm_wq; + +static int __init pm_start_workqueue(void) +{ + pm_wq = create_freezeable_workqueue("pm"); + + return pm_wq ? 0 : -ENOMEM; +} +#else +static inline int pm_start_workqueue(void) { return 0; } +#endif + static int __init pm_init(void) { + int error = pm_start_workqueue(); + if (error) + return error; power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; diff --git a/kernel/power/power.h b/kernel/power/power.h index 26d5a26..46c5a26 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void); extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); -extern int swsusp_shrink_memory(void); +extern int hibernate_preallocate_memory(void); /** * Auxiliary structure used for reading the snapshot image data and diff --git a/kernel/power/process.c b/kernel/power/process.c index da2072d..cc2e553 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -9,6 +9,7 @@ #undef DEBUG #include <linux/interrupt.h> +#include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/syscalls.h> diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 523a451..36cb168 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) #define BM_END_OF_MAP (~0UL) -#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) +#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) struct bm_block { struct list_head hook; /* hook into a list of bitmap blocks */ @@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); /** * create_bm_block_list - create a list of block bitmap objects - * @nr_blocks - number of blocks to allocate + * @pages - number of pages to track * @list - list to put the allocated blocks into * @ca - chain allocator to be used for allocating memory */ @@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem_low(sizeof(struct nosave_region)); + region = alloc_bootmem(sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); @@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void) struct zone *zone; unsigned int n = 0; - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long pfn, max_zone_pfn; if (!is_highmem(zone)) @@ -916,7 +916,7 @@ static unsigned int count_data_pages(void) unsigned long pfn, max_zone_pfn; unsigned int n = 0; - for_each_zone(zone) { + for_each_populated_zone(zone) { if (is_highmem(zone)) continue; @@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) struct zone *zone; unsigned long pfn; - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long max_zone_pfn; mark_free_pages(zone); @@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) static unsigned int nr_copy_pages; /* Number of pages needed for saving the original pfns of the image pages */ static unsigned int nr_meta_pages; +/* + * Numbers of normal and highmem page frames allocated for hibernation image + * before suspending devices. + */ +unsigned int alloc_normal, alloc_highmem; +/* + * Memory bitmap used for marking saveable pages (during hibernation) or + * hibernation image pages (during restore) + */ +static struct memory_bitmap orig_bm; +/* + * Memory bitmap used during hibernation for marking allocated page frames that + * will contain copies of saveable pages. During restore it is initially used + * for marking hibernation image pages, but then the set bits from it are + * duplicated in @orig_bm and it is released. On highmem systems it is next + * used for marking "safe" highmem pages, but it has to be reinitialized for + * this purpose. + */ +static struct memory_bitmap copy_bm; /** * swsusp_free - free pages allocated for the suspend. @@ -1046,7 +1065,7 @@ void swsusp_free(void) struct zone *zone; unsigned long pfn, max_zone_pfn; - for_each_zone(zone) { + for_each_populated_zone(zone) { max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { @@ -1064,74 +1083,286 @@ void swsusp_free(void) nr_meta_pages = 0; restore_pblist = NULL; buffer = NULL; + alloc_normal = 0; + alloc_highmem = 0; } +/* Helper functions used for the shrinking of memory. */ + +#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) + /** - * swsusp_shrink_memory - Try to free as much memory as needed - * - * ... but do not OOM-kill anyone + * preallocate_image_pages - Allocate a number of pages for hibernation image + * @nr_pages: Number of page frames to allocate. + * @mask: GFP flags to use for the allocation. * - * Notice: all userland should be stopped before it is called, or - * livelock is possible. + * Return value: Number of page frames actually allocated + */ +static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) +{ + unsigned long nr_alloc = 0; + + while (nr_pages > 0) { + struct page *page; + + page = alloc_image_page(mask); + if (!page) + break; + memory_bm_set_bit(©_bm, page_to_pfn(page)); + if (PageHighMem(page)) + alloc_highmem++; + else + alloc_normal++; + nr_pages--; + nr_alloc++; + } + + return nr_alloc; +} + +static unsigned long preallocate_image_memory(unsigned long nr_pages) +{ + return preallocate_image_pages(nr_pages, GFP_IMAGE); +} + +#ifdef CONFIG_HIGHMEM +static unsigned long preallocate_image_highmem(unsigned long nr_pages) +{ + return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM); +} + +/** + * __fraction - Compute (an approximation of) x * (multiplier / base) */ +static unsigned long __fraction(u64 x, u64 multiplier, u64 base) +{ + x *= multiplier; + do_div(x, base); + return (unsigned long)x; +} + +static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + unsigned long alloc = __fraction(nr_pages, highmem, total); -#define SHRINK_BITE 10000 -static inline unsigned long __shrink_memory(long tmp) + return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM); +} +#else /* CONFIG_HIGHMEM */ +static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) { - if (tmp > SHRINK_BITE) - tmp = SHRINK_BITE; - return shrink_all_memory(tmp); + return 0; } -int swsusp_shrink_memory(void) +static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + return 0; +} +#endif /* CONFIG_HIGHMEM */ + +/** + * free_unnecessary_pages - Release preallocated pages not needed for the image + */ +static void free_unnecessary_pages(void) +{ + unsigned long save_highmem, to_free_normal, to_free_highmem; + + to_free_normal = alloc_normal - count_data_pages(); + save_highmem = count_highmem_pages(); + if (alloc_highmem > save_highmem) { + to_free_highmem = alloc_highmem - save_highmem; + } else { + to_free_highmem = 0; + to_free_normal -= save_highmem - alloc_highmem; + } + + memory_bm_position_reset(©_bm); + + while (to_free_normal > 0 && to_free_highmem > 0) { + unsigned long pfn = memory_bm_next_pfn(©_bm); + struct page *page = pfn_to_page(pfn); + + if (PageHighMem(page)) { + if (!to_free_highmem) + continue; + to_free_highmem--; + alloc_highmem--; + } else { + if (!to_free_normal) + continue; + to_free_normal--; + alloc_normal--; + } + memory_bm_clear_bit(©_bm, pfn); + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); + __free_page(page); + } +} + +/** + * minimum_image_size - Estimate the minimum acceptable size of an image + * @saveable: Number of saveable pages in the system. + * + * We want to avoid attempting to free too much memory too hard, so estimate the + * minimum acceptable size of a hibernation image to use as the lower limit for + * preallocating memory. + * + * We assume that the minimum image size should be proportional to + * + * [number of saveable pages] - [number of pages that can be freed in theory] + * + * where the second term is the sum of (1) reclaimable slab pages, (2) active + * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages, + * minus mapped file pages. + */ +static unsigned long minimum_image_size(unsigned long saveable) +{ + unsigned long size; + + size = global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE) + - global_page_state(NR_FILE_MAPPED); + + return saveable <= size ? 0 : saveable - size; +} + +/** + * hibernate_preallocate_memory - Preallocate memory for hibernation image + * + * To create a hibernation image it is necessary to make a copy of every page + * frame in use. We also need a number of page frames to be free during + * hibernation for allocations made while saving the image and for device + * drivers, in case they need to allocate memory from their hibernation + * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, + * respectively, both of which are rough estimates). To make this happen, we + * compute the total number of available page frames and allocate at least + * + * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES + * + * of them, which corresponds to the maximum size of a hibernation image. + * + * If image_size is set below the number following from the above formula, + * the preallocation of memory is continued until the total number of saveable + * pages in the system is below the requested image size or the minimum + * acceptable image size returned by minimum_image_size(), whichever is greater. + */ +int hibernate_preallocate_memory(void) { - long tmp; struct zone *zone; - unsigned long pages = 0; - unsigned int i = 0; - char *p = "-\\|/"; + unsigned long saveable, size, max_size, count, highmem, pages = 0; + unsigned long alloc, save_highmem, pages_highmem; struct timeval start, stop; + int error; - printk(KERN_INFO "PM: Shrinking memory... "); + printk(KERN_INFO "PM: Preallocating image memory... "); do_gettimeofday(&start); - do { - long size, highmem_size; - - highmem_size = count_highmem_pages(); - size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; - tmp = size; - size += highmem_size; - for_each_populated_zone(zone) { - tmp += snapshot_additional_pages(zone); - if (is_highmem(zone)) { - highmem_size -= - zone_page_state(zone, NR_FREE_PAGES); - } else { - tmp -= zone_page_state(zone, NR_FREE_PAGES); - tmp += zone->lowmem_reserve[ZONE_NORMAL]; - } - } - if (highmem_size < 0) - highmem_size = 0; + error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); + if (error) + goto err_out; - tmp += highmem_size; - if (tmp > 0) { - tmp = __shrink_memory(tmp); - if (!tmp) - return -ENOMEM; - pages += tmp; - } else if (size > image_size / PAGE_SIZE) { - tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); - pages += tmp; - } - printk("\b%c", p[i++%4]); - } while (tmp > 0); + error = memory_bm_create(©_bm, GFP_IMAGE, PG_ANY); + if (error) + goto err_out; + + alloc_normal = 0; + alloc_highmem = 0; + + /* Count the number of saveable data pages. */ + save_highmem = count_highmem_pages(); + saveable = count_data_pages(); + + /* + * Compute the total number of page frames we can use (count) and the + * number of pages needed for image metadata (size). + */ + count = saveable; + saveable += save_highmem; + highmem = save_highmem; + size = 0; + for_each_populated_zone(zone) { + size += snapshot_additional_pages(zone); + if (is_highmem(zone)) + highmem += zone_page_state(zone, NR_FREE_PAGES); + else + count += zone_page_state(zone, NR_FREE_PAGES); + } + count += highmem; + count -= totalreserve_pages; + + /* Compute the maximum number of saveable pages to leave in memory. */ + max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; + size = DIV_ROUND_UP(image_size, PAGE_SIZE); + if (size > max_size) + size = max_size; + /* + * If the maximum is not less than the current number of saveable pages + * in memory, allocate page frames for the image and we're done. + */ + if (size >= saveable) { + pages = preallocate_image_highmem(save_highmem); + pages += preallocate_image_memory(saveable - pages); + goto out; + } + + /* Estimate the minimum size of the image. */ + pages = minimum_image_size(saveable); + if (size < pages) + size = min_t(unsigned long, pages, max_size); + + /* + * Let the memory management subsystem know that we're going to need a + * large number of page frames to allocate and make it free some memory. + * NOTE: If this is not done, performance will be hurt badly in some + * test cases. + */ + shrink_all_memory(saveable - size); + + /* + * The number of saveable pages in memory was too high, so apply some + * pressure to decrease it. First, make room for the largest possible + * image and fail if that doesn't work. Next, try to decrease the size + * of the image as much as indicated by 'size' using allocations from + * highmem and non-highmem zones separately. + */ + pages_highmem = preallocate_image_highmem(highmem / 2); + alloc = (count - max_size) - pages_highmem; + pages = preallocate_image_memory(alloc); + if (pages < alloc) + goto err_out; + size = max_size - size; + alloc = size; + size = preallocate_highmem_fraction(size, highmem, count); + pages_highmem += size; + alloc -= size; + pages += preallocate_image_memory(alloc); + pages += pages_highmem; + + /* + * We only need as many page frames for the image as there are saveable + * pages in memory, but we have allocated more. Release the excessive + * ones now. + */ + free_unnecessary_pages(); + + out: do_gettimeofday(&stop); - printk("\bdone (%lu pages freed)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Freed"); + printk(KERN_CONT "done (allocated %lu pages)\n", pages); + swsusp_show_speed(&start, &stop, pages, "Allocated"); return 0; + + err_out: + printk(KERN_CONT "\n"); + swsusp_free(); + return -ENOMEM; } #ifdef CONFIG_HIGHMEM @@ -1142,7 +1373,7 @@ int swsusp_shrink_memory(void) static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { - unsigned int free_highmem = count_free_highmem_pages(); + unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; if (free_highmem >= nr_highmem) nr_highmem = 0; @@ -1164,19 +1395,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; } static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) { struct zone *zone; - unsigned int free = 0, meta = 0; + unsigned int free = alloc_normal; - for_each_zone(zone) { - meta += snapshot_additional_pages(zone); + for_each_populated_zone(zone) if (!is_highmem(zone)) free += zone_page_state(zone, NR_FREE_PAGES); - } nr_pages += count_pages_for_highmem(nr_highmem); - pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", - nr_pages, PAGES_FOR_IO, meta, free); + pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n", + nr_pages, PAGES_FOR_IO, free); - return free > nr_pages + PAGES_FOR_IO + meta; + return free > nr_pages + PAGES_FOR_IO; } #ifdef CONFIG_HIGHMEM @@ -1198,7 +1427,7 @@ static inline int get_highmem_buffer(int safe_needed) */ static inline unsigned int -alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) { unsigned int to_alloc = count_free_highmem_pages(); @@ -1218,7 +1447,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) static inline int get_highmem_buffer(int safe_needed) { return 0; } static inline unsigned int -alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } #endif /* CONFIG_HIGHMEM */ /** @@ -1237,51 +1466,36 @@ static int swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, unsigned int nr_pages, unsigned int nr_highmem) { - int error; - - error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); - if (error) - goto Free; - - error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); - if (error) - goto Free; + int error = 0; if (nr_highmem > 0) { error = get_highmem_buffer(PG_ANY); if (error) - goto Free; - - nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); + goto err_out; + if (nr_highmem > alloc_highmem) { + nr_highmem -= alloc_highmem; + nr_pages += alloc_highmem_pages(copy_bm, nr_highmem); + } } - while (nr_pages-- > 0) { - struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); - - if (!page) - goto Free; + if (nr_pages > alloc_normal) { + nr_pages -= alloc_normal; + while (nr_pages-- > 0) { + struct page *page; - memory_bm_set_bit(copy_bm, page_to_pfn(page)); + page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + if (!page) + goto err_out; + memory_bm_set_bit(copy_bm, page_to_pfn(page)); + } } + return 0; - Free: + err_out: swsusp_free(); - return -ENOMEM; + return error; } -/* Memory bitmap used for marking saveable pages (during suspend) or the - * suspend image pages (during resume) - */ -static struct memory_bitmap orig_bm; -/* Memory bitmap used on suspend for marking allocated pages that will contain - * the copies of saveable pages. During resume it is initially used for - * marking the suspend image pages, but then its set bits are duplicated in - * @orig_bm and it is released. Next, on systems with high memory, it may be - * used for marking "safe" highmem pages, but it has to be reinitialized for - * this purpose. - */ -static struct memory_bitmap copy_bm; - asmlinkage int swsusp_save(void) { unsigned int nr_pages, nr_highmem; @@ -1474,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) unsigned long pfn, max_zone_pfn; /* Clear page flags */ - for_each_zone(zone) { + for_each_populated_zone(zone) { max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8ba052c..b101cdc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -13,7 +13,6 @@ #include <linux/module.h> #include <linux/file.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/genhd.h> diff --git a/kernel/printk.c b/kernel/printk.c index e10d193..f38b07f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -206,12 +206,11 @@ __setup("log_buf_len=", log_buf_len_setup); #ifdef CONFIG_BOOT_PRINTK_DELAY static unsigned int boot_delay; /* msecs delay after each printk during bootup */ -static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */ +static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; - unsigned long long loops_per_msec; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; @@ -220,10 +219,9 @@ static int __init boot_delay_setup(char *str) if (boot_delay > 10 * 1000) boot_delay = 0; - printk_delay_msec = loops_per_msec; - printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, " - "HZ: %d, printk_delay_msec: %llu\n", - boot_delay, preset_lpj, lpj, HZ, printk_delay_msec); + pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " + "HZ: %d, loops_per_msec: %llu\n", + boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 1; } __setup("boot_delay=", boot_delay_setup); @@ -236,7 +234,7 @@ static void boot_delay_msec(void) if (boot_delay == 0 || system_state != SYSTEM_BOOTING) return; - k = (unsigned long long)printk_delay_msec * boot_delay; + k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { @@ -655,6 +653,20 @@ static int recursion_bug; static int new_text_line = 1; static char printk_buf[1024]; +int printk_delay_msec __read_mostly; + +static inline void printk_delay(void) +{ + if (unlikely(printk_delay_msec)) { + int m = printk_delay_msec; + + while (m--) { + mdelay(1); + touch_nmi_watchdog(); + } + } +} + asmlinkage int vprintk(const char *fmt, va_list args) { int printed_len = 0; @@ -664,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) char *p; boot_delay_msec(); + printk_delay(); preempt_disable(); /* This stops the holder of console_sem just where we want him */ @@ -1075,12 +1088,6 @@ void __sched console_conditional_schedule(void) } EXPORT_SYMBOL(console_conditional_schedule); -void console_print(const char *s) -{ - printk(KERN_EMERG "%s", s); -} -EXPORT_SYMBOL(console_print); - void console_unblank(void) { struct console *c; diff --git a/kernel/profile.c b/kernel/profile.c index 419250e..a55d3a3 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -442,48 +442,51 @@ void profile_tick(int type) #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <asm/uaccess.h> -static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) { - int len = cpumask_scnprintf(page, count, data); - if (count - len < 2) - return -EINVAL; - len += sprintf(page + len, "\n"); - return len; + seq_cpumask(m, prof_cpu_mask); + seq_putc(m, '\n'); + return 0; } -static int prof_cpu_mask_write_proc(struct file *file, - const char __user *buffer, unsigned long count, void *data) +static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, prof_cpu_mask_proc_show, NULL); +} + +static ssize_t prof_cpu_mask_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) { - struct cpumask *mask = data; - unsigned long full_count = count, err; cpumask_var_t new_value; + int err; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); if (!err) { - cpumask_copy(mask, new_value); - err = full_count; + cpumask_copy(prof_cpu_mask, new_value); + err = count; } free_cpumask_var(new_value); return err; } +static const struct file_operations prof_cpu_mask_proc_fops = { + .open = prof_cpu_mask_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = prof_cpu_mask_proc_write, +}; + void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) { - struct proc_dir_entry *entry; - /* create /proc/irq/prof_cpu_mask */ - entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); - if (!entry) - return; - entry->data = prof_cpu_mask; - entry->read_proc = prof_cpu_mask_read_proc; - entry->write_proc = prof_cpu_mask_write_proc; + proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); } /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 307c285..23bd09c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh) * or self-reaping. Do notification now if it would have happened earlier. * If it should reap itself, return true. * - * If it's our own child, there is no notification to do. - * But if our normal children self-reap, then this child - * was prevented by ptrace and we must reap it now. + * If it's our own child, there is no notification to do. But if our normal + * children self-reap, then this child was prevented by ptrace and we must + * reap it now, in that case we must also wake up sub-threads sleeping in + * do_wait(). */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { @@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!task_detached(p) && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); p->exit_signal = -1; + } } if (task_detached(p)) { /* Mark it as in the process of being reaped. */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index bd5d5c8..37ac454 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -19,7 +19,7 @@ * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * + * * Based on the original work by Paul McKenney <paulmck@us.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: @@ -27,7 +27,7 @@ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - - * http://lse.sourceforge.net/locking/rcupdate.html + * http://lse.sourceforge.net/locking/rcupdate.html * */ #include <linux/types.h> @@ -74,6 +74,8 @@ void wakeme_after_rcu(struct rcu_head *head) complete(&rcu->completion); } +#ifdef CONFIG_TREE_PREEMPT_RCU + /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -87,7 +89,7 @@ void synchronize_rcu(void) { struct rcu_synchronize rcu; - if (rcu_blocking_is_gp()) + if (!rcu_scheduler_active) return; init_completion(&rcu.completion); @@ -98,6 +100,46 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +/** + * synchronize_sched - wait until an rcu-sched grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-sched + * grace period has elapsed, in other words after all currently executing + * rcu-sched read-side critical sections have completed. These read-side + * critical sections are delimited by rcu_read_lock_sched() and + * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), + * local_irq_disable(), and so on may be used in place of + * rcu_read_lock_sched(). + * + * This means that all preempt_disable code sequences, including NMI and + * hardware-interrupt handlers, in progress on entry will have completed + * before this primitive returns. However, this does not guarantee that + * softirq handlers will have completed, since in some kernels, these + * handlers can run in process context, and can block. + * + * This primitive provides the guarantees made by the (now removed) + * synchronize_kernel() API. In contrast, synchronize_rcu() only + * guarantees that rcu_read_lock() sections will have completed. + * In "classic RCU", these two guarantees happen to be one and + * the same, but can differ in realtime RCU implementations. + */ +void synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + /** * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. * diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b33db53..233768f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -18,7 +18,7 @@ * Copyright (C) IBM Corporation, 2005, 2006 * * Authors: Paul E. McKenney <paulmck@us.ibm.com> - * Josh Triplett <josh@freedesktop.org> + * Josh Triplett <josh@freedesktop.org> * * See also: Documentation/RCU/torture.txt */ @@ -50,7 +50,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " - "Josh Triplett <josh@freedesktop.org>"); + "Josh Triplett <josh@freedesktop.org>"); static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ @@ -110,8 +110,8 @@ struct rcu_torture { }; static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current = NULL; -static long rcu_torture_current_version = 0; +static struct rcu_torture *rcu_torture_current; +static long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = @@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; -static long n_rcu_torture_timers = 0; +static long n_rcu_torture_timers; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; -static int stutter_pause_test = 0; +static int stutter_pause_test; #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -267,7 +267,8 @@ struct rcu_torture_ops { int irq_capable; char *name; }; -static struct rcu_torture_ops *cur_ops = NULL; + +static struct rcu_torture_ops *cur_ops; /* * Definitions for rcu torture testing. @@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU) static void rcu_read_delay(struct rcu_random_state *rrsp) { - long delay; - const long longdelay = 200; + const unsigned long shortdelay_us = 200; + const unsigned long longdelay_ms = 50; - /* We want there to be long-running readers, but not all the time. */ + /* We want a short delay sometimes to make a reader delay the grace + * period, and we want a long delay occasionally to trigger + * force_quiescent_state. */ - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); - if (!delay) - udelay(longdelay); + if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + udelay(shortdelay_us); } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -339,8 +343,8 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .cb_barrier = rcu_barrier, .stats = NULL, - .irq_capable = 1, - .name = "rcu" + .irq_capable = 1, + .name = "rcu" }; static void rcu_sync_torture_deferred_free(struct rcu_torture *p) @@ -638,7 +642,8 @@ rcu_torture_writer(void *arg) do { schedule_timeout_uninterruptible(1); - if ((rp = rcu_torture_alloc()) == NULL) + rp = rcu_torture_alloc(); + if (rp == NULL) continue; rp->rtort_pipe_count = 0; udelay(rcu_random(&rand) & 0x3ff); @@ -1110,7 +1115,7 @@ rcu_torture_init(void) printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", torture_type); mutex_unlock(&fullstop_mutex); - return (-EINVAL); + return -EINVAL; } if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ @@ -1161,7 +1166,7 @@ rcu_torture_init(void) goto unwind; } fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); + GFP_KERNEL); if (fakewriter_tasks == NULL) { VERBOSE_PRINTK_ERRSTRING("out of memory"); firsterr = -ENOMEM; @@ -1170,7 +1175,7 @@ rcu_torture_init(void) for (i = 0; i < nfakewriters; i++) { VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); + "rcu_torture_fakewriter"); if (IS_ERR(fakewriter_tasks[i])) { firsterr = PTR_ERR(fakewriter_tasks[i]); VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b11b07..52b06f6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -25,7 +25,7 @@ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU + * Documentation/RCU */ #include <linux/types.h> #include <linux/kernel.h> @@ -107,27 +107,23 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, */ void rcu_sched_qs(int cpu) { - unsigned long flags; struct rcu_data *rdp; - local_irq_save(flags); rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; - rcu_preempt_qs(cpu); - local_irq_restore(flags); + barrier(); + rdp->passed_quiesc = 1; + rcu_preempt_note_context_switch(cpu); } void rcu_bh_qs(int cpu) { - unsigned long flags; struct rcu_data *rdp; - local_irq_save(flags); rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; - local_irq_restore(flags); + barrier(); + rdp->passed_quiesc = 1; } #ifdef CONFIG_NO_HZ @@ -605,8 +601,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) { struct rcu_data *rdp = rsp->rda[smp_processor_id()]; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_node *rnp_cur; - struct rcu_node *rnp_end; if (!cpu_needs_another_gp(rsp, rdp)) { spin_unlock_irqrestore(&rnp->lock, flags); @@ -615,6 +609,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Advance to a new grace period and initialize state. */ rsp->gpnum++; + WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); @@ -631,7 +626,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Special-case the common single-level case. */ if (NUM_RCU_NODES == 1) { + rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -644,42 +641,28 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) spin_lock(&rsp->onofflock); /* irqs already disabled. */ /* - * Set the quiescent-state-needed bits in all the non-leaf RCU - * nodes for all currently online CPUs. This operation relies - * on the layout of the hierarchy within the rsp->node[] array. - * Note that other CPUs will access only the leaves of the - * hierarchy, which still indicate that no grace period is in - * progress. In addition, we have excluded CPU-hotplug operations. - * - * We therefore do not need to hold any locks. Any required - * memory barriers will be supplied by the locks guarding the - * leaf rcu_nodes in the hierarchy. - */ - - rnp_end = rsp->level[NUM_RCU_LVLS - 1]; - for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++) - rnp_cur->qsmask = rnp_cur->qsmaskinit; - - /* - * Now set up the leaf nodes. Here we must be careful. First, - * we need to hold the lock in order to exclude other CPUs, which - * might be contending for the leaf nodes' locks. Second, as - * soon as we initialize a given leaf node, its CPUs might run - * up the rest of the hierarchy. We must therefore acquire locks - * for each node that we touch during this stage. (But we still - * are excluding CPU-hotplug operations.) + * Set the quiescent-state-needed bits in all the rcu_node + * structures for all currently online CPUs in breadth-first + * order, starting from the root rcu_node structure. This + * operation relies on the layout of the hierarchy within the + * rsp->node[] array. Note that other CPUs will access only + * the leaves of the hierarchy, which still indicate that no + * grace period is in progress, at least until the corresponding + * leaf node has been initialized. In addition, we have excluded + * CPU-hotplug operations. * * Note that the grace period cannot complete until we finish * the initialization process, as there will be at least one * qsmask bit set in the root node until that time, namely the - * one corresponding to this CPU. + * one corresponding to this CPU, due to the fact that we have + * irqs disabled. */ - rnp_end = &rsp->node[NUM_RCU_NODES]; - rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - for (; rnp_cur < rnp_end; rnp_cur++) { - spin_lock(&rnp_cur->lock); /* irqs already disabled. */ - rnp_cur->qsmask = rnp_cur->qsmaskinit; - spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ + for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) { + spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_preempt_check_blocked_tasks(rnp); + rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; + spin_unlock(&rnp->lock); /* irqs already disabled. */ } rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ @@ -722,6 +705,7 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) __releases(rnp->lock) { + WARN_ON_ONCE(rsp->completed == rsp->gpnum); rsp->completed = rsp->gpnum; rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ @@ -739,6 +723,8 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { + struct rcu_node *rnp_c; + /* Walk up the rcu_node hierarchy. */ for (;;) { if (!(rnp->qsmask & mask)) { @@ -762,8 +748,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, break; } spin_unlock_irqrestore(&rnp->lock, flags); + rnp_c = rnp; rnp = rnp->parent; spin_lock_irqsave(&rnp->lock, flags); + WARN_ON_ONCE(rnp_c->qsmask); } /* @@ -776,10 +764,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, /* * Record a quiescent state for the specified CPU, which must either be - * the current CPU or an offline CPU. The lastcomp argument is used to - * make sure we are still in the grace period of interest. We don't want - * to end the current grace period based on quiescent states detected in - * an earlier grace period! + * the current CPU. The lastcomp argument is used to make sure we are + * still in the grace period of interest. We don't want to end the current + * grace period based on quiescent states detected in an earlier grace + * period! */ static void cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) @@ -814,7 +802,6 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - rdp = rsp->rda[smp_processor_id()]; rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ @@ -872,7 +859,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_lock_irqsave(&rsp->onofflock, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - rnp = rdp->mynode; + rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { spin_lock(&rnp->lock); /* irqs already disabled. */ @@ -881,7 +868,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } - rcu_preempt_offline_tasks(rsp, rnp); + rcu_preempt_offline_tasks(rsp, rnp, rdp); mask = rnp->grpmask; spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; @@ -890,9 +877,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - /* Being offline is a quiescent state, so go record it. */ - cpu_quiet(cpu, rsp, rdp, lastcomp); - /* * Move callbacks from the outgoing CPU to the running CPU. * Note that the outgoing CPU is now quiscent, so it is now @@ -1457,20 +1441,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - - /* - * A new grace period might start here. If so, we will be part of - * it, and its gpnum will be greater than ours, so we will - * participate. It is also possible for the gpnum to have been - * incremented before this function was called, and the bitmasks - * to not be filled out until now, in which case we will also - * participate due to our gpnum being behind. - */ - - /* Since it is coming online, the CPU is in a quiescent state. */ - cpu_quiet(cpu, rsp, rdp, lastcomp); - local_irq_restore(flags); + spin_unlock_irqrestore(&rsp->onofflock, flags); } static void __cpuinit rcu_online_cpu(int cpu) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index bf8a6f9..8e8287a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -142,7 +142,7 @@ struct rcu_data { */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; - long qlen; /* # of queued callbacks */ + long qlen; /* # of queued callbacks */ long blimit; /* Upper limit on a processed batch */ #ifdef CONFIG_NO_HZ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 4778936..1cee04f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -64,22 +64,31 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. */ -static void rcu_preempt_qs_record(int cpu) +static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; + barrier(); + rdp->passed_quiesc = 1; } /* - * We have entered the scheduler or are between softirqs in ksoftirqd. - * If we are in an RCU read-side critical section, we need to reflect - * that in the state of the rcu_node structure corresponding to this CPU. - * Caller must disable hardirqs. + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the appropriate entry + * of the blocked_tasks[] array. The task will dequeue itself when + * it exits the outermost enclosing RCU read-side critical section. + * Therefore, the current grace period cannot be permitted to complete + * until the blocked_tasks[] entry indexed by the low-order bit of + * rnp->gpnum empties. + * + * Caller must disable preemption. */ -static void rcu_preempt_qs(int cpu) +static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; + unsigned long flags; int phase; struct rcu_data *rdp; struct rcu_node *rnp; @@ -90,7 +99,7 @@ static void rcu_preempt_qs(int cpu) /* Possibly blocking in an RCU read-side critical section. */ rdp = rcu_preempt_state.rda[cpu]; rnp = rdp->mynode; - spin_lock(&rnp->lock); + spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; t->rcu_blocked_node = rnp; @@ -103,11 +112,15 @@ static void rcu_preempt_qs(int cpu) * state for the current grace period), then as long * as that task remains queued, the current grace period * cannot end. + * + * But first, note that the current CPU must still be + * on line! */ - phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1); + WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); + WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); + phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); - smp_mb(); /* Ensure later ctxt swtch seen after above. */ - spin_unlock(&rnp->lock); + spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -119,9 +132,10 @@ static void rcu_preempt_qs(int cpu) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - rcu_preempt_qs_record(cpu); - t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS | - RCU_READ_UNLOCK_GOT_QS); + rcu_preempt_qs(cpu); + local_irq_save(flags); + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + local_irq_restore(flags); } /* @@ -157,7 +171,7 @@ static void rcu_read_unlock_special(struct task_struct *t) special = t->rcu_read_unlock_special; if (special & RCU_READ_UNLOCK_NEED_QS) { t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS; + rcu_preempt_qs(smp_processor_id()); } /* Hardware IRQ handlers cannot block. */ @@ -177,10 +191,10 @@ static void rcu_read_unlock_special(struct task_struct *t) */ for (;;) { rnp = t->rcu_blocked_node; - spin_lock(&rnp->lock); + spin_lock(&rnp->lock); /* irqs already disabled. */ if (rnp == t->rcu_blocked_node) break; - spin_unlock(&rnp->lock); + spin_unlock(&rnp->lock); /* irqs remain disabled. */ } empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); list_del_init(&t->rcu_node_entry); @@ -194,9 +208,8 @@ static void rcu_read_unlock_special(struct task_struct *t) */ if (!empty && rnp->qsmask == 0 && list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { - t->rcu_read_unlock_special &= - ~(RCU_READ_UNLOCK_NEED_QS | - RCU_READ_UNLOCK_GOT_QS); + struct rcu_node *rnp_p; + if (rnp->parent == NULL) { /* Only one rcu_node in the tree. */ cpu_quiet_msk_finish(&rcu_preempt_state, flags); @@ -205,9 +218,10 @@ static void rcu_read_unlock_special(struct task_struct *t) /* Report up the rest of the hierarchy. */ mask = rnp->grpmask; spin_unlock_irqrestore(&rnp->lock, flags); - rnp = rnp->parent; - spin_lock_irqsave(&rnp->lock, flags); - cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags); + rnp_p = rnp->parent; + spin_lock_irqsave(&rnp_p->lock, flags); + WARN_ON_ONCE(rnp->qsmask); + cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); return; } spin_unlock(&rnp->lock); @@ -259,6 +273,19 @@ static void rcu_print_task_stall(struct rcu_node *rnp) #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* + * Check that the list of blocked tasks for the newly completed grace + * period is in fact empty. It is a serious bug to complete a grace + * period that still has RCU readers blocked! This function must be + * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock + * must be held by the caller. + */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +{ + WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])); + WARN_ON_ONCE(rnp->qsmask); +} + +/* * Check for preempted RCU readers for the specified rcu_node structure. * If the caller needs a reliable answer, it must hold the rcu_node's * >lock. @@ -280,7 +307,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp) * The caller must hold rnp->lock with irqs disabled. */ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp) + struct rcu_node *rnp, + struct rcu_data *rdp) { int i; struct list_head *lp; @@ -292,6 +320,9 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, WARN_ONCE(1, "Last CPU thought to be offlined?"); return; /* Shouldn't happen: at least one CPU online. */ } + WARN_ON_ONCE(rnp != rdp->mynode && + (!list_empty(&rnp->blocked_tasks[0]) || + !list_empty(&rnp->blocked_tasks[1]))); /* * Move tasks up to root rcu_node. Rely on the fact that the @@ -335,20 +366,12 @@ static void rcu_preempt_check_callbacks(int cpu) struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { - t->rcu_read_unlock_special &= - ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS); - rcu_preempt_qs_record(cpu); + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + rcu_preempt_qs(cpu); return; } - if (per_cpu(rcu_preempt_data, cpu).qs_pending) { - if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) { - rcu_preempt_qs_record(cpu); - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS; - } else if (!(t->rcu_read_unlock_special & - RCU_READ_UNLOCK_NEED_QS)) { - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; - } - } + if (per_cpu(rcu_preempt_data, cpu).qs_pending) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } /* @@ -434,7 +457,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * Because preemptable RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_qs(int cpu) +static void rcu_preempt_note_context_switch(int cpu) { } @@ -451,6 +474,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* + * Because there is no preemptable RCU, there can be no readers blocked, + * so there is no need to check for blocked tasks. So check only for + * bogus qsmask values. + */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +{ + WARN_ON_ONCE(rnp->qsmask); +} + +/* * Because preemptable RCU does not exist, there are never any preempted * RCU readers. */ @@ -466,7 +499,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp) * tasks that were blocked within RCU read-side critical sections. */ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp) + struct rcu_node *rnp, + struct rcu_data *rdp) { } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0ea1bff..c89f5e9 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -20,7 +20,7 @@ * Papers: http://www.rdrop.com/users/paulmck/RCU * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU + * Documentation/RCU * */ #include <linux/types.h> diff --git a/kernel/res_counter.c b/kernel/res_counter.c index e1338f0..88faec2 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = RESOURCE_MAX; + counter->soft_limit = RESOURCE_MAX; counter->parent = parent; } @@ -36,17 +37,27 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) } int res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) + struct res_counter **limit_fail_at, + struct res_counter **soft_limit_fail_at) { int ret; unsigned long flags; struct res_counter *c, *u; *limit_fail_at = NULL; + if (soft_limit_fail_at) + *soft_limit_fail_at = NULL; local_irq_save(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); ret = res_counter_charge_locked(c, val); + /* + * With soft limits, we return the highest ancestor + * that exceeds its soft limit + */ + if (soft_limit_fail_at && + !res_counter_soft_limit_check_locked(c)) + *soft_limit_fail_at = c; spin_unlock(&c->lock); if (ret < 0) { *limit_fail_at = c; @@ -74,7 +85,8 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) counter->usage -= val; } -void res_counter_uncharge(struct res_counter *counter, unsigned long val) +void res_counter_uncharge(struct res_counter *counter, unsigned long val, + bool *was_soft_limit_excess) { unsigned long flags; struct res_counter *c; @@ -82,6 +94,9 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) local_irq_save(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); + if (was_soft_limit_excess) + *was_soft_limit_excess = + !res_counter_soft_limit_check_locked(c); res_counter_uncharge_locked(c, val); spin_unlock(&c->lock); } @@ -101,6 +116,8 @@ res_counter_member(struct res_counter *counter, int member) return &counter->limit; case RES_FAILCNT: return &counter->failcnt; + case RES_SOFT_LIMIT: + return &counter->soft_limit; }; BUG(); diff --git a/kernel/resource.c b/kernel/resource.c index 78b0872..fb11a58 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -223,13 +223,13 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* * Finds the lowest memory reosurce exists within [res->start.res->end) - * the caller must specify res->start, res->end, res->flags. + * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. */ -static int find_next_system_ram(struct resource *res) +static int find_next_system_ram(struct resource *res, char *name) { resource_size_t start, end; struct resource *p; @@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res) /* system ram is just marked as IORESOURCE_MEM */ if (p->flags != res->flags) continue; + if (name && strcmp(p->name, name)) + continue; if (p->start > end) { p = NULL; break; @@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res) res->end = p->end; return 0; } -int -walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, - int (*func)(unsigned long, unsigned long, void *)) + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". + */ +int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, + void *arg, int (*func)(unsigned long, unsigned long, void *)) { struct resource res; unsigned long pfn, len; u64 orig_end; int ret = -1; + res.start = (u64) start_pfn << PAGE_SHIFT; res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; - while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { + while ((res.start < res.end) && + (find_next_system_ram(&res, "System RAM") >= 0)) { pfn = (unsigned long)(res.start >> PAGE_SHIFT); len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); ret = (*func)(pfn, len, arg); diff --git a/kernel/sched.c b/kernel/sched.c index e27a536..ee61f45 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,7 +39,7 @@ #include <linux/completion.h> #include <linux/kernel_stat.h> #include <linux/debug_locks.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/profile.h> @@ -119,8 +119,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -static void double_rq_lock(struct rq *rq1, struct rq *rq2); - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -295,12 +293,12 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group @@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ - return 1; -} -#endif - static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline struct task_group *task_group(struct task_struct *p) { @@ -514,14 +505,6 @@ struct root_domain { #ifdef CONFIG_SMP struct cpupri cpupri; #endif -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Preferred wake up cpu nominated by sched_mc balance that will be - * used when most cpus are idle in the system indicating overall very - * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) - */ - unsigned int sched_mc_preferred_wakeup_cpu; -#endif }; /* @@ -646,9 +629,10 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) +static inline +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - rq->curr->sched_class->check_preempt_curr(rq, p, sync); + rq->curr->sched_class->check_preempt_curr(rq, p, flags); } static inline int cpu_of(struct rq *rq) @@ -697,15 +681,9 @@ inline void update_rq_clock(struct rq *rq) * This interface allows printk to be called with the runqueue lock * held and know whether or not it is OK to wake up the klogd. */ -int runqueue_is_locked(void) +int runqueue_is_locked(int cpu) { - int cpu = get_cpu(); - struct rq *rq = cpu_rq(cpu); - int ret; - - ret = spin_is_locked(&rq->lock); - put_cpu(); - return ret; + return spin_is_locked(&cpu_rq(cpu)->lock); } /* @@ -1509,8 +1487,65 @@ static int tg_nop(struct task_group *tg, void *data) #endif #ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + +static struct sched_group *group_of(int cpu) +{ + struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + + if (!sd) + return NULL; + + return sd->groups; +} + +static unsigned long power_of(int cpu) +{ + struct sched_group *group = group_of(cpu); + + if (!group) + return SCHED_LOAD_SCALE; + + return group->cpu_power; +} + static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) @@ -1695,6 +1730,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #ifdef CONFIG_PREEMPT +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + /* * fair double_lock_balance: Safely acquires both rq->locks in a fair * way at the expense of forcing extra atomic operations in all @@ -1959,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SMP - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - /* * Is this task likely cache-hot: */ @@ -2023,7 +2053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif - perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } p->se.vruntime -= old_cfsrq->min_vruntime - @@ -2239,185 +2269,6 @@ void kick_process(struct task_struct *p) preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); - -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) -{ - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; - - do { - unsigned long load, avg_load; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), - &p->cpus_allowed)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); - - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - - for_each_cpu(i, sched_group_cpus(group)) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - - avg_load += load; - } - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - this = group; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - if (!idlest || 100*this_load < imbalance*min_load) - return NULL; - return idlest; -} - -/* - * find_idlest_cpu - find the idlest cpu among the cpus in group. - */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) -{ - unsigned long load, min_load = ULONG_MAX; - int idlest = -1; - int i; - - /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; - } - } - - return idlest; -} - -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int sched_balance_self(int cpu, int flag) -{ - struct task_struct *t = current; - struct sched_domain *tmp, *sd = NULL; - - for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - break; - if (tmp->flags & flag) - sd = tmp; - } - - if (sd) - update_shares(sd); - - while (sd) { - struct sched_group *group; - int new_cpu, weight; - - if (!(sd->flags & flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, t, cpu); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = cpumask_weight(sched_domain_span(sd)); - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= cpumask_weight(sched_domain_span(tmp))) - break; - if (tmp->flags & flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - return cpu; -} - #endif /* CONFIG_SMP */ /** @@ -2455,37 +2306,22 @@ void task_oncpu_function_call(struct task_struct *p, * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int try_to_wake_up(struct task_struct *p, unsigned int state, + int wake_flags) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - long old_state; struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) - sync = 0; - -#ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { - struct sched_domain *sd; + wake_flags &= ~WF_SYNC; - this_cpu = raw_smp_processor_id(); - cpu = task_cpu(p); - - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - update_shares(sd); - break; - } - } - } -#endif + this_cpu = get_cpu(); smp_wmb(); rq = task_rq_lock(p, &flags); update_rq_clock(rq); - old_state = p->state; - if (!(old_state & state)) + if (!(p->state & state)) goto out; if (p->se.on_rq) @@ -2493,27 +2329,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); orig_cpu = cpu; - this_cpu = smp_processor_id(); #ifdef CONFIG_SMP if (unlikely(task_running(rq, p))) goto out_activate; - cpu = p->sched_class->select_task_rq(p, sync); - if (cpu != orig_cpu) { + /* + * In order to handle concurrent wakeups and release the rq->lock + * we put the task in TASK_WAKING state. + * + * First fix up the nr_uninterruptible count: + */ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; + p->state = TASK_WAKING; + task_rq_unlock(rq, &flags); + + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (cpu != orig_cpu) set_task_cpu(p, cpu); - task_rq_unlock(rq, &flags); - /* might preempt at this point */ - rq = task_rq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out; - if (p->se.on_rq) - goto out_running; - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - } + rq = task_rq_lock(p, &flags); + WARN_ON(p->state != TASK_WAKING); + cpu = task_cpu(p); #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2533,7 +2371,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ schedstat_inc(p, se.nr_wakeups); - if (sync) + if (wake_flags & WF_SYNC) schedstat_inc(p, se.nr_wakeups_sync); if (orig_cpu != cpu) schedstat_inc(p, se.nr_wakeups_migrate); @@ -2562,7 +2400,7 @@ out_activate: out_running: trace_sched_wakeup(rq, p, success); - check_preempt_curr(rq, p, sync); + check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2571,6 +2409,7 @@ out_running: #endif out: task_rq_unlock(rq, &flags); + put_cpu(); return success; } @@ -2613,6 +2452,7 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; + p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -2674,11 +2514,6 @@ void sched_fork(struct task_struct *p, int clone_flags) __sched_fork(p); -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif - set_task_cpu(p, cpu); - /* * Make sure we do not leak PI boosting priority to the child. */ @@ -2709,6 +2544,11 @@ void sched_fork(struct task_struct *p, int clone_flags) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; +#ifdef CONFIG_SMP + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); +#endif + set_task_cpu(p, cpu); + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -2754,7 +2594,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) inc_nr_running(rq); } trace_sched_wakeup_new(rq, p, 1); - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2878,7 +2718,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_counter_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -3064,6 +2904,19 @@ unsigned long nr_iowait(void) return sum; } +unsigned long nr_iowait_cpu(void) +{ + struct rq *this = this_rq(); + return atomic_read(&this->nr_iowait); +} + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; @@ -3263,7 +3116,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); + new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); @@ -3683,11 +3536,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, *imbalance = sds->min_load_per_task; sds->busiest = sds->group_min; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(sds->group_leader); - } - return 1; } @@ -3711,7 +3559,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) + +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return SCHED_LOAD_SCALE; +} + +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return default_scale_freq_power(sd, cpu); +} + +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { unsigned long weight = cpumask_weight(sched_domain_span(sd)); unsigned long smt_gain = sd->smt_gain; @@ -3721,6 +3580,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) return smt_gain; } +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + return default_scale_smt_power(sd, cpu); +} + unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -3745,10 +3609,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - /* here we could scale based on cpufreq */ + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - power *= arch_scale_smt_power(sd, cpu); + if (sched_feat(ARCH_POWER)) + power *= arch_scale_smt_power(sd, cpu); + else + power *= default_scale_smt_power(sd, cpu); + power >>= SCHED_LOAD_SHIFT; } @@ -4161,26 +4034,6 @@ ret: return NULL; } -static struct sched_group *group_of(int cpu) -{ - struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); - - if (!sd) - return NULL; - - return sd->groups; -} - -static unsigned long power_of(int cpu) -{ - struct sched_group *group = group_of(cpu); - - if (!group) - return SCHED_LOAD_SCALE; - - return group->cpu_power; -} - /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ @@ -5239,17 +5092,16 @@ void account_idle_time(cputime_t cputime) */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy = jiffies_to_cputime(1); - cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); if (user_tick) - account_user_time(p, one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, one_jiffy_scaled); else - account_idle_time(one_jiffy); + account_idle_time(cputime_one_jiffy); } /* @@ -5353,7 +5205,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); - perf_counter_task_tick(curr, cpu); + perf_event_task_tick(curr, cpu); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5465,14 +5317,13 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *prev) +static void put_prev_task(struct rq *rq, struct task_struct *p) { - if (prev->state == TASK_RUNNING) { - u64 runtime = prev->se.sum_exec_runtime; + u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; - runtime -= prev->se.prev_sum_exec_runtime; - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_running, runtime); + if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5482,9 +5333,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) * correlates to the amount of cache footprint a task can * build up. */ - update_avg(&prev->se.avg_overlap, runtime); + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_overlap, runtime); + } else { + update_avg(&p->se.avg_running, 0); } - prev->sched_class->put_prev_task(rq, prev); + p->sched_class->put_prev_task(rq, p); } /* @@ -5567,7 +5421,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; @@ -5716,10 +5570,10 @@ asmlinkage void __sched preempt_schedule_irq(void) #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -5733,14 +5587,14 @@ EXPORT_SYMBOL(default_wake_function); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) + int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; - if (curr->func(curr, mode, sync, key) && + if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } @@ -5801,16 +5655,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; - int sync = 1; + int wake_flags = WF_SYNC; if (unlikely(!q)) return; if (unlikely(!nr_exclusive)) - sync = 0; + wake_flags = 0; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, key); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); @@ -6977,23 +6831,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - /* - * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER - * tasks that are on an otherwise idle runqueue: - */ - time_slice = 0; - if (p->policy == SCHED_RR) { - time_slice = DEF_TIMESLICE; - } else if (p->policy != SCHED_FIFO) { - struct sched_entity *se = &p->se; - unsigned long flags; - struct rq *rq; + time_slice = p->sched_class->get_rr_interval(p); - rq = task_rq_lock(p, &flags); - if (rq->cfs.load.weight) - time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); - } read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -7844,7 +7683,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* * Register at high priority so that task migration (migrate_all_tasks) * happens before everything else. This has to be lower priority than - * the notifier in the perf_counter subsystem, though. + * the notifier in the perf_event subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, @@ -8000,9 +7839,7 @@ static int sd_degenerate(struct sched_domain *sd) } /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_IDLE | - SD_WAKE_AFFINE | - SD_WAKE_BALANCE)) + if (sd->flags & (SD_WAKE_AFFINE)) return 0; return 1; @@ -8019,10 +7856,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; - /* Does parent contain flags not in child? */ - /* WAKE_BALANCE is a subset of WAKE_AFFINE */ - if (cflags & SD_WAKE_AFFINE) - pflags &= ~SD_WAKE_BALANCE; /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { pflags &= ~(SD_LOAD_BALANCE | @@ -8708,10 +8541,10 @@ static void set_domain_attribute(struct sched_domain *sd, request = attr->relax_domain_level; if (request < sd->level) { /* turn off idle balance on this domain */ - sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } else { /* turn on idle balance on this domain */ - sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } } @@ -9329,6 +9162,7 @@ void __init sched_init_smp(void) cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@ -9360,7 +9194,6 @@ void __init sched_init_smp(void) sched_init_granularity(); free_cpumask_var(non_isolated_cpus); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); init_sched_rt_class(); } #else @@ -9707,7 +9540,7 @@ void __init sched_init(void) alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_counter_init(); + perf_event_init(); scheduler_running = 1; } @@ -10479,7 +10312,7 @@ static int sched_rt_global_constraints(void) #endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -10490,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_constraints(); @@ -10544,8 +10377,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) } static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk) +cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) @@ -10555,15 +10387,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, if (tsk->sched_class != &fair_sched_class) return -EINVAL; #endif + return 0; +} +static int +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk, bool threadgroup) +{ + int retval = cpu_cgroup_can_attach_task(cgrp, tsk); + if (retval) + return retval; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + retval = cpu_cgroup_can_attach_task(cgrp, c); + if (retval) { + rcu_read_unlock(); + return retval; + } + } + rcu_read_unlock(); + } return 0; } static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk) + struct cgroup *old_cont, struct task_struct *tsk, + bool threadgroup) { sched_move_task(tsk); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + sched_move_task(c); + } + rcu_read_unlock(); + } } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e1d16c9..ac2e1dc 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running; __read_mostly int sched_clock_stable; struct sched_clock_data { - /* - * Raw spinlock - this is a special case: this might be called - * from within instrumentation code so we dont want to do any - * instrumentation ourselves. - */ - raw_spinlock_t lock; - u64 tick_raw; u64 tick_gtod; u64 clock; @@ -80,7 +73,6 @@ void sched_clock_init(void) for_each_possible_cpu(cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); - scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; @@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) +static u64 sched_clock_local(struct sched_clock_data *scd) { - s64 delta = now - scd->tick_raw; - u64 clock, min_clock, max_clock; + u64 now, clock, old_clock, min_clock, max_clock; + s64 delta; +again: + now = sched_clock(); + delta = now - scd->tick_raw; if (unlikely(delta < 0)) delta = 0; + old_clock = scd->clock; + /* * scd->clock = clamp(scd->tick_gtod + delta, * max(scd->tick_gtod, scd->clock), @@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) */ clock = scd->tick_gtod + delta; - min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); + min_clock = wrap_max(scd->tick_gtod, old_clock); + max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - scd->clock = clock; + if (cmpxchg(&scd->clock, old_clock, clock) != old_clock) + goto again; - return scd->clock; + return clock; } -static void lock_double_clock(struct sched_clock_data *data1, - struct sched_clock_data *data2) +static u64 sched_clock_remote(struct sched_clock_data *scd) { - if (data1 < data2) { - __raw_spin_lock(&data1->lock); - __raw_spin_lock(&data2->lock); + struct sched_clock_data *my_scd = this_scd(); + u64 this_clock, remote_clock; + u64 *ptr, old_val, val; + + sched_clock_local(my_scd); +again: + this_clock = my_scd->clock; + remote_clock = scd->clock; + + /* + * Use the opportunity that we have both locks + * taken to couple the two clocks: we take the + * larger time as the latest time for both + * runqueues. (this creates monotonic movement) + */ + if (likely((s64)(remote_clock - this_clock) < 0)) { + ptr = &scd->clock; + old_val = remote_clock; + val = this_clock; } else { - __raw_spin_lock(&data2->lock); - __raw_spin_lock(&data1->lock); + /* + * Should be rare, but possible: + */ + ptr = &my_scd->clock; + old_val = this_clock; + val = remote_clock; } + + if (cmpxchg(ptr, old_val, val) != old_val) + goto again; + + return val; } u64 sched_clock_cpu(int cpu) { - u64 now, clock, this_clock, remote_clock; struct sched_clock_data *scd; + u64 clock; + + WARN_ON_ONCE(!irqs_disabled()); if (sched_clock_stable) return sched_clock(); - scd = cpu_sdc(cpu); - - /* - * Normally this is not called in NMI context - but if it is, - * trying to do any locking here is totally lethal. - */ - if (unlikely(in_nmi())) - return scd->clock; - if (unlikely(!sched_clock_running)) return 0ull; - WARN_ON_ONCE(!irqs_disabled()); - now = sched_clock(); - - if (cpu != raw_smp_processor_id()) { - struct sched_clock_data *my_scd = this_scd(); - - lock_double_clock(scd, my_scd); - - this_clock = __update_sched_clock(my_scd, now); - remote_clock = scd->clock; - - /* - * Use the opportunity that we have both locks - * taken to couple the two clocks: we take the - * larger time as the latest time for both - * runqueues. (this creates monotonic movement) - */ - if (likely((s64)(remote_clock - this_clock) < 0)) { - clock = this_clock; - scd->clock = clock; - } else { - /* - * Should be rare, but possible: - */ - clock = remote_clock; - my_scd->clock = remote_clock; - } - - __raw_spin_unlock(&my_scd->lock); - } else { - __raw_spin_lock(&scd->lock); - clock = __update_sched_clock(scd, now); - } + scd = cpu_sdc(cpu); - __raw_spin_unlock(&scd->lock); + if (cpu != smp_processor_id()) + clock = sched_clock_remote(scd); + else + clock = sched_clock_local(scd); return clock; } @@ -223,11 +209,9 @@ void sched_clock_tick(void) now_gtod = ktime_to_ns(ktime_get()); now = sched_clock(); - __raw_spin_lock(&scd->lock); scd->tick_raw = now; scd->tick_gtod = now_gtod; - __update_sched_clock(scd, now); - __raw_spin_unlock(&scd->lock); + sched_clock_local(scd); } /* diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ddbd08..efb8440 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.sum_exec_runtime); PN(se.avg_overlap); PN(se.avg_wakeup); + PN(se.avg_running); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index aa7f841..4e777b4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) #ifdef CONFIG_SCHED_DEBUG int sched_nr_latency_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); + trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } @@ -709,24 +710,28 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (initial && sched_feat(START_DEBIT)) vruntime += sched_vslice(cfs_rq, se); - if (!initial) { - /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) { - unsigned long thresh = sysctl_sched_latency; + /* sleeps up to a single latency don't count. */ + if (!initial && sched_feat(FAIR_SLEEPERS)) { + unsigned long thresh = sysctl_sched_latency; - /* - * Convert the sleeper threshold into virtual time. - * SCHED_IDLE is a special sub-class. We care about - * fairness only relative to other SCHED_IDLE tasks, - * all of which have the same weight. - */ - if (sched_feat(NORMALIZED_SLEEPER) && - (!entity_is_task(se) || - task_of(se)->policy != SCHED_IDLE)) - thresh = calc_delta_fair(thresh, se); + /* + * Convert the sleeper threshold into virtual time. + * SCHED_IDLE is a special sub-class. We care about + * fairness only relative to other SCHED_IDLE tasks, + * all of which have the same weight. + */ + if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) || + task_of(se)->policy != SCHED_IDLE)) + thresh = calc_delta_fair(thresh, se); - vruntime -= thresh; - } + /* + * Halve their sleep time's effect, to allow + * for a gentler effect of sleepers: + */ + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; + + vruntime -= thresh; } /* ensure we never gain time by being placed backwards. */ @@ -757,10 +762,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->last == se) + if (!se || cfs_rq->last == se) cfs_rq->last = NULL; - if (cfs_rq->next == se) + if (!se || cfs_rq->next == se) cfs_rq->next = NULL; } @@ -1062,83 +1067,6 @@ static void yield_task_fair(struct rq *rq) se->vruntime = rightmost->vruntime + 1; } -/* - * wake_idle() will wake a task on an idle cpu if task->cpu is - * not idle and an idle cpu is available. The span of cpus to - * search starts with cpus closest then further out as needed, - * so we always favor a closer, idle cpu. - * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (rq->rd->online) - * - * Returns the CPU we should wake onto. - */ -#if defined(ARCH_HAS_SCHED_WAKE_IDLE) - -#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) - -static int wake_idle(int cpu, struct task_struct *p) -{ - struct sched_domain *sd; - int i; - unsigned int chosen_wakeup_cpu; - int this_cpu; - struct rq *task_rq = task_rq(p); - - /* - * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu - * are idle and this is not a kernel thread and this task's affinity - * allows it to be moved to preferred cpu, then just move! - */ - - this_cpu = smp_processor_id(); - chosen_wakeup_cpu = - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; - - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && - idle_cpu(cpu) && idle_cpu(this_cpu) && - p->mm && !(p->flags & PF_KTHREAD) && - cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) - return chosen_wakeup_cpu; - - /* - * If it is idle, then it is the best cpu to run this task. - * - * This cpu is also the best, if it has more than one task already. - * Siblings must be also busy(in most cases) as they didn't already - * pickup the extra load from this cpu and hence we need not check - * sibling runqueue info. This will avoid the checks and cache miss - * penalities associated with that. - */ - if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) - return cpu; - - for_each_domain(cpu, sd) { - if ((sd->flags & SD_WAKE_IDLE) - || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq->clock, sd))) { - for_each_cpu_and(i, sched_domain_span(sd), - &p->cpus_allowed) { - if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { - if (i != task_cpu(p)) { - schedstat_inc(p, - se.nr_wakeups_idle); - } - return i; - } - } - } else { - break; - } - } - return cpu; -} -#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ -static inline int wake_idle(int cpu, struct task_struct *p) -{ - return cpu; -} -#endif - #ifdef CONFIG_SMP #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1225,25 +1153,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif -static int -wake_affine(struct sched_domain *this_sd, struct rq *this_rq, - struct task_struct *p, int prev_cpu, int this_cpu, int sync, - int idx, unsigned long load, unsigned long this_load, - unsigned int imbalance) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { - struct task_struct *curr = this_rq->curr; - struct task_group *tg; - unsigned long tl = this_load; + struct task_struct *curr = current; + unsigned long this_load, load; + int idx, this_cpu, prev_cpu; unsigned long tl_per_task; + unsigned int imbalance; + struct task_group *tg; unsigned long weight; int balanced; - if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) - return 0; + idx = sd->wake_idx; + this_cpu = smp_processor_id(); + prev_cpu = task_cpu(p); + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); - if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || - p->se.avg_overlap > sysctl_sched_migration_cost)) - sync = 0; + if (sync) { + if (sched_feat(SYNC_LESS) && + (curr->se.avg_overlap > sysctl_sched_migration_cost || + p->se.avg_overlap > sysctl_sched_migration_cost)) + sync = 0; + } else { + if (sched_feat(SYNC_MORE) && + (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost)) + sync = 1; + } /* * If sync wakeup then subtract the (maximum possible) @@ -1254,24 +1191,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, tg = task_group(current); weight = current->se.load.weight; - tl += effective_load(tg, this_cpu, -weight, -weight); + this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); weight = p->se.load.weight; + imbalance = 100 + (sd->imbalance_pct - 100) / 2; + /* * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped tl to 0, we'll always have - * an imbalance, but there's really nothing you can do about that, so - * that's good too. + * due to the sync cause above having dropped this_load to 0, we'll + * always have an imbalance, but there's really nothing you can do + * about that, so that's good too. * * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - balanced = !tl || - 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + balanced = !this_load || + 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* @@ -1285,14 +1224,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= - tl_per_task)) { + if (balanced || + (this_load <= load && + this_load + target_load(prev_cpu, idx) <= tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and * there is no bad imbalance. */ - schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(sd, ttwu_move_affine); schedstat_inc(p, se.nr_wakeups_affine); return 1; @@ -1300,65 +1240,216 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, return 0; } -static int select_task_rq_fair(struct task_struct *p, int sync) +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int load_idx) { - struct sched_domain *sd, *this_sd = NULL; - int prev_cpu, this_cpu, new_cpu; - unsigned long load, this_load; - struct rq *this_rq; - unsigned int imbalance; - int idx; + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int imbalance = 100 + (sd->imbalance_pct-100)/2; - prev_cpu = task_cpu(p); - this_cpu = smp_processor_id(); - this_rq = cpu_rq(this_cpu); - new_cpu = prev_cpu; + do { + unsigned long load, avg_load; + int local_group; + int i; - /* - * 'this_sd' is the first domain that both - * this_cpu and prev_cpu are present in: - */ - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { - this_sd = sd; - break; + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu(i, sched_group_cpus(group)) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + + avg_load += load; + } + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + } while (group = group->next, group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; +} + +/* + * find_idlest_cpu - find the idlest cpu among the cpus in group. + */ +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +{ + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + /* Traverse only the allowed CPUs */ + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + load = weighted_cpuload(i); + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; } } - if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) - goto out; + return idlest; +} - /* - * Check for affine wakeup and passive balancing possibilities. - */ - if (!this_sd) +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +{ + struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int new_cpu = cpu; + int want_affine = 0; + int want_sd = 1; + int sync = wake_flags & WF_SYNC; + + if (sd_flag & SD_BALANCE_WAKE) { + if (sched_feat(AFFINE_WAKEUPS) && + cpumask_test_cpu(cpu, &p->cpus_allowed)) + want_affine = 1; + new_cpu = prev_cpu; + } + + rcu_read_lock(); + for_each_domain(cpu, tmp) { + /* + * If power savings logic is enabled for a domain, see if we + * are not overloaded, if so, don't balance wider. + */ + if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { + unsigned long power = 0; + unsigned long nr_running = 0; + unsigned long capacity; + int i; + + for_each_cpu(i, sched_domain_span(tmp)) { + power += power_of(i); + nr_running += cpu_rq(i)->cfs.nr_running; + } + + capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + nr_running /= 2; + + if (nr_running < capacity) + want_sd = 0; + } + + if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + + affine_sd = tmp; + want_affine = 0; + } + + if (!want_sd && !want_affine) + break; + + if (!(tmp->flags & sd_flag)) + continue; + + if (want_sd) + sd = tmp; + } + + if (sched_feat(LB_SHARES_UPDATE)) { + /* + * Pick the largest domain to update shares over + */ + tmp = sd; + if (affine_sd && (!tmp || + cpumask_weight(sched_domain_span(affine_sd)) > + cpumask_weight(sched_domain_span(sd)))) + tmp = affine_sd; + + if (tmp) + update_shares(tmp); + } + + if (affine_sd && wake_affine(affine_sd, p, sync)) { + new_cpu = cpu; goto out; + } - idx = this_sd->wake_idx; + while (sd) { + int load_idx = sd->forkexec_idx; + struct sched_group *group; + int weight; - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; - if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, - load, this_load, imbalance)) - return this_cpu; + group = find_idlest_group(sd, p, cpu, load_idx); + if (!group) { + sd = sd->child; + continue; + } - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - return this_cpu; + new_cpu = find_idlest_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= cpumask_weight(sched_domain_span(tmp))) + break; + if (tmp->flags & sd_flag) + sd = tmp; } + /* while loop will break here if sd == NULL */ } out: - return wake_idle(new_cpu, p); + rcu_read_unlock(); + return new_cpu; } #endif /* CONFIG_SMP */ @@ -1471,11 +1562,12 @@ static void set_next_buddy(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); + int sync = wake_flags & WF_SYNC; update_curr(cfs_rq); @@ -1501,7 +1593,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) */ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) set_last_buddy(se); - set_next_buddy(pse); + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) + set_next_buddy(pse); /* * We can come here with TIF_NEED_RESCHED already set from new task @@ -1523,16 +1616,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; } - if (!sched_feat(WAKEUP_PREEMPT)) - return; - - if (sched_feat(WAKEUP_OVERLAP) && (sync || - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { + if ((sched_feat(WAKEUP_SYNC) && sync) || + (sched_feat(WAKEUP_OVERLAP) && + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { resched_task(curr); return; } + if (sched_feat(WAKEUP_RUNNING)) { + if (pse->avg_running < se->avg_running) { + set_next_buddy(pse); + resched_task(curr); + return; + } + } + + if (!sched_feat(WAKEUP_PREEMPT)) + return; + find_matching_se(&se, &pse); BUG_ON(!pse); @@ -1555,8 +1657,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) /* * If se was a buddy, clear it so that it will have to earn * the favour again. + * + * If se was not a buddy, clear the buddies because neither + * was elegible to run, let them earn it again. + * + * IOW. unconditionally clear buddies. */ - __clear_buddies(cfs_rq, se); + __clear_buddies(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -1832,6 +1939,25 @@ static void moved_group_fair(struct task_struct *p) } #endif +unsigned int get_rr_interval_fair(struct task_struct *task) +{ + struct sched_entity *se = &task->se; + unsigned long flags; + struct rq *rq; + unsigned int rr_interval = 0; + + /* + * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise + * idle runqueue: + */ + rq = task_rq_lock(task, &flags); + if (rq->cfs.load.weight) + rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + task_rq_unlock(rq, &flags); + + return rr_interval; +} + /* * All the scheduling class methods: */ @@ -1860,6 +1986,8 @@ static const struct sched_class fair_sched_class = { .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, + .get_rr_interval = get_rr_interval_fair, + #ifdef CONFIG_FAIR_GROUP_SCHED .moved_group = moved_group_fair, #endif diff --git a/kernel/sched_features.h b/kernel/sched_features.h index e2dc63a..0d94083 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,17 +1,123 @@ -SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) +/* + * Disregards a certain amount of sleep time (sched_latency_ns) and + * considers the task to be running during that period. This gives it + * a service deficit on wakeup, allowing it to run sooner. + */ +SCHED_FEAT(FAIR_SLEEPERS, 1) + +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) + +/* + * By not normalizing the sleep time, heavy tasks get an effective + * longer period, and lighter task an effective shorter period they + * are considered running. + */ SCHED_FEAT(NORMALIZED_SLEEPER, 0) -SCHED_FEAT(ADAPTIVE_GRAN, 1) -SCHED_FEAT(WAKEUP_PREEMPT, 1) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ SCHED_FEAT(START_DEBIT, 1) + +/* + * Should wakeups try to preempt running tasks. + */ +SCHED_FEAT(WAKEUP_PREEMPT, 1) + +/* + * Compute wakeup_gran based on task behaviour, clipped to + * [0, sched_wakeup_gran_ns] + */ +SCHED_FEAT(ADAPTIVE_GRAN, 1) + +/* + * When converting the wakeup granularity to virtual time, do it such + * that heavier tasks preempting a lighter task have an edge. + */ +SCHED_FEAT(ASYM_GRAN, 1) + +/* + * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS. + */ +SCHED_FEAT(WAKEUP_SYNC, 0) + +/* + * Wakeup preempt based on task behaviour. Tasks that do not overlap + * don't get preempted. + */ +SCHED_FEAT(WAKEUP_OVERLAP, 0) + +/* + * Wakeup preemption towards tasks that run short + */ +SCHED_FEAT(WAKEUP_RUNNING, 0) + +/* + * Use the SYNC wakeup hint, pipes and the likes use this to indicate + * the remote end is likely to consume the data we just wrote, and + * therefore has cache benefit from being placed on the same cpu, see + * also AFFINE_WAKEUPS. + */ +SCHED_FEAT(SYNC_WAKEUPS, 1) + +/* + * Based on load and program behaviour, see if it makes sense to place + * a newly woken task on the same cpu as the task that woke it -- + * improve cache locality. Typically used with SYNC wakeups as + * generated by pipes and the like, see also SYNC_WAKEUPS. + */ SCHED_FEAT(AFFINE_WAKEUPS, 1) + +/* + * Weaken SYNC hint based on overlap + */ +SCHED_FEAT(SYNC_LESS, 1) + +/* + * Add SYNC hint based on overlap + */ +SCHED_FEAT(SYNC_MORE, 0) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, 0) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, 1) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ SCHED_FEAT(CACHE_HOT_BUDDY, 1) -SCHED_FEAT(SYNC_WAKEUPS, 1) + +/* + * Use arch dependent cpu power functions + */ +SCHED_FEAT(ARCH_POWER, 0) + SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) -SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) -SCHED_FEAT(LB_WAKEUP_UPDATE, 1) +SCHED_FEAT(LB_SHARES_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) -SCHED_FEAT(WAKEUP_OVERLAP, 0) -SCHED_FEAT(LAST_BUDDY, 1) + +/* + * Spin-wait on mutex acquisition when the mutex owner is running on + * another cpu -- assumes that when the owner is running, it will soon + * release the lock. Decreases scheduling overhead. + */ SCHED_FEAT(OWNER_SPIN, 1) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 499672c..b133a28 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -6,7 +6,7 @@ */ #ifdef CONFIG_SMP -static int select_task_rq_idle(struct task_struct *p, int sync) +static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) { resched_task(rq->idle); } @@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, check_preempt_curr(rq, p, 0); } +unsigned int get_rr_interval_idle(struct task_struct *task) +{ + return 0; +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = { .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, + .get_rr_interval = get_rr_interval_idle, + .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2eb4bd6..a4d790c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); -static int select_task_rq_rt(struct task_struct *p, int sync) +static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { struct rq *rq = task_rq(p); + if (sd_flag != SD_BALANCE_WAKE) + return smp_processor_id(); + /* * If the current task is an RT task, then * try to see if we can wake this RT task up on another @@ -999,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); @@ -1731,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq) dequeue_pushable_task(rq, p); } +unsigned int get_rr_interval_rt(struct task_struct *task) +{ + /* + * Time slice is 0 for SCHED_FIFO tasks + */ + if (task->policy == SCHED_RR) + return DEF_TIMESLICE; + else + return 0; +} + static const struct sched_class rt_sched_class = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, @@ -1759,6 +1773,8 @@ static const struct sched_class rt_sched_class = { .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, + .get_rr_interval = get_rr_interval_rt, + .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, }; diff --git a/kernel/signal.c b/kernel/signal.c index 64c5dee..6705320 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) if (why) { /* - * The first thread which returns from finish_stop() + * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ @@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) return send_signal(sig, info, t, 0); } +int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, + bool group) +{ + unsigned long flags; + int ret = -ESRCH; + + if (lock_task_sighand(p, &flags)) { + ret = send_signal(sig, info, p, group); + unlock_task_sighand(p, &flags); + } + + return ret; +} + /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. @@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p) } } -int __fatal_signal_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL); -} -EXPORT_SYMBOL(__fatal_signal_pending); - struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; @@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - unsigned long flags; - int ret; + int ret = check_kill_permission(sig, info, p); - ret = check_kill_permission(sig, info, p); - - if (!ret && sig) { - ret = -ESRCH; - if (lock_task_sighand(p, &flags)) { - ret = __group_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); - } - } + if (!ret && sig) + ret = do_send_sig_info(sig, info, p, true); return ret; } @@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -/* - * The caller must ensure the task can't exit. - */ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret; - unsigned long flags; - /* * Make sure legacy kernel users don't send in bad values * (normal paths check this in check_kill_permission). @@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = specific_send_sig_info(sig, info, p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; + return do_send_sig_info(sig, info, p, false); } #define __si_special(priv) \ @@ -1383,15 +1374,6 @@ ret: } /* - * Wake up any threads in the parent blocked in wait* syscalls. - */ -static inline void __wake_up_parent(struct task_struct *p, - struct task_struct *parent) -{ - wake_up_interruptible_sync(&parent->signal->wait_chldexit); -} - -/* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * @@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code) spin_unlock_irq(¤t->sighand->siglock); } -static void -finish_stop(int stop_count) -{ - /* - * If there are no other threads in the group, or if there is - * a group stop in progress and we are the last to stop, - * report to the parent. When ptraced, every thread reports itself. - */ - if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, CLD_STOPPED); - read_unlock(&tasklist_lock); - } - - do { - schedule(); - } while (try_to_freeze()); - /* - * Now we don't run again until continued. - */ - current->exit_code = 0; -} - /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. @@ -1705,15 +1664,9 @@ finish_stop(int stop_count) static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - int stop_count; + int notify; - if (sig->group_stop_count > 0) { - /* - * There is a group stop in progress. We don't need to - * start another one. - */ - stop_count = --sig->group_stop_count; - } else { + if (!sig->group_stop_count) { struct task_struct *t; if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || @@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr) */ sig->group_exit_code = signr; - stop_count = 0; + sig->group_stop_count = 1; for (t = next_thread(current); t != current; t = next_thread(t)) /* * Setting state to TASK_STOPPED for a group @@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr) */ if (!(t->flags & PF_EXITING) && !task_is_stopped_or_traced(t)) { - stop_count++; + sig->group_stop_count++; signal_wake_up(t, 0); } - sig->group_stop_count = stop_count; } + /* + * If there are no other threads in the group, or if there is + * a group stop in progress and we are the last to stop, report + * to the parent. When ptraced, every thread reports itself. + */ + notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; + notify = tracehook_notify_jctl(notify, CLD_STOPPED); + /* + * tracehook_notify_jctl() can drop and reacquire siglock, so + * we keep ->group_stop_count != 0 before the call. If SIGCONT + * or SIGKILL comes in between ->group_stop_count == 0. + */ + if (sig->group_stop_count) { + if (!--sig->group_stop_count) + sig->flags = SIGNAL_STOP_STOPPED; + current->exit_code = sig->group_exit_code; + __set_current_state(TASK_STOPPED); + } + spin_unlock_irq(¤t->sighand->siglock); - if (stop_count == 0) - sig->flags = SIGNAL_STOP_STOPPED; - current->exit_code = sig->group_exit_code; - __set_current_state(TASK_STOPPED); + if (notify) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, notify); + read_unlock(&tasklist_lock); + } + + /* Now we don't run again until woken by SIGCONT or SIGKILL */ + do { + schedule(); + } while (try_to_freeze()); + + tracehook_finish_jctl(); + current->exit_code = 0; - spin_unlock_irq(¤t->sighand->siglock); - finish_stop(stop_count); return 1; } @@ -1815,14 +1793,15 @@ relock: int why = (signal->flags & SIGNAL_STOP_CONTINUED) ? CLD_CONTINUED : CLD_STOPPED; signal->flags &= ~SIGNAL_CLD_MASK; - spin_unlock_irq(&sighand->siglock); - if (unlikely(!tracehook_notify_jctl(1, why))) - goto relock; + why = tracehook_notify_jctl(why, CLD_CONTINUED); + spin_unlock_irq(&sighand->siglock); - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); - read_unlock(&tasklist_lock); + if (why) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current->group_leader, why); + read_unlock(&tasklist_lock); + } goto relock; } @@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk) if (unlikely(tsk->signal->group_stop_count) && !--tsk->signal->group_stop_count) { tsk->signal->flags = SIGNAL_STOP_STOPPED; - group_stop = 1; + group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); } out: spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { + if (unlikely(group_stop)) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, CLD_STOPPED); + do_notify_parent_cldstop(tsk, group_stop); read_unlock(&tasklist_lock); } } @@ -2290,7 +2269,6 @@ static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { struct task_struct *p; - unsigned long flags; int error = -ESRCH; rcu_read_lock(); @@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. - * - * If lock_task_sighand() fails we pretend the task dies - * after receiving the signal. The window is tiny, and the - * signal is private anyway. */ - if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); + if (!error && sig) { + error = do_send_sig_info(sig, info, p, false); + /* + * If lock_task_sighand() failed we pretend the task + * dies after receiving the signal. The window is tiny, + * and the signal is private anyway. + */ + if (unlikely(error == -ESRCH)) + error = 0; } } rcu_read_unlock(); diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 09d7519..0d31135 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); #ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, +static int slow_work_min_threads_sysctl(struct ctl_table *, int, void __user *, size_t *, loff_t *); -static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, +static int slow_work_max_threads_sysctl(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif @@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data) * Handle adjustment of the minimum number of threads */ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { @@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, * Handle adjustment of the maximum number of threads */ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { diff --git a/kernel/smp.c b/kernel/smp.c index 94188b8..c9d1c78 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -29,8 +29,7 @@ enum { struct call_function_data { struct call_single_data csd; - spinlock_t lock; - unsigned int refs; + atomic_t refs; cpumask_var_t cpumask; }; @@ -39,9 +38,7 @@ struct call_single_queue { spinlock_t lock; }; -static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { - .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock), -}; +static DEFINE_PER_CPU(struct call_function_data, cfd_data); static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -177,6 +174,11 @@ void generic_smp_call_function_interrupt(void) int cpu = get_cpu(); /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(cpu)); + + /* * Ensure entry is visible on call_function_queue after we have * entered the IPI. See comment in smp_call_function_many. * If we don't have this, then we may miss an entry on the list @@ -191,25 +193,18 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; - spin_lock(&data->lock); - if (!cpumask_test_cpu(cpu, data->cpumask)) { - spin_unlock(&data->lock); + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; - } - cpumask_clear_cpu(cpu, data->cpumask); - spin_unlock(&data->lock); data->csd.func(data->csd.info); - spin_lock(&data->lock); - WARN_ON(data->refs == 0); - refs = --data->refs; + refs = atomic_dec_return(&data->refs); + WARN_ON(refs < 0); if (!refs) { spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); spin_unlock(&call_function.lock); } - spin_unlock(&data->lock); if (refs) continue; @@ -230,6 +225,11 @@ void generic_smp_call_function_single_interrupt(void) unsigned int data_flags; LIST_HEAD(list); + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(smp_processor_id())); + spin_lock(&q->lock); list_replace_init(&q->list, &list); spin_unlock(&q->lock); @@ -285,8 +285,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, */ this_cpu = get_cpu(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -329,19 +335,18 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, { csd_lock(data); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() + && !oops_in_progress); generic_exec_single(cpu, data, wait); } -/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */ - -#ifndef arch_send_call_function_ipi_mask -# define arch_send_call_function_ipi_mask(maskp) \ - arch_send_call_function_ipi(*(maskp)) -#endif - /** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). @@ -365,8 +370,14 @@ void smp_call_function_many(const struct cpumask *mask, unsigned long flags; int cpu, next_cpu, this_cpu = smp_processor_id(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); @@ -391,23 +402,20 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); - spin_lock_irqsave(&data->lock, flags); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); - data->refs = cpumask_weight(data->cpumask); + atomic_set(&data->refs, cpumask_weight(data->cpumask)); - spin_lock(&call_function.lock); + spin_lock_irqsave(&call_function.lock, flags); /* * Place entry at the _HEAD_ of the list, so that any cpu still * observing the entry in generic_smp_call_function_interrupt() * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); - spin_unlock(&call_function.lock); - - spin_unlock_irqrestore(&data->lock, flags); + spin_unlock_irqrestore(&call_function.lock, flags); /* * Make the list addition visible before sending the ipi. diff --git a/kernel/softirq.c b/kernel/softirq.c index 7db2506..f8749e5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 88796c3..81324d1 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void) EXPORT_SYMBOL(touch_all_softlockup_watchdogs); int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } /* diff --git a/kernel/sys.c b/kernel/sys.c index b3f1097..255475d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,7 +14,7 @@ #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> #include <linux/kexec.h> @@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long flags; cputime_t utime, stime; struct task_cputime cputime; + unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; @@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = task_utime(current); stime = task_stime(current); accumulate_thread_rusage(p, r); + maxrss = p->signal->maxrss; goto out; } @@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt = p->signal->cmaj_flt; r->ru_inblock = p->signal->cinblock; r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; if (who == RUSAGE_CHILDREN) break; @@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt += p->signal->maj_flt; r->ru_inblock += p->signal->inblock; r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; t = p; do { accumulate_thread_rusage(t, r); @@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) out: cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); + + if (who != RUSAGE_CHILDREN) { + struct mm_struct *mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); + } + } + r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ } int getrusage(struct task_struct *p, int who, struct rusage __user *ru) @@ -1511,11 +1525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; - case PR_TASK_PERF_COUNTERS_DISABLE: - error = perf_counter_task_disable(); + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); break; - case PR_TASK_PERF_COUNTERS_ENABLE: - error = perf_counter_task_enable(); + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; @@ -1528,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, current->timer_slack_ns = arg2; error = 0; break; + case PR_MCE_KILL: + if (arg4 | arg5) + return -EINVAL; + switch (arg2) { + case 0: + if (arg3 != 0) + return -EINVAL; + current->flags &= ~PF_MCE_PROCESS; + break; + case 1: + current->flags |= PF_MCE_PROCESS; + if (arg3 != 0) + current->flags |= PF_MCE_EARLY; + else + current->flags &= ~PF_MCE_EARLY; + break; + default: + return -EINVAL; + } + error = 0; + break; + default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 44e5936..e06d0b8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -178,4 +178,4 @@ cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); /* performance counters: */ -cond_syscall(sys_perf_counter_open); +cond_syscall(sys_perf_event_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3125cff..0d949c5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -26,7 +26,6 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/utsname.h> #include <linux/kmemcheck.h> #include <linux/smp_lock.h> #include <linux/fs.h> @@ -50,7 +49,7 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/slow-work.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -77,6 +76,7 @@ extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; +extern unsigned int core_pipe_limit; extern int pid_max; extern int min_free_kbytes; extern int pid_max_min, pid_max_max; @@ -91,6 +91,9 @@ extern int sysctl_nr_trim_pages; #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#ifdef CONFIG_BLOCK +extern int blk_iopoll_enabled; +#endif /* Constants used for minimum and maximum */ #ifdef CONFIG_DETECT_SOFTLOCKUP @@ -103,6 +106,9 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static unsigned long one_ul = 1; static int one_hundred = 100; +#ifdef CONFIG_PRINTK +static int ten_thousand = 10000; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -157,9 +163,9 @@ extern int max_lock_depth; #endif #ifdef CONFIG_PROC_SYSCTL -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -418,6 +424,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -719,6 +733,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &ten_thousand, + }, #endif { .ctl_name = KERN_NGROUPS_MAX, @@ -961,28 +986,28 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_paranoid", - .data = &sysctl_perf_counter_paranoid, - .maxlen = sizeof(sysctl_perf_counter_paranoid), + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_mlock_kb", - .data = &sysctl_perf_counter_mlock, - .maxlen = sizeof(sysctl_perf_counter_mlock), + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_max_sample_rate", - .data = &sysctl_perf_counter_sample_rate, - .maxlen = sizeof(sysctl_perf_counter_sample_rate), + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = &proc_dointvec, }, @@ -997,7 +1022,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif - +#ifdef CONFIG_BLOCK + { + .ctl_name = CTL_UNNUMBERED, + .procname = "blk_iopoll", + .data = &blk_iopoll_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -1364,6 +1398,31 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &scan_unevictable_handler, }, +#ifdef CONFIG_MEMORY_FAILURE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -2192,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head) #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(void* data, int maxlen, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -2253,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write, * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2267,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write, * * Returns 0 on success. */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, filp, + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, ppos); } @@ -2295,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, } static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, struct file *filp, void __user *buffer, + int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -2402,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, #undef TMPBUFLEN } -static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, +static int do_proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { - return __do_proc_dointvec(table->data, table, write, filp, + return __do_proc_dointvec(table->data, table, write, buffer, lenp, ppos, conv, data); } @@ -2416,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * proc_dointvec - read a vector of integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2426,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, NULL,NULL); } @@ -2437,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, * Taint values can only be increased * This means we can safely use a temporary. */ -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -2449,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp, t = *table; t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; @@ -2501,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2514,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_minmax_conv, ¶m); } static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, @@ -2631,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, - filp, buffer, lenp, ppos, convmul, convdiv); + buffer, lenp, ppos, convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2658,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } /** * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2683,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); } @@ -2763,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2775,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); } @@ -2786,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: pointer to the file position @@ -2798,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * * Returns 0 on success. */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); } @@ -2809,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2822,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct pid *new_pid; @@ -2838,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp tmp = pid_vnr(cad_pid); - r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + r = __do_proc_dointvec(&tmp, table, write, buffer, lenp, ppos, NULL, NULL); if (r || !write) return r; @@ -2853,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp #else /* CONFIG_PROC_FS */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/time.c b/kernel/time.c index 2951194..2e2e469 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime); * 0 <= tv_nsec < NSEC_PER_SEC * For negative values only the tv_sec field is negative ! */ -void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) +void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) { while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); nsec -= NSEC_PER_SEC; ++sec; } while (nsec < 0) { + asm("" : "+rm"(nsec)); nsec += NSEC_PER_SEC; --sec; } diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 0b0a636..ee26662 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7466cb8..0911334 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -21,7 +21,6 @@ * * TODO WishList: * o Allow clocksource drivers to be unregistered - * o get rid of clocksource_jiffies extern */ #include <linux/clocksource.h> @@ -30,6 +29,7 @@ #include <linux/module.h> #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ #include <linux/tick.h> +#include <linux/kthread.h> void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, @@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc, } EXPORT_SYMBOL(timecounter_cyc2time); -/* XXX - Would like a better way for initializing curr_clocksource */ -extern struct clocksource clocksource_jiffies; - /*[Clocksource internal variables]--------- * curr_clocksource: - * currently selected clocksource. Initialized to clocksource_jiffies. - * next_clocksource: - * pending next selected clocksource. + * currently selected clocksource. * clocksource_list: * linked list with the registered clocksources - * clocksource_lock: - * protects manipulations to curr_clocksource and next_clocksource - * and the clocksource_list + * clocksource_mutex: + * protects manipulations to curr_clocksource and the clocksource_list * override_name: * Name of the user-specified clocksource. */ -static struct clocksource *curr_clocksource = &clocksource_jiffies; -static struct clocksource *next_clocksource; -static struct clocksource *clocksource_override; +static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); -static DEFINE_SPINLOCK(clocksource_lock); +static DEFINE_MUTEX(clocksource_mutex); static char override_name[32]; static int finished_booting; -/* clocksource_done_booting - Called near the end of core bootup - * - * Hack to avoid lots of clocksource churn at boot time. - * We use fs_initcall because we want this to start before - * device_initcall but after subsys_initcall. - */ -static int __init clocksource_done_booting(void) -{ - finished_booting = 1; - return 0; -} -fs_initcall(clocksource_done_booting); - #ifdef CONFIG_CLOCKSOURCE_WATCHDOG +static void clocksource_watchdog_work(struct work_struct *work); + static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; static struct timer_list watchdog_timer; +static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; -static unsigned long watchdog_resumed; +static int watchdog_running; + +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); /* * Interval: 0.5sec Threshold: 0.0625s @@ -158,135 +143,249 @@ static unsigned long watchdog_resumed; #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) -static void clocksource_ratewd(struct clocksource *cs, int64_t delta) +static void clocksource_watchdog_work(struct work_struct *work) { - if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) - return; + /* + * If kthread_run fails the next watchdog scan over the + * watchdog_list will find the unstable clock again. + */ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} +static void __clocksource_unstable(struct clocksource *cs) +{ + cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); + cs->flags |= CLOCK_SOURCE_UNSTABLE; + if (finished_booting) + schedule_work(&watchdog_work); +} + +static void clocksource_unstable(struct clocksource *cs, int64_t delta) +{ printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", cs->name, delta); - cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); - clocksource_change_rating(cs, 0); - list_del(&cs->wd_list); + __clocksource_unstable(cs); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called instead of clocksource_change_rating from + * cpu hotplug code to avoid a deadlock between the clocksource mutex + * and the cpu hotplug mutex. It defers the update of the clocksource + * to the watchdog thread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); } static void clocksource_watchdog(unsigned long data) { - struct clocksource *cs, *tmp; + struct clocksource *cs; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; - int resumed; + int next_cpu; spin_lock(&watchdog_lock); - - resumed = test_and_clear_bit(0, &watchdog_resumed); + if (!watchdog_running) + goto out; wdnow = watchdog->read(watchdog); - wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); + wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, + watchdog->mult, watchdog->shift); watchdog_last = wdnow; - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { - csnow = cs->read(cs); + list_for_each_entry(cs, &watchdog_list, wd_list) { - if (unlikely(resumed)) { - cs->wd_last = csnow; + /* Clocksource already marked unstable? */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + if (finished_booting) + schedule_work(&watchdog_work); continue; } - /* Initialized ? */ + csnow = cs->read(cs); + + /* Clocksource initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { - if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && - (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - /* - * We just marked the clocksource as - * highres-capable, notify the rest of the - * system as well so that we transition - * into high-res mode: - */ - tick_clock_notify(); - } cs->flags |= CLOCK_SOURCE_WATCHDOG; cs->wd_last = csnow; - } else { - cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); - cs->wd_last = csnow; - /* Check the delta. Might remove from the list ! */ - clocksource_ratewd(cs, cs_nsec - wd_nsec); + continue; } - } - if (!list_empty(&watchdog_list)) { - /* - * Cycle through CPUs to check if the CPUs stay - * synchronized to each other. - */ - int next_cpu = cpumask_next(raw_smp_processor_id(), - cpu_online_mask); + /* Check the deviation from the watchdog clocksource. */ + cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & + cs->mask, cs->mult, cs->shift); + cs->wd_last = csnow; + if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + clocksource_unstable(cs, cs_nsec - wd_nsec); + continue; + } - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* + * We just marked the clocksource as highres-capable, + * notify the rest of the system as well so that we + * transition into high-res mode: + */ + tick_clock_notify(); + } } + + /* + * Cycle through CPUs to check if the CPUs stay synchronized + * to each other. + */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); +out: spin_unlock(&watchdog_lock); } + +static inline void clocksource_start_watchdog(void) +{ + if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + return; + init_timer(&watchdog_timer); + watchdog_timer.function = clocksource_watchdog; + watchdog_last = watchdog->read(watchdog); + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + watchdog_running = 1; +} + +static inline void clocksource_stop_watchdog(void) +{ + if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + return; + del_timer(&watchdog_timer); + watchdog_running = 0; +} + +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + static void clocksource_resume_watchdog(void) { - set_bit(0, &watchdog_resumed); + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + clocksource_reset_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); } -static void clocksource_check_watchdog(struct clocksource *cs) +static void clocksource_enqueue_watchdog(struct clocksource *cs) { - struct clocksource *cse; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - int started = !list_empty(&watchdog_list); - + /* cs is a clocksource to be watched. */ list_add(&cs->wd_list, &watchdog_list); - if (!started && watchdog) { - watchdog_last = watchdog->read(watchdog); - watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - cpumask_first(cpu_online_mask)); - } + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; } else { + /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - + /* Pick the best watchdog. */ if (!watchdog || cs->rating > watchdog->rating) { - if (watchdog) - del_timer(&watchdog_timer); watchdog = cs; - init_timer(&watchdog_timer); - watchdog_timer.function = clocksource_watchdog; - /* Reset watchdog cycles */ - list_for_each_entry(cse, &watchdog_list, wd_list) - cse->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Start if list is not empty */ - if (!list_empty(&watchdog_list)) { - watchdog_last = watchdog->read(watchdog); - watchdog_timer.expires = - jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - cpumask_first(cpu_online_mask)); - } + clocksource_reset_watchdog(); + } + } + /* Check if the watchdog timer needs to be started. */ + clocksource_start_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_dequeue_watchdog(struct clocksource *cs) +{ + struct clocksource *tmp; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + } else if (cs == watchdog) { + /* Reset watchdog cycles */ + clocksource_reset_watchdog(); + /* Current watchdog is removed. Find an alternative. */ + watchdog = NULL; + list_for_each_entry(tmp, &clocksource_list, list) { + if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + if (!watchdog || tmp->rating > watchdog->rating) + watchdog = tmp; } } + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } -#else -static void clocksource_check_watchdog(struct clocksource *cs) + +static int clocksource_watchdog_kthread(void *data) +{ + struct clocksource *cs, *tmp; + unsigned long flags; + LIST_HEAD(unstable); + + mutex_lock(&clocksource_mutex); + spin_lock_irqsave(&watchdog_lock, flags); + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + list_del_init(&cs->wd_list); + list_add(&cs->wd_list, &unstable); + } + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); + + /* Needs to be done outside of watchdog lock */ + list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { + list_del_init(&cs->wd_list); + __clocksource_change_rating(cs, 0); + } + mutex_unlock(&clocksource_mutex); + return 0; +} + +#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +static void clocksource_enqueue_watchdog(struct clocksource *cs) { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -#endif +static inline int clocksource_watchdog_kthread(void *data) { return 0; } + +#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ /** * clocksource_resume - resume the clocksource(s) @@ -294,18 +393,16 @@ static inline void clocksource_resume_watchdog(void) { } void clocksource_resume(void) { struct clocksource *cs; - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); - list_for_each_entry(cs, &clocksource_list, list) { + list_for_each_entry(cs, &clocksource_list, list) if (cs->resume) cs->resume(); - } clocksource_resume_watchdog(); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); } /** @@ -320,75 +417,94 @@ void clocksource_touch_watchdog(void) clocksource_resume_watchdog(); } +#ifdef CONFIG_GENERIC_TIME + /** - * clocksource_get_next - Returns the selected clocksource + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. */ -struct clocksource *clocksource_get_next(void) +static void clocksource_select(void) { - unsigned long flags; + struct clocksource *best, *cs; - spin_lock_irqsave(&clocksource_lock, flags); - if (next_clocksource && finished_booting) { - curr_clocksource = next_clocksource; - next_clocksource = NULL; + if (!finished_booting || list_empty(&clocksource_list)) + return; + /* First clocksource on the list has the best rating. */ + best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, override_name) != 0) + continue; + /* + * Check to make sure we don't switch to a non-highres + * capable clocksource if the tick code is in oneshot + * mode (highres or nohz) + */ + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + tick_oneshot_mode_active()) { + /* Override clocksource cannot be used. */ + printk(KERN_WARNING "Override clocksource %s is not " + "HRT compatible. Cannot switch while in " + "HRT/NOHZ mode\n", cs->name); + override_name[0] = 0; + } else + /* Override clocksource can be used. */ + best = cs; + break; + } + if (curr_clocksource != best) { + printk(KERN_INFO "Switching to clocksource %s\n", best->name); + curr_clocksource = best; + timekeeping_notify(curr_clocksource); } - spin_unlock_irqrestore(&clocksource_lock, flags); - - return curr_clocksource; } -/** - * select_clocksource - Selects the best registered clocksource. - * - * Private function. Must hold clocksource_lock when called. +#else /* CONFIG_GENERIC_TIME */ + +static inline void clocksource_select(void) { } + +#endif + +/* + * clocksource_done_booting - Called near the end of core bootup * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. + * Hack to avoid lots of clocksource churn at boot time. + * We use fs_initcall because we want this to start before + * device_initcall but after subsys_initcall. */ -static struct clocksource *select_clocksource(void) +static int __init clocksource_done_booting(void) { - struct clocksource *next; - - if (list_empty(&clocksource_list)) - return NULL; - - if (clocksource_override) - next = clocksource_override; - else - next = list_entry(clocksource_list.next, struct clocksource, - list); + finished_booting = 1; - if (next == curr_clocksource) - return NULL; + /* + * Run the watchdog first to eliminate unstable clock sources + */ + clocksource_watchdog_kthread(NULL); - return next; + mutex_lock(&clocksource_mutex); + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; } +fs_initcall(clocksource_done_booting); /* * Enqueue the clocksource sorted by rating */ -static int clocksource_enqueue(struct clocksource *c) +static void clocksource_enqueue(struct clocksource *cs) { - struct list_head *tmp, *entry = &clocksource_list; + struct list_head *entry = &clocksource_list; + struct clocksource *tmp; - list_for_each(tmp, &clocksource_list) { - struct clocksource *cs; - - cs = list_entry(tmp, struct clocksource, list); - if (cs == c) - return -EBUSY; + list_for_each_entry(tmp, &clocksource_list, list) /* Keep track of the place, where to insert */ - if (cs->rating >= c->rating) - entry = tmp; - } - list_add(&c->list, entry); - - if (strlen(c->name) == strlen(override_name) && - !strcmp(c->name, override_name)) - clocksource_override = c; - - return 0; + if (tmp->rating >= cs->rating) + entry = &tmp->list; + list_add(&cs->list, entry); } /** @@ -397,52 +513,48 @@ static int clocksource_enqueue(struct clocksource *c) * * Returns -EBUSY if registration fails, zero otherwise. */ -int clocksource_register(struct clocksource *c) +int clocksource_register(struct clocksource *cs) { - unsigned long flags; - int ret; - - spin_lock_irqsave(&clocksource_lock, flags); - ret = clocksource_enqueue(c); - if (!ret) - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); - if (!ret) - clocksource_check_watchdog(c); - return ret; + mutex_lock(&clocksource_mutex); + clocksource_enqueue(cs); + clocksource_select(); + clocksource_enqueue_watchdog(cs); + mutex_unlock(&clocksource_mutex); + return 0; } EXPORT_SYMBOL(clocksource_register); +static void __clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); + clocksource_select(); +} + /** * clocksource_change_rating - Change the rating of a registered clocksource - * */ void clocksource_change_rating(struct clocksource *cs, int rating) { - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); - list_del(&cs->list); - cs->rating = rating; - clocksource_enqueue(cs); - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); + __clocksource_change_rating(cs, rating); + mutex_unlock(&clocksource_mutex); } +EXPORT_SYMBOL(clocksource_change_rating); /** * clocksource_unregister - remove a registered clocksource */ void clocksource_unregister(struct clocksource *cs) { - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); + clocksource_dequeue_watchdog(cs); list_del(&cs->list); - if (clocksource_override == cs) - clocksource_override = NULL; - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); + clocksource_select(); + mutex_unlock(&clocksource_mutex); } +EXPORT_SYMBOL(clocksource_unregister); #ifdef CONFIG_SYSFS /** @@ -458,9 +570,9 @@ sysfs_show_current_clocksources(struct sys_device *dev, { ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); return count; } @@ -478,9 +590,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t count) { - struct clocksource *ovr = NULL; size_t ret = count; - int len; /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(override_name)) @@ -490,44 +600,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, if (buf[count-1] == '\n') count--; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); if (count > 0) memcpy(override_name, buf, count); override_name[count] = 0; + clocksource_select(); - len = strlen(override_name); - if (len) { - struct clocksource *cs; - - ovr = clocksource_override; - /* try to select it: */ - list_for_each_entry(cs, &clocksource_list, list) { - if (strlen(cs->name) == len && - !strcmp(cs->name, override_name)) - ovr = cs; - } - } - - /* - * Check to make sure we don't switch to a non-highres capable - * clocksource if the tick code is in oneshot mode (highres or nohz) - */ - if (tick_oneshot_mode_active() && ovr && - !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { - printk(KERN_WARNING "%s clocksource is not HRT compatible. " - "Cannot switch while in HRT/NOHZ mode\n", ovr->name); - ovr = NULL; - override_name[0] = 0; - } - - /* Reselect, when the override name has changed */ - if (ovr != clocksource_override) { - clocksource_override = ovr; - next_clocksource = select_clocksource(); - } - - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); return ret; } @@ -547,7 +627,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, struct clocksource *src; ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); list_for_each_entry(src, &clocksource_list, list) { /* * Don't show non-HRES clocksource if the tick code is @@ -559,7 +639,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "%s ", src->name); } - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); count += snprintf(buf + count, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); @@ -614,11 +694,10 @@ device_initcall(init_clocksource_sysfs); */ static int __init boot_override_clocksource(char* str) { - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); if (str) strlcpy(override_name, str, sizeof(override_name)); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index c3f6c30..5404a84 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ - .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; @@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void) } core_initcall(init_jiffies_clocksource); + +struct clocksource * __init __weak clocksource_default_clock(void) +{ + return &clocksource_jiffies; +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7fc6437..4800f93 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) case TIME_OK: break; case TIME_INS: - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; + timekeeping_leap_insert(-1); time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); @@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) res = HRTIMER_RESTART; break; case TIME_DEL: - xtime.tv_sec++; + timekeeping_leap_insert(1); time_tai--; - wall_to_monotonic.tv_sec--; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); @@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_state = TIME_OK; break; } - update_vsyscall(&xtime, clock); write_sequnlock(&xtime_lock); diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c new file mode 100644 index 0000000..86628e7 --- /dev/null +++ b/kernel/time/timeconv.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * Contributed by Paul Eggert (eggert@twinsun.com). + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU C Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Converts the calendar time to broken-down time representation + * Based on code from glibc-2.6 + * + * 2009-7-14: + * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> + */ + +#include <linux/time.h> +#include <linux/module.h> + +/* + * Nonzero if YEAR is a leap year (every 4 years, + * except every 100th isn't, and every 400th is). + */ +static int __isleap(long year) +{ + return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); +} + +/* do a mathdiv for long type */ +static long math_div(long a, long b) +{ + return a / b - (a % b < 0); +} + +/* How many leap years between y1 and y2, y1 must less or equal to y2 */ +static long leaps_between(long y1, long y2) +{ + long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) + + math_div(y1 - 1, 400); + long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) + + math_div(y2 - 1, 400); + return leaps2 - leaps1; +} + +/* How many days come before each month (0-12). */ +static const unsigned short __mon_yday[2][13] = { + /* Normal years. */ + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + /* Leap years. */ + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} +}; + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + long days, rem, y; + const unsigned short *ip; + + days = totalsecs / SECS_PER_DAY; + rem = totalsecs % SECS_PER_DAY; + rem += offset; + while (rem < 0) { + rem += SECS_PER_DAY; + --days; + } + while (rem >= SECS_PER_DAY) { + rem -= SECS_PER_DAY; + ++days; + } + + result->tm_hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + result->tm_min = rem / 60; + result->tm_sec = rem % 60; + + /* January 1, 1970 was a Thursday. */ + result->tm_wday = (4 + days) % 7; + if (result->tm_wday < 0) + result->tm_wday += 7; + + y = 1970; + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { + /* Guess a corrected year, assuming 365 days per year. */ + long yg = y + math_div(days, 365); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= (yg - y) * 365 + leaps_between(y, yg); + y = yg; + } + + result->tm_year = y - 1900; + + result->tm_yday = days; + + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < ip[y]; y--) + continue; + days -= ip[y]; + + result->tm_mon = y; + result->tm_mday = days + 1; +} +EXPORT_SYMBOL(time_to_tm); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e8c77d9..fb0f46f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -18,7 +18,117 @@ #include <linux/jiffies.h> #include <linux/time.h> #include <linux/tick.h> +#include <linux/stop_machine.h> + +/* Structure holding internal timekeeping values. */ +struct timekeeper { + /* Current clocksource used for timekeeping. */ + struct clocksource *clock; + /* The shift value of the current clocksource. */ + int shift; + + /* Number of clock cycles in one NTP interval. */ + cycle_t cycle_interval; + /* Number of clock shifted nano seconds in one NTP interval. */ + u64 xtime_interval; + /* Raw nano seconds accumulated per NTP interval. */ + u32 raw_interval; + + /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ + u64 xtime_nsec; + /* Difference between accumulated time and NTP time in ntp + * shifted nano seconds. */ + s64 ntp_error; + /* Shift conversion between clock shifted nano seconds and + * ntp shifted nano seconds. */ + int ntp_error_shift; + /* NTP adjusted clock multiplier */ + u32 mult; +}; + +struct timekeeper timekeeper; + +/** + * timekeeper_setup_internals - Set up internals to use clocksource clock. + * + * @clock: Pointer to clocksource. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static void timekeeper_setup_internals(struct clocksource *clock) +{ + cycle_t interval; + u64 tmp; + + timekeeper.clock = clock; + clock->cycle_last = clock->read(clock); + /* Do the ns -> cycle conversion first, using original mult */ + tmp = NTP_INTERVAL_LENGTH; + tmp <<= clock->shift; + tmp += clock->mult/2; + do_div(tmp, clock->mult); + if (tmp == 0) + tmp = 1; + + interval = (cycle_t) tmp; + timekeeper.cycle_interval = interval; + + /* Go back from cycles -> shifted ns */ + timekeeper.xtime_interval = (u64) interval * clock->mult; + timekeeper.raw_interval = + ((u64) interval * clock->mult) >> clock->shift; + + timekeeper.xtime_nsec = 0; + timekeeper.shift = clock->shift; + + timekeeper.ntp_error = 0; + timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + + /* + * The timekeeper keeps its own mult values for the currently + * active clocksource. These value will be adjusted via NTP + * to counteract clock drifting. + */ + timekeeper.mult = clock->mult; +} + +/* Timekeeper helper functions. */ +static inline s64 timekeeping_get_ns(void) +{ + cycle_t cycle_now, cycle_delta; + struct clocksource *clock; + + /* read clocksource: */ + clock = timekeeper.clock; + cycle_now = clock->read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* return delta convert to nanoseconds using ntp adjusted mult. */ + return clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); +} + +static inline s64 timekeeping_get_ns_raw(void) +{ + cycle_t cycle_now, cycle_delta; + struct clocksource *clock; + + /* read clocksource: */ + clock = timekeeper.clock; + cycle_now = clock->read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* return delta convert to nanoseconds using ntp adjusted mult. */ + return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); +} /* * This read-write spinlock protects us from races in SMP while @@ -44,7 +154,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); */ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static unsigned long total_sleep_time; /* seconds */ +static struct timespec total_sleep_time; + +/* + * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. + */ +struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -56,35 +171,44 @@ void update_xtime_cache(u64 nsec) timespec_add_ns(&xtime_cache, nsec); } -struct clocksource *clock; - +/* must hold xtime_lock */ +void timekeeping_leap_insert(int leapsecond) +{ + xtime.tv_sec += leapsecond; + wall_to_monotonic.tv_sec -= leapsecond; + update_vsyscall(&xtime, timekeeper.clock); +} #ifdef CONFIG_GENERIC_TIME + /** - * clocksource_forward_now - update clock to the current time + * timekeeping_forward_now - update clock to the current time * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void clocksource_forward_now(void) +static void timekeeping_forward_now(void) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; s64 nsec; - cycle_now = clocksource_read(clock); + clock = timekeeper.clock; + cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - nsec = cyc2ns(clock, cycle_delta); + nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); /* If arch requires, add in gettimeoffset() */ nsec += arch_gettimeoffset(); timespec_add_ns(&xtime, nsec); - nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; - clock->raw_time.tv_nsec += nsec; + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + timespec_add_ns(&raw_time, nsec); } /** @@ -95,7 +219,6 @@ static void clocksource_forward_now(void) */ void getnstimeofday(struct timespec *ts) { - cycle_t cycle_now, cycle_delta; unsigned long seq; s64 nsecs; @@ -105,15 +228,7 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; - - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - nsecs = cyc2ns(clock, cycle_delta); + nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -125,6 +240,57 @@ void getnstimeofday(struct timespec *ts) EXPORT_SYMBOL(getnstimeofday); +ktime_t ktime_get(void) +{ + unsigned int seq; + s64 secs, nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + secs = xtime.tv_sec + wall_to_monotonic.tv_sec; + nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; + nsecs += timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + /* + * Use ktime_set/ktime_add_ns to create a proper ktime on + * 32-bit architectures without CONFIG_KTIME_SCALAR. + */ + return ktime_add_ns(ktime_set(secs, 0), nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + nsecs = timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set @@ -157,7 +323,7 @@ int do_settimeofday(struct timespec *tv) write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; @@ -167,10 +333,10 @@ int do_settimeofday(struct timespec *tv) update_xtime_cache(0); - clock->error = 0; + timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -187,44 +353,97 @@ EXPORT_SYMBOL(do_settimeofday); * * Accumulates current time interval and initializes new clocksource */ -static void change_clocksource(void) +static int change_clocksource(void *data) { struct clocksource *new, *old; - new = clocksource_get_next(); + new = (struct clocksource *) data; + + timekeeping_forward_now(); + if (!new->enable || new->enable(new) == 0) { + old = timekeeper.clock; + timekeeper_setup_internals(new); + if (old->disable) + old->disable(old); + } + return 0; +} - if (clock == new) +/** + * timekeeping_notify - Install a new clock source + * @clock: pointer to the clock source + * + * This function is called from clocksource.c after a new, better clock + * source has been registered. The caller holds the clocksource_mutex. + */ +void timekeeping_notify(struct clocksource *clock) +{ + if (timekeeper.clock == clock) return; + stop_machine(change_clocksource, clock, NULL); + tick_clock_notify(); +} - clocksource_forward_now(); +#else /* GENERIC_TIME */ - if (clocksource_enable(new)) - return; +static inline void timekeeping_forward_now(void) { } - new->raw_time = clock->raw_time; - old = clock; - clock = new; - clocksource_disable(old); +/** + * ktime_get - get the monotonic time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get(void) +{ + struct timespec now; - clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + ktime_get_ts(&now); - tick_clock_notify(); + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get); - /* - * We're holding xtime lock and waking up klogd would deadlock - * us on enqueue. So no printing! - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); - */ +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(ts); + tomono = wall_to_monotonic; + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec); } -#else -static inline void clocksource_forward_now(void) { } -static inline void change_clocksource(void) { } -#endif +EXPORT_SYMBOL_GPL(ktime_get_ts); + +#endif /* !GENERIC_TIME */ + +/** + * ktime_get_real - get the real (wall-) time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get_real(void) +{ + struct timespec now; + + getnstimeofday(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get_real); /** * getrawmonotonic - Returns the raw monotonic time in a timespec @@ -236,21 +455,11 @@ void getrawmonotonic(struct timespec *ts) { unsigned long seq; s64 nsecs; - cycle_t cycle_now, cycle_delta; do { seq = read_seqbegin(&xtime_lock); - - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; - - *ts = clock->raw_time; + nsecs = timekeeping_get_ns_raw(); + *ts = raw_time; } while (read_seqretry(&xtime_lock, seq)); @@ -270,7 +479,7 @@ int timekeeping_valid_for_hres(void) do { seq = read_seqbegin(&xtime_lock); - ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqretry(&xtime_lock, seq)); @@ -278,17 +487,33 @@ int timekeeping_valid_for_hres(void) } /** - * read_persistent_clock - Return time in seconds from the persistent clock. + * read_persistent_clock - Return time from the persistent clock. * * Weak dummy function for arches that do not yet support it. - * Returns seconds from epoch using the battery backed persistent clock. - * Returns zero if unsupported. + * Reads the time from the battery backed persistent clock. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ -unsigned long __attribute__((weak)) read_persistent_clock(void) +void __attribute__((weak)) read_persistent_clock(struct timespec *ts) { - return 0; + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + +/** + * read_boot_clock - Return time of the system start. + * + * Weak dummy function for arches that do not yet support it. + * Function to read the exact time the system has been started. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +void __attribute__((weak)) read_boot_clock(struct timespec *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; } /* @@ -296,29 +521,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void) */ void __init timekeeping_init(void) { + struct clocksource *clock; unsigned long flags; - unsigned long sec = read_persistent_clock(); + struct timespec now, boot; + + read_persistent_clock(&now); + read_boot_clock(&boot); write_seqlock_irqsave(&xtime_lock, flags); ntp_init(); - clock = clocksource_get_next(); - clocksource_enable(clock); - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clocksource_read(clock); - - xtime.tv_sec = sec; - xtime.tv_nsec = 0; + clock = clocksource_default_clock(); + if (clock->enable) + clock->enable(clock); + timekeeper_setup_internals(clock); + + xtime.tv_sec = now.tv_sec; + xtime.tv_nsec = now.tv_nsec; + raw_time.tv_sec = 0; + raw_time.tv_nsec = 0; + if (boot.tv_sec == 0 && boot.tv_nsec == 0) { + boot.tv_sec = xtime.tv_sec; + boot.tv_nsec = xtime.tv_nsec; + } set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); + -boot.tv_sec, -boot.tv_nsec); update_xtime_cache(0); - total_sleep_time = 0; + total_sleep_time.tv_sec = 0; + total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); } /* time in seconds when suspend began */ -static unsigned long timekeeping_suspend_time; +static struct timespec timekeeping_suspend_time; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -331,24 +567,24 @@ static unsigned long timekeeping_suspend_time; static int timekeeping_resume(struct sys_device *dev) { unsigned long flags; - unsigned long now = read_persistent_clock(); + struct timespec ts; + + read_persistent_clock(&ts); clocksource_resume(); write_seqlock_irqsave(&xtime_lock, flags); - if (now && (now > timekeeping_suspend_time)) { - unsigned long sleep_length = now - timekeeping_suspend_time; - - xtime.tv_sec += sleep_length; - wall_to_monotonic.tv_sec -= sleep_length; - total_sleep_time += sleep_length; + if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { + ts = timespec_sub(ts, timekeeping_suspend_time); + xtime = timespec_add_safe(xtime, ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); + total_sleep_time = timespec_add_safe(total_sleep_time, ts); } update_xtime_cache(0); /* re-base the last cycle value */ - clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); - clock->error = 0; + timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); + timekeeper.ntp_error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -366,10 +602,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; - timekeeping_suspend_time = read_persistent_clock(); + read_persistent_clock(&timekeeping_suspend_time); write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -404,7 +640,7 @@ device_initcall(timekeeping_init_device); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, +static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, s64 *offset) { s64 tick_error, i; @@ -420,7 +656,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); error2 = abs(error2); for (look_ahead = 0; error2 > 0; look_ahead++) error2 >>= 2; @@ -429,8 +665,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); - tick_error -= clock->xtime_interval >> 1; + tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); + tick_error -= timekeeper.xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; /* Finally calculate the adjustment shift value. */ @@ -455,18 +691,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void clocksource_adjust(s64 offset) +static void timekeeping_adjust(s64 offset) { - s64 error, interval = clock->cycle_interval; + s64 error, interval = timekeeper.cycle_interval; int adj; - error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); + error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); if (error > interval) { error >>= 2; if (likely(error <= interval)) adj = 1; else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else if (error < -interval) { error >>= 2; if (likely(error >= -interval)) { @@ -474,15 +710,15 @@ static void clocksource_adjust(s64 offset) interval = -interval; offset = -offset; } else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else return; - clock->mult += adj; - clock->xtime_interval += interval; - clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << - (NTP_SCALE_SHIFT - clock->shift); + timekeeper.mult += adj; + timekeeper.xtime_interval += interval; + timekeeper.xtime_nsec -= offset; + timekeeper.ntp_error -= (interval - offset) << + timekeeper.ntp_error_shift; } /** @@ -492,53 +728,59 @@ static void clocksource_adjust(s64 offset) */ void update_wall_time(void) { + struct clocksource *clock; cycle_t offset; + u64 nsecs; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return; + clock = timekeeper.clock; #ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #else - offset = clock->cycle_interval; + offset = timekeeper.cycle_interval; #endif - clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ - while (offset >= clock->cycle_interval) { + while (offset >= timekeeper.cycle_interval) { + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; + /* accumulate one interval */ - offset -= clock->cycle_interval; - clock->cycle_last += clock->cycle_interval; + offset -= timekeeper.cycle_interval; + clock->cycle_last += timekeeper.cycle_interval; - clock->xtime_nsec += clock->xtime_interval; - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; + timekeeper.xtime_nsec += timekeeper.xtime_interval; + if (timekeeper.xtime_nsec >= nsecps) { + timekeeper.xtime_nsec -= nsecps; xtime.tv_sec++; second_overflow(); } - clock->raw_time.tv_nsec += clock->raw_interval; - if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { - clock->raw_time.tv_nsec -= NSEC_PER_SEC; - clock->raw_time.tv_sec++; + raw_time.tv_nsec += timekeeper.raw_interval; + if (raw_time.tv_nsec >= NSEC_PER_SEC) { + raw_time.tv_nsec -= NSEC_PER_SEC; + raw_time.tv_sec++; } /* accumulate error between NTP and clock interval */ - clock->error += tick_length; - clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); + timekeeper.ntp_error += tick_length; + timekeeper.ntp_error -= timekeeper.xtime_interval << + timekeeper.ntp_error_shift; } /* correct the clock when NTP error is too big */ - clocksource_adjust(offset); + timekeeping_adjust(offset); /* * Since in the loop above, we accumulate any amount of time * in xtime_nsec over a second into xtime.tv_sec, its possible for * xtime_nsec to be fairly small after the loop. Further, if we're - * slightly speeding the clocksource up in clocksource_adjust(), + * slightly speeding the clocksource up in timekeeping_adjust(), * its possible the required corrective factor to xtime_nsec could * cause it to underflow. * @@ -550,24 +792,25 @@ void update_wall_time(void) * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)clock->xtime_nsec < 0)) { - s64 neg = -(s64)clock->xtime_nsec; - clock->xtime_nsec = 0; - clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); + if (unlikely((s64)timekeeper.xtime_nsec < 0)) { + s64 neg = -(s64)timekeeper.xtime_nsec; + timekeeper.xtime_nsec = 0; + timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; } /* store full nanoseconds into xtime after rounding it up and * add the remainder to the error difference. */ - xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; - clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; - clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); + xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; + timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; + timekeeper.ntp_error += timekeeper.xtime_nsec << + timekeeper.ntp_error_shift; - update_xtime_cache(cyc2ns(clock, offset)); + nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); + update_xtime_cache(nsecs); /* check to see if there is a new clocksource to use */ - change_clocksource(); - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); } /** @@ -583,9 +826,12 @@ void update_wall_time(void) */ void getboottime(struct timespec *ts) { - set_normalized_timespec(ts, - - (wall_to_monotonic.tv_sec + total_sleep_time), - - wall_to_monotonic.tv_nsec); + struct timespec boottime = { + .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, + .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec + }; + + set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } /** @@ -594,7 +840,7 @@ void getboottime(struct timespec *ts) */ void monotonic_to_bootbased(struct timespec *ts) { - ts->tv_sec += total_sleep_time; + *ts = timespec_add_safe(*ts, total_sleep_time); } unsigned long get_seconds(void) @@ -603,6 +849,10 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); +struct timespec __current_kernel_time(void) +{ + return xtime_cache; +} struct timespec current_kernel_time(void) { @@ -618,3 +868,20 @@ struct timespec current_kernel_time(void) return now; } EXPORT_SYMBOL(current_kernel_time); + +struct timespec get_monotonic_coarse(void) +{ + struct timespec now, mono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + + now = xtime_cache; + mono = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + now.tv_nsec + mono.tv_nsec); + return now; +} diff --git a/kernel/timer.c b/kernel/timer.c index a3d25f415..5db5a8d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/sched.h> #include <asm/uaccess.h> @@ -46,6 +46,9 @@ #include <asm/timex.h> #include <asm/io.h> +#define CREATE_TRACE_POINTS +#include <trace/events/timer.h> + u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); @@ -72,6 +75,7 @@ struct tvec_base { spinlock_t lock; struct timer_list *running_timer; unsigned long timer_jiffies; + unsigned long next_timer; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -520,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } #endif +static inline void debug_init(struct timer_list *timer) +{ + debug_timer_init(timer); + trace_timer_init(timer); +} + +static inline void +debug_activate(struct timer_list *timer, unsigned long expires) +{ + debug_timer_activate(timer); + trace_timer_start(timer, expires); +} + +static inline void debug_deactivate(struct timer_list *timer) +{ + debug_timer_deactivate(timer); + trace_timer_cancel(timer); +} + static void __init_timer(struct timer_list *timer, const char *name, struct lock_class_key *key) @@ -548,7 +571,7 @@ void init_timer_key(struct timer_list *timer, const char *name, struct lock_class_key *key) { - debug_timer_init(timer); + debug_init(timer); __init_timer(timer, name, key); } EXPORT_SYMBOL(init_timer_key); @@ -567,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer, { struct list_head *entry = &timer->entry; - debug_timer_deactivate(timer); + debug_deactivate(timer); __list_del(entry->prev, entry->next); if (clear_pending) @@ -622,13 +645,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, if (timer_pending(timer)) { detach_timer(timer, 0); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } else { if (pending_only) goto out_unlock; } - debug_timer_activate(timer); + debug_activate(timer, expires); new_base = __get_cpu_var(tvec_bases); @@ -663,6 +689,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } timer->expires = expires; + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); out_unlock: @@ -780,7 +809,10 @@ void add_timer_on(struct timer_list *timer, int cpu) BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); - debug_timer_activate(timer); + debug_activate(timer, timer->expires); + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); /* * Check whether the other CPU is idle and needs to be @@ -817,6 +849,9 @@ int del_timer(struct timer_list *timer) base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } spin_unlock_irqrestore(&base->lock, flags); @@ -850,6 +885,9 @@ int try_to_del_timer_sync(struct timer_list *timer) ret = 0; if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } out: @@ -984,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base) */ lock_map_acquire(&lockdep_map); + trace_timer_expire_entry(timer); fn(data); + trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); @@ -1007,8 +1047,8 @@ static inline void __run_timers(struct tvec_base *base) #ifdef CONFIG_NO_HZ /* * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a cpus is idle. - * This functions needs to be called disabled. + * is used on S/390 to stop all activity when a CPU is idle. + * This function needs to be called with interrupts disabled. */ static unsigned long __next_timer_interrupt(struct tvec_base *base) { @@ -1134,7 +1174,9 @@ unsigned long get_next_timer_interrupt(unsigned long now) unsigned long expires; spin_lock(&base->lock); - expires = __next_timer_interrupt(base); + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; spin_unlock(&base->lock); if (time_before_eq(expires, now)) @@ -1169,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_counter_do_pending(); + perf_event_do_pending(); hrtimer_run_pending(); @@ -1522,6 +1564,7 @@ static int __cpuinit init_timers_cpu(int cpu) INIT_LIST_HEAD(base->tv1.vec + j); base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; return 0; } @@ -1534,6 +1577,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, 0); timer_set_base(timer, new_base); + if (time_before(timer->expires, new_base->next_timer) && + !tbase_get_deferrable(timer->base)) + new_base->next_timer = timer->expires; internal_add_timer(new_base, timer); } } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1ea0d12..b416512 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -11,12 +11,18 @@ config NOP_TRACER config HAVE_FTRACE_NMI_ENTER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_TRACER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_GRAPH_TRACER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_GRAPH_FP_TEST bool @@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help - This gets selected when the arch tests the function_trace_stop - variable at the mcount call site. Otherwise, this variable - is tested by the called function. + See Documentation/trace/ftrace-implementation.txt config HAVE_DYNAMIC_FTRACE bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FTRACE_MCOUNT_RECORD bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_HW_BRANCH_TRACER bool config HAVE_SYSCALL_TRACEPOINTS bool + help + See Documentation/trace/ftrace-implementation.txt config TRACER_MAX_TRACE bool @@ -73,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP # This allows those options to appear when no other tracer is selected. But the # options do not appear when something else selects it. We need the two options # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the -# hidding of the automatic options options. +# hidding of the automatic options. config TRACING bool @@ -469,6 +479,18 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config EVENT_TRACE_TEST_SYSCALLS + bool "Run selftest on syscall events" + depends on FTRACE_STARTUP_TEST + help + This option will also enable testing every syscall event. + It only enables the event and disables it and runs various loads + with the event enabled. This adds a bit more time for kernel boot + up since it runs this on every system call defined. + + TBD - enable a way to actually call the syscalls as we test their + events + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 844164d..26f03ac 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o -obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o @@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8c804e2..a142579 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1323,11 +1323,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) enum { FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_CONT = (1 << 1), - FTRACE_ITER_NOTRACE = (1 << 2), - FTRACE_ITER_FAILURES = (1 << 3), - FTRACE_ITER_PRINTALL = (1 << 4), - FTRACE_ITER_HASH = (1 << 5), + FTRACE_ITER_NOTRACE = (1 << 1), + FTRACE_ITER_FAILURES = (1 << 2), + FTRACE_ITER_PRINTALL = (1 << 3), + FTRACE_ITER_HASH = (1 << 4), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1337,8 +1336,7 @@ struct ftrace_iterator { int hidx; int idx; unsigned flags; - unsigned char buffer[FTRACE_BUFF_MAX+1]; - unsigned buffer_idx; + struct trace_parser parser; }; static void * @@ -1407,7 +1405,7 @@ static int t_hash_show(struct seq_file *m, void *v) if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); - seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); + seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); if (rec->data) seq_printf(m, ":%p", rec->data); @@ -1517,12 +1515,12 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%pf\n", (void *)rec->ip); + seq_printf(m, "%ps\n", (void *)rec->ip); return 0; } -static struct seq_operations show_ftrace_seq_ops = { +static const struct seq_operations show_ftrace_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -1604,6 +1602,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (!iter) return -ENOMEM; + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { + kfree(iter); + return -ENOMEM; + } + mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -2059,9 +2062,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int i, len = 0; char *search; - if (glob && (strcmp(glob, "*") || !strlen(glob))) + if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) glob = NULL; - else { + else if (glob) { int not; type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); @@ -2196,9 +2199,8 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos, int enable) { struct ftrace_iterator *iter; - char ch; - size_t read = 0; - ssize_t ret; + struct trace_parser *parser; + ssize_t ret, read; if (!cnt || cnt < 0) return 0; @@ -2211,72 +2213,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, } else iter = file->private_data; - if (!*ppos) { - iter->flags &= ~FTRACE_ITER_CONT; - iter->buffer_idx = 0; - } + parser = &iter->parser; + read = trace_get_user(parser, ubuf, cnt, ppos); - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - - /* - * If the parser haven't finished with the last write, - * continue reading the user input without skipping spaces. - */ - if (!(iter->flags & FTRACE_ITER_CONT)) { - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - } - - /* only spaces were written */ - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } - - iter->buffer_idx = 0; - } - - while (cnt && !isspace(ch)) { - if (iter->buffer_idx < FTRACE_BUFF_MAX) - iter->buffer[iter->buffer_idx++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); + if (trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { + ret = ftrace_process_regex(parser->buffer, + parser->idx, enable); if (ret) goto out; - read++; - cnt--; - } - if (isspace(ch)) { - iter->buffer[iter->buffer_idx] = 0; - ret = ftrace_process_regex(iter->buffer, - iter->buffer_idx, enable); - if (ret) - goto out; - iter->buffer_idx = 0; - } else { - iter->flags |= FTRACE_ITER_CONT; - iter->buffer[iter->buffer_idx++] = ch; + trace_parser_clear(parser); } - *ppos += read; ret = read; - out: - mutex_unlock(&ftrace_regex_lock); + mutex_unlock(&ftrace_regex_lock); +out: return ret; } @@ -2381,6 +2334,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; + struct trace_parser *parser; mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { @@ -2390,9 +2344,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) } else iter = file->private_data; - if (iter->buffer_idx) { - iter->buffer[iter->buffer_idx] = 0; - ftrace_match_records(iter->buffer, iter->buffer_idx, enable); + parser = &iter->parser; + if (trace_parser_loaded(parser)) { + parser->buffer[parser->idx] = 0; + ftrace_match_records(parser->buffer, parser->idx, enable); } mutex_lock(&ftrace_lock); @@ -2400,7 +2355,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_lock); + trace_parser_put(parser); kfree(iter); + mutex_unlock(&ftrace_regex_lock); return 0; } @@ -2457,11 +2414,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * __g_next(struct seq_file *m, loff_t *pos) { - unsigned long *array = m->private; - if (*pos >= ftrace_graph_count) return NULL; - return &array[*pos]; + return &ftrace_graph_funcs[*pos]; } static void * @@ -2499,12 +2454,12 @@ static int g_show(struct seq_file *m, void *v) return 0; } - seq_printf(m, "%pf\n", v); + seq_printf(m, "%ps\n", (void *)*ptr); return 0; } -static struct seq_operations ftrace_graph_seq_ops = { +static const struct seq_operations ftrace_graph_seq_ops = { .start = g_start, .next = g_next, .stop = g_stop, @@ -2525,16 +2480,10 @@ ftrace_graph_open(struct inode *inode, struct file *file) ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } + mutex_unlock(&graph_lock); - if (file->f_mode & FMODE_READ) { + if (file->f_mode & FMODE_READ) ret = seq_open(file, &ftrace_graph_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = ftrace_graph_funcs; - } - } else - file->private_data = ftrace_graph_funcs; - mutex_unlock(&graph_lock); return ret; } @@ -2602,12 +2551,9 @@ static ssize_t ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - unsigned char buffer[FTRACE_BUFF_MAX+1]; - unsigned long *array; + struct trace_parser parser; size_t read = 0; ssize_t ret; - int index = 0; - char ch; if (!cnt || cnt < 0) return 0; @@ -2619,57 +2565,26 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, goto out; } - if (file->f_mode & FMODE_READ) { - struct seq_file *m = file->private_data; - array = m->private; - } else - array = file->private_data; - - ret = get_user(ch, ubuf++); - if (ret) + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { + ret = -ENOMEM; goto out; - read++; - cnt--; - - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; } - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } + read = trace_get_user(&parser, ubuf, cnt, ppos); - while (cnt && !isspace(ch)) { - if (index < FTRACE_BUFF_MAX) - buffer[index++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); + if (trace_parser_loaded((&parser))) { + parser.buffer[parser.idx] = 0; + + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + parser.buffer); if (ret) goto out; - read++; - cnt--; } - buffer[index] = 0; - - /* we allow only one expression at a time */ - ret = ftrace_set_func(array, &ftrace_graph_count, buffer); - if (ret) - goto out; - - file->f_pos += read; ret = read; out: + trace_parser_put(&parser); mutex_unlock(&graph_lock); return ret; @@ -3100,7 +3015,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -3110,7 +3025,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c new file mode 100644 index 0000000..e06c6e3 --- /dev/null +++ b/kernel/trace/power-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/slab.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/power.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); + diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 454e74e..d4ff019 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -201,8 +201,6 @@ int tracing_is_on(void) } EXPORT_SYMBOL_GPL(tracing_is_on); -#include "trace.h" - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) @@ -701,8 +699,8 @@ static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, val &= ~RB_FLAG_MASK; - ret = (unsigned long)cmpxchg(&list->next, - val | old_flag, val | new_flag); + ret = cmpxchg((unsigned long *)&list->next, + val | old_flag, val | new_flag); /* check if the reader took the page */ if ((ret & ~RB_FLAG_MASK) != val) @@ -794,7 +792,7 @@ static int rb_head_page_replace(struct buffer_page *old, val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; - ret = cmpxchg(ptr, val, &new->list); + ret = cmpxchg(ptr, val, (unsigned long)&new->list); return ret == val; } @@ -2997,15 +2995,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) } static struct ring_buffer_event * -rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) { - struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; struct buffer_page *reader; int nr_loops = 0; - cpu_buffer = buffer->buffers[cpu]; - again: /* * We repeat when a timestamp is encountered. It is possible @@ -3049,7 +3044,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = cpu_buffer->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(buffer, + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } return event; @@ -3168,7 +3163,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) local_irq_save(flags); if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); + event = rb_buffer_peek(cpu_buffer, ts); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) @@ -3237,7 +3232,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); + event = rb_buffer_peek(cpu_buffer, ts); if (event) rb_advance_reader(cpu_buffer); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c75dee..411af37 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -125,13 +125,13 @@ int ftrace_dump_on_oops; static int tracing_set_tracer(const char *buf); -#define BOOTUP_TRACER_SIZE 100 -static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; +#define MAX_TRACER_SIZE 100 +static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; static int __init set_ftrace(char *str) { - strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); + strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = 1; @@ -242,13 +242,6 @@ static struct tracer *trace_types __read_mostly; static struct tracer *current_trace __read_mostly; /* - * max_tracer_type_len is used to simplify the allocating of - * buffers to read userspace tracer names. We keep track of - * the longest tracer name registered. - */ -static int max_tracer_type_len; - -/* * trace_types_lock is used to protect the trace_types list. * This lock is also used to keep user access serialized. * Accesses from userspace will grab this lock while userspace @@ -275,12 +268,18 @@ static DEFINE_SPINLOCK(tracing_start_lock); */ void trace_wake_up(void) { + int cpu; + + if (trace_flags & TRACE_ITER_BLOCK) + return; /* * The runqueue_is_locked() can fail, but this is the best we * have for now: */ - if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) + cpu = get_cpu(); + if (!runqueue_is_locked(cpu)) wake_up(&trace_wait); + put_cpu(); } static int __init set_buf_size(char *str) @@ -339,6 +338,112 @@ static struct { int trace_clock_id; +/* + * trace_parser_get_init - gets the buffer for trace parser + */ +int trace_parser_get_init(struct trace_parser *parser, int size) +{ + memset(parser, 0, sizeof(*parser)); + + parser->buffer = kmalloc(size, GFP_KERNEL); + if (!parser->buffer) + return 1; + + parser->size = size; + return 0; +} + +/* + * trace_parser_put - frees the buffer for trace parser + */ +void trace_parser_put(struct trace_parser *parser) +{ + kfree(parser->buffer); +} + +/* + * trace_get_user - reads the user input string separated by space + * (matched by isspace(ch)) + * + * For each string found the 'struct trace_parser' is updated, + * and the function returns. + * + * Returns number of bytes read. + * + * See kernel/trace/trace.h for 'struct trace_parser' details. + */ +int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char ch; + size_t read = 0; + ssize_t ret; + + if (!*ppos) + trace_parser_clear(parser); + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + + read++; + cnt--; + + /* + * The parser is not finished with the last write, + * continue reading the user input without skipping spaces. + */ + if (!parser->cont) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* only spaces were written */ + if (isspace(ch)) { + *ppos += read; + ret = read; + goto out; + } + + parser->idx = 0; + } + + /* read the non-space input */ + while (cnt && !isspace(ch)) { + if (parser->idx < parser->size) + parser->buffer[parser->idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* We either got finished input or we have to wait for another call. */ + if (isspace(ch)) { + parser->buffer[parser->idx] = 0; + parser->cont = false; + } else { + parser->cont = true; + parser->buffer[parser->idx++] = ch; + } + + *ppos += read; + ret = read; + +out: + return ret; +} + ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) { int len; @@ -513,7 +618,6 @@ __releases(kernel_lock) __acquires(kernel_lock) { struct tracer *t; - int len; int ret = 0; if (!type->name) { @@ -521,6 +625,11 @@ __acquires(kernel_lock) return -1; } + if (strlen(type->name) > MAX_TRACER_SIZE) { + pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); + return -1; + } + /* * When this gets called we hold the BKL which means that * preemption is disabled. Various trace selftests however @@ -535,7 +644,7 @@ __acquires(kernel_lock) for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ - pr_info("Trace %s already registered\n", + pr_info("Tracer %s already registered\n", type->name); ret = -1; goto out; @@ -586,9 +695,6 @@ __acquires(kernel_lock) type->next = trace_types; trace_types = type; - len = strlen(type->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; out: tracing_selftest_running = false; @@ -597,7 +703,7 @@ __acquires(kernel_lock) if (ret || !default_bootup_tracer) goto out_unlock; - if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) + if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) goto out_unlock; printk(KERN_INFO "Starting tracer '%s'\n", type->name); @@ -619,14 +725,13 @@ __acquires(kernel_lock) void unregister_tracer(struct tracer *type) { struct tracer **t; - int len; mutex_lock(&trace_types_lock); for (t = &trace_types; *t; t = &(*t)->next) { if (*t == type) goto found; } - pr_info("Trace %s not registered\n", type->name); + pr_info("Tracer %s not registered\n", type->name); goto out; found: @@ -639,17 +744,7 @@ void unregister_tracer(struct tracer *type) current_trace->stop(&global_trace); current_trace = &nop_trace; } - - if (strlen(type->name) != max_tracer_type_len) - goto out; - - max_tracer_type_len = 0; - for (t = &trace_types; *t; t = &(*t)->next) { - len = strlen((*t)->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; - } - out: +out: mutex_unlock(&trace_types_lock); } @@ -719,6 +814,11 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } +int is_tracing_stopped(void) +{ + return trace_stop_count; +} + /** * ftrace_off_permanent - disable all ftrace code permanently * @@ -886,7 +986,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->tgid = (tsk) ? tsk->tgid : 0; + entry->lock_depth = (tsk) ? tsk->lock_depth : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1068,6 +1168,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) return; entry = ring_buffer_event_data(event); + entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); trace.nr_entries = 0; @@ -1094,6 +1195,7 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { + struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct ring_buffer *buffer = tr->buffer; @@ -1107,7 +1209,9 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void @@ -1530,10 +1634,10 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / \n"); - seq_puts(m, "# ||||| delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# |||| /_--=> lock-depth \n"); + seq_puts(m, "# |||||/ delay \n"); + seq_puts(m, "# cmd pid |||||| time | caller \n"); + seq_puts(m, "# \\ / |||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) @@ -1845,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations tracer_seq_ops = { +static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, .stop = s_stop, @@ -1880,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file) if (current_trace) *iter->trace = *current_trace; - if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) + if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - cpumask_clear(iter->started); - if (current_trace && current_trace->print_max) iter->tr = &max_tr; else @@ -2059,7 +2161,7 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations show_traces_seq_ops = { +static const struct seq_operations show_traces_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -2489,7 +2591,7 @@ static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[max_tracer_type_len+2]; + char buf[MAX_TRACER_SIZE+2]; int r; mutex_lock(&trace_types_lock); @@ -2639,15 +2741,15 @@ static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[max_tracer_type_len+1]; + char buf[MAX_TRACER_SIZE+1]; int i; size_t ret; int err; ret = cnt; - if (cnt > max_tracer_type_len) - cnt = max_tracer_type_len; + if (cnt > MAX_TRACER_SIZE) + cnt = MAX_TRACER_SIZE; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -4285,7 +4387,7 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; - if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) goto out_free_tracing_cpumask; /* To save memory, keep the ring buffer size to its minimum */ @@ -4296,7 +4398,6 @@ __init static int tracer_alloc_buffers(void) cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); - cpumask_clear(tracing_reader_cpumask); /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(ring_buf_size, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fa1dccb..405cb85 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -7,10 +7,10 @@ #include <linux/clocksource.h> #include <linux/ring_buffer.h> #include <linux/mmiotrace.h> +#include <linux/tracepoint.h> #include <linux/ftrace.h> #include <trace/boot.h> #include <linux/kmemtrace.h> -#include <trace/power.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -36,163 +36,59 @@ enum trace_type { TRACE_HW_BRANCHES, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, - TRACE_POWER, TRACE_BLK, __TRACE_LAST_TYPE, }; -/* - * Function trace entry - function address and parent function addres: - */ -struct ftrace_entry { - struct trace_entry ent; - unsigned long ip; - unsigned long parent_ip; -}; - -/* Function call entry */ -struct ftrace_graph_ent_entry { - struct trace_entry ent; - struct ftrace_graph_ent graph_ent; +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ }; -/* Function return entry */ -struct ftrace_graph_ret_entry { - struct trace_entry ent; - struct ftrace_graph_ret ret; -}; extern struct tracer boot_tracer; -/* - * Context switch trace entry - which task (and prio) we switched from/to: - */ -struct ctx_switch_entry { - struct trace_entry ent; - unsigned int prev_pid; - unsigned char prev_prio; - unsigned char prev_state; - unsigned int next_pid; - unsigned char next_prio; - unsigned char next_state; - unsigned int next_cpu; -}; - -/* - * Special (free-form) trace entry: - */ -struct special_entry { - struct trace_entry ent; - unsigned long arg1; - unsigned long arg2; - unsigned long arg3; -}; - -/* - * Stack-trace entry: - */ - -#define FTRACE_STACK_ENTRIES 8 - -struct stack_entry { - struct trace_entry ent; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; - -struct userstack_entry { - struct trace_entry ent; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; - -/* - * trace_printk entry: - */ -struct bprint_entry { - struct trace_entry ent; - unsigned long ip; - const char *fmt; - u32 buf[]; -}; +#undef __field +#define __field(type, item) type item; -struct print_entry { - struct trace_entry ent; - unsigned long ip; - char buf[]; -}; +#undef __field_struct +#define __field_struct(type, item) __field(type, item) -#define TRACE_OLD_SIZE 88 +#undef __field_desc +#define __field_desc(type, container, item) -struct trace_field_cont { - unsigned char type; - /* Temporary till we get rid of this completely */ - char buf[TRACE_OLD_SIZE - 1]; -}; +#undef __array +#define __array(type, item, size) type item[size]; -struct trace_mmiotrace_rw { - struct trace_entry ent; - struct mmiotrace_rw rw; -}; +#undef __array_desc +#define __array_desc(type, container, item, size) -struct trace_mmiotrace_map { - struct trace_entry ent; - struct mmiotrace_map map; -}; +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; -struct trace_boot_call { - struct trace_entry ent; - struct boot_trace_call boot_call; -}; +#undef F_STRUCT +#define F_STRUCT(args...) args -struct trace_boot_ret { - struct trace_entry ent; - struct boot_trace_ret boot_ret; -}; - -#define TRACE_FUNC_SIZE 30 -#define TRACE_FILE_SIZE 20 -struct trace_branch { - struct trace_entry ent; - unsigned line; - char func[TRACE_FUNC_SIZE+1]; - char file[TRACE_FILE_SIZE+1]; - char correct; -}; - -struct hw_branch_entry { - struct trace_entry ent; - u64 from; - u64 to; -}; - -struct trace_power { - struct trace_entry ent; - struct power_trace state_data; -}; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ + } -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; +#undef TP_ARGS +#define TP_ARGS(args...) args -struct kmemtrace_alloc_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; - size_t bytes_req; - size_t bytes_alloc; - gfp_t gfp_flags; - int node; -}; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) -struct kmemtrace_free_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; -}; +#include "trace_entries.h" +/* + * syscalls are special, and need special handling, this is why + * they are not included in trace_entries.h + */ struct syscall_trace_enter { struct trace_entry ent; int nr; @@ -205,13 +101,12 @@ struct syscall_trace_exit { unsigned long ret; }; - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: * IRQS_OFF - interrupts were disabled * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags - * NEED_RESCED - reschedule is requested + * NEED_RESCHED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler */ @@ -310,7 +205,6 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ - IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ @@ -390,7 +284,6 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; - struct tracer_stat *stats; }; @@ -469,6 +362,7 @@ void tracing_stop_sched_switch_record(void); void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); +int is_tracing_stopped(void); extern unsigned long nsecs_to_usecs(unsigned long nsecs); @@ -509,20 +403,6 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -typedef void -(*tracer_switch_func_t)(void *private, - void *__rq, - struct task_struct *prev, - struct task_struct *next); - -struct tracer_switch_ops { - tracer_switch_func_t func; - void *private; - struct tracer_switch_ops *next; -}; -#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ - extern void trace_find_cmdline(int pid, char comm[]); #ifdef CONFIG_DYNAMIC_FTRACE @@ -638,6 +518,41 @@ static inline int ftrace_trace_task(struct task_struct *task) #endif /* + * struct trace_parser - servers for reading the user input separated by spaces + * @cont: set if the input is not complete - no final space char was found + * @buffer: holds the parsed user input + * @idx: user input lenght + * @size: buffer size + */ +struct trace_parser { + bool cont; + char *buffer; + unsigned idx; + unsigned size; +}; + +static inline bool trace_parser_loaded(struct trace_parser *parser) +{ + return (parser->idx != 0); +} + +static inline bool trace_parser_cont(struct trace_parser *parser) +{ + return parser->cont; +} + +static inline void trace_parser_clear(struct trace_parser *parser) +{ + parser->cont = false; + parser->idx = 0; +} + +extern int trace_parser_get_init(struct trace_parser *parser, int size); +extern void trace_parser_put(struct trace_parser *parser); +extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos); + +/* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. * @@ -823,58 +738,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -#define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - int match = 0; \ - \ - switch (pred->op) { \ - case OP_LT: \ - match = (*addr < val); \ - break; \ - case OP_LE: \ - match = (*addr <= val); \ - break; \ - case OP_GT: \ - match = (*addr > val); \ - break; \ - case OP_GE: \ - match = (*addr >= val); \ - break; \ - default: \ - break; \ - } \ - \ - return match; \ -} - -#define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - u##size *addr = (u##size *)(event + pred->offset); \ - u##size val = (u##size)pred->val; \ - int match; \ - \ - match = (val == *addr) ^ pred->not; \ - \ - return match; \ -} - extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ extern struct ftrace_event_call event_##call; -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) -#include "trace_event_types.h" +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#include "trace_entries.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 19bfc75..c21d5f3 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -129,6 +129,7 @@ struct tracer boot_tracer __read_mostly = void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { + struct ftrace_event_call *call = &event_boot_call; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_boot_call *entry; @@ -150,13 +151,15 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) goto out; entry = ring_buffer_event_data(event); entry->boot_call = *bt; - trace_buffer_unlock_commit(buffer, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { + struct ftrace_event_call *call = &event_boot_ret; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_boot_ret *entry; @@ -175,7 +178,8 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) goto out; entry = ring_buffer_event_data(event); entry->boot_ret = *bt; - trace_buffer_unlock_commit(buffer, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index b588fd8..20c5f92 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -66,10 +66,14 @@ u64 notrace trace_clock(void) * Used by plugins that need globally coherent timestamps. */ -static u64 prev_trace_clock_time; - -static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +/* keep prev_time and lock in the same cacheline. */ +static struct { + u64 prev_time; + raw_spinlock_t lock; +} trace_clock_struct ____cacheline_aligned_in_smp = + { + .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, + }; u64 notrace trace_clock_global(void) { @@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void) if (unlikely(in_nmi())) goto out; - __raw_spin_lock(&trace_clock_lock); + __raw_spin_lock(&trace_clock_struct.lock); /* * TODO: if this happens often then maybe we should reset - * my_scd->clock to prev_trace_clock_time+1, to make sure + * my_scd->clock to prev_time+1, to make sure * we start ticking with the local clock from now on? */ - if ((s64)(now - prev_trace_clock_time) < 0) - now = prev_trace_clock_time + 1; + if ((s64)(now - trace_clock_struct.prev_time) < 0) + now = trace_clock_struct.prev_time + 1; - prev_trace_clock_time = now; + trace_clock_struct.prev_time = now; - __raw_spin_unlock(&trace_clock_lock); + __raw_spin_unlock(&trace_clock_struct.lock); out: raw_local_irq_restore(flags); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h new file mode 100644 index 0000000..ead3d72 --- /dev/null +++ b/kernel/trace/trace_entries.h @@ -0,0 +1,366 @@ +/* + * This file defines the trace event structures that go into the ring + * buffer directly. They are created via macros so that changes for them + * appear in the format file. Using macros will automate this process. + * + * The macro used to create a ftrace data structure is: + * + * FTRACE_ENTRY( name, struct_name, id, structure, print ) + * + * @name: the name used the event name, as well as the name of + * the directory that holds the format file. + * + * @struct_name: the name of the structure that is created. + * + * @id: The event identifier that is used to detect what event + * this is from the ring buffer. + * + * @structure: the structure layout + * + * - __field( type, item ) + * This is equivalent to declaring + * type item; + * in the structure. + * - __array( type, item, size ) + * This is equivalent to declaring + * type item[size]; + * in the structure. + * + * * for structures within structures, the format of the internal + * structure is layed out. This allows the internal structure + * to be deciphered for the format file. Although these macros + * may become out of sync with the internal structure, they + * will create a compile error if it happens. Since the + * internel structures are just tracing helpers, this is not + * an issue. + * + * When an internal structure is used, it should use: + * + * __field_struct( type, item ) + * + * instead of __field. This will prevent it from being shown in + * the output file. The fields in the structure should use. + * + * __field_desc( type, container, item ) + * __array_desc( type, container, item, len ) + * + * type, item and len are the same as __field and __array, but + * container is added. This is the name of the item in + * __field_struct that this is describing. + * + * + * @print: the print format shown to users in the format file. + */ + +/* + * Function trace entry - function address and parent function addres: + */ +FTRACE_ENTRY(function, ftrace_entry, + + TRACE_FN, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + ), + + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) +); + +/* Function call entry */ +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, + + TRACE_GRAPH_ENT, + + F_STRUCT( + __field_struct( struct ftrace_graph_ent, graph_ent ) + __field_desc( unsigned long, graph_ent, func ) + __field_desc( int, graph_ent, depth ) + ), + + F_printk("--> %lx (%d)", __entry->func, __entry->depth) +); + +/* Function return entry */ +FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_desc( unsigned long, ret, func ) + __field_desc( unsigned long long, ret, calltime) + __field_desc( unsigned long long, ret, rettime ) + __field_desc( unsigned long, ret, overrun ) + __field_desc( int, ret, depth ) + ), + + F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", + __entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth) +); + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + * + * This is used for both wakeup and context switches. We only want + * to create one structure, but we need two outputs for it. + */ +#define FTRACE_CTX_FIELDS \ + __field( unsigned int, prev_pid ) \ + __field( unsigned char, prev_prio ) \ + __field( unsigned char, prev_state ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned char, next_prio ) \ + __field( unsigned char, next_state ) \ + __field( unsigned int, next_cpu ) + +FTRACE_ENTRY(context_switch, ctx_switch_entry, + + TRACE_CTX, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * FTRACE_ENTRY_DUP only creates the format file, it will not + * create another structure. + */ +FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, + + TRACE_WAKE, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * Special (free-form) trace entry: + */ +FTRACE_ENTRY(special, special_entry, + + TRACE_SPECIAL, + + F_STRUCT( + __field( unsigned long, arg1 ) + __field( unsigned long, arg2 ) + __field( unsigned long, arg3 ) + ), + + F_printk("(%08lx) (%08lx) (%08lx)", + __entry->arg1, __entry->arg2, __entry->arg3) +); + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +FTRACE_ENTRY(kernel_stack, stack_entry, + + TRACE_STACK, + + F_STRUCT( + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +FTRACE_ENTRY(user_stack, userstack_entry, + + TRACE_USER_STACK, + + F_STRUCT( + __field( unsigned int, tgid ) + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +/* + * trace_printk entry: + */ +FTRACE_ENTRY(bprint, bprint_entry, + + TRACE_BPRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, fmt ) + __dynamic_array( u32, buf ) + ), + + F_printk("%08lx fmt:%p", + __entry->ip, __entry->fmt) +); + +FTRACE_ENTRY(print, print_entry, + + TRACE_PRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __dynamic_array( char, buf ) + ), + + F_printk("%08lx %s", + __entry->ip, __entry->buf) +); + +FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, + + TRACE_MMIO_RW, + + F_STRUCT( + __field_struct( struct mmiotrace_rw, rw ) + __field_desc( resource_size_t, rw, phys ) + __field_desc( unsigned long, rw, value ) + __field_desc( unsigned long, rw, pc ) + __field_desc( int, rw, map_id ) + __field_desc( unsigned char, rw, opcode ) + __field_desc( unsigned char, rw, width ) + ), + + F_printk("%lx %lx %lx %d %x %x", + (unsigned long)__entry->phys, __entry->value, __entry->pc, + __entry->map_id, __entry->opcode, __entry->width) +); + +FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, + + TRACE_MMIO_MAP, + + F_STRUCT( + __field_struct( struct mmiotrace_map, map ) + __field_desc( resource_size_t, map, phys ) + __field_desc( unsigned long, map, virt ) + __field_desc( unsigned long, map, len ) + __field_desc( int, map, map_id ) + __field_desc( unsigned char, map, opcode ) + ), + + F_printk("%lx %lx %lx %d %x", + (unsigned long)__entry->phys, __entry->virt, __entry->len, + __entry->map_id, __entry->opcode) +); + +FTRACE_ENTRY(boot_call, trace_boot_call, + + TRACE_BOOT_CALL, + + F_STRUCT( + __field_struct( struct boot_trace_call, boot_call ) + __field_desc( pid_t, boot_call, caller ) + __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) + ), + + F_printk("%d %s", __entry->caller, __entry->func) +); + +FTRACE_ENTRY(boot_ret, trace_boot_ret, + + TRACE_BOOT_RET, + + F_STRUCT( + __field_struct( struct boot_trace_ret, boot_ret ) + __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) + __field_desc( int, boot_ret, result ) + __field_desc( unsigned long, boot_ret, duration ) + ), + + F_printk("%s %d %lx", + __entry->func, __entry->result, __entry->duration) +); + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 + +FTRACE_ENTRY(branch, trace_branch, + + TRACE_BRANCH, + + F_STRUCT( + __field( unsigned int, line ) + __array( char, func, TRACE_FUNC_SIZE+1 ) + __array( char, file, TRACE_FILE_SIZE+1 ) + __field( char, correct ) + ), + + F_printk("%u:%s:%s (%u)", + __entry->line, + __entry->func, __entry->file, __entry->correct) +); + +FTRACE_ENTRY(hw_branch, hw_branch_entry, + + TRACE_HW_BRANCHES, + + F_STRUCT( + __field( u64, from ) + __field( u64, to ) + ), + + F_printk("from: %llx to: %llx", __entry->from, __entry->to) +); + +FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, + + TRACE_KMEM_ALLOC, + + F_STRUCT( + __field( enum kmemtrace_type_id, type_id ) + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" + " flags:%x node:%d", + __entry->type_id, __entry->call_site, __entry->ptr, + __entry->bytes_req, __entry->bytes_alloc, + __entry->gfp_flags, __entry->node) +); + +FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, + + TRACE_KMEM_FREE, + + F_STRUCT( + __field( enum kmemtrace_type_id, type_id ) + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + F_printk("type:%u call_site:%lx ptr:%p", + __entry->type_id, __entry->call_site, __entry->ptr) +); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 11ba5bb..dd44b87 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -5,8 +5,60 @@ * */ +#include <linux/module.h> #include "trace.h" +/* + * We can't use a size but a type in alloc_percpu() + * So let's create a dummy type that matches the desired size + */ +typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; + +char *trace_profile_buf; +EXPORT_SYMBOL_GPL(trace_profile_buf); + +char *trace_profile_buf_nmi; +EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); + +/* Count the events in use (per event id, not per instance) */ +static int total_profile_count; + +static int ftrace_profile_enable_event(struct ftrace_event_call *event) +{ + char *buf; + int ret = -ENOMEM; + + if (atomic_inc_return(&event->profile_count)) + return 0; + + if (!total_profile_count++) { + buf = (char *)alloc_percpu(profile_buf_t); + if (!buf) + goto fail_buf; + + rcu_assign_pointer(trace_profile_buf, buf); + + buf = (char *)alloc_percpu(profile_buf_t); + if (!buf) + goto fail_buf_nmi; + + rcu_assign_pointer(trace_profile_buf_nmi, buf); + } + + ret = event->profile_enable(); + if (!ret) + return 0; + + kfree(trace_profile_buf_nmi); +fail_buf_nmi: + kfree(trace_profile_buf); +fail_buf: + total_profile_count--; + atomic_dec(&event->profile_count); + + return ret; +} + int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; @@ -14,8 +66,9 @@ int ftrace_profile_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->profile_enable) { - ret = event->profile_enable(event); + if (event->id == event_id && event->profile_enable && + try_module_get(event->mod)) { + ret = ftrace_profile_enable_event(event); break; } } @@ -24,6 +77,33 @@ int ftrace_profile_enable(int event_id) return ret; } +static void ftrace_profile_disable_event(struct ftrace_event_call *event) +{ + char *buf, *nmi_buf; + + if (!atomic_add_negative(-1, &event->profile_count)) + return; + + event->profile_disable(); + + if (!--total_profile_count) { + buf = trace_profile_buf; + rcu_assign_pointer(trace_profile_buf, NULL); + + nmi_buf = trace_profile_buf_nmi; + rcu_assign_pointer(trace_profile_buf_nmi, NULL); + + /* + * Ensure every events in profiling have finished before + * releasing the buffers + */ + synchronize_sched(); + + free_percpu(buf); + free_percpu(nmi_buf); + } +} + void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; @@ -31,7 +111,8 @@ void ftrace_profile_disable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) { - event->profile_disable(event); + ftrace_profile_disable_event(event); + module_put(event->mod); break; } } diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h deleted file mode 100644 index 6db005e..0000000 --- a/kernel/trace/trace_event_types.h +++ /dev/null @@ -1,178 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM ftrace - -/* - * We cheat and use the proto type field as the ID - * and args as the entry type (minus 'struct') - */ -TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(unsigned long, parent_ip, parent_ip) - ), - TP_RAW_FMT(" %lx <-- %lx") -); - -TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, - ftrace_graph_ent_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, graph_ent.func, func) - TRACE_FIELD(int, graph_ent.depth, depth) - ), - TP_RAW_FMT("--> %lx (%d)") -); - -TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, - ftrace_graph_ret_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ret.func, func) - TRACE_FIELD(unsigned long long, ret.calltime, calltime) - TRACE_FIELD(unsigned long long, ret.rettime, rettime) - TRACE_FIELD(unsigned long, ret.overrun, overrun) - TRACE_FIELD(int, ret.depth, depth) - ), - TP_RAW_FMT("<-- %lx (%d)") -); - -TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, arg1, arg1) - TRACE_FIELD(unsigned long, arg2, arg2) - TRACE_FIELD(unsigned long, arg3, arg3) - ), - TP_RAW_FMT("(%08lx) (%08lx) (%08lx)") -); - -/* - * Stack-trace entry: - */ - -/* #define FTRACE_STACK_ENTRIES 8 */ - -TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(char *, fmt, fmt) - TRACE_FIELD_ZERO_CHAR(buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD_ZERO_CHAR(buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, - TRACE_FUNC_SIZE+1, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, - TRACE_FUNC_SIZE+1, file) - TRACE_FIELD(char, correct, correct) - ), - TP_RAW_FMT("%u:%s:%s (%u)") -); - -TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(u64, from, from) - TRACE_FIELD(u64, to, to) - ), - TP_RAW_FMT("from: %llx to: %llx") -); - -TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, - TRACE_STRUCT( - TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) - TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) - TRACE_FIELD(int, state_data.type, type) - TRACE_FIELD(int, state_data.state, state) - ), - TP_RAW_FMT("%llx->%llx type:%u state:%u") -); - -TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - TRACE_FIELD(size_t, bytes_req, bytes_req) - TRACE_FIELD(size_t, bytes_alloc, bytes_alloc) - TRACE_FIELD(gfp_t, gfp_flags, gfp_flags) - TRACE_FIELD(int, node, node) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" - " flags:%x node:%d") -); - -TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p") -); - -#undef TRACE_SYSTEM diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 78b1ed2..6f03c8a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -21,6 +21,7 @@ #include "trace_output.h" +#undef TRACE_SYSTEM #define TRACE_SYSTEM "TRACE_SYSTEM" DEFINE_MUTEX(event_mutex); @@ -86,7 +87,7 @@ int trace_define_common_fields(struct ftrace_event_call *call) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, tgid); + __common_field(int, lock_depth); return ret; } @@ -230,11 +231,9 @@ static ssize_t ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_parser parser; size_t read = 0; - int i, set = 1; ssize_t ret; - char *buf; - char ch; if (!cnt || cnt < 0) return 0; @@ -243,60 +242,28 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (ret < 0) return ret; - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - } - - /* Only white space found? */ - if (isspace(ch)) { - file->f_pos += read; - ret = read; - return ret; - } - - buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL); - if (!buf) + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) return -ENOMEM; - if (cnt > EVENT_BUF_SIZE) - cnt = EVENT_BUF_SIZE; + read = trace_get_user(&parser, ubuf, cnt, ppos); + + if (trace_parser_loaded((&parser))) { + int set = 1; - i = 0; - while (cnt && !isspace(ch)) { - if (!i && ch == '!') + if (*parser.buffer == '!') set = 0; - else - buf[i++] = ch; - ret = get_user(ch, ubuf++); + parser.buffer[parser.idx] = 0; + + ret = ftrace_set_clr_event(parser.buffer + !set, set); if (ret) - goto out_free; - read++; - cnt--; + goto out_put; } - buf[i] = 0; - - file->f_pos += read; - - ret = ftrace_set_clr_event(buf, set); - if (ret) - goto out_free; ret = read; - out_free: - kfree(buf); + out_put: + trace_parser_put(&parser); return ret; } @@ -304,42 +271,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *list = m->private; - struct ftrace_event_call *call; + struct ftrace_event_call *call = v; (*pos)++; - for (;;) { - if (list == &ftrace_events) - return NULL; - - call = list_entry(list, struct ftrace_event_call, list); - + list_for_each_entry_continue(call, &ftrace_events, list) { /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->regfunc) - break; - - list = list->next; + return call; } - m->private = list->next; - - return call; + return NULL; } static void *t_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; loff_t l; mutex_lock(&event_mutex); - m->private = ftrace_events.next; + call = list_entry(&ftrace_events, struct ftrace_event_call, list); for (l = 0; l <= *pos; ) { - call = t_next(m, NULL, &l); + call = t_next(m, call, &l); if (!call) break; } @@ -349,37 +306,28 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *list = m->private; - struct ftrace_event_call *call; + struct ftrace_event_call *call = v; (*pos)++; - retry: - if (list == &ftrace_events) - return NULL; - - call = list_entry(list, struct ftrace_event_call, list); - - if (!call->enabled) { - list = list->next; - goto retry; + list_for_each_entry_continue(call, &ftrace_events, list) { + if (call->enabled) + return call; } - m->private = list->next; - - return call; + return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; loff_t l; mutex_lock(&event_mutex); - m->private = ftrace_events.next; + call = list_entry(&ftrace_events, struct ftrace_event_call, list); for (l = 0; l <= *pos; ) { - call = s_next(m, NULL, &l); + call = s_next(m, call, &l); if (!call) break; } @@ -578,7 +526,7 @@ static int trace_write_header(struct trace_seq *s) FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), - FIELD(int, tgid)); + FIELD(int, lock_depth)); } static ssize_t @@ -1187,7 +1135,7 @@ static int trace_module_notify(struct notifier_block *self, } #endif /* CONFIG_MODULES */ -struct notifier_block trace_module_nb = { +static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, }; @@ -1359,6 +1307,18 @@ static __init void event_trace_self_tests(void) if (!call->regfunc) continue; +/* + * Testing syscall events here is pretty useless, but + * we still do it if configured. But this is time consuming. + * What we really need is a user thread to perform the + * syscalls as we test. + */ +#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS + if (call->system && + strcmp(call->system, "syscalls") == 0) + continue; +#endif + pr_info("Testing event %s: ", call->name); /* @@ -1432,7 +1392,7 @@ static __init void event_trace_self_tests(void) #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip) @@ -1449,7 +1409,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) pc = preempt_count(); resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; @@ -1468,7 +1428,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); out: - atomic_dec(&per_cpu(test_event_disable, cpu)); + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); ftrace_preempt_enable(resched); } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 93660fb..2324578 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -121,6 +121,47 @@ struct filter_parse_state { } operand; }; +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + DEFINE_COMPARISON_PRED(s64); DEFINE_COMPARISON_PRED(u64); DEFINE_COMPARISON_PRED(s32); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index df1bf6e..9753fcc 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -15,146 +15,125 @@ #include "trace_output.h" +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ftrace -#undef TRACE_STRUCT -#define TRACE_STRUCT(args...) args +/* not needed for this file */ +#undef __field_struct +#define __field_struct(type, item) -extern void __bad_type_size(void); +#undef __field +#define __field(type, item) type item; -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - if (sizeof(type) != sizeof(field.item)) \ - __bad_type_size(); \ +#undef __field_desc +#define __field_desc(type, container, item) type item; + +#undef __array +#define __array(type, item, size) type item[size]; + +#undef __array_desc +#define __array_desc(type, container, item, size) type item[size]; + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef F_printk +#define F_printk(fmt, args...) fmt, args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __used ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force cmpile-time check on F_printk() */ \ + printk(print); \ +} + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#include "trace_entries.h" + + +#undef __field +#define __field(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ if (!ret) \ return 0; +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), container.item), \ + sizeof(field.container.item)); \ + if (!ret) \ + return 0; -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ +#undef __array +#define __array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ if (!ret) \ return 0; -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) \ - ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ - "offset:%u;\tsize:0;\n", \ - (unsigned int)offsetof(typeof(field), item)); \ +#undef __array_desc +#define __array_desc(type, container, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), container.item), \ + sizeof(field.container.item)); \ if (!ret) \ return 0; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) +#undef __dynamic_array +#define __dynamic_array(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%zu;\tsize:0;\n", \ + offsetof(typeof(field), item)); \ + if (!ret) \ + return 0; -#undef TP_RAW_FMT -#define TP_RAW_FMT(args...) args +#undef F_printk +#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -static int \ -ftrace_format_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct args field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ - \ - return ret; \ -} +#undef __entry +#define __entry REC -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ static int \ -ftrace_format_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ +ftrace_format_##name(struct ftrace_event_call *unused, \ + struct trace_seq *s) \ { \ - struct args field; \ - int ret; \ + struct struct_name field __attribute__((unused)); \ + int ret = 0; \ \ tstruct; \ \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + trace_seq_printf(s, "\nprint fmt: " print); \ \ return ret; \ } -#include "trace_event_types.h" - -#undef TRACE_ZERO_CHAR -#define TRACE_ZERO_CHAR(arg) - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) - -#undef TP_CMD -#define TP_CMD(cmd...) cmd - -#undef TRACE_ENTRY -#define TRACE_ENTRY entry - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - cmd; - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \ -static int ftrace_raw_init_event_##call(void); \ - \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .id = proto, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ - .show_format = ftrace_format_##call, \ - .define_fields = ftrace_define_fields_##call, \ -}; \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - INIT_LIST_HEAD(&event_##call.fields); \ - return 0; \ -} \ - -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ - \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .id = proto, \ - .system = __stringify(TRACE_SYSTEM), \ - .show_format = ftrace_format_##call, \ -}; +#include "trace_entries.h" -#include "trace_event_types.h" -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ +#undef __field +#define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ @@ -162,32 +141,45 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ if (ret) \ return ret; -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ sizeof(field.item), 0, FILTER_OTHER); \ if (ret) \ return ret; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed, \ +#undef __array_desc +#define __array_desc(type, container, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), 0, \ FILTER_OTHER); \ if (ret) \ return ret; -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) +#undef __dynamic_array +#define __dynamic_array(type, item) -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ int \ -ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ +ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ - struct args field; \ + struct struct_name field; \ int ret; \ \ ret = trace_define_common_fields(event_call); \ @@ -199,8 +191,42 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ return ret; \ } -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) +#include "trace_entries.h" + + +#undef __field +#define __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __array +#define __array(type, item, len) + +#undef __array_desc +#define __array_desc(type, container, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item) + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ +static int ftrace_raw_init_event_##call(void); \ + \ +struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .id = type, \ + .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_raw_init_event_##call, \ + .show_format = ftrace_format_##call, \ + .define_fields = ftrace_define_fields_##call, \ +}; \ +static int ftrace_raw_init_event_##call(void) \ +{ \ + INIT_LIST_HEAD(&event_##call.fields); \ + return 0; \ +} \ -#include "trace_event_types.h" +#include "trace_entries.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5b01b94..b3f3776 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -290,7 +290,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, { long count = (long)data; - seq_printf(m, "%pf:", (void *)ip); + seq_printf(m, "%ps:", (void *)ip); if (ops == &traceon_probe_ops) seq_printf(m, "traceon"); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b3749a2..45e6c01 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, if (unlikely(current->ret_stack[index].fp != frame_pointer)) { ftrace_graph_stop(); WARN(1, "Bad frame pointer: expected %lx, received %lx\n" - " from func %pF return to %lx\n", + " from func %ps return to %lx\n", current->ret_stack[index].fp, frame_pointer, (void *)current->ret_stack[index].func, @@ -364,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid) } +static enum print_line_t +print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + if (!trace_seq_putc(s, ' ')) + return 0; + + return trace_print_lat_fmt(s, entry); +} + /* If the pid changed since the last trace, output this event */ static enum print_line_t verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) @@ -521,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } + /* Proc */ if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { ret = print_graph_proc(s, pid); @@ -659,7 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); + ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -702,7 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); + ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -758,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + ret = print_graph_lat_fmt(s, ent); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + return 0; } @@ -952,28 +969,59 @@ print_graph_function(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } +static void print_lat_header(struct seq_file *s) +{ + static const char spaces[] = " " /* 16 spaces */ + " " /* 4 spaces */ + " "; /* 17 spaces */ + int size = 0; + + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + size += 16; + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + size += 4; + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + size += 17; + + seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); + seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); + seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); + seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); + seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); + seq_printf(s, "#%.*s|||| / \n", size, spaces); +} + static void print_graph_headers(struct seq_file *s) { + int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + + if (lat) + print_lat_header(s); + /* 1st line */ - seq_printf(s, "# "); + seq_printf(s, "#"); if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " TIME "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "CPU"); + seq_printf(s, " CPU"); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " TASK/PID "); + seq_printf(s, " TASK/PID "); + if (lat) + seq_printf(s, "|||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); /* 2nd line */ - seq_printf(s, "# "); + seq_printf(s, "#"); if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "| "); + seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " | | "); + seq_printf(s, " | | "); + if (lat) + seq_printf(s, "|||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index ca7d7c4..23b6385 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) seq_print_ip_sym(seq, it->from, symflags) && trace_seq_printf(seq, "\n")) return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE;; + return TRACE_TYPE_PARTIAL_LINE; } return TRACE_TYPE_UNHANDLED; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5555b75..3aa7eaa 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr, unsigned long parent_ip, int cpu) { - unsigned long latency, t0, t1; cycle_t T0, T1, delta; unsigned long flags; int pc; - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr, trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); - latency = nsecs_to_usecs(delta); - if (data->critical_sequence != max_sequence) goto out_unlock; - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - data->critical_end = parent_ip; - update_max_tr_single(tr, current, cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr_single(tr, current, cpu); + } max_sequence++; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index c4c9bbd..0acd834 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -307,6 +307,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { + struct ftrace_event_call *call = &event_mmiotrace_rw; struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; @@ -320,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->rw = *rw; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -334,6 +337,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { + struct ftrace_event_call *call = &event_mmiotrace_map; struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; @@ -347,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->map = *map; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e0c2545..f572f44 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, * since individual threads might have already quit! */ rcu_read_lock(); - task = find_task_by_vpid(entry->ent.tgid); + task = find_task_by_vpid(entry->tgid); if (task) mm = get_task_mm(task); rcu_read_unlock(); @@ -460,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } -static int -lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +/** + * trace_print_lat_fmt - print the irq, preempt and lockdep fields + * @s: trace seq struct to write to + * @entry: The trace entry field from the ring buffer + * + * Prints the generic fields of irqs off, in hard or softirq, preempt + * count and lock depth. + */ +int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { int hardirq, softirq; - char comm[TASK_COMM_LEN]; + int ret; - trace_find_cmdline(entry->pid, comm); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", - comm, entry->pid, cpu, + if (!trace_seq_printf(s, "%c%c%c", (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', @@ -481,9 +486,30 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) hardirq ? 'h' : softirq ? 's' : '.')) return 0; + if (entry->lock_depth < 0) + ret = trace_seq_putc(s, '.'); + else + ret = trace_seq_printf(s, "%d", entry->lock_depth); + if (!ret) + return 0; + if (entry->preempt_count) return trace_seq_printf(s, "%x", entry->preempt_count); - return trace_seq_puts(s, "."); + return trace_seq_putc(s, '.'); +} + +static int +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + if (!trace_seq_printf(s, "%8.8s-%-5d %3d", + comm, entry->pid, cpu)) + return 0; + + return trace_print_lat_fmt(s, entry); } static unsigned long preempt_mark_thresh = 100; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index d38bec4..9d91c72 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); +extern int +trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ extern int __unregister_ftrace_event(struct trace_event *event); diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c deleted file mode 100644 index fe1a00f..0000000 --- a/kernel/trace/trace_power.c +++ /dev/null @@ -1,218 +0,0 @@ -/* - * ring buffer based C-state tracer - * - * Arjan van de Ven <arjan@linux.intel.com> - * Copyright (C) 2008 Intel Corporation - * - * Much is borrowed from trace_boot.c which is - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - -#include <linux/init.h> -#include <linux/debugfs.h> -#include <trace/power.h> -#include <linux/kallsyms.h> -#include <linux/module.h> - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *power_trace; -static int __read_mostly trace_power_enabled; - -static void probe_power_start(struct power_trace *it, unsigned int type, - unsigned int level) -{ - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); -} - - -static void probe_power_end(struct power_trace *it) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - preempt_disable(); - it->end = ktime_get(); - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static void probe_power_mark(struct power_trace *it, unsigned int type, - unsigned int level) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); - preempt_disable(); - it->end = it->stamp; - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static int tracing_power_register(void) -{ - int ret; - - ret = register_trace_power_start(probe_power_start); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_start\n"); - return ret; - } - ret = register_trace_power_end(probe_power_end); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_end\n"); - goto fail_start; - } - ret = register_trace_power_mark(probe_power_mark); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_mark\n"); - goto fail_end; - } - return ret; -fail_end: - unregister_trace_power_end(probe_power_end); -fail_start: - unregister_trace_power_start(probe_power_start); - return ret; -} - -static void start_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 1; -} - -static void stop_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 0; -} - -static void power_trace_reset(struct trace_array *tr) -{ - trace_power_enabled = 0; - unregister_trace_power_start(probe_power_start); - unregister_trace_power_end(probe_power_end); - unregister_trace_power_mark(probe_power_mark); -} - - -static int power_trace_init(struct trace_array *tr) -{ - power_trace = tr; - - trace_power_enabled = 1; - tracing_power_register(); - - tracing_reset_online_cpus(tr); - return 0; -} - -static enum print_line_t power_print_line(struct trace_iterator *iter) -{ - int ret = 0; - struct trace_entry *entry = iter->ent; - struct trace_power *field ; - struct power_trace *it; - struct trace_seq *s = &iter->seq; - struct timespec stamp; - struct timespec duration; - - trace_assign_type(field, entry); - it = &field->state_data; - stamp = ktime_to_timespec(it->stamp); - duration = ktime_to_timespec(ktime_sub(it->end, it->stamp)); - - if (entry->type == TRACE_POWER) { - if (it->type == POWER_CSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu, - duration.tv_sec, - duration.tv_nsec); - if (it->type == POWER_PSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; - } - return TRACE_TYPE_UNHANDLED; -} - -static void power_print_header(struct seq_file *s) -{ - seq_puts(s, "# TIMESTAMP STATE EVENT\n"); - seq_puts(s, "# | | |\n"); -} - -static struct tracer power_tracer __read_mostly = -{ - .name = "power", - .init = power_trace_init, - .start = start_power_trace, - .stop = stop_power_trace, - .reset = power_trace_reset, - .print_line = power_print_line, - .print_header = power_print_header, -}; - -static int init_power_trace(void) -{ - return register_tracer(&power_tracer); -} -device_initcall(init_power_trace); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 687699d..2547d88 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -11,7 +11,6 @@ #include <linux/ftrace.h> #include <linux/string.h> #include <linux/module.h> -#include <linux/marker.h> #include <linux/mutex.h> #include <linux/ctype.h> #include <linux/list.h> diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ad69f10..26185d7 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled; static struct task_struct *wakeup_task; static int wakeup_cpu; +static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; @@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (unlikely(disabled != 1)) goto out; local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); - - if (unlikely(!wakeup_task)) - goto unlock; - - /* - * The task can't disappear because it needs to - * wake up first, and we have the wakeup_lock. - */ - if (task_cpu(wakeup_task) != cpu) - goto unlock; trace_function(tr, ip, parent_ip, flags, pc); - unlock: - __raw_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: atomic_dec(&data->disabled); - + out_enable: ftrace_preempt_enable(resched); } @@ -107,11 +98,18 @@ static int report_latency(cycle_t delta) return 1; } +static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) +{ + if (task != wakeup_task) + return; + + wakeup_current_cpu = cpu; +} + static void notrace probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - unsigned long latency = 0, t0 = 0, t1 = 0; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; @@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (!report_latency(delta)) goto out_unlock; - latency = nsecs_to_usecs(delta); - - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + } out_unlock: __wakeup_reset(wakeup_trace); @@ -244,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) __wakeup_reset(wakeup_trace); wakeup_cpu = task_cpu(p); + wakeup_current_cpu = wakeup_cpu; wakeup_prio = p->prio; wakeup_task = p; @@ -293,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr) goto fail_deprobe_wake_new; } + ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_migrate_task\n"); + return; + } + wakeup_reset(tr); /* @@ -325,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr) unregister_trace_sched_switch(probe_wakeup_sched_switch); unregister_trace_sched_wakeup_new(probe_wakeup); unregister_trace_sched_wakeup(probe_wakeup); + unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); } static int __wakeup_tracer_init(struct trace_array *tr) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0f6facb..8504ac7 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = { int stack_trace_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&stack_sysctl_mutex); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_stack_tracer_enabled == !!stack_tracer_enabled)) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8712ce3..9fbce6c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -2,7 +2,7 @@ #include <trace/events/syscalls.h> #include <linux/kernel.h> #include <linux/ftrace.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/syscall.h> #include "trace_output.h" @@ -384,10 +384,13 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { - struct syscall_trace_enter *rec; struct syscall_metadata *sys_data; + struct syscall_trace_enter *rec; + unsigned long flags; + char *raw_data; int syscall_nr; int size; + int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) @@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - do { - char raw_data[size]; + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return; + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(flags); - /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + cpu = smp_processor_id(); + + if (in_nmi()) + raw_data = rcu_dereference(trace_profile_buf_nmi); + else + raw_data = rcu_dereference(trace_profile_buf); + + if (!raw_data) + goto end; - rec = (struct syscall_trace_enter *) raw_data; - tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->enter_id; - rec->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, - (unsigned long *)&rec->args); - perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); - } while(0); + raw_data = per_cpu_ptr(raw_data, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + rec = (struct syscall_trace_enter *) raw_data; + tracing_generic_entry_update(&rec->ent, 0, 0); + rec->ent.type = sys_data->enter_id; + rec->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, + (unsigned long *)&rec->args); + perf_tp_event(sys_data->enter_id, 0, 1, rec, size); + +end: + local_irq_restore(flags); } int reg_prof_syscall_enter(char *name) @@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name) static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; - struct syscall_trace_exit rec; + struct syscall_trace_exit *rec; + unsigned long flags; int syscall_nr; + char *raw_data; + int size; + int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) @@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - tracing_generic_entry_update(&rec.ent, 0, 0); - rec.ent.type = sys_data->exit_id; - rec.nr = syscall_nr; - rec.ret = syscall_get_return_value(current, regs); + /* We can probably do that at build time */ + size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); - perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); + /* + * Impossible, but be paranoid with the future + * How to put this check outside runtime? + */ + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "exit event has grown above profile buffer size")) + return; + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(flags); + cpu = smp_processor_id(); + + if (in_nmi()) + raw_data = rcu_dereference(trace_profile_buf_nmi); + else + raw_data = rcu_dereference(trace_profile_buf); + + if (!raw_data) + goto end; + + raw_data = per_cpu_ptr(raw_data, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + rec = (struct syscall_trace_exit *)raw_data; + + tracing_generic_entry_update(&rec->ent, 0, 0); + rec->ent.type = sys_data->exit_id; + rec->nr = syscall_nr; + rec->ret = syscall_get_return_value(current, regs); + + perf_tp_event(sys_data->exit_id, 0, 1, rec, size); + +end: + local_irq_restore(flags); } int reg_prof_syscall_exit(char *name) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9489a0a..cc89be5 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; /* * Note about RCU : - * It is used to to delay the free of multiple probes array until a quiescent + * It is used to delay the free of multiple probes array until a quiescent * state is reached. * Tracepoint entries modifications are protected by the tracepoints_mutex. */ diff --git a/kernel/uid16.c b/kernel/uid16.c index 0314501..4192098 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -4,7 +4,6 @@ */ #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/mman.h> #include <linux/notifier.h> #include <linux/reboot.h> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 92359cc..69eae35 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, +static int proc_do_uts_string(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); + r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); return r; } |