diff options
author | codeworkx <daniel.hillenbrand@codeworkx.de> | 2012-06-02 13:09:29 +0200 |
---|---|---|
committer | codeworkx <daniel.hillenbrand@codeworkx.de> | 2012-06-02 13:09:29 +0200 |
commit | c6da2cfeb05178a11c6d062a06f8078150ee492f (patch) | |
tree | f3b4021d252c52d6463a9b3c1bb7245e399b009c /drivers/cpufreq | |
parent | c6d7c4dbff353eac7919342ae6b3299a378160a6 (diff) | |
download | kernel_samsung_smdk4412-c6da2cfeb05178a11c6d062a06f8078150ee492f.zip kernel_samsung_smdk4412-c6da2cfeb05178a11c6d062a06f8078150ee492f.tar.gz kernel_samsung_smdk4412-c6da2cfeb05178a11c6d062a06f8078150ee492f.tar.bz2 |
samsung update 1
Diffstat (limited to 'drivers/cpufreq')
-rw-r--r-- | drivers/cpufreq/Kconfig | 95 | ||||
-rw-r--r-- | drivers/cpufreq/Makefile | 5 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq.c | 2 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq_adaptive.c | 952 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq_interactive.c | 707 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq_ondemand.c | 462 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq_pegasusq.c | 1411 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq_stats.c | 25 | ||||
-rw-r--r-- | drivers/cpufreq/dvfs_monitor.c | 236 |
9 files changed, 3886 insertions, 9 deletions
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 9fb8485..8e089bd 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -99,6 +99,30 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor. + +config CPU_FREQ_DEFAULT_GOV_INTERACTIVE + bool "interactive" + select CPU_FREQ_GOV_INTERACTIVE + help + Use the CPUFreq governor 'interactive' as default. This allows + you to get a full dynamic cpu frequency capable system by simply + loading your cpufreq low-level hardware driver, using the + 'interactive' governor for latency-sensitive workloads. + +config CPU_FREQ_DEFAULT_GOV_ADAPTIVE + bool "adaptive" + select CPU_FREQ_GOV_ADAPTIVE + help + Use the CPUFreq governor 'adaptive' as default. This allows + you to get a full dynamic cpu frequency capable system by simply + loading your cpufreq low-level hardware driver, using the + 'adaptive' governor for latency-sensitive workloads and demanding + performance. + +config CPU_FREQ_DEFAULT_GOV_PEGASUSQ + bool "pegasusq" + select CPU_FREQ_GOV_PEGASUSQ + endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -156,6 +180,45 @@ config CPU_FREQ_GOV_ONDEMAND If in doubt, say N. +config CPU_FREQ_GOV_ONDEMAND_FLEXRATE + bool "flexrate interface for 'ondemand' cpufreq policy governor" + depends on CPU_FREQ_GOV_ONDEMAND + help + Flexrate for 'ondemand' governor provides an interface to request + faster polling temporarily. This is to let it react quickly to + load changes when there is high probablity of load increase + in short time. For example, when a user event occurs, we have + use this interface. It does not increase the frequency + unconditionally; however, it allows ondemand to react fast + by temporarily decreasing sampling rate. Flexrate provides both + sysfs interface and in-kernel interface. + +config CPU_FREQ_GOV_ONDEMAND_FLEXRATE_MAX_DURATION + int "flexrate's maximum duration of sampling rate override" + range 5 500 + depends on CPU_FREQ_GOV_ONDEMAND_FLEXRATE + default "100" + help + The maximum number of ondemand sampling whose rate is + overriden by Flexrate for ondemand. + +config CPU_FREQ_GOV_INTERACTIVE + tristate "'interactive' cpufreq policy governor" + help + 'interactive' - This driver adds a dynamic cpufreq policy governor + designed for latency-sensitive workloads. + + This governor attempts to reduce the latency of clock + increases so that the system is more responsive to + interactive workloads. + + To compile this driver as a module, choose M here: the + module will be called cpufreq_interactive. + + For details, take a look at linux/Documentation/cpu-freq. + + If in doubt, say N. + config CPU_FREQ_GOV_CONSERVATIVE tristate "'conservative' cpufreq governor" depends on CPU_FREQ @@ -179,6 +242,38 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. +config CPU_FREQ_GOV_ADAPTIVE + tristate "'adaptive' cpufreq policy governor" + help + 'adaptive' - This driver adds a dynamic cpufreq policy governor + designed for latency-sensitive workloads and also for demanding + performance. + + This governor attempts to reduce the latency of clock + increases so that the system is more responsive to + interactive workloads in loweset steady-state but to + to reduce power consumption in middle operation level level up + will be done in step by step to prohibit system from going to + max operation level. + + To compile this driver as a module, choose M here: the + module will be called cpufreq_adaptive. + + For details, take a look at linux/Documentation/cpu-freq. + + If in doubt, say N. + +config CPU_FREQ_GOV_PEGASUSQ + tristate "'pegasusq' cpufreq policy governor" + +config CPU_FREQ_DVFS_MONITOR + bool "dvfs monitor" + depends on CPU_FREQ + help + This option adds a proc node for dvfs monitoring. + /proc/dvfs_mon + + menu "x86 CPU frequency scaling drivers" depends on X86 source "drivers/cpufreq/Kconfig.x86" diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index e2fc2d2..ed91c0d 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -9,10 +9,15 @@ obj-$(CONFIG_CPU_FREQ_GOV_POWERSAVE) += cpufreq_powersave.o obj-$(CONFIG_CPU_FREQ_GOV_USERSPACE) += cpufreq_userspace.o obj-$(CONFIG_CPU_FREQ_GOV_ONDEMAND) += cpufreq_ondemand.o obj-$(CONFIG_CPU_FREQ_GOV_CONSERVATIVE) += cpufreq_conservative.o +obj-$(CONFIG_CPU_FREQ_GOV_INTERACTIVE) += cpufreq_interactive.o +obj-$(CONFIG_CPU_FREQ_GOV_ADAPTIVE) += cpufreq_adaptive.o +obj-$(CONFIG_CPU_FREQ_GOV_PEGASUSQ) += cpufreq_pegasusq.o # CPUfreq cross-arch helpers obj-$(CONFIG_CPU_FREQ_TABLE) += freq_table.o +obj-$(CONFIG_CPU_FREQ_DVFS_MONITOR) += dvfs_monitor.o + ##################################################################################d # x86 drivers. # Link order matters. K8 is preferred to ACPI because of firmware bugs in early diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 0a5bea9..9785cf7 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -189,7 +189,7 @@ EXPORT_SYMBOL_GPL(cpufreq_cpu_put); * systems as each CPU might be scaled differently. So, use the arch * per-CPU loops_per_jiffy value wherever possible. */ -#ifndef CONFIG_SMP +#if !defined CONFIG_SMP || defined(CONFIG_ARCH_EXYNOS4) || defined(CONFIG_ARCH_EXYNOS5) static unsigned long l_p_j_ref; static unsigned int l_p_j_ref_freq; diff --git a/drivers/cpufreq/cpufreq_adaptive.c b/drivers/cpufreq/cpufreq_adaptive.c new file mode 100644 index 0000000..ad7f7de --- /dev/null +++ b/drivers/cpufreq/cpufreq_adaptive.c @@ -0,0 +1,952 @@ +/* + * drivers/cpufreq/cpufreq_adaptive.c + * + * Copyright (C) 2001 Russell King + * (C) 2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>. + * Jun Nakajima <jun.nakajima@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/cpu.h> +#include <linux/jiffies.h> +#include <linux/kernel_stat.h> +#include <linux/mutex.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/ktime.h> +#include <linux/sched.h> +#include <linux/kthread.h> + +#include <mach/ppmu.h> + +/* + * dbs is used in this file as a shortform for demandbased switching + * It helps to keep variable names smaller, simpler + */ + +#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) +#define DEF_FREQUENCY_UP_THRESHOLD (80) +#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) +#define MICRO_FREQUENCY_UP_THRESHOLD (95) +#define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) +#define MIN_FREQUENCY_UP_THRESHOLD (11) +#define MAX_FREQUENCY_UP_THRESHOLD (100) +#define MIN_ONDEMAND_THRESHOLD (4) +/* + * The polling frequency of this governor depends on the capability of + * the processor. Default polling frequency is 1000 times the transition + * latency of the processor. The governor will work on any processor with + * transition latency <= 10mS, using appropriate sampling + * rate. + * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL) + * this governor will not work. + * All times here are in uS. + */ +#define MIN_SAMPLING_RATE_RATIO (2) + +static unsigned int min_sampling_rate; + +#define LATENCY_MULTIPLIER (1000) +#define MIN_LATENCY_MULTIPLIER (100) +#define TRANSITION_LATENCY_LIMIT (10 * 1000 * 1000) + +static void (*pm_idle_old)(void); +static void do_dbs_timer(struct work_struct *work); +static int cpufreq_governor_dbs(struct cpufreq_policy *policy, + unsigned int event); + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ADAPTIVE +static +#endif +struct cpufreq_governor cpufreq_gov_adaptive = { + .name = "adaptive", + .governor = cpufreq_governor_dbs, + .max_transition_latency = TRANSITION_LATENCY_LIMIT, + .owner = THIS_MODULE, +}; + +/* Sampling types */ +enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE}; + +struct cpu_dbs_info_s { + cputime64_t prev_cpu_idle; + cputime64_t prev_cpu_iowait; + cputime64_t prev_cpu_wall; + cputime64_t prev_cpu_nice; + struct cpufreq_policy *cur_policy; + struct delayed_work work; + struct cpufreq_frequency_table *freq_table; + unsigned int freq_hi_jiffies; + int cpu; + unsigned int sample_type:1; + bool ondemand; + /* + * percpu mutex that serializes governor limit change with + * do_dbs_timer invocation. We do not want do_dbs_timer to run + * when user is changing the governor or limits. + */ + struct mutex timer_mutex; +}; +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info); + +static unsigned int dbs_enable; /* number of CPUs using this policy */ + +/* + * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on + * different CPUs. It protects dbs_enable in governor start/stop. + */ +static DEFINE_MUTEX(dbs_mutex); +static struct task_struct *up_task; +static struct workqueue_struct *down_wq; +static struct work_struct freq_scale_down_work; +static cpumask_t up_cpumask; +static spinlock_t up_cpumask_lock; +static cpumask_t down_cpumask; +static spinlock_t down_cpumask_lock; + +static DEFINE_PER_CPU(cputime64_t, idle_in_idle); +static DEFINE_PER_CPU(cputime64_t, idle_exit_wall); + +static struct timer_list cpu_timer; +static unsigned int target_freq; +static DEFINE_MUTEX(short_timer_mutex); + +/* Go to max speed when CPU load at or above this value. */ +#define DEFAULT_GO_MAXSPEED_LOAD 60 +static unsigned long go_maxspeed_load; + +#define DEFAULT_KEEP_MINSPEED_LOAD 30 +static unsigned long keep_minspeed_load; + +#define DEFAULT_STEPUP_LOAD 10 +static unsigned long step_up_load; + +static struct dbs_tuners { + unsigned int sampling_rate; + unsigned int up_threshold; + unsigned int down_differential; + unsigned int ignore_nice; + unsigned int io_is_busy; +} dbs_tuners_ins = { + .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, + .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL, + .ignore_nice = 0, +}; + +static inline cputime64_t get_cpu_iowait_time(unsigned int cpu, cputime64_t *wall) +{ + u64 iowait_time = get_cpu_iowait_time_us(cpu, wall); + + if (iowait_time == -1ULL) + return 0; + + return iowait_time; +} + +static void adaptive_init_cpu(int cpu) +{ + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); + dbs_info->freq_table = cpufreq_frequency_get_table(cpu); +} + +/************************** sysfs interface ************************/ + +static ssize_t show_sampling_rate_max(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + printk_once(KERN_INFO "CPUFREQ: adaptive sampling_rate_max " + "sysfs file is deprecated - used by: %s\n", current->comm); + return sprintf(buf, "%u\n", -1U); +} + +static ssize_t show_sampling_rate_min(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", min_sampling_rate); +} + +define_one_global_ro(sampling_rate_max); +define_one_global_ro(sampling_rate_min); + +/* cpufreq_adaptive Governor Tunables */ +#define show_one(file_name, object) \ +static ssize_t show_##file_name \ +(struct kobject *kobj, struct attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "%u\n", dbs_tuners_ins.object); \ +} +show_one(sampling_rate, sampling_rate); +show_one(io_is_busy, io_is_busy); +show_one(up_threshold, up_threshold); +show_one(ignore_nice_load, ignore_nice); + +/*** delete after deprecation time ***/ + +#define DEPRECATION_MSG(file_name) \ + printk_once(KERN_INFO "CPUFREQ: Per core adaptive sysfs " \ + "interface is deprecated - " #file_name "\n"); + +#define show_one_old(file_name) \ +static ssize_t show_##file_name##_old \ +(struct cpufreq_policy *unused, char *buf) \ +{ \ + printk_once(KERN_INFO "CPUFREQ: Per core adaptive sysfs " \ + "interface is deprecated - " #file_name "\n"); \ + return show_##file_name(NULL, NULL, buf); \ +} + +/*** delete after deprecation time ***/ + +static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + + mutex_lock(&dbs_mutex); + dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate); + mutex_unlock(&dbs_mutex); + + return count; +} + +static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + + mutex_lock(&dbs_mutex); + dbs_tuners_ins.io_is_busy = !!input; + mutex_unlock(&dbs_mutex); + + return count; +} + +static ssize_t store_up_threshold(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + + if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD || + input < MIN_FREQUENCY_UP_THRESHOLD) { + return -EINVAL; + } + + mutex_lock(&dbs_mutex); + dbs_tuners_ins.up_threshold = input; + mutex_unlock(&dbs_mutex); + + return count; +} + +static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + + unsigned int j; + + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + + if (input > 1) + input = 1; + + mutex_lock(&dbs_mutex); + if (input == dbs_tuners_ins.ignore_nice) { /* nothing to do */ + mutex_unlock(&dbs_mutex); + return count; + } + dbs_tuners_ins.ignore_nice = input; + + /* we need to re-evaluate prev_cpu_idle */ + for_each_online_cpu(j) { + struct cpu_dbs_info_s *dbs_info; + dbs_info = &per_cpu(od_cpu_dbs_info, j); + dbs_info->prev_cpu_idle = get_cpu_idle_time_us(j, + &dbs_info->prev_cpu_wall); + if (dbs_tuners_ins.ignore_nice) + dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; + + } + mutex_unlock(&dbs_mutex); + + return count; +} + +define_one_global_rw(sampling_rate); +define_one_global_rw(io_is_busy); +define_one_global_rw(up_threshold); +define_one_global_rw(ignore_nice_load); + +static struct attribute *dbs_attributes[] = { + &sampling_rate_max.attr, + &sampling_rate_min.attr, + &sampling_rate.attr, + &up_threshold.attr, + &ignore_nice_load.attr, + &io_is_busy.attr, + NULL +}; + +static struct attribute_group dbs_attr_group = { + .attrs = dbs_attributes, + .name = "adaptive", +}; + +/*** delete after deprecation time ***/ + +#define write_one_old(file_name) \ +static ssize_t store_##file_name##_old \ +(struct cpufreq_policy *unused, const char *buf, size_t count) \ +{ \ + printk_once(KERN_INFO "CPUFREQ: Per core adaptive sysfs " \ + "interface is deprecated - " #file_name "\n"); \ + return store_##file_name(NULL, NULL, buf, count); \ +} + +static void cpufreq_adaptive_timer(unsigned long data) +{ + cputime64_t cur_idle; + cputime64_t cur_wall; + unsigned int delta_idle; + unsigned int delta_time; + int short_load; + unsigned int new_freq; + unsigned long flags; + struct cpu_dbs_info_s *this_dbs_info; + struct cpufreq_policy *policy; + unsigned int j; + unsigned int index; + unsigned int max_load = 0; + + this_dbs_info = &per_cpu(od_cpu_dbs_info, 0); + + policy = this_dbs_info->cur_policy; + + for_each_online_cpu(j) { + cur_idle = get_cpu_idle_time_us(j, &cur_wall); + + delta_idle = (unsigned int) cputime64_sub(cur_idle, + per_cpu(idle_in_idle, j)); + delta_time = (unsigned int) cputime64_sub(cur_wall, + per_cpu(idle_exit_wall, j)); + + /* + * If timer ran less than 1ms after short-term sample started, retry. + */ + if (delta_time < 1000) + goto do_nothing; + + if (delta_idle > delta_time) + short_load = 0; + else + short_load = 100 * (delta_time - delta_idle) / delta_time; + + if (short_load > max_load) + max_load = short_load; + } + + if (this_dbs_info->ondemand) + goto do_nothing; + + if (max_load >= go_maxspeed_load) + new_freq = policy->max; + else + new_freq = policy->max * max_load / 100; + + if ((max_load <= keep_minspeed_load) && + (policy->cur == policy->min)) + new_freq = policy->cur; + + if (cpufreq_frequency_table_target(policy, this_dbs_info->freq_table, + new_freq, CPUFREQ_RELATION_L, + &index)) { + goto do_nothing; + } + + new_freq = this_dbs_info->freq_table[index].frequency; + + target_freq = new_freq; + + if (new_freq < this_dbs_info->cur_policy->cur) { + spin_lock_irqsave(&down_cpumask_lock, flags); + cpumask_set_cpu(0, &down_cpumask); + spin_unlock_irqrestore(&down_cpumask_lock, flags); + queue_work(down_wq, &freq_scale_down_work); + } else { + spin_lock_irqsave(&up_cpumask_lock, flags); + cpumask_set_cpu(0, &up_cpumask); + spin_unlock_irqrestore(&up_cpumask_lock, flags); + wake_up_process(up_task); + } + + return; + +do_nothing: + for_each_online_cpu(j) { + per_cpu(idle_in_idle, j) = + get_cpu_idle_time_us(j, + &per_cpu(idle_exit_wall, j)); + } + mod_timer(&cpu_timer, jiffies + 2); + schedule_delayed_work_on(0, &this_dbs_info->work, 10); + + if (mutex_is_locked(&short_timer_mutex)) + mutex_unlock(&short_timer_mutex); + return; +} + +/*** delete after deprecation time ***/ + +/************************** sysfs end ************************/ + +static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq) +{ +#ifndef CONFIG_ARCH_EXYNOS4 + if (p->cur == p->max) + return; +#endif + __cpufreq_driver_target(p, freq, CPUFREQ_RELATION_H); +} + +static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) +{ + unsigned int max_load_freq; + + struct cpufreq_policy *policy; + unsigned int j; + + unsigned int index, new_freq; + unsigned int longterm_load = 0; + + policy = this_dbs_info->cur_policy; + + /* + * Every sampling_rate, we check, if current idle time is less + * than 20% (default), then we try to increase frequency + * Every sampling_rate, we look for a the lowest + * frequency which can sustain the load while keeping idle time over + * 30%. If such a frequency exist, we try to decrease to this frequency. + * + * Any frequency increase takes it to the maximum frequency. + * Frequency reduction happens at minimum steps of + * 5% (default) of current frequency + */ + + /* Get Absolute Load - in terms of freq */ + max_load_freq = 0; + + for_each_cpu(j, policy->cpus) { + struct cpu_dbs_info_s *j_dbs_info; + cputime64_t cur_wall_time, cur_idle_time, cur_iowait_time; + unsigned int idle_time, wall_time, iowait_time; + unsigned int load, load_freq; + int freq_avg; + + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); + + cur_idle_time = get_cpu_idle_time_us(j, &cur_wall_time); + cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time); + + wall_time = (unsigned int) cputime64_sub(cur_wall_time, + j_dbs_info->prev_cpu_wall); + j_dbs_info->prev_cpu_wall = cur_wall_time; + + idle_time = (unsigned int) cputime64_sub(cur_idle_time, + j_dbs_info->prev_cpu_idle); + j_dbs_info->prev_cpu_idle = cur_idle_time; + + iowait_time = (unsigned int) cputime64_sub(cur_iowait_time, + j_dbs_info->prev_cpu_iowait); + j_dbs_info->prev_cpu_iowait = cur_iowait_time; + + if (dbs_tuners_ins.ignore_nice) { + cputime64_t cur_nice; + unsigned long cur_nice_jiffies; + + cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, + j_dbs_info->prev_cpu_nice); + /* + * Assumption: nice time between sampling periods will + * be less than 2^32 jiffies for 32 bit sys + */ + cur_nice_jiffies = (unsigned long) + cputime64_to_jiffies64(cur_nice); + + j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; + idle_time += jiffies_to_usecs(cur_nice_jiffies); + } + + /* + * For the purpose of adaptive, waiting for disk IO is an + * indication that you're performance critical, and not that + * the system is actually idle. So subtract the iowait time + * from the cpu idle time. + */ + + if (dbs_tuners_ins.io_is_busy && idle_time >= iowait_time) + idle_time -= iowait_time; + + if (unlikely(!wall_time || wall_time < idle_time)) + continue; + + load = 100 * (wall_time - idle_time) / wall_time; + + if (load > longterm_load) + longterm_load = load; + + freq_avg = __cpufreq_driver_getavg(policy, j); + if (freq_avg <= 0) + freq_avg = policy->cur; + + load_freq = load * freq_avg; + + if (load_freq > max_load_freq) + max_load_freq = load_freq; + } + + if (longterm_load >= MIN_ONDEMAND_THRESHOLD) + this_dbs_info->ondemand = true; + else + this_dbs_info->ondemand = false; + + /* Check for frequency increase */ + if (max_load_freq > (dbs_tuners_ins.up_threshold * policy->cur)) { + cpufreq_frequency_table_target(policy, + this_dbs_info->freq_table, + (policy->cur + step_up_load), + CPUFREQ_RELATION_L, &index); + + new_freq = this_dbs_info->freq_table[index].frequency; + dbs_freq_increase(policy, new_freq); + return; + } + + /* Check for frequency decrease */ + /* if we cannot reduce the frequency anymore, break out early */ +#ifndef CONFIG_ARCH_EXYNOS4 + if (policy->cur == policy->min) + return; +#endif + /* + * The optimal frequency is the frequency that is the lowest that + * can support the current CPU usage without triggering the up + * policy. To be safe, we focus 10 points under the threshold. + */ + if (max_load_freq < + (dbs_tuners_ins.up_threshold - dbs_tuners_ins.down_differential) * + policy->cur) { + unsigned int freq_next; + freq_next = max_load_freq / + (dbs_tuners_ins.up_threshold - + dbs_tuners_ins.down_differential); + + if (freq_next < policy->min) + freq_next = policy->min; + + __cpufreq_driver_target(policy, freq_next, + CPUFREQ_RELATION_L); + } +} + +static void do_dbs_timer(struct work_struct *work) +{ + struct cpu_dbs_info_s *dbs_info = + container_of(work, struct cpu_dbs_info_s, work.work); + unsigned int cpu = dbs_info->cpu; + + int delay; + + mutex_lock(&dbs_info->timer_mutex); + + /* Common NORMAL_SAMPLE setup */ + dbs_info->sample_type = DBS_NORMAL_SAMPLE; + dbs_check_cpu(dbs_info); + + /* We want all CPUs to do sampling nearly on + * same jiffy + */ + delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); + + schedule_delayed_work_on(cpu, &dbs_info->work, delay); + + mutex_unlock(&dbs_info->timer_mutex); +} + +static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) +{ + /* We want all CPUs to do sampling nearly on same jiffy */ + int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); + + dbs_info->sample_type = DBS_NORMAL_SAMPLE; + INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); + schedule_delayed_work_on(dbs_info->cpu, &dbs_info->work, delay); +} + +static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) +{ + cancel_delayed_work_sync(&dbs_info->work); +} + +/* + * Not all CPUs want IO time to be accounted as busy; this dependson how + * efficient idling at a higher frequency/voltage is. + * Pavel Machek says this is not so for various generations of AMD and old + * Intel systems. + * Mike Chan (androidlcom) calis this is also not true for ARM. + * Because of this, whitelist specific known (series) of CPUs by default, and + * leave all others up to the user. + */ +static int should_io_be_busy(void) +{ +#if defined(CONFIG_X86) + /* + * For Intel, Core 2 (model 15) andl later have an efficient idle. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model >= 15) + return 1; +#endif + return 0; +} + +static void cpufreq_adaptive_idle(void) +{ + int i; + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, 0); + struct cpufreq_policy *policy; + + policy = dbs_info->cur_policy; + + pm_idle_old(); + + if ((policy->cur == policy->min) || + (policy->cur == policy->max)) { + + if (timer_pending(&cpu_timer)) + return; + + if (mutex_trylock(&short_timer_mutex)) { + for_each_online_cpu(i) { + per_cpu(idle_in_idle, i) = + get_cpu_idle_time_us(i, + &per_cpu(idle_exit_wall, i)); + } + + mod_timer(&cpu_timer, jiffies + 2); + cancel_delayed_work(&dbs_info->work); + } + } else { + if (timer_pending(&cpu_timer)) + del_timer(&cpu_timer); + + } +} + +static int cpufreq_governor_dbs(struct cpufreq_policy *policy, + unsigned int event) +{ + unsigned int cpu = policy->cpu; + struct cpu_dbs_info_s *this_dbs_info; + unsigned int j; + int rc; + + this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu); + + switch (event) { + case CPUFREQ_GOV_START: + if ((!cpu_online(cpu)) || (!policy->cur)) + return -EINVAL; + + mutex_lock(&dbs_mutex); + + rc = sysfs_create_group(&policy->kobj, &dbs_attr_group); + if (rc) { + mutex_unlock(&dbs_mutex); + return rc; + } + + dbs_enable++; + for_each_cpu(j, policy->cpus) { + struct cpu_dbs_info_s *j_dbs_info; + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); + j_dbs_info->cur_policy = policy; + + j_dbs_info->prev_cpu_idle = get_cpu_idle_time_us(j, + &j_dbs_info->prev_cpu_wall); + if (dbs_tuners_ins.ignore_nice) { + j_dbs_info->prev_cpu_nice = + kstat_cpu(j).cpustat.nice; + } + } + this_dbs_info->cpu = cpu; + adaptive_init_cpu(cpu); + + /* + * Start the timerschedule work, when this governor + * is used for first time + */ + if (dbs_enable == 1) { + unsigned int latency; + + rc = sysfs_create_group(cpufreq_global_kobject, + &dbs_attr_group); + if (rc) { + mutex_unlock(&dbs_mutex); + return rc; + } + + /* policy latency is in nS. Convert it to uS first */ + latency = policy->cpuinfo.transition_latency / 1000; + if (latency == 0) + latency = 1; + /* Bring kernel and HW constraints together */ + min_sampling_rate = max(min_sampling_rate, + MIN_LATENCY_MULTIPLIER * latency); + dbs_tuners_ins.sampling_rate = + max(min_sampling_rate, + latency * LATENCY_MULTIPLIER); + dbs_tuners_ins.io_is_busy = should_io_be_busy(); + } + mutex_unlock(&dbs_mutex); + + mutex_init(&this_dbs_info->timer_mutex); + dbs_timer_init(this_dbs_info); + + pm_idle_old = pm_idle; + pm_idle = cpufreq_adaptive_idle; + break; + + case CPUFREQ_GOV_STOP: + dbs_timer_exit(this_dbs_info); + + mutex_lock(&dbs_mutex); + sysfs_remove_group(&policy->kobj, &dbs_attr_group); + mutex_destroy(&this_dbs_info->timer_mutex); + dbs_enable--; + mutex_unlock(&dbs_mutex); + if (!dbs_enable) + sysfs_remove_group(cpufreq_global_kobject, + &dbs_attr_group); + + pm_idle = pm_idle_old; + break; + + case CPUFREQ_GOV_LIMITS: + mutex_lock(&this_dbs_info->timer_mutex); + if (policy->max < this_dbs_info->cur_policy->cur) + __cpufreq_driver_target(this_dbs_info->cur_policy, + policy->max, CPUFREQ_RELATION_H); + else if (policy->min > this_dbs_info->cur_policy->cur) + __cpufreq_driver_target(this_dbs_info->cur_policy, + policy->min, CPUFREQ_RELATION_L); + mutex_unlock(&this_dbs_info->timer_mutex); + break; + } + return 0; +} + +static inline void cpufreq_adaptive_update_time(void) +{ + struct cpu_dbs_info_s *this_dbs_info; + struct cpufreq_policy *policy; + int j; + + this_dbs_info = &per_cpu(od_cpu_dbs_info, 0); + policy = this_dbs_info->cur_policy; + + for_each_cpu(j, policy->cpus) { + struct cpu_dbs_info_s *j_dbs_info; + cputime64_t cur_wall_time, cur_idle_time, cur_iowait_time; + + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); + + cur_idle_time = get_cpu_idle_time_us(j, &cur_wall_time); + cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time); + + j_dbs_info->prev_cpu_wall = cur_wall_time; + + j_dbs_info->prev_cpu_idle = cur_idle_time; + + j_dbs_info->prev_cpu_iowait = cur_iowait_time; + + if (dbs_tuners_ins.ignore_nice) + j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; + + } + +} + +static int cpufreq_adaptive_up_task(void *data) +{ + unsigned long flags; + struct cpu_dbs_info_s *this_dbs_info; + struct cpufreq_policy *policy; + int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); + + this_dbs_info = &per_cpu(od_cpu_dbs_info, 0); + policy = this_dbs_info->cur_policy; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&up_cpumask_lock, flags); + + if (cpumask_empty(&up_cpumask)) { + spin_unlock_irqrestore(&up_cpumask_lock, flags); + schedule(); + + if (kthread_should_stop()) + break; + + spin_lock_irqsave(&up_cpumask_lock, flags); + } + + set_current_state(TASK_RUNNING); + + cpumask_clear(&up_cpumask); + spin_unlock_irqrestore(&up_cpumask_lock, flags); + + __cpufreq_driver_target(this_dbs_info->cur_policy, + target_freq, + CPUFREQ_RELATION_H); + if (policy->cur != policy->max) { + mutex_lock(&this_dbs_info->timer_mutex); + + schedule_delayed_work_on(0, &this_dbs_info->work, delay); + mutex_unlock(&this_dbs_info->timer_mutex); + cpufreq_adaptive_update_time(); + } + if (mutex_is_locked(&short_timer_mutex)) + mutex_unlock(&short_timer_mutex); + } + + return 0; +} + +static void cpufreq_adaptive_freq_down(struct work_struct *work) +{ + unsigned long flags; + struct cpu_dbs_info_s *this_dbs_info; + struct cpufreq_policy *policy; + int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); + + spin_lock_irqsave(&down_cpumask_lock, flags); + cpumask_clear(&down_cpumask); + spin_unlock_irqrestore(&down_cpumask_lock, flags); + + this_dbs_info = &per_cpu(od_cpu_dbs_info, 0); + policy = this_dbs_info->cur_policy; + + __cpufreq_driver_target(this_dbs_info->cur_policy, + target_freq, + CPUFREQ_RELATION_H); + + if (policy->cur != policy->min) { + mutex_lock(&this_dbs_info->timer_mutex); + + schedule_delayed_work_on(0, &this_dbs_info->work, delay); + mutex_unlock(&this_dbs_info->timer_mutex); + cpufreq_adaptive_update_time(); + } + + if (mutex_is_locked(&short_timer_mutex)) + mutex_unlock(&short_timer_mutex); +} + +static int __init cpufreq_gov_dbs_init(void) +{ + cputime64_t wall; + u64 idle_time; + int cpu = get_cpu(); + + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + go_maxspeed_load = DEFAULT_GO_MAXSPEED_LOAD; + keep_minspeed_load = DEFAULT_KEEP_MINSPEED_LOAD; + step_up_load = DEFAULT_STEPUP_LOAD; + + idle_time = get_cpu_idle_time_us(cpu, &wall); + put_cpu(); + if (idle_time != -1ULL) { + /* Idle micro accounting is supported. Use finer thresholds */ + dbs_tuners_ins.up_threshold = MICRO_FREQUENCY_UP_THRESHOLD; + dbs_tuners_ins.down_differential = + MICRO_FREQUENCY_DOWN_DIFFERENTIAL; + /* + * In no_hz/micro accounting case we set the minimum frequency + * not depending on HZ, but fixed (very low). The deferred + * timer might skip some samples if idle/sleeping as needed. + */ + min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE; + } else { + /* For correct statistics, we need 10 ticks for each measure */ + min_sampling_rate = + MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10); + } + + init_timer(&cpu_timer); + cpu_timer.function = cpufreq_adaptive_timer; + + up_task = kthread_create(cpufreq_adaptive_up_task, NULL, + "kadaptiveup"); + + if (IS_ERR(up_task)) + return PTR_ERR(up_task); + + sched_setscheduler_nocheck(up_task, SCHED_FIFO, ¶m); + get_task_struct(up_task); + + /* No rescuer thread, bind to CPU queuing the work for possibly + warm cache (probably doesn't matter much). */ + down_wq = alloc_workqueue("kadaptive_down", 0, 1); + + if (!down_wq) + goto err_freeuptask; + + INIT_WORK(&freq_scale_down_work, cpufreq_adaptive_freq_down); + + + return cpufreq_register_governor(&cpufreq_gov_adaptive); +err_freeuptask: + put_task_struct(up_task); + return -ENOMEM; +} + +static void __exit cpufreq_gov_dbs_exit(void) +{ + cpufreq_unregister_governor(&cpufreq_gov_adaptive); +} + + +MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>"); +MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>"); +MODULE_DESCRIPTION("'cpufreq_adaptive' - A dynamic cpufreq governor for " + "Low Latency Frequency Transition capable processors"); +MODULE_LICENSE("GPL"); + +#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ADAPTIVE +fs_initcall(cpufreq_gov_dbs_init); +#else +module_init(cpufreq_gov_dbs_init); +#endif +module_exit(cpufreq_gov_dbs_exit); diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c new file mode 100644 index 0000000..45266d5 --- /dev/null +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -0,0 +1,707 @@ +/* + * drivers/cpufreq/cpufreq_interactive.c + * + * Copyright (C) 2010 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Author: Mike Chan (mike@android.com) + * + */ + +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpufreq.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/tick.h> +#include <linux/time.h> +#include <linux/timer.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/mutex.h> + +#include <asm/cputime.h> + +static atomic_t active_count = ATOMIC_INIT(0); + +struct cpufreq_interactive_cpuinfo { + struct timer_list cpu_timer; + int timer_idlecancel; + u64 time_in_idle; + u64 idle_exit_time; + u64 timer_run_time; + int idling; + u64 freq_change_time; + u64 freq_change_time_in_idle; + struct cpufreq_policy *policy; + struct cpufreq_frequency_table *freq_table; + unsigned int target_freq; + int governor_enabled; +}; + +static DEFINE_PER_CPU(struct cpufreq_interactive_cpuinfo, cpuinfo); + +/* Workqueues handle frequency scaling */ +static struct task_struct *up_task; +static struct workqueue_struct *down_wq; +static struct work_struct freq_scale_down_work; +static cpumask_t up_cpumask; +static spinlock_t up_cpumask_lock; +static cpumask_t down_cpumask; +static spinlock_t down_cpumask_lock; +static struct mutex set_speed_lock; + +/* Hi speed to bump to from lo speed when load burst (default max) */ +static u64 hispeed_freq; + +/* Go to hi speed when CPU load at or above this value. */ +#define DEFAULT_GO_HISPEED_LOAD 95 +static unsigned long go_hispeed_load; + +/* + * The minimum amount of time to spend at a frequency before we can ramp down. + */ +#define DEFAULT_MIN_SAMPLE_TIME 20 * USEC_PER_MSEC +static unsigned long min_sample_time; + +/* + * The sample rate of the timer used to increase frequency + */ +#define DEFAULT_TIMER_RATE 20 * USEC_PER_MSEC +static unsigned long timer_rate; + +static int cpufreq_governor_interactive(struct cpufreq_policy *policy, + unsigned int event); + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE +static +#endif +struct cpufreq_governor cpufreq_gov_interactive = { + .name = "interactive", + .governor = cpufreq_governor_interactive, + .max_transition_latency = 10000000, + .owner = THIS_MODULE, +}; + +static void cpufreq_interactive_timer(unsigned long data) +{ + unsigned int delta_idle; + unsigned int delta_time; + int cpu_load; + int load_since_change; + u64 time_in_idle; + u64 idle_exit_time; + struct cpufreq_interactive_cpuinfo *pcpu = + &per_cpu(cpuinfo, data); + u64 now_idle; + unsigned int new_freq; + unsigned int index; + unsigned long flags; + + smp_rmb(); + + if (!pcpu->governor_enabled) + goto exit; + + /* + * Once pcpu->timer_run_time is updated to >= pcpu->idle_exit_time, + * this lets idle exit know the current idle time sample has + * been processed, and idle exit can generate a new sample and + * re-arm the timer. This prevents a concurrent idle + * exit on that CPU from writing a new set of info at the same time + * the timer function runs (the timer function can't use that info + * until more time passes). + */ + time_in_idle = pcpu->time_in_idle; + idle_exit_time = pcpu->idle_exit_time; + now_idle = get_cpu_idle_time_us(data, &pcpu->timer_run_time); + smp_wmb(); + + /* If we raced with cancelling a timer, skip. */ + if (!idle_exit_time) + goto exit; + + delta_idle = (unsigned int) cputime64_sub(now_idle, time_in_idle); + delta_time = (unsigned int) cputime64_sub(pcpu->timer_run_time, + idle_exit_time); + + /* + * If timer ran less than 1ms after short-term sample started, retry. + */ + if (delta_time < 1000) + goto rearm; + + if (delta_idle > delta_time) + cpu_load = 0; + else + cpu_load = 100 * (delta_time - delta_idle) / delta_time; + + delta_idle = (unsigned int) cputime64_sub(now_idle, + pcpu->freq_change_time_in_idle); + delta_time = (unsigned int) cputime64_sub(pcpu->timer_run_time, + pcpu->freq_change_time); + + if ((delta_time == 0) || (delta_idle > delta_time)) + load_since_change = 0; + else + load_since_change = + 100 * (delta_time - delta_idle) / delta_time; + + /* + * Choose greater of short-term load (since last idle timer + * started or timer function re-armed itself) or long-term load + * (since last frequency change). + */ + if (load_since_change > cpu_load) + cpu_load = load_since_change; + + if (cpu_load >= go_hispeed_load) { + if (pcpu->policy->cur == pcpu->policy->min) + new_freq = hispeed_freq; + else + new_freq = pcpu->policy->max * cpu_load / 100; + } else { + new_freq = pcpu->policy->cur * cpu_load / 100; + } + + if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table, + new_freq, CPUFREQ_RELATION_H, + &index)) { + pr_warn_once("timer %d: cpufreq_frequency_table_target error\n", + (int) data); + goto rearm; + } + + new_freq = pcpu->freq_table[index].frequency; + + if (pcpu->target_freq == new_freq) + goto rearm_if_notmax; + + /* + * Do not scale down unless we have been at this frequency for the + * minimum sample time. + */ + if (new_freq < pcpu->target_freq) { + if (cputime64_sub(pcpu->timer_run_time, pcpu->freq_change_time) + < min_sample_time) + goto rearm; + } + + if (new_freq < pcpu->target_freq) { + pcpu->target_freq = new_freq; + spin_lock_irqsave(&down_cpumask_lock, flags); + cpumask_set_cpu(data, &down_cpumask); + spin_unlock_irqrestore(&down_cpumask_lock, flags); + queue_work(down_wq, &freq_scale_down_work); + } else { + pcpu->target_freq = new_freq; + spin_lock_irqsave(&up_cpumask_lock, flags); + cpumask_set_cpu(data, &up_cpumask); + spin_unlock_irqrestore(&up_cpumask_lock, flags); + wake_up_process(up_task); + } + +rearm_if_notmax: + /* + * Already set max speed and don't see a need to change that, + * wait until next idle to re-evaluate, don't need timer. + */ + if (pcpu->target_freq == pcpu->policy->max) + goto exit; + +rearm: + if (!timer_pending(&pcpu->cpu_timer)) { + /* + * If already at min: if that CPU is idle, don't set timer. + * Else cancel the timer if that CPU goes idle. We don't + * need to re-evaluate speed until the next idle exit. + */ + if (pcpu->target_freq == pcpu->policy->min) { + smp_rmb(); + + if (pcpu->idling) + goto exit; + + pcpu->timer_idlecancel = 1; + } + + pcpu->time_in_idle = get_cpu_idle_time_us( + data, &pcpu->idle_exit_time); + mod_timer(&pcpu->cpu_timer, + jiffies + usecs_to_jiffies(timer_rate)); + } + +exit: + return; +} + +static void cpufreq_interactive_idle_start(void) +{ + struct cpufreq_interactive_cpuinfo *pcpu = + &per_cpu(cpuinfo, smp_processor_id()); + int pending; + + if (!pcpu->governor_enabled) + return; + + pcpu->idling = 1; + smp_wmb(); + pending = timer_pending(&pcpu->cpu_timer); + + if (pcpu->target_freq != pcpu->policy->min) { +#ifdef CONFIG_SMP + /* + * Entering idle while not at lowest speed. On some + * platforms this can hold the other CPU(s) at that speed + * even though the CPU is idle. Set a timer to re-evaluate + * speed so this idle CPU doesn't hold the other CPUs above + * min indefinitely. This should probably be a quirk of + * the CPUFreq driver. + */ + if (!pending) { + pcpu->time_in_idle = get_cpu_idle_time_us( + smp_processor_id(), &pcpu->idle_exit_time); + pcpu->timer_idlecancel = 0; + mod_timer(&pcpu->cpu_timer, + jiffies + usecs_to_jiffies(timer_rate)); + } +#endif + } else { + /* + * If at min speed and entering idle after load has + * already been evaluated, and a timer has been set just in + * case the CPU suddenly goes busy, cancel that timer. The + * CPU didn't go busy; we'll recheck things upon idle exit. + */ + if (pending && pcpu->timer_idlecancel) { + del_timer(&pcpu->cpu_timer); + /* + * Ensure last timer run time is after current idle + * sample start time, so next idle exit will always + * start a new idle sampling period. + */ + pcpu->idle_exit_time = 0; + pcpu->timer_idlecancel = 0; + } + } + +} + +static void cpufreq_interactive_idle_end(void) +{ + struct cpufreq_interactive_cpuinfo *pcpu = + &per_cpu(cpuinfo, smp_processor_id()); + + pcpu->idling = 0; + smp_wmb(); + + /* + * Arm the timer for 1-2 ticks later if not already, and if the timer + * function has already processed the previous load sampling + * interval. (If the timer is not pending but has not processed + * the previous interval, it is probably racing with us on another + * CPU. Let it compute load based on the previous sample and then + * re-arm the timer for another interval when it's done, rather + * than updating the interval start time to be "now", which doesn't + * give the timer function enough time to make a decision on this + * run.) + */ + if (timer_pending(&pcpu->cpu_timer) == 0 && + pcpu->timer_run_time >= pcpu->idle_exit_time && + pcpu->governor_enabled) { + pcpu->time_in_idle = + get_cpu_idle_time_us(smp_processor_id(), + &pcpu->idle_exit_time); + pcpu->timer_idlecancel = 0; + mod_timer(&pcpu->cpu_timer, + jiffies + usecs_to_jiffies(timer_rate)); + } + +} + +static int cpufreq_interactive_up_task(void *data) +{ + unsigned int cpu; + cpumask_t tmp_mask; + unsigned long flags; + struct cpufreq_interactive_cpuinfo *pcpu; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&up_cpumask_lock, flags); + + if (cpumask_empty(&up_cpumask)) { + spin_unlock_irqrestore(&up_cpumask_lock, flags); + schedule(); + + if (kthread_should_stop()) + break; + + spin_lock_irqsave(&up_cpumask_lock, flags); + } + + set_current_state(TASK_RUNNING); + tmp_mask = up_cpumask; + cpumask_clear(&up_cpumask); + spin_unlock_irqrestore(&up_cpumask_lock, flags); + + for_each_cpu(cpu, &tmp_mask) { + unsigned int j; + unsigned int max_freq = 0; + + pcpu = &per_cpu(cpuinfo, cpu); + smp_rmb(); + + if (!pcpu->governor_enabled) + continue; + + mutex_lock(&set_speed_lock); + + for_each_cpu(j, pcpu->policy->cpus) { + struct cpufreq_interactive_cpuinfo *pjcpu = + &per_cpu(cpuinfo, j); + + if (pjcpu->target_freq > max_freq) + max_freq = pjcpu->target_freq; + } + + if (max_freq != pcpu->policy->cur) + __cpufreq_driver_target(pcpu->policy, + max_freq, + CPUFREQ_RELATION_H); + mutex_unlock(&set_speed_lock); + + pcpu->freq_change_time_in_idle = + get_cpu_idle_time_us(cpu, + &pcpu->freq_change_time); + } + } + + return 0; +} + +static void cpufreq_interactive_freq_down(struct work_struct *work) +{ + unsigned int cpu; + cpumask_t tmp_mask; + unsigned long flags; + struct cpufreq_interactive_cpuinfo *pcpu; + + spin_lock_irqsave(&down_cpumask_lock, flags); + tmp_mask = down_cpumask; + cpumask_clear(&down_cpumask); + spin_unlock_irqrestore(&down_cpumask_lock, flags); + + for_each_cpu(cpu, &tmp_mask) { + unsigned int j; + unsigned int max_freq = 0; + + pcpu = &per_cpu(cpuinfo, cpu); + smp_rmb(); + + if (!pcpu->governor_enabled) + continue; + + mutex_lock(&set_speed_lock); + + for_each_cpu(j, pcpu->policy->cpus) { + struct cpufreq_interactive_cpuinfo *pjcpu = + &per_cpu(cpuinfo, j); + + if (pjcpu->target_freq > max_freq) + max_freq = pjcpu->target_freq; + } + + if (max_freq != pcpu->policy->cur) + __cpufreq_driver_target(pcpu->policy, max_freq, + CPUFREQ_RELATION_H); + + mutex_unlock(&set_speed_lock); + pcpu->freq_change_time_in_idle = + get_cpu_idle_time_us(cpu, + &pcpu->freq_change_time); + } +} + +static ssize_t show_hispeed_freq(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", hispeed_freq); +} + +static ssize_t store_hispeed_freq(struct kobject *kobj, + struct attribute *attr, const char *buf, + size_t count) +{ + int ret; + u64 val; + + ret = strict_strtoull(buf, 0, &val); + if (ret < 0) + return ret; + hispeed_freq = val; + return count; +} + +static struct global_attr hispeed_freq_attr = __ATTR(hispeed_freq, 0644, + show_hispeed_freq, store_hispeed_freq); + + +static ssize_t show_go_hispeed_load(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", go_hispeed_load); +} + +static ssize_t store_go_hispeed_load(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t count) +{ + int ret; + unsigned long val; + + ret = strict_strtoul(buf, 0, &val); + if (ret < 0) + return ret; + go_hispeed_load = val; + return count; +} + +static struct global_attr go_hispeed_load_attr = __ATTR(go_hispeed_load, 0644, + show_go_hispeed_load, store_go_hispeed_load); + +static ssize_t show_min_sample_time(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", min_sample_time); +} + +static ssize_t store_min_sample_time(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t count) +{ + int ret; + unsigned long val; + + ret = strict_strtoul(buf, 0, &val); + if (ret < 0) + return ret; + min_sample_time = val; + return count; +} + +static struct global_attr min_sample_time_attr = __ATTR(min_sample_time, 0644, + show_min_sample_time, store_min_sample_time); + +static ssize_t show_timer_rate(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", timer_rate); +} + +static ssize_t store_timer_rate(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t count) +{ + int ret; + unsigned long val; + + ret = strict_strtoul(buf, 0, &val); + if (ret < 0) + return ret; + timer_rate = val; + return count; +} + +static struct global_attr timer_rate_attr = __ATTR(timer_rate, 0644, + show_timer_rate, store_timer_rate); + +static struct attribute *interactive_attributes[] = { + &hispeed_freq_attr.attr, + &go_hispeed_load_attr.attr, + &min_sample_time_attr.attr, + &timer_rate_attr.attr, + NULL, +}; + +static struct attribute_group interactive_attr_group = { + .attrs = interactive_attributes, + .name = "interactive", +}; + +static int cpufreq_governor_interactive(struct cpufreq_policy *policy, + unsigned int event) +{ + int rc; + unsigned int j; + struct cpufreq_interactive_cpuinfo *pcpu; + struct cpufreq_frequency_table *freq_table; + + switch (event) { + case CPUFREQ_GOV_START: + if (!cpu_online(policy->cpu)) + return -EINVAL; + + freq_table = + cpufreq_frequency_get_table(policy->cpu); + + for_each_cpu(j, policy->cpus) { + pcpu = &per_cpu(cpuinfo, j); + pcpu->policy = policy; + pcpu->target_freq = policy->cur; + pcpu->freq_table = freq_table; + pcpu->freq_change_time_in_idle = + get_cpu_idle_time_us(j, + &pcpu->freq_change_time); + pcpu->governor_enabled = 1; + smp_wmb(); + } + + if (!hispeed_freq) + hispeed_freq = policy->max; + + /* + * Do not register the idle hook and create sysfs + * entries if we have already done so. + */ + if (atomic_inc_return(&active_count) > 1) + return 0; + + rc = sysfs_create_group(cpufreq_global_kobject, + &interactive_attr_group); + if (rc) + return rc; + + break; + + case CPUFREQ_GOV_STOP: + for_each_cpu(j, policy->cpus) { + pcpu = &per_cpu(cpuinfo, j); + pcpu->governor_enabled = 0; + smp_wmb(); + del_timer_sync(&pcpu->cpu_timer); + + /* + * Reset idle exit time since we may cancel the timer + * before it can run after the last idle exit time, + * to avoid tripping the check in idle exit for a timer + * that is trying to run. + */ + pcpu->idle_exit_time = 0; + } + + flush_work(&freq_scale_down_work); + if (atomic_dec_return(&active_count) > 0) + return 0; + + sysfs_remove_group(cpufreq_global_kobject, + &interactive_attr_group); + + break; + + case CPUFREQ_GOV_LIMITS: + if (policy->max < policy->cur) + __cpufreq_driver_target(policy, + policy->max, CPUFREQ_RELATION_H); + else if (policy->min > policy->cur) + __cpufreq_driver_target(policy, + policy->min, CPUFREQ_RELATION_L); + break; + } + return 0; +} + +static int cpufreq_interactive_idle_notifier(struct notifier_block *nb, + unsigned long val, + void *data) +{ + switch (val) { + case IDLE_START: + cpufreq_interactive_idle_start(); + break; + case IDLE_END: + cpufreq_interactive_idle_end(); + break; + } + + return 0; +} + +static struct notifier_block cpufreq_interactive_idle_nb = { + .notifier_call = cpufreq_interactive_idle_notifier, +}; + +static int __init cpufreq_interactive_init(void) +{ + unsigned int i; + struct cpufreq_interactive_cpuinfo *pcpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + go_hispeed_load = DEFAULT_GO_HISPEED_LOAD; + min_sample_time = DEFAULT_MIN_SAMPLE_TIME; + timer_rate = DEFAULT_TIMER_RATE; + + /* Initalize per-cpu timers */ + for_each_possible_cpu(i) { + pcpu = &per_cpu(cpuinfo, i); + init_timer(&pcpu->cpu_timer); + pcpu->cpu_timer.function = cpufreq_interactive_timer; + pcpu->cpu_timer.data = i; + } + + up_task = kthread_create(cpufreq_interactive_up_task, NULL, + "kinteractiveup"); + if (IS_ERR(up_task)) + return PTR_ERR(up_task); + + sched_setscheduler_nocheck(up_task, SCHED_FIFO, ¶m); + get_task_struct(up_task); + + /* No rescuer thread, bind to CPU queuing the work for possibly + warm cache (probably doesn't matter much). */ + down_wq = alloc_workqueue("knteractive_down", 0, 1); + + if (!down_wq) + goto err_freeuptask; + + INIT_WORK(&freq_scale_down_work, + cpufreq_interactive_freq_down); + + spin_lock_init(&up_cpumask_lock); + spin_lock_init(&down_cpumask_lock); + mutex_init(&set_speed_lock); + + idle_notifier_register(&cpufreq_interactive_idle_nb); + + return cpufreq_register_governor(&cpufreq_gov_interactive); + +err_freeuptask: + put_task_struct(up_task); + return -ENOMEM; +} + +#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE +fs_initcall(cpufreq_interactive_init); +#else +module_init(cpufreq_interactive_init); +#endif + +static void __exit cpufreq_interactive_exit(void) +{ + cpufreq_unregister_governor(&cpufreq_gov_interactive); + kthread_stop(up_task); + put_task_struct(up_task); + destroy_workqueue(down_wq); +} + +module_exit(cpufreq_interactive_exit); + +MODULE_AUTHOR("Mike Chan <mike@android.com>"); +MODULE_DESCRIPTION("'cpufreq_interactive' - A cpufreq governor for " + "Latency sensitive workloads"); +MODULE_LICENSE("GPL"); diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 891360e..68a15b6 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -22,6 +22,7 @@ #include <linux/tick.h> #include <linux/ktime.h> #include <linux/sched.h> +#include <linux/pm_qos_params.h> /* * dbs is used in this file as a shortform for demandbased switching @@ -93,6 +94,10 @@ struct cpu_dbs_info_s { * when user is changing the governor or limits. */ struct mutex timer_mutex; + bool activated; /* dbs_timer_init is in effect */ +#ifdef CONFIG_CPU_FREQ_GOV_ONDEMAND_FLEXRATE + unsigned int flex_duration; +#endif }; static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info); @@ -102,6 +107,9 @@ static unsigned int dbs_enable; /* number of CPUs using this policy */ * dbs_mutex protects dbs_enable in governor start/stop. */ static DEFINE_MUTEX(dbs_mutex); +#ifdef CONFIG_CPU_FREQ_GOV_ONDEMAND_FLEXRATE +static DEFINE_MUTEX(flex_mutex); +#endif static struct dbs_tuners { unsigned int sampling_rate; @@ -111,12 +119,20 @@ static struct dbs_tuners { unsigned int sampling_down_factor; unsigned int powersave_bias; unsigned int io_is_busy; + struct notifier_block dvfs_lat_qos_db; + unsigned int dvfs_lat_qos_wants; + unsigned int freq_step; +#ifdef CONFIG_CPU_FREQ_GOV_ONDEMAND_FLEXRATE + unsigned int flex_sampling_rate; + unsigned int flex_duration; +#endif } dbs_tuners_ins = { .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, .sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR, .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL, .ignore_nice = 0, .powersave_bias = 0, + .freq_step = 100, }; static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, @@ -163,6 +179,23 @@ static inline cputime64_t get_cpu_iowait_time(unsigned int cpu, cputime64_t *wal } /* + * Find right sampling rate based on sampling_rate and + * QoS requests on dvfs latency. + */ +static unsigned int effective_sampling_rate(void) +{ + unsigned int effective; + + if (dbs_tuners_ins.dvfs_lat_qos_wants) + effective = min(dbs_tuners_ins.dvfs_lat_qos_wants, + dbs_tuners_ins.sampling_rate); + else + effective = dbs_tuners_ins.sampling_rate; + + return max(effective, min_sampling_rate); +} + +/* * Find right freq to be set now with powersave_bias on. * Returns the freq_hi to be used right now and will set freq_hi_jiffies, * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs. @@ -206,7 +239,7 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy, dbs_info->freq_lo_jiffies = 0; return freq_lo; } - jiffies_total = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); + jiffies_total = usecs_to_jiffies(effective_sampling_rate()); jiffies_hi = (freq_avg - freq_lo) * jiffies_total; jiffies_hi += ((freq_hi - freq_lo) / 2); jiffies_hi /= (freq_hi - freq_lo); @@ -255,6 +288,95 @@ show_one(up_threshold, up_threshold); show_one(sampling_down_factor, sampling_down_factor); show_one(ignore_nice_load, ignore_nice); show_one(powersave_bias, powersave_bias); +show_one(down_differential, down_differential); +show_one(freq_step, freq_step); + +/** + * update_sampling_rate - update sampling rate effective immediately if needed. + * @new_rate: new sampling rate. If it is 0, regard sampling rate is not + * changed and assume that qos request value is changed. + * + * If new rate is smaller than the old, simply updaing + * dbs_tuners_int.sampling_rate might not be appropriate. For example, + * if the original sampling_rate was 1 second and the requested new sampling + * rate is 10 ms because the user needs immediate reaction from ondemand + * governor, but not sure if higher frequency will be required or not, + * then, the governor may change the sampling rate too late; up to 1 second + * later. Thus, if we are reducing the sampling rate, we need to make the + * new value effective immediately. + */ +static void update_sampling_rate(unsigned int new_rate) +{ + int cpu; + unsigned int effective; + + if (new_rate) + dbs_tuners_ins.sampling_rate = max(new_rate, min_sampling_rate); + + effective = effective_sampling_rate(); + + for_each_online_cpu(cpu) { + struct cpufreq_policy *policy; + struct cpu_dbs_info_s *dbs_info; + unsigned long next_sampling, appointed_at; + + /* + * mutex_destory(&dbs_info->timer_mutex) should not happen + * in this context. dbs_mutex is locked/unlocked at GOV_START + * and GOV_STOP context only other than here. + */ + mutex_lock(&dbs_mutex); + + policy = cpufreq_cpu_get(cpu); + if (!policy) { + mutex_unlock(&dbs_mutex); + continue; + } + dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu); + cpufreq_cpu_put(policy); + + /* timer_mutex is destroyed or will be destroyed soon */ + if (!dbs_info->activated) { + mutex_unlock(&dbs_mutex); + continue; + } + + mutex_lock(&dbs_info->timer_mutex); + + if (!delayed_work_pending(&dbs_info->work)) { + mutex_unlock(&dbs_info->timer_mutex); + mutex_unlock(&dbs_mutex); + continue; + } + + next_sampling = jiffies + usecs_to_jiffies(new_rate); + appointed_at = dbs_info->work.timer.expires; + + if (time_before(next_sampling, appointed_at)) { + mutex_unlock(&dbs_info->timer_mutex); + cancel_delayed_work_sync(&dbs_info->work); + mutex_lock(&dbs_info->timer_mutex); + + schedule_delayed_work_on(dbs_info->cpu, &dbs_info->work, + usecs_to_jiffies(effective)); + } + mutex_unlock(&dbs_info->timer_mutex); + + /* + * For the little possiblity that dbs_timer_exit() has been + * called after checking dbs_info->activated above. + * If cancel_delayed_work_syn() has been calld by + * dbs_timer_exit() before schedule_delayed_work_on() of this + * function, it should be revoked by calling cancel again + * before releasing dbs_mutex, which will trigger mutex_destroy + * to be called. + */ + if (!dbs_info->activated) + cancel_delayed_work_sync(&dbs_info->work); + + mutex_unlock(&dbs_mutex); + } +} static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b, const char *buf, size_t count) @@ -264,7 +386,7 @@ static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b, ret = sscanf(buf, "%u", &input); if (ret != 1) return -EINVAL; - dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate); + update_sampling_rate(input); return count; } @@ -367,12 +489,46 @@ static ssize_t store_powersave_bias(struct kobject *a, struct attribute *b, return count; } +static ssize_t store_down_differential(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.down_differential = min(input, 100u); + return count; +} + +static ssize_t store_freq_step(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.freq_step = min(input, 100u); + return count; +} + + define_one_global_rw(sampling_rate); define_one_global_rw(io_is_busy); define_one_global_rw(up_threshold); define_one_global_rw(sampling_down_factor); define_one_global_rw(ignore_nice_load); define_one_global_rw(powersave_bias); +define_one_global_rw(down_differential); +define_one_global_rw(freq_step); +#ifdef CONFIG_CPU_FREQ_GOV_ONDEMAND_FLEXRATE +static struct global_attr flexrate_request; +static struct global_attr flexrate_duration; +static struct global_attr flexrate_enable; +static struct global_attr flexrate_forcerate; +static struct global_attr flexrate_num_effective_usage; +#endif static struct attribute *dbs_attributes[] = { &sampling_rate_min.attr, @@ -382,6 +538,15 @@ static struct attribute *dbs_attributes[] = { &ignore_nice_load.attr, &powersave_bias.attr, &io_is_busy.attr, + &down_differential.attr, + &freq_step.attr, +#ifdef CONFIG_CPU_FREQ_GOV_ONDEMAND_FLEXRATE + &flexrate_request.attr, + &flexrate_duration.attr, + &flexrate_enable.attr, + &flexrate_forcerate.attr, + &flexrate_num_effective_usage.attr, +#endif NULL }; @@ -396,8 +561,10 @@ static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq) { if (dbs_tuners_ins.powersave_bias) freq = powersave_bias_target(p, freq, CPUFREQ_RELATION_H); +#if !defined(CONFIG_ARCH_EXYNOS4) && !defined(CONFIG_ARCH_EXYNOS5) else if (p->cur == p->max) return; +#endif __cpufreq_driver_target(p, freq, dbs_tuners_ins.powersave_bias ? CPUFREQ_RELATION_L : CPUFREQ_RELATION_H); @@ -495,18 +662,22 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) /* Check for frequency increase */ if (max_load_freq > dbs_tuners_ins.up_threshold * policy->cur) { + int inc = (policy->max * dbs_tuners_ins.freq_step) / 100; + int target = min(policy->max, policy->cur + inc); /* If switching to max speed, apply sampling_down_factor */ - if (policy->cur < policy->max) + if (policy->cur < policy->max && target == policy->max) this_dbs_info->rate_mult = dbs_tuners_ins.sampling_down_factor; - dbs_freq_increase(policy, policy->max); + dbs_freq_increase(policy, target); return; } /* Check for frequency decrease */ +#if !defined(CONFIG_ARCH_EXYNOS4) && !defined(CONFIG_ARCH_EXYNOS5) /* if we cannot reduce the frequency anymore, break out early */ if (policy->cur == policy->min) return; +#endif /* * The optimal frequency is the frequency that is the lowest that @@ -563,7 +734,7 @@ static void do_dbs_timer(struct work_struct *work) /* We want all CPUs to do sampling nearly on * same jiffy */ - delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate + delay = usecs_to_jiffies(effective_sampling_rate() * dbs_info->rate_mult); if (num_online_cpus() > 1) @@ -574,6 +745,23 @@ static void do_dbs_timer(struct work_struct *work) dbs_info->freq_lo, CPUFREQ_RELATION_H); delay = dbs_info->freq_lo_jiffies; } +#ifdef CONFIG_CPU_FREQ_GOV_ONDEMAND_FLEXRATE + if (dbs_info->flex_duration) { + struct cpufreq_policy *policy = dbs_info->cur_policy; + + mutex_lock(&flex_mutex); + delay = usecs_to_jiffies(dbs_tuners_ins.flex_sampling_rate); + + /* If it's already max, we don't need to iterate fast */ + if (policy->cur >= policy->max) + dbs_info->flex_duration = 1; + + if (--dbs_info->flex_duration < dbs_tuners_ins.flex_duration) { + dbs_tuners_ins.flex_duration = dbs_info->flex_duration; + } + mutex_unlock(&flex_mutex); + } +#endif /* CONFIG_CPU_FREQ_GOV_ONDEMAND_FLEXRATE */ schedule_delayed_work_on(cpu, &dbs_info->work, delay); mutex_unlock(&dbs_info->timer_mutex); } @@ -581,18 +769,20 @@ static void do_dbs_timer(struct work_struct *work) static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) { /* We want all CPUs to do sampling nearly on same jiffy */ - int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); + int delay = usecs_to_jiffies(effective_sampling_rate()); if (num_online_cpus() > 1) delay -= jiffies % delay; dbs_info->sample_type = DBS_NORMAL_SAMPLE; INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); - schedule_delayed_work_on(dbs_info->cpu, &dbs_info->work, delay); + schedule_delayed_work_on(dbs_info->cpu, &dbs_info->work, 10 * delay); + dbs_info->activated = true; } static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) { + dbs_info->activated = false; cancel_delayed_work_sync(&dbs_info->work); } @@ -711,11 +901,40 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, return 0; } +/** + * qos_dvfs_lat_notify - PM QoS Notifier for DVFS_LATENCY QoS Request + * @nb notifier block struct + * @value QoS value + * @dummy + */ +static int qos_dvfs_lat_notify(struct notifier_block *nb, unsigned long value, + void *dummy) +{ + /* + * In the worst case, with a continuous up-treshold + e cpu load + * from up-threshold - e load, the ondemand governor will react + * sampling_rate * 2. + * + * Thus, based on the worst case scenario, we use value / 2; + */ + dbs_tuners_ins.dvfs_lat_qos_wants = value / 2; + + /* Update sampling rate */ + update_sampling_rate(0); + + return NOTIFY_OK; +} + +static struct notifier_block ondemand_qos_dvfs_lat_nb = { + .notifier_call = qos_dvfs_lat_notify, +}; + static int __init cpufreq_gov_dbs_init(void) { cputime64_t wall; u64 idle_time; int cpu = get_cpu(); + int err = 0; idle_time = get_cpu_idle_time_us(cpu, &wall); put_cpu(); @@ -736,14 +955,241 @@ static int __init cpufreq_gov_dbs_init(void) MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10); } - return cpufreq_register_governor(&cpufreq_gov_ondemand); + err = pm_qos_add_notifier(PM_QOS_DVFS_RESPONSE_LATENCY, + &ondemand_qos_dvfs_lat_nb); + if (err) + return err; + + err = cpufreq_register_governor(&cpufreq_gov_ondemand); + if (err) { + pm_qos_remove_notifier(PM_QOS_DVFS_RESPONSE_LATENCY, + &ondemand_qos_dvfs_lat_nb); + } + + return err; } static void __exit cpufreq_gov_dbs_exit(void) { + pm_qos_remove_notifier(PM_QOS_DVFS_RESPONSE_LATENCY, + &ondemand_qos_dvfs_lat_nb); + cpufreq_unregister_governor(&cpufreq_gov_ondemand); } +#ifdef CONFIG_CPU_FREQ_GOV_ONDEMAND_FLEXRATE +static unsigned int max_duration = + (CONFIG_CPU_FREQ_GOV_ONDEMAND_FLEXRATE_MAX_DURATION); +#define DEFAULT_DURATION (5) +static unsigned int sysfs_duration = DEFAULT_DURATION; +static bool flexrate_enabled = true; +static unsigned int forced_rate; +static unsigned int flexrate_num_effective; + +static int cpufreq_ondemand_flexrate_do(struct cpufreq_policy *policy, + bool now) +{ + unsigned int cpu = policy->cpu; + bool using_ondemand; + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); + + WARN(!mutex_is_locked(&flex_mutex), "flex_mutex not locked\n"); + + dbs_info->flex_duration = dbs_tuners_ins.flex_duration; + + if (now) { + flexrate_num_effective++; + + mutex_lock(&dbs_mutex); + using_ondemand = dbs_enable && !strncmp(policy->governor->name, "ondemand", 8); + mutex_unlock(&dbs_mutex); + + if (!using_ondemand) + return 0; + + mutex_unlock(&flex_mutex); + mutex_lock(&dbs_info->timer_mutex); + + /* Do It! */ + cancel_delayed_work_sync(&dbs_info->work); + schedule_delayed_work_on(cpu, &dbs_info->work, 1); + + mutex_unlock(&dbs_info->timer_mutex); + mutex_lock(&flex_mutex); + } + + return 0; +} + +int cpufreq_ondemand_flexrate_request(unsigned int rate_us, + unsigned int duration) +{ + int err = 0; + + if (!flexrate_enabled) + return 0; + + if (forced_rate) + rate_us = forced_rate; + + mutex_lock(&flex_mutex); + + /* Unnecessary requests are dropped */ + if (rate_us >= dbs_tuners_ins.sampling_rate) + goto out; + if (rate_us >= dbs_tuners_ins.flex_sampling_rate && + duration <= dbs_tuners_ins.flex_duration) + goto out; + + duration = min(max_duration, duration); + if (rate_us > 0 && rate_us < min_sampling_rate) + rate_us = min_sampling_rate; + + err = 1; /* Need update */ + + /* Cancel the active flexrate requests */ + if (rate_us == 0 || duration == 0) { + dbs_tuners_ins.flex_duration = 0; + dbs_tuners_ins.flex_sampling_rate = 0; + goto out; + } + + if (dbs_tuners_ins.flex_sampling_rate == 0 || + dbs_tuners_ins.flex_sampling_rate > rate_us) + err = 2; /* Need to poll faster */ + + /* Set new flexrate per the request */ + dbs_tuners_ins.flex_sampling_rate = + min(dbs_tuners_ins.flex_sampling_rate, rate_us); + dbs_tuners_ins.flex_duration = + max(dbs_tuners_ins.flex_duration, duration); +out: + /* Apply new flexrate */ + if (err > 0) { + bool now = (err == 2); + int cpu = 0; + + /* TODO: For every CPU using ONDEMAND */ + err = cpufreq_ondemand_flexrate_do(cpufreq_cpu_get(cpu), now); + } + mutex_unlock(&flex_mutex); + return err; +} +EXPORT_SYMBOL_GPL(cpufreq_ondemand_flexrate_request); + +static ssize_t store_flexrate_request(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int rate; + int ret; + + ret = sscanf(buf, "%u", &rate); + if (ret != 1) + return -EINVAL; + + ret = cpufreq_ondemand_flexrate_request(rate, sysfs_duration); + if (ret) + return ret; + return count; +} + +static ssize_t show_flexrate_request(struct kobject *a, struct attribute *b, + char *buf) +{ + return sprintf(buf, "Flexrate decreases CPUFreq Ondemand governor's polling rate temporaily.\n" + "Usage Example:\n" + "# echo 8 > flexrate_duration\n" + "# echo 10000 > flexrate_request\n" + "With the second statement, Ondemand polls with 10ms(10000us) interval 8 times.\n" + "run \"echo flexrate_duration\" to see the currecnt duration setting.\n"); +} + +static ssize_t store_flexrate_duration(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int duration; + int ret; + + /* mutex not needed for flexrate_sysfs_duration */ + ret = sscanf(buf, "%u", &duration); + if (ret != 1) + return -EINVAL; + + if (duration == 0) + duration = DEFAULT_DURATION; + if (duration > max_duration) + duration = max_duration; + + sysfs_duration = duration; + return count; +} + +static ssize_t show_flexrate_duration(struct kobject *a, struct attribute *b, + char *buf) +{ + return sprintf(buf, "%d\n", sysfs_duration); +} + +static ssize_t store_flexrate_enable(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + + if (input > 0) + flexrate_enabled = true; + else + flexrate_enabled = false; + + return count; +} + +static ssize_t show_flexrate_enable(struct kobject *a, struct attribute *b, + char *buf) +{ + return sprintf(buf, "%d\n", !!flexrate_enabled); +} + +static ssize_t store_flexrate_forcerate(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int rate; + int ret; + + ret = sscanf(buf, "%u", &rate); + if (ret != 1) + return -EINVAL; + + forced_rate = rate; + + pr_info("CAUTION: flexrate_forcerate is for debugging/benchmarking only.\n"); + return count; +} + +static ssize_t show_flexrate_forcerate(struct kobject *a, struct attribute *b, + char *buf) +{ + return sprintf(buf, "%u\n", forced_rate); +} + +static ssize_t show_flexrate_num_effective_usage(struct kobject *a, + struct attribute *b, + char *buf) +{ + return sprintf(buf, "%u\n", flexrate_num_effective); +} + +define_one_global_rw(flexrate_request); +define_one_global_rw(flexrate_duration); +define_one_global_rw(flexrate_enable); +define_one_global_rw(flexrate_forcerate); +define_one_global_ro(flexrate_num_effective_usage); +#endif + MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>"); MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>"); diff --git a/drivers/cpufreq/cpufreq_pegasusq.c b/drivers/cpufreq/cpufreq_pegasusq.c new file mode 100644 index 0000000..4a90a01 --- /dev/null +++ b/drivers/cpufreq/cpufreq_pegasusq.c @@ -0,0 +1,1411 @@ +/* + * drivers/cpufreq/cpufreq_pegasusq.c + * + * Copyright (C) 2011 Samsung Electronics co. ltd + * ByungChang Cha <bc.cha@samsung.com> + * + * Based on ondemand governor + * Copyright (C) 2001 Russell King + * (C) 2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>. + * Jun Nakajima <jun.nakajima@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/jiffies.h> +#include <linux/kernel_stat.h> +#include <linux/mutex.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/ktime.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/suspend.h> +#include <linux/reboot.h> + +#ifdef CONFIG_HAS_EARLYSUSPEND +#include <linux/earlysuspend.h> +#endif + +/* + * runqueue average + */ + +#define RQ_AVG_TIMER_RATE 10 + +struct runqueue_data { + unsigned int nr_run_avg; + unsigned int update_rate; + int64_t last_time; + int64_t total_time; + struct delayed_work work; + struct workqueue_struct *nr_run_wq; + spinlock_t lock; +}; + +static struct runqueue_data *rq_data; +static void rq_work_fn(struct work_struct *work); + +static void start_rq_work(void) +{ + rq_data->nr_run_avg = 0; + rq_data->last_time = 0; + rq_data->total_time = 0; + if (rq_data->nr_run_wq == NULL) + rq_data->nr_run_wq = + create_singlethread_workqueue("nr_run_avg"); + + queue_delayed_work(rq_data->nr_run_wq, &rq_data->work, + msecs_to_jiffies(rq_data->update_rate)); + return; +} + +static void stop_rq_work(void) +{ + if (rq_data->nr_run_wq) + cancel_delayed_work(&rq_data->work); + return; +} + +static int __init init_rq_avg(void) +{ + rq_data = kzalloc(sizeof(struct runqueue_data), GFP_KERNEL); + if (rq_data == NULL) { + pr_err("%s cannot allocate memory\n", __func__); + return -ENOMEM; + } + spin_lock_init(&rq_data->lock); + rq_data->update_rate = RQ_AVG_TIMER_RATE; + INIT_DELAYED_WORK_DEFERRABLE(&rq_data->work, rq_work_fn); + + return 0; +} + +static void rq_work_fn(struct work_struct *work) +{ + int64_t time_diff = 0; + int64_t nr_run = 0; + unsigned long flags = 0; + int64_t cur_time = ktime_to_ns(ktime_get()); + + spin_lock_irqsave(&rq_data->lock, flags); + + if (rq_data->last_time == 0) + rq_data->last_time = cur_time; + if (rq_data->nr_run_avg == 0) + rq_data->total_time = 0; + + nr_run = nr_running() * 100; + time_diff = cur_time - rq_data->last_time; + do_div(time_diff, 1000 * 1000); + + if (time_diff != 0 && rq_data->total_time != 0) { + nr_run = (nr_run * time_diff) + + (rq_data->nr_run_avg * rq_data->total_time); + do_div(nr_run, rq_data->total_time + time_diff); + } + rq_data->nr_run_avg = nr_run; + rq_data->total_time += time_diff; + rq_data->last_time = cur_time; + + if (rq_data->update_rate != 0) + queue_delayed_work(rq_data->nr_run_wq, &rq_data->work, + msecs_to_jiffies(rq_data->update_rate)); + + spin_unlock_irqrestore(&rq_data->lock, flags); +} + +static unsigned int get_nr_run_avg(void) +{ + unsigned int nr_run_avg; + unsigned long flags = 0; + + spin_lock_irqsave(&rq_data->lock, flags); + nr_run_avg = rq_data->nr_run_avg; + rq_data->nr_run_avg = 0; + spin_unlock_irqrestore(&rq_data->lock, flags); + + return nr_run_avg; +} + + +/* + * dbs is used in this file as a shortform for demandbased switching + * It helps to keep variable names smaller, simpler + */ + +#define DEF_SAMPLING_DOWN_FACTOR (2) +#define MAX_SAMPLING_DOWN_FACTOR (100000) +#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (5) +#define DEF_FREQUENCY_UP_THRESHOLD (85) +#define DEF_FREQUENCY_MIN_SAMPLE_RATE (10000) +#define MIN_FREQUENCY_UP_THRESHOLD (11) +#define MAX_FREQUENCY_UP_THRESHOLD (100) +#define DEF_SAMPLING_RATE (50000) +#define MIN_SAMPLING_RATE (10000) +#define MAX_HOTPLUG_RATE (40u) + +#define DEF_MAX_CPU_LOCK (0) +#define DEF_CPU_UP_FREQ (500000) +#define DEF_CPU_DOWN_FREQ (200000) +#define DEF_UP_NR_CPUS (1) +#define DEF_CPU_UP_RATE (10) +#define DEF_CPU_DOWN_RATE (20) +#define DEF_FREQ_STEP (40) +#define DEF_START_DELAY (0) + +#define UP_THRESHOLD_AT_MIN_FREQ (40) +#define FREQ_FOR_RESPONSIVENESS (500000) + +#define HOTPLUG_DOWN_INDEX (0) +#define HOTPLUG_UP_INDEX (1) + +#ifdef CONFIG_MACH_MIDAS +static int hotplug_rq[4][2] = { + {0, 100}, {100, 200}, {200, 300}, {300, 0} +}; + +static int hotplug_freq[4][2] = { + {0, 500000}, + {200000, 500000}, + {200000, 500000}, + {200000, 0} +}; +#else +static int hotplug_rq[4][2] = { + {0, 100}, {100, 200}, {200, 300}, {300, 0} +}; + +static int hotplug_freq[4][2] = { + {0, 500000}, + {200000, 500000}, + {200000, 500000}, + {200000, 0} +}; +#endif + +static unsigned int min_sampling_rate; + +static void do_dbs_timer(struct work_struct *work); +static int cpufreq_governor_dbs(struct cpufreq_policy *policy, + unsigned int event); + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_PEGASUSQ +static +#endif +struct cpufreq_governor cpufreq_gov_pegasusq = { + .name = "pegasusq", + .governor = cpufreq_governor_dbs, + .owner = THIS_MODULE, +}; + +/* Sampling types */ +enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE}; + +struct cpu_dbs_info_s { + cputime64_t prev_cpu_idle; + cputime64_t prev_cpu_iowait; + cputime64_t prev_cpu_wall; + cputime64_t prev_cpu_nice; + struct cpufreq_policy *cur_policy; + struct delayed_work work; + struct work_struct up_work; + struct work_struct down_work; + struct cpufreq_frequency_table *freq_table; + unsigned int rate_mult; + int cpu; + /* + * percpu mutex that serializes governor limit change with + * do_dbs_timer invocation. We do not want do_dbs_timer to run + * when user is changing the governor or limits. + */ + struct mutex timer_mutex; +}; +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info); + +struct workqueue_struct *dvfs_workqueue; + +static unsigned int dbs_enable; /* number of CPUs using this policy */ + +/* + * dbs_mutex protects dbs_enable in governor start/stop. + */ +static DEFINE_MUTEX(dbs_mutex); + +static struct dbs_tuners { + unsigned int sampling_rate; + unsigned int up_threshold; + unsigned int down_differential; + unsigned int ignore_nice; + unsigned int sampling_down_factor; + unsigned int io_is_busy; + /* pegasusq tuners */ + unsigned int freq_step; + unsigned int cpu_up_rate; + unsigned int cpu_down_rate; + unsigned int cpu_up_freq; + unsigned int cpu_down_freq; + unsigned int up_nr_cpus; + unsigned int max_cpu_lock; + atomic_t hotplug_lock; + unsigned int dvfs_debug; + unsigned int max_freq; + unsigned int min_freq; +#ifdef CONFIG_HAS_EARLYSUSPEND + int early_suspend; +#endif +} dbs_tuners_ins = { + .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, + .sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR, + .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL, + .ignore_nice = 0, + .freq_step = DEF_FREQ_STEP, + .cpu_up_rate = DEF_CPU_UP_RATE, + .cpu_down_rate = DEF_CPU_DOWN_RATE, + .cpu_up_freq = DEF_CPU_UP_FREQ, + .cpu_down_freq = DEF_CPU_DOWN_FREQ, + .up_nr_cpus = DEF_UP_NR_CPUS, + .max_cpu_lock = DEF_MAX_CPU_LOCK, + .hotplug_lock = ATOMIC_INIT(0), + .dvfs_debug = 0, +#ifdef CONFIG_HAS_EARLYSUSPEND + .early_suspend = -1, +#endif +}; + + +/* + * CPU hotplug lock interface + */ + +static atomic_t g_hotplug_count = ATOMIC_INIT(0); +static atomic_t g_hotplug_lock = ATOMIC_INIT(0); + +static void apply_hotplug_lock(void) +{ + int online, possible, lock, flag; + struct work_struct *work; + struct cpu_dbs_info_s *dbs_info; + + /* do turn_on/off cpus */ + dbs_info = &per_cpu(od_cpu_dbs_info, 0); /* from CPU0 */ + online = num_online_cpus(); + possible = num_possible_cpus(); + lock = atomic_read(&g_hotplug_lock); + flag = lock - online; + + if (flag == 0) + return; + + work = flag > 0 ? &dbs_info->up_work : &dbs_info->down_work; + + pr_debug("%s online %d possible %d lock %d flag %d %d\n", + __func__, online, possible, lock, flag, (int)abs(flag)); + + queue_work_on(dbs_info->cpu, dvfs_workqueue, work); +} + +int cpufreq_pegasusq_cpu_lock(int num_core) +{ + int prev_lock; + + if (num_core < 1 || num_core > num_possible_cpus()) + return -EINVAL; + + prev_lock = atomic_read(&g_hotplug_lock); + + if (prev_lock != 0 && prev_lock < num_core) + return -EINVAL; + else if (prev_lock == num_core) + atomic_inc(&g_hotplug_count); + + atomic_set(&g_hotplug_lock, num_core); + atomic_set(&g_hotplug_count, 1); + apply_hotplug_lock(); + + return 0; +} + +int cpufreq_pegasusq_cpu_unlock(int num_core) +{ + int prev_lock = atomic_read(&g_hotplug_lock); + + if (prev_lock < num_core) + return 0; + else if (prev_lock == num_core) + atomic_dec(&g_hotplug_count); + + if (atomic_read(&g_hotplug_count) == 0) + atomic_set(&g_hotplug_lock, 0); + + return 0; +} + + +/* + * History of CPU usage + */ +struct cpu_usage { + unsigned int freq; + unsigned int load[NR_CPUS]; + unsigned int rq_avg; +}; + +struct cpu_usage_history { + struct cpu_usage usage[MAX_HOTPLUG_RATE]; + unsigned int num_hist; +}; + +struct cpu_usage_history *hotplug_history; + +static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, + cputime64_t *wall) +{ + cputime64_t idle_time; + cputime64_t cur_wall_time; + cputime64_t busy_time; + + cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); + busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user, + kstat_cpu(cpu).cpustat.system); + + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); + busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); + + idle_time = cputime64_sub(cur_wall_time, busy_time); + if (wall) + *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); + + return (cputime64_t)jiffies_to_usecs(idle_time); +} + +static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) +{ + u64 idle_time = get_cpu_idle_time_us(cpu, wall); + + if (idle_time == -1ULL) + return get_cpu_idle_time_jiffy(cpu, wall); + + return idle_time; +} + +static inline cputime64_t get_cpu_iowait_time(unsigned int cpu, + cputime64_t *wall) +{ + u64 iowait_time = get_cpu_iowait_time_us(cpu, wall); + + if (iowait_time == -1ULL) + return 0; + + return iowait_time; +} + +/************************** sysfs interface ************************/ + +static ssize_t show_sampling_rate_min(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", min_sampling_rate); +} + +define_one_global_ro(sampling_rate_min); + +/* cpufreq_pegasusq Governor Tunables */ +#define show_one(file_name, object) \ +static ssize_t show_##file_name \ +(struct kobject *kobj, struct attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "%u\n", dbs_tuners_ins.object); \ +} +show_one(sampling_rate, sampling_rate); +show_one(io_is_busy, io_is_busy); +show_one(up_threshold, up_threshold); +show_one(sampling_down_factor, sampling_down_factor); +show_one(ignore_nice_load, ignore_nice); +show_one(down_differential, down_differential); +show_one(freq_step, freq_step); +show_one(cpu_up_rate, cpu_up_rate); +show_one(cpu_down_rate, cpu_down_rate); +show_one(cpu_up_freq, cpu_up_freq); +show_one(cpu_down_freq, cpu_down_freq); +show_one(up_nr_cpus, up_nr_cpus); +show_one(max_cpu_lock, max_cpu_lock); +show_one(dvfs_debug, dvfs_debug); +static ssize_t show_hotplug_lock(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", atomic_read(&g_hotplug_lock)); +} + +#define show_hotplug_param(file_name, num_core, up_down) \ +static ssize_t show_##file_name##_##num_core##_##up_down \ +(struct kobject *kobj, struct attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "%u\n", file_name[num_core - 1][up_down]); \ +} + +#define store_hotplug_param(file_name, num_core, up_down) \ +static ssize_t store_##file_name##_##num_core##_##up_down \ +(struct kobject *kobj, struct attribute *attr, \ + const char *buf, size_t count) \ +{ \ + unsigned int input; \ + int ret; \ + ret = sscanf(buf, "%u", &input); \ + if (ret != 1) \ + return -EINVAL; \ + file_name[num_core - 1][up_down] = input; \ + return count; \ +} + +show_hotplug_param(hotplug_freq, 1, 1); +show_hotplug_param(hotplug_freq, 2, 0); +show_hotplug_param(hotplug_freq, 2, 1); +show_hotplug_param(hotplug_freq, 3, 0); +show_hotplug_param(hotplug_freq, 3, 1); +show_hotplug_param(hotplug_freq, 4, 0); + +show_hotplug_param(hotplug_rq, 1, 1); +show_hotplug_param(hotplug_rq, 2, 0); +show_hotplug_param(hotplug_rq, 2, 1); +show_hotplug_param(hotplug_rq, 3, 0); +show_hotplug_param(hotplug_rq, 3, 1); +show_hotplug_param(hotplug_rq, 4, 0); + +store_hotplug_param(hotplug_freq, 1, 1); +store_hotplug_param(hotplug_freq, 2, 0); +store_hotplug_param(hotplug_freq, 2, 1); +store_hotplug_param(hotplug_freq, 3, 0); +store_hotplug_param(hotplug_freq, 3, 1); +store_hotplug_param(hotplug_freq, 4, 0); + +store_hotplug_param(hotplug_rq, 1, 1); +store_hotplug_param(hotplug_rq, 2, 0); +store_hotplug_param(hotplug_rq, 2, 1); +store_hotplug_param(hotplug_rq, 3, 0); +store_hotplug_param(hotplug_rq, 3, 1); +store_hotplug_param(hotplug_rq, 4, 0); + +define_one_global_rw(hotplug_freq_1_1); +define_one_global_rw(hotplug_freq_2_0); +define_one_global_rw(hotplug_freq_2_1); +define_one_global_rw(hotplug_freq_3_0); +define_one_global_rw(hotplug_freq_3_1); +define_one_global_rw(hotplug_freq_4_0); + +define_one_global_rw(hotplug_rq_1_1); +define_one_global_rw(hotplug_rq_2_0); +define_one_global_rw(hotplug_rq_2_1); +define_one_global_rw(hotplug_rq_3_0); +define_one_global_rw(hotplug_rq_3_1); +define_one_global_rw(hotplug_rq_4_0); + +static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate); + return count; +} + +static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + + dbs_tuners_ins.io_is_busy = !!input; + return count; +} + +static ssize_t store_up_threshold(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + + if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD || + input < MIN_FREQUENCY_UP_THRESHOLD) { + return -EINVAL; + } + dbs_tuners_ins.up_threshold = input; + return count; +} + +static ssize_t store_sampling_down_factor(struct kobject *a, + struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input, j; + int ret; + ret = sscanf(buf, "%u", &input); + + if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1) + return -EINVAL; + dbs_tuners_ins.sampling_down_factor = input; + + /* Reset down sampling multiplier in case it was active */ + for_each_online_cpu(j) { + struct cpu_dbs_info_s *dbs_info; + dbs_info = &per_cpu(od_cpu_dbs_info, j); + dbs_info->rate_mult = 1; + } + return count; +} + +static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + + unsigned int j; + + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + + if (input > 1) + input = 1; + + if (input == dbs_tuners_ins.ignore_nice) { /* nothing to do */ + return count; + } + dbs_tuners_ins.ignore_nice = input; + + /* we need to re-evaluate prev_cpu_idle */ + for_each_online_cpu(j) { + struct cpu_dbs_info_s *dbs_info; + dbs_info = &per_cpu(od_cpu_dbs_info, j); + dbs_info->prev_cpu_idle = + get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); + if (dbs_tuners_ins.ignore_nice) + dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; + } + return count; +} + +static ssize_t store_down_differential(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.down_differential = min(input, 100u); + return count; +} + +static ssize_t store_freq_step(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.freq_step = min(input, 100u); + return count; +} + +static ssize_t store_cpu_up_rate(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.cpu_up_rate = min(input, MAX_HOTPLUG_RATE); + return count; +} + +static ssize_t store_cpu_down_rate(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.cpu_down_rate = min(input, MAX_HOTPLUG_RATE); + return count; +} + +static ssize_t store_cpu_up_freq(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.cpu_up_freq = min(input, dbs_tuners_ins.max_freq); + return count; +} + +static ssize_t store_cpu_down_freq(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.cpu_down_freq = max(input, dbs_tuners_ins.min_freq); + return count; +} + +static ssize_t store_up_nr_cpus(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.up_nr_cpus = min(input, num_possible_cpus()); + return count; +} + +static ssize_t store_max_cpu_lock(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.max_cpu_lock = min(input, num_possible_cpus()); + return count; +} + +static ssize_t store_hotplug_lock(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + int prev_lock; + + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + input = min(input, num_possible_cpus()); + prev_lock = atomic_read(&dbs_tuners_ins.hotplug_lock); + + if (prev_lock) + cpufreq_pegasusq_cpu_unlock(prev_lock); + + if (input == 0) { + atomic_set(&dbs_tuners_ins.hotplug_lock, 0); + return count; + } + + ret = cpufreq_pegasusq_cpu_lock(input); + if (ret) { + printk(KERN_ERR "[HOTPLUG] already locked with smaller value %d < %d\n", + atomic_read(&g_hotplug_lock), input); + return ret; + } + + atomic_set(&dbs_tuners_ins.hotplug_lock, input); + + return count; +} + +static ssize_t store_dvfs_debug(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + dbs_tuners_ins.dvfs_debug = input > 0; + return count; +} + +define_one_global_rw(sampling_rate); +define_one_global_rw(io_is_busy); +define_one_global_rw(up_threshold); +define_one_global_rw(sampling_down_factor); +define_one_global_rw(ignore_nice_load); +define_one_global_rw(down_differential); +define_one_global_rw(freq_step); +define_one_global_rw(cpu_up_rate); +define_one_global_rw(cpu_down_rate); +define_one_global_rw(cpu_up_freq); +define_one_global_rw(cpu_down_freq); +define_one_global_rw(up_nr_cpus); +define_one_global_rw(max_cpu_lock); +define_one_global_rw(hotplug_lock); +define_one_global_rw(dvfs_debug); + +static struct attribute *dbs_attributes[] = { + &sampling_rate_min.attr, + &sampling_rate.attr, + &up_threshold.attr, + &sampling_down_factor.attr, + &ignore_nice_load.attr, + &io_is_busy.attr, + &down_differential.attr, + &freq_step.attr, + &cpu_up_rate.attr, + &cpu_down_rate.attr, + &cpu_up_freq.attr, + &cpu_down_freq.attr, + &up_nr_cpus.attr, + /* priority: hotplug_lock > max_cpu_lock */ + &max_cpu_lock.attr, + &hotplug_lock.attr, + &dvfs_debug.attr, + &hotplug_freq_1_1.attr, + &hotplug_freq_2_0.attr, + &hotplug_freq_2_1.attr, + &hotplug_freq_3_0.attr, + &hotplug_freq_3_1.attr, + &hotplug_freq_4_0.attr, + &hotplug_rq_1_1.attr, + &hotplug_rq_2_0.attr, + &hotplug_rq_2_1.attr, + &hotplug_rq_3_0.attr, + &hotplug_rq_3_1.attr, + &hotplug_rq_4_0.attr, + NULL +}; + +static struct attribute_group dbs_attr_group = { + .attrs = dbs_attributes, + .name = "pegasusq", +}; + +/************************** sysfs end ************************/ + +static void cpu_up_work(struct work_struct *work) +{ + int cpu; + int online = num_online_cpus(); + int nr_up = dbs_tuners_ins.up_nr_cpus; + int hotplug_lock = atomic_read(&g_hotplug_lock); + if (hotplug_lock) + nr_up = hotplug_lock - online; + + if (online == 1) { + printk(KERN_ERR "CPU_UP 3\n"); + cpu_up(num_possible_cpus() - 1); + nr_up -= 1; + } + + for_each_cpu_not(cpu, cpu_online_mask) { + if (nr_up-- == 0) + break; + if (cpu == 0) + continue; + printk(KERN_ERR "CPU_UP %d\n", cpu); + cpu_up(cpu); + } +} + +static void cpu_down_work(struct work_struct *work) +{ + int cpu; + int online = num_online_cpus(); + int nr_down = 1; + int hotplug_lock = atomic_read(&g_hotplug_lock); + + if (hotplug_lock) + nr_down = online - hotplug_lock; + + for_each_online_cpu(cpu) { + if (cpu == 0) + continue; + printk(KERN_ERR "CPU_DOWN %d\n", cpu); + cpu_down(cpu); + if (--nr_down == 0) + break; + } +} + +static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq) +{ +#ifndef CONFIG_ARCH_EXYNOS4 + if (p->cur == p->max) + return; +#endif + + __cpufreq_driver_target(p, freq, CPUFREQ_RELATION_L); +} + +/* + * print hotplug debugging info. + * which 1 : UP, 0 : DOWN + */ +static void debug_hotplug_check(int which, int rq_avg, int freq, + struct cpu_usage *usage) +{ + int cpu; + printk(KERN_ERR "CHECK %s rq %d.%02d freq %d [", which ? "up" : "down", + rq_avg / 100, rq_avg % 100, freq); + for_each_online_cpu(cpu) { + printk(KERN_ERR "(%d, %d), ", cpu, usage->load[cpu]); + } + printk(KERN_ERR "]\n"); +} + +static int check_up(void) +{ + int num_hist = hotplug_history->num_hist; + struct cpu_usage *usage; + int freq, rq_avg; + int i; + int up_rate = dbs_tuners_ins.cpu_up_rate; + int up_freq, up_rq; + int min_freq = INT_MAX; + int min_rq_avg = INT_MAX; + int online; + int hotplug_lock = atomic_read(&g_hotplug_lock); + + if (hotplug_lock > 0) + return 0; + + online = num_online_cpus(); + up_freq = hotplug_freq[online - 1][HOTPLUG_UP_INDEX]; + up_rq = hotplug_rq[online - 1][HOTPLUG_UP_INDEX]; + + if (online == num_possible_cpus()) + return 0; + if (dbs_tuners_ins.max_cpu_lock != 0 + && online >= dbs_tuners_ins.max_cpu_lock) + return 0; + + if (num_hist == 0 || num_hist % up_rate) + return 0; + + for (i = num_hist - 1; i >= num_hist - up_rate; --i) { + usage = &hotplug_history->usage[i]; + + freq = usage->freq; + rq_avg = usage->rq_avg; + + min_freq = min(min_freq, freq); + min_rq_avg = min(min_rq_avg, rq_avg); + + if (dbs_tuners_ins.dvfs_debug) + debug_hotplug_check(1, rq_avg, freq, usage); + } + + if (min_freq >= up_freq && min_rq_avg > up_rq) { + printk(KERN_ERR "[HOTPLUG IN] %s %d>=%d && %d>%d\n", + __func__, min_freq, up_freq, min_rq_avg, up_rq); + hotplug_history->num_hist = 0; + return 1; + } + return 0; +} + +static int check_down(void) +{ + int num_hist = hotplug_history->num_hist; + struct cpu_usage *usage; + int freq, rq_avg; + int i; + int down_rate = dbs_tuners_ins.cpu_down_rate; + int down_freq, down_rq; + int max_freq = 0; + int max_rq_avg = 0; + int online; + int hotplug_lock = atomic_read(&g_hotplug_lock); + + if (hotplug_lock > 0) + return 0; + + online = num_online_cpus(); + down_freq = hotplug_freq[online - 1][HOTPLUG_DOWN_INDEX]; + down_rq = hotplug_rq[online - 1][HOTPLUG_DOWN_INDEX]; + + if (online == 1) + return 0; + + if (dbs_tuners_ins.max_cpu_lock != 0 + && online > dbs_tuners_ins.max_cpu_lock) + return 1; + + if (num_hist == 0 || num_hist % down_rate) + return 0; + + for (i = num_hist - 1; i >= num_hist - down_rate; --i) { + usage = &hotplug_history->usage[i]; + + freq = usage->freq; + rq_avg = usage->rq_avg; + + max_freq = max(max_freq, freq); + max_rq_avg = max(max_rq_avg, rq_avg); + + if (dbs_tuners_ins.dvfs_debug) + debug_hotplug_check(0, rq_avg, freq, usage); + } + + if (max_freq <= down_freq && max_rq_avg <= down_rq) { + printk(KERN_ERR "[HOTPLUG OUT] %s %d<=%d && %d<%d\n", + __func__, max_freq, down_freq, max_rq_avg, down_rq); + hotplug_history->num_hist = 0; + return 1; + } + + return 0; +} + +static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) +{ + unsigned int max_load_freq; + + struct cpufreq_policy *policy; + unsigned int j; + int num_hist = hotplug_history->num_hist; + int max_hotplug_rate = max(dbs_tuners_ins.cpu_up_rate, + dbs_tuners_ins.cpu_down_rate); + int up_threshold = dbs_tuners_ins.up_threshold; + + policy = this_dbs_info->cur_policy; + + hotplug_history->usage[num_hist].freq = policy->cur; + hotplug_history->usage[num_hist].rq_avg = get_nr_run_avg(); + ++hotplug_history->num_hist; + + /* Get Absolute Load - in terms of freq */ + max_load_freq = 0; + + for_each_cpu(j, policy->cpus) { + struct cpu_dbs_info_s *j_dbs_info; + cputime64_t cur_wall_time, cur_idle_time, cur_iowait_time; + cputime64_t prev_wall_time, prev_idle_time, prev_iowait_time; + unsigned int idle_time, wall_time, iowait_time; + unsigned int load, load_freq; + int freq_avg; + + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); + prev_wall_time = j_dbs_info->prev_cpu_wall; + prev_idle_time = j_dbs_info->prev_cpu_idle; + prev_iowait_time = j_dbs_info->prev_cpu_iowait; + + cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); + cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time); + + wall_time = (unsigned int) cputime64_sub(cur_wall_time, + prev_wall_time); + j_dbs_info->prev_cpu_wall = cur_wall_time; + + idle_time = (unsigned int) cputime64_sub(cur_idle_time, + prev_idle_time); + j_dbs_info->prev_cpu_idle = cur_idle_time; + + iowait_time = (unsigned int) cputime64_sub(cur_iowait_time, + prev_iowait_time); + j_dbs_info->prev_cpu_iowait = cur_iowait_time; + + if (dbs_tuners_ins.ignore_nice) { + cputime64_t cur_nice; + unsigned long cur_nice_jiffies; + + cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, + j_dbs_info->prev_cpu_nice); + /* + * Assumption: nice time between sampling periods will + * be less than 2^32 jiffies for 32 bit sys + */ + cur_nice_jiffies = (unsigned long) + cputime64_to_jiffies64(cur_nice); + + j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; + idle_time += jiffies_to_usecs(cur_nice_jiffies); + } + + if (dbs_tuners_ins.io_is_busy && idle_time >= iowait_time) + idle_time -= iowait_time; + + if (unlikely(!wall_time || wall_time < idle_time)) + continue; + + load = 100 * (wall_time - idle_time) / wall_time; + hotplug_history->usage[num_hist].load[j] = load; + + freq_avg = __cpufreq_driver_getavg(policy, j); + if (freq_avg <= 0) + freq_avg = policy->cur; + + load_freq = load * freq_avg; + if (load_freq > max_load_freq) + max_load_freq = load_freq; + } + + /* Check for CPU hotplug */ + if (check_up()) { + queue_work_on(this_dbs_info->cpu, dvfs_workqueue, + &this_dbs_info->up_work); + } else if (check_down()) { + queue_work_on(this_dbs_info->cpu, dvfs_workqueue, + &this_dbs_info->down_work); + } + if (hotplug_history->num_hist == max_hotplug_rate) + hotplug_history->num_hist = 0; + + /* Check for frequency increase */ + if (policy->cur < FREQ_FOR_RESPONSIVENESS) { + up_threshold = UP_THRESHOLD_AT_MIN_FREQ; + } + + if (max_load_freq > up_threshold * policy->cur) { + int inc = (policy->max * dbs_tuners_ins.freq_step) / 100; + int target = min(policy->max, policy->cur + inc); + /* If switching to max speed, apply sampling_down_factor */ + if (policy->cur < policy->max && target == policy->max) + this_dbs_info->rate_mult = + dbs_tuners_ins.sampling_down_factor; + dbs_freq_increase(policy, target); + return; + } + + /* Check for frequency decrease */ +#ifndef CONFIG_ARCH_EXYNOS4 + /* if we cannot reduce the frequency anymore, break out early */ + if (policy->cur == policy->min) + return; +#endif + + /* + * The optimal frequency is the frequency that is the lowest that + * can support the current CPU usage without triggering the up + * policy. To be safe, we focus DOWN_DIFFERENTIAL points under + * the threshold. + */ + if (max_load_freq < + (dbs_tuners_ins.up_threshold - dbs_tuners_ins.down_differential) * + policy->cur) { + unsigned int freq_next; + unsigned int down_thres; + + freq_next = max_load_freq / + (dbs_tuners_ins.up_threshold - + dbs_tuners_ins.down_differential); + + /* No longer fully busy, reset rate_mult */ + this_dbs_info->rate_mult = 1; + + if (freq_next < policy->min) + freq_next = policy->min; + + + down_thres = UP_THRESHOLD_AT_MIN_FREQ + - dbs_tuners_ins.down_differential; + + if (freq_next < FREQ_FOR_RESPONSIVENESS + && (max_load_freq / freq_next) > down_thres) + freq_next = FREQ_FOR_RESPONSIVENESS; + + if (policy->cur == freq_next) + return; + + __cpufreq_driver_target(policy, freq_next, + CPUFREQ_RELATION_L); + } +} + +static void do_dbs_timer(struct work_struct *work) +{ + struct cpu_dbs_info_s *dbs_info = + container_of(work, struct cpu_dbs_info_s, work.work); + unsigned int cpu = dbs_info->cpu; + int delay; + + mutex_lock(&dbs_info->timer_mutex); + + dbs_check_cpu(dbs_info); + /* We want all CPUs to do sampling nearly on + * same jiffy + */ + delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate + * dbs_info->rate_mult); + + if (num_online_cpus() > 1) + delay -= jiffies % delay; + + queue_delayed_work_on(cpu, dvfs_workqueue, &dbs_info->work, delay); + mutex_unlock(&dbs_info->timer_mutex); +} + +static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) +{ + /* We want all CPUs to do sampling nearly on same jiffy */ + int delay = usecs_to_jiffies(DEF_START_DELAY * 1000 * 1000 + + dbs_tuners_ins.sampling_rate); + if (num_online_cpus() > 1) + delay -= jiffies % delay; + + INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); + INIT_WORK(&dbs_info->up_work, cpu_up_work); + INIT_WORK(&dbs_info->down_work, cpu_down_work); + + queue_delayed_work_on(dbs_info->cpu, dvfs_workqueue, + &dbs_info->work, delay + 2 * HZ); +} + +static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) +{ + cancel_delayed_work_sync(&dbs_info->work); + cancel_work_sync(&dbs_info->up_work); + cancel_work_sync(&dbs_info->down_work); +} + +static int pm_notifier_call(struct notifier_block *this, + unsigned long event, void *ptr) +{ + static unsigned int prev_hotplug_lock; + switch (event) { + case PM_SUSPEND_PREPARE: + prev_hotplug_lock = atomic_read(&g_hotplug_lock); + atomic_set(&g_hotplug_lock, 1); + apply_hotplug_lock(); + pr_debug("%s enter suspend\n", __func__); + return NOTIFY_OK; + case PM_POST_RESTORE: + case PM_POST_SUSPEND: + atomic_set(&g_hotplug_lock, prev_hotplug_lock); + if (prev_hotplug_lock) + apply_hotplug_lock(); + prev_hotplug_lock = 0; + pr_debug("%s exit suspend\n", __func__); + return NOTIFY_OK; + } + return NOTIFY_DONE; +} + +static struct notifier_block pm_notifier = { + .notifier_call = pm_notifier_call, +}; + +static int reboot_notifier_call(struct notifier_block *this, + unsigned long code, void *_cmd) +{ + atomic_set(&g_hotplug_lock, 1); + return NOTIFY_DONE; +} + +static struct notifier_block reboot_notifier = { + .notifier_call = reboot_notifier_call, +}; + +#ifdef CONFIG_HAS_EARLYSUSPEND +static struct early_suspend early_suspend; +unsigned int prev_freq_step; +unsigned int prev_sampling_rate; +static void cpufreq_pegasusq_early_suspend(struct early_suspend *h) +{ + dbs_tuners_ins.early_suspend = + atomic_read(&g_hotplug_lock); + prev_freq_step = dbs_tuners_ins.freq_step; + prev_sampling_rate = dbs_tuners_ins.sampling_rate; + dbs_tuners_ins.freq_step = 20; + dbs_tuners_ins.sampling_rate *= 4; + atomic_set(&g_hotplug_lock, 1); + apply_hotplug_lock(); + stop_rq_work(); +} +static void cpufreq_pegasusq_late_resume(struct early_suspend *h) +{ + atomic_set(&g_hotplug_lock, dbs_tuners_ins.early_suspend); + dbs_tuners_ins.early_suspend = -1; + dbs_tuners_ins.freq_step = prev_freq_step; + dbs_tuners_ins.sampling_rate = prev_sampling_rate; + apply_hotplug_lock(); + start_rq_work(); +} +#endif + +static int cpufreq_governor_dbs(struct cpufreq_policy *policy, + unsigned int event) +{ + unsigned int cpu = policy->cpu; + struct cpu_dbs_info_s *this_dbs_info; + unsigned int j; + int rc; + + this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu); + + switch (event) { + case CPUFREQ_GOV_START: + if ((!cpu_online(cpu)) || (!policy->cur)) + return -EINVAL; + + dbs_tuners_ins.max_freq = policy->max; + dbs_tuners_ins.min_freq = policy->min; + hotplug_history->num_hist = 0; + start_rq_work(); + + mutex_lock(&dbs_mutex); + + dbs_enable++; + for_each_cpu(j, policy->cpus) { + struct cpu_dbs_info_s *j_dbs_info; + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); + j_dbs_info->cur_policy = policy; + + j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, + &j_dbs_info->prev_cpu_wall); + if (dbs_tuners_ins.ignore_nice) { + j_dbs_info->prev_cpu_nice = + kstat_cpu(j).cpustat.nice; + } + } + this_dbs_info->cpu = cpu; + this_dbs_info->rate_mult = 1; + /* + * Start the timerschedule work, when this governor + * is used for first time + */ + if (dbs_enable == 1) { + rc = sysfs_create_group(cpufreq_global_kobject, + &dbs_attr_group); + if (rc) { + mutex_unlock(&dbs_mutex); + return rc; + } + + min_sampling_rate = MIN_SAMPLING_RATE; + dbs_tuners_ins.sampling_rate = DEF_SAMPLING_RATE; + dbs_tuners_ins.io_is_busy = 0; + } + mutex_unlock(&dbs_mutex); + + register_reboot_notifier(&reboot_notifier); + + mutex_init(&this_dbs_info->timer_mutex); + dbs_timer_init(this_dbs_info); + +#ifdef CONFIG_HAS_EARLYSUSPEND + register_early_suspend(&early_suspend); +#endif + break; + + case CPUFREQ_GOV_STOP: +#ifdef CONFIG_HAS_EARLYSUSPEND + unregister_early_suspend(&early_suspend); +#endif + + dbs_timer_exit(this_dbs_info); + + mutex_lock(&dbs_mutex); + mutex_destroy(&this_dbs_info->timer_mutex); + + unregister_reboot_notifier(&reboot_notifier); + + dbs_enable--; + mutex_unlock(&dbs_mutex); + + stop_rq_work(); + + if (!dbs_enable) + sysfs_remove_group(cpufreq_global_kobject, + &dbs_attr_group); + + break; + + case CPUFREQ_GOV_LIMITS: + mutex_lock(&this_dbs_info->timer_mutex); + + if (policy->max < this_dbs_info->cur_policy->cur) + __cpufreq_driver_target(this_dbs_info->cur_policy, + policy->max, + CPUFREQ_RELATION_H); + else if (policy->min > this_dbs_info->cur_policy->cur) + __cpufreq_driver_target(this_dbs_info->cur_policy, + policy->min, + CPUFREQ_RELATION_L); + + mutex_unlock(&this_dbs_info->timer_mutex); + break; + } + return 0; +} + +static int __init cpufreq_gov_dbs_init(void) +{ + int ret; + + ret = init_rq_avg(); + if (ret) + return ret; + + hotplug_history = kzalloc(sizeof(struct cpu_usage_history), GFP_KERNEL); + if (!hotplug_history) { + pr_err("%s cannot create hotplug history array\n", __func__); + ret = -ENOMEM; + goto err_hist; + } + + dvfs_workqueue = create_workqueue("kpegasusq"); + if (!dvfs_workqueue) { + pr_err("%s cannot create workqueue\n", __func__); + ret = -ENOMEM; + goto err_queue; + } + + ret = cpufreq_register_governor(&cpufreq_gov_pegasusq); + if (ret) + goto err_reg; + +#ifdef CONFIG_HAS_EARLYSUSPEND + early_suspend.level = EARLY_SUSPEND_LEVEL_DISABLE_FB; + early_suspend.suspend = cpufreq_pegasusq_early_suspend; + early_suspend.resume = cpufreq_pegasusq_late_resume; +#endif + + return ret; + +err_reg: + destroy_workqueue(dvfs_workqueue); +err_queue: + kfree(hotplug_history); +err_hist: + kfree(rq_data); + return ret; +} + +static void __exit cpufreq_gov_dbs_exit(void) +{ + cpufreq_unregister_governor(&cpufreq_gov_pegasusq); + destroy_workqueue(dvfs_workqueue); + kfree(hotplug_history); + kfree(rq_data); +} + +MODULE_AUTHOR("ByungChang Cha <bc.cha@samsung.com>"); +MODULE_DESCRIPTION("'cpufreq_pegasusq' - A dynamic cpufreq/cpuhotplug governor"); +MODULE_LICENSE("GPL"); + +#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PEGASUSQ +fs_initcall(cpufreq_gov_dbs_init); +#else +module_init(cpufreq_gov_dbs_init); +#endif +module_exit(cpufreq_gov_dbs_exit); diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index faf7c52..c315ec9 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -317,6 +317,27 @@ static int cpufreq_stat_notifier_trans(struct notifier_block *nb, return 0; } +static int cpufreq_stats_create_table_cpu(unsigned int cpu) +{ + struct cpufreq_policy *policy; + struct cpufreq_frequency_table *table; + int ret = -ENODEV; + + policy = cpufreq_cpu_get(cpu); + if (!policy) + return -ENODEV; + + table = cpufreq_frequency_get_table(cpu); + if (!table) + goto out; + + ret = cpufreq_stats_create_table(policy, table); + +out: + cpufreq_cpu_put(policy); + return ret; +} + static int __cpuinit cpufreq_stat_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -335,6 +356,10 @@ static int __cpuinit cpufreq_stat_cpu_callback(struct notifier_block *nfb, case CPU_DEAD_FROZEN: cpufreq_stats_free_table(cpu); break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + cpufreq_stats_create_table_cpu(cpu); + break; } return NOTIFY_OK; } diff --git a/drivers/cpufreq/dvfs_monitor.c b/drivers/cpufreq/dvfs_monitor.c new file mode 100644 index 0000000..e1e02b4 --- /dev/null +++ b/drivers/cpufreq/dvfs_monitor.c @@ -0,0 +1,236 @@ +#include <linux/kernel.h> +#include <linux/threads.h> +#include <linux/spinlock.h> +#include <linux/cpumask.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/cpu.h> +#include <linux/notifier.h> +#include <linux/slab.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/proc_fs.h> +#include <linux/atomic.h> +#include <linux/tick.h> + +struct cpufreq_load_data { + cputime64_t prev_idle; + cputime64_t prev_wall; + unsigned char load; +}; + +struct dvfs_data { + atomic_t opened; + atomic_t num_events; + unsigned char cpus[NR_CPUS]; + unsigned int prev_freq[NR_CPUS]; + unsigned int freq[NR_CPUS]; + struct cpufreq_load_data load_data[NR_CPUS]; + wait_queue_head_t wait_queue; + spinlock_t load_lock; +}; + +static struct dvfs_data *dvfs_info; + +static void init_dvfs_mon(void) +{ + int cpu; + int cur_freq = cpufreq_get(0); + + for_each_possible_cpu(cpu) { + dvfs_info->cpus[cpu] = cpu_online(cpu); + dvfs_info->freq[cpu] = cur_freq; + } + atomic_set(&dvfs_info->num_events, 1); +} + +static void calculate_load(void) +{ + int cpu; + cputime64_t cur_wall, cur_idle; + cputime64_t prev_wall, prev_idle; + unsigned int wall_time, idle_time; + unsigned long flags; + + spin_lock_irqsave(&dvfs_info->load_lock, flags); + for_each_online_cpu(cpu) { + cur_idle = get_cpu_idle_time_us(cpu, &cur_wall); + prev_idle = dvfs_info->load_data[cpu].prev_idle; + prev_wall = dvfs_info->load_data[cpu].prev_wall; + + dvfs_info->load_data[cpu].prev_idle = cur_idle; + dvfs_info->load_data[cpu].prev_wall = cur_wall; + + idle_time = (unsigned int)cputime64_sub(cur_idle, prev_idle); + wall_time = (unsigned int)cputime64_sub(cur_wall, prev_wall); + + if (wall_time < idle_time) { + pr_err("%s walltime < idletime\n", __func__); + dvfs_info->load_data[cpu].load = 0; + } + + dvfs_info->load_data[cpu].load = (wall_time - idle_time) * 100 + / wall_time; + } + spin_unlock_irqrestore(&dvfs_info->load_lock, flags); + return; +} + +static int dvfs_monitor_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + + if (val != CPUFREQ_POSTCHANGE) + return 0; + + if (freq->new == freq->old) + return 0; + + dvfs_info->prev_freq[freq->cpu] = freq->old; + dvfs_info->freq[freq->cpu] = freq->new; + + calculate_load(); + + atomic_inc(&dvfs_info->num_events); + wake_up_interruptible(&dvfs_info->wait_queue); + + return 0; +} + +static int __cpuinit dvfs_monitor_hotplug(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + int cpu_status = 0; + + switch (action) { + case CPU_ONLINE: + cpu_status = 1; + break; + case CPU_DOWN_PREPARE: + cpu_status = 0; + break; + default: + return NOTIFY_OK; + } + + dvfs_info->cpus[cpu] = cpu_status; + atomic_inc(&dvfs_info->num_events); + calculate_load(); + wake_up_interruptible(&dvfs_info->wait_queue); + + return NOTIFY_OK; +} + +static struct notifier_block notifier_trans_block = { + .notifier_call = dvfs_monitor_trans, +}; + +static struct notifier_block notifier_hotplug_block __refdata = { + .notifier_call = dvfs_monitor_hotplug, + .priority = 1, +}; + +static int dvfs_mon_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if (atomic_xchg(&dvfs_info->opened, 1) != 0) + return -EBUSY; + + init_dvfs_mon(); + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + if (ret) + return ret; + + register_hotcpu_notifier(¬ifier_hotplug_block); + + return 0; +} + +static int dvfs_mon_release(struct inode *inode, struct file *file) +{ + int ret = 0; + + atomic_dec(&dvfs_info->opened); + ret = cpufreq_unregister_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + unregister_hotcpu_notifier(¬ifier_hotplug_block); + + return ret; +} + +static ssize_t dvfs_mon_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned long long t; + unsigned long nanosec_rem; + int freq, prev_freq; + char cpu_status[NR_CPUS * 8 + 1]; + char temp[3]; + int i; + + wait_event_interruptible(dvfs_info->wait_queue, + atomic_read(&dvfs_info->num_events)); + + atomic_set(&dvfs_info->num_events, 0); + + /* for now, assume that all cores run on same speed */ + freq = dvfs_info->freq[0]; + prev_freq = dvfs_info->prev_freq[0]; + dvfs_info->prev_freq[0] = freq; + + memset(cpu_status, 0, sizeof(cpu_status)); + for (i = 0; i != num_possible_cpus(); ++i) { + unsigned char load = dvfs_info->cpus[i] ? + dvfs_info->load_data[i].load : 0; + sprintf(temp, "(%d,%3d),", dvfs_info->cpus[i], load); + strcat(cpu_status, temp); + } + + t = cpu_clock(0); + nanosec_rem = do_div(t, 1000000000); + + return sprintf(buf, "%lu.%06lu,%s%d,%d\n", + (unsigned long) t, nanosec_rem / 1000, + cpu_status, prev_freq, freq); +} + +static const struct file_operations dvfs_mon_operations = { + .read = dvfs_mon_read, + .open = dvfs_mon_open, + .release = dvfs_mon_release, +}; + +static int __init dvfs_monitor_init(void) +{ + dvfs_info = kzalloc(sizeof(struct dvfs_data), GFP_KERNEL); + if (dvfs_info == NULL) { + pr_err("[DVFS_MON] cannot allocate memory\n"); + return -ENOMEM; + } + + spin_lock_init(&dvfs_info->load_lock); + + init_waitqueue_head(&dvfs_info->wait_queue); + + proc_create("dvfs_mon", S_IRUSR, NULL, &dvfs_mon_operations); + + return 0; +} +late_initcall(dvfs_monitor_init); + +static void __exit dvfs_monitor_exit(void) +{ + kfree(dvfs_info); + return; +} +module_exit(dvfs_monitor_exit); + +MODULE_AUTHOR("ByungChang Cha <bc.cha@samsung.com>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("DVFS Monitoring proc file"); |