一、dvfs介绍
主要作用是动态调整CPU的电压和频率,以在性能和功耗之间实现平衡。当CPU负载较轻时,可以通过降低电压和频率来降低功耗,延长电池续航时间;当CPU负载较重时,可以通过提高电压和频率来提高性能,确保系统的响应速度。
二、软件框架
Linux cpufreq(dvfs)框架主要包括cpufreq core、governor、driver.
1)cpufreq core
cpufreq framework的核心模块,和kernel其它framework类似,主要实现三类功能
抽象调频调压的公共逻辑和接口,主要围绕struct cpufreq_driver、struct cpufreq_policy和struct cpufreq_governor三个数据结构进行。
与user space交互;
提供CPU频率和电压控制的驱动框架,方便底层driver的开发;
提供governor框架,用于实现不同的频率调整机制。
2)cpufreq governor
负责调频调压的各种策略,每种governor计算频率的方式不同,根据提供的频率范围和参数(阈值等),计算合适的频率。
userspace:用户通过操作scaling_setspeed文件节点操作频率及电压的调整。
ondemand:根据CPU当前的使用率,动态调整cpu的频率及电压。Sched通过调用ondemand注册进来的回调函数来触发负载的估算,它以一定时间间隔对系统负载进行采样,按需调整cpu的频率及电压,若当前cpu的利用率超过设定的阈值,就会立即调整到最大的频率。调频速度快,但是不够精确。
conservative:类似ondemand,在调频调节时会平滑一下,以防最大、最小频率之间来回跳变。调整的时候会以一定步长调整,而不是直接调整到目标值。同时会周期的计算系统负载,用以决定调到什么频率。
schedutil:它主要是根据当前CPU的利用率进行调频。因此,sugov会注册一个callback函数(sugov_update_shared/sugov_update_single)到调度器负载跟踪模块,当CPU util发生变化的时候就会调用该callback函数,检查一下当前CPU频率是否和当前的CPU util匹配,如果不匹配,那么就进行提频或者降频。
3)cpufreq driver
负责平台相关的调频调压机制的实现,基于cpu subsystem driver、OPP、clock driver、regulator driver等模块,提供对CPU频率和电压的控制。
三、初始化
3.1 注册cpufreq_driver
int cpufreq_register_driver(struct cpufreq_driver *driver_data)
{
unsigned long flags;
int ret;
// 检查cpufreq是否使能
if (cpufreq_disabled())
return -ENODEV;
/*
* The cpufreq core depends heavily on the availability of device
* structure, make sure they are available before proceeding further.
*/
if (!get_cpu_device(0))
return -EPROBE_DEFER;
if (!driver_data || !driver_data->verify || !driver_data->init ||
!(driver_data->setpolicy || driver_data->target_index ||
driver_data->target) ||
(driver_data->setpolicy && (driver_data->target_index ||
driver_data->target)) ||
(!driver_data->get_intermediate != !driver_data->target_intermediate) ||
(!driver_data->online != !driver_data->offline) ||
(driver_data->adjust_perf && !driver_data->fast_switch))
return -EINVAL;
pr_debug("trying to register driver %s\n", driver_data->name);
/* Protect against concurrent CPU online/offline. */
cpus_read_lock();
write_lock_irqsave(&cpufreq_driver_lock, flags);
if (cpufreq_driver) {
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
ret = -EEXIST;
goto out;
}
cpufreq_driver = driver_data;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
/*
* Mark support for the scheduler's frequency invariance engine for
* drivers that implement target(), target_index() or fast_switch().
*/
if (!cpufreq_driver->setpolicy) {
static_branch_enable_cpuslocked(&cpufreq_freq_invariance);
pr_debug("supports frequency invariance");
}
if (driver_data->setpolicy)
driver_data->flags |= CPUFREQ_CONST_LOOPS;
// 如果平台支持cpu boost,创建节点,提供给user space
if (cpufreq_boost_supported()) {
ret = create_boost_sysfs_file();
if (ret)
goto err_null_driver;
}
// 注册cpu subsys的cpufreq interface
ret = subsys_interface_register(&cpufreq_interface);
if (ret)
goto err_boost_unreg;
if (unlikely(list_empty(&cpufreq_policy_list))) {
/* if all ->init() calls failed, unregister */
ret = -ENODEV;
pr_debug("%s: No CPU initialized for driver %s\n", __func__,
driver_data->name);
goto err_if_unreg;
}
ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN,
"cpufreq:online",
cpuhp_cpufreq_online,
cpuhp_cpufreq_offline);
if (ret < 0)
goto err_if_unreg;
hp_online = ret;
ret = 0;
pr_debug("driver %s up and running\n", driver_data->name);
goto out;
err_if_unreg:
subsys_interface_unregister(&cpufreq_interface);
err_boost_unreg:
remove_boost_sysfs_file();
err_null_driver:
write_lock_irqsave(&cpufreq_driver_lock, flags);
cpufreq_driver = NULL;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
out:
cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(cpufreq_register_driver);
3.2 注册cpufreq_governor
int cpufreq_register_governor(struct cpufreq_governor *governor)
{
int err;
if (!governor)
return -EINVAL;
// 检查cpufreq是否使能
if (cpufreq_disabled())
return -ENODEV;
mutex_lock(&cpufreq_governor_mutex);
err = -EBUSY;
// 寻找匹配的gov,并添加到gov list
if (!find_governor(governor->name)) {
err = 0;
list_add(&governor->governor_list, &cpufreq_governor_list);
}
mutex_unlock(&cpufreq_governor_mutex);
return err;
}
EXPORT_SYMBOL_GPL(cpufreq_register_governor);
四、核心数据结构
4.1 cpufreq_driver
struct cpufreq_driver {
char name[CPUFREQ_NAME_LEN]; // cpufreq driver的名称
u16 flags;
void *driver_data;
/* needed by all drivers */
int (*init)(struct cpufreq_policy *policy);
int (*verify)(struct cpufreq_policy_data *policy);
/* define one out of two */
// 设置cpu调频的范围
int (*setpolicy)(struct cpufreq_policy *policy);
// cpu调频的目标频率
int (*target)(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation); /* Deprecated */
int (*target_index)(struct cpufreq_policy *policy,
unsigned int index);
// cpu快速调频方法
unsigned int (*fast_switch)(struct cpufreq_policy *policy,
unsigned int target_freq);
/*
* ->fast_switch() replacement for drivers that use an internal
* representation of performance levels and can pass hints other than
* the target performance level to the hardware. This can only be set
* if ->fast_switch is set too, because in those cases (under specific
* conditions) scale invariance can be disabled, which causes the
* schedutil governor to fall back to the latter.
*/
void (*adjust_perf)(unsigned int cpu,
unsigned long min_perf,
unsigned long target_perf,
unsigned long capacity);
/*
* Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION
* unset.
*
* get_intermediate should return a stable intermediate frequency
* platform wants to switch to and target_intermediate() should set CPU
* to that frequency, before jumping to the frequency corresponding
* to 'index'. Core will take care of sending notifications and driver
* doesn't have to handle them in target_intermediate() or
* target_index().
*
* Drivers can return '0' from get_intermediate() in case they don't
* wish to switch to intermediate frequency for some target frequency.
* In that case core will directly call ->target_index().
*/
unsigned int (*get_intermediate)(struct cpufreq_policy *policy,
unsigned int index);
int (*target_intermediate)(struct cpufreq_policy *policy,
unsigned int index);
/* should be defined, if possible, return 0 on error */
// 获取cpu当前的频率
unsigned int (*get)(unsigned int cpu);
/* Called to update policy limits on firmware notifications. */
void (*update_limits)(unsigned int cpu);
/* optional */
// cpu最大频率
int (*bios_limit)(int cpu, unsigned int *limit);
int (*online)(struct cpufreq_policy *policy);
int (*offline)(struct cpufreq_policy *policy);
int (*exit)(struct cpufreq_policy *policy);
int (*suspend)(struct cpufreq_policy *policy);
int (*resume)(struct cpufreq_policy *policy);
/* Will be called after the driver is fully initialized */
// cpufreq driver初始化完成后调用
void (*ready)(struct cpufreq_policy *policy);
struct freq_attr **attr;
/* platform specific boost support code */
bool boost_enabled;
int (*set_boost)(struct cpufreq_policy *policy, int state);
/*
* Set by drivers that want to register with the energy model after the
* policy is properly initialized, but before the governor is started.
*/
void (*register_em)(struct cpufreq_policy *policy);
};
static struct cpufreq_driver dt_cpufreq_driver = {
.flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK |
CPUFREQ_IS_COOLING_DEV,
.verify = cpufreq_generic_frequency_table_verify,
.target_index = set_target,
.get = cpufreq_generic_get,
.init = cpufreq_init,
.exit = cpufreq_exit,
.online = cpufreq_online,
.offline = cpufreq_offline,
.register_em = cpufreq_register_em_with_opp,
.name = "cpufreq-dt",
.attr = cpufreq_dt_attr,
.suspend = cpufreq_generic_suspend,
};
4.2 cpufreq_policy
struct cpufreq_policy {
/* CPUs sharing clock, require sw coordination */
cpumask_var_t cpus; /* Online CPUs only */
cpumask_var_t related_cpus; /* Online + Offline CPUs */
cpumask_var_t real_cpus; /* Related and present */
unsigned int shared_type; /* ACPI: ANY or ALL affected CPUs
should set cpufreq */
unsigned int cpu; /* cpu managing this policy, must be online */
struct clk *clk;
struct cpufreq_cpuinfo cpuinfo;/* see above */
// 自动调频的范围
unsigned int min; /* in kHz */
unsigned int max; /* in kHz */
unsigned int cur; /* in kHz, only needed if cpufreq
* governors are used */
unsigned int suspend_freq; /* freq to set during suspend */
unsigned int policy; /* see above */
unsigned int last_policy; /* policy before unplug */
// 非自动调频的频率值,保存在governor中
struct cpufreq_governor *governor; /* see below */
void *governor_data;
char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */
struct work_struct update; /* if update_policy() needs to be
* called, but you're in IRQ context */
// cpufreq Qos约束
struct freq_constraints constraints;
struct freq_qos_request *min_freq_req;
struct freq_qos_request *max_freq_req;
// 当前可用的频率表
struct cpufreq_frequency_table *freq_table;
enum cpufreq_table_sorting freq_table_sorted;
// 调频的pocliy list
struct list_head policy_list;
struct kobject kobj;
struct completion kobj_unregister;
/*
* The rules for this semaphore:
* - Any routine that wants to read from the policy structure will
* do a down_read on this semaphore.
* - Any routine that will write to the policy structure and/or may take away
* the policy altogether (eg. CPU hotplug), will hold this lock in write
* mode before doing so.
*/
struct rw_semaphore rwsem;
/*
* Fast switch flags:
* - fast_switch_possible should be set by the driver if it can
* guarantee that frequency can be changed on any CPU sharing the
* policy and that the change will affect all of the policy CPUs then.
* - fast_switch_enabled is to be set by governors that support fast
* frequency switching with the help of cpufreq_enable_fast_switch().
*/
bool fast_switch_possible;
bool fast_switch_enabled;
/*
* Set if the CPUFREQ_GOV_STRICT_TARGET flag is set for the current
* governor.
*/
bool strict_target;
/*
* Set if inefficient frequencies were found in the frequency table.
* This indicates if the relation flag CPUFREQ_RELATION_E can be
* honored.
*/
bool efficiencies_available;
/*
* Preferred average time interval between consecutive invocations of
* the driver to set the frequency for this policy. To be set by the
* scaling driver (0, which is the default, means no preference).
*/
// 调频花费的时间
unsigned int transition_delay_us;
/*
* Remote DVFS flag (Not added to the driver structure as we don't want
* to access another structure from scheduler hotpath).
*
* Should be set if CPUs can do DVFS on behalf of other CPUs from
* different cpufreq policies.
*/
bool dvfs_possible_from_any_cpu;
/* Per policy boost enabled flag. */
bool boost_enabled;
/* Cached frequency lookup from cpufreq_driver_resolve_freq. */
unsigned int cached_target_freq;
unsigned int cached_resolved_idx;
/* Synchronization for frequency transitions */
bool transition_ongoing; /* Tracks transition status */
spinlock_t transition_lock;
wait_queue_head_t transition_wait;
struct task_struct *transition_task; /* Task which is doing the transition */
/* cpufreq-stats */
struct cpufreq_stats *stats;
/* For cpufreq driver's internal use */
void *driver_data;
/* Pointer to the cooling device if used for thermal mitigation */
struct thermal_cooling_device *cdev;
struct notifier_block nb_min;
struct notifier_block nb_max;
};
struct cpufreq_cpuinfo {
unsigned int max_freq;
unsigned int min_freq;
/* in 10^(-9) s = nanoseconds */
unsigned int transition_latency;
};
4.3 cpufreq_governor
struct cpufreq_governor {
char name[CPUFREQ_NAME_LEN]; // gov名字
int (*init)(struct cpufreq_policy *policy);
void (*exit)(struct cpufreq_policy *policy);
int (*start)(struct cpufreq_policy *policy);
void (*stop)(struct cpufreq_policy *policy);
void (*limits)(struct cpufreq_policy *policy);
// 用户空间的回调函数
ssize_t (*show_setspeed) (struct cpufreq_policy *policy,
char *buf);
int (*store_setspeed) (struct cpufreq_policy *policy,
unsigned int freq);
struct list_head governor_list;
struct module *owner;
u8 flags;
};
五、governor工作原理
5.1 shcedutil
shcedutil是最常用的governor,核心思想是基于cpu util或负载变化,计算出目标频率,进行调频调压。
// kernel/kernel/sched/cpufreq_schedutil.c
static void sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned int next_f;
raw_spin_lock(&sg_policy->update_lock);
// 1. 更新cpu上的io boost的状态
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
// 忽略deadline线程的频率限制
ignore_dl_rate_limit(sg_cpu, sg_policy);
// 2. 判断是否需要调频
if (sugov_should_update_freq(sg_policy, time)) {
// 3. 计算目标频率
next_f = sugov_next_freq_shared(sg_cpu, time);
if (sg_policy->policy->fast_switch_enabled)
// 4. 快速调频
sugov_fast_switch(sg_policy, time, next_f);
else
sugov_deferred_update(sg_policy, time, next_f);
}
raw_spin_unlock(&sg_policy->update_lock);
}
5.1.1 更新cpu上的io boost的状态
当某个CPU核心在等待I/O操作的完成时,其对应的CPU周期会处于空闲状态。
/**
* sugov_iowait_boost() - Updates the IO boost status of a CPU.
* @sg_cpu: the sugov data for the CPU to boost
* @time: the update time from the caller
* @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
*
* Each time a task wakes up after an IO operation, the CPU utilization can be
* boosted to a certain utilization which doubles at each "frequent and
* successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
* of the maximum OPP.
*
* To keep doubling, an IO boost has to be requested at least once per tick,
* otherwise we restart from the utilization of the minimum OPP.
*/
static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
unsigned int flags)
{
bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
/* Reset boost if the CPU appears to have been idle enough */
if (sg_cpu->iowait_boost &&
sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
return;
/* Boost only tasks waking up after IO */
if (!set_iowait_boost)
return;
/* Ensure boost doubles only one time at each request */
if (sg_cpu->iowait_boost_pending)
return;
sg_cpu->iowait_boost_pending = true;
/* Double the boost at each request */
if (sg_cpu->iowait_boost) {
sg_cpu->iowait_boost =
min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
return;
}
/* First wakeup after IO: start with minimum boost */
sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
}
5.1.2 计算目标频率
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util = 0, max = 1;
unsigned int j;
// 选择负载最高的cpu作为该cluster中的目标频率
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
// 1. 获取cpu负载
j_util = sugov_get_util(j_sg_cpu);
j_max = j_sg_cpu->max;
// 2. 对cpu使用io boost
j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
}
}
return get_next_freq(sg_policy, util, max);
}
static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
unsigned long util, unsigned long max)
{
unsigned long boost;
/* No boost currently required */
if (!sg_cpu->iowait_boost)
return util;
/* Reset boost if the CPU appears to have been idle enough */
// 当cpuidle时间大于tick,复位iowait reset
if (sugov_iowait_reset(sg_cpu, time, false))
return util;
if (!sg_cpu->iowait_boost_pending) {
/*
* No boost pending; reduce the boost value.
*/
sg_cpu->iowait_boost >>= 1;
if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
sg_cpu->iowait_boost = 0;
return util;
}
}
sg_cpu->iowait_boost_pending = false;
/*
* @util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
return max(boost, util);
}
/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
* @sg_policy: schedutil policy object to compute the new frequency for.
* @util: Current CPU utilization.
* @max: CPU capacity.
*
* If the utilization is frequency-invariant, choose the new frequency to be
* proportional to it, that is
*
* next_freq = C * max_freq * util / max
*
* Otherwise, approximate the would-be frequency-invariant utilization by
* util_raw * (curr_freq / max_freq) which leads to
*
* next_freq = C * curr_freq * util_raw / max
*
* Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
*
* The lowest driver-supported frequency which is equal or greater than the raw
* next_freq (as calculated above) is returned, subject to policy min/max and
* cpufreq driver limitations.
*/
static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned long util, unsigned long max)
{
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
util = map_util_perf(util);
freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
找到cluster中最大的cpu utility之后,通过将其映射到一个具体的CPU frequency。目前sugov采用的映射公式:next_freq = C *max_freq * util / max
其中C = 1.25,表示CPU需要调整的next freq需要提供1.25倍的算力,这样CPU在next freq上运行当前的任务还有20%的算力余量。这里计算出来的next_freq未必是最终设定的频率,因为底层硬件支持的调频是一系列的档位频率,因此,还需要底层硬件驱动进一步根据next_freq来选择一个它支持的频率,最后设定下去。
5.1.3 快速调频
传入cpufreq策略和目标频率,进行快速调频。
static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
if (sugov_update_next_freq(sg_policy, time, next_freq))
cpufreq_driver_fast_switch(sg_policy->policy, next_freq);
}
/**
* cpufreq_driver_fast_switch - Carry out a fast CPU frequency switch.
* @policy: cpufreq policy to switch the frequency for.
* @target_freq: New frequency to set (may be approximate).
*
* Carry out a fast frequency switch without sleeping.
*
* The driver's ->fast_switch() callback invoked by this function must be
* suitable for being called from within RCU-sched read-side critical sections
* and it is expected to select the minimum available frequency greater than or
* equal to @target_freq (CPUFREQ_RELATION_L).
*
* This function must not be called if policy->fast_switch_enabled is unset.
*
* Governors calling this function must guarantee that it will never be invoked
* twice in parallel for the same policy and that it will never be called in
* parallel with either ->target() or ->target_index() for the same policy.
*
* Returns the actual frequency set for the CPU.
*
* If 0 is returned by the driver's ->fast_switch() callback to indicate an
* error condition, the hardware configuration must be preserved.
*/
unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
unsigned int target_freq)
{
unsigned int freq;
unsigned int old_target_freq = target_freq;
int cpu;
target_freq = clamp_val(target_freq, policy->min, policy->max);
trace_android_vh_cpufreq_fast_switch(policy, target_freq, old_target_freq);
// 传入cpufreq策略和目标频率,进行调频
freq = cpufreq_driver->fast_switch(policy, target_freq);
if (!freq)
return 0;
policy->cur = freq;
arch_set_freq_scale(policy->related_cpus, freq,
policy->cpuinfo.max_freq);
cpufreq_stats_record_transition(policy, freq);
cpufreq_times_record_transition(policy, freq);
trace_android_rvh_cpufreq_transition(policy);
if (trace_cpu_frequency_enabled()) {
for_each_cpu(cpu, policy->cpus)
trace_cpu_frequency(freq, cpu);
}
return freq;
}
EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
5.2 ondemend
TBD
六、schedutil gov触发时机
1)实时线程(rt或者deadline)的入队出队
2)cpu上的cfs util发生变化
3)处于Iowait的任务被唤醒
调度事件的发生还是非常密集的,特别是在重载的情况下,很多任务可能执行若干个us就切换出去了。如果每次都计算CPU util看看是否需要调整频率,那么本身sugov就给系统带来较重的负荷,因此并非每次调频时机都会真正执行调频检查,sugov设置了一个最小调频间隔,小于这个间隔的调频请求会被过滤掉。当然,这个最小调频间隔规定也不是永远强制执行,在特定情况下(例如cpufreq core修改了sugov可以动态调整的范围的时候),调频间隔判断可以略过。
七、可优化方向
根据需求或场景,调整sugov的最小调频间隔、sugov util转化freq的映射算法优化、结合用户空间实际场景进行调频调压
八、源码路径
kernel/kernel/sched/cpufreq_schedutil.c
kernel/drivers/cpufreq/cpufreq.c
kernel/drivers/cpufreq/cpufreq-dt.c文章来源:https://www.toymoban.com/news/detail-817924.html
kernel/include/linux/cpufreq.h文章来源地址https://www.toymoban.com/news/detail-817924.html
到了这里,关于Linux cpu dvfs机制的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!