Skip to content

Commit 7249450

Browse files
committed
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar. * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Fix the relax_domain_level boot parameter sched: Validate assumptions in sched_init_numa() sched: Always initialize cpu-power sched: Fix domain iteration sched/rt: Fix lockdep annotation within find_lock_lowest_rq() sched/numa: Load balance between remote nodes sched/x86: Calculate booted cores after construction of sibling_mask
2 parents cd96891 + a841f8c commit 7249450

File tree

6 files changed

+179
-39
lines changed

6 files changed

+179
-39
lines changed

arch/x86/kernel/smpboot.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,15 @@ void __cpuinit set_cpu_sibling_map(int cpu)
382382
if ((i == cpu) || (has_mc && match_llc(c, o)))
383383
link_mask(llc_shared, cpu, i);
384384

385+
}
386+
387+
/*
388+
* This needs a separate iteration over the cpus because we rely on all
389+
* cpu_sibling_mask links to be set-up.
390+
*/
391+
for_each_cpu(i, cpu_sibling_setup_mask) {
392+
o = &cpu_data(i);
393+
385394
if ((i == cpu) || (has_mc && match_mc(c, o))) {
386395
link_mask(core, cpu, i);
387396

include/linux/sched.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -877,6 +877,8 @@ struct sched_group_power {
877877
* Number of busy cpus in this group.
878878
*/
879879
atomic_t nr_busy_cpus;
880+
881+
unsigned long cpumask[0]; /* iteration mask */
880882
};
881883

882884
struct sched_group {
@@ -901,6 +903,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
901903
return to_cpumask(sg->cpumask);
902904
}
903905

906+
/*
907+
* cpumask masking which cpus in the group are allowed to iterate up the domain
908+
* tree.
909+
*/
910+
static inline struct cpumask *sched_group_mask(struct sched_group *sg)
911+
{
912+
return to_cpumask(sg->sgp->cpumask);
913+
}
914+
904915
/**
905916
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
906917
* @group: The group whose first cpu is to be returned.

kernel/sched/core.c

Lines changed: 152 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5556,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
55565556

55575557
#ifdef CONFIG_SCHED_DEBUG
55585558

5559-
static __read_mostly int sched_domain_debug_enabled;
5559+
static __read_mostly int sched_debug_enabled;
55605560

5561-
static int __init sched_domain_debug_setup(char *str)
5561+
static int __init sched_debug_setup(char *str)
55625562
{
5563-
sched_domain_debug_enabled = 1;
5563+
sched_debug_enabled = 1;
55645564

55655565
return 0;
55665566
}
5567-
early_param("sched_debug", sched_domain_debug_setup);
5567+
early_param("sched_debug", sched_debug_setup);
5568+
5569+
static inline bool sched_debug(void)
5570+
{
5571+
return sched_debug_enabled;
5572+
}
55685573

55695574
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
55705575
struct cpumask *groupmask)
@@ -5604,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
56045609
break;
56055610
}
56065611

5607-
if (!group->sgp->power) {
5612+
/*
5613+
* Even though we initialize ->power to something semi-sane,
5614+
* we leave power_orig unset. This allows us to detect if
5615+
* domain iteration is still funny without causing /0 traps.
5616+
*/
5617+
if (!group->sgp->power_orig) {
56085618
printk(KERN_CONT "\n");
56095619
printk(KERN_ERR "ERROR: domain->cpu_power not "
56105620
"set\n");
@@ -5652,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
56525662
{
56535663
int level = 0;
56545664

5655-
if (!sched_domain_debug_enabled)
5665+
if (!sched_debug_enabled)
56565666
return;
56575667

56585668
if (!sd) {
@@ -5673,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
56735683
}
56745684
#else /* !CONFIG_SCHED_DEBUG */
56755685
# define sched_domain_debug(sd, cpu) do { } while (0)
5686+
static inline bool sched_debug(void)
5687+
{
5688+
return false;
5689+
}
56765690
#endif /* CONFIG_SCHED_DEBUG */
56775691

56785692
static int sd_degenerate(struct sched_domain *sd)
@@ -5994,6 +6008,44 @@ struct sched_domain_topology_level {
59946008
struct sd_data data;
59956009
};
59966010

6011+
/*
6012+
* Build an iteration mask that can exclude certain CPUs from the upwards
6013+
* domain traversal.
6014+
*
6015+
* Asymmetric node setups can result in situations where the domain tree is of
6016+
* unequal depth, make sure to skip domains that already cover the entire
6017+
* range.
6018+
*
6019+
* In that case build_sched_domains() will have terminated the iteration early
6020+
* and our sibling sd spans will be empty. Domains should always include the
6021+
* cpu they're built on, so check that.
6022+
*
6023+
*/
6024+
static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6025+
{
6026+
const struct cpumask *span = sched_domain_span(sd);
6027+
struct sd_data *sdd = sd->private;
6028+
struct sched_domain *sibling;
6029+
int i;
6030+
6031+
for_each_cpu(i, span) {
6032+
sibling = *per_cpu_ptr(sdd->sd, i);
6033+
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6034+
continue;
6035+
6036+
cpumask_set_cpu(i, sched_group_mask(sg));
6037+
}
6038+
}
6039+
6040+
/*
6041+
* Return the canonical balance cpu for this group, this is the first cpu
6042+
* of this group that's also in the iteration mask.
6043+
*/
6044+
int group_balance_cpu(struct sched_group *sg)
6045+
{
6046+
return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6047+
}
6048+
59976049
static int
59986050
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
59996051
{
@@ -6012,15 +6064,19 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
60126064
if (cpumask_test_cpu(i, covered))
60136065
continue;
60146066

6067+
child = *per_cpu_ptr(sdd->sd, i);
6068+
6069+
/* See the comment near build_group_mask(). */
6070+
if (!cpumask_test_cpu(i, sched_domain_span(child)))
6071+
continue;
6072+
60156073
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
60166074
GFP_KERNEL, cpu_to_node(cpu));
60176075

60186076
if (!sg)
60196077
goto fail;
60206078

60216079
sg_span = sched_group_cpus(sg);
6022-
6023-
child = *per_cpu_ptr(sdd->sd, i);
60246080
if (child->child) {
60256081
child = child->child;
60266082
cpumask_copy(sg_span, sched_domain_span(child));
@@ -6030,13 +6086,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
60306086
cpumask_or(covered, covered, sg_span);
60316087

60326088
sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6033-
atomic_inc(&sg->sgp->ref);
6089+
if (atomic_inc_return(&sg->sgp->ref) == 1)
6090+
build_group_mask(sd, sg);
60346091

6092+
/*
6093+
* Initialize sgp->power such that even if we mess up the
6094+
* domains and no possible iteration will get us here, we won't
6095+
* die on a /0 trap.
6096+
*/
6097+
sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
6098+
6099+
/*
6100+
* Make sure the first group of this domain contains the
6101+
* canonical balance cpu. Otherwise the sched_domain iteration
6102+
* breaks. See update_sg_lb_stats().
6103+
*/
60356104
if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6036-
cpumask_first(sg_span) == cpu) {
6037-
WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
6105+
group_balance_cpu(sg) == cpu)
60386106
groups = sg;
6039-
}
60406107

60416108
if (!first)
60426109
first = sg;
@@ -6109,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
61096176

61106177
cpumask_clear(sched_group_cpus(sg));
61116178
sg->sgp->power = 0;
6179+
cpumask_setall(sched_group_mask(sg));
61126180

61136181
for_each_cpu(j, span) {
61146182
if (get_group(j, sdd, NULL) != group)
@@ -6150,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
61506218
sg = sg->next;
61516219
} while (sg != sd->groups);
61526220

6153-
if (cpu != group_first_cpu(sg))
6221+
if (cpu != group_balance_cpu(sg))
61546222
return;
61556223

61566224
update_group_power(sd, cpu);
@@ -6200,11 +6268,8 @@ int sched_domain_level_max;
62006268

62016269
static int __init setup_relax_domain_level(char *str)
62026270
{
6203-
unsigned long val;
6204-
6205-
val = simple_strtoul(str, NULL, 0);
6206-
if (val < sched_domain_level_max)
6207-
default_relax_domain_level = val;
6271+
if (kstrtoint(str, 0, &default_relax_domain_level))
6272+
pr_warn("Unable to set relax_domain_level\n");
62086273

62096274
return 1;
62106275
}
@@ -6314,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
63146379
#ifdef CONFIG_NUMA
63156380

63166381
static int sched_domains_numa_levels;
6317-
static int sched_domains_numa_scale;
63186382
static int *sched_domains_numa_distance;
63196383
static struct cpumask ***sched_domains_numa_masks;
63206384
static int sched_domains_curr_level;
63216385

63226386
static inline int sd_local_flags(int level)
63236387
{
6324-
if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
6388+
if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
63256389
return 0;
63266390

63276391
return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
@@ -6379,14 +6443,49 @@ static const struct cpumask *sd_numa_mask(int cpu)
63796443
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
63806444
}
63816445

6446+
static void sched_numa_warn(const char *str)
6447+
{
6448+
static int done = false;
6449+
int i,j;
6450+
6451+
if (done)
6452+
return;
6453+
6454+
done = true;
6455+
6456+
printk(KERN_WARNING "ERROR: %s\n\n", str);
6457+
6458+
for (i = 0; i < nr_node_ids; i++) {
6459+
printk(KERN_WARNING " ");
6460+
for (j = 0; j < nr_node_ids; j++)
6461+
printk(KERN_CONT "%02d ", node_distance(i,j));
6462+
printk(KERN_CONT "\n");
6463+
}
6464+
printk(KERN_WARNING "\n");
6465+
}
6466+
6467+
static bool find_numa_distance(int distance)
6468+
{
6469+
int i;
6470+
6471+
if (distance == node_distance(0, 0))
6472+
return true;
6473+
6474+
for (i = 0; i < sched_domains_numa_levels; i++) {
6475+
if (sched_domains_numa_distance[i] == distance)
6476+
return true;
6477+
}
6478+
6479+
return false;
6480+
}
6481+
63826482
static void sched_init_numa(void)
63836483
{
63846484
int next_distance, curr_distance = node_distance(0, 0);
63856485
struct sched_domain_topology_level *tl;
63866486
int level = 0;
63876487
int i, j, k;
63886488

6389-
sched_domains_numa_scale = curr_distance;
63906489
sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
63916490
if (!sched_domains_numa_distance)
63926491
return;
@@ -6397,23 +6496,41 @@ static void sched_init_numa(void)
63976496
*
63986497
* Assumes node_distance(0,j) includes all distances in
63996498
* node_distance(i,j) in order to avoid cubic time.
6400-
*
6401-
* XXX: could be optimized to O(n log n) by using sort()
64026499
*/
64036500
next_distance = curr_distance;
64046501
for (i = 0; i < nr_node_ids; i++) {
64056502
for (j = 0; j < nr_node_ids; j++) {
6406-
int distance = node_distance(0, j);
6407-
if (distance > curr_distance &&
6408-
(distance < next_distance ||
6409-
next_distance == curr_distance))
6410-
next_distance = distance;
6503+
for (k = 0; k < nr_node_ids; k++) {
6504+
int distance = node_distance(i, k);
6505+
6506+
if (distance > curr_distance &&
6507+
(distance < next_distance ||
6508+
next_distance == curr_distance))
6509+
next_distance = distance;
6510+
6511+
/*
6512+
* While not a strong assumption it would be nice to know
6513+
* about cases where if node A is connected to B, B is not
6514+
* equally connected to A.
6515+
*/
6516+
if (sched_debug() && node_distance(k, i) != distance)
6517+
sched_numa_warn("Node-distance not symmetric");
6518+
6519+
if (sched_debug() && i && !find_numa_distance(distance))
6520+
sched_numa_warn("Node-0 not representative");
6521+
}
6522+
if (next_distance != curr_distance) {
6523+
sched_domains_numa_distance[level++] = next_distance;
6524+
sched_domains_numa_levels = level;
6525+
curr_distance = next_distance;
6526+
} else break;
64116527
}
6412-
if (next_distance != curr_distance) {
6413-
sched_domains_numa_distance[level++] = next_distance;
6414-
sched_domains_numa_levels = level;
6415-
curr_distance = next_distance;
6416-
} else break;
6528+
6529+
/*
6530+
* In case of sched_debug() we verify the above assumption.
6531+
*/
6532+
if (!sched_debug())
6533+
break;
64176534
}
64186535
/*
64196536
* 'level' contains the number of unique distances, excluding the
@@ -6525,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
65256642

65266643
*per_cpu_ptr(sdd->sg, j) = sg;
65276644

6528-
sgp = kzalloc_node(sizeof(struct sched_group_power),
6645+
sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
65296646
GFP_KERNEL, cpu_to_node(j));
65306647
if (!sgp)
65316648
return -ENOMEM;
@@ -6578,14 +6695,14 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
65786695
if (!sd)
65796696
return child;
65806697

6581-
set_domain_attribute(sd, attr);
65826698
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
65836699
if (child) {
65846700
sd->level = child->level + 1;
65856701
sched_domain_level_max = max(sched_domain_level_max, sd->level);
65866702
child->parent = sd;
65876703
}
65886704
sd->child = child;
6705+
set_domain_attribute(sd, attr);
65896706

65906707
return sd;
65916708
}

0 commit comments

Comments
 (0)