@@ -5556,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
55565556
55575557#ifdef CONFIG_SCHED_DEBUG
55585558
5559- static __read_mostly int sched_domain_debug_enabled ;
5559+ static __read_mostly int sched_debug_enabled ;
55605560
5561- static int __init sched_domain_debug_setup (char * str )
5561+ static int __init sched_debug_setup (char * str )
55625562{
5563- sched_domain_debug_enabled = 1 ;
5563+ sched_debug_enabled = 1 ;
55645564
55655565 return 0 ;
55665566}
5567- early_param ("sched_debug" , sched_domain_debug_setup );
5567+ early_param ("sched_debug" , sched_debug_setup );
5568+
5569+ static inline bool sched_debug (void )
5570+ {
5571+ return sched_debug_enabled ;
5572+ }
55685573
55695574static int sched_domain_debug_one (struct sched_domain * sd , int cpu , int level ,
55705575 struct cpumask * groupmask )
@@ -5604,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
56045609 break ;
56055610 }
56065611
5607- if (!group -> sgp -> power ) {
5612+ /*
5613+ * Even though we initialize ->power to something semi-sane,
5614+ * we leave power_orig unset. This allows us to detect if
5615+ * domain iteration is still funny without causing /0 traps.
5616+ */
5617+ if (!group -> sgp -> power_orig ) {
56085618 printk (KERN_CONT "\n" );
56095619 printk (KERN_ERR "ERROR: domain->cpu_power not "
56105620 "set\n" );
@@ -5652,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
56525662{
56535663 int level = 0 ;
56545664
5655- if (!sched_domain_debug_enabled )
5665+ if (!sched_debug_enabled )
56565666 return ;
56575667
56585668 if (!sd ) {
@@ -5673,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
56735683}
56745684#else /* !CONFIG_SCHED_DEBUG */
56755685# define sched_domain_debug (sd , cpu ) do { } while (0)
5686+ static inline bool sched_debug (void )
5687+ {
5688+ return false;
5689+ }
56765690#endif /* CONFIG_SCHED_DEBUG */
56775691
56785692static int sd_degenerate (struct sched_domain * sd )
@@ -5994,6 +6008,44 @@ struct sched_domain_topology_level {
59946008 struct sd_data data ;
59956009};
59966010
6011+ /*
6012+ * Build an iteration mask that can exclude certain CPUs from the upwards
6013+ * domain traversal.
6014+ *
6015+ * Asymmetric node setups can result in situations where the domain tree is of
6016+ * unequal depth, make sure to skip domains that already cover the entire
6017+ * range.
6018+ *
6019+ * In that case build_sched_domains() will have terminated the iteration early
6020+ * and our sibling sd spans will be empty. Domains should always include the
6021+ * cpu they're built on, so check that.
6022+ *
6023+ */
6024+ static void build_group_mask (struct sched_domain * sd , struct sched_group * sg )
6025+ {
6026+ const struct cpumask * span = sched_domain_span (sd );
6027+ struct sd_data * sdd = sd -> private ;
6028+ struct sched_domain * sibling ;
6029+ int i ;
6030+
6031+ for_each_cpu (i , span ) {
6032+ sibling = * per_cpu_ptr (sdd -> sd , i );
6033+ if (!cpumask_test_cpu (i , sched_domain_span (sibling )))
6034+ continue ;
6035+
6036+ cpumask_set_cpu (i , sched_group_mask (sg ));
6037+ }
6038+ }
6039+
6040+ /*
6041+ * Return the canonical balance cpu for this group, this is the first cpu
6042+ * of this group that's also in the iteration mask.
6043+ */
6044+ int group_balance_cpu (struct sched_group * sg )
6045+ {
6046+ return cpumask_first_and (sched_group_cpus (sg ), sched_group_mask (sg ));
6047+ }
6048+
59976049static int
59986050build_overlap_sched_groups (struct sched_domain * sd , int cpu )
59996051{
@@ -6012,15 +6064,19 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
60126064 if (cpumask_test_cpu (i , covered ))
60136065 continue ;
60146066
6067+ child = * per_cpu_ptr (sdd -> sd , i );
6068+
6069+ /* See the comment near build_group_mask(). */
6070+ if (!cpumask_test_cpu (i , sched_domain_span (child )))
6071+ continue ;
6072+
60156073 sg = kzalloc_node (sizeof (struct sched_group ) + cpumask_size (),
60166074 GFP_KERNEL , cpu_to_node (cpu ));
60176075
60186076 if (!sg )
60196077 goto fail ;
60206078
60216079 sg_span = sched_group_cpus (sg );
6022-
6023- child = * per_cpu_ptr (sdd -> sd , i );
60246080 if (child -> child ) {
60256081 child = child -> child ;
60266082 cpumask_copy (sg_span , sched_domain_span (child ));
@@ -6030,13 +6086,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
60306086 cpumask_or (covered , covered , sg_span );
60316087
60326088 sg -> sgp = * per_cpu_ptr (sdd -> sgp , i );
6033- atomic_inc (& sg -> sgp -> ref );
6089+ if (atomic_inc_return (& sg -> sgp -> ref ) == 1 )
6090+ build_group_mask (sd , sg );
60346091
6092+ /*
6093+ * Initialize sgp->power such that even if we mess up the
6094+ * domains and no possible iteration will get us here, we won't
6095+ * die on a /0 trap.
6096+ */
6097+ sg -> sgp -> power = SCHED_POWER_SCALE * cpumask_weight (sg_span );
6098+
6099+ /*
6100+ * Make sure the first group of this domain contains the
6101+ * canonical balance cpu. Otherwise the sched_domain iteration
6102+ * breaks. See update_sg_lb_stats().
6103+ */
60356104 if ((!groups && cpumask_test_cpu (cpu , sg_span )) ||
6036- cpumask_first (sg_span ) == cpu ) {
6037- WARN_ON_ONCE (!cpumask_test_cpu (cpu , sg_span ));
6105+ group_balance_cpu (sg ) == cpu )
60386106 groups = sg ;
6039- }
60406107
60416108 if (!first )
60426109 first = sg ;
@@ -6109,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
61096176
61106177 cpumask_clear (sched_group_cpus (sg ));
61116178 sg -> sgp -> power = 0 ;
6179+ cpumask_setall (sched_group_mask (sg ));
61126180
61136181 for_each_cpu (j , span ) {
61146182 if (get_group (j , sdd , NULL ) != group )
@@ -6150,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
61506218 sg = sg -> next ;
61516219 } while (sg != sd -> groups );
61526220
6153- if (cpu != group_first_cpu (sg ))
6221+ if (cpu != group_balance_cpu (sg ))
61546222 return ;
61556223
61566224 update_group_power (sd , cpu );
@@ -6200,11 +6268,8 @@ int sched_domain_level_max;
62006268
62016269static int __init setup_relax_domain_level (char * str )
62026270{
6203- unsigned long val ;
6204-
6205- val = simple_strtoul (str , NULL , 0 );
6206- if (val < sched_domain_level_max )
6207- default_relax_domain_level = val ;
6271+ if (kstrtoint (str , 0 , & default_relax_domain_level ))
6272+ pr_warn ("Unable to set relax_domain_level\n" );
62086273
62096274 return 1 ;
62106275}
@@ -6314,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
63146379#ifdef CONFIG_NUMA
63156380
63166381static int sched_domains_numa_levels ;
6317- static int sched_domains_numa_scale ;
63186382static int * sched_domains_numa_distance ;
63196383static struct cpumask * * * sched_domains_numa_masks ;
63206384static int sched_domains_curr_level ;
63216385
63226386static inline int sd_local_flags (int level )
63236387{
6324- if (sched_domains_numa_distance [level ] > REMOTE_DISTANCE )
6388+ if (sched_domains_numa_distance [level ] > RECLAIM_DISTANCE )
63256389 return 0 ;
63266390
63276391 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE ;
@@ -6379,14 +6443,49 @@ static const struct cpumask *sd_numa_mask(int cpu)
63796443 return sched_domains_numa_masks [sched_domains_curr_level ][cpu_to_node (cpu )];
63806444}
63816445
6446+ static void sched_numa_warn (const char * str )
6447+ {
6448+ static int done = false;
6449+ int i ,j ;
6450+
6451+ if (done )
6452+ return ;
6453+
6454+ done = true;
6455+
6456+ printk (KERN_WARNING "ERROR: %s\n\n" , str );
6457+
6458+ for (i = 0 ; i < nr_node_ids ; i ++ ) {
6459+ printk (KERN_WARNING " " );
6460+ for (j = 0 ; j < nr_node_ids ; j ++ )
6461+ printk (KERN_CONT "%02d " , node_distance (i ,j ));
6462+ printk (KERN_CONT "\n" );
6463+ }
6464+ printk (KERN_WARNING "\n" );
6465+ }
6466+
6467+ static bool find_numa_distance (int distance )
6468+ {
6469+ int i ;
6470+
6471+ if (distance == node_distance (0 , 0 ))
6472+ return true;
6473+
6474+ for (i = 0 ; i < sched_domains_numa_levels ; i ++ ) {
6475+ if (sched_domains_numa_distance [i ] == distance )
6476+ return true;
6477+ }
6478+
6479+ return false;
6480+ }
6481+
63826482static void sched_init_numa (void )
63836483{
63846484 int next_distance , curr_distance = node_distance (0 , 0 );
63856485 struct sched_domain_topology_level * tl ;
63866486 int level = 0 ;
63876487 int i , j , k ;
63886488
6389- sched_domains_numa_scale = curr_distance ;
63906489 sched_domains_numa_distance = kzalloc (sizeof (int ) * nr_node_ids , GFP_KERNEL );
63916490 if (!sched_domains_numa_distance )
63926491 return ;
@@ -6397,23 +6496,41 @@ static void sched_init_numa(void)
63976496 *
63986497 * Assumes node_distance(0,j) includes all distances in
63996498 * node_distance(i,j) in order to avoid cubic time.
6400- *
6401- * XXX: could be optimized to O(n log n) by using sort()
64026499 */
64036500 next_distance = curr_distance ;
64046501 for (i = 0 ; i < nr_node_ids ; i ++ ) {
64056502 for (j = 0 ; j < nr_node_ids ; j ++ ) {
6406- int distance = node_distance (0 , j );
6407- if (distance > curr_distance &&
6408- (distance < next_distance ||
6409- next_distance == curr_distance ))
6410- next_distance = distance ;
6503+ for (k = 0 ; k < nr_node_ids ; k ++ ) {
6504+ int distance = node_distance (i , k );
6505+
6506+ if (distance > curr_distance &&
6507+ (distance < next_distance ||
6508+ next_distance == curr_distance ))
6509+ next_distance = distance ;
6510+
6511+ /*
6512+ * While not a strong assumption it would be nice to know
6513+ * about cases where if node A is connected to B, B is not
6514+ * equally connected to A.
6515+ */
6516+ if (sched_debug () && node_distance (k , i ) != distance )
6517+ sched_numa_warn ("Node-distance not symmetric" );
6518+
6519+ if (sched_debug () && i && !find_numa_distance (distance ))
6520+ sched_numa_warn ("Node-0 not representative" );
6521+ }
6522+ if (next_distance != curr_distance ) {
6523+ sched_domains_numa_distance [level ++ ] = next_distance ;
6524+ sched_domains_numa_levels = level ;
6525+ curr_distance = next_distance ;
6526+ } else break ;
64116527 }
6412- if (next_distance != curr_distance ) {
6413- sched_domains_numa_distance [level ++ ] = next_distance ;
6414- sched_domains_numa_levels = level ;
6415- curr_distance = next_distance ;
6416- } else break ;
6528+
6529+ /*
6530+ * In case of sched_debug() we verify the above assumption.
6531+ */
6532+ if (!sched_debug ())
6533+ break ;
64176534 }
64186535 /*
64196536 * 'level' contains the number of unique distances, excluding the
@@ -6525,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
65256642
65266643 * per_cpu_ptr (sdd -> sg , j ) = sg ;
65276644
6528- sgp = kzalloc_node (sizeof (struct sched_group_power ),
6645+ sgp = kzalloc_node (sizeof (struct sched_group_power ) + cpumask_size () ,
65296646 GFP_KERNEL , cpu_to_node (j ));
65306647 if (!sgp )
65316648 return - ENOMEM ;
@@ -6578,14 +6695,14 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
65786695 if (!sd )
65796696 return child ;
65806697
6581- set_domain_attribute (sd , attr );
65826698 cpumask_and (sched_domain_span (sd ), cpu_map , tl -> mask (cpu ));
65836699 if (child ) {
65846700 sd -> level = child -> level + 1 ;
65856701 sched_domain_level_max = max (sched_domain_level_max , sd -> level );
65866702 child -> parent = sd ;
65876703 }
65886704 sd -> child = child ;
6705+ set_domain_attribute (sd , attr );
65896706
65906707 return sd ;
65916708}
0 commit comments