From: Bijan Mottahedeh Date: Fri, 29 Jan 2016 02:25:03 +0000 (-0800) Subject: LDoms CPU Hotplug - fix interrupt redistribution. X-Git-Tag: v4.1.12-92~163^2~17 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=2004332bc65be614deae5586a7564434f467b12c;p=users%2Fjedix%2Flinux-maple.git LDoms CPU Hotplug - fix interrupt redistribution. Orabug: 22623753 - Disable cpu timer only for hot-remove and not for hot-add - Update interrupt affinities before interrupt redistribution - Default to simple round-robin interrupt redistribution for ldoms Signed-off-by: Bijan Mottahedeh (cherry picked from commit 40110bd3bf1d2188719cca6f7a32df7d722f42be) (cherry picked from commit 69910784aff4cad929ed7b15b744249da57ffc01) --- diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h index 1c163db85a3fd..c3c24f02a6ac8 100644 --- a/arch/sparc/include/asm/irq_64.h +++ b/arch/sparc/include/asm/irq_64.h @@ -61,7 +61,7 @@ unsigned int irq_alloc(unsigned int dev_handle, unsigned int dev_ino); void irq_free(unsigned int irq); void __init init_IRQ(void); -void fixup_irqs(void); +void fixup_irqs(cpumask_t *, bool); static inline void set_softint(unsigned long bits) { diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c index e69ec0e3f1552..bb413143afc65 100644 --- a/arch/sparc/kernel/cpumap.c +++ b/arch/sparc/kernel/cpumap.c @@ -55,6 +55,14 @@ struct cpuinfo_tree { static struct cpuinfo_tree *cpuinfo_tree; +int dbgcit; /* cpuinfo tree debug */ + +#define citdbg(fmt, args...) \ + do { \ + if (dbgcit) \ + pr_info("%s " fmt, __func__, ##args); \ + } while (0) + static u16 cpu_distribution_map[NR_CPUS]; static DEFINE_SPINLOCK(cpu_map_lock); @@ -92,6 +100,104 @@ static const int generic_iterate_method[] = { }; +/* + * The cpuinfo tree is rebuilt during a cpu hotplug operation, either + * directly through cpu_map_rebuild() or indirectly through map_to_cpu(), + * the latter case happening if the number of online cpus is different + * than the number of cpus in the tree. + * + * There were three paths to tree rebuild originally as depicted below, + * one during cpu hot-add, one during cpu hot-remove, and during irq enable. + * In addition, __cpu_up() now directly calls cpu_map_rebuild() during + * hot-add processing. + * + * The tree can be accessed however when enabling interrupts. This is not + * an issue for hot-remove since cpu_map_rebuild() is called with all cpus + * paused and interrupts disabled during a stop_machine() call. This may + * be an issue however for hot-add since __cpu_up() and fixup_irqs() are + * called with other cpus running and interrupts enabled. + * + * There is no issue however if simple_map_to_cpu() is used. + * + * +irq_enable() + * | + * | +dr_cpu_configure() + * | | + * | +->fixup_irqs() + * | | + * | +->irq_set_affinity() + * | | + * +---+->irq_choose_cpu() + * | + * +->map_to_cpu() + * | + * +------------>_map_to_cpu() + * | + * +--------------------+->_cpu_map_rebuild() + * | | + * | +->build_cpuinfo_tree() + * +--+->cpu_map_rebuild() + * | | + * | +__cpu_disable() + * | + * +__cpu_up() + * + * + * set_proc_ids() iteraters through all "exec-unit" nodes and calls + * mark_proc_ids() to assign the same proc_id to all cpus pointing to + * the "exec-unit" unit. This means that if a core has multiple + * pipelines shared by all strands in the core, each strand would be + * assigned a proc_id twice, the second overwriting the first and thus + * hiding one of the pipelines. On a T5 where each core has two pipelines + * the number of reported pipelines is fact half of what they should be. + * The increment_rover() algorithm subsequently doesn't work on all platforms. + * + * + * iterate_cpu() and increment_rover() assume that all cpus between + * start_index and end_index of a CPUINFO_LVL_PROC are always present. + * This means that if a cpu in the middle of that range has been offlined + * iterate_cpu() can actually return and offline cpu as the target for + * interrupt redistribution which leads to subsequent system hangs. + * To deal with problem, iterate_cpu() was called multiple times until + * an online cpu was returned. + * + * The following code in map_to_cpu() can lead to an infinite loop in + * case of the cpuinfo_tree because if _map_to_cpu() causes the tree + * to be rebuilt, it can return the same offline cpu as before leading + * to the infinite loop: + * + * while (unlikely(!cpu_online(mapped_cpu))) + * mapped_cpu = _map_to_cpu(index); + * + * + * enumerate_cpuinfo_nodes() assumes that node ids at each level of the tree + * are monotonically increasing which is not necessarily the case for + * ldoms, e.g. lower cpu ids can have higher core ids. If this assumption + * is broken, the number of calculated nodes can be less that the number + * of actual nodes required to represent the cpu topology. This can lead to + * data corruption when the tree is iterated. Testing showed illegal index + * values in iterate_cpu() and subsequent panics and hangs. + * Using bitmaps for nodes, core, and procs fixed the illegal index problem + * and significanly reduced the number of the panics. However, one of those + * panics still happens, with less frequency but consistenly, sometime during + * or after when sched domains are rebuilt as part of hotplug processing. + * No panic happens when the cpuinfo_tree method is bypassed and the default + * simple_map_to_cpu() method is used. + * + * + * Furthermore, no documentation exists to show actual measured benefits + * of the cpuinfo tree. For all those reasons, ldoms defaults to + * simple_map_to_cpu(). + */ +#ifdef CONFIG_SUN_LDOMS +/* + * Default to simple_map_to_cpu() for LDoms. + */ +static inline struct cpuinfo_tree *build_cpuinfo_tree(void) +{ + return NULL; +} +#else static int cpuinfo_id(int cpu, int level) { int id; @@ -124,6 +230,14 @@ static int enumerate_cpuinfo_nodes(struct cpuinfo_level *tree_level) { int prev_id[CPUINFO_LVL_MAX]; int i, n, num_nodes; +#ifdef DBGCIT + int c, m; + cpumask_t node_mask, core_mask, proc_mask; + + cpumask_clear(&node_mask); + cpumask_clear(&core_mask); + cpumask_clear(&proc_mask); +#endif for (i = CPUINFO_LVL_ROOT; i < CPUINFO_LVL_MAX; i++) { struct cpuinfo_level *lv = &tree_level[i]; @@ -139,23 +253,41 @@ static int enumerate_cpuinfo_nodes(struct cpuinfo_level *tree_level) continue; n = cpuinfo_id(i, CPUINFO_LVL_NODE); +#ifdef DBGCIT + m = n; + if (!cpumask_test_cpu(n, &node_mask)) { + cpumask_set_cpu(n, &node_mask); +#else if (n > prev_id[CPUINFO_LVL_NODE]) { +#endif tree_level[CPUINFO_LVL_NODE].num_nodes++; prev_id[CPUINFO_LVL_NODE] = n; num_nodes++; } n = cpuinfo_id(i, CPUINFO_LVL_CORE); +#ifdef DBGCIT + c = n; + if (!cpumask_test_cpu(n, &core_mask)) { + cpumask_set_cpu(n, &core_mask); +#else if (n > prev_id[CPUINFO_LVL_CORE]) { +#endif tree_level[CPUINFO_LVL_CORE].num_nodes++; prev_id[CPUINFO_LVL_CORE] = n; num_nodes++; } n = cpuinfo_id(i, CPUINFO_LVL_PROC); +#ifdef DBGCIT + if (!cpumask_test_cpu(n, &proc_mask)) { + cpumask_set_cpu(n, &proc_mask); +#else if (n > prev_id[CPUINFO_LVL_PROC]) { +#endif tree_level[CPUINFO_LVL_PROC].num_nodes++; prev_id[CPUINFO_LVL_PROC] = n; num_nodes++; } + citdbg("cpu=%d pid=%d cid=%d nid=%d\n", i, n, c, m); } tree_level[CPUINFO_LVL_ROOT].num_nodes = 1; @@ -173,6 +305,11 @@ static int enumerate_cpuinfo_nodes(struct cpuinfo_level *tree_level) n += tree_level[CPUINFO_LVL_PROC].num_nodes; tree_level[CPUINFO_LVL_PROC].end_index = n - 1; + for (i = CPUINFO_LVL_ROOT; i < CPUINFO_LVL_MAX; i++) + citdbg("level=%d nodes=%d start=%d end=%d\n", + i, tree_level[i].num_nodes, tree_level[i].start_index, + tree_level[i].end_index); + return num_nodes; } @@ -195,6 +332,7 @@ static struct cpuinfo_tree *build_cpuinfo_tree(void) new_tree = kzalloc(sizeof(struct cpuinfo_tree) + (sizeof(struct cpuinfo_node) * n), GFP_ATOMIC); + citdbg("num_nodes=%d new_tree=%p\n", n, new_tree); if (!new_tree) return NULL; @@ -270,6 +408,10 @@ static struct cpuinfo_tree *build_cpuinfo_tree(void) node->child_end = level_rover[level + 1] - 1; } + citdbg("l=%d r=%d s=%d e=%d p=%d\n", + level, level_rover[level], + node->child_start, node->child_end, + node->parent_index); /* Initialize the next node in the same level */ n = ++level_rover[level]; @@ -292,6 +434,7 @@ static struct cpuinfo_tree *build_cpuinfo_tree(void) return new_tree; } +#endif static void increment_rover(struct cpuinfo_tree *t, int node_index, int root_index, const int *rover_inc_table) @@ -338,6 +481,10 @@ static int iterate_cpu(struct cpuinfo_tree *t, unsigned int root_index) for (level = t->nodes[root_index].level; level < CPUINFO_LVL_MAX; level++) { new_index = t->nodes[index].rover; + if (new_index < 0 || (new_index >= t->total_nodes && + level != CPUINFO_LVL_PROC)) + citdbg("index=%d new_index=%d total=%d level=%d\n", + index, new_index, t->total_nodes, level); if (rover_inc_table[level] & ROVER_INC_ON_VISIT) increment_rover(t, index, root_index, rover_inc_table); @@ -363,8 +510,25 @@ static void _cpu_map_rebuild(void) * to check if the CPU is online, as that is done when the cpuinfo * tree is being built. */ - for (i = 0; i < cpuinfo_tree->nodes[0].num_cpus; i++) + for (i = 0; i < cpuinfo_tree->nodes[0].num_cpus; i++) { +#ifdef DBGCIT + int cpu; + int j = 0; + + do { + cpu = iterate_cpu(cpuinfo_tree, 0); + if (cpu_online(cpu)) + break; + } while (++j < num_possible_cpus()); + + if (j) + citdbg("offline=%d\n", j); + BUG_ON(!cpu_online(cpu)); + cpu_distribution_map[i] = cpu; +#else cpu_distribution_map[i] = iterate_cpu(cpuinfo_tree, 0); +#endif + } } /* Fallback if the cpuinfo tree could not be built. CPU mapping is linear @@ -402,9 +566,16 @@ static int _map_to_cpu(unsigned int index) root_node = &cpuinfo_tree->nodes[0]; #ifdef CONFIG_HOTPLUG_CPU if (unlikely(root_node->num_cpus != num_online_cpus())) { + citdbg("cpus=%d online=%d\n", + root_node->num_cpus, num_online_cpus()); _cpu_map_rebuild(); if (!cpuinfo_tree) return simple_map_to_cpu(index); + +#ifdef DBGCIT + /* update root_node if cpuinfo_tree has changed */ + root_node = &cpuinfo_tree->nodes[0]; +#endif } #endif return cpu_distribution_map[index % root_node->num_cpus]; @@ -419,8 +590,12 @@ int map_to_cpu(unsigned int index) mapped_cpu = _map_to_cpu(index); #ifdef CONFIG_HOTPLUG_CPU +#ifdef DBGCIT + BUG_ON(!cpu_online(cpu)); +#else while (unlikely(!cpu_online(mapped_cpu))) mapped_cpu = _map_to_cpu(index); +#endif #endif spin_unlock_irqrestore(&cpu_map_lock, flag); return mapped_cpu; diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c index 70844bbf7bc8f..151f598ff60ab 100644 --- a/arch/sparc/kernel/ds.c +++ b/arch/sparc/kernel/ds.c @@ -598,6 +598,9 @@ struct dr_cpu_resp_entry { u32 str_off; }; + +static DEFINE_MUTEX(ds_dr_cpu_mutex); + #endif /* CONFIG_HOTPLUG_CPU */ @@ -1830,16 +1833,17 @@ static int __cpuinit dr_cpu_configure(struct ds_dev *ds, pr_err("ds-%llu: CPU startup failed err=%d\n", ds->id, err); dr_cpu_mark(resp, cpu, ncpus, res, stat); + cpumask_clear_cpu(cpu, mask); } } + /* Redistribute IRQs, taking into account the new cpus. */ + fixup_irqs(mask, true); + ds_cap_send(handle, resp, resp_len); kfree(resp); - /* Redistribute IRQs, taking into account the new cpus. */ - fixup_irqs(); - return 0; } @@ -1914,6 +1918,7 @@ static void __cpuinit ds_dr_cpu_data_cb(ds_cb_arg_t arg, cpumask_set_cpu(cpu_list[i], &mask); } + mutex_lock(&ds_dr_cpu_mutex); if (tag->type == DR_CPU_CONFIGURE) err = dr_cpu_configure(ds, handle, req_num, &mask); else @@ -1921,6 +1926,7 @@ static void __cpuinit ds_dr_cpu_data_cb(ds_cb_arg_t arg, if (err) __dr_cpu_send_error(ds, handle, tag); + mutex_unlock(&ds_dr_cpu_mutex); } #endif /* CONFIG_HOTPLUG_CPU */ @@ -2734,7 +2740,7 @@ void ldom_power_off(void) static int ds_handle_data_nack(struct ds_dev *ds, struct ds_msg_tag *pkt) { - int rv; + int rv = 0; struct ds_data_nack *data_nack; dprintk("entered.\n"); @@ -2772,7 +2778,6 @@ static int ds_handle_data_nack(struct ds_dev *ds, struct ds_msg_tag *pkt) */ pr_err("ds-%llu: received UNKNOWN data NACK for " "handle %llx\n", ds->id, data_nack->payload.handle); - rv = 0; break; }; @@ -3764,7 +3769,7 @@ static int ds_read_ldc_msg(struct ds_dev *ds, unsigned char *buf, unsigned int bytes_read; unsigned int read_size; unsigned int delay_cnt; - int rv; + int rv = 0; bytes_left = size; bytes_read = 0; diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 865d403120415..ebc291cc642d9 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -865,10 +865,27 @@ void do_softirq_own_stack(void) } #ifdef CONFIG_HOTPLUG_CPU -void fixup_irqs(void) +static void fixup_affinity(cpumask_t *affinity, cpumask_t *mask, bool enabled) +{ + if (enabled) + cpumask_or(affinity, affinity, mask); + else { + cpumask_and(affinity, affinity, mask); + if (cpumask_empty(affinity)) { + pr_warn("IRQ mapped to cpu not in affinity list\n"); + cpumask_set_cpu(cpumask_first(cpu_online_mask), + affinity); + } + } +} + +void fixup_irqs(cpumask_t *mask, bool enabled) { unsigned int irq; + if (!enabled) + cpumask_complement(mask, mask); + for (irq = 0; irq < NR_IRQS; irq++) { struct irq_desc *desc = irq_to_desc(irq); struct irq_data *data; @@ -879,6 +896,7 @@ void fixup_irqs(void) data = irq_desc_get_irq_data(desc); raw_spin_lock_irqsave(&desc->lock, flags); if (desc->action && !irqd_is_per_cpu(data)) { + fixup_affinity(data->affinity, mask, enabled); if (data->chip->irq_set_affinity) data->chip->irq_set_affinity(data, data->affinity, @@ -887,7 +905,8 @@ void fixup_irqs(void) raw_spin_unlock_irqrestore(&desc->lock, flags); } - tick_ops->disable_irq(); + if (!enabled) + tick_ops->disable_irq(); } #endif diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index 169ef9b48b7a7..33bdfcba694f4 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -1181,6 +1181,7 @@ static void *fill_in_one_cpu(struct mdesc_handle *hp, u64 mp, int cpuid, } } + c->sock_id = -1; c->core_id = 0; c->proc_id = -1; diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 39f96a89c3dfb..d2dfa92ce8898 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1231,11 +1231,9 @@ void __init smp_setup_processor_id(void) void smp_fill_in_sib_core_maps(void) { - unsigned int i; + unsigned int i, j; for_each_present_cpu(i) { - unsigned int j; - cpumask_clear(&cpu_core_map[i]); if (cpu_data(i).core_id == 0) { cpumask_set_cpu(i, &cpu_core_map[i]); @@ -1243,14 +1241,18 @@ void smp_fill_in_sib_core_maps(void) } for_each_present_cpu(j) { - if (cpu_data(i).core_id == - cpu_data(j).core_id) + if (cpu_data(i).core_id == cpu_data(j).core_id) cpumask_set_cpu(j, &cpu_core_map[i]); } } for_each_present_cpu(i) { - unsigned int j; + cpumask_clear(&cpu_core_sib_map[i]); + if (cpu_data(i).sock_id == -1) { + cpumask_set_cpu(i, &cpu_core_sib_map[i]); + continue; + } + for_each_present_cpu(j) { if (cpu_data(i).sock_id == cpu_data(j).sock_id) cpumask_set_cpu(j, &cpu_core_sib_map[i]); @@ -1258,8 +1260,6 @@ void smp_fill_in_sib_core_maps(void) } for_each_present_cpu(i) { - unsigned int j; - cpumask_clear(&per_cpu(cpu_sibling_map, i)); if (cpu_data(i).proc_id == -1) { cpumask_set_cpu(i, &per_cpu(cpu_sibling_map, i)); @@ -1267,9 +1267,9 @@ void smp_fill_in_sib_core_maps(void) } for_each_present_cpu(j) { - if (cpu_data(i).proc_id == - cpu_data(j).proc_id) - cpumask_set_cpu(j, &per_cpu(cpu_sibling_map, i)); + if (cpu_data(i).proc_id == cpu_data(j).proc_id) + cpumask_set_cpu(j, &per_cpu(cpu_sibling_map, + i)); } } } @@ -1290,6 +1290,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) */ if (tlb_type != hypervisor) smp_synchronize_one_tick(cpu); + cpu_map_rebuild(); } } return ret; @@ -1336,32 +1337,42 @@ int __cpu_disable(void) int cpu = smp_processor_id(); cpuinfo_sparc *c; int i; + cpumask_var_t mask; + + if (!zalloc_cpumask_var(&mask, GFP_ATOMIC)) + return -ENOMEM; for_each_cpu(i, &cpu_core_map[cpu]) cpumask_clear_cpu(cpu, &cpu_core_map[i]); cpumask_clear(&cpu_core_map[cpu]); + for_each_cpu(i, &cpu_core_sib_map[cpu]) + cpumask_clear_cpu(cpu, &cpu_core_sib_map[i]); + cpumask_clear(&cpu_core_sib_map[cpu]); + for_each_cpu(i, &per_cpu(cpu_sibling_map, cpu)) cpumask_clear_cpu(cpu, &per_cpu(cpu_sibling_map, i)); cpumask_clear(&per_cpu(cpu_sibling_map, cpu)); c = &cpu_data(cpu); + c->sock_id = -1; c->core_id = 0; c->proc_id = -1; + /* + * Offline before fixup. + * See irq_choose_cpu(), cpu_map_rebuild(). + */ + set_cpu_online(cpu, false); smp_wmb(); + local_irq_disable(); /* don't process further interrupts */ + cpu_map_rebuild(); /* Make sure no interrupts point to this cpu. */ - fixup_irqs(); - - local_irq_enable(); - mdelay(1); - local_irq_disable(); - - set_cpu_online(cpu, false); - - cpu_map_rebuild(); + cpumask_set_cpu(cpu, mask); + fixup_irqs(mask, false); /* cpu should alreay be offlined */ + free_cpumask_var(mask); return 0; }