]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
LDoms CPU Hotplug - fix interrupt redistribution.
authorBijan Mottahedeh <bijan.mottahedeh@oracle.com>
Fri, 29 Jan 2016 02:25:03 +0000 (18:25 -0800)
committerAllen Pais <allen.pais@oracle.com>
Tue, 19 Apr 2016 12:21:17 +0000 (17:51 +0530)
Orabug: 22623753

- Disable cpu timer only for hot-remove and not for hot-add
- Update interrupt affinities before interrupt redistribution
- Default to simple round-robin interrupt redistribution for ldoms

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
(cherry picked from commit 40110bd3bf1d2188719cca6f7a32df7d722f42be)
(cherry picked from commit 69910784aff4cad929ed7b15b744249da57ffc01)

arch/sparc/include/asm/irq_64.h
arch/sparc/kernel/cpumap.c
arch/sparc/kernel/ds.c
arch/sparc/kernel/irq_64.c
arch/sparc/kernel/mdesc.c
arch/sparc/kernel/smp_64.c

index 1c163db85a3fdf66174f389579f4f16c070e42a1..c3c24f02a6ac8374b7fad63dabf4cca8c9649680 100644 (file)
@@ -61,7 +61,7 @@ unsigned int irq_alloc(unsigned int dev_handle, unsigned int dev_ino);
 void irq_free(unsigned int irq);
 
 void __init init_IRQ(void);
-void fixup_irqs(void);
+void fixup_irqs(cpumask_t *, bool);
 
 static inline void set_softint(unsigned long bits)
 {
index e69ec0e3f15527705b3ce28fc89ff5c0341f03fb..bb413143afc65fca8ba1d2c2f15a720393ee0590 100644 (file)
@@ -55,6 +55,14 @@ struct cpuinfo_tree {
 
 static struct cpuinfo_tree *cpuinfo_tree;
 
+int dbgcit;    /* cpuinfo tree debug */
+
+#define        citdbg(fmt, args...)                                    \
+       do {                                                    \
+               if (dbgcit)                                     \
+                       pr_info("%s " fmt, __func__, ##args);   \
+       } while (0)
+
 static u16 cpu_distribution_map[NR_CPUS];
 static DEFINE_SPINLOCK(cpu_map_lock);
 
@@ -92,6 +100,104 @@ static const int generic_iterate_method[] = {
 };
 
 
+/*
+ * The cpuinfo tree is rebuilt during a cpu hotplug operation, either
+ * directly through cpu_map_rebuild() or indirectly through map_to_cpu(),
+ * the latter case happening if the number of online cpus is different
+ * than the number of cpus in the tree.
+ *
+ * There were three paths to tree rebuild originally as depicted below,
+ * one during cpu hot-add, one during cpu hot-remove, and during irq enable.
+ * In addition, __cpu_up() now directly calls cpu_map_rebuild() during
+ * hot-add processing.
+ *
+ * The tree can be accessed however when enabling interrupts.  This is not
+ * an issue for hot-remove since cpu_map_rebuild() is called with all cpus
+ * paused and interrupts disabled during a stop_machine() call.  This may
+ * be an issue however for hot-add since __cpu_up() and fixup_irqs() are
+ * called with other cpus running and interrupts enabled.
+ *
+ * There is no issue however if simple_map_to_cpu() is used.
+ *
+ *     +irq_enable()
+ *     |
+ *     | +dr_cpu_configure()
+ *     | |
+ *     | +->fixup_irqs()
+ *     |   |
+ *     |   +->irq_set_affinity()
+ *     |   |
+ *     +---+->irq_choose_cpu()
+ *            |
+ *            +->map_to_cpu()
+ *               |
+ *               +------------>_map_to_cpu()
+ *                             |
+ *        +--------------------+->_cpu_map_rebuild()
+ *        |                       |
+ *        |                       +->build_cpuinfo_tree()
+ *     +--+->cpu_map_rebuild()
+ *     |  |
+ *     |  +__cpu_disable()
+ *     |
+ *     +__cpu_up()
+ *
+ *
+ * set_proc_ids() iteraters through all "exec-unit" nodes and calls
+ * mark_proc_ids() to assign the same proc_id to all cpus pointing to
+ * the "exec-unit" unit.  This means that if a core has multiple
+ * pipelines shared by all strands in the core, each strand would be
+ * assigned a proc_id twice, the second overwriting the first and thus
+ * hiding one of the pipelines.  On a T5 where each core has two pipelines
+ * the number of reported pipelines is fact half of what they should be.
+ * The increment_rover() algorithm subsequently doesn't work on all platforms.
+ *
+ *
+ * iterate_cpu() and increment_rover() assume that all cpus between
+ * start_index and end_index of a CPUINFO_LVL_PROC are always present.
+ * This means that if a cpu in the middle of that range has been offlined
+ * iterate_cpu() can actually return and offline cpu as the target for
+ * interrupt redistribution which leads to subsequent system hangs.
+ * To deal with problem, iterate_cpu() was called multiple times until
+ * an online cpu was returned.
+ *
+ * The following code in map_to_cpu() can lead to an infinite loop in
+ * case of the cpuinfo_tree because if _map_to_cpu() causes the tree
+ * to be rebuilt, it can return the same offline cpu as before leading
+ * to the infinite loop:
+ *
+ *     while (unlikely(!cpu_online(mapped_cpu)))
+ *             mapped_cpu = _map_to_cpu(index);
+ *
+ *
+ * enumerate_cpuinfo_nodes() assumes that node ids at each level of the tree
+ * are monotonically increasing which is not necessarily the case for
+ * ldoms, e.g. lower cpu ids can have higher core ids. If this assumption
+ * is broken, the number of calculated nodes can be less that the number
+ * of actual nodes required to represent the cpu topology. This can lead to
+ * data corruption when the tree is iterated.  Testing showed illegal index
+ * values in iterate_cpu() and subsequent panics and hangs.
+ * Using bitmaps for nodes, core, and procs fixed the illegal index problem
+ * and significanly reduced the number of the panics.  However, one of those
+ * panics still happens, with less frequency but consistenly, sometime during
+ * or after when sched domains are rebuilt as part of hotplug processing.
+ * No panic happens when the cpuinfo_tree method is bypassed and the default
+ * simple_map_to_cpu() method is used.
+ *
+ *
+ * Furthermore, no documentation exists to show actual measured benefits
+ * of the cpuinfo tree.  For all those reasons, ldoms defaults to
+ * simple_map_to_cpu().
+ */
+#ifdef CONFIG_SUN_LDOMS
+/*
+ * Default to simple_map_to_cpu() for LDoms.
+ */
+static inline struct cpuinfo_tree *build_cpuinfo_tree(void)
+{
+       return NULL;
+}
+#else
 static int cpuinfo_id(int cpu, int level)
 {
        int id;
@@ -124,6 +230,14 @@ static int enumerate_cpuinfo_nodes(struct cpuinfo_level *tree_level)
 {
        int prev_id[CPUINFO_LVL_MAX];
        int i, n, num_nodes;
+#ifdef DBGCIT
+       int c, m;
+       cpumask_t node_mask, core_mask, proc_mask;
+
+       cpumask_clear(&node_mask);
+       cpumask_clear(&core_mask);
+       cpumask_clear(&proc_mask);
+#endif
 
        for (i = CPUINFO_LVL_ROOT; i < CPUINFO_LVL_MAX; i++) {
                struct cpuinfo_level *lv = &tree_level[i];
@@ -139,23 +253,41 @@ static int enumerate_cpuinfo_nodes(struct cpuinfo_level *tree_level)
                        continue;
 
                n = cpuinfo_id(i, CPUINFO_LVL_NODE);
+#ifdef DBGCIT
+               m = n;
+               if (!cpumask_test_cpu(n, &node_mask)) {
+                       cpumask_set_cpu(n, &node_mask);
+#else
                if (n > prev_id[CPUINFO_LVL_NODE]) {
+#endif
                        tree_level[CPUINFO_LVL_NODE].num_nodes++;
                        prev_id[CPUINFO_LVL_NODE] = n;
                        num_nodes++;
                }
                n = cpuinfo_id(i, CPUINFO_LVL_CORE);
+#ifdef DBGCIT
+               c = n;
+               if (!cpumask_test_cpu(n, &core_mask)) {
+                       cpumask_set_cpu(n, &core_mask);
+#else
                if (n > prev_id[CPUINFO_LVL_CORE]) {
+#endif
                        tree_level[CPUINFO_LVL_CORE].num_nodes++;
                        prev_id[CPUINFO_LVL_CORE] = n;
                        num_nodes++;
                }
                n = cpuinfo_id(i, CPUINFO_LVL_PROC);
+#ifdef DBGCIT
+               if (!cpumask_test_cpu(n, &proc_mask)) {
+                       cpumask_set_cpu(n, &proc_mask);
+#else
                if (n > prev_id[CPUINFO_LVL_PROC]) {
+#endif
                        tree_level[CPUINFO_LVL_PROC].num_nodes++;
                        prev_id[CPUINFO_LVL_PROC] = n;
                        num_nodes++;
                }
+               citdbg("cpu=%d pid=%d cid=%d nid=%d\n", i, n, c, m);
        }
 
        tree_level[CPUINFO_LVL_ROOT].num_nodes = 1;
@@ -173,6 +305,11 @@ static int enumerate_cpuinfo_nodes(struct cpuinfo_level *tree_level)
        n += tree_level[CPUINFO_LVL_PROC].num_nodes;
        tree_level[CPUINFO_LVL_PROC].end_index   = n - 1;
 
+       for (i = CPUINFO_LVL_ROOT; i < CPUINFO_LVL_MAX; i++)
+               citdbg("level=%d nodes=%d start=%d end=%d\n",
+                      i, tree_level[i].num_nodes, tree_level[i].start_index,
+                      tree_level[i].end_index);
+
        return num_nodes;
 }
 
@@ -195,6 +332,7 @@ static struct cpuinfo_tree *build_cpuinfo_tree(void)
 
        new_tree = kzalloc(sizeof(struct cpuinfo_tree) +
                           (sizeof(struct cpuinfo_node) * n), GFP_ATOMIC);
+       citdbg("num_nodes=%d new_tree=%p\n", n, new_tree);
        if (!new_tree)
                return NULL;
 
@@ -270,6 +408,10 @@ static struct cpuinfo_tree *build_cpuinfo_tree(void)
                                        node->child_end =
                                            level_rover[level + 1] - 1;
                                }
+                               citdbg("l=%d r=%d s=%d e=%d p=%d\n",
+                                       level, level_rover[level],
+                                       node->child_start, node->child_end,
+                                       node->parent_index);
 
                                /* Initialize the next node in the same level */
                                n = ++level_rover[level];
@@ -292,6 +434,7 @@ static struct cpuinfo_tree *build_cpuinfo_tree(void)
 
        return new_tree;
 }
+#endif
 
 static void increment_rover(struct cpuinfo_tree *t, int node_index,
                             int root_index, const int *rover_inc_table)
@@ -338,6 +481,10 @@ static int iterate_cpu(struct cpuinfo_tree *t, unsigned int root_index)
        for (level = t->nodes[root_index].level; level < CPUINFO_LVL_MAX;
             level++) {
                new_index = t->nodes[index].rover;
+               if (new_index < 0 || (new_index >= t->total_nodes &&
+                   level != CPUINFO_LVL_PROC))
+                       citdbg("index=%d new_index=%d total=%d level=%d\n",
+                                index, new_index, t->total_nodes, level);
                if (rover_inc_table[level] & ROVER_INC_ON_VISIT)
                        increment_rover(t, index, root_index, rover_inc_table);
 
@@ -363,8 +510,25 @@ static void _cpu_map_rebuild(void)
         * to check if the CPU is online, as that is done when the cpuinfo
         * tree is being built.
         */
-       for (i = 0; i < cpuinfo_tree->nodes[0].num_cpus; i++)
+       for (i = 0; i < cpuinfo_tree->nodes[0].num_cpus; i++) {
+#ifdef DBGCIT
+               int cpu;
+               int j = 0;
+
+               do {
+                       cpu = iterate_cpu(cpuinfo_tree, 0);
+                       if (cpu_online(cpu))
+                               break;
+               } while (++j < num_possible_cpus());
+
+               if (j)
+                       citdbg("offline=%d\n", j);
+               BUG_ON(!cpu_online(cpu));
+               cpu_distribution_map[i] = cpu;
+#else
                cpu_distribution_map[i] = iterate_cpu(cpuinfo_tree, 0);
+#endif
+       }
 }
 
 /* Fallback if the cpuinfo tree could not be built.  CPU mapping is linear
@@ -402,9 +566,16 @@ static int _map_to_cpu(unsigned int index)
        root_node = &cpuinfo_tree->nodes[0];
 #ifdef CONFIG_HOTPLUG_CPU
        if (unlikely(root_node->num_cpus != num_online_cpus())) {
+               citdbg("cpus=%d online=%d\n",
+                      root_node->num_cpus, num_online_cpus());
                _cpu_map_rebuild();
                if (!cpuinfo_tree)
                        return simple_map_to_cpu(index);
+
+#ifdef DBGCIT
+               /* update root_node if cpuinfo_tree has changed */
+               root_node = &cpuinfo_tree->nodes[0];
+#endif
        }
 #endif
        return cpu_distribution_map[index % root_node->num_cpus];
@@ -419,8 +590,12 @@ int map_to_cpu(unsigned int index)
        mapped_cpu = _map_to_cpu(index);
 
 #ifdef CONFIG_HOTPLUG_CPU
+#ifdef DBGCIT
+       BUG_ON(!cpu_online(cpu));
+#else
        while (unlikely(!cpu_online(mapped_cpu)))
                mapped_cpu = _map_to_cpu(index);
+#endif
 #endif
        spin_unlock_irqrestore(&cpu_map_lock, flag);
        return mapped_cpu;
index 70844bbf7bc8f56f3e893b8a593b2b2c27ba31ce..151f598ff60ab3f04d3a19ea82e055852823dd84 100644 (file)
@@ -598,6 +598,9 @@ struct dr_cpu_resp_entry {
 
        u32                             str_off;
 };
+
+static DEFINE_MUTEX(ds_dr_cpu_mutex);
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 
@@ -1830,16 +1833,17 @@ static int __cpuinit dr_cpu_configure(struct ds_dev *ds,
                        pr_err("ds-%llu: CPU startup failed err=%d\n", ds->id,
                                err);
                        dr_cpu_mark(resp, cpu, ncpus, res, stat);
+                       cpumask_clear_cpu(cpu, mask);
                }
        }
 
+       /* Redistribute IRQs, taking into account the new cpus.  */
+       fixup_irqs(mask, true);
+
        ds_cap_send(handle, resp, resp_len);
 
        kfree(resp);
 
-       /* Redistribute IRQs, taking into account the new cpus.  */
-       fixup_irqs();
-
        return 0;
 }
 
@@ -1914,6 +1918,7 @@ static void __cpuinit ds_dr_cpu_data_cb(ds_cb_arg_t arg,
                        cpumask_set_cpu(cpu_list[i], &mask);
        }
 
+       mutex_lock(&ds_dr_cpu_mutex);
        if (tag->type == DR_CPU_CONFIGURE)
                err = dr_cpu_configure(ds, handle, req_num, &mask);
        else
@@ -1921,6 +1926,7 @@ static void __cpuinit ds_dr_cpu_data_cb(ds_cb_arg_t arg,
 
        if (err)
                __dr_cpu_send_error(ds, handle, tag);
+       mutex_unlock(&ds_dr_cpu_mutex);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
@@ -2734,7 +2740,7 @@ void ldom_power_off(void)
 
 static int ds_handle_data_nack(struct ds_dev *ds, struct ds_msg_tag *pkt)
 {
-       int rv;
+       int rv = 0;
        struct ds_data_nack *data_nack;
 
        dprintk("entered.\n");
@@ -2772,7 +2778,6 @@ static int ds_handle_data_nack(struct ds_dev *ds, struct ds_msg_tag *pkt)
                 */
                pr_err("ds-%llu: received UNKNOWN data NACK for "
                        "handle %llx\n", ds->id, data_nack->payload.handle);
-               rv = 0;
 
                break;
        };
@@ -3764,7 +3769,7 @@ static int ds_read_ldc_msg(struct ds_dev *ds, unsigned char *buf,
        unsigned int bytes_read;
        unsigned int read_size;
        unsigned int delay_cnt;
-       int rv;
+       int rv = 0;
 
        bytes_left = size;
        bytes_read = 0;
index 865d4031204151df20ff0e42a4c20418f649c624..ebc291cc642d9ca3c007779183f573f58378e930 100644 (file)
@@ -865,10 +865,27 @@ void do_softirq_own_stack(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-void fixup_irqs(void)
+static void fixup_affinity(cpumask_t *affinity, cpumask_t *mask, bool enabled)
+{
+       if (enabled)
+               cpumask_or(affinity, affinity, mask);
+       else {
+               cpumask_and(affinity, affinity, mask);
+               if (cpumask_empty(affinity)) {
+                       pr_warn("IRQ mapped to cpu not in affinity list\n");
+                       cpumask_set_cpu(cpumask_first(cpu_online_mask),
+                                       affinity);
+               }
+       }
+}
+
+void fixup_irqs(cpumask_t *mask, bool enabled)
 {
        unsigned int irq;
 
+       if (!enabled)
+               cpumask_complement(mask, mask);
+
        for (irq = 0; irq < NR_IRQS; irq++) {
                struct irq_desc *desc = irq_to_desc(irq);
                struct irq_data *data;
@@ -879,6 +896,7 @@ void fixup_irqs(void)
                data = irq_desc_get_irq_data(desc);
                raw_spin_lock_irqsave(&desc->lock, flags);
                if (desc->action && !irqd_is_per_cpu(data)) {
+                       fixup_affinity(data->affinity, mask, enabled);
                        if (data->chip->irq_set_affinity)
                                data->chip->irq_set_affinity(data,
                                                             data->affinity,
@@ -887,7 +905,8 @@ void fixup_irqs(void)
                raw_spin_unlock_irqrestore(&desc->lock, flags);
        }
 
-       tick_ops->disable_irq();
+       if (!enabled)
+               tick_ops->disable_irq();
 }
 #endif
 
index 169ef9b48b7a7253c68acf70d9fa7647a3ac1a70..33bdfcba694f47de99277bcd414609e6ddbfd4b8 100644 (file)
@@ -1181,6 +1181,7 @@ static void *fill_in_one_cpu(struct mdesc_handle *hp, u64 mp, int cpuid,
                }
        }
 
+       c->sock_id = -1;
        c->core_id = 0;
        c->proc_id = -1;
 
index 39f96a89c3dfb7c8cde5fb01ee264281981ed838..d2dfa92ce8898f7bbbdf893b6144d8acc5222ed1 100644 (file)
@@ -1231,11 +1231,9 @@ void __init smp_setup_processor_id(void)
 
 void smp_fill_in_sib_core_maps(void)
 {
-       unsigned int i;
+       unsigned int i, j;
 
        for_each_present_cpu(i) {
-               unsigned int j;
-
                cpumask_clear(&cpu_core_map[i]);
                if (cpu_data(i).core_id == 0) {
                        cpumask_set_cpu(i, &cpu_core_map[i]);
@@ -1243,14 +1241,18 @@ void smp_fill_in_sib_core_maps(void)
                }
 
                for_each_present_cpu(j) {
-                       if (cpu_data(i).core_id ==
-                           cpu_data(j).core_id)
+                       if (cpu_data(i).core_id == cpu_data(j).core_id)
                                cpumask_set_cpu(j, &cpu_core_map[i]);
                }
        }
 
        for_each_present_cpu(i)  {
-               unsigned int j;
+               cpumask_clear(&cpu_core_sib_map[i]);
+               if (cpu_data(i).sock_id == -1) {
+                       cpumask_set_cpu(i, &cpu_core_sib_map[i]);
+                       continue;
+               }
+
                for_each_present_cpu(j)  {
                        if (cpu_data(i).sock_id == cpu_data(j).sock_id)
                                cpumask_set_cpu(j, &cpu_core_sib_map[i]);
@@ -1258,8 +1260,6 @@ void smp_fill_in_sib_core_maps(void)
        }
 
        for_each_present_cpu(i) {
-               unsigned int j;
-
                cpumask_clear(&per_cpu(cpu_sibling_map, i));
                if (cpu_data(i).proc_id == -1) {
                        cpumask_set_cpu(i, &per_cpu(cpu_sibling_map, i));
@@ -1267,9 +1267,9 @@ void smp_fill_in_sib_core_maps(void)
                }
 
                for_each_present_cpu(j) {
-                       if (cpu_data(i).proc_id ==
-                           cpu_data(j).proc_id)
-                               cpumask_set_cpu(j, &per_cpu(cpu_sibling_map, i));
+                       if (cpu_data(i).proc_id == cpu_data(j).proc_id)
+                               cpumask_set_cpu(j, &per_cpu(cpu_sibling_map,
+                                               i));
                }
        }
 }
@@ -1290,6 +1290,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
                         */
                        if (tlb_type != hypervisor)
                                smp_synchronize_one_tick(cpu);
+                       cpu_map_rebuild();
                }
        }
        return ret;
@@ -1336,32 +1337,42 @@ int __cpu_disable(void)
        int cpu = smp_processor_id();
        cpuinfo_sparc *c;
        int i;
+       cpumask_var_t mask;
+
+       if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+               return -ENOMEM;
 
        for_each_cpu(i, &cpu_core_map[cpu])
                cpumask_clear_cpu(cpu, &cpu_core_map[i]);
        cpumask_clear(&cpu_core_map[cpu]);
 
+       for_each_cpu(i, &cpu_core_sib_map[cpu])
+               cpumask_clear_cpu(cpu, &cpu_core_sib_map[i]);
+       cpumask_clear(&cpu_core_sib_map[cpu]);
+
        for_each_cpu(i, &per_cpu(cpu_sibling_map, cpu))
                cpumask_clear_cpu(cpu, &per_cpu(cpu_sibling_map, i));
        cpumask_clear(&per_cpu(cpu_sibling_map, cpu));
 
        c = &cpu_data(cpu);
 
+       c->sock_id = -1;
        c->core_id = 0;
        c->proc_id = -1;
 
+       /*
+        * Offline before fixup.
+        * See irq_choose_cpu(), cpu_map_rebuild().
+        */
+       set_cpu_online(cpu, false);
        smp_wmb();
+       local_irq_disable();            /* don't process further interrupts */
+       cpu_map_rebuild();
 
        /* Make sure no interrupts point to this cpu.  */
-       fixup_irqs();
-
-       local_irq_enable();
-       mdelay(1);
-       local_irq_disable();
-
-       set_cpu_online(cpu, false);
-
-       cpu_map_rebuild();
+       cpumask_set_cpu(cpu, mask);
+       fixup_irqs(mask, false);        /* cpu should alreay be offlined */
+       free_cpumask_var(mask);
 
        return 0;
 }