x86/irq: Install posted MSI notification handler

author Jacob Pan <jacob.jun.pan@linux.intel.com>

Tue, 23 Apr 2024 17:41:10 +0000 (10:41 -0700)

committer Thomas Gleixner <tglx@linutronix.de>

Mon, 29 Apr 2024 22:54:42 +0000 (00:54 +0200)
author Jacob Pan <jacob.jun.pan@linux.intel.com>
Tue, 23 Apr 2024 17:41:10 +0000 (10:41 -0700)
committer Thomas Gleixner <tglx@linutronix.de>
Mon, 29 Apr 2024 22:54:42 +0000 (00:54 +0200)
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c

index 89c1476fcdd9f95825c0dbbb320b86b66360ce9e..f004a4dc74c2dafa3c03e7d36e5182e2042e6124 100644 (file)
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -117,6 +117,8 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
         SYSVEC(POSTED_INTR_VECTOR,              kvm_posted_intr_ipi),
         SYSVEC(POSTED_INTR_WAKEUP_VECTOR,       kvm_posted_intr_wakeup_ipi),
         SYSVEC(POSTED_INTR_NESTED_VECTOR,       kvm_posted_intr_nested_ipi),
+
+       SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR,  posted_msi_notification),
  };
  
  static bool fred_setup_done __initdata;
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h

index e7ab594b3a7a09d66be7e60bdab61d6fe908510f..c67fa6ad098aaee490328a5fc66e83c3b428ac61 100644 (file)
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -44,6 +44,9 @@ typedef struct {
         unsigned int irq_hv_reenlightenment_count;
         unsigned int hyperv_stimer0_count;
  #endif
+#ifdef CONFIG_X86_POSTED_MSI
+       unsigned int posted_msi_notification_count;
+#endif
  } ____cacheline_aligned irq_cpustat_t;
  
  DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h

index 749c7411d2f1de33ba44564b37c900a4e6dd936b..d4f24499b256c827deefd0d76dcf14adc0f566a4 100644 (file)
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -751,6 +751,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested
  # define fred_sysvec_kvm_posted_intr_nested_ipi                NULL
  #endif
  
+# ifdef CONFIG_X86_POSTED_MSI
+DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR,        sysvec_posted_msi_notification);
+#else
+# define fred_sysvec_posted_msi_notification           NULL
+# endif
+
  #if IS_ENABLED(CONFIG_HYPERV)
  DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR,    sysvec_hyperv_callback);
  DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c

index fc37c8d83daf234f15094e96c87654f6fd989acd..f445bec516a0b897b72314bf806f61293ed97ad5 100644 (file)
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = {
  # endif
         INTG(SPURIOUS_APIC_VECTOR,              asm_sysvec_spurious_apic_interrupt),
         INTG(ERROR_APIC_VECTOR,                 asm_sysvec_error_interrupt),
+# ifdef CONFIG_X86_POSTED_MSI
+       INTG(POSTED_MSI_NOTIFICATION_VECTOR,    asm_sysvec_posted_msi_notification),
+# endif
  #endif
  };
  
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c

index d652b0481899605b29426d948ee19039d395d656..578e4f6a50804af8ddad0b315f092d45ed4f8a5f 100644 (file)
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -183,6 +183,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
                 seq_printf(p, "%10u ",
                            irq_stats(j)->kvm_posted_intr_wakeup_ipis);
         seq_puts(p, "  Posted-interrupt wakeup event\n");
+#endif
+#ifdef CONFIG_X86_POSTED_MSI
+       seq_printf(p, "%*s: ", prec, "PMN");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ",
+                          irq_stats(j)->posted_msi_notification_count);
+       seq_puts(p, "  Posted MSI notification event\n");
  #endif
         return 0;
  }
@@ -242,16 +249,16 @@ static __always_inline void handle_irq(struct irq_desc *desc,
                 __handle_irq(desc, regs);
  }
  
-static __always_inline void call_irq_handler(int vector, struct pt_regs *regs)
+static __always_inline int call_irq_handler(int vector, struct pt_regs *regs)
  {
         struct irq_desc *desc;
+       int ret = 0;
  
         desc = __this_cpu_read(vector_irq[vector]);
         if (likely(!IS_ERR_OR_NULL(desc))) {
                 handle_irq(desc, regs);
         } else {
-               apic_eoi();
-
+               ret = -EINVAL;
                 if (desc == VECTOR_UNUSED) {
                         pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
                                              __func__, smp_processor_id(),
@@ -260,6 +267,8 @@ static __always_inline void call_irq_handler(int vector, struct pt_regs *regs)
                         __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
                 }
         }
+
+       return ret;
  }
  
  /*
@@ -273,7 +282,9 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
         /* entry code tells RCU that we're not quiescent.  Check it. */
         RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
  
-       call_irq_handler(vector, regs);
+       if (unlikely(call_irq_handler(vector, regs)))
+               apic_eoi();
+
         set_irq_regs(old_regs);
  }
  
@@ -361,6 +372,112 @@ void intel_posted_msi_init(void)
         destination = x2apic_enabled() ? apic_id : apic_id << 8;
         this_cpu_write(posted_msi_pi_desc.ndst, destination);
  }
+
+/*
+ * De-multiplexing posted interrupts is on the performance path, the code
+ * below is written to optimize the cache performance based on the following
+ * considerations:
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
+ *   accessed by both CPU and IOMMU.
+ * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
+ *   for checking and clearing posted interrupt request (PIR), a 256 bit field
+ *   within the PID.
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
+ *   line when posting interrupts and setting control bits.
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
+ *   cache line. The cache line states after each operation are as follows:
+ *   CPU               IOMMU                   PID Cache line state
+ *   ---------------------------------------------------------------
+ *...read64                                    exclusive
+ *...lock xchg64                               modified
+ *...                  post/atomic swap        invalid
+ *...-------------------------------------------------------------
+ *
+ * To reduce L1 data cache miss, it is important to avoid contention with
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
+ * to dispatch interrupt handlers.
+ *
+ * In addition, the code is trying to keep the cache line state consistent
+ * as much as possible. e.g. when making a copy and clearing the PIR
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
+ *             read, read, read, read, xchg, xchg, xchg, xchg
+ * instead of:
+ *             read, xchg, read, xchg, read, xchg, read, xchg
+ */
+static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
+{
+       int i, vec = FIRST_EXTERNAL_VECTOR;
+       unsigned long pir_copy[4];
+       bool handled = false;
+
+       for (i = 0; i < 4; i++)
+               pir_copy[i] = pir[i];
+
+       for (i = 0; i < 4; i++) {
+               if (!pir_copy[i])
+                       continue;
+
+               pir_copy[i] = arch_xchg(&pir[i], 0);
+               handled = true;
+       }
+
+       if (handled) {
+               for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
+                       call_irq_handler(vec, regs);
+       }
+
+       return handled;
+}
+
+/*
+ * Performance data shows that 3 is good enough to harvest 90+% of the benefit
+ * on high IRQ rate workload.
+ */
+#define MAX_POSTED_MSI_COALESCING_LOOP 3
+
+/*
+ * For MSIs that are delivered as posted interrupts, the CPU notifications
+ * can be coalesced if the MSIs arrive in high frequency bursts.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
+{
+       struct pt_regs *old_regs = set_irq_regs(regs);
+       struct pi_desc *pid;
+       int i = 0;
+
+       pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+       inc_irq_stat(posted_msi_notification_count);
+       irq_enter();
+
+       /*
+        * Max coalescing count includes the extra round of handle_pending_pir
+        * after clearing the outstanding notification bit. Hence, at most
+        * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
+        */
+       while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
+               if (!handle_pending_pir(pid->pir64, regs))
+                       break;
+       }
+
+       /*
+        * Clear outstanding notification bit to allow new IRQ notifications,
+        * do this last to maximize the window of interrupt coalescing.
+        */
+       pi_clear_on(pid);
+
+       /*
+        * There could be a race of PI notification and the clearing of ON bit,
+        * process PIR bits one last time such that handling the new interrupts
+        * are not delayed until the next IRQ.
+        */
+       handle_pending_pir(pid->pir64, regs);
+
+       apic_eoi();
+       irq_exit();
+       set_irq_regs(old_regs);
+}
  #endif /* X86_POSTED_MSI */
  
  #ifdef CONFIG_HOTPLUG_CPU
author	Jacob Pan <jacob.jun.pan@linux.intel.com>
	Tue, 23 Apr 2024 17:41:10 +0000 (10:41 -0700)
committer	Thomas Gleixner <tglx@linutronix.de>
	Mon, 29 Apr 2024 22:54:42 +0000 (00:54 +0200)
arch/x86/entry/entry_fred.c		patch \| blob \| history
arch/x86/include/asm/hardirq.h		patch \| blob \| history
arch/x86/include/asm/idtentry.h		patch \| blob \| history
arch/x86/kernel/idt.c		patch \| blob \| history
arch/x86/kernel/irq.c		patch \| blob \| history