#include <linux/jump_label.h>
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
+#include <linux/spinlock.h>
+#include <linux/cpuhotplug.h>
+#include <linux/workqueue.h>
+#include <linux/proc_fs.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/page.h>
 EXPORT_SYMBOL(plpar_hcall9);
 EXPORT_SYMBOL(plpar_hcall_norets);
 
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+static u8 dtl_mask = DTL_LOG_PREEMPT;
+#else
+static u8 dtl_mask;
+#endif
+
 void alloc_dtl_buffers(void)
 {
        int cpu;
 
        for_each_possible_cpu(cpu) {
                pp = paca_ptrs[cpu];
+               if (pp->dispatch_log)
+                       continue;
                dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
                if (!dtl) {
                        pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
                                cpu);
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
                        pr_warn("Stolen time statistics will be unreliable\n");
+#endif
                        break;
                }
 
 
        pp = paca_ptrs[cpu];
        dtl = pp->dispatch_log;
-       if (dtl) {
+       if (dtl && dtl_mask) {
                pp->dtl_ridx = 0;
                pp->dtl_curr = dtl;
                lppaca_of(cpu).dtl_idx = 0;
                        pr_err("WARNING: DTL registration of cpu %d (hw %d) failed with %ld\n",
                               cpu, hwcpu, ret);
 
-               lppaca_of(cpu).dtl_enable_mask = DTL_LOG_PREEMPT;
+               lppaca_of(cpu).dtl_enable_mask = dtl_mask;
        }
 }
 
 #ifdef CONFIG_PPC_SPLPAR
+struct dtl_worker {
+       struct delayed_work work;
+       int cpu;
+};
+
+struct vcpu_dispatch_data {
+       int last_disp_cpu;
+
+       int total_disp;
+
+       int same_cpu_disp;
+       int same_chip_disp;
+       int diff_chip_disp;
+       int far_chip_disp;
+
+       int numa_home_disp;
+       int numa_remote_disp;
+       int numa_far_disp;
+};
+
+/*
+ * This represents the number of cpus in the hypervisor. Since there is no
+ * architected way to discover the number of processors in the host, we
+ * provision for dealing with NR_CPUS. This is currently 2048 by default, and
+ * is sufficient for our purposes. This will need to be tweaked if
+ * CONFIG_NR_CPUS is changed.
+ */
+#define NR_CPUS_H      NR_CPUS
+
 DEFINE_RWLOCK(dtl_access_lock);
+static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data);
+static DEFINE_PER_CPU(u64, dtl_entry_ridx);
+static DEFINE_PER_CPU(struct dtl_worker, dtl_workers);
+static enum cpuhp_state dtl_worker_state;
+static DEFINE_MUTEX(dtl_enable_mutex);
+static int vcpudispatch_stats_on __read_mostly;
+static int vcpudispatch_stats_freq = 50;
+static __be32 *vcpu_associativity, *pcpu_associativity;
+
+
+static void free_dtl_buffers(void)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+       int cpu;
+       struct paca_struct *pp;
+
+       for_each_possible_cpu(cpu) {
+               pp = paca_ptrs[cpu];
+               if (!pp->dispatch_log)
+                       continue;
+               kmem_cache_free(dtl_cache, pp->dispatch_log);
+               pp->dtl_ridx = 0;
+               pp->dispatch_log = 0;
+               pp->dispatch_log_end = 0;
+               pp->dtl_curr = 0;
+       }
+#endif
+}
+
+static int init_cpu_associativity(void)
+{
+       vcpu_associativity = kcalloc(num_possible_cpus() / threads_per_core,
+                       VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
+       pcpu_associativity = kcalloc(NR_CPUS_H / threads_per_core,
+                       VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
+
+       if (!vcpu_associativity || !pcpu_associativity) {
+               pr_err("error allocating memory for associativity information\n");
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static void destroy_cpu_associativity(void)
+{
+       kfree(vcpu_associativity);
+       kfree(pcpu_associativity);
+       vcpu_associativity = pcpu_associativity = 0;
+}
+
+static __be32 *__get_cpu_associativity(int cpu, __be32 *cpu_assoc, int flag)
+{
+       __be32 *assoc;
+       int rc = 0;
+
+       assoc = &cpu_assoc[(int)(cpu / threads_per_core) * VPHN_ASSOC_BUFSIZE];
+       if (!assoc[0]) {
+               rc = hcall_vphn(cpu, flag, &assoc[0]);
+               if (rc)
+                       return NULL;
+       }
+
+       return assoc;
+}
+
+static __be32 *get_pcpu_associativity(int cpu)
+{
+       return __get_cpu_associativity(cpu, pcpu_associativity, VPHN_FLAG_PCPU);
+}
+
+static __be32 *get_vcpu_associativity(int cpu)
+{
+       return __get_cpu_associativity(cpu, vcpu_associativity, VPHN_FLAG_VCPU);
+}
+
+static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu)
+{
+       __be32 *last_disp_cpu_assoc, *cur_disp_cpu_assoc;
+
+       if (last_disp_cpu >= NR_CPUS_H || cur_disp_cpu >= NR_CPUS_H)
+               return -EINVAL;
+
+       last_disp_cpu_assoc = get_pcpu_associativity(last_disp_cpu);
+       cur_disp_cpu_assoc = get_pcpu_associativity(cur_disp_cpu);
+
+       if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
+               return -EIO;
+
+       return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
+}
+
+static int cpu_home_node_dispatch_distance(int disp_cpu)
+{
+       __be32 *disp_cpu_assoc, *vcpu_assoc;
+       int vcpu_id = smp_processor_id();
+
+       if (disp_cpu >= NR_CPUS_H) {
+               pr_debug_ratelimited("vcpu dispatch cpu %d > %d\n",
+                                               disp_cpu, NR_CPUS_H);
+               return -EINVAL;
+       }
+
+       disp_cpu_assoc = get_pcpu_associativity(disp_cpu);
+       vcpu_assoc = get_vcpu_associativity(vcpu_id);
+
+       if (!disp_cpu_assoc || !vcpu_assoc)
+               return -EIO;
+
+       return cpu_distance(disp_cpu_assoc, vcpu_assoc);
+}
+
+static void update_vcpu_disp_stat(int disp_cpu)
+{
+       struct vcpu_dispatch_data *disp;
+       int distance;
+
+       disp = this_cpu_ptr(&vcpu_disp_data);
+       if (disp->last_disp_cpu == -1) {
+               disp->last_disp_cpu = disp_cpu;
+               return;
+       }
+
+       disp->total_disp++;
+
+       if (disp->last_disp_cpu == disp_cpu ||
+               (cpu_first_thread_sibling(disp->last_disp_cpu) ==
+                                       cpu_first_thread_sibling(disp_cpu)))
+               disp->same_cpu_disp++;
+       else {
+               distance = cpu_relative_dispatch_distance(disp->last_disp_cpu,
+                                                               disp_cpu);
+               if (distance < 0)
+                       pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
+                                       smp_processor_id());
+               else {
+                       switch (distance) {
+                       case 0:
+                               disp->same_chip_disp++;
+                               break;
+                       case 1:
+                               disp->diff_chip_disp++;
+                               break;
+                       case 2:
+                               disp->far_chip_disp++;
+                               break;
+                       default:
+                               pr_debug_ratelimited("vcpudispatch_stats: cpu %d (%d -> %d): unexpected relative dispatch distance %d\n",
+                                                smp_processor_id(),
+                                                disp->last_disp_cpu,
+                                                disp_cpu,
+                                                distance);
+                       }
+               }
+       }
+
+       distance = cpu_home_node_dispatch_distance(disp_cpu);
+       if (distance < 0)
+               pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
+                               smp_processor_id());
+       else {
+               switch (distance) {
+               case 0:
+                       disp->numa_home_disp++;
+                       break;
+               case 1:
+                       disp->numa_remote_disp++;
+                       break;
+               case 2:
+                       disp->numa_far_disp++;
+                       break;
+               default:
+                       pr_debug_ratelimited("vcpudispatch_stats: cpu %d on %d: unexpected numa dispatch distance %d\n",
+                                                smp_processor_id(),
+                                                disp_cpu,
+                                                distance);
+               }
+       }
+
+       disp->last_disp_cpu = disp_cpu;
+}
+
+static void process_dtl_buffer(struct work_struct *work)
+{
+       struct dtl_entry dtle;
+       u64 i = __this_cpu_read(dtl_entry_ridx);
+       struct dtl_entry *dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
+       struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
+       struct lppaca *vpa = local_paca->lppaca_ptr;
+       struct dtl_worker *d = container_of(work, struct dtl_worker, work.work);
+
+       if (!local_paca->dispatch_log)
+               return;
+
+       /* if we have been migrated away, we cancel ourself */
+       if (d->cpu != smp_processor_id()) {
+               pr_debug("vcpudispatch_stats: cpu %d worker migrated -- canceling worker\n",
+                                               smp_processor_id());
+               return;
+       }
+
+       if (i == be64_to_cpu(vpa->dtl_idx))
+               goto out;
+
+       while (i < be64_to_cpu(vpa->dtl_idx)) {
+               dtle = *dtl;
+               barrier();
+               if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
+                       /* buffer has overflowed */
+                       pr_debug_ratelimited("vcpudispatch_stats: cpu %d lost %lld DTL samples\n",
+                               d->cpu,
+                               be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG - i);
+                       i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
+                       dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
+                       continue;
+               }
+               update_vcpu_disp_stat(be16_to_cpu(dtle.processor_id));
+               ++i;
+               ++dtl;
+               if (dtl == dtl_end)
+                       dtl = local_paca->dispatch_log;
+       }
+
+       __this_cpu_write(dtl_entry_ridx, i);
+
+out:
+       schedule_delayed_work_on(d->cpu, to_delayed_work(work),
+                                       HZ / vcpudispatch_stats_freq);
+}
+
+static int dtl_worker_online(unsigned int cpu)
+{
+       struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
+
+       memset(d, 0, sizeof(*d));
+       INIT_DELAYED_WORK(&d->work, process_dtl_buffer);
+       d->cpu = cpu;
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+       per_cpu(dtl_entry_ridx, cpu) = 0;
+       register_dtl_buffer(cpu);
+#else
+       per_cpu(dtl_entry_ridx, cpu) = be64_to_cpu(lppaca_of(cpu).dtl_idx);
+#endif
+
+       schedule_delayed_work_on(cpu, &d->work, HZ / vcpudispatch_stats_freq);
+       return 0;
+}
+
+static int dtl_worker_offline(unsigned int cpu)
+{
+       struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
+
+       cancel_delayed_work_sync(&d->work);
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+       unregister_dtl(get_hard_smp_processor_id(cpu));
+#endif
+
+       return 0;
+}
+
+static void set_global_dtl_mask(u8 mask)
+{
+       int cpu;
+
+       dtl_mask = mask;
+       for_each_present_cpu(cpu)
+               lppaca_of(cpu).dtl_enable_mask = dtl_mask;
+}
+
+static void reset_global_dtl_mask(void)
+{
+       int cpu;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+       dtl_mask = DTL_LOG_PREEMPT;
+#else
+       dtl_mask = 0;
+#endif
+       for_each_present_cpu(cpu)
+               lppaca_of(cpu).dtl_enable_mask = dtl_mask;
+}
+
+static int dtl_worker_enable(void)
+{
+       int rc = 0, state;
+
+       if (!write_trylock(&dtl_access_lock)) {
+               rc = -EBUSY;
+               goto out;
+       }
+
+       set_global_dtl_mask(DTL_LOG_ALL);
+
+       /* Setup dtl buffers and register those */
+       alloc_dtl_buffers();
+
+       state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/dtl:online",
+                                       dtl_worker_online, dtl_worker_offline);
+       if (state < 0) {
+               pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n");
+               free_dtl_buffers();
+               reset_global_dtl_mask();
+               write_unlock(&dtl_access_lock);
+               rc = -EINVAL;
+               goto out;
+       }
+       dtl_worker_state = state;
+
+out:
+       return rc;
+}
+
+static void dtl_worker_disable(void)
+{
+       cpuhp_remove_state(dtl_worker_state);
+       free_dtl_buffers();
+       reset_global_dtl_mask();
+       write_unlock(&dtl_access_lock);
+}
+
+static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p,
+               size_t count, loff_t *ppos)
+{
+       struct vcpu_dispatch_data *disp;
+       int rc, cmd, cpu;
+       char buf[16];
+
+       if (count > 15)
+               return -EINVAL;
+
+       if (copy_from_user(buf, p, count))
+               return -EFAULT;
+
+       buf[count] = 0;
+       rc = kstrtoint(buf, 0, &cmd);
+       if (rc || cmd < 0 || cmd > 1) {
+               pr_err("vcpudispatch_stats: please use 0 to disable or 1 to enable dispatch statistics\n");
+               return rc ? rc : -EINVAL;
+       }
+
+       mutex_lock(&dtl_enable_mutex);
+
+       if ((cmd == 0 && !vcpudispatch_stats_on) ||
+                       (cmd == 1 && vcpudispatch_stats_on))
+               goto out;
+
+       if (cmd) {
+               rc = init_cpu_associativity();
+               if (rc)
+                       goto out;
+
+               for_each_possible_cpu(cpu) {
+                       disp = per_cpu_ptr(&vcpu_disp_data, cpu);
+                       memset(disp, 0, sizeof(*disp));
+                       disp->last_disp_cpu = -1;
+               }
+
+               rc = dtl_worker_enable();
+               if (rc) {
+                       destroy_cpu_associativity();
+                       goto out;
+               }
+       } else {
+               dtl_worker_disable();
+               destroy_cpu_associativity();
+       }
+
+       vcpudispatch_stats_on = cmd;
+
+out:
+       mutex_unlock(&dtl_enable_mutex);
+       if (rc)
+               return rc;
+       return count;
+}
+
+static int vcpudispatch_stats_display(struct seq_file *p, void *v)
+{
+       int cpu;
+       struct vcpu_dispatch_data *disp;
+
+       if (!vcpudispatch_stats_on) {
+               seq_puts(p, "off\n");
+               return 0;
+       }
+
+       for_each_online_cpu(cpu) {
+               disp = per_cpu_ptr(&vcpu_disp_data, cpu);
+               seq_printf(p, "cpu%d", cpu);
+               seq_put_decimal_ull(p, " ", disp->total_disp);
+               seq_put_decimal_ull(p, " ", disp->same_cpu_disp);
+               seq_put_decimal_ull(p, " ", disp->same_chip_disp);
+               seq_put_decimal_ull(p, " ", disp->diff_chip_disp);
+               seq_put_decimal_ull(p, " ", disp->far_chip_disp);
+               seq_put_decimal_ull(p, " ", disp->numa_home_disp);
+               seq_put_decimal_ull(p, " ", disp->numa_remote_disp);
+               seq_put_decimal_ull(p, " ", disp->numa_far_disp);
+               seq_puts(p, "\n");
+       }
+
+       return 0;
+}
+
+static int vcpudispatch_stats_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, vcpudispatch_stats_display, NULL);
+}
+
+static const struct file_operations vcpudispatch_stats_proc_ops = {
+       .open           = vcpudispatch_stats_open,
+       .read           = seq_read,
+       .write          = vcpudispatch_stats_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static ssize_t vcpudispatch_stats_freq_write(struct file *file,
+               const char __user *p, size_t count, loff_t *ppos)
+{
+       int rc, freq;
+       char buf[16];
+
+       if (count > 15)
+               return -EINVAL;
+
+       if (copy_from_user(buf, p, count))
+               return -EFAULT;
+
+       buf[count] = 0;
+       rc = kstrtoint(buf, 0, &freq);
+       if (rc || freq < 1 || freq > HZ) {
+               pr_err("vcpudispatch_stats_freq: please specify a frequency between 1 and %d\n",
+                               HZ);
+               return rc ? rc : -EINVAL;
+       }
+
+       vcpudispatch_stats_freq = freq;
+
+       return count;
+}
+
+static int vcpudispatch_stats_freq_display(struct seq_file *p, void *v)
+{
+       seq_printf(p, "%d\n", vcpudispatch_stats_freq);
+       return 0;
+}
+
+static int vcpudispatch_stats_freq_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, vcpudispatch_stats_freq_display, NULL);
+}
+
+static const struct file_operations vcpudispatch_stats_freq_proc_ops = {
+       .open           = vcpudispatch_stats_freq_open,
+       .read           = seq_read,
+       .write          = vcpudispatch_stats_freq_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int __init vcpudispatch_stats_procfs_init(void)
+{
+       if (!lppaca_shared_proc(get_lppaca()))
+               return 0;
+
+       if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL,
+                                       &vcpudispatch_stats_proc_ops))
+               pr_err("vcpudispatch_stats: error creating procfs file\n");
+       else if (!proc_create("powerpc/vcpudispatch_stats_freq", 0600, NULL,
+                                       &vcpudispatch_stats_freq_proc_ops))
+               pr_err("vcpudispatch_stats_freq: error creating procfs file\n");
+
+       return 0;
+}
+
+machine_device_initcall(pseries, vcpudispatch_stats_procfs_init);
 #endif /* CONFIG_PPC_SPLPAR */
 
 void vpa_init(int cpu)