#include <unistd.h>
 #include <time.h>
 #include <pthread.h>
+#include <semaphore.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <errno.h>
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
+#include <asm/barrier.h>
 
 #include "test_util.h"
 #include "kvm_util.h"
 # define test_and_clear_bit_le test_and_clear_bit
 #endif
 
+#define TEST_DIRTY_RING_COUNT          1024
+
 /*
  * Guest/Host shared variables. Ensure addr_gva2hva() and/or
  * sync_global_to/from_guest() are used when accessing from
 static uint64_t host_clear_count;
 static uint64_t host_track_next_count;
 
+/* Whether dirty ring reset is requested, or finished */
+static sem_t dirty_ring_vcpu_stop;
+static sem_t dirty_ring_vcpu_cont;
+/*
+ * This is only used for verifying the dirty pages.  Dirty ring has a very
+ * tricky case when the ring just got full, kvm will do userspace exit due to
+ * ring full.  When that happens, the very last PFN is set but actually the
+ * data is not changed (the guest WRITE is not really applied yet), because
+ * we found that the dirty ring is full, refused to continue the vcpu, and
+ * recorded the dirty gfn with the old contents.
+ *
+ * For this specific case, it's safe to skip checking this pfn for this
+ * bit, because it's a redundant bit, and when the write happens later the bit
+ * will be set again.  We use this variable to always keep track of the latest
+ * dirty gfn we've collected, so that if a mismatch of data found later in the
+ * verifying process, we let it pass.
+ */
+static uint64_t dirty_ring_last_page;
+
 enum log_mode_t {
        /* Only use KVM_GET_DIRTY_LOG for logging */
        LOG_MODE_DIRTY_LOG = 0,
        /* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */
        LOG_MODE_CLEAR_LOG = 1,
 
+       /* Use dirty ring for logging */
+       LOG_MODE_DIRTY_RING = 2,
+
        LOG_MODE_NUM,
 
        /* Run all supported modes */
 static enum log_mode_t host_log_mode_option = LOG_MODE_ALL;
 /* Logging mode for current run */
 static enum log_mode_t host_log_mode;
+static pthread_t vcpu_thread;
+
+/*
+ * In our test we do signal tricks, let's use a better version of
+ * sem_wait to avoid signal interrupts
+ */
+static void sem_wait_until(sem_t *sem)
+{
+       int ret;
+
+       do
+               ret = sem_wait(sem);
+       while (ret == -1 && errno == EINTR);
+}
 
 static bool clear_log_supported(void)
 {
        kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages);
 }
 
-static void default_after_vcpu_run(struct kvm_vm *vm)
+static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
 {
        struct kvm_run *run = vcpu_state(vm, VCPU_ID);
 
+       TEST_ASSERT(ret == 0 || (ret == -1 && err == EINTR),
+                   "vcpu run failed: errno=%d", err);
+
        TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC,
                    "Invalid guest sync status: exit_reason=%s\n",
                    exit_reason_str(run->exit_reason));
 }
 
+static bool dirty_ring_supported(void)
+{
+       return kvm_check_cap(KVM_CAP_DIRTY_LOG_RING);
+}
+
+static void dirty_ring_create_vm_done(struct kvm_vm *vm)
+{
+       /*
+        * Switch to dirty ring mode after VM creation but before any
+        * of the vcpu creation.
+        */
+       vm_enable_dirty_ring(vm, TEST_DIRTY_RING_COUNT *
+                            sizeof(struct kvm_dirty_gfn));
+}
+
+static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
+{
+       return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
+}
+
+static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
+{
+       gfn->flags = KVM_DIRTY_GFN_F_RESET;
+}
+
+static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns,
+                                      int slot, void *bitmap,
+                                      uint32_t num_pages, uint32_t *fetch_index)
+{
+       struct kvm_dirty_gfn *cur;
+       uint32_t count = 0;
+
+       while (true) {
+               cur = &dirty_gfns[*fetch_index % TEST_DIRTY_RING_COUNT];
+               if (!dirty_gfn_is_dirtied(cur))
+                       break;
+               TEST_ASSERT(cur->slot == slot, "Slot number didn't match: "
+                           "%u != %u", cur->slot, slot);
+               TEST_ASSERT(cur->offset < num_pages, "Offset overflow: "
+                           "0x%llx >= 0x%x", cur->offset, num_pages);
+               //pr_info("fetch 0x%x page %llu\n", *fetch_index, cur->offset);
+               set_bit_le(cur->offset, bitmap);
+               dirty_ring_last_page = cur->offset;
+               dirty_gfn_set_collected(cur);
+               (*fetch_index)++;
+               count++;
+       }
+
+       return count;
+}
+
+static void dirty_ring_wait_vcpu(void)
+{
+       sem_wait_until(&dirty_ring_vcpu_stop);
+}
+
+static void dirty_ring_continue_vcpu(void)
+{
+       pr_info("Notifying vcpu to continue\n");
+       sem_post(&dirty_ring_vcpu_cont);
+}
+
+static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot,
+                                          void *bitmap, uint32_t num_pages)
+{
+       /* We only have one vcpu */
+       static uint32_t fetch_index = 0;
+       uint32_t count = 0, cleared;
+
+       dirty_ring_wait_vcpu();
+
+       /* Only have one vcpu */
+       count = dirty_ring_collect_one(vcpu_map_dirty_ring(vm, VCPU_ID),
+                                      slot, bitmap, num_pages, &fetch_index);
+
+       cleared = kvm_vm_reset_dirty_ring(vm);
+
+       /* Cleared pages should be the same as collected */
+       TEST_ASSERT(cleared == count, "Reset dirty pages (%u) mismatch "
+                   "with collected (%u)", cleared, count);
+
+       dirty_ring_continue_vcpu();
+
+       pr_info("Iteration %ld collected %u pages\n", iteration, count);
+}
+
+static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
+{
+       struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+
+       /* A ucall-sync or ring-full event is allowed */
+       if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) {
+               /* We should allow this to continue */
+               ;
+       } else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL) {
+               /* Update the flag first before pause */
+               sem_post(&dirty_ring_vcpu_stop);
+               pr_info("vcpu stops because dirty ring is full...\n");
+               sem_wait_until(&dirty_ring_vcpu_cont);
+               pr_info("vcpu continues now.\n");
+       } else {
+               TEST_ASSERT(false, "Invalid guest sync status: "
+                           "exit_reason=%s\n",
+                           exit_reason_str(run->exit_reason));
+       }
+}
+
+static void dirty_ring_before_vcpu_join(void)
+{
+       /* Kick another round of vcpu just to make sure it will quit */
+       sem_post(&dirty_ring_vcpu_cont);
+}
+
 struct log_mode {
        const char *name;
        /* Return true if this mode is supported, otherwise false */
        void (*collect_dirty_pages) (struct kvm_vm *vm, int slot,
                                     void *bitmap, uint32_t num_pages);
        /* Hook to call when after each vcpu run */
-       void (*after_vcpu_run)(struct kvm_vm *vm);
+       void (*after_vcpu_run)(struct kvm_vm *vm, int ret, int err);
+       void (*before_vcpu_join) (void);
 } log_modes[LOG_MODE_NUM] = {
        {
                .name = "dirty-log",
                .collect_dirty_pages = clear_log_collect_dirty_pages,
                .after_vcpu_run = default_after_vcpu_run,
        },
+       {
+               .name = "dirty-ring",
+               .supported = dirty_ring_supported,
+               .create_vm_done = dirty_ring_create_vm_done,
+               .collect_dirty_pages = dirty_ring_collect_dirty_pages,
+               .before_vcpu_join = dirty_ring_before_vcpu_join,
+               .after_vcpu_run = dirty_ring_after_vcpu_run,
+       },
 };
 
 /*
        mode->collect_dirty_pages(vm, slot, bitmap, num_pages);
 }
 
-static void log_mode_after_vcpu_run(struct kvm_vm *vm)
+static void log_mode_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
 {
        struct log_mode *mode = &log_modes[host_log_mode];
 
        if (mode->after_vcpu_run)
-               mode->after_vcpu_run(vm);
+               mode->after_vcpu_run(vm, ret, err);
+}
+
+static void log_mode_before_vcpu_join(void)
+{
+       struct log_mode *mode = &log_modes[host_log_mode];
+
+       if (mode->before_vcpu_join)
+               mode->before_vcpu_join();
 }
 
 static void generate_random_array(uint64_t *guest_array, uint64_t size)
 
 static void *vcpu_worker(void *data)
 {
-       int ret;
+       int ret, vcpu_fd;
        struct kvm_vm *vm = data;
        uint64_t *guest_array;
        uint64_t pages_count = 0;
 
+       vcpu_fd = vcpu_get_fd(vm, VCPU_ID);
+
        guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array);
 
        while (!READ_ONCE(host_quit)) {
+               /* Clear any existing kick signals */
                generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
                pages_count += TEST_PAGES_PER_LOOP;
                /* Let the guest dirty the random pages */
-               ret = _vcpu_run(vm, VCPU_ID);
-               TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
-               log_mode_after_vcpu_run(vm);
+               ret = ioctl(vcpu_fd, KVM_RUN, NULL);
+               log_mode_after_vcpu_run(vm, ret, errno);
        }
 
        pr_info("Dirtied %"PRIu64" pages\n", pages_count);
        uint64_t step = vm_num_host_pages(mode, 1);
        uint64_t page;
        uint64_t *value_ptr;
+       uint64_t min_iter = 0;
 
        for (page = 0; page < host_num_pages; page += step) {
                value_ptr = host_test_mem + page * host_page_size;
                }
 
                if (test_and_clear_bit_le(page, bmap)) {
+                       bool matched;
+
                        host_dirty_count++;
+
                        /*
                         * If the bit is set, the value written onto
                         * the corresponding page should be either the
                         * previous iteration number or the current one.
                         */
-                       TEST_ASSERT(*value_ptr == iteration ||
-                                   *value_ptr == iteration - 1,
+                       matched = (*value_ptr == iteration ||
+                                  *value_ptr == iteration - 1);
+
+                       if (host_log_mode == LOG_MODE_DIRTY_RING && !matched) {
+                               if (*value_ptr == iteration - 2 && min_iter <= iteration - 2) {
+                                       /*
+                                        * Short answer: this case is special
+                                        * only for dirty ring test where the
+                                        * page is the last page before a kvm
+                                        * dirty ring full in iteration N-2.
+                                        *
+                                        * Long answer: Assuming ring size R,
+                                        * one possible condition is:
+                                        *
+                                        *      main thr       vcpu thr
+                                        *      --------       --------
+                                        *    iter=1
+                                        *                   write 1 to page 0~(R-1)
+                                        *                   full, vmexit
+                                        *    collect 0~(R-1)
+                                        *    kick vcpu
+                                        *                   write 1 to (R-1)~(2R-2)
+                                        *                   full, vmexit
+                                        *    iter=2
+                                        *    collect (R-1)~(2R-2)
+                                        *    kick vcpu
+                                        *                   write 1 to (2R-2)
+                                        *                   (NOTE!!! "1" cached in cpu reg)
+                                        *                   write 2 to (2R-1)~(3R-3)
+                                        *                   full, vmexit
+                                        *    iter=3
+                                        *    collect (2R-2)~(3R-3)
+                                        *    (here if we read value on page
+                                        *     "2R-2" is 1, while iter=3!!!)
+                                        *
+                                        * This however can only happen once per iteration.
+                                        */
+                                       min_iter = iteration - 1;
+                                       continue;
+                               } else if (page == dirty_ring_last_page) {
+                                       /*
+                                        * Please refer to comments in
+                                        * dirty_ring_last_page.
+                                        */
+                                       continue;
+                               }
+                       }
+
+                       TEST_ASSERT(matched,
                                    "Set page %"PRIu64" value %"PRIu64
                                    " incorrect (iteration=%"PRIu64")",
                                    page, *value_ptr, iteration);
 static void run_test(enum vm_guest_mode mode, unsigned long iterations,
                     unsigned long interval, uint64_t phys_offset)
 {
-       pthread_t vcpu_thread;
        struct kvm_vm *vm;
        unsigned long *bmap;
 
 
        /* Tell the vcpu thread to quit */
        host_quit = true;
+       log_mode_before_vcpu_join();
        pthread_join(vcpu_thread, NULL);
 
        pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
        unsigned int mode;
        int opt, i, j;
 
+       sem_init(&dirty_ring_vcpu_stop, 0, 0);
+       sem_init(&dirty_ring_vcpu_cont, 0, 0);
+
 #ifdef __x86_64__
        guest_mode_init(VM_MODE_PXXV48_4K, true, true);
 #endif
 
        return r;
 }
 
+void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
+{
+       struct kvm_enable_cap cap = { 0 };
+
+       cap.cap = KVM_CAP_DIRTY_LOG_RING;
+       cap.args[0] = ring_size;
+       vm_enable_cap(vm, &cap);
+       vm->dirty_ring_size = ring_size;
+}
+
 static void vm_open(struct kvm_vm *vm, int perm)
 {
        vm->kvm_fd = open(KVM_DEV_PATH, perm);
                    __func__, strerror(-ret));
 }
 
+uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm)
+{
+       return ioctl(vm->fd, KVM_RESET_DIRTY_RINGS);
+}
+
 /*
  * Userspace Memory Region Find
  *
  *
  * Removes a vCPU from a VM and frees its resources.
  */
-static void vm_vcpu_rm(struct vcpu *vcpu)
+static void vm_vcpu_rm(struct kvm_vm *vm, struct vcpu *vcpu)
 {
        int ret;
 
+       if (vcpu->dirty_gfns) {
+               ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size);
+               TEST_ASSERT(ret == 0, "munmap of VCPU dirty ring failed, "
+                           "rc: %i errno: %i", ret, errno);
+               vcpu->dirty_gfns = NULL;
+       }
+
        ret = munmap(vcpu->state, sizeof(*vcpu->state));
        TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i "
                "errno: %i", ret, errno);
        int ret;
 
        list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list)
-               vm_vcpu_rm(vcpu);
+               vm_vcpu_rm(vmp, vcpu);
 
        ret = close(vmp->fd);
        TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
        return rc;
 }
 
+int vcpu_get_fd(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+       TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+       return vcpu->fd;
+}
+
 void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid)
 {
        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
        return ret;
 }
 
+void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       struct vcpu *vcpu;
+       uint32_t size = vm->dirty_ring_size;
+
+       TEST_ASSERT(size > 0, "Should enable dirty ring first");
+
+       vcpu = vcpu_find(vm, vcpuid);
+
+       TEST_ASSERT(vcpu, "Cannot find vcpu %u", vcpuid);
+
+       if (!vcpu->dirty_gfns) {
+               void *addr;
+
+               addr = mmap(NULL, size, PROT_READ,
+                           MAP_PRIVATE, vcpu->fd,
+                           vm->page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
+               TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private");
+
+               addr = mmap(NULL, size, PROT_READ | PROT_EXEC,
+                           MAP_PRIVATE, vcpu->fd,
+                           vm->page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
+               TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec");
+
+               addr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                           MAP_SHARED, vcpu->fd,
+                           vm->page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
+               TEST_ASSERT(addr != MAP_FAILED, "Dirty ring map failed");
+
+               vcpu->dirty_gfns = addr;
+               vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn);
+       }
+
+       return vcpu->dirty_gfns;
+}
+
 /*
  * VM Ioctl
  *
        {KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"},
        {KVM_EXIT_OSI, "OSI"},
        {KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"},
+       {KVM_EXIT_DIRTY_RING_FULL, "DIRTY_RING_FULL"},
 #ifdef KVM_EXIT_MEMORY_NOT_PRESENT
        {KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"},
 #endif