static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
 {
-       set_bit(0, (volatile unsigned long *)&lock->tickets.tail);
+       set_bit(0, (volatile unsigned long *)&lock->tickets.head);
 }
 
 #else  /* !CONFIG_PARAVIRT_SPINLOCKS */
 }
 
 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
+static inline int  __tickets_equal(__ticket_t one, __ticket_t two)
+{
+       return !((one ^ two) & ~TICKET_SLOWPATH_FLAG);
+}
+
+static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock,
+                                                       __ticket_t head)
+{
+       if (head & TICKET_SLOWPATH_FLAG) {
+               arch_spinlock_t old, new;
+
+               old.tickets.head = head;
+               new.tickets.head = head & ~TICKET_SLOWPATH_FLAG;
+               old.tickets.tail = new.tickets.head + TICKET_LOCK_INC;
+               new.tickets.tail = old.tickets.tail;
+
+               /* try to clear slowpath flag when there are no contenders */
+               cmpxchg(&lock->head_tail, old.head_tail, new.head_tail);
+       }
+}
 
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
-       return lock.tickets.head == lock.tickets.tail;
+       return __tickets_equal(lock.tickets.head, lock.tickets.tail);
 }
 
 /*
        if (likely(inc.head == inc.tail))
                goto out;
 
-       inc.tail &= ~TICKET_SLOWPATH_FLAG;
        for (;;) {
                unsigned count = SPIN_THRESHOLD;
 
                do {
-                       if (READ_ONCE(lock->tickets.head) == inc.tail)
-                               goto out;
+                       inc.head = READ_ONCE(lock->tickets.head);
+                       if (__tickets_equal(inc.head, inc.tail))
+                               goto clear_slowpath;
                        cpu_relax();
                } while (--count);
                __ticket_lock_spinning(lock, inc.tail);
        }
-out:   barrier();      /* make sure nothing creeps before the lock is taken */
+clear_slowpath:
+       __ticket_check_and_clear_slowpath(lock, inc.head);
+out:
+       barrier();      /* make sure nothing creeps before the lock is taken */
 }
 
 static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
        arch_spinlock_t old, new;
 
        old.tickets = READ_ONCE(lock->tickets);
-       if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
+       if (!__tickets_equal(old.tickets.head, old.tickets.tail))
                return 0;
 
        new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
+       new.head_tail &= ~TICKET_SLOWPATH_FLAG;
 
        /* cmpxchg is a full barrier, so nothing can move before it */
        return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
 }
 
-static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock,
-                                           arch_spinlock_t old)
-{
-       arch_spinlock_t new;
-
-       BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
-
-       /* Perform the unlock on the "before" copy */
-       old.tickets.head += TICKET_LOCK_INC;
-
-       /* Clear the slowpath flag */
-       new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT);
-
-       /*
-        * If the lock is uncontended, clear the flag - use cmpxchg in
-        * case it changes behind our back though.
-        */
-       if (new.tickets.head != new.tickets.tail ||
-           cmpxchg(&lock->head_tail, old.head_tail,
-                                       new.head_tail) != old.head_tail) {
-               /*
-                * Lock still has someone queued for it, so wake up an
-                * appropriate waiter.
-                */
-               __ticket_unlock_kick(lock, old.tickets.head);
-       }
-}
-
 static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
        if (TICKET_SLOWPATH_FLAG &&
-           static_key_false(¶virt_ticketlocks_enabled)) {
-               arch_spinlock_t prev;
+               static_key_false(¶virt_ticketlocks_enabled)) {
+               __ticket_t head;
 
-               prev = *lock;
-               add_smp(&lock->tickets.head, TICKET_LOCK_INC);
+               BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
 
-               /* add_smp() is a full mb() */
+               head = xadd(&lock->tickets.head, TICKET_LOCK_INC);
 
-               if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
-                       __ticket_unlock_slowpath(lock, prev);
+               if (unlikely(head & TICKET_SLOWPATH_FLAG)) {
+                       head &= ~TICKET_SLOWPATH_FLAG;
+                       __ticket_unlock_kick(lock, (head + TICKET_LOCK_INC));
+               }
        } else
                __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
 }
 {
        struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-       return tmp.tail != tmp.head;
+       return !__tickets_equal(tmp.tail, tmp.head);
 }
 
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
        struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-       return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
+       tmp.head &= ~TICKET_SLOWPATH_FLAG;
+       return (tmp.tail - tmp.head) > TICKET_LOCK_INC;
 }
 #define arch_spin_is_contended arch_spin_is_contended
 
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-       __ticket_t head = ACCESS_ONCE(lock->tickets.head);
+       __ticket_t head = READ_ONCE(lock->tickets.head);
 
        for (;;) {
-               struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+               struct __raw_tickets tmp = READ_ONCE(lock->tickets);
                /*
                 * We need to check "unlocked" in a loop, tmp.head == head
                 * can be false positive because of overflow.
                 */
-               if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) ||
-                   tmp.head != head)
+               if (__tickets_equal(tmp.head, tmp.tail) ||
+                               !__tickets_equal(tmp.head, head))
                        break;
 
                cpu_relax();
 
        u8 ret;
        u8 old;
 
-       old = ACCESS_ONCE(zero_stats);
+       old = READ_ONCE(zero_stats);
        if (unlikely(old)) {
                ret = cmpxchg(&zero_stats, old, 0);
                /* This ensures only one fellow resets the stat */
        int cpu;
        u64 start;
        unsigned long flags;
+       __ticket_t head;
 
        if (in_nmi())
                return;
         */
        __ticket_enter_slowpath(lock);
 
+       /* make sure enter_slowpath, which is atomic does not cross the read */
+       smp_mb__after_atomic();
+
        /*
         * check again make sure it didn't become free while
         * we weren't looking.
         */
-       if (ACCESS_ONCE(lock->tickets.head) == want) {
+       head = READ_ONCE(lock->tickets.head);
+       if (__tickets_equal(head, want)) {
                add_stats(TAKEN_SLOW_PICKUP, 1);
                goto out;
        }
        add_stats(RELEASED_SLOW, 1);
        for_each_cpu(cpu, &waiting_cpus) {
                const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
-               if (ACCESS_ONCE(w->lock) == lock &&
-                   ACCESS_ONCE(w->want) == ticket) {
+               if (READ_ONCE(w->lock) == lock &&
+                   READ_ONCE(w->want) == ticket) {
                        add_stats(RELEASED_SLOW_KICKED, 1);
                        kvm_kick_cpu(cpu);
                        break;
 
 static inline void check_zero(void)
 {
        u8 ret;
-       u8 old = ACCESS_ONCE(zero_stats);
+       u8 old = READ_ONCE(zero_stats);
        if (unlikely(old)) {
                ret = cmpxchg(&zero_stats, old, 0);
                /* This ensures only one fellow resets the stat */
        struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting);
        int cpu = smp_processor_id();
        u64 start;
+       __ticket_t head;
        unsigned long flags;
 
        /* If kicker interrupts not initialized yet, just spin */
         */
        __ticket_enter_slowpath(lock);
 
+       /* make sure enter_slowpath, which is atomic does not cross the read */
+       smp_mb__after_atomic();
+
        /*
         * check again make sure it didn't become free while
         * we weren't looking
         */
-       if (ACCESS_ONCE(lock->tickets.head) == want) {
+       head = READ_ONCE(lock->tickets.head);
+       if (__tickets_equal(head, want)) {
                add_stats(TAKEN_SLOW_PICKUP, 1);
                goto out;
        }
                const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
 
                /* Make sure we read lock before want */
-               if (ACCESS_ONCE(w->lock) == lock &&
-                   ACCESS_ONCE(w->want) == next) {
+               if (READ_ONCE(w->lock) == lock &&
+                   READ_ONCE(w->want) == next) {
                        add_stats(RELEASED_SLOW_KICKED, 1);
                        xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
                        break;