#if (NR_CPUS < 256)
 static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
 {
-       unsigned short inc = 1 << TICKET_SHIFT;
-
-       asm volatile (
-               LOCK_PREFIX "xaddw %w0, %1\n"
-               "1:\t"
-               "cmpb %h0, %b0\n\t"
-               "je 2f\n\t"
-               "rep ; nop\n\t"
-               "movb %1, %b0\n\t"
-               /* don't need lfence here, because loads are in-order */
-               "jmp 1b\n"
-               "2:"
-               : "+Q" (inc), "+m" (lock->slock)
-               :
-               : "memory", "cc");
+       register union {
+               struct __raw_tickets tickets;
+               unsigned short slock;
+       } inc = { .slock = 1 << TICKET_SHIFT };
+
+       asm volatile (LOCK_PREFIX "xaddw %w0, %1\n"
+                     : "+Q" (inc), "+m" (lock->slock) : : "memory", "cc");
+
+       for (;;) {
+               if (inc.tickets.head == inc.tickets.tail)
+                       break;
+               cpu_relax();
+               inc.tickets.head = ACCESS_ONCE(lock->tickets.head);
+       }
+       barrier();              /* make sure nothing creeps before the lock is taken */
 }
 
 static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
 static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
 {
        unsigned inc = 1 << TICKET_SHIFT;
-       unsigned tmp;
+       __ticket_t tmp;
 
-       asm volatile(LOCK_PREFIX "xaddl %0, %1\n"
-                    "movzwl %w0, %2\n\t"
-                    "shrl $16, %0\n\t"
-                    "1:\t"
-                    "cmpl %0, %2\n\t"
-                    "je 2f\n\t"
-                    "rep ; nop\n\t"
-                    "movzwl %1, %2\n\t"
-                    /* don't need lfence here, because loads are in-order */
-                    "jmp 1b\n"
-                    "2:"
-                    : "+r" (inc), "+m" (lock->slock), "=&r" (tmp)
-                    :
-                    : "memory", "cc");
+       asm volatile(LOCK_PREFIX "xaddl %0, %1\n\t"
+                    : "+r" (inc), "+m" (lock->slock)
+                    : : "memory", "cc");
+
+       tmp = inc;
+       inc >>= TICKET_SHIFT;
+
+       for (;;) {
+               if ((__ticket_t)inc == tmp)
+                       break;
+               cpu_relax();
+               tmp = ACCESS_ONCE(lock->tickets.head);
+       }
+       barrier();              /* make sure nothing creeps before the lock is taken */
 }
 
 static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)