return per_cpu_ptr(&mcs_nodes[idx], cpu);
 }
 
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
 /**
  * queued_spin_lock_slowpath - acquire the queued spinlock
  * @lock: Pointer to queued spinlock structure
  * @val: Current value of the queued spinlock 32-bit word
  *
- * (queue tail, lock value)
- *
- *              fast      :    slow                                  :    unlock
- *                        :                                          :
- * uncontended  (0,0)   --:--> (0,1) --------------------------------:--> (*,0)
- *                        :       | ^--------.                    /  :
- *                        :       v           \                   |  :
- * uncontended            :    (n,x) --+--> (n,0)                 |  :
- *   queue                :       | ^--'                          |  :
- *                        :       v                               |  :
- * contended              :    (*,x) --+--> (*,0) -----> (*,1) ---'  :
- *   queue                :         ^--'                             :
+ * (queue tail, pending bit, lock value)
  *
+ *              fast     :    slow                                  :    unlock
+ *                       :                                          :
+ * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ *                       :       | ^--------.------.             /  :
+ *                       :       v           \      \            |  :
+ * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
+ *                       :       | ^--'              |           |  :
+ *                       :       v                   |           |  :
+ * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
+ *   queue               :       | ^--'                          |  :
+ *                       :       v                               |  :
+ * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
+ *   queue               :         ^--'                             :
  */
 void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 {
 
        BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
 
+       /*
+        * wait for in-progress pending->locked hand-overs
+        *
+        * 0,1,0 -> 0,0,1
+        */
+       if (val == _Q_PENDING_VAL) {
+               while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+                       cpu_relax();
+       }
+
+       /*
+        * trylock || pending
+        *
+        * 0,0,0 -> 0,0,1 ; trylock
+        * 0,0,1 -> 0,1,1 ; pending
+        */
+       for (;;) {
+               /*
+                * If we observe any contention; queue.
+                */
+               if (val & ~_Q_LOCKED_MASK)
+                       goto queue;
+
+               new = _Q_LOCKED_VAL;
+               if (val == new)
+                       new |= _Q_PENDING_VAL;
+
+               old = atomic_cmpxchg(&lock->val, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       /*
+        * we won the trylock
+        */
+       if (new == _Q_LOCKED_VAL)
+               return;
+
+       /*
+        * we're pending, wait for the owner to go away.
+        *
+        * *,1,1 -> *,1,0
+        */
+       while ((val = atomic_read(&lock->val)) & _Q_LOCKED_MASK)
+               cpu_relax();
+
+       /*
+        * take ownership and clear the pending bit.
+        *
+        * *,1,0 -> *,0,1
+        */
+       for (;;) {
+               new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+
+               old = atomic_cmpxchg(&lock->val, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+       return;
+
+       /*
+        * End of pending bit optimistic spinning and beginning of MCS
+        * queuing.
+        */
+queue:
        node = this_cpu_ptr(&mcs_nodes[0]);
        idx = node->count++;
        tail = encode_tail(smp_processor_id(), idx);
        node->next = NULL;
 
        /*
+        * We have already touched the queueing cacheline; don't bother with
+        * pending stuff.
+        *
         * trylock || xchg(lock, node)
         *
-        * 0,0 -> 0,1 ; no tail, not locked -> no tail, locked.
-        * p,x -> n,x ; tail was p -> tail is n; preserving locked.
+        * 0,0,0 -> 0,0,1 ; no tail, not locked -> no tail, locked.
+        * p,y,x -> n,y,x ; tail was p -> tail is n; preserving locked.
         */
        for (;;) {
                new = _Q_LOCKED_VAL;
                if (val)
-                       new = tail | (val & _Q_LOCKED_MASK);
+                       new = tail | (val & _Q_LOCKED_PENDING_MASK);
 
                old = atomic_cmpxchg(&lock->val, val, new);
                if (old == val)
         * if there was a previous node; link it and wait until reaching the
         * head of the waitqueue.
         */
-       if (old & ~_Q_LOCKED_MASK) {
+       if (old & ~_Q_LOCKED_PENDING_MASK) {
                prev = decode_tail(old);
                WRITE_ONCE(prev->next, node);
 
        }
 
        /*
-        * we're at the head of the waitqueue, wait for the owner to go away.
+        * we're at the head of the waitqueue, wait for the owner & pending to
+        * go away.
         *
-        * *,x -> *,0
+        * *,x,y -> *,0,0
         */
-       while ((val = atomic_read(&lock->val)) & _Q_LOCKED_MASK)
+       while ((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)
                cpu_relax();
 
        /*
         * claim the lock:
         *
-        * n,0 -> 0,1 : lock, uncontended
-        * *,0 -> *,1 : lock, contended
+        * n,0,0 -> 0,0,1 : lock, uncontended
+        * *,0,0 -> *,0,1 : lock, contended
         */
        for (;;) {
                new = _Q_LOCKED_VAL;