]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
sched/wait: Add add_wait_queue_priority()
authorDavid Woodhouse <dwmw@amazon.co.uk>
Mon, 26 Oct 2020 17:26:06 +0000 (17:26 +0000)
committerDavid Woodhouse <dwmw@amazon.co.uk>
Mon, 26 Oct 2020 17:26:06 +0000 (17:26 +0000)
This allows an exclusive wait_queue_entry to be added at the head of the
queue, instead of the tail as normal. Thus, it gets to consume events
first.

The problem I'm trying to solve here is interrupt remapping invalidation
vs. MSI interrupts from VFIO. I'd really like KVM IRQFD to be able to
consume events before (and indeed instead of) userspace.

When the remapped MSI target in the KVM routing table is invalidated,
the VMM needs to *deassociate* the IRQFD and fall back to handling the
next IRQ in userspace, so it can be retranslated and a fault reported
if appropriate.

It's possible to do that by constantly registering and deregistering the
fd in the userspace poll loop, but it gets ugly especially because the
fallback handler isn't really local to the core MSI handling.

It's much nicer if the userspace handler can just remain registered all
the time, and it just doesn't get any events when KVM steals them first.
Which is precisely what happens with posted interrupts, and this makes
it consistent. (Unless I'm missing something that prevents posted
interrupts from working when there's another listener on the eventfd?)

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
include/linux/wait.h
kernel/sched/wait.c

index 27fb99cfeb026932b6e26d70334a4ed5389551e3..fe10e8570a522166903411cfd9ab02f4109813fe 100644 (file)
@@ -22,6 +22,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 #define WQ_FLAG_BOOKMARK       0x04
 #define WQ_FLAG_CUSTOM         0x08
 #define WQ_FLAG_DONE           0x10
+#define WQ_FLAG_PRIORITY       0x20
 
 /*
  * A single wait-queue entry structure:
@@ -164,11 +165,20 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
 
 extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 
 static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
-       list_add(&wq_entry->entry, &wq_head->head);
+       struct list_head *head = &wq_head->head;
+       struct wait_queue_entry *wq;
+
+       list_for_each_entry(wq, &wq_head->head, entry) {
+               if (!(wq->flags & WQ_FLAG_PRIORITY))
+                       break;
+               head = &wq->entry;
+       }
+       list_add(&wq_entry->entry, head);
 }
 
 /*
index 01f5d3020589de0e85ed2fa706c30bc0be547687..d2a84c8e88bf39eaf0158ef0aa6ad43ade9cf7e8 100644 (file)
@@ -37,6 +37,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue
 }
 EXPORT_SYMBOL(add_wait_queue_exclusive);
 
+void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+       unsigned long flags;
+
+       wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+       spin_lock_irqsave(&wq_head->lock, flags);
+       __add_wait_queue(wq_head, wq_entry);
+       spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+
 void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
        unsigned long flags;