From: Chris Mason Date: Tue, 5 Jul 2011 19:07:25 +0000 (-0500) Subject: ipc semaphores: order wakeups based on waiter CPU X-Git-Tag: v2.6.39-400.9.0~945 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=8102e1ff9d667661b581209323faaf7a84f0f528;p=users%2Fjedix%2Flinux-maple.git ipc semaphores: order wakeups based on waiter CPU When IPC semaphores are used in a bulk post and wait system, we can end up waking a very large number of processes per semtimedop call. At least one major database will use a single process to kick hundreds of other processes at a time. This patch tries to reduce the runqueue lock contention by ordering the wakeups based on the CPU the waiting process was on when it went to sleep. A later patch could add some code in the scheduler to help wake these up in bulk and take the various runqueue locks less often. Signed-off-by: Chris Mason --- diff --git a/include/linux/sem.h b/include/linux/sem.h index 5a97a370c71d..15da841d1161 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -106,6 +106,7 @@ struct sem_array { struct sem_queue { struct list_head list; /* queue of pending operations */ struct task_struct *sleeper; /* this process */ + unsigned long sleep_cpu; struct sem_undo *undo; /* undo structure */ int pid; /* process id of requesting process */ int status; /* completion status of operation */ diff --git a/ipc/sem.c b/ipc/sem.c index 4643652be3f5..d34316d8a924 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -572,6 +572,25 @@ static void wake_up_sem_queue_do(struct list_head *pt) preempt_enable(); } +/* + * sorting helper for struct sem_queues in a list. This is used to + * sort by the CPU they are likely to be on when waking them. + */ +int list_comp(void *priv, struct list_head *a, struct list_head *b) +{ + struct sem_queue *qa; + struct sem_queue *qb; + + qa = list_entry(a, struct sem_queue, list); + qb = list_entry(b, struct sem_queue, list); + + if (qa->sleep_cpu < qb->sleep_cpu) + return -1; + if (qa->sleep_cpu > qb->sleep_cpu) + return 1; + return 0; +} + /** * update_queue(sma, semnum): Look for tasks that can be completed. * @sma: semaphore array. @@ -591,6 +610,7 @@ static int update_queue(struct sem_array *sma, struct list_head *pt, struct sem_queue *q; LIST_HEAD(new_pending); LIST_HEAD(work_list); + LIST_HEAD(wake_list); int semop_completed = 0; /* @@ -630,13 +650,24 @@ again: if (!error) semop_completed = 1; - wake_up_sem_queue_prepare(pt, q, error); + if (error) + wake_up_sem_queue_prepare(pt, q, error); + else + list_add_tail(&q->list, &wake_list); if (!list_empty(&new_pending)) { list_splice_init(&new_pending, &work_list); goto again; } } + + list_sort(NULL, &wake_list, list_comp); + while (!list_empty(&wake_list)) { + q = list_entry(wake_list.next, struct sem_queue, list); + list_del_init(&q->list); + wake_up_sem_queue_prepare(pt, q, 0); + } + return semop_completed; } @@ -1551,6 +1582,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, queue.alter = alter; queue.status = -EINTR; queue.sleeper = current; + + /* + * the sleep_cpu number allows sorting by the CPU we expect + * their runqueue entry to be on..hopefully faster for waking up + */ + queue.sleep_cpu = my_cpu_offset; current->state = TASK_INTERRUPTIBLE; /*