Commit 
ca469f35a8e9ef ("deal with races between remove_proc_entry() and
proc_reg_release()") moved too much stuff under ->pde_unload_lock making
a problem described at series "[PATCH v5] procfs: Improve Scaling in
proc" worse.
While RCU is being figured out, move kfree() out of ->pde_unload_lock.
On my potato, difference is only 0.5% speedup with concurrent
open+read+close of /proc/cmdline, but the effect should be more
noticeable on more capable machines.
$ perf stat -r 16 -- ./proc-j 16
 Performance counter stats for './proc-j 16' (16 runs):
     130569.502377      task-clock (msec)         #   15.872 CPUs utilized            ( +-  0.05% )
            19,169      context-switches          #    0.147 K/sec                    ( +-  0.18% )
                15      cpu-migrations            #    0.000 K/sec                    ( +-  3.27% )
               437      page-faults               #    0.003 K/sec                    ( +-  1.25% )
   300,172,097,675      cycles                    #    2.299 GHz                      ( +-  0.05% )
    96,793,267,308      instructions              #    0.32  insn per cycle           ( +-  0.04% )
    22,798,342,298      branches                  #  174.607 M/sec                    ( +-  0.04% )
       111,764,687      branch-misses             #    0.49% of all branches          ( +-  0.47% )
       8.
226574400 seconds time elapsed                                          ( +-  0.05% )
       ^^^^^^^^^^^
$ perf stat -r 16 -- ./proc-j 16
 Performance counter stats for './proc-j 16' (16 runs):
     129866.777392      task-clock (msec)         #   15.869 CPUs utilized            ( +-  0.04% )
            19,154      context-switches          #    0.147 K/sec                    ( +-  0.66% )
                14      cpu-migrations            #    0.000 K/sec                    ( +-  1.73% )
               431      page-faults               #    0.003 K/sec                    ( +-  1.09% )
   298,556,520,546      cycles                    #    2.299 GHz                      ( +-  0.04% )
    96,525,366,833      instructions              #    0.32  insn per cycle           ( +-  0.04% )
    22,730,194,043      branches                  #  175.027 M/sec                    ( +-  0.04% )
       111,506,074      branch-misses             #    0.49% of all branches          ( +-  0.18% )
       8.
183629778 seconds time elapsed                                          ( +-  0.04% )
       ^^^^^^^^^^^
Link: http://lkml.kernel.org/r/20180213132911.GA24298@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 
                complete(pde->pde_unload_completion);
 }
 
-/* pde is locked */
+/* pde is locked on entry, unlocked on exit */
 static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 {
        /*
                pdeo->c = &c;
                spin_unlock(&pde->pde_unload_lock);
                wait_for_completion(&c);
-               spin_lock(&pde->pde_unload_lock);
        } else {
                struct file *file;
+               struct completion *c;
+
                pdeo->closing = true;
                spin_unlock(&pde->pde_unload_lock);
                file = pdeo->file;
                spin_lock(&pde->pde_unload_lock);
                /* After ->release. */
                list_del(&pdeo->lh);
-               if (unlikely(pdeo->c))
-                       complete(pdeo->c);
+               c = pdeo->c;
+               spin_unlock(&pde->pde_unload_lock);
+               if (unlikely(c))
+                       complete(c);
                kfree(pdeo);
        }
 }
                struct pde_opener *pdeo;
                pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
                close_pdeo(de, pdeo);
+               spin_lock(&de->pde_unload_lock);
        }
        spin_unlock(&de->pde_unload_lock);
 }
        list_for_each_entry(pdeo, &pde->pde_openers, lh) {
                if (pdeo->file == file) {
                        close_pdeo(pde, pdeo);
-                       break;
+                       return 0;
                }
        }
        spin_unlock(&pde->pde_unload_lock);