#include <crypto/internal/acompress.h>
 #include <crypto/internal/scompress.h>
 #include <crypto/scatterwalk.h>
+#include <linux/cpumask.h>
 #include <linux/cryptouser.h>
 #include <linux/err.h>
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
 #include <net/netlink.h>
 
 #include "compress.h"
 static int scomp_scratch_users;
 static DEFINE_MUTEX(scomp_lock);
 
+static cpumask_t scomp_scratch_want;
+static void scomp_scratch_workfn(struct work_struct *work);
+static DECLARE_WORK(scomp_scratch_work, scomp_scratch_workfn);
+
 static int __maybe_unused crypto_scomp_report(
        struct sk_buff *skb, struct crypto_alg *alg)
 {
                scratch = per_cpu_ptr(&scomp_scratch, i);
 
                free_page(scratch->saddr);
-               vfree(scratch->dst);
+               kvfree(scratch->dst);
                scratch->src = NULL;
                scratch->dst = NULL;
        }
 }
 
-static int crypto_scomp_alloc_scratches(void)
+static int scomp_alloc_scratch(struct scomp_scratch *scratch, int cpu)
 {
-       struct scomp_scratch *scratch;
-       int i;
+       int node = cpu_to_node(cpu);
+       struct page *page;
+       void *mem;
 
-       for_each_possible_cpu(i) {
-               struct page *page;
-               void *mem;
+       mem = kvmalloc_node(SCOMP_SCRATCH_SIZE, GFP_KERNEL, node);
+       if (!mem)
+               return -ENOMEM;
+       page = alloc_pages_node(node, GFP_KERNEL, 0);
+       if (!page) {
+               kvfree(mem);
+               return -ENOMEM;
+       }
+       spin_lock_bh(&scratch->lock);
+       scratch->src = page_address(page);
+       scratch->dst = mem;
+       spin_unlock_bh(&scratch->lock);
+       return 0;
+}
 
-               scratch = per_cpu_ptr(&scomp_scratch, i);
+static void scomp_scratch_workfn(struct work_struct *work)
+{
+       int cpu;
 
-               page = alloc_pages_node(cpu_to_node(i), GFP_KERNEL, 0);
-               if (!page)
-                       goto error;
-               scratch->src = page_address(page);
-               mem = vmalloc_node(SCOMP_SCRATCH_SIZE, cpu_to_node(i));
-               if (!mem)
-                       goto error;
-               scratch->dst = mem;
+       for_each_cpu(cpu, &scomp_scratch_want) {
+               struct scomp_scratch *scratch;
+
+               scratch = per_cpu_ptr(&scomp_scratch, cpu);
+               if (scratch->src)
+                       continue;
+               if (scomp_alloc_scratch(scratch, cpu))
+                       break;
+
+               cpumask_clear_cpu(cpu, &scomp_scratch_want);
        }
-       return 0;
-error:
-       crypto_scomp_free_scratches();
-       return -ENOMEM;
+}
+
+static int crypto_scomp_alloc_scratches(void)
+{
+       unsigned int i = cpumask_first(cpu_possible_mask);
+       struct scomp_scratch *scratch;
+
+       scratch = per_cpu_ptr(&scomp_scratch, i);
+       return scomp_alloc_scratch(scratch, i);
 }
 
 static void scomp_free_streams(struct scomp_alg *alg)
                struct crypto_acomp_stream *ps = per_cpu_ptr(stream, i);
 
                if (!ps->ctx)
-                       break;
+                       continue;
 
                alg->free_ctx(ps->ctx);
        }
 static int scomp_alloc_streams(struct scomp_alg *alg)
 {
        struct crypto_acomp_stream __percpu *stream;
-       int i;
+       struct crypto_acomp_stream *ps;
+       unsigned int i;
+       void *ctx;
 
        stream = alloc_percpu(struct crypto_acomp_stream);
        if (!stream)
                return -ENOMEM;
 
-       for_each_possible_cpu(i) {
-               struct crypto_acomp_stream *ps = per_cpu_ptr(stream, i);
+       ctx = alg->alloc_ctx();
+       if (IS_ERR(ctx)) {
+               free_percpu(stream);
+               return PTR_ERR(ctx);
+       }
 
-               ps->ctx = alg->alloc_ctx();
-               if (IS_ERR(ps->ctx)) {
-                       scomp_free_streams(alg);
-                       return PTR_ERR(ps->ctx);
-               }
+       i = cpumask_first(cpu_possible_mask);
+       ps = per_cpu_ptr(stream, i);
+       ps->ctx = ctx;
 
+       for_each_possible_cpu(i) {
+               ps = per_cpu_ptr(stream, i);
                spin_lock_init(&ps->lock);
        }
 
        return 0;
 }
 
+static void scomp_stream_workfn(struct work_struct *work)
+{
+       struct scomp_alg *alg = container_of(work, struct scomp_alg,
+                                            stream_work);
+       struct crypto_acomp_stream __percpu *stream = alg->stream;
+       int cpu;
+
+       for_each_cpu(cpu, &alg->stream_want) {
+               struct crypto_acomp_stream *ps;
+               void *ctx;
+
+               ps = per_cpu_ptr(stream, cpu);
+               if (ps->ctx)
+                       continue;
+
+               ctx = alg->alloc_ctx();
+               if (IS_ERR(ctx))
+                       break;
+
+               spin_lock_bh(&ps->lock);
+               ps->ctx = ctx;
+               spin_unlock_bh(&ps->lock);
+
+               cpumask_clear_cpu(cpu, &alg->stream_want);
+       }
+}
+
 static int crypto_scomp_init_tfm(struct crypto_tfm *tfm)
 {
        struct scomp_alg *alg = crypto_scomp_alg(__crypto_scomp_tfm(tfm));
        return ret;
 }
 
+static struct scomp_scratch *scomp_lock_scratch_bh(void) __acquires(scratch)
+{
+       int cpu = raw_smp_processor_id();
+       struct scomp_scratch *scratch;
+
+       scratch = per_cpu_ptr(&scomp_scratch, cpu);
+       spin_lock_bh(&scratch->lock);
+       if (likely(scratch->src))
+               return scratch;
+       spin_unlock(&scratch->lock);
+
+       cpumask_set_cpu(cpu, &scomp_scratch_want);
+       schedule_work(&scomp_scratch_work);
+
+       scratch = per_cpu_ptr(&scomp_scratch, cpumask_first(cpu_possible_mask));
+       spin_lock(&scratch->lock);
+       return scratch;
+}
+
+static inline void scomp_unlock_scratch_bh(struct scomp_scratch *scratch)
+       __releases(scratch)
+{
+       spin_unlock_bh(&scratch->lock);
+}
+
+static struct crypto_acomp_stream *scomp_lock_stream(struct crypto_scomp *tfm)
+       __acquires(stream)
+{
+       struct scomp_alg *alg = crypto_scomp_alg(tfm);
+       struct crypto_acomp_stream __percpu *stream;
+       int cpu = raw_smp_processor_id();
+       struct crypto_acomp_stream *ps;
+
+       stream = alg->stream;
+       ps = per_cpu_ptr(stream, cpu);
+       spin_lock(&ps->lock);
+       if (likely(ps->ctx))
+               return ps;
+       spin_unlock(&ps->lock);
+
+       cpumask_set_cpu(cpu, &alg->stream_want);
+       schedule_work(&alg->stream_work);
+
+       ps = per_cpu_ptr(stream, cpumask_first(cpu_possible_mask));
+       spin_lock(&ps->lock);
+       return ps;
+}
+
+static inline void scomp_unlock_stream(struct crypto_acomp_stream *stream)
+       __releases(stream)
+{
+       spin_unlock(&stream->lock);
+}
+
 static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
 {
-       struct scomp_scratch *scratch = raw_cpu_ptr(&scomp_scratch);
        struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
        struct crypto_scomp **tfm_ctx = acomp_tfm_ctx(tfm);
        struct crypto_scomp *scomp = *tfm_ctx;
        struct crypto_acomp_stream *stream;
+       struct scomp_scratch *scratch;
        unsigned int slen = req->slen;
        unsigned int dlen = req->dlen;
        struct page *spage, *dpage;
        if (!req->dst || !dlen)
                return -EINVAL;
 
+       scratch = scomp_lock_scratch_bh();
+
        if (acomp_request_src_isvirt(req))
                src = req->svirt;
        else {
                                break;
                        src = kmap_local_page(spage) + soff;
                } while (0);
+
+               if (src == scratch->src)
+                       memcpy_from_sglist(scratch->src, req->src, 0, slen);
        }
 
        if (acomp_request_dst_isvirt(req))
                dlen = min(dlen, max);
        }
 
-       spin_lock_bh(&scratch->lock);
-
-       if (src == scratch->src)
-               memcpy_from_sglist(scratch->src, req->src, 0, slen);
-
-       stream = raw_cpu_ptr(crypto_scomp_alg(scomp)->stream);
-       spin_lock(&stream->lock);
+       stream = scomp_lock_stream(scomp);
        if (dir)
                ret = crypto_scomp_compress(scomp, src, slen,
                                            dst, &dlen, stream->ctx);
        if (dst == scratch->dst)
                memcpy_to_sglist(req->dst, 0, dst, dlen);
 
-       spin_unlock(&stream->lock);
-       spin_unlock_bh(&scratch->lock);
+       scomp_unlock_stream(stream);
+       scomp_unlock_scratch_bh(scratch);
 
        req->dlen = dlen;
 
 
        crypto_free_scomp(*ctx);
 
+       flush_work(&scomp_scratch_work);
        mutex_lock(&scomp_lock);
        if (!--scomp_scratch_users)
                crypto_scomp_free_scratches();
 
 static void crypto_scomp_destroy(struct crypto_alg *alg)
 {
-       scomp_free_streams(__crypto_scomp_alg(alg));
+       struct scomp_alg *scomp = __crypto_scomp_alg(alg);
+
+       cancel_work_sync(&scomp->stream_work);
+       scomp_free_streams(scomp);
 }
 
 static const struct crypto_type crypto_scomp_type = {
        comp_prepare_alg(&alg->calg);
 
        base->cra_flags |= CRYPTO_ALG_REQ_CHAIN;
+
+       INIT_WORK(&alg->stream_work, scomp_stream_workfn);
 }
 
 int crypto_register_scomp(struct scomp_alg *alg)