#endif
 }
 
-/* Deprecated. Always use nr_cpu_ids. */
-#define nr_cpumask_bits        nr_cpu_ids
+/*
+ * We have several different "preferred sizes" for the cpumask
+ * operations, depending on operation.
+ *
+ * For example, the bitmap scanning and operating operations have
+ * optimized routines that work for the single-word case, but only when
+ * the size is constant. So if NR_CPUS fits in one single word, we are
+ * better off using that small constant, in order to trigger the
+ * optimized bit finding. That is 'small_cpumask_size'.
+ *
+ * The clearing and copying operations will similarly perform better
+ * with a constant size, but we limit that size arbitrarily to four
+ * words. We call this 'large_cpumask_size'.
+ *
+ * Finally, some operations just want the exact limit, either because
+ * they set bits or just don't have any faster fixed-sized versions. We
+ * call this just 'nr_cpumask_size'.
+ *
+ * Note that these optional constants are always guaranteed to be at
+ * least as big as 'nr_cpu_ids' itself is, and all our cpumask
+ * allocations are at least that size (see cpumask_size()). The
+ * optimization comes from being able to potentially use a compile-time
+ * constant instead of a run-time generated exact number of CPUs.
+ */
+#if NR_CPUS <= BITS_PER_LONG
+  #define small_cpumask_bits ((unsigned int)NR_CPUS)
+  #define large_cpumask_bits ((unsigned int)NR_CPUS)
+#elif NR_CPUS <= 4*BITS_PER_LONG
+  #define small_cpumask_bits nr_cpu_ids
+  #define large_cpumask_bits ((unsigned int)NR_CPUS)
+#else
+  #define small_cpumask_bits nr_cpu_ids
+  #define large_cpumask_bits nr_cpu_ids
+#endif
+#define nr_cpumask_bits nr_cpu_ids
 
 /*
  * The following particular system cpumasks and operations manage
  */
 static inline unsigned int cpumask_first(const struct cpumask *srcp)
 {
-       return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
+       return find_first_bit(cpumask_bits(srcp), small_cpumask_bits);
 }
 
 /**
  */
 static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
 {
-       return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits);
+       return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits);
 }
 
 /**
 static inline
 unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
 {
-       return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
+       return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
 }
 
 /**
  */
 static inline unsigned int cpumask_last(const struct cpumask *srcp)
 {
-       return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+       return find_last_bit(cpumask_bits(srcp), small_cpumask_bits);
 }
 
 /**
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
-       return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
+       return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1);
 }
 
 /**
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
-       return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
+       return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1);
 }
 
 #if NR_CPUS == 1
        if (n != -1)
                cpumask_check(n);
        return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
-               nr_cpumask_bits, n + 1);
+               small_cpumask_bits, n + 1);
 }
 
 /**
  * After the loop, cpu is >= nr_cpu_ids.
  */
 #define for_each_cpu(cpu, mask)                                \
-       for_each_set_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
-
-/**
- * for_each_cpu_not - iterate over every cpu in a complemented mask
- * @cpu: the (optionally unsigned) integer iterator
- * @mask: the cpumask pointer
- *
- * After the loop, cpu is >= nr_cpu_ids.
- */
-#define for_each_cpu_not(cpu, mask)                            \
-       for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
+       for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
 
 #if NR_CPUS == 1
 static inline
  * After the loop, cpu is >= nr_cpu_ids.
  */
 #define for_each_cpu_wrap(cpu, mask, start)                            \
-       for_each_set_bit_wrap(cpu, cpumask_bits(mask), nr_cpumask_bits, start)
+       for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start)
 
 /**
  * for_each_cpu_and - iterate over every cpu in both masks
  * After the loop, cpu is >= nr_cpu_ids.
  */
 #define for_each_cpu_and(cpu, mask1, mask2)                            \
-       for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
+       for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
 
 /**
  * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
  * After the loop, cpu is >= nr_cpu_ids.
  */
 #define for_each_cpu_andnot(cpu, mask1, mask2)                         \
-       for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
+       for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
 
 /**
  * cpumask_any_but - return a "random" in a cpumask, but not this one.
  */
 static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
 {
-       return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu));
+       return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu));
 }
 
 /**
                                                        const struct cpumask *srcp2)
 {
        return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
-                               nr_cpumask_bits, cpumask_check(cpu));
+                               small_cpumask_bits, cpumask_check(cpu));
 }
 
 /**
                                                        const struct cpumask *srcp2)
 {
        return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
-                               nr_cpumask_bits, cpumask_check(cpu));
+                               small_cpumask_bits, cpumask_check(cpu));
 }
 
 /**
        return find_nth_and_andnot_bit(cpumask_bits(srcp1),
                                        cpumask_bits(srcp2),
                                        cpumask_bits(srcp3),
-                                       nr_cpumask_bits, cpumask_check(cpu));
+                                       small_cpumask_bits, cpumask_check(cpu));
 }
 
 #define CPU_BITS_NONE                                          \
 /**
  * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
  * @dstp: the cpumask pointer
+ *
+ * Note: since we set bits, we should use the tighter 'bitmap_set()' with
+ * the eact number of bits, not 'bitmap_fill()' that will fill past the
+ * end.
  */
 static inline void cpumask_setall(struct cpumask *dstp)
 {
-       bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
+       bitmap_set(cpumask_bits(dstp), 0, nr_cpumask_bits);
 }
 
 /**
  */
 static inline void cpumask_clear(struct cpumask *dstp)
 {
-       bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
+       bitmap_zero(cpumask_bits(dstp), large_cpumask_bits);
 }
 
 /**
                               const struct cpumask *src2p)
 {
        return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
-                                      cpumask_bits(src2p), nr_cpumask_bits);
+                                      cpumask_bits(src2p), small_cpumask_bits);
 }
 
 /**
                              const struct cpumask *src2p)
 {
        bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
-                                     cpumask_bits(src2p), nr_cpumask_bits);
+                                     cpumask_bits(src2p), small_cpumask_bits);
 }
 
 /**
                               const struct cpumask *src2p)
 {
        bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
-                                      cpumask_bits(src2p), nr_cpumask_bits);
+                                      cpumask_bits(src2p), small_cpumask_bits);
 }
 
 /**
                                  const struct cpumask *src2p)
 {
        return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
-                                         cpumask_bits(src2p), nr_cpumask_bits);
-}
-
-/**
- * cpumask_complement - *dstp = ~*srcp
- * @dstp: the cpumask result
- * @srcp: the input to invert
- */
-static inline void cpumask_complement(struct cpumask *dstp,
-                                     const struct cpumask *srcp)
-{
-       bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp),
-                                             nr_cpumask_bits);
+                                         cpumask_bits(src2p), small_cpumask_bits);
 }
 
 /**
                                const struct cpumask *src2p)
 {
        return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
-                                                nr_cpumask_bits);
+                                                small_cpumask_bits);
 }
 
 /**
                                    const struct cpumask *src3p)
 {
        return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
-                              cpumask_bits(src3p), nr_cpumask_bits);
+                              cpumask_bits(src3p), small_cpumask_bits);
 }
 
 /**
                                     const struct cpumask *src2p)
 {
        return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
-                                                     nr_cpumask_bits);
+                                                     small_cpumask_bits);
 }
 
 /**
                                 const struct cpumask *src2p)
 {
        return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
-                                                 nr_cpumask_bits);
+                                                 small_cpumask_bits);
 }
 
 /**
  */
 static inline bool cpumask_empty(const struct cpumask *srcp)
 {
-       return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits);
+       return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits);
 }
 
 /**
  */
 static inline unsigned int cpumask_weight(const struct cpumask *srcp)
 {
-       return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
+       return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits);
 }
 
 /**
 static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
                                                const struct cpumask *srcp2)
 {
-       return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
+       return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
 }
 
 /**
                                       const struct cpumask *srcp, int n)
 {
        bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
-                                              nr_cpumask_bits);
+                                              small_cpumask_bits);
 }
 
 /**
 static inline void cpumask_copy(struct cpumask *dstp,
                                const struct cpumask *srcp)
 {
-       bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
+       bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits);
 }
 
 /**
  */
 static inline unsigned int cpumask_size(void)
 {
-       return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long);
+       return BITS_TO_LONGS(large_cpumask_bits) * sizeof(long);
 }
 
 /*
 
                KUNIT_EXPECT_EQ_MSG((test), mask_weight, iter, MASK_MSG(mask)); \
        } while (0)
 
-#define EXPECT_FOR_EACH_CPU_NOT_EQ(test, mask)                                 \
-       do {                                                                    \
-               const cpumask_t *m = (mask);                                    \
-               int mask_weight = cpumask_weight(m);                            \
-               int cpu, iter = 0;                                              \
-               for_each_cpu_not(cpu, m)                                        \
-                       iter++;                                                 \
-               KUNIT_EXPECT_EQ_MSG((test), nr_cpu_ids - mask_weight, iter, MASK_MSG(mask));    \
-       } while (0)
-
 #define EXPECT_FOR_EACH_CPU_OP_EQ(test, op, mask1, mask2)                      \
        do {                                                                    \
                const cpumask_t *m1 = (mask1);                                  \
        KUNIT_EXPECT_EQ_MSG(test, 0, cpumask_weight(&mask_empty), MASK_MSG(&mask_empty));
        KUNIT_EXPECT_EQ_MSG(test, nr_cpu_ids, cpumask_weight(cpu_possible_mask),
                            MASK_MSG(cpu_possible_mask));
-       KUNIT_EXPECT_EQ_MSG(test, nr_cpumask_bits, cpumask_weight(&mask_all), MASK_MSG(&mask_all));
+       KUNIT_EXPECT_EQ_MSG(test, nr_cpu_ids, cpumask_weight(&mask_all), MASK_MSG(&mask_all));
 }
 
 static void test_cpumask_first(struct kunit *test)
 static void test_cpumask_iterators(struct kunit *test)
 {
        EXPECT_FOR_EACH_CPU_EQ(test, &mask_empty);
-       EXPECT_FOR_EACH_CPU_NOT_EQ(test, &mask_empty);
        EXPECT_FOR_EACH_CPU_WRAP_EQ(test, &mask_empty);
        EXPECT_FOR_EACH_CPU_OP_EQ(test, and, &mask_empty, &mask_empty);
        EXPECT_FOR_EACH_CPU_OP_EQ(test, and, cpu_possible_mask, &mask_empty);
        EXPECT_FOR_EACH_CPU_OP_EQ(test, andnot, &mask_empty, &mask_empty);
 
        EXPECT_FOR_EACH_CPU_EQ(test, cpu_possible_mask);
-       EXPECT_FOR_EACH_CPU_NOT_EQ(test, cpu_possible_mask);
        EXPECT_FOR_EACH_CPU_WRAP_EQ(test, cpu_possible_mask);
        EXPECT_FOR_EACH_CPU_OP_EQ(test, and, cpu_possible_mask, cpu_possible_mask);
        EXPECT_FOR_EACH_CPU_OP_EQ(test, andnot, cpu_possible_mask, &mask_empty);