if (prot_numa && pmd_protnone(*pmd))
                goto unlock;
 
-       entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
+       /*
+        * In case prot_numa, we are under down_read(mmap_sem). It's critical
+        * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+        * which is also under down_read(mmap_sem):
+        *
+        *      CPU0:                           CPU1:
+        *                              change_huge_pmd(prot_numa=1)
+        *                               pmdp_huge_get_and_clear_notify()
+        * madvise_dontneed()
+        *  zap_pmd_range()
+        *   pmd_trans_huge(*pmd) == 0 (without ptl)
+        *   // skip the pmd
+        *                               set_pmd_at();
+        *                               // pmd is re-established
+        *
+        * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+        * which may break userspace.
+        *
+        * pmdp_invalidate() is required to make sure we don't miss
+        * dirty/young flags set by hardware.
+        */
+       entry = *pmd;
+       pmdp_invalidate(vma, addr, pmd);
+
+       /*
+        * Recover dirty/young flags.  It relies on pmdp_invalidate to not
+        * corrupt them.
+        */
+       if (pmd_dirty(*pmd))
+               entry = pmd_mkdirty(entry);
+       if (pmd_young(*pmd))
+               entry = pmd_mkyoung(entry);
+
        entry = pmd_modify(entry, newprot);
        if (preserve_write)
                entry = pmd_mk_savedwrite(entry);