www.infradead.org Git - users/jedix/linux-maple.git/blob

1 // SPDX-License-Identifier: GPL-2.0-only

2 /*

3 * linux/mm/oom_kill.c

4 *

6 * Thanks go out to Claus Fischer for some serious inspiration and

7 * for goading me into coding this file...

9 * Rewritten by David Rientjes

10 *

11 * The routines in this file are used to kill a process when

12 * we're seriously out of memory. This gets called from __alloc_pages()

13 * in mm/page_alloc.c when we really run out of memory.

14 *

15 * Since we won't call these routines often (on a well-configured

16 * machine) this file will double as a 'coding guide' and a signpost

17 * for newbie kernel hackers. It features several pointers to major

18 * kernel subsystems and hints as to where to find out what things do.

19 */

21 #include <linux/oom.h>

22 #include <linux/mm.h>

23 #include <linux/err.h>

24 #include <linux/gfp.h>

25 #include <linux/sched.h>

26 #include <linux/sched/mm.h>

27 #include <linux/sched/coredump.h>

28 #include <linux/sched/task.h>

29 #include <linux/sched/debug.h>

30 #include <linux/swap.h>

31 #include <linux/timex.h>

32 #include <linux/jiffies.h>

33 #include <linux/cpuset.h>

34 #include <linux/export.h>

35 #include <linux/notifier.h>

36 #include <linux/memcontrol.h>

37 #include <linux/mempolicy.h>

38 #include <linux/security.h>

39 #include <linux/ptrace.h>

40 #include <linux/freezer.h>

41 #include <linux/ftrace.h>

42 #include <linux/ratelimit.h>

43 #include <linux/kthread.h>

44 #include <linux/init.h>

45 #include <linux/mmu_notifier.h>

47 #include <asm/tlb.h>

48 #include "internal.h"

49 #include "slab.h"

51 #define CREATE_TRACE_POINTS

52 #include <trace/events/oom.h>

54 int sysctl_panic_on_oom;

55 int sysctl_oom_kill_allocating_task;

56 int sysctl_oom_dump_tasks = 1;

58 /*

59 * Serializes oom killer invocations (out_of_memory()) from all contexts to

60 * prevent from over eager oom killing (e.g. when the oom killer is invoked

61 * from different domains).

62 *

63 * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled

64 * and mark_oom_victim

65 */

66 DEFINE_MUTEX(oom_lock);

67 /* Serializes oom_score_adj and oom_score_adj_min updates */

68 DEFINE_MUTEX(oom_adj_mutex);

70 static inline bool is_memcg_oom(struct oom_control *oc)

71 {

72 return oc->memcg != NULL;

73 }

75 #ifdef CONFIG_NUMA

76 /**

77 * oom_cpuset_eligible() - check task eligibility for kill

78 * @start: task struct of which task to consider

79 * @oc: pointer to struct oom_control

80 *

81 * Task eligibility is determined by whether or not a candidate task, @tsk,

82 * shares the same mempolicy nodes as current if it is bound by such a policy

83 * and whether or not it has the same set of allowed cpuset nodes.

84 *

85 * This function is assuming oom-killer context and 'current' has triggered

86 * the oom-killer.

87 */

88 static bool oom_cpuset_eligible(struct task_struct *start,

89 struct oom_control *oc)

90 {

91 struct task_struct *tsk;

92 bool ret = false;

93 const nodemask_t *mask = oc->nodemask;

95 if (is_memcg_oom(oc))

96 return true;

98 rcu_read_lock();

99 for_each_thread(start, tsk) {

100 if (mask) {

101 /*

102 * If this is a mempolicy constrained oom, tsk's

103 * cpuset is irrelevant. Only return true if its

104 * mempolicy intersects current, otherwise it may be

105 * needlessly killed.

106 */

107 ret = mempolicy_in_oom_domain(tsk, mask);

108 } else {

109 /*

110 * This is not a mempolicy constrained oom, so only

111 * check the mems of tsk's cpuset.

112 */

113 ret = cpuset_mems_allowed_intersects(current, tsk);

114 }

115 if (ret)

116 break;

117 }

118 rcu_read_unlock();

119

120 return ret;

121 }

122 #else

123 static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)

124 {

125 return true;

126 }

127 #endif /* CONFIG_NUMA */

128

129 /*

130 * The process p may have detached its own ->mm while exiting or through

131 * kthread_use_mm(), but one or more of its subthreads may still have a valid

132 * pointer. Return p, or any of its subthreads with a valid ->mm, with

133 * task_lock() held.

134 */

135 struct task_struct *find_lock_task_mm(struct task_struct *p)

136 {

137 struct task_struct *t;

138

139 rcu_read_lock();

140

141 for_each_thread(p, t) {

142 task_lock(t);

143 if (likely(t->mm))

144 goto found;

145 task_unlock(t);

146 }

147 t = NULL;

148 found:

149 rcu_read_unlock();

150

151 return t;

152 }

153

154 /*

155 * order == -1 means the oom kill is required by sysrq, otherwise only

156 * for display purposes.

157 */

158 static inline bool is_sysrq_oom(struct oom_control *oc)

159 {

160 return oc->order == -1;

161 }

162

163 /* return true if the task is not adequate as candidate victim task. */

164 static bool oom_unkillable_task(struct task_struct *p)

165 {

166 if (is_global_init(p))

167 return true;

168 if (p->flags & PF_KTHREAD)

169 return true;

170 return false;

171 }

172

173 /*

174 * Check whether unreclaimable slab amount is greater than

175 * all user memory(LRU pages).

176 * dump_unreclaimable_slab() could help in the case that

177 * oom due to too much unreclaimable slab used by kernel.

178 */

179 static bool should_dump_unreclaim_slab(void)

180 {

181 unsigned long nr_lru;

182

183 nr_lru = global_node_page_state(NR_ACTIVE_ANON) +

184 global_node_page_state(NR_INACTIVE_ANON) +

185 global_node_page_state(NR_ACTIVE_FILE) +

186 global_node_page_state(NR_INACTIVE_FILE) +

187 global_node_page_state(NR_ISOLATED_ANON) +

188 global_node_page_state(NR_ISOLATED_FILE) +

189 global_node_page_state(NR_UNEVICTABLE);

190

191 return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);

192 }

193

194 /**

195 * oom_badness - heuristic function to determine which candidate task to kill

196 * @p: task struct of which task we should calculate

197 * @totalpages: total present RAM allowed for page allocation

198 *

199 * The heuristic for determining which task to kill is made to be as simple and

200 * predictable as possible. The goal is to return the highest value for the

201 * task consuming the most memory to avoid subsequent oom failures.

202 */

203 long oom_badness(struct task_struct *p, unsigned long totalpages)

204 {

205 long points;

206 long adj;

207

208 if (oom_unkillable_task(p))

209 return LONG_MIN;

210

211 p = find_lock_task_mm(p);

212 if (!p)

213 return LONG_MIN;

214

215 /*

216 * Do not even consider tasks which are explicitly marked oom

217 * unkillable or have been already oom reaped or the are in

218 * the middle of vfork

219 */

220 adj = (long)p->signal->oom_score_adj;

221 if (adj == OOM_SCORE_ADJ_MIN ||

222 test_bit(MMF_OOM_SKIP, &p->mm->flags) ||

223 in_vfork(p)) {

224 task_unlock(p);

225 return LONG_MIN;

226 }

227

228 /*

229 * The baseline for the badness score is the proportion of RAM that each

230 * task's rss, pagetable and swap space use.

231 */

232 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +

233 mm_pgtables_bytes(p->mm) / PAGE_SIZE;

234 task_unlock(p);

235

236 /* Normalize to oom_score_adj units */

237 adj *= totalpages / 1000;

238 points += adj;

239

240 return points;

241 }

242

243 static const char * const oom_constraint_text[] = {

244 [CONSTRAINT_NONE] = "CONSTRAINT_NONE",

245 [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",

246 [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",

247 [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",

248 };

249

250 /*

251 * Determine the type of allocation constraint.

252 */

253 static enum oom_constraint constrained_alloc(struct oom_control *oc)

254 {

255 struct zone *zone;

256 struct zoneref *z;

257 enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);

258 bool cpuset_limited = false;

259 int nid;

260

261 if (is_memcg_oom(oc)) {

262 oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;

263 return CONSTRAINT_MEMCG;

264 }

265

266 /* Default to all available memory */

267 oc->totalpages = totalram_pages() + total_swap_pages;

268

269 if (!IS_ENABLED(CONFIG_NUMA))

270 return CONSTRAINT_NONE;

271

272 if (!oc->zonelist)

273 return CONSTRAINT_NONE;

274 /*

275 * Reach here only when __GFP_NOFAIL is used. So, we should avoid

276 * to kill current.We have to random task kill in this case.

277 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.

278 */

279 if (oc->gfp_mask & __GFP_THISNODE)

280 return CONSTRAINT_NONE;

281

282 /*

283 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in

284 * the page allocator means a mempolicy is in effect. Cpuset policy

285 * is enforced in get_page_from_freelist().

286 */

287 if (oc->nodemask &&

288 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {

289 oc->totalpages = total_swap_pages;

290 for_each_node_mask(nid, *oc->nodemask)

291 oc->totalpages += node_present_pages(nid);

292 return CONSTRAINT_MEMORY_POLICY;

293 }

294

295 /* Check this allocation failure is caused by cpuset's wall function */

296 for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,

297 highest_zoneidx, oc->nodemask)

298 if (!cpuset_zone_allowed(zone, oc->gfp_mask))

299 cpuset_limited = true;

300

301 if (cpuset_limited) {

302 oc->totalpages = total_swap_pages;

303 for_each_node_mask(nid, cpuset_current_mems_allowed)

304 oc->totalpages += node_present_pages(nid);

305 return CONSTRAINT_CPUSET;

306 }

307 return CONSTRAINT_NONE;

308 }

309

310 static int oom_evaluate_task(struct task_struct *task, void *arg)

311 {

312 struct oom_control *oc = arg;

313 long points;

314

315 if (oom_unkillable_task(task))

316 goto next;

317

318 /* p may not have freeable memory in nodemask */

319 if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))

320 goto next;

321

322 /*

323 * This task already has access to memory reserves and is being killed.

324 * Don't allow any other task to have access to the reserves unless

325 * the task has MMF_OOM_SKIP because chances that it would release

326 * any memory is quite low.

327 */

328 if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {

329 if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))

330 goto next;

331 goto abort;

332 }

333

334 /*

335 * If task is allocating a lot of memory and has been marked to be

336 * killed first if it triggers an oom, then select it.

337 */

338 if (oom_task_origin(task)) {

339 points = LONG_MAX;

340 goto select;

341 }

342

343 points = oom_badness(task, oc->totalpages);

344 if (points == LONG_MIN || points < oc->chosen_points)

345 goto next;

346

347 select:

348 if (oc->chosen)

349 put_task_struct(oc->chosen);

350 get_task_struct(task);

351 oc->chosen = task;

352 oc->chosen_points = points;

353 next:

354 return 0;

355 abort:

356 if (oc->chosen)

357 put_task_struct(oc->chosen);

358 oc->chosen = (void *)-1UL;

359 return 1;

360 }

361

362 /*

363 * Simple selection loop. We choose the process with the highest number of

364 * 'points'. In case scan was aborted, oc->chosen is set to -1.

365 */

366 static void select_bad_process(struct oom_control *oc)

367 {

368 oc->chosen_points = LONG_MIN;

369

370 if (is_memcg_oom(oc))

371 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);

372 else {

373 struct task_struct *p;

374

375 rcu_read_lock();

376 for_each_process(p)

377 if (oom_evaluate_task(p, oc))

378 break;

379 rcu_read_unlock();

380 }

381 }

382

383 static int dump_task(struct task_struct *p, void *arg)

384 {

385 struct oom_control *oc = arg;

386 struct task_struct *task;

387

388 if (oom_unkillable_task(p))

389 return 0;

390

391 /* p may not have freeable memory in nodemask */

392 if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))

393 return 0;

394

395 task = find_lock_task_mm(p);

396 if (!task) {

397 /*

398 * All of p's threads have already detached their mm's. There's

399 * no need to report them; they can't be oom killed anyway.

400 */

401 return 0;

402 }

403

404 pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",

405 task->pid, from_kuid(&init_user_ns, task_uid(task)),

406 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),

407 mm_pgtables_bytes(task->mm),

408 get_mm_counter(task->mm, MM_SWAPENTS),

409 task->signal->oom_score_adj, task->comm);

410 task_unlock(task);

411

412 return 0;

413 }

414

415 /**

416 * dump_tasks - dump current memory state of all system tasks

417 * @oc: pointer to struct oom_control

418 *

419 * Dumps the current memory state of all eligible tasks. Tasks not in the same

420 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes

421 * are not shown.

422 * State information includes task's pid, uid, tgid, vm size, rss,

423 * pgtables_bytes, swapents, oom_score_adj value, and name.

424 */

425 static void dump_tasks(struct oom_control *oc)

426 {

427 pr_info("Tasks state (memory values in pages):\n");

428 pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");

429

430 if (is_memcg_oom(oc))

431 mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);

432 else {

433 struct task_struct *p;

434

435 rcu_read_lock();

436 for_each_process(p)

437 dump_task(p, oc);

438 rcu_read_unlock();

439 }

440 }

441

442 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)

443 {

444 /* one line summary of the oom killer context. */

445 pr_info("oom-kill:constraint=%s,nodemask=%*pbl",

446 oom_constraint_text[oc->constraint],

447 nodemask_pr_args(oc->nodemask));

448 cpuset_print_current_mems_allowed();

449 mem_cgroup_print_oom_context(oc->memcg, victim);

450 pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,

451 from_kuid(&init_user_ns, task_uid(victim)));

452 }

453

454 static void dump_header(struct oom_control *oc, struct task_struct *p)

455 {

456 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",

457 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,

458 current->signal->oom_score_adj);

459 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)

460 pr_warn("COMPACTION is disabled!!!\n");

461

462 dump_stack();

463 if (is_memcg_oom(oc))

464 mem_cgroup_print_oom_meminfo(oc->memcg);

465 else {

466 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);

467 if (should_dump_unreclaim_slab())

468 dump_unreclaimable_slab();

469 }

470 if (sysctl_oom_dump_tasks)

471 dump_tasks(oc);

472 if (p)

473 dump_oom_summary(oc, p);

474 }

475

476 /*

477 * Number of OOM victims in flight

478 */

479 static atomic_t oom_victims = ATOMIC_INIT(0);

480 static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);

481

482 static bool oom_killer_disabled __read_mostly;

483

484 #define K(x) ((x) << (PAGE_SHIFT-10))

485

486 /*

487 * task->mm can be NULL if the task is the exited group leader. So to

488 * determine whether the task is using a particular mm, we examine all the

489 * task's threads: if one of those is using this mm then this task was also

490 * using it.

491 */

492 bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)

493 {

494 struct task_struct *t;

495

496 for_each_thread(p, t) {

497 struct mm_struct *t_mm = READ_ONCE(t->mm);

498 if (t_mm)

499 return t_mm == mm;

500 }

501 return false;

502 }

503

504 #ifdef CONFIG_MMU

505 /*

506 * OOM Reaper kernel thread which tries to reap the memory used by the OOM

507 * victim (if that is possible) to help the OOM killer to move on.

508 */

509 static struct task_struct *oom_reaper_th;

510 static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);

511 static struct task_struct *oom_reaper_list;

512 static DEFINE_SPINLOCK(oom_reaper_lock);

513

514 bool __oom_reap_task_mm(struct mm_struct *mm)

515 {

516 struct vm_area_struct *vma;

517 bool ret = true;

518 MA_STATE(mas, &mm->mm_mt, 0, 0);

519

520 /*

521 * Tell all users of get_user/copy_from_user etc... that the content

522 * is no longer stable. No barriers really needed because unmapping

523 * should imply barriers already and the reader would hit a page fault

524 * if it stumbled over a reaped memory.

525 */

526 set_bit(MMF_UNSTABLE, &mm->flags);

527

528 rcu_read_lock();

529 mas_for_each(&mas, vma, ULONG_MAX) {

530 if (!can_madv_lru_vma(vma))

531 continue;

532

533 /*

534 * Only anonymous pages have a good chance to be dropped

535 * without additional steps which we cannot afford as we

536 * are OOM already.

537 *

538 * We do not even care about fs backed pages because all

539 * which are reclaimable have already been reclaimed and

540 * we do not want to block exit_mmap by keeping mm ref

541 * count elevated without a good reason.

542 */

543 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {

544 struct mmu_notifier_range range;

545 struct mmu_gather tlb;

546

547 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,

548 vma, mm, vma->vm_start,

549 vma->vm_end);

550 tlb_gather_mmu(&tlb, mm);

551 if (mmu_notifier_invalidate_range_start_nonblock(&range)) {

552 tlb_finish_mmu(&tlb);

553 ret = false;

554 continue;

555 }

556 unmap_page_range(&tlb, vma, range.start, range.end, NULL);

557 mmu_notifier_invalidate_range_end(&range);

558 tlb_finish_mmu(&tlb);

559 }

560 }

561 rcu_read_unlock();

562

563 return ret;

564 }

565

566 /*

567 * Reaps the address space of the give task.

568 *

569 * Returns true on success and false if none or part of the address space

570 * has been reclaimed and the caller should retry later.

571 */

572 static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)

573 {

574 bool ret = true;

575

576 if (!mmap_read_trylock(mm)) {

577 trace_skip_task_reaping(tsk->pid);

578 return false;

579 }

580

581 /*

582 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't

583 * work on the mm anymore. The check for MMF_OOM_SKIP must run

584 * under mmap_lock for reading because it serializes against the

585 * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().

586 */

587 if (test_bit(MMF_OOM_SKIP, &mm->flags)) {

588 trace_skip_task_reaping(tsk->pid);

589 goto out_unlock;

590 }

591

592 trace_start_task_reaping(tsk->pid);

593

594 /* failed to reap part of the address space. Try again later */

595 ret = __oom_reap_task_mm(mm);

596 if (!ret)

597 goto out_finish;

598

599 pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",

600 task_pid_nr(tsk), tsk->comm,

601 K(get_mm_counter(mm, MM_ANONPAGES)),

602 K(get_mm_counter(mm, MM_FILEPAGES)),

603 K(get_mm_counter(mm, MM_SHMEMPAGES)));

604 out_finish:

605 trace_finish_task_reaping(tsk->pid);

606 out_unlock:

607 mmap_read_unlock(mm);

608

609 return ret;

610 }

611

612 #define MAX_OOM_REAP_RETRIES 10

613 static void oom_reap_task(struct task_struct *tsk)

614 {

615 int attempts = 0;

616 struct mm_struct *mm = tsk->signal->oom_mm;

617

618 /* Retry the mmap_read_trylock(mm) a few times */

619 while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))

620 schedule_timeout_idle(HZ/10);

621

622 if (attempts <= MAX_OOM_REAP_RETRIES ||

623 test_bit(MMF_OOM_SKIP, &mm->flags))

624 goto done;

625

626 pr_info("oom_reaper: unable to reap pid:%d (%s)\n",

627 task_pid_nr(tsk), tsk->comm);

628 sched_show_task(tsk);

629 debug_show_all_locks();

630

631 done:

632 tsk->oom_reaper_list = NULL;

633

634 /*

635 * Hide this mm from OOM killer because it has been either reaped or

636 * somebody can't call mmap_write_unlock(mm).

637 */

638 set_bit(MMF_OOM_SKIP, &mm->flags);

639

640 /* Drop a reference taken by wake_oom_reaper */

641 put_task_struct(tsk);

642 }

643

644 static int oom_reaper(void *unused)

645 {

646 while (true) {

647 struct task_struct *tsk = NULL;

648

649 wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);

650 spin_lock(&oom_reaper_lock);

651 if (oom_reaper_list != NULL) {

652 tsk = oom_reaper_list;

653 oom_reaper_list = tsk->oom_reaper_list;

654 }

655 spin_unlock(&oom_reaper_lock);

656

657 if (tsk)

658 oom_reap_task(tsk);

659 }

660

661 return 0;

662 }

663

664 static void wake_oom_reaper(struct task_struct *tsk)

665 {

666 /* mm is already queued? */

667 if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))

668 return;

669

670 get_task_struct(tsk);

671

672 spin_lock(&oom_reaper_lock);

673 tsk->oom_reaper_list = oom_reaper_list;

674 oom_reaper_list = tsk;

675 spin_unlock(&oom_reaper_lock);

676 trace_wake_reaper(tsk->pid);

677 wake_up(&oom_reaper_wait);

678 }

679

680 static int __init oom_init(void)

681 {

682 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");

683 return 0;

684 }

685 subsys_initcall(oom_init)

686 #else

687 static inline void wake_oom_reaper(struct task_struct *tsk)

688 {

689 }

690 #endif /* CONFIG_MMU */

691

692 /**

693 * mark_oom_victim - mark the given task as OOM victim

694 * @tsk: task to mark

695 *

696 * Has to be called with oom_lock held and never after

697 * oom has been disabled already.

698 *

699 * tsk->mm has to be non NULL and caller has to guarantee it is stable (either

700 * under task_lock or operate on the current).

701 */

702 static void mark_oom_victim(struct task_struct *tsk)

703 {

704 struct mm_struct *mm = tsk->mm;

705

706 WARN_ON(oom_killer_disabled);

707 /* OOM killer might race with memcg OOM */

708 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))

709 return;

710

711 /* oom_mm is bound to the signal struct life time. */

712 if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {

713 mmgrab(tsk->signal->oom_mm);

714 set_bit(MMF_OOM_VICTIM, &mm->flags);

715 }

716

717 /*

718 * Make sure that the task is woken up from uninterruptible sleep

719 * if it is frozen because OOM killer wouldn't be able to free

720 * any memory and livelock. freezing_slow_path will tell the freezer

721 * that TIF_MEMDIE tasks should be ignored.

722 */

723 __thaw_task(tsk);

724 atomic_inc(&oom_victims);

725 trace_mark_victim(tsk->pid);

726 }

727

728 /**

729 * exit_oom_victim - note the exit of an OOM victim

730 */

731 void exit_oom_victim(void)

732 {

733 clear_thread_flag(TIF_MEMDIE);

734

735 if (!atomic_dec_return(&oom_victims))

736 wake_up_all(&oom_victims_wait);

737 }

738

739 /**

740 * oom_killer_enable - enable OOM killer

741 */

742 void oom_killer_enable(void)

743 {

744 oom_killer_disabled = false;

745 pr_info("OOM killer enabled.\n");

746 }

747

748 /**

749 * oom_killer_disable - disable OOM killer

750 * @timeout: maximum timeout to wait for oom victims in jiffies

751 *

752 * Forces all page allocations to fail rather than trigger OOM killer.

753 * Will block and wait until all OOM victims are killed or the given

754 * timeout expires.

755 *

756 * The function cannot be called when there are runnable user tasks because

757 * the userspace would see unexpected allocation failures as a result. Any

758 * new usage of this function should be consulted with MM people.

759 *

760 * Returns true if successful and false if the OOM killer cannot be

761 * disabled.

762 */

763 bool oom_killer_disable(signed long timeout)

764 {

765 signed long ret;

766

767 /*

768 * Make sure to not race with an ongoing OOM killer. Check that the

769 * current is not killed (possibly due to sharing the victim's memory).

770 */

771 if (mutex_lock_killable(&oom_lock))

772 return false;

773 oom_killer_disabled = true;

774 mutex_unlock(&oom_lock);

775

776 ret = wait_event_interruptible_timeout(oom_victims_wait,

777 !atomic_read(&oom_victims), timeout);

778 if (ret <= 0) {

779 oom_killer_enable();

780 return false;

781 }

782 pr_info("OOM killer disabled.\n");

783

784 return true;

785 }

786

787 static inline bool __task_will_free_mem(struct task_struct *task)

788 {

789 struct signal_struct *sig = task->signal;

790

791 /*

792 * A coredumping process may sleep for an extended period in exit_mm(),

793 * so the oom killer cannot assume that the process will promptly exit

794 * and release memory.

795 */

796 if (sig->flags & SIGNAL_GROUP_COREDUMP)

797 return false;

798

799 if (sig->flags & SIGNAL_GROUP_EXIT)

800 return true;

801

802 if (thread_group_empty(task) && (task->flags & PF_EXITING))

803 return true;

804

805 return false;

806 }

807

808 /*

809 * Checks whether the given task is dying or exiting and likely to

810 * release its address space. This means that all threads and processes

811 * sharing the same mm have to be killed or exiting.

812 * Caller has to make sure that task->mm is stable (hold task_lock or

813 * it operates on the current).

814 */

815 static bool task_will_free_mem(struct task_struct *task)

816 {

817 struct mm_struct *mm = task->mm;

818 struct task_struct *p;

819 bool ret = true;

820

821 /*

822 * Skip tasks without mm because it might have passed its exit_mm and

823 * exit_oom_victim. oom_reaper could have rescued that but do not rely

824 * on that for now. We can consider find_lock_task_mm in future.

825 */

826 if (!mm)

827 return false;

828

829 if (!__task_will_free_mem(task))

830 return false;

831

832 /*

833 * This task has already been drained by the oom reaper so there are

834 * only small chances it will free some more

835 */

836 if (test_bit(MMF_OOM_SKIP, &mm->flags))

837 return false;

838

839 if (atomic_read(&mm->mm_users) <= 1)

840 return true;

841

842 /*

843 * Make sure that all tasks which share the mm with the given tasks

844 * are dying as well to make sure that a) nobody pins its mm and

845 * b) the task is also reapable by the oom reaper.

846 */

847 rcu_read_lock();

848 for_each_process(p) {

849 if (!process_shares_mm(p, mm))

850 continue;

851 if (same_thread_group(task, p))

852 continue;

853 ret = __task_will_free_mem(p);

854 if (!ret)

855 break;

856 }

857 rcu_read_unlock();

858

859 return ret;

860 }

861

862 static void __oom_kill_process(struct task_struct *victim, const char *message)

863 {

864 struct task_struct *p;

865 struct mm_struct *mm;

866 bool can_oom_reap = true;

867

868 p = find_lock_task_mm(victim);

869 if (!p) {

870 pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",

871 message, task_pid_nr(victim), victim->comm);

872 put_task_struct(victim);

873 return;

874 } else if (victim != p) {

875 get_task_struct(p);

876 put_task_struct(victim);

877 victim = p;

878 }

879

880 /* Get a reference to safely compare mm after task_unlock(victim) */

881 mm = victim->mm;

882 mmgrab(mm);

883

884 /* Raise event before sending signal: task reaper must see this */

885 count_vm_event(OOM_KILL);

886 memcg_memory_event_mm(mm, MEMCG_OOM_KILL);

887

888 /*

889 * We should send SIGKILL before granting access to memory reserves

890 * in order to prevent the OOM victim from depleting the memory

891 * reserves from the user space under its control.

892 */

893 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);

894 mark_oom_victim(victim);

895 pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",

896 message, task_pid_nr(victim), victim->comm, K(mm->total_vm),

897 K(get_mm_counter(mm, MM_ANONPAGES)),

898 K(get_mm_counter(mm, MM_FILEPAGES)),

899 K(get_mm_counter(mm, MM_SHMEMPAGES)),

900 from_kuid(&init_user_ns, task_uid(victim)),

901 mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);

902 task_unlock(victim);

903

904 /*

905 * Kill all user processes sharing victim->mm in other thread groups, if

906 * any. They don't get access to memory reserves, though, to avoid

907 * depletion of all memory. This prevents mm->mmap_lock livelock when an

908 * oom killed thread cannot exit because it requires the semaphore and

909 * its contended by another thread trying to allocate memory itself.

910 * That thread will now get access to memory reserves since it has a

911 * pending fatal signal.

912 */

913 rcu_read_lock();

914 for_each_process(p) {

915 if (!process_shares_mm(p, mm))

916 continue;

917 if (same_thread_group(p, victim))

918 continue;

919 if (is_global_init(p)) {

920 can_oom_reap = false;

921 set_bit(MMF_OOM_SKIP, &mm->flags);

922 pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",

923 task_pid_nr(victim), victim->comm,

924 task_pid_nr(p), p->comm);

925 continue;

926 }

927 /*

928 * No kthread_use_mm() user needs to read from the userspace so

929 * we are ok to reap it.

930 */

931 if (unlikely(p->flags & PF_KTHREAD))

932 continue;

933 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);

934 }

935 rcu_read_unlock();

936

937 if (can_oom_reap)

938 wake_oom_reaper(victim);

939

940 mmdrop(mm);

941 put_task_struct(victim);

942 }

943 #undef K

944

945 /*

946 * Kill provided task unless it's secured by setting

947 * oom_score_adj to OOM_SCORE_ADJ_MIN.

948 */

949 static int oom_kill_memcg_member(struct task_struct *task, void *message)

950 {

951 if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&

952 !is_global_init(task)) {

953 get_task_struct(task);

954 __oom_kill_process(task, message);

955 }

956 return 0;

957 }

958

959 static void oom_kill_process(struct oom_control *oc, const char *message)

960 {

961 struct task_struct *victim = oc->chosen;

962 struct mem_cgroup *oom_group;

963 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,

964 DEFAULT_RATELIMIT_BURST);

965

966 /*

967 * If the task is already exiting, don't alarm the sysadmin or kill

968 * its children or threads, just give it access to memory reserves

969 * so it can die quickly

970 */

971 task_lock(victim);

972 if (task_will_free_mem(victim)) {

973 mark_oom_victim(victim);

974 wake_oom_reaper(victim);

975 task_unlock(victim);

976 put_task_struct(victim);

977 return;

978 }

979 task_unlock(victim);

980

981 if (__ratelimit(&oom_rs))

982 dump_header(oc, victim);

983

984 /*

985 * Do we need to kill the entire memory cgroup?

986 * Or even one of the ancestor memory cgroups?

987 * Check this out before killing the victim task.

988 */

989 oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);

990

991 __oom_kill_process(victim, message);

992

993 /*

994 * If necessary, kill all tasks in the selected memory cgroup.

995 */

996 if (oom_group) {

997 mem_cgroup_print_oom_group(oom_group);

998 mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,

999 (void *)message);

1000 mem_cgroup_put(oom_group);

1001 }

1002 }

1003

1004 /*

1005 * Determines whether the kernel must panic because of the panic_on_oom sysctl.

1006 */

1007 static void check_panic_on_oom(struct oom_control *oc)

1008 {

1009 if (likely(!sysctl_panic_on_oom))

1010 return;

1011 if (sysctl_panic_on_oom != 2) {

1012 /*

1013 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel

1014 * does not panic for cpuset, mempolicy, or memcg allocation

1015 * failures.

1016 */

1017 if (oc->constraint != CONSTRAINT_NONE)

1018 return;

1019 }

1020 /* Do not panic for oom kills triggered by sysrq */

1021 if (is_sysrq_oom(oc))

1022 return;

1023 dump_header(oc, NULL);

1024 panic("Out of memory: %s panic_on_oom is enabled\n",

1025 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");

1026 }

1027

1028 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);

1029

1030 int register_oom_notifier(struct notifier_block *nb)

1031 {

1032 return blocking_notifier_chain_register(&oom_notify_list, nb);

1033 }

1034 EXPORT_SYMBOL_GPL(register_oom_notifier);

1035

1036 int unregister_oom_notifier(struct notifier_block *nb)

1037 {

1038 return blocking_notifier_chain_unregister(&oom_notify_list, nb);

1039 }

1040 EXPORT_SYMBOL_GPL(unregister_oom_notifier);

1041

1042 /**

1043 * out_of_memory - kill the "best" process when we run out of memory

1044 * @oc: pointer to struct oom_control

1045 *

1046 * If we run out of memory, we have the choice between either

1047 * killing a random task (bad), letting the system crash (worse)

1048 * OR try to be smart about which process to kill. Note that we

1049 * don't have to be perfect here, we just have to be good.

1050 */

1051 bool out_of_memory(struct oom_control *oc)

1052 {

1053 unsigned long freed = 0;

1054

1055 if (oom_killer_disabled)

1056 return false;

1057

1058 if (!is_memcg_oom(oc)) {

1059 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);

1060 if (freed > 0)

1061 /* Got some memory back in the last second. */

1062 return true;

1063 }

1064

1065 /*

1066 * If current has a pending SIGKILL or is exiting, then automatically

1067 * select it. The goal is to allow it to allocate so that it may

1068 * quickly exit and free its memory.

1069 */

1070 if (task_will_free_mem(current)) {

1071 mark_oom_victim(current);

1072 wake_oom_reaper(current);

1073 return true;

1074 }

1075

1076 /*

1077 * The OOM killer does not compensate for IO-less reclaim.

1078 * pagefault_out_of_memory lost its gfp context so we have to

1079 * make sure exclude 0 mask - all other users should have at least

1080 * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to

1081 * invoke the OOM killer even if it is a GFP_NOFS allocation.

1082 */

1083 if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))

1084 return true;

1085

1086 /*

1087 * Check if there were limitations on the allocation (only relevant for

1088 * NUMA and memcg) that may require different handling.

1089 */

1090 oc->constraint = constrained_alloc(oc);

1091 if (oc->constraint != CONSTRAINT_MEMORY_POLICY)

1092 oc->nodemask = NULL;

1093 check_panic_on_oom(oc);

1094

1095 if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&

1096 current->mm && !oom_unkillable_task(current) &&

1097 oom_cpuset_eligible(current, oc) &&

1098 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {

1099 get_task_struct(current);

1100 oc->chosen = current;

1101 oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");

1102 return true;

1103 }

1104

1105 select_bad_process(oc);

1106 /* Found nothing?!?! */

1107 if (!oc->chosen) {

1108 dump_header(oc, NULL);

1109 pr_warn("Out of memory and no killable processes...\n");

1110 /*

1111 * If we got here due to an actual allocation at the

1112 * system level, we cannot survive this and will enter

1113 * an endless loop in the allocator. Bail out now.

1114 */

1115 if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))

1116 panic("System is deadlocked on memory\n");

1117 }

1118 if (oc->chosen && oc->chosen != (void *)-1UL)

1119 oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :

1120 "Memory cgroup out of memory");

1121 return !!oc->chosen;

1122 }

1123

1124 /*

1125 * The pagefault handler calls here because it is out of memory, so kill a

1126 * memory-hogging task. If oom_lock is held by somebody else, a parallel oom

1127 * killing is already in progress so do nothing.

1128 */

1129 void pagefault_out_of_memory(void)

1130 {

1131 struct oom_control oc = {

1132 .zonelist = NULL,

1133 .nodemask = NULL,

1134 .memcg = NULL,

1135 .gfp_mask = 0,

1136 .order = 0,

1137 };

1138

1139 if (mem_cgroup_oom_synchronize(true))

1140 return;

1141

1142 if (!mutex_trylock(&oom_lock))

1143 return;

1144 out_of_memory(&oc);

1145 mutex_unlock(&oom_lock);

1146 }