1 /*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "lowmemorykiller"
18
19 #include <dirent.h>
20 #include <errno.h>
21 #include <inttypes.h>
22 #include <pwd.h>
23 #include <sched.h>
24 #include <signal.h>
25 #include <stdbool.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <sys/cdefs.h>
29 #include <sys/epoll.h>
30 #include <sys/eventfd.h>
31 #include <sys/mman.h>
32 #include <sys/pidfd.h>
33 #include <sys/resource.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/time.h>
38 #include <sys/types.h>
39 #include <time.h>
40 #include <unistd.h>
41
42 #include <cutils/properties.h>
43 #include <cutils/sched_policy.h>
44 #include <cutils/sockets.h>
45 #include <liblmkd_utils.h>
46 #include <lmkd.h>
47 #include <log/log.h>
48 #include <log/log_event_list.h>
49 #include <log/log_time.h>
50 #include <private/android_filesystem_config.h>
51 #include <psi/psi.h>
52 #include <system/thread_defs.h>
53
54 #include "statslog.h"
55
56 #define BPF_FD_JUST_USE_INT
57 #include "BpfSyscallWrappers.h"
58
59 /*
60 * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces
61 * to profile and correlate with OOM kills
62 */
63 #ifdef LMKD_TRACE_KILLS
64
65 #define ATRACE_TAG ATRACE_TAG_ALWAYS
66 #include <cutils/trace.h>
67
68 #define TRACE_KILL_START(pid) ATRACE_INT(__FUNCTION__, pid);
69 #define TRACE_KILL_END() ATRACE_INT(__FUNCTION__, 0);
70
71 #else /* LMKD_TRACE_KILLS */
72
73 #define TRACE_KILL_START(pid) ((void)(pid))
74 #define TRACE_KILL_END() ((void)0)
75
76 #endif /* LMKD_TRACE_KILLS */
77
78 #ifndef __unused
79 #define __unused __attribute__((__unused__))
80 #endif
81
82 #define MEMCG_SYSFS_PATH "/dev/memcg/"
83 #define MEMCG_MEMORY_USAGE "/dev/memcg/memory.usage_in_bytes"
84 #define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"
85 #define ZONEINFO_PATH "/proc/zoneinfo"
86 #define MEMINFO_PATH "/proc/meminfo"
87 #define VMSTAT_PATH "/proc/vmstat"
88 #define PROC_STATUS_TGID_FIELD "Tgid:"
89 #define PROC_STATUS_RSS_FIELD "VmRSS:"
90 #define PROC_STATUS_SWAP_FIELD "VmSwap:"
91 #define LINE_MAX 128
92
93 #define PERCEPTIBLE_APP_ADJ 200
94
95 /* Android Logger event logtags (see event.logtags) */
96 #define KILLINFO_LOG_TAG 10195355
97
98 /* gid containing AID_SYSTEM required */
99 #define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"
100 #define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj"
101
102 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
103 #define EIGHT_MEGA (1 << 23)
104
105 #define TARGET_UPDATE_MIN_INTERVAL_MS 1000
106 #define THRASHING_RESET_INTERVAL_MS 1000
107
108 #define NS_PER_MS (NS_PER_SEC / MS_PER_SEC)
109 #define US_PER_MS (US_PER_SEC / MS_PER_SEC)
110
111 /* Defined as ProcessList.SYSTEM_ADJ in ProcessList.java */
112 #define SYSTEM_ADJ (-900)
113
114 #define STRINGIFY(x) STRINGIFY_INTERNAL(x)
115 #define STRINGIFY_INTERNAL(x) #x
116
117 /*
118 * PSI monitor tracking window size.
119 * PSI monitor generates events at most once per window,
120 * therefore we poll memory state for the duration of
121 * PSI_WINDOW_SIZE_MS after the event happens.
122 */
123 #define PSI_WINDOW_SIZE_MS 1000
124 /* Polling period after PSI signal when pressure is high */
125 #define PSI_POLL_PERIOD_SHORT_MS 10
126 /* Polling period after PSI signal when pressure is low */
127 #define PSI_POLL_PERIOD_LONG_MS 100
128
129 #define min(a, b) (((a) < (b)) ? (a) : (b))
130 #define max(a, b) (((a) > (b)) ? (a) : (b))
131
132 #define FAIL_REPORT_RLIMIT_MS 1000
133
134 /*
135 * System property defaults
136 */
137 /* ro.lmk.swap_free_low_percentage property defaults */
138 #define DEF_LOW_SWAP 10
139 /* ro.lmk.thrashing_limit property defaults */
140 #define DEF_THRASHING_LOWRAM 30
141 #define DEF_THRASHING 100
142 /* ro.lmk.thrashing_limit_decay property defaults */
143 #define DEF_THRASHING_DECAY_LOWRAM 50
144 #define DEF_THRASHING_DECAY 10
145 /* ro.lmk.psi_partial_stall_ms property defaults */
146 #define DEF_PARTIAL_STALL_LOWRAM 200
147 #define DEF_PARTIAL_STALL 70
148 /* ro.lmk.psi_complete_stall_ms property defaults */
149 #define DEF_COMPLETE_STALL 700
150
151 #define LMKD_REINIT_PROP "lmkd.reinit"
152
153 /* default to old in-kernel interface if no memory pressure events */
154 static bool use_inkernel_interface = true;
155 static bool has_inkernel_module;
156
157 /* memory pressure levels */
158 enum vmpressure_level {
159 VMPRESS_LEVEL_LOW = 0,
160 VMPRESS_LEVEL_MEDIUM,
161 VMPRESS_LEVEL_CRITICAL,
162 VMPRESS_LEVEL_COUNT
163 };
164
165 static const char *level_name[] = {
166 "low",
167 "medium",
168 "critical"
169 };
170
171 struct {
172 int64_t min_nr_free_pages; /* recorded but not used yet */
173 int64_t max_nr_free_pages;
174 } low_pressure_mem = { -1, -1 };
175
176 struct psi_threshold {
177 enum psi_stall_type stall_type;
178 int threshold_ms;
179 };
180
181 static int level_oomadj[VMPRESS_LEVEL_COUNT];
182 static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
183 static bool pidfd_supported;
184 static int last_kill_pid_or_fd = -1;
185 static struct timespec last_kill_tm;
186
187 /* lmkd configurable parameters */
188 static bool debug_process_killing;
189 static bool enable_pressure_upgrade;
190 static int64_t upgrade_pressure;
191 static int64_t downgrade_pressure;
192 static bool low_ram_device;
193 static bool kill_heaviest_task;
194 static unsigned long kill_timeout_ms;
195 static bool use_minfree_levels;
196 static bool per_app_memcg;
197 static int swap_free_low_percentage;
198 static int psi_partial_stall_ms;
199 static int psi_complete_stall_ms;
200 static int thrashing_limit_pct;
201 static int thrashing_limit_decay_pct;
202 static int thrashing_critical_pct;
203 static int swap_util_max;
204 static int64_t filecache_min_kb;
205 static bool use_psi_monitors = false;
206 static int kpoll_fd;
207 static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
208 { PSI_SOME, 70 }, /* 70ms out of 1sec for partial stall */
209 { PSI_SOME, 100 }, /* 100ms out of 1sec for partial stall */
210 { PSI_FULL, 70 }, /* 70ms out of 1sec for complete stall */
211 };
212
213 static android_log_context ctx;
214
215 enum polling_update {
216 POLLING_DO_NOT_CHANGE,
217 POLLING_START,
218 POLLING_PAUSE,
219 POLLING_RESUME,
220 };
221
222 /*
223 * Data used for periodic polling for the memory state of the device.
224 * Note that when system is not polling poll_handler is set to NULL,
225 * when polling starts poll_handler gets set and is reset back to
226 * NULL when polling stops.
227 */
228 struct polling_params {
229 struct event_handler_info* poll_handler;
230 struct event_handler_info* paused_handler;
231 struct timespec poll_start_tm;
232 struct timespec last_poll_tm;
233 int polling_interval_ms;
234 enum polling_update update;
235 };
236
237 /* data required to handle events */
238 struct event_handler_info {
239 int data;
240 void (*handler)(int data, uint32_t events, struct polling_params *poll_params);
241 };
242
243 /* data required to handle socket events */
244 struct sock_event_handler_info {
245 int sock;
246 pid_t pid;
247 uint32_t async_event_mask;
248 struct event_handler_info handler_info;
249 };
250
251 /* max supported number of data connections (AMS, init, tests) */
252 #define MAX_DATA_CONN 3
253
254 /* socket event handler data */
255 static struct sock_event_handler_info ctrl_sock;
256 static struct sock_event_handler_info data_sock[MAX_DATA_CONN];
257
258 /* vmpressure event handler data */
259 static struct event_handler_info vmpressure_hinfo[VMPRESS_LEVEL_COUNT];
260
261 /*
262 * 1 ctrl listen socket, 3 ctrl data socket, 3 memory pressure levels,
263 * 1 lmk events + 1 fd to wait for process death
264 */
265 #define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1)
266 static int epollfd;
267 static int maxevents;
268
269 /* OOM score values used by both kernel and framework */
270 #define OOM_SCORE_ADJ_MIN (-1000)
271 #define OOM_SCORE_ADJ_MAX 1000
272
273 static int lowmem_adj[MAX_TARGETS];
274 static int lowmem_minfree[MAX_TARGETS];
275 static int lowmem_targets_size;
276
277 /* Fields to parse in /proc/zoneinfo */
278 /* zoneinfo per-zone fields */
279 enum zoneinfo_zone_field {
280 ZI_ZONE_NR_FREE_PAGES = 0,
281 ZI_ZONE_MIN,
282 ZI_ZONE_LOW,
283 ZI_ZONE_HIGH,
284 ZI_ZONE_PRESENT,
285 ZI_ZONE_NR_FREE_CMA,
286 ZI_ZONE_FIELD_COUNT
287 };
288
289 static const char* const zoneinfo_zone_field_names[ZI_ZONE_FIELD_COUNT] = {
290 "nr_free_pages",
291 "min",
292 "low",
293 "high",
294 "present",
295 "nr_free_cma",
296 };
297
298 /* zoneinfo per-zone special fields */
299 enum zoneinfo_zone_spec_field {
300 ZI_ZONE_SPEC_PROTECTION = 0,
301 ZI_ZONE_SPEC_PAGESETS,
302 ZI_ZONE_SPEC_FIELD_COUNT,
303 };
304
305 static const char* const zoneinfo_zone_spec_field_names[ZI_ZONE_SPEC_FIELD_COUNT] = {
306 "protection:",
307 "pagesets",
308 };
309
310 /* see __MAX_NR_ZONES definition in kernel mmzone.h */
311 #define MAX_NR_ZONES 6
312
313 union zoneinfo_zone_fields {
314 struct {
315 int64_t nr_free_pages;
316 int64_t min;
317 int64_t low;
318 int64_t high;
319 int64_t present;
320 int64_t nr_free_cma;
321 } field;
322 int64_t arr[ZI_ZONE_FIELD_COUNT];
323 };
324
325 struct zoneinfo_zone {
326 union zoneinfo_zone_fields fields;
327 int64_t protection[MAX_NR_ZONES];
328 int64_t max_protection;
329 };
330
331 /* zoneinfo per-node fields */
332 enum zoneinfo_node_field {
333 ZI_NODE_NR_INACTIVE_FILE = 0,
334 ZI_NODE_NR_ACTIVE_FILE,
335 ZI_NODE_FIELD_COUNT
336 };
337
338 static const char* const zoneinfo_node_field_names[ZI_NODE_FIELD_COUNT] = {
339 "nr_inactive_file",
340 "nr_active_file",
341 };
342
343 union zoneinfo_node_fields {
344 struct {
345 int64_t nr_inactive_file;
346 int64_t nr_active_file;
347 } field;
348 int64_t arr[ZI_NODE_FIELD_COUNT];
349 };
350
351 struct zoneinfo_node {
352 int id;
353 int zone_count;
354 struct zoneinfo_zone zones[MAX_NR_ZONES];
355 union zoneinfo_node_fields fields;
356 };
357
358 /* for now two memory nodes is more than enough */
359 #define MAX_NR_NODES 2
360
361 struct zoneinfo {
362 int node_count;
363 struct zoneinfo_node nodes[MAX_NR_NODES];
364 int64_t totalreserve_pages;
365 int64_t total_inactive_file;
366 int64_t total_active_file;
367 };
368
369 /* Fields to parse in /proc/meminfo */
370 enum meminfo_field {
371 MI_NR_FREE_PAGES = 0,
372 MI_CACHED,
373 MI_SWAP_CACHED,
374 MI_BUFFERS,
375 MI_SHMEM,
376 MI_UNEVICTABLE,
377 MI_TOTAL_SWAP,
378 MI_FREE_SWAP,
379 MI_ACTIVE_ANON,
380 MI_INACTIVE_ANON,
381 MI_ACTIVE_FILE,
382 MI_INACTIVE_FILE,
383 MI_SRECLAIMABLE,
384 MI_SUNRECLAIM,
385 MI_KERNEL_STACK,
386 MI_PAGE_TABLES,
387 MI_ION_HELP,
388 MI_ION_HELP_POOL,
389 MI_CMA_FREE,
390 MI_FIELD_COUNT
391 };
392
393 static const char* const meminfo_field_names[MI_FIELD_COUNT] = {
394 "MemFree:",
395 "Cached:",
396 "SwapCached:",
397 "Buffers:",
398 "Shmem:",
399 "Unevictable:",
400 "SwapTotal:",
401 "SwapFree:",
402 "Active(anon):",
403 "Inactive(anon):",
404 "Active(file):",
405 "Inactive(file):",
406 "SReclaimable:",
407 "SUnreclaim:",
408 "KernelStack:",
409 "PageTables:",
410 "ION_heap:",
411 "ION_heap_pool:",
412 "CmaFree:",
413 };
414
415 union meminfo {
416 struct {
417 int64_t nr_free_pages;
418 int64_t cached;
419 int64_t swap_cached;
420 int64_t buffers;
421 int64_t shmem;
422 int64_t unevictable;
423 int64_t total_swap;
424 int64_t free_swap;
425 int64_t active_anon;
426 int64_t inactive_anon;
427 int64_t active_file;
428 int64_t inactive_file;
429 int64_t sreclaimable;
430 int64_t sunreclaimable;
431 int64_t kernel_stack;
432 int64_t page_tables;
433 int64_t ion_heap;
434 int64_t ion_heap_pool;
435 int64_t cma_free;
436 /* fields below are calculated rather than read from the file */
437 int64_t nr_file_pages;
438 int64_t total_gpu_kb;
439 } field;
440 int64_t arr[MI_FIELD_COUNT];
441 };
442
443 /* Fields to parse in /proc/vmstat */
444 enum vmstat_field {
445 VS_FREE_PAGES,
446 VS_INACTIVE_FILE,
447 VS_ACTIVE_FILE,
448 VS_WORKINGSET_REFAULT,
449 VS_WORKINGSET_REFAULT_FILE,
450 VS_PGSCAN_KSWAPD,
451 VS_PGSCAN_DIRECT,
452 VS_PGSCAN_DIRECT_THROTTLE,
453 VS_FIELD_COUNT
454 };
455
456 static const char* const vmstat_field_names[MI_FIELD_COUNT] = {
457 "nr_free_pages",
458 "nr_inactive_file",
459 "nr_active_file",
460 "workingset_refault",
461 "workingset_refault_file",
462 "pgscan_kswapd",
463 "pgscan_direct",
464 "pgscan_direct_throttle",
465 };
466
467 union vmstat {
468 struct {
469 int64_t nr_free_pages;
470 int64_t nr_inactive_file;
471 int64_t nr_active_file;
472 int64_t workingset_refault;
473 int64_t workingset_refault_file;
474 int64_t pgscan_kswapd;
475 int64_t pgscan_direct;
476 int64_t pgscan_direct_throttle;
477 } field;
478 int64_t arr[VS_FIELD_COUNT];
479 };
480
481 enum field_match_result {
482 NO_MATCH,
483 PARSE_FAIL,
484 PARSE_SUCCESS
485 };
486
487 struct adjslot_list {
488 struct adjslot_list *next;
489 struct adjslot_list *prev;
490 };
491
492 struct proc {
493 struct adjslot_list asl;
494 int pid;
495 int pidfd;
496 uid_t uid;
497 int oomadj;
498 pid_t reg_pid; /* PID of the process that registered this record */
499 struct proc *pidhash_next;
500 };
501
502 struct reread_data {
503 const char* const filename;
504 int fd;
505 };
506
507 #define PIDHASH_SZ 1024
508 static struct proc *pidhash[PIDHASH_SZ];
509 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
510
511 #define ADJTOSLOT(adj) ((adj) + -OOM_SCORE_ADJ_MIN)
512 #define ADJTOSLOT_COUNT (ADJTOSLOT(OOM_SCORE_ADJ_MAX) + 1)
513 static struct adjslot_list procadjslot_list[ADJTOSLOT_COUNT];
514
515 #define MAX_DISTINCT_OOM_ADJ 32
516 #define KILLCNT_INVALID_IDX 0xFF
517 /*
518 * Because killcnt array is sparse a two-level indirection is used
519 * to keep the size small. killcnt_idx stores index of the element in
520 * killcnt array. Index KILLCNT_INVALID_IDX indicates an unused slot.
521 */
522 static uint8_t killcnt_idx[ADJTOSLOT_COUNT];
523 static uint16_t killcnt[MAX_DISTINCT_OOM_ADJ];
524 static int killcnt_free_idx = 0;
525 static uint32_t killcnt_total = 0;
526
527 /* PAGE_SIZE / 1024 */
528 static long page_k;
529
530 static void update_props();
531 static bool init_monitors();
532 static void destroy_monitors();
533
clamp(int low,int high,int value)534 static int clamp(int low, int high, int value) {
535 return max(min(value, high), low);
536 }
537
parse_int64(const char * str,int64_t * ret)538 static bool parse_int64(const char* str, int64_t* ret) {
539 char* endptr;
540 long long val = strtoll(str, &endptr, 10);
541 if (str == endptr || val > INT64_MAX) {
542 return false;
543 }
544 *ret = (int64_t)val;
545 return true;
546 }
547
find_field(const char * name,const char * const field_names[],int field_count)548 static int find_field(const char* name, const char* const field_names[], int field_count) {
549 for (int i = 0; i < field_count; i++) {
550 if (!strcmp(name, field_names[i])) {
551 return i;
552 }
553 }
554 return -1;
555 }
556
match_field(const char * cp,const char * ap,const char * const field_names[],int field_count,int64_t * field,int * field_idx)557 static enum field_match_result match_field(const char* cp, const char* ap,
558 const char* const field_names[],
559 int field_count, int64_t* field,
560 int *field_idx) {
561 int i = find_field(cp, field_names, field_count);
562 if (i < 0) {
563 return NO_MATCH;
564 }
565 *field_idx = i;
566 return parse_int64(ap, field) ? PARSE_SUCCESS : PARSE_FAIL;
567 }
568
569 /*
570 * Read file content from the beginning up to max_len bytes or EOF
571 * whichever happens first.
572 */
read_all(int fd,char * buf,size_t max_len)573 static ssize_t read_all(int fd, char *buf, size_t max_len)
574 {
575 ssize_t ret = 0;
576 off_t offset = 0;
577
578 while (max_len > 0) {
579 ssize_t r = TEMP_FAILURE_RETRY(pread(fd, buf, max_len, offset));
580 if (r == 0) {
581 break;
582 }
583 if (r == -1) {
584 return -1;
585 }
586 ret += r;
587 buf += r;
588 offset += r;
589 max_len -= r;
590 }
591
592 return ret;
593 }
594
595 /*
596 * Read a new or already opened file from the beginning.
597 * If the file has not been opened yet data->fd should be set to -1.
598 * To be used with files which are read often and possibly during high
599 * memory pressure to minimize file opening which by itself requires kernel
600 * memory allocation and might result in a stall on memory stressed system.
601 */
reread_file(struct reread_data * data)602 static char *reread_file(struct reread_data *data) {
603 /* start with page-size buffer and increase if needed */
604 static ssize_t buf_size = PAGE_SIZE;
605 static char *new_buf, *buf = NULL;
606 ssize_t size;
607
608 if (data->fd == -1) {
609 /* First-time buffer initialization */
610 if (!buf && (buf = static_cast<char*>(malloc(buf_size))) == nullptr) {
611 return NULL;
612 }
613
614 data->fd = TEMP_FAILURE_RETRY(open(data->filename, O_RDONLY | O_CLOEXEC));
615 if (data->fd < 0) {
616 ALOGE("%s open: %s", data->filename, strerror(errno));
617 return NULL;
618 }
619 }
620
621 while (true) {
622 size = read_all(data->fd, buf, buf_size - 1);
623 if (size < 0) {
624 ALOGE("%s read: %s", data->filename, strerror(errno));
625 close(data->fd);
626 data->fd = -1;
627 return NULL;
628 }
629 if (size < buf_size - 1) {
630 break;
631 }
632 /*
633 * Since we are reading /proc files we can't use fstat to find out
634 * the real size of the file. Double the buffer size and keep retrying.
635 */
636 if ((new_buf = static_cast<char*>(realloc(buf, buf_size * 2))) == nullptr) {
637 errno = ENOMEM;
638 return NULL;
639 }
640 buf = new_buf;
641 buf_size *= 2;
642 }
643 buf[size] = 0;
644
645 return buf;
646 }
647
claim_record(struct proc * procp,pid_t pid)648 static bool claim_record(struct proc* procp, pid_t pid) {
649 if (procp->reg_pid == pid) {
650 /* Record already belongs to the registrant */
651 return true;
652 }
653 if (procp->reg_pid == 0) {
654 /* Old registrant is gone, claim the record */
655 procp->reg_pid = pid;
656 return true;
657 }
658 /* The record is owned by another registrant */
659 return false;
660 }
661
remove_claims(pid_t pid)662 static void remove_claims(pid_t pid) {
663 int i;
664
665 for (i = 0; i < PIDHASH_SZ; i++) {
666 struct proc* procp = pidhash[i];
667 while (procp) {
668 if (procp->reg_pid == pid) {
669 procp->reg_pid = 0;
670 }
671 procp = procp->pidhash_next;
672 }
673 }
674 }
675
ctrl_data_close(int dsock_idx)676 static void ctrl_data_close(int dsock_idx) {
677 struct epoll_event epev;
678
679 ALOGI("closing lmkd data connection");
680 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, data_sock[dsock_idx].sock, &epev) == -1) {
681 // Log a warning and keep going
682 ALOGW("epoll_ctl for data connection socket failed; errno=%d", errno);
683 }
684 maxevents--;
685
686 close(data_sock[dsock_idx].sock);
687 data_sock[dsock_idx].sock = -1;
688
689 /* Mark all records of the old registrant as unclaimed */
690 remove_claims(data_sock[dsock_idx].pid);
691 }
692
ctrl_data_read(int dsock_idx,char * buf,size_t bufsz,struct ucred * sender_cred)693 static ssize_t ctrl_data_read(int dsock_idx, char* buf, size_t bufsz, struct ucred* sender_cred) {
694 struct iovec iov = {buf, bufsz};
695 char control[CMSG_SPACE(sizeof(struct ucred))];
696 struct msghdr hdr = {
697 NULL, 0, &iov, 1, control, sizeof(control), 0,
698 };
699 ssize_t ret;
700 ret = TEMP_FAILURE_RETRY(recvmsg(data_sock[dsock_idx].sock, &hdr, 0));
701 if (ret == -1) {
702 ALOGE("control data socket read failed; %s", strerror(errno));
703 return -1;
704 }
705 if (ret == 0) {
706 ALOGE("Got EOF on control data socket");
707 return -1;
708 }
709
710 struct ucred* cred = NULL;
711 struct cmsghdr* cmsg = CMSG_FIRSTHDR(&hdr);
712 while (cmsg != NULL) {
713 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_CREDENTIALS) {
714 cred = (struct ucred*)CMSG_DATA(cmsg);
715 break;
716 }
717 cmsg = CMSG_NXTHDR(&hdr, cmsg);
718 }
719
720 if (cred == NULL) {
721 ALOGE("Failed to retrieve sender credentials");
722 /* Close the connection */
723 ctrl_data_close(dsock_idx);
724 return -1;
725 }
726
727 memcpy(sender_cred, cred, sizeof(struct ucred));
728
729 /* Store PID of the peer */
730 data_sock[dsock_idx].pid = cred->pid;
731
732 return ret;
733 }
734
ctrl_data_write(int dsock_idx,char * buf,size_t bufsz)735 static int ctrl_data_write(int dsock_idx, char* buf, size_t bufsz) {
736 int ret = 0;
737
738 ret = TEMP_FAILURE_RETRY(write(data_sock[dsock_idx].sock, buf, bufsz));
739
740 if (ret == -1) {
741 ALOGE("control data socket write failed; errno=%d", errno);
742 } else if (ret == 0) {
743 ALOGE("Got EOF on control data socket");
744 ret = -1;
745 }
746
747 return ret;
748 }
749
750 /*
751 * Write the pid/uid pair over the data socket, note: all active clients
752 * will receive this unsolicited notification.
753 */
ctrl_data_write_lmk_kill_occurred(pid_t pid,uid_t uid)754 static void ctrl_data_write_lmk_kill_occurred(pid_t pid, uid_t uid) {
755 LMKD_CTRL_PACKET packet;
756 size_t len = lmkd_pack_set_prockills(packet, pid, uid);
757
758 for (int i = 0; i < MAX_DATA_CONN; i++) {
759 if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_KILL) {
760 ctrl_data_write(i, (char*)packet, len);
761 }
762 }
763 }
764
765 /*
766 * Write the kill_stat/memory_stat over the data socket to be propagated via AMS to statsd
767 */
stats_write_lmk_kill_occurred(struct kill_stat * kill_st,struct memory_stat * mem_st)768 static void stats_write_lmk_kill_occurred(struct kill_stat *kill_st,
769 struct memory_stat *mem_st) {
770 LMK_KILL_OCCURRED_PACKET packet;
771 const size_t len = lmkd_pack_set_kill_occurred(packet, kill_st, mem_st);
772 if (len == 0) {
773 return;
774 }
775
776 for (int i = 0; i < MAX_DATA_CONN; i++) {
777 if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) {
778 ctrl_data_write(i, packet, len);
779 }
780 }
781
782 }
783
stats_write_lmk_kill_occurred_pid(int pid,struct kill_stat * kill_st,struct memory_stat * mem_st)784 static void stats_write_lmk_kill_occurred_pid(int pid, struct kill_stat *kill_st,
785 struct memory_stat *mem_st) {
786 kill_st->taskname = stats_get_task_name(pid);
787 if (kill_st->taskname != NULL) {
788 stats_write_lmk_kill_occurred(kill_st, mem_st);
789 }
790 }
791
792 /*
793 * Write the state_changed over the data socket to be propagated via AMS to statsd
794 */
stats_write_lmk_state_changed(enum lmk_state state)795 static void stats_write_lmk_state_changed(enum lmk_state state) {
796 LMKD_CTRL_PACKET packet_state_changed;
797 const size_t len = lmkd_pack_set_state_changed(packet_state_changed, state);
798 if (len == 0) {
799 return;
800 }
801 for (int i = 0; i < MAX_DATA_CONN; i++) {
802 if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) {
803 ctrl_data_write(i, (char*)packet_state_changed, len);
804 }
805 }
806 }
807
poll_kernel(int poll_fd)808 static void poll_kernel(int poll_fd) {
809 if (poll_fd == -1) {
810 // not waiting
811 return;
812 }
813
814 while (1) {
815 char rd_buf[256];
816 int bytes_read = TEMP_FAILURE_RETRY(pread(poll_fd, (void*)rd_buf, sizeof(rd_buf), 0));
817 if (bytes_read <= 0) break;
818 rd_buf[bytes_read] = '\0';
819
820 int64_t pid;
821 int64_t uid;
822 int64_t group_leader_pid;
823 int64_t rss_in_pages;
824 struct memory_stat mem_st = {};
825 int16_t oom_score_adj;
826 int16_t min_score_adj;
827 int64_t starttime;
828 char* taskname = 0;
829
830 int fields_read =
831 sscanf(rd_buf,
832 "%" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64
833 " %" SCNd16 " %" SCNd16 " %" SCNd64 "\n%m[^\n]",
834 &pid, &uid, &group_leader_pid, &mem_st.pgfault, &mem_st.pgmajfault,
835 &rss_in_pages, &oom_score_adj, &min_score_adj, &starttime, &taskname);
836
837 /* only the death of the group leader process is logged */
838 if (fields_read == 10 && group_leader_pid == pid) {
839 ctrl_data_write_lmk_kill_occurred((pid_t)pid, (uid_t)uid);
840 mem_st.process_start_time_ns = starttime * (NS_PER_SEC / sysconf(_SC_CLK_TCK));
841 mem_st.rss_in_bytes = rss_in_pages * PAGE_SIZE;
842
843 struct kill_stat kill_st = {
844 .uid = static_cast<int32_t>(uid),
845 .kill_reason = NONE,
846 .oom_score = oom_score_adj,
847 .min_oom_score = min_score_adj,
848 .free_mem_kb = 0,
849 .free_swap_kb = 0,
850 };
851 stats_write_lmk_kill_occurred_pid(pid, &kill_st, &mem_st);
852 }
853
854 free(taskname);
855 }
856 }
857
init_poll_kernel()858 static bool init_poll_kernel() {
859 kpoll_fd = TEMP_FAILURE_RETRY(open("/proc/lowmemorykiller", O_RDONLY | O_NONBLOCK | O_CLOEXEC));
860
861 if (kpoll_fd < 0) {
862 ALOGE("kernel lmk event file could not be opened; errno=%d", errno);
863 return false;
864 }
865
866 return true;
867 }
868
pid_lookup(int pid)869 static struct proc *pid_lookup(int pid) {
870 struct proc *procp;
871
872 for (procp = pidhash[pid_hashfn(pid)]; procp && procp->pid != pid;
873 procp = procp->pidhash_next)
874 ;
875
876 return procp;
877 }
878
adjslot_insert(struct adjslot_list * head,struct adjslot_list * new_element)879 static void adjslot_insert(struct adjslot_list *head, struct adjslot_list *new_element)
880 {
881 struct adjslot_list *next = head->next;
882 new_element->prev = head;
883 new_element->next = next;
884 next->prev = new_element;
885 head->next = new_element;
886 }
887
adjslot_remove(struct adjslot_list * old)888 static void adjslot_remove(struct adjslot_list *old)
889 {
890 struct adjslot_list *prev = old->prev;
891 struct adjslot_list *next = old->next;
892 next->prev = prev;
893 prev->next = next;
894 }
895
adjslot_tail(struct adjslot_list * head)896 static struct adjslot_list *adjslot_tail(struct adjslot_list *head) {
897 struct adjslot_list *asl = head->prev;
898
899 return asl == head ? NULL : asl;
900 }
901
proc_slot(struct proc * procp)902 static void proc_slot(struct proc *procp) {
903 int adjslot = ADJTOSLOT(procp->oomadj);
904
905 adjslot_insert(&procadjslot_list[adjslot], &procp->asl);
906 }
907
proc_unslot(struct proc * procp)908 static void proc_unslot(struct proc *procp) {
909 adjslot_remove(&procp->asl);
910 }
911
proc_insert(struct proc * procp)912 static void proc_insert(struct proc *procp) {
913 int hval = pid_hashfn(procp->pid);
914
915 procp->pidhash_next = pidhash[hval];
916 pidhash[hval] = procp;
917 proc_slot(procp);
918 }
919
pid_remove(int pid)920 static int pid_remove(int pid) {
921 int hval = pid_hashfn(pid);
922 struct proc *procp;
923 struct proc *prevp;
924
925 for (procp = pidhash[hval], prevp = NULL; procp && procp->pid != pid;
926 procp = procp->pidhash_next)
927 prevp = procp;
928
929 if (!procp)
930 return -1;
931
932 if (!prevp)
933 pidhash[hval] = procp->pidhash_next;
934 else
935 prevp->pidhash_next = procp->pidhash_next;
936
937 proc_unslot(procp);
938 /*
939 * Close pidfd here if we are not waiting for corresponding process to die,
940 * in which case stop_wait_for_proc_kill() will close the pidfd later
941 */
942 if (procp->pidfd >= 0 && procp->pidfd != last_kill_pid_or_fd) {
943 close(procp->pidfd);
944 }
945 free(procp);
946 return 0;
947 }
948
949 /*
950 * Write a string to a file.
951 * Returns false if the file does not exist.
952 */
writefilestring(const char * path,const char * s,bool err_if_missing)953 static bool writefilestring(const char *path, const char *s,
954 bool err_if_missing) {
955 int fd = open(path, O_WRONLY | O_CLOEXEC);
956 ssize_t len = strlen(s);
957 ssize_t ret;
958
959 if (fd < 0) {
960 if (err_if_missing) {
961 ALOGE("Error opening %s; errno=%d", path, errno);
962 }
963 return false;
964 }
965
966 ret = TEMP_FAILURE_RETRY(write(fd, s, len));
967 if (ret < 0) {
968 ALOGE("Error writing %s; errno=%d", path, errno);
969 } else if (ret < len) {
970 ALOGE("Short write on %s; length=%zd", path, ret);
971 }
972
973 close(fd);
974 return true;
975 }
976
get_time_diff_ms(struct timespec * from,struct timespec * to)977 static inline long get_time_diff_ms(struct timespec *from,
978 struct timespec *to) {
979 return (to->tv_sec - from->tv_sec) * (long)MS_PER_SEC +
980 (to->tv_nsec - from->tv_nsec) / (long)NS_PER_MS;
981 }
982
983 /* Reads /proc/pid/status into buf. */
read_proc_status(int pid,char * buf,size_t buf_sz)984 static bool read_proc_status(int pid, char *buf, size_t buf_sz) {
985 char path[PATH_MAX];
986 int fd;
987 ssize_t size;
988
989 snprintf(path, PATH_MAX, "/proc/%d/status", pid);
990 fd = open(path, O_RDONLY | O_CLOEXEC);
991 if (fd < 0) {
992 return false;
993 }
994
995 size = read_all(fd, buf, buf_sz - 1);
996 close(fd);
997 if (size < 0) {
998 return false;
999 }
1000 buf[size] = 0;
1001 return true;
1002 }
1003
1004 /* Looks for tag in buf and parses the first integer */
parse_status_tag(char * buf,const char * tag,int64_t * out)1005 static bool parse_status_tag(char *buf, const char *tag, int64_t *out) {
1006 char *pos = buf;
1007 while (true) {
1008 pos = strstr(pos, tag);
1009 /* Stop if tag not found or found at the line beginning */
1010 if (pos == NULL || pos == buf || pos[-1] == '\n') {
1011 break;
1012 }
1013 pos++;
1014 }
1015
1016 if (pos == NULL) {
1017 return false;
1018 }
1019
1020 pos += strlen(tag);
1021 while (*pos == ' ') ++pos;
1022 return parse_int64(pos, out);
1023 }
1024
proc_get_size(int pid)1025 static int proc_get_size(int pid) {
1026 char path[PATH_MAX];
1027 char line[LINE_MAX];
1028 int fd;
1029 int rss = 0;
1030 int total;
1031 ssize_t ret;
1032
1033 /* gid containing AID_READPROC required */
1034 snprintf(path, PATH_MAX, "/proc/%d/statm", pid);
1035 fd = open(path, O_RDONLY | O_CLOEXEC);
1036 if (fd == -1)
1037 return -1;
1038
1039 ret = read_all(fd, line, sizeof(line) - 1);
1040 if (ret < 0) {
1041 close(fd);
1042 return -1;
1043 }
1044 line[ret] = '\0';
1045
1046 sscanf(line, "%d %d ", &total, &rss);
1047 close(fd);
1048 return rss;
1049 }
1050
proc_get_name(int pid,char * buf,size_t buf_size)1051 static char *proc_get_name(int pid, char *buf, size_t buf_size) {
1052 char path[PATH_MAX];
1053 int fd;
1054 char *cp;
1055 ssize_t ret;
1056
1057 /* gid containing AID_READPROC required */
1058 snprintf(path, PATH_MAX, "/proc/%d/cmdline", pid);
1059 fd = open(path, O_RDONLY | O_CLOEXEC);
1060 if (fd == -1) {
1061 return NULL;
1062 }
1063 ret = read_all(fd, buf, buf_size - 1);
1064 close(fd);
1065 if (ret < 0) {
1066 return NULL;
1067 }
1068 buf[ret] = '\0';
1069
1070 cp = strchr(buf, ' ');
1071 if (cp) {
1072 *cp = '\0';
1073 }
1074
1075 return buf;
1076 }
1077
cmd_procprio(LMKD_CTRL_PACKET packet,int field_count,struct ucred * cred)1078 static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred *cred) {
1079 struct proc *procp;
1080 char path[LINE_MAX];
1081 char val[20];
1082 int soft_limit_mult;
1083 struct lmk_procprio params;
1084 bool is_system_server;
1085 struct passwd *pwdrec;
1086 int64_t tgid;
1087 char buf[PAGE_SIZE];
1088
1089 lmkd_pack_get_procprio(packet, field_count, ¶ms);
1090
1091 if (params.oomadj < OOM_SCORE_ADJ_MIN ||
1092 params.oomadj > OOM_SCORE_ADJ_MAX) {
1093 ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj);
1094 return;
1095 }
1096
1097 if (params.ptype < PROC_TYPE_FIRST || params.ptype >= PROC_TYPE_COUNT) {
1098 ALOGE("Invalid PROCPRIO process type argument %d", params.ptype);
1099 return;
1100 }
1101
1102 /* Check if registered process is a thread group leader */
1103 if (read_proc_status(params.pid, buf, sizeof(buf))) {
1104 if (parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid) && tgid != params.pid) {
1105 ALOGE("Attempt to register a task that is not a thread group leader "
1106 "(tid %d, tgid %" PRId64 ")", params.pid, tgid);
1107 return;
1108 }
1109 }
1110
1111 /* gid containing AID_READPROC required */
1112 /* CAP_SYS_RESOURCE required */
1113 /* CAP_DAC_OVERRIDE required */
1114 snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid);
1115 snprintf(val, sizeof(val), "%d", params.oomadj);
1116 if (!writefilestring(path, val, false)) {
1117 ALOGW("Failed to open %s; errno=%d: process %d might have been killed",
1118 path, errno, params.pid);
1119 /* If this file does not exist the process is dead. */
1120 return;
1121 }
1122
1123 if (use_inkernel_interface) {
1124 stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path)));
1125 return;
1126 }
1127
1128 /* lmkd should not change soft limits for services */
1129 if (params.ptype == PROC_TYPE_APP && per_app_memcg) {
1130 if (params.oomadj >= 900) {
1131 soft_limit_mult = 0;
1132 } else if (params.oomadj >= 800) {
1133 soft_limit_mult = 0;
1134 } else if (params.oomadj >= 700) {
1135 soft_limit_mult = 0;
1136 } else if (params.oomadj >= 600) {
1137 // Launcher should be perceptible, don't kill it.
1138 params.oomadj = 200;
1139 soft_limit_mult = 1;
1140 } else if (params.oomadj >= 500) {
1141 soft_limit_mult = 0;
1142 } else if (params.oomadj >= 400) {
1143 soft_limit_mult = 0;
1144 } else if (params.oomadj >= 300) {
1145 soft_limit_mult = 1;
1146 } else if (params.oomadj >= 200) {
1147 soft_limit_mult = 8;
1148 } else if (params.oomadj >= 100) {
1149 soft_limit_mult = 10;
1150 } else if (params.oomadj >= 0) {
1151 soft_limit_mult = 20;
1152 } else {
1153 // Persistent processes will have a large
1154 // soft limit 512MB.
1155 soft_limit_mult = 64;
1156 }
1157
1158 snprintf(path, sizeof(path), MEMCG_SYSFS_PATH
1159 "apps/uid_%d/pid_%d/memory.soft_limit_in_bytes",
1160 params.uid, params.pid);
1161 snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA);
1162
1163 /*
1164 * system_server process has no memcg under /dev/memcg/apps but should be
1165 * registered with lmkd. This is the best way so far to identify it.
1166 */
1167 is_system_server = (params.oomadj == SYSTEM_ADJ &&
1168 (pwdrec = getpwnam("system")) != NULL &&
1169 params.uid == pwdrec->pw_uid);
1170 writefilestring(path, val, !is_system_server);
1171 }
1172
1173 procp = pid_lookup(params.pid);
1174 if (!procp) {
1175 int pidfd = -1;
1176
1177 if (pidfd_supported) {
1178 pidfd = TEMP_FAILURE_RETRY(pidfd_open(params.pid, 0));
1179 if (pidfd < 0) {
1180 ALOGE("pidfd_open for pid %d failed; errno=%d", params.pid, errno);
1181 return;
1182 }
1183 }
1184
1185 procp = static_cast<struct proc*>(calloc(1, sizeof(struct proc)));
1186 if (!procp) {
1187 // Oh, the irony. May need to rebuild our state.
1188 return;
1189 }
1190
1191 procp->pid = params.pid;
1192 procp->pidfd = pidfd;
1193 procp->uid = params.uid;
1194 procp->reg_pid = cred->pid;
1195 procp->oomadj = params.oomadj;
1196 proc_insert(procp);
1197 } else {
1198 if (!claim_record(procp, cred->pid)) {
1199 char buf[LINE_MAX];
1200 char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1201 /* Only registrant of the record can remove it */
1202 ALOGE("%s (%d, %d) attempts to modify a process registered by another client",
1203 taskname ? taskname : "A process ", cred->uid, cred->pid);
1204 return;
1205 }
1206 proc_unslot(procp);
1207 procp->oomadj = params.oomadj;
1208 proc_slot(procp);
1209 }
1210 }
1211
cmd_procremove(LMKD_CTRL_PACKET packet,struct ucred * cred)1212 static void cmd_procremove(LMKD_CTRL_PACKET packet, struct ucred *cred) {
1213 struct lmk_procremove params;
1214 struct proc *procp;
1215
1216 lmkd_pack_get_procremove(packet, ¶ms);
1217
1218 if (use_inkernel_interface) {
1219 /*
1220 * Perform an extra check before the pid is removed, after which it
1221 * will be impossible for poll_kernel to get the taskname. poll_kernel()
1222 * is potentially a long-running blocking function; however this method
1223 * handles AMS requests but does not block AMS.
1224 */
1225 poll_kernel(kpoll_fd);
1226
1227 stats_remove_taskname(params.pid);
1228 return;
1229 }
1230
1231 procp = pid_lookup(params.pid);
1232 if (!procp) {
1233 return;
1234 }
1235
1236 if (!claim_record(procp, cred->pid)) {
1237 char buf[LINE_MAX];
1238 char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1239 /* Only registrant of the record can remove it */
1240 ALOGE("%s (%d, %d) attempts to unregister a process registered by another client",
1241 taskname ? taskname : "A process ", cred->uid, cred->pid);
1242 return;
1243 }
1244
1245 /*
1246 * WARNING: After pid_remove() procp is freed and can't be used!
1247 * Therefore placed at the end of the function.
1248 */
1249 pid_remove(params.pid);
1250 }
1251
cmd_procpurge(struct ucred * cred)1252 static void cmd_procpurge(struct ucred *cred) {
1253 int i;
1254 struct proc *procp;
1255 struct proc *next;
1256
1257 if (use_inkernel_interface) {
1258 stats_purge_tasknames();
1259 return;
1260 }
1261
1262 for (i = 0; i < PIDHASH_SZ; i++) {
1263 procp = pidhash[i];
1264 while (procp) {
1265 next = procp->pidhash_next;
1266 /* Purge only records created by the requestor */
1267 if (claim_record(procp, cred->pid)) {
1268 pid_remove(procp->pid);
1269 }
1270 procp = next;
1271 }
1272 }
1273 }
1274
cmd_subscribe(int dsock_idx,LMKD_CTRL_PACKET packet)1275 static void cmd_subscribe(int dsock_idx, LMKD_CTRL_PACKET packet) {
1276 struct lmk_subscribe params;
1277
1278 lmkd_pack_get_subscribe(packet, ¶ms);
1279 data_sock[dsock_idx].async_event_mask |= 1 << params.evt_type;
1280 }
1281
inc_killcnt(int oomadj)1282 static void inc_killcnt(int oomadj) {
1283 int slot = ADJTOSLOT(oomadj);
1284 uint8_t idx = killcnt_idx[slot];
1285
1286 if (idx == KILLCNT_INVALID_IDX) {
1287 /* index is not assigned for this oomadj */
1288 if (killcnt_free_idx < MAX_DISTINCT_OOM_ADJ) {
1289 killcnt_idx[slot] = killcnt_free_idx;
1290 killcnt[killcnt_free_idx] = 1;
1291 killcnt_free_idx++;
1292 } else {
1293 ALOGW("Number of distinct oomadj levels exceeds %d",
1294 MAX_DISTINCT_OOM_ADJ);
1295 }
1296 } else {
1297 /*
1298 * wraparound is highly unlikely and is detectable using total
1299 * counter because it has to be equal to the sum of all counters
1300 */
1301 killcnt[idx]++;
1302 }
1303 /* increment total kill counter */
1304 killcnt_total++;
1305 }
1306
get_killcnt(int min_oomadj,int max_oomadj)1307 static int get_killcnt(int min_oomadj, int max_oomadj) {
1308 int slot;
1309 int count = 0;
1310
1311 if (min_oomadj > max_oomadj)
1312 return 0;
1313
1314 /* special case to get total kill count */
1315 if (min_oomadj > OOM_SCORE_ADJ_MAX)
1316 return killcnt_total;
1317
1318 while (min_oomadj <= max_oomadj &&
1319 (slot = ADJTOSLOT(min_oomadj)) < ADJTOSLOT_COUNT) {
1320 uint8_t idx = killcnt_idx[slot];
1321 if (idx != KILLCNT_INVALID_IDX) {
1322 count += killcnt[idx];
1323 }
1324 min_oomadj++;
1325 }
1326
1327 return count;
1328 }
1329
cmd_getkillcnt(LMKD_CTRL_PACKET packet)1330 static int cmd_getkillcnt(LMKD_CTRL_PACKET packet) {
1331 struct lmk_getkillcnt params;
1332
1333 if (use_inkernel_interface) {
1334 /* kernel driver does not expose this information */
1335 return 0;
1336 }
1337
1338 lmkd_pack_get_getkillcnt(packet, ¶ms);
1339
1340 return get_killcnt(params.min_oomadj, params.max_oomadj);
1341 }
1342
cmd_target(int ntargets,LMKD_CTRL_PACKET packet)1343 static void cmd_target(int ntargets, LMKD_CTRL_PACKET packet) {
1344 int i;
1345 struct lmk_target target;
1346 char minfree_str[PROPERTY_VALUE_MAX];
1347 char *pstr = minfree_str;
1348 char *pend = minfree_str + sizeof(minfree_str);
1349 static struct timespec last_req_tm;
1350 struct timespec curr_tm;
1351
1352 if (ntargets < 1 || ntargets > (int)ARRAY_SIZE(lowmem_adj))
1353 return;
1354
1355 /*
1356 * Ratelimit minfree updates to once per TARGET_UPDATE_MIN_INTERVAL_MS
1357 * to prevent DoS attacks
1358 */
1359 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1360 ALOGE("Failed to get current time");
1361 return;
1362 }
1363
1364 if (get_time_diff_ms(&last_req_tm, &curr_tm) <
1365 TARGET_UPDATE_MIN_INTERVAL_MS) {
1366 ALOGE("Ignoring frequent updated to lmkd limits");
1367 return;
1368 }
1369
1370 last_req_tm = curr_tm;
1371
1372 for (i = 0; i < ntargets; i++) {
1373 lmkd_pack_get_target(packet, i, &target);
1374 lowmem_minfree[i] = target.minfree;
1375 lowmem_adj[i] = target.oom_adj_score;
1376
1377 pstr += snprintf(pstr, pend - pstr, "%d:%d,", target.minfree,
1378 target.oom_adj_score);
1379 if (pstr >= pend) {
1380 /* if no more space in the buffer then terminate the loop */
1381 pstr = pend;
1382 break;
1383 }
1384 }
1385
1386 lowmem_targets_size = ntargets;
1387
1388 /* Override the last extra comma */
1389 pstr[-1] = '\0';
1390 property_set("sys.lmk.minfree_levels", minfree_str);
1391
1392 if (has_inkernel_module) {
1393 char minfreestr[128];
1394 char killpriostr[128];
1395
1396 minfreestr[0] = '\0';
1397 killpriostr[0] = '\0';
1398
1399 for (i = 0; i < lowmem_targets_size; i++) {
1400 char val[40];
1401
1402 if (i) {
1403 strlcat(minfreestr, ",", sizeof(minfreestr));
1404 strlcat(killpriostr, ",", sizeof(killpriostr));
1405 }
1406
1407 snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_minfree[i] : 0);
1408 strlcat(minfreestr, val, sizeof(minfreestr));
1409 snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_adj[i] : 0);
1410 strlcat(killpriostr, val, sizeof(killpriostr));
1411 }
1412
1413 writefilestring(INKERNEL_MINFREE_PATH, minfreestr, true);
1414 writefilestring(INKERNEL_ADJ_PATH, killpriostr, true);
1415 }
1416 }
1417
ctrl_command_handler(int dsock_idx)1418 static void ctrl_command_handler(int dsock_idx) {
1419 LMKD_CTRL_PACKET packet;
1420 struct ucred cred;
1421 int len;
1422 enum lmk_cmd cmd;
1423 int nargs;
1424 int targets;
1425 int kill_cnt;
1426 int result;
1427
1428 len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE, &cred);
1429 if (len <= 0)
1430 return;
1431
1432 if (len < (int)sizeof(int)) {
1433 ALOGE("Wrong control socket read length len=%d", len);
1434 return;
1435 }
1436
1437 cmd = lmkd_pack_get_cmd(packet);
1438 nargs = len / sizeof(int) - 1;
1439 if (nargs < 0)
1440 goto wronglen;
1441
1442 switch(cmd) {
1443 case LMK_TARGET:
1444 targets = nargs / 2;
1445 if (nargs & 0x1 || targets > (int)ARRAY_SIZE(lowmem_adj))
1446 goto wronglen;
1447 cmd_target(targets, packet);
1448 break;
1449 case LMK_PROCPRIO:
1450 /* process type field is optional for backward compatibility */
1451 if (nargs < 3 || nargs > 4)
1452 goto wronglen;
1453 cmd_procprio(packet, nargs, &cred);
1454 break;
1455 case LMK_PROCREMOVE:
1456 if (nargs != 1)
1457 goto wronglen;
1458 cmd_procremove(packet, &cred);
1459 break;
1460 case LMK_PROCPURGE:
1461 if (nargs != 0)
1462 goto wronglen;
1463 cmd_procpurge(&cred);
1464 break;
1465 case LMK_GETKILLCNT:
1466 if (nargs != 2)
1467 goto wronglen;
1468 kill_cnt = cmd_getkillcnt(packet);
1469 len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt);
1470 if (ctrl_data_write(dsock_idx, (char *)packet, len) != len)
1471 return;
1472 break;
1473 case LMK_SUBSCRIBE:
1474 if (nargs != 1)
1475 goto wronglen;
1476 cmd_subscribe(dsock_idx, packet);
1477 break;
1478 case LMK_PROCKILL:
1479 /* This command code is NOT expected at all */
1480 ALOGE("Received unexpected command code %d", cmd);
1481 break;
1482 case LMK_UPDATE_PROPS:
1483 if (nargs != 0)
1484 goto wronglen;
1485 update_props();
1486 if (!use_inkernel_interface) {
1487 /* Reinitialize monitors to apply new settings */
1488 destroy_monitors();
1489 result = init_monitors() ? 0 : -1;
1490 } else {
1491 result = 0;
1492 }
1493 len = lmkd_pack_set_update_props_repl(packet, result);
1494 if (ctrl_data_write(dsock_idx, (char *)packet, len) != len) {
1495 ALOGE("Failed to report operation results");
1496 }
1497 if (!result) {
1498 ALOGI("Properties reinitilized");
1499 } else {
1500 /* New settings can't be supported, crash to be restarted */
1501 ALOGE("New configuration is not supported. Exiting...");
1502 exit(1);
1503 }
1504 break;
1505 default:
1506 ALOGE("Received unknown command code %d", cmd);
1507 return;
1508 }
1509
1510 return;
1511
1512 wronglen:
1513 ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
1514 }
1515
ctrl_data_handler(int data,uint32_t events,struct polling_params * poll_params __unused)1516 static void ctrl_data_handler(int data, uint32_t events,
1517 struct polling_params *poll_params __unused) {
1518 if (events & EPOLLIN) {
1519 ctrl_command_handler(data);
1520 }
1521 }
1522
get_free_dsock()1523 static int get_free_dsock() {
1524 for (int i = 0; i < MAX_DATA_CONN; i++) {
1525 if (data_sock[i].sock < 0) {
1526 return i;
1527 }
1528 }
1529 return -1;
1530 }
1531
ctrl_connect_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)1532 static void ctrl_connect_handler(int data __unused, uint32_t events __unused,
1533 struct polling_params *poll_params __unused) {
1534 struct epoll_event epev;
1535 int free_dscock_idx = get_free_dsock();
1536
1537 if (free_dscock_idx < 0) {
1538 /*
1539 * Number of data connections exceeded max supported. This should not
1540 * happen but if it does we drop all existing connections and accept
1541 * the new one. This prevents inactive connections from monopolizing
1542 * data socket and if we drop ActivityManager connection it will
1543 * immediately reconnect.
1544 */
1545 for (int i = 0; i < MAX_DATA_CONN; i++) {
1546 ctrl_data_close(i);
1547 }
1548 free_dscock_idx = 0;
1549 }
1550
1551 data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL);
1552 if (data_sock[free_dscock_idx].sock < 0) {
1553 ALOGE("lmkd control socket accept failed; errno=%d", errno);
1554 return;
1555 }
1556
1557 ALOGI("lmkd data connection established");
1558 /* use data to store data connection idx */
1559 data_sock[free_dscock_idx].handler_info.data = free_dscock_idx;
1560 data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler;
1561 data_sock[free_dscock_idx].async_event_mask = 0;
1562 epev.events = EPOLLIN;
1563 epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info);
1564 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) {
1565 ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno);
1566 ctrl_data_close(free_dscock_idx);
1567 return;
1568 }
1569 maxevents++;
1570 }
1571
1572 /*
1573 * /proc/zoneinfo parsing routines
1574 * Expected file format is:
1575 *
1576 * Node <node_id>, zone <zone_name>
1577 * (
1578 * per-node stats
1579 * (<per-node field name> <value>)+
1580 * )?
1581 * (pages free <value>
1582 * (<per-zone field name> <value>)+
1583 * pagesets
1584 * (<unused fields>)*
1585 * )+
1586 * ...
1587 */
zoneinfo_parse_protection(char * buf,struct zoneinfo_zone * zone)1588 static void zoneinfo_parse_protection(char *buf, struct zoneinfo_zone *zone) {
1589 int zone_idx;
1590 int64_t max = 0;
1591 char *save_ptr;
1592
1593 for (buf = strtok_r(buf, "(), ", &save_ptr), zone_idx = 0;
1594 buf && zone_idx < MAX_NR_ZONES;
1595 buf = strtok_r(NULL, "), ", &save_ptr), zone_idx++) {
1596 long long zoneval = strtoll(buf, &buf, 0);
1597 if (zoneval > max) {
1598 max = (zoneval > INT64_MAX) ? INT64_MAX : zoneval;
1599 }
1600 zone->protection[zone_idx] = zoneval;
1601 }
1602 zone->max_protection = max;
1603 }
1604
zoneinfo_parse_zone(char ** buf,struct zoneinfo_zone * zone)1605 static int zoneinfo_parse_zone(char **buf, struct zoneinfo_zone *zone) {
1606 for (char *line = strtok_r(NULL, "\n", buf); line;
1607 line = strtok_r(NULL, "\n", buf)) {
1608 char *cp;
1609 char *ap;
1610 char *save_ptr;
1611 int64_t val;
1612 int field_idx;
1613 enum field_match_result match_res;
1614
1615 cp = strtok_r(line, " ", &save_ptr);
1616 if (!cp) {
1617 return false;
1618 }
1619
1620 field_idx = find_field(cp, zoneinfo_zone_spec_field_names, ZI_ZONE_SPEC_FIELD_COUNT);
1621 if (field_idx >= 0) {
1622 /* special field */
1623 if (field_idx == ZI_ZONE_SPEC_PAGESETS) {
1624 /* no mode fields we are interested in */
1625 return true;
1626 }
1627
1628 /* protection field */
1629 ap = strtok_r(NULL, ")", &save_ptr);
1630 if (ap) {
1631 zoneinfo_parse_protection(ap, zone);
1632 }
1633 continue;
1634 }
1635
1636 ap = strtok_r(NULL, " ", &save_ptr);
1637 if (!ap) {
1638 continue;
1639 }
1640
1641 match_res = match_field(cp, ap, zoneinfo_zone_field_names, ZI_ZONE_FIELD_COUNT,
1642 &val, &field_idx);
1643 if (match_res == PARSE_FAIL) {
1644 return false;
1645 }
1646 if (match_res == PARSE_SUCCESS) {
1647 zone->fields.arr[field_idx] = val;
1648 }
1649 if (field_idx == ZI_ZONE_PRESENT && val == 0) {
1650 /* zone is not populated, stop parsing it */
1651 return true;
1652 }
1653 }
1654 return false;
1655 }
1656
zoneinfo_parse_node(char ** buf,struct zoneinfo_node * node)1657 static int zoneinfo_parse_node(char **buf, struct zoneinfo_node *node) {
1658 int fields_to_match = ZI_NODE_FIELD_COUNT;
1659
1660 for (char *line = strtok_r(NULL, "\n", buf); line;
1661 line = strtok_r(NULL, "\n", buf)) {
1662 char *cp;
1663 char *ap;
1664 char *save_ptr;
1665 int64_t val;
1666 int field_idx;
1667 enum field_match_result match_res;
1668
1669 cp = strtok_r(line, " ", &save_ptr);
1670 if (!cp) {
1671 return false;
1672 }
1673
1674 ap = strtok_r(NULL, " ", &save_ptr);
1675 if (!ap) {
1676 return false;
1677 }
1678
1679 match_res = match_field(cp, ap, zoneinfo_node_field_names, ZI_NODE_FIELD_COUNT,
1680 &val, &field_idx);
1681 if (match_res == PARSE_FAIL) {
1682 return false;
1683 }
1684 if (match_res == PARSE_SUCCESS) {
1685 node->fields.arr[field_idx] = val;
1686 fields_to_match--;
1687 if (!fields_to_match) {
1688 return true;
1689 }
1690 }
1691 }
1692 return false;
1693 }
1694
zoneinfo_parse(struct zoneinfo * zi)1695 static int zoneinfo_parse(struct zoneinfo *zi) {
1696 static struct reread_data file_data = {
1697 .filename = ZONEINFO_PATH,
1698 .fd = -1,
1699 };
1700 char *buf;
1701 char *save_ptr;
1702 char *line;
1703 char zone_name[LINE_MAX + 1];
1704 struct zoneinfo_node *node = NULL;
1705 int node_idx = 0;
1706 int zone_idx = 0;
1707
1708 memset(zi, 0, sizeof(struct zoneinfo));
1709
1710 if ((buf = reread_file(&file_data)) == NULL) {
1711 return -1;
1712 }
1713
1714 for (line = strtok_r(buf, "\n", &save_ptr); line;
1715 line = strtok_r(NULL, "\n", &save_ptr)) {
1716 int node_id;
1717 if (sscanf(line, "Node %d, zone %" STRINGIFY(LINE_MAX) "s", &node_id, zone_name) == 2) {
1718 if (!node || node->id != node_id) {
1719 /* new node is found */
1720 if (node) {
1721 node->zone_count = zone_idx + 1;
1722 node_idx++;
1723 if (node_idx == MAX_NR_NODES) {
1724 /* max node count exceeded */
1725 ALOGE("%s parse error", file_data.filename);
1726 return -1;
1727 }
1728 }
1729 node = &zi->nodes[node_idx];
1730 node->id = node_id;
1731 zone_idx = 0;
1732 if (!zoneinfo_parse_node(&save_ptr, node)) {
1733 ALOGE("%s parse error", file_data.filename);
1734 return -1;
1735 }
1736 } else {
1737 /* new zone is found */
1738 zone_idx++;
1739 }
1740 if (!zoneinfo_parse_zone(&save_ptr, &node->zones[zone_idx])) {
1741 ALOGE("%s parse error", file_data.filename);
1742 return -1;
1743 }
1744 }
1745 }
1746 if (!node) {
1747 ALOGE("%s parse error", file_data.filename);
1748 return -1;
1749 }
1750 node->zone_count = zone_idx + 1;
1751 zi->node_count = node_idx + 1;
1752
1753 /* calculate totals fields */
1754 for (node_idx = 0; node_idx < zi->node_count; node_idx++) {
1755 node = &zi->nodes[node_idx];
1756 for (zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
1757 struct zoneinfo_zone *zone = &zi->nodes[node_idx].zones[zone_idx];
1758 zi->totalreserve_pages += zone->max_protection + zone->fields.field.high;
1759 }
1760 zi->total_inactive_file += node->fields.field.nr_inactive_file;
1761 zi->total_active_file += node->fields.field.nr_active_file;
1762 }
1763 return 0;
1764 }
1765
1766 /* /proc/meminfo parsing routines */
meminfo_parse_line(char * line,union meminfo * mi)1767 static bool meminfo_parse_line(char *line, union meminfo *mi) {
1768 char *cp = line;
1769 char *ap;
1770 char *save_ptr;
1771 int64_t val;
1772 int field_idx;
1773 enum field_match_result match_res;
1774
1775 cp = strtok_r(line, " ", &save_ptr);
1776 if (!cp) {
1777 return false;
1778 }
1779
1780 ap = strtok_r(NULL, " ", &save_ptr);
1781 if (!ap) {
1782 return false;
1783 }
1784
1785 match_res = match_field(cp, ap, meminfo_field_names, MI_FIELD_COUNT,
1786 &val, &field_idx);
1787 if (match_res == PARSE_SUCCESS) {
1788 mi->arr[field_idx] = val / page_k;
1789 }
1790 return (match_res != PARSE_FAIL);
1791 }
1792
read_gpu_total_kb()1793 static int64_t read_gpu_total_kb() {
1794 static int fd = android::bpf::bpfFdGet(
1795 "/sys/fs/bpf/map_gpu_mem_gpu_mem_total_map", BPF_F_RDONLY);
1796 static constexpr uint64_t kBpfKeyGpuTotalUsage = 0;
1797 uint64_t value;
1798
1799 if (fd < 0) {
1800 return 0;
1801 }
1802
1803 return android::bpf::findMapEntry(fd, &kBpfKeyGpuTotalUsage, &value)
1804 ? 0
1805 : (int32_t)(value / 1024);
1806 }
1807
meminfo_parse(union meminfo * mi)1808 static int meminfo_parse(union meminfo *mi) {
1809 static struct reread_data file_data = {
1810 .filename = MEMINFO_PATH,
1811 .fd = -1,
1812 };
1813 char *buf;
1814 char *save_ptr;
1815 char *line;
1816
1817 memset(mi, 0, sizeof(union meminfo));
1818
1819 if ((buf = reread_file(&file_data)) == NULL) {
1820 return -1;
1821 }
1822
1823 for (line = strtok_r(buf, "\n", &save_ptr); line;
1824 line = strtok_r(NULL, "\n", &save_ptr)) {
1825 if (!meminfo_parse_line(line, mi)) {
1826 ALOGE("%s parse error", file_data.filename);
1827 return -1;
1828 }
1829 }
1830 mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached +
1831 mi->field.buffers;
1832 mi->field.total_gpu_kb = read_gpu_total_kb();
1833
1834 return 0;
1835 }
1836
1837 /* /proc/vmstat parsing routines */
vmstat_parse_line(char * line,union vmstat * vs)1838 static bool vmstat_parse_line(char *line, union vmstat *vs) {
1839 char *cp;
1840 char *ap;
1841 char *save_ptr;
1842 int64_t val;
1843 int field_idx;
1844 enum field_match_result match_res;
1845
1846 cp = strtok_r(line, " ", &save_ptr);
1847 if (!cp) {
1848 return false;
1849 }
1850
1851 ap = strtok_r(NULL, " ", &save_ptr);
1852 if (!ap) {
1853 return false;
1854 }
1855
1856 match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
1857 &val, &field_idx);
1858 if (match_res == PARSE_SUCCESS) {
1859 vs->arr[field_idx] = val;
1860 }
1861 return (match_res != PARSE_FAIL);
1862 }
1863
vmstat_parse(union vmstat * vs)1864 static int vmstat_parse(union vmstat *vs) {
1865 static struct reread_data file_data = {
1866 .filename = VMSTAT_PATH,
1867 .fd = -1,
1868 };
1869 char *buf;
1870 char *save_ptr;
1871 char *line;
1872
1873 memset(vs, 0, sizeof(union vmstat));
1874
1875 if ((buf = reread_file(&file_data)) == NULL) {
1876 return -1;
1877 }
1878
1879 for (line = strtok_r(buf, "\n", &save_ptr); line;
1880 line = strtok_r(NULL, "\n", &save_ptr)) {
1881 if (!vmstat_parse_line(line, vs)) {
1882 ALOGE("%s parse error", file_data.filename);
1883 return -1;
1884 }
1885 }
1886
1887 return 0;
1888 }
1889
1890 enum wakeup_reason {
1891 Event,
1892 Polling
1893 };
1894
1895 struct wakeup_info {
1896 struct timespec wakeup_tm;
1897 struct timespec prev_wakeup_tm;
1898 struct timespec last_event_tm;
1899 int wakeups_since_event;
1900 int skipped_wakeups;
1901 };
1902
1903 /*
1904 * After the initial memory pressure event is received lmkd schedules periodic wakeups to check
1905 * the memory conditions and kill if needed (polling). This is done because pressure events are
1906 * rate-limited and memory conditions can change in between events. Therefore after the initial
1907 * event there might be multiple wakeups. This function records the wakeup information such as the
1908 * timestamps of the last event and the last wakeup, the number of wakeups since the last event
1909 * and how many of those wakeups were skipped (some wakeups are skipped if previously killed
1910 * process is still freeing its memory).
1911 */
record_wakeup_time(struct timespec * tm,enum wakeup_reason reason,struct wakeup_info * wi)1912 static void record_wakeup_time(struct timespec *tm, enum wakeup_reason reason,
1913 struct wakeup_info *wi) {
1914 wi->prev_wakeup_tm = wi->wakeup_tm;
1915 wi->wakeup_tm = *tm;
1916 if (reason == Event) {
1917 wi->last_event_tm = *tm;
1918 wi->wakeups_since_event = 0;
1919 wi->skipped_wakeups = 0;
1920 } else {
1921 wi->wakeups_since_event++;
1922 }
1923 }
1924
killinfo_log(struct proc * procp,int min_oom_score,int rss_kb,int swap_kb,int kill_reason,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)1925 static void killinfo_log(struct proc* procp, int min_oom_score, int rss_kb,
1926 int swap_kb, int kill_reason, union meminfo *mi,
1927 struct wakeup_info *wi, struct timespec *tm) {
1928 /* log process information */
1929 android_log_write_int32(ctx, procp->pid);
1930 android_log_write_int32(ctx, procp->uid);
1931 android_log_write_int32(ctx, procp->oomadj);
1932 android_log_write_int32(ctx, min_oom_score);
1933 android_log_write_int32(ctx, (int32_t)min(rss_kb, INT32_MAX));
1934 android_log_write_int32(ctx, kill_reason);
1935
1936 /* log meminfo fields */
1937 for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
1938 android_log_write_int32(ctx, (int32_t)min(mi->arr[field_idx] * page_k, INT32_MAX));
1939 }
1940
1941 /* log lmkd wakeup information */
1942 android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->last_event_tm, tm));
1943 android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->prev_wakeup_tm, tm));
1944 android_log_write_int32(ctx, wi->wakeups_since_event);
1945 android_log_write_int32(ctx, wi->skipped_wakeups);
1946 android_log_write_int32(ctx, (int32_t)min(swap_kb, INT32_MAX));
1947 android_log_write_int32(ctx, (int32_t)mi->field.total_gpu_kb);
1948
1949 android_log_write_list(ctx, LOG_ID_EVENTS);
1950 android_log_reset(ctx);
1951 }
1952
proc_adj_lru(int oomadj)1953 static struct proc *proc_adj_lru(int oomadj) {
1954 return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
1955 }
1956
proc_get_heaviest(int oomadj)1957 static struct proc *proc_get_heaviest(int oomadj) {
1958 struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
1959 struct adjslot_list *curr = head->next;
1960 struct proc *maxprocp = NULL;
1961 int maxsize = 0;
1962 while (curr != head) {
1963 int pid = ((struct proc *)curr)->pid;
1964 int tasksize = proc_get_size(pid);
1965 if (tasksize < 0) {
1966 struct adjslot_list *next = curr->next;
1967 pid_remove(pid);
1968 curr = next;
1969 } else {
1970 if (tasksize > maxsize) {
1971 maxsize = tasksize;
1972 maxprocp = (struct proc *)curr;
1973 }
1974 curr = curr->next;
1975 }
1976 }
1977 return maxprocp;
1978 }
1979
set_process_group_and_prio(int pid,SchedPolicy sp,int prio)1980 static void set_process_group_and_prio(int pid, SchedPolicy sp, int prio) {
1981 DIR* d;
1982 char proc_path[PATH_MAX];
1983 struct dirent* de;
1984
1985 snprintf(proc_path, sizeof(proc_path), "/proc/%d/task", pid);
1986 if (!(d = opendir(proc_path))) {
1987 ALOGW("Failed to open %s; errno=%d: process pid(%d) might have died", proc_path, errno,
1988 pid);
1989 return;
1990 }
1991
1992 while ((de = readdir(d))) {
1993 int t_pid;
1994
1995 if (de->d_name[0] == '.') continue;
1996 t_pid = atoi(de->d_name);
1997
1998 if (!t_pid) {
1999 ALOGW("Failed to get t_pid for '%s' of pid(%d)", de->d_name, pid);
2000 continue;
2001 }
2002
2003 if (setpriority(PRIO_PROCESS, t_pid, prio) && errno != ESRCH) {
2004 ALOGW("Unable to raise priority of killing t_pid (%d): errno=%d", t_pid, errno);
2005 }
2006
2007 if (set_cpuset_policy(t_pid, sp)) {
2008 ALOGW("Failed to set_cpuset_policy on pid(%d) t_pid(%d) to %d", pid, t_pid, (int)sp);
2009 continue;
2010 }
2011 }
2012 closedir(d);
2013 }
2014
is_kill_pending(void)2015 static bool is_kill_pending(void) {
2016 char buf[24];
2017
2018 if (last_kill_pid_or_fd < 0) {
2019 return false;
2020 }
2021
2022 if (pidfd_supported) {
2023 return true;
2024 }
2025
2026 /* when pidfd is not supported base the decision on /proc/<pid> existence */
2027 snprintf(buf, sizeof(buf), "/proc/%d/", last_kill_pid_or_fd);
2028 if (access(buf, F_OK) == 0) {
2029 return true;
2030 }
2031
2032 return false;
2033 }
2034
is_waiting_for_kill(void)2035 static bool is_waiting_for_kill(void) {
2036 return pidfd_supported && last_kill_pid_or_fd >= 0;
2037 }
2038
stop_wait_for_proc_kill(bool finished)2039 static void stop_wait_for_proc_kill(bool finished) {
2040 struct epoll_event epev;
2041
2042 if (last_kill_pid_or_fd < 0) {
2043 return;
2044 }
2045
2046 if (debug_process_killing) {
2047 struct timespec curr_tm;
2048
2049 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2050 /*
2051 * curr_tm is used here merely to report kill duration, so this failure is not fatal.
2052 * Log an error and continue.
2053 */
2054 ALOGE("Failed to get current time");
2055 }
2056
2057 if (finished) {
2058 ALOGI("Process got killed in %ldms",
2059 get_time_diff_ms(&last_kill_tm, &curr_tm));
2060 } else {
2061 ALOGI("Stop waiting for process kill after %ldms",
2062 get_time_diff_ms(&last_kill_tm, &curr_tm));
2063 }
2064 }
2065
2066 if (pidfd_supported) {
2067 /* unregister fd */
2068 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, last_kill_pid_or_fd, &epev)) {
2069 // Log an error and keep going
2070 ALOGE("epoll_ctl for last killed process failed; errno=%d", errno);
2071 }
2072 maxevents--;
2073 close(last_kill_pid_or_fd);
2074 }
2075
2076 last_kill_pid_or_fd = -1;
2077 }
2078
kill_done_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params)2079 static void kill_done_handler(int data __unused, uint32_t events __unused,
2080 struct polling_params *poll_params) {
2081 stop_wait_for_proc_kill(true);
2082 poll_params->update = POLLING_RESUME;
2083 }
2084
start_wait_for_proc_kill(int pid_or_fd)2085 static void start_wait_for_proc_kill(int pid_or_fd) {
2086 static struct event_handler_info kill_done_hinfo = { 0, kill_done_handler };
2087 struct epoll_event epev;
2088
2089 if (last_kill_pid_or_fd >= 0) {
2090 /* Should not happen but if it does we should stop previous wait */
2091 ALOGE("Attempt to wait for a kill while another wait is in progress");
2092 stop_wait_for_proc_kill(false);
2093 }
2094
2095 last_kill_pid_or_fd = pid_or_fd;
2096
2097 if (!pidfd_supported) {
2098 /* If pidfd is not supported just store PID and exit */
2099 return;
2100 }
2101
2102 epev.events = EPOLLIN;
2103 epev.data.ptr = (void *)&kill_done_hinfo;
2104 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, last_kill_pid_or_fd, &epev) != 0) {
2105 ALOGE("epoll_ctl for last kill failed; errno=%d", errno);
2106 close(last_kill_pid_or_fd);
2107 last_kill_pid_or_fd = -1;
2108 return;
2109 }
2110 maxevents++;
2111 }
2112
2113 struct kill_info {
2114 enum kill_reasons kill_reason;
2115 const char *kill_desc;
2116 int thrashing;
2117 int max_thrashing;
2118 };
2119
2120 /* Kill one process specified by procp. Returns the size (in pages) of the process killed */
kill_one_process(struct proc * procp,int min_oom_score,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)2121 static int kill_one_process(struct proc* procp, int min_oom_score, struct kill_info *ki,
2122 union meminfo *mi, struct wakeup_info *wi, struct timespec *tm) {
2123 int pid = procp->pid;
2124 int pidfd = procp->pidfd;
2125 uid_t uid = procp->uid;
2126 char *taskname;
2127 int r;
2128 int result = -1;
2129 struct memory_stat *mem_st;
2130 struct kill_stat kill_st;
2131 int64_t tgid;
2132 int64_t rss_kb;
2133 int64_t swap_kb;
2134 char buf[PAGE_SIZE];
2135
2136 if (!read_proc_status(pid, buf, sizeof(buf))) {
2137 goto out;
2138 }
2139 if (!parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid)) {
2140 ALOGE("Unable to parse tgid from /proc/%d/status", pid);
2141 goto out;
2142 }
2143 if (tgid != pid) {
2144 ALOGE("Possible pid reuse detected (pid %d, tgid %" PRId64 ")!", pid, tgid);
2145 goto out;
2146 }
2147 // Zombie processes will not have RSS / Swap fields.
2148 if (!parse_status_tag(buf, PROC_STATUS_RSS_FIELD, &rss_kb)) {
2149 goto out;
2150 }
2151 if (!parse_status_tag(buf, PROC_STATUS_SWAP_FIELD, &swap_kb)) {
2152 goto out;
2153 }
2154
2155 taskname = proc_get_name(pid, buf, sizeof(buf));
2156 // taskname will point inside buf, do not reuse buf onwards.
2157 if (!taskname) {
2158 goto out;
2159 }
2160
2161 mem_st = stats_read_memory_stat(per_app_memcg, pid, uid, rss_kb * 1024, swap_kb * 1024);
2162
2163 TRACE_KILL_START(pid);
2164
2165 /* CAP_KILL required */
2166 if (pidfd < 0) {
2167 start_wait_for_proc_kill(pid);
2168 r = kill(pid, SIGKILL);
2169 } else {
2170 start_wait_for_proc_kill(pidfd);
2171 r = pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
2172 }
2173
2174 TRACE_KILL_END();
2175
2176 if (r) {
2177 stop_wait_for_proc_kill(false);
2178 ALOGE("kill(%d): errno=%d", pid, errno);
2179 /* Delete process record even when we fail to kill so that we don't get stuck on it */
2180 goto out;
2181 }
2182
2183 set_process_group_and_prio(pid, SP_FOREGROUND, ANDROID_PRIORITY_HIGHEST);
2184
2185 last_kill_tm = *tm;
2186
2187 inc_killcnt(procp->oomadj);
2188
2189 if (ki) {
2190 kill_st.kill_reason = ki->kill_reason;
2191 kill_st.thrashing = ki->thrashing;
2192 kill_st.max_thrashing = ki->max_thrashing;
2193 killinfo_log(procp, min_oom_score, rss_kb, swap_kb, ki->kill_reason, mi, wi, tm);
2194 ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2195 "kB swap; reason: %s", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb,
2196 ki->kill_desc);
2197 } else {
2198 kill_st.kill_reason = NONE;
2199 kill_st.thrashing = 0;
2200 kill_st.max_thrashing = 0;
2201 killinfo_log(procp, min_oom_score, rss_kb, swap_kb, NONE, mi, wi, tm);
2202 ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2203 "kb swap", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb);
2204 }
2205
2206 kill_st.uid = static_cast<int32_t>(uid);
2207 kill_st.taskname = taskname;
2208 kill_st.oom_score = procp->oomadj;
2209 kill_st.min_oom_score = min_oom_score;
2210 kill_st.free_mem_kb = mi->field.nr_free_pages * page_k;
2211 kill_st.free_swap_kb = mi->field.free_swap * page_k;
2212 stats_write_lmk_kill_occurred(&kill_st, mem_st);
2213
2214 ctrl_data_write_lmk_kill_occurred((pid_t)pid, uid);
2215
2216 result = rss_kb / page_k;
2217
2218 out:
2219 /*
2220 * WARNING: After pid_remove() procp is freed and can't be used!
2221 * Therefore placed at the end of the function.
2222 */
2223 pid_remove(pid);
2224 return result;
2225 }
2226
2227 /*
2228 * Find one process to kill at or above the given oom_score_adj level.
2229 * Returns size of the killed process.
2230 */
find_and_kill_process(int min_score_adj,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)2231 static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi,
2232 struct wakeup_info *wi, struct timespec *tm) {
2233 int i;
2234 int killed_size = 0;
2235 bool lmk_state_change_start = false;
2236 bool choose_heaviest_task = kill_heaviest_task;
2237
2238 for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
2239 struct proc *procp;
2240
2241 if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) {
2242 /*
2243 * If we have to choose a perceptible process, choose the heaviest one to
2244 * hopefully minimize the number of victims.
2245 */
2246 choose_heaviest_task = true;
2247 }
2248
2249 while (true) {
2250 procp = choose_heaviest_task ?
2251 proc_get_heaviest(i) : proc_adj_lru(i);
2252
2253 if (!procp)
2254 break;
2255
2256 killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm);
2257 if (killed_size >= 0) {
2258 if (!lmk_state_change_start) {
2259 lmk_state_change_start = true;
2260 stats_write_lmk_state_changed(STATE_START);
2261 }
2262 break;
2263 }
2264 }
2265 if (killed_size) {
2266 break;
2267 }
2268 }
2269
2270 if (lmk_state_change_start) {
2271 stats_write_lmk_state_changed(STATE_STOP);
2272 }
2273
2274 return killed_size;
2275 }
2276
get_memory_usage(struct reread_data * file_data)2277 static int64_t get_memory_usage(struct reread_data *file_data) {
2278 int64_t mem_usage;
2279 char *buf;
2280
2281 if ((buf = reread_file(file_data)) == NULL) {
2282 return -1;
2283 }
2284
2285 if (!parse_int64(buf, &mem_usage)) {
2286 ALOGE("%s parse error", file_data->filename);
2287 return -1;
2288 }
2289 if (mem_usage == 0) {
2290 ALOGE("No memory!");
2291 return -1;
2292 }
2293 return mem_usage;
2294 }
2295
record_low_pressure_levels(union meminfo * mi)2296 void record_low_pressure_levels(union meminfo *mi) {
2297 if (low_pressure_mem.min_nr_free_pages == -1 ||
2298 low_pressure_mem.min_nr_free_pages > mi->field.nr_free_pages) {
2299 if (debug_process_killing) {
2300 ALOGI("Low pressure min memory update from %" PRId64 " to %" PRId64,
2301 low_pressure_mem.min_nr_free_pages, mi->field.nr_free_pages);
2302 }
2303 low_pressure_mem.min_nr_free_pages = mi->field.nr_free_pages;
2304 }
2305 /*
2306 * Free memory at low vmpressure events occasionally gets spikes,
2307 * possibly a stale low vmpressure event with memory already
2308 * freed up (no memory pressure should have been reported).
2309 * Ignore large jumps in max_nr_free_pages that would mess up our stats.
2310 */
2311 if (low_pressure_mem.max_nr_free_pages == -1 ||
2312 (low_pressure_mem.max_nr_free_pages < mi->field.nr_free_pages &&
2313 mi->field.nr_free_pages - low_pressure_mem.max_nr_free_pages <
2314 low_pressure_mem.max_nr_free_pages * 0.1)) {
2315 if (debug_process_killing) {
2316 ALOGI("Low pressure max memory update from %" PRId64 " to %" PRId64,
2317 low_pressure_mem.max_nr_free_pages, mi->field.nr_free_pages);
2318 }
2319 low_pressure_mem.max_nr_free_pages = mi->field.nr_free_pages;
2320 }
2321 }
2322
upgrade_level(enum vmpressure_level level)2323 enum vmpressure_level upgrade_level(enum vmpressure_level level) {
2324 return (enum vmpressure_level)((level < VMPRESS_LEVEL_CRITICAL) ?
2325 level + 1 : level);
2326 }
2327
downgrade_level(enum vmpressure_level level)2328 enum vmpressure_level downgrade_level(enum vmpressure_level level) {
2329 return (enum vmpressure_level)((level > VMPRESS_LEVEL_LOW) ?
2330 level - 1 : level);
2331 }
2332
2333 enum zone_watermark {
2334 WMARK_MIN = 0,
2335 WMARK_LOW,
2336 WMARK_HIGH,
2337 WMARK_NONE
2338 };
2339
2340 struct zone_watermarks {
2341 long high_wmark;
2342 long low_wmark;
2343 long min_wmark;
2344 };
2345
2346 /*
2347 * Returns lowest breached watermark or WMARK_NONE.
2348 */
get_lowest_watermark(union meminfo * mi,struct zone_watermarks * watermarks)2349 static enum zone_watermark get_lowest_watermark(union meminfo *mi,
2350 struct zone_watermarks *watermarks)
2351 {
2352 int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free;
2353
2354 if (nr_free_pages < watermarks->min_wmark) {
2355 return WMARK_MIN;
2356 }
2357 if (nr_free_pages < watermarks->low_wmark) {
2358 return WMARK_LOW;
2359 }
2360 if (nr_free_pages < watermarks->high_wmark) {
2361 return WMARK_HIGH;
2362 }
2363 return WMARK_NONE;
2364 }
2365
calc_zone_watermarks(struct zoneinfo * zi,struct zone_watermarks * watermarks)2366 void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) {
2367 memset(watermarks, 0, sizeof(struct zone_watermarks));
2368
2369 for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
2370 struct zoneinfo_node *node = &zi->nodes[node_idx];
2371 for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
2372 struct zoneinfo_zone *zone = &node->zones[zone_idx];
2373
2374 if (!zone->fields.field.present) {
2375 continue;
2376 }
2377
2378 watermarks->high_wmark += zone->max_protection + zone->fields.field.high;
2379 watermarks->low_wmark += zone->max_protection + zone->fields.field.low;
2380 watermarks->min_wmark += zone->max_protection + zone->fields.field.min;
2381 }
2382 }
2383 }
2384
calc_swap_utilization(union meminfo * mi)2385 static int calc_swap_utilization(union meminfo *mi) {
2386 int64_t swap_used = mi->field.total_swap - mi->field.free_swap;
2387 int64_t total_swappable = mi->field.active_anon + mi->field.inactive_anon +
2388 mi->field.shmem + swap_used;
2389 return total_swappable > 0 ? (swap_used * 100) / total_swappable : 0;
2390 }
2391
mp_event_psi(int data,uint32_t events,struct polling_params * poll_params)2392 static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
2393 enum reclaim_state {
2394 NO_RECLAIM = 0,
2395 KSWAPD_RECLAIM,
2396 DIRECT_RECLAIM,
2397 };
2398 static int64_t init_ws_refault;
2399 static int64_t prev_workingset_refault;
2400 static int64_t base_file_lru;
2401 static int64_t init_pgscan_kswapd;
2402 static int64_t init_pgscan_direct;
2403 static int64_t swap_low_threshold;
2404 static bool killing;
2405 static int thrashing_limit = thrashing_limit_pct;
2406 static struct zone_watermarks watermarks;
2407 static struct timespec wmark_update_tm;
2408 static struct wakeup_info wi;
2409 static struct timespec thrashing_reset_tm;
2410 static int64_t prev_thrash_growth = 0;
2411 static bool check_filecache = false;
2412 static int max_thrashing = 0;
2413
2414 union meminfo mi;
2415 union vmstat vs;
2416 struct timespec curr_tm;
2417 int64_t thrashing = 0;
2418 bool swap_is_low = false;
2419 enum vmpressure_level level = (enum vmpressure_level)data;
2420 enum kill_reasons kill_reason = NONE;
2421 bool cycle_after_kill = false;
2422 enum reclaim_state reclaim = NO_RECLAIM;
2423 enum zone_watermark wmark = WMARK_NONE;
2424 char kill_desc[LINE_MAX];
2425 bool cut_thrashing_limit = false;
2426 int min_score_adj = 0;
2427 int swap_util = 0;
2428 long since_thrashing_reset_ms;
2429 int64_t workingset_refault_file;
2430
2431 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2432 ALOGE("Failed to get current time");
2433 return;
2434 }
2435
2436 record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2437
2438 bool kill_pending = is_kill_pending();
2439 if (kill_pending && (kill_timeout_ms == 0 ||
2440 get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms))) {
2441 /* Skip while still killing a process */
2442 wi.skipped_wakeups++;
2443 goto no_kill;
2444 }
2445 /*
2446 * Process is dead or kill timeout is over, stop waiting. This has no effect if pidfds are
2447 * supported and death notification already caused waiting to stop.
2448 */
2449 stop_wait_for_proc_kill(!kill_pending);
2450
2451 if (vmstat_parse(&vs) < 0) {
2452 ALOGE("Failed to parse vmstat!");
2453 return;
2454 }
2455 /* Starting 5.9 kernel workingset_refault vmstat field was renamed workingset_refault_file */
2456 workingset_refault_file = vs.field.workingset_refault ? : vs.field.workingset_refault_file;
2457
2458 if (meminfo_parse(&mi) < 0) {
2459 ALOGE("Failed to parse meminfo!");
2460 return;
2461 }
2462
2463 /* Reset states after process got killed */
2464 if (killing) {
2465 killing = false;
2466 cycle_after_kill = true;
2467 /* Reset file-backed pagecache size and refault amounts after a kill */
2468 base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2469 init_ws_refault = workingset_refault_file;
2470 thrashing_reset_tm = curr_tm;
2471 prev_thrash_growth = 0;
2472 }
2473
2474 /* Check free swap levels */
2475 if (swap_free_low_percentage) {
2476 if (!swap_low_threshold) {
2477 swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
2478 }
2479 swap_is_low = mi.field.free_swap < swap_low_threshold;
2480 }
2481
2482 /* Identify reclaim state */
2483 if (vs.field.pgscan_direct > init_pgscan_direct) {
2484 init_pgscan_direct = vs.field.pgscan_direct;
2485 init_pgscan_kswapd = vs.field.pgscan_kswapd;
2486 reclaim = DIRECT_RECLAIM;
2487 } else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) {
2488 init_pgscan_kswapd = vs.field.pgscan_kswapd;
2489 reclaim = KSWAPD_RECLAIM;
2490 } else if (workingset_refault_file == prev_workingset_refault) {
2491 /*
2492 * Device is not thrashing and not reclaiming, bail out early until we see these stats
2493 * changing
2494 */
2495 goto no_kill;
2496 }
2497
2498 prev_workingset_refault = workingset_refault_file;
2499
2500 /*
2501 * It's possible we fail to find an eligible process to kill (ex. no process is
2502 * above oom_adj_min). When this happens, we should retry to find a new process
2503 * for a kill whenever a new eligible process is available. This is especially
2504 * important for a slow growing refault case. While retrying, we should keep
2505 * monitoring new thrashing counter as someone could release the memory to mitigate
2506 * the thrashing. Thus, when thrashing reset window comes, we decay the prev thrashing
2507 * counter by window counts. If the counter is still greater than thrashing limit,
2508 * we preserve the current prev_thrash counter so we will retry kill again. Otherwise,
2509 * we reset the prev_thrash counter so we will stop retrying.
2510 */
2511 since_thrashing_reset_ms = get_time_diff_ms(&thrashing_reset_tm, &curr_tm);
2512 if (since_thrashing_reset_ms > THRASHING_RESET_INTERVAL_MS) {
2513 long windows_passed;
2514 /* Calculate prev_thrash_growth if we crossed THRASHING_RESET_INTERVAL_MS */
2515 prev_thrash_growth = (workingset_refault_file - init_ws_refault) * 100
2516 / (base_file_lru + 1);
2517 windows_passed = (since_thrashing_reset_ms / THRASHING_RESET_INTERVAL_MS);
2518 /*
2519 * Decay prev_thrashing unless over-the-limit thrashing was registered in the window we
2520 * just crossed, which means there were no eligible processes to kill. We preserve the
2521 * counter in that case to ensure a kill if a new eligible process appears.
2522 */
2523 if (windows_passed > 1 || prev_thrash_growth < thrashing_limit) {
2524 prev_thrash_growth >>= windows_passed;
2525 }
2526
2527 /* Record file-backed pagecache size when crossing THRASHING_RESET_INTERVAL_MS */
2528 base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2529 init_ws_refault = workingset_refault_file;
2530 thrashing_reset_tm = curr_tm;
2531 thrashing_limit = thrashing_limit_pct;
2532 } else {
2533 /* Calculate what % of the file-backed pagecache refaulted so far */
2534 thrashing = (workingset_refault_file - init_ws_refault) * 100 / (base_file_lru + 1);
2535 }
2536 /* Add previous cycle's decayed thrashing amount */
2537 thrashing += prev_thrash_growth;
2538 if (max_thrashing < thrashing) {
2539 max_thrashing = thrashing;
2540 }
2541
2542 /*
2543 * Refresh watermarks once per min in case user updated one of the margins.
2544 * TODO: b/140521024 replace this periodic update with an API for AMS to notify LMKD
2545 * that zone watermarks were changed by the system software.
2546 */
2547 if (watermarks.high_wmark == 0 || get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000) {
2548 struct zoneinfo zi;
2549
2550 if (zoneinfo_parse(&zi) < 0) {
2551 ALOGE("Failed to parse zoneinfo!");
2552 return;
2553 }
2554
2555 calc_zone_watermarks(&zi, &watermarks);
2556 wmark_update_tm = curr_tm;
2557 }
2558
2559 /* Find out which watermark is breached if any */
2560 wmark = get_lowest_watermark(&mi, &watermarks);
2561
2562 /*
2563 * TODO: move this logic into a separate function
2564 * Decide if killing a process is necessary and record the reason
2565 */
2566 if (cycle_after_kill && wmark < WMARK_LOW) {
2567 /*
2568 * Prevent kills not freeing enough memory which might lead to OOM kill.
2569 * This might happen when a process is consuming memory faster than reclaim can
2570 * free even after a kill. Mostly happens when running memory stress tests.
2571 */
2572 kill_reason = PRESSURE_AFTER_KILL;
2573 strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
2574 } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
2575 /*
2576 * Device is too busy reclaiming memory which might lead to ANR.
2577 * Critical level is triggered when PSI complete stall (all tasks are blocked because
2578 * of the memory congestion) breaches the configured threshold.
2579 */
2580 kill_reason = NOT_RESPONDING;
2581 strncpy(kill_desc, "device is not responding", sizeof(kill_desc));
2582 } else if (swap_is_low && thrashing > thrashing_limit_pct) {
2583 /* Page cache is thrashing while swap is low */
2584 kill_reason = LOW_SWAP_AND_THRASHING;
2585 snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
2586 "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
2587 mi.field.free_swap * page_k, swap_low_threshold * page_k, thrashing);
2588 /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2589 if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2590 min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2591 }
2592 check_filecache = true;
2593 } else if (swap_is_low && wmark < WMARK_HIGH) {
2594 /* Both free memory and swap are low */
2595 kill_reason = LOW_MEM_AND_SWAP;
2596 snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
2597 PRId64 "kB < %" PRId64 "kB)", wmark < WMARK_LOW ? "min" : "low",
2598 mi.field.free_swap * page_k, swap_low_threshold * page_k);
2599 /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2600 if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2601 min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2602 }
2603 } else if (wmark < WMARK_HIGH && swap_util_max < 100 &&
2604 (swap_util = calc_swap_utilization(&mi)) > swap_util_max) {
2605 /*
2606 * Too much anon memory is swapped out but swap is not low.
2607 * Non-swappable allocations created memory pressure.
2608 */
2609 kill_reason = LOW_MEM_AND_SWAP_UTIL;
2610 snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap utilization"
2611 " is high (%d%% > %d%%)", wmark < WMARK_LOW ? "min" : "low",
2612 swap_util, swap_util_max);
2613 } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
2614 /* Page cache is thrashing while memory is low */
2615 kill_reason = LOW_MEM_AND_THRASHING;
2616 snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
2617 PRId64 "%%)", wmark < WMARK_LOW ? "min" : "low", thrashing);
2618 cut_thrashing_limit = true;
2619 /* Do not kill perceptible apps unless thrashing at critical levels */
2620 if (thrashing < thrashing_critical_pct) {
2621 min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2622 }
2623 check_filecache = true;
2624 } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
2625 /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
2626 kill_reason = DIRECT_RECL_AND_THRASHING;
2627 snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
2628 PRId64 "%%)", thrashing);
2629 cut_thrashing_limit = true;
2630 /* Do not kill perceptible apps unless thrashing at critical levels */
2631 if (thrashing < thrashing_critical_pct) {
2632 min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2633 }
2634 check_filecache = true;
2635 } else if (check_filecache) {
2636 int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k;
2637
2638 if (file_lru_kb < filecache_min_kb) {
2639 /* File cache is too low after thrashing, keep killing background processes */
2640 kill_reason = LOW_FILECACHE_AFTER_THRASHING;
2641 snprintf(kill_desc, sizeof(kill_desc),
2642 "filecache is low (%" PRId64 "kB < %" PRId64 "kB) after thrashing",
2643 file_lru_kb, filecache_min_kb);
2644 min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2645 } else {
2646 /* File cache is big enough, stop checking */
2647 check_filecache = false;
2648 }
2649 }
2650
2651 /* Kill a process if necessary */
2652 if (kill_reason != NONE) {
2653 struct kill_info ki = {
2654 .kill_reason = kill_reason,
2655 .kill_desc = kill_desc,
2656 .thrashing = (int)thrashing,
2657 .max_thrashing = max_thrashing,
2658 };
2659 int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm);
2660 if (pages_freed > 0) {
2661 killing = true;
2662 max_thrashing = 0;
2663 if (cut_thrashing_limit) {
2664 /*
2665 * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current
2666 * thrashing limit until the system stops thrashing.
2667 */
2668 thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
2669 }
2670 }
2671 }
2672
2673 no_kill:
2674 /* Do not poll if kernel supports pidfd waiting */
2675 if (is_waiting_for_kill()) {
2676 /* Pause polling if we are waiting for process death notification */
2677 poll_params->update = POLLING_PAUSE;
2678 return;
2679 }
2680
2681 /*
2682 * Start polling after initial PSI event;
2683 * extend polling while device is in direct reclaim or process is being killed;
2684 * do not extend when kswapd reclaims because that might go on for a long time
2685 * without causing memory pressure
2686 */
2687 if (events || killing || reclaim == DIRECT_RECLAIM) {
2688 poll_params->update = POLLING_START;
2689 }
2690
2691 /* Decide the polling interval */
2692 if (swap_is_low || killing) {
2693 /* Fast polling during and after a kill or when swap is low */
2694 poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2695 } else {
2696 /* By default use long intervals */
2697 poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
2698 }
2699 }
2700
mp_event_common(int data,uint32_t events,struct polling_params * poll_params)2701 static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
2702 unsigned long long evcount;
2703 int64_t mem_usage, memsw_usage;
2704 int64_t mem_pressure;
2705 union meminfo mi;
2706 struct zoneinfo zi;
2707 struct timespec curr_tm;
2708 static unsigned long kill_skip_count = 0;
2709 enum vmpressure_level level = (enum vmpressure_level)data;
2710 long other_free = 0, other_file = 0;
2711 int min_score_adj;
2712 int minfree = 0;
2713 static struct reread_data mem_usage_file_data = {
2714 .filename = MEMCG_MEMORY_USAGE,
2715 .fd = -1,
2716 };
2717 static struct reread_data memsw_usage_file_data = {
2718 .filename = MEMCG_MEMORYSW_USAGE,
2719 .fd = -1,
2720 };
2721 static struct wakeup_info wi;
2722
2723 if (debug_process_killing) {
2724 ALOGI("%s memory pressure event is triggered", level_name[level]);
2725 }
2726
2727 if (!use_psi_monitors) {
2728 /*
2729 * Check all event counters from low to critical
2730 * and upgrade to the highest priority one. By reading
2731 * eventfd we also reset the event counters.
2732 */
2733 for (int lvl = VMPRESS_LEVEL_LOW; lvl < VMPRESS_LEVEL_COUNT; lvl++) {
2734 if (mpevfd[lvl] != -1 &&
2735 TEMP_FAILURE_RETRY(read(mpevfd[lvl],
2736 &evcount, sizeof(evcount))) > 0 &&
2737 evcount > 0 && lvl > level) {
2738 level = static_cast<vmpressure_level>(lvl);
2739 }
2740 }
2741 }
2742
2743 /* Start polling after initial PSI event */
2744 if (use_psi_monitors && events) {
2745 /* Override polling params only if current event is more critical */
2746 if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
2747 poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2748 poll_params->update = POLLING_START;
2749 }
2750 }
2751
2752 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2753 ALOGE("Failed to get current time");
2754 return;
2755 }
2756
2757 record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2758
2759 if (kill_timeout_ms &&
2760 get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms)) {
2761 /*
2762 * If we're within the no-kill timeout, see if there's pending reclaim work
2763 * from the last killed process. If so, skip killing for now.
2764 */
2765 if (is_kill_pending()) {
2766 kill_skip_count++;
2767 wi.skipped_wakeups++;
2768 return;
2769 }
2770 /*
2771 * Process is dead, stop waiting. This has no effect if pidfds are supported and
2772 * death notification already caused waiting to stop.
2773 */
2774 stop_wait_for_proc_kill(true);
2775 } else {
2776 /*
2777 * Killing took longer than no-kill timeout. Stop waiting for the last process
2778 * to die because we are ready to kill again.
2779 */
2780 stop_wait_for_proc_kill(false);
2781 }
2782
2783 if (kill_skip_count > 0) {
2784 ALOGI("%lu memory pressure events were skipped after a kill!",
2785 kill_skip_count);
2786 kill_skip_count = 0;
2787 }
2788
2789 if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) {
2790 ALOGE("Failed to get free memory!");
2791 return;
2792 }
2793
2794 if (use_minfree_levels) {
2795 int i;
2796
2797 other_free = mi.field.nr_free_pages - zi.totalreserve_pages;
2798 if (mi.field.nr_file_pages > (mi.field.shmem + mi.field.unevictable + mi.field.swap_cached)) {
2799 other_file = (mi.field.nr_file_pages - mi.field.shmem -
2800 mi.field.unevictable - mi.field.swap_cached);
2801 } else {
2802 other_file = 0;
2803 }
2804
2805 min_score_adj = OOM_SCORE_ADJ_MAX + 1;
2806 for (i = 0; i < lowmem_targets_size; i++) {
2807 minfree = lowmem_minfree[i];
2808 if (other_free < minfree && other_file < minfree) {
2809 min_score_adj = lowmem_adj[i];
2810 break;
2811 }
2812 }
2813
2814 if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
2815 if (debug_process_killing) {
2816 ALOGI("Ignore %s memory pressure event "
2817 "(free memory=%ldkB, cache=%ldkB, limit=%ldkB)",
2818 level_name[level], other_free * page_k, other_file * page_k,
2819 (long)lowmem_minfree[lowmem_targets_size - 1] * page_k);
2820 }
2821 return;
2822 }
2823
2824 goto do_kill;
2825 }
2826
2827 if (level == VMPRESS_LEVEL_LOW) {
2828 record_low_pressure_levels(&mi);
2829 }
2830
2831 if (level_oomadj[level] > OOM_SCORE_ADJ_MAX) {
2832 /* Do not monitor this pressure level */
2833 return;
2834 }
2835
2836 if ((mem_usage = get_memory_usage(&mem_usage_file_data)) < 0) {
2837 goto do_kill;
2838 }
2839 if ((memsw_usage = get_memory_usage(&memsw_usage_file_data)) < 0) {
2840 goto do_kill;
2841 }
2842
2843 // Calculate percent for swappinness.
2844 mem_pressure = (mem_usage * 100) / memsw_usage;
2845
2846 if (enable_pressure_upgrade && level != VMPRESS_LEVEL_CRITICAL) {
2847 // We are swapping too much.
2848 if (mem_pressure < upgrade_pressure) {
2849 level = upgrade_level(level);
2850 if (debug_process_killing) {
2851 ALOGI("Event upgraded to %s", level_name[level]);
2852 }
2853 }
2854 }
2855
2856 // If we still have enough swap space available, check if we want to
2857 // ignore/downgrade pressure events.
2858 if (mi.field.free_swap >=
2859 mi.field.total_swap * swap_free_low_percentage / 100) {
2860 // If the pressure is larger than downgrade_pressure lmk will not
2861 // kill any process, since enough memory is available.
2862 if (mem_pressure > downgrade_pressure) {
2863 if (debug_process_killing) {
2864 ALOGI("Ignore %s memory pressure", level_name[level]);
2865 }
2866 return;
2867 } else if (level == VMPRESS_LEVEL_CRITICAL && mem_pressure > upgrade_pressure) {
2868 if (debug_process_killing) {
2869 ALOGI("Downgrade critical memory pressure");
2870 }
2871 // Downgrade event, since enough memory available.
2872 level = downgrade_level(level);
2873 }
2874 }
2875
2876 do_kill:
2877 if (low_ram_device) {
2878 /* For Go devices kill only one task */
2879 if (find_and_kill_process(level_oomadj[level], NULL, &mi, &wi, &curr_tm) == 0) {
2880 if (debug_process_killing) {
2881 ALOGI("Nothing to kill");
2882 }
2883 }
2884 } else {
2885 int pages_freed;
2886 static struct timespec last_report_tm;
2887 static unsigned long report_skip_count = 0;
2888
2889 if (!use_minfree_levels) {
2890 /* Free up enough memory to downgrate the memory pressure to low level */
2891 if (mi.field.nr_free_pages >= low_pressure_mem.max_nr_free_pages) {
2892 if (debug_process_killing) {
2893 ALOGI("Ignoring pressure since more memory is "
2894 "available (%" PRId64 ") than watermark (%" PRId64 ")",
2895 mi.field.nr_free_pages, low_pressure_mem.max_nr_free_pages);
2896 }
2897 return;
2898 }
2899 min_score_adj = level_oomadj[level];
2900 }
2901
2902 pages_freed = find_and_kill_process(min_score_adj, NULL, &mi, &wi, &curr_tm);
2903
2904 if (pages_freed == 0) {
2905 /* Rate limit kill reports when nothing was reclaimed */
2906 if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) {
2907 report_skip_count++;
2908 return;
2909 }
2910 }
2911
2912 /* Log whenever we kill or when report rate limit allows */
2913 if (use_minfree_levels) {
2914 ALOGI("Reclaimed %ldkB, cache(%ldkB) and free(%" PRId64 "kB)-reserved(%" PRId64 "kB) "
2915 "below min(%ldkB) for oom_score_adj %d",
2916 pages_freed * page_k,
2917 other_file * page_k, mi.field.nr_free_pages * page_k,
2918 zi.totalreserve_pages * page_k,
2919 minfree * page_k, min_score_adj);
2920 } else {
2921 ALOGI("Reclaimed %ldkB at oom_score_adj %d", pages_freed * page_k, min_score_adj);
2922 }
2923
2924 if (report_skip_count > 0) {
2925 ALOGI("Suppressed %lu failed kill reports", report_skip_count);
2926 report_skip_count = 0;
2927 }
2928
2929 last_report_tm = curr_tm;
2930 }
2931 if (is_waiting_for_kill()) {
2932 /* pause polling if we are waiting for process death notification */
2933 poll_params->update = POLLING_PAUSE;
2934 }
2935 }
2936
init_mp_psi(enum vmpressure_level level,bool use_new_strategy)2937 static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
2938 int fd;
2939
2940 /* Do not register a handler if threshold_ms is not set */
2941 if (!psi_thresholds[level].threshold_ms) {
2942 return true;
2943 }
2944
2945 fd = init_psi_monitor(psi_thresholds[level].stall_type,
2946 psi_thresholds[level].threshold_ms * US_PER_MS,
2947 PSI_WINDOW_SIZE_MS * US_PER_MS);
2948
2949 if (fd < 0) {
2950 return false;
2951 }
2952
2953 vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
2954 vmpressure_hinfo[level].data = level;
2955 if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
2956 destroy_psi_monitor(fd);
2957 return false;
2958 }
2959 maxevents++;
2960 mpevfd[level] = fd;
2961
2962 return true;
2963 }
2964
destroy_mp_psi(enum vmpressure_level level)2965 static void destroy_mp_psi(enum vmpressure_level level) {
2966 int fd = mpevfd[level];
2967
2968 if (fd < 0) {
2969 return;
2970 }
2971
2972 if (unregister_psi_monitor(epollfd, fd) < 0) {
2973 ALOGE("Failed to unregister psi monitor for %s memory pressure; errno=%d",
2974 level_name[level], errno);
2975 }
2976 maxevents--;
2977 destroy_psi_monitor(fd);
2978 mpevfd[level] = -1;
2979 }
2980
init_psi_monitors()2981 static bool init_psi_monitors() {
2982 /*
2983 * When PSI is used on low-ram devices or on high-end devices without memfree levels
2984 * use new kill strategy based on zone watermarks, free swap and thrashing stats
2985 */
2986 bool use_new_strategy =
2987 property_get_bool("ro.lmk.use_new_strategy", low_ram_device || !use_minfree_levels);
2988
2989 /* In default PSI mode override stall amounts using system properties */
2990 if (use_new_strategy) {
2991 /* Do not use low pressure level */
2992 psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
2993 psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
2994 psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;
2995 }
2996
2997 if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) {
2998 return false;
2999 }
3000 if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) {
3001 destroy_mp_psi(VMPRESS_LEVEL_LOW);
3002 return false;
3003 }
3004 if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) {
3005 destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3006 destroy_mp_psi(VMPRESS_LEVEL_LOW);
3007 return false;
3008 }
3009 return true;
3010 }
3011
init_mp_common(enum vmpressure_level level)3012 static bool init_mp_common(enum vmpressure_level level) {
3013 int mpfd;
3014 int evfd;
3015 int evctlfd;
3016 char buf[256];
3017 struct epoll_event epev;
3018 int ret;
3019 int level_idx = (int)level;
3020 const char *levelstr = level_name[level_idx];
3021
3022 /* gid containing AID_SYSTEM required */
3023 mpfd = open(MEMCG_SYSFS_PATH "memory.pressure_level", O_RDONLY | O_CLOEXEC);
3024 if (mpfd < 0) {
3025 ALOGI("No kernel memory.pressure_level support (errno=%d)", errno);
3026 goto err_open_mpfd;
3027 }
3028
3029 evctlfd = open(MEMCG_SYSFS_PATH "cgroup.event_control", O_WRONLY | O_CLOEXEC);
3030 if (evctlfd < 0) {
3031 ALOGI("No kernel memory cgroup event control (errno=%d)", errno);
3032 goto err_open_evctlfd;
3033 }
3034
3035 evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
3036 if (evfd < 0) {
3037 ALOGE("eventfd failed for level %s; errno=%d", levelstr, errno);
3038 goto err_eventfd;
3039 }
3040
3041 ret = snprintf(buf, sizeof(buf), "%d %d %s", evfd, mpfd, levelstr);
3042 if (ret >= (ssize_t)sizeof(buf)) {
3043 ALOGE("cgroup.event_control line overflow for level %s", levelstr);
3044 goto err;
3045 }
3046
3047 ret = TEMP_FAILURE_RETRY(write(evctlfd, buf, strlen(buf) + 1));
3048 if (ret == -1) {
3049 ALOGE("cgroup.event_control write failed for level %s; errno=%d",
3050 levelstr, errno);
3051 goto err;
3052 }
3053
3054 epev.events = EPOLLIN;
3055 /* use data to store event level */
3056 vmpressure_hinfo[level_idx].data = level_idx;
3057 vmpressure_hinfo[level_idx].handler = mp_event_common;
3058 epev.data.ptr = (void *)&vmpressure_hinfo[level_idx];
3059 ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, evfd, &epev);
3060 if (ret == -1) {
3061 ALOGE("epoll_ctl for level %s failed; errno=%d", levelstr, errno);
3062 goto err;
3063 }
3064 maxevents++;
3065 mpevfd[level] = evfd;
3066 close(evctlfd);
3067 return true;
3068
3069 err:
3070 close(evfd);
3071 err_eventfd:
3072 close(evctlfd);
3073 err_open_evctlfd:
3074 close(mpfd);
3075 err_open_mpfd:
3076 return false;
3077 }
3078
destroy_mp_common(enum vmpressure_level level)3079 static void destroy_mp_common(enum vmpressure_level level) {
3080 struct epoll_event epev;
3081 int fd = mpevfd[level];
3082
3083 if (fd < 0) {
3084 return;
3085 }
3086
3087 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &epev)) {
3088 // Log an error and keep going
3089 ALOGE("epoll_ctl for level %s failed; errno=%d", level_name[level], errno);
3090 }
3091 maxevents--;
3092 close(fd);
3093 mpevfd[level] = -1;
3094 }
3095
kernel_event_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)3096 static void kernel_event_handler(int data __unused, uint32_t events __unused,
3097 struct polling_params *poll_params __unused) {
3098 poll_kernel(kpoll_fd);
3099 }
3100
init_monitors()3101 static bool init_monitors() {
3102 /* Try to use psi monitor first if kernel has it */
3103 use_psi_monitors = property_get_bool("ro.lmk.use_psi", true) &&
3104 init_psi_monitors();
3105 /* Fall back to vmpressure */
3106 if (!use_psi_monitors &&
3107 (!init_mp_common(VMPRESS_LEVEL_LOW) ||
3108 !init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
3109 !init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
3110 ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
3111 return false;
3112 }
3113 if (use_psi_monitors) {
3114 ALOGI("Using psi monitors for memory pressure detection");
3115 } else {
3116 ALOGI("Using vmpressure for memory pressure detection");
3117 }
3118 return true;
3119 }
3120
destroy_monitors()3121 static void destroy_monitors() {
3122 if (use_psi_monitors) {
3123 destroy_mp_psi(VMPRESS_LEVEL_CRITICAL);
3124 destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3125 destroy_mp_psi(VMPRESS_LEVEL_LOW);
3126 } else {
3127 destroy_mp_common(VMPRESS_LEVEL_CRITICAL);
3128 destroy_mp_common(VMPRESS_LEVEL_MEDIUM);
3129 destroy_mp_common(VMPRESS_LEVEL_LOW);
3130 }
3131 }
3132
init(void)3133 static int init(void) {
3134 static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler };
3135 struct reread_data file_data = {
3136 .filename = ZONEINFO_PATH,
3137 .fd = -1,
3138 };
3139 struct epoll_event epev;
3140 int pidfd;
3141 int i;
3142 int ret;
3143
3144 page_k = sysconf(_SC_PAGESIZE);
3145 if (page_k == -1)
3146 page_k = PAGE_SIZE;
3147 page_k /= 1024;
3148
3149 epollfd = epoll_create(MAX_EPOLL_EVENTS);
3150 if (epollfd == -1) {
3151 ALOGE("epoll_create failed (errno=%d)", errno);
3152 return -1;
3153 }
3154
3155 // mark data connections as not connected
3156 for (int i = 0; i < MAX_DATA_CONN; i++) {
3157 data_sock[i].sock = -1;
3158 }
3159
3160 ctrl_sock.sock = android_get_control_socket("lmkd");
3161 if (ctrl_sock.sock < 0) {
3162 ALOGE("get lmkd control socket failed");
3163 return -1;
3164 }
3165
3166 ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
3167 if (ret < 0) {
3168 ALOGE("lmkd control socket listen failed (errno=%d)", errno);
3169 return -1;
3170 }
3171
3172 epev.events = EPOLLIN;
3173 ctrl_sock.handler_info.handler = ctrl_connect_handler;
3174 epev.data.ptr = (void *)&(ctrl_sock.handler_info);
3175 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
3176 ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
3177 return -1;
3178 }
3179 maxevents++;
3180
3181 has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
3182 use_inkernel_interface = has_inkernel_module;
3183
3184 if (use_inkernel_interface) {
3185 ALOGI("Using in-kernel low memory killer interface");
3186 if (init_poll_kernel()) {
3187 epev.events = EPOLLIN;
3188 epev.data.ptr = (void*)&kernel_poll_hinfo;
3189 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_fd, &epev) != 0) {
3190 ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
3191 close(kpoll_fd);
3192 kpoll_fd = -1;
3193 } else {
3194 maxevents++;
3195 /* let the others know it does support reporting kills */
3196 property_set("sys.lmk.reportkills", "1");
3197 }
3198 }
3199 } else {
3200 if (!init_monitors()) {
3201 return -1;
3202 }
3203 /* let the others know it does support reporting kills */
3204 property_set("sys.lmk.reportkills", "1");
3205 }
3206
3207 for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
3208 procadjslot_list[i].next = &procadjslot_list[i];
3209 procadjslot_list[i].prev = &procadjslot_list[i];
3210 }
3211
3212 memset(killcnt_idx, KILLCNT_INVALID_IDX, sizeof(killcnt_idx));
3213
3214 /*
3215 * Read zoneinfo as the biggest file we read to create and size the initial
3216 * read buffer and avoid memory re-allocations during memory pressure
3217 */
3218 if (reread_file(&file_data) == NULL) {
3219 ALOGE("Failed to read %s: %s", file_data.filename, strerror(errno));
3220 }
3221
3222 /* check if kernel supports pidfd_open syscall */
3223 pidfd = TEMP_FAILURE_RETRY(pidfd_open(getpid(), 0));
3224 if (pidfd < 0) {
3225 pidfd_supported = (errno != ENOSYS);
3226 } else {
3227 pidfd_supported = true;
3228 close(pidfd);
3229 }
3230 ALOGI("Process polling is %s", pidfd_supported ? "supported" : "not supported" );
3231
3232 return 0;
3233 }
3234
polling_paused(struct polling_params * poll_params)3235 static bool polling_paused(struct polling_params *poll_params) {
3236 return poll_params->paused_handler != NULL;
3237 }
3238
resume_polling(struct polling_params * poll_params,struct timespec curr_tm)3239 static void resume_polling(struct polling_params *poll_params, struct timespec curr_tm) {
3240 poll_params->poll_start_tm = curr_tm;
3241 poll_params->poll_handler = poll_params->paused_handler;
3242 poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
3243 poll_params->paused_handler = NULL;
3244 }
3245
call_handler(struct event_handler_info * handler_info,struct polling_params * poll_params,uint32_t events)3246 static void call_handler(struct event_handler_info* handler_info,
3247 struct polling_params *poll_params, uint32_t events) {
3248 struct timespec curr_tm;
3249
3250 poll_params->update = POLLING_DO_NOT_CHANGE;
3251 handler_info->handler(handler_info->data, events, poll_params);
3252 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3253 if (poll_params->poll_handler == handler_info) {
3254 poll_params->last_poll_tm = curr_tm;
3255 }
3256
3257 switch (poll_params->update) {
3258 case POLLING_START:
3259 /*
3260 * Poll for the duration of PSI_WINDOW_SIZE_MS after the
3261 * initial PSI event because psi events are rate-limited
3262 * at one per sec.
3263 */
3264 poll_params->poll_start_tm = curr_tm;
3265 poll_params->poll_handler = handler_info;
3266 break;
3267 case POLLING_PAUSE:
3268 poll_params->paused_handler = handler_info;
3269 poll_params->poll_handler = NULL;
3270 break;
3271 case POLLING_RESUME:
3272 resume_polling(poll_params, curr_tm);
3273 break;
3274 case POLLING_DO_NOT_CHANGE:
3275 if (get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
3276 /* Polled for the duration of PSI window, time to stop */
3277 poll_params->poll_handler = NULL;
3278 }
3279 break;
3280 }
3281 }
3282
mainloop(void)3283 static void mainloop(void) {
3284 struct event_handler_info* handler_info;
3285 struct polling_params poll_params;
3286 struct timespec curr_tm;
3287 struct epoll_event *evt;
3288 long delay = -1;
3289
3290 poll_params.poll_handler = NULL;
3291 poll_params.paused_handler = NULL;
3292
3293 while (1) {
3294 struct epoll_event events[MAX_EPOLL_EVENTS];
3295 int nevents;
3296 int i;
3297
3298 if (poll_params.poll_handler) {
3299 bool poll_now;
3300
3301 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3302 if (poll_params.update == POLLING_RESUME) {
3303 /* Just transitioned into POLLING_RESUME, poll immediately. */
3304 poll_now = true;
3305 nevents = 0;
3306 } else {
3307 /* Calculate next timeout */
3308 delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
3309 delay = (delay < poll_params.polling_interval_ms) ?
3310 poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;
3311
3312 /* Wait for events until the next polling timeout */
3313 nevents = epoll_wait(epollfd, events, maxevents, delay);
3314
3315 /* Update current time after wait */
3316 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3317 poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
3318 poll_params.polling_interval_ms);
3319 }
3320 if (poll_now) {
3321 call_handler(poll_params.poll_handler, &poll_params, 0);
3322 }
3323 } else {
3324 if (kill_timeout_ms && is_waiting_for_kill()) {
3325 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3326 delay = kill_timeout_ms - get_time_diff_ms(&last_kill_tm, &curr_tm);
3327 /* Wait for pidfds notification or kill timeout to expire */
3328 nevents = (delay > 0) ? epoll_wait(epollfd, events, maxevents, delay) : 0;
3329 if (nevents == 0) {
3330 /* Kill notification timed out */
3331 stop_wait_for_proc_kill(false);
3332 if (polling_paused(&poll_params)) {
3333 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3334 poll_params.update = POLLING_RESUME;
3335 resume_polling(&poll_params, curr_tm);
3336 }
3337 }
3338 } else {
3339 /* Wait for events with no timeout */
3340 nevents = epoll_wait(epollfd, events, maxevents, -1);
3341 }
3342 }
3343
3344 if (nevents == -1) {
3345 if (errno == EINTR)
3346 continue;
3347 ALOGE("epoll_wait failed (errno=%d)", errno);
3348 continue;
3349 }
3350
3351 /*
3352 * First pass to see if any data socket connections were dropped.
3353 * Dropped connection should be handled before any other events
3354 * to deallocate data connection and correctly handle cases when
3355 * connection gets dropped and reestablished in the same epoll cycle.
3356 * In such cases it's essential to handle connection closures first.
3357 */
3358 for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
3359 if ((evt->events & EPOLLHUP) && evt->data.ptr) {
3360 ALOGI("lmkd data connection dropped");
3361 handler_info = (struct event_handler_info*)evt->data.ptr;
3362 ctrl_data_close(handler_info->data);
3363 }
3364 }
3365
3366 /* Second pass to handle all other events */
3367 for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
3368 if (evt->events & EPOLLERR) {
3369 ALOGD("EPOLLERR on event #%d", i);
3370 }
3371 if (evt->events & EPOLLHUP) {
3372 /* This case was handled in the first pass */
3373 continue;
3374 }
3375 if (evt->data.ptr) {
3376 handler_info = (struct event_handler_info*)evt->data.ptr;
3377 call_handler(handler_info, &poll_params, evt->events);
3378 }
3379 }
3380 }
3381 }
3382
issue_reinit()3383 int issue_reinit() {
3384 int sock;
3385
3386 sock = lmkd_connect();
3387 if (sock < 0) {
3388 ALOGE("failed to connect to lmkd: %s", strerror(errno));
3389 return -1;
3390 }
3391
3392 enum update_props_result res = lmkd_update_props(sock);
3393 switch (res) {
3394 case UPDATE_PROPS_SUCCESS:
3395 ALOGI("lmkd updated properties successfully");
3396 break;
3397 case UPDATE_PROPS_SEND_ERR:
3398 ALOGE("failed to send lmkd request: %s", strerror(errno));
3399 break;
3400 case UPDATE_PROPS_RECV_ERR:
3401 ALOGE("failed to receive lmkd reply: %s", strerror(errno));
3402 break;
3403 case UPDATE_PROPS_FORMAT_ERR:
3404 ALOGE("lmkd reply is invalid");
3405 break;
3406 case UPDATE_PROPS_FAIL:
3407 ALOGE("lmkd failed to update its properties");
3408 break;
3409 }
3410
3411 close(sock);
3412 return res == UPDATE_PROPS_SUCCESS ? 0 : -1;
3413 }
3414
update_props()3415 static void update_props() {
3416 /* By default disable low level vmpressure events */
3417 level_oomadj[VMPRESS_LEVEL_LOW] =
3418 property_get_int32("ro.lmk.low", OOM_SCORE_ADJ_MAX + 1);
3419 level_oomadj[VMPRESS_LEVEL_MEDIUM] =
3420 property_get_int32("ro.lmk.medium", 800);
3421 level_oomadj[VMPRESS_LEVEL_CRITICAL] =
3422 property_get_int32("ro.lmk.critical", 0);
3423 debug_process_killing = property_get_bool("ro.lmk.debug", false);
3424
3425 /* By default disable upgrade/downgrade logic */
3426 enable_pressure_upgrade =
3427 property_get_bool("ro.lmk.critical_upgrade", false);
3428 upgrade_pressure =
3429 (int64_t)property_get_int32("ro.lmk.upgrade_pressure", 100);
3430 downgrade_pressure =
3431 (int64_t)property_get_int32("ro.lmk.downgrade_pressure", 100);
3432 kill_heaviest_task =
3433 property_get_bool("ro.lmk.kill_heaviest_task", false);
3434 low_ram_device = property_get_bool("ro.config.low_ram", false);
3435 kill_timeout_ms =
3436 (unsigned long)property_get_int32("ro.lmk.kill_timeout_ms", 100);
3437 use_minfree_levels =
3438 property_get_bool("ro.lmk.use_minfree_levels", false);
3439 per_app_memcg =
3440 property_get_bool("ro.config.per_app_memcg", low_ram_device);
3441 swap_free_low_percentage = clamp(0, 100, property_get_int32("ro.lmk.swap_free_low_percentage",
3442 DEF_LOW_SWAP));
3443 psi_partial_stall_ms = property_get_int32("ro.lmk.psi_partial_stall_ms",
3444 low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
3445 psi_complete_stall_ms = property_get_int32("ro.lmk.psi_complete_stall_ms",
3446 DEF_COMPLETE_STALL);
3447 thrashing_limit_pct = max(0, property_get_int32("ro.lmk.thrashing_limit",
3448 low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
3449 thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay",
3450 low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
3451 thrashing_critical_pct = max(0, property_get_int32("ro.lmk.thrashing_limit_critical",
3452 thrashing_limit_pct * 2));
3453 swap_util_max = clamp(0, 100, property_get_int32("ro.lmk.swap_util_max", 100));
3454 filecache_min_kb = property_get_int64("ro.lmk.filecache_min_kb", 0);
3455 }
3456
main(int argc,char ** argv)3457 int main(int argc, char **argv) {
3458 if ((argc > 1) && argv[1] && !strcmp(argv[1], "--reinit")) {
3459 if (property_set(LMKD_REINIT_PROP, "0")) {
3460 ALOGE("Failed to reset " LMKD_REINIT_PROP " property");
3461 }
3462 return issue_reinit();
3463 }
3464
3465 update_props();
3466
3467 ctx = create_android_logger(KILLINFO_LOG_TAG);
3468
3469 if (!init()) {
3470 if (!use_inkernel_interface) {
3471 /*
3472 * MCL_ONFAULT pins pages as they fault instead of loading
3473 * everything immediately all at once. (Which would be bad,
3474 * because as of this writing, we have a lot of mapped pages we
3475 * never use.) Old kernels will see MCL_ONFAULT and fail with
3476 * EINVAL; we ignore this failure.
3477 *
3478 * N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
3479 * pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
3480 * in pages.
3481 */
3482 /* CAP_IPC_LOCK required */
3483 if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
3484 ALOGW("mlockall failed %s", strerror(errno));
3485 }
3486
3487 /* CAP_NICE required */
3488 struct sched_param param = {
3489 .sched_priority = 1,
3490 };
3491 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
3492 ALOGW("set SCHED_FIFO failed %s", strerror(errno));
3493 }
3494 }
3495
3496 mainloop();
3497 }
3498
3499 android_log_destroy(&ctx);
3500
3501 ALOGI("exiting");
3502 return 0;
3503 }
3504