1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "lowmemorykiller"
18 
19 #include <dirent.h>
20 #include <errno.h>
21 #include <inttypes.h>
22 #include <pwd.h>
23 #include <sched.h>
24 #include <signal.h>
25 #include <stdbool.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <sys/cdefs.h>
29 #include <sys/epoll.h>
30 #include <sys/eventfd.h>
31 #include <sys/mman.h>
32 #include <sys/pidfd.h>
33 #include <sys/resource.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/time.h>
38 #include <sys/types.h>
39 #include <time.h>
40 #include <unistd.h>
41 
42 #include <cutils/properties.h>
43 #include <cutils/sched_policy.h>
44 #include <cutils/sockets.h>
45 #include <liblmkd_utils.h>
46 #include <lmkd.h>
47 #include <log/log.h>
48 #include <log/log_event_list.h>
49 #include <log/log_time.h>
50 #include <private/android_filesystem_config.h>
51 #include <psi/psi.h>
52 #include <system/thread_defs.h>
53 
54 #include "statslog.h"
55 
56 #define BPF_FD_JUST_USE_INT
57 #include "BpfSyscallWrappers.h"
58 
59 /*
60  * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces
61  * to profile and correlate with OOM kills
62  */
63 #ifdef LMKD_TRACE_KILLS
64 
65 #define ATRACE_TAG ATRACE_TAG_ALWAYS
66 #include <cutils/trace.h>
67 
68 #define TRACE_KILL_START(pid) ATRACE_INT(__FUNCTION__, pid);
69 #define TRACE_KILL_END()      ATRACE_INT(__FUNCTION__, 0);
70 
71 #else /* LMKD_TRACE_KILLS */
72 
73 #define TRACE_KILL_START(pid) ((void)(pid))
74 #define TRACE_KILL_END() ((void)0)
75 
76 #endif /* LMKD_TRACE_KILLS */
77 
78 #ifndef __unused
79 #define __unused __attribute__((__unused__))
80 #endif
81 
82 #define MEMCG_SYSFS_PATH "/dev/memcg/"
83 #define MEMCG_MEMORY_USAGE "/dev/memcg/memory.usage_in_bytes"
84 #define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"
85 #define ZONEINFO_PATH "/proc/zoneinfo"
86 #define MEMINFO_PATH "/proc/meminfo"
87 #define VMSTAT_PATH "/proc/vmstat"
88 #define PROC_STATUS_TGID_FIELD "Tgid:"
89 #define PROC_STATUS_RSS_FIELD "VmRSS:"
90 #define PROC_STATUS_SWAP_FIELD "VmSwap:"
91 #define LINE_MAX 128
92 
93 #define PERCEPTIBLE_APP_ADJ 200
94 
95 /* Android Logger event logtags (see event.logtags) */
96 #define KILLINFO_LOG_TAG 10195355
97 
98 /* gid containing AID_SYSTEM required */
99 #define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"
100 #define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj"
101 
102 #define ARRAY_SIZE(x)   (sizeof(x) / sizeof(*(x)))
103 #define EIGHT_MEGA (1 << 23)
104 
105 #define TARGET_UPDATE_MIN_INTERVAL_MS 1000
106 #define THRASHING_RESET_INTERVAL_MS 1000
107 
108 #define NS_PER_MS (NS_PER_SEC / MS_PER_SEC)
109 #define US_PER_MS (US_PER_SEC / MS_PER_SEC)
110 
111 /* Defined as ProcessList.SYSTEM_ADJ in ProcessList.java */
112 #define SYSTEM_ADJ (-900)
113 
114 #define STRINGIFY(x) STRINGIFY_INTERNAL(x)
115 #define STRINGIFY_INTERNAL(x) #x
116 
117 /*
118  * PSI monitor tracking window size.
119  * PSI monitor generates events at most once per window,
120  * therefore we poll memory state for the duration of
121  * PSI_WINDOW_SIZE_MS after the event happens.
122  */
123 #define PSI_WINDOW_SIZE_MS 1000
124 /* Polling period after PSI signal when pressure is high */
125 #define PSI_POLL_PERIOD_SHORT_MS 10
126 /* Polling period after PSI signal when pressure is low */
127 #define PSI_POLL_PERIOD_LONG_MS 100
128 
129 #define min(a, b) (((a) < (b)) ? (a) : (b))
130 #define max(a, b) (((a) > (b)) ? (a) : (b))
131 
132 #define FAIL_REPORT_RLIMIT_MS 1000
133 
134 /*
135  * System property defaults
136  */
137 /* ro.lmk.swap_free_low_percentage property defaults */
138 #define DEF_LOW_SWAP 10
139 /* ro.lmk.thrashing_limit property defaults */
140 #define DEF_THRASHING_LOWRAM 30
141 #define DEF_THRASHING 100
142 /* ro.lmk.thrashing_limit_decay property defaults */
143 #define DEF_THRASHING_DECAY_LOWRAM 50
144 #define DEF_THRASHING_DECAY 10
145 /* ro.lmk.psi_partial_stall_ms property defaults */
146 #define DEF_PARTIAL_STALL_LOWRAM 200
147 #define DEF_PARTIAL_STALL 70
148 /* ro.lmk.psi_complete_stall_ms property defaults */
149 #define DEF_COMPLETE_STALL 700
150 
151 #define LMKD_REINIT_PROP "lmkd.reinit"
152 
153 /* default to old in-kernel interface if no memory pressure events */
154 static bool use_inkernel_interface = true;
155 static bool has_inkernel_module;
156 
157 /* memory pressure levels */
158 enum vmpressure_level {
159     VMPRESS_LEVEL_LOW = 0,
160     VMPRESS_LEVEL_MEDIUM,
161     VMPRESS_LEVEL_CRITICAL,
162     VMPRESS_LEVEL_COUNT
163 };
164 
165 static const char *level_name[] = {
166     "low",
167     "medium",
168     "critical"
169 };
170 
171 struct {
172     int64_t min_nr_free_pages; /* recorded but not used yet */
173     int64_t max_nr_free_pages;
174 } low_pressure_mem = { -1, -1 };
175 
176 struct psi_threshold {
177     enum psi_stall_type stall_type;
178     int threshold_ms;
179 };
180 
181 static int level_oomadj[VMPRESS_LEVEL_COUNT];
182 static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
183 static bool pidfd_supported;
184 static int last_kill_pid_or_fd = -1;
185 static struct timespec last_kill_tm;
186 
187 /* lmkd configurable parameters */
188 static bool debug_process_killing;
189 static bool enable_pressure_upgrade;
190 static int64_t upgrade_pressure;
191 static int64_t downgrade_pressure;
192 static bool low_ram_device;
193 static bool kill_heaviest_task;
194 static unsigned long kill_timeout_ms;
195 static bool use_minfree_levels;
196 static bool per_app_memcg;
197 static int swap_free_low_percentage;
198 static int psi_partial_stall_ms;
199 static int psi_complete_stall_ms;
200 static int thrashing_limit_pct;
201 static int thrashing_limit_decay_pct;
202 static int thrashing_critical_pct;
203 static int swap_util_max;
204 static int64_t filecache_min_kb;
205 static bool use_psi_monitors = false;
206 static int kpoll_fd;
207 static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
208     { PSI_SOME, 70 },    /* 70ms out of 1sec for partial stall */
209     { PSI_SOME, 100 },   /* 100ms out of 1sec for partial stall */
210     { PSI_FULL, 70 },    /* 70ms out of 1sec for complete stall */
211 };
212 
213 static android_log_context ctx;
214 
215 enum polling_update {
216     POLLING_DO_NOT_CHANGE,
217     POLLING_START,
218     POLLING_PAUSE,
219     POLLING_RESUME,
220 };
221 
222 /*
223  * Data used for periodic polling for the memory state of the device.
224  * Note that when system is not polling poll_handler is set to NULL,
225  * when polling starts poll_handler gets set and is reset back to
226  * NULL when polling stops.
227  */
228 struct polling_params {
229     struct event_handler_info* poll_handler;
230     struct event_handler_info* paused_handler;
231     struct timespec poll_start_tm;
232     struct timespec last_poll_tm;
233     int polling_interval_ms;
234     enum polling_update update;
235 };
236 
237 /* data required to handle events */
238 struct event_handler_info {
239     int data;
240     void (*handler)(int data, uint32_t events, struct polling_params *poll_params);
241 };
242 
243 /* data required to handle socket events */
244 struct sock_event_handler_info {
245     int sock;
246     pid_t pid;
247     uint32_t async_event_mask;
248     struct event_handler_info handler_info;
249 };
250 
251 /* max supported number of data connections (AMS, init, tests) */
252 #define MAX_DATA_CONN 3
253 
254 /* socket event handler data */
255 static struct sock_event_handler_info ctrl_sock;
256 static struct sock_event_handler_info data_sock[MAX_DATA_CONN];
257 
258 /* vmpressure event handler data */
259 static struct event_handler_info vmpressure_hinfo[VMPRESS_LEVEL_COUNT];
260 
261 /*
262  * 1 ctrl listen socket, 3 ctrl data socket, 3 memory pressure levels,
263  * 1 lmk events + 1 fd to wait for process death
264  */
265 #define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1)
266 static int epollfd;
267 static int maxevents;
268 
269 /* OOM score values used by both kernel and framework */
270 #define OOM_SCORE_ADJ_MIN       (-1000)
271 #define OOM_SCORE_ADJ_MAX       1000
272 
273 static int lowmem_adj[MAX_TARGETS];
274 static int lowmem_minfree[MAX_TARGETS];
275 static int lowmem_targets_size;
276 
277 /* Fields to parse in /proc/zoneinfo */
278 /* zoneinfo per-zone fields */
279 enum zoneinfo_zone_field {
280     ZI_ZONE_NR_FREE_PAGES = 0,
281     ZI_ZONE_MIN,
282     ZI_ZONE_LOW,
283     ZI_ZONE_HIGH,
284     ZI_ZONE_PRESENT,
285     ZI_ZONE_NR_FREE_CMA,
286     ZI_ZONE_FIELD_COUNT
287 };
288 
289 static const char* const zoneinfo_zone_field_names[ZI_ZONE_FIELD_COUNT] = {
290     "nr_free_pages",
291     "min",
292     "low",
293     "high",
294     "present",
295     "nr_free_cma",
296 };
297 
298 /* zoneinfo per-zone special fields */
299 enum zoneinfo_zone_spec_field {
300     ZI_ZONE_SPEC_PROTECTION = 0,
301     ZI_ZONE_SPEC_PAGESETS,
302     ZI_ZONE_SPEC_FIELD_COUNT,
303 };
304 
305 static const char* const zoneinfo_zone_spec_field_names[ZI_ZONE_SPEC_FIELD_COUNT] = {
306     "protection:",
307     "pagesets",
308 };
309 
310 /* see __MAX_NR_ZONES definition in kernel mmzone.h */
311 #define MAX_NR_ZONES 6
312 
313 union zoneinfo_zone_fields {
314     struct {
315         int64_t nr_free_pages;
316         int64_t min;
317         int64_t low;
318         int64_t high;
319         int64_t present;
320         int64_t nr_free_cma;
321     } field;
322     int64_t arr[ZI_ZONE_FIELD_COUNT];
323 };
324 
325 struct zoneinfo_zone {
326     union zoneinfo_zone_fields fields;
327     int64_t protection[MAX_NR_ZONES];
328     int64_t max_protection;
329 };
330 
331 /* zoneinfo per-node fields */
332 enum zoneinfo_node_field {
333     ZI_NODE_NR_INACTIVE_FILE = 0,
334     ZI_NODE_NR_ACTIVE_FILE,
335     ZI_NODE_FIELD_COUNT
336 };
337 
338 static const char* const zoneinfo_node_field_names[ZI_NODE_FIELD_COUNT] = {
339     "nr_inactive_file",
340     "nr_active_file",
341 };
342 
343 union zoneinfo_node_fields {
344     struct {
345         int64_t nr_inactive_file;
346         int64_t nr_active_file;
347     } field;
348     int64_t arr[ZI_NODE_FIELD_COUNT];
349 };
350 
351 struct zoneinfo_node {
352     int id;
353     int zone_count;
354     struct zoneinfo_zone zones[MAX_NR_ZONES];
355     union zoneinfo_node_fields fields;
356 };
357 
358 /* for now two memory nodes is more than enough */
359 #define MAX_NR_NODES 2
360 
361 struct zoneinfo {
362     int node_count;
363     struct zoneinfo_node nodes[MAX_NR_NODES];
364     int64_t totalreserve_pages;
365     int64_t total_inactive_file;
366     int64_t total_active_file;
367 };
368 
369 /* Fields to parse in /proc/meminfo */
370 enum meminfo_field {
371     MI_NR_FREE_PAGES = 0,
372     MI_CACHED,
373     MI_SWAP_CACHED,
374     MI_BUFFERS,
375     MI_SHMEM,
376     MI_UNEVICTABLE,
377     MI_TOTAL_SWAP,
378     MI_FREE_SWAP,
379     MI_ACTIVE_ANON,
380     MI_INACTIVE_ANON,
381     MI_ACTIVE_FILE,
382     MI_INACTIVE_FILE,
383     MI_SRECLAIMABLE,
384     MI_SUNRECLAIM,
385     MI_KERNEL_STACK,
386     MI_PAGE_TABLES,
387     MI_ION_HELP,
388     MI_ION_HELP_POOL,
389     MI_CMA_FREE,
390     MI_FIELD_COUNT
391 };
392 
393 static const char* const meminfo_field_names[MI_FIELD_COUNT] = {
394     "MemFree:",
395     "Cached:",
396     "SwapCached:",
397     "Buffers:",
398     "Shmem:",
399     "Unevictable:",
400     "SwapTotal:",
401     "SwapFree:",
402     "Active(anon):",
403     "Inactive(anon):",
404     "Active(file):",
405     "Inactive(file):",
406     "SReclaimable:",
407     "SUnreclaim:",
408     "KernelStack:",
409     "PageTables:",
410     "ION_heap:",
411     "ION_heap_pool:",
412     "CmaFree:",
413 };
414 
415 union meminfo {
416     struct {
417         int64_t nr_free_pages;
418         int64_t cached;
419         int64_t swap_cached;
420         int64_t buffers;
421         int64_t shmem;
422         int64_t unevictable;
423         int64_t total_swap;
424         int64_t free_swap;
425         int64_t active_anon;
426         int64_t inactive_anon;
427         int64_t active_file;
428         int64_t inactive_file;
429         int64_t sreclaimable;
430         int64_t sunreclaimable;
431         int64_t kernel_stack;
432         int64_t page_tables;
433         int64_t ion_heap;
434         int64_t ion_heap_pool;
435         int64_t cma_free;
436         /* fields below are calculated rather than read from the file */
437         int64_t nr_file_pages;
438         int64_t total_gpu_kb;
439     } field;
440     int64_t arr[MI_FIELD_COUNT];
441 };
442 
443 /* Fields to parse in /proc/vmstat */
444 enum vmstat_field {
445     VS_FREE_PAGES,
446     VS_INACTIVE_FILE,
447     VS_ACTIVE_FILE,
448     VS_WORKINGSET_REFAULT,
449     VS_WORKINGSET_REFAULT_FILE,
450     VS_PGSCAN_KSWAPD,
451     VS_PGSCAN_DIRECT,
452     VS_PGSCAN_DIRECT_THROTTLE,
453     VS_FIELD_COUNT
454 };
455 
456 static const char* const vmstat_field_names[MI_FIELD_COUNT] = {
457     "nr_free_pages",
458     "nr_inactive_file",
459     "nr_active_file",
460     "workingset_refault",
461     "workingset_refault_file",
462     "pgscan_kswapd",
463     "pgscan_direct",
464     "pgscan_direct_throttle",
465 };
466 
467 union vmstat {
468     struct {
469         int64_t nr_free_pages;
470         int64_t nr_inactive_file;
471         int64_t nr_active_file;
472         int64_t workingset_refault;
473         int64_t workingset_refault_file;
474         int64_t pgscan_kswapd;
475         int64_t pgscan_direct;
476         int64_t pgscan_direct_throttle;
477     } field;
478     int64_t arr[VS_FIELD_COUNT];
479 };
480 
481 enum field_match_result {
482     NO_MATCH,
483     PARSE_FAIL,
484     PARSE_SUCCESS
485 };
486 
487 struct adjslot_list {
488     struct adjslot_list *next;
489     struct adjslot_list *prev;
490 };
491 
492 struct proc {
493     struct adjslot_list asl;
494     int pid;
495     int pidfd;
496     uid_t uid;
497     int oomadj;
498     pid_t reg_pid; /* PID of the process that registered this record */
499     struct proc *pidhash_next;
500 };
501 
502 struct reread_data {
503     const char* const filename;
504     int fd;
505 };
506 
507 #define PIDHASH_SZ 1024
508 static struct proc *pidhash[PIDHASH_SZ];
509 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
510 
511 #define ADJTOSLOT(adj) ((adj) + -OOM_SCORE_ADJ_MIN)
512 #define ADJTOSLOT_COUNT (ADJTOSLOT(OOM_SCORE_ADJ_MAX) + 1)
513 static struct adjslot_list procadjslot_list[ADJTOSLOT_COUNT];
514 
515 #define MAX_DISTINCT_OOM_ADJ 32
516 #define KILLCNT_INVALID_IDX 0xFF
517 /*
518  * Because killcnt array is sparse a two-level indirection is used
519  * to keep the size small. killcnt_idx stores index of the element in
520  * killcnt array. Index KILLCNT_INVALID_IDX indicates an unused slot.
521  */
522 static uint8_t killcnt_idx[ADJTOSLOT_COUNT];
523 static uint16_t killcnt[MAX_DISTINCT_OOM_ADJ];
524 static int killcnt_free_idx = 0;
525 static uint32_t killcnt_total = 0;
526 
527 /* PAGE_SIZE / 1024 */
528 static long page_k;
529 
530 static void update_props();
531 static bool init_monitors();
532 static void destroy_monitors();
533 
clamp(int low,int high,int value)534 static int clamp(int low, int high, int value) {
535     return max(min(value, high), low);
536 }
537 
parse_int64(const char * str,int64_t * ret)538 static bool parse_int64(const char* str, int64_t* ret) {
539     char* endptr;
540     long long val = strtoll(str, &endptr, 10);
541     if (str == endptr || val > INT64_MAX) {
542         return false;
543     }
544     *ret = (int64_t)val;
545     return true;
546 }
547 
find_field(const char * name,const char * const field_names[],int field_count)548 static int find_field(const char* name, const char* const field_names[], int field_count) {
549     for (int i = 0; i < field_count; i++) {
550         if (!strcmp(name, field_names[i])) {
551             return i;
552         }
553     }
554     return -1;
555 }
556 
match_field(const char * cp,const char * ap,const char * const field_names[],int field_count,int64_t * field,int * field_idx)557 static enum field_match_result match_field(const char* cp, const char* ap,
558                                    const char* const field_names[],
559                                    int field_count, int64_t* field,
560                                    int *field_idx) {
561     int i = find_field(cp, field_names, field_count);
562     if (i < 0) {
563         return NO_MATCH;
564     }
565     *field_idx = i;
566     return parse_int64(ap, field) ? PARSE_SUCCESS : PARSE_FAIL;
567 }
568 
569 /*
570  * Read file content from the beginning up to max_len bytes or EOF
571  * whichever happens first.
572  */
read_all(int fd,char * buf,size_t max_len)573 static ssize_t read_all(int fd, char *buf, size_t max_len)
574 {
575     ssize_t ret = 0;
576     off_t offset = 0;
577 
578     while (max_len > 0) {
579         ssize_t r = TEMP_FAILURE_RETRY(pread(fd, buf, max_len, offset));
580         if (r == 0) {
581             break;
582         }
583         if (r == -1) {
584             return -1;
585         }
586         ret += r;
587         buf += r;
588         offset += r;
589         max_len -= r;
590     }
591 
592     return ret;
593 }
594 
595 /*
596  * Read a new or already opened file from the beginning.
597  * If the file has not been opened yet data->fd should be set to -1.
598  * To be used with files which are read often and possibly during high
599  * memory pressure to minimize file opening which by itself requires kernel
600  * memory allocation and might result in a stall on memory stressed system.
601  */
reread_file(struct reread_data * data)602 static char *reread_file(struct reread_data *data) {
603     /* start with page-size buffer and increase if needed */
604     static ssize_t buf_size = PAGE_SIZE;
605     static char *new_buf, *buf = NULL;
606     ssize_t size;
607 
608     if (data->fd == -1) {
609         /* First-time buffer initialization */
610         if (!buf && (buf = static_cast<char*>(malloc(buf_size))) == nullptr) {
611             return NULL;
612         }
613 
614         data->fd = TEMP_FAILURE_RETRY(open(data->filename, O_RDONLY | O_CLOEXEC));
615         if (data->fd < 0) {
616             ALOGE("%s open: %s", data->filename, strerror(errno));
617             return NULL;
618         }
619     }
620 
621     while (true) {
622         size = read_all(data->fd, buf, buf_size - 1);
623         if (size < 0) {
624             ALOGE("%s read: %s", data->filename, strerror(errno));
625             close(data->fd);
626             data->fd = -1;
627             return NULL;
628         }
629         if (size < buf_size - 1) {
630             break;
631         }
632         /*
633          * Since we are reading /proc files we can't use fstat to find out
634          * the real size of the file. Double the buffer size and keep retrying.
635          */
636         if ((new_buf = static_cast<char*>(realloc(buf, buf_size * 2))) == nullptr) {
637             errno = ENOMEM;
638             return NULL;
639         }
640         buf = new_buf;
641         buf_size *= 2;
642     }
643     buf[size] = 0;
644 
645     return buf;
646 }
647 
claim_record(struct proc * procp,pid_t pid)648 static bool claim_record(struct proc* procp, pid_t pid) {
649     if (procp->reg_pid == pid) {
650         /* Record already belongs to the registrant */
651         return true;
652     }
653     if (procp->reg_pid == 0) {
654         /* Old registrant is gone, claim the record */
655         procp->reg_pid = pid;
656         return true;
657     }
658     /* The record is owned by another registrant */
659     return false;
660 }
661 
remove_claims(pid_t pid)662 static void remove_claims(pid_t pid) {
663     int i;
664 
665     for (i = 0; i < PIDHASH_SZ; i++) {
666         struct proc* procp = pidhash[i];
667         while (procp) {
668             if (procp->reg_pid == pid) {
669                 procp->reg_pid = 0;
670             }
671             procp = procp->pidhash_next;
672         }
673     }
674 }
675 
ctrl_data_close(int dsock_idx)676 static void ctrl_data_close(int dsock_idx) {
677     struct epoll_event epev;
678 
679     ALOGI("closing lmkd data connection");
680     if (epoll_ctl(epollfd, EPOLL_CTL_DEL, data_sock[dsock_idx].sock, &epev) == -1) {
681         // Log a warning and keep going
682         ALOGW("epoll_ctl for data connection socket failed; errno=%d", errno);
683     }
684     maxevents--;
685 
686     close(data_sock[dsock_idx].sock);
687     data_sock[dsock_idx].sock = -1;
688 
689     /* Mark all records of the old registrant as unclaimed */
690     remove_claims(data_sock[dsock_idx].pid);
691 }
692 
ctrl_data_read(int dsock_idx,char * buf,size_t bufsz,struct ucred * sender_cred)693 static ssize_t ctrl_data_read(int dsock_idx, char* buf, size_t bufsz, struct ucred* sender_cred) {
694     struct iovec iov = {buf, bufsz};
695     char control[CMSG_SPACE(sizeof(struct ucred))];
696     struct msghdr hdr = {
697             NULL, 0, &iov, 1, control, sizeof(control), 0,
698     };
699     ssize_t ret;
700     ret = TEMP_FAILURE_RETRY(recvmsg(data_sock[dsock_idx].sock, &hdr, 0));
701     if (ret == -1) {
702         ALOGE("control data socket read failed; %s", strerror(errno));
703         return -1;
704     }
705     if (ret == 0) {
706         ALOGE("Got EOF on control data socket");
707         return -1;
708     }
709 
710     struct ucred* cred = NULL;
711     struct cmsghdr* cmsg = CMSG_FIRSTHDR(&hdr);
712     while (cmsg != NULL) {
713         if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_CREDENTIALS) {
714             cred = (struct ucred*)CMSG_DATA(cmsg);
715             break;
716         }
717         cmsg = CMSG_NXTHDR(&hdr, cmsg);
718     }
719 
720     if (cred == NULL) {
721         ALOGE("Failed to retrieve sender credentials");
722         /* Close the connection */
723         ctrl_data_close(dsock_idx);
724         return -1;
725     }
726 
727     memcpy(sender_cred, cred, sizeof(struct ucred));
728 
729     /* Store PID of the peer */
730     data_sock[dsock_idx].pid = cred->pid;
731 
732     return ret;
733 }
734 
ctrl_data_write(int dsock_idx,char * buf,size_t bufsz)735 static int ctrl_data_write(int dsock_idx, char* buf, size_t bufsz) {
736     int ret = 0;
737 
738     ret = TEMP_FAILURE_RETRY(write(data_sock[dsock_idx].sock, buf, bufsz));
739 
740     if (ret == -1) {
741         ALOGE("control data socket write failed; errno=%d", errno);
742     } else if (ret == 0) {
743         ALOGE("Got EOF on control data socket");
744         ret = -1;
745     }
746 
747     return ret;
748 }
749 
750 /*
751  * Write the pid/uid pair over the data socket, note: all active clients
752  * will receive this unsolicited notification.
753  */
ctrl_data_write_lmk_kill_occurred(pid_t pid,uid_t uid)754 static void ctrl_data_write_lmk_kill_occurred(pid_t pid, uid_t uid) {
755     LMKD_CTRL_PACKET packet;
756     size_t len = lmkd_pack_set_prockills(packet, pid, uid);
757 
758     for (int i = 0; i < MAX_DATA_CONN; i++) {
759         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_KILL) {
760             ctrl_data_write(i, (char*)packet, len);
761         }
762     }
763 }
764 
765 /*
766  * Write the kill_stat/memory_stat over the data socket to be propagated via AMS to statsd
767  */
stats_write_lmk_kill_occurred(struct kill_stat * kill_st,struct memory_stat * mem_st)768 static void stats_write_lmk_kill_occurred(struct kill_stat *kill_st,
769                                           struct memory_stat *mem_st) {
770     LMK_KILL_OCCURRED_PACKET packet;
771     const size_t len = lmkd_pack_set_kill_occurred(packet, kill_st, mem_st);
772     if (len == 0) {
773         return;
774     }
775 
776     for (int i = 0; i < MAX_DATA_CONN; i++) {
777         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) {
778             ctrl_data_write(i, packet, len);
779         }
780     }
781 
782 }
783 
stats_write_lmk_kill_occurred_pid(int pid,struct kill_stat * kill_st,struct memory_stat * mem_st)784 static void stats_write_lmk_kill_occurred_pid(int pid, struct kill_stat *kill_st,
785                                               struct memory_stat *mem_st) {
786     kill_st->taskname = stats_get_task_name(pid);
787     if (kill_st->taskname != NULL) {
788         stats_write_lmk_kill_occurred(kill_st, mem_st);
789     }
790 }
791 
792 /*
793  * Write the state_changed over the data socket to be propagated via AMS to statsd
794  */
stats_write_lmk_state_changed(enum lmk_state state)795 static void stats_write_lmk_state_changed(enum lmk_state state) {
796     LMKD_CTRL_PACKET packet_state_changed;
797     const size_t len = lmkd_pack_set_state_changed(packet_state_changed, state);
798     if (len == 0) {
799         return;
800     }
801     for (int i = 0; i < MAX_DATA_CONN; i++) {
802         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) {
803             ctrl_data_write(i, (char*)packet_state_changed, len);
804         }
805     }
806 }
807 
poll_kernel(int poll_fd)808 static void poll_kernel(int poll_fd) {
809     if (poll_fd == -1) {
810         // not waiting
811         return;
812     }
813 
814     while (1) {
815         char rd_buf[256];
816         int bytes_read = TEMP_FAILURE_RETRY(pread(poll_fd, (void*)rd_buf, sizeof(rd_buf), 0));
817         if (bytes_read <= 0) break;
818         rd_buf[bytes_read] = '\0';
819 
820         int64_t pid;
821         int64_t uid;
822         int64_t group_leader_pid;
823         int64_t rss_in_pages;
824         struct memory_stat mem_st = {};
825         int16_t oom_score_adj;
826         int16_t min_score_adj;
827         int64_t starttime;
828         char* taskname = 0;
829 
830         int fields_read =
831                 sscanf(rd_buf,
832                        "%" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64
833                        " %" SCNd16 " %" SCNd16 " %" SCNd64 "\n%m[^\n]",
834                        &pid, &uid, &group_leader_pid, &mem_st.pgfault, &mem_st.pgmajfault,
835                        &rss_in_pages, &oom_score_adj, &min_score_adj, &starttime, &taskname);
836 
837         /* only the death of the group leader process is logged */
838         if (fields_read == 10 && group_leader_pid == pid) {
839             ctrl_data_write_lmk_kill_occurred((pid_t)pid, (uid_t)uid);
840             mem_st.process_start_time_ns = starttime * (NS_PER_SEC / sysconf(_SC_CLK_TCK));
841             mem_st.rss_in_bytes = rss_in_pages * PAGE_SIZE;
842 
843             struct kill_stat kill_st = {
844                 .uid = static_cast<int32_t>(uid),
845                 .kill_reason = NONE,
846                 .oom_score = oom_score_adj,
847                 .min_oom_score = min_score_adj,
848                 .free_mem_kb = 0,
849                 .free_swap_kb = 0,
850             };
851             stats_write_lmk_kill_occurred_pid(pid, &kill_st, &mem_st);
852         }
853 
854         free(taskname);
855     }
856 }
857 
init_poll_kernel()858 static bool init_poll_kernel() {
859     kpoll_fd = TEMP_FAILURE_RETRY(open("/proc/lowmemorykiller", O_RDONLY | O_NONBLOCK | O_CLOEXEC));
860 
861     if (kpoll_fd < 0) {
862         ALOGE("kernel lmk event file could not be opened; errno=%d", errno);
863         return false;
864     }
865 
866     return true;
867 }
868 
pid_lookup(int pid)869 static struct proc *pid_lookup(int pid) {
870     struct proc *procp;
871 
872     for (procp = pidhash[pid_hashfn(pid)]; procp && procp->pid != pid;
873          procp = procp->pidhash_next)
874             ;
875 
876     return procp;
877 }
878 
adjslot_insert(struct adjslot_list * head,struct adjslot_list * new_element)879 static void adjslot_insert(struct adjslot_list *head, struct adjslot_list *new_element)
880 {
881     struct adjslot_list *next = head->next;
882     new_element->prev = head;
883     new_element->next = next;
884     next->prev = new_element;
885     head->next = new_element;
886 }
887 
adjslot_remove(struct adjslot_list * old)888 static void adjslot_remove(struct adjslot_list *old)
889 {
890     struct adjslot_list *prev = old->prev;
891     struct adjslot_list *next = old->next;
892     next->prev = prev;
893     prev->next = next;
894 }
895 
adjslot_tail(struct adjslot_list * head)896 static struct adjslot_list *adjslot_tail(struct adjslot_list *head) {
897     struct adjslot_list *asl = head->prev;
898 
899     return asl == head ? NULL : asl;
900 }
901 
proc_slot(struct proc * procp)902 static void proc_slot(struct proc *procp) {
903     int adjslot = ADJTOSLOT(procp->oomadj);
904 
905     adjslot_insert(&procadjslot_list[adjslot], &procp->asl);
906 }
907 
proc_unslot(struct proc * procp)908 static void proc_unslot(struct proc *procp) {
909     adjslot_remove(&procp->asl);
910 }
911 
proc_insert(struct proc * procp)912 static void proc_insert(struct proc *procp) {
913     int hval = pid_hashfn(procp->pid);
914 
915     procp->pidhash_next = pidhash[hval];
916     pidhash[hval] = procp;
917     proc_slot(procp);
918 }
919 
pid_remove(int pid)920 static int pid_remove(int pid) {
921     int hval = pid_hashfn(pid);
922     struct proc *procp;
923     struct proc *prevp;
924 
925     for (procp = pidhash[hval], prevp = NULL; procp && procp->pid != pid;
926          procp = procp->pidhash_next)
927             prevp = procp;
928 
929     if (!procp)
930         return -1;
931 
932     if (!prevp)
933         pidhash[hval] = procp->pidhash_next;
934     else
935         prevp->pidhash_next = procp->pidhash_next;
936 
937     proc_unslot(procp);
938     /*
939      * Close pidfd here if we are not waiting for corresponding process to die,
940      * in which case stop_wait_for_proc_kill() will close the pidfd later
941      */
942     if (procp->pidfd >= 0 && procp->pidfd != last_kill_pid_or_fd) {
943         close(procp->pidfd);
944     }
945     free(procp);
946     return 0;
947 }
948 
949 /*
950  * Write a string to a file.
951  * Returns false if the file does not exist.
952  */
writefilestring(const char * path,const char * s,bool err_if_missing)953 static bool writefilestring(const char *path, const char *s,
954                             bool err_if_missing) {
955     int fd = open(path, O_WRONLY | O_CLOEXEC);
956     ssize_t len = strlen(s);
957     ssize_t ret;
958 
959     if (fd < 0) {
960         if (err_if_missing) {
961             ALOGE("Error opening %s; errno=%d", path, errno);
962         }
963         return false;
964     }
965 
966     ret = TEMP_FAILURE_RETRY(write(fd, s, len));
967     if (ret < 0) {
968         ALOGE("Error writing %s; errno=%d", path, errno);
969     } else if (ret < len) {
970         ALOGE("Short write on %s; length=%zd", path, ret);
971     }
972 
973     close(fd);
974     return true;
975 }
976 
get_time_diff_ms(struct timespec * from,struct timespec * to)977 static inline long get_time_diff_ms(struct timespec *from,
978                                     struct timespec *to) {
979     return (to->tv_sec - from->tv_sec) * (long)MS_PER_SEC +
980            (to->tv_nsec - from->tv_nsec) / (long)NS_PER_MS;
981 }
982 
983 /* Reads /proc/pid/status into buf. */
read_proc_status(int pid,char * buf,size_t buf_sz)984 static bool read_proc_status(int pid, char *buf, size_t buf_sz) {
985     char path[PATH_MAX];
986     int fd;
987     ssize_t size;
988 
989     snprintf(path, PATH_MAX, "/proc/%d/status", pid);
990     fd = open(path, O_RDONLY | O_CLOEXEC);
991     if (fd < 0) {
992         return false;
993     }
994 
995     size = read_all(fd, buf, buf_sz - 1);
996     close(fd);
997     if (size < 0) {
998         return false;
999     }
1000     buf[size] = 0;
1001     return true;
1002 }
1003 
1004 /* Looks for tag in buf and parses the first integer */
parse_status_tag(char * buf,const char * tag,int64_t * out)1005 static bool parse_status_tag(char *buf, const char *tag, int64_t *out) {
1006     char *pos = buf;
1007     while (true) {
1008         pos = strstr(pos, tag);
1009         /* Stop if tag not found or found at the line beginning */
1010         if (pos == NULL || pos == buf || pos[-1] == '\n') {
1011             break;
1012         }
1013         pos++;
1014     }
1015 
1016     if (pos == NULL) {
1017         return false;
1018     }
1019 
1020     pos += strlen(tag);
1021     while (*pos == ' ') ++pos;
1022     return parse_int64(pos, out);
1023 }
1024 
proc_get_size(int pid)1025 static int proc_get_size(int pid) {
1026     char path[PATH_MAX];
1027     char line[LINE_MAX];
1028     int fd;
1029     int rss = 0;
1030     int total;
1031     ssize_t ret;
1032 
1033     /* gid containing AID_READPROC required */
1034     snprintf(path, PATH_MAX, "/proc/%d/statm", pid);
1035     fd = open(path, O_RDONLY | O_CLOEXEC);
1036     if (fd == -1)
1037         return -1;
1038 
1039     ret = read_all(fd, line, sizeof(line) - 1);
1040     if (ret < 0) {
1041         close(fd);
1042         return -1;
1043     }
1044     line[ret] = '\0';
1045 
1046     sscanf(line, "%d %d ", &total, &rss);
1047     close(fd);
1048     return rss;
1049 }
1050 
proc_get_name(int pid,char * buf,size_t buf_size)1051 static char *proc_get_name(int pid, char *buf, size_t buf_size) {
1052     char path[PATH_MAX];
1053     int fd;
1054     char *cp;
1055     ssize_t ret;
1056 
1057     /* gid containing AID_READPROC required */
1058     snprintf(path, PATH_MAX, "/proc/%d/cmdline", pid);
1059     fd = open(path, O_RDONLY | O_CLOEXEC);
1060     if (fd == -1) {
1061         return NULL;
1062     }
1063     ret = read_all(fd, buf, buf_size - 1);
1064     close(fd);
1065     if (ret < 0) {
1066         return NULL;
1067     }
1068     buf[ret] = '\0';
1069 
1070     cp = strchr(buf, ' ');
1071     if (cp) {
1072         *cp = '\0';
1073     }
1074 
1075     return buf;
1076 }
1077 
cmd_procprio(LMKD_CTRL_PACKET packet,int field_count,struct ucred * cred)1078 static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred *cred) {
1079     struct proc *procp;
1080     char path[LINE_MAX];
1081     char val[20];
1082     int soft_limit_mult;
1083     struct lmk_procprio params;
1084     bool is_system_server;
1085     struct passwd *pwdrec;
1086     int64_t tgid;
1087     char buf[PAGE_SIZE];
1088 
1089     lmkd_pack_get_procprio(packet, field_count, &params);
1090 
1091     if (params.oomadj < OOM_SCORE_ADJ_MIN ||
1092         params.oomadj > OOM_SCORE_ADJ_MAX) {
1093         ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj);
1094         return;
1095     }
1096 
1097     if (params.ptype < PROC_TYPE_FIRST || params.ptype >= PROC_TYPE_COUNT) {
1098         ALOGE("Invalid PROCPRIO process type argument %d", params.ptype);
1099         return;
1100     }
1101 
1102     /* Check if registered process is a thread group leader */
1103     if (read_proc_status(params.pid, buf, sizeof(buf))) {
1104         if (parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid) && tgid != params.pid) {
1105             ALOGE("Attempt to register a task that is not a thread group leader "
1106                   "(tid %d, tgid %" PRId64 ")", params.pid, tgid);
1107             return;
1108         }
1109     }
1110 
1111     /* gid containing AID_READPROC required */
1112     /* CAP_SYS_RESOURCE required */
1113     /* CAP_DAC_OVERRIDE required */
1114     snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid);
1115     snprintf(val, sizeof(val), "%d", params.oomadj);
1116     if (!writefilestring(path, val, false)) {
1117         ALOGW("Failed to open %s; errno=%d: process %d might have been killed",
1118               path, errno, params.pid);
1119         /* If this file does not exist the process is dead. */
1120         return;
1121     }
1122 
1123     if (use_inkernel_interface) {
1124         stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path)));
1125         return;
1126     }
1127 
1128     /* lmkd should not change soft limits for services */
1129     if (params.ptype == PROC_TYPE_APP && per_app_memcg) {
1130         if (params.oomadj >= 900) {
1131             soft_limit_mult = 0;
1132         } else if (params.oomadj >= 800) {
1133             soft_limit_mult = 0;
1134         } else if (params.oomadj >= 700) {
1135             soft_limit_mult = 0;
1136         } else if (params.oomadj >= 600) {
1137             // Launcher should be perceptible, don't kill it.
1138             params.oomadj = 200;
1139             soft_limit_mult = 1;
1140         } else if (params.oomadj >= 500) {
1141             soft_limit_mult = 0;
1142         } else if (params.oomadj >= 400) {
1143             soft_limit_mult = 0;
1144         } else if (params.oomadj >= 300) {
1145             soft_limit_mult = 1;
1146         } else if (params.oomadj >= 200) {
1147             soft_limit_mult = 8;
1148         } else if (params.oomadj >= 100) {
1149             soft_limit_mult = 10;
1150         } else if (params.oomadj >=   0) {
1151             soft_limit_mult = 20;
1152         } else {
1153             // Persistent processes will have a large
1154             // soft limit 512MB.
1155             soft_limit_mult = 64;
1156         }
1157 
1158         snprintf(path, sizeof(path), MEMCG_SYSFS_PATH
1159                  "apps/uid_%d/pid_%d/memory.soft_limit_in_bytes",
1160                  params.uid, params.pid);
1161         snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA);
1162 
1163         /*
1164          * system_server process has no memcg under /dev/memcg/apps but should be
1165          * registered with lmkd. This is the best way so far to identify it.
1166          */
1167         is_system_server = (params.oomadj == SYSTEM_ADJ &&
1168                             (pwdrec = getpwnam("system")) != NULL &&
1169                             params.uid == pwdrec->pw_uid);
1170         writefilestring(path, val, !is_system_server);
1171     }
1172 
1173     procp = pid_lookup(params.pid);
1174     if (!procp) {
1175         int pidfd = -1;
1176 
1177         if (pidfd_supported) {
1178             pidfd = TEMP_FAILURE_RETRY(pidfd_open(params.pid, 0));
1179             if (pidfd < 0) {
1180                 ALOGE("pidfd_open for pid %d failed; errno=%d", params.pid, errno);
1181                 return;
1182             }
1183         }
1184 
1185         procp = static_cast<struct proc*>(calloc(1, sizeof(struct proc)));
1186         if (!procp) {
1187             // Oh, the irony.  May need to rebuild our state.
1188             return;
1189         }
1190 
1191         procp->pid = params.pid;
1192         procp->pidfd = pidfd;
1193         procp->uid = params.uid;
1194         procp->reg_pid = cred->pid;
1195         procp->oomadj = params.oomadj;
1196         proc_insert(procp);
1197     } else {
1198         if (!claim_record(procp, cred->pid)) {
1199             char buf[LINE_MAX];
1200             char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1201             /* Only registrant of the record can remove it */
1202             ALOGE("%s (%d, %d) attempts to modify a process registered by another client",
1203                 taskname ? taskname : "A process ", cred->uid, cred->pid);
1204             return;
1205         }
1206         proc_unslot(procp);
1207         procp->oomadj = params.oomadj;
1208         proc_slot(procp);
1209     }
1210 }
1211 
cmd_procremove(LMKD_CTRL_PACKET packet,struct ucred * cred)1212 static void cmd_procremove(LMKD_CTRL_PACKET packet, struct ucred *cred) {
1213     struct lmk_procremove params;
1214     struct proc *procp;
1215 
1216     lmkd_pack_get_procremove(packet, &params);
1217 
1218     if (use_inkernel_interface) {
1219         /*
1220          * Perform an extra check before the pid is removed, after which it
1221          * will be impossible for poll_kernel to get the taskname. poll_kernel()
1222          * is potentially a long-running blocking function; however this method
1223          * handles AMS requests but does not block AMS.
1224          */
1225         poll_kernel(kpoll_fd);
1226 
1227         stats_remove_taskname(params.pid);
1228         return;
1229     }
1230 
1231     procp = pid_lookup(params.pid);
1232     if (!procp) {
1233         return;
1234     }
1235 
1236     if (!claim_record(procp, cred->pid)) {
1237         char buf[LINE_MAX];
1238         char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1239         /* Only registrant of the record can remove it */
1240         ALOGE("%s (%d, %d) attempts to unregister a process registered by another client",
1241             taskname ? taskname : "A process ", cred->uid, cred->pid);
1242         return;
1243     }
1244 
1245     /*
1246      * WARNING: After pid_remove() procp is freed and can't be used!
1247      * Therefore placed at the end of the function.
1248      */
1249     pid_remove(params.pid);
1250 }
1251 
cmd_procpurge(struct ucred * cred)1252 static void cmd_procpurge(struct ucred *cred) {
1253     int i;
1254     struct proc *procp;
1255     struct proc *next;
1256 
1257     if (use_inkernel_interface) {
1258         stats_purge_tasknames();
1259         return;
1260     }
1261 
1262     for (i = 0; i < PIDHASH_SZ; i++) {
1263         procp = pidhash[i];
1264         while (procp) {
1265             next = procp->pidhash_next;
1266             /* Purge only records created by the requestor */
1267             if (claim_record(procp, cred->pid)) {
1268                 pid_remove(procp->pid);
1269             }
1270             procp = next;
1271         }
1272     }
1273 }
1274 
cmd_subscribe(int dsock_idx,LMKD_CTRL_PACKET packet)1275 static void cmd_subscribe(int dsock_idx, LMKD_CTRL_PACKET packet) {
1276     struct lmk_subscribe params;
1277 
1278     lmkd_pack_get_subscribe(packet, &params);
1279     data_sock[dsock_idx].async_event_mask |= 1 << params.evt_type;
1280 }
1281 
inc_killcnt(int oomadj)1282 static void inc_killcnt(int oomadj) {
1283     int slot = ADJTOSLOT(oomadj);
1284     uint8_t idx = killcnt_idx[slot];
1285 
1286     if (idx == KILLCNT_INVALID_IDX) {
1287         /* index is not assigned for this oomadj */
1288         if (killcnt_free_idx < MAX_DISTINCT_OOM_ADJ) {
1289             killcnt_idx[slot] = killcnt_free_idx;
1290             killcnt[killcnt_free_idx] = 1;
1291             killcnt_free_idx++;
1292         } else {
1293             ALOGW("Number of distinct oomadj levels exceeds %d",
1294                 MAX_DISTINCT_OOM_ADJ);
1295         }
1296     } else {
1297         /*
1298          * wraparound is highly unlikely and is detectable using total
1299          * counter because it has to be equal to the sum of all counters
1300          */
1301         killcnt[idx]++;
1302     }
1303     /* increment total kill counter */
1304     killcnt_total++;
1305 }
1306 
get_killcnt(int min_oomadj,int max_oomadj)1307 static int get_killcnt(int min_oomadj, int max_oomadj) {
1308     int slot;
1309     int count = 0;
1310 
1311     if (min_oomadj > max_oomadj)
1312         return 0;
1313 
1314     /* special case to get total kill count */
1315     if (min_oomadj > OOM_SCORE_ADJ_MAX)
1316         return killcnt_total;
1317 
1318     while (min_oomadj <= max_oomadj &&
1319            (slot = ADJTOSLOT(min_oomadj)) < ADJTOSLOT_COUNT) {
1320         uint8_t idx = killcnt_idx[slot];
1321         if (idx != KILLCNT_INVALID_IDX) {
1322             count += killcnt[idx];
1323         }
1324         min_oomadj++;
1325     }
1326 
1327     return count;
1328 }
1329 
cmd_getkillcnt(LMKD_CTRL_PACKET packet)1330 static int cmd_getkillcnt(LMKD_CTRL_PACKET packet) {
1331     struct lmk_getkillcnt params;
1332 
1333     if (use_inkernel_interface) {
1334         /* kernel driver does not expose this information */
1335         return 0;
1336     }
1337 
1338     lmkd_pack_get_getkillcnt(packet, &params);
1339 
1340     return get_killcnt(params.min_oomadj, params.max_oomadj);
1341 }
1342 
cmd_target(int ntargets,LMKD_CTRL_PACKET packet)1343 static void cmd_target(int ntargets, LMKD_CTRL_PACKET packet) {
1344     int i;
1345     struct lmk_target target;
1346     char minfree_str[PROPERTY_VALUE_MAX];
1347     char *pstr = minfree_str;
1348     char *pend = minfree_str + sizeof(minfree_str);
1349     static struct timespec last_req_tm;
1350     struct timespec curr_tm;
1351 
1352     if (ntargets < 1 || ntargets > (int)ARRAY_SIZE(lowmem_adj))
1353         return;
1354 
1355     /*
1356      * Ratelimit minfree updates to once per TARGET_UPDATE_MIN_INTERVAL_MS
1357      * to prevent DoS attacks
1358      */
1359     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1360         ALOGE("Failed to get current time");
1361         return;
1362     }
1363 
1364     if (get_time_diff_ms(&last_req_tm, &curr_tm) <
1365         TARGET_UPDATE_MIN_INTERVAL_MS) {
1366         ALOGE("Ignoring frequent updated to lmkd limits");
1367         return;
1368     }
1369 
1370     last_req_tm = curr_tm;
1371 
1372     for (i = 0; i < ntargets; i++) {
1373         lmkd_pack_get_target(packet, i, &target);
1374         lowmem_minfree[i] = target.minfree;
1375         lowmem_adj[i] = target.oom_adj_score;
1376 
1377         pstr += snprintf(pstr, pend - pstr, "%d:%d,", target.minfree,
1378             target.oom_adj_score);
1379         if (pstr >= pend) {
1380             /* if no more space in the buffer then terminate the loop */
1381             pstr = pend;
1382             break;
1383         }
1384     }
1385 
1386     lowmem_targets_size = ntargets;
1387 
1388     /* Override the last extra comma */
1389     pstr[-1] = '\0';
1390     property_set("sys.lmk.minfree_levels", minfree_str);
1391 
1392     if (has_inkernel_module) {
1393         char minfreestr[128];
1394         char killpriostr[128];
1395 
1396         minfreestr[0] = '\0';
1397         killpriostr[0] = '\0';
1398 
1399         for (i = 0; i < lowmem_targets_size; i++) {
1400             char val[40];
1401 
1402             if (i) {
1403                 strlcat(minfreestr, ",", sizeof(minfreestr));
1404                 strlcat(killpriostr, ",", sizeof(killpriostr));
1405             }
1406 
1407             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_minfree[i] : 0);
1408             strlcat(minfreestr, val, sizeof(minfreestr));
1409             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_adj[i] : 0);
1410             strlcat(killpriostr, val, sizeof(killpriostr));
1411         }
1412 
1413         writefilestring(INKERNEL_MINFREE_PATH, minfreestr, true);
1414         writefilestring(INKERNEL_ADJ_PATH, killpriostr, true);
1415     }
1416 }
1417 
ctrl_command_handler(int dsock_idx)1418 static void ctrl_command_handler(int dsock_idx) {
1419     LMKD_CTRL_PACKET packet;
1420     struct ucred cred;
1421     int len;
1422     enum lmk_cmd cmd;
1423     int nargs;
1424     int targets;
1425     int kill_cnt;
1426     int result;
1427 
1428     len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE, &cred);
1429     if (len <= 0)
1430         return;
1431 
1432     if (len < (int)sizeof(int)) {
1433         ALOGE("Wrong control socket read length len=%d", len);
1434         return;
1435     }
1436 
1437     cmd = lmkd_pack_get_cmd(packet);
1438     nargs = len / sizeof(int) - 1;
1439     if (nargs < 0)
1440         goto wronglen;
1441 
1442     switch(cmd) {
1443     case LMK_TARGET:
1444         targets = nargs / 2;
1445         if (nargs & 0x1 || targets > (int)ARRAY_SIZE(lowmem_adj))
1446             goto wronglen;
1447         cmd_target(targets, packet);
1448         break;
1449     case LMK_PROCPRIO:
1450         /* process type field is optional for backward compatibility */
1451         if (nargs < 3 || nargs > 4)
1452             goto wronglen;
1453         cmd_procprio(packet, nargs, &cred);
1454         break;
1455     case LMK_PROCREMOVE:
1456         if (nargs != 1)
1457             goto wronglen;
1458         cmd_procremove(packet, &cred);
1459         break;
1460     case LMK_PROCPURGE:
1461         if (nargs != 0)
1462             goto wronglen;
1463         cmd_procpurge(&cred);
1464         break;
1465     case LMK_GETKILLCNT:
1466         if (nargs != 2)
1467             goto wronglen;
1468         kill_cnt = cmd_getkillcnt(packet);
1469         len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt);
1470         if (ctrl_data_write(dsock_idx, (char *)packet, len) != len)
1471             return;
1472         break;
1473     case LMK_SUBSCRIBE:
1474         if (nargs != 1)
1475             goto wronglen;
1476         cmd_subscribe(dsock_idx, packet);
1477         break;
1478     case LMK_PROCKILL:
1479         /* This command code is NOT expected at all */
1480         ALOGE("Received unexpected command code %d", cmd);
1481         break;
1482     case LMK_UPDATE_PROPS:
1483         if (nargs != 0)
1484             goto wronglen;
1485         update_props();
1486         if (!use_inkernel_interface) {
1487             /* Reinitialize monitors to apply new settings */
1488             destroy_monitors();
1489             result = init_monitors() ? 0 : -1;
1490         } else {
1491             result = 0;
1492         }
1493         len = lmkd_pack_set_update_props_repl(packet, result);
1494         if (ctrl_data_write(dsock_idx, (char *)packet, len) != len) {
1495             ALOGE("Failed to report operation results");
1496         }
1497         if (!result) {
1498             ALOGI("Properties reinitilized");
1499         } else {
1500             /* New settings can't be supported, crash to be restarted */
1501             ALOGE("New configuration is not supported. Exiting...");
1502             exit(1);
1503         }
1504         break;
1505     default:
1506         ALOGE("Received unknown command code %d", cmd);
1507         return;
1508     }
1509 
1510     return;
1511 
1512 wronglen:
1513     ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
1514 }
1515 
ctrl_data_handler(int data,uint32_t events,struct polling_params * poll_params __unused)1516 static void ctrl_data_handler(int data, uint32_t events,
1517                               struct polling_params *poll_params __unused) {
1518     if (events & EPOLLIN) {
1519         ctrl_command_handler(data);
1520     }
1521 }
1522 
get_free_dsock()1523 static int get_free_dsock() {
1524     for (int i = 0; i < MAX_DATA_CONN; i++) {
1525         if (data_sock[i].sock < 0) {
1526             return i;
1527         }
1528     }
1529     return -1;
1530 }
1531 
ctrl_connect_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)1532 static void ctrl_connect_handler(int data __unused, uint32_t events __unused,
1533                                  struct polling_params *poll_params __unused) {
1534     struct epoll_event epev;
1535     int free_dscock_idx = get_free_dsock();
1536 
1537     if (free_dscock_idx < 0) {
1538         /*
1539          * Number of data connections exceeded max supported. This should not
1540          * happen but if it does we drop all existing connections and accept
1541          * the new one. This prevents inactive connections from monopolizing
1542          * data socket and if we drop ActivityManager connection it will
1543          * immediately reconnect.
1544          */
1545         for (int i = 0; i < MAX_DATA_CONN; i++) {
1546             ctrl_data_close(i);
1547         }
1548         free_dscock_idx = 0;
1549     }
1550 
1551     data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL);
1552     if (data_sock[free_dscock_idx].sock < 0) {
1553         ALOGE("lmkd control socket accept failed; errno=%d", errno);
1554         return;
1555     }
1556 
1557     ALOGI("lmkd data connection established");
1558     /* use data to store data connection idx */
1559     data_sock[free_dscock_idx].handler_info.data = free_dscock_idx;
1560     data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler;
1561     data_sock[free_dscock_idx].async_event_mask = 0;
1562     epev.events = EPOLLIN;
1563     epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info);
1564     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) {
1565         ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno);
1566         ctrl_data_close(free_dscock_idx);
1567         return;
1568     }
1569     maxevents++;
1570 }
1571 
1572 /*
1573  * /proc/zoneinfo parsing routines
1574  * Expected file format is:
1575  *
1576  *   Node <node_id>, zone   <zone_name>
1577  *   (
1578  *    per-node stats
1579  *       (<per-node field name> <value>)+
1580  *   )?
1581  *   (pages free     <value>
1582  *       (<per-zone field name> <value>)+
1583  *    pagesets
1584  *       (<unused fields>)*
1585  *   )+
1586  *   ...
1587  */
zoneinfo_parse_protection(char * buf,struct zoneinfo_zone * zone)1588 static void zoneinfo_parse_protection(char *buf, struct zoneinfo_zone *zone) {
1589     int zone_idx;
1590     int64_t max = 0;
1591     char *save_ptr;
1592 
1593     for (buf = strtok_r(buf, "(), ", &save_ptr), zone_idx = 0;
1594          buf && zone_idx < MAX_NR_ZONES;
1595          buf = strtok_r(NULL, "), ", &save_ptr), zone_idx++) {
1596         long long zoneval = strtoll(buf, &buf, 0);
1597         if (zoneval > max) {
1598             max = (zoneval > INT64_MAX) ? INT64_MAX : zoneval;
1599         }
1600         zone->protection[zone_idx] = zoneval;
1601     }
1602     zone->max_protection = max;
1603 }
1604 
zoneinfo_parse_zone(char ** buf,struct zoneinfo_zone * zone)1605 static int zoneinfo_parse_zone(char **buf, struct zoneinfo_zone *zone) {
1606     for (char *line = strtok_r(NULL, "\n", buf); line;
1607          line = strtok_r(NULL, "\n", buf)) {
1608         char *cp;
1609         char *ap;
1610         char *save_ptr;
1611         int64_t val;
1612         int field_idx;
1613         enum field_match_result match_res;
1614 
1615         cp = strtok_r(line, " ", &save_ptr);
1616         if (!cp) {
1617             return false;
1618         }
1619 
1620         field_idx = find_field(cp, zoneinfo_zone_spec_field_names, ZI_ZONE_SPEC_FIELD_COUNT);
1621         if (field_idx >= 0) {
1622             /* special field */
1623             if (field_idx == ZI_ZONE_SPEC_PAGESETS) {
1624                 /* no mode fields we are interested in */
1625                 return true;
1626             }
1627 
1628             /* protection field */
1629             ap = strtok_r(NULL, ")", &save_ptr);
1630             if (ap) {
1631                 zoneinfo_parse_protection(ap, zone);
1632             }
1633             continue;
1634         }
1635 
1636         ap = strtok_r(NULL, " ", &save_ptr);
1637         if (!ap) {
1638             continue;
1639         }
1640 
1641         match_res = match_field(cp, ap, zoneinfo_zone_field_names, ZI_ZONE_FIELD_COUNT,
1642             &val, &field_idx);
1643         if (match_res == PARSE_FAIL) {
1644             return false;
1645         }
1646         if (match_res == PARSE_SUCCESS) {
1647             zone->fields.arr[field_idx] = val;
1648         }
1649         if (field_idx == ZI_ZONE_PRESENT && val == 0) {
1650             /* zone is not populated, stop parsing it */
1651             return true;
1652         }
1653     }
1654     return false;
1655 }
1656 
zoneinfo_parse_node(char ** buf,struct zoneinfo_node * node)1657 static int zoneinfo_parse_node(char **buf, struct zoneinfo_node *node) {
1658     int fields_to_match = ZI_NODE_FIELD_COUNT;
1659 
1660     for (char *line = strtok_r(NULL, "\n", buf); line;
1661          line = strtok_r(NULL, "\n", buf)) {
1662         char *cp;
1663         char *ap;
1664         char *save_ptr;
1665         int64_t val;
1666         int field_idx;
1667         enum field_match_result match_res;
1668 
1669         cp = strtok_r(line, " ", &save_ptr);
1670         if (!cp) {
1671             return false;
1672         }
1673 
1674         ap = strtok_r(NULL, " ", &save_ptr);
1675         if (!ap) {
1676             return false;
1677         }
1678 
1679         match_res = match_field(cp, ap, zoneinfo_node_field_names, ZI_NODE_FIELD_COUNT,
1680             &val, &field_idx);
1681         if (match_res == PARSE_FAIL) {
1682             return false;
1683         }
1684         if (match_res == PARSE_SUCCESS) {
1685             node->fields.arr[field_idx] = val;
1686             fields_to_match--;
1687             if (!fields_to_match) {
1688                 return true;
1689             }
1690         }
1691     }
1692     return false;
1693 }
1694 
zoneinfo_parse(struct zoneinfo * zi)1695 static int zoneinfo_parse(struct zoneinfo *zi) {
1696     static struct reread_data file_data = {
1697         .filename = ZONEINFO_PATH,
1698         .fd = -1,
1699     };
1700     char *buf;
1701     char *save_ptr;
1702     char *line;
1703     char zone_name[LINE_MAX + 1];
1704     struct zoneinfo_node *node = NULL;
1705     int node_idx = 0;
1706     int zone_idx = 0;
1707 
1708     memset(zi, 0, sizeof(struct zoneinfo));
1709 
1710     if ((buf = reread_file(&file_data)) == NULL) {
1711         return -1;
1712     }
1713 
1714     for (line = strtok_r(buf, "\n", &save_ptr); line;
1715          line = strtok_r(NULL, "\n", &save_ptr)) {
1716         int node_id;
1717         if (sscanf(line, "Node %d, zone %" STRINGIFY(LINE_MAX) "s", &node_id, zone_name) == 2) {
1718             if (!node || node->id != node_id) {
1719                 /* new node is found */
1720                 if (node) {
1721                     node->zone_count = zone_idx + 1;
1722                     node_idx++;
1723                     if (node_idx == MAX_NR_NODES) {
1724                         /* max node count exceeded */
1725                         ALOGE("%s parse error", file_data.filename);
1726                         return -1;
1727                     }
1728                 }
1729                 node = &zi->nodes[node_idx];
1730                 node->id = node_id;
1731                 zone_idx = 0;
1732                 if (!zoneinfo_parse_node(&save_ptr, node)) {
1733                     ALOGE("%s parse error", file_data.filename);
1734                     return -1;
1735                 }
1736             } else {
1737                 /* new zone is found */
1738                 zone_idx++;
1739             }
1740             if (!zoneinfo_parse_zone(&save_ptr, &node->zones[zone_idx])) {
1741                 ALOGE("%s parse error", file_data.filename);
1742                 return -1;
1743             }
1744         }
1745     }
1746     if (!node) {
1747         ALOGE("%s parse error", file_data.filename);
1748         return -1;
1749     }
1750     node->zone_count = zone_idx + 1;
1751     zi->node_count = node_idx + 1;
1752 
1753     /* calculate totals fields */
1754     for (node_idx = 0; node_idx < zi->node_count; node_idx++) {
1755         node = &zi->nodes[node_idx];
1756         for (zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
1757             struct zoneinfo_zone *zone = &zi->nodes[node_idx].zones[zone_idx];
1758             zi->totalreserve_pages += zone->max_protection + zone->fields.field.high;
1759         }
1760         zi->total_inactive_file += node->fields.field.nr_inactive_file;
1761         zi->total_active_file += node->fields.field.nr_active_file;
1762     }
1763     return 0;
1764 }
1765 
1766 /* /proc/meminfo parsing routines */
meminfo_parse_line(char * line,union meminfo * mi)1767 static bool meminfo_parse_line(char *line, union meminfo *mi) {
1768     char *cp = line;
1769     char *ap;
1770     char *save_ptr;
1771     int64_t val;
1772     int field_idx;
1773     enum field_match_result match_res;
1774 
1775     cp = strtok_r(line, " ", &save_ptr);
1776     if (!cp) {
1777         return false;
1778     }
1779 
1780     ap = strtok_r(NULL, " ", &save_ptr);
1781     if (!ap) {
1782         return false;
1783     }
1784 
1785     match_res = match_field(cp, ap, meminfo_field_names, MI_FIELD_COUNT,
1786         &val, &field_idx);
1787     if (match_res == PARSE_SUCCESS) {
1788         mi->arr[field_idx] = val / page_k;
1789     }
1790     return (match_res != PARSE_FAIL);
1791 }
1792 
read_gpu_total_kb()1793 static int64_t read_gpu_total_kb() {
1794     static int fd = android::bpf::bpfFdGet(
1795             "/sys/fs/bpf/map_gpu_mem_gpu_mem_total_map", BPF_F_RDONLY);
1796     static constexpr uint64_t kBpfKeyGpuTotalUsage = 0;
1797     uint64_t value;
1798 
1799     if (fd < 0) {
1800         return 0;
1801     }
1802 
1803     return android::bpf::findMapEntry(fd, &kBpfKeyGpuTotalUsage, &value)
1804             ? 0
1805             : (int32_t)(value / 1024);
1806 }
1807 
meminfo_parse(union meminfo * mi)1808 static int meminfo_parse(union meminfo *mi) {
1809     static struct reread_data file_data = {
1810         .filename = MEMINFO_PATH,
1811         .fd = -1,
1812     };
1813     char *buf;
1814     char *save_ptr;
1815     char *line;
1816 
1817     memset(mi, 0, sizeof(union meminfo));
1818 
1819     if ((buf = reread_file(&file_data)) == NULL) {
1820         return -1;
1821     }
1822 
1823     for (line = strtok_r(buf, "\n", &save_ptr); line;
1824          line = strtok_r(NULL, "\n", &save_ptr)) {
1825         if (!meminfo_parse_line(line, mi)) {
1826             ALOGE("%s parse error", file_data.filename);
1827             return -1;
1828         }
1829     }
1830     mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached +
1831         mi->field.buffers;
1832     mi->field.total_gpu_kb = read_gpu_total_kb();
1833 
1834     return 0;
1835 }
1836 
1837 /* /proc/vmstat parsing routines */
vmstat_parse_line(char * line,union vmstat * vs)1838 static bool vmstat_parse_line(char *line, union vmstat *vs) {
1839     char *cp;
1840     char *ap;
1841     char *save_ptr;
1842     int64_t val;
1843     int field_idx;
1844     enum field_match_result match_res;
1845 
1846     cp = strtok_r(line, " ", &save_ptr);
1847     if (!cp) {
1848         return false;
1849     }
1850 
1851     ap = strtok_r(NULL, " ", &save_ptr);
1852     if (!ap) {
1853         return false;
1854     }
1855 
1856     match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
1857         &val, &field_idx);
1858     if (match_res == PARSE_SUCCESS) {
1859         vs->arr[field_idx] = val;
1860     }
1861     return (match_res != PARSE_FAIL);
1862 }
1863 
vmstat_parse(union vmstat * vs)1864 static int vmstat_parse(union vmstat *vs) {
1865     static struct reread_data file_data = {
1866         .filename = VMSTAT_PATH,
1867         .fd = -1,
1868     };
1869     char *buf;
1870     char *save_ptr;
1871     char *line;
1872 
1873     memset(vs, 0, sizeof(union vmstat));
1874 
1875     if ((buf = reread_file(&file_data)) == NULL) {
1876         return -1;
1877     }
1878 
1879     for (line = strtok_r(buf, "\n", &save_ptr); line;
1880          line = strtok_r(NULL, "\n", &save_ptr)) {
1881         if (!vmstat_parse_line(line, vs)) {
1882             ALOGE("%s parse error", file_data.filename);
1883             return -1;
1884         }
1885     }
1886 
1887     return 0;
1888 }
1889 
1890 enum wakeup_reason {
1891     Event,
1892     Polling
1893 };
1894 
1895 struct wakeup_info {
1896     struct timespec wakeup_tm;
1897     struct timespec prev_wakeup_tm;
1898     struct timespec last_event_tm;
1899     int wakeups_since_event;
1900     int skipped_wakeups;
1901 };
1902 
1903 /*
1904  * After the initial memory pressure event is received lmkd schedules periodic wakeups to check
1905  * the memory conditions and kill if needed (polling). This is done because pressure events are
1906  * rate-limited and memory conditions can change in between events. Therefore after the initial
1907  * event there might be multiple wakeups. This function records the wakeup information such as the
1908  * timestamps of the last event and the last wakeup, the number of wakeups since the last event
1909  * and how many of those wakeups were skipped (some wakeups are skipped if previously killed
1910  * process is still freeing its memory).
1911  */
record_wakeup_time(struct timespec * tm,enum wakeup_reason reason,struct wakeup_info * wi)1912 static void record_wakeup_time(struct timespec *tm, enum wakeup_reason reason,
1913                                struct wakeup_info *wi) {
1914     wi->prev_wakeup_tm = wi->wakeup_tm;
1915     wi->wakeup_tm = *tm;
1916     if (reason == Event) {
1917         wi->last_event_tm = *tm;
1918         wi->wakeups_since_event = 0;
1919         wi->skipped_wakeups = 0;
1920     } else {
1921         wi->wakeups_since_event++;
1922     }
1923 }
1924 
killinfo_log(struct proc * procp,int min_oom_score,int rss_kb,int swap_kb,int kill_reason,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)1925 static void killinfo_log(struct proc* procp, int min_oom_score, int rss_kb,
1926                          int swap_kb, int kill_reason, union meminfo *mi,
1927                          struct wakeup_info *wi, struct timespec *tm) {
1928     /* log process information */
1929     android_log_write_int32(ctx, procp->pid);
1930     android_log_write_int32(ctx, procp->uid);
1931     android_log_write_int32(ctx, procp->oomadj);
1932     android_log_write_int32(ctx, min_oom_score);
1933     android_log_write_int32(ctx, (int32_t)min(rss_kb, INT32_MAX));
1934     android_log_write_int32(ctx, kill_reason);
1935 
1936     /* log meminfo fields */
1937     for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
1938         android_log_write_int32(ctx, (int32_t)min(mi->arr[field_idx] * page_k, INT32_MAX));
1939     }
1940 
1941     /* log lmkd wakeup information */
1942     android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->last_event_tm, tm));
1943     android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->prev_wakeup_tm, tm));
1944     android_log_write_int32(ctx, wi->wakeups_since_event);
1945     android_log_write_int32(ctx, wi->skipped_wakeups);
1946     android_log_write_int32(ctx, (int32_t)min(swap_kb, INT32_MAX));
1947     android_log_write_int32(ctx, (int32_t)mi->field.total_gpu_kb);
1948 
1949     android_log_write_list(ctx, LOG_ID_EVENTS);
1950     android_log_reset(ctx);
1951 }
1952 
proc_adj_lru(int oomadj)1953 static struct proc *proc_adj_lru(int oomadj) {
1954     return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
1955 }
1956 
proc_get_heaviest(int oomadj)1957 static struct proc *proc_get_heaviest(int oomadj) {
1958     struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
1959     struct adjslot_list *curr = head->next;
1960     struct proc *maxprocp = NULL;
1961     int maxsize = 0;
1962     while (curr != head) {
1963         int pid = ((struct proc *)curr)->pid;
1964         int tasksize = proc_get_size(pid);
1965         if (tasksize < 0) {
1966             struct adjslot_list *next = curr->next;
1967             pid_remove(pid);
1968             curr = next;
1969         } else {
1970             if (tasksize > maxsize) {
1971                 maxsize = tasksize;
1972                 maxprocp = (struct proc *)curr;
1973             }
1974             curr = curr->next;
1975         }
1976     }
1977     return maxprocp;
1978 }
1979 
set_process_group_and_prio(int pid,SchedPolicy sp,int prio)1980 static void set_process_group_and_prio(int pid, SchedPolicy sp, int prio) {
1981     DIR* d;
1982     char proc_path[PATH_MAX];
1983     struct dirent* de;
1984 
1985     snprintf(proc_path, sizeof(proc_path), "/proc/%d/task", pid);
1986     if (!(d = opendir(proc_path))) {
1987         ALOGW("Failed to open %s; errno=%d: process pid(%d) might have died", proc_path, errno,
1988               pid);
1989         return;
1990     }
1991 
1992     while ((de = readdir(d))) {
1993         int t_pid;
1994 
1995         if (de->d_name[0] == '.') continue;
1996         t_pid = atoi(de->d_name);
1997 
1998         if (!t_pid) {
1999             ALOGW("Failed to get t_pid for '%s' of pid(%d)", de->d_name, pid);
2000             continue;
2001         }
2002 
2003         if (setpriority(PRIO_PROCESS, t_pid, prio) && errno != ESRCH) {
2004             ALOGW("Unable to raise priority of killing t_pid (%d): errno=%d", t_pid, errno);
2005         }
2006 
2007         if (set_cpuset_policy(t_pid, sp)) {
2008             ALOGW("Failed to set_cpuset_policy on pid(%d) t_pid(%d) to %d", pid, t_pid, (int)sp);
2009             continue;
2010         }
2011     }
2012     closedir(d);
2013 }
2014 
is_kill_pending(void)2015 static bool is_kill_pending(void) {
2016     char buf[24];
2017 
2018     if (last_kill_pid_or_fd < 0) {
2019         return false;
2020     }
2021 
2022     if (pidfd_supported) {
2023         return true;
2024     }
2025 
2026     /* when pidfd is not supported base the decision on /proc/<pid> existence */
2027     snprintf(buf, sizeof(buf), "/proc/%d/", last_kill_pid_or_fd);
2028     if (access(buf, F_OK) == 0) {
2029         return true;
2030     }
2031 
2032     return false;
2033 }
2034 
is_waiting_for_kill(void)2035 static bool is_waiting_for_kill(void) {
2036     return pidfd_supported && last_kill_pid_or_fd >= 0;
2037 }
2038 
stop_wait_for_proc_kill(bool finished)2039 static void stop_wait_for_proc_kill(bool finished) {
2040     struct epoll_event epev;
2041 
2042     if (last_kill_pid_or_fd < 0) {
2043         return;
2044     }
2045 
2046     if (debug_process_killing) {
2047         struct timespec curr_tm;
2048 
2049         if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2050             /*
2051              * curr_tm is used here merely to report kill duration, so this failure is not fatal.
2052              * Log an error and continue.
2053              */
2054             ALOGE("Failed to get current time");
2055         }
2056 
2057         if (finished) {
2058             ALOGI("Process got killed in %ldms",
2059                 get_time_diff_ms(&last_kill_tm, &curr_tm));
2060         } else {
2061             ALOGI("Stop waiting for process kill after %ldms",
2062                 get_time_diff_ms(&last_kill_tm, &curr_tm));
2063         }
2064     }
2065 
2066     if (pidfd_supported) {
2067         /* unregister fd */
2068         if (epoll_ctl(epollfd, EPOLL_CTL_DEL, last_kill_pid_or_fd, &epev)) {
2069             // Log an error and keep going
2070             ALOGE("epoll_ctl for last killed process failed; errno=%d", errno);
2071         }
2072         maxevents--;
2073         close(last_kill_pid_or_fd);
2074     }
2075 
2076     last_kill_pid_or_fd = -1;
2077 }
2078 
kill_done_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params)2079 static void kill_done_handler(int data __unused, uint32_t events __unused,
2080                               struct polling_params *poll_params) {
2081     stop_wait_for_proc_kill(true);
2082     poll_params->update = POLLING_RESUME;
2083 }
2084 
start_wait_for_proc_kill(int pid_or_fd)2085 static void start_wait_for_proc_kill(int pid_or_fd) {
2086     static struct event_handler_info kill_done_hinfo = { 0, kill_done_handler };
2087     struct epoll_event epev;
2088 
2089     if (last_kill_pid_or_fd >= 0) {
2090         /* Should not happen but if it does we should stop previous wait */
2091         ALOGE("Attempt to wait for a kill while another wait is in progress");
2092         stop_wait_for_proc_kill(false);
2093     }
2094 
2095     last_kill_pid_or_fd = pid_or_fd;
2096 
2097     if (!pidfd_supported) {
2098         /* If pidfd is not supported just store PID and exit */
2099         return;
2100     }
2101 
2102     epev.events = EPOLLIN;
2103     epev.data.ptr = (void *)&kill_done_hinfo;
2104     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, last_kill_pid_or_fd, &epev) != 0) {
2105         ALOGE("epoll_ctl for last kill failed; errno=%d", errno);
2106         close(last_kill_pid_or_fd);
2107         last_kill_pid_or_fd = -1;
2108         return;
2109     }
2110     maxevents++;
2111 }
2112 
2113 struct kill_info {
2114     enum kill_reasons kill_reason;
2115     const char *kill_desc;
2116     int thrashing;
2117     int max_thrashing;
2118 };
2119 
2120 /* Kill one process specified by procp.  Returns the size (in pages) of the process killed */
kill_one_process(struct proc * procp,int min_oom_score,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)2121 static int kill_one_process(struct proc* procp, int min_oom_score, struct kill_info *ki,
2122                             union meminfo *mi, struct wakeup_info *wi, struct timespec *tm) {
2123     int pid = procp->pid;
2124     int pidfd = procp->pidfd;
2125     uid_t uid = procp->uid;
2126     char *taskname;
2127     int r;
2128     int result = -1;
2129     struct memory_stat *mem_st;
2130     struct kill_stat kill_st;
2131     int64_t tgid;
2132     int64_t rss_kb;
2133     int64_t swap_kb;
2134     char buf[PAGE_SIZE];
2135 
2136     if (!read_proc_status(pid, buf, sizeof(buf))) {
2137         goto out;
2138     }
2139     if (!parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid)) {
2140         ALOGE("Unable to parse tgid from /proc/%d/status", pid);
2141         goto out;
2142     }
2143     if (tgid != pid) {
2144         ALOGE("Possible pid reuse detected (pid %d, tgid %" PRId64 ")!", pid, tgid);
2145         goto out;
2146     }
2147     // Zombie processes will not have RSS / Swap fields.
2148     if (!parse_status_tag(buf, PROC_STATUS_RSS_FIELD, &rss_kb)) {
2149         goto out;
2150     }
2151     if (!parse_status_tag(buf, PROC_STATUS_SWAP_FIELD, &swap_kb)) {
2152         goto out;
2153     }
2154 
2155     taskname = proc_get_name(pid, buf, sizeof(buf));
2156     // taskname will point inside buf, do not reuse buf onwards.
2157     if (!taskname) {
2158         goto out;
2159     }
2160 
2161     mem_st = stats_read_memory_stat(per_app_memcg, pid, uid, rss_kb * 1024, swap_kb * 1024);
2162 
2163     TRACE_KILL_START(pid);
2164 
2165     /* CAP_KILL required */
2166     if (pidfd < 0) {
2167         start_wait_for_proc_kill(pid);
2168         r = kill(pid, SIGKILL);
2169     } else {
2170         start_wait_for_proc_kill(pidfd);
2171         r = pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
2172     }
2173 
2174     TRACE_KILL_END();
2175 
2176     if (r) {
2177         stop_wait_for_proc_kill(false);
2178         ALOGE("kill(%d): errno=%d", pid, errno);
2179         /* Delete process record even when we fail to kill so that we don't get stuck on it */
2180         goto out;
2181     }
2182 
2183     set_process_group_and_prio(pid, SP_FOREGROUND, ANDROID_PRIORITY_HIGHEST);
2184 
2185     last_kill_tm = *tm;
2186 
2187     inc_killcnt(procp->oomadj);
2188 
2189     if (ki) {
2190         kill_st.kill_reason = ki->kill_reason;
2191         kill_st.thrashing = ki->thrashing;
2192         kill_st.max_thrashing = ki->max_thrashing;
2193         killinfo_log(procp, min_oom_score, rss_kb, swap_kb, ki->kill_reason, mi, wi, tm);
2194         ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2195               "kB swap; reason: %s", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb,
2196               ki->kill_desc);
2197     } else {
2198         kill_st.kill_reason = NONE;
2199         kill_st.thrashing = 0;
2200         kill_st.max_thrashing = 0;
2201         killinfo_log(procp, min_oom_score, rss_kb, swap_kb, NONE, mi, wi, tm);
2202         ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2203               "kb swap", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb);
2204     }
2205 
2206     kill_st.uid = static_cast<int32_t>(uid);
2207     kill_st.taskname = taskname;
2208     kill_st.oom_score = procp->oomadj;
2209     kill_st.min_oom_score = min_oom_score;
2210     kill_st.free_mem_kb = mi->field.nr_free_pages * page_k;
2211     kill_st.free_swap_kb = mi->field.free_swap * page_k;
2212     stats_write_lmk_kill_occurred(&kill_st, mem_st);
2213 
2214     ctrl_data_write_lmk_kill_occurred((pid_t)pid, uid);
2215 
2216     result = rss_kb / page_k;
2217 
2218 out:
2219     /*
2220      * WARNING: After pid_remove() procp is freed and can't be used!
2221      * Therefore placed at the end of the function.
2222      */
2223     pid_remove(pid);
2224     return result;
2225 }
2226 
2227 /*
2228  * Find one process to kill at or above the given oom_score_adj level.
2229  * Returns size of the killed process.
2230  */
find_and_kill_process(int min_score_adj,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)2231 static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi,
2232                                  struct wakeup_info *wi, struct timespec *tm) {
2233     int i;
2234     int killed_size = 0;
2235     bool lmk_state_change_start = false;
2236     bool choose_heaviest_task = kill_heaviest_task;
2237 
2238     for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
2239         struct proc *procp;
2240 
2241         if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) {
2242             /*
2243              * If we have to choose a perceptible process, choose the heaviest one to
2244              * hopefully minimize the number of victims.
2245              */
2246             choose_heaviest_task = true;
2247         }
2248 
2249         while (true) {
2250             procp = choose_heaviest_task ?
2251                 proc_get_heaviest(i) : proc_adj_lru(i);
2252 
2253             if (!procp)
2254                 break;
2255 
2256             killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm);
2257             if (killed_size >= 0) {
2258                 if (!lmk_state_change_start) {
2259                     lmk_state_change_start = true;
2260                     stats_write_lmk_state_changed(STATE_START);
2261                 }
2262                 break;
2263             }
2264         }
2265         if (killed_size) {
2266             break;
2267         }
2268     }
2269 
2270     if (lmk_state_change_start) {
2271         stats_write_lmk_state_changed(STATE_STOP);
2272     }
2273 
2274     return killed_size;
2275 }
2276 
get_memory_usage(struct reread_data * file_data)2277 static int64_t get_memory_usage(struct reread_data *file_data) {
2278     int64_t mem_usage;
2279     char *buf;
2280 
2281     if ((buf = reread_file(file_data)) == NULL) {
2282         return -1;
2283     }
2284 
2285     if (!parse_int64(buf, &mem_usage)) {
2286         ALOGE("%s parse error", file_data->filename);
2287         return -1;
2288     }
2289     if (mem_usage == 0) {
2290         ALOGE("No memory!");
2291         return -1;
2292     }
2293     return mem_usage;
2294 }
2295 
record_low_pressure_levels(union meminfo * mi)2296 void record_low_pressure_levels(union meminfo *mi) {
2297     if (low_pressure_mem.min_nr_free_pages == -1 ||
2298         low_pressure_mem.min_nr_free_pages > mi->field.nr_free_pages) {
2299         if (debug_process_killing) {
2300             ALOGI("Low pressure min memory update from %" PRId64 " to %" PRId64,
2301                 low_pressure_mem.min_nr_free_pages, mi->field.nr_free_pages);
2302         }
2303         low_pressure_mem.min_nr_free_pages = mi->field.nr_free_pages;
2304     }
2305     /*
2306      * Free memory at low vmpressure events occasionally gets spikes,
2307      * possibly a stale low vmpressure event with memory already
2308      * freed up (no memory pressure should have been reported).
2309      * Ignore large jumps in max_nr_free_pages that would mess up our stats.
2310      */
2311     if (low_pressure_mem.max_nr_free_pages == -1 ||
2312         (low_pressure_mem.max_nr_free_pages < mi->field.nr_free_pages &&
2313          mi->field.nr_free_pages - low_pressure_mem.max_nr_free_pages <
2314          low_pressure_mem.max_nr_free_pages * 0.1)) {
2315         if (debug_process_killing) {
2316             ALOGI("Low pressure max memory update from %" PRId64 " to %" PRId64,
2317                 low_pressure_mem.max_nr_free_pages, mi->field.nr_free_pages);
2318         }
2319         low_pressure_mem.max_nr_free_pages = mi->field.nr_free_pages;
2320     }
2321 }
2322 
upgrade_level(enum vmpressure_level level)2323 enum vmpressure_level upgrade_level(enum vmpressure_level level) {
2324     return (enum vmpressure_level)((level < VMPRESS_LEVEL_CRITICAL) ?
2325         level + 1 : level);
2326 }
2327 
downgrade_level(enum vmpressure_level level)2328 enum vmpressure_level downgrade_level(enum vmpressure_level level) {
2329     return (enum vmpressure_level)((level > VMPRESS_LEVEL_LOW) ?
2330         level - 1 : level);
2331 }
2332 
2333 enum zone_watermark {
2334     WMARK_MIN = 0,
2335     WMARK_LOW,
2336     WMARK_HIGH,
2337     WMARK_NONE
2338 };
2339 
2340 struct zone_watermarks {
2341     long high_wmark;
2342     long low_wmark;
2343     long min_wmark;
2344 };
2345 
2346 /*
2347  * Returns lowest breached watermark or WMARK_NONE.
2348  */
get_lowest_watermark(union meminfo * mi,struct zone_watermarks * watermarks)2349 static enum zone_watermark get_lowest_watermark(union meminfo *mi,
2350                                                 struct zone_watermarks *watermarks)
2351 {
2352     int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free;
2353 
2354     if (nr_free_pages < watermarks->min_wmark) {
2355         return WMARK_MIN;
2356     }
2357     if (nr_free_pages < watermarks->low_wmark) {
2358         return WMARK_LOW;
2359     }
2360     if (nr_free_pages < watermarks->high_wmark) {
2361         return WMARK_HIGH;
2362     }
2363     return WMARK_NONE;
2364 }
2365 
calc_zone_watermarks(struct zoneinfo * zi,struct zone_watermarks * watermarks)2366 void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) {
2367     memset(watermarks, 0, sizeof(struct zone_watermarks));
2368 
2369     for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
2370         struct zoneinfo_node *node = &zi->nodes[node_idx];
2371         for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
2372             struct zoneinfo_zone *zone = &node->zones[zone_idx];
2373 
2374             if (!zone->fields.field.present) {
2375                 continue;
2376             }
2377 
2378             watermarks->high_wmark += zone->max_protection + zone->fields.field.high;
2379             watermarks->low_wmark += zone->max_protection + zone->fields.field.low;
2380             watermarks->min_wmark += zone->max_protection + zone->fields.field.min;
2381         }
2382     }
2383 }
2384 
calc_swap_utilization(union meminfo * mi)2385 static int calc_swap_utilization(union meminfo *mi) {
2386     int64_t swap_used = mi->field.total_swap - mi->field.free_swap;
2387     int64_t total_swappable = mi->field.active_anon + mi->field.inactive_anon +
2388                               mi->field.shmem + swap_used;
2389     return total_swappable > 0 ? (swap_used * 100) / total_swappable : 0;
2390 }
2391 
mp_event_psi(int data,uint32_t events,struct polling_params * poll_params)2392 static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
2393     enum reclaim_state {
2394         NO_RECLAIM = 0,
2395         KSWAPD_RECLAIM,
2396         DIRECT_RECLAIM,
2397     };
2398     static int64_t init_ws_refault;
2399     static int64_t prev_workingset_refault;
2400     static int64_t base_file_lru;
2401     static int64_t init_pgscan_kswapd;
2402     static int64_t init_pgscan_direct;
2403     static int64_t swap_low_threshold;
2404     static bool killing;
2405     static int thrashing_limit = thrashing_limit_pct;
2406     static struct zone_watermarks watermarks;
2407     static struct timespec wmark_update_tm;
2408     static struct wakeup_info wi;
2409     static struct timespec thrashing_reset_tm;
2410     static int64_t prev_thrash_growth = 0;
2411     static bool check_filecache = false;
2412     static int max_thrashing = 0;
2413 
2414     union meminfo mi;
2415     union vmstat vs;
2416     struct timespec curr_tm;
2417     int64_t thrashing = 0;
2418     bool swap_is_low = false;
2419     enum vmpressure_level level = (enum vmpressure_level)data;
2420     enum kill_reasons kill_reason = NONE;
2421     bool cycle_after_kill = false;
2422     enum reclaim_state reclaim = NO_RECLAIM;
2423     enum zone_watermark wmark = WMARK_NONE;
2424     char kill_desc[LINE_MAX];
2425     bool cut_thrashing_limit = false;
2426     int min_score_adj = 0;
2427     int swap_util = 0;
2428     long since_thrashing_reset_ms;
2429     int64_t workingset_refault_file;
2430 
2431     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2432         ALOGE("Failed to get current time");
2433         return;
2434     }
2435 
2436     record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2437 
2438     bool kill_pending = is_kill_pending();
2439     if (kill_pending && (kill_timeout_ms == 0 ||
2440         get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms))) {
2441         /* Skip while still killing a process */
2442         wi.skipped_wakeups++;
2443         goto no_kill;
2444     }
2445     /*
2446      * Process is dead or kill timeout is over, stop waiting. This has no effect if pidfds are
2447      * supported and death notification already caused waiting to stop.
2448      */
2449     stop_wait_for_proc_kill(!kill_pending);
2450 
2451     if (vmstat_parse(&vs) < 0) {
2452         ALOGE("Failed to parse vmstat!");
2453         return;
2454     }
2455     /* Starting 5.9 kernel workingset_refault vmstat field was renamed workingset_refault_file */
2456     workingset_refault_file = vs.field.workingset_refault ? : vs.field.workingset_refault_file;
2457 
2458     if (meminfo_parse(&mi) < 0) {
2459         ALOGE("Failed to parse meminfo!");
2460         return;
2461     }
2462 
2463     /* Reset states after process got killed */
2464     if (killing) {
2465         killing = false;
2466         cycle_after_kill = true;
2467         /* Reset file-backed pagecache size and refault amounts after a kill */
2468         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2469         init_ws_refault = workingset_refault_file;
2470         thrashing_reset_tm = curr_tm;
2471         prev_thrash_growth = 0;
2472     }
2473 
2474     /* Check free swap levels */
2475     if (swap_free_low_percentage) {
2476         if (!swap_low_threshold) {
2477             swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
2478         }
2479         swap_is_low = mi.field.free_swap < swap_low_threshold;
2480     }
2481 
2482     /* Identify reclaim state */
2483     if (vs.field.pgscan_direct > init_pgscan_direct) {
2484         init_pgscan_direct = vs.field.pgscan_direct;
2485         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2486         reclaim = DIRECT_RECLAIM;
2487     } else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) {
2488         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2489         reclaim = KSWAPD_RECLAIM;
2490     } else if (workingset_refault_file == prev_workingset_refault) {
2491         /*
2492          * Device is not thrashing and not reclaiming, bail out early until we see these stats
2493          * changing
2494          */
2495         goto no_kill;
2496     }
2497 
2498     prev_workingset_refault = workingset_refault_file;
2499 
2500      /*
2501      * It's possible we fail to find an eligible process to kill (ex. no process is
2502      * above oom_adj_min). When this happens, we should retry to find a new process
2503      * for a kill whenever a new eligible process is available. This is especially
2504      * important for a slow growing refault case. While retrying, we should keep
2505      * monitoring new thrashing counter as someone could release the memory to mitigate
2506      * the thrashing. Thus, when thrashing reset window comes, we decay the prev thrashing
2507      * counter by window counts. If the counter is still greater than thrashing limit,
2508      * we preserve the current prev_thrash counter so we will retry kill again. Otherwise,
2509      * we reset the prev_thrash counter so we will stop retrying.
2510      */
2511     since_thrashing_reset_ms = get_time_diff_ms(&thrashing_reset_tm, &curr_tm);
2512     if (since_thrashing_reset_ms > THRASHING_RESET_INTERVAL_MS) {
2513         long windows_passed;
2514         /* Calculate prev_thrash_growth if we crossed THRASHING_RESET_INTERVAL_MS */
2515         prev_thrash_growth = (workingset_refault_file - init_ws_refault) * 100
2516                             / (base_file_lru + 1);
2517         windows_passed = (since_thrashing_reset_ms / THRASHING_RESET_INTERVAL_MS);
2518         /*
2519          * Decay prev_thrashing unless over-the-limit thrashing was registered in the window we
2520          * just crossed, which means there were no eligible processes to kill. We preserve the
2521          * counter in that case to ensure a kill if a new eligible process appears.
2522          */
2523         if (windows_passed > 1 || prev_thrash_growth < thrashing_limit) {
2524             prev_thrash_growth >>= windows_passed;
2525         }
2526 
2527         /* Record file-backed pagecache size when crossing THRASHING_RESET_INTERVAL_MS */
2528         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2529         init_ws_refault = workingset_refault_file;
2530         thrashing_reset_tm = curr_tm;
2531         thrashing_limit = thrashing_limit_pct;
2532     } else {
2533         /* Calculate what % of the file-backed pagecache refaulted so far */
2534         thrashing = (workingset_refault_file - init_ws_refault) * 100 / (base_file_lru + 1);
2535     }
2536     /* Add previous cycle's decayed thrashing amount */
2537     thrashing += prev_thrash_growth;
2538     if (max_thrashing < thrashing) {
2539         max_thrashing = thrashing;
2540     }
2541 
2542     /*
2543      * Refresh watermarks once per min in case user updated one of the margins.
2544      * TODO: b/140521024 replace this periodic update with an API for AMS to notify LMKD
2545      * that zone watermarks were changed by the system software.
2546      */
2547     if (watermarks.high_wmark == 0 || get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000) {
2548         struct zoneinfo zi;
2549 
2550         if (zoneinfo_parse(&zi) < 0) {
2551             ALOGE("Failed to parse zoneinfo!");
2552             return;
2553         }
2554 
2555         calc_zone_watermarks(&zi, &watermarks);
2556         wmark_update_tm = curr_tm;
2557     }
2558 
2559     /* Find out which watermark is breached if any */
2560     wmark = get_lowest_watermark(&mi, &watermarks);
2561 
2562     /*
2563      * TODO: move this logic into a separate function
2564      * Decide if killing a process is necessary and record the reason
2565      */
2566     if (cycle_after_kill && wmark < WMARK_LOW) {
2567         /*
2568          * Prevent kills not freeing enough memory which might lead to OOM kill.
2569          * This might happen when a process is consuming memory faster than reclaim can
2570          * free even after a kill. Mostly happens when running memory stress tests.
2571          */
2572         kill_reason = PRESSURE_AFTER_KILL;
2573         strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
2574     } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
2575         /*
2576          * Device is too busy reclaiming memory which might lead to ANR.
2577          * Critical level is triggered when PSI complete stall (all tasks are blocked because
2578          * of the memory congestion) breaches the configured threshold.
2579          */
2580         kill_reason = NOT_RESPONDING;
2581         strncpy(kill_desc, "device is not responding", sizeof(kill_desc));
2582     } else if (swap_is_low && thrashing > thrashing_limit_pct) {
2583         /* Page cache is thrashing while swap is low */
2584         kill_reason = LOW_SWAP_AND_THRASHING;
2585         snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
2586             "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
2587             mi.field.free_swap * page_k, swap_low_threshold * page_k, thrashing);
2588         /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2589         if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2590             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2591         }
2592         check_filecache = true;
2593     } else if (swap_is_low && wmark < WMARK_HIGH) {
2594         /* Both free memory and swap are low */
2595         kill_reason = LOW_MEM_AND_SWAP;
2596         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
2597             PRId64 "kB < %" PRId64 "kB)", wmark < WMARK_LOW ? "min" : "low",
2598             mi.field.free_swap * page_k, swap_low_threshold * page_k);
2599         /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2600         if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2601             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2602         }
2603     } else if (wmark < WMARK_HIGH && swap_util_max < 100 &&
2604                (swap_util = calc_swap_utilization(&mi)) > swap_util_max) {
2605         /*
2606          * Too much anon memory is swapped out but swap is not low.
2607          * Non-swappable allocations created memory pressure.
2608          */
2609         kill_reason = LOW_MEM_AND_SWAP_UTIL;
2610         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap utilization"
2611             " is high (%d%% > %d%%)", wmark < WMARK_LOW ? "min" : "low",
2612             swap_util, swap_util_max);
2613     } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
2614         /* Page cache is thrashing while memory is low */
2615         kill_reason = LOW_MEM_AND_THRASHING;
2616         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
2617             PRId64 "%%)", wmark < WMARK_LOW ? "min" : "low", thrashing);
2618         cut_thrashing_limit = true;
2619         /* Do not kill perceptible apps unless thrashing at critical levels */
2620         if (thrashing < thrashing_critical_pct) {
2621             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2622         }
2623         check_filecache = true;
2624     } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
2625         /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
2626         kill_reason = DIRECT_RECL_AND_THRASHING;
2627         snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
2628             PRId64 "%%)", thrashing);
2629         cut_thrashing_limit = true;
2630         /* Do not kill perceptible apps unless thrashing at critical levels */
2631         if (thrashing < thrashing_critical_pct) {
2632             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2633         }
2634         check_filecache = true;
2635     } else if (check_filecache) {
2636         int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k;
2637 
2638         if (file_lru_kb < filecache_min_kb) {
2639             /* File cache is too low after thrashing, keep killing background processes */
2640             kill_reason = LOW_FILECACHE_AFTER_THRASHING;
2641             snprintf(kill_desc, sizeof(kill_desc),
2642                 "filecache is low (%" PRId64 "kB < %" PRId64 "kB) after thrashing",
2643                 file_lru_kb, filecache_min_kb);
2644             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2645         } else {
2646             /* File cache is big enough, stop checking */
2647             check_filecache = false;
2648         }
2649     }
2650 
2651     /* Kill a process if necessary */
2652     if (kill_reason != NONE) {
2653         struct kill_info ki = {
2654             .kill_reason = kill_reason,
2655             .kill_desc = kill_desc,
2656             .thrashing = (int)thrashing,
2657             .max_thrashing = max_thrashing,
2658         };
2659         int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm);
2660         if (pages_freed > 0) {
2661             killing = true;
2662             max_thrashing = 0;
2663             if (cut_thrashing_limit) {
2664                 /*
2665                  * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current
2666                  * thrashing limit until the system stops thrashing.
2667                  */
2668                 thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
2669             }
2670         }
2671     }
2672 
2673 no_kill:
2674     /* Do not poll if kernel supports pidfd waiting */
2675     if (is_waiting_for_kill()) {
2676         /* Pause polling if we are waiting for process death notification */
2677         poll_params->update = POLLING_PAUSE;
2678         return;
2679     }
2680 
2681     /*
2682      * Start polling after initial PSI event;
2683      * extend polling while device is in direct reclaim or process is being killed;
2684      * do not extend when kswapd reclaims because that might go on for a long time
2685      * without causing memory pressure
2686      */
2687     if (events || killing || reclaim == DIRECT_RECLAIM) {
2688         poll_params->update = POLLING_START;
2689     }
2690 
2691     /* Decide the polling interval */
2692     if (swap_is_low || killing) {
2693         /* Fast polling during and after a kill or when swap is low */
2694         poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2695     } else {
2696         /* By default use long intervals */
2697         poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
2698     }
2699 }
2700 
mp_event_common(int data,uint32_t events,struct polling_params * poll_params)2701 static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
2702     unsigned long long evcount;
2703     int64_t mem_usage, memsw_usage;
2704     int64_t mem_pressure;
2705     union meminfo mi;
2706     struct zoneinfo zi;
2707     struct timespec curr_tm;
2708     static unsigned long kill_skip_count = 0;
2709     enum vmpressure_level level = (enum vmpressure_level)data;
2710     long other_free = 0, other_file = 0;
2711     int min_score_adj;
2712     int minfree = 0;
2713     static struct reread_data mem_usage_file_data = {
2714         .filename = MEMCG_MEMORY_USAGE,
2715         .fd = -1,
2716     };
2717     static struct reread_data memsw_usage_file_data = {
2718         .filename = MEMCG_MEMORYSW_USAGE,
2719         .fd = -1,
2720     };
2721     static struct wakeup_info wi;
2722 
2723     if (debug_process_killing) {
2724         ALOGI("%s memory pressure event is triggered", level_name[level]);
2725     }
2726 
2727     if (!use_psi_monitors) {
2728         /*
2729          * Check all event counters from low to critical
2730          * and upgrade to the highest priority one. By reading
2731          * eventfd we also reset the event counters.
2732          */
2733         for (int lvl = VMPRESS_LEVEL_LOW; lvl < VMPRESS_LEVEL_COUNT; lvl++) {
2734             if (mpevfd[lvl] != -1 &&
2735                 TEMP_FAILURE_RETRY(read(mpevfd[lvl],
2736                                    &evcount, sizeof(evcount))) > 0 &&
2737                 evcount > 0 && lvl > level) {
2738                 level = static_cast<vmpressure_level>(lvl);
2739             }
2740         }
2741     }
2742 
2743     /* Start polling after initial PSI event */
2744     if (use_psi_monitors && events) {
2745         /* Override polling params only if current event is more critical */
2746         if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
2747             poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2748             poll_params->update = POLLING_START;
2749         }
2750     }
2751 
2752     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2753         ALOGE("Failed to get current time");
2754         return;
2755     }
2756 
2757     record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2758 
2759     if (kill_timeout_ms &&
2760         get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms)) {
2761         /*
2762          * If we're within the no-kill timeout, see if there's pending reclaim work
2763          * from the last killed process. If so, skip killing for now.
2764          */
2765         if (is_kill_pending()) {
2766             kill_skip_count++;
2767             wi.skipped_wakeups++;
2768             return;
2769         }
2770         /*
2771          * Process is dead, stop waiting. This has no effect if pidfds are supported and
2772          * death notification already caused waiting to stop.
2773          */
2774         stop_wait_for_proc_kill(true);
2775     } else {
2776         /*
2777          * Killing took longer than no-kill timeout. Stop waiting for the last process
2778          * to die because we are ready to kill again.
2779          */
2780         stop_wait_for_proc_kill(false);
2781     }
2782 
2783     if (kill_skip_count > 0) {
2784         ALOGI("%lu memory pressure events were skipped after a kill!",
2785               kill_skip_count);
2786         kill_skip_count = 0;
2787     }
2788 
2789     if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) {
2790         ALOGE("Failed to get free memory!");
2791         return;
2792     }
2793 
2794     if (use_minfree_levels) {
2795         int i;
2796 
2797         other_free = mi.field.nr_free_pages - zi.totalreserve_pages;
2798         if (mi.field.nr_file_pages > (mi.field.shmem + mi.field.unevictable + mi.field.swap_cached)) {
2799             other_file = (mi.field.nr_file_pages - mi.field.shmem -
2800                           mi.field.unevictable - mi.field.swap_cached);
2801         } else {
2802             other_file = 0;
2803         }
2804 
2805         min_score_adj = OOM_SCORE_ADJ_MAX + 1;
2806         for (i = 0; i < lowmem_targets_size; i++) {
2807             minfree = lowmem_minfree[i];
2808             if (other_free < minfree && other_file < minfree) {
2809                 min_score_adj = lowmem_adj[i];
2810                 break;
2811             }
2812         }
2813 
2814         if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
2815             if (debug_process_killing) {
2816                 ALOGI("Ignore %s memory pressure event "
2817                       "(free memory=%ldkB, cache=%ldkB, limit=%ldkB)",
2818                       level_name[level], other_free * page_k, other_file * page_k,
2819                       (long)lowmem_minfree[lowmem_targets_size - 1] * page_k);
2820             }
2821             return;
2822         }
2823 
2824         goto do_kill;
2825     }
2826 
2827     if (level == VMPRESS_LEVEL_LOW) {
2828         record_low_pressure_levels(&mi);
2829     }
2830 
2831     if (level_oomadj[level] > OOM_SCORE_ADJ_MAX) {
2832         /* Do not monitor this pressure level */
2833         return;
2834     }
2835 
2836     if ((mem_usage = get_memory_usage(&mem_usage_file_data)) < 0) {
2837         goto do_kill;
2838     }
2839     if ((memsw_usage = get_memory_usage(&memsw_usage_file_data)) < 0) {
2840         goto do_kill;
2841     }
2842 
2843     // Calculate percent for swappinness.
2844     mem_pressure = (mem_usage * 100) / memsw_usage;
2845 
2846     if (enable_pressure_upgrade && level != VMPRESS_LEVEL_CRITICAL) {
2847         // We are swapping too much.
2848         if (mem_pressure < upgrade_pressure) {
2849             level = upgrade_level(level);
2850             if (debug_process_killing) {
2851                 ALOGI("Event upgraded to %s", level_name[level]);
2852             }
2853         }
2854     }
2855 
2856     // If we still have enough swap space available, check if we want to
2857     // ignore/downgrade pressure events.
2858     if (mi.field.free_swap >=
2859         mi.field.total_swap * swap_free_low_percentage / 100) {
2860         // If the pressure is larger than downgrade_pressure lmk will not
2861         // kill any process, since enough memory is available.
2862         if (mem_pressure > downgrade_pressure) {
2863             if (debug_process_killing) {
2864                 ALOGI("Ignore %s memory pressure", level_name[level]);
2865             }
2866             return;
2867         } else if (level == VMPRESS_LEVEL_CRITICAL && mem_pressure > upgrade_pressure) {
2868             if (debug_process_killing) {
2869                 ALOGI("Downgrade critical memory pressure");
2870             }
2871             // Downgrade event, since enough memory available.
2872             level = downgrade_level(level);
2873         }
2874     }
2875 
2876 do_kill:
2877     if (low_ram_device) {
2878         /* For Go devices kill only one task */
2879         if (find_and_kill_process(level_oomadj[level], NULL, &mi, &wi, &curr_tm) == 0) {
2880             if (debug_process_killing) {
2881                 ALOGI("Nothing to kill");
2882             }
2883         }
2884     } else {
2885         int pages_freed;
2886         static struct timespec last_report_tm;
2887         static unsigned long report_skip_count = 0;
2888 
2889         if (!use_minfree_levels) {
2890             /* Free up enough memory to downgrate the memory pressure to low level */
2891             if (mi.field.nr_free_pages >= low_pressure_mem.max_nr_free_pages) {
2892                 if (debug_process_killing) {
2893                     ALOGI("Ignoring pressure since more memory is "
2894                         "available (%" PRId64 ") than watermark (%" PRId64 ")",
2895                         mi.field.nr_free_pages, low_pressure_mem.max_nr_free_pages);
2896                 }
2897                 return;
2898             }
2899             min_score_adj = level_oomadj[level];
2900         }
2901 
2902         pages_freed = find_and_kill_process(min_score_adj, NULL, &mi, &wi, &curr_tm);
2903 
2904         if (pages_freed == 0) {
2905             /* Rate limit kill reports when nothing was reclaimed */
2906             if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) {
2907                 report_skip_count++;
2908                 return;
2909             }
2910         }
2911 
2912         /* Log whenever we kill or when report rate limit allows */
2913         if (use_minfree_levels) {
2914             ALOGI("Reclaimed %ldkB, cache(%ldkB) and free(%" PRId64 "kB)-reserved(%" PRId64 "kB) "
2915                 "below min(%ldkB) for oom_score_adj %d",
2916                 pages_freed * page_k,
2917                 other_file * page_k, mi.field.nr_free_pages * page_k,
2918                 zi.totalreserve_pages * page_k,
2919                 minfree * page_k, min_score_adj);
2920         } else {
2921             ALOGI("Reclaimed %ldkB at oom_score_adj %d", pages_freed * page_k, min_score_adj);
2922         }
2923 
2924         if (report_skip_count > 0) {
2925             ALOGI("Suppressed %lu failed kill reports", report_skip_count);
2926             report_skip_count = 0;
2927         }
2928 
2929         last_report_tm = curr_tm;
2930     }
2931     if (is_waiting_for_kill()) {
2932         /* pause polling if we are waiting for process death notification */
2933         poll_params->update = POLLING_PAUSE;
2934     }
2935 }
2936 
init_mp_psi(enum vmpressure_level level,bool use_new_strategy)2937 static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
2938     int fd;
2939 
2940     /* Do not register a handler if threshold_ms is not set */
2941     if (!psi_thresholds[level].threshold_ms) {
2942         return true;
2943     }
2944 
2945     fd = init_psi_monitor(psi_thresholds[level].stall_type,
2946         psi_thresholds[level].threshold_ms * US_PER_MS,
2947         PSI_WINDOW_SIZE_MS * US_PER_MS);
2948 
2949     if (fd < 0) {
2950         return false;
2951     }
2952 
2953     vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
2954     vmpressure_hinfo[level].data = level;
2955     if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
2956         destroy_psi_monitor(fd);
2957         return false;
2958     }
2959     maxevents++;
2960     mpevfd[level] = fd;
2961 
2962     return true;
2963 }
2964 
destroy_mp_psi(enum vmpressure_level level)2965 static void destroy_mp_psi(enum vmpressure_level level) {
2966     int fd = mpevfd[level];
2967 
2968     if (fd < 0) {
2969         return;
2970     }
2971 
2972     if (unregister_psi_monitor(epollfd, fd) < 0) {
2973         ALOGE("Failed to unregister psi monitor for %s memory pressure; errno=%d",
2974             level_name[level], errno);
2975     }
2976     maxevents--;
2977     destroy_psi_monitor(fd);
2978     mpevfd[level] = -1;
2979 }
2980 
init_psi_monitors()2981 static bool init_psi_monitors() {
2982     /*
2983      * When PSI is used on low-ram devices or on high-end devices without memfree levels
2984      * use new kill strategy based on zone watermarks, free swap and thrashing stats
2985      */
2986     bool use_new_strategy =
2987         property_get_bool("ro.lmk.use_new_strategy", low_ram_device || !use_minfree_levels);
2988 
2989     /* In default PSI mode override stall amounts using system properties */
2990     if (use_new_strategy) {
2991         /* Do not use low pressure level */
2992         psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
2993         psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
2994         psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;
2995     }
2996 
2997     if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) {
2998         return false;
2999     }
3000     if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) {
3001         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3002         return false;
3003     }
3004     if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) {
3005         destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3006         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3007         return false;
3008     }
3009     return true;
3010 }
3011 
init_mp_common(enum vmpressure_level level)3012 static bool init_mp_common(enum vmpressure_level level) {
3013     int mpfd;
3014     int evfd;
3015     int evctlfd;
3016     char buf[256];
3017     struct epoll_event epev;
3018     int ret;
3019     int level_idx = (int)level;
3020     const char *levelstr = level_name[level_idx];
3021 
3022     /* gid containing AID_SYSTEM required */
3023     mpfd = open(MEMCG_SYSFS_PATH "memory.pressure_level", O_RDONLY | O_CLOEXEC);
3024     if (mpfd < 0) {
3025         ALOGI("No kernel memory.pressure_level support (errno=%d)", errno);
3026         goto err_open_mpfd;
3027     }
3028 
3029     evctlfd = open(MEMCG_SYSFS_PATH "cgroup.event_control", O_WRONLY | O_CLOEXEC);
3030     if (evctlfd < 0) {
3031         ALOGI("No kernel memory cgroup event control (errno=%d)", errno);
3032         goto err_open_evctlfd;
3033     }
3034 
3035     evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
3036     if (evfd < 0) {
3037         ALOGE("eventfd failed for level %s; errno=%d", levelstr, errno);
3038         goto err_eventfd;
3039     }
3040 
3041     ret = snprintf(buf, sizeof(buf), "%d %d %s", evfd, mpfd, levelstr);
3042     if (ret >= (ssize_t)sizeof(buf)) {
3043         ALOGE("cgroup.event_control line overflow for level %s", levelstr);
3044         goto err;
3045     }
3046 
3047     ret = TEMP_FAILURE_RETRY(write(evctlfd, buf, strlen(buf) + 1));
3048     if (ret == -1) {
3049         ALOGE("cgroup.event_control write failed for level %s; errno=%d",
3050               levelstr, errno);
3051         goto err;
3052     }
3053 
3054     epev.events = EPOLLIN;
3055     /* use data to store event level */
3056     vmpressure_hinfo[level_idx].data = level_idx;
3057     vmpressure_hinfo[level_idx].handler = mp_event_common;
3058     epev.data.ptr = (void *)&vmpressure_hinfo[level_idx];
3059     ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, evfd, &epev);
3060     if (ret == -1) {
3061         ALOGE("epoll_ctl for level %s failed; errno=%d", levelstr, errno);
3062         goto err;
3063     }
3064     maxevents++;
3065     mpevfd[level] = evfd;
3066     close(evctlfd);
3067     return true;
3068 
3069 err:
3070     close(evfd);
3071 err_eventfd:
3072     close(evctlfd);
3073 err_open_evctlfd:
3074     close(mpfd);
3075 err_open_mpfd:
3076     return false;
3077 }
3078 
destroy_mp_common(enum vmpressure_level level)3079 static void destroy_mp_common(enum vmpressure_level level) {
3080     struct epoll_event epev;
3081     int fd = mpevfd[level];
3082 
3083     if (fd < 0) {
3084         return;
3085     }
3086 
3087     if (epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &epev)) {
3088         // Log an error and keep going
3089         ALOGE("epoll_ctl for level %s failed; errno=%d", level_name[level], errno);
3090     }
3091     maxevents--;
3092     close(fd);
3093     mpevfd[level] = -1;
3094 }
3095 
kernel_event_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)3096 static void kernel_event_handler(int data __unused, uint32_t events __unused,
3097                                  struct polling_params *poll_params __unused) {
3098     poll_kernel(kpoll_fd);
3099 }
3100 
init_monitors()3101 static bool init_monitors() {
3102     /* Try to use psi monitor first if kernel has it */
3103     use_psi_monitors = property_get_bool("ro.lmk.use_psi", true) &&
3104         init_psi_monitors();
3105     /* Fall back to vmpressure */
3106     if (!use_psi_monitors &&
3107         (!init_mp_common(VMPRESS_LEVEL_LOW) ||
3108         !init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
3109         !init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
3110         ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
3111         return false;
3112     }
3113     if (use_psi_monitors) {
3114         ALOGI("Using psi monitors for memory pressure detection");
3115     } else {
3116         ALOGI("Using vmpressure for memory pressure detection");
3117     }
3118     return true;
3119 }
3120 
destroy_monitors()3121 static void destroy_monitors() {
3122     if (use_psi_monitors) {
3123         destroy_mp_psi(VMPRESS_LEVEL_CRITICAL);
3124         destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3125         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3126     } else {
3127         destroy_mp_common(VMPRESS_LEVEL_CRITICAL);
3128         destroy_mp_common(VMPRESS_LEVEL_MEDIUM);
3129         destroy_mp_common(VMPRESS_LEVEL_LOW);
3130     }
3131 }
3132 
init(void)3133 static int init(void) {
3134     static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler };
3135     struct reread_data file_data = {
3136         .filename = ZONEINFO_PATH,
3137         .fd = -1,
3138     };
3139     struct epoll_event epev;
3140     int pidfd;
3141     int i;
3142     int ret;
3143 
3144     page_k = sysconf(_SC_PAGESIZE);
3145     if (page_k == -1)
3146         page_k = PAGE_SIZE;
3147     page_k /= 1024;
3148 
3149     epollfd = epoll_create(MAX_EPOLL_EVENTS);
3150     if (epollfd == -1) {
3151         ALOGE("epoll_create failed (errno=%d)", errno);
3152         return -1;
3153     }
3154 
3155     // mark data connections as not connected
3156     for (int i = 0; i < MAX_DATA_CONN; i++) {
3157         data_sock[i].sock = -1;
3158     }
3159 
3160     ctrl_sock.sock = android_get_control_socket("lmkd");
3161     if (ctrl_sock.sock < 0) {
3162         ALOGE("get lmkd control socket failed");
3163         return -1;
3164     }
3165 
3166     ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
3167     if (ret < 0) {
3168         ALOGE("lmkd control socket listen failed (errno=%d)", errno);
3169         return -1;
3170     }
3171 
3172     epev.events = EPOLLIN;
3173     ctrl_sock.handler_info.handler = ctrl_connect_handler;
3174     epev.data.ptr = (void *)&(ctrl_sock.handler_info);
3175     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
3176         ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
3177         return -1;
3178     }
3179     maxevents++;
3180 
3181     has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
3182     use_inkernel_interface = has_inkernel_module;
3183 
3184     if (use_inkernel_interface) {
3185         ALOGI("Using in-kernel low memory killer interface");
3186         if (init_poll_kernel()) {
3187             epev.events = EPOLLIN;
3188             epev.data.ptr = (void*)&kernel_poll_hinfo;
3189             if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_fd, &epev) != 0) {
3190                 ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
3191                 close(kpoll_fd);
3192                 kpoll_fd = -1;
3193             } else {
3194                 maxevents++;
3195                 /* let the others know it does support reporting kills */
3196                 property_set("sys.lmk.reportkills", "1");
3197             }
3198         }
3199     } else {
3200         if (!init_monitors()) {
3201             return -1;
3202         }
3203         /* let the others know it does support reporting kills */
3204         property_set("sys.lmk.reportkills", "1");
3205     }
3206 
3207     for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
3208         procadjslot_list[i].next = &procadjslot_list[i];
3209         procadjslot_list[i].prev = &procadjslot_list[i];
3210     }
3211 
3212     memset(killcnt_idx, KILLCNT_INVALID_IDX, sizeof(killcnt_idx));
3213 
3214     /*
3215      * Read zoneinfo as the biggest file we read to create and size the initial
3216      * read buffer and avoid memory re-allocations during memory pressure
3217      */
3218     if (reread_file(&file_data) == NULL) {
3219         ALOGE("Failed to read %s: %s", file_data.filename, strerror(errno));
3220     }
3221 
3222     /* check if kernel supports pidfd_open syscall */
3223     pidfd = TEMP_FAILURE_RETRY(pidfd_open(getpid(), 0));
3224     if (pidfd < 0) {
3225         pidfd_supported = (errno != ENOSYS);
3226     } else {
3227         pidfd_supported = true;
3228         close(pidfd);
3229     }
3230     ALOGI("Process polling is %s", pidfd_supported ? "supported" : "not supported" );
3231 
3232     return 0;
3233 }
3234 
polling_paused(struct polling_params * poll_params)3235 static bool polling_paused(struct polling_params *poll_params) {
3236     return poll_params->paused_handler != NULL;
3237 }
3238 
resume_polling(struct polling_params * poll_params,struct timespec curr_tm)3239 static void resume_polling(struct polling_params *poll_params, struct timespec curr_tm) {
3240     poll_params->poll_start_tm = curr_tm;
3241     poll_params->poll_handler = poll_params->paused_handler;
3242     poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
3243     poll_params->paused_handler = NULL;
3244 }
3245 
call_handler(struct event_handler_info * handler_info,struct polling_params * poll_params,uint32_t events)3246 static void call_handler(struct event_handler_info* handler_info,
3247                          struct polling_params *poll_params, uint32_t events) {
3248     struct timespec curr_tm;
3249 
3250     poll_params->update = POLLING_DO_NOT_CHANGE;
3251     handler_info->handler(handler_info->data, events, poll_params);
3252     clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3253     if (poll_params->poll_handler == handler_info) {
3254         poll_params->last_poll_tm = curr_tm;
3255     }
3256 
3257     switch (poll_params->update) {
3258     case POLLING_START:
3259         /*
3260          * Poll for the duration of PSI_WINDOW_SIZE_MS after the
3261          * initial PSI event because psi events are rate-limited
3262          * at one per sec.
3263          */
3264         poll_params->poll_start_tm = curr_tm;
3265         poll_params->poll_handler = handler_info;
3266         break;
3267     case POLLING_PAUSE:
3268         poll_params->paused_handler = handler_info;
3269         poll_params->poll_handler = NULL;
3270         break;
3271     case POLLING_RESUME:
3272         resume_polling(poll_params, curr_tm);
3273         break;
3274     case POLLING_DO_NOT_CHANGE:
3275         if (get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
3276             /* Polled for the duration of PSI window, time to stop */
3277             poll_params->poll_handler = NULL;
3278         }
3279         break;
3280     }
3281 }
3282 
mainloop(void)3283 static void mainloop(void) {
3284     struct event_handler_info* handler_info;
3285     struct polling_params poll_params;
3286     struct timespec curr_tm;
3287     struct epoll_event *evt;
3288     long delay = -1;
3289 
3290     poll_params.poll_handler = NULL;
3291     poll_params.paused_handler = NULL;
3292 
3293     while (1) {
3294         struct epoll_event events[MAX_EPOLL_EVENTS];
3295         int nevents;
3296         int i;
3297 
3298         if (poll_params.poll_handler) {
3299             bool poll_now;
3300 
3301             clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3302             if (poll_params.update == POLLING_RESUME) {
3303                 /* Just transitioned into POLLING_RESUME, poll immediately. */
3304                 poll_now = true;
3305                 nevents = 0;
3306             } else {
3307                 /* Calculate next timeout */
3308                 delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
3309                 delay = (delay < poll_params.polling_interval_ms) ?
3310                     poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;
3311 
3312                 /* Wait for events until the next polling timeout */
3313                 nevents = epoll_wait(epollfd, events, maxevents, delay);
3314 
3315                 /* Update current time after wait */
3316                 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3317                 poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
3318                     poll_params.polling_interval_ms);
3319             }
3320             if (poll_now) {
3321                 call_handler(poll_params.poll_handler, &poll_params, 0);
3322             }
3323         } else {
3324             if (kill_timeout_ms && is_waiting_for_kill()) {
3325                 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3326                 delay = kill_timeout_ms - get_time_diff_ms(&last_kill_tm, &curr_tm);
3327                 /* Wait for pidfds notification or kill timeout to expire */
3328                 nevents = (delay > 0) ? epoll_wait(epollfd, events, maxevents, delay) : 0;
3329                 if (nevents == 0) {
3330                     /* Kill notification timed out */
3331                     stop_wait_for_proc_kill(false);
3332                     if (polling_paused(&poll_params)) {
3333                         clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3334                         poll_params.update = POLLING_RESUME;
3335                         resume_polling(&poll_params, curr_tm);
3336                     }
3337                 }
3338             } else {
3339                 /* Wait for events with no timeout */
3340                 nevents = epoll_wait(epollfd, events, maxevents, -1);
3341             }
3342         }
3343 
3344         if (nevents == -1) {
3345             if (errno == EINTR)
3346                 continue;
3347             ALOGE("epoll_wait failed (errno=%d)", errno);
3348             continue;
3349         }
3350 
3351         /*
3352          * First pass to see if any data socket connections were dropped.
3353          * Dropped connection should be handled before any other events
3354          * to deallocate data connection and correctly handle cases when
3355          * connection gets dropped and reestablished in the same epoll cycle.
3356          * In such cases it's essential to handle connection closures first.
3357          */
3358         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
3359             if ((evt->events & EPOLLHUP) && evt->data.ptr) {
3360                 ALOGI("lmkd data connection dropped");
3361                 handler_info = (struct event_handler_info*)evt->data.ptr;
3362                 ctrl_data_close(handler_info->data);
3363             }
3364         }
3365 
3366         /* Second pass to handle all other events */
3367         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
3368             if (evt->events & EPOLLERR) {
3369                 ALOGD("EPOLLERR on event #%d", i);
3370             }
3371             if (evt->events & EPOLLHUP) {
3372                 /* This case was handled in the first pass */
3373                 continue;
3374             }
3375             if (evt->data.ptr) {
3376                 handler_info = (struct event_handler_info*)evt->data.ptr;
3377                 call_handler(handler_info, &poll_params, evt->events);
3378             }
3379         }
3380     }
3381 }
3382 
issue_reinit()3383 int issue_reinit() {
3384     int sock;
3385 
3386     sock = lmkd_connect();
3387     if (sock < 0) {
3388         ALOGE("failed to connect to lmkd: %s", strerror(errno));
3389         return -1;
3390     }
3391 
3392     enum update_props_result res = lmkd_update_props(sock);
3393     switch (res) {
3394     case UPDATE_PROPS_SUCCESS:
3395         ALOGI("lmkd updated properties successfully");
3396         break;
3397     case UPDATE_PROPS_SEND_ERR:
3398         ALOGE("failed to send lmkd request: %s", strerror(errno));
3399         break;
3400     case UPDATE_PROPS_RECV_ERR:
3401         ALOGE("failed to receive lmkd reply: %s", strerror(errno));
3402         break;
3403     case UPDATE_PROPS_FORMAT_ERR:
3404         ALOGE("lmkd reply is invalid");
3405         break;
3406     case UPDATE_PROPS_FAIL:
3407         ALOGE("lmkd failed to update its properties");
3408         break;
3409     }
3410 
3411     close(sock);
3412     return res == UPDATE_PROPS_SUCCESS ? 0 : -1;
3413 }
3414 
update_props()3415 static void update_props() {
3416     /* By default disable low level vmpressure events */
3417     level_oomadj[VMPRESS_LEVEL_LOW] =
3418         property_get_int32("ro.lmk.low", OOM_SCORE_ADJ_MAX + 1);
3419     level_oomadj[VMPRESS_LEVEL_MEDIUM] =
3420         property_get_int32("ro.lmk.medium", 800);
3421     level_oomadj[VMPRESS_LEVEL_CRITICAL] =
3422         property_get_int32("ro.lmk.critical", 0);
3423     debug_process_killing = property_get_bool("ro.lmk.debug", false);
3424 
3425     /* By default disable upgrade/downgrade logic */
3426     enable_pressure_upgrade =
3427         property_get_bool("ro.lmk.critical_upgrade", false);
3428     upgrade_pressure =
3429         (int64_t)property_get_int32("ro.lmk.upgrade_pressure", 100);
3430     downgrade_pressure =
3431         (int64_t)property_get_int32("ro.lmk.downgrade_pressure", 100);
3432     kill_heaviest_task =
3433         property_get_bool("ro.lmk.kill_heaviest_task", false);
3434     low_ram_device = property_get_bool("ro.config.low_ram", false);
3435     kill_timeout_ms =
3436         (unsigned long)property_get_int32("ro.lmk.kill_timeout_ms", 100);
3437     use_minfree_levels =
3438         property_get_bool("ro.lmk.use_minfree_levels", false);
3439     per_app_memcg =
3440         property_get_bool("ro.config.per_app_memcg", low_ram_device);
3441     swap_free_low_percentage = clamp(0, 100, property_get_int32("ro.lmk.swap_free_low_percentage",
3442         DEF_LOW_SWAP));
3443     psi_partial_stall_ms = property_get_int32("ro.lmk.psi_partial_stall_ms",
3444         low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
3445     psi_complete_stall_ms = property_get_int32("ro.lmk.psi_complete_stall_ms",
3446         DEF_COMPLETE_STALL);
3447     thrashing_limit_pct = max(0, property_get_int32("ro.lmk.thrashing_limit",
3448         low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
3449     thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay",
3450         low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
3451     thrashing_critical_pct = max(0, property_get_int32("ro.lmk.thrashing_limit_critical",
3452         thrashing_limit_pct * 2));
3453     swap_util_max = clamp(0, 100, property_get_int32("ro.lmk.swap_util_max", 100));
3454     filecache_min_kb = property_get_int64("ro.lmk.filecache_min_kb", 0);
3455 }
3456 
main(int argc,char ** argv)3457 int main(int argc, char **argv) {
3458     if ((argc > 1) && argv[1] && !strcmp(argv[1], "--reinit")) {
3459         if (property_set(LMKD_REINIT_PROP, "0")) {
3460             ALOGE("Failed to reset " LMKD_REINIT_PROP " property");
3461         }
3462         return issue_reinit();
3463     }
3464 
3465     update_props();
3466 
3467     ctx = create_android_logger(KILLINFO_LOG_TAG);
3468 
3469     if (!init()) {
3470         if (!use_inkernel_interface) {
3471             /*
3472              * MCL_ONFAULT pins pages as they fault instead of loading
3473              * everything immediately all at once. (Which would be bad,
3474              * because as of this writing, we have a lot of mapped pages we
3475              * never use.) Old kernels will see MCL_ONFAULT and fail with
3476              * EINVAL; we ignore this failure.
3477              *
3478              * N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
3479              * pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
3480              * in pages.
3481              */
3482             /* CAP_IPC_LOCK required */
3483             if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
3484                 ALOGW("mlockall failed %s", strerror(errno));
3485             }
3486 
3487             /* CAP_NICE required */
3488             struct sched_param param = {
3489                     .sched_priority = 1,
3490             };
3491             if (sched_setscheduler(0, SCHED_FIFO, &param)) {
3492                 ALOGW("set SCHED_FIFO failed %s", strerror(errno));
3493             }
3494         }
3495 
3496         mainloop();
3497     }
3498 
3499     android_log_destroy(&ctx);
3500 
3501     ALOGI("exiting");
3502     return 0;
3503 }
3504