1 /*
2  * Copyright (c) 2015, The Android Open Source Project
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *   * Redistributions of source code must retain the above copyright
9  *     notice, this list of conditions and the following disclaimer.
10  *   * Redistributions in binary form must reproduce the above copyright
11  *     notice, this list of conditions and the following disclaimer
12  *     in the documentation and/or other materials provided with the
13  *     distribution.
14  *   * Neither the name of Google, Inc. nor the names of its contributors
15  *     may be used to endorse or promote products derived from this
16  *     software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
28  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <binder/IBinder.h>
33 #include <binder/IServiceManager.h>
34 #include <binder/Parcel.h>
35 
36 #include <ctime>
37 #include <cutils/properties.h>
38 #include <signal.h>
39 #include <stdbool.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
43 
44 #include <sys/resource.h>
45 #include <sys/stat.h>
46 #include <sys/time.h>
47 #include <sys/types.h>
48 #include <unistd.h>
49 
50 #include <utils/Log.h>
51 #include <utils/String8.h>
52 #include <utils/Trace.h>
53 #include <zlib.h>
54 
55 using namespace android;
56 
57 #ifdef LOG_TAG
58 #undef LOG_TAG
59 #endif
60 
61 #define LOG_TAG "anrdaemon"
62 
63 static const int check_period = 1;              // in sec
64 static const int tracing_check_period = 500000; // in micro sec
65 static const int cpu_stat_entries = 7;          // number of cpu stat entries
66 static const int min_buffer_size = 16;
67 static const int max_buffer_size = 2048;
68 static const char *min_buffer_size_str = "16";
69 static const char *max_buffer_size_str = "2048";
70 static const int time_buf_size = 20;
71 static const int path_buf_size = 60;
72 
73 typedef struct cpu_stat {
74     unsigned long utime, ntime, stime, itime;
75     unsigned long iowtime, irqtime, sirqtime, steal;
76     unsigned long total;
77 } cpu_stat_t;
78 
79 /*
80  * Logging on/off threshold.
81  * Uint: 0.01%; default to 99.90% cpu.
82  */
83 static int idle_threshold = 10;
84 
85 static bool quit = false;
86 static bool suspend= false;
87 static bool dump_requested = false;
88 static bool err = false;
89 static char err_msg[100];
90 static bool tracing = false;
91 
92 static const char *buf_size_kb = "2048";
93 static const char *apps = "";
94 static uint64_t tag = 0;
95 
96 static cpu_stat_t new_cpu;
97 static cpu_stat_t old_cpu;
98 
99 /* Log certain kernel activity when enabled */
100 static bool log_sched = false;
101 static bool log_stack = false;
102 static bool log_irq   = false;
103 static bool log_sync  = false;
104 static bool log_workq = false;
105 
106 /* Paths for debugfs controls*/
107 static const char* dfs_trace_output_path =
108     "/d/tracing/trace";
109 static const char* dfs_irq_path =
110     "/d/tracing/events/irq/enable";
111 static const char* dfs_sync_path =
112     "/d/tracing/events/sync/enable";
113 static const char* dfs_workq_path =
114     "/d/tracing/events/workqueue/enable";
115 static const char* dfs_stack_path =
116     "/d/tracing/options/stacktrace";
117 static const char* dfs_sched_switch_path =
118     "/d/tracing/events/sched/sched_switch/enable";
119 static const char* dfs_sched_wakeup_path =
120     "/d/tracing/events/sched/sched_wakeup/enable";
121 static const char* dfs_control_path =
122     "/d/tracing/tracing_on";
123 static const char* dfs_buffer_size_path =
124     "/d/tracing/buffer_size_kb";
125 static const char* dfs_tags_property = "debug.atrace.tags.enableflags";
126 static const char* dfs_apps_property = "debug.atrace.app_cmdlines";
127 
128 /*
129  * Read accumulated cpu data from /proc/stat
130  */
get_cpu_stat(cpu_stat_t * cpu)131 static void get_cpu_stat(cpu_stat_t *cpu) {
132     FILE *fp = NULL;
133     const char *params = "cpu  %lu %lu %lu %lu %lu %lu %lu %*d %*d %*d\n";
134 
135     if ((fp = fopen("/proc/stat", "r")) == NULL) {
136         err = true;
137         snprintf(err_msg, sizeof(err_msg), "can't read from /proc/stat with errno %d", errno);
138     } else {
139         if (fscanf(fp, params, &cpu->utime, &cpu->ntime,
140                 &cpu->stime, &cpu->itime, &cpu->iowtime, &cpu->irqtime,
141                 &cpu->sirqtime) != cpu_stat_entries) {
142             /*
143              * If failed in getting status, new_cpu won't be updated and
144              * is_heavy_loaded() will return false.
145              */
146             ALOGE("Error in getting cpu status. Skipping this check.");
147             fclose(fp);
148             return;
149         }
150 
151         cpu->total = cpu->utime + cpu->ntime + cpu->stime + cpu->itime
152             + cpu->iowtime + cpu->irqtime + cpu->sirqtime;
153 
154         fclose(fp);
155     }
156 }
157 
158 /*
159  * Calculate cpu usage in the past interval.
160  * If tracing is on, increase the idle threshold by 1.00% so that we do not
161  * turn on and off tracing frequently when the cpu load is right close to
162  * threshold.
163  */
is_heavy_load(void)164 static bool is_heavy_load(void) {
165     unsigned long diff_idle, diff_total;
166     int threshold = idle_threshold + (tracing?100:0);
167     get_cpu_stat(&new_cpu);
168     diff_idle = new_cpu.itime - old_cpu.itime;
169     diff_total = new_cpu.total - old_cpu.total;
170     old_cpu = new_cpu;
171     return (diff_idle * 10000 < diff_total * threshold);
172 }
173 
174 /*
175  * Force the userland processes to refresh their property for logging.
176  */
dfs_poke_binder(void)177 static void dfs_poke_binder(void) {
178     sp<IServiceManager> sm = defaultServiceManager();
179     Vector<String16> services = sm->listServices();
180     for (size_t i = 0; i < services.size(); i++) {
181         sp<IBinder> obj = sm->checkService(services[i]);
182         if (obj != NULL) {
183             Parcel data;
184             obj->transact(IBinder::SYSPROPS_TRANSACTION, data, NULL, 0);
185         }
186     }
187 }
188 
189 /*
190  * Enable/disable a debugfs property by writing 0/1 to its path.
191  */
dfs_enable(bool enable,const char * path)192 static int dfs_enable(bool enable, const char* path) {
193     int fd = open(path, O_WRONLY);
194     if (fd == -1) {
195         err = true;
196         snprintf(err_msg, sizeof(err_msg), "Can't open %s. Error: %d", path, errno);
197         return -1;
198     }
199     const char* control = (enable?"1":"0");
200     ssize_t len = strlen(control);
201     int max_try = 10; // Fail if write was interrupted for 10 times
202     while (write(fd, control, len) != len) {
203         if (errno == EINTR && max_try-- > 0) {
204             usleep(100);
205             continue;
206         }
207 
208         err = true;
209         snprintf(err_msg, sizeof(err_msg), "Error %d in writing to %s.", errno, path);
210     }
211     close(fd);
212     return (err?-1:0);
213 }
214 
215 /*
216  * Set the userland tracing properties.
217  */
dfs_set_property(uint64_t mtag,const char * mapp,bool enable)218 static void dfs_set_property(uint64_t mtag, const char* mapp, bool enable) {
219     char buf[64];
220     snprintf(buf, sizeof(buf), "%#" PRIx64, mtag);
221     if (property_set(dfs_tags_property, buf) < 0) {
222         err = true;
223         snprintf(err_msg, sizeof(err_msg), "Failed to set debug tags system properties.");
224     }
225 
226     if (strlen(mapp) > 0
227             && property_set(dfs_apps_property, mapp) < 0) {
228         err = true;
229         snprintf(err_msg, sizeof(err_msg), "Failed to set debug applications.");
230     }
231 
232     if (log_sched) {
233         dfs_enable(enable, dfs_sched_switch_path);
234         dfs_enable(enable, dfs_sched_wakeup_path);
235     }
236     if (log_stack) {
237         dfs_enable(enable, dfs_stack_path);
238     }
239     if (log_irq) {
240         dfs_enable(enable, dfs_irq_path);
241     }
242     if (log_sync) {
243         dfs_enable(enable, dfs_sync_path);
244     }
245     if (log_workq) {
246         dfs_enable(enable, dfs_workq_path);
247     }
248 }
249 
250 /*
251  * Dump the log in a compressed format for systrace to visualize.
252  * Create a dump file "dump_of_anrdaemon.<current_time>" under /data/misc/anrd
253  */
dump_trace()254 static void dump_trace()
255 {
256     time_t now = time(0);
257     struct tm  tstruct;
258     char time_buf[time_buf_size];
259     char path_buf[path_buf_size];
260     const char* header = " done\nTRACE:\n";
261     ssize_t header_len = strlen(header);
262 
263     ALOGI("Started to dump ANRdaemon trace.");
264 
265     tstruct = *localtime(&now);
266     strftime(time_buf, time_buf_size, "%Y-%m-%d.%X", &tstruct);
267     snprintf(path_buf, path_buf_size, "/data/misc/anrd/dump_of_anrdaemon.%s", time_buf);
268     int output_fd = creat(path_buf, S_IRWXU);
269     if (output_fd == -1) {
270         ALOGE("Failed to create %s. Dump aborted.", path_buf);
271         return;
272     }
273 
274     if (write(output_fd, header, strlen(header)) != header_len) {
275         ALOGE("Failed to write the header.");
276         close(output_fd);
277         return;
278     }
279 
280     int trace_fd = open(dfs_trace_output_path, O_RDWR);
281     if (trace_fd == -1) {
282         ALOGE("Failed to open %s. Dump aborted.", dfs_trace_output_path);
283         close(output_fd);
284         return;
285     }
286 
287     z_stream zs;
288     uint8_t *in, *out;
289     int result, flush;
290 
291     memset(&zs, 0, sizeof(zs));
292     result = deflateInit(&zs, Z_DEFAULT_COMPRESSION);
293     if (result != Z_OK) {
294         ALOGE("error initializing zlib: %d\n", result);
295         close(trace_fd);
296         close(output_fd);
297         return;
298     }
299 
300     const size_t bufSize = 64*1024;
301     in = (uint8_t*)malloc(bufSize);
302     out = (uint8_t*)malloc(bufSize);
303     flush = Z_NO_FLUSH;
304 
305     zs.next_out = out;
306     zs.avail_out = bufSize;
307 
308     do {
309         if (zs.avail_in == 0) {
310             result = read(trace_fd, in, bufSize);
311             if (result < 0) {
312                 ALOGE("error reading trace: %s", strerror(errno));
313                 result = Z_STREAM_END;
314                 break;
315             } else if (result == 0) {
316                 flush = Z_FINISH;
317             } else {
318                 zs.next_in = in;
319                 zs.avail_in = result;
320             }
321         }
322 
323         if (zs.avail_out == 0) {
324             result = write(output_fd, out, bufSize);
325             if ((size_t)result < bufSize) {
326                 ALOGE("error writing deflated trace: %s", strerror(errno));
327                 result = Z_STREAM_END;
328                 zs.avail_out = bufSize;
329                 break;
330             }
331             zs.next_out = out;
332             zs.avail_out = bufSize;
333         }
334 
335     } while ((result = deflate(&zs, flush)) == Z_OK);
336 
337     if (result != Z_STREAM_END) {
338         ALOGE("error deflating trace: %s\n", zs.msg);
339     }
340 
341     if (zs.avail_out < bufSize) {
342         size_t bytes = bufSize - zs.avail_out;
343         result = write(output_fd, out, bytes);
344         if ((size_t)result < bytes) {
345             ALOGE("error writing deflated trace: %s", strerror(errno));
346         }
347     }
348 
349     result = deflateEnd(&zs);
350     if (result != Z_OK) {
351         ALOGE("error cleaning up zlib: %d\n", result);
352     }
353 
354     free(in);
355     free(out);
356 
357     close(trace_fd);
358     close(output_fd);
359 
360     ALOGI("Finished dump. Output file stored at: %s", path_buf);
361 }
362 
363 /*
364  * Start logging when cpu usage is high. Meanwhile, moniter the cpu usage and
365  * stop logging when it drops down.
366  */
start_tracing(void)367 static void start_tracing(void) {
368     ALOGD("High cpu usage, start logging.");
369 
370     if (dfs_enable(true, dfs_control_path) != 0) {
371         ALOGE("Failed to start tracing.");
372         return;
373     }
374     tracing = true;
375 
376     /* Stop logging when cpu usage drops or the daemon is suspended.*/
377     do {
378         usleep(tracing_check_period);
379     } while (!suspend && !dump_requested && is_heavy_load());
380 
381     if (dfs_enable(false, dfs_control_path) != 0) {
382         ALOGE("Failed to stop tracing.");
383         err = true;
384         return;
385     }
386     tracing = false;
387 
388     if (suspend) {
389         ALOGI("trace stopped due to suspend. Send SIGCONT to resume.");
390     } else if (dump_requested) {
391         ALOGI("trace stopped due to dump request.");
392         dump_trace();
393         dump_requested = false;
394     } else {
395         ALOGD("Usage back to low, stop logging.");
396     }
397 }
398 
399 /*
400  * Set the tracing log buffer size.
401  * Note the actual buffer size will be buf_size_kb * number of cores.
402  */
set_tracing_buffer_size(void)403 static int set_tracing_buffer_size(void) {
404     int fd = open(dfs_buffer_size_path, O_WRONLY);
405     if (fd == -1) {
406         err = true;
407         snprintf(err_msg, sizeof(err_msg), "Can't open atrace buffer size file under /d/tracing.");
408         return -1;
409     }
410     ssize_t len = strlen(buf_size_kb);
411     if (write(fd, buf_size_kb, len) != len) {
412         err = true;
413         snprintf(err_msg, sizeof(err_msg), "Error in writing to atrace buffer size file.");
414     }
415     close(fd);
416     return (err?-1:0);
417 
418 }
419 
420 /*
421  * Main loop to moniter the cpu usage and decided whether to start logging.
422  */
start(void)423 static void start(void) {
424     if ((set_tracing_buffer_size()) != 0)
425         return;
426 
427     dfs_set_property(tag, apps, true);
428     dfs_poke_binder();
429 
430     get_cpu_stat(&old_cpu);
431     sleep(check_period);
432 
433     while (!quit && !err) {
434         if (!suspend && is_heavy_load()) {
435             /*
436              * Increase process priority to make sure we can stop logging when
437              * necessary and do not overwrite the buffer
438              */
439             setpriority(PRIO_PROCESS, 0, -20);
440             start_tracing();
441             setpriority(PRIO_PROCESS, 0, 0);
442         }
443         sleep(check_period);
444     }
445     return;
446 }
447 
448 /*
449  * If trace is not running, dump trace right away.
450  * If trace is running, request to dump trace.
451  */
request_dump_trace()452 static void request_dump_trace()
453 {
454     if (!tracing) {
455         dump_trace();
456     } else if (!dump_requested) {
457         dump_requested = true;
458     }
459 }
460 
handle_signal(int signo)461 static void handle_signal(int signo)
462 {
463     switch (signo) {
464         case SIGQUIT:
465             suspend = true;
466             quit = true;
467             break;
468         case SIGSTOP:
469             suspend = true;
470             break;
471         case SIGCONT:
472             suspend = false;
473             break;
474         case SIGUSR1:
475             request_dump_trace();
476     }
477 }
478 
479 /*
480  * Set the signal handler:
481  * SIGQUIT: Reset debugfs and tracing property and terminate the daemon.
482  * SIGSTOP: Stop logging and suspend the daemon.
483  * SIGCONT: Resume the daemon as normal.
484  * SIGUSR1: Dump the logging to a compressed format for systrace to visualize.
485  */
register_sighandler(void)486 static void register_sighandler(void)
487 {
488     struct sigaction sa;
489     sigset_t block_mask;
490 
491     sigemptyset(&block_mask);
492     sigaddset (&block_mask, SIGQUIT);
493     sigaddset (&block_mask, SIGSTOP);
494     sigaddset (&block_mask, SIGCONT);
495     sigaddset (&block_mask, SIGUSR1);
496 
497     sa.sa_flags = 0;
498     sa.sa_mask = block_mask;
499     sa.sa_handler = handle_signal;
500     sigaction(SIGQUIT, &sa, NULL);
501     sigaction(SIGSTOP, &sa, NULL);
502     sigaction(SIGCONT, &sa, NULL);
503     sigaction(SIGUSR1, &sa, NULL);
504 }
505 
show_help(void)506 static void show_help(void) {
507 
508     fprintf(stderr, "usage: ANRdaemon [options] [categoris...]\n");
509     fprintf(stdout, "Options includes:\n"
510                     "   -a appname  enable app-level tracing for a comma "
511                        "separated list of cmdlines\n"
512                     "   -t N        cpu threshold for logging to start "
513                         "(uint = 0.01%%, min = 5000, max = 9999, default = 9990)\n"
514                     "   -s N        use a trace buffer size of N KB "
515                         "default to 2048KB\n"
516                     "   -h          show helps\n");
517     fprintf(stdout, "Categoris includes:\n"
518                     "   am         - activity manager\n"
519                     "   sm         - sync manager\n"
520                     "   input      - input\n"
521                     "   dalvik     - dalvik VM\n"
522                     "   audio      - Audio\n"
523                     "   gfx        - Graphics\n"
524                     "   rs         - RenderScript\n"
525                     "   hal        - Hardware Modules\n"
526                     "   irq        - kernel irq events\n"
527                     "   sched      - kernel scheduler activity\n"
528                     "   stack      - kernel stack\n"
529                     "   sync       - kernel sync activity\n"
530                     "   workq      - kernel work queues\n");
531     fprintf(stdout, "Control includes:\n"
532                     "   SIGQUIT: terminate the process\n"
533                     "   SIGSTOP: suspend all function of the daemon\n"
534                     "   SIGCONT: resume the normal function\n"
535                     "   SIGUSR1: dump the current logging in a compressed form\n");
536     exit(0);
537 }
538 
get_options(int argc,char * argv[])539 static int get_options(int argc, char *argv[]) {
540     int opt = 0;
541     int threshold;
542     while ((opt = getopt(argc, argv, "a:s:t:h")) >= 0) {
543         switch(opt) {
544             case 'a':
545                 apps = optarg;
546                 break;
547             case 's':
548                 if (atoi(optarg) > max_buffer_size)
549                     buf_size_kb = max_buffer_size_str;
550                 else if (atoi(optarg) < min_buffer_size)
551                     buf_size_kb = min_buffer_size_str;
552                 else
553                     buf_size_kb = optarg;
554                 break;
555             case 't':
556                 threshold = atoi(optarg);
557                 if (threshold > 9999 || threshold < 5000) {
558                     fprintf(stderr, "logging threshold should be 5000-9999\n");
559                     return 1;
560                 }
561                 idle_threshold = 10000 - threshold;
562                 break;
563             case 'h':
564                 show_help();
565                 break;
566             default:
567                 fprintf(stderr, "Error in getting options.\n"
568                         "run \"%s -h\" for usage.\n", argv[0]);
569                 return 1;
570         }
571     }
572 
573     for (int i = optind; i < argc; i++) {
574         if (strcmp(argv[i], "am") == 0) {
575             tag |= ATRACE_TAG_ACTIVITY_MANAGER;
576         } else if (strcmp(argv[i], "input") == 0) {
577             tag |= ATRACE_TAG_INPUT;
578         } else if (strcmp(argv[i], "sm") == 0) {
579             tag |= ATRACE_TAG_SYNC_MANAGER;
580         } else if (strcmp(argv[i], "dalvik") == 0) {
581             tag |= ATRACE_TAG_DALVIK;
582         } else if (strcmp(argv[i], "gfx") == 0) {
583             tag |= ATRACE_TAG_GRAPHICS;
584         } else if (strcmp(argv[i], "audio") == 0) {
585             tag |= ATRACE_TAG_AUDIO;
586         } else if (strcmp(argv[i], "hal") == 0) {
587             tag |= ATRACE_TAG_HAL;
588         } else if (strcmp(argv[i], "rs") == 0) {
589             tag |= ATRACE_TAG_RS;
590         } else if (strcmp(argv[i], "sched") == 0) {
591             log_sched = true;
592         } else if (strcmp(argv[i], "stack") == 0) {
593             log_stack = true;
594         } else if (strcmp(argv[i], "workq") == 0) {
595             log_workq = true;
596         } else if (strcmp(argv[i], "irq") == 0) {
597             log_irq = true;
598         } else if (strcmp(argv[i], "sync") == 0) {
599             log_sync = true;
600         } else {
601             fprintf(stderr, "invalid category: %s\n"
602                     "run \"%s -h\" for usage.\n", argv[i], argv[0]);
603             return 1;
604         }
605     }
606 
607     /* If nothing is enabled, don't run */
608     if (!tag && !log_sched && !log_stack && !log_workq && !log_irq && !log_sync) {
609         ALOGE("Specify at least one category to trace.");
610         return 1;
611     }
612 
613     return 0;
614 }
615 
main(int argc,char * argv[])616 int main(int argc, char *argv[])
617 {
618     if(get_options(argc, argv) != 0)
619         return 1;
620 
621     if (daemon(0, 0) != 0)
622         return 1;
623 
624     register_sighandler();
625 
626     /* Clear any the trace log file by overwrite it with a new file */
627     int fd = creat(dfs_trace_output_path, 0);
628     if (fd == -1) {
629         ALOGE("Faield to open and cleaup previous log");
630         return 1;
631     }
632     close(fd);
633 
634     ALOGI("ANRdaemon starting");
635     start();
636 
637     if (err)
638         ALOGE("ANRdaemon stopped due to Error: %s", err_msg);
639 
640     ALOGI("ANRdaemon terminated.");
641 
642     return (err?1:0);
643 }
644