1 /*
2  * Copyright (C) 2016 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <stdbool.h>
18 #include <arpa/nameser.h>
19 #include <string.h>
20 
21 #include <async_safe/log.h>
22 
23 #include "isc/eventlib.h"
24 #include "resolv_stats.h"
25 
26 #define DBG 0
27 
28 /* Calculate the round-trip-time from start time t0 and end time t1. */
29 int
_res_stats_calculate_rtt(const struct timespec * t1,const struct timespec * t0)30 _res_stats_calculate_rtt(const struct timespec* t1, const struct timespec* t0) {
31     // Divide ns by one million to get ms, multiply s by thousand to get ms (obvious)
32     long ms0 = t0->tv_sec * 1000 + t0->tv_nsec / 1000000;
33     long ms1 = t1->tv_sec * 1000 + t1->tv_nsec / 1000000;
34     return (int) (ms1 - ms0);
35 }
36 
37 /* Create a sample for calculating server reachability statistics. */
38 void
_res_stats_set_sample(struct __res_sample * sample,time_t now,int rcode,int rtt)39 _res_stats_set_sample(struct __res_sample* sample, time_t now, int rcode, int rtt)
40 {
41     if (DBG) {
42         async_safe_format_log(ANDROID_LOG_INFO, "libc", "rcode = %d, sec = %d", rcode, rtt);
43     }
44     sample->at = now;
45     sample->rcode = rcode;
46     sample->rtt = rtt;
47 }
48 
49 /* Clears all stored samples for the given server. */
50 void
_res_stats_clear_samples(struct __res_stats * stats)51 _res_stats_clear_samples(struct __res_stats* stats)
52 {
53     stats->sample_count = stats->sample_next = 0;
54 }
55 
56 /* Aggregates the reachability statistics for the given server based on on the stored samples. */
57 void
android_net_res_stats_aggregate(struct __res_stats * stats,int * successes,int * errors,int * timeouts,int * internal_errors,int * rtt_avg,time_t * last_sample_time)58 android_net_res_stats_aggregate(struct __res_stats* stats, int* successes, int* errors,
59         int* timeouts, int* internal_errors, int* rtt_avg, time_t* last_sample_time)
60 {
61     int s = 0;   // successes
62     int e = 0;   // errors
63     int t = 0;   // timouts
64     int ie = 0;  // internal errors
65     long rtt_sum = 0;
66     time_t last = 0;
67     int rtt_count = 0;
68     for (int i = 0 ; i < stats->sample_count ; ++i) {
69         // Treat everything as an error that the code in send_dg() already considers a
70         // rejection by the server, i.e. SERVFAIL, NOTIMP and REFUSED. Assume that NXDOMAIN
71         // and NOTAUTH can actually occur for user queries. NOERROR with empty answer section
72         // is not treated as an error here either. FORMERR seems to sometimes be returned by
73         // some versions of BIND in response to DNSSEC or EDNS0. Whether to treat such responses
74         // as an indication of a broken server is unclear, though. For now treat such responses,
75         // as well as unknown codes as errors.
76         switch (stats->samples[i].rcode) {
77         case NOERROR:
78         case NOTAUTH:
79         case NXDOMAIN:
80             ++s;
81             rtt_sum += stats->samples[i].rtt;
82             ++rtt_count;
83             break;
84         case RCODE_TIMEOUT:
85             ++t;
86             break;
87         case RCODE_INTERNAL_ERROR:
88             ++ie;
89             break;
90         case SERVFAIL:
91         case NOTIMP:
92         case REFUSED:
93         default:
94             ++e;
95             break;
96         }
97     }
98     *successes = s;
99     *errors = e;
100     *timeouts = t;
101     *internal_errors = ie;
102     /* If there was at least one successful sample, calculate average RTT. */
103     if (rtt_count) {
104         *rtt_avg = rtt_sum / rtt_count;
105     } else {
106         *rtt_avg = -1;
107     }
108     /* If we had at least one sample, populate last sample time. */
109     if (stats->sample_count > 0) {
110         if (stats->sample_next > 0) {
111             last = stats->samples[stats->sample_next - 1].at;
112         } else {
113             last = stats->samples[stats->sample_count - 1].at;
114         }
115     }
116     *last_sample_time = last;
117 }
118 
119 bool
_res_stats_usable_server(const struct __res_params * params,struct __res_stats * stats)120 _res_stats_usable_server(const struct __res_params* params, struct __res_stats* stats) {
121     int successes = -1;
122     int errors = -1;
123     int timeouts = -1;
124     int internal_errors = -1;
125     int rtt_avg = -1;
126     time_t last_sample_time = 0;
127     android_net_res_stats_aggregate(stats, &successes, &errors, &timeouts, &internal_errors,
128             &rtt_avg, &last_sample_time);
129     if (successes >= 0 && errors >= 0 && timeouts >= 0) {
130         int total = successes + errors + timeouts;
131         if (DBG) {
132             async_safe_format_log(ANDROID_LOG_DEBUG, "libc", "NS stats: S %d + E %d + T %d + I %d "
133                  "= %d, rtt = %d, min_samples = %d\n", successes, errors, timeouts, internal_errors,
134                  total, rtt_avg, params->min_samples);
135         }
136         if (total >= params->min_samples && (errors > 0 || timeouts > 0)) {
137             int success_rate = successes * 100 / total;
138             if (DBG) {
139                 async_safe_format_log(ANDROID_LOG_DEBUG, "libc", "success rate %d%%\n",
140                                       success_rate);
141             }
142             if (success_rate < params->success_threshold) {
143                 // evNowTime() is used here instead of time() to stay consistent with the rest of
144                 // the code base
145                 time_t now = evNowTime().tv_sec;
146                 if (now - last_sample_time > params->sample_validity) {
147                     // Note: It might be worth considering to expire old servers after their expiry
148                     // date has been reached, however the code for returning the ring buffer to its
149                     // previous non-circular state would induce additional complexity.
150                     if (DBG) {
151                         async_safe_format_log(ANDROID_LOG_INFO, "libc",
152                             "samples stale, retrying server\n");
153                     }
154                     _res_stats_clear_samples(stats);
155                 } else {
156                     if (DBG) {
157                         async_safe_format_log(ANDROID_LOG_INFO, "libc",
158                             "too many resolution errors, ignoring server\n");
159                     }
160                     return 0;
161                 }
162             }
163         }
164     }
165     return 1;
166 }
167 
168 void
android_net_res_stats_get_usable_servers(const struct __res_params * params,struct __res_stats stats[],int nscount,bool usable_servers[])169 android_net_res_stats_get_usable_servers(const struct __res_params* params,
170         struct __res_stats stats[], int nscount, bool usable_servers[]) {
171     unsigned usable_servers_found = 0;
172     for (int ns = 0; ns < nscount; ns++) {
173         bool usable = _res_stats_usable_server(params, &stats[ns]);
174         if (usable) {
175             ++usable_servers_found;
176         }
177         usable_servers[ns] = usable;
178     }
179     // If there are no usable servers, consider all of them usable.
180     // TODO: Explore other possibilities, such as enabling only the best N servers, etc.
181     if (usable_servers_found == 0) {
182         for (int ns = 0; ns < nscount; ns++) {
183             usable_servers[ns] = true;
184         }
185     }
186 }
187