1 /* 2 * Copyright (C) 2016 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #define LOG_TAG "resolv" 17 18 #include <arpa/nameser.h> 19 #include <stdbool.h> 20 #include <string.h> 21 22 #include <android-base/logging.h> 23 24 #include "stats.h" 25 26 // Calculate the round-trip-time from start time t0 and end time t1. 27 int res_stats_calculate_rtt(const timespec* t1, const timespec* t0) { 28 // Divide ns by one million to get ms, multiply s by thousand to get ms (obvious) 29 long ms0 = t0->tv_sec * 1000 + t0->tv_nsec / 1000000; 30 long ms1 = t1->tv_sec * 1000 + t1->tv_nsec / 1000000; 31 return (int) (ms1 - ms0); 32 } 33 34 // Create a sample for calculating server reachability statistics. 35 void res_stats_set_sample(res_sample* sample, time_t now, int rcode, int rtt) { 36 LOG(INFO) << __func__ << ": rcode = " << rcode << ", sec = " << rtt; 37 sample->at = now; 38 sample->rcode = rcode; 39 sample->rtt = rtt; 40 } 41 42 /* Clears all stored samples for the given server. */ 43 void _res_stats_clear_samples(res_stats* stats) { 44 stats->sample_count = stats->sample_next = 0; 45 } 46 47 /* Aggregates the reachability statistics for the given server based on on the stored samples. */ 48 void android_net_res_stats_aggregate(res_stats* stats, int* successes, int* errors, int* timeouts, 49 int* internal_errors, int* rtt_avg, time_t* last_sample_time) { 50 int s = 0; // successes 51 int e = 0; // errors 52 int t = 0; // timouts 53 int ie = 0; // internal errors 54 long rtt_sum = 0; 55 time_t last = 0; 56 int rtt_count = 0; 57 for (int i = 0; i < stats->sample_count; ++i) { 58 // Treat everything as an error that the code in send_dg() already considers a 59 // rejection by the server, i.e. SERVFAIL, NOTIMP and REFUSED. Assume that NXDOMAIN 60 // and NOTAUTH can actually occur for user queries. NOERROR with empty answer section 61 // is not treated as an error here either. FORMERR seems to sometimes be returned by 62 // some versions of BIND in response to DNSSEC or EDNS0. Whether to treat such responses 63 // as an indication of a broken server is unclear, though. For now treat such responses, 64 // as well as unknown codes as errors. 65 switch (stats->samples[i].rcode) { 66 case NOERROR: 67 case NOTAUTH: 68 case NXDOMAIN: 69 ++s; 70 rtt_sum += stats->samples[i].rtt; 71 ++rtt_count; 72 break; 73 case RCODE_TIMEOUT: 74 ++t; 75 break; 76 case RCODE_INTERNAL_ERROR: 77 ++ie; 78 break; 79 case SERVFAIL: 80 case NOTIMP: 81 case REFUSED: 82 default: 83 ++e; 84 break; 85 } 86 } 87 *successes = s; 88 *errors = e; 89 *timeouts = t; 90 *internal_errors = ie; 91 /* If there was at least one successful sample, calculate average RTT. */ 92 if (rtt_count) { 93 *rtt_avg = rtt_sum / rtt_count; 94 } else { 95 *rtt_avg = -1; 96 } 97 /* If we had at least one sample, populate last sample time. */ 98 if (stats->sample_count > 0) { 99 if (stats->sample_next > 0) { 100 last = stats->samples[stats->sample_next - 1].at; 101 } else { 102 last = stats->samples[stats->sample_count - 1].at; 103 } 104 } 105 *last_sample_time = last; 106 } 107 108 // Returns true if the server is considered usable, i.e. if the success rate is not lower than the 109 // threshold for the stored stored samples. If not enough samples are stored, the server is 110 // considered usable. 111 static bool res_stats_usable_server(const res_params* params, res_stats* stats) { 112 int successes = -1; 113 int errors = -1; 114 int timeouts = -1; 115 int internal_errors = -1; 116 int rtt_avg = -1; 117 time_t last_sample_time = 0; 118 android_net_res_stats_aggregate(stats, &successes, &errors, &timeouts, &internal_errors, 119 &rtt_avg, &last_sample_time); 120 if (successes >= 0 && errors >= 0 && timeouts >= 0) { 121 int total = successes + errors + timeouts + internal_errors; 122 LOG(INFO) << __func__ << ": NS stats: S " << successes << " + E " << errors << " + T " 123 << timeouts << " + I " << internal_errors << " = " << total 124 << ", rtt = " << rtt_avg << ", min_samples = " << unsigned(params->min_samples); 125 if (total >= params->min_samples) { 126 int success_rate = successes * 100 / total; 127 LOG(INFO) << __func__ << ": success rate " << success_rate; 128 if (success_rate < params->success_threshold) { 129 time_t now = time(NULL); 130 if (now - last_sample_time > params->sample_validity) { 131 // Note: It might be worth considering to expire old servers after their expiry 132 // date has been reached, however the code for returning the ring buffer to its 133 // previous non-circular state would induce additional complexity. 134 LOG(INFO) << __func__ << ": samples stale, retrying server"; 135 _res_stats_clear_samples(stats); 136 } else { 137 LOG(INFO) << __func__ << ": too many resolution errors, ignoring server"; 138 return 0; 139 } 140 } 141 } 142 } 143 return 1; 144 } 145 146 int android_net_res_stats_get_usable_servers(const res_params* params, res_stats stats[], 147 int nscount, bool usable_servers[]) { 148 unsigned usable_servers_found = 0; 149 for (int ns = 0; ns < nscount; ns++) { 150 bool usable = res_stats_usable_server(params, &stats[ns]); 151 if (usable) { 152 ++usable_servers_found; 153 } 154 usable_servers[ns] = usable; 155 } 156 // If there are no usable servers, consider all of them usable. 157 // TODO: Explore other possibilities, such as enabling only the best N servers, etc. 158 if (usable_servers_found == 0) { 159 for (int ns = 0; ns < nscount; ns++) { 160 usable_servers[ns] = true; 161 } 162 } 163 return (usable_servers_found == 0) ? nscount : usable_servers_found; 164 } 165