1 //
2 // Copyright (C) 2013 The Android Open Source Project
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "shill/traffic_monitor.h"
18
19 #include <base/bind.h>
20 #include <base/strings/stringprintf.h>
21 #include <netinet/in.h>
22
23 #include "shill/device.h"
24 #include "shill/device_info.h"
25 #include "shill/event_dispatcher.h"
26 #include "shill/logging.h"
27 #include "shill/socket_info_reader.h"
28
29 using base::StringPrintf;
30 using std::string;
31 using std::vector;
32
33 namespace shill {
34
35 namespace Logging {
36 static auto kModuleLogScope = ScopeLogger::kLink;
ObjectID(Device * d)37 static string ObjectID(Device* d) { return d->link_name(); }
38 }
39
40 // static
41 const uint16_t TrafficMonitor::kDnsPort = 53;
42 const int64_t TrafficMonitor::kDnsTimedOutThresholdSeconds = 15;
43 const int TrafficMonitor::kMinimumFailedSamplesToTrigger = 2;
44 const int64_t TrafficMonitor::kSamplingIntervalMilliseconds = 5000;
45
TrafficMonitor(const DeviceRefPtr & device,EventDispatcher * dispatcher)46 TrafficMonitor::TrafficMonitor(const DeviceRefPtr& device,
47 EventDispatcher* dispatcher)
48 : device_(device),
49 dispatcher_(dispatcher),
50 socket_info_reader_(new SocketInfoReader),
51 accummulated_congested_tx_queues_samples_(0),
52 connection_info_reader_(new ConnectionInfoReader),
53 accummulated_dns_failures_samples_(0) {
54 }
55
~TrafficMonitor()56 TrafficMonitor::~TrafficMonitor() {
57 Stop();
58 }
59
Start()60 void TrafficMonitor::Start() {
61 SLOG(device_.get(), 2) << __func__;
62 Stop();
63
64 sample_traffic_callback_.Reset(base::Bind(&TrafficMonitor::SampleTraffic,
65 base::Unretained(this)));
66 dispatcher_->PostDelayedTask(sample_traffic_callback_.callback(),
67 kSamplingIntervalMilliseconds);
68 }
69
Stop()70 void TrafficMonitor::Stop() {
71 SLOG(device_.get(), 2) << __func__;
72 sample_traffic_callback_.Cancel();
73 ResetCongestedTxQueuesStats();
74 ResetDnsFailingStats();
75 }
76
ResetCongestedTxQueuesStats()77 void TrafficMonitor::ResetCongestedTxQueuesStats() {
78 accummulated_congested_tx_queues_samples_ = 0;
79 }
80
ResetCongestedTxQueuesStatsWithLogging()81 void TrafficMonitor::ResetCongestedTxQueuesStatsWithLogging() {
82 SLOG(device_.get(), 2) << __func__ << ": Tx-queues decongested";
83 ResetCongestedTxQueuesStats();
84 }
85
BuildIPPortToTxQueueLength(const vector<SocketInfo> & socket_infos,IPPortToTxQueueLengthMap * tx_queue_lengths)86 void TrafficMonitor::BuildIPPortToTxQueueLength(
87 const vector<SocketInfo>& socket_infos,
88 IPPortToTxQueueLengthMap* tx_queue_lengths) {
89 SLOG(device_.get(), 3) << __func__;
90 string device_ip_address = device_->ipconfig()->properties().address;
91 for (const auto& info : socket_infos) {
92 SLOG(device_.get(), 4) << "SocketInfo(IP="
93 << info.local_ip_address().ToString()
94 << ", TX=" << info.transmit_queue_value()
95 << ", State=" << info.connection_state()
96 << ", TimerState=" << info.timer_state();
97 if (info.local_ip_address().ToString() != device_ip_address ||
98 info.transmit_queue_value() == 0 ||
99 info.connection_state() != SocketInfo::kConnectionStateEstablished ||
100 (info.timer_state() != SocketInfo::kTimerStateRetransmitTimerPending &&
101 info.timer_state() !=
102 SocketInfo::kTimerStateZeroWindowProbeTimerPending)) {
103 SLOG(device_.get(), 4) << "Connection Filtered.";
104 continue;
105 }
106 SLOG(device_.get(), 3) << "Monitoring connection: TX="
107 << info.transmit_queue_value()
108 << " TimerState=" << info.timer_state();
109
110 string local_ip_port =
111 StringPrintf("%s:%d",
112 info.local_ip_address().ToString().c_str(),
113 info.local_port());
114 (*tx_queue_lengths)[local_ip_port] = info.transmit_queue_value();
115 }
116 }
117
IsCongestedTxQueues()118 bool TrafficMonitor::IsCongestedTxQueues() {
119 SLOG(device_.get(), 4) << __func__;
120 vector<SocketInfo> socket_infos;
121 if (!socket_info_reader_->LoadTcpSocketInfo(&socket_infos) ||
122 socket_infos.empty()) {
123 SLOG(device_.get(), 3) << __func__ << ": Empty socket info";
124 ResetCongestedTxQueuesStatsWithLogging();
125 return false;
126 }
127 bool congested_tx_queues = true;
128 IPPortToTxQueueLengthMap curr_tx_queue_lengths;
129 BuildIPPortToTxQueueLength(socket_infos, &curr_tx_queue_lengths);
130 if (curr_tx_queue_lengths.empty()) {
131 SLOG(device_.get(), 3) << __func__ << ": No interesting socket info";
132 ResetCongestedTxQueuesStatsWithLogging();
133 } else {
134 for (const auto& length_entry : old_tx_queue_lengths_) {
135 IPPortToTxQueueLengthMap::iterator curr_tx_queue_it =
136 curr_tx_queue_lengths.find(length_entry.first);
137 if (curr_tx_queue_it == curr_tx_queue_lengths.end() ||
138 curr_tx_queue_it->second < length_entry.second) {
139 congested_tx_queues = false;
140 // TODO(armansito): If we had a false positive earlier, we may
141 // want to correct it here by invoking a "connection back to normal
142 // callback", so that the OutOfCredits property can be set to
143 // false.
144 break;
145 }
146 }
147 if (congested_tx_queues) {
148 ++accummulated_congested_tx_queues_samples_;
149 SLOG(device_.get(), 2) << __func__
150 << ": Congested tx-queues detected ("
151 << accummulated_congested_tx_queues_samples_
152 << ")";
153 }
154 }
155 old_tx_queue_lengths_ = curr_tx_queue_lengths;
156
157 return congested_tx_queues;
158 }
159
ResetDnsFailingStats()160 void TrafficMonitor::ResetDnsFailingStats() {
161 accummulated_dns_failures_samples_ = 0;
162 }
163
ResetDnsFailingStatsWithLogging()164 void TrafficMonitor::ResetDnsFailingStatsWithLogging() {
165 SLOG(device_.get(), 2) << __func__ << ": DNS queries restored";
166 ResetDnsFailingStats();
167 }
168
IsDnsFailing()169 bool TrafficMonitor::IsDnsFailing() {
170 SLOG(device_.get(), 4) << __func__;
171 vector<ConnectionInfo> connection_infos;
172 if (!connection_info_reader_->LoadConnectionInfo(&connection_infos) ||
173 connection_infos.empty()) {
174 SLOG(device_.get(), 3) << __func__ << ": Empty connection info";
175 } else {
176 // The time-to-expire counter is used to determine when a DNS request
177 // has timed out. This counter is the number of seconds remaining until
178 // the entry is removed from the system IP connection tracker. The
179 // default time is 30 seconds. This is too long of a wait. Instead, we
180 // want to time out at |kDnsTimedOutThresholdSeconds|. Unfortunately,
181 // we cannot simply look for entries less than
182 // |kDnsTimedOutThresholdSeconds| because we will count the entry
183 // multiple times once its time-to-expire is less than
184 // |kDnsTimedOutThresholdSeconds|. To ensure that we only count an
185 // entry once, we look for entries in this time window between
186 // |kDnsTimedOutThresholdSeconds| and |kDnsTimedOutLowerThresholdSeconds|.
187 const int64_t kDnsTimedOutLowerThresholdSeconds =
188 kDnsTimedOutThresholdSeconds - kSamplingIntervalMilliseconds / 1000;
189 string device_ip_address = device_->ipconfig()->properties().address;
190 for (const auto& info : connection_infos) {
191 if (info.protocol() != IPPROTO_UDP ||
192 info.time_to_expire_seconds() > kDnsTimedOutThresholdSeconds ||
193 info.time_to_expire_seconds() <= kDnsTimedOutLowerThresholdSeconds ||
194 !info.is_unreplied() ||
195 info.original_source_ip_address().ToString() != device_ip_address ||
196 info.original_destination_port() != kDnsPort)
197 continue;
198
199 ++accummulated_dns_failures_samples_;
200 SLOG(device_.get(), 2) << __func__
201 << ": DNS failures detected ("
202 << accummulated_dns_failures_samples_ << ")";
203 return true;
204 }
205 }
206 ResetDnsFailingStatsWithLogging();
207 return false;
208 }
209
SampleTraffic()210 void TrafficMonitor::SampleTraffic() {
211 SLOG(device_.get(), 3) << __func__;
212
213 // Schedule the sample callback first, so it is possible for the network
214 // problem callback to stop the traffic monitor.
215 dispatcher_->PostDelayedTask(sample_traffic_callback_.callback(),
216 kSamplingIntervalMilliseconds);
217
218 if (IsCongestedTxQueues() &&
219 accummulated_congested_tx_queues_samples_ ==
220 kMinimumFailedSamplesToTrigger) {
221 LOG(WARNING) << "Congested tx queues detected, out-of-credits?";
222 network_problem_detected_callback_.Run(kNetworkProblemCongestedTxQueue);
223 } else if (IsDnsFailing() &&
224 accummulated_dns_failures_samples_ ==
225 kMinimumFailedSamplesToTrigger) {
226 LOG(WARNING) << "DNS queries failing, out-of-credits?";
227 network_problem_detected_callback_.Run(kNetworkProblemDNSFailure);
228 }
229 }
230
231 } // namespace shill
232