1 // 2 // Copyright (C) 2015 The Android Open Source Project 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 // 16 17 #ifndef SHILL_CONNECTION_DIAGNOSTICS_H_ 18 #define SHILL_CONNECTION_DIAGNOSTICS_H_ 19 20 #include <map> 21 #include <memory> 22 #include <string> 23 #include <vector> 24 25 #include <base/callback.h> 26 #include <base/cancelable_callback.h> 27 #include <base/memory/weak_ptr.h> 28 29 #include "shill/portal_detector.h" 30 #include "shill/refptr_types.h" 31 32 namespace shill { 33 34 class ArpClient; 35 class ByteString; 36 class DeviceInfo; 37 class DNSClient; 38 class DNSClientFactory; 39 class Error; 40 class EventDispatcher; 41 class HTTPURL; 42 class IcmpSession; 43 class IcmpSessionFactory; 44 class Metrics; 45 class RoutingTable; 46 struct RoutingTableEntry; 47 class RTNLHandler; 48 class RTNLListener; 49 class RTNLMessage; 50 51 // The ConnectionDiagnostics class implements facilities to diagnose problems 52 // that a connection encounters reaching a specific URL. 53 // 54 // Given a connection and a URL, ConnectionDiagnostics performs the following 55 // actions: 56 // (A) Start portal detection on the connection using the given URL. 57 // (B) If portal detection ends in the content phase, the connection is 58 // either functioning, or we are trapped in a captive portal. END. 59 // (C) If the portal detection ends in the DNS phase and failed for any 60 // reason other than a timeout, we have found a DNS server issue. END. 61 // (D) If the portal detection ends in the DNS phase and failed because of a 62 // timeout, ping all DNS servers. 63 // (E) If none of the DNS servers reply to pings, then we might have a 64 // problem issue reaching DNS servers. Send a request to the kernel 65 // for a route the first DNS server on our list (step M). 66 // (F) If at least one DNS server replies to pings, and we have DNS 67 // retries left, attempt DNS resolution again using the pingable DNS 68 // servers. 69 // (G) If at least one DNS server replies to pings but we are out of DNS 70 // retries, the DNS servers are at fault. END. 71 // (H) If portal detection ends in any other phase (i.e. HTTP or Connection) 72 // resolve the IP of the target web server via DNS. 73 // (I) If DNS resolution fails because of a timeout, ping all DNS 74 // servers (step D). 75 // (J) If DNS resolution fails for any other reason, we have found a 76 // DNS server issue. END. 77 // (K) Otherwise, ping the IP address of the target web server. 78 // (L) If ping is successful, we can reach the target web server. We 79 // might have a HTTP issue or a broken portal. END. 80 // (M) If ping is unsuccessful, we send a request to the kernel for 81 // a route to the IP address of the target web server. 82 // (N) If no route is found, a routing issue has been found. 83 // END. 84 // (O) If a route is found, and the destination is a local IPv6 85 // address, look for a neighbor table entry. 86 // (P) If a neighbor table entry is found, then this 87 // gateway/web server appears to be on the local 88 // network, but is not responding to pings. END. 89 // (Q) If a neighbor table entry is not found, then either 90 // this gateway/web server does not exist on the local 91 // network, or there are link layer issues. 92 // (R) If a route is found and the destination is a remote 93 // address, ping the local gateway. 94 // (S) If the local gateway respond to pings, then we have 95 // found an upstream connectivity problem or gateway 96 // problem. END. 97 // (T) If the local gateway is at an IPv6 address and does 98 // not respond to pings, look for a neighbor table 99 // entry (step O). 100 // (U) If the local gateway is at an IPv4 address and does 101 // not respond to pings, check for an ARP table entry 102 // for its address (step V). 103 // (V) Otherwise, if a route is found and the destination is a 104 // local IPv4 address, look for an ARP table entry for it. 105 // (W) If an ARP table entry is found, then this gateway/ 106 // web server appears to be on the local network, but is 107 // not responding to pings. END. 108 // (X) If an ARP table entry is not found, check for IP 109 // address collision in the local network by sending out 110 // an ARP request for the local IP address of this 111 // connection. 112 // (Y) If a reply is received, an IP collision has been 113 // detected. END. 114 // (Z) If no reply was received, no IP address collision 115 // was detected. Since we are here because ARP and 116 // ping failed, either the web server or gateway 117 // does not actually exist on the local network, or 118 // there is a link layer issue. END. 119 // 120 // TODO(samueltan): Step F: if retry succeeds, remove the unresponsive DNS 121 // servers so Chrome does not try to use them. 122 // TODO(samueltan): Step X: find ways to disambiguate the cause (e.g. can we see 123 // packets from other hosts?). 124 class ConnectionDiagnostics { 125 public: 126 // The ConnectionDiagnostics::kEventNames string array depends on this enum. 127 // Any changes to this enum should be synced with that array. 128 enum Type { 129 kTypePortalDetection = 0, 130 kTypePingDNSServers = 1, 131 kTypeResolveTargetServerIP = 2, 132 kTypePingTargetServer = 3, 133 kTypePingGateway = 4, 134 kTypeFindRoute = 5, 135 kTypeArpTableLookup = 6, 136 kTypeNeighborTableLookup = 7, 137 kTypeIPCollisionCheck = 8 138 }; 139 140 // The ConnectionDiagnostics::kPhaseNames string array depends on this enum. 141 // Any changes to this enum should be synced with that array. 142 enum Phase { 143 kPhaseStart = 0, 144 kPhaseEnd = 1, 145 // End phases specific to kTypePortalDetection. 146 kPhasePortalDetectionEndContent = 2, 147 kPhasePortalDetectionEndDNS = 3, 148 kPhasePortalDetectionEndOther = 4 149 }; 150 151 // The ConnectionDiagnostics::kResultNames string array depends on this enum. 152 // Any changes to this enum should be synced with that array. 153 enum Result { 154 kResultSuccess = 0, 155 kResultFailure = 1, 156 kResultTimeout = 2 157 }; 158 159 struct Event { EventEvent160 Event(Type type_in, Phase phase_in, Result result_in, 161 const std::string& message_in) 162 : type(type_in), 163 phase(phase_in), 164 result(result_in), 165 message(message_in) {} 166 Type type; 167 Phase phase; 168 Result result; 169 std::string message; 170 }; 171 172 // The result of the diagnostics is a string describing the connection issue 173 // detected (if any), and list of events (e.g. routing table 174 // lookup, DNS resolution) performed during the diagnostics. 175 using ResultCallback = 176 base::Callback<void(const std::string&, const std::vector<Event>&)>; 177 178 // Metrics::NotifyConnectionDiagnosticsIssue depends on these kIssue strings. 179 // Any changes to these strings should be synced with that Metrics function. 180 static const char kIssueIPCollision[]; 181 static const char kIssueRouting[]; 182 static const char kIssueHTTPBrokenPortal[]; 183 static const char kIssueDNSServerMisconfig[]; 184 static const char kIssueDNSServerNoResponse[]; 185 static const char kIssueNoDNSServersConfigured[]; 186 static const char kIssueDNSServersInvalid[]; 187 static const char kIssueNone[]; 188 static const char kIssueCaptivePortal[]; 189 static const char kIssueGatewayUpstream[]; 190 static const char kIssueGatewayNotResponding[]; 191 static const char kIssueServerNotResponding[]; 192 static const char kIssueGatewayArpFailed[]; 193 static const char kIssueServerArpFailed[]; 194 static const char kIssueInternalError[]; 195 static const char kIssueGatewayNoNeighborEntry[]; 196 static const char kIssueServerNoNeighborEntry[]; 197 static const char kIssueGatewayNeighborEntryNotConnected[]; 198 static const char kIssueServerNeighborEntryNotConnected[]; 199 200 ConnectionDiagnostics(ConnectionRefPtr connection, 201 EventDispatcher* dispatcher, 202 Metrics* metrics, 203 const DeviceInfo* device_info, 204 const ResultCallback& result_callback); 205 ~ConnectionDiagnostics(); 206 207 // Starts diagnosing problems that |connection_| encounters reaching 208 // |url_string|. 209 bool Start(const std::string& url_string); 210 211 // Skips the portal detection initiated in ConnectionDiagnostics::Start and 212 // performs further diagnostics based on the |result| from a completed portal 213 // detection attempt. 214 bool StartAfterPortalDetection(const std::string& url_string, 215 const PortalDetector::Result& result); 216 217 void Stop(); 218 219 // Returns a string representation of |event|. 220 static std::string EventToString(const Event& event); 221 running()222 bool running() { return running_; } 223 224 private: 225 friend class ConnectionDiagnosticsTest; 226 227 static const int kMaxDNSRetries; 228 static const int kRouteQueryTimeoutSeconds; 229 static const int kArpReplyTimeoutSeconds; 230 static const int kNeighborTableRequestTimeoutSeconds; 231 static const int kDNSTimeoutSeconds; 232 233 // Create a new Event with |type|, |phase|, |result|, and an empty message, 234 // and add it to the end of |diagnostic_events_|. 235 void AddEvent(Type type, Phase phase, Result result); 236 237 // Same as ConnectionDiagnostics::AddEvent, except that the added event 238 // contains the string |message|. 239 void AddEventWithMessage(Type type, Phase phase, Result result, 240 const std::string& message); 241 242 // Calls |result_callback_|, then stops connection diagnostics. 243 // |diagnostic_events_| and |issue| are passed as arguments to 244 // |result_callback_| to report the results of the diagnostics. 245 void ReportResultAndStop(const std::string &issue); 246 247 void StartAfterPortalDetectionInternal(const PortalDetector::Result& result); 248 249 // Attempts to resolve the IP address of |target_url_| using |dns_servers|. 250 void ResolveTargetServerIPAddress( 251 const std::vector<std::string>& dns_servers); 252 253 // Pings all the DNS servers of |connection_|. 254 void PingDNSServers(); 255 256 // Finds a route to the host at |address| by querying the kernel's routing 257 // table. 258 void FindRouteToHost(const IPAddress& address); 259 260 // Finds an ARP table entry for |address| by querying the kernel's ARP table. 261 void FindArpTableEntry(const IPAddress& address); 262 263 // Finds a neighbor table entry for |address| by requesting an RTNL neighbor 264 // table dump, and looking for a matching neighbor table entry for |address| 265 // in ConnectionDiagnostics::OnNeighborMsgReceived. 266 void FindNeighborTableEntry(const IPAddress& address); 267 268 // Checks for an IP collision by sending out an ARP request for the local IP 269 // address assigned to |connection_|. 270 void CheckIpCollision(); 271 272 // Starts an IcmpSession with |address|. Called when we want to ping the 273 // target web server or local gateway. 274 void PingHost(const IPAddress& address); 275 276 // Called after each IcmpSession started in 277 // ConnectionDiagnostics::PingDNSServers finishes or times out. The DNS server 278 // that was pinged can be uniquely identified with |dns_server_index|. 279 // Attempts to resolve the IP address of |target_url_| again if at least one 280 // DNS server was pinged successfully, and if |num_dns_attempts_| has not yet 281 // reached |kMaxDNSRetries|. 282 void OnPingDNSServerComplete(int dns_server_index, 283 const std::vector<base::TimeDelta>& result); 284 285 // Called after the DNS IP address resolution on started in 286 // ConnectionDiagnostics::ResolveTargetServerIPAddress completes. 287 void OnDNSResolutionComplete(const Error& error, const IPAddress& address); 288 289 // Called after the IcmpSession started in ConnectionDiagnostics::PingHost on 290 // |address_pinged| finishes or times out. |ping_event_type| indicates the 291 // type of ping that was started (gateway or target web server), and |result| 292 // is the result of the IcmpSession. 293 void OnPingHostComplete(Type ping_event_type, const IPAddress& address_pinged, 294 const std::vector<base::TimeDelta>& result); 295 296 // This I/O callback is triggered whenever the ARP reception socket has data 297 // available to be received. 298 void OnArpReplyReceived(int fd); 299 300 // Called if no replies to the ARP request sent in 301 // ConnectionDiagnostics::CheckIpCollision are received within 302 // |kArpReplyTimeoutSeconds| seconds. 303 void OnArpRequestTimeout(); 304 305 // Called when replies are received to the neighbor table dump request issued 306 // in ConnectionDiagnostics::FindNeighborTableEntry. 307 void OnNeighborMsgReceived(const IPAddress& address_queried, 308 const RTNLMessage& msg); 309 310 // Called if no neighbor table entry for |address_queried| is received within 311 // |kNeighborTableRequestTimeoutSeconds| of issuing a dump request in 312 // ConnectionDiagnostics::FindNeighborTableEntry. 313 void OnNeighborTableRequestTimeout(const IPAddress& address_queried); 314 315 // Called upon receiving a reply to the routing table query issued in 316 // ConnectionDiagnostics::FindRoute. 317 void OnRouteQueryResponse(int interface_index, 318 const RoutingTableEntry& entry); 319 320 // Called if no replies to the routing table query issued in 321 // ConnectionDiagnostics::FindRoute are received within 322 // |kRouteQueryTimeoutSeconds|. 323 void OnRouteQueryTimeout(); 324 325 // Utility function that returns true iff the event in |diagnostic_events_| 326 // that is |num_events_ago| before the last event has a matching |type|, 327 // |phase|, and |result|. 328 bool DoesPreviousEventMatch(Type type, Phase phase, Result result, 329 size_t num_events_ago); 330 331 base::WeakPtrFactory<ConnectionDiagnostics> weak_ptr_factory_; 332 EventDispatcher* dispatcher_; 333 Metrics* metrics_; 334 RoutingTable* routing_table_; 335 RTNLHandler* rtnl_handler_; 336 337 // The connection being diagnosed. 338 ConnectionRefPtr connection_; 339 340 // Used to get the MAC address of the device associated with |connection_|. 341 const DeviceInfo* device_info_; 342 343 // The MAC address of device associated with |connection_|. 344 ByteString local_mac_address_; 345 346 DNSClientFactory* dns_client_factory_; 347 std::unique_ptr<DNSClient> dns_client_; 348 std::unique_ptr<PortalDetector> portal_detector_; 349 std::unique_ptr<ArpClient> arp_client_; 350 std::unique_ptr<IcmpSession> icmp_session_; 351 352 // The URL being diagnosed. Stored in unique_ptr so that it can be cleared 353 // when we stop diagnostics. 354 std::unique_ptr<HTTPURL> target_url_; 355 356 // Used to ping multiple DNS servers in |connection_| in parallel. 357 IcmpSessionFactory* icmp_session_factory_; 358 std::map<int, std::unique_ptr<IcmpSession>> 359 id_to_pending_dns_server_icmp_session_; 360 std::vector<std::string> pingable_dns_servers_; 361 362 int num_dns_attempts_; 363 bool running_; 364 365 ResultCallback result_callback_; 366 base::CancelableCallback<void(int, const RoutingTableEntry&)> 367 route_query_callback_; 368 base::CancelableClosure route_query_timeout_callback_; 369 base::CancelableClosure arp_reply_timeout_callback_; 370 base::CancelableClosure neighbor_request_timeout_callback_; 371 372 // IOCallback that fires when the socket associated with |arp_client_| has a 373 // packet to be received. Calls ConnectionDiagnostics::OnArpReplyReceived. 374 std::unique_ptr<IOHandler> receive_response_handler_; 375 376 std::unique_ptr<RTNLListener> neighbor_msg_listener_; 377 378 // Record of all diagnostic events that occurred, sorted in order of 379 // occurrence. 380 std::vector<Event> diagnostic_events_; 381 382 DISALLOW_COPY_AND_ASSIGN(ConnectionDiagnostics); 383 }; 384 385 } // namespace shill 386 387 #endif // SHILL_CONNECTION_DIAGNOSTICS_H_ 388