1 //
2 // Copyright (C) 2015 The Android Open Source Project
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #ifndef SHILL_CONNECTION_DIAGNOSTICS_H_
18 #define SHILL_CONNECTION_DIAGNOSTICS_H_
19 
20 #include <map>
21 #include <memory>
22 #include <string>
23 #include <vector>
24 
25 #include <base/callback.h>
26 #include <base/cancelable_callback.h>
27 #include <base/memory/weak_ptr.h>
28 
29 #include "shill/portal_detector.h"
30 #include "shill/refptr_types.h"
31 
32 namespace shill {
33 
34 class ArpClient;
35 class ByteString;
36 class DeviceInfo;
37 class DNSClient;
38 class DNSClientFactory;
39 class Error;
40 class EventDispatcher;
41 class HTTPURL;
42 class IcmpSession;
43 class IcmpSessionFactory;
44 class Metrics;
45 class RoutingTable;
46 struct RoutingTableEntry;
47 class RTNLHandler;
48 class RTNLListener;
49 class RTNLMessage;
50 
51 // The ConnectionDiagnostics class implements facilities to diagnose problems
52 // that a connection encounters reaching a specific URL.
53 //
54 // Given a connection and a URL, ConnectionDiagnostics performs the following
55 // actions:
56 // (A) Start portal detection on the connection using the given URL.
57 //     (B) If portal detection ends in the content phase, the connection is
58 //         either functioning, or we are trapped in a captive portal. END.
59 //     (C) If the portal detection ends in the DNS phase and failed for any
60 //         reason other than a timeout, we have found a DNS server issue. END.
61 //     (D) If the portal detection ends in the DNS phase and failed because of a
62 //         timeout, ping all DNS servers.
63 //         (E) If none of the DNS servers reply to pings, then we might have a
64 //             problem issue reaching DNS servers. Send a request to the kernel
65 //             for a route the first DNS server on our list (step M).
66 //         (F) If at least one DNS server replies to pings, and we have DNS
67 //             retries left, attempt DNS resolution again using the pingable DNS
68 //             servers.
69 //         (G) If at least one DNS server replies to pings but we are out of DNS
70 //             retries, the DNS servers are at fault. END.
71 //     (H) If portal detection ends in any other phase (i.e. HTTP or Connection)
72 //         resolve the IP of the target web server via DNS.
73 //         (I) If DNS resolution fails because of a timeout, ping all DNS
74 //             servers (step D).
75 //         (J) If DNS resolution fails for any other reason, we have found a
76 //             DNS server issue. END.
77 //         (K) Otherwise, ping the IP address of the target web server.
78 //             (L) If ping is successful, we can reach the target web server. We
79 //                 might have a HTTP issue or a broken portal. END.
80 //             (M) If ping is unsuccessful, we send a request to the kernel for
81 //                 a route to the IP address of the target web server.
82 //                 (N) If no route is found, a routing issue has been found.
83 //                     END.
84 //                 (O) If a route is found, and the destination is a local IPv6
85 //                     address, look for a neighbor table entry.
86 //                     (P) If a neighbor table entry is found, then this
87 //                         gateway/web server appears to be on the local
88 //                         network, but is not responding to pings. END.
89 //                     (Q) If a neighbor table entry is not found, then either
90 //                         this gateway/web server does not exist on the local
91 //                         network, or there are link layer issues.
92 //                 (R) If a route is found and the destination is a remote
93 //                     address, ping the local gateway.
94 //                     (S) If the local gateway respond to pings, then we have
95 //                         found an upstream connectivity problem or gateway
96 //                         problem. END.
97 //                     (T) If the local gateway is at an IPv6 address and does
98 //                         not respond to pings, look for a neighbor table
99 //                         entry (step O).
100 //                     (U) If the local gateway is at an IPv4 address and does
101 //                         not respond to pings, check for an ARP table entry
102 //                         for its address (step V).
103 //                 (V) Otherwise, if a route is found and the destination is a
104 //                     local IPv4 address, look for an ARP table entry for it.
105 //                     (W) If an ARP table entry is found, then this gateway/
106 //                         web server appears to be on the local network, but is
107 //                         not responding to pings. END.
108 //                     (X) If an ARP table entry is not found, check for IP
109 //                         address collision in the local network by sending out
110 //                         an ARP request for the local IP address of this
111 //                         connection.
112 //                         (Y) If a reply is received, an IP collision has been
113 //                             detected. END.
114 //                         (Z) If no reply was received, no IP address collision
115 //                             was detected. Since we are here because ARP and
116 //                             ping failed, either the web server or gateway
117 //                             does not actually exist on the local network, or
118 //                             there is a link layer issue. END.
119 //
120 // TODO(samueltan): Step F: if retry succeeds, remove the unresponsive DNS
121 // servers so Chrome does not try to use them.
122 // TODO(samueltan): Step X: find ways to disambiguate the cause (e.g. can we see
123 // packets from other hosts?).
124 class ConnectionDiagnostics {
125  public:
126   // The ConnectionDiagnostics::kEventNames string array depends on this enum.
127   // Any changes to this enum should be synced with that array.
128   enum Type {
129     kTypePortalDetection = 0,
130     kTypePingDNSServers = 1,
131     kTypeResolveTargetServerIP = 2,
132     kTypePingTargetServer = 3,
133     kTypePingGateway = 4,
134     kTypeFindRoute = 5,
135     kTypeArpTableLookup = 6,
136     kTypeNeighborTableLookup = 7,
137     kTypeIPCollisionCheck = 8
138   };
139 
140   // The ConnectionDiagnostics::kPhaseNames string array depends on this enum.
141   // Any changes to this enum should be synced with that array.
142   enum Phase {
143     kPhaseStart = 0,
144     kPhaseEnd = 1,
145     // End phases specific to kTypePortalDetection.
146     kPhasePortalDetectionEndContent = 2,
147     kPhasePortalDetectionEndDNS = 3,
148     kPhasePortalDetectionEndOther = 4
149   };
150 
151   // The ConnectionDiagnostics::kResultNames string array depends on this enum.
152   // Any changes to this enum should be synced with that array.
153   enum Result {
154     kResultSuccess = 0,
155     kResultFailure = 1,
156     kResultTimeout = 2
157   };
158 
159   struct Event {
EventEvent160     Event(Type type_in, Phase phase_in, Result result_in,
161           const std::string& message_in)
162         : type(type_in),
163           phase(phase_in),
164           result(result_in),
165           message(message_in) {}
166     Type type;
167     Phase phase;
168     Result result;
169     std::string message;
170   };
171 
172   // The result of the diagnostics is a string describing the connection issue
173   // detected (if any), and list of events (e.g. routing table
174   // lookup, DNS resolution) performed during the diagnostics.
175   using ResultCallback =
176       base::Callback<void(const std::string&, const std::vector<Event>&)>;
177 
178   // Metrics::NotifyConnectionDiagnosticsIssue depends on these kIssue strings.
179   // Any changes to these strings should be synced with that Metrics function.
180   static const char kIssueIPCollision[];
181   static const char kIssueRouting[];
182   static const char kIssueHTTPBrokenPortal[];
183   static const char kIssueDNSServerMisconfig[];
184   static const char kIssueDNSServerNoResponse[];
185   static const char kIssueNoDNSServersConfigured[];
186   static const char kIssueDNSServersInvalid[];
187   static const char kIssueNone[];
188   static const char kIssueCaptivePortal[];
189   static const char kIssueGatewayUpstream[];
190   static const char kIssueGatewayNotResponding[];
191   static const char kIssueServerNotResponding[];
192   static const char kIssueGatewayArpFailed[];
193   static const char kIssueServerArpFailed[];
194   static const char kIssueInternalError[];
195   static const char kIssueGatewayNoNeighborEntry[];
196   static const char kIssueServerNoNeighborEntry[];
197   static const char kIssueGatewayNeighborEntryNotConnected[];
198   static const char kIssueServerNeighborEntryNotConnected[];
199 
200   ConnectionDiagnostics(ConnectionRefPtr connection,
201                         EventDispatcher* dispatcher,
202                         Metrics* metrics,
203                         const DeviceInfo* device_info,
204                         const ResultCallback& result_callback);
205   ~ConnectionDiagnostics();
206 
207   // Starts diagnosing problems that |connection_| encounters reaching
208   // |url_string|.
209   bool Start(const std::string& url_string);
210 
211   // Skips the portal detection initiated in ConnectionDiagnostics::Start and
212   // performs further diagnostics based on the |result| from a completed portal
213   // detection attempt.
214   bool StartAfterPortalDetection(const std::string& url_string,
215                                  const PortalDetector::Result& result);
216 
217   void Stop();
218 
219   // Returns a string representation of |event|.
220   static std::string EventToString(const Event& event);
221 
running()222   bool running() { return running_; }
223 
224  private:
225   friend class ConnectionDiagnosticsTest;
226 
227   static const int kMaxDNSRetries;
228   static const int kRouteQueryTimeoutSeconds;
229   static const int kArpReplyTimeoutSeconds;
230   static const int kNeighborTableRequestTimeoutSeconds;
231   static const int kDNSTimeoutSeconds;
232 
233   // Create a new Event with |type|, |phase|, |result|, and an empty message,
234   // and add it to the end of |diagnostic_events_|.
235   void AddEvent(Type type, Phase phase, Result result);
236 
237   // Same as ConnectionDiagnostics::AddEvent, except that the added event
238   // contains the string |message|.
239   void AddEventWithMessage(Type type, Phase phase, Result result,
240                            const std::string& message);
241 
242   // Calls |result_callback_|, then stops connection diagnostics.
243   // |diagnostic_events_| and |issue| are passed as arguments to
244   // |result_callback_| to report the results of the diagnostics.
245   void ReportResultAndStop(const std::string &issue);
246 
247   void StartAfterPortalDetectionInternal(const PortalDetector::Result& result);
248 
249   // Attempts to resolve the IP address of |target_url_| using |dns_servers|.
250   void ResolveTargetServerIPAddress(
251       const std::vector<std::string>& dns_servers);
252 
253   // Pings all the DNS servers of |connection_|.
254   void PingDNSServers();
255 
256   // Finds a route to the host at |address| by querying the kernel's routing
257   // table.
258   void FindRouteToHost(const IPAddress& address);
259 
260   // Finds an ARP table entry for |address| by querying the kernel's ARP table.
261   void FindArpTableEntry(const IPAddress& address);
262 
263   // Finds a neighbor table entry for |address| by requesting an RTNL neighbor
264   // table dump, and looking for a matching neighbor table entry for |address|
265   // in ConnectionDiagnostics::OnNeighborMsgReceived.
266   void FindNeighborTableEntry(const IPAddress& address);
267 
268   // Checks for an IP collision by sending out an ARP request for the local IP
269   // address assigned to |connection_|.
270   void CheckIpCollision();
271 
272   // Starts an IcmpSession with |address|. Called when we want to ping the
273   // target web server or local gateway.
274   void PingHost(const IPAddress& address);
275 
276   // Called after each IcmpSession started in
277   // ConnectionDiagnostics::PingDNSServers finishes or times out. The DNS server
278   // that was pinged can be uniquely identified with |dns_server_index|.
279   // Attempts to resolve the IP address of |target_url_| again if at least one
280   // DNS server was pinged successfully, and if |num_dns_attempts_| has not yet
281   // reached |kMaxDNSRetries|.
282   void OnPingDNSServerComplete(int dns_server_index,
283                                const std::vector<base::TimeDelta>& result);
284 
285   // Called after the DNS IP address resolution on started in
286   // ConnectionDiagnostics::ResolveTargetServerIPAddress completes.
287   void OnDNSResolutionComplete(const Error& error, const IPAddress& address);
288 
289   // Called after the IcmpSession started in ConnectionDiagnostics::PingHost on
290   // |address_pinged| finishes or times out. |ping_event_type| indicates the
291   // type of ping that was started (gateway or target web server), and |result|
292   // is the result of the IcmpSession.
293   void OnPingHostComplete(Type ping_event_type, const IPAddress& address_pinged,
294                           const std::vector<base::TimeDelta>& result);
295 
296   // This I/O callback is triggered whenever the ARP reception socket has data
297   // available to be received.
298   void OnArpReplyReceived(int fd);
299 
300   // Called if no replies to the ARP request sent in
301   // ConnectionDiagnostics::CheckIpCollision are received within
302   // |kArpReplyTimeoutSeconds| seconds.
303   void OnArpRequestTimeout();
304 
305   // Called when replies are received to the neighbor table dump request issued
306   // in ConnectionDiagnostics::FindNeighborTableEntry.
307   void OnNeighborMsgReceived(const IPAddress& address_queried,
308                              const RTNLMessage& msg);
309 
310   // Called if no neighbor table entry for |address_queried| is received within
311   // |kNeighborTableRequestTimeoutSeconds| of issuing a dump request in
312   // ConnectionDiagnostics::FindNeighborTableEntry.
313   void OnNeighborTableRequestTimeout(const IPAddress& address_queried);
314 
315   // Called upon receiving a reply to the routing table query issued in
316   // ConnectionDiagnostics::FindRoute.
317   void OnRouteQueryResponse(int interface_index,
318                             const RoutingTableEntry& entry);
319 
320   // Called if no replies to the routing table query issued in
321   // ConnectionDiagnostics::FindRoute are received within
322   // |kRouteQueryTimeoutSeconds|.
323   void OnRouteQueryTimeout();
324 
325   // Utility function that returns true iff the event in |diagnostic_events_|
326   // that is |num_events_ago| before the last event has a matching |type|,
327   // |phase|, and |result|.
328   bool DoesPreviousEventMatch(Type type, Phase phase, Result result,
329                               size_t num_events_ago);
330 
331   base::WeakPtrFactory<ConnectionDiagnostics> weak_ptr_factory_;
332   EventDispatcher* dispatcher_;
333   Metrics* metrics_;
334   RoutingTable* routing_table_;
335   RTNLHandler* rtnl_handler_;
336 
337   // The connection being diagnosed.
338   ConnectionRefPtr connection_;
339 
340   // Used to get the MAC address of the device associated with |connection_|.
341   const DeviceInfo* device_info_;
342 
343   // The MAC address of device associated with |connection_|.
344   ByteString local_mac_address_;
345 
346   DNSClientFactory* dns_client_factory_;
347   std::unique_ptr<DNSClient> dns_client_;
348   std::unique_ptr<PortalDetector> portal_detector_;
349   std::unique_ptr<ArpClient> arp_client_;
350   std::unique_ptr<IcmpSession> icmp_session_;
351 
352   // The URL being diagnosed. Stored in unique_ptr so that it can be cleared
353   // when we stop diagnostics.
354   std::unique_ptr<HTTPURL> target_url_;
355 
356   // Used to ping multiple DNS servers in |connection_| in parallel.
357   IcmpSessionFactory* icmp_session_factory_;
358   std::map<int, std::unique_ptr<IcmpSession>>
359       id_to_pending_dns_server_icmp_session_;
360   std::vector<std::string> pingable_dns_servers_;
361 
362   int num_dns_attempts_;
363   bool running_;
364 
365   ResultCallback result_callback_;
366   base::CancelableCallback<void(int, const RoutingTableEntry&)>
367       route_query_callback_;
368   base::CancelableClosure route_query_timeout_callback_;
369   base::CancelableClosure arp_reply_timeout_callback_;
370   base::CancelableClosure neighbor_request_timeout_callback_;
371 
372   // IOCallback that fires when the socket associated with |arp_client_| has a
373   // packet to be received.  Calls ConnectionDiagnostics::OnArpReplyReceived.
374   std::unique_ptr<IOHandler> receive_response_handler_;
375 
376   std::unique_ptr<RTNLListener> neighbor_msg_listener_;
377 
378   // Record of all diagnostic events that occurred, sorted in order of
379   // occurrence.
380   std::vector<Event> diagnostic_events_;
381 
382   DISALLOW_COPY_AND_ASSIGN(ConnectionDiagnostics);
383 };
384 
385 }  // namespace shill
386 
387 #endif  // SHILL_CONNECTION_DIAGNOSTICS_H_
388