1 /*
2  * Copyright (C) 2014 Square, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.squareup.okhttp.sample;
17 
18 import com.squareup.okhttp.Cache;
19 import com.squareup.okhttp.HttpUrl;
20 import com.squareup.okhttp.OkHttpClient;
21 import com.squareup.okhttp.Request;
22 import com.squareup.okhttp.Response;
23 import com.squareup.okhttp.internal.NamedRunnable;
24 import java.io.File;
25 import java.io.IOException;
26 import java.util.Collections;
27 import java.util.LinkedHashSet;
28 import java.util.Set;
29 import java.util.concurrent.ConcurrentHashMap;
30 import java.util.concurrent.ExecutorService;
31 import java.util.concurrent.Executors;
32 import java.util.concurrent.LinkedBlockingQueue;
33 import java.util.concurrent.atomic.AtomicInteger;
34 import org.jsoup.Jsoup;
35 import org.jsoup.nodes.Document;
36 import org.jsoup.nodes.Element;
37 
38 /**
39  * Fetches HTML from a requested URL, follows the links, and repeats.
40  */
41 public final class Crawler {
42   private final OkHttpClient client;
43   private final Set<HttpUrl> fetchedUrls = Collections.synchronizedSet(
44       new LinkedHashSet<HttpUrl>());
45   private final LinkedBlockingQueue<HttpUrl> queue = new LinkedBlockingQueue<>();
46   private final ConcurrentHashMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>();
47 
Crawler(OkHttpClient client)48   public Crawler(OkHttpClient client) {
49     this.client = client;
50   }
51 
parallelDrainQueue(int threadCount)52   private void parallelDrainQueue(int threadCount) {
53     ExecutorService executor = Executors.newFixedThreadPool(threadCount);
54     for (int i = 0; i < threadCount; i++) {
55       executor.execute(new NamedRunnable("Crawler %s", i) {
56         @Override protected void execute() {
57           try {
58             drainQueue();
59           } catch (Exception e) {
60             e.printStackTrace();
61           }
62         }
63       });
64     }
65     executor.shutdown();
66   }
67 
drainQueue()68   private void drainQueue() throws Exception {
69     for (HttpUrl url; (url = queue.take()) != null; ) {
70       if (!fetchedUrls.add(url)) {
71         continue;
72       }
73 
74       try {
75         fetch(url);
76       } catch (IOException e) {
77         System.out.printf("XXX: %s %s%n", url, e);
78       }
79     }
80   }
81 
fetch(HttpUrl url)82   public void fetch(HttpUrl url) throws IOException {
83     // Skip hosts that we've visited many times.
84     AtomicInteger hostnameCount = new AtomicInteger();
85     AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount);
86     if (previous != null) hostnameCount = previous;
87     if (hostnameCount.incrementAndGet() > 100) return;
88 
89     Request request = new Request.Builder()
90         .url(url)
91         .build();
92     Response response = client.newCall(request).execute();
93     String responseSource = response.networkResponse() != null
94         ? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")")
95         : "(cache)";
96     int responseCode = response.code();
97 
98     System.out.printf("%03d: %s %s%n", responseCode, url, responseSource);
99 
100     String contentType = response.header("Content-Type");
101     if (responseCode != 200 || contentType == null) {
102       response.body().close();
103       return;
104     }
105 
106     Document document = Jsoup.parse(response.body().string(), url.toString());
107     for (Element element : document.select("a[href]")) {
108       String href = element.attr("href");
109       HttpUrl link = response.request().httpUrl().resolve(href);
110       if (link != null) queue.add(link);
111     }
112   }
113 
main(String[] args)114   public static void main(String[] args) throws IOException {
115     if (args.length != 2) {
116       System.out.println("Usage: Crawler <cache dir> <root>");
117       return;
118     }
119 
120     int threadCount = 20;
121     long cacheByteCount = 1024L * 1024L * 100L;
122 
123     OkHttpClient client = new OkHttpClient();
124     Cache cache = new Cache(new File(args[0]), cacheByteCount);
125     client.setCache(cache);
126 
127     Crawler crawler = new Crawler(client);
128     crawler.queue.add(HttpUrl.parse(args[1]));
129     crawler.parallelDrainQueue(threadCount);
130   }
131 }
132