1 /* 2 * Copyright (C) 2014 Square, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.squareup.okhttp.sample; 17 18 import com.squareup.okhttp.Cache; 19 import com.squareup.okhttp.HttpUrl; 20 import com.squareup.okhttp.OkHttpClient; 21 import com.squareup.okhttp.Request; 22 import com.squareup.okhttp.Response; 23 import com.squareup.okhttp.internal.NamedRunnable; 24 import java.io.File; 25 import java.io.IOException; 26 import java.util.Collections; 27 import java.util.LinkedHashSet; 28 import java.util.Set; 29 import java.util.concurrent.ConcurrentHashMap; 30 import java.util.concurrent.ExecutorService; 31 import java.util.concurrent.Executors; 32 import java.util.concurrent.LinkedBlockingQueue; 33 import java.util.concurrent.atomic.AtomicInteger; 34 import org.jsoup.Jsoup; 35 import org.jsoup.nodes.Document; 36 import org.jsoup.nodes.Element; 37 38 /** 39 * Fetches HTML from a requested URL, follows the links, and repeats. 40 */ 41 public final class Crawler { 42 private final OkHttpClient client; 43 private final Set<HttpUrl> fetchedUrls = Collections.synchronizedSet( 44 new LinkedHashSet<HttpUrl>()); 45 private final LinkedBlockingQueue<HttpUrl> queue = new LinkedBlockingQueue<>(); 46 private final ConcurrentHashMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>(); 47 Crawler(OkHttpClient client)48 public Crawler(OkHttpClient client) { 49 this.client = client; 50 } 51 parallelDrainQueue(int threadCount)52 private void parallelDrainQueue(int threadCount) { 53 ExecutorService executor = Executors.newFixedThreadPool(threadCount); 54 for (int i = 0; i < threadCount; i++) { 55 executor.execute(new NamedRunnable("Crawler %s", i) { 56 @Override protected void execute() { 57 try { 58 drainQueue(); 59 } catch (Exception e) { 60 e.printStackTrace(); 61 } 62 } 63 }); 64 } 65 executor.shutdown(); 66 } 67 drainQueue()68 private void drainQueue() throws Exception { 69 for (HttpUrl url; (url = queue.take()) != null; ) { 70 if (!fetchedUrls.add(url)) { 71 continue; 72 } 73 74 try { 75 fetch(url); 76 } catch (IOException e) { 77 System.out.printf("XXX: %s %s%n", url, e); 78 } 79 } 80 } 81 fetch(HttpUrl url)82 public void fetch(HttpUrl url) throws IOException { 83 // Skip hosts that we've visited many times. 84 AtomicInteger hostnameCount = new AtomicInteger(); 85 AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount); 86 if (previous != null) hostnameCount = previous; 87 if (hostnameCount.incrementAndGet() > 100) return; 88 89 Request request = new Request.Builder() 90 .url(url) 91 .build(); 92 Response response = client.newCall(request).execute(); 93 String responseSource = response.networkResponse() != null 94 ? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")") 95 : "(cache)"; 96 int responseCode = response.code(); 97 98 System.out.printf("%03d: %s %s%n", responseCode, url, responseSource); 99 100 String contentType = response.header("Content-Type"); 101 if (responseCode != 200 || contentType == null) { 102 response.body().close(); 103 return; 104 } 105 106 Document document = Jsoup.parse(response.body().string(), url.toString()); 107 for (Element element : document.select("a[href]")) { 108 String href = element.attr("href"); 109 HttpUrl link = response.request().httpUrl().resolve(href); 110 if (link != null) queue.add(link); 111 } 112 } 113 main(String[] args)114 public static void main(String[] args) throws IOException { 115 if (args.length != 2) { 116 System.out.println("Usage: Crawler <cache dir> <root>"); 117 return; 118 } 119 120 int threadCount = 20; 121 long cacheByteCount = 1024L * 1024L * 100L; 122 123 OkHttpClient client = new OkHttpClient(); 124 Cache cache = new Cache(new File(args[0]), cacheByteCount); 125 client.setCache(cache); 126 127 Crawler crawler = new Crawler(client); 128 crawler.queue.add(HttpUrl.parse(args[1])); 129 crawler.parallelDrainQueue(threadCount); 130 } 131 } 132