1c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller/*
2c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Copyright (C) 2014 Square, Inc.
3c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller *
4c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Licensed under the Apache License, Version 2.0 (the "License");
5c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * you may not use this file except in compliance with the License.
6c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * You may obtain a copy of the License at
7c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller *
8c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller *      http://www.apache.org/licenses/LICENSE-2.0
9c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller *
10c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Unless required by applicable law or agreed to in writing, software
11c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * distributed under the License is distributed on an "AS IS" BASIS,
12c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * See the License for the specific language governing permissions and
14c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * limitations under the License.
15c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller */
16c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerpackage com.squareup.okhttp.sample;
17c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
18e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fullerimport com.squareup.okhttp.Cache;
1971b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fullerimport com.squareup.okhttp.HttpUrl;
20c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport com.squareup.okhttp.OkHttpClient;
21e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fullerimport com.squareup.okhttp.Request;
22e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fullerimport com.squareup.okhttp.Response;
23a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fullerimport com.squareup.okhttp.internal.NamedRunnable;
24c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.io.File;
25c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.io.IOException;
26c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.Collections;
27c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.LinkedHashSet;
28c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.Set;
29a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fullerimport java.util.concurrent.ConcurrentHashMap;
30c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.ExecutorService;
31c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.Executors;
32c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.LinkedBlockingQueue;
33a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fullerimport java.util.concurrent.atomic.AtomicInteger;
34c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.Jsoup;
35c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.nodes.Document;
36c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.nodes.Element;
37c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
38c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller/**
39c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Fetches HTML from a requested URL, follows the links, and repeats.
40c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller */
41c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerpublic final class Crawler {
42c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  private final OkHttpClient client;
4371b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller  private final Set<HttpUrl> fetchedUrls = Collections.synchronizedSet(
4471b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller      new LinkedHashSet<HttpUrl>());
4571b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller  private final LinkedBlockingQueue<HttpUrl> queue = new LinkedBlockingQueue<>();
46a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller  private final ConcurrentHashMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>();
47c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
48c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  public Crawler(OkHttpClient client) {
49c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    this.client = client;
50c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  }
51c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
52c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  private void parallelDrainQueue(int threadCount) {
53c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    ExecutorService executor = Executors.newFixedThreadPool(threadCount);
54c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    for (int i = 0; i < threadCount; i++) {
55a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller      executor.execute(new NamedRunnable("Crawler %s", i) {
56a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller        @Override protected void execute() {
57c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller          try {
58c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller            drainQueue();
59c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller          } catch (Exception e) {
60c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller            e.printStackTrace();
61c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller          }
62c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller        }
63c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      });
64c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
65c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    executor.shutdown();
66c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  }
67c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
68c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  private void drainQueue() throws Exception {
6971b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller    for (HttpUrl url; (url = queue.take()) != null; ) {
70c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      if (!fetchedUrls.add(url)) {
71c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller        continue;
72c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      }
73c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
74c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      try {
75c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller        fetch(url);
76c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      } catch (IOException e) {
77c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller        System.out.printf("XXX: %s %s%n", url, e);
78c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      }
79c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
80c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  }
81c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
8271b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller  public void fetch(HttpUrl url) throws IOException {
83a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller    // Skip hosts that we've visited many times.
84a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller    AtomicInteger hostnameCount = new AtomicInteger();
8571b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller    AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount);
86a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller    if (previous != null) hostnameCount = previous;
87a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller    if (hostnameCount.incrementAndGet() > 100) return;
88a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller
89e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller    Request request = new Request.Builder()
90e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller        .url(url)
91e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller        .build();
92e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller    Response response = client.newCall(request).execute();
93e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller    String responseSource = response.networkResponse() != null
94a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller        ? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")")
95e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller        : "(cache)";
96e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller    int responseCode = response.code();
97c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
98c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    System.out.printf("%03d: %s %s%n", responseCode, url, responseSource);
99c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
100e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller    String contentType = response.header("Content-Type");
101c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    if (responseCode != 200 || contentType == null) {
102e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller      response.body().close();
103c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      return;
104c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
105c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
106e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller    Document document = Jsoup.parse(response.body().string(), url.toString());
107c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    for (Element element : document.select("a[href]")) {
108c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      String href = element.attr("href");
10971b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller      HttpUrl link = response.request().httpUrl().resolve(href);
110c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      if (link != null) queue.add(link);
111c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
112c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  }
113c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
114c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  public static void main(String[] args) throws IOException {
115c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    if (args.length != 2) {
116c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      System.out.println("Usage: Crawler <cache dir> <root>");
117c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      return;
118c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
119c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
120c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    int threadCount = 20;
121c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    long cacheByteCount = 1024L * 1024L * 100L;
122c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
123c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    OkHttpClient client = new OkHttpClient();
124e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller    Cache cache = new Cache(new File(args[0]), cacheByteCount);
125e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller    client.setCache(cache);
126c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
127c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    Crawler crawler = new Crawler(client);
12871b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller    crawler.queue.add(HttpUrl.parse(args[1]));
129c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    crawler.parallelDrainQueue(threadCount);
130c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  }
131c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller}
132