1c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller/*
2c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Copyright (C) 2014 Square, Inc.
3c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller *
4c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Licensed under the Apache License, Version 2.0 (the "License");
5c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * you may not use this file except in compliance with the License.
6c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * You may obtain a copy of the License at
7c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller *
8c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller *      http://www.apache.org/licenses/LICENSE-2.0
9c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller *
10c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Unless required by applicable law or agreed to in writing, software
11c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * distributed under the License is distributed on an "AS IS" BASIS,
12c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * See the License for the specific language governing permissions and
14c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * limitations under the License.
15c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller */
16c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerpackage com.squareup.okhttp.sample;
17c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
18c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport com.squareup.okhttp.HttpResponseCache;
19c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport com.squareup.okhttp.MediaType;
20c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport com.squareup.okhttp.OkHttpClient;
21c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport com.squareup.okhttp.internal.http.OkHeaders;
22c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.io.File;
23c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.io.IOException;
24c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.io.InputStream;
25c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.net.HttpURLConnection;
26c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.net.MalformedURLException;
27c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.net.URL;
28c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.nio.charset.Charset;
29c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.Collections;
30c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.LinkedHashSet;
31c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.Set;
32c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.ExecutorService;
33c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.Executors;
34c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.LinkedBlockingQueue;
35c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.Jsoup;
36c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.nodes.Document;
37c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.nodes.Element;
38c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
39c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller/**
40c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Fetches HTML from a requested URL, follows the links, and repeats.
41c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller */
42c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerpublic final class Crawler {
43c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  public static final Charset UTF_8 = Charset.forName("UTF-8");
44c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
45c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  private final OkHttpClient client;
46c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  private final Set<URL> fetchedUrls = Collections.synchronizedSet(new LinkedHashSet<URL>());
47c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  private final LinkedBlockingQueue<URL> queue = new LinkedBlockingQueue<URL>();
48c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
49c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  public Crawler(OkHttpClient client) {
50c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    this.client = client;
51c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  }
52c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
53c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  private void parallelDrainQueue(int threadCount) {
54c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    ExecutorService executor = Executors.newFixedThreadPool(threadCount);
55c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    for (int i = 0; i < threadCount; i++) {
56c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      executor.execute(new Runnable() {
57c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller        @Override public void run() {
58c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller          try {
59c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller            drainQueue();
60c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller          } catch (Exception e) {
61c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller            e.printStackTrace();
62c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller          }
63c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller        }
64c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      });
65c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
66c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    executor.shutdown();
67c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  }
68c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
69c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  private void drainQueue() throws Exception {
70c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    for (URL url; (url = queue.take()) != null; ) {
71c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      if (!fetchedUrls.add(url)) {
72c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller        continue;
73c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      }
74c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
75c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      try {
76c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller        fetch(url);
77c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      } catch (IOException e) {
78c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller        System.out.printf("XXX: %s %s%n", url, e);
79c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      }
80c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
81c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  }
82c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
83c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  public void fetch(URL url) throws IOException {
84c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    HttpURLConnection connection = client.open(url);
85c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    String responseSource = connection.getHeaderField(OkHeaders.RESPONSE_SOURCE);
86c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    String contentType = connection.getHeaderField("Content-Type");
87c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    int responseCode = connection.getResponseCode();
88c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
89c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    System.out.printf("%03d: %s %s%n", responseCode, url, responseSource);
90c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
91c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    if (responseCode >= 400) {
92c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      connection.getErrorStream().close();
93c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      return;
94c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
95c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
96c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    InputStream in = connection.getInputStream();
97c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    if (responseCode != 200 || contentType == null) {
98c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      in.close();
99c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      return;
100c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
101c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
102c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    MediaType mediaType = MediaType.parse(contentType);
103c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    Document document = Jsoup.parse(in, mediaType.charset(UTF_8).name(), url.toString());
104c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    for (Element element : document.select("a[href]")) {
105c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      String href = element.attr("href");
106c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      URL link = parseUrl(url, href);
107c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      if (link != null) queue.add(link);
108c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
109c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
110c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    in.close();
111c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  }
112c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
113c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  private URL parseUrl(URL url, String href) {
114c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    try {
115c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      URL result = new URL(url, href);
116c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      return result.getProtocol().equals("http") || result.getProtocol().equals("https")
117c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller          ? result
118c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller          : null;
119c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    } catch (MalformedURLException e) {
120c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      return null;
121c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
122c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  }
123c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
124c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  public static void main(String[] args) throws IOException {
125c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    if (args.length != 2) {
126c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      System.out.println("Usage: Crawler <cache dir> <root>");
127c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller      return;
128c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    }
129c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
130c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    int threadCount = 20;
131c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    long cacheByteCount = 1024L * 1024L * 100L;
132c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
133c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    OkHttpClient client = new OkHttpClient();
134c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    HttpResponseCache httpResponseCache = new HttpResponseCache(new File(args[0]), cacheByteCount);
135c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    client.setOkResponseCache(httpResponseCache);
136c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller
137c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    Crawler crawler = new Crawler(client);
138c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    crawler.queue.add(new URL(args[1]));
139c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller    crawler.parallelDrainQueue(threadCount);
140c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller  }
141c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller}
142