1/*
2 * Copyright (C) 2014 Square, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16package com.squareup.okhttp.sample;
17
18import com.squareup.okhttp.Cache;
19import com.squareup.okhttp.HttpUrl;
20import com.squareup.okhttp.OkHttpClient;
21import com.squareup.okhttp.Request;
22import com.squareup.okhttp.Response;
23import com.squareup.okhttp.internal.NamedRunnable;
24import java.io.File;
25import java.io.IOException;
26import java.util.Collections;
27import java.util.LinkedHashSet;
28import java.util.Set;
29import java.util.concurrent.ConcurrentHashMap;
30import java.util.concurrent.ExecutorService;
31import java.util.concurrent.Executors;
32import java.util.concurrent.LinkedBlockingQueue;
33import java.util.concurrent.atomic.AtomicInteger;
34import org.jsoup.Jsoup;
35import org.jsoup.nodes.Document;
36import org.jsoup.nodes.Element;
37
38/**
39 * Fetches HTML from a requested URL, follows the links, and repeats.
40 */
41public final class Crawler {
42  private final OkHttpClient client;
43  private final Set<HttpUrl> fetchedUrls = Collections.synchronizedSet(
44      new LinkedHashSet<HttpUrl>());
45  private final LinkedBlockingQueue<HttpUrl> queue = new LinkedBlockingQueue<>();
46  private final ConcurrentHashMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>();
47
48  public Crawler(OkHttpClient client) {
49    this.client = client;
50  }
51
52  private void parallelDrainQueue(int threadCount) {
53    ExecutorService executor = Executors.newFixedThreadPool(threadCount);
54    for (int i = 0; i < threadCount; i++) {
55      executor.execute(new NamedRunnable("Crawler %s", i) {
56        @Override protected void execute() {
57          try {
58            drainQueue();
59          } catch (Exception e) {
60            e.printStackTrace();
61          }
62        }
63      });
64    }
65    executor.shutdown();
66  }
67
68  private void drainQueue() throws Exception {
69    for (HttpUrl url; (url = queue.take()) != null; ) {
70      if (!fetchedUrls.add(url)) {
71        continue;
72      }
73
74      try {
75        fetch(url);
76      } catch (IOException e) {
77        System.out.printf("XXX: %s %s%n", url, e);
78      }
79    }
80  }
81
82  public void fetch(HttpUrl url) throws IOException {
83    // Skip hosts that we've visited many times.
84    AtomicInteger hostnameCount = new AtomicInteger();
85    AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount);
86    if (previous != null) hostnameCount = previous;
87    if (hostnameCount.incrementAndGet() > 100) return;
88
89    Request request = new Request.Builder()
90        .url(url)
91        .build();
92    Response response = client.newCall(request).execute();
93    String responseSource = response.networkResponse() != null
94        ? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")")
95        : "(cache)";
96    int responseCode = response.code();
97
98    System.out.printf("%03d: %s %s%n", responseCode, url, responseSource);
99
100    String contentType = response.header("Content-Type");
101    if (responseCode != 200 || contentType == null) {
102      response.body().close();
103      return;
104    }
105
106    Document document = Jsoup.parse(response.body().string(), url.toString());
107    for (Element element : document.select("a[href]")) {
108      String href = element.attr("href");
109      HttpUrl link = response.request().httpUrl().resolve(href);
110      if (link != null) queue.add(link);
111    }
112  }
113
114  public static void main(String[] args) throws IOException {
115    if (args.length != 2) {
116      System.out.println("Usage: Crawler <cache dir> <root>");
117      return;
118    }
119
120    int threadCount = 20;
121    long cacheByteCount = 1024L * 1024L * 100L;
122
123    OkHttpClient client = new OkHttpClient();
124    Cache cache = new Cache(new File(args[0]), cacheByteCount);
125    client.setCache(cache);
126
127    Crawler crawler = new Crawler(client);
128    crawler.queue.add(HttpUrl.parse(args[1]));
129    crawler.parallelDrainQueue(threadCount);
130  }
131}
132