1c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller/* 2c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Copyright (C) 2014 Square, Inc. 3c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * 4c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Licensed under the Apache License, Version 2.0 (the "License"); 5c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * you may not use this file except in compliance with the License. 6c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * You may obtain a copy of the License at 7c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * 8c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * http://www.apache.org/licenses/LICENSE-2.0 9c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * 10c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Unless required by applicable law or agreed to in writing, software 11c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * distributed under the License is distributed on an "AS IS" BASIS, 12c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * See the License for the specific language governing permissions and 14c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * limitations under the License. 15c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller */ 16c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerpackage com.squareup.okhttp.sample; 17c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 18e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fullerimport com.squareup.okhttp.Cache; 1971b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fullerimport com.squareup.okhttp.HttpUrl; 20c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport com.squareup.okhttp.OkHttpClient; 21e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fullerimport com.squareup.okhttp.Request; 22e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fullerimport com.squareup.okhttp.Response; 23a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fullerimport com.squareup.okhttp.internal.NamedRunnable; 24c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.io.File; 25c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.io.IOException; 26c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.Collections; 27c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.LinkedHashSet; 28c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.Set; 29a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fullerimport java.util.concurrent.ConcurrentHashMap; 30c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.ExecutorService; 31c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.Executors; 32c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.LinkedBlockingQueue; 33a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fullerimport java.util.concurrent.atomic.AtomicInteger; 34c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.Jsoup; 35c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.nodes.Document; 36c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.nodes.Element; 37c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 38c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller/** 39c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Fetches HTML from a requested URL, follows the links, and repeats. 40c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller */ 41c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerpublic final class Crawler { 42c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller private final OkHttpClient client; 4371b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller private final Set<HttpUrl> fetchedUrls = Collections.synchronizedSet( 4471b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller new LinkedHashSet<HttpUrl>()); 4571b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller private final LinkedBlockingQueue<HttpUrl> queue = new LinkedBlockingQueue<>(); 46a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller private final ConcurrentHashMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>(); 47c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 48c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller public Crawler(OkHttpClient client) { 49c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller this.client = client; 50c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 51c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 52c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller private void parallelDrainQueue(int threadCount) { 53c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller ExecutorService executor = Executors.newFixedThreadPool(threadCount); 54c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller for (int i = 0; i < threadCount; i++) { 55a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller executor.execute(new NamedRunnable("Crawler %s", i) { 56a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller @Override protected void execute() { 57c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller try { 58c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller drainQueue(); 59c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } catch (Exception e) { 60c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller e.printStackTrace(); 61c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 62c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 63c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller }); 64c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 65c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller executor.shutdown(); 66c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 67c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 68c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller private void drainQueue() throws Exception { 6971b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller for (HttpUrl url; (url = queue.take()) != null; ) { 70c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller if (!fetchedUrls.add(url)) { 71c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller continue; 72c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 73c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 74c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller try { 75c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller fetch(url); 76c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } catch (IOException e) { 77c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller System.out.printf("XXX: %s %s%n", url, e); 78c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 79c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 80c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 81c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 8271b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller public void fetch(HttpUrl url) throws IOException { 83a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller // Skip hosts that we've visited many times. 84a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller AtomicInteger hostnameCount = new AtomicInteger(); 8571b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount); 86a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller if (previous != null) hostnameCount = previous; 87a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller if (hostnameCount.incrementAndGet() > 100) return; 88a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller 89e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller Request request = new Request.Builder() 90e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller .url(url) 91e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller .build(); 92e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller Response response = client.newCall(request).execute(); 93e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller String responseSource = response.networkResponse() != null 94a2cab72aa5ff730ba2ae987b45398faafffeb505Neil Fuller ? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")") 95e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller : "(cache)"; 96e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller int responseCode = response.code(); 97c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 98c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller System.out.printf("%03d: %s %s%n", responseCode, url, responseSource); 99c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 100e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller String contentType = response.header("Content-Type"); 101c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller if (responseCode != 200 || contentType == null) { 102e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller response.body().close(); 103c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller return; 104c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 105c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 106e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller Document document = Jsoup.parse(response.body().string(), url.toString()); 107c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller for (Element element : document.select("a[href]")) { 108c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller String href = element.attr("href"); 10971b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller HttpUrl link = response.request().httpUrl().resolve(href); 110c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller if (link != null) queue.add(link); 111c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 112c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 113c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 114c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller public static void main(String[] args) throws IOException { 115c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller if (args.length != 2) { 116c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller System.out.println("Usage: Crawler <cache dir> <root>"); 117c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller return; 118c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 119c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 120c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller int threadCount = 20; 121c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller long cacheByteCount = 1024L * 1024L * 100L; 122c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 123c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller OkHttpClient client = new OkHttpClient(); 124e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller Cache cache = new Cache(new File(args[0]), cacheByteCount); 125e78f117bcbd6b57d783737107f445ef75ecb474aNeil Fuller client.setCache(cache); 126c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 127c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller Crawler crawler = new Crawler(client); 12871b9f47b26fb57ac3e436a19519c6e3ec70e86ebNeil Fuller crawler.queue.add(HttpUrl.parse(args[1])); 129c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller crawler.parallelDrainQueue(threadCount); 130c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 131c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller} 132