1c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller/* 2c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Copyright (C) 2014 Square, Inc. 3c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * 4c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Licensed under the Apache License, Version 2.0 (the "License"); 5c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * you may not use this file except in compliance with the License. 6c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * You may obtain a copy of the License at 7c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * 8c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * http://www.apache.org/licenses/LICENSE-2.0 9c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * 10c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Unless required by applicable law or agreed to in writing, software 11c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * distributed under the License is distributed on an "AS IS" BASIS, 12c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * See the License for the specific language governing permissions and 14c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * limitations under the License. 15c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller */ 16c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerpackage com.squareup.okhttp.sample; 17c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 18c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport com.squareup.okhttp.HttpResponseCache; 19c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport com.squareup.okhttp.MediaType; 20c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport com.squareup.okhttp.OkHttpClient; 21c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport com.squareup.okhttp.internal.http.OkHeaders; 22c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.io.File; 23c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.io.IOException; 24c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.io.InputStream; 25c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.net.HttpURLConnection; 26c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.net.MalformedURLException; 27c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.net.URL; 28c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.nio.charset.Charset; 29c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.Collections; 30c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.LinkedHashSet; 31c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.Set; 32c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.ExecutorService; 33c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.Executors; 34c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport java.util.concurrent.LinkedBlockingQueue; 35c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.Jsoup; 36c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.nodes.Document; 37c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerimport org.jsoup.nodes.Element; 38c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 39c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller/** 40c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller * Fetches HTML from a requested URL, follows the links, and repeats. 41c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller */ 42c6bd683320121544811f481709b3fdbcbe9b3866Neil Fullerpublic final class Crawler { 43c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller public static final Charset UTF_8 = Charset.forName("UTF-8"); 44c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 45c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller private final OkHttpClient client; 46c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller private final Set<URL> fetchedUrls = Collections.synchronizedSet(new LinkedHashSet<URL>()); 47c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller private final LinkedBlockingQueue<URL> queue = new LinkedBlockingQueue<URL>(); 48c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 49c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller public Crawler(OkHttpClient client) { 50c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller this.client = client; 51c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 52c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 53c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller private void parallelDrainQueue(int threadCount) { 54c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller ExecutorService executor = Executors.newFixedThreadPool(threadCount); 55c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller for (int i = 0; i < threadCount; i++) { 56c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller executor.execute(new Runnable() { 57c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller @Override public void run() { 58c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller try { 59c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller drainQueue(); 60c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } catch (Exception e) { 61c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller e.printStackTrace(); 62c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 63c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 64c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller }); 65c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 66c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller executor.shutdown(); 67c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 68c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 69c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller private void drainQueue() throws Exception { 70c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller for (URL url; (url = queue.take()) != null; ) { 71c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller if (!fetchedUrls.add(url)) { 72c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller continue; 73c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 74c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 75c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller try { 76c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller fetch(url); 77c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } catch (IOException e) { 78c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller System.out.printf("XXX: %s %s%n", url, e); 79c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 80c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 81c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 82c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 83c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller public void fetch(URL url) throws IOException { 84c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller HttpURLConnection connection = client.open(url); 85c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller String responseSource = connection.getHeaderField(OkHeaders.RESPONSE_SOURCE); 86c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller String contentType = connection.getHeaderField("Content-Type"); 87c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller int responseCode = connection.getResponseCode(); 88c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 89c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller System.out.printf("%03d: %s %s%n", responseCode, url, responseSource); 90c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 91c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller if (responseCode >= 400) { 92c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller connection.getErrorStream().close(); 93c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller return; 94c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 95c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 96c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller InputStream in = connection.getInputStream(); 97c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller if (responseCode != 200 || contentType == null) { 98c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller in.close(); 99c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller return; 100c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 101c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 102c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller MediaType mediaType = MediaType.parse(contentType); 103c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller Document document = Jsoup.parse(in, mediaType.charset(UTF_8).name(), url.toString()); 104c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller for (Element element : document.select("a[href]")) { 105c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller String href = element.attr("href"); 106c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller URL link = parseUrl(url, href); 107c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller if (link != null) queue.add(link); 108c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 109c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 110c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller in.close(); 111c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 112c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 113c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller private URL parseUrl(URL url, String href) { 114c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller try { 115c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller URL result = new URL(url, href); 116c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller return result.getProtocol().equals("http") || result.getProtocol().equals("https") 117c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller ? result 118c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller : null; 119c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } catch (MalformedURLException e) { 120c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller return null; 121c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 122c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 123c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 124c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller public static void main(String[] args) throws IOException { 125c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller if (args.length != 2) { 126c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller System.out.println("Usage: Crawler <cache dir> <root>"); 127c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller return; 128c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 129c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 130c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller int threadCount = 20; 131c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller long cacheByteCount = 1024L * 1024L * 100L; 132c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 133c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller OkHttpClient client = new OkHttpClient(); 134c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller HttpResponseCache httpResponseCache = new HttpResponseCache(new File(args[0]), cacheByteCount); 135c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller client.setOkResponseCache(httpResponseCache); 136c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller 137c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller Crawler crawler = new Crawler(client); 138c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller crawler.queue.add(new URL(args[1])); 139c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller crawler.parallelDrainQueue(threadCount); 140c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller } 141c6bd683320121544811f481709b3fdbcbe9b3866Neil Fuller} 142