/* * Copyright (C) 2014 Square, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.squareup.okhttp.sample; import com.squareup.okhttp.Cache; import com.squareup.okhttp.HttpUrl; import com.squareup.okhttp.OkHttpClient; import com.squareup.okhttp.Request; import com.squareup.okhttp.Response; import com.squareup.okhttp.internal.NamedRunnable; import java.io.File; import java.io.IOException; import java.util.Collections; import java.util.LinkedHashSet; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; /** * Fetches HTML from a requested URL, follows the links, and repeats. */ public final class Crawler { private final OkHttpClient client; private final Set fetchedUrls = Collections.synchronizedSet( new LinkedHashSet()); private final LinkedBlockingQueue queue = new LinkedBlockingQueue<>(); private final ConcurrentHashMap hostnames = new ConcurrentHashMap<>(); public Crawler(OkHttpClient client) { this.client = client; } private void parallelDrainQueue(int threadCount) { ExecutorService executor = Executors.newFixedThreadPool(threadCount); for (int i = 0; i < threadCount; i++) { executor.execute(new NamedRunnable("Crawler %s", i) { @Override protected void execute() { try { drainQueue(); } catch (Exception e) { e.printStackTrace(); } } }); } executor.shutdown(); } private void drainQueue() throws Exception { for (HttpUrl url; (url = queue.take()) != null; ) { if (!fetchedUrls.add(url)) { continue; } try { fetch(url); } catch (IOException e) { System.out.printf("XXX: %s %s%n", url, e); } } } public void fetch(HttpUrl url) throws IOException { // Skip hosts that we've visited many times. AtomicInteger hostnameCount = new AtomicInteger(); AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount); if (previous != null) hostnameCount = previous; if (hostnameCount.incrementAndGet() > 100) return; Request request = new Request.Builder() .url(url) .build(); Response response = client.newCall(request).execute(); String responseSource = response.networkResponse() != null ? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")") : "(cache)"; int responseCode = response.code(); System.out.printf("%03d: %s %s%n", responseCode, url, responseSource); String contentType = response.header("Content-Type"); if (responseCode != 200 || contentType == null) { response.body().close(); return; } Document document = Jsoup.parse(response.body().string(), url.toString()); for (Element element : document.select("a[href]")) { String href = element.attr("href"); HttpUrl link = response.request().httpUrl().resolve(href); if (link != null) queue.add(link); } } public static void main(String[] args) throws IOException { if (args.length != 2) { System.out.println("Usage: Crawler "); return; } int threadCount = 20; long cacheByteCount = 1024L * 1024L * 100L; OkHttpClient client = new OkHttpClient(); Cache cache = new Cache(new File(args[0]), cacheByteCount); client.setCache(cache); Crawler crawler = new Crawler(client); crawler.queue.add(HttpUrl.parse(args[1])); crawler.parallelDrainQueue(threadCount); } }