123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)// Copyright 2014 The Chromium Authors. All rights reserved.
223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)// found in the LICENSE file.
423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include <sstream>
623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "base/command_line.h"
823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "base/files/scoped_temp_dir.h"
923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "base/message_loop/message_loop.h"
1023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "base/path_service.h"
1123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "base/run_loop.h"
12cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)#include "base/strings/string_number_conversions.h"
13f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "base/strings/string_split.h"
1423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/content/distiller_page_web_contents.h"
15f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "components/dom_distiller/core/article_entry.h"
16116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch#include "components/dom_distiller/core/distilled_page_prefs.h"
1723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/distiller.h"
1823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/dom_distiller_service.h"
1923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/dom_distiller_store.h"
2023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/proto/distilled_article.pb.h"
2123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/proto/distilled_page.pb.h"
2223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/task_tracker.h"
23f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "components/leveldb_proto/proto_database.h"
24f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "components/leveldb_proto/proto_database_impl.h"
25116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch#include "components/pref_registry/testing_pref_service_syncable.h"
2623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "content/public/browser/browser_context.h"
2723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "content/public/browser/browser_thread.h"
28effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch#include "content/public/test/content_browser_test.h"
2923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "content/shell/browser/shell.h"
30f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "google/protobuf/io/coded_stream.h"
31f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
3223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "net/dns/mock_host_resolver.h"
33cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)#include "third_party/dom_distiller_js/dom_distiller.pb.h"
3423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "ui/base/resource/resource_bundle.h"
3523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
3623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)using content::ContentBrowserTest;
3723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
3823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)namespace dom_distiller {
3923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
4023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)namespace {
4123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
42c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch// The url to distill.
4323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)const char* kUrlSwitch = "url";
4423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
45f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)// A space-separated list of urls to distill.
46f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)const char* kUrlsSwitch = "urls";
47f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
48c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch// Indicates that DNS resolution should be disabled for this test.
49c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochconst char* kDisableDnsSwitch = "disable-dns";
50c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch
51c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch// Will write the distilled output to the given file instead of to stdout.
52c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochconst char* kOutputFile = "output-file";
53c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch
54c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch// Indicates to output a serialized protocol buffer instead of human-readable
55c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch// output.
56c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochconst char* kShouldOutputBinary = "output-binary";
57c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch
58cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)// Indicates to output only the text of the article and not the enclosing html.
59cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)const char* kExtractTextOnly = "extract-text-only";
60cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
61cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)// Indicates to include debug output.
62cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)const char* kDebugLevel = "debug-level";
63cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
64f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)// Maximum number of concurrent started extractor requests.
65f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)const int kMaxExtractorTasks = 8;
66f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
6723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)scoped_ptr<DomDistillerService> CreateDomDistillerService(
6823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    content::BrowserContext* context,
6923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    const base::FilePath& db_path) {
7023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  scoped_refptr<base::SequencedTaskRunner> background_task_runner =
7123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)      content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
7223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)          content::BrowserThread::GetBlockingPool()->GetSequenceToken());
7323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
7423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  // TODO(cjhopman): use an in-memory database instead of an on-disk one with
7523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  // temporary directory.
76f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
77f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
78f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)          background_task_runner));
7923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  scoped_ptr<DomDistillerStore> dom_distiller_store(new DomDistillerStore(
80f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      db.PassAs<leveldb_proto::ProtoDatabase<ArticleEntry> >(), db_path));
8123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
8223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  scoped_ptr<DistillerPageFactory> distiller_page_factory(
8323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)      new DistillerPageWebContentsFactory(context));
8423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
8523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)      new DistillerURLFetcherFactory(context->GetRequestContext()));
86cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
87cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  dom_distiller::proto::DomDistillerOptions options;
88cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
89cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    options.set_extract_text_only(true);
90cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  }
91cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  int debug_level = 0;
92cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
93cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)      base::StringToInt(
94cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)          base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
95cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)              kDebugLevel),
96cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)          &debug_level)) {
97cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    options.set_debug_level(debug_level);
98cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  }
995c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  scoped_ptr<DistillerFactory> distiller_factory(
100cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)      new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options));
10123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
102116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  // Setting up PrefService for DistilledPagePrefs.
103116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  user_prefs::TestingPrefServiceSyncable* pref_service =
104116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      new user_prefs::TestingPrefServiceSyncable();
105116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
106116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
10723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  return scoped_ptr<DomDistillerService>(new DomDistillerService(
10823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)      dom_distiller_store.PassAs<DomDistillerStoreInterface>(),
1095c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu      distiller_factory.Pass(),
110116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      distiller_page_factory.Pass(),
111116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      scoped_ptr<DistilledPagePrefs>(
112116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch          new DistilledPagePrefs(pref_service))));
11323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)}
11423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
11523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)void AddComponentsResources() {
11623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  base::FilePath pak_file;
11723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  base::FilePath pak_dir;
11823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  PathService::Get(base::DIR_MODULE, &pak_dir);
11923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  pak_file = pak_dir.Append(FILE_PATH_LITERAL("components_resources.pak"));
12023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
12123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)      pak_file, ui::SCALE_FACTOR_NONE);
12223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)}
12323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
124f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)bool WriteProtobufWithSize(
125f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    const google::protobuf::MessageLite& message,
126f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    google::protobuf::io::ZeroCopyOutputStream* output_stream) {
127f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  google::protobuf::io::CodedOutputStream coded_output(output_stream);
128f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
129f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  // Write the size.
130f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  const int size = message.ByteSize();
131f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  coded_output.WriteLittleEndian32(size);
132f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  message.SerializeWithCachedSizes(&coded_output);
133f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  return !coded_output.HadError();
134f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)}
135c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch
136f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)std::string GetReadableArticleString(
137f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    const DistilledArticleProto& article_proto) {
138f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  std::stringstream output;
139f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  output << "Article Title: " << article_proto.title() << std::endl;
140f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  output << "# of pages: " << article_proto.pages_size() << std::endl;
141f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  for (int i = 0; i < article_proto.pages_size(); ++i) {
142f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    const DistilledPageProto& page = article_proto.pages(i);
143f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    output << "Page " << i << std::endl;
144f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    output << "URL: " << page.url() << std::endl;
145f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    output << "Content: " << page.html() << std::endl;
1461320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    if (page.has_debug_info() && page.debug_info().has_log())
1471320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      output << "Log: " << page.debug_info().log() << std::endl;
14823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  }
149f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  return output.str();
15023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)}
15123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
15223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)}  // namespace
15323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
15423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)class ContentExtractionRequest : public ViewRequestDelegate {
15523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) public:
1565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  void Start(DomDistillerService* service, const gfx::Size& render_view_size,
1575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)             base::Closure finished_callback) {
15823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    finished_callback_ = finished_callback;
1595c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu    viewer_handle_ =
1605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)        service->ViewUrl(this,
1615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                         service->CreateDefaultDistillerPage(render_view_size),
1625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                         url_);
16323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  }
16423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
16523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  DistilledArticleProto GetArticleCopy() {
16623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    return *article_proto_;
16723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  }
16823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
169f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
17023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)      const CommandLine& command_line) {
171f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    ScopedVector<ContentExtractionRequest> requests;
17223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    if (command_line.HasSwitch(kUrlSwitch)) {
173f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      GURL url;
17423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)      std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
17523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)      url = GURL(url_string);
176f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      if (url.is_valid()) {
177f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        requests.push_back(new ContentExtractionRequest(url));
178f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      }
179f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    } else if (command_line.HasSwitch(kUrlsSwitch)) {
180f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
181f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      std::vector<std::string> urls;
182f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      base::SplitString(urls_string, ' ', &urls);
183f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      for (size_t i = 0; i < urls.size(); ++i) {
184f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        GURL url(urls[i]);
185f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        if (url.is_valid()) {
186f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)          requests.push_back(new ContentExtractionRequest(url));
187f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else {
188f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)          ADD_FAILURE() << "Bad url";
189f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
190f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      }
19123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    }
192f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    if (requests.empty()) {
19323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)      ADD_FAILURE() << "No valid url provided";
19423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    }
195f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
196f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return requests.Pass();
19723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  }
19823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
19923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) private:
20023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  ContentExtractionRequest(const GURL& url) : url_(url) {}
20123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
20223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  virtual void OnArticleUpdated(ArticleDistillationUpdate article_update)
20323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)      OVERRIDE {}
20423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
20523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  virtual void OnArticleReady(const DistilledArticleProto* article_proto)
20623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)      OVERRIDE {
20723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    article_proto_ = article_proto;
2081320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    CHECK(article_proto->pages_size()) << "Failed extracting " << url_;
20923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    base::MessageLoop::current()->PostTask(
21023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)        FROM_HERE,
21123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)        finished_callback_);
21223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  }
21323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
21423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  const DistilledArticleProto* article_proto_;
21523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  scoped_ptr<ViewerHandle> viewer_handle_;
21623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  GURL url_;
21723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  base::Closure finished_callback_;
21823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)};
21923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
22023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)class ContentExtractor : public ContentBrowserTest {
221f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) public:
222f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  ContentExtractor()
223f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      : pending_tasks_(0),
224f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        max_tasks_(kMaxExtractorTasks),
225f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        next_request_(0),
226f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        output_data_(),
227f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        protobuf_output_stream_(
228f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            new google::protobuf::io::StringOutputStream(&output_data_)) {}
229f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
23023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  // Change behavior of the default host resolver to avoid DNS lookup errors, so
23123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  // we can make network calls.
23223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  virtual void SetUpOnMainThread() OVERRIDE {
233c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch    if (!CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
234c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch      EnableDNSLookupForThisTest();
235c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch    }
23623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    CHECK(db_dir_.CreateUniqueTempDir());
23723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    AddComponentsResources();
23823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  }
23923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
24023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  virtual void TearDownOnMainThread() OVERRIDE {
24123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    DisableDNSLookupForThisTest();
24223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  }
24323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
24423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) protected:
24523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  // Creates the DomDistillerService and creates and starts the extraction
24623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  // request.
24723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  void Start() {
24823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    content::BrowserContext* context =
24923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)        shell()->web_contents()->GetBrowserContext();
25023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    service_ = CreateDomDistillerService(context,
25123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)                                         db_dir_.path());
25223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    const CommandLine& command_line = *CommandLine::ForCurrentProcess();
253f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    requests_ = ContentExtractionRequest::CreateForCommandLine(command_line);
254f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    PumpQueue();
255f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  }
256f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
257f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  void PumpQueue() {
258f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
259f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      requests_[next_request_]->Start(
260f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)          service_.get(),
2615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)          shell()->web_contents()->GetContainerBounds().size(),
262f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)          base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
263f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      ++next_request_;
264f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      ++pending_tasks_;
265f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    }
26623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  }
26723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
26823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) private:
26923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  // Change behavior of the default host resolver to allow DNS lookup
27023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  // to proceed instead of being blocked by the test infrastructure.
27123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  void EnableDNSLookupForThisTest() {
27223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    // mock_host_resolver_override_ takes ownership of the resolver.
27323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    scoped_refptr<net::RuleBasedHostResolverProc> resolver =
27423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)        new net::RuleBasedHostResolverProc(host_resolver());
27523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    resolver->AllowDirectLookup("*");
27623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    mock_host_resolver_override_.reset(
27723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)        new net::ScopedDefaultHostResolverProc(resolver.get()));
27823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  }
27923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
28023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  // We need to reset the DNS lookup when we finish, or the test will fail.
28123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  void DisableDNSLookupForThisTest() {
28223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    mock_host_resolver_override_.reset();
28323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  }
28423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
285f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  void FinishRequest() {
286f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    --pending_tasks_;
287f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    if (next_request_ == requests_.size() && pending_tasks_ == 0) {
288f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      Finish();
289f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    } else {
290f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      PumpQueue();
291f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    }
292f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  }
293f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
294f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  void DoArticleOutput() {
295f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    for (size_t i = 0; i < requests_.size(); ++i) {
296f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
297f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
298f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        WriteProtobufWithSize(article, protobuf_output_stream_.get());
299f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      } else {
300f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        output_data_ += GetReadableArticleString(article) + "\n";
301f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      }
302f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    }
303f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
304f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
305f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      base::FilePath filename =
306f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)          CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
307f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      ASSERT_EQ(
308f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)          (int)output_data_.size(),
309f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)          base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
310f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    } else {
311f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      VLOG(0) << output_data_;
312f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    }
313f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  }
314f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
31523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  void Finish() {
316f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    DoArticleOutput();
317f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    requests_.clear();
31823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    service_.reset();
31923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    base::MessageLoop::current()->PostTask(
32023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)        FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
32123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  }
32223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
323f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  size_t pending_tasks_;
324f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  size_t max_tasks_;
325f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  size_t next_request_;
326f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
32723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  base::ScopedTempDir db_dir_;
32823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
32923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  scoped_ptr<DomDistillerService> service_;
330f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  ScopedVector<ContentExtractionRequest> requests_;
331f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
332f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  std::string output_data_;
333f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
33423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)};
33523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
33623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
33723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  Start();
33823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)  base::RunLoop().Run();
33923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)}
34023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
34123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)}  // namespace dom_distiller
342