123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)// Copyright 2014 The Chromium Authors. All rights reserved. 223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)// found in the LICENSE file. 423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include <sstream> 623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "base/command_line.h" 823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "base/files/scoped_temp_dir.h" 923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "base/message_loop/message_loop.h" 1023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "base/path_service.h" 1123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "base/run_loop.h" 12cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)#include "base/strings/string_number_conversions.h" 13f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "base/strings/string_split.h" 1423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/content/distiller_page_web_contents.h" 15f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "components/dom_distiller/core/article_entry.h" 16116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch#include "components/dom_distiller/core/distilled_page_prefs.h" 1723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/distiller.h" 1823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/dom_distiller_service.h" 1923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/dom_distiller_store.h" 2023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/proto/distilled_article.pb.h" 2123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/proto/distilled_page.pb.h" 2223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "components/dom_distiller/core/task_tracker.h" 23f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "components/leveldb_proto/proto_database.h" 24f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "components/leveldb_proto/proto_database_impl.h" 25116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch#include "components/pref_registry/testing_pref_service_syncable.h" 2623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "content/public/browser/browser_context.h" 2723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "content/public/browser/browser_thread.h" 28effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch#include "content/public/test/content_browser_test.h" 2923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "content/shell/browser/shell.h" 30f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "google/protobuf/io/coded_stream.h" 31f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "google/protobuf/io/zero_copy_stream_impl_lite.h" 3223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "net/dns/mock_host_resolver.h" 33cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)#include "third_party/dom_distiller_js/dom_distiller.pb.h" 3423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)#include "ui/base/resource/resource_bundle.h" 3523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 3623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)using content::ContentBrowserTest; 3723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 3823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)namespace dom_distiller { 3923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 4023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)namespace { 4123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 42c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch// The url to distill. 4323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)const char* kUrlSwitch = "url"; 4423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 45f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)// A space-separated list of urls to distill. 46f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)const char* kUrlsSwitch = "urls"; 47f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 48c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch// Indicates that DNS resolution should be disabled for this test. 49c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochconst char* kDisableDnsSwitch = "disable-dns"; 50c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch 51c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch// Will write the distilled output to the given file instead of to stdout. 52c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochconst char* kOutputFile = "output-file"; 53c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch 54c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch// Indicates to output a serialized protocol buffer instead of human-readable 55c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch// output. 56c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochconst char* kShouldOutputBinary = "output-binary"; 57c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch 58cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)// Indicates to output only the text of the article and not the enclosing html. 59cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)const char* kExtractTextOnly = "extract-text-only"; 60cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) 61cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)// Indicates to include debug output. 62cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)const char* kDebugLevel = "debug-level"; 63cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) 64f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)// Maximum number of concurrent started extractor requests. 65f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)const int kMaxExtractorTasks = 8; 66f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 6723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)scoped_ptr<DomDistillerService> CreateDomDistillerService( 6823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) content::BrowserContext* context, 6923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) const base::FilePath& db_path) { 7023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) scoped_refptr<base::SequencedTaskRunner> background_task_runner = 7123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( 7223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) content::BrowserThread::GetBlockingPool()->GetSequenceToken()); 7323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 7423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) // TODO(cjhopman): use an in-memory database instead of an on-disk one with 7523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) // temporary directory. 76f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( 77f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( 78f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) background_task_runner)); 7923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) scoped_ptr<DomDistillerStore> dom_distiller_store(new DomDistillerStore( 80f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) db.PassAs<leveldb_proto::ProtoDatabase<ArticleEntry> >(), db_path)); 8123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 8223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) scoped_ptr<DistillerPageFactory> distiller_page_factory( 8323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) new DistillerPageWebContentsFactory(context)); 8423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory( 8523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) new DistillerURLFetcherFactory(context->GetRequestContext())); 86cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) 87cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) dom_distiller::proto::DomDistillerOptions options; 88cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) { 89cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) options.set_extract_text_only(true); 90cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) } 91cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) int debug_level = 0; 92cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && 93cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) base::StringToInt( 94cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( 95cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) kDebugLevel), 96cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) &debug_level)) { 97cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) options.set_debug_level(debug_level); 98cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) } 995c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu scoped_ptr<DistillerFactory> distiller_factory( 100cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options)); 10123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 102116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch // Setting up PrefService for DistilledPagePrefs. 103116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch user_prefs::TestingPrefServiceSyncable* pref_service = 104116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch new user_prefs::TestingPrefServiceSyncable(); 105116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); 106116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch 10723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) return scoped_ptr<DomDistillerService>(new DomDistillerService( 10823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) dom_distiller_store.PassAs<DomDistillerStoreInterface>(), 1095c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu distiller_factory.Pass(), 110116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch distiller_page_factory.Pass(), 111116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch scoped_ptr<DistilledPagePrefs>( 112116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch new DistilledPagePrefs(pref_service)))); 11323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)} 11423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 11523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)void AddComponentsResources() { 11623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) base::FilePath pak_file; 11723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) base::FilePath pak_dir; 11823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) PathService::Get(base::DIR_MODULE, &pak_dir); 11923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) pak_file = pak_dir.Append(FILE_PATH_LITERAL("components_resources.pak")); 12023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath( 12123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) pak_file, ui::SCALE_FACTOR_NONE); 12223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)} 12323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 124f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)bool WriteProtobufWithSize( 125f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const google::protobuf::MessageLite& message, 126f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) google::protobuf::io::ZeroCopyOutputStream* output_stream) { 127f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) google::protobuf::io::CodedOutputStream coded_output(output_stream); 128f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 129f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) // Write the size. 130f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const int size = message.ByteSize(); 131f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) coded_output.WriteLittleEndian32(size); 132f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) message.SerializeWithCachedSizes(&coded_output); 133f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) return !coded_output.HadError(); 134f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)} 135c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch 136f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)std::string GetReadableArticleString( 137f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const DistilledArticleProto& article_proto) { 138f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) std::stringstream output; 139f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) output << "Article Title: " << article_proto.title() << std::endl; 140f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) output << "# of pages: " << article_proto.pages_size() << std::endl; 141f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) for (int i = 0; i < article_proto.pages_size(); ++i) { 142f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const DistilledPageProto& page = article_proto.pages(i); 143f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) output << "Page " << i << std::endl; 144f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) output << "URL: " << page.url() << std::endl; 145f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) output << "Content: " << page.html() << std::endl; 1461320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci if (page.has_debug_info() && page.debug_info().has_log()) 1471320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci output << "Log: " << page.debug_info().log() << std::endl; 14823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 149f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) return output.str(); 15023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)} 15123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 15223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)} // namespace 15323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 15423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)class ContentExtractionRequest : public ViewRequestDelegate { 15523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) public: 1565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) void Start(DomDistillerService* service, const gfx::Size& render_view_size, 1575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) base::Closure finished_callback) { 15823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) finished_callback_ = finished_callback; 1595c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu viewer_handle_ = 1605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) service->ViewUrl(this, 1615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) service->CreateDefaultDistillerPage(render_view_size), 1625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) url_); 16323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 16423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 16523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) DistilledArticleProto GetArticleCopy() { 16623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) return *article_proto_; 16723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 16823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 169f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) static ScopedVector<ContentExtractionRequest> CreateForCommandLine( 17023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) const CommandLine& command_line) { 171f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ScopedVector<ContentExtractionRequest> requests; 17223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) if (command_line.HasSwitch(kUrlSwitch)) { 173f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) GURL url; 17423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); 17523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) url = GURL(url_string); 176f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (url.is_valid()) { 177f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) requests.push_back(new ContentExtractionRequest(url)); 178f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 179f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (command_line.HasSwitch(kUrlsSwitch)) { 180f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); 181f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) std::vector<std::string> urls; 182f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) base::SplitString(urls_string, ' ', &urls); 183f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) for (size_t i = 0; i < urls.size(); ++i) { 184f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) GURL url(urls[i]); 185f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (url.is_valid()) { 186f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) requests.push_back(new ContentExtractionRequest(url)); 187f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { 188f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ADD_FAILURE() << "Bad url"; 189f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 190f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 19123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 192f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (requests.empty()) { 19323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) ADD_FAILURE() << "No valid url provided"; 19423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 195f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 196f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) return requests.Pass(); 19723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 19823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 19923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) private: 20023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) ContentExtractionRequest(const GURL& url) : url_(url) {} 20123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 20223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) virtual void OnArticleUpdated(ArticleDistillationUpdate article_update) 20323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) OVERRIDE {} 20423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 20523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) virtual void OnArticleReady(const DistilledArticleProto* article_proto) 20623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) OVERRIDE { 20723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) article_proto_ = article_proto; 2081320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci CHECK(article_proto->pages_size()) << "Failed extracting " << url_; 20923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) base::MessageLoop::current()->PostTask( 21023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) FROM_HERE, 21123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) finished_callback_); 21223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 21323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 21423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) const DistilledArticleProto* article_proto_; 21523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) scoped_ptr<ViewerHandle> viewer_handle_; 21623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) GURL url_; 21723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) base::Closure finished_callback_; 21823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)}; 21923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 22023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)class ContentExtractor : public ContentBrowserTest { 221f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) public: 222f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ContentExtractor() 223f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) : pending_tasks_(0), 224f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) max_tasks_(kMaxExtractorTasks), 225f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) next_request_(0), 226f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) output_data_(), 227f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) protobuf_output_stream_( 228f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) new google::protobuf::io::StringOutputStream(&output_data_)) {} 229f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 23023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) // Change behavior of the default host resolver to avoid DNS lookup errors, so 23123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) // we can make network calls. 23223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) virtual void SetUpOnMainThread() OVERRIDE { 233c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch if (!CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) { 234c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch EnableDNSLookupForThisTest(); 235c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch } 23623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) CHECK(db_dir_.CreateUniqueTempDir()); 23723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) AddComponentsResources(); 23823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 23923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 24023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) virtual void TearDownOnMainThread() OVERRIDE { 24123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) DisableDNSLookupForThisTest(); 24223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 24323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 24423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) protected: 24523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) // Creates the DomDistillerService and creates and starts the extraction 24623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) // request. 24723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) void Start() { 24823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) content::BrowserContext* context = 24923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) shell()->web_contents()->GetBrowserContext(); 25023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) service_ = CreateDomDistillerService(context, 25123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) db_dir_.path()); 25223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) const CommandLine& command_line = *CommandLine::ForCurrentProcess(); 253f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) requests_ = ContentExtractionRequest::CreateForCommandLine(command_line); 254f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) PumpQueue(); 255f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 256f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 257f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) void PumpQueue() { 258f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { 259f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) requests_[next_request_]->Start( 260f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) service_.get(), 2615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) shell()->web_contents()->GetContainerBounds().size(), 262f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); 263f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ++next_request_; 264f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ++pending_tasks_; 265f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 26623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 26723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 26823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) private: 26923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) // Change behavior of the default host resolver to allow DNS lookup 27023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) // to proceed instead of being blocked by the test infrastructure. 27123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) void EnableDNSLookupForThisTest() { 27223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) // mock_host_resolver_override_ takes ownership of the resolver. 27323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) scoped_refptr<net::RuleBasedHostResolverProc> resolver = 27423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) new net::RuleBasedHostResolverProc(host_resolver()); 27523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) resolver->AllowDirectLookup("*"); 27623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) mock_host_resolver_override_.reset( 27723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) new net::ScopedDefaultHostResolverProc(resolver.get())); 27823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 27923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 28023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) // We need to reset the DNS lookup when we finish, or the test will fail. 28123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) void DisableDNSLookupForThisTest() { 28223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) mock_host_resolver_override_.reset(); 28323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 28423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 285f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) void FinishRequest() { 286f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) --pending_tasks_; 287f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (next_request_ == requests_.size() && pending_tasks_ == 0) { 288f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) Finish(); 289f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { 290f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) PumpQueue(); 291f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 292f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 293f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 294f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) void DoArticleOutput() { 295f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) for (size_t i = 0; i < requests_.size(); ++i) { 296f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const DistilledArticleProto& article = requests_[i]->GetArticleCopy(); 297f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) { 298f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) WriteProtobufWithSize(article, protobuf_output_stream_.get()); 299f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { 300f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) output_data_ += GetReadableArticleString(article) + "\n"; 301f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 302f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 303f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 304f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) { 305f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) base::FilePath filename = 306f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile); 307f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ASSERT_EQ( 308f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) (int)output_data_.size(), 309f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) base::WriteFile(filename, output_data_.c_str(), output_data_.size())); 310f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { 311f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) VLOG(0) << output_data_; 312f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 313f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 314f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 31523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) void Finish() { 316f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) DoArticleOutput(); 317f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) requests_.clear(); 31823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) service_.reset(); 31923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) base::MessageLoop::current()->PostTask( 32023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) FROM_HERE, base::MessageLoop::QuitWhenIdleClosure()); 32123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 32223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 323f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) size_t pending_tasks_; 324f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) size_t max_tasks_; 325f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) size_t next_request_; 326f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 32723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) base::ScopedTempDir db_dir_; 32823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_; 32923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) scoped_ptr<DomDistillerService> service_; 330f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ScopedVector<ContentExtractionRequest> requests_; 331f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 332f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) std::string output_data_; 333f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; 33423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)}; 33523730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 33623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { 33723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) Start(); 33823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) base::RunLoop().Run(); 33923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)} 34023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 34123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)} // namespace dom_distiller 342