1f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Copyright 2013 The Chromium Authors. All rights reserved.
2f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
3f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// found in the LICENSE file.
4f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
5f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "components/dom_distiller/core/distiller_page.h"
6f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
7a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/bind.h"
8cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)#include "base/json/json_writer.h"
9f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "base/logging.h"
10a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/message_loop/message_loop.h"
111320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci#include "base/metrics/histogram.h"
12cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)#include "base/strings/string_util.h"
13cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)#include "base/strings/utf_string_conversions.h"
141320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci#include "base/time/time.h"
151320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci#include "grit/components_resources.h"
16cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)#include "third_party/dom_distiller_js/dom_distiller.pb.h"
17cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)#include "third_party/dom_distiller_js/dom_distiller_json_converter.h"
18a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "ui/base/resource/resource_bundle.h"
19f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "url/gurl.h"
20f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
21f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)namespace dom_distiller {
22f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
23cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)namespace {
24cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
25cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)const char* kOptionsPlaceholder = "$$OPTIONS";
26cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
27cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)std::string GetDistillerScriptWithOptions(
28cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    const dom_distiller::proto::DomDistillerOptions& options) {
29cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  std::string script = ResourceBundle::GetSharedInstance()
30cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)                           .GetRawDataResource(IDR_DISTILLER_JS)
31cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)                           .as_string();
32cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  if (script.empty()) {
33cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    return "";
34cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  }
35cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
36cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  scoped_ptr<base::Value> options_value(
37cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)      dom_distiller::proto::json::DomDistillerOptions::WriteToValue(options));
38cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  std::string options_json;
39cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  if (!base::JSONWriter::Write(options_value.get(), &options_json)) {
40cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    NOTREACHED();
41cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  }
42cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  size_t options_offset = script.find(kOptionsPlaceholder);
43cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  DCHECK_NE(std::string::npos, options_offset);
44cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  DCHECK_EQ(std::string::npos,
45cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)            script.find(kOptionsPlaceholder, options_offset + 1));
46cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  script =
47cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)      script.replace(options_offset, strlen(kOptionsPlaceholder), options_json);
48cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  return script;
49cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)}
50cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
51cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)}
52cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
53f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)DistillerPageFactory::~DistillerPageFactory() {}
54f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
550529e5d033099cbfc42635f6f6183833b09dff6eBen MurdochDistillerPage::DistillerPage() : ready_(true) {}
56f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
57a02191e04bc25c4935f804f2c080ae28663d096dBen MurdochDistillerPage::~DistillerPage() {}
58f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
59cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)void DistillerPage::DistillPage(
60cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    const GURL& gurl,
61cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    const dom_distiller::proto::DomDistillerOptions options,
62cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    const DistillerPageCallback& callback) {
630529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  DCHECK(ready_);
640529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // It is only possible to distill one page at a time. |ready_| is reset when
650529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // the callback to OnDistillationDone happens.
660529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  ready_ = false;
67a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch  distiller_page_callback_ = callback;
68cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  DistillPageImpl(gurl, GetDistillerScriptWithOptions(options));
69f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}
70f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
710529e5d033099cbfc42635f6f6183833b09dff6eBen Murdochvoid DistillerPage::OnDistillationDone(const GURL& page_url,
720529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch                                       const base::Value* value) {
730529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  DCHECK(!ready_);
740529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  ready_ = true;
75cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
761320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  scoped_ptr<dom_distiller::proto::DomDistillerResult> distiller_result(
771320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      new dom_distiller::proto::DomDistillerResult());
781320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  bool found_content;
791320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  if (value->IsType(base::Value::TYPE_NULL)) {
801320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    found_content = false;
811320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  } else {
821320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    found_content =
831320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        dom_distiller::proto::json::DomDistillerResult::ReadFromValue(
841320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci            value, distiller_result.get());
851320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    if (!found_content) {
861320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      DVLOG(1) << "Unable to parse DomDistillerResult.";
871320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    } else {
881320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      if (distiller_result->has_timing_info()) {
891320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        const dom_distiller::proto::TimingInfo& timing =
901320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci            distiller_result->timing_info();
911320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        if (timing.has_markup_parsing_time()) {
921320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          UMA_HISTOGRAM_TIMES(
931320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci              "DomDistiller.Time.MarkupParsing",
941320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci              base::TimeDelta::FromMillisecondsD(timing.markup_parsing_time()));
951320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        }
961320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        if (timing.has_document_construction_time()) {
971320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          UMA_HISTOGRAM_TIMES(
981320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci              "DomDistiller.Time.DocumentConstruction",
991320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci              base::TimeDelta::FromMillisecondsD(
1001320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci                  timing.document_construction_time()));
1011320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        }
1021320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        if (timing.has_article_processing_time()) {
1031320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          UMA_HISTOGRAM_TIMES(
1041320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci              "DomDistiller.Time.ArticleProcessing",
1051320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci              base::TimeDelta::FromMillisecondsD(
1061320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci                  timing.article_processing_time()));
1071320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        }
1081320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        if (timing.has_formatting_time()) {
1091320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          UMA_HISTOGRAM_TIMES(
1101320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci              "DomDistiller.Time.Formatting",
1111320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci              base::TimeDelta::FromMillisecondsD(timing.formatting_time()));
1121320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        }
1131320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        if (timing.has_total_time()) {
1141320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          UMA_HISTOGRAM_TIMES(
1151320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci              "DomDistiller.Time.DistillationTotal",
1161320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci              base::TimeDelta::FromMillisecondsD(timing.total_time()));
1171320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        }
1181320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      }
1191320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      if (distiller_result->has_statistics_info()) {
1201320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        const dom_distiller::proto::StatisticsInfo& statistics =
1211320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci            distiller_result->statistics_info();
1221320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        if (statistics.has_word_count()) {
1231320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          UMA_HISTOGRAM_CUSTOM_COUNTS(
1241320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci            "DomDistiller.Statistics.WordCount",
1251320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci            statistics.word_count(),
1261320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci            1, 4000, 50);
1271320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        }
128a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch      }
1295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    }
130a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch  }
131cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
132cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  base::MessageLoop::current()->PostTask(
133cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)      FROM_HERE,
1341320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      base::Bind(distiller_page_callback_,
1351320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci                 base::Passed(&distiller_result),
1361320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci                 found_content));
137f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}
138f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
139f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}  // namespace dom_distiller
140