1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "components/dom_distiller/core/distiller.h"
6
7#include <map>
8#include <vector>
9
10#include "base/auto_reset.h"
11#include "base/bind.h"
12#include "base/callback.h"
13#include "base/location.h"
14#include "base/message_loop/message_loop.h"
15#include "base/strings/string_number_conversions.h"
16#include "base/strings/utf_string_conversions.h"
17#include "base/values.h"
18#include "components/dom_distiller/core/distiller_page.h"
19#include "components/dom_distiller/core/distiller_url_fetcher.h"
20#include "components/dom_distiller/core/proto/distilled_article.pb.h"
21#include "components/dom_distiller/core/proto/distilled_page.pb.h"
22#include "net/url_request/url_request_context_getter.h"
23
24namespace {
25// Maximum number of distilled pages in an article.
26const size_t kMaxPagesInArticle = 32;
27}
28
29namespace dom_distiller {
30
31DistillerFactoryImpl::DistillerFactoryImpl(
32    scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
33    const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
34    : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
35      dom_distiller_options_(dom_distiller_options) {
36}
37
38DistillerFactoryImpl::~DistillerFactoryImpl() {}
39
40scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
41  scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
42      *distiller_url_fetcher_factory_, dom_distiller_options_));
43  return distiller.PassAs<Distiller>();
44}
45
46DistillerImpl::DistilledPageData::DistilledPageData() {}
47
48DistillerImpl::DistilledPageData::~DistilledPageData() {}
49
50DistillerImpl::DistillerImpl(
51    const DistillerURLFetcherFactory& distiller_url_fetcher_factory,
52    const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
53    : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
54      dom_distiller_options_(dom_distiller_options),
55      max_pages_in_article_(kMaxPagesInArticle),
56      destruction_allowed_(true),
57      weak_factory_(this) {
58}
59
60DistillerImpl::~DistillerImpl() {
61  DCHECK(destruction_allowed_);
62}
63
64void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
65  max_pages_in_article_ = max_num_pages;
66}
67
68bool DistillerImpl::AreAllPagesFinished() const {
69  return started_pages_index_.empty() && waiting_pages_.empty();
70}
71
72size_t DistillerImpl::TotalPageCount() const {
73  return waiting_pages_.size() + started_pages_index_.size() +
74         finished_pages_index_.size();
75}
76
77void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
78  if (!IsPageNumberInUse(page_num) && url.is_valid() &&
79      TotalPageCount() < max_pages_in_article_ &&
80      seen_urls_.find(url.spec()) == seen_urls_.end()) {
81    waiting_pages_[page_num] = url;
82  }
83}
84
85bool DistillerImpl::IsPageNumberInUse(int page_num) const {
86  return waiting_pages_.find(page_num) != waiting_pages_.end() ||
87         started_pages_index_.find(page_num) != started_pages_index_.end() ||
88         finished_pages_index_.find(page_num) != finished_pages_index_.end();
89}
90
91DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index)
92    const {
93  DCHECK_LT(index, pages_.size());
94  DistilledPageData* page_data = pages_[index];
95  DCHECK(page_data);
96  return page_data;
97}
98
99void DistillerImpl::DistillPage(const GURL& url,
100                                scoped_ptr<DistillerPage> distiller_page,
101                                const DistillationFinishedCallback& finished_cb,
102                                const DistillationUpdateCallback& update_cb) {
103  DCHECK(AreAllPagesFinished());
104  distiller_page_ = distiller_page.Pass();
105  finished_cb_ = finished_cb;
106  update_cb_ = update_cb;
107
108  AddToDistillationQueue(0, url);
109  DistillNextPage();
110}
111
112void DistillerImpl::DistillNextPage() {
113  if (!waiting_pages_.empty()) {
114    std::map<int, GURL>::iterator front = waiting_pages_.begin();
115    int page_num = front->first;
116    const GURL url = front->second;
117
118    waiting_pages_.erase(front);
119    DCHECK(url.is_valid());
120    DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
121    DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
122    seen_urls_.insert(url.spec());
123    pages_.push_back(new DistilledPageData());
124    started_pages_index_[page_num] = pages_.size() - 1;
125    distiller_page_->DistillPage(
126        url,
127        dom_distiller_options_,
128        base::Bind(&DistillerImpl::OnPageDistillationFinished,
129                   weak_factory_.GetWeakPtr(),
130                   page_num,
131                   url));
132  }
133}
134
135void DistillerImpl::OnPageDistillationFinished(
136    int page_num,
137    const GURL& page_url,
138    scoped_ptr<proto::DomDistillerResult> distiller_result,
139    bool distillation_successful) {
140  DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
141  if (distillation_successful) {
142    DCHECK(distiller_result.get());
143    DistilledPageData* page_data =
144        GetPageAtIndex(started_pages_index_[page_num]);
145    page_data->distilled_page_proto =
146        new base::RefCountedData<DistilledPageProto>();
147    page_data->page_num = page_num;
148    if (distiller_result->has_title()) {
149      page_data->distilled_page_proto->data.set_title(
150          distiller_result->title());
151    }
152    page_data->distilled_page_proto->data.set_url(page_url.spec());
153    if (distiller_result->has_distilled_content() &&
154        distiller_result->distilled_content().has_html()) {
155      page_data->distilled_page_proto->data.set_html(
156          distiller_result->distilled_content().html());
157    }
158    if (distiller_result->has_debug_info() &&
159        distiller_result->debug_info().has_log()) {
160      page_data->distilled_page_proto->data.mutable_debug_info()->set_log(
161          distiller_result->debug_info().log());
162    }
163
164    if (distiller_result->has_pagination_info()) {
165      proto::PaginationInfo pagination_info =
166          distiller_result->pagination_info();
167      if (pagination_info.has_next_page()) {
168        GURL next_page_url(pagination_info.next_page());
169        if (next_page_url.is_valid()) {
170          // The pages should be in same origin.
171          DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
172          AddToDistillationQueue(page_num + 1, next_page_url);
173        }
174      }
175
176      if (pagination_info.has_prev_page()) {
177        GURL prev_page_url(pagination_info.prev_page());
178        if (prev_page_url.is_valid()) {
179          DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
180          AddToDistillationQueue(page_num - 1, prev_page_url);
181        }
182      }
183    }
184
185    for (int img_num = 0; img_num < distiller_result->image_urls_size();
186         ++img_num) {
187      std::string image_id =
188          base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
189      FetchImage(page_num, image_id, distiller_result->image_urls(img_num));
190    }
191
192    AddPageIfDone(page_num);
193    DistillNextPage();
194  } else {
195    started_pages_index_.erase(page_num);
196    RunDistillerCallbackIfDone();
197  }
198}
199
200void DistillerImpl::FetchImage(int page_num,
201                               const std::string& image_id,
202                               const std::string& item) {
203  DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
204  DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
205  DistillerURLFetcher* fetcher =
206      distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
207  page_data->image_fetchers_.push_back(fetcher);
208
209  fetcher->FetchURL(item,
210                    base::Bind(&DistillerImpl::OnFetchImageDone,
211                               weak_factory_.GetWeakPtr(),
212                               page_num,
213                               base::Unretained(fetcher),
214                               image_id));
215}
216
217void DistillerImpl::OnFetchImageDone(int page_num,
218                                     DistillerURLFetcher* url_fetcher,
219                                     const std::string& id,
220                                     const std::string& response) {
221  DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
222  DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
223  DCHECK(page_data->distilled_page_proto.get());
224  DCHECK(url_fetcher);
225  ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
226      std::find(page_data->image_fetchers_.begin(),
227                page_data->image_fetchers_.end(),
228                url_fetcher);
229
230  DCHECK(fetcher_it != page_data->image_fetchers_.end());
231  // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
232  // callback is invoked by the |url_fetcher|.
233  page_data->image_fetchers_.weak_erase(fetcher_it);
234  base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
235
236  DistilledPageProto_Image* image =
237      page_data->distilled_page_proto->data.add_image();
238  image->set_name(id);
239  image->set_data(response);
240
241  AddPageIfDone(page_num);
242}
243
244void DistillerImpl::AddPageIfDone(int page_num) {
245  DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
246  DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
247  DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
248  if (page_data->image_fetchers_.empty()) {
249    finished_pages_index_[page_num] = started_pages_index_[page_num];
250    started_pages_index_.erase(page_num);
251    const ArticleDistillationUpdate& article_update =
252        CreateDistillationUpdate();
253    DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size());
254    update_cb_.Run(article_update);
255    RunDistillerCallbackIfDone();
256  }
257}
258
259const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate()
260    const {
261  bool has_prev_page = false;
262  bool has_next_page = false;
263  if (!finished_pages_index_.empty()) {
264    int prev_page_num = finished_pages_index_.begin()->first - 1;
265    int next_page_num = finished_pages_index_.rbegin()->first + 1;
266    has_prev_page = IsPageNumberInUse(prev_page_num);
267    has_next_page = IsPageNumberInUse(next_page_num);
268  }
269
270  std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto> >
271      update_pages;
272  for (std::map<int, size_t>::const_iterator it = finished_pages_index_.begin();
273       it != finished_pages_index_.end();
274       ++it) {
275    update_pages.push_back(pages_[it->second]->distilled_page_proto);
276  }
277  return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page);
278}
279
280void DistillerImpl::RunDistillerCallbackIfDone() {
281  DCHECK(!finished_cb_.is_null());
282  if (AreAllPagesFinished()) {
283    bool first_page = true;
284    scoped_ptr<DistilledArticleProto> article_proto(
285        new DistilledArticleProto());
286    // Stitch the pages back into the article.
287    for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
288         it != finished_pages_index_.end();) {
289      DistilledPageData* page_data = GetPageAtIndex(it->second);
290      *(article_proto->add_pages()) = page_data->distilled_page_proto->data;
291
292      if (first_page) {
293        article_proto->set_title(page_data->distilled_page_proto->data.title());
294        first_page = false;
295      }
296
297      finished_pages_index_.erase(it++);
298    }
299
300    pages_.clear();
301    DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
302              max_pages_in_article_);
303
304    DCHECK(pages_.empty());
305    DCHECK(finished_pages_index_.empty());
306
307    base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
308                                                       false);
309    finished_cb_.Run(article_proto.Pass());
310    finished_cb_.Reset();
311  }
312}
313
314}  // namespace dom_distiller
315