1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "components/dom_distiller/core/distiller.h"
6
7#include <map>
8#include <vector>
9
10#include "base/auto_reset.h"
11#include "base/bind.h"
12#include "base/callback.h"
13#include "base/location.h"
14#include "base/message_loop/message_loop.h"
15#include "base/strings/string_number_conversions.h"
16#include "base/strings/utf_string_conversions.h"
17#include "base/values.h"
18#include "components/dom_distiller/core/distiller_page.h"
19#include "components/dom_distiller/core/distiller_url_fetcher.h"
20#include "components/dom_distiller/core/proto/distilled_article.pb.h"
21#include "components/dom_distiller/core/proto/distilled_page.pb.h"
22#include "net/url_request/url_request_context_getter.h"
23
24namespace {
25// Maximum number of distilled pages in an article.
26const size_t kMaxPagesInArticle = 32;
27}
28
29namespace dom_distiller {
30
31DistillerFactoryImpl::DistillerFactoryImpl(
32    scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
33    const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
34    : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
35      dom_distiller_options_(dom_distiller_options) {
36}
37
38DistillerFactoryImpl::~DistillerFactoryImpl() {}
39
40scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
41  scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
42      *distiller_url_fetcher_factory_, dom_distiller_options_));
43  return distiller.PassAs<Distiller>();
44}
45
46DistillerImpl::DistilledPageData::DistilledPageData() {}
47
48DistillerImpl::DistilledPageData::~DistilledPageData() {}
49
50DistillerImpl::DistillerImpl(
51    const DistillerURLFetcherFactory& distiller_url_fetcher_factory,
52    const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
53    : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
54      dom_distiller_options_(dom_distiller_options),
55      max_pages_in_article_(kMaxPagesInArticle),
56      destruction_allowed_(true),
57      weak_factory_(this) {
58}
59
60DistillerImpl::~DistillerImpl() {
61  DCHECK(destruction_allowed_);
62}
63
64void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
65  max_pages_in_article_ = max_num_pages;
66}
67
68bool DistillerImpl::AreAllPagesFinished() const {
69  return started_pages_index_.empty() && waiting_pages_.empty();
70}
71
72size_t DistillerImpl::TotalPageCount() const {
73  return waiting_pages_.size() + started_pages_index_.size() +
74         finished_pages_index_.size();
75}
76
77void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
78  if (!IsPageNumberInUse(page_num) && url.is_valid() &&
79      TotalPageCount() < max_pages_in_article_ &&
80      seen_urls_.find(url.spec()) == seen_urls_.end()) {
81    waiting_pages_[page_num] = url;
82  }
83}
84
85bool DistillerImpl::IsPageNumberInUse(int page_num) const {
86  return waiting_pages_.find(page_num) != waiting_pages_.end() ||
87         started_pages_index_.find(page_num) != started_pages_index_.end() ||
88         finished_pages_index_.find(page_num) != finished_pages_index_.end();
89}
90
91DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index)
92    const {
93  DCHECK_LT(index, pages_.size());
94  DistilledPageData* page_data = pages_[index];
95  DCHECK(page_data);
96  return page_data;
97}
98
99void DistillerImpl::DistillPage(const GURL& url,
100                                scoped_ptr<DistillerPage> distiller_page,
101                                const DistillationFinishedCallback& finished_cb,
102                                const DistillationUpdateCallback& update_cb) {
103  DCHECK(AreAllPagesFinished());
104  distiller_page_ = distiller_page.Pass();
105  finished_cb_ = finished_cb;
106  update_cb_ = update_cb;
107
108  AddToDistillationQueue(0, url);
109  DistillNextPage();
110}
111
112void DistillerImpl::DistillNextPage() {
113  if (!waiting_pages_.empty()) {
114    std::map<int, GURL>::iterator front = waiting_pages_.begin();
115    int page_num = front->first;
116    const GURL url = front->second;
117
118    waiting_pages_.erase(front);
119    DCHECK(url.is_valid());
120    DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
121    DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
122    seen_urls_.insert(url.spec());
123    pages_.push_back(new DistilledPageData());
124    started_pages_index_[page_num] = pages_.size() - 1;
125    distiller_page_->DistillPage(
126        url,
127        dom_distiller_options_,
128        base::Bind(&DistillerImpl::OnPageDistillationFinished,
129                   weak_factory_.GetWeakPtr(),
130                   page_num,
131                   url));
132  }
133}
134
135void DistillerImpl::OnPageDistillationFinished(
136    int page_num,
137    const GURL& page_url,
138    scoped_ptr<DistilledPageInfo> distilled_page,
139    bool distillation_successful) {
140  DCHECK(distilled_page.get());
141  DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
142  if (distillation_successful) {
143    DistilledPageData* page_data =
144        GetPageAtIndex(started_pages_index_[page_num]);
145    page_data->distilled_page_proto =
146        new base::RefCountedData<DistilledPageProto>();
147    page_data->page_num = page_num;
148    page_data->distilled_page_proto->data.set_title(distilled_page->title);
149    page_data->distilled_page_proto->data.set_url(page_url.spec());
150    page_data->distilled_page_proto->data.set_html(distilled_page->html);
151
152    GURL next_page_url(distilled_page->next_page_url);
153    if (next_page_url.is_valid()) {
154      // The pages should be in same origin.
155      DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
156      AddToDistillationQueue(page_num + 1, next_page_url);
157    }
158
159    GURL prev_page_url(distilled_page->prev_page_url);
160    if (prev_page_url.is_valid()) {
161      DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
162      AddToDistillationQueue(page_num - 1, prev_page_url);
163    }
164
165    for (size_t img_num = 0; img_num < distilled_page->image_urls.size();
166         ++img_num) {
167      std::string image_id =
168          base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
169      FetchImage(page_num, image_id, distilled_page->image_urls[img_num]);
170    }
171
172    AddPageIfDone(page_num);
173    DistillNextPage();
174  } else {
175    started_pages_index_.erase(page_num);
176    RunDistillerCallbackIfDone();
177  }
178}
179
180void DistillerImpl::FetchImage(int page_num,
181                               const std::string& image_id,
182                               const std::string& item) {
183  DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
184  DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
185  DistillerURLFetcher* fetcher =
186      distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
187  page_data->image_fetchers_.push_back(fetcher);
188
189  fetcher->FetchURL(item,
190                    base::Bind(&DistillerImpl::OnFetchImageDone,
191                               weak_factory_.GetWeakPtr(),
192                               page_num,
193                               base::Unretained(fetcher),
194                               image_id));
195}
196
197void DistillerImpl::OnFetchImageDone(int page_num,
198                                     DistillerURLFetcher* url_fetcher,
199                                     const std::string& id,
200                                     const std::string& response) {
201  DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
202  DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
203  DCHECK(page_data->distilled_page_proto);
204  DCHECK(url_fetcher);
205  ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
206      std::find(page_data->image_fetchers_.begin(),
207                page_data->image_fetchers_.end(),
208                url_fetcher);
209
210  DCHECK(fetcher_it != page_data->image_fetchers_.end());
211  // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
212  // callback is invoked by the |url_fetcher|.
213  page_data->image_fetchers_.weak_erase(fetcher_it);
214  base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
215
216  DistilledPageProto_Image* image =
217      page_data->distilled_page_proto->data.add_image();
218  image->set_name(id);
219  image->set_data(response);
220
221  AddPageIfDone(page_num);
222}
223
224void DistillerImpl::AddPageIfDone(int page_num) {
225  DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
226  DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
227  DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
228  if (page_data->image_fetchers_.empty()) {
229    finished_pages_index_[page_num] = started_pages_index_[page_num];
230    started_pages_index_.erase(page_num);
231    const ArticleDistillationUpdate& article_update =
232        CreateDistillationUpdate();
233    DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size());
234    update_cb_.Run(article_update);
235    RunDistillerCallbackIfDone();
236  }
237}
238
239const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate()
240    const {
241  bool has_prev_page = false;
242  bool has_next_page = false;
243  if (!finished_pages_index_.empty()) {
244    int prev_page_num = finished_pages_index_.begin()->first - 1;
245    int next_page_num = finished_pages_index_.rbegin()->first + 1;
246    has_prev_page = IsPageNumberInUse(prev_page_num);
247    has_next_page = IsPageNumberInUse(next_page_num);
248  }
249
250  std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto> >
251      update_pages;
252  for (std::map<int, size_t>::const_iterator it = finished_pages_index_.begin();
253       it != finished_pages_index_.end();
254       ++it) {
255    update_pages.push_back(pages_[it->second]->distilled_page_proto);
256  }
257  return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page);
258}
259
260void DistillerImpl::RunDistillerCallbackIfDone() {
261  DCHECK(!finished_cb_.is_null());
262  if (AreAllPagesFinished()) {
263    bool first_page = true;
264    scoped_ptr<DistilledArticleProto> article_proto(
265        new DistilledArticleProto());
266    // Stitch the pages back into the article.
267    for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
268         it != finished_pages_index_.end();) {
269      DistilledPageData* page_data = GetPageAtIndex(it->second);
270      *(article_proto->add_pages()) = page_data->distilled_page_proto->data;
271
272      if (first_page) {
273        article_proto->set_title(page_data->distilled_page_proto->data.title());
274        first_page = false;
275      }
276
277      finished_pages_index_.erase(it++);
278    }
279
280    pages_.clear();
281    DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
282              max_pages_in_article_);
283
284    DCHECK(pages_.empty());
285    DCHECK(finished_pages_index_.empty());
286
287    base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
288                                                       false);
289    finished_cb_.Run(article_proto.Pass());
290    finished_cb_.Reset();
291  }
292}
293
294}  // namespace dom_distiller
295