1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/utility/importer/bookmark_html_reader.h"
6
7#include "base/callback.h"
8#include "base/files/file_util.h"
9#include "base/i18n/icu_string_conversions.h"
10#include "base/strings/string_number_conversions.h"
11#include "base/strings/string_split.h"
12#include "base/strings/string_util.h"
13#include "base/time/time.h"
14#include "chrome/common/importer/imported_bookmark_entry.h"
15#include "chrome/common/importer/imported_favicon_usage.h"
16#include "chrome/utility/importer/favicon_reencode.h"
17#include "net/base/data_url.h"
18#include "net/base/escape.h"
19#include "url/gurl.h"
20#include "url/url_constants.h"
21
22namespace {
23
24// Fetches the given |attribute| value from the |attribute_list|. Returns true
25// if successful, and |value| will contain the value.
26bool GetAttribute(const std::string& attribute_list,
27                  const std::string& attribute,
28                  std::string* value) {
29  const char kQuote[] = "\"";
30
31  size_t begin = attribute_list.find(attribute + "=" + kQuote);
32  if (begin == std::string::npos)
33    return false;  // Can't find the attribute.
34
35  begin += attribute.size() + 2;
36  size_t end = begin + 1;
37
38  while (end < attribute_list.size()) {
39    if (attribute_list[end] == '"' &&
40        attribute_list[end - 1] != '\\') {
41      break;
42    }
43    end++;
44  }
45
46  if (end == attribute_list.size())
47    return false;  // The value is not quoted.
48
49  *value = attribute_list.substr(begin, end - begin);
50  return true;
51}
52
53// Given the URL of a page and a favicon data URL, adds an appropriate record
54// to the given favicon usage vector.
55void DataURLToFaviconUsage(
56    const GURL& link_url,
57    const GURL& favicon_data,
58    std::vector<ImportedFaviconUsage>* favicons) {
59  if (!link_url.is_valid() || !favicon_data.is_valid() ||
60      !favicon_data.SchemeIs(url::kDataScheme))
61    return;
62
63  // Parse the data URL.
64  std::string mime_type, char_set, data;
65  if (!net::DataURL::Parse(favicon_data, &mime_type, &char_set, &data) ||
66      data.empty())
67    return;
68
69  ImportedFaviconUsage usage;
70  if (!importer::ReencodeFavicon(
71          reinterpret_cast<const unsigned char*>(&data[0]),
72          data.size(), &usage.png_data))
73    return;  // Unable to decode.
74
75  // We need to make up a URL for the favicon. We use a version of the page's
76  // URL so that we can be sure it will not collide.
77  usage.favicon_url = GURL(std::string("made-up-favicon:") + link_url.spec());
78
79  // We only have one URL per favicon for Firefox 2 bookmarks.
80  usage.urls.insert(link_url);
81
82  favicons->push_back(usage);
83}
84
85}  // namespace
86
87namespace bookmark_html_reader {
88
89void ImportBookmarksFile(
90      const base::Callback<bool(void)>& cancellation_callback,
91      const base::Callback<bool(const GURL&)>& valid_url_callback,
92      const base::FilePath& file_path,
93      std::vector<ImportedBookmarkEntry>* bookmarks,
94      std::vector<ImportedFaviconUsage>* favicons) {
95  std::string content;
96  base::ReadFileToString(file_path, &content);
97  std::vector<std::string> lines;
98  base::SplitString(content, '\n', &lines);
99
100  base::string16 last_folder;
101  bool last_folder_on_toolbar = false;
102  bool last_folder_is_empty = true;
103  bool has_subfolder = false;
104  base::Time last_folder_add_date;
105  std::vector<base::string16> path;
106  size_t toolbar_folder_index = 0;
107  std::string charset;
108  for (size_t i = 0;
109       i < lines.size() &&
110           (cancellation_callback.is_null() || !cancellation_callback.Run());
111       ++i) {
112    std::string line;
113    base::TrimString(lines[i], " ", &line);
114
115    // Remove "<HR>" if |line| starts with it. "<HR>" is the bookmark entries
116    // separator in Firefox that Chrome does not support. Note that there can be
117    // multiple "<HR>" tags at the beginning of a single line.
118    // See http://crbug.com/257474.
119    static const char kHrTag[] = "<HR>";
120    while (StartsWithASCII(line, kHrTag, false)) {
121      line.erase(0, arraysize(kHrTag) - 1);
122      base::TrimString(line, " ", &line);
123    }
124
125    // Get the encoding of the bookmark file.
126    if (internal::ParseCharsetFromLine(line, &charset))
127      continue;
128
129    // Get the folder name.
130    if (internal::ParseFolderNameFromLine(line,
131                                          charset,
132                                          &last_folder,
133                                          &last_folder_on_toolbar,
134                                          &last_folder_add_date)) {
135      continue;
136    }
137
138    // Get the bookmark entry.
139    base::string16 title;
140    base::string16 shortcut;
141    GURL url, favicon;
142    base::Time add_date;
143    base::string16 post_data;
144    bool is_bookmark;
145    // TODO(jcampan): http://b/issue?id=1196285 we do not support POST based
146    //                keywords yet.
147    is_bookmark =
148        internal::ParseBookmarkFromLine(line, charset, &title,
149                                        &url, &favicon, &shortcut,
150                                        &add_date, &post_data) ||
151        internal::ParseMinimumBookmarkFromLine(line, charset, &title, &url);
152
153    if (is_bookmark)
154      last_folder_is_empty = false;
155
156    if (is_bookmark &&
157        post_data.empty() &&
158        (valid_url_callback.is_null() || valid_url_callback.Run(url))) {
159      if (toolbar_folder_index > path.size() && !path.empty()) {
160        NOTREACHED();  // error in parsing.
161        break;
162      }
163
164      ImportedBookmarkEntry entry;
165      entry.creation_time = add_date;
166      entry.url = url;
167      entry.title = title;
168
169      if (toolbar_folder_index) {
170        // The toolbar folder should be at the top level.
171        entry.in_toolbar = true;
172        entry.path.assign(path.begin() + toolbar_folder_index - 1, path.end());
173      } else {
174        // Add this bookmark to the list of |bookmarks|.
175        if (!has_subfolder && !last_folder.empty()) {
176          path.push_back(last_folder);
177          last_folder.clear();
178        }
179        entry.path.assign(path.begin(), path.end());
180      }
181      bookmarks->push_back(entry);
182
183      // Save the favicon. DataURLToFaviconUsage will handle the case where
184      // there is no favicon.
185      if (favicons)
186        DataURLToFaviconUsage(url, favicon, favicons);
187
188      continue;
189    }
190
191    // Bookmarks in sub-folder are encapsulated with <DL> tag.
192    if (StartsWithASCII(line, "<DL>", false)) {
193      has_subfolder = true;
194      if (!last_folder.empty()) {
195        path.push_back(last_folder);
196        last_folder.clear();
197      }
198      if (last_folder_on_toolbar && !toolbar_folder_index)
199        toolbar_folder_index = path.size();
200
201      // Mark next folder empty as initial state.
202      last_folder_is_empty = true;
203    } else if (StartsWithASCII(line, "</DL>", false)) {
204      if (path.empty())
205        break;  // Mismatch <DL>.
206
207      base::string16 folder_title = path.back();
208      path.pop_back();
209
210      if (last_folder_is_empty) {
211        // Empty folder should be added explicitly.
212        ImportedBookmarkEntry entry;
213        entry.is_folder = true;
214        entry.creation_time = last_folder_add_date;
215        entry.title = folder_title;
216        if (toolbar_folder_index) {
217          // The toolbar folder should be at the top level.
218          // Make sure we don't add the toolbar folder itself if it is empty.
219          if (toolbar_folder_index <= path.size()) {
220            entry.in_toolbar = true;
221            entry.path.assign(path.begin() + toolbar_folder_index - 1,
222                              path.end());
223            bookmarks->push_back(entry);
224          }
225        } else {
226          // Add this folder to the list of |bookmarks|.
227          entry.path.assign(path.begin(), path.end());
228          bookmarks->push_back(entry);
229        }
230
231        // Parent folder include current one, so it's not empty.
232        last_folder_is_empty = false;
233      }
234
235      if (toolbar_folder_index > path.size())
236        toolbar_folder_index = 0;
237    }
238  }
239}
240
241namespace internal {
242
243bool ParseCharsetFromLine(const std::string& line, std::string* charset) {
244  const char kCharset[] = "charset=";
245  if (StartsWithASCII(line, "<META", false) &&
246      (line.find("CONTENT=\"") != std::string::npos ||
247          line.find("content=\"") != std::string::npos)) {
248    size_t begin = line.find(kCharset);
249    if (begin == std::string::npos)
250      return false;
251    begin += std::string(kCharset).size();
252    size_t end = line.find_first_of('\"', begin);
253    *charset = line.substr(begin, end - begin);
254    return true;
255  }
256  return false;
257}
258
259bool ParseFolderNameFromLine(const std::string& line,
260                             const std::string& charset,
261                             base::string16* folder_name,
262                             bool* is_toolbar_folder,
263                             base::Time* add_date) {
264  const char kFolderOpen[] = "<DT><H3";
265  const char kFolderClose[] = "</H3>";
266  const char kToolbarFolderAttribute[] = "PERSONAL_TOOLBAR_FOLDER";
267  const char kAddDateAttribute[] = "ADD_DATE";
268
269  if (!StartsWithASCII(line, kFolderOpen, true))
270    return false;
271
272  size_t end = line.find(kFolderClose);
273  size_t tag_end = line.rfind('>', end) + 1;
274  // If no end tag or start tag is broken, we skip to find the folder name.
275  if (end == std::string::npos || tag_end < arraysize(kFolderOpen))
276    return false;
277
278  base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
279                        base::OnStringConversionError::SKIP, folder_name);
280  *folder_name = net::UnescapeForHTML(*folder_name);
281
282  std::string attribute_list = line.substr(arraysize(kFolderOpen),
283      tag_end - arraysize(kFolderOpen) - 1);
284  std::string value;
285
286  // Add date
287  if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
288    int64 time;
289    base::StringToInt64(value, &time);
290    // Upper bound it at 32 bits.
291    if (0 < time && time < (1LL << 32))
292      *add_date = base::Time::FromTimeT(time);
293  }
294
295  if (GetAttribute(attribute_list, kToolbarFolderAttribute, &value) &&
296      LowerCaseEqualsASCII(value, "true"))
297    *is_toolbar_folder = true;
298  else
299    *is_toolbar_folder = false;
300
301  return true;
302}
303
304bool ParseBookmarkFromLine(const std::string& line,
305                           const std::string& charset,
306                           base::string16* title,
307                           GURL* url,
308                           GURL* favicon,
309                           base::string16* shortcut,
310                           base::Time* add_date,
311                           base::string16* post_data) {
312  const char kItemOpen[] = "<DT><A";
313  const char kItemClose[] = "</A>";
314  const char kFeedURLAttribute[] = "FEEDURL";
315  const char kHrefAttribute[] = "HREF";
316  const char kIconAttribute[] = "ICON";
317  const char kShortcutURLAttribute[] = "SHORTCUTURL";
318  const char kAddDateAttribute[] = "ADD_DATE";
319  const char kPostDataAttribute[] = "POST_DATA";
320
321  title->clear();
322  *url = GURL();
323  *favicon = GURL();
324  shortcut->clear();
325  post_data->clear();
326  *add_date = base::Time();
327
328  if (!StartsWithASCII(line, kItemOpen, true))
329    return false;
330
331  size_t end = line.find(kItemClose);
332  size_t tag_end = line.rfind('>', end) + 1;
333  if (end == std::string::npos || tag_end < arraysize(kItemOpen))
334    return false;  // No end tag or start tag is broken.
335
336  std::string attribute_list = line.substr(arraysize(kItemOpen),
337      tag_end - arraysize(kItemOpen) - 1);
338
339  // We don't import Live Bookmark folders, which is Firefox's RSS reading
340  // feature, since the user never necessarily bookmarked them and we don't
341  // have this feature to update their contents.
342  std::string value;
343  if (GetAttribute(attribute_list, kFeedURLAttribute, &value))
344    return false;
345
346  // Title
347  base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
348                        base::OnStringConversionError::SKIP, title);
349  *title = net::UnescapeForHTML(*title);
350
351  // URL
352  if (GetAttribute(attribute_list, kHrefAttribute, &value)) {
353    base::string16 url16;
354    base::CodepageToUTF16(value, charset.c_str(),
355                          base::OnStringConversionError::SKIP, &url16);
356    url16 = net::UnescapeForHTML(url16);
357
358    *url = GURL(url16);
359  }
360
361  // Favicon
362  if (GetAttribute(attribute_list, kIconAttribute, &value))
363    *favicon = GURL(value);
364
365  // Keyword
366  if (GetAttribute(attribute_list, kShortcutURLAttribute, &value)) {
367    base::CodepageToUTF16(value, charset.c_str(),
368                          base::OnStringConversionError::SKIP, shortcut);
369    *shortcut = net::UnescapeForHTML(*shortcut);
370  }
371
372  // Add date
373  if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
374    int64 time;
375    base::StringToInt64(value, &time);
376    // Upper bound it at 32 bits.
377    if (0 < time && time < (1LL << 32))
378      *add_date = base::Time::FromTimeT(time);
379  }
380
381  // Post data.
382  if (GetAttribute(attribute_list, kPostDataAttribute, &value)) {
383    base::CodepageToUTF16(value, charset.c_str(),
384                          base::OnStringConversionError::SKIP, post_data);
385    *post_data = net::UnescapeForHTML(*post_data);
386  }
387
388  return true;
389}
390
391bool ParseMinimumBookmarkFromLine(const std::string& line,
392                                  const std::string& charset,
393                                  base::string16* title,
394                                  GURL* url) {
395  const char kItemOpen[] = "<DT><A";
396  const char kItemClose[] = "</";
397  const char kHrefAttributeUpper[] = "HREF";
398  const char kHrefAttributeLower[] = "href";
399
400  title->clear();
401  *url = GURL();
402
403  // Case-insensitive check of open tag.
404  if (!StartsWithASCII(line, kItemOpen, false))
405    return false;
406
407  // Find any close tag.
408  size_t end = line.find(kItemClose);
409  size_t tag_end = line.rfind('>', end) + 1;
410  if (end == std::string::npos || tag_end < arraysize(kItemOpen))
411    return false;  // No end tag or start tag is broken.
412
413  std::string attribute_list = line.substr(arraysize(kItemOpen),
414      tag_end - arraysize(kItemOpen) - 1);
415
416  // Title
417  base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
418                        base::OnStringConversionError::SKIP, title);
419  *title = net::UnescapeForHTML(*title);
420
421  // URL
422  std::string value;
423  if (GetAttribute(attribute_list, kHrefAttributeUpper, &value) ||
424      GetAttribute(attribute_list, kHrefAttributeLower, &value)) {
425    if (charset.length() != 0) {
426      base::string16 url16;
427      base::CodepageToUTF16(value, charset.c_str(),
428                            base::OnStringConversionError::SKIP, &url16);
429      url16 = net::UnescapeForHTML(url16);
430
431      *url = GURL(url16);
432    } else {
433      *url = GURL(value);
434    }
435  }
436
437  return true;
438}
439
440}  // namespace internal
441
442}  // namespace bookmark_html_reader
443