1/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2/* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 *
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
14 *
15 * The Original Code is the Mork Reader.
16 *
17 * The Initial Developer of the Original Code is
18 * Google Inc.
19 * Portions created by the Initial Developer are Copyright (C) 2006
20 * the Initial Developer. All Rights Reserved.
21 *
22 * Contributor(s):
23 *   Brian Ryner <bryner@brianryner.com> (original author)
24 *
25 * Alternatively, the contents of this file may be used under the terms of
26 * either the GNU General Public License Version 2 or later (the "GPL"), or
27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
36 *
37 * ***** END LICENSE BLOCK ***** */
38
39// Source:
40// http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp
41// This file has been converted to google style.
42
43#include "chrome/browser/importer/mork_reader.h"
44
45#include <algorithm>
46
47#include "base/file_path.h"
48#include "base/i18n/icu_string_conversions.h"
49#include "base/logging.h"
50#include "base/message_loop.h"
51#include "base/string_number_conversions.h"
52#include "base/string_util.h"
53#include "chrome/browser/history/history_types.h"
54#include "chrome/browser/importer/firefox_importer_utils.h"
55#include "chrome/browser/importer/importer_bridge.h"
56
57namespace {
58
59// Convert a hex character (0-9, A-F) to its corresponding byte value.
60// Returns -1 if the character is invalid.
61inline int HexCharToInt(char c) {
62  if ('0' <= c && c <= '9')
63    return c - '0';
64  if ('A' <= c && c <= 'F')
65    return c - 'A' + 10;
66  return -1;
67}
68
69// Unescape a Mork value.  Mork uses $xx escaping to encode non-ASCII
70// characters.  Additionally, '$' and '\' are backslash-escaped.
71// The result of the unescape is in returned.
72std::string MorkUnescape(const std::string& input) {
73  // We optimize for speed over space here -- size the result buffer to
74  // the size of the source, which is an upper bound on the size of the
75  // unescaped string.
76  std::string result;
77  size_t input_length = input.size();
78  result.reserve(input_length);
79
80  for (size_t i = 0; i < input_length; i++) {
81    char c = input[i];
82    if (c == '\\') {
83      // Escaped literal, slip the backslash, append the next character.
84      i++;
85      if (i < input_length)
86        result.push_back(input[i]);
87    } else if (c == '$') {
88      // Dollar sign denotes a hex character.
89      if (i < input_length - 2) {
90        // Would be nice to use ToInteger() here, but it currently
91        // requires a null-terminated string.
92        int first = HexCharToInt(input[++i]);
93        int second = HexCharToInt(input[++i]);
94        if (first >= 0 && second >= 0)
95          result.push_back((first << 4) | second);
96      }
97    } else {
98      // Regular character, just append.
99      result.push_back(input[i]);
100    }
101  }
102  return result;
103}
104
105}  // namespace
106
107MorkReader::MorkReader() {
108}
109
110MorkReader::~MorkReader() {
111  // Need to delete all the pointers to vectors we have in the table.
112  for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i)
113    delete i->second;
114}
115
116bool MorkReader::Read(const FilePath& path) {
117  stream_.open(path.value().c_str());
118  if (!stream_.is_open())
119    return false;
120
121  std::string line;
122  if (!ReadLine(&line) ||
123      line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0)
124    return false;  // Unexpected file format.
125
126  IndexMap column_map;
127  while (ReadLine(&line)) {
128    // Trim off leading spaces
129    size_t idx = 0;
130    size_t len = line.size();
131    while (idx < len && line[idx] == ' ')
132      ++idx;
133    if (idx >= len)
134      continue;
135
136    // Look at the line to figure out what section type this is
137    if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) {
138      // Column map.  We begin by creating a hash of column id to column name.
139      StringMap column_name_map;
140      ParseMap(line, idx, &column_name_map);
141
142      // Now that we have the list of columns, we put them into a flat array.
143      // Rows will have value arrays of the same size, with indexes that
144      // correspond to the columns array.  As we insert each column into the
145      // array, we also make an entry in columnMap so that we can look up the
146      // index given the column id.
147      columns_.reserve(column_name_map.size());
148
149      for (StringMap::const_iterator i = column_name_map.begin();
150           i != column_name_map.end(); ++i) {
151        column_map[i->first] = static_cast<int>(columns_.size());
152        MorkColumn col(i->first, i->second);
153        columns_.push_back(col);
154      }
155    } else if (StartsWithASCII(&line[idx], "<(", true)) {
156      // Value map.
157      ParseMap(line, idx, &value_map_);
158    } else if (line[idx] == '{' || line[idx] == '[') {
159      // Table / table row.
160      ParseTable(line, idx, &column_map);
161    } else {
162      // Don't know, hopefully don't care.
163    }
164  }
165  return true;
166}
167
168// Parses a key/value map of the form
169// <(k1=v1)(k2=v2)...>
170bool MorkReader::ParseMap(const std::string& first_line,
171                          size_t start_index,
172                          StringMap* map) {
173  // If the first line is the a=c line (column map), just skip over it.
174  std::string line(first_line);
175  if (StartsWithASCII(line, "< <(a=c)>", true))
176    ReadLine(&line);
177
178  std::string key;
179  do {
180    size_t idx = start_index;
181    size_t len = line.size();
182    size_t token_start;
183
184    while (idx < len) {
185      switch (line[idx++]) {
186        case '(':
187          // Beginning of a key/value pair.
188          if (!key.empty()) {
189            DLOG(WARNING) << "unterminated key/value pair?";
190            key.clear();
191          }
192
193          token_start = idx;
194          while (idx < len && line[idx] != '=')
195            ++idx;
196          key.assign(&line[token_start], idx - token_start);
197          break;
198
199        case '=': {
200          // Beginning of the value.
201          if (key.empty()) {
202            DLOG(WARNING) << "stray value";
203            break;
204          }
205
206          token_start = idx;
207          while (idx < len && line[idx] != ')') {
208            if (line[idx] == '\\')
209              ++idx;  // Skip escaped ')' characters.
210            ++idx;
211          }
212          size_t token_end = std::min(idx, len);
213          ++idx;
214
215          std::string value = MorkUnescape(
216              std::string(&line[token_start], token_end - token_start));
217          (*map)[key] = value;
218          key.clear();
219          break;
220        }
221        case '>':
222          // End of the map.
223          DLOG_IF(WARNING, key.empty()) <<
224              "map terminates inside of key/value pair";
225          return true;
226      }
227    }
228
229    // We should start reading the next line at the beginning.
230    start_index = 0;
231  } while (ReadLine(&line));
232
233  // We ran out of lines and the map never terminated.  This probably indicates
234  // a parsing error.
235  DLOG(WARNING) << "didn't find end of key/value map";
236  return false;
237}
238
239// Parses a table row of the form [123(^45^67)..]
240// (row id 123 has the value with id 67 for the column with id 45).
241// A '^' prefix for a column or value references an entry in the column or
242// value map.  '=' is used as the separator when the value is a literal.
243void MorkReader::ParseTable(const std::string& first_line,
244                            size_t start_index,
245                            const IndexMap* column_map) {
246  std::string line(first_line);
247
248  // Column index of the cell we're parsing, minus one if invalid.
249  int column_index = -1;
250
251  // Points to the current row we're parsing inside of the |table_|, will be
252  // NULL if we're not inside a row.
253  ColumnDataList* current_row = NULL;
254
255  bool in_meta_row = false;
256
257  do {
258    size_t idx = start_index;
259    size_t len = line.size();
260
261    while (idx < len) {
262      switch (line[idx++]) {
263        case '{':
264          // This marks the beginning of a table section.  There's a lot of
265          // junk before the first row that looks like cell values but isn't.
266          // Skip to the first '['.
267          while (idx < len && line[idx] != '[') {
268            if (line[idx] == '{') {
269              in_meta_row = true;  // The meta row is enclosed in { }
270            } else if (line[idx] == '}') {
271              in_meta_row = false;
272            }
273            ++idx;
274          }
275          break;
276
277        case '[': {
278          // Start of a new row.  Consume the row id, up to the first '('.
279          // Row edits also have a table namespace, separated from the row id
280          // by a colon.  We don't make use of the namespace, but we need to
281          // make sure not to consider it part of the row id.
282          if (current_row) {
283            DLOG(WARNING) << "unterminated row?";
284            current_row = NULL;
285          }
286
287          // Check for a '-' at the start of the id.  This signifies that
288          // if the row already exists, we should delete all columns from it
289          // before adding the new values.
290          bool cut_columns;
291          if (idx < len && line[idx] == '-') {
292            cut_columns = true;
293            ++idx;
294          } else {
295            cut_columns = false;
296          }
297
298          // Locate the range of the ID.
299          size_t token_start = idx;  // Index of the first char of the token.
300          while (idx < len &&
301                 line[idx] != '(' &&
302                 line[idx] != ']' &&
303                 line[idx] != ':') {
304            ++idx;
305          }
306          size_t token_end = idx;  // Index of the char following the token.
307          while (idx < len && line[idx] != '(' && line[idx] != ']') {
308            ++idx;
309          }
310
311          if (in_meta_row) {
312            // Need to create the meta row.
313            meta_row_.resize(columns_.size());
314            current_row = &meta_row_;
315          } else {
316            // Find or create the regular row for this.
317            IDString row_id(&line[token_start], token_end - token_start);
318            RowMap::iterator found_row = table_.find(row_id);
319            if (found_row == table_.end()) {
320              // We don't already have this row, create a new one for it.
321              current_row = new ColumnDataList(columns_.size());
322              table_[row_id] = current_row;
323            } else {
324              // The row already exists and we're adding/replacing things.
325              current_row = found_row->second;
326            }
327          }
328          if (cut_columns) {
329            for (size_t i = 0; i < current_row->size(); ++i)
330              (*current_row)[i].clear();
331          }
332          break;
333        }
334
335        case ']':
336          // We're done with the row.
337          current_row = NULL;
338          in_meta_row = false;
339          break;
340
341        case '(': {
342          if (!current_row) {
343            DLOG(WARNING) << "cell value outside of row";
344            break;
345          }
346
347          bool column_is_atom;
348          if (line[idx] == '^') {
349            column_is_atom = true;
350            ++idx;  // This is not part of the column id, advance past it.
351          } else {
352            column_is_atom = false;
353          }
354          size_t token_start = idx;
355          while (idx < len && line[idx] != '^' && line[idx] != '=') {
356            if (line[idx] == '\\')
357              ++idx;  // Skip escaped characters.
358            ++idx;
359          }
360
361          size_t token_end = std::min(idx, len);
362
363          IDString column;
364          if (column_is_atom)
365            column.assign(&line[token_start], token_end - token_start);
366          else
367            column = MorkUnescape(line.substr(token_start,
368                                              token_end - token_start));
369
370          IndexMap::const_iterator found_column = column_map->find(column);
371          if (found_column == column_map->end()) {
372            DLOG(WARNING) << "Column not in column map, discarding it";
373            column_index = -1;
374          } else {
375            column_index = found_column->second;
376          }
377          break;
378        }
379
380        case '=':
381        case '^': {
382          if (column_index == -1) {
383            DLOG(WARNING) << "stray ^ or = marker";
384            break;
385          }
386
387          bool value_is_atom = (line[idx - 1] == '^');
388          size_t token_start = idx - 1;  // Include the '=' or '^' marker.
389          while (idx < len && line[idx] != ')') {
390            if (line[idx] == '\\')
391              ++idx;  // Skip escaped characters.
392            ++idx;
393          }
394          size_t token_end = std::min(idx, len);
395          ++idx;
396
397          if (value_is_atom) {
398            (*current_row)[column_index].assign(&line[token_start],
399                                                token_end - token_start);
400          } else {
401            (*current_row)[column_index] =
402                MorkUnescape(line.substr(token_start, token_end - token_start));
403          }
404          column_index = -1;
405        }
406        break;
407      }
408    }
409
410    // Start parsing the next line at the beginning.
411    start_index = 0;
412  } while (current_row && ReadLine(&line));
413}
414
415bool MorkReader::ReadLine(std::string* line) {
416  line->resize(256);
417  std::getline(stream_, *line);
418  if (stream_.eof() || stream_.bad())
419    return false;
420
421  while (!line->empty() &&  (*line)[line->size() - 1] == '\\') {
422    // There is a continuation for this line.  Read it and append.
423    std::string new_line;
424    std::getline(stream_, new_line);
425    if (stream_.eof())
426      return false;
427    line->erase(line->size() - 1);
428    line->append(new_line);
429  }
430
431  return true;
432}
433
434void MorkReader::NormalizeValue(std::string* value) const {
435  if (value->empty())
436    return;
437  MorkReader::StringMap::const_iterator i;
438  switch (value->at(0)) {
439    case '^':
440      // Hex ID, lookup the name for it in the |value_map_|.
441      i = value_map_.find(value->substr(1));
442      if (i == value_map_.end())
443        value->clear();
444      else
445        *value = i->second;
446      break;
447    case '=':
448      // Just use the literal after the equals sign.
449      value->erase(value->begin());
450      break;
451    default:
452      // Anything else is invalid.
453      value->clear();
454      break;
455  }
456}
457
458// Source:
459// http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp
460
461// Columns for entry (non-meta) history rows
462enum {
463  kURLColumn,
464  kNameColumn,
465  kVisitCountColumn,
466  kHiddenColumn,
467  kTypedColumn,
468  kLastVisitColumn,
469  kColumnCount  // Keep me last.
470};
471
472static const char * const gColumnNames[] = {
473  "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate"
474};
475
476struct TableReadClosure {
477  explicit TableReadClosure(const MorkReader& r)
478      : reader(r),
479        swap_bytes(false),
480        byte_order_column(-1) {
481    for (int i = 0; i < kColumnCount; ++i)
482      column_indexes[i] = -1;
483  }
484
485  // Backpointers to the reader and history we're operating on.
486  const MorkReader& reader;
487
488  // Whether we need to swap bytes (file format is other-endian).
489  bool swap_bytes;
490
491  // Indexes of the columns that we care about.
492  int column_indexes[kColumnCount];
493  int byte_order_column;
494};
495
496void AddToHistory(MorkReader::ColumnDataList* column_values,
497                  const TableReadClosure& data,
498                  std::vector<history::URLRow>* rows) {
499  std::string values[kColumnCount];
500
501  for (size_t i = 0; i < kColumnCount; ++i) {
502    if (data.column_indexes[i] != -1) {
503      values[i] = column_values->at(data.column_indexes[i]);
504      data.reader.NormalizeValue(&values[i]);
505      // Do not import hidden records.
506      if (i == kHiddenColumn && values[i] == "1")
507        return;
508    }
509  }
510
511  GURL url(values[kURLColumn]);
512
513  if (CanImportURL(url)) {
514    history::URLRow row(url);
515
516    string16 title;
517    if (data.swap_bytes) {
518      base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16BE,
519                            base::OnStringConversionError::SKIP, &title);
520    } else {
521      base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16LE,
522                            base::OnStringConversionError::SKIP, &title);
523    }
524    row.set_title(title);
525
526    int count = atoi(values[kVisitCountColumn].c_str());
527    if (count == 0)
528      count = 1;
529    row.set_visit_count(count);
530
531    int64 date;
532    base::StringToInt64(values[kLastVisitColumn], &date);
533    if (date != 0)
534      row.set_last_visit(base::Time::FromTimeT(date / 1000000));
535
536    bool is_typed = (values[kTypedColumn] == "1");
537    if (is_typed)
538      row.set_typed_count(1);
539
540    rows->push_back(row);
541  }
542}
543
544// It sets up the file stream and loops over the lines in the file to
545// parse them, then adds the resulting row set to history.
546void ImportHistoryFromFirefox2(const FilePath& file, ImporterBridge* bridge) {
547  MorkReader reader;
548  reader.Read(file);
549
550  // Gather up the column ids so we don't need to find them on each row
551  TableReadClosure data(reader);
552  const MorkReader::MorkColumnList& columns = reader.columns();
553  for (size_t i = 0; i < columns.size(); ++i) {
554    for (int j = 0; j < kColumnCount; ++j)
555      if (columns[i].name == gColumnNames[j]) {
556        data.column_indexes[j] = static_cast<int>(i);
557        break;
558      }
559    if (columns[i].name == "ByteOrder")
560      data.byte_order_column = static_cast<int>(i);
561  }
562
563  // Determine the byte order from the table's meta-row.
564  const MorkReader::ColumnDataList& meta_row = reader.meta_row();
565  if (!meta_row.empty() && data.byte_order_column != -1) {
566    std::string byte_order = meta_row[data.byte_order_column];
567    if (!byte_order.empty()) {
568      // Note whether the file uses a non-native byte ordering.
569      // If it does, we'll have to swap bytes for PRUnichar values.
570      // "BE" and "LE" are the only recognized values, anything
571      // else is garbage and the file will be treated as native-endian
572      // (no swapping).
573      std::string byte_order_value(byte_order);
574      reader.NormalizeValue(&byte_order_value);
575      data.swap_bytes = (byte_order_value == "BE");
576    }
577  }
578
579  std::vector<history::URLRow> rows;
580  for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i)
581    AddToHistory(i->second, data, &rows);
582  if (!rows.empty())
583    bridge->SetHistoryItems(rows, history::SOURCE_FIREFOX_IMPORTED);
584}
585