mork_reader.cc revision 3345a6884c488ff3a535c2c9acdd33d74b37e311
1/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2/* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 *
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
14 *
15 * The Original Code is the Mork Reader.
16 *
17 * The Initial Developer of the Original Code is
18 * Google Inc.
19 * Portions created by the Initial Developer are Copyright (C) 2006
20 * the Initial Developer. All Rights Reserved.
21 *
22 * Contributor(s):
23 *   Brian Ryner <bryner@brianryner.com> (original author)
24 *
25 * Alternatively, the contents of this file may be used under the terms of
26 * either the GNU General Public License Version 2 or later (the "GPL"), or
27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
36 *
37 * ***** END LICENSE BLOCK ***** */
38
39// Source:
40// http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp
41// This file has been converted to google style.
42
43#include "chrome/browser/importer/mork_reader.h"
44
45#include <algorithm>
46
47#include "base/file_path.h"
48#include "base/i18n/icu_string_conversions.h"
49#include "base/logging.h"
50#include "base/message_loop.h"
51#include "base/string_number_conversions.h"
52#include "base/string_util.h"
53#include "base/values.h"
54#include "chrome/browser/history/history_types.h"
55#include "chrome/browser/importer/firefox_importer_utils.h"
56#include "chrome/browser/importer/importer.h"
57#include "chrome/browser/importer/importer_bridge.h"
58
59using base::Time;
60
61namespace {
62
63// Convert a hex character (0-9, A-F) to its corresponding byte value.
64// Returns -1 if the character is invalid.
65inline int HexCharToInt(char c) {
66  if ('0' <= c && c <= '9')
67    return c - '0';
68  if ('A' <= c && c <= 'F')
69    return c - 'A' + 10;
70  return -1;
71}
72
73// Unescape a Mork value.  Mork uses $xx escaping to encode non-ASCII
74// characters.  Additionally, '$' and '\' are backslash-escaped.
75// The result of the unescape is in returned.
76std::string MorkUnescape(const std::string& input) {
77  // We optimize for speed over space here -- size the result buffer to
78  // the size of the source, which is an upper bound on the size of the
79  // unescaped string.
80  std::string result;
81  size_t input_length = input.size();
82  result.reserve(input_length);
83
84  for (size_t i = 0; i < input_length; i++) {
85    char c = input[i];
86    if (c == '\\') {
87      // Escaped literal, slip the backslash, append the next character.
88      i++;
89      if (i < input_length)
90        result.push_back(input[i]);
91    } else if (c == '$') {
92      // Dollar sign denotes a hex character.
93      if (i < input_length - 2) {
94        // Would be nice to use ToInteger() here, but it currently
95        // requires a null-terminated string.
96        int first = HexCharToInt(input[++i]);
97        int second = HexCharToInt(input[++i]);
98        if (first >= 0 && second >= 0)
99          result.push_back((first << 4) | second);
100      }
101    } else {
102      // Regular character, just append.
103      result.push_back(input[i]);
104    }
105  }
106  return result;
107}
108
109}  // namespace
110
111MorkReader::MorkReader() {
112}
113
114MorkReader::~MorkReader() {
115  // Need to delete all the pointers to vectors we have in the table.
116  for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i)
117    delete i->second;
118}
119
120bool MorkReader::Read(const FilePath& path) {
121  stream_.open(path.value().c_str());
122  if (!stream_.is_open())
123    return false;
124
125  std::string line;
126  if (!ReadLine(&line) ||
127      line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0)
128    return false;  // Unexpected file format.
129
130  IndexMap column_map;
131  while (ReadLine(&line)) {
132    // Trim off leading spaces
133    size_t idx = 0;
134    size_t len = line.size();
135    while (idx < len && line[idx] == ' ')
136      ++idx;
137    if (idx >= len)
138      continue;
139
140    // Look at the line to figure out what section type this is
141    if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) {
142      // Column map.  We begin by creating a hash of column id to column name.
143      StringMap column_name_map;
144      ParseMap(line, idx, &column_name_map);
145
146      // Now that we have the list of columns, we put them into a flat array.
147      // Rows will have value arrays of the same size, with indexes that
148      // correspond to the columns array.  As we insert each column into the
149      // array, we also make an entry in columnMap so that we can look up the
150      // index given the column id.
151      columns_.reserve(column_name_map.size());
152
153      for (StringMap::const_iterator i = column_name_map.begin();
154           i != column_name_map.end(); ++i) {
155        column_map[i->first] = static_cast<int>(columns_.size());
156        MorkColumn col(i->first, i->second);
157        columns_.push_back(col);
158      }
159    } else if (StartsWithASCII(&line[idx], "<(", true)) {
160      // Value map.
161      ParseMap(line, idx, &value_map_);
162    } else if (line[idx] == '{' || line[idx] == '[') {
163      // Table / table row.
164      ParseTable(line, idx, &column_map);
165    } else {
166      // Don't know, hopefully don't care.
167    }
168  }
169  return true;
170}
171
172// Parses a key/value map of the form
173// <(k1=v1)(k2=v2)...>
174bool MorkReader::ParseMap(const std::string& first_line,
175                          size_t start_index,
176                          StringMap* map) {
177  // If the first line is the a=c line (column map), just skip over it.
178  std::string line(first_line);
179  if (StartsWithASCII(line, "< <(a=c)>", true))
180    ReadLine(&line);
181
182  std::string key;
183  do {
184    size_t idx = start_index;
185    size_t len = line.size();
186    size_t token_start;
187
188    while (idx < len) {
189      switch (line[idx++]) {
190        case '(':
191          // Beginning of a key/value pair.
192          if (!key.empty()) {
193            DLOG(WARNING) << "unterminated key/value pair?";
194            key.clear();
195          }
196
197          token_start = idx;
198          while (idx < len && line[idx] != '=')
199            ++idx;
200          key.assign(&line[token_start], idx - token_start);
201          break;
202
203        case '=': {
204          // Beginning of the value.
205          if (key.empty()) {
206            DLOG(WARNING) << "stray value";
207            break;
208          }
209
210          token_start = idx;
211          while (idx < len && line[idx] != ')') {
212            if (line[idx] == '\\')
213              ++idx;  // Skip escaped ')' characters.
214            ++idx;
215          }
216          size_t token_end = std::min(idx, len);
217          ++idx;
218
219          std::string value = MorkUnescape(
220              std::string(&line[token_start], token_end - token_start));
221          (*map)[key] = value;
222          key.clear();
223          break;
224        }
225        case '>':
226          // End of the map.
227          DLOG_IF(WARNING, key.empty()) <<
228              "map terminates inside of key/value pair";
229          return true;
230      }
231    }
232
233    // We should start reading the next line at the beginning.
234    start_index = 0;
235  } while (ReadLine(&line));
236
237  // We ran out of lines and the map never terminated.  This probably indicates
238  // a parsing error.
239  DLOG(WARNING) << "didn't find end of key/value map";
240  return false;
241}
242
243// Parses a table row of the form [123(^45^67)..]
244// (row id 123 has the value with id 67 for the column with id 45).
245// A '^' prefix for a column or value references an entry in the column or
246// value map.  '=' is used as the separator when the value is a literal.
247void MorkReader::ParseTable(const std::string& first_line,
248                            size_t start_index,
249                            const IndexMap* column_map) {
250  std::string line(first_line);
251
252  // Column index of the cell we're parsing, minus one if invalid.
253  int column_index = -1;
254
255  // Points to the current row we're parsing inside of the |table_|, will be
256  // NULL if we're not inside a row.
257  ColumnDataList* current_row = NULL;
258
259  bool in_meta_row = false;
260
261  do {
262    size_t idx = start_index;
263    size_t len = line.size();
264
265    while (idx < len) {
266      switch (line[idx++]) {
267        case '{':
268          // This marks the beginning of a table section.  There's a lot of
269          // junk before the first row that looks like cell values but isn't.
270          // Skip to the first '['.
271          while (idx < len && line[idx] != '[') {
272            if (line[idx] == '{') {
273              in_meta_row = true;  // The meta row is enclosed in { }
274            } else if (line[idx] == '}') {
275              in_meta_row = false;
276            }
277            ++idx;
278          }
279          break;
280
281        case '[': {
282          // Start of a new row.  Consume the row id, up to the first '('.
283          // Row edits also have a table namespace, separated from the row id
284          // by a colon.  We don't make use of the namespace, but we need to
285          // make sure not to consider it part of the row id.
286          if (current_row) {
287            DLOG(WARNING) << "unterminated row?";
288            current_row = NULL;
289          }
290
291          // Check for a '-' at the start of the id.  This signifies that
292          // if the row already exists, we should delete all columns from it
293          // before adding the new values.
294          bool cut_columns;
295          if (idx < len && line[idx] == '-') {
296            cut_columns = true;
297            ++idx;
298          } else {
299            cut_columns = false;
300          }
301
302          // Locate the range of the ID.
303          size_t token_start = idx;  // Index of the first char of the token.
304          while (idx < len &&
305                 line[idx] != '(' &&
306                 line[idx] != ']' &&
307                 line[idx] != ':') {
308            ++idx;
309          }
310          size_t token_end = idx;  // Index of the char following the token.
311          while (idx < len && line[idx] != '(' && line[idx] != ']') {
312            ++idx;
313          }
314
315          if (in_meta_row) {
316            // Need to create the meta row.
317            meta_row_.resize(columns_.size());
318            current_row = &meta_row_;
319          } else {
320            // Find or create the regular row for this.
321            IDString row_id(&line[token_start], token_end - token_start);
322            RowMap::iterator found_row = table_.find(row_id);
323            if (found_row == table_.end()) {
324              // We don't already have this row, create a new one for it.
325              current_row = new ColumnDataList(columns_.size());
326              table_[row_id] = current_row;
327            } else {
328              // The row already exists and we're adding/replacing things.
329              current_row = found_row->second;
330            }
331          }
332          if (cut_columns) {
333            for (size_t i = 0; i < current_row->size(); ++i)
334              (*current_row)[i].clear();
335          }
336          break;
337        }
338
339        case ']':
340          // We're done with the row.
341          current_row = NULL;
342          in_meta_row = false;
343          break;
344
345        case '(': {
346          if (!current_row) {
347            DLOG(WARNING) << "cell value outside of row";
348            break;
349          }
350
351          bool column_is_atom;
352          if (line[idx] == '^') {
353            column_is_atom = true;
354            ++idx;  // This is not part of the column id, advance past it.
355          } else {
356            column_is_atom = false;
357          }
358          size_t token_start = idx;
359          while (idx < len && line[idx] != '^' && line[idx] != '=') {
360            if (line[idx] == '\\')
361              ++idx;  // Skip escaped characters.
362            ++idx;
363          }
364
365          size_t token_end = std::min(idx, len);
366
367          IDString column;
368          if (column_is_atom)
369            column.assign(&line[token_start], token_end - token_start);
370          else
371            column = MorkUnescape(line.substr(token_start,
372                                              token_end - token_start));
373
374          IndexMap::const_iterator found_column = column_map->find(column);
375          if (found_column == column_map->end()) {
376            DLOG(WARNING) << "Column not in column map, discarding it";
377            column_index = -1;
378          } else {
379            column_index = found_column->second;
380          }
381          break;
382        }
383
384        case '=':
385        case '^': {
386          if (column_index == -1) {
387            DLOG(WARNING) << "stray ^ or = marker";
388            break;
389          }
390
391          bool value_is_atom = (line[idx - 1] == '^');
392          size_t token_start = idx - 1;  // Include the '=' or '^' marker.
393          while (idx < len && line[idx] != ')') {
394            if (line[idx] == '\\')
395              ++idx;  // Skip escaped characters.
396            ++idx;
397          }
398          size_t token_end = std::min(idx, len);
399          ++idx;
400
401          if (value_is_atom) {
402            (*current_row)[column_index].assign(&line[token_start],
403                                                token_end - token_start);
404          } else {
405            (*current_row)[column_index] =
406                MorkUnescape(line.substr(token_start, token_end - token_start));
407          }
408          column_index = -1;
409        }
410        break;
411      }
412    }
413
414    // Start parsing the next line at the beginning.
415    start_index = 0;
416  } while (current_row && ReadLine(&line));
417}
418
419bool MorkReader::ReadLine(std::string* line) {
420  line->resize(256);
421  std::getline(stream_, *line);
422  if (stream_.eof() || stream_.bad())
423    return false;
424
425  while (!line->empty() &&  (*line)[line->size() - 1] == '\\') {
426    // There is a continuation for this line.  Read it and append.
427    std::string new_line;
428    std::getline(stream_, new_line);
429    if (stream_.eof())
430      return false;
431    line->erase(line->size() - 1);
432    line->append(new_line);
433  }
434
435  return true;
436}
437
438void MorkReader::NormalizeValue(std::string* value) const {
439  if (value->empty())
440    return;
441  MorkReader::StringMap::const_iterator i;
442  switch (value->at(0)) {
443    case '^':
444      // Hex ID, lookup the name for it in the |value_map_|.
445      i = value_map_.find(value->substr(1));
446      if (i == value_map_.end())
447        value->clear();
448      else
449        *value = i->second;
450      break;
451    case '=':
452      // Just use the literal after the equals sign.
453      value->erase(value->begin());
454      break;
455    default:
456      // Anything else is invalid.
457      value->clear();
458      break;
459  }
460}
461
462// Source:
463// http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp
464
465// Columns for entry (non-meta) history rows
466enum {
467  kURLColumn,
468  kNameColumn,
469  kVisitCountColumn,
470  kHiddenColumn,
471  kTypedColumn,
472  kLastVisitColumn,
473  kColumnCount  // Keep me last.
474};
475
476static const char * const gColumnNames[] = {
477  "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate"
478};
479
480struct TableReadClosure {
481  explicit TableReadClosure(const MorkReader& r)
482      : reader(r),
483        swap_bytes(false),
484        byte_order_column(-1) {
485    for (int i = 0; i < kColumnCount; ++i)
486      column_indexes[i] = -1;
487  }
488
489  // Backpointers to the reader and history we're operating on.
490  const MorkReader& reader;
491
492  // Whether we need to swap bytes (file format is other-endian).
493  bool swap_bytes;
494
495  // Indexes of the columns that we care about.
496  int column_indexes[kColumnCount];
497  int byte_order_column;
498};
499
500void AddToHistory(MorkReader::ColumnDataList* column_values,
501                  const TableReadClosure& data,
502                  std::vector<history::URLRow>* rows) {
503  std::string values[kColumnCount];
504
505  for (size_t i = 0; i < kColumnCount; ++i) {
506    if (data.column_indexes[i] != -1) {
507      values[i] = column_values->at(data.column_indexes[i]);
508      data.reader.NormalizeValue(&values[i]);
509      // Do not import hidden records.
510      if (i == kHiddenColumn && values[i] == "1")
511        return;
512    }
513  }
514
515  GURL url(values[kURLColumn]);
516
517  if (CanImportURL(url)) {
518    history::URLRow row(url);
519
520    string16 title;
521    if (data.swap_bytes) {
522      base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16BE,
523                            base::OnStringConversionError::SKIP, &title);
524    } else {
525      base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16LE,
526                            base::OnStringConversionError::SKIP, &title);
527    }
528    row.set_title(title);
529
530    int count = atoi(values[kVisitCountColumn].c_str());
531    if (count == 0)
532      count = 1;
533    row.set_visit_count(count);
534
535    int64 date;
536    base::StringToInt64(values[kLastVisitColumn], &date);
537    if (date != 0)
538      row.set_last_visit(Time::FromTimeT(date / 1000000));
539
540    bool is_typed = (values[kTypedColumn] == "1");
541    if (is_typed)
542      row.set_typed_count(1);
543
544    rows->push_back(row);
545  }
546}
547
548// It sets up the file stream and loops over the lines in the file to
549// parse them, then adds the resulting row set to history.
550void ImportHistoryFromFirefox2(const FilePath& file, ImporterBridge* bridge) {
551  MorkReader reader;
552  reader.Read(file);
553
554  // Gather up the column ids so we don't need to find them on each row
555  TableReadClosure data(reader);
556  const MorkReader::MorkColumnList& columns = reader.columns();
557  for (size_t i = 0; i < columns.size(); ++i) {
558    for (int j = 0; j < kColumnCount; ++j)
559      if (columns[i].name == gColumnNames[j]) {
560        data.column_indexes[j] = static_cast<int>(i);
561        break;
562      }
563    if (columns[i].name == "ByteOrder")
564      data.byte_order_column = static_cast<int>(i);
565  }
566
567  // Determine the byte order from the table's meta-row.
568  const MorkReader::ColumnDataList& meta_row = reader.meta_row();
569  if (!meta_row.empty() && data.byte_order_column != -1) {
570    std::string byte_order = meta_row[data.byte_order_column];
571    if (!byte_order.empty()) {
572      // Note whether the file uses a non-native byte ordering.
573      // If it does, we'll have to swap bytes for PRUnichar values.
574      // "BE" and "LE" are the only recognized values, anything
575      // else is garbage and the file will be treated as native-endian
576      // (no swapping).
577      std::string byte_order_value(byte_order);
578      reader.NormalizeValue(&byte_order_value);
579      data.swap_bytes = (byte_order_value == "BE");
580    }
581  }
582
583  std::vector<history::URLRow> rows;
584  for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i)
585    AddToHistory(i->second, data, &rows);
586  if (!rows.empty())
587    bridge->SetHistoryItems(rows, history::SOURCE_FIREFOX_IMPORTED);
588}
589