mork_reader.cc revision c407dc5cd9bdc5668497f21b26b09d988ab439de
1/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2/* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 *
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
14 *
15 * The Original Code is the Mork Reader.
16 *
17 * The Initial Developer of the Original Code is
18 * Google Inc.
19 * Portions created by the Initial Developer are Copyright (C) 2006
20 * the Initial Developer. All Rights Reserved.
21 *
22 * Contributor(s):
23 *   Brian Ryner <bryner@brianryner.com> (original author)
24 *
25 * Alternatively, the contents of this file may be used under the terms of
26 * either the GNU General Public License Version 2 or later (the "GPL"), or
27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
36 *
37 * ***** END LICENSE BLOCK ***** */
38
39// Source:
40// http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp
41// This file has been converted to google style.
42
43#include "chrome/browser/importer/mork_reader.h"
44
45#include <algorithm>
46
47#include "base/file_path.h"
48#include "base/i18n/icu_string_conversions.h"
49#include "base/logging.h"
50#include "base/message_loop.h"
51#include "base/string_util.h"
52#include "chrome/browser/history/history_types.h"
53#include "chrome/browser/importer/firefox_importer_utils.h"
54#include "chrome/browser/importer/importer.h"
55#include "chrome/browser/importer/importer_bridge.h"
56
57using base::Time;
58
59namespace {
60
61// Convert a hex character (0-9, A-F) to its corresponding byte value.
62// Returns -1 if the character is invalid.
63inline int HexCharToInt(char c) {
64  if ('0' <= c && c <= '9')
65    return c - '0';
66  if ('A' <= c && c <= 'F')
67    return c - 'A' + 10;
68  return -1;
69}
70
71// Unescape a Mork value.  Mork uses $xx escaping to encode non-ASCII
72// characters.  Additionally, '$' and '\' are backslash-escaped.
73// The result of the unescape is in returned.
74std::string MorkUnescape(const std::string& input) {
75  // We optimize for speed over space here -- size the result buffer to
76  // the size of the source, which is an upper bound on the size of the
77  // unescaped string.
78  std::string result;
79  size_t input_length = input.size();
80  result.reserve(input_length);
81
82  for (size_t i = 0; i < input_length; i++) {
83    char c = input[i];
84    if (c == '\\') {
85      // Escaped literal, slip the backslash, append the next character.
86      i++;
87      if (i < input_length)
88        result.push_back(input[i]);
89    } else if (c == '$') {
90      // Dollar sign denotes a hex character.
91      if (i < input_length - 2) {
92        // Would be nice to use ToInteger() here, but it currently
93        // requires a null-terminated string.
94        int first = HexCharToInt(input[++i]);
95        int second = HexCharToInt(input[++i]);
96        if (first >= 0 && second >= 0)
97          result.push_back((first << 4) | second);
98      }
99    } else {
100      // Regular character, just append.
101      result.push_back(input[i]);
102    }
103  }
104  return result;
105}
106
107}  // namespace
108
109MorkReader::MorkReader() {
110}
111
112MorkReader::~MorkReader() {
113  // Need to delete all the pointers to vectors we have in the table.
114  for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i)
115    delete i->second;
116}
117
118bool MorkReader::Read(const FilePath& path) {
119  stream_.open(path.value().c_str());
120  if (!stream_.is_open())
121    return false;
122
123  std::string line;
124  if (!ReadLine(&line) ||
125      line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0)
126    return false;  // Unexpected file format.
127
128  IndexMap column_map;
129  while (ReadLine(&line)) {
130    // Trim off leading spaces
131    size_t idx = 0;
132    size_t len = line.size();
133    while (idx < len && line[idx] == ' ')
134      ++idx;
135    if (idx >= len)
136      continue;
137
138    // Look at the line to figure out what section type this is
139    if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) {
140      // Column map.  We begin by creating a hash of column id to column name.
141      StringMap column_name_map;
142      ParseMap(line, idx, &column_name_map);
143
144      // Now that we have the list of columns, we put them into a flat array.
145      // Rows will have value arrays of the same size, with indexes that
146      // correspond to the columns array.  As we insert each column into the
147      // array, we also make an entry in columnMap so that we can look up the
148      // index given the column id.
149      columns_.reserve(column_name_map.size());
150
151      for (StringMap::const_iterator i = column_name_map.begin();
152           i != column_name_map.end(); ++i) {
153        column_map[i->first] = static_cast<int>(columns_.size());
154        MorkColumn col(i->first, i->second);
155        columns_.push_back(col);
156      }
157    } else if (StartsWithASCII(&line[idx], "<(", true)) {
158      // Value map.
159      ParseMap(line, idx, &value_map_);
160    } else if (line[idx] == '{' || line[idx] == '[') {
161      // Table / table row.
162      ParseTable(line, idx, &column_map);
163    } else {
164      // Don't know, hopefully don't care.
165    }
166  }
167  return true;
168}
169
170// Parses a key/value map of the form
171// <(k1=v1)(k2=v2)...>
172bool MorkReader::ParseMap(const std::string& first_line,
173                          size_t start_index,
174                          StringMap* map) {
175  // If the first line is the a=c line (column map), just skip over it.
176  std::string line(first_line);
177  if (StartsWithASCII(line, "< <(a=c)>", true))
178    ReadLine(&line);
179
180  std::string key;
181  do {
182    size_t idx = start_index;
183    size_t len = line.size();
184    size_t token_start;
185
186    while (idx < len) {
187      switch (line[idx++]) {
188        case '(':
189          // Beginning of a key/value pair.
190          if (!key.empty()) {
191            DLOG(WARNING) << "unterminated key/value pair?";
192            key.clear();
193          }
194
195          token_start = idx;
196          while (idx < len && line[idx] != '=')
197            ++idx;
198          key.assign(&line[token_start], idx - token_start);
199          break;
200
201        case '=': {
202          // Beginning of the value.
203          if (key.empty()) {
204            DLOG(WARNING) << "stray value";
205            break;
206          }
207
208          token_start = idx;
209          while (idx < len && line[idx] != ')') {
210            if (line[idx] == '\\')
211              ++idx;  // Skip escaped ')' characters.
212            ++idx;
213          }
214          size_t token_end = std::min(idx, len);
215          ++idx;
216
217          std::string value = MorkUnescape(
218              std::string(&line[token_start], token_end - token_start));
219          (*map)[key] = value;
220          key.clear();
221          break;
222        }
223        case '>':
224          // End of the map.
225          DLOG_IF(WARNING, key.empty()) <<
226              "map terminates inside of key/value pair";
227          return true;
228      }
229    }
230
231    // We should start reading the next line at the beginning.
232    start_index = 0;
233  } while (ReadLine(&line));
234
235  // We ran out of lines and the map never terminated.  This probably indicates
236  // a parsing error.
237  DLOG(WARNING) << "didn't find end of key/value map";
238  return false;
239}
240
241// Parses a table row of the form [123(^45^67)..]
242// (row id 123 has the value with id 67 for the column with id 45).
243// A '^' prefix for a column or value references an entry in the column or
244// value map.  '=' is used as the separator when the value is a literal.
245void MorkReader::ParseTable(const std::string& first_line,
246                            size_t start_index,
247                            const IndexMap* column_map) {
248  std::string line(first_line);
249
250  // Column index of the cell we're parsing, minus one if invalid.
251  int column_index = -1;
252
253  // Points to the current row we're parsing inside of the |table_|, will be
254  // NULL if we're not inside a row.
255  ColumnDataList* current_row = NULL;
256
257  bool in_meta_row = false;
258
259  do {
260    size_t idx = start_index;
261    size_t len = line.size();
262
263    while (idx < len) {
264      switch (line[idx++]) {
265        case '{':
266          // This marks the beginning of a table section.  There's a lot of
267          // junk before the first row that looks like cell values but isn't.
268          // Skip to the first '['.
269          while (idx < len && line[idx] != '[') {
270            if (line[idx] == '{') {
271              in_meta_row = true;  // The meta row is enclosed in { }
272            } else if (line[idx] == '}') {
273              in_meta_row = false;
274            }
275            ++idx;
276          }
277          break;
278
279        case '[': {
280          // Start of a new row.  Consume the row id, up to the first '('.
281          // Row edits also have a table namespace, separated from the row id
282          // by a colon.  We don't make use of the namespace, but we need to
283          // make sure not to consider it part of the row id.
284          if (current_row) {
285            DLOG(WARNING) << "unterminated row?";
286            current_row = NULL;
287          }
288
289          // Check for a '-' at the start of the id.  This signifies that
290          // if the row already exists, we should delete all columns from it
291          // before adding the new values.
292          bool cut_columns;
293          if (idx < len && line[idx] == '-') {
294            cut_columns = true;
295            ++idx;
296          } else {
297            cut_columns = false;
298          }
299
300          // Locate the range of the ID.
301          size_t token_start = idx;  // Index of the first char of the token.
302          while (idx < len &&
303                 line[idx] != '(' &&
304                 line[idx] != ']' &&
305                 line[idx] != ':') {
306            ++idx;
307          }
308          size_t token_end = idx;  // Index of the char following the token.
309          while (idx < len && line[idx] != '(' && line[idx] != ']') {
310            ++idx;
311          }
312
313          if (in_meta_row) {
314            // Need to create the meta row.
315            meta_row_.resize(columns_.size());
316            current_row = &meta_row_;
317          } else {
318            // Find or create the regular row for this.
319            IDString row_id(&line[token_start], token_end - token_start);
320            RowMap::iterator found_row = table_.find(row_id);
321            if (found_row == table_.end()) {
322              // We don't already have this row, create a new one for it.
323              current_row = new ColumnDataList(columns_.size());
324              table_[row_id] = current_row;
325            } else {
326              // The row already exists and we're adding/replacing things.
327              current_row = found_row->second;
328            }
329          }
330          if (cut_columns) {
331            for (size_t i = 0; i < current_row->size(); ++i)
332              (*current_row)[i].clear();
333          }
334          break;
335        }
336
337        case ']':
338          // We're done with the row.
339          current_row = NULL;
340          in_meta_row = false;
341          break;
342
343        case '(': {
344          if (!current_row) {
345            DLOG(WARNING) << "cell value outside of row";
346            break;
347          }
348
349          bool column_is_atom;
350          if (line[idx] == '^') {
351            column_is_atom = true;
352            ++idx;  // This is not part of the column id, advance past it.
353          } else {
354            column_is_atom = false;
355          }
356          size_t token_start = idx;
357          while (idx < len && line[idx] != '^' && line[idx] != '=') {
358            if (line[idx] == '\\')
359              ++idx;  // Skip escaped characters.
360            ++idx;
361          }
362
363          size_t token_end = std::min(idx, len);
364
365          IDString column;
366          if (column_is_atom)
367            column.assign(&line[token_start], token_end - token_start);
368          else
369            column = MorkUnescape(line.substr(token_start,
370                                              token_end - token_start));
371
372          IndexMap::const_iterator found_column = column_map->find(column);
373          if (found_column == column_map->end()) {
374            DLOG(WARNING) << "Column not in column map, discarding it";
375            column_index = -1;
376          } else {
377            column_index = found_column->second;
378          }
379          break;
380        }
381
382        case '=':
383        case '^': {
384          if (column_index == -1) {
385            DLOG(WARNING) << "stray ^ or = marker";
386            break;
387          }
388
389          bool value_is_atom = (line[idx - 1] == '^');
390          size_t token_start = idx - 1;  // Include the '=' or '^' marker.
391          while (idx < len && line[idx] != ')') {
392            if (line[idx] == '\\')
393              ++idx;  // Skip escaped characters.
394            ++idx;
395          }
396          size_t token_end = std::min(idx, len);
397          ++idx;
398
399          if (value_is_atom) {
400            (*current_row)[column_index].assign(&line[token_start],
401                                                token_end - token_start);
402          } else {
403            (*current_row)[column_index] =
404                MorkUnescape(line.substr(token_start, token_end - token_start));
405          }
406          column_index = -1;
407        }
408        break;
409      }
410    }
411
412    // Start parsing the next line at the beginning.
413    start_index = 0;
414  } while (current_row && ReadLine(&line));
415}
416
417bool MorkReader::ReadLine(std::string* line) {
418  line->resize(256);
419  std::getline(stream_, *line);
420  if (stream_.eof() || stream_.bad())
421    return false;
422
423  while (!line->empty() &&  (*line)[line->size() - 1] == '\\') {
424    // There is a continuation for this line.  Read it and append.
425    std::string new_line;
426    std::getline(stream_, new_line);
427    if (stream_.eof())
428      return false;
429    line->erase(line->size() - 1);
430    line->append(new_line);
431  }
432
433  return true;
434}
435
436void MorkReader::NormalizeValue(std::string* value) const {
437  if (value->empty())
438    return;
439  MorkReader::StringMap::const_iterator i;
440  switch (value->at(0)) {
441    case '^':
442      // Hex ID, lookup the name for it in the |value_map_|.
443      i = value_map_.find(value->substr(1));
444      if (i == value_map_.end())
445        value->clear();
446      else
447        *value = i->second;
448      break;
449    case '=':
450      // Just use the literal after the equals sign.
451      value->erase(value->begin());
452      break;
453    default:
454      // Anything else is invalid.
455      value->clear();
456      break;
457  }
458}
459
460// Source:
461// http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp
462
463// Columns for entry (non-meta) history rows
464enum {
465  kURLColumn,
466  kNameColumn,
467  kVisitCountColumn,
468  kHiddenColumn,
469  kTypedColumn,
470  kLastVisitColumn,
471  kColumnCount  // Keep me last.
472};
473
474static const char * const gColumnNames[] = {
475  "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate"
476};
477
478struct TableReadClosure {
479  explicit TableReadClosure(const MorkReader& r)
480      : reader(r),
481        swap_bytes(false),
482        byte_order_column(-1) {
483    for (int i = 0; i < kColumnCount; ++i)
484      column_indexes[i] = -1;
485  }
486
487  // Backpointers to the reader and history we're operating on.
488  const MorkReader& reader;
489
490  // Whether we need to swap bytes (file format is other-endian).
491  bool swap_bytes;
492
493  // Indexes of the columns that we care about.
494  int column_indexes[kColumnCount];
495  int byte_order_column;
496};
497
498void AddToHistory(MorkReader::ColumnDataList* column_values,
499                  const TableReadClosure& data,
500                  std::vector<history::URLRow>* rows) {
501  std::string values[kColumnCount];
502
503  for (size_t i = 0; i < kColumnCount; ++i) {
504    if (data.column_indexes[i] != -1) {
505      values[i] = column_values->at(data.column_indexes[i]);
506      data.reader.NormalizeValue(&values[i]);
507      // Do not import hidden records.
508      if (i == kHiddenColumn && values[i] == "1")
509        return;
510    }
511  }
512
513  GURL url(values[kURLColumn]);
514
515  if (CanImportURL(url)) {
516    history::URLRow row(url);
517
518    string16 title;
519    if (data.swap_bytes) {
520      base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16BE,
521                            base::OnStringConversionError::SKIP, &title);
522    } else {
523      base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16LE,
524                            base::OnStringConversionError::SKIP, &title);
525    }
526    row.set_title(title);
527
528    int count = atoi(values[kVisitCountColumn].c_str());
529    if (count == 0)
530      count = 1;
531    row.set_visit_count(count);
532
533    time_t date = StringToInt64(values[kLastVisitColumn]);
534    if (date != 0)
535      row.set_last_visit(Time::FromTimeT(date/1000000));
536
537    bool is_typed = (values[kTypedColumn] == "1");
538    if (is_typed)
539      row.set_typed_count(1);
540
541    rows->push_back(row);
542  }
543}
544
545// It sets up the file stream and loops over the lines in the file to
546// parse them, then adds the resulting row set to history.
547void ImportHistoryFromFirefox2(const FilePath& file, ImporterBridge* bridge) {
548  MorkReader reader;
549  reader.Read(file);
550
551  // Gather up the column ids so we don't need to find them on each row
552  TableReadClosure data(reader);
553  const MorkReader::MorkColumnList& columns = reader.columns();
554  for (size_t i = 0; i < columns.size(); ++i) {
555    for (int j = 0; j < kColumnCount; ++j)
556      if (columns[i].name == gColumnNames[j]) {
557        data.column_indexes[j] = static_cast<int>(i);
558        break;
559      }
560    if (columns[i].name == "ByteOrder")
561      data.byte_order_column = static_cast<int>(i);
562  }
563
564  // Determine the byte order from the table's meta-row.
565  const MorkReader::ColumnDataList& meta_row = reader.meta_row();
566  if (!meta_row.empty() && data.byte_order_column != -1) {
567    std::string byte_order = meta_row[data.byte_order_column];
568    if (!byte_order.empty()) {
569      // Note whether the file uses a non-native byte ordering.
570      // If it does, we'll have to swap bytes for PRUnichar values.
571      // "BE" and "LE" are the only recognized values, anything
572      // else is garbage and the file will be treated as native-endian
573      // (no swapping).
574      std::string byte_order_value(byte_order);
575      reader.NormalizeValue(&byte_order_value);
576      data.swap_bytes = (byte_order_value == "BE");
577    }
578  }
579
580  std::vector<history::URLRow> rows;
581  for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i)
582    AddToHistory(i->second, data, &rows);
583  if (!rows.empty())
584    bridge->SetHistoryItems(rows);
585}
586