mork_reader.cc revision 3345a6884c488ff3a535c2c9acdd33d74b37e311
1/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2/* ***** BEGIN LICENSE BLOCK ***** 3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 * 5 * The contents of this file are subject to the Mozilla Public License Version 6 * 1.1 (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * http://www.mozilla.org/MPL/ 9 * 10 * Software distributed under the License is distributed on an "AS IS" basis, 11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 * for the specific language governing rights and limitations under the 13 * License. 14 * 15 * The Original Code is the Mork Reader. 16 * 17 * The Initial Developer of the Original Code is 18 * Google Inc. 19 * Portions created by the Initial Developer are Copyright (C) 2006 20 * the Initial Developer. All Rights Reserved. 21 * 22 * Contributor(s): 23 * Brian Ryner <bryner@brianryner.com> (original author) 24 * 25 * Alternatively, the contents of this file may be used under the terms of 26 * either the GNU General Public License Version 2 or later (the "GPL"), or 27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 * in which case the provisions of the GPL or the LGPL are applicable instead 29 * of those above. If you wish to allow use of your version of this file only 30 * under the terms of either the GPL or the LGPL, and not to allow others to 31 * use your version of this file under the terms of the MPL, indicate your 32 * decision by deleting the provisions above and replace them with the notice 33 * and other provisions required by the GPL or the LGPL. If you do not delete 34 * the provisions above, a recipient may use your version of this file under 35 * the terms of any one of the MPL, the GPL or the LGPL. 36 * 37 * ***** END LICENSE BLOCK ***** */ 38 39// Source: 40// http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp 41// This file has been converted to google style. 42 43#include "chrome/browser/importer/mork_reader.h" 44 45#include <algorithm> 46 47#include "base/file_path.h" 48#include "base/i18n/icu_string_conversions.h" 49#include "base/logging.h" 50#include "base/message_loop.h" 51#include "base/string_number_conversions.h" 52#include "base/string_util.h" 53#include "base/values.h" 54#include "chrome/browser/history/history_types.h" 55#include "chrome/browser/importer/firefox_importer_utils.h" 56#include "chrome/browser/importer/importer.h" 57#include "chrome/browser/importer/importer_bridge.h" 58 59using base::Time; 60 61namespace { 62 63// Convert a hex character (0-9, A-F) to its corresponding byte value. 64// Returns -1 if the character is invalid. 65inline int HexCharToInt(char c) { 66 if ('0' <= c && c <= '9') 67 return c - '0'; 68 if ('A' <= c && c <= 'F') 69 return c - 'A' + 10; 70 return -1; 71} 72 73// Unescape a Mork value. Mork uses $xx escaping to encode non-ASCII 74// characters. Additionally, '$' and '\' are backslash-escaped. 75// The result of the unescape is in returned. 76std::string MorkUnescape(const std::string& input) { 77 // We optimize for speed over space here -- size the result buffer to 78 // the size of the source, which is an upper bound on the size of the 79 // unescaped string. 80 std::string result; 81 size_t input_length = input.size(); 82 result.reserve(input_length); 83 84 for (size_t i = 0; i < input_length; i++) { 85 char c = input[i]; 86 if (c == '\\') { 87 // Escaped literal, slip the backslash, append the next character. 88 i++; 89 if (i < input_length) 90 result.push_back(input[i]); 91 } else if (c == '$') { 92 // Dollar sign denotes a hex character. 93 if (i < input_length - 2) { 94 // Would be nice to use ToInteger() here, but it currently 95 // requires a null-terminated string. 96 int first = HexCharToInt(input[++i]); 97 int second = HexCharToInt(input[++i]); 98 if (first >= 0 && second >= 0) 99 result.push_back((first << 4) | second); 100 } 101 } else { 102 // Regular character, just append. 103 result.push_back(input[i]); 104 } 105 } 106 return result; 107} 108 109} // namespace 110 111MorkReader::MorkReader() { 112} 113 114MorkReader::~MorkReader() { 115 // Need to delete all the pointers to vectors we have in the table. 116 for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i) 117 delete i->second; 118} 119 120bool MorkReader::Read(const FilePath& path) { 121 stream_.open(path.value().c_str()); 122 if (!stream_.is_open()) 123 return false; 124 125 std::string line; 126 if (!ReadLine(&line) || 127 line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0) 128 return false; // Unexpected file format. 129 130 IndexMap column_map; 131 while (ReadLine(&line)) { 132 // Trim off leading spaces 133 size_t idx = 0; 134 size_t len = line.size(); 135 while (idx < len && line[idx] == ' ') 136 ++idx; 137 if (idx >= len) 138 continue; 139 140 // Look at the line to figure out what section type this is 141 if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) { 142 // Column map. We begin by creating a hash of column id to column name. 143 StringMap column_name_map; 144 ParseMap(line, idx, &column_name_map); 145 146 // Now that we have the list of columns, we put them into a flat array. 147 // Rows will have value arrays of the same size, with indexes that 148 // correspond to the columns array. As we insert each column into the 149 // array, we also make an entry in columnMap so that we can look up the 150 // index given the column id. 151 columns_.reserve(column_name_map.size()); 152 153 for (StringMap::const_iterator i = column_name_map.begin(); 154 i != column_name_map.end(); ++i) { 155 column_map[i->first] = static_cast<int>(columns_.size()); 156 MorkColumn col(i->first, i->second); 157 columns_.push_back(col); 158 } 159 } else if (StartsWithASCII(&line[idx], "<(", true)) { 160 // Value map. 161 ParseMap(line, idx, &value_map_); 162 } else if (line[idx] == '{' || line[idx] == '[') { 163 // Table / table row. 164 ParseTable(line, idx, &column_map); 165 } else { 166 // Don't know, hopefully don't care. 167 } 168 } 169 return true; 170} 171 172// Parses a key/value map of the form 173// <(k1=v1)(k2=v2)...> 174bool MorkReader::ParseMap(const std::string& first_line, 175 size_t start_index, 176 StringMap* map) { 177 // If the first line is the a=c line (column map), just skip over it. 178 std::string line(first_line); 179 if (StartsWithASCII(line, "< <(a=c)>", true)) 180 ReadLine(&line); 181 182 std::string key; 183 do { 184 size_t idx = start_index; 185 size_t len = line.size(); 186 size_t token_start; 187 188 while (idx < len) { 189 switch (line[idx++]) { 190 case '(': 191 // Beginning of a key/value pair. 192 if (!key.empty()) { 193 DLOG(WARNING) << "unterminated key/value pair?"; 194 key.clear(); 195 } 196 197 token_start = idx; 198 while (idx < len && line[idx] != '=') 199 ++idx; 200 key.assign(&line[token_start], idx - token_start); 201 break; 202 203 case '=': { 204 // Beginning of the value. 205 if (key.empty()) { 206 DLOG(WARNING) << "stray value"; 207 break; 208 } 209 210 token_start = idx; 211 while (idx < len && line[idx] != ')') { 212 if (line[idx] == '\\') 213 ++idx; // Skip escaped ')' characters. 214 ++idx; 215 } 216 size_t token_end = std::min(idx, len); 217 ++idx; 218 219 std::string value = MorkUnescape( 220 std::string(&line[token_start], token_end - token_start)); 221 (*map)[key] = value; 222 key.clear(); 223 break; 224 } 225 case '>': 226 // End of the map. 227 DLOG_IF(WARNING, key.empty()) << 228 "map terminates inside of key/value pair"; 229 return true; 230 } 231 } 232 233 // We should start reading the next line at the beginning. 234 start_index = 0; 235 } while (ReadLine(&line)); 236 237 // We ran out of lines and the map never terminated. This probably indicates 238 // a parsing error. 239 DLOG(WARNING) << "didn't find end of key/value map"; 240 return false; 241} 242 243// Parses a table row of the form [123(^45^67)..] 244// (row id 123 has the value with id 67 for the column with id 45). 245// A '^' prefix for a column or value references an entry in the column or 246// value map. '=' is used as the separator when the value is a literal. 247void MorkReader::ParseTable(const std::string& first_line, 248 size_t start_index, 249 const IndexMap* column_map) { 250 std::string line(first_line); 251 252 // Column index of the cell we're parsing, minus one if invalid. 253 int column_index = -1; 254 255 // Points to the current row we're parsing inside of the |table_|, will be 256 // NULL if we're not inside a row. 257 ColumnDataList* current_row = NULL; 258 259 bool in_meta_row = false; 260 261 do { 262 size_t idx = start_index; 263 size_t len = line.size(); 264 265 while (idx < len) { 266 switch (line[idx++]) { 267 case '{': 268 // This marks the beginning of a table section. There's a lot of 269 // junk before the first row that looks like cell values but isn't. 270 // Skip to the first '['. 271 while (idx < len && line[idx] != '[') { 272 if (line[idx] == '{') { 273 in_meta_row = true; // The meta row is enclosed in { } 274 } else if (line[idx] == '}') { 275 in_meta_row = false; 276 } 277 ++idx; 278 } 279 break; 280 281 case '[': { 282 // Start of a new row. Consume the row id, up to the first '('. 283 // Row edits also have a table namespace, separated from the row id 284 // by a colon. We don't make use of the namespace, but we need to 285 // make sure not to consider it part of the row id. 286 if (current_row) { 287 DLOG(WARNING) << "unterminated row?"; 288 current_row = NULL; 289 } 290 291 // Check for a '-' at the start of the id. This signifies that 292 // if the row already exists, we should delete all columns from it 293 // before adding the new values. 294 bool cut_columns; 295 if (idx < len && line[idx] == '-') { 296 cut_columns = true; 297 ++idx; 298 } else { 299 cut_columns = false; 300 } 301 302 // Locate the range of the ID. 303 size_t token_start = idx; // Index of the first char of the token. 304 while (idx < len && 305 line[idx] != '(' && 306 line[idx] != ']' && 307 line[idx] != ':') { 308 ++idx; 309 } 310 size_t token_end = idx; // Index of the char following the token. 311 while (idx < len && line[idx] != '(' && line[idx] != ']') { 312 ++idx; 313 } 314 315 if (in_meta_row) { 316 // Need to create the meta row. 317 meta_row_.resize(columns_.size()); 318 current_row = &meta_row_; 319 } else { 320 // Find or create the regular row for this. 321 IDString row_id(&line[token_start], token_end - token_start); 322 RowMap::iterator found_row = table_.find(row_id); 323 if (found_row == table_.end()) { 324 // We don't already have this row, create a new one for it. 325 current_row = new ColumnDataList(columns_.size()); 326 table_[row_id] = current_row; 327 } else { 328 // The row already exists and we're adding/replacing things. 329 current_row = found_row->second; 330 } 331 } 332 if (cut_columns) { 333 for (size_t i = 0; i < current_row->size(); ++i) 334 (*current_row)[i].clear(); 335 } 336 break; 337 } 338 339 case ']': 340 // We're done with the row. 341 current_row = NULL; 342 in_meta_row = false; 343 break; 344 345 case '(': { 346 if (!current_row) { 347 DLOG(WARNING) << "cell value outside of row"; 348 break; 349 } 350 351 bool column_is_atom; 352 if (line[idx] == '^') { 353 column_is_atom = true; 354 ++idx; // This is not part of the column id, advance past it. 355 } else { 356 column_is_atom = false; 357 } 358 size_t token_start = idx; 359 while (idx < len && line[idx] != '^' && line[idx] != '=') { 360 if (line[idx] == '\\') 361 ++idx; // Skip escaped characters. 362 ++idx; 363 } 364 365 size_t token_end = std::min(idx, len); 366 367 IDString column; 368 if (column_is_atom) 369 column.assign(&line[token_start], token_end - token_start); 370 else 371 column = MorkUnescape(line.substr(token_start, 372 token_end - token_start)); 373 374 IndexMap::const_iterator found_column = column_map->find(column); 375 if (found_column == column_map->end()) { 376 DLOG(WARNING) << "Column not in column map, discarding it"; 377 column_index = -1; 378 } else { 379 column_index = found_column->second; 380 } 381 break; 382 } 383 384 case '=': 385 case '^': { 386 if (column_index == -1) { 387 DLOG(WARNING) << "stray ^ or = marker"; 388 break; 389 } 390 391 bool value_is_atom = (line[idx - 1] == '^'); 392 size_t token_start = idx - 1; // Include the '=' or '^' marker. 393 while (idx < len && line[idx] != ')') { 394 if (line[idx] == '\\') 395 ++idx; // Skip escaped characters. 396 ++idx; 397 } 398 size_t token_end = std::min(idx, len); 399 ++idx; 400 401 if (value_is_atom) { 402 (*current_row)[column_index].assign(&line[token_start], 403 token_end - token_start); 404 } else { 405 (*current_row)[column_index] = 406 MorkUnescape(line.substr(token_start, token_end - token_start)); 407 } 408 column_index = -1; 409 } 410 break; 411 } 412 } 413 414 // Start parsing the next line at the beginning. 415 start_index = 0; 416 } while (current_row && ReadLine(&line)); 417} 418 419bool MorkReader::ReadLine(std::string* line) { 420 line->resize(256); 421 std::getline(stream_, *line); 422 if (stream_.eof() || stream_.bad()) 423 return false; 424 425 while (!line->empty() && (*line)[line->size() - 1] == '\\') { 426 // There is a continuation for this line. Read it and append. 427 std::string new_line; 428 std::getline(stream_, new_line); 429 if (stream_.eof()) 430 return false; 431 line->erase(line->size() - 1); 432 line->append(new_line); 433 } 434 435 return true; 436} 437 438void MorkReader::NormalizeValue(std::string* value) const { 439 if (value->empty()) 440 return; 441 MorkReader::StringMap::const_iterator i; 442 switch (value->at(0)) { 443 case '^': 444 // Hex ID, lookup the name for it in the |value_map_|. 445 i = value_map_.find(value->substr(1)); 446 if (i == value_map_.end()) 447 value->clear(); 448 else 449 *value = i->second; 450 break; 451 case '=': 452 // Just use the literal after the equals sign. 453 value->erase(value->begin()); 454 break; 455 default: 456 // Anything else is invalid. 457 value->clear(); 458 break; 459 } 460} 461 462// Source: 463// http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp 464 465// Columns for entry (non-meta) history rows 466enum { 467 kURLColumn, 468 kNameColumn, 469 kVisitCountColumn, 470 kHiddenColumn, 471 kTypedColumn, 472 kLastVisitColumn, 473 kColumnCount // Keep me last. 474}; 475 476static const char * const gColumnNames[] = { 477 "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate" 478}; 479 480struct TableReadClosure { 481 explicit TableReadClosure(const MorkReader& r) 482 : reader(r), 483 swap_bytes(false), 484 byte_order_column(-1) { 485 for (int i = 0; i < kColumnCount; ++i) 486 column_indexes[i] = -1; 487 } 488 489 // Backpointers to the reader and history we're operating on. 490 const MorkReader& reader; 491 492 // Whether we need to swap bytes (file format is other-endian). 493 bool swap_bytes; 494 495 // Indexes of the columns that we care about. 496 int column_indexes[kColumnCount]; 497 int byte_order_column; 498}; 499 500void AddToHistory(MorkReader::ColumnDataList* column_values, 501 const TableReadClosure& data, 502 std::vector<history::URLRow>* rows) { 503 std::string values[kColumnCount]; 504 505 for (size_t i = 0; i < kColumnCount; ++i) { 506 if (data.column_indexes[i] != -1) { 507 values[i] = column_values->at(data.column_indexes[i]); 508 data.reader.NormalizeValue(&values[i]); 509 // Do not import hidden records. 510 if (i == kHiddenColumn && values[i] == "1") 511 return; 512 } 513 } 514 515 GURL url(values[kURLColumn]); 516 517 if (CanImportURL(url)) { 518 history::URLRow row(url); 519 520 string16 title; 521 if (data.swap_bytes) { 522 base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16BE, 523 base::OnStringConversionError::SKIP, &title); 524 } else { 525 base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16LE, 526 base::OnStringConversionError::SKIP, &title); 527 } 528 row.set_title(title); 529 530 int count = atoi(values[kVisitCountColumn].c_str()); 531 if (count == 0) 532 count = 1; 533 row.set_visit_count(count); 534 535 int64 date; 536 base::StringToInt64(values[kLastVisitColumn], &date); 537 if (date != 0) 538 row.set_last_visit(Time::FromTimeT(date / 1000000)); 539 540 bool is_typed = (values[kTypedColumn] == "1"); 541 if (is_typed) 542 row.set_typed_count(1); 543 544 rows->push_back(row); 545 } 546} 547 548// It sets up the file stream and loops over the lines in the file to 549// parse them, then adds the resulting row set to history. 550void ImportHistoryFromFirefox2(const FilePath& file, ImporterBridge* bridge) { 551 MorkReader reader; 552 reader.Read(file); 553 554 // Gather up the column ids so we don't need to find them on each row 555 TableReadClosure data(reader); 556 const MorkReader::MorkColumnList& columns = reader.columns(); 557 for (size_t i = 0; i < columns.size(); ++i) { 558 for (int j = 0; j < kColumnCount; ++j) 559 if (columns[i].name == gColumnNames[j]) { 560 data.column_indexes[j] = static_cast<int>(i); 561 break; 562 } 563 if (columns[i].name == "ByteOrder") 564 data.byte_order_column = static_cast<int>(i); 565 } 566 567 // Determine the byte order from the table's meta-row. 568 const MorkReader::ColumnDataList& meta_row = reader.meta_row(); 569 if (!meta_row.empty() && data.byte_order_column != -1) { 570 std::string byte_order = meta_row[data.byte_order_column]; 571 if (!byte_order.empty()) { 572 // Note whether the file uses a non-native byte ordering. 573 // If it does, we'll have to swap bytes for PRUnichar values. 574 // "BE" and "LE" are the only recognized values, anything 575 // else is garbage and the file will be treated as native-endian 576 // (no swapping). 577 std::string byte_order_value(byte_order); 578 reader.NormalizeValue(&byte_order_value); 579 data.swap_bytes = (byte_order_value == "BE"); 580 } 581 } 582 583 std::vector<history::URLRow> rows; 584 for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i) 585 AddToHistory(i->second, data, &rows); 586 if (!rows.empty()) 587 bridge->SetHistoryItems(rows, history::SOURCE_FIREFOX_IMPORTED); 588} 589