mork_reader.cc revision c407dc5cd9bdc5668497f21b26b09d988ab439de
1/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2/* ***** BEGIN LICENSE BLOCK ***** 3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 * 5 * The contents of this file are subject to the Mozilla Public License Version 6 * 1.1 (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * http://www.mozilla.org/MPL/ 9 * 10 * Software distributed under the License is distributed on an "AS IS" basis, 11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 * for the specific language governing rights and limitations under the 13 * License. 14 * 15 * The Original Code is the Mork Reader. 16 * 17 * The Initial Developer of the Original Code is 18 * Google Inc. 19 * Portions created by the Initial Developer are Copyright (C) 2006 20 * the Initial Developer. All Rights Reserved. 21 * 22 * Contributor(s): 23 * Brian Ryner <bryner@brianryner.com> (original author) 24 * 25 * Alternatively, the contents of this file may be used under the terms of 26 * either the GNU General Public License Version 2 or later (the "GPL"), or 27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 * in which case the provisions of the GPL or the LGPL are applicable instead 29 * of those above. If you wish to allow use of your version of this file only 30 * under the terms of either the GPL or the LGPL, and not to allow others to 31 * use your version of this file under the terms of the MPL, indicate your 32 * decision by deleting the provisions above and replace them with the notice 33 * and other provisions required by the GPL or the LGPL. If you do not delete 34 * the provisions above, a recipient may use your version of this file under 35 * the terms of any one of the MPL, the GPL or the LGPL. 36 * 37 * ***** END LICENSE BLOCK ***** */ 38 39// Source: 40// http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp 41// This file has been converted to google style. 42 43#include "chrome/browser/importer/mork_reader.h" 44 45#include <algorithm> 46 47#include "base/file_path.h" 48#include "base/i18n/icu_string_conversions.h" 49#include "base/logging.h" 50#include "base/message_loop.h" 51#include "base/string_util.h" 52#include "chrome/browser/history/history_types.h" 53#include "chrome/browser/importer/firefox_importer_utils.h" 54#include "chrome/browser/importer/importer.h" 55#include "chrome/browser/importer/importer_bridge.h" 56 57using base::Time; 58 59namespace { 60 61// Convert a hex character (0-9, A-F) to its corresponding byte value. 62// Returns -1 if the character is invalid. 63inline int HexCharToInt(char c) { 64 if ('0' <= c && c <= '9') 65 return c - '0'; 66 if ('A' <= c && c <= 'F') 67 return c - 'A' + 10; 68 return -1; 69} 70 71// Unescape a Mork value. Mork uses $xx escaping to encode non-ASCII 72// characters. Additionally, '$' and '\' are backslash-escaped. 73// The result of the unescape is in returned. 74std::string MorkUnescape(const std::string& input) { 75 // We optimize for speed over space here -- size the result buffer to 76 // the size of the source, which is an upper bound on the size of the 77 // unescaped string. 78 std::string result; 79 size_t input_length = input.size(); 80 result.reserve(input_length); 81 82 for (size_t i = 0; i < input_length; i++) { 83 char c = input[i]; 84 if (c == '\\') { 85 // Escaped literal, slip the backslash, append the next character. 86 i++; 87 if (i < input_length) 88 result.push_back(input[i]); 89 } else if (c == '$') { 90 // Dollar sign denotes a hex character. 91 if (i < input_length - 2) { 92 // Would be nice to use ToInteger() here, but it currently 93 // requires a null-terminated string. 94 int first = HexCharToInt(input[++i]); 95 int second = HexCharToInt(input[++i]); 96 if (first >= 0 && second >= 0) 97 result.push_back((first << 4) | second); 98 } 99 } else { 100 // Regular character, just append. 101 result.push_back(input[i]); 102 } 103 } 104 return result; 105} 106 107} // namespace 108 109MorkReader::MorkReader() { 110} 111 112MorkReader::~MorkReader() { 113 // Need to delete all the pointers to vectors we have in the table. 114 for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i) 115 delete i->second; 116} 117 118bool MorkReader::Read(const FilePath& path) { 119 stream_.open(path.value().c_str()); 120 if (!stream_.is_open()) 121 return false; 122 123 std::string line; 124 if (!ReadLine(&line) || 125 line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0) 126 return false; // Unexpected file format. 127 128 IndexMap column_map; 129 while (ReadLine(&line)) { 130 // Trim off leading spaces 131 size_t idx = 0; 132 size_t len = line.size(); 133 while (idx < len && line[idx] == ' ') 134 ++idx; 135 if (idx >= len) 136 continue; 137 138 // Look at the line to figure out what section type this is 139 if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) { 140 // Column map. We begin by creating a hash of column id to column name. 141 StringMap column_name_map; 142 ParseMap(line, idx, &column_name_map); 143 144 // Now that we have the list of columns, we put them into a flat array. 145 // Rows will have value arrays of the same size, with indexes that 146 // correspond to the columns array. As we insert each column into the 147 // array, we also make an entry in columnMap so that we can look up the 148 // index given the column id. 149 columns_.reserve(column_name_map.size()); 150 151 for (StringMap::const_iterator i = column_name_map.begin(); 152 i != column_name_map.end(); ++i) { 153 column_map[i->first] = static_cast<int>(columns_.size()); 154 MorkColumn col(i->first, i->second); 155 columns_.push_back(col); 156 } 157 } else if (StartsWithASCII(&line[idx], "<(", true)) { 158 // Value map. 159 ParseMap(line, idx, &value_map_); 160 } else if (line[idx] == '{' || line[idx] == '[') { 161 // Table / table row. 162 ParseTable(line, idx, &column_map); 163 } else { 164 // Don't know, hopefully don't care. 165 } 166 } 167 return true; 168} 169 170// Parses a key/value map of the form 171// <(k1=v1)(k2=v2)...> 172bool MorkReader::ParseMap(const std::string& first_line, 173 size_t start_index, 174 StringMap* map) { 175 // If the first line is the a=c line (column map), just skip over it. 176 std::string line(first_line); 177 if (StartsWithASCII(line, "< <(a=c)>", true)) 178 ReadLine(&line); 179 180 std::string key; 181 do { 182 size_t idx = start_index; 183 size_t len = line.size(); 184 size_t token_start; 185 186 while (idx < len) { 187 switch (line[idx++]) { 188 case '(': 189 // Beginning of a key/value pair. 190 if (!key.empty()) { 191 DLOG(WARNING) << "unterminated key/value pair?"; 192 key.clear(); 193 } 194 195 token_start = idx; 196 while (idx < len && line[idx] != '=') 197 ++idx; 198 key.assign(&line[token_start], idx - token_start); 199 break; 200 201 case '=': { 202 // Beginning of the value. 203 if (key.empty()) { 204 DLOG(WARNING) << "stray value"; 205 break; 206 } 207 208 token_start = idx; 209 while (idx < len && line[idx] != ')') { 210 if (line[idx] == '\\') 211 ++idx; // Skip escaped ')' characters. 212 ++idx; 213 } 214 size_t token_end = std::min(idx, len); 215 ++idx; 216 217 std::string value = MorkUnescape( 218 std::string(&line[token_start], token_end - token_start)); 219 (*map)[key] = value; 220 key.clear(); 221 break; 222 } 223 case '>': 224 // End of the map. 225 DLOG_IF(WARNING, key.empty()) << 226 "map terminates inside of key/value pair"; 227 return true; 228 } 229 } 230 231 // We should start reading the next line at the beginning. 232 start_index = 0; 233 } while (ReadLine(&line)); 234 235 // We ran out of lines and the map never terminated. This probably indicates 236 // a parsing error. 237 DLOG(WARNING) << "didn't find end of key/value map"; 238 return false; 239} 240 241// Parses a table row of the form [123(^45^67)..] 242// (row id 123 has the value with id 67 for the column with id 45). 243// A '^' prefix for a column or value references an entry in the column or 244// value map. '=' is used as the separator when the value is a literal. 245void MorkReader::ParseTable(const std::string& first_line, 246 size_t start_index, 247 const IndexMap* column_map) { 248 std::string line(first_line); 249 250 // Column index of the cell we're parsing, minus one if invalid. 251 int column_index = -1; 252 253 // Points to the current row we're parsing inside of the |table_|, will be 254 // NULL if we're not inside a row. 255 ColumnDataList* current_row = NULL; 256 257 bool in_meta_row = false; 258 259 do { 260 size_t idx = start_index; 261 size_t len = line.size(); 262 263 while (idx < len) { 264 switch (line[idx++]) { 265 case '{': 266 // This marks the beginning of a table section. There's a lot of 267 // junk before the first row that looks like cell values but isn't. 268 // Skip to the first '['. 269 while (idx < len && line[idx] != '[') { 270 if (line[idx] == '{') { 271 in_meta_row = true; // The meta row is enclosed in { } 272 } else if (line[idx] == '}') { 273 in_meta_row = false; 274 } 275 ++idx; 276 } 277 break; 278 279 case '[': { 280 // Start of a new row. Consume the row id, up to the first '('. 281 // Row edits also have a table namespace, separated from the row id 282 // by a colon. We don't make use of the namespace, but we need to 283 // make sure not to consider it part of the row id. 284 if (current_row) { 285 DLOG(WARNING) << "unterminated row?"; 286 current_row = NULL; 287 } 288 289 // Check for a '-' at the start of the id. This signifies that 290 // if the row already exists, we should delete all columns from it 291 // before adding the new values. 292 bool cut_columns; 293 if (idx < len && line[idx] == '-') { 294 cut_columns = true; 295 ++idx; 296 } else { 297 cut_columns = false; 298 } 299 300 // Locate the range of the ID. 301 size_t token_start = idx; // Index of the first char of the token. 302 while (idx < len && 303 line[idx] != '(' && 304 line[idx] != ']' && 305 line[idx] != ':') { 306 ++idx; 307 } 308 size_t token_end = idx; // Index of the char following the token. 309 while (idx < len && line[idx] != '(' && line[idx] != ']') { 310 ++idx; 311 } 312 313 if (in_meta_row) { 314 // Need to create the meta row. 315 meta_row_.resize(columns_.size()); 316 current_row = &meta_row_; 317 } else { 318 // Find or create the regular row for this. 319 IDString row_id(&line[token_start], token_end - token_start); 320 RowMap::iterator found_row = table_.find(row_id); 321 if (found_row == table_.end()) { 322 // We don't already have this row, create a new one for it. 323 current_row = new ColumnDataList(columns_.size()); 324 table_[row_id] = current_row; 325 } else { 326 // The row already exists and we're adding/replacing things. 327 current_row = found_row->second; 328 } 329 } 330 if (cut_columns) { 331 for (size_t i = 0; i < current_row->size(); ++i) 332 (*current_row)[i].clear(); 333 } 334 break; 335 } 336 337 case ']': 338 // We're done with the row. 339 current_row = NULL; 340 in_meta_row = false; 341 break; 342 343 case '(': { 344 if (!current_row) { 345 DLOG(WARNING) << "cell value outside of row"; 346 break; 347 } 348 349 bool column_is_atom; 350 if (line[idx] == '^') { 351 column_is_atom = true; 352 ++idx; // This is not part of the column id, advance past it. 353 } else { 354 column_is_atom = false; 355 } 356 size_t token_start = idx; 357 while (idx < len && line[idx] != '^' && line[idx] != '=') { 358 if (line[idx] == '\\') 359 ++idx; // Skip escaped characters. 360 ++idx; 361 } 362 363 size_t token_end = std::min(idx, len); 364 365 IDString column; 366 if (column_is_atom) 367 column.assign(&line[token_start], token_end - token_start); 368 else 369 column = MorkUnescape(line.substr(token_start, 370 token_end - token_start)); 371 372 IndexMap::const_iterator found_column = column_map->find(column); 373 if (found_column == column_map->end()) { 374 DLOG(WARNING) << "Column not in column map, discarding it"; 375 column_index = -1; 376 } else { 377 column_index = found_column->second; 378 } 379 break; 380 } 381 382 case '=': 383 case '^': { 384 if (column_index == -1) { 385 DLOG(WARNING) << "stray ^ or = marker"; 386 break; 387 } 388 389 bool value_is_atom = (line[idx - 1] == '^'); 390 size_t token_start = idx - 1; // Include the '=' or '^' marker. 391 while (idx < len && line[idx] != ')') { 392 if (line[idx] == '\\') 393 ++idx; // Skip escaped characters. 394 ++idx; 395 } 396 size_t token_end = std::min(idx, len); 397 ++idx; 398 399 if (value_is_atom) { 400 (*current_row)[column_index].assign(&line[token_start], 401 token_end - token_start); 402 } else { 403 (*current_row)[column_index] = 404 MorkUnescape(line.substr(token_start, token_end - token_start)); 405 } 406 column_index = -1; 407 } 408 break; 409 } 410 } 411 412 // Start parsing the next line at the beginning. 413 start_index = 0; 414 } while (current_row && ReadLine(&line)); 415} 416 417bool MorkReader::ReadLine(std::string* line) { 418 line->resize(256); 419 std::getline(stream_, *line); 420 if (stream_.eof() || stream_.bad()) 421 return false; 422 423 while (!line->empty() && (*line)[line->size() - 1] == '\\') { 424 // There is a continuation for this line. Read it and append. 425 std::string new_line; 426 std::getline(stream_, new_line); 427 if (stream_.eof()) 428 return false; 429 line->erase(line->size() - 1); 430 line->append(new_line); 431 } 432 433 return true; 434} 435 436void MorkReader::NormalizeValue(std::string* value) const { 437 if (value->empty()) 438 return; 439 MorkReader::StringMap::const_iterator i; 440 switch (value->at(0)) { 441 case '^': 442 // Hex ID, lookup the name for it in the |value_map_|. 443 i = value_map_.find(value->substr(1)); 444 if (i == value_map_.end()) 445 value->clear(); 446 else 447 *value = i->second; 448 break; 449 case '=': 450 // Just use the literal after the equals sign. 451 value->erase(value->begin()); 452 break; 453 default: 454 // Anything else is invalid. 455 value->clear(); 456 break; 457 } 458} 459 460// Source: 461// http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp 462 463// Columns for entry (non-meta) history rows 464enum { 465 kURLColumn, 466 kNameColumn, 467 kVisitCountColumn, 468 kHiddenColumn, 469 kTypedColumn, 470 kLastVisitColumn, 471 kColumnCount // Keep me last. 472}; 473 474static const char * const gColumnNames[] = { 475 "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate" 476}; 477 478struct TableReadClosure { 479 explicit TableReadClosure(const MorkReader& r) 480 : reader(r), 481 swap_bytes(false), 482 byte_order_column(-1) { 483 for (int i = 0; i < kColumnCount; ++i) 484 column_indexes[i] = -1; 485 } 486 487 // Backpointers to the reader and history we're operating on. 488 const MorkReader& reader; 489 490 // Whether we need to swap bytes (file format is other-endian). 491 bool swap_bytes; 492 493 // Indexes of the columns that we care about. 494 int column_indexes[kColumnCount]; 495 int byte_order_column; 496}; 497 498void AddToHistory(MorkReader::ColumnDataList* column_values, 499 const TableReadClosure& data, 500 std::vector<history::URLRow>* rows) { 501 std::string values[kColumnCount]; 502 503 for (size_t i = 0; i < kColumnCount; ++i) { 504 if (data.column_indexes[i] != -1) { 505 values[i] = column_values->at(data.column_indexes[i]); 506 data.reader.NormalizeValue(&values[i]); 507 // Do not import hidden records. 508 if (i == kHiddenColumn && values[i] == "1") 509 return; 510 } 511 } 512 513 GURL url(values[kURLColumn]); 514 515 if (CanImportURL(url)) { 516 history::URLRow row(url); 517 518 string16 title; 519 if (data.swap_bytes) { 520 base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16BE, 521 base::OnStringConversionError::SKIP, &title); 522 } else { 523 base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16LE, 524 base::OnStringConversionError::SKIP, &title); 525 } 526 row.set_title(title); 527 528 int count = atoi(values[kVisitCountColumn].c_str()); 529 if (count == 0) 530 count = 1; 531 row.set_visit_count(count); 532 533 time_t date = StringToInt64(values[kLastVisitColumn]); 534 if (date != 0) 535 row.set_last_visit(Time::FromTimeT(date/1000000)); 536 537 bool is_typed = (values[kTypedColumn] == "1"); 538 if (is_typed) 539 row.set_typed_count(1); 540 541 rows->push_back(row); 542 } 543} 544 545// It sets up the file stream and loops over the lines in the file to 546// parse them, then adds the resulting row set to history. 547void ImportHistoryFromFirefox2(const FilePath& file, ImporterBridge* bridge) { 548 MorkReader reader; 549 reader.Read(file); 550 551 // Gather up the column ids so we don't need to find them on each row 552 TableReadClosure data(reader); 553 const MorkReader::MorkColumnList& columns = reader.columns(); 554 for (size_t i = 0; i < columns.size(); ++i) { 555 for (int j = 0; j < kColumnCount; ++j) 556 if (columns[i].name == gColumnNames[j]) { 557 data.column_indexes[j] = static_cast<int>(i); 558 break; 559 } 560 if (columns[i].name == "ByteOrder") 561 data.byte_order_column = static_cast<int>(i); 562 } 563 564 // Determine the byte order from the table's meta-row. 565 const MorkReader::ColumnDataList& meta_row = reader.meta_row(); 566 if (!meta_row.empty() && data.byte_order_column != -1) { 567 std::string byte_order = meta_row[data.byte_order_column]; 568 if (!byte_order.empty()) { 569 // Note whether the file uses a non-native byte ordering. 570 // If it does, we'll have to swap bytes for PRUnichar values. 571 // "BE" and "LE" are the only recognized values, anything 572 // else is garbage and the file will be treated as native-endian 573 // (no swapping). 574 std::string byte_order_value(byte_order); 575 reader.NormalizeValue(&byte_order_value); 576 data.swap_bytes = (byte_order_value == "BE"); 577 } 578 } 579 580 std::vector<history::URLRow> rows; 581 for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i) 582 AddToHistory(i->second, data, &rows); 583 if (!rows.empty()) 584 bridge->SetHistoryItems(rows); 585} 586