page_set_archive_info.py revision 116680a4aac90f2aa7413d9095a592090648e557
1# Copyright 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import json
6import logging
7import os
8import re
9import shutil
10
11from telemetry.util import cloud_storage
12
13
14class PageSetArchiveInfo(object):
15  def __init__(self, file_path, data, ignore_archive=False):
16    self._file_path = file_path
17    self._base_dir = os.path.dirname(file_path)
18
19    # Ensure directory exists.
20    if not os.path.exists(self._base_dir):
21      os.makedirs(self._base_dir)
22
23    # Download all .wpr files.
24    if not ignore_archive:
25      # TODO(tbarzic): Remove this once http://crbug.com/351143 is diagnosed.
26      log_cloud_storage_exception = True
27      for archive_path in data['archives']:
28        archive_path = self._WprFileNameToPath(archive_path)
29        try:
30          cloud_storage.GetIfChanged(archive_path)
31        except (cloud_storage.CredentialsError,
32                cloud_storage.PermissionError) as e:
33          if os.path.exists(archive_path):
34            # If the archive exists, assume the user recorded their own and
35            # simply warn.
36            logging.warning('Need credentials to update WPR archive: %s',
37                            archive_path)
38          elif log_cloud_storage_exception:
39            # Log access errors only once, as they should stay the same in other
40            # iterations.
41            log_cloud_storage_exception = False
42            logging.warning('Error getting WPR archive %s: %s ' %
43                                (archive_path, str(e)))
44            logging.info(
45                'HOME: "%s"; USER: "%s"' %
46                (os.environ.get('HOME', ''), os.environ.get('USER', '')))
47
48    # Map from the relative path (as it appears in the metadata file) of the
49    # .wpr file to a list of page names it supports.
50    self._wpr_file_to_page_names = data['archives']
51
52    # Map from the page name to a relative path (as it appears in the metadata
53    # file) of the .wpr file.
54    self._page_name_to_wpr_file = dict()
55    # Find out the wpr file names for each page.
56    for wpr_file in data['archives']:
57      page_names = data['archives'][wpr_file]
58      for page_name in page_names:
59        self._page_name_to_wpr_file[page_name] = wpr_file
60    self.temp_target_wpr_file_path = None
61
62  @classmethod
63  def FromFile(cls, file_path, ignore_archive=False):
64    if os.path.exists(file_path):
65      with open(file_path, 'r') as f:
66        data = json.load(f)
67        return cls(file_path, data, ignore_archive=ignore_archive)
68    # TODO(tbarzic): Remove this once http://crbug.com/351143 is diagnosed.
69    logging.warning('Page set archives not found: %s' % file_path)
70    return cls(file_path, {'archives': {}}, ignore_archive=ignore_archive)
71
72  def WprFilePathForPage(self, page):
73    if self.temp_target_wpr_file_path:
74      return self.temp_target_wpr_file_path
75    wpr_file = self._page_name_to_wpr_file.get(page.display_name, None)
76    if wpr_file is None:
77      # Some old page sets always use the URL to identify a page rather than the
78      # display_name, so try to look for that.
79      wpr_file = self._page_name_to_wpr_file.get(page.url, None)
80    if wpr_file:
81      return self._WprFileNameToPath(wpr_file)
82    return None
83
84  def AddNewTemporaryRecording(self, temp_target_wpr_file_path):
85    self.temp_target_wpr_file_path = temp_target_wpr_file_path
86
87  def AddRecordedPages(self, pages):
88    (target_wpr_file, target_wpr_file_path) = self._NextWprFileName()
89    for page in pages:
90      self._SetWprFileForPage(page.display_name, target_wpr_file)
91    shutil.move(self.temp_target_wpr_file_path, target_wpr_file_path)
92
93    # Update the hash file.
94    with open(target_wpr_file_path + '.sha1', 'wb') as f:
95      f.write(cloud_storage.CalculateHash(target_wpr_file_path))
96      f.flush()
97
98    self._WriteToFile()
99    self._DeleteAbandonedWprFiles()
100
101  def _DeleteAbandonedWprFiles(self):
102    # Update the metadata so that the abandoned wpr files don't have empty page
103    # name arrays.
104    abandoned_wpr_files = self._AbandonedWprFiles()
105    for wpr_file in abandoned_wpr_files:
106      del self._wpr_file_to_page_names[wpr_file]
107      # Don't fail if we're unable to delete some of the files.
108      wpr_file_path = self._WprFileNameToPath(wpr_file)
109      try:
110        os.remove(wpr_file_path)
111      except Exception:
112        logging.warning('Failed to delete file: %s' % wpr_file_path)
113
114  def _AbandonedWprFiles(self):
115    abandoned_wpr_files = []
116    for wpr_file, page_names in self._wpr_file_to_page_names.iteritems():
117      if not page_names:
118        abandoned_wpr_files.append(wpr_file)
119    return abandoned_wpr_files
120
121  def _WriteToFile(self):
122    """Writes the metadata into the file passed as constructor parameter."""
123    metadata = dict()
124    metadata['description'] = (
125        'Describes the Web Page Replay archives for a page set. Don\'t edit by '
126        'hand! Use record_wpr for updating.')
127    metadata['archives'] = self._wpr_file_to_page_names.copy()
128    # Don't write data for abandoned archives.
129    abandoned_wpr_files = self._AbandonedWprFiles()
130    for wpr_file in abandoned_wpr_files:
131      del metadata['archives'][wpr_file]
132
133    with open(self._file_path, 'w') as f:
134      json.dump(metadata, f, indent=4)
135      f.flush()
136
137  def _WprFileNameToPath(self, wpr_file):
138    return os.path.abspath(os.path.join(self._base_dir, wpr_file))
139
140  def _NextWprFileName(self):
141    """Creates a new file name for a wpr archive file."""
142    # The names are of the format "some_thing_number.wpr". Read the numbers.
143    highest_number = -1
144    base = None
145    for wpr_file in self._wpr_file_to_page_names:
146      match = re.match(r'(?P<BASE>.*)_(?P<NUMBER>[0-9]+)\.wpr', wpr_file)
147      if not match:
148        raise Exception('Illegal wpr file name ' + wpr_file)
149      highest_number = max(int(match.groupdict()['NUMBER']), highest_number)
150      if base and match.groupdict()['BASE'] != base:
151        raise Exception('Illegal wpr file name ' + wpr_file +
152                        ', doesn\'t begin with ' + base)
153      base = match.groupdict()['BASE']
154    if not base:
155      # If we're creating a completely new info file, use the base name of the
156      # page set file.
157      base = os.path.splitext(os.path.basename(self._file_path))[0]
158    new_filename = '%s_%03d.wpr' % (base, highest_number + 1)
159    return new_filename, self._WprFileNameToPath(new_filename)
160
161  def _SetWprFileForPage(self, page_name, wpr_file):
162    """For modifying the metadata when we're going to record a new archive."""
163    old_wpr_file = self._page_name_to_wpr_file.get(page_name, None)
164    if old_wpr_file:
165      self._wpr_file_to_page_names[old_wpr_file].remove(page_name)
166    self._page_name_to_wpr_file[page_name] = wpr_file
167    if wpr_file not in self._wpr_file_to_page_names:
168      self._wpr_file_to_page_names[wpr_file] = []
169    self._wpr_file_to_page_names[wpr_file].append(page_name)
170