1# Copyright 2012 Google Inc. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#    http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing,
10# software distributed under the License is distributed on an
11# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
12# either express or implied. See the License for the specific
13# language governing permissions and limitations under the License.
14
15"""Helpers shared by cloudstorage_stub and cloudstorage_api."""
16
17
18
19
20
21__all__ = ['CS_XML_NS',
22           'CSFileStat',
23           'dt_str_to_posix',
24           'local_api_url',
25           'LOCAL_GCS_ENDPOINT',
26           'local_run',
27           'get_access_token',
28           'get_metadata',
29           'GCSFileStat',
30           'http_time_to_posix',
31           'memory_usage',
32           'posix_time_to_http',
33           'posix_to_dt_str',
34           'set_access_token',
35           'validate_options',
36           'validate_bucket_name',
37           'validate_bucket_path',
38           'validate_file_path',
39          ]
40
41
42import calendar
43import datetime
44from email import utils as email_utils
45import logging
46import os
47import re
48
49try:
50  from google.appengine.api import runtime
51except ImportError:
52  from google.appengine.api import runtime
53
54
55_GCS_BUCKET_REGEX_BASE = r'[a-z0-9\.\-_]{3,63}'
56_GCS_BUCKET_REGEX = re.compile(_GCS_BUCKET_REGEX_BASE + r'$')
57_GCS_BUCKET_PATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'$')
58_GCS_PATH_PREFIX_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'.*')
59_GCS_FULLPATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'/.*')
60_GCS_METADATA = ['x-goog-meta-',
61                 'content-disposition',
62                 'cache-control',
63                 'content-encoding']
64_GCS_OPTIONS = _GCS_METADATA + ['x-goog-acl']
65CS_XML_NS = 'http://doc.s3.amazonaws.com/2006-03-01'
66LOCAL_GCS_ENDPOINT = '/_ah/gcs'
67_access_token = ''
68
69
70_MAX_GET_BUCKET_RESULT = 1000
71
72
73def set_access_token(access_token):
74  """Set the shared access token to authenticate with Google Cloud Storage.
75
76  When set, the library will always attempt to communicate with the
77  real Google Cloud Storage with this token even when running on dev appserver.
78  Note the token could expire so it's up to you to renew it.
79
80  When absent, the library will automatically request and refresh a token
81  on appserver, or when on dev appserver, talk to a Google Cloud Storage
82  stub.
83
84  Args:
85    access_token: you can get one by run 'gsutil -d ls' and copy the
86      str after 'Bearer'.
87  """
88  global _access_token
89  _access_token = access_token
90
91
92def get_access_token():
93  """Returns the shared access token."""
94  return _access_token
95
96
97class GCSFileStat(object):
98  """Container for GCS file stat."""
99
100  def __init__(self,
101               filename,
102               st_size,
103               etag,
104               st_ctime,
105               content_type=None,
106               metadata=None,
107               is_dir=False):
108    """Initialize.
109
110    For files, the non optional arguments are always set.
111    For directories, only filename and is_dir is set.
112
113    Args:
114      filename: a Google Cloud Storage filename of form '/bucket/filename'.
115      st_size: file size in bytes. long compatible.
116      etag: hex digest of the md5 hash of the file's content. str.
117      st_ctime: posix file creation time. float compatible.
118      content_type: content type. str.
119      metadata: a str->str dict of user specified options when creating
120        the file. Possible keys are x-goog-meta-, content-disposition,
121        content-encoding, and cache-control.
122      is_dir: True if this represents a directory. False if this is a real file.
123    """
124    self.filename = filename
125    self.is_dir = is_dir
126    self.st_size = None
127    self.st_ctime = None
128    self.etag = None
129    self.content_type = content_type
130    self.metadata = metadata
131
132    if not is_dir:
133      self.st_size = long(st_size)
134      self.st_ctime = float(st_ctime)
135      if etag[0] == '"' and etag[-1] == '"':
136        etag = etag[1:-1]
137      self.etag = etag
138
139  def __repr__(self):
140    if self.is_dir:
141      return '(directory: %s)' % self.filename
142
143    return (
144        '(filename: %(filename)s, st_size: %(st_size)s, '
145        'st_ctime: %(st_ctime)s, etag: %(etag)s, '
146        'content_type: %(content_type)s, '
147        'metadata: %(metadata)s)' %
148        dict(filename=self.filename,
149             st_size=self.st_size,
150             st_ctime=self.st_ctime,
151             etag=self.etag,
152             content_type=self.content_type,
153             metadata=self.metadata))
154
155  def __cmp__(self, other):
156    if not isinstance(other, self.__class__):
157      raise ValueError('Argument to cmp must have the same type. '
158                       'Expect %s, got %s', self.__class__.__name__,
159                       other.__class__.__name__)
160    if self.filename > other.filename:
161      return 1
162    elif self.filename < other.filename:
163      return -1
164    return 0
165
166  def __hash__(self):
167    if self.etag:
168      return hash(self.etag)
169    return hash(self.filename)
170
171
172CSFileStat = GCSFileStat
173
174
175def get_metadata(headers):
176  """Get user defined options from HTTP response headers."""
177  return dict((k, v) for k, v in headers.iteritems()
178              if any(k.lower().startswith(valid) for valid in _GCS_METADATA))
179
180
181def validate_bucket_name(name):
182  """Validate a Google Storage bucket name.
183
184  Args:
185    name: a Google Storage bucket name with no prefix or suffix.
186
187  Raises:
188    ValueError: if name is invalid.
189  """
190  _validate_path(name)
191  if not _GCS_BUCKET_REGEX.match(name):
192    raise ValueError('Bucket should be 3-63 characters long using only a-z,'
193                     '0-9, underscore, dash or dot but got %s' % name)
194
195
196def validate_bucket_path(path):
197  """Validate a Google Cloud Storage bucket path.
198
199  Args:
200    path: a Google Storage bucket path. It should have form '/bucket'.
201
202  Raises:
203    ValueError: if path is invalid.
204  """
205  _validate_path(path)
206  if not _GCS_BUCKET_PATH_REGEX.match(path):
207    raise ValueError('Bucket should have format /bucket '
208                     'but got %s' % path)
209
210
211def validate_file_path(path):
212  """Validate a Google Cloud Storage file path.
213
214  Args:
215    path: a Google Storage file path. It should have form '/bucket/filename'.
216
217  Raises:
218    ValueError: if path is invalid.
219  """
220  _validate_path(path)
221  if not _GCS_FULLPATH_REGEX.match(path):
222    raise ValueError('Path should have format /bucket/filename '
223                     'but got %s' % path)
224
225
226def _process_path_prefix(path_prefix):
227  """Validate and process a Google Cloud Stoarge path prefix.
228
229  Args:
230    path_prefix: a Google Cloud Storage path prefix of format '/bucket/prefix'
231      or '/bucket/' or '/bucket'.
232
233  Raises:
234    ValueError: if path is invalid.
235
236  Returns:
237    a tuple of /bucket and prefix. prefix can be None.
238  """
239  _validate_path(path_prefix)
240  if not _GCS_PATH_PREFIX_REGEX.match(path_prefix):
241    raise ValueError('Path prefix should have format /bucket, /bucket/, '
242                     'or /bucket/prefix but got %s.' % path_prefix)
243  bucket_name_end = path_prefix.find('/', 1)
244  bucket = path_prefix
245  prefix = None
246  if bucket_name_end != -1:
247    bucket = path_prefix[:bucket_name_end]
248    prefix = path_prefix[bucket_name_end + 1:] or None
249  return bucket, prefix
250
251
252def _validate_path(path):
253  """Basic validation of Google Storage paths.
254
255  Args:
256    path: a Google Storage path. It should have form '/bucket/filename'
257      or '/bucket'.
258
259  Raises:
260    ValueError: if path is invalid.
261    TypeError: if path is not of type basestring.
262  """
263  if not path:
264    raise ValueError('Path is empty')
265  if not isinstance(path, basestring):
266    raise TypeError('Path should be a string but is %s (%s).' %
267                    (path.__class__, path))
268
269
270def validate_options(options):
271  """Validate Google Cloud Storage options.
272
273  Args:
274    options: a str->basestring dict of options to pass to Google Cloud Storage.
275
276  Raises:
277    ValueError: if option is not supported.
278    TypeError: if option is not of type str or value of an option
279      is not of type basestring.
280  """
281  if not options:
282    return
283
284  for k, v in options.iteritems():
285    if not isinstance(k, str):
286      raise TypeError('option %r should be a str.' % k)
287    if not any(k.lower().startswith(valid) for valid in _GCS_OPTIONS):
288      raise ValueError('option %s is not supported.' % k)
289    if not isinstance(v, basestring):
290      raise TypeError('value %r for option %s should be of type basestring.' %
291                      (v, k))
292
293
294def http_time_to_posix(http_time):
295  """Convert HTTP time format to posix time.
296
297  See http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1
298  for http time format.
299
300  Args:
301    http_time: time in RFC 2616 format. e.g.
302      "Mon, 20 Nov 1995 19:12:08 GMT".
303
304  Returns:
305    A float of secs from unix epoch.
306  """
307  if http_time is not None:
308    return email_utils.mktime_tz(email_utils.parsedate_tz(http_time))
309
310
311def posix_time_to_http(posix_time):
312  """Convert posix time to HTML header time format.
313
314  Args:
315    posix_time: unix time.
316
317  Returns:
318    A datatime str in RFC 2616 format.
319  """
320  if posix_time:
321    return email_utils.formatdate(posix_time, usegmt=True)
322
323
324_DT_FORMAT = '%Y-%m-%dT%H:%M:%S'
325
326
327def dt_str_to_posix(dt_str):
328  """format str to posix.
329
330  datetime str is of format %Y-%m-%dT%H:%M:%S.%fZ,
331  e.g. 2013-04-12T00:22:27.978Z. According to ISO 8601, T is a separator
332  between date and time when they are on the same line.
333  Z indicates UTC (zero meridian).
334
335  A pointer: http://www.cl.cam.ac.uk/~mgk25/iso-time.html
336
337  This is used to parse LastModified node from GCS's GET bucket XML response.
338
339  Args:
340    dt_str: A datetime str.
341
342  Returns:
343    A float of secs from unix epoch. By posix definition, epoch is midnight
344    1970/1/1 UTC.
345  """
346  parsable, _ = dt_str.split('.')
347  dt = datetime.datetime.strptime(parsable, _DT_FORMAT)
348  return calendar.timegm(dt.utctimetuple())
349
350
351def posix_to_dt_str(posix):
352  """Reverse of str_to_datetime.
353
354  This is used by GCS stub to generate GET bucket XML response.
355
356  Args:
357    posix: A float of secs from unix epoch.
358
359  Returns:
360    A datetime str.
361  """
362  dt = datetime.datetime.utcfromtimestamp(posix)
363  dt_str = dt.strftime(_DT_FORMAT)
364  return dt_str + '.000Z'
365
366
367def local_run():
368  """Whether we should hit GCS dev appserver stub."""
369  server_software = os.environ.get('SERVER_SOFTWARE')
370  if server_software is None:
371    return True
372  if 'remote_api' in server_software:
373    return False
374  if server_software.startswith(('Development', 'testutil')):
375    return True
376  return False
377
378
379def local_api_url():
380  """Return URL for GCS emulation on dev appserver."""
381  return 'http://%s%s' % (os.environ.get('HTTP_HOST'), LOCAL_GCS_ENDPOINT)
382
383
384def memory_usage(method):
385  """Log memory usage before and after a method."""
386  def wrapper(*args, **kwargs):
387    logging.info('Memory before method %s is %s.',
388                 method.__name__, runtime.memory_usage().current())
389    result = method(*args, **kwargs)
390    logging.info('Memory after method %s is %s',
391                 method.__name__, runtime.memory_usage().current())
392    return result
393  return wrapper
394
395
396def _add_ns(tagname):
397  return '{%(ns)s}%(tag)s' % {'ns': CS_XML_NS,
398                              'tag': tagname}
399
400
401_T_CONTENTS = _add_ns('Contents')
402_T_LAST_MODIFIED = _add_ns('LastModified')
403_T_ETAG = _add_ns('ETag')
404_T_KEY = _add_ns('Key')
405_T_SIZE = _add_ns('Size')
406_T_PREFIX = _add_ns('Prefix')
407_T_COMMON_PREFIXES = _add_ns('CommonPrefixes')
408_T_NEXT_MARKER = _add_ns('NextMarker')
409_T_IS_TRUNCATED = _add_ns('IsTruncated')
410