1# Copyright 2012 Google Inc. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#    http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing,
10# software distributed under the License is distributed on an
11# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
12# either express or implied. See the License for the specific
13# language governing permissions and limitations under the License.
14
15"""Helpers shared by cloudstorage_stub and cloudstorage_api."""
16
17
18
19
20
21__all__ = ['CS_XML_NS',
22           'CSFileStat',
23           'dt_str_to_posix',
24           'local_api_url',
25           'LOCAL_GCS_ENDPOINT',
26           'local_run',
27           'get_access_token',
28           'get_stored_content_length',
29           'get_metadata',
30           'GCSFileStat',
31           'http_time_to_posix',
32           'memory_usage',
33           'posix_time_to_http',
34           'posix_to_dt_str',
35           'set_access_token',
36           'validate_options',
37           'validate_bucket_name',
38           'validate_bucket_path',
39           'validate_file_path',
40          ]
41
42
43import calendar
44import datetime
45from email import utils as email_utils
46import logging
47import os
48import re
49
50try:
51  from google.appengine.api import runtime
52except ImportError:
53  from google.appengine.api import runtime
54
55
56_GCS_BUCKET_REGEX_BASE = r'[a-z0-9\.\-_]{3,63}'
57_GCS_BUCKET_REGEX = re.compile(_GCS_BUCKET_REGEX_BASE + r'$')
58_GCS_BUCKET_PATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'$')
59_GCS_PATH_PREFIX_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'.*')
60_GCS_FULLPATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'/.*')
61_GCS_METADATA = ['x-goog-meta-',
62                 'content-disposition',
63                 'cache-control',
64                 'content-encoding']
65_GCS_OPTIONS = _GCS_METADATA + ['x-goog-acl']
66CS_XML_NS = 'http://doc.s3.amazonaws.com/2006-03-01'
67LOCAL_GCS_ENDPOINT = '/_ah/gcs'
68_access_token = ''
69
70
71_MAX_GET_BUCKET_RESULT = 1000
72
73
74def set_access_token(access_token):
75  """Set the shared access token to authenticate with Google Cloud Storage.
76
77  When set, the library will always attempt to communicate with the
78  real Google Cloud Storage with this token even when running on dev appserver.
79  Note the token could expire so it's up to you to renew it.
80
81  When absent, the library will automatically request and refresh a token
82  on appserver, or when on dev appserver, talk to a Google Cloud Storage
83  stub.
84
85  Args:
86    access_token: you can get one by run 'gsutil -d ls' and copy the
87      str after 'Bearer'.
88  """
89  global _access_token
90  _access_token = access_token
91
92
93def get_access_token():
94  """Returns the shared access token."""
95  return _access_token
96
97
98class GCSFileStat(object):
99  """Container for GCS file stat."""
100
101  def __init__(self,
102               filename,
103               st_size,
104               etag,
105               st_ctime,
106               content_type=None,
107               metadata=None,
108               is_dir=False):
109    """Initialize.
110
111    For files, the non optional arguments are always set.
112    For directories, only filename and is_dir is set.
113
114    Args:
115      filename: a Google Cloud Storage filename of form '/bucket/filename'.
116      st_size: file size in bytes. long compatible.
117      etag: hex digest of the md5 hash of the file's content. str.
118      st_ctime: posix file creation time. float compatible.
119      content_type: content type. str.
120      metadata: a str->str dict of user specified options when creating
121        the file. Possible keys are x-goog-meta-, content-disposition,
122        content-encoding, and cache-control.
123      is_dir: True if this represents a directory. False if this is a real file.
124    """
125    self.filename = filename
126    self.is_dir = is_dir
127    self.st_size = None
128    self.st_ctime = None
129    self.etag = None
130    self.content_type = content_type
131    self.metadata = metadata
132
133    if not is_dir:
134      self.st_size = long(st_size)
135      self.st_ctime = float(st_ctime)
136      if etag[0] == '"' and etag[-1] == '"':
137        etag = etag[1:-1]
138      self.etag = etag
139
140  def __repr__(self):
141    if self.is_dir:
142      return '(directory: %s)' % self.filename
143
144    return (
145        '(filename: %(filename)s, st_size: %(st_size)s, '
146        'st_ctime: %(st_ctime)s, etag: %(etag)s, '
147        'content_type: %(content_type)s, '
148        'metadata: %(metadata)s)' %
149        dict(filename=self.filename,
150             st_size=self.st_size,
151             st_ctime=self.st_ctime,
152             etag=self.etag,
153             content_type=self.content_type,
154             metadata=self.metadata))
155
156  def __cmp__(self, other):
157    if not isinstance(other, self.__class__):
158      raise ValueError('Argument to cmp must have the same type. '
159                       'Expect %s, got %s', self.__class__.__name__,
160                       other.__class__.__name__)
161    if self.filename > other.filename:
162      return 1
163    elif self.filename < other.filename:
164      return -1
165    return 0
166
167  def __hash__(self):
168    if self.etag:
169      return hash(self.etag)
170    return hash(self.filename)
171
172
173CSFileStat = GCSFileStat
174
175
176def get_stored_content_length(headers):
177  """Return the content length (in bytes) of the object as stored in GCS.
178
179  x-goog-stored-content-length should always be present except when called via
180  the local dev_appserver. Therefore if it is not present we default to the
181  standard content-length header.
182
183  Args:
184    headers: a dict of headers from the http response.
185
186  Returns:
187    the stored content length.
188  """
189  length = headers.get('x-goog-stored-content-length')
190  if length is None:
191    length = headers.get('content-length')
192  return length
193
194
195def get_metadata(headers):
196  """Get user defined options from HTTP response headers."""
197  return dict((k, v) for k, v in headers.iteritems()
198              if any(k.lower().startswith(valid) for valid in _GCS_METADATA))
199
200
201def validate_bucket_name(name):
202  """Validate a Google Storage bucket name.
203
204  Args:
205    name: a Google Storage bucket name with no prefix or suffix.
206
207  Raises:
208    ValueError: if name is invalid.
209  """
210  _validate_path(name)
211  if not _GCS_BUCKET_REGEX.match(name):
212    raise ValueError('Bucket should be 3-63 characters long using only a-z,'
213                     '0-9, underscore, dash or dot but got %s' % name)
214
215
216def validate_bucket_path(path):
217  """Validate a Google Cloud Storage bucket path.
218
219  Args:
220    path: a Google Storage bucket path. It should have form '/bucket'.
221
222  Raises:
223    ValueError: if path is invalid.
224  """
225  _validate_path(path)
226  if not _GCS_BUCKET_PATH_REGEX.match(path):
227    raise ValueError('Bucket should have format /bucket '
228                     'but got %s' % path)
229
230
231def validate_file_path(path):
232  """Validate a Google Cloud Storage file path.
233
234  Args:
235    path: a Google Storage file path. It should have form '/bucket/filename'.
236
237  Raises:
238    ValueError: if path is invalid.
239  """
240  _validate_path(path)
241  if not _GCS_FULLPATH_REGEX.match(path):
242    raise ValueError('Path should have format /bucket/filename '
243                     'but got %s' % path)
244
245
246def _process_path_prefix(path_prefix):
247  """Validate and process a Google Cloud Stoarge path prefix.
248
249  Args:
250    path_prefix: a Google Cloud Storage path prefix of format '/bucket/prefix'
251      or '/bucket/' or '/bucket'.
252
253  Raises:
254    ValueError: if path is invalid.
255
256  Returns:
257    a tuple of /bucket and prefix. prefix can be None.
258  """
259  _validate_path(path_prefix)
260  if not _GCS_PATH_PREFIX_REGEX.match(path_prefix):
261    raise ValueError('Path prefix should have format /bucket, /bucket/, '
262                     'or /bucket/prefix but got %s.' % path_prefix)
263  bucket_name_end = path_prefix.find('/', 1)
264  bucket = path_prefix
265  prefix = None
266  if bucket_name_end != -1:
267    bucket = path_prefix[:bucket_name_end]
268    prefix = path_prefix[bucket_name_end + 1:] or None
269  return bucket, prefix
270
271
272def _validate_path(path):
273  """Basic validation of Google Storage paths.
274
275  Args:
276    path: a Google Storage path. It should have form '/bucket/filename'
277      or '/bucket'.
278
279  Raises:
280    ValueError: if path is invalid.
281    TypeError: if path is not of type basestring.
282  """
283  if not path:
284    raise ValueError('Path is empty')
285  if not isinstance(path, basestring):
286    raise TypeError('Path should be a string but is %s (%s).' %
287                    (path.__class__, path))
288
289
290def validate_options(options):
291  """Validate Google Cloud Storage options.
292
293  Args:
294    options: a str->basestring dict of options to pass to Google Cloud Storage.
295
296  Raises:
297    ValueError: if option is not supported.
298    TypeError: if option is not of type str or value of an option
299      is not of type basestring.
300  """
301  if not options:
302    return
303
304  for k, v in options.iteritems():
305    if not isinstance(k, str):
306      raise TypeError('option %r should be a str.' % k)
307    if not any(k.lower().startswith(valid) for valid in _GCS_OPTIONS):
308      raise ValueError('option %s is not supported.' % k)
309    if not isinstance(v, basestring):
310      raise TypeError('value %r for option %s should be of type basestring.' %
311                      (v, k))
312
313
314def http_time_to_posix(http_time):
315  """Convert HTTP time format to posix time.
316
317  See http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1
318  for http time format.
319
320  Args:
321    http_time: time in RFC 2616 format. e.g.
322      "Mon, 20 Nov 1995 19:12:08 GMT".
323
324  Returns:
325    A float of secs from unix epoch.
326  """
327  if http_time is not None:
328    return email_utils.mktime_tz(email_utils.parsedate_tz(http_time))
329
330
331def posix_time_to_http(posix_time):
332  """Convert posix time to HTML header time format.
333
334  Args:
335    posix_time: unix time.
336
337  Returns:
338    A datatime str in RFC 2616 format.
339  """
340  if posix_time:
341    return email_utils.formatdate(posix_time, usegmt=True)
342
343
344_DT_FORMAT = '%Y-%m-%dT%H:%M:%S'
345
346
347def dt_str_to_posix(dt_str):
348  """format str to posix.
349
350  datetime str is of format %Y-%m-%dT%H:%M:%S.%fZ,
351  e.g. 2013-04-12T00:22:27.978Z. According to ISO 8601, T is a separator
352  between date and time when they are on the same line.
353  Z indicates UTC (zero meridian).
354
355  A pointer: http://www.cl.cam.ac.uk/~mgk25/iso-time.html
356
357  This is used to parse LastModified node from GCS's GET bucket XML response.
358
359  Args:
360    dt_str: A datetime str.
361
362  Returns:
363    A float of secs from unix epoch. By posix definition, epoch is midnight
364    1970/1/1 UTC.
365  """
366  parsable, _ = dt_str.split('.')
367  dt = datetime.datetime.strptime(parsable, _DT_FORMAT)
368  return calendar.timegm(dt.utctimetuple())
369
370
371def posix_to_dt_str(posix):
372  """Reverse of str_to_datetime.
373
374  This is used by GCS stub to generate GET bucket XML response.
375
376  Args:
377    posix: A float of secs from unix epoch.
378
379  Returns:
380    A datetime str.
381  """
382  dt = datetime.datetime.utcfromtimestamp(posix)
383  dt_str = dt.strftime(_DT_FORMAT)
384  return dt_str + '.000Z'
385
386
387def local_run():
388  """Whether we should hit GCS dev appserver stub."""
389  server_software = os.environ.get('SERVER_SOFTWARE')
390  if server_software is None:
391    return True
392  if 'remote_api' in server_software:
393    return False
394  if server_software.startswith(('Development', 'testutil')):
395    return True
396  return False
397
398
399def local_api_url():
400  """Return URL for GCS emulation on dev appserver."""
401  return 'http://%s%s' % (os.environ.get('HTTP_HOST'), LOCAL_GCS_ENDPOINT)
402
403
404def memory_usage(method):
405  """Log memory usage before and after a method."""
406  def wrapper(*args, **kwargs):
407    logging.info('Memory before method %s is %s.',
408                 method.__name__, runtime.memory_usage().current())
409    result = method(*args, **kwargs)
410    logging.info('Memory after method %s is %s',
411                 method.__name__, runtime.memory_usage().current())
412    return result
413  return wrapper
414
415
416def _add_ns(tagname):
417  return '{%(ns)s}%(tag)s' % {'ns': CS_XML_NS,
418                              'tag': tagname}
419
420
421_T_CONTENTS = _add_ns('Contents')
422_T_LAST_MODIFIED = _add_ns('LastModified')
423_T_ETAG = _add_ns('ETag')
424_T_KEY = _add_ns('Key')
425_T_SIZE = _add_ns('Size')
426_T_PREFIX = _add_ns('Prefix')
427_T_COMMON_PREFIXES = _add_ns('CommonPrefixes')
428_T_NEXT_MARKER = _add_ns('NextMarker')
429_T_IS_TRUNCATED = _add_ns('IsTruncated')
430