1# -*- coding: utf-8 -*-
2# Copyright 2013 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""File and Cloud URL representation classes."""
16
17from __future__ import absolute_import
18
19import os
20import re
21
22from gslib.exception import InvalidUrlError
23
24# Matches provider strings of the form 'gs://'
25PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$')
26# Matches bucket strings of the form 'gs://bucket'
27BUCKET_REGEX = re.compile(r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$')
28# Matches object strings of the form 'gs://bucket/obj'
29OBJECT_REGEX = re.compile(
30    r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)')
31# Matches versioned object strings of the form 'gs://bucket/obj#1234'
32GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$')
33# Matches versioned object strings of the form 's3://bucket/obj#NULL'
34S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$')
35# Matches file strings of the form 'file://dir/filename'
36FILE_OBJECT_REGEX = re.compile(r'([^:]*://)(?P<filepath>.*)')
37# Regex to disallow buckets violating charset or not [3..255] chars total.
38BUCKET_NAME_RE = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9\._-]{1,253}[a-zA-Z0-9]$')
39# Regex to disallow buckets with individual DNS labels longer than 63.
40TOO_LONG_DNS_NAME_COMP = re.compile(r'[-_a-z0-9]{64}')
41# Regex to determine if a string contains any wildcards.
42WILDCARD_REGEX = re.compile(r'[*?\[\]]')
43
44
45class StorageUrl(object):
46  """Abstract base class for file and Cloud Storage URLs."""
47
48  def Clone(self):
49    raise NotImplementedError('Clone not overridden')
50
51  def IsFileUrl(self):
52    raise NotImplementedError('IsFileUrl not overridden')
53
54  def IsCloudUrl(self):
55    raise NotImplementedError('IsCloudUrl not overridden')
56
57  def IsStream(self):
58    raise NotImplementedError('IsStream not overridden')
59
60  def CreatePrefixUrl(self, wildcard_suffix=None):
61    """Returns a prefix of this URL that can be used for iterating.
62
63    Args:
64      wildcard_suffix: If supplied, this wildcard suffix will be appended to the
65                       prefix with a trailing slash before being returned.
66
67    Returns:
68      A prefix of this URL that can be used for iterating.
69
70    If this URL contains a trailing slash, it will be stripped to create the
71    prefix. This helps avoid infinite looping when prefixes are iterated, but
72    preserves other slashes so that objects with '/' in the name are handled
73    properly.
74
75    For example, when recursively listing a bucket with the following contents:
76      gs://bucket// <-- object named slash
77      gs://bucket//one-dir-deep
78    a top-level expansion with '/' as a delimiter will result in the following
79    URL strings:
80      'gs://bucket//' : OBJECT
81      'gs://bucket//' : PREFIX
82    If we right-strip all slashes from the prefix entry and add a wildcard
83    suffix, we will get 'gs://bucket/*' which will produce identical results
84    (and infinitely recurse).
85
86    Example return values:
87      ('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*'
88      ('gs://bucket/', '*') becomes 'gs://bucket/*'
89      ('gs://bucket/', None) becomes 'gs://bucket'
90      ('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*'
91      ('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**'
92      ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes
93           'gs://bucket/subdir/*', but iterating on this will return 'subdir/'
94           as a BucketListingObject, so we will not recurse on it as a subdir
95           during listing.
96    """
97    raise NotImplementedError('CreatePrefixUrl not overridden')
98
99  @property
100  def url_string(self):
101    raise NotImplementedError('url_string not overridden')
102
103  @property
104  def versionless_url_string(self):
105    raise NotImplementedError('versionless_url_string not overridden')
106
107  def __eq__(self, other):
108    return isinstance(other, StorageUrl) and self.url_string == other.url_string
109
110  def __hash__(self):
111    return hash(self.url_string)
112
113
114class _FileUrl(StorageUrl):
115  """File URL class providing parsing and convenience methods.
116
117    This class assists with usage and manipulation of an
118    (optionally wildcarded) file URL string.  Depending on the string
119    contents, this class represents one or more directories or files.
120
121    For File URLs, scheme is always file, bucket_name is always blank,
122    and object_name contains the file/directory path.
123  """
124
125  def __init__(self, url_string, is_stream=False):
126    self.scheme = 'file'
127    self.bucket_name = ''
128    match = FILE_OBJECT_REGEX.match(url_string)
129    if match and match.lastindex == 2:
130      self.object_name = match.group(2)
131    else:
132      self.object_name = url_string
133    self.generation = None
134    self.is_stream = is_stream
135    self.delim = os.sep
136
137  def Clone(self):
138    return _FileUrl(self.url_string)
139
140  def IsFileUrl(self):
141    return True
142
143  def IsCloudUrl(self):
144    return False
145
146  def IsStream(self):
147    return self.is_stream
148
149  def IsDirectory(self):
150    return not self.IsStream() and os.path.isdir(self.object_name)
151
152  def CreatePrefixUrl(self, wildcard_suffix=None):
153    return self.url_string
154
155  @property
156  def url_string(self):
157    return '%s://%s' % (self.scheme, self.object_name)
158
159  @property
160  def versionless_url_string(self):
161    return self.url_string
162
163  def __str__(self):
164    return self.url_string
165
166
167class _CloudUrl(StorageUrl):
168  """Cloud URL class providing parsing and convenience methods.
169
170    This class assists with usage and manipulation of an
171    (optionally wildcarded) cloud URL string.  Depending on the string
172    contents, this class represents a provider, bucket(s), or object(s).
173
174    This class operates only on strings.  No cloud storage API calls are
175    made from this class.
176  """
177
178  def __init__(self, url_string):
179    self.scheme = None
180    self.bucket_name = None
181    self.object_name = None
182    self.generation = None
183    self.delim = '/'
184    provider_match = PROVIDER_REGEX.match(url_string)
185    bucket_match = BUCKET_REGEX.match(url_string)
186    if provider_match:
187      self.scheme = provider_match.group('provider')
188    elif bucket_match:
189      self.scheme = bucket_match.group('provider')
190      self.bucket_name = bucket_match.group('bucket')
191      if (not ContainsWildcard(self.bucket_name) and
192          (not BUCKET_NAME_RE.match(self.bucket_name) or
193           TOO_LONG_DNS_NAME_COMP.search(self.bucket_name))):
194        raise InvalidUrlError('Invalid bucket name in URL "%s"' % url_string)
195    else:
196      object_match = OBJECT_REGEX.match(url_string)
197      if object_match:
198        self.scheme = object_match.group('provider')
199        self.bucket_name = object_match.group('bucket')
200        self.object_name = object_match.group('object')
201        if self.scheme == 'gs':
202          generation_match = GS_GENERATION_REGEX.match(self.object_name)
203          if generation_match:
204            self.object_name = generation_match.group('object')
205            self.generation = generation_match.group('generation')
206        elif self.scheme == 's3':
207          version_match = S3_VERSION_REGEX.match(self.object_name)
208          if version_match:
209            self.object_name = version_match.group('object')
210            self.generation = version_match.group('version_id')
211      else:
212        raise InvalidUrlError(
213            'CloudUrl: URL string %s did not match URL regex' % url_string)
214
215  def Clone(self):
216    return _CloudUrl(self.url_string)
217
218  def IsFileUrl(self):
219    return False
220
221  def IsCloudUrl(self):
222    return True
223
224  def IsStream(self):
225    raise NotImplementedError('IsStream not supported on CloudUrl')
226
227  def IsBucket(self):
228    return bool(self.bucket_name and not self.object_name)
229
230  def IsObject(self):
231    return bool(self.bucket_name and self.object_name)
232
233  def HasGeneration(self):
234    return bool(self.generation)
235
236  def IsProvider(self):
237    return bool(self.scheme and not self.bucket_name)
238
239  def CreatePrefixUrl(self, wildcard_suffix=None):
240    prefix = StripOneSlash(self.versionless_url_string)
241    if wildcard_suffix:
242      prefix = '%s/%s' % (prefix, wildcard_suffix)
243    return prefix
244
245  @property
246  def bucket_url_string(self):
247    return '%s://%s/' % (self.scheme, self.bucket_name)
248
249  @property
250  def url_string(self):
251    url_str = self.versionless_url_string
252    if self.HasGeneration():
253      url_str += '#%s' % self.generation
254    return url_str
255
256  @property
257  def versionless_url_string(self):
258    if self.IsProvider():
259      return '%s://' % self.scheme
260    elif self.IsBucket():
261      return self.bucket_url_string
262    return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name)
263
264  def __str__(self):
265    return self.url_string
266
267
268def _GetSchemeFromUrlString(url_str):
269  """Returns scheme component of a URL string."""
270
271  end_scheme_idx = url_str.find('://')
272  if end_scheme_idx == -1:
273    # File is the default scheme.
274    return 'file'
275  else:
276    return url_str[0:end_scheme_idx].lower()
277
278
279def _GetPathFromUrlString(url_str):
280  """Returns path component of a URL string."""
281
282  end_scheme_idx = url_str.find('://')
283  if end_scheme_idx == -1:
284    return url_str
285  else:
286    return url_str[end_scheme_idx + 3:]
287
288
289def IsFileUrlString(url_str):
290  """Returns whether a string is a file URL."""
291
292  return _GetSchemeFromUrlString(url_str) == 'file'
293
294
295def StorageUrlFromString(url_str):
296  """Static factory function for creating a StorageUrl from a string."""
297
298  scheme = _GetSchemeFromUrlString(url_str)
299
300  if scheme not in ('file', 's3', 'gs'):
301    raise InvalidUrlError('Unrecognized scheme "%s"' % scheme)
302  if scheme == 'file':
303    path = _GetPathFromUrlString(url_str)
304    is_stream = (path == '-')
305    return _FileUrl(url_str, is_stream=is_stream)
306  return _CloudUrl(url_str)
307
308
309def StripOneSlash(url_str):
310  if url_str and url_str.endswith('/'):
311    return url_str[:-1]
312  return url_str
313
314
315def ContainsWildcard(url_string):
316  """Checks whether url_string contains a wildcard.
317
318  Args:
319    url_string: URL string to check.
320
321  Returns:
322    bool indicator.
323  """
324  return bool(WILDCARD_REGEX.search(url_string))
325