1# -*- coding: utf-8 -*-
2# Copyright 2013 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Implementation of Unix-like du command for cloud storage providers."""
16
17from __future__ import absolute_import
18
19import sys
20
21from gslib.boto_translation import S3_DELETE_MARKER_GUID
22from gslib.bucket_listing_ref import BucketListingObject
23from gslib.command import Command
24from gslib.command_argument import CommandArgument
25from gslib.cs_api_map import ApiSelector
26from gslib.exception import CommandException
27from gslib.ls_helper import LsHelper
28from gslib.storage_url import ContainsWildcard
29from gslib.storage_url import StorageUrlFromString
30from gslib.util import MakeHumanReadable
31from gslib.util import NO_MAX
32from gslib.util import UTF8
33
34_SYNOPSIS = """
35  gsutil du url...
36"""
37
38_DETAILED_HELP_TEXT = ("""
39<B>SYNOPSIS</B>
40""" + _SYNOPSIS + """
41
42
43<B>DESCRIPTION</B>
44  The du command displays the amount of space (in bytes) being used by the
45  objects in the file or object hierarchy under a given URL. The syntax emulates
46  the Linux du command (which stands for disk usage). For example, the command:
47
48  gsutil du -s gs://your-bucket/dir
49
50  will report the total space used by all objects under gs://your-bucket/dir and
51  any sub-directories.
52
53
54<B>OPTIONS</B>
55  -0          Ends each output line with a 0 byte rather than a newline. This
56              can be useful to make the output more easily machine-readable.
57
58  -a          Includes non-current object versions / generations in the listing
59              (only useful with a versioning-enabled bucket). Also prints
60              generation and metageneration for each listed object.
61
62  -c          Produce a grand total.
63
64  -e          A pattern to exclude from reporting. Example: -e "*.o" would
65              exclude any object that ends in ".o". Can be specified multiple
66              times.
67
68  -h          Prints object sizes in human-readable format (e.g., 1 KiB,
69              234 MiB, 2GiB, etc.)
70
71  -s          Display only a summary total for each argument.
72
73  -X          Similar to -e, but excludes patterns from the given file. The
74              patterns to exclude should be one per line.
75
76
77<B>EXAMPLES</B>
78  To list the size of all objects in a bucket:
79
80    gsutil du gs://bucketname
81
82  To list the size of all objects underneath a prefix:
83
84    gsutil du gs://bucketname/prefix/*
85
86  To print the total number of bytes in a bucket, in human-readable form:
87
88    gsutil du -ch gs://bucketname
89
90  To see a summary of the total bytes in the two given buckets:
91
92    gsutil du -s gs://bucket1 gs://bucket2
93
94  To list the size of all objects in a versioned bucket, including objects that
95  are not the latest:
96
97    gsutil du -a gs://bucketname
98
99  To list all objects in a bucket, except objects that end in ".bak",
100  with each object printed ending in a null byte:
101
102    gsutil du -e "*.bak" -0 gs://bucketname
103
104  To get a total of all buckets in a project with a grand total for an entire
105  project:
106
107      gsutil -o GSUtil:default_project_id=project-name du -shc
108""")
109
110
111class DuCommand(Command):
112  """Implementation of gsutil du command."""
113
114  # Command specification. See base class for documentation.
115  command_spec = Command.CreateCommandSpec(
116      'du',
117      command_name_aliases=[],
118      usage_synopsis=_SYNOPSIS,
119      min_args=0,
120      max_args=NO_MAX,
121      supported_sub_args='0ace:hsX:',
122      file_url_ok=False,
123      provider_url_ok=True,
124      urls_start_arg=0,
125      gs_api_support=[ApiSelector.XML, ApiSelector.JSON],
126      gs_default_api=ApiSelector.JSON,
127      argparse_arguments=[
128          CommandArgument.MakeZeroOrMoreCloudURLsArgument()
129      ]
130  )
131  # Help specification. See help_provider.py for documentation.
132  help_spec = Command.HelpSpec(
133      help_name='du',
134      help_name_aliases=[],
135      help_type='command_help',
136      help_one_line_summary='Display object size usage',
137      help_text=_DETAILED_HELP_TEXT,
138      subcommand_help_text={},
139  )
140
141  def _PrintSummaryLine(self, num_bytes, name):
142    size_string = (MakeHumanReadable(num_bytes)
143                   if self.human_readable else str(num_bytes))
144    sys.stdout.write('%(size)-10s  %(name)s%(ending)s' % {
145        'size': size_string, 'name': name, 'ending': self.line_ending})
146
147  def _PrintInfoAboutBucketListingRef(self, bucket_listing_ref):
148    """Print listing info for given bucket_listing_ref.
149
150    Args:
151      bucket_listing_ref: BucketListing being listed.
152
153    Returns:
154      Tuple (number of objects, object size)
155
156    Raises:
157      Exception: if calling bug encountered.
158    """
159    obj = bucket_listing_ref.root_object
160    url_str = bucket_listing_ref.url_string
161    if (obj.metadata and S3_DELETE_MARKER_GUID in
162        obj.metadata.additionalProperties):
163      size_string = '0'
164      num_bytes = 0
165      num_objs = 0
166      url_str += '<DeleteMarker>'
167    else:
168      size_string = (MakeHumanReadable(obj.size)
169                     if self.human_readable else str(obj.size))
170      num_bytes = obj.size
171      num_objs = 1
172
173    if not self.summary_only:
174      sys.stdout.write('%(size)-10s  %(url)s%(ending)s' % {
175          'size': size_string,
176          'url': url_str.encode(UTF8),
177          'ending': self.line_ending})
178
179    return (num_objs, num_bytes)
180
181  def RunCommand(self):
182    """Command entry point for the du command."""
183    self.line_ending = '\n'
184    self.all_versions = False
185    self.produce_total = False
186    self.human_readable = False
187    self.summary_only = False
188    self.exclude_patterns = []
189    if self.sub_opts:
190      for o, a in self.sub_opts:
191        if o == '-0':
192          self.line_ending = '\0'
193        elif o == '-a':
194          self.all_versions = True
195        elif o == '-c':
196          self.produce_total = True
197        elif o == '-e':
198          self.exclude_patterns.append(a)
199        elif o == '-h':
200          self.human_readable = True
201        elif o == '-s':
202          self.summary_only = True
203        elif o == '-X':
204          if a == '-':
205            f = sys.stdin
206          else:
207            f = open(a, 'r')
208          try:
209            for line in f:
210              line = line.strip()
211              if line:
212                self.exclude_patterns.append(line)
213          finally:
214            f.close()
215
216    if not self.args:
217      # Default to listing all gs buckets.
218      self.args = ['gs://']
219
220    total_bytes = 0
221    got_nomatch_errors = False
222
223    def _PrintObjectLong(blr):
224      return self._PrintInfoAboutBucketListingRef(blr)
225
226    def _PrintNothing(unused_blr=None):
227      pass
228
229    def _PrintDirectory(num_bytes, name):
230      if not self.summary_only:
231        self._PrintSummaryLine(num_bytes, name)
232
233    for url_arg in self.args:
234      top_level_storage_url = StorageUrlFromString(url_arg)
235      if top_level_storage_url.IsFileUrl():
236        raise CommandException('Only cloud URLs are supported for %s'
237                               % self.command_name)
238      bucket_listing_fields = ['size']
239
240      ls_helper = LsHelper(
241          self.WildcardIterator, self.logger,
242          print_object_func=_PrintObjectLong, print_dir_func=_PrintNothing,
243          print_dir_header_func=_PrintNothing,
244          print_dir_summary_func=_PrintDirectory,
245          print_newline_func=_PrintNothing, all_versions=self.all_versions,
246          should_recurse=True, exclude_patterns=self.exclude_patterns,
247          fields=bucket_listing_fields)
248
249      # ls_helper expands to objects and prefixes, so perform a top-level
250      # expansion first.
251      if top_level_storage_url.IsProvider():
252        # Provider URL: use bucket wildcard to iterate over all buckets.
253        top_level_iter = self.WildcardIterator(
254            '%s://*' % top_level_storage_url.scheme).IterBuckets(
255                bucket_fields=['id'])
256      elif top_level_storage_url.IsBucket():
257        top_level_iter = self.WildcardIterator(
258            '%s://%s' % (top_level_storage_url.scheme,
259                         top_level_storage_url.bucket_name)).IterBuckets(
260                             bucket_fields=['id'])
261      else:
262        top_level_iter = [BucketListingObject(top_level_storage_url)]
263
264      for blr in top_level_iter:
265        storage_url = blr.storage_url
266        if storage_url.IsBucket() and self.summary_only:
267          storage_url = StorageUrlFromString(
268              storage_url.CreatePrefixUrl(wildcard_suffix='**'))
269        _, exp_objs, exp_bytes = ls_helper.ExpandUrlAndPrint(storage_url)
270        if (storage_url.IsObject() and exp_objs == 0 and
271            ContainsWildcard(url_arg) and not self.exclude_patterns):
272          got_nomatch_errors = True
273        total_bytes += exp_bytes
274
275        if self.summary_only:
276          self._PrintSummaryLine(exp_bytes, blr.url_string.rstrip('/'))
277
278    if self.produce_total:
279      self._PrintSummaryLine(total_bytes, 'total')
280
281    if got_nomatch_errors:
282      raise CommandException('One or more URLs matched no objects.')
283
284    return 0
285