1# -*- coding: utf-8 -*-
2# Copyright 2011 Google Inc. All Rights Reserved.
3# Copyright 2011, Nexenta Systems Inc.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#     http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16"""Implementation of Unix-like cp command for cloud storage providers."""
17
18from __future__ import absolute_import
19
20import os
21import time
22import traceback
23
24from gslib import copy_helper
25from gslib.cat_helper import CatHelper
26from gslib.command import Command
27from gslib.command_argument import CommandArgument
28from gslib.commands.compose import MAX_COMPONENT_COUNT
29from gslib.copy_helper import CreateCopyHelperOpts
30from gslib.copy_helper import ItemExistsError
31from gslib.copy_helper import Manifest
32from gslib.copy_helper import PARALLEL_UPLOAD_TEMP_NAMESPACE
33from gslib.copy_helper import SkipUnsupportedObjectError
34from gslib.cs_api_map import ApiSelector
35from gslib.exception import CommandException
36from gslib.name_expansion import NameExpansionIterator
37from gslib.storage_url import ContainsWildcard
38from gslib.util import CreateLock
39from gslib.util import GetCloudApiInstance
40from gslib.util import IsCloudSubdirPlaceholder
41from gslib.util import MakeHumanReadable
42from gslib.util import NO_MAX
43from gslib.util import RemoveCRLFFromString
44from gslib.util import StdinIterator
45
46_SYNOPSIS = """
47  gsutil cp [OPTION]... src_url dst_url
48  gsutil cp [OPTION]... src_url... dst_url
49  gsutil cp [OPTION]... -I dst_url
50"""
51
52_SYNOPSIS_TEXT = """
53<B>SYNOPSIS</B>
54""" + _SYNOPSIS
55
56_DESCRIPTION_TEXT = """
57<B>DESCRIPTION</B>
58  The gsutil cp command allows you to copy data between your local file
59  system and the cloud, copy data within the cloud, and copy data between
60  cloud storage providers. For example, to copy all text files from the
61  local directory to a bucket you could do:
62
63    gsutil cp *.txt gs://my_bucket
64
65  Similarly, you can download text files from a bucket by doing:
66
67    gsutil cp gs://my_bucket/*.txt .
68
69  If you want to copy an entire directory tree you need to use the -r option:
70
71    gsutil cp -r dir gs://my_bucket
72
73  If you have a large number of files to upload you might want to use the
74  gsutil -m option, to perform a parallel (multi-threaded/multi-processing)
75  copy:
76
77    gsutil -m cp -r dir gs://my_bucket
78
79  You can pass a list of URLs (one per line) to copy on stdin instead of as
80  command line arguments by using the -I option. This allows you to use gsutil
81  in a pipeline to upload or download files / objects as generated by a program,
82  such as:
83
84    some_program | gsutil -m cp -I gs://my_bucket
85
86  or:
87
88    some_program | gsutil -m cp -I ./download_dir
89
90  The contents of stdin can name files, cloud URLs, and wildcards of files
91  and cloud URLs.
92"""
93
94_NAME_CONSTRUCTION_TEXT = """
95<B>HOW NAMES ARE CONSTRUCTED</B>
96  The gsutil cp command strives to name objects in a way consistent with how
97  Linux cp works, which causes names to be constructed in varying ways depending
98  on whether you're performing a recursive directory copy or copying
99  individually named objects; and whether you're copying to an existing or
100  non-existent directory.
101
102  When performing recursive directory copies, object names are constructed
103  that mirror the source directory structure starting at the point of
104  recursive processing. For example, the command:
105
106    gsutil cp -r dir1/dir2 gs://my_bucket
107
108  will create objects named like gs://my_bucket/dir2/a/b/c, assuming
109  dir1/dir2 contains the file a/b/c.
110
111  In contrast, copying individually named files will result in objects named
112  by the final path component of the source files. For example, the command:
113
114    gsutil cp dir1/dir2/** gs://my_bucket
115
116  will create objects named like gs://my_bucket/c.
117
118  The same rules apply for downloads: recursive copies of buckets and
119  bucket subdirectories produce a mirrored filename structure, while copying
120  individually (or wildcard) named objects produce flatly named files.
121
122  Note that in the above example the '**' wildcard matches all names
123  anywhere under dir. The wildcard '*' will match names just one level deep. For
124  more details see 'gsutil help wildcards'.
125
126  There's an additional wrinkle when working with subdirectories: the resulting
127  names depend on whether the destination subdirectory exists. For example,
128  if gs://my_bucket/subdir exists as a subdirectory, the command:
129
130    gsutil cp -r dir1/dir2 gs://my_bucket/subdir
131
132  will create objects named like gs://my_bucket/subdir/dir2/a/b/c. In contrast,
133  if gs://my_bucket/subdir does not exist, this same gsutil cp command will
134  create objects named like gs://my_bucket/subdir/a/b/c.
135
136  Note: If you use the
137  `Google Developers Console <https://console.developers.google.com>`_
138  to create folders, it does so by creating a "placeholder" object that ends
139  with a "/" character. gsutil skips these objects when downloading from the
140  cloud to the local file system, because attempting to create a file that
141  ends with a "/" is not allowed on Linux and MacOS. Because of this, it is
142  recommended that you not create objects that end with "/" (unless you don't
143  need to be able to download such objects using gsutil).
144"""
145
146_SUBDIRECTORIES_TEXT = """
147<B>COPYING TO/FROM SUBDIRECTORIES; DISTRIBUTING TRANSFERS ACROSS MACHINES</B>
148  You can use gsutil to copy to and from subdirectories by using a command
149  like:
150
151    gsutil cp -r dir gs://my_bucket/data
152
153  This will cause dir and all of its files and nested subdirectories to be
154  copied under the specified destination, resulting in objects with names like
155  gs://my_bucket/data/dir/a/b/c. Similarly you can download from bucket
156  subdirectories by using a command like:
157
158    gsutil cp -r gs://my_bucket/data dir
159
160  This will cause everything nested under gs://my_bucket/data to be downloaded
161  into dir, resulting in files with names like dir/data/a/b/c.
162
163  Copying subdirectories is useful if you want to add data to an existing
164  bucket directory structure over time. It's also useful if you want
165  to parallelize uploads and downloads across multiple machines (often
166  reducing overall transfer time compared with simply running gsutil -m
167  cp on one machine). For example, if your bucket contains this structure:
168
169    gs://my_bucket/data/result_set_01/
170    gs://my_bucket/data/result_set_02/
171    ...
172    gs://my_bucket/data/result_set_99/
173
174  you could perform concurrent downloads across 3 machines by running these
175  commands on each machine, respectively:
176
177    gsutil -m cp -r gs://my_bucket/data/result_set_[0-3]* dir
178    gsutil -m cp -r gs://my_bucket/data/result_set_[4-6]* dir
179    gsutil -m cp -r gs://my_bucket/data/result_set_[7-9]* dir
180
181  Note that dir could be a local directory on each machine, or it could
182  be a directory mounted off of a shared file server; whether the latter
183  performs acceptably may depend on a number of things, so we recommend
184  you experiment and find out what works best for you.
185"""
186
187_COPY_IN_CLOUD_TEXT = """
188<B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B>
189  If both the source and destination URL are cloud URLs from the same
190  provider, gsutil copies data "in the cloud" (i.e., without downloading
191  to and uploading from the machine where you run gsutil). In addition to
192  the performance and cost advantages of doing this, copying in the cloud
193  preserves metadata (like Content-Type and Cache-Control). In contrast,
194  when you download data from the cloud it ends up in a file, which has
195  no associated metadata. Thus, unless you have some way to hold on to
196  or re-create that metadata, downloading to a file will not retain the
197  metadata.
198
199  Copies spanning locations and/or storage classes cause data to be rewritten
200  in the cloud, which may take some time. Such operations can be resumed with
201  the same command if they are interrupted, so long as the command parameters
202  are identical.
203
204  Note that by default, the gsutil cp command does not copy the object
205  ACL to the new object, and instead will use the default bucket ACL (see
206  "gsutil help defacl").  You can override this behavior with the -p
207  option (see OPTIONS below).
208
209  One additional note about copying in the cloud: If the destination bucket has
210  versioning enabled, gsutil cp will by default copy only live versions of the
211  source object(s). For example:
212
213    gsutil cp gs://bucket1/obj gs://bucket2
214
215  will cause only the single live version of of gs://bucket1/obj to be copied
216  to gs://bucket2, even if there are archived versions of gs://bucket1/obj. To
217  also copy archived versions, use the -A flag:
218
219    gsutil cp -A gs://bucket1/obj gs://bucket2
220
221  The gsutil -m flag is disallowed when using the cp -A flag, to ensure that
222  version ordering is preserved.
223"""
224
225_CHECKSUM_VALIDATION_TEXT = """
226<B>CHECKSUM VALIDATION</B>
227  At the end of every upload or download the gsutil cp command validates that
228  the checksum it computes for the source file/object matches the checksum
229  the service computes. If the checksums do not match, gsutil will delete the
230  corrupted object and print a warning message. This very rarely happens, but
231  if it does, please contact gs-team@google.com.
232
233  If you know the MD5 of a file before uploading you can specify it in the
234  Content-MD5 header, which will cause the cloud storage service to reject the
235  upload if the MD5 doesn't match the value computed by the service. For
236  example:
237
238    % gsutil hash obj
239    Hashing     obj:
240    Hashes [base64] for obj:
241            Hash (crc32c):          lIMoIw==
242            Hash (md5):             VgyllJgiiaRAbyUUIqDMmw==
243
244    % gsutil -h Content-MD5:VgyllJgiiaRAbyUUIqDMmw== cp obj gs://your-bucket/obj
245    Copying file://obj [Content-Type=text/plain]...
246    Uploading   gs://your-bucket/obj:                                182 b/182 B
247
248    If the checksum didn't match the service would instead reject the upload and
249    gsutil would print a message like:
250
251    BadRequestException: 400 Provided MD5 hash "VgyllJgiiaRAbyUUIqDMmw=="
252    doesn't match calculated MD5 hash "7gyllJgiiaRAbyUUIqDMmw==".
253
254  Even if you don't do this gsutil will delete the object if the computed
255  checksum mismatches, but specifying the Content-MD5 header has three
256  advantages:
257
258      1. It prevents the corrupted object from becoming visible at all, whereas
259      otherwise it would be visible for 1-3 seconds before gsutil deletes it.
260
261      2. It will definitively prevent the corrupted object from being left in
262      the cloud, whereas the gsutil approach of deleting after the upload
263      completes could fail if (for example) the gsutil process gets ^C'd
264      between upload and deletion request.
265
266      3. It supports a customer-to-service integrity check handoff. For example,
267      if you have a content production pipeline that generates data to be
268      uploaded to the cloud along with checksums of that data, specifying the
269      MD5 computed by your content pipeline when you run gsutil cp will ensure
270      that the checksums match all the way through the process (e.g., detecting
271      if data gets corrupted on your local disk between the time it was written
272      by your content pipeline and the time it was uploaded to GCS).
273
274  Note: The Content-MD5 header is ignored for composite objects, because such
275  objects only have a CRC32C checksum.
276"""
277
278_RETRY_HANDLING_TEXT = """
279<B>RETRY HANDLING</B>
280  The cp command will retry when failures occur, but if enough failures happen
281  during a particular copy or delete operation the command will skip that object
282  and move on. At the end of the copy run if any failures were not successfully
283  retried, the cp command will report the count of failures, and exit with
284  non-zero status.
285
286  Note that there are cases where retrying will never succeed, such as if you
287  don't have write permission to the destination bucket or if the destination
288  path for some objects is longer than the maximum allowed length.
289
290  For more details about gsutil's retry handling, please see
291  "gsutil help retries".
292"""
293
294_RESUMABLE_TRANSFERS_TEXT = """
295<B>RESUMABLE TRANSFERS</B>
296  gsutil automatically uses the Google Cloud Storage resumable upload feature
297  whenever you use the cp command to upload an object that is larger than 2
298  MiB. You do not need to specify any special command line options to make this
299  happen. If your upload is interrupted you can restart the upload by running
300  the same cp command that you ran to start the upload. Until the upload
301  has completed successfully, it will not be visible at the destination object
302  and will not replace any existing object the upload is intended to overwrite.
303  (However, see the section on PARALLEL COMPOSITE UPLOADS, which may leave
304  temporary component objects in place during the upload process.)
305
306  Similarly, gsutil automatically performs resumable downloads (using HTTP
307  standard Range GET operations) whenever you use the cp command, unless the
308  destination is a stream or null. In this case, a partially downloaded
309  temporary file will be visible in the destination directory. Upon completion,
310  the original file is deleted and overwritten with the downloaded contents.
311
312  Resumable uploads and downloads store some state information in a files
313  in ~/.gsutil named by the destination object or file. If you attempt to
314  resume a transfer from a machine with a different directory, the transfer
315  will start over from scratch.
316
317  See also "gsutil help prod" for details on using resumable transfers
318  in production.
319"""
320
321_STREAMING_TRANSFERS_TEXT = """
322<B>STREAMING TRANSFERS</B>
323  Use '-' in place of src_url or dst_url to perform a streaming
324  transfer. For example:
325
326    long_running_computation | gsutil cp - gs://my_bucket/obj
327
328  Streaming uploads using the JSON API (see "gsutil help apis") are buffered in
329  memory and can retry in the event of network flakiness or service errors.
330
331  Streaming transfers (other than uploads using the JSON API) do not support
332  resumable uploads/downloads. If you have a large amount of data to upload
333  (say, more than 100 MiB) it is recommended to write the data to a local file
334  and then copy that file to the cloud rather than streaming it (and similarly
335  for large downloads).
336
337  WARNING: When performing streaming transfers gsutil does not compute a
338  checksum of the uploaded or downloaded data.  Therefore, we recommend that
339  users either perform their own validation of the data or use non-streaming
340  transfers (which perform integrity checking automatically).
341"""
342
343_SLICED_OBJECT_DOWNLOADS_TEXT = """
344<B>SLICED OBJECT DOWNLOADS</B>
345  gsutil automatically uses HTTP Range GET requests to perform "sliced"
346  downloads in parallel for downloads of large objects. This means that, if
347  enabled, disk space for the temporary download destination file will be
348  pre-allocated and byte ranges (slices) within the file will be downloaded in
349  parallel. Once all slices have completed downloading, the temporary file will
350  be renamed to the destination file. No additional local disk space is
351  required for this operation.
352
353  This feature is only available for Google Cloud Storage objects because it
354  requires a fast composable checksum that can be used to verify the data
355  integrity of the slices. Thus, using sliced object downloads also requires a
356  compiled crcmod (see "gsutil help crcmod") on the machine performing the
357  download. If compiled crcmod is not available, normal download will instead
358  be used.
359
360  Note: since sliced object downloads cause multiple writes to occur at various
361  locations on disk, this can degrade performance for disks with slow seek
362  times, especially for large numbers of slices. While the default number of
363  slices is small to avoid this, sliced object download can be completely
364  disabled by setting the "sliced_object_download_threshold" variable in the
365  .boto config file to 0.
366"""
367
368_PARALLEL_COMPOSITE_UPLOADS_TEXT = """
369<B>PARALLEL COMPOSITE UPLOADS</B>
370  gsutil can automatically use
371  `object composition <https://developers.google.com/storage/docs/composite-objects>`_
372  to perform uploads in parallel for large, local files being uploaded to Google
373  Cloud Storage. This means that, if enabled (see next paragraph), a large file
374  will be split into component pieces that will be uploaded in parallel. Those
375  components will then be composed in the cloud, and the temporary components in
376  the cloud will be deleted after successful composition. No additional local
377  disk space is required for this operation.
378
379  Using parallel composite uploads presents a tradeoff between upload
380  performance and download configuration: If you enable parallel composite
381  uploads your uploads will run faster, but someone will need to install a
382  compiled crcmod (see "gsutil help crcmod") on every machine where objects are
383  downloaded by gsutil or other Python applications. For some distributions this
384  is easy (e.g., it comes pre-installed on MacOS), but in some cases users have
385  found it difficult. Because of this at present parallel composite uploads are
386  disabled by default. Google is actively working with a number of the Linux
387  distributions to get crcmod included with the stock distribution. Once that is
388  done we will re-enable parallel composite uploads by default in gsutil.
389
390  Parallel composite uploads should not be used with NEARLINE storage
391  class buckets, as doing this would incur an early deletion charge for each
392  component object.
393
394  To try parallel composite uploads you can run the command:
395
396    gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp bigfile gs://your-bucket
397
398  where bigfile is larger than 150 MiB. When you do this notice that the upload
399  progress indicator continuously updates for several different uploads at once
400  (corresponding to each of the sections of the file being uploaded in
401  parallel), until the parallel upload completes. If you then want to enable
402  parallel composite uploads for all of your future uploads (notwithstanding the
403  caveats mentioned earlier), you can uncomment and set the
404  "parallel_composite_upload_threshold" config value in your .boto configuration
405  file to this value.
406
407  Note that the crcmod problem only impacts downloads via Python applications
408  (such as gsutil). If any users who need to download the data using gsutil or
409  other Python applications can install crcmod, it makes sense to enable
410  parallel composite uploads (see above). For example, if you use gsutil to
411  upload video assets and those assets will only ever be served via a Java
412  application (there are efficient crc32c implementations available in Java), it
413  would make sense to enable parallel composite uploads on your machine.
414
415  If a parallel composite upload fails prior to composition, re-running the
416  gsutil command will take advantage of resumable uploads for those components
417  that failed, and the component objects will be deleted after the first
418  successful attempt. Any temporary objects that were uploaded successfully
419  before gsutil failed will still exist until the upload is completed
420  successfully. The temporary objects will be named in the following fashion:
421
422    <random ID>%s<hash>
423
424  where <random ID> is some numerical value, and <hash> is an MD5 hash (not
425  related to the hash of the contents of the file or object).
426
427  To avoid leaving temporary objects around, you should make sure to check the
428  exit status from the gsutil command.  This can be done in a bash script, for
429  example, by doing:
430
431     gsutil cp ./local-file gs://your-bucket/your-object
432     if [ "$status" -ne "0" ] ; then
433       << Code that handles failures >>
434     fi
435
436  Or, for copying a directory, use this instead:
437
438     gsutil cp -c -L cp.log -r ./dir gs://bucket
439     if [ "$status" -ne "0" ] ; then
440       << Code that handles failures >>
441     fi
442
443  One important caveat is that files uploaded in this fashion are still subject
444  to the maximum number of components limit. For example, if you upload a large
445  file that gets split into %d components, and try to compose it with another
446  object with %d components, the operation will fail because it exceeds the %d
447  component limit. If you wish to compose an object later and the component
448  limit is a concern, it is recommended that you disable parallel composite
449  uploads for that transfer.
450
451  Also note that an object uploaded using this feature will have a CRC32C hash,
452  but it will not have an MD5 hash (and because of that, requires users who
453  download the object to have crcmod installed, as noted earlier). For details
454  see 'gsutil help crc32c'.
455
456  Note that this feature can be completely disabled by setting the
457  "parallel_composite_upload_threshold" variable in the .boto config file to 0.
458""" % (PARALLEL_UPLOAD_TEMP_NAMESPACE, 10, MAX_COMPONENT_COUNT - 9,
459       MAX_COMPONENT_COUNT)
460
461
462_CHANGING_TEMP_DIRECTORIES_TEXT = """
463<B>CHANGING TEMP DIRECTORIES</B>
464  gsutil writes data to a temporary directory in several cases:
465
466  - when compressing data to be uploaded (see the -z option)
467  - when decompressing data being downloaded (when the data has
468    Content-Encoding:gzip, e.g., as happens when uploaded using gsutil cp -z)
469  - when running integration tests (using the gsutil test command)
470
471  In these cases it's possible the temp file location on your system that
472  gsutil selects by default may not have enough space. If you find that
473  gsutil runs out of space during one of these operations (e.g., raising
474  "CommandException: Inadequate temp space available to compress <your file>"
475  during a gsutil cp -z operation), you can change where it writes these
476  temp files by setting the TMPDIR environment variable. On Linux and MacOS
477  you can do this either by running gsutil this way:
478
479    TMPDIR=/some/directory gsutil cp ...
480
481  or by adding this line to your ~/.bashrc file and then restarting the shell
482  before running gsutil:
483
484    export TMPDIR=/some/directory
485
486  On Windows 7 you can change the TMPDIR environment variable from Start ->
487  Computer -> System -> Advanced System Settings -> Environment Variables.
488  You need to reboot after making this change for it to take effect. (Rebooting
489  is not necessary after running the export command on Linux and MacOS.)
490"""
491
492_OPTIONS_TEXT = """
493<B>OPTIONS</B>
494  -a canned_acl  Sets named canned_acl when uploaded objects created. See
495                 'gsutil help acls' for further details.
496
497  -A             Copy all source versions from a source buckets/folders.
498                 If not set, only the live version of each source object is
499                 copied. Note: this option is only useful when the destination
500                 bucket has versioning enabled.
501
502  -c             If an error occurs, continue to attempt to copy the remaining
503                 files. If any copies were unsuccessful, gsutil's exit status
504                 will be non-zero even if this flag is set. This option is
505                 implicitly set when running "gsutil -m cp...". Note: -c only
506                 applies to the actual copying operation. If an error occurs
507                 while iterating over the files in the local directory (e.g.,
508                 invalid Unicode file name) gsutil will print an error message
509                 and abort.
510
511  -D             Copy in "daisy chain" mode, i.e., copying between two buckets
512                 by hooking a download to an upload, via the machine where
513                 gsutil is run. By default, data are copied between two buckets
514                 "in the cloud", i.e., without needing to copy via the machine
515                 where gsutil runs.
516
517                 By default, a "copy in the cloud" when the source is a
518                 composite object will retain the composite nature of the
519                 object. However, Daisy chain mode can be used to change a
520                 composite object into a non-composite object. For example:
521
522                     gsutil cp -D -p gs://bucket/obj gs://bucket/obj_tmp
523                     gsutil mv -p gs://bucket/obj_tmp gs://bucket/obj
524
525                 Note: Daisy chain mode is automatically used when copying
526                 between providers (e.g., to copy data from Google Cloud Storage
527                 to another provider).
528
529  -e             Exclude symlinks. When specified, symbolic links will not be
530                 copied.
531
532  -I             Causes gsutil to read the list of files or objects to copy from
533                 stdin. This allows you to run a program that generates the list
534                 of files to upload/download.
535
536  -L <file>      Outputs a manifest log file with detailed information about
537                 each item that was copied. This manifest contains the following
538                 information for each item:
539
540                 - Source path.
541                 - Destination path.
542                 - Source size.
543                 - Bytes transferred.
544                 - MD5 hash.
545                 - UTC date and time transfer was started in ISO 8601 format.
546                 - UTC date and time transfer was completed in ISO 8601 format.
547                 - Upload id, if a resumable upload was performed.
548                 - Final result of the attempted transfer, success or failure.
549                 - Failure details, if any.
550
551                 If the log file already exists, gsutil will use the file as an
552                 input to the copy process, and will also append log items to
553                 the existing file. Files/objects that are marked in the
554                 existing log file as having been successfully copied (or
555                 skipped) will be ignored. Files/objects without entries will be
556                 copied and ones previously marked as unsuccessful will be
557                 retried. This can be used in conjunction with the -c option to
558                 build a script that copies a large number of objects reliably,
559                 using a bash script like the following:
560
561                   until gsutil cp -c -L cp.log -r ./dir gs://bucket; do
562                     sleep 1
563                   done
564
565                 The -c option will cause copying to continue after failures
566                 occur, and the -L option will allow gsutil to pick up where it
567                 left off without duplicating work. The loop will continue
568                 running as long as gsutil exits with a non-zero status (such a
569                 status indicates there was at least one failure during the
570                 gsutil run).
571
572                 Note: If you're trying to synchronize the contents of a
573                 directory and a bucket (or two buckets), see
574                 'gsutil help rsync'.
575
576  -n             No-clobber. When specified, existing files or objects at the
577                 destination will not be overwritten. Any items that are skipped
578                 by this option will be reported as being skipped. This option
579                 will perform an additional GET request to check if an item
580                 exists before attempting to upload the data. This will save
581                 retransmitting data, but the additional HTTP requests may make
582                 small object transfers slower and more expensive.
583
584  -p             Causes ACLs to be preserved when copying in the cloud. Note
585                 that this option has performance and cost implications when
586                 using  the XML API, as it requires separate HTTP calls for
587                 interacting with ACLs. The performance issue can be mitigated
588                 to some degree by using gsutil -m cp to cause parallel copying.
589                 Also, this option only works if you have OWNER access to all of
590                 the objects that are copied.
591
592                 You can avoid the additional performance and cost of using
593                 cp -p if you want all objects in the destination bucket to end
594                 up with the same ACL by setting a default object ACL on that
595                 bucket instead of using cp -p. See "help gsutil defacl".
596
597                 Note that it's not valid to specify both the -a and -p options
598                 together.
599
600  -R, -r         Causes directories, buckets, and bucket subdirectories to be
601                 copied recursively. If you neglect to use this option for
602                 an upload, gsutil will copy any files it finds and skip any
603                 directories. Similarly, neglecting to specify -r for a download
604                 will cause gsutil to copy any objects at the current bucket
605                 directory level, and skip any subdirectories.
606
607  -U             Skip objects with unsupported object types instead of failing.
608                 Unsupported object types are Amazon S3 Objects in the GLACIER
609                 storage class.
610
611  -v             Requests that the version-specific URL for each uploaded object
612                 be printed. Given this URL you can make future upload requests
613                 that are safe in the face of concurrent updates, because Google
614                 Cloud Storage will refuse to perform the update if the current
615                 object version doesn't match the version-specific URL. See
616                 'gsutil help versions' for more details.
617
618  -z <ext,...>   Applies gzip content-encoding to file uploads with the given
619                 extensions. This is useful when uploading files with
620                 compressible content (such as .js, .css, or .html files)
621                 because it saves network bandwidth and space in Google Cloud
622                 Storage, which in turn reduces storage costs.
623
624                 When you specify the -z option, the data from your files is
625                 compressed before it is uploaded, but your actual files are
626                 left uncompressed on the local disk. The uploaded objects
627                 retain the Content-Type and name of the original files but are
628                 given a Content-Encoding header with the value "gzip" to
629                 indicate that the object data stored are compressed on the
630                 Google Cloud Storage servers.
631
632                 For example, the following command:
633
634                   gsutil cp -z html -a public-read cattypes.html gs://mycats
635
636                 will do all of the following:
637
638                 - Upload as the object gs://mycats/cattypes.html (cp command)
639                 - Set the Content-Type to text/html (based on file extension)
640                 - Compress the data in the file cattypes.html (-z option)
641                 - Set the Content-Encoding to gzip (-z option)
642                 - Set the ACL to public-read (-a option)
643                 - If a user tries to view cattypes.html in a browser, the
644                   browser will know to uncompress the data based on the
645                   Content-Encoding header, and to render it as HTML based on
646                   the Content-Type header.
647
648                 Note that if you download an object with Content-Encoding:gzip
649                 gsutil will decompress the content before writing the local
650                 file.
651"""
652
653_DETAILED_HELP_TEXT = '\n\n'.join([_SYNOPSIS_TEXT,
654                                   _DESCRIPTION_TEXT,
655                                   _NAME_CONSTRUCTION_TEXT,
656                                   _SUBDIRECTORIES_TEXT,
657                                   _COPY_IN_CLOUD_TEXT,
658                                   _CHECKSUM_VALIDATION_TEXT,
659                                   _RETRY_HANDLING_TEXT,
660                                   _RESUMABLE_TRANSFERS_TEXT,
661                                   _STREAMING_TRANSFERS_TEXT,
662                                   _SLICED_OBJECT_DOWNLOADS_TEXT,
663                                   _PARALLEL_COMPOSITE_UPLOADS_TEXT,
664                                   _CHANGING_TEMP_DIRECTORIES_TEXT,
665                                   _OPTIONS_TEXT])
666
667
668CP_SUB_ARGS = 'a:AcDeIL:MNnprRtUvz:'
669
670
671def _CopyFuncWrapper(cls, args, thread_state=None):
672  cls.CopyFunc(args, thread_state=thread_state)
673
674
675def _CopyExceptionHandler(cls, e):
676  """Simple exception handler to allow post-completion status."""
677  cls.logger.error(str(e))
678  cls.op_failure_count += 1
679  cls.logger.debug('\n\nEncountered exception while copying:\n%s\n',
680                   traceback.format_exc())
681
682
683def _RmExceptionHandler(cls, e):
684  """Simple exception handler to allow post-completion status."""
685  cls.logger.error(str(e))
686
687
688class CpCommand(Command):
689  """Implementation of gsutil cp command.
690
691  Note that CpCommand is run for both gsutil cp and gsutil mv. The latter
692  happens by MvCommand calling CpCommand and passing the hidden (undocumented)
693  -M option. This allows the copy and remove needed for each mv to run
694  together (rather than first running all the cp's and then all the rm's, as
695  we originally had implemented), which in turn avoids the following problem
696  with removing the wrong objects: starting with a bucket containing only
697  the object gs://bucket/obj, say the user does:
698    gsutil mv gs://bucket/* gs://bucket/d.txt
699  If we ran all the cp's and then all the rm's and we didn't expand the wildcard
700  first, the cp command would first copy gs://bucket/obj to gs://bucket/d.txt,
701  and the rm command would then remove that object. In the implementation
702  prior to gsutil release 3.12 we avoided this by building a list of objects
703  to process and then running the copies and then the removes; but building
704  the list up front limits scalability (compared with the current approach
705  of processing the bucket listing iterator on the fly).
706  """
707
708  # Command specification. See base class for documentation.
709  command_spec = Command.CreateCommandSpec(
710      'cp',
711      command_name_aliases=['copy'],
712      usage_synopsis=_SYNOPSIS,
713      min_args=1,
714      max_args=NO_MAX,
715      # -t is deprecated but leave intact for now to avoid breakage.
716      supported_sub_args=CP_SUB_ARGS,
717      file_url_ok=True,
718      provider_url_ok=False,
719      urls_start_arg=0,
720      gs_api_support=[ApiSelector.XML, ApiSelector.JSON],
721      gs_default_api=ApiSelector.JSON,
722      supported_private_args=['testcallbackfile='],
723      argparse_arguments=[
724          CommandArgument.MakeZeroOrMoreCloudOrFileURLsArgument()
725      ]
726  )
727  # Help specification. See help_provider.py for documentation.
728  help_spec = Command.HelpSpec(
729      help_name='cp',
730      help_name_aliases=['copy'],
731      help_type='command_help',
732      help_one_line_summary='Copy files and objects',
733      help_text=_DETAILED_HELP_TEXT,
734      subcommand_help_text={},
735  )
736
737  # pylint: disable=too-many-statements
738  def CopyFunc(self, name_expansion_result, thread_state=None):
739    """Worker function for performing the actual copy (and rm, for mv)."""
740    gsutil_api = GetCloudApiInstance(self, thread_state=thread_state)
741
742    copy_helper_opts = copy_helper.GetCopyHelperOpts()
743    if copy_helper_opts.perform_mv:
744      cmd_name = 'mv'
745    else:
746      cmd_name = self.command_name
747    src_url = name_expansion_result.source_storage_url
748    exp_src_url = name_expansion_result.expanded_storage_url
749    src_url_names_container = name_expansion_result.names_container
750    have_multiple_srcs = name_expansion_result.is_multi_source_request
751
752    if src_url.IsCloudUrl() and src_url.IsProvider():
753      raise CommandException(
754          'The %s command does not allow provider-only source URLs (%s)' %
755          (cmd_name, src_url))
756    if have_multiple_srcs:
757      copy_helper.InsistDstUrlNamesContainer(
758          self.exp_dst_url, self.have_existing_dst_container, cmd_name)
759
760    # Various GUI tools (like the GCS web console) create placeholder objects
761    # ending with '/' when the user creates an empty directory. Normally these
762    # tools should delete those placeholders once objects have been written
763    # "under" the directory, but sometimes the placeholders are left around. We
764    # need to filter them out here, otherwise if the user tries to rsync from
765    # GCS to a local directory it will result in a directory/file conflict
766    # (e.g., trying to download an object called "mydata/" where the local
767    # directory "mydata" exists).
768    if IsCloudSubdirPlaceholder(exp_src_url):
769      # We used to output the message 'Skipping cloud sub-directory placeholder
770      # object...' but we no longer do so because it caused customer confusion.
771      return
772
773    if copy_helper_opts.use_manifest and self.manifest.WasSuccessful(
774        exp_src_url.url_string):
775      return
776
777    if copy_helper_opts.perform_mv:
778      if name_expansion_result.names_container:
779        # Use recursion_requested when performing name expansion for the
780        # directory mv case so we can determine if any of the source URLs are
781        # directories (and then use cp -r and rm -r to perform the move, to
782        # match the behavior of Linux mv (which when moving a directory moves
783        # all the contained files).
784        self.recursion_requested = True
785        # Disallow wildcard src URLs when moving directories, as supporting it
786        # would make the name transformation too complex and would also be
787        # dangerous (e.g., someone could accidentally move many objects to the
788        # wrong name, or accidentally overwrite many objects).
789        if ContainsWildcard(src_url.url_string):
790          raise CommandException('The mv command disallows naming source '
791                                 'directories using wildcards')
792
793    if (self.exp_dst_url.IsFileUrl()
794        and not os.path.exists(self.exp_dst_url.object_name)
795        and have_multiple_srcs):
796      os.makedirs(self.exp_dst_url.object_name)
797
798    dst_url = copy_helper.ConstructDstUrl(
799        src_url, exp_src_url, src_url_names_container, have_multiple_srcs,
800        self.exp_dst_url, self.have_existing_dst_container,
801        self.recursion_requested)
802    dst_url = copy_helper.FixWindowsNaming(src_url, dst_url)
803
804    copy_helper.CheckForDirFileConflict(exp_src_url, dst_url)
805    if copy_helper.SrcDstSame(exp_src_url, dst_url):
806      raise CommandException('%s: "%s" and "%s" are the same file - '
807                             'abort.' % (cmd_name, exp_src_url, dst_url))
808
809    if dst_url.IsCloudUrl() and dst_url.HasGeneration():
810      raise CommandException('%s: a version-specific URL\n(%s)\ncannot be '
811                             'the destination for gsutil cp - abort.'
812                             % (cmd_name, dst_url))
813
814    elapsed_time = bytes_transferred = 0
815    try:
816      if copy_helper_opts.use_manifest:
817        self.manifest.Initialize(
818            exp_src_url.url_string, dst_url.url_string)
819      (elapsed_time, bytes_transferred, result_url, md5) = (
820          copy_helper.PerformCopy(
821              self.logger, exp_src_url, dst_url, gsutil_api,
822              self, _CopyExceptionHandler, allow_splitting=True,
823              headers=self.headers, manifest=self.manifest,
824              gzip_exts=self.gzip_exts))
825      if copy_helper_opts.use_manifest:
826        if md5:
827          self.manifest.Set(exp_src_url.url_string, 'md5', md5)
828        self.manifest.SetResult(
829            exp_src_url.url_string, bytes_transferred, 'OK')
830      if copy_helper_opts.print_ver:
831        # Some cases don't return a version-specific URL (e.g., if destination
832        # is a file).
833        self.logger.info('Created: %s', result_url)
834    except ItemExistsError:
835      message = 'Skipping existing item: %s' % dst_url
836      self.logger.info(message)
837      if copy_helper_opts.use_manifest:
838        self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)
839    except SkipUnsupportedObjectError, e:
840      message = ('Skipping item %s with unsupported object type %s' %
841                 (exp_src_url.url_string, e.unsupported_type))
842      self.logger.info(message)
843      if copy_helper_opts.use_manifest:
844        self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)
845    except copy_helper.FileConcurrencySkipError, e:
846      self.logger.warn('Skipping copy of source URL %s because destination URL '
847                       '%s is already being copied by another gsutil process '
848                       'or thread (did you specify the same source URL twice?) '
849                       % (src_url, dst_url))
850    except Exception, e:
851      if (copy_helper_opts.no_clobber and
852          copy_helper.IsNoClobberServerException(e)):
853        message = 'Rejected (noclobber): %s' % dst_url
854        self.logger.info(message)
855        if copy_helper_opts.use_manifest:
856          self.manifest.SetResult(
857              exp_src_url.url_string, 0, 'skip', message)
858      elif self.continue_on_error:
859        message = 'Error copying %s: %s' % (src_url, str(e))
860        self.op_failure_count += 1
861        self.logger.error(message)
862        if copy_helper_opts.use_manifest:
863          self.manifest.SetResult(
864              exp_src_url.url_string, 0, 'error',
865              RemoveCRLFFromString(message))
866      else:
867        if copy_helper_opts.use_manifest:
868          self.manifest.SetResult(
869              exp_src_url.url_string, 0, 'error', str(e))
870        raise
871    else:
872      if copy_helper_opts.perform_mv:
873        self.logger.info('Removing %s...', exp_src_url)
874        if exp_src_url.IsCloudUrl():
875          gsutil_api.DeleteObject(exp_src_url.bucket_name,
876                                  exp_src_url.object_name,
877                                  generation=exp_src_url.generation,
878                                  provider=exp_src_url.scheme)
879        else:
880          os.unlink(exp_src_url.object_name)
881
882    with self.stats_lock:
883      self.total_elapsed_time += elapsed_time
884      self.total_bytes_transferred += bytes_transferred
885
886  # Command entry point.
887  def RunCommand(self):
888    copy_helper_opts = self._ParseOpts()
889
890    self.total_elapsed_time = self.total_bytes_transferred = 0
891    if self.args[-1] == '-' or self.args[-1] == 'file://-':
892      return CatHelper(self).CatUrlStrings(self.args[:-1])
893
894    if copy_helper_opts.read_args_from_stdin:
895      if len(self.args) != 1:
896        raise CommandException('Source URLs cannot be specified with -I option')
897      url_strs = StdinIterator()
898    else:
899      if len(self.args) < 2:
900        raise CommandException('Wrong number of arguments for "cp" command.')
901      url_strs = self.args[:-1]
902
903    (self.exp_dst_url, self.have_existing_dst_container) = (
904        copy_helper.ExpandUrlToSingleBlr(self.args[-1], self.gsutil_api,
905                                         self.debug, self.project_id))
906
907    name_expansion_iterator = NameExpansionIterator(
908        self.command_name, self.debug,
909        self.logger, self.gsutil_api, url_strs,
910        self.recursion_requested or copy_helper_opts.perform_mv,
911        project_id=self.project_id, all_versions=self.all_versions,
912        continue_on_error=self.continue_on_error or self.parallel_operations)
913
914    # Use a lock to ensure accurate statistics in the face of
915    # multi-threading/multi-processing.
916    self.stats_lock = CreateLock()
917
918    # Tracks if any copies failed.
919    self.op_failure_count = 0
920
921    # Start the clock.
922    start_time = time.time()
923
924    # Tuple of attributes to share/manage across multiple processes in
925    # parallel (-m) mode.
926    shared_attrs = ('op_failure_count', 'total_bytes_transferred')
927
928    # Perform copy requests in parallel (-m) mode, if requested, using
929    # configured number of parallel processes and threads. Otherwise,
930    # perform requests with sequential function calls in current process.
931    self.Apply(_CopyFuncWrapper, name_expansion_iterator,
932               _CopyExceptionHandler, shared_attrs,
933               fail_on_error=(not self.continue_on_error))
934    self.logger.debug(
935        'total_bytes_transferred: %d', self.total_bytes_transferred)
936
937    end_time = time.time()
938    self.total_elapsed_time = end_time - start_time
939
940    # Sometimes, particularly when running unit tests, the total elapsed time
941    # is really small. On Windows, the timer resolution is too small and
942    # causes total_elapsed_time to be zero.
943    try:
944      float(self.total_bytes_transferred) / float(self.total_elapsed_time)
945    except ZeroDivisionError:
946      self.total_elapsed_time = 0.01
947
948    self.total_bytes_per_second = (float(self.total_bytes_transferred) /
949                                   float(self.total_elapsed_time))
950
951    if self.debug == 3:
952      # Note that this only counts the actual GET and PUT bytes for the copy
953      # - not any transfers for doing wildcard expansion, the initial
954      # HEAD/GET request performed to get the object metadata, etc.
955      if self.total_bytes_transferred != 0:
956        self.logger.info(
957            'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)',
958            self.total_bytes_transferred, self.total_elapsed_time,
959            MakeHumanReadable(self.total_bytes_per_second))
960    if self.op_failure_count:
961      plural_str = 's' if self.op_failure_count > 1 else ''
962      raise CommandException('%d file%s/object%s could not be transferred.' % (
963          self.op_failure_count, plural_str, plural_str))
964
965    return 0
966
967  def _ParseOpts(self):
968    perform_mv = False
969    # exclude_symlinks is handled by Command parent class, so save in Command
970    # state rather than CopyHelperOpts.
971    self.exclude_symlinks = False
972    no_clobber = False
973    # continue_on_error is handled by Command parent class, so save in Command
974    # state rather than CopyHelperOpts.
975    self.continue_on_error = False
976    daisy_chain = False
977    read_args_from_stdin = False
978    print_ver = False
979    use_manifest = False
980    preserve_acl = False
981    canned_acl = None
982    # canned_acl is handled by a helper function in parent
983    # Command class, so save in Command state rather than CopyHelperOpts.
984    self.canned = None
985
986    self.all_versions = False
987
988    self.skip_unsupported_objects = False
989
990    # Files matching these extensions should be gzipped before uploading.
991    self.gzip_exts = []
992
993    test_callback_file = None
994
995    # self.recursion_requested initialized in command.py (so can be checked
996    # in parent class for all commands).
997    self.manifest = None
998    if self.sub_opts:
999      for o, a in self.sub_opts:
1000        if o == '-a':
1001          canned_acl = a
1002          self.canned = True
1003        if o == '-A':
1004          self.all_versions = True
1005        if o == '-c':
1006          self.continue_on_error = True
1007        elif o == '-D':
1008          daisy_chain = True
1009        elif o == '-e':
1010          self.exclude_symlinks = True
1011        elif o == '--testcallbackfile':
1012          # File path of a pickled class that implements ProgressCallback.call.
1013          # Used for testing transfer interruptions and resumes.
1014          test_callback_file = a
1015        elif o == '-I':
1016          read_args_from_stdin = True
1017        elif o == '-L':
1018          use_manifest = True
1019          self.manifest = Manifest(a)
1020        elif o == '-M':
1021          # Note that we signal to the cp command to perform a move (copy
1022          # followed by remove) and use directory-move naming rules by passing
1023          # the undocumented (for internal use) -M option when running the cp
1024          # command from mv.py.
1025          perform_mv = True
1026        elif o == '-n':
1027          no_clobber = True
1028        elif o == '-p':
1029          preserve_acl = True
1030        elif o == '-r' or o == '-R':
1031          self.recursion_requested = True
1032        elif o == '-U':
1033          self.skip_unsupported_objects = True
1034        elif o == '-v':
1035          print_ver = True
1036        elif o == '-z':
1037          self.gzip_exts = [x.strip() for x in a.split(',')]
1038    if preserve_acl and canned_acl:
1039      raise CommandException(
1040          'Specifying both the -p and -a options together is invalid.')
1041    if self.all_versions and self.parallel_operations:
1042      raise CommandException(
1043          'The gsutil -m option is not supported with the cp -A flag, to '
1044          'ensure that object version ordering is preserved. Please re-run '
1045          'the command without the -m option.')
1046    return CreateCopyHelperOpts(
1047        perform_mv=perform_mv,
1048        no_clobber=no_clobber,
1049        daisy_chain=daisy_chain,
1050        read_args_from_stdin=read_args_from_stdin,
1051        print_ver=print_ver,
1052        use_manifest=use_manifest,
1053        preserve_acl=preserve_acl,
1054        canned_acl=canned_acl,
1055        skip_unsupported_objects=self.skip_unsupported_objects,
1056        test_callback_file=test_callback_file)
1057