18d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# -*- coding: utf-8 -*-
28d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# Copyright 2014 Google Inc. All Rights Reserved.
38d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi#
48d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# Licensed under the Apache License, Version 2.0 (the "License");
58d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# you may not use this file except in compliance with the License.
68d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# You may obtain a copy of the License at
78d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi#
88d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi#     http://www.apache.org/licenses/LICENSE-2.0
98d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi#
108d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# Unless required by applicable law or agreed to in writing, software
118d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# distributed under the License is distributed on an "AS IS" BASIS,
128d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
138d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# See the License for the specific language governing permissions and
148d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# limitations under the License.
158d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi"""Implementation of Unix-like rsync command."""
168d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
178d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom __future__ import absolute_import
188d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
198d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiimport errno
208d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiimport heapq
218d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiimport io
228d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom itertools import islice
238d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiimport os
248d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiimport re
258d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiimport tempfile
268d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiimport textwrap
278d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiimport traceback
288d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiimport urllib
298d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
308d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom boto import config
318d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiimport crcmod
328d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
338d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib import copy_helper
34cef7893435aa41160dd1255c43cb8498279738ccChris Craikfrom gslib.bucket_listing_ref import BucketListingObject
358d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.cloud_api import NotFoundException
368d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.command import Command
378d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.command import DummyArgChecker
388d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.command_argument import CommandArgument
398d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.copy_helper import CreateCopyHelperOpts
408d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.copy_helper import SkipUnsupportedObjectError
418d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.cs_api_map import ApiSelector
428d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.exception import CommandException
438d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.hashing_helper import CalculateB64EncodedCrc32cFromContents
448d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.hashing_helper import CalculateB64EncodedMd5FromContents
458d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.hashing_helper import SLOW_CRCMOD_WARNING
468d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.plurality_checkable_iterator import PluralityCheckableIterator
478d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.sig_handling import GetCaughtSignals
488d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.sig_handling import RegisterSignalHandler
498d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.storage_url import StorageUrlFromString
508d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.util import GetCloudApiInstance
518d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.util import IsCloudSubdirPlaceholder
528d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.util import TEN_MIB
538d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.util import UsingCrcmodExtension
548d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.util import UTF8
558d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoifrom gslib.wildcard_iterator import CreateWildcardIterator
568d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
578d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
588d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi_SYNOPSIS = """
598d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  gsutil rsync [-c] [-C] [-d] [-e] [-n] [-p] [-r] [-U] [-x] src_url dst_url
608d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi"""
618d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
628d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi_DETAILED_HELP_TEXT = ("""
638d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi<B>SYNOPSIS</B>
648d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi""" + _SYNOPSIS + """
658d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
668d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
678d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi<B>DESCRIPTION</B>
688d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  The gsutil rsync command makes the contents under dst_url the same as the
698d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  contents under src_url, by copying any missing files/objects, and (if the
708d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  -d option is specified) deleting any extra files/objects. For example, to
718d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  make gs://mybucket/data match the contents of the local directory "data"
728d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  you could do:
738d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
748d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil rsync -d data gs://mybucket/data
758d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
768d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  To recurse into directories use the -r option:
778d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
788d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil rsync -d -r data gs://mybucket/data
798d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
808d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  To copy only new/changed files without deleting extra files from
818d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  gs://mybucket/data leave off the -d option:
828d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
838d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil rsync -r data gs://mybucket/data
848d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
858d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  If you have a large number of objects to synchronize you might want to use the
868d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  gsutil -m option, to perform parallel (multi-threaded/multi-processing)
878d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  synchronization:
888d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
898d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil -m rsync -d -r data gs://mybucket/data
908d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
918d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  The -m option typically will provide a large performance boost if either the
928d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  source or destination (or both) is a cloud URL. If both source and
938d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  destination are file URLs the -m option will typically thrash the disk and
948d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  slow synchronization down.
958d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
968d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  To make the local directory "data" the same as the contents of
978d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  gs://mybucket/data:
988d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
998d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil rsync -d -r gs://mybucket/data data
1008d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1018d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  To make the contents of gs://mybucket2 the same as gs://mybucket1:
1028d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1038d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil rsync -d -r gs://mybucket1 gs://mybucket2
1048d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1058d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  You can also mirror data across local directories:
1068d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1078d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil rsync -d -r dir1 dir2
1088d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1098d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  To mirror your content across clouds:
1108d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1118d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil rsync -d -r gs://my-gs-bucket s3://my-s3-bucket
1128d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1138d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Note: If you are synchronizing a large amount of data between clouds you might
1148d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  consider setting up a
1158d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  `Google Compute Engine <https://cloud.google.com/products/compute-engine>`_
1168d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  account and running gsutil there. Since cross-provider gsutil data transfers
1178d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  flow through the machine where gsutil is running, doing this can make your
1188d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  transfer run significantly faster than running gsutil on your local
1198d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  workstation.
1208d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1218d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1228d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi<B>BE CAREFUL WHEN USING -d OPTION!</B>
1238d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  The rsync -d option is very useful and commonly used, because it provides a
1248d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  means of making the contents of a destination bucket or directory match those
1258d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  of a source bucket or directory. However, please exercise caution when you
1268d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  use this option: It's possible to delete large amounts of data accidentally
1278d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  if, for example, you erroneously reverse source and destination. For example,
1288d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  if you meant to synchronize a local directory from a bucket in the cloud but
1298d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  instead run the command:
1308d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1318d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil -m rsync -r -d ./your-dir gs://your-bucket
1328d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1338d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  and your-dir is currently empty, you will quickly delete all of the objects in
1348d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  gs://your-bucket.
1358d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1368d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  You can also cause large amounts of data to be lost quickly by specifying a
1378d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  subdirectory of the destination as the source of an rsync. For example, the
1388d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  command:
1398d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1408d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil -m rsync -r -d gs://your-bucket/data gs://your-bucket
1418d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1428d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  would cause most or all of the objects in gs://your-bucket to be deleted
1438d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  (some objects may survive if there are any with names that sort lower than
1448d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  "data" under gs://your-bucket/data).
1458d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1468d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  In addition to paying careful attention to the source and destination you
1478d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  specify with the rsync command, there are two more safety measures your can
1488d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  take when using gsutil rsync -d:
1498d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1508d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    1. Try running the command with the rsync -n option first, to see what it
1518d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi       would do without actually performing the operations. For example, if
1528d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi       you run the command:
1538d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1548d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi         gsutil -m rsync -r -d -n gs://your-bucket/data gs://your-bucket
1558d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1568d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi       it will be immediately evident that running that command without the -n
1578d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi       option would cause many objects to be deleted.
1588d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1598d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    2. Enable object versioning in your bucket, which will allow you to restore
1608d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi       objects if you accidentally delete them. For more details see
1618d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi       "gsutil help versions".
1628d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1638d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1648d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi<B>IMPACT OF BUCKET LISTING EVENTUAL CONSISTENCY</B>
1658d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  The rsync command operates by listing the source and destination URLs, and
1668d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  then performing copy and remove operations according to the differences
1678d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  between these listings. Because bucket listing is eventually (not strongly)
1688d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  consistent, if you upload new objects or delete objects from a bucket and then
1698d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  immediately run gsutil rsync with that bucket as the source or destination,
1708d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  it's possible the rsync command will not see the recent updates and thus
1718d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  synchronize incorrectly. You can rerun the rsync operation again later to
1728d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  correct the incorrect synchronization.
1738d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1748d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1758d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi<B>CHECKSUM VALIDATION AND FAILURE HANDLING</B>
1768d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  At the end of every upload or download, the gsutil rsync command validates
1778d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  that the checksum of the source file/object matches the checksum of the
1788d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  destination file/object. If the checksums do not match, gsutil will delete
1798d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  the invalid copy and print a warning message. This very rarely happens, but
1808d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  if it does, please contact gs-team@google.com.
1818d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1828d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  The rsync command will retry when failures occur, but if enough failures
1838d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  happen during a particular copy or delete operation the command will skip that
1848d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  object and move on. At the end of the synchronization run if any failures were
1858d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  not successfully retried, the rsync command will report the count of failures,
1868d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  and exit with non-zero status. At this point you can run the rsync command
1878d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  again, and it will attempt any remaining needed copy and/or delete operations.
1888d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1898d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Note that there are cases where retrying will never succeed, such as if you
1908d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  don't have write permission to the destination bucket or if the destination
1918d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  path for some objects is longer than the maximum allowed length.
1928d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1938d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  For more details about gsutil's retry handling, please see
1948d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  "gsutil help retries".
1958d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1968d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
1978d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi<B>CHANGE DETECTION ALGORITHM</B>
1988d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  To determine if a file or object has changed gsutil rsync first checks whether
1998d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  the source and destination sizes match. If they match, it next checks if their
2008d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  checksums match, using checksums if available (see below). Unlike the Unix
2018d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  rsync command, gsutil rsync does not use timestamps to determine if the
2028d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  file/object changed, because the GCS API does not permit the caller to set an
2038d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  object's timestamp (hence, timestamps of identical files/objects cannot be
2048d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  made to match).
2058d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2068d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Checksums will not be available in two cases:
2078d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2088d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  1. When synchronizing to or from a file system. By default, gsutil does not
2098d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi     checksum files, because of the slowdown caused when working with large
2108d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi     files. You can cause gsutil to checksum files by using the gsutil rsync -c
2118d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi     option, at the cost of increased local disk I/O and run time when working
2128d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi     with large files. You should consider using the -c option if your files can
2138d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi     change without changing sizes (e.g., if you have files that contain fixed
2148d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi     width data, such as timestamps).
2158d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2168d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  2. When comparing composite GCS objects with objects at a cloud provider that
2178d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi     does not support CRC32C (which is the only checksum available for composite
2188d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi     objects). See 'gsutil help compose' for details about composite objects.
2198d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2208d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2218d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi<B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B>
2228d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  If both the source and destination URL are cloud URLs from the same provider,
2238d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  gsutil copies data "in the cloud" (i.e., without downloading to and uploading
2248d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  from the machine where you run gsutil). In addition to the performance and
2258d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  cost advantages of doing this, copying in the cloud preserves metadata (like
2268d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Content-Type and Cache-Control). In contrast, when you download data from the
2278d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  cloud it ends up in a file, which has no associated metadata. Thus, unless you
2288d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  have some way to hold on to or re-create that metadata, synchronizing a bucket
2298d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  to a directory in the local file system will not retain the metadata.
2308d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2318d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Note that by default, the gsutil rsync command does not copy the ACLs of
2328d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  objects being synchronized and instead will use the default bucket ACL (see
2338d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  "gsutil help defacl"). You can override this behavior with the -p option (see
2348d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  OPTIONS below).
2358d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2368d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2378d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi<B>SLOW CHECKSUMS</B>
2388d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  If you find that CRC32C checksum computation runs slowly, this is likely
2398d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  because you don't have a compiled CRC32c on your system. Try running:
2408d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2418d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil ver -l
2428d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2438d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  If the output contains:
2448d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2458d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    compiled crcmod: False
2468d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2478d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  you are running a Python library for computing CRC32C, which is much slower
2488d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  than using the compiled code. For information on getting a compiled CRC32C
2498d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  implementation, see 'gsutil help crc32c'.
2508d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2518d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2528d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi<B>LIMITATIONS</B>
2538d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  1. The gsutil rsync command doesn't make the destination object's timestamps
2548d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi     match those of the source object (it can't; timestamp setting is not
2558d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi     allowed by the GCS API).
2568d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
257cef7893435aa41160dd1255c43cb8498279738ccChris Craik  2. The gsutil rsync command considers only the current object generations in
258cef7893435aa41160dd1255c43cb8498279738ccChris Craik     the source and destination buckets when deciding what to copy / delete. If
259cef7893435aa41160dd1255c43cb8498279738ccChris Craik     versioning is enabled in the destination bucket then gsutil rsync's
260cef7893435aa41160dd1255c43cb8498279738ccChris Craik     overwriting or deleting objects will end up creating versions, but the
261cef7893435aa41160dd1255c43cb8498279738ccChris Craik     command doesn't try to make the archived generations match in the source
262cef7893435aa41160dd1255c43cb8498279738ccChris Craik     and destination buckets.
263cef7893435aa41160dd1255c43cb8498279738ccChris Craik
2648d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2658d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2668d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi<B>OPTIONS</B>
2678d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  -c            Causes the rsync command to compute checksums for files if the
2688d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                size of source and destination match, and then compare
2698d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                checksums.  This option increases local disk I/O and run time
2708d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                if either src_url or dst_url are on the local file system.
2718d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2728d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  -C            If an error occurs, continue to attempt to copy the remaining
2738d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                files. If errors occurred, gsutil's exit status will be non-zero
2748d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                even if this flag is set. This option is implicitly set when
2758d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                running "gsutil -m rsync...".  Note: -C only applies to the
2768d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                actual copying operation. If an error occurs while iterating
2778d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                over the files in the local directory (e.g., invalid Unicode
2788d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                file name) gsutil will print an error message and abort.
2798d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2808d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  -d            Delete extra files under dst_url not found under src_url. By
2818d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                default extra files are not deleted. Note: this option can
2828d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                delete data quickly if you specify the wrong source/destination
2838d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                combination. See the help section above,
2848d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                "BE CAREFUL WHEN USING -d OPTION!".
2858d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2868d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  -e            Exclude symlinks. When specified, symbolic links will be
2878d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                ignored.
2888d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2898d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  -n            Causes rsync to run in "dry run" mode, i.e., just outputting
2908d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                what would be copied or deleted without actually doing any
2918d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                copying/deleting.
2928d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
2938d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  -p            Causes ACLs to be preserved when synchronizing in the cloud.
2948d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                Note that this option has performance and cost implications when
2958d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                using the XML API, as it requires separate HTTP calls for
2968d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                interacting with ACLs. The performance issue can be mitigated to
2978d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                some degree by using gsutil -m rsync to cause parallel
2988d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                synchronization. Also, this option only works if you have OWNER
2998d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                access to all of the objects that are copied.
3008d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3018d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                You can avoid the additional performance and cost of using
3028d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                rsync -p if you want all objects in the destination bucket to
3038d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                end up with the same ACL by setting a default object ACL on that
3048d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                bucket instead of using rsync -p. See 'help gsutil defacl'.
3058d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3068d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  -R, -r        Causes directories, buckets, and bucket subdirectories to be
3078d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                synchronized recursively. If you neglect to use this option
3088d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                gsutil will make only the top-level directory in the source
3098d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                and destination URLs match, skipping any sub-directories.
3108d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3118d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  -U            Skip objects with unsupported object types instead of failing.
312cef7893435aa41160dd1255c43cb8498279738ccChris Craik                Unsupported object types are Amazon S3 Objects in the GLACIER
313cef7893435aa41160dd1255c43cb8498279738ccChris Craik                storage class.
3148d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3158d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  -x pattern    Causes files/objects matching pattern to be excluded, i.e., any
3168d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                matching files/objects will not be copied or deleted. Note that
3178d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                the pattern is a Python regular expression, not a wildcard (so,
3188d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                matching any string ending in 'abc' would be specified using
3198d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                '.*abc' rather than '*abc'). Note also that the exclude path is
3208d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                always relative (similar to Unix rsync or tar exclude options).
3218d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                For example, if you run the command:
3228d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3238d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                  gsutil rsync -x 'data./.*\\.txt' dir gs://my-bucket
3248d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3258d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                it will skip the file dir/data1/a.txt.
3268d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3278d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                You can use regex alternation to specify multiple exclusions,
3288d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                for example:
3298d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3308d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                  gsutil rsync -x '.*\\.txt|.*\\.jpg' dir gs://my-bucket
3318d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi""")
3328d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3338d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3348d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiclass _DiffAction(object):
3358d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  COPY = 'copy'
3368d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  REMOVE = 'remove'
3378d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3388d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3398d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi_NA = '-'
3408d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi_OUTPUT_BUFFER_SIZE = 64 * 1024
3418d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi_PROGRESS_REPORT_LISTING_COUNT = 10000
3428d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3438d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3448d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# Tracks files we need to clean up at end or if interrupted.
3458d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi_tmp_files = []
3468d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3478d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3488d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# pylint: disable=unused-argument
3498d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _HandleSignals(signal_num, cur_stack_frame):
3508d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Called when rsync command is killed with SIGINT, SIGQUIT or SIGTERM."""
3518d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  CleanUpTempFiles()
3528d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3538d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3548d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef CleanUpTempFiles():
3558d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Cleans up temp files.
3568d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3578d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  This function allows the main (RunCommand) function to clean up at end of
3588d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  operation, or if gsutil rsync is interrupted (e.g., via ^C). This is necessary
3598d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  because tempfile.NamedTemporaryFile doesn't allow the created file to be
3608d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  re-opened in read mode on Windows, so we have to use tempfile.mkstemp, which
3618d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  doesn't automatically delete temp files.
3628d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """
3638d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  try:
3648d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    for fname in _tmp_files:
3658d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      os.unlink(fname)
3668d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  except:  # pylint: disable=bare-except
3678d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    pass
3688d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3698d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3708d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiclass _DiffToApply(object):
3718d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Class that encapsulates info needed to apply diff for one object."""
3728d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3738d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  def __init__(self, src_url_str, dst_url_str, diff_action):
3748d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """Constructor.
3758d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3768d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Args:
3778d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      src_url_str: The source URL string, or None if diff_action is REMOVE.
3788d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      dst_url_str: The destination URL string.
3798d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      diff_action: _DiffAction to be applied.
3808d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """
3818d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.src_url_str = src_url_str
3828d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.dst_url_str = dst_url_str
3838d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.diff_action = diff_action
3848d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3858d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3868d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _DiffToApplyArgChecker(command_instance, diff_to_apply):
3878d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Arg checker that skips symlinks if -e flag specified."""
3888d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  if (diff_to_apply.diff_action == _DiffAction.REMOVE
3898d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      or not command_instance.exclude_symlinks):
3908d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # No src URL is populated for REMOVE actions.
3918d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    return True
3928d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  exp_src_url = StorageUrlFromString(diff_to_apply.src_url_str)
3938d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  if exp_src_url.IsFileUrl() and os.path.islink(exp_src_url.object_name):
3948d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    command_instance.logger.info('Skipping symbolic link %s...', exp_src_url)
3958d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    return False
3968d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  return True
3978d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3988d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
3998d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _ComputeNeededFileChecksums(logger, src_url_str, src_size, src_crc32c,
4008d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                                src_md5, dst_url_str, dst_size, dst_crc32c,
4018d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                                dst_md5):
4028d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Computes any file checksums needed by _ObjectsMatch.
4038d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
4048d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Args:
4058d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    logger: logging.logger for outputting log messages.
4068d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    src_url_str: Source URL string.
4078d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    src_size: Source size
4088d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    src_crc32c: Source CRC32c.
4098d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    src_md5: Source MD5.
4108d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    dst_url_str: Destination URL string.
4118d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    dst_size: Destination size
4128d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    dst_crc32c: Destination CRC32c.
4138d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    dst_md5: Destination MD5.
4148d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
4158d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Returns:
4168d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    (src_crc32c, src_md5, dst_crc32c, dst_md5)
4178d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """
4188d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  src_url = StorageUrlFromString(src_url_str)
4198d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  dst_url = StorageUrlFromString(dst_url_str)
4208d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  if src_url.IsFileUrl():
4218d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if dst_crc32c != _NA or dst_url.IsFileUrl():
4228d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if src_size > TEN_MIB:
4238d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        logger.info('Computing MD5 for %s...', src_url_str)
4248d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      with open(src_url.object_name, 'rb') as fp:
4258d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        src_crc32c = CalculateB64EncodedCrc32cFromContents(fp)
4268d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    elif dst_md5 != _NA or dst_url.IsFileUrl():
4278d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if dst_size > TEN_MIB:
4288d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        logger.info('Computing MD5 for %s...', dst_url_str)
4298d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      with open(src_url.object_name, 'rb') as fp:
4308d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        src_md5 = CalculateB64EncodedMd5FromContents(fp)
4318d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  if dst_url.IsFileUrl():
4328d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if src_crc32c != _NA:
4338d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if src_size > TEN_MIB:
4348d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        logger.info('Computing CRC32C for %s...', src_url_str)
4358d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      with open(dst_url.object_name, 'rb') as fp:
4368d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        dst_crc32c = CalculateB64EncodedCrc32cFromContents(fp)
4378d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    elif src_md5 != _NA:
4388d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if dst_size > TEN_MIB:
4398d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        logger.info('Computing CRC32C for %s...', dst_url_str)
4408d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      with open(dst_url.object_name, 'rb') as fp:
4418d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        dst_md5 = CalculateB64EncodedMd5FromContents(fp)
4428d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  return (src_crc32c, src_md5, dst_crc32c, dst_md5)
4438d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
4448d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
4458d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _ListUrlRootFunc(cls, args_tuple, thread_state=None):
4468d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Worker function for listing files/objects under to be sync'd.
4478d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
4488d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Outputs sorted list to out_file_name, formatted per _BuildTmpOutputLine. We
4498d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  sort the listed URLs because we don't want to depend on consistent sort
4508d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  order across file systems and cloud providers.
4518d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
4528d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Args:
4538d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    cls: Command instance.
4548d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    args_tuple: (base_url_str, out_file_name, desc), where base_url_str is
4558d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                top-level URL string to list; out_filename is name of file to
4568d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                which sorted output should be written; desc is 'source' or
4578d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                'destination'.
4588d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    thread_state: gsutil Cloud API instance to use.
4598d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """
4608d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  gsutil_api = GetCloudApiInstance(cls, thread_state=thread_state)
4618d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  (base_url_str, out_filename, desc) = args_tuple
4628d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  # We sort while iterating over base_url_str, allowing parallelism of batched
4638d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  # sorting with collecting the listing.
4648d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  out_file = io.open(out_filename, mode='w', encoding=UTF8)
4658d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  try:
4668d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    _BatchSort(_FieldedListingIterator(cls, gsutil_api, base_url_str, desc),
4678d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi               out_file)
4688d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  except Exception as e:  # pylint: disable=broad-except
4698d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Abandon rsync if an exception percolates up to this layer - retryable
4708d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # exceptions are handled in the lower layers, so we got a non-retryable
4718d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # exception (like 404 bucket not found) and proceeding would either be
4728d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # futile or could result in data loss - for example:
4738d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    #     gsutil rsync -d gs://non-existent-bucket ./localdir
4748d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # would delete files from localdir.
4758d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    cls.logger.error(
4768d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        'Caught non-retryable exception while listing %s: %s' %
4778d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        (base_url_str, e))
4788d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    cls.non_retryable_listing_failures = 1
4798d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  out_file.close()
4808d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
4818d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
482cef7893435aa41160dd1255c43cb8498279738ccChris Craikdef _LocalDirIterator(base_url):
483cef7893435aa41160dd1255c43cb8498279738ccChris Craik  """A generator that yields a BLR for each file in a local directory.
484cef7893435aa41160dd1255c43cb8498279738ccChris Craik
485cef7893435aa41160dd1255c43cb8498279738ccChris Craik     We use this function instead of WildcardIterator for listing a local
486cef7893435aa41160dd1255c43cb8498279738ccChris Craik     directory without recursion, because the glob.globi implementation called
487cef7893435aa41160dd1255c43cb8498279738ccChris Craik     by WildcardIterator skips "dot" files (which we don't want to do when
488cef7893435aa41160dd1255c43cb8498279738ccChris Craik     synchronizing to or from a local directory).
489cef7893435aa41160dd1255c43cb8498279738ccChris Craik
490cef7893435aa41160dd1255c43cb8498279738ccChris Craik  Args:
491cef7893435aa41160dd1255c43cb8498279738ccChris Craik    base_url: URL for the directory over which to iterate.
492cef7893435aa41160dd1255c43cb8498279738ccChris Craik
493cef7893435aa41160dd1255c43cb8498279738ccChris Craik  Yields:
494cef7893435aa41160dd1255c43cb8498279738ccChris Craik    BucketListingObject for each file in the directory.
495cef7893435aa41160dd1255c43cb8498279738ccChris Craik  """
496cef7893435aa41160dd1255c43cb8498279738ccChris Craik  for filename in os.listdir(base_url.object_name):
497cef7893435aa41160dd1255c43cb8498279738ccChris Craik    filename = os.path.join(base_url.object_name, filename)
498cef7893435aa41160dd1255c43cb8498279738ccChris Craik    if os.path.isfile(filename):
499cef7893435aa41160dd1255c43cb8498279738ccChris Craik      yield BucketListingObject(StorageUrlFromString(filename), None)
500cef7893435aa41160dd1255c43cb8498279738ccChris Craik
501cef7893435aa41160dd1255c43cb8498279738ccChris Craik
5028d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _FieldedListingIterator(cls, gsutil_api, base_url_str, desc):
5038d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Iterator over base_url_str formatting output per _BuildTmpOutputLine.
5048d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5058d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Args:
5068d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    cls: Command instance.
5078d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    gsutil_api: gsutil Cloud API instance to use for bucket listing.
5088d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    base_url_str: The top-level URL string over which to iterate.
5098d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    desc: 'source' or 'destination'.
5108d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5118d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Yields:
5128d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Output line formatted per _BuildTmpOutputLine.
5138d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """
514cef7893435aa41160dd1255c43cb8498279738ccChris Craik  base_url = StorageUrlFromString(base_url_str)
515cef7893435aa41160dd1255c43cb8498279738ccChris Craik  if base_url.scheme == 'file' and not cls.recursion_requested:
516cef7893435aa41160dd1255c43cb8498279738ccChris Craik    iterator = _LocalDirIterator(base_url)
5178d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  else:
518cef7893435aa41160dd1255c43cb8498279738ccChris Craik    if cls.recursion_requested:
519cef7893435aa41160dd1255c43cb8498279738ccChris Craik      wildcard = '%s/**' % base_url_str.rstrip('/\\')
520cef7893435aa41160dd1255c43cb8498279738ccChris Craik    else:
521cef7893435aa41160dd1255c43cb8498279738ccChris Craik      wildcard = '%s/*' % base_url_str.rstrip('/\\')
522cef7893435aa41160dd1255c43cb8498279738ccChris Craik    iterator = CreateWildcardIterator(
523cef7893435aa41160dd1255c43cb8498279738ccChris Craik        wildcard, gsutil_api, debug=cls.debug,
524cef7893435aa41160dd1255c43cb8498279738ccChris Craik        project_id=cls.project_id).IterObjects(
525cef7893435aa41160dd1255c43cb8498279738ccChris Craik            # Request just the needed fields, to reduce bandwidth usage.
526cef7893435aa41160dd1255c43cb8498279738ccChris Craik            bucket_listing_fields=['crc32c', 'md5Hash', 'name', 'size'])
527cef7893435aa41160dd1255c43cb8498279738ccChris Craik
5288d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  i = 0
529cef7893435aa41160dd1255c43cb8498279738ccChris Craik  for blr in iterator:
5308d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Various GUI tools (like the GCS web console) create placeholder objects
5318d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # ending with '/' when the user creates an empty directory. Normally these
5328d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # tools should delete those placeholders once objects have been written
5338d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # "under" the directory, but sometimes the placeholders are left around.
5348d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # We need to filter them out here, otherwise if the user tries to rsync
5358d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # from GCS to a local directory it will result in a directory/file
5368d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # conflict (e.g., trying to download an object called "mydata/" where the
5378d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # local directory "mydata" exists).
5388d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    url = blr.storage_url
5398d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if IsCloudSubdirPlaceholder(url, blr=blr):
540cef7893435aa41160dd1255c43cb8498279738ccChris Craik      # We used to output the message 'Skipping cloud sub-directory placeholder
541cef7893435aa41160dd1255c43cb8498279738ccChris Craik      # object...' but we no longer do so because it caused customer confusion.
5428d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      continue
5438d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if (cls.exclude_symlinks and url.IsFileUrl()
5448d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        and os.path.islink(url.object_name)):
5458d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      continue
5468d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if cls.exclude_pattern:
5478d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      str_to_check = url.url_string[len(base_url_str):]
5488d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if str_to_check.startswith(url.delim):
5498d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        str_to_check = str_to_check[1:]
5508d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if cls.exclude_pattern.match(str_to_check):
5518d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        continue
5528d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    i += 1
5538d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if i % _PROGRESS_REPORT_LISTING_COUNT == 0:
5548d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      cls.logger.info('At %s listing %d...', desc, i)
5558d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    yield _BuildTmpOutputLine(blr)
5568d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5578d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5588d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _BuildTmpOutputLine(blr):
5598d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Builds line to output to temp file for given BucketListingRef.
5608d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5618d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Args:
5628d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    blr: The BucketListingRef.
5638d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5648d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Returns:
5658d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    The output line, formatted as _EncodeUrl(URL)<sp>size<sp>crc32c<sp>md5
5668d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    where crc32c will only be present for GCS URLs, and md5 will only be
5678d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    present for cloud URLs that aren't composite objects. A missing field is
5688d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    populated with '-'.
5698d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """
5708d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  crc32c = _NA
5718d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  md5 = _NA
5728d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  url = blr.storage_url
5738d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  if url.IsFileUrl():
5748d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    size = os.path.getsize(url.object_name)
5758d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  elif url.IsCloudUrl():
5768d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    size = blr.root_object.size
5778d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    crc32c = blr.root_object.crc32c or _NA
5788d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    md5 = blr.root_object.md5Hash or _NA
5798d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  else:
5808d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    raise CommandException('Got unexpected URL type (%s)' % url.scheme)
5818d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  return '%s %d %s %s\n' % (_EncodeUrl(url.url_string), size, crc32c, md5)
5828d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5838d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5848d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _EncodeUrl(url_string):
5858d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Encodes url_str with quote plus encoding and UTF8 character encoding.
5868d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5878d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  We use this for all URL encodings.
5888d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5898d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Args:
5908d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    url_string: String URL to encode.
5918d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5928d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Returns:
5938d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    encoded URL.
5948d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """
5958d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  return urllib.quote_plus(url_string.encode(UTF8))
5968d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5978d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
5988d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _DecodeUrl(enc_url_string):
5998d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Inverts encoding from EncodeUrl.
6008d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6018d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Args:
6028d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    enc_url_string: String URL to decode.
6038d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6048d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Returns:
6058d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    decoded URL.
6068d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """
6078d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  return urllib.unquote_plus(enc_url_string).decode(UTF8)
6088d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6098d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6108d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi# pylint: disable=bare-except
6118d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _BatchSort(in_iter, out_file):
6128d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Sorts input lines from in_iter and outputs to out_file.
6138d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6148d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Sorts in batches as input arrives, so input file does not need to be loaded
6158d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  into memory all at once. Derived from Python Recipe 466302: Sorting big
6168d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  files the Python 2.4 way by Nicolas Lehuen.
6178d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6188d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Sorted format is per _BuildTmpOutputLine. We're sorting on the entire line
6198d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  when we could just sort on the first record (URL); but the sort order is
6208d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  identical either way.
6218d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6228d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  Args:
6238d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    in_iter: Input iterator.
6248d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    out_file: Output file.
6258d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """
6268d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  # Note: If chunk_files gets very large we can run out of open FDs. See .boto
6278d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  # file comments about rsync_buffer_lines. If increasing rsync_buffer_lines
6288d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  # doesn't suffice (e.g., for someone synchronizing with a really large
6298d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  # bucket), an option would be to make gsutil merge in passes, never
6308d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  # opening all chunk files simultaneously.
6318d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  buffer_size = config.getint('GSUtil', 'rsync_buffer_lines', 32000)
6328d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  chunk_files = []
6338d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  try:
6348d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    while True:
6358d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      current_chunk = sorted(islice(in_iter, buffer_size))
6368d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if not current_chunk:
6378d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        break
6388d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      output_chunk = io.open('%s-%06i' % (out_file.name, len(chunk_files)),
6398d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                             mode='w+', encoding=UTF8)
6408d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      chunk_files.append(output_chunk)
6418d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      output_chunk.writelines(unicode(''.join(current_chunk)))
6428d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      output_chunk.flush()
6438d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      output_chunk.seek(0)
6448d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    out_file.writelines(heapq.merge(*chunk_files))
6458d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  except IOError as e:
6468d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if e.errno == errno.EMFILE:
6478d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      raise CommandException('\n'.join(textwrap.wrap(
6488d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          'Synchronization failed because too many open file handles were '
6498d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          'needed while building synchronization state. Please see the '
6508d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          'comments about rsync_buffer_lines in your .boto config file for a '
6518d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          'possible way to address this problem.')))
6528d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    raise
6538d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  finally:
6548d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    for chunk_file in chunk_files:
6558d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      try:
6568d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        chunk_file.close()
6578d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        os.remove(chunk_file.name)
6588d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      except:
6598d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        pass
6608d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6618d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6628d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiclass _DiffIterator(object):
6638d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Iterator yielding sequence of _DiffToApply objects."""
6648d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6658d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  def __init__(self, command_obj, base_src_url, base_dst_url):
6668d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.command_obj = command_obj
6678d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.compute_file_checksums = command_obj.compute_file_checksums
6688d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.delete_extras = command_obj.delete_extras
6698d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.recursion_requested = command_obj.recursion_requested
6708d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.logger = self.command_obj.logger
6718d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.base_src_url = base_src_url
6728d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.base_dst_url = base_dst_url
6738d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.logger.info('Building synchronization state...')
6748d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6758d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(
6768d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        prefix='gsutil-rsync-src-')
6778d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    _tmp_files.append(self.sorted_list_src_file_name)
6788d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(
6798d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        prefix='gsutil-rsync-dst-')
6808d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    _tmp_files.append(self.sorted_list_dst_file_name)
6818d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Close the file handles; the file will be opened in write mode by
6828d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # _ListUrlRootFunc.
6838d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    os.close(src_fh)
6848d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    os.close(dst_fh)
6858d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6868d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Build sorted lists of src and dst URLs in parallel. To do this, pass args
6878d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc)
6888d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # where base_url_str is the starting URL string for listing.
6898d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    args_iter = iter([
6908d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        (self.base_src_url.url_string, self.sorted_list_src_file_name,
6918d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi         'source'),
6928d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        (self.base_dst_url.url_string, self.sorted_list_dst_file_name,
6938d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi         'destination')
6948d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    ])
6958d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
6968d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Contains error message from non-retryable listing failure.
6978d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    command_obj.non_retryable_listing_failures = 0
6988d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    shared_attrs = ['non_retryable_listing_failures']
6998d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler,
7008d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                      shared_attrs, arg_checker=DummyArgChecker,
7018d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                      parallel_operations_override=True,
7028d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                      fail_on_error=True)
7038d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7048d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if command_obj.non_retryable_listing_failures:
7058d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      raise CommandException('Caught non-retryable exception - aborting rsync')
7068d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7078d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r')
7088d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r')
7098d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7108d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Wrap iterators in PluralityCheckableIterator so we can check emptiness.
7118d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.sorted_src_urls_it = PluralityCheckableIterator(
7128d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        iter(self.sorted_list_src_file))
7138d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.sorted_dst_urls_it = PluralityCheckableIterator(
7148d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        iter(self.sorted_list_dst_file))
7158d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7168d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  def _ParseTmpFileLine(self, line):
7178d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """Parses output from _BuildTmpOutputLine.
7188d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7198d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Parses into tuple:
7208d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      (URL, size, crc32c, md5)
7218d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    where crc32c and/or md5 can be _NA.
7228d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7238d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Args:
7248d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      line: The line to parse.
7258d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7268d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Returns:
7278d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      Parsed tuple: (url, size, crc32c, md5)
7288d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """
7298d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    (encoded_url, size, crc32c, md5) = line.split()
7308d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    return (_DecodeUrl(encoded_url), int(size), crc32c, md5.strip())
7318d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7328d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  def _WarnIfMissingCloudHash(self, url_str, crc32c, md5):
7338d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """Warns if given url_str is a cloud URL and is missing both crc32c and md5.
7348d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7358d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Args:
7368d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      url_str: Destination URL string.
7378d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      crc32c: Destination CRC32c.
7388d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      md5: Destination MD5.
7398d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7408d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Returns:
7418d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      True if issued warning.
7428d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """
7438d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # One known way this can currently happen is when rsync'ing objects larger
7448d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # than 5 GB from S3 (for which the etag is not an MD5).
7458d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if (StorageUrlFromString(url_str).IsCloudUrl()
7468d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        and crc32c == _NA and md5 == _NA):
7478d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      self.logger.warn(
7488d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          'Found no hashes to validate %s. Integrity cannot be assured without '
7498d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          'hashes.', url_str)
7508d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      return True
7518d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    return False
7528d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7538d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  def _ObjectsMatch(self, src_url_str, src_size, src_crc32c, src_md5,
7548d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                    dst_url_str, dst_size, dst_crc32c, dst_md5):
7558d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """Returns True if src and dst objects are the same.
7568d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7578d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Uses size plus whatever checksums are available.
7588d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7598d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Args:
7608d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      src_url_str: Source URL string.
7618d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      src_size: Source size
7628d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      src_crc32c: Source CRC32c.
7638d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      src_md5: Source MD5.
7648d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      dst_url_str: Destination URL string.
7658d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      dst_size: Destination size
7668d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      dst_crc32c: Destination CRC32c.
7678d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      dst_md5: Destination MD5.
7688d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7698d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Returns:
7708d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      True/False.
7718d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """
7728d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Note: This function is called from __iter__, which is called from the
7738d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Command.Apply driver. Thus, all checksum computation will be run in a
7748d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # single thread, which is good (having multiple threads concurrently
7758d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # computing checksums would thrash the disk).
7768d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if src_size != dst_size:
7778d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      return False
7788d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if self.compute_file_checksums:
7798d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      (src_crc32c, src_md5, dst_crc32c, dst_md5) = _ComputeNeededFileChecksums(
7808d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          self.logger, src_url_str, src_size, src_crc32c, src_md5, dst_url_str,
7818d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          dst_size, dst_crc32c, dst_md5)
7828d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if src_md5 != _NA and dst_md5 != _NA:
7838d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      self.logger.debug('Comparing md5 for %s and %s', src_url_str, dst_url_str)
7848d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      return src_md5 == dst_md5
7858d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if src_crc32c != _NA and dst_crc32c != _NA:
7868d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      self.logger.debug(
7878d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          'Comparing crc32c for %s and %s', src_url_str, dst_url_str)
7888d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      return src_crc32c == dst_crc32c
7898d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if not self._WarnIfMissingCloudHash(src_url_str, src_crc32c, src_md5):
7908d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      self._WarnIfMissingCloudHash(dst_url_str, dst_crc32c, dst_md5)
7918d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Without checksums to compare we depend only on basic size comparison.
7928d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    return True
7938d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7948d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  def __iter__(self):
7958d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """Iterates over src/dst URLs and produces a _DiffToApply sequence.
7968d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
7978d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Yields:
7988d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      The _DiffToApply.
7998d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """
8008d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Strip trailing slashes, if any, so we compute tail length against
8018d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # consistent position regardless of whether trailing slashes were included
8028d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # or not in URL.
8038d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    base_src_url_len = len(self.base_src_url.url_string.rstrip('/\\'))
8048d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    base_dst_url_len = len(self.base_dst_url.url_string.rstrip('/\\'))
8058d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    src_url_str = dst_url_str = None
8068d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Invariant: After each yield, the URLs in src_url_str, dst_url_str,
8078d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # self.sorted_src_urls_it, and self.sorted_dst_urls_it are not yet
8088d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # processed. Each time we encounter None in src_url_str or dst_url_str we
8098d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # populate from the respective iterator, and we reset one or the other value
8108d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # to None after yielding an action that disposes of that URL.
8118d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    while not self.sorted_src_urls_it.IsEmpty() or src_url_str is not None:
8128d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if src_url_str is None:
8138d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        (src_url_str, src_size, src_crc32c, src_md5) = self._ParseTmpFileLine(
8148d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            self.sorted_src_urls_it.next())
8158d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # Skip past base URL and normalize slashes so we can compare across
8168d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # clouds/file systems (including Windows).
8178d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        src_url_str_to_check = _EncodeUrl(
8188d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            src_url_str[base_src_url_len:].replace('\\', '/'))
8198d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        dst_url_str_would_copy_to = copy_helper.ConstructDstUrl(
8208d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            self.base_src_url, StorageUrlFromString(src_url_str), True, True,
8218d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            self.base_dst_url, False, self.recursion_requested).url_string
8228d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if self.sorted_dst_urls_it.IsEmpty():
8238d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # We've reached end of dst URLs, so copy src to dst.
8248d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        yield _DiffToApply(
8258d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY)
8268d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        src_url_str = None
8278d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        continue
8288d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if not dst_url_str:
8298d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        (dst_url_str, dst_size, dst_crc32c, dst_md5) = (
8308d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            self._ParseTmpFileLine(self.sorted_dst_urls_it.next()))
8318d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # Skip past base URL and normalize slashes so we can compare acros
8328d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # clouds/file systems (including Windows).
8338d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        dst_url_str_to_check = _EncodeUrl(
8348d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            dst_url_str[base_dst_url_len:].replace('\\', '/'))
8358d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
8368d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if src_url_str_to_check < dst_url_str_to_check:
8378d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # There's no dst object corresponding to src object, so copy src to dst.
8388d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        yield _DiffToApply(
8398d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY)
8408d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        src_url_str = None
8418d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      elif src_url_str_to_check > dst_url_str_to_check:
8428d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # dst object without a corresponding src object, so remove dst if -d
8438d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # option was specified.
8448d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        if self.delete_extras:
8458d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
8468d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        dst_url_str = None
8478d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      else:
8488d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # There is a dst object corresponding to src object, so check if objects
8498d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # match.
8508d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        if self._ObjectsMatch(
8518d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            src_url_str, src_size, src_crc32c, src_md5,
8528d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            dst_url_str, dst_size, dst_crc32c, dst_md5):
8538d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          # Continue iterating without yielding a _DiffToApply.
8548d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          pass
8558d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        else:
8568d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          yield _DiffToApply(src_url_str, dst_url_str, _DiffAction.COPY)
8578d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        src_url_str = None
8588d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        dst_url_str = None
8598d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
8608d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # If -d option specified any files/objects left in dst iteration should be
8618d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # removed.
8628d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if not self.delete_extras:
8638d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      return
8648d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if dst_url_str:
8658d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
8668d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      dst_url_str = None
8678d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    for line in self.sorted_dst_urls_it:
8688d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      (dst_url_str, _, _, _) = self._ParseTmpFileLine(line)
8698d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
8708d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
8718d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
8728d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _RsyncFunc(cls, diff_to_apply, thread_state=None):
8738d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Worker function for performing the actual copy and remove operations."""
8748d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  gsutil_api = GetCloudApiInstance(cls, thread_state=thread_state)
8758d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  dst_url_str = diff_to_apply.dst_url_str
8768d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  dst_url = StorageUrlFromString(dst_url_str)
8778d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  if diff_to_apply.diff_action == _DiffAction.REMOVE:
8788d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if cls.dryrun:
8798d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      cls.logger.info('Would remove %s', dst_url)
8808d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    else:
8818d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      cls.logger.info('Removing %s', dst_url)
8828d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      if dst_url.IsFileUrl():
8838d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        os.unlink(dst_url.object_name)
8848d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      else:
8858d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        try:
8868d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          gsutil_api.DeleteObject(
8878d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi              dst_url.bucket_name, dst_url.object_name,
8888d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi              generation=dst_url.generation, provider=dst_url.scheme)
8898d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        except NotFoundException:
8908d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          # If the object happened to be deleted by an external process, this
8918d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          # is fine because it moves us closer to the desired state.
8928d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          pass
8938d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  elif diff_to_apply.diff_action == _DiffAction.COPY:
8948d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    src_url_str = diff_to_apply.src_url_str
8958d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    src_url = StorageUrlFromString(src_url_str)
8968d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if cls.dryrun:
8978d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      cls.logger.info('Would copy %s to %s', src_url, dst_url)
8988d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    else:
8998d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      try:
9008d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        copy_helper.PerformCopy(cls.logger, src_url, dst_url, gsutil_api, cls,
9018d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                                _RsyncExceptionHandler,
9028d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                                headers=cls.headers)
9038d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      except SkipUnsupportedObjectError, e:
9048d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        cls.logger.info('Skipping item %s with unsupported object type %s',
9058d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                        src_url, e.unsupported_type)
9068d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9078d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  else:
9088d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    raise CommandException('Got unexpected DiffAction (%d)'
9098d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                           % diff_to_apply.diff_action)
9108d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9118d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9128d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _RootListingExceptionHandler(cls, e):
9138d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Simple exception handler for exceptions during listing URLs to sync."""
9148d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  cls.logger.error(str(e))
9158d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9168d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9178d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoidef _RsyncExceptionHandler(cls, e):
9188d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Simple exception handler to allow post-completion status."""
9198d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  cls.logger.error(str(e))
9208d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  cls.op_failure_count += 1
9218d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  cls.logger.debug('\n\nEncountered exception while syncing:\n%s\n',
9228d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                   traceback.format_exc())
9238d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9248d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9258d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoiclass RsyncCommand(Command):
9268d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  """Implementation of gsutil rsync command."""
9278d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9288d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  # Command specification. See base class for documentation.
9298d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  command_spec = Command.CreateCommandSpec(
9308d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      'rsync',
9318d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      command_name_aliases=[],
9328d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      usage_synopsis=_SYNOPSIS,
9338d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      min_args=2,
9348d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      max_args=2,
9358d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      supported_sub_args='cCdenprRUx:',
9368d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      file_url_ok=True,
9378d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      provider_url_ok=False,
9388d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      urls_start_arg=0,
9398d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      gs_api_support=[ApiSelector.XML, ApiSelector.JSON],
9408d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      gs_default_api=ApiSelector.JSON,
9418d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      argparse_arguments=[
9428d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          CommandArgument.MakeNCloudOrFileURLsArgument(2)
9438d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      ]
9448d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  )
9458d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  # Help specification. See help_provider.py for documentation.
9468d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  help_spec = Command.HelpSpec(
9478d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      help_name='rsync',
9488d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      help_name_aliases=['sync', 'synchronize'],
9498d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      help_type='command_help',
9508d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      help_one_line_summary='Synchronize content of two buckets/directories',
9518d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      help_text=_DETAILED_HELP_TEXT,
9528d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      subcommand_help_text={},
9538d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  )
9548d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  total_bytes_transferred = 0
9558d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9568d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  def _InsistContainer(self, url_str, treat_nonexistent_object_as_subdir):
9578d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """Sanity checks that URL names an existing container.
9588d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9598d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Args:
9608d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      url_str: URL string to check.
9618d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      treat_nonexistent_object_as_subdir: indicates if should treat a
9628d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                                          non-existent object as a subdir.
9638d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9648d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Returns:
9658d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      URL for checked string.
9668d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9678d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    Raises:
9688d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      CommandException if url_str doesn't name an existing container.
9698d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """
9708d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    (url, have_existing_container) = (
9718d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        copy_helper.ExpandUrlToSingleBlr(url_str, self.gsutil_api, self.debug,
9728d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                                         self.project_id,
9738d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                                         treat_nonexistent_object_as_subdir))
9748d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if not have_existing_container:
9758d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      raise CommandException(
9768d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          'arg (%s) does not name a directory, bucket, or bucket subdir.'
9778d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          % url_str)
9788d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    return url
9798d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9808d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  def RunCommand(self):
9818d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    """Command entry point for the rsync command."""
9828d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self._ParseOpts()
9838d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if self.compute_file_checksums and not UsingCrcmodExtension(crcmod):
9848d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      self.logger.warn(SLOW_CRCMOD_WARNING)
9858d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9868d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    src_url = self._InsistContainer(self.args[0], False)
9878d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    dst_url = self._InsistContainer(self.args[1], True)
9888d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9898d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Tracks if any copy or rm operations failed.
9908d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.op_failure_count = 0
9918d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9928d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # List of attributes to share/manage across multiple processes in
9938d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # parallel (-m) mode.
9948d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    shared_attrs = ['op_failure_count']
9958d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9968d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    for signal_num in GetCaughtSignals():
9978d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      RegisterSignalHandler(signal_num, _HandleSignals)
9988d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
9998d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # Perform sync requests in parallel (-m) mode, if requested, using
10008d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # configured number of parallel processes and threads. Otherwise,
10018d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # perform requests with sequential function calls in current process.
10028d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    diff_iterator = _DiffIterator(self, src_url, dst_url)
10038d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.logger.info('Starting synchronization')
10048d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    try:
10058d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      self.Apply(_RsyncFunc, diff_iterator, _RsyncExceptionHandler,
10068d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                 shared_attrs, arg_checker=_DiffToApplyArgChecker,
10078d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi                 fail_on_error=True)
10088d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    finally:
10098d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      CleanUpTempFiles()
10108d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
10118d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if self.op_failure_count:
10128d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      plural_str = 's' if self.op_failure_count else ''
10138d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      raise CommandException(
10148d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          '%d file%s/object%s could not be copied/removed.' %
10158d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          (self.op_failure_count, plural_str, plural_str))
10168d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
10178d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi  def _ParseOpts(self):
10188d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # exclude_symlinks is handled by Command parent class, so save in Command
10198d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # state rather than CopyHelperOpts.
10208d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.exclude_symlinks = False
10218d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # continue_on_error is handled by Command parent class, so save in Command
10228d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # state rather than CopyHelperOpts.
10238d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.continue_on_error = False
10248d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.delete_extras = False
10258d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    preserve_acl = False
10268d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.compute_file_checksums = False
10278d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.dryrun = False
10288d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.exclude_pattern = None
10298d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    self.skip_unsupported_objects = False
10308d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # self.recursion_requested is initialized in command.py (so it can be
10318d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    # checked in parent class for all commands).
10328d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi
10338d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    if self.sub_opts:
10348d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi      for o, a in self.sub_opts:
10358d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        if o == '-c':
10368d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          self.compute_file_checksums = True
10378d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # Note: In gsutil cp command this is specified using -c but here we use
10388d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # -C so we can use -c for checksum arg (to be consistent with Unix rsync
10398d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        # command options).
10408d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        elif o == '-C':
10418d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          self.continue_on_error = True
10428d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        elif o == '-d':
10438d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          self.delete_extras = True
10448d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        elif o == '-e':
10458d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          self.exclude_symlinks = True
10468d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        elif o == '-n':
10478d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          self.dryrun = True
10488d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        elif o == '-p':
10498d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          preserve_acl = True
10508d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        elif o == '-r' or o == '-R':
10518d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          self.recursion_requested = True
10528d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        elif o == '-U':
10538d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          self.skip_unsupported_objects = True
10548d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        elif o == '-x':
10558d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          if not a:
10568d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            raise CommandException('Invalid blank exclude filter')
10578d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          try:
10588d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            self.exclude_pattern = re.compile(a)
10598d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi          except re.error:
10608d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi            raise CommandException('Invalid exclude filter (%s)' % a)
10618d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi    return CreateCopyHelperOpts(
10628d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        preserve_acl=preserve_acl,
10638d2b206a675ec20ea07100c35df34e65ee1e45e8Ruchi Kandoi        skip_unsupported_objects=self.skip_unsupported_objects)
1064