1#!/bin/bash -p
2
3# Copyright (c) 2011 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7# usage: dirdiffer.sh old_dir new_dir patch_dir
8#
9# dirdiffer creates a patch directory patch_dir that represents the difference
10# between old_dir and new_dir. patch_dir can be used with dirpatcher to
11# recreate new_dir given old_dir.
12#
13# dirdiffer operates recursively, properly handling ordinary files, symbolic
14# links, and directories, as they are found in new_dir. Symbolic links and
15# directories are always replicated as-is in patch_dir. Ordinary files will
16# be represented at the appropriate location in patch_dir by one of the
17# following:
18#
19#  - a binary diff prepared by goobsdiff that can transform the file at the
20#    same position in old_dir to the version in new_dir, but only when such a
21#    file already exists in old_dir and is an ordinary file. These files are
22#    given a "$gbs" suffix.
23#  - a bzip2-compressed copy of the new file from new_dir; in patch_dir, the
24#    new file will have a "$bz2" suffix.
25#  - a gzip-compressed copy of the new file from new_dir; in patch_dir, the
26#    new file will have a "$gz" suffix.
27#  - an xz/lzma2-compressed copy of the new file from new_dir; in patch_dir,
28#    the new file will have an "$xz" suffix.
29#  - an uncompressed copy of the new file from new_dir; in patch_dir, the
30#    new file will have a "$raw" suffix.
31#
32# The unconventional suffixes are used because they aren't likely to occur in
33# filenames.
34#
35# Of these options, the smallest possible representation is chosen. Note that
36# goobsdiff itself will also compress various sections of a binary diff with
37# bzip2, gzip, or xz/lzma2, or leave them uncompressed, according to which is
38# smallest. The approach of choosing the smallest possible representation is
39# time-consuming but given the choices of compressors results in an overall
40# size reduction of about 3%-5% relative to using bzip2 as the only
41# compressor; bzip2 is generally more effective for these data sets than gzip,
42# and xz/lzma2 more effective than bzip2.
43#
44# For large input files, goobsdiff is also very time-consuming and
45# memory-intensive. The overall "wall clock time" spent preparing a patch_dir
46# representing the differences between Google Chrome's 6.0.422.0 and 6.0.427.0
47# versioned directories from successive weekly dev channel releases on a
48# 2.53GHz dual-core 4GB MacBook Pro is 3 minutes. Reconstructing new_dir with
49# dirpatcher is much quicker; in the above configuration, only 10 seconds are
50# needed for reconstruction.
51#
52# After creating a full patch_dir structure, but before returning, dirpatcher
53# is invoked to attempt to recreate new_dir in a temporary location given
54# old_dir and patch_dir. The recreated new_dir is then compared against the
55# original new_dir as a verification step. Should verification fail, dirdiffer
56# exits with a nonzero status, and patch_dir should not be used.
57#
58# Environment variables:
59# DIRDIFFER_EXCLUDE
60#   When an entry in new_dir matches this regular expression, it will not be
61#   included in patch_dir. All prospective paths in new_dir will be matched
62#   against this regular expression, including directories. If a directory
63#   matches this pattern, dirdiffer will also ignore the directory's contents.
64# DIRDIFFER_NO_DIFF
65#   When an entry in new_dir matches this regular expression, it will not be
66#   represented in patch_dir by a $gbs file prepared by goobsdiff. It will only
67#   appear as a $bz2, $gz, or $raw file. Only files in new_dir, not
68#   directories,  will be matched against this regular expression.
69#
70# Exit codes:
71#  0  OK
72#  1  Unknown failure
73#  2  Incorrect number of parameters
74#  3  Input directories do not exist or are not directories
75#  4  Output directory already exists
76#  5  Parent of output directory does not exist or is not a directory
77#  6  An input or output directories contains another
78#  7  Could not create output directory
79#  8  File already exists in output directory
80#  9  Found an irregular file (non-directory, file, or symbolic link) in input
81# 10  Could not create symbolic link
82# 11  File copy failed
83# 12  bzip2 compression failed
84# 13  gzip compression failed
85# 14  xz/lzma2 compression failed
86# 15  Patch creation failed
87# 16  Verification failed
88# 17  Could not set mode (permissions)
89# 18  Could not set modification time
90# 19  Invalid regular expression (irregular expression?)
91
92set -eu
93
94# Environment sanitization. Set a known-safe PATH. Clear environment variables
95# that might impact the interpreter's operation. The |bash -p| invocation
96# on the #! line takes the bite out of BASH_ENV, ENV, and SHELLOPTS (among
97# other features), but clearing them here ensures that they won't impact any
98# shell scripts used as utility programs. SHELLOPTS is read-only and can't be
99# unset, only unexported.
100export PATH="/usr/bin:/bin:/usr/sbin:/sbin"
101unset BASH_ENV CDPATH ENV GLOBIGNORE IFS POSIXLY_CORRECT
102export -n SHELLOPTS
103
104shopt -s dotglob nullglob
105
106# find_tool looks for an executable file named |tool_name|:
107#  - in the same directory as this script,
108#  - if this script is located in a Chromium source tree, at the expected
109#    Release output location in the Mac out directory,
110#  - as above, but in the Debug output location
111# If found in any of the above locations, the script's path is output.
112# Otherwise, this function outputs |tool_name| as a fallback, allowing it to
113# be found (or not) by an ordinary ${PATH} search.
114find_tool() {
115  local tool_name="${1}"
116
117  local script_dir
118  script_dir="$(dirname "${0}")"
119
120  local tool="${script_dir}/${tool_name}"
121  if [[ -f "${tool}" ]] && [[ -x "${tool}" ]]; then
122    echo "${tool}"
123    return
124  fi
125
126  local script_dir_phys
127  script_dir_phys="$(cd "${script_dir}" && pwd -P)"
128  if [[ "${script_dir_phys}" =~ ^(.*)/src/chrome/installer/mac$ ]]; then
129    tool="${BASH_REMATCH[1]}/src/out/Release/${tool_name}"
130    if [[ -f "${tool}" ]] && [[ -x "${tool}" ]]; then
131      echo "${tool}"
132      return
133    fi
134
135    tool="${BASH_REMATCH[1]}/src/out/Debug/${tool_name}"
136    if [[ -f "${tool}" ]] && [[ -x "${tool}" ]]; then
137      echo "${tool}"
138      return
139    fi
140  fi
141
142  echo "${tool_name}"
143}
144
145ME="$(basename "${0}")"
146readonly ME
147DIRPATCHER="$(dirname "${0}")/dirpatcher.sh"
148readonly DIRPATCHER
149GOOBSDIFF="$(find_tool goobsdiff)"
150readonly GOOBSDIFF
151readonly BZIP2="bzip2"
152readonly GZIP="gzip"
153XZ="$(find_tool xz)"
154readonly XZ
155readonly GBS_SUFFIX='$gbs'
156readonly BZ2_SUFFIX='$bz2'
157readonly GZ_SUFFIX='$gz'
158readonly XZ_SUFFIX='$xz'
159readonly PLAIN_SUFFIX='$raw'
160
161# Workaround for http://code.google.com/p/chromium/issues/detail?id=83180#c3
162# In bash 4.0, "declare VAR" no longer initializes VAR if not already set.
163: ${DIRDIFFER_EXCLUDE:=}
164: ${DIRDIFFER_NO_DIFF:=}
165
166err() {
167  local error="${1}"
168
169  echo "${ME}: ${error}" >& 2
170}
171
172declare -a g_cleanup g_verify_exclude
173cleanup() {
174  local status=${?}
175
176  trap - EXIT
177  trap '' HUP INT QUIT TERM
178
179  if [[ ${status} -ge 128 ]]; then
180    err "Caught signal $((${status} - 128))"
181  fi
182
183  if [[ "${#g_cleanup[@]}" -gt 0 ]]; then
184    rm -rf "${g_cleanup[@]}"
185  fi
186
187  exit ${status}
188}
189
190copy_mode_and_time() {
191  local new_file="${1}"
192  local patch_file="${2}"
193
194  local mode
195  mode="$(stat "-f%OMp%OLp" "${new_file}")"
196  if ! chmod -h "${mode}" "${patch_file}"; then
197    exit 17
198  fi
199
200  if ! [[ -L "${patch_file}" ]]; then
201    # Symbolic link modification times can't be copied because there's no
202    # shell tool that provides direct access to lutimes. Instead, the symbolic
203    # link was created with rsync, which already copied the timestamp with
204    # lutimes.
205    if ! touch -r "${new_file}" "${patch_file}"; then
206      exit 18
207    fi
208  fi
209}
210
211file_size() {
212  local file="${1}"
213
214  stat -f %z "${file}"
215}
216
217make_patch_file() {
218  local old_file="${1}"
219  local new_file="${2}"
220  local patch_file="${3}"
221
222  local uncompressed_file="${patch_file}${PLAIN_SUFFIX}"
223  if ! cp "${new_file}" "${uncompressed_file}"; then
224    exit 11
225  fi
226  local uncompressed_size
227  uncompressed_size="$(file_size "${new_file}")"
228
229  local keep_file="${uncompressed_file}"
230  local keep_size="${uncompressed_size}"
231
232  local bz2_file="${patch_file}${BZ2_SUFFIX}"
233  if [[ -e "${bz2_file}" ]]; then
234    err "${bz2_file} already exists"
235    exit 8
236  fi
237  if ! "${BZIP2}" -9c < "${new_file}" > "${bz2_file}"; then
238    err "couldn't compress ${new_file} to ${bz2_file} with ${BZIP2}"
239    exit 12
240  fi
241  local bz2_size
242  bz2_size="$(file_size "${bz2_file}")"
243
244  if [[ "${bz2_size}" -ge "${keep_size}" ]]; then
245    rm -f "${bz2_file}"
246  else
247    rm -f "${keep_file}"
248    keep_file="${bz2_file}"
249    keep_size="${bz2_size}"
250  fi
251
252  local gz_file="${patch_file}${GZ_SUFFIX}"
253  if [[ -e "${gz_file}" ]]; then
254    err "${gz_file} already exists"
255    exit 8
256  fi
257  if ! "${GZIP}" -9cn < "${new_file}" > "${gz_file}"; then
258    err "couldn't compress ${new_file} to ${gz_file} with ${GZIP}"
259    exit 13
260  fi
261  local gz_size
262  gz_size="$(file_size "${gz_file}")"
263
264  if [[ "${gz_size}" -ge "${keep_size}" ]]; then
265    rm -f "${gz_file}"
266  else
267    rm -f "${keep_file}"
268    keep_file="${gz_file}"
269    keep_size="${gz_size}"
270  fi
271
272  local xz_flags=("-c")
273
274  # If the file looks like a Mach-O file, including a universal/fat file, add
275  # the x86 BCJ filter, which results in slightly better compression of x86
276  # and x86_64 executables. Mach-O files might contain other architectures,
277  # but they aren't currently expected in Chrome.
278  local file_output
279  file_output="$(file "${new_file}" 2> /dev/null || true)"
280  if [[ "${file_output}" =~ Mach-O ]]; then
281    xz_flags+=("--x86")
282  fi
283
284  # Use an lzma2 encoder. This is equivalent to xz -9 -e, but allows filters
285  # to precede the compressor.
286  xz_flags+=("--lzma2=preset=9e")
287
288  local xz_file="${patch_file}${XZ_SUFFIX}"
289  if [[ -e "${xz_file}" ]]; then
290    err "${xz_file} already exists"
291    exit 8
292  fi
293  if ! "${XZ}" "${xz_flags[@]}" < "${new_file}" > "${xz_file}"; then
294    err "couldn't compress ${new_file} to ${xz_file} with ${XZ}"
295    exit 14
296  fi
297  local xz_size
298  xz_size="$(file_size "${xz_file}")"
299
300  if [[ "${xz_size}" -ge "${keep_size}" ]]; then
301    rm -f "${xz_file}"
302  else
303    rm -f "${keep_file}"
304    keep_file="${xz_file}"
305    keep_size="${xz_size}"
306  fi
307
308  if [[ -f "${old_file}" ]] && ! [[ -L "${old_file}" ]] &&
309     ! [[ "${new_file}" =~ ${DIRDIFFER_NO_DIFF} ]]; then
310    local gbs_file="${patch_file}${GBS_SUFFIX}"
311    if [[ -e "${gbs_file}" ]]; then
312      err "${gbs_file} already exists"
313      exit 8
314    fi
315    if ! "${GOOBSDIFF}" "${old_file}" "${new_file}" "${gbs_file}"; then
316      err "couldn't create ${gbs_file} by comparing ${old_file} to ${new_file}"
317      exit 15
318    fi
319    local gbs_size
320    gbs_size="$(file_size "${gbs_file}")"
321
322    if [[ "${gbs_size}" -ge "${keep_size}" ]]; then
323      rm -f "${gbs_file}"
324    else
325      rm -f "${keep_file}"
326      keep_file="${gbs_file}"
327      keep_size="${gbs_size}"
328    fi
329  fi
330
331  copy_mode_and_time "${new_file}" "${keep_file}"
332}
333
334make_patch_symlink() {
335  local new_file="${1}"
336  local patch_file="${2}"
337
338  # local target
339  # target="$(readlink "${new_file}")"
340  # ln -s "${target}" "${patch_file}"
341
342  # Use rsync instead of the above, as it's the only way to preserve the
343  # timestamp of a symbolic link using shell tools.
344  if ! rsync -lt "${new_file}" "${patch_file}"; then
345    exit 10
346  fi
347
348  copy_mode_and_time "${new_file}" "${patch_file}"
349}
350
351make_patch_dir() {
352  local old_dir="${1}"
353  local new_dir="${2}"
354  local patch_dir="${3}"
355
356  if ! mkdir "${patch_dir}"; then
357    exit 7
358  fi
359
360  local new_file
361  for new_file in "${new_dir}/"*; do
362    local file="${new_file:${#new_dir} + 1}"
363    local old_file="${old_dir}/${file}"
364    local patch_file="${patch_dir}/${file}"
365
366    if [[ "${new_file}" =~ ${DIRDIFFER_EXCLUDE} ]]; then
367      g_verify_exclude+=("${new_file}")
368      continue
369    fi
370
371    if [[ -e "${patch_file}" ]]; then
372      err "${patch_file} already exists"
373      exit 8
374    fi
375
376    if [[ -L "${new_file}" ]]; then
377      make_patch_symlink "${new_file}" "${patch_file}"
378    elif [[ -d "${new_file}" ]]; then
379      make_patch_dir "${old_file}" "${new_file}" "${patch_file}"
380    elif [[ ! -f "${new_file}" ]]; then
381      err "can't handle irregular file ${new_file}"
382      exit 9
383    else
384      make_patch_file "${old_file}" "${new_file}" "${patch_file}"
385    fi
386  done
387
388  copy_mode_and_time "${new_dir}" "${patch_dir}"
389}
390
391verify_patch_dir() {
392  local old_dir="${1}"
393  local new_dir="${2}"
394  local patch_dir="${3}"
395
396  local verify_temp_dir verify_dir
397  verify_temp_dir="$(mktemp -d -t "${ME}")"
398  g_cleanup+=("${verify_temp_dir}")
399  verify_dir="${verify_temp_dir}/patched"
400
401  if ! "${DIRPATCHER}" "${old_dir}" "${patch_dir}" "${verify_dir}"; then
402    err "patch application for verification failed"
403    exit 16
404  fi
405
406  # rsync will print a line for any file, directory, or symbolic link that
407  # differs or exists only in one directory. As used here, it correctly
408  # considers link targets, file contents, permissions, and timestamps.
409  local rsync_command=(rsync -clprt --delete --out-format=%n \
410                       "${new_dir}/" "${verify_dir}")
411  if [[ ${#g_verify_exclude[@]} -gt 0 ]]; then
412    local exclude
413    for exclude in "${g_verify_exclude[@]}"; do
414      # ${g_verify_exclude[@]} contains paths in ${new_dir}. Strip off
415      # ${new_dir} from the beginning of each, but leave a leading "/" so that
416      # rsync treats them as being at the root of the "transfer."
417      rsync_command+=("--exclude" "${exclude:${#new_dir}}")
418    done
419  fi
420
421  local rsync_output
422  if ! rsync_output="$("${rsync_command[@]}")"; then
423    err "rsync for verification failed"
424    exit 16
425  fi
426
427  rm -rf "${verify_temp_dir}"
428  unset g_cleanup[${#g_cleanup[@]}]
429
430  if [[ -n "${rsync_output}" ]]; then
431    err "verification failed"
432    exit 16
433  fi
434}
435
436# shell_safe_path ensures that |path| is safe to pass to tools as a
437# command-line argument. If the first character in |path| is "-", "./" is
438# prepended to it. The possibly-modified |path| is output.
439shell_safe_path() {
440  local path="${1}"
441  if [[ "${path:0:1}" = "-" ]]; then
442    echo "./${path}"
443  else
444    echo "${path}"
445  fi
446}
447
448dirs_contained() {
449  local dir1="${1}/"
450  local dir2="${2}/"
451
452  if [[ "${dir1:0:${#dir2}}" = "${dir2}" ]] ||
453     [[ "${dir2:0:${#dir1}}" = "${dir1}" ]]; then
454    return 0
455  fi
456
457  return 1
458}
459
460usage() {
461  echo "usage: ${ME} old_dir new_dir patch_dir" >& 2
462}
463
464main() {
465  local old_dir new_dir patch_dir
466  old_dir="$(shell_safe_path "${1}")"
467  new_dir="$(shell_safe_path "${2}")"
468  patch_dir="$(shell_safe_path "${3}")"
469
470  trap cleanup EXIT HUP INT QUIT TERM
471
472  if ! [[ -d "${old_dir}" ]] || ! [[ -d "${new_dir}" ]]; then
473    err "old_dir and new_dir must exist and be directories"
474    usage
475    exit 3
476  fi
477
478  if [[ -e "${patch_dir}" ]]; then
479    err "patch_dir must not exist"
480    usage
481    exit 4
482  fi
483
484  local patch_dir_parent
485  patch_dir_parent="$(dirname "${patch_dir}")"
486  if ! [[ -d "${patch_dir_parent}" ]]; then
487    err "patch_dir parent directory must exist and be a directory"
488    usage
489    exit 5
490  fi
491
492  # The weird conditional structure is because the status of the RE comparison
493  # needs to be available in ${?} without conflating it with other conditions
494  # or negating it. Only a status of 2 from the =~ operator indicates an
495  # invalid regular expression.
496
497  if [[ -n "${DIRDIFFER_EXCLUDE}" ]]; then
498    if [[ "" =~ ${DIRDIFFER_EXCLUDE} ]]; then
499      true
500    elif [[ ${?} -eq 2 ]]; then
501      err "DIRDIFFER_EXCLUDE contains an invalid regular expression"
502      exit 19
503    fi
504  fi
505
506  if [[ -n "${DIRDIFFER_NO_DIFF}" ]]; then
507    if [[ "" =~ ${DIRDIFFER_NO_DIFF} ]]; then
508      true
509    elif [[ ${?} -eq 2 ]]; then
510      err "DIRDIFFER_NO_DIFF contains an invalid regular expression"
511      exit 19
512    fi
513  fi
514
515  local old_dir_phys new_dir_phys patch_dir_parent_phys patch_dir_phys
516  old_dir_phys="$(cd "${old_dir}" && pwd -P)"
517  new_dir_phys="$(cd "${new_dir}" && pwd -P)"
518  patch_dir_parent_phys="$(cd "${patch_dir_parent}" && pwd -P)"
519  patch_dir_phys="${patch_dir_parent_phys}/$(basename "${patch_dir}")"
520
521  if dirs_contained "${old_dir_phys}" "${new_dir_phys}" ||
522     dirs_contained "${old_dir_phys}" "${patch_dir_phys}" ||
523     dirs_contained "${new_dir_phys}" "${patch_dir_phys}"; then
524    err "directories must not contain one another"
525    usage
526    exit 6
527  fi
528
529  g_cleanup[${#g_cleanup[@]}]="${patch_dir}"
530
531  make_patch_dir "${old_dir}" "${new_dir}" "${patch_dir}"
532
533  verify_patch_dir "${old_dir}" "${new_dir}" "${patch_dir}"
534
535  unset g_cleanup[${#g_cleanup[@]}]
536  trap - EXIT
537}
538
539if [[ ${#} -ne 3 ]]; then
540  usage
541  exit 2
542fi
543
544main "${@}"
545exit ${?}
546