1#!/usr/bin/python
2
3"""
4Copyright 2013 Google Inc.
5
6Use of this source code is governed by a BSD-style license that can be
7found in the LICENSE file.
8
9Calulate differences between image pairs, and store them in a database.
10"""
11
12import contextlib
13import csv
14import logging
15import os
16import re
17import shutil
18import sys
19import tempfile
20import urllib
21try:
22  from PIL import Image, ImageChops
23except ImportError:
24  raise ImportError('Requires PIL to be installed; see '
25                    + 'http://www.pythonware.com/products/pil/')
26
27# Set the PYTHONPATH to include the tools directory.
28sys.path.append(
29    os.path.join(
30        os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir,
31                        'tools'))
32import find_run_binary
33
34SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')
35
36DEFAULT_IMAGE_SUFFIX = '.png'
37DEFAULT_IMAGES_SUBDIR = 'images'
38
39DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')
40
41DIFFS_SUBDIR = 'diffs'
42WHITEDIFFS_SUBDIR = 'whitediffs'
43
44VALUES_PER_BAND = 256
45
46# Keys used within DiffRecord dictionary representations.
47# NOTE: Keep these in sync with static/constants.js
48KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'
49KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels'
50KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'
51KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference'
52
53
54class DiffRecord(object):
55  """ Record of differences between two images. """
56
57  def __init__(self, storage_root,
58               expected_image_url, expected_image_locator,
59               actual_image_url, actual_image_locator,
60               expected_images_subdir=DEFAULT_IMAGES_SUBDIR,
61               actual_images_subdir=DEFAULT_IMAGES_SUBDIR,
62               image_suffix=DEFAULT_IMAGE_SUFFIX):
63    """Download this pair of images (unless we already have them on local disk),
64    and prepare a DiffRecord for them.
65
66    TODO(epoger): Make this asynchronously download images, rather than blocking
67    until the images have been downloaded and processed.
68
69    Args:
70      storage_root: root directory on local disk within which we store all
71          images
72      expected_image_url: file or HTTP url from which we will download the
73          expected image
74      expected_image_locator: a unique ID string under which we will store the
75          expected image within storage_root (probably including a checksum to
76          guarantee uniqueness)
77      actual_image_url: file or HTTP url from which we will download the
78          actual image
79      actual_image_locator: a unique ID string under which we will store the
80          actual image within storage_root (probably including a checksum to
81          guarantee uniqueness)
82      expected_images_subdir: the subdirectory expected images are stored in.
83      actual_images_subdir: the subdirectory actual images are stored in.
84      image_suffix: the suffix of images.
85    """
86    expected_image_locator = _sanitize_locator(expected_image_locator)
87    actual_image_locator = _sanitize_locator(actual_image_locator)
88
89    # Download the expected/actual images, if we don't have them already.
90    # TODO(rmistry): Add a parameter that makes _download_and_open_image raise
91    # an exception if images are not found locally (instead of trying to
92    # download them).
93    expected_image_file = os.path.join(
94        storage_root, expected_images_subdir,
95        str(expected_image_locator) + image_suffix)
96    actual_image_file = os.path.join(
97        storage_root, actual_images_subdir,
98        str(actual_image_locator) + image_suffix)
99    try:
100      expected_image = _download_and_open_image(
101          expected_image_file, expected_image_url)
102    except Exception:
103      logging.exception('unable to download expected_image_url %s to file %s' %
104                        (expected_image_url, expected_image_file))
105      raise
106    try:
107      actual_image = _download_and_open_image(
108          actual_image_file, actual_image_url)
109    except Exception:
110      logging.exception('unable to download actual_image_url %s to file %s' %
111                        (actual_image_url, actual_image_file))
112      raise
113
114    # Generate the diff image (absolute diff at each pixel) and
115    # max_diff_per_channel.
116    diff_image = _generate_image_diff(actual_image, expected_image)
117    diff_histogram = diff_image.histogram()
118    (diff_width, diff_height) = diff_image.size
119    self._max_diff_per_channel = _max_per_band(diff_histogram)
120
121    # Generate the whitediff image (any differing pixels show as white).
122    # This is tricky, because when you convert color images to grayscale or
123    # black & white in PIL, it has its own ideas about thresholds.
124    # We have to force it: if a pixel has any color at all, it's a '1'.
125    bands = diff_image.split()
126    graydiff_image = ImageChops.lighter(ImageChops.lighter(
127        bands[0], bands[1]), bands[2])
128    whitediff_image = (graydiff_image.point(lambda p: p > 0 and VALUES_PER_BAND)
129                                     .convert('1', dither=Image.NONE))
130
131    # Calculate the perceptual difference percentage.
132    skpdiff_csv_dir = tempfile.mkdtemp()
133    try:
134      skpdiff_csv_output = os.path.join(skpdiff_csv_dir, 'skpdiff-output.csv')
135      expected_img = os.path.join(storage_root, expected_images_subdir,
136                                  str(expected_image_locator) + image_suffix)
137      actual_img = os.path.join(storage_root, actual_images_subdir,
138                                str(actual_image_locator) + image_suffix)
139      find_run_binary.run_command(
140          [SKPDIFF_BINARY, '-p', expected_img, actual_img,
141           '--csv', skpdiff_csv_output, '-d', 'perceptual'])
142      with contextlib.closing(open(skpdiff_csv_output)) as csv_file:
143        for row in csv.DictReader(csv_file):
144          perceptual_similarity = float(row[' perceptual'].strip())
145          if not 0 <= perceptual_similarity <= 1:
146            # skpdiff outputs -1 if the images are different sizes. Treat any
147            # output that does not lie in [0, 1] as having 0% perceptual
148            # similarity.
149            perceptual_similarity = 0
150          # skpdiff returns the perceptual similarity, convert it to get the
151          # perceptual difference percentage.
152          self._perceptual_difference = 100 - (perceptual_similarity * 100)
153    finally:
154      shutil.rmtree(skpdiff_csv_dir)
155
156    # Final touches on diff_image: use whitediff_image as an alpha mask.
157    # Unchanged pixels are transparent; differing pixels are opaque.
158    diff_image.putalpha(whitediff_image)
159
160    # Store the diff and whitediff images generated above.
161    diff_image_locator = _get_difference_locator(
162        expected_image_locator=expected_image_locator,
163        actual_image_locator=actual_image_locator)
164    basename = str(diff_image_locator) + image_suffix
165    _save_image(diff_image, os.path.join(
166        storage_root, DIFFS_SUBDIR, basename))
167    _save_image(whitediff_image, os.path.join(
168        storage_root, WHITEDIFFS_SUBDIR, basename))
169
170    # Calculate difference metrics.
171    (self._width, self._height) = diff_image.size
172    self._num_pixels_differing = (
173        whitediff_image.histogram()[VALUES_PER_BAND - 1])
174
175  def get_num_pixels_differing(self):
176    """Returns the absolute number of pixels that differ."""
177    return self._num_pixels_differing
178
179  def get_percent_pixels_differing(self):
180    """Returns the percentage of pixels that differ, as a float between
181    0 and 100 (inclusive)."""
182    return ((float(self._num_pixels_differing) * 100) /
183            (self._width * self._height))
184
185  def get_perceptual_difference(self):
186    """Returns the perceptual difference percentage."""
187    return self._perceptual_difference
188
189  def get_max_diff_per_channel(self):
190    """Returns the maximum difference between the expected and actual images
191    for each R/G/B channel, as a list."""
192    return self._max_diff_per_channel
193
194  def as_dict(self):
195    """Returns a dictionary representation of this DiffRecord, as needed when
196    constructing the JSON representation."""
197    return {
198        KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing,
199        KEY__DIFFERENCES__PERCENT_DIFF_PIXELS:
200            self.get_percent_pixels_differing(),
201        KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,
202        KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference,
203    }
204
205
206class ImageDiffDB(object):
207  """ Calculates differences between image pairs, maintaining a database of
208  them for download."""
209
210  def __init__(self, storage_root):
211    """
212    Args:
213      storage_root: string; root path within the DB will store all of its stuff
214    """
215    self._storage_root = storage_root
216
217    # Dictionary of DiffRecords, keyed by (expected_image_locator,
218    # actual_image_locator) tuples.
219    self._diff_dict = {}
220
221  def add_image_pair(self,
222                     expected_image_url, expected_image_locator,
223                     actual_image_url, actual_image_locator):
224    """Download this pair of images (unless we already have them on local disk),
225    and prepare a DiffRecord for them.
226
227    TODO(epoger): Make this asynchronously download images, rather than blocking
228    until the images have been downloaded and processed.
229    When we do that, we should probably add a new method that will block
230    until all of the images have been downloaded and processed.  Otherwise,
231    we won't know when it's safe to start calling get_diff_record().
232    jcgregorio notes: maybe just make ImageDiffDB thread-safe and create a
233    thread-pool/worker queue at a higher level that just uses ImageDiffDB?
234
235    Args:
236      expected_image_url: file or HTTP url from which we will download the
237          expected image
238      expected_image_locator: a unique ID string under which we will store the
239          expected image within storage_root (probably including a checksum to
240          guarantee uniqueness)
241      actual_image_url: file or HTTP url from which we will download the
242          actual image
243      actual_image_locator: a unique ID string under which we will store the
244          actual image within storage_root (probably including a checksum to
245          guarantee uniqueness)
246    """
247    expected_image_locator = _sanitize_locator(expected_image_locator)
248    actual_image_locator = _sanitize_locator(actual_image_locator)
249    key = (expected_image_locator, actual_image_locator)
250    if not key in self._diff_dict:
251      try:
252        new_diff_record = DiffRecord(
253            self._storage_root,
254            expected_image_url=expected_image_url,
255            expected_image_locator=expected_image_locator,
256            actual_image_url=actual_image_url,
257            actual_image_locator=actual_image_locator)
258      except Exception:
259        # If we can't create a real DiffRecord for this (expected, actual) pair,
260        # store None and the UI will show whatever information we DO have.
261        # Fixes http://skbug.com/2368 .
262        logging.exception(
263            'got exception while creating a DiffRecord for '
264            'expected_image_url=%s , actual_image_url=%s; returning None' % (
265                expected_image_url, actual_image_url))
266        new_diff_record = None
267      self._diff_dict[key] = new_diff_record
268
269  def get_diff_record(self, expected_image_locator, actual_image_locator):
270    """Returns the DiffRecord for this image pair.
271
272    Raises a KeyError if we don't have a DiffRecord for this image pair.
273    """
274    key = (_sanitize_locator(expected_image_locator),
275           _sanitize_locator(actual_image_locator))
276    return self._diff_dict[key]
277
278
279# Utility functions
280
281def _max_per_band(histogram):
282  """Given the histogram of an image, return the maximum value of each band
283  (a.k.a. "color channel", such as R/G/B) across the entire image.
284
285  Args:
286    histogram: PIL histogram
287
288  Returns the maximum value of each band within the image histogram, as a list.
289  """
290  max_per_band = []
291  assert(len(histogram) % VALUES_PER_BAND == 0)
292  num_bands = len(histogram) / VALUES_PER_BAND
293  for band in xrange(num_bands):
294    # Assuming that VALUES_PER_BAND is 256...
295    #  the 'R' band makes up indices 0-255 in the histogram,
296    #  the 'G' band makes up indices 256-511 in the histogram,
297    #  etc.
298    min_index = band * VALUES_PER_BAND
299    index = min_index + VALUES_PER_BAND
300    while index > min_index:
301      index -= 1
302      if histogram[index] > 0:
303        max_per_band.append(index - min_index)
304        break
305  return max_per_band
306
307
308def _generate_image_diff(image1, image2):
309  """Wrapper for ImageChops.difference(image1, image2) that will handle some
310  errors automatically, or at least yield more useful error messages.
311
312  TODO(epoger): Currently, some of the images generated by the bots are RGBA
313  and others are RGB.  I'm not sure why that is.  For now, to avoid confusion
314  within the UI, convert all to RGB when diffing.
315
316  Args:
317    image1: a PIL image object
318    image2: a PIL image object
319
320  Returns: per-pixel diffs between image1 and image2, as a PIL image object
321  """
322  try:
323    return ImageChops.difference(image1.convert('RGB'), image2.convert('RGB'))
324  except ValueError:
325    logging.error('Error diffing image1 [%s] and image2 [%s].' % (
326        repr(image1), repr(image2)))
327    raise
328
329
330def _download_and_open_image(local_filepath, url):
331  """Open the image at local_filepath; if there is no file at that path,
332  download it from url to that path and then open it.
333
334  Args:
335    local_filepath: path on local disk where the image should be stored
336    url: URL from which we can download the image if we don't have it yet
337
338  Returns: a PIL image object
339  """
340  if not os.path.exists(local_filepath):
341    _mkdir_unless_exists(os.path.dirname(local_filepath))
342    with contextlib.closing(urllib.urlopen(url)) as url_handle:
343      with open(local_filepath, 'wb') as file_handle:
344        shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)
345  return _open_image(local_filepath)
346
347
348def _open_image(filepath):
349  """Wrapper for Image.open(filepath) that yields more useful error messages.
350
351  Args:
352    filepath: path on local disk to load image from
353
354  Returns: a PIL image object
355  """
356  try:
357    return Image.open(filepath)
358  except IOError:
359    # If we are unable to load an image from the file, delete it from disk
360    # and we will try to fetch it again next time.  Fixes http://skbug.com/2247
361    logging.error('IOError loading image file %s ; deleting it.' % filepath)
362    os.remove(filepath)
363    raise
364
365
366def _save_image(image, filepath, format='PNG'):
367  """Write an image to disk, creating any intermediate directories as needed.
368
369  Args:
370    image: a PIL image object
371    filepath: path on local disk to write image to
372    format: one of the PIL image formats, listed at
373            http://effbot.org/imagingbook/formats.htm
374  """
375  _mkdir_unless_exists(os.path.dirname(filepath))
376  image.save(filepath, format)
377
378
379def _mkdir_unless_exists(path):
380  """Unless path refers to an already-existing directory, create it.
381
382  Args:
383    path: path on local disk
384  """
385  if not os.path.isdir(path):
386    os.makedirs(path)
387
388
389def _sanitize_locator(locator):
390  """Returns a sanitized version of a locator (one in which we know none of the
391  characters will have special meaning in filenames).
392
393  Args:
394    locator: string, or something that can be represented as a string
395  """
396  return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))
397
398
399def _get_difference_locator(expected_image_locator, actual_image_locator):
400  """Returns the locator string used to look up the diffs between expected_image
401  and actual_image.
402
403  We must keep this function in sync with getImageDiffRelativeUrl() in
404  static/loader.js
405
406  Args:
407    expected_image_locator: locator string pointing at expected image
408    actual_image_locator: locator string pointing at actual image
409
410  Returns: already-sanitized locator where the diffs between expected and
411      actual images can be found
412  """
413  return "%s-vs-%s" % (_sanitize_locator(expected_image_locator),
414                       _sanitize_locator(actual_image_locator))
415