1#!/usr/bin/python
2
3"""
4Copyright 2013 Google Inc.
5
6Use of this source code is governed by a BSD-style license that can be
7found in the LICENSE file.
8
9Calulate differences between image pairs, and store them in a database.
10"""
11
12# System-level imports
13import contextlib
14import errno
15import json
16import logging
17import os
18import Queue
19import re
20import shutil
21import tempfile
22import threading
23import time
24import urllib
25
26# Must fix up PYTHONPATH before importing from within Skia
27import rs_fixpypath  # pylint: disable=W0611
28
29# Imports from within Skia
30import find_run_binary
31from py.utils import gs_utils
32
33
34SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')
35
36DEFAULT_IMAGE_SUFFIX = '.png'
37DEFAULT_IMAGES_SUBDIR = 'images'
38# TODO(epoger): Figure out a better default number of threads; for now,
39# using a conservative default value.
40DEFAULT_NUM_WORKER_THREADS = 1
41
42DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')
43
44RGBDIFFS_SUBDIR = 'diffs'
45WHITEDIFFS_SUBDIR = 'whitediffs'
46
47# Keys used within DiffRecord dictionary representations.
48# NOTE: Keep these in sync with static/constants.js
49KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'
50KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels'
51KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'
52KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference'
53KEY__DIFFERENCES__DIFF_URL = 'diffUrl'
54KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl'
55
56# Special values within ImageDiffDB._diff_dict
57_DIFFRECORD_FAILED = 'failed'
58_DIFFRECORD_PENDING = 'pending'
59
60# How often to report tasks_queue size
61QUEUE_LOGGING_GRANULARITY = 1000
62
63# Temporary variable to keep track of how many times we download
64# the same file in multiple threads.
65# TODO(epoger): Delete this, once we see that the number stays close to 0.
66global_file_collisions = 0
67
68
69class DiffRecord(object):
70  """ Record of differences between two images. """
71
72  def __init__(self, gs, storage_root,
73               expected_image_url, expected_image_locator,
74               actual_image_url, actual_image_locator,
75               expected_images_subdir=DEFAULT_IMAGES_SUBDIR,
76               actual_images_subdir=DEFAULT_IMAGES_SUBDIR,
77               image_suffix=DEFAULT_IMAGE_SUFFIX):
78    """Download this pair of images (unless we already have them on local disk),
79    and prepare a DiffRecord for them.
80
81    Args:
82      gs: instance of GSUtils object we can use to download images
83      storage_root: root directory on local disk within which we store all
84          images
85      expected_image_url: file, GS, or HTTP url from which we will download the
86          expected image
87      expected_image_locator: a unique ID string under which we will store the
88          expected image within storage_root (probably including a checksum to
89          guarantee uniqueness)
90      actual_image_url: file, GS, or HTTP url from which we will download the
91          actual image
92      actual_image_locator: a unique ID string under which we will store the
93          actual image within storage_root (probably including a checksum to
94          guarantee uniqueness)
95      expected_images_subdir: the subdirectory expected images are stored in.
96      actual_images_subdir: the subdirectory actual images are stored in.
97      image_suffix: the suffix of images.
98    """
99    expected_image_locator = _sanitize_locator(expected_image_locator)
100    actual_image_locator = _sanitize_locator(actual_image_locator)
101
102    # Download the expected/actual images, if we don't have them already.
103    expected_image_file = os.path.join(
104        storage_root, expected_images_subdir,
105        str(expected_image_locator) + image_suffix)
106    actual_image_file = os.path.join(
107        storage_root, actual_images_subdir,
108        str(actual_image_locator) + image_suffix)
109    for image_file, image_url in [
110        (expected_image_file, expected_image_url),
111        (actual_image_file, actual_image_url)]:
112      if image_file and image_url:
113        try:
114          _download_file(gs, image_file, image_url)
115        except Exception:
116          logging.exception('unable to download image_url %s to file %s' %
117                            (image_url, image_file))
118          raise
119
120    # Return early if we do not need to generate diffs.
121    if (expected_image_url == actual_image_url or
122        not expected_image_url or not actual_image_url):
123      return
124
125    # Get all diff images and values using the skpdiff binary.
126    skpdiff_output_dir = tempfile.mkdtemp()
127    try:
128      skpdiff_summary_file = os.path.join(skpdiff_output_dir,
129                                          'skpdiff-output.json')
130      skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR)
131      skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR)
132      _mkdir_unless_exists(skpdiff_rgbdiff_dir)
133      _mkdir_unless_exists(skpdiff_rgbdiff_dir)
134
135      # TODO(epoger): Consider calling skpdiff ONCE for all image pairs,
136      # instead of calling it separately for each image pair.
137      # Pro: we'll incur less overhead from making repeated system calls,
138      # spinning up the skpdiff binary, etc.
139      # Con: we would have to wait until all image pairs were loaded before
140      # generating any of the diffs?
141      # Note(stephana): '--longnames' was added to allow for this
142      # case (multiple files at once) versus specifying output diffs
143      # directly.
144      find_run_binary.run_command(
145          [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file,
146           '--jsonp', 'false',
147           '--longnames', 'true',
148           '--output', skpdiff_summary_file,
149           '--differs', 'perceptual', 'different_pixels',
150           '--rgbDiffDir', skpdiff_rgbdiff_dir,
151           '--whiteDiffDir', skpdiff_whitediff_dir,
152           ])
153
154      # Get information out of the skpdiff_summary_file.
155      with contextlib.closing(open(skpdiff_summary_file)) as fp:
156        data = json.load(fp)
157
158      # For now, we can assume there is only one record in the output summary,
159      # since we passed skpdiff only one pair of images.
160      record = data['records'][0]
161      self._width = record['width']
162      self._height = record['height']
163      self._diffUrl = os.path.split(record['rgbDiffPath'])[1]
164      self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1]
165
166      # TODO: make max_diff_per_channel a tuple instead of a list, because the
167      # structure is meaningful (first element is red, second is green, etc.)
168      # See http://stackoverflow.com/a/626871
169      self._max_diff_per_channel = [
170          record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']]
171      per_differ_stats = record['diffs']
172      for stats in per_differ_stats:
173        differ_name = stats['differName']
174        if differ_name == 'different_pixels':
175          self._num_pixels_differing = stats['pointsOfInterest']
176        elif differ_name == 'perceptual':
177          perceptual_similarity = stats['result']
178
179      # skpdiff returns the perceptual similarity; convert it to get the
180      # perceptual difference percentage.
181      # skpdiff outputs -1 if the images are different sizes. Treat any
182      # output that does not lie in [0, 1] as having 0% perceptual
183      # similarity.
184      if not 0 <= perceptual_similarity <= 1:
185        perceptual_similarity = 0
186      self._perceptual_difference = 100 - (perceptual_similarity * 100)
187    finally:
188      shutil.rmtree(skpdiff_output_dir)
189
190  # TODO(epoger): Use properties instead of getters throughout.
191  # See http://stackoverflow.com/a/6618176
192  def get_num_pixels_differing(self):
193    """Returns the absolute number of pixels that differ."""
194    return self._num_pixels_differing
195
196  def get_percent_pixels_differing(self):
197    """Returns the percentage of pixels that differ, as a float between
198    0 and 100 (inclusive)."""
199    return ((float(self._num_pixels_differing) * 100) /
200            (self._width * self._height))
201
202  def get_perceptual_difference(self):
203    """Returns the perceptual difference percentage."""
204    return self._perceptual_difference
205
206  def get_max_diff_per_channel(self):
207    """Returns the maximum difference between the expected and actual images
208    for each R/G/B channel, as a list."""
209    return self._max_diff_per_channel
210
211  def as_dict(self):
212    """Returns a dictionary representation of this DiffRecord, as needed when
213    constructing the JSON representation."""
214    return {
215        KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing,
216        KEY__DIFFERENCES__PERCENT_DIFF_PIXELS:
217            self.get_percent_pixels_differing(),
218        KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,
219        KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference,
220        KEY__DIFFERENCES__DIFF_URL: self._diffUrl,
221        KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl,
222    }
223
224
225
226class ImageDiffDB(object):
227  """ Calculates differences between image pairs, maintaining a database of
228  them for download."""
229
230  def __init__(self, storage_root, gs=None,
231               num_worker_threads=DEFAULT_NUM_WORKER_THREADS):
232    """
233    Args:
234      storage_root: string; root path within the DB will store all of its stuff
235      gs: instance of GSUtils object we can use to download images
236      num_worker_threads: how many threads that download images and
237          generate diffs simultaneously
238    """
239    self._storage_root = storage_root
240    self._gs = gs
241
242    # Mechanism for reporting queue size periodically.
243    self._last_queue_size_reported = None
244    self._queue_size_report_lock = threading.RLock()
245
246    # Dictionary of DiffRecords, keyed by (expected_image_locator,
247    # actual_image_locator) tuples.
248    # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED.
249    #
250    # Any thread that modifies _diff_dict must first acquire
251    # _diff_dict_writelock!
252    #
253    # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably
254    # remove items from self._diff_dict if they haven't been accessed for a
255    # long time.  We can always regenerate them by diffing the images we
256    # previously downloaded to local disk.
257    # I guess we should figure out how expensive it is to download vs diff the
258    # image pairs... if diffing them is expensive too, we can write these
259    # _diff_dict objects out to disk if there's too many to hold in RAM.
260    # Or we could use virtual memory to handle that automatically.
261    self._diff_dict = {}
262    self._diff_dict_writelock = threading.RLock()
263
264    # Set up the queue for asynchronously loading DiffRecords, and start the
265    # worker threads reading from it.
266    # The queue maxsize must be 0 (infinite size queue), so that asynchronous
267    # calls can return as soon as possible.
268    self._tasks_queue = Queue.Queue(maxsize=0)
269    self._workers = []
270    for i in range(num_worker_threads):
271      worker = threading.Thread(target=self.worker, args=(i,))
272      worker.daemon = True
273      worker.start()
274      self._workers.append(worker)
275
276  def log_queue_size_if_changed(self, limit_verbosity=True):
277    """Log the size of self._tasks_queue, if it has changed since the last call.
278
279    Reports the current queue size, using log.info(), unless the queue is the
280    same size as the last time we reported it.
281
282    Args:
283      limit_verbosity: if True, only log if the queue size is a multiple of
284          QUEUE_LOGGING_GRANULARITY
285    """
286    # Acquire the lock, to synchronize access to self._last_queue_size_reported
287    self._queue_size_report_lock.acquire()
288    try:
289      size = self._tasks_queue.qsize()
290      if size == self._last_queue_size_reported:
291        return
292      if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0):
293        return
294      logging.info('tasks_queue size is %d' % size)
295      self._last_queue_size_reported = size
296    finally:
297      self._queue_size_report_lock.release()
298
299  def worker(self, worker_num):
300    """Launch a worker thread that pulls tasks off self._tasks_queue.
301
302    Args:
303      worker_num: (integer) which worker this is
304    """
305    while True:
306      self.log_queue_size_if_changed()
307      params = self._tasks_queue.get()
308      key, expected_image_url, actual_image_url = params
309      try:
310        diff_record = DiffRecord(
311            self._gs, self._storage_root,
312            expected_image_url=expected_image_url,
313            expected_image_locator=key[0],
314            actual_image_url=actual_image_url,
315            actual_image_locator=key[1])
316      except Exception:
317        logging.exception(
318            'exception while creating DiffRecord for key %s' % str(key))
319        diff_record = _DIFFRECORD_FAILED
320      self._diff_dict_writelock.acquire()
321      try:
322        self._diff_dict[key] = diff_record
323      finally:
324        self._diff_dict_writelock.release()
325
326  @property
327  def storage_root(self):
328    return self._storage_root
329
330  def add_image_pair(self,
331                     expected_image_url, expected_image_locator,
332                     actual_image_url, actual_image_locator):
333    """Asynchronously prepare a DiffRecord for a pair of images.
334
335    This method will return quickly; calls to get_diff_record() will block
336    until the DiffRecord is available (or we have given up on creating it).
337
338    If we already have a DiffRecord for this particular image pair, no work
339    will be done.
340
341    If expected_image_url (or its locator) is None, just download actual_image.
342    If actual_image_url (or its locator) is None, just download expected_image.
343
344    Args:
345      expected_image_url: file, GS, or HTTP url from which we will download the
346          expected image
347      expected_image_locator: a unique ID string under which we will store the
348          expected image within storage_root (probably including a checksum to
349          guarantee uniqueness)
350      actual_image_url: file, GS, or HTTP url from which we will download the
351          actual image
352      actual_image_locator: a unique ID string under which we will store the
353          actual image within storage_root (probably including a checksum to
354          guarantee uniqueness)
355    """
356    expected_image_locator = _sanitize_locator(expected_image_locator)
357    actual_image_locator = _sanitize_locator(actual_image_locator)
358    key = (expected_image_locator, actual_image_locator)
359    must_add_to_queue = False
360
361    self._diff_dict_writelock.acquire()
362    try:
363      if not key in self._diff_dict:
364        # If we have already requested a diff between these two images,
365        # we don't need to request it again.
366        must_add_to_queue = True
367        self._diff_dict[key] = _DIFFRECORD_PENDING
368    finally:
369      self._diff_dict_writelock.release()
370
371    if must_add_to_queue:
372      self._tasks_queue.put((key, expected_image_url, actual_image_url))
373      self.log_queue_size_if_changed()
374
375  def get_diff_record(self, expected_image_locator, actual_image_locator):
376    """Returns the DiffRecord for this image pair.
377
378    This call will block until the diff record is available, or we were unable
379    to generate it.
380
381    Args:
382      expected_image_locator: a unique ID string under which we will store the
383          expected image within storage_root (probably including a checksum to
384          guarantee uniqueness)
385      actual_image_locator: a unique ID string under which we will store the
386          actual image within storage_root (probably including a checksum to
387          guarantee uniqueness)
388
389    Returns the DiffRecord for this image pair, or None if we were unable to
390    generate one.
391    """
392    key = (_sanitize_locator(expected_image_locator),
393           _sanitize_locator(actual_image_locator))
394    diff_record = self._diff_dict[key]
395
396    # If we have no results yet, block until we do.
397    while diff_record == _DIFFRECORD_PENDING:
398      time.sleep(1)
399      diff_record = self._diff_dict[key]
400
401    # Once we have the result...
402    if diff_record == _DIFFRECORD_FAILED:
403      logging.error(
404          'failed to create a DiffRecord for expected_image_locator=%s , '
405          'actual_image_locator=%s' % (
406              expected_image_locator, actual_image_locator))
407      return None
408    else:
409      return diff_record
410
411
412# Utility functions
413
414def _download_file(gs, local_filepath, url):
415  """Download a file from url to local_filepath, unless it is already there.
416
417  Args:
418    gs: instance of GSUtils object, in case the url points at Google Storage
419    local_filepath: path on local disk where the image should be stored
420    url: HTTP or GS URL from which we can download the image if we don't have
421        it yet
422  """
423  global global_file_collisions
424  if not os.path.exists(local_filepath):
425    _mkdir_unless_exists(os.path.dirname(local_filepath))
426
427    # First download the file contents into a unique filename, and
428    # then rename that file.  That way, if multiple threads are downloading
429    # the same filename at the same time, they won't interfere with each
430    # other (they will both download the file, and one will "win" in the end)
431    temp_filename = '%s-%d' % (local_filepath,
432                               threading.current_thread().ident)
433    if gs_utils.GSUtils.is_gs_url(url):
434      (bucket, path) = gs_utils.GSUtils.split_gs_url(url)
435      gs.download_file(source_bucket=bucket, source_path=path,
436                       dest_path=temp_filename)
437    else:
438      with contextlib.closing(urllib.urlopen(url)) as url_handle:
439        with open(temp_filename, 'wb') as file_handle:
440          shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)
441
442    # Rename the file to its real filename.
443    # Keep count of how many colliding downloads we encounter;
444    # if it's a large number, we may want to change our download strategy
445    # to minimize repeated downloads.
446    if os.path.exists(local_filepath):
447      global_file_collisions += 1
448    else:
449      os.rename(temp_filename, local_filepath)
450
451
452def _mkdir_unless_exists(path):
453  """Unless path refers to an already-existing directory, create it.
454
455  Args:
456    path: path on local disk
457  """
458  try:
459    os.makedirs(path)
460  except OSError as e:
461    if e.errno == errno.EEXIST:
462      pass
463
464
465def _sanitize_locator(locator):
466  """Returns a sanitized version of a locator (one in which we know none of the
467  characters will have special meaning in filenames).
468
469  Args:
470    locator: string, or something that can be represented as a string.
471        If None or '', it is returned without modification, because empty
472        locators have a particular meaning ("there is no image for this")
473  """
474  if locator:
475    return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))
476  else:
477    return locator
478