1# Copyright (c) 2011 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Component for automatically creating masks of changing areas of a website.
6
7Works by repeated invokation of a browser and scraping of the resulting page.
8Areas that differ will be added to the auto-generated mask. The mask generator
9considers the mask complete when further scrapes fail to produce any differences
10in the mask.
11"""
12
13import os            # Functions for walking the directory tree
14import tempfile      # Get a temporary directory to hold intermediates
15import time          # Used for sleep() and naming masks by time
16
17import command_line
18import drivers
19from PIL import Image
20from PIL import ImageChops
21import scrapers
22
23
24def CreateCommand(cmdline):
25  """Inserts the command and arguments into a command line for parsing."""
26  cmd = cmdline.AddCommand(
27    ["maskmaker"],
28    "Automatically generates a mask from a list of URLs",
29    ValidateMaskmaker,
30    ExecuteMaskmaker)
31
32  cmd.AddArgument(
33    ["-bp", "--browserpath"], "Full path to browser's executable",
34    type="readfile", metaname="PATH")
35  cmd.AddArgument(
36    ["-b", "--browser"], "Which browser to use", type="string",
37    default="chrome")
38  cmd.AddArgument(
39    ["-bv", "--browserver"], "Version of the browser", metaname="VERSION")
40  cmd.AddArgument(
41    ["-o", "--outdir"], "Directory to store generated masks", metaname="DIR",
42    required=True)
43  cmd.AddArgument(
44    ["-u", "--url"], "URL to compare")
45  cmd.AddArgument(
46    ["-l", "--list"], "List of URLs to compare", type="readfile")
47  cmd.AddMutualExclusion(["--url", "--list"])
48  cmd.AddArgument(
49    ["-s", "--startline"], "First line of URL list", type="int")
50  cmd.AddArgument(
51    ["-e", "--endline"], "Last line of URL list (exclusive)", type="int")
52  cmd.AddArgument(
53    ["-c", "--count"], "Number of lines of URL file to use", type="int")
54  cmd.AddDependency("--startline", "--list")
55  cmd.AddRequiredGroup(["--url", "--list"])
56  cmd.AddDependency("--endline", "--list")
57  cmd.AddDependency("--count", "--list")
58  cmd.AddMutualExclusion(["--count", "--endline"])
59  cmd.AddDependency("--count", "--startline")
60  cmd.AddArgument(
61    ["-t", "--timeout"], "Amount of time (seconds) to wait for browser to "
62    "finish loading",
63    type="int", default=60)
64  cmd.AddArgument(
65    ["-w", "--wait"],
66    "Amount of time (in seconds) to wait between successive scrapes",
67    type="int", default=60)
68  cmd.AddArgument(
69    ["-sc", "--scrapes"],
70    "Number of successive scrapes which must result in no change to a mask "
71    "before mask creation is considered complete", type="int", default=10)
72  cmd.AddArgument(
73    ["-sz", "--size"], "Browser window size", default=(800, 600), type="coords")
74  cmd.AddArgument(["-sd", "--scrapedir"], "Directory to store scrapes")
75  cmd.AddArgument(
76    ["-gu", "--giveup"],
77    "Number of times to scrape before giving up", type="int", default=50)
78  cmd.AddArgument(
79    ["-th", "--threshhold"],
80    "Percentage of different pixels (0-100) above which the scrape will be"
81    "discarded and the mask not updated.", type="int", default=100)
82  cmd.AddArgument(
83    ["--er", "--errors"],
84    "Number of times a scrape can fail before giving up on the URL.",
85    type="int", default=1)
86
87
88def ValidateMaskmaker(command):
89  """Validate the arguments to maskmaker. Raises ParseError if failed."""
90  executables = [".exe", ".com", ".bat"]
91  if command["--browserpath"]:
92    if os.path.splitext(command["--browserpath"])[1].lower() not in executables:
93      raise command_line.ParseError("Browser filename must be an executable")
94
95
96def ExecuteMaskmaker(command):
97  """Performs automatic mask generation."""
98
99  # Get the list of URLs to generate masks for
100  class MaskmakerURL(object):
101    """Helper class for holding information about a URL passed to maskmaker."""
102    __slots__ = ['url', 'consecutive_successes', 'errors']
103    def __init__(self, url):
104      self.url = url
105      self.consecutive_successes = 0
106      self.errors = 0
107
108  if command["--url"]:
109    url_list = [MaskmakerURL(command["--url"])]
110  else:
111    startline = command["--startline"]
112    if command["--count"]:
113      endline = startline+command["--count"]
114    else:
115      endline = command["--endline"]
116    url_list = [MaskmakerURL(url.strip()) for url in
117                open(command["--list"], "r").readlines()[startline:endline]]
118
119  complete_list = []
120  error_list = []
121
122  outdir = command["--outdir"]
123  scrapes = command["--scrapes"]
124  errors = command["--errors"]
125  size = command["--size"]
126  scrape_pass = 0
127
128  scrapedir = command["--scrapedir"]
129  if not scrapedir: scrapedir = tempfile.gettempdir()
130
131  # Get the scraper
132  scraper = scrapers.GetScraper((command["--browser"], command["--browserver"]))
133
134  # Repeatedly iterate through the list of URLs until either every URL has
135  # a successful mask or too many errors, or we've exceeded the giveup limit
136  while url_list and scrape_pass < command["--giveup"]:
137    # Scrape each URL
138    for url in url_list:
139      print "Processing %r..." % url.url
140      mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp")
141
142      # Load the existing mask. This is in a loop so we can try to recover
143      # from error conditions
144      while True:
145        try:
146          mask = Image.open(mask_filename)
147          if mask.size != size:
148            print "  %r already exists and is the wrong size! (%r vs %r)" % (
149              mask_filename, mask.size, size)
150            mask_filename = "%s_%r%s" % (
151              mask_filename[:-4], size, mask_filename[-4:])
152            print "  Trying again as %r..." % mask_filename
153            continue
154          break
155        except IOError:
156          print "  %r does not exist, creating" % mask_filename
157          mask = Image.new("1", size, 1)
158          mask.save(mask_filename)
159
160      # Find the stored scrape path
161      mask_scrape_dir = os.path.join(
162        scrapedir, os.path.splitext(os.path.basename(mask_filename))[0])
163      drivers.windowing.PreparePath(mask_scrape_dir)
164
165      # Find the baseline image
166      mask_scrapes = os.listdir(mask_scrape_dir)
167      mask_scrapes.sort()
168
169      if not mask_scrapes:
170        print "  No baseline image found, mask will not be updated"
171        baseline = None
172      else:
173        baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0]))
174
175      mask_scrape_filename = os.path.join(mask_scrape_dir,
176                                          time.strftime("%y%m%d-%H%M%S.bmp"))
177
178      # Do the scrape
179      result = scraper.Scrape(
180        [url.url], mask_scrape_dir, size, (0, 0),
181        command["--timeout"], path=command["--browserpath"],
182        filename=mask_scrape_filename)
183
184      if result:
185        # Return value other than None means an error
186        print "  Scrape failed with error '%r'" % result
187        url.errors += 1
188        if url.errors >= errors:
189          print "  ** Exceeded maximum error count for this URL, giving up"
190        continue
191
192      # Load the new scrape
193      scrape = Image.open(mask_scrape_filename)
194
195      # Calculate the difference between the new scrape and the baseline,
196      # subject to the current mask
197      if baseline:
198        diff = ImageChops.multiply(ImageChops.difference(scrape, baseline),
199                                   mask.convert(scrape.mode))
200
201        # If the difference is none, there's nothing to update
202        if max(diff.getextrema()) == (0, 0):
203          print "  Scrape identical to baseline, no change in mask"
204          url.consecutive_successes += 1
205          if url.consecutive_successes >= scrapes:
206            print "  ** No change for %r scrapes, done!" % scrapes
207        else:
208          # convert the difference to black and white, then change all
209          # black pixels (where the scrape and the baseline were identical)
210          # to white, all others (where the scrape and the baseline differed)
211          # to black.
212          #
213          # Since the below command is a little unclear, here's how it works.
214          #    1. convert("L") converts the RGB image to grayscale
215          #    2. point() maps grayscale values (or the individual channels)
216          #       of an RGB image) to different ones. Because it operates on
217          #       individual channels, the grayscale conversion from step 1
218          #       is necessary.
219          #    3. The "1" second parameter to point() outputs the result as
220          #       a monochrome bitmap. If the original RGB image were converted
221          #       directly to monochrome, PIL would dither it.
222          diff = diff.convert("L").point([255]+[0]*255, "1")
223
224          # count the number of different pixels
225          diff_pixels = diff.getcolors()[0][0]
226
227          # is this too much?
228          diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1])
229          if diff_pixel_percent > command["--threshhold"]:
230            print ("  Scrape differed from baseline by %.2f percent, ignoring"
231                   % diff_pixel_percent)
232          else:
233            print "  Scrape differed in %d pixels, updating mask" % diff_pixels
234            mask = ImageChops.multiply(mask, diff)
235            mask.save(mask_filename)
236
237            # reset the number of consecutive "good" scrapes
238            url.consecutive_successes = 0
239
240    # Remove URLs whose mask is deemed done
241    complete_list.extend(
242      [url for url in url_list if url.consecutive_successes >= scrapes])
243    error_list.extend(
244      [url for url in url_list if url.errors >= errors])
245    url_list = [
246      url for url in url_list if
247      url.consecutive_successes < scrapes and
248      url.errors < errors]
249
250    scrape_pass += 1
251    print "**Done with scrape pass %d\n" % scrape_pass
252
253    if scrape_pass >= command["--giveup"]:
254      print "**Exceeded giveup threshhold. Giving up."
255    else:
256      print "Waiting %d seconds..." % command["--wait"]
257      time.sleep(command["--wait"])
258
259  print
260  print "*** MASKMAKER COMPLETE ***"
261  print "Summary report:"
262  print "  %d masks successfully generated" % len(complete_list)
263  for url in complete_list:
264    print "    ", url.url
265  print "  %d masks failed with too many errors" % len(error_list)
266  for url in error_list:
267    print "    ", url.url
268  if scrape_pass >= command["--giveup"]:
269    print ("  %d masks were not completed before "
270           "reaching the giveup threshhold" % len(url_list))
271    for url in url_list:
272      print "    ", url.url
273