15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#!/usr/bin/env python
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Copyright (c) 2011 The Chromium Authors. All rights reserved.
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Use of this source code is governed by a BSD-style license that can be
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# found in the LICENSE file.
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"""Does scraping for all currently-known versions of Chrome"""
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)import pywintypes
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)import types
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from drivers import keyboard
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from drivers import mouse
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from drivers import windowing
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# TODO: this has moved, use some logic to find it. For now,
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# expects a subst k:.
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DEFAULT_PATH = r"k:\chrome.exe"
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def InvokeBrowser(path):
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """Invoke the Chrome browser.
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Args:
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    path: full path to browser
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Returns:
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    A tuple of (main window, process handle, address bar, render pane)
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Reuse an existing instance of the browser if we can find one. This
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # may not work correctly, especially if the window is behind other windows.
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # TODO(jhaas): make this work with Vista
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  wnds = windowing.FindChildWindows(0, "Chrome_XPFrame")
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if len(wnds):
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    wnd = wnds[0]
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    proc = None
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  else:
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # Invoke Chrome
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    (proc, wnd) = windowing.InvokeAndWait(path)
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Get windows we'll need
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  address_bar = windowing.FindChildWindow(wnd, "Chrome_AutocompleteEdit")
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  render_pane = GetChromeRenderPane(wnd)
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return (wnd, proc, address_bar, render_pane)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def Scrape(urls, outdir, size, pos, timeout, kwargs):
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """Invoke a browser, send it to a series of URLs, and save its output.
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Args:
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    urls: list of URLs to scrape
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    outdir: directory to place output
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    size: size of browser window to use
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pos: position of browser window
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    timeout: amount of time to wait for page to load
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    kwargs: miscellaneous keyword args
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Returns:
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    None if success, else an error string
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if "path" in kwargs and kwargs["path"]: path = kwargs["path"]
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  else: path = DEFAULT_PATH
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  (wnd, proc, address_bar, render_pane) = InvokeBrowser(path)
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Resize and reposition the frame
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  windowing.MoveAndSizeWindow(wnd, pos, size, render_pane)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Visit each URL we're given
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if type(urls) in types.StringTypes: urls = [urls]
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  timedout = False
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for url in urls:
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # Double-click in the address bar, type the name, and press Enter
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    mouse.ClickInWindow(address_bar)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    keyboard.TypeString(url, 0.1)
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    keyboard.TypeString("\n")
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # Wait for the page to finish loading
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    load_time = windowing.WaitForThrobber(wnd, (20, 16, 36, 32), timeout)
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    timedout = load_time < 0
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if timedout:
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # Scrape the page
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    image = windowing.ScrapeWindow(render_pane)
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # Save to disk
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if "filename" in kwargs:
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if callable(kwargs["filename"]):
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        filename = kwargs["filename"](url)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      else:
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        filename = kwargs["filename"]
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    else:
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      filename = windowing.URLtoFilename(url, outdir, ".bmp")
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    image.save(filename)
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if proc:
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    windowing.SetForegroundWindow(wnd)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    # Send Alt-F4, then wait for process to end
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    keyboard.TypeString(r"{\4}", use_modifiers=True)
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if not windowing.WaitForProcessExit(proc, timeout):
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      windowing.EndProcess(proc)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return "crashed"
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if timedout:
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return "timeout"
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return None
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def Time(urls, size, timeout, kwargs):
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """Measure how long it takes to load each of a series of URLs
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Args:
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    urls: list of URLs to time
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    size: size of browser window to use
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    timeout: amount of time to wait for page to load
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    kwargs: miscellaneous keyword args
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Returns:
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    A list of tuples (url, time). "time" can be "crashed" or "timeout"
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  """
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if "path" in kwargs and kwargs["path"]: path = kwargs["path"]
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  else: path = DEFAULT_PATH
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  proc = None
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Visit each URL we're given
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if type(urls) in types.StringTypes: urls = [urls]
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ret = []
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for url in urls:
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    try:
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      # Invoke the browser if necessary
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if not proc:
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        (wnd, proc, address_bar, render_pane) = InvokeBrowser(path)
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        # Resize and reposition the frame
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        windowing.MoveAndSizeWindow(wnd, (0,0), size, render_pane)
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      # Double-click in the address bar, type the name, and press Enter
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      mouse.ClickInWindow(address_bar)
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      keyboard.TypeString(url, 0.1)
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      keyboard.TypeString("\n")
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      # Wait for the page to finish loading
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      load_time = windowing.WaitForThrobber(wnd, (20, 16, 36, 32), timeout)
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      timedout = load_time < 0
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if timedout:
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        load_time = "timeout"
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        # Send an alt-F4 to make the browser close; if this times out,
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        # we've probably got a crash
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        windowing.SetForegroundWindow(wnd)
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        keyboard.TypeString(r"{\4}", use_modifiers=True)
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if not windowing.WaitForProcessExit(proc, timeout):
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          windowing.EndProcess(proc)
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          load_time = "crashed"
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        proc = None
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    except pywintypes.error:
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      proc = None
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      load_time = "crashed"
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ret.append( (url, load_time) )
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if proc:
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    windowing.SetForegroundWindow(wnd)
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    keyboard.TypeString(r"{\4}", use_modifiers=True)
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if not windowing.WaitForProcessExit(proc, timeout):
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      windowing.EndProcess(proc)
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return ret
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def main():
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # We're being invoked rather than imported, so run some tests
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  path = r"c:\sitecompare\scrapes\chrome\0.1.97.0"
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  windowing.PreparePath(path)
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  # Scrape three sites and save the results
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Scrape([
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "http://www.microsoft.com",
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "http://www.google.com",
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "http://www.sun.com"],
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         path, (1024, 768), (0, 0))
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return 0
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)if __name__ == "__main__":
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sys.exit(main())
200