15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#!/usr/bin/env python 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Copyright (c) 2011 The Chromium Authors. All rights reserved. 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Use of this source code is governed by a BSD-style license that can be 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# found in the LICENSE file. 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"""Does scraping for all currently-known versions of Chrome""" 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)import pywintypes 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)import types 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from drivers import keyboard 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from drivers import mouse 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)from drivers import windowing 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# TODO: this has moved, use some logic to find it. For now, 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# expects a subst k:. 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DEFAULT_PATH = r"k:\chrome.exe" 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def InvokeBrowser(path): 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """Invoke the Chrome browser. 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Args: 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) path: full path to browser 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Returns: 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) A tuple of (main window, process handle, address bar, render pane) 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Reuse an existing instance of the browser if we can find one. This 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # may not work correctly, especially if the window is behind other windows. 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # TODO(jhaas): make this work with Vista 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) wnds = windowing.FindChildWindows(0, "Chrome_XPFrame") 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if len(wnds): 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) wnd = wnds[0] 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) proc = None 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else: 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Invoke Chrome 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (proc, wnd) = windowing.InvokeAndWait(path) 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Get windows we'll need 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) address_bar = windowing.FindChildWindow(wnd, "Chrome_AutocompleteEdit") 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) render_pane = GetChromeRenderPane(wnd) 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return (wnd, proc, address_bar, render_pane) 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def Scrape(urls, outdir, size, pos, timeout, kwargs): 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """Invoke a browser, send it to a series of URLs, and save its output. 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Args: 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) urls: list of URLs to scrape 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) outdir: directory to place output 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size: size of browser window to use 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pos: position of browser window 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) timeout: amount of time to wait for page to load 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kwargs: miscellaneous keyword args 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Returns: 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) None if success, else an error string 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if "path" in kwargs and kwargs["path"]: path = kwargs["path"] 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else: path = DEFAULT_PATH 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (wnd, proc, address_bar, render_pane) = InvokeBrowser(path) 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Resize and reposition the frame 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) windowing.MoveAndSizeWindow(wnd, pos, size, render_pane) 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Visit each URL we're given 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if type(urls) in types.StringTypes: urls = [urls] 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) timedout = False 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for url in urls: 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Double-click in the address bar, type the name, and press Enter 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mouse.ClickInWindow(address_bar) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) keyboard.TypeString(url, 0.1) 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) keyboard.TypeString("\n") 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Wait for the page to finish loading 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) load_time = windowing.WaitForThrobber(wnd, (20, 16, 36, 32), timeout) 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) timedout = load_time < 0 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if timedout: 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Scrape the page 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) image = windowing.ScrapeWindow(render_pane) 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Save to disk 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if "filename" in kwargs: 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if callable(kwargs["filename"]): 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) filename = kwargs["filename"](url) 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else: 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) filename = kwargs["filename"] 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else: 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) filename = windowing.URLtoFilename(url, outdir, ".bmp") 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) image.save(filename) 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if proc: 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) windowing.SetForegroundWindow(wnd) 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Send Alt-F4, then wait for process to end 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) keyboard.TypeString(r"{\4}", use_modifiers=True) 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if not windowing.WaitForProcessExit(proc, timeout): 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) windowing.EndProcess(proc) 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return "crashed" 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if timedout: 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return "timeout" 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return None 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def Time(urls, size, timeout, kwargs): 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """Measure how long it takes to load each of a series of URLs 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Args: 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) urls: list of URLs to time 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size: size of browser window to use 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) timeout: amount of time to wait for page to load 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kwargs: miscellaneous keyword args 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Returns: 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) A list of tuples (url, time). "time" can be "crashed" or "timeout" 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) """ 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if "path" in kwargs and kwargs["path"]: path = kwargs["path"] 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else: path = DEFAULT_PATH 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) proc = None 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Visit each URL we're given 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if type(urls) in types.StringTypes: urls = [urls] 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ret = [] 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for url in urls: 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) try: 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Invoke the browser if necessary 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if not proc: 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (wnd, proc, address_bar, render_pane) = InvokeBrowser(path) 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Resize and reposition the frame 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) windowing.MoveAndSizeWindow(wnd, (0,0), size, render_pane) 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Double-click in the address bar, type the name, and press Enter 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mouse.ClickInWindow(address_bar) 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) keyboard.TypeString(url, 0.1) 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) keyboard.TypeString("\n") 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Wait for the page to finish loading 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) load_time = windowing.WaitForThrobber(wnd, (20, 16, 36, 32), timeout) 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) timedout = load_time < 0 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if timedout: 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) load_time = "timeout" 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Send an alt-F4 to make the browser close; if this times out, 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # we've probably got a crash 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) windowing.SetForegroundWindow(wnd) 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) keyboard.TypeString(r"{\4}", use_modifiers=True) 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if not windowing.WaitForProcessExit(proc, timeout): 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) windowing.EndProcess(proc) 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) load_time = "crashed" 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) proc = None 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) except pywintypes.error: 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) proc = None 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) load_time = "crashed" 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ret.append( (url, load_time) ) 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if proc: 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) windowing.SetForegroundWindow(wnd) 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) keyboard.TypeString(r"{\4}", use_modifiers=True) 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if not windowing.WaitForProcessExit(proc, timeout): 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) windowing.EndProcess(proc) 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ret 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)def main(): 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # We're being invoked rather than imported, so run some tests 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) path = r"c:\sitecompare\scrapes\chrome\0.1.97.0" 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) windowing.PreparePath(path) 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) # Scrape three sites and save the results 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Scrape([ 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "http://www.microsoft.com", 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "http://www.google.com", 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "http://www.sun.com"], 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) path, (1024, 768), (0, 0)) 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 0 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)if __name__ == "__main__": 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sys.exit(main()) 200