1#! /usr/bin/python 2 3#this is a script to extract given named nodes from a dot file, with 4#the associated edges. An edge is kept iff for edge x -> y 5# x and y are both nodes specified to be kept. 6 7#known issues: if a line contains '->' and is not an edge line 8#problems will occur. If node labels do not begin with 9#Node this also will not work. Since this is designed to work 10#on DSA dot output and not general dot files this is ok. 11#If you want to use this on other files rename the node labels 12#to Node[.*] with a script or something. This also relies on 13#the length of a node name being 13 characters (as it is in all 14#DSA dot output files) 15 16#Note that the name of the node can be any substring of the actual 17#name in the dot file. Thus if you say specify COLLAPSED 18#as a parameter this script will pull out all COLLAPSED 19#nodes in the file 20 21#Specifying escape characters in the name like \n also will not work, 22#as Python 23#will make it \\n, I'm not really sure how to fix this 24 25#currently the script prints the names it is searching for 26#to STDOUT, so you can check to see if they are what you intend 27 28import re 29import string 30import sys 31 32 33if len(sys.argv) < 3: 34 print 'usage is ./DSAextract <dot_file_to_modify> \ 35 <output_file> [list of nodes to extract]' 36 37#open the input file 38input = open(sys.argv[1], 'r') 39 40#construct a set of node names 41node_name_set = set() 42for name in sys.argv[3:]: 43 node_name_set |= set([name]) 44 45#construct a list of compiled regular expressions from the 46#node_name_set 47regexp_list = [] 48for name in node_name_set: 49 regexp_list.append(re.compile(name)) 50 51#used to see what kind of line we are on 52nodeexp = re.compile('Node') 53#used to check to see if the current line is an edge line 54arrowexp = re.compile('->') 55 56node_set = set() 57 58#read the file one line at a time 59buffer = input.readline() 60while buffer != '': 61 #filter out the unnecessary checks on all the edge lines 62 if not arrowexp.search(buffer): 63 #check to see if this is a node we are looking for 64 for regexp in regexp_list: 65 #if this name is for the current node, add the dot variable name 66 #for the node (it will be Node(hex number)) to our set of nodes 67 if regexp.search(buffer): 68 node_set |= set([re.split('\s+',buffer,2)[1]]) 69 break 70 buffer = input.readline() 71 72 73#test code 74#print '\n' 75 76print node_name_set 77 78#print node_set 79 80 81#open the output file 82output = open(sys.argv[2], 'w') 83#start the second pass over the file 84input = open(sys.argv[1], 'r') 85 86buffer = input.readline() 87while buffer != '': 88 #there are three types of lines we are looking for 89 #1) node lines, 2) edge lines 3) support lines (like page size, etc) 90 91 #is this an edge line? 92 #note that this is no completely robust, if a none edge line 93 #for some reason contains -> it will be missidentified 94 #hand edit the file if this happens 95 if arrowexp.search(buffer): 96 #check to make sure that both nodes are in the node list 97 #if they are print this to output 98 nodes = arrowexp.split(buffer) 99 nodes[0] = string.strip(nodes[0]) 100 nodes[1] = string.strip(nodes[1]) 101 if nodes[0][:13] in node_set and \ 102 nodes[1][:13] in node_set: 103 output.write(buffer) 104 elif nodeexp.search(buffer): #this is a node line 105 node = re.split('\s+', buffer,2)[1] 106 if node in node_set: 107 output.write(buffer) 108 else: #this is a support line 109 output.write(buffer) 110 buffer = input.readline() 111 112