xml.awk revision 05be040fdd9fa9d23259d6b6a4aaf4f2aca9c9f2
1# Copyright (C) 2010 The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#      http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14#
15
16# Tiny XML parser implementation in awk.
17#
18# This file is not meant to be used directly, instead copy the
19# functions it defines here into your own script then specialize
20# it appropriately.
21#
22
23# See further below for usage instructions and implementation details.
24#
25
26# ---------------------------- cut here ---------------------------
27
28function xml_event () {
29    RS=">";
30    XML_TAG=XML_TYPE="";
31    split("", XML_ATTR);
32    while ( 1 ) {
33        if (_xml_closing) { # delayed direct tag closure
34            XML_TAG = _xml_closing;
35            XML_TYPE = "END";
36            _xml_closing = "";
37            _xml_exit(XML_TAG);
38            return 1;
39        }
40        if (getline <= 0) return 0; # read new input line
41        _xml_p = index($0, "<"); # get start marker
42        if (_xml_p == 0) return 0; # end of file (or malformed input)
43        $0 = substr($0, _xml_p) # remove anything before '<'
44        # ignore CData / Comments / Processing instructions / Declarations
45        if (_xml_in_section("<!\\[[Cc][Dd][Aa][Tt][Aa]\\[", "]]") ||
46            _xml_in_section("<!--", "--") ||
47            _xml_in_section("<\\?", "\\?") ||
48            _xml_in_section("<!", "")) {
49            continue;
50        }
51        if (substr($0, 1, 2) == "</") { # is it a closing tag ?
52            XML_TYPE = "END";
53            $0 = substr($0, 3);
54        } else { # nope, it's an opening one
55            XML_TYPE = "BEGIN";
56            $0 = substr($0, 2);
57        }
58        XML_TAG = $0
59        sub("[ \n\t/].*$", "", XML_TAG);  # extract tag name
60        XML_TAG = toupper(XML_TAG);       # uppercase it
61        if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ )  # validate it
62            _xml_panic("Invalid tag name: " XML_TAG);
63        if (XML_TYPE == "BEGIN") {  # update reverse path
64            _xml_enter(XML_TAG);
65        } else {
66            _xml_exit(XML_TAG);
67        }
68        sub("[^ \n\t]*[ \n\t]*", "", $0); # get rid of tag and spaces
69        while ($0) { # process attributes
70            if ($0 == "/") {  # deal with direct closing tag, e.g. </foo>
71                _xml_closing = XML_TAG; # record delayed tag closure.
72                break
73            }
74            _xml_attrib = $0;
75            sub(/=.*$/,"",_xml_attrib);  # extract attribute name
76            sub(/^[^=]*/,"",$0);         # remove it from record
77            _xml_attrib = tolower(_xml_attrib);
78            if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # validate it
79                _xml_panic("Invalid attribute name: " _xml_attrib);
80            if (substr($0,1,2) == "=\"") { # value is ="something"
81                _xml_value = substr($0,3);
82                sub(/".*$/,"",_xml_value);
83                sub(/^="[^"]*"/,"",$0);
84            } else if (substr($0,1,2) == "='") { # value is ='something'
85                _xml_value = substr($0,3);
86                sub(/'.*$/,"",_xml_value);
87                sub(/^='[^']*'/,"",$0);
88            } else {
89                _xml_panic("Invalid attribute value syntax for " _xml_attrib ": " $0);
90            }
91            XML_ATTR[_xml_attrib] = _xml_value;  # store attribute name/value
92            sub(/^[ \t\n]*/,"",$0); # get rid of remaining leading spaces
93        }
94        return 1; # now return, XML_TYPE/TAG/ATTR/RPATH are set
95    }
96}
97
98function _xml_panic (msg) {
99    print msg > "/dev/stderr"
100    exit(1)
101}
102
103function _xml_in_section (sec_begin, sec_end) {
104    if (!match( $0, "^" sec_begin )) return 0;
105    while (!match($0, sec_end "$")) {
106        if (getline <= 0) _xml_panic("Unexpected EOF: " ERRNO);
107    }
108    return 1;
109}
110
111function _xml_enter (tag) {
112    XML_RPATH = tag "/" XML_RPATH;
113}
114
115function _xml_exit (tag) {
116    _xml_p = index(XML_RPATH, "/");
117    _xml_expected = substr(XML_RPATH, 1, _xml_p-1);
118    if (_xml_expected != XML_TAG)
119        _xml_panic("Unexpected close tag: " XML_TAG ", expecting " _xml_expected);
120    XML_RPATH = substr(XML_RPATH, _xml_p+1);
121}
122
123# ---------------------------- cut here ---------------------------
124
125# USAGE:
126#
127# The functions provided here are used to extract the tags and attributes of a
128# given XML file. They do not support extraction of data, CDATA, comments,
129# processing instructions and declarations at all.
130#
131# You should use this from the BEGIN {} action of your awk script (it will
132# not work from an END {} action).
133#
134# Call xml_event() in a while loop. This functions returns 1 for each XML
135# 'event' encountered, or 0 when the end of input is reached. Note that in
136# case of malformed output, an error will be printed and the script will
137# force an exit(1)
138#
139# After each succesful xml_event() call, the following variables will be set:
140#
141#    XML_TYPE:  type of event: "BEGIN" -> mean an opening tag, "END" a
142#               closing one.
143#
144#    XML_TAG:   name of the tag, always in UPPERCASE!
145#
146#    XML_ATTR:  a map of attributes for the type. Only set for "BEGIN" types.
147#               all attribute names are in lowercase.
148#
149#               beware: values are *not* unescaped !
150#
151#    XML_RPATH: the _reversed_ element path, using "/" as a separator.
152#               if you are within the <manifest><application> tag, then
153#               it will be set to "APPLICATION/MANIFEST/"
154#               (note the trailing slash).
155#
156
157# This is a simple example that dumps the output of the parsing.
158#
159BEGIN {
160    while ( xml_event() ) {
161        printf "XML_TYPE=%s XML_TAG=%s XML_RPATH=%s", XML_TYPE, XML_TAG, XML_RPATH;
162        if (XML_TYPE == "BEGIN") {
163            for (attr in XML_ATTR) {
164                printf " %s='%s'", attr, XML_ATTR[attr];
165            }
166        }
167        printf "\n";
168    }
169}
170
171# IMPLEMENTATION DETAILS:
172#
173# 1. '>' as the record separator:
174#
175# RS is set to '>' to use this character as the record separator, instead of
176# the default '\n'. This means that something like the following:
177#
178#   <foo><bar attrib="value">stuff</bar></foo>
179#
180# will be translated into the following successive 'records':
181#
182#  <foo
183#  <bar attrib="value"
184#  stuff</bar
185#  </foo
186#
187# Note that the '>' is never part of the records and thus will not be matched.
188# If the record does not contain a single '<', the input is either
189# malformed XML, or we reached the end of file with data after the last
190# '>'.
191#
192# Newlines in the original input are kept in the records as-is.
193#
194# 2. Getting rid of unwanted stuff:
195#
196# We don't need any of the data within elements, so we get rid of them by
197# simply ignoring anything before the '<' in the current record. This is
198# done with code like this:
199#
200#     p = index($0, "<");       # get index of '<'
201#     if (p == 0) -> return 0;  # malformed input or end of file
202#     $0 = substr($0, p+1);     # remove anything before the '<' in record
203#
204# We also want to ignore certain sections like CDATA, comments, declarations,
205# etc.. These begin with a certain pattern and end with another one, e.g.
206# "<!--" and "-->" for comments. This is handled by the _xml_in_section()
207# function that accepts two patterns as input:
208#
209#    sec_begin: is the pattern for the start of the record.
210#    sec_end:   is the pattern for the end of the record (minus trailing '>').
211#
212# The function deals with the fact that these section can embed a valid '>'
213# and will then span multiple records, i.e. something like:
214#
215#  <!-- A comment with an embedded > right here ! -->
216#
217# will be decomposed into two records:
218#
219#   "<!-- A comment with an embedded "
220#   " right here ! --"
221#
222# The function deals with this case, and exits when such a section is not
223# properly terminated in the input.
224#
225# _xml_in_section() returns 1 if an ignorable section was found, or 0 otherwise.
226#
227# 3. Extracting the tag name:
228#
229# </foo> is a closing tag, and <foo> an opening tag, this is handled
230# by the following code:
231#
232#       if (substr($0, 1, 2) == "</") {
233#           XML_TYPE = "END";
234#           $0 = substr($0, 3);
235#       } else {
236#           XML_TYPE = "BEGIN";
237#           $0 = substr($0, 2);
238#       }
239#
240# which defines XML_TYPE, and removes the leading "</" or "<" from the record.
241# The tag is later extracted and converted to uppercase with:
242#
243#       XML_TAG = $0                      # copy record
244#       sub("[ \n\t/].*$", "", XML_TAG);  # remove anything after tag name
245#       XML_TAG = toupper(XML_TAG);       # conver to uppercase
246#       # validate tag
247#       if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) -> panic
248#
249# Then the record is purged from the tag name and the spaces after it:
250#
251#       # get rid of tag and spaces after it in $0
252#       sub("[^ \n\t]*[ \n\t]*", "", $0);
253#
254# 4. Maintaining XML_RPATH:
255#
256# The _xml_enter() and _xml_exit() functions are called to maintain the
257# XML_RPATH variable when entering and exiting specific tags. _xml_exit()
258# will also validate the input, checking proper tag enclosure (or exit(1)
259# in case of error).
260#
261#       if (XML_TYPE == "BEGIN") {
262#           _xml_enter(XML_TAG);
263#       } else {
264#           _xml_exit(XML_TAG);
265#       }
266#
267# 5. Extracting attributes:
268#
269# A loop is implemented to parse attributes, the idea is to get the attribute
270# name, which is always followed by a '=' character:
271#
272#           _xml_attrib = $0;              # copy record.
273#           sub(/=.*$/,"",_xml_attrib);    # get rid of '=' and anything after.
274#           sub(/^[^=]*/,"",$0);           # remove attribute name from $0
275#           _xml_attrib = tolower(_xml_attrib);
276#           if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ )
277#               _xml_panic("Invalid attribute name: " _xml_attrib);
278#
279# Now get the value, which is enclosed by either (") or (')
280#
281#          if (substr($0,1,2) == "=\"") {        # if $0 begins with ="
282#               _xml_value = substr($0,3);       # extract value
283#               sub(/".*$/,"",_xml_value);  
284#               sub(/^="[^"]*"/,"",$0);          # remove it from $0
285#           } else if (substr($0,1,2) == "='") { # if $0 begins with ='
286#               _xml_value = substr($0,3);       # extract value
287#               sub(/'.*$/,"",_xml_value);
288#               sub(/^='[^']*'/,"",$0);          # remove it from $0
289#           } else {
290#               -> panic (malformed input)
291#           }
292#
293# After that, we simply store the value into the XML_ATTR associative
294# array, and cleanup $0 from leading spaces:
295#
296#           XML_ATTR[_xml_attrib] = _xml_value;
297#           sub(/^[ \t\n]*/,"",$0);
298#
299#
300# 6. Handling direct tag closure:
301#
302# When a tag is closed directly (as in <foo/>), A single '/' will be
303# parsed in the attribute parsing loop. We need to record this for the
304# next call to xml_event(), since the current one should return a"BEGIN"
305# for the "FOO" tag instead.
306#
307# We do this by setting the special _xml_closing variable, as in:
308#
309#          if ($0 == "/") {
310#               # record a delayed tag closure for the next call
311#               _xml_closing = XML_TAG;
312#               break
313#           }
314#
315# This variable is checked at the start of xml_event() like this:
316#
317#       # delayed tag closure - see below
318#       if (_xml_closing) {
319#           XML_TAG = _xml_closing;
320#           XML_TYPE = "END";
321#           _xml_closing = "";
322#           _xml_exit(XML_TAG);
323#           return 1;
324#       }
325#
326# Note the call to _xml_exit() to update XML_RPATH here.
327#
328