build/awk/xml.awk

# Copyright (C) 2010 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Tiny XML parser implementation in awk.
#
# This file is not meant to be used directly, instead copy the
# functions it defines here into your own script then specialize
# it appropriately.
#

# See further below for usage instructions and implementation details.
#

# ---------------------------- cut here ---------------------------

function xml_event () {
    RS=">";
    XML_TAG=XML_TYPE="";
    split("", XML_ATTR);
    while ( 1 ) {
        if (_xml_closing) { # delayed direct tag closure
            XML_TAG = _xml_closing;
            XML_TYPE = "END";
            _xml_closing = "";
            _xml_exit(XML_TAG);
            return 1;
        }
        if (getline <= 0) return 0; # read new input line
        _xml_p = index($0, "<"); # get start marker
        if (_xml_p == 0) return 0; # end of file (or malformed input)
        $0 = substr($0, _xml_p) # remove anything before '<'
        # ignore CData / Comments / Processing instructions / Declarations
        if (_xml_in_section("<!\\[[Cc][Dd][Aa][Tt][Aa]\\[", "]]") ||
            _xml_in_section("<!--", "--") ||
            _xml_in_section("<\\?", "\\?") ||
            _xml_in_section("<!", "")) {
            continue;
        }
        if (substr($0, 1, 2) == "</") { # is it a closing tag ?
            XML_TYPE = "END";
            $0 = substr($0, 3);
        } else { # nope, it's an opening one
            XML_TYPE = "BEGIN";
            $0 = substr($0, 2);
        }
        XML_TAG = $0
        sub("[ \n\t/].*$", "", XML_TAG);  # extract tag name
        XML_TAG = toupper(XML_TAG);       # uppercase it
        if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ )  # validate it
            _xml_panic("Invalid tag name: " XML_TAG);
        if (XML_TYPE == "BEGIN") {  # update reverse path
            _xml_enter(XML_TAG);
        } else {
            _xml_exit(XML_TAG);
        }
        sub("[^ \n\t]*[ \n\t]*", "", $0); # get rid of tag and spaces
        while ($0) { # process attributes
            if ($0 == "/") {  # deal with direct closing tag, e.g. </foo>
                _xml_closing = XML_TAG; # record delayed tag closure.
                break
            }
            _xml_attrib = $0;
            sub(/=.*$/,"",_xml_attrib);  # extract attribute name
            sub(/^[^=]*/,"",$0);         # remove it from record
            _xml_attrib = tolower(_xml_attrib);
            if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # validate it
                _xml_panic("Invalid attribute name: " _xml_attrib);
            if (substr($0,1,2) == "=\"") { # value is ="something"
                _xml_value = substr($0,3);
                sub(/".*$/,"",_xml_value);
                sub(/^="[^"]*"/,"",$0);
            } else if (substr($0,1,2) == "='") { # value is ='something'
                _xml_value = substr($0,3);
                sub(/'.*$/,"",_xml_value);
                sub(/^='[^']*'/,"",$0);
            } else {
                _xml_panic("Invalid attribute value syntax for " _xml_attrib ": " $0);
            }
            XML_ATTR[_xml_attrib] = _xml_value;  # store attribute name/value
            sub(/^[ \t\n]*/,"",$0); # get rid of remaining leading spaces
        }
        return 1; # now return, XML_TYPE/TAG/ATTR/RPATH are set
    }
}

function _xml_panic (msg) {
    print msg > "/dev/stderr"
    exit(1)
}

function _xml_in_section (sec_begin, sec_end) {
    if (!match( $0, "^" sec_begin )) return 0;
    while (!match($0, sec_end "$")) {
        if (getline <= 0) _xml_panic("Unexpected EOF: " ERRNO);
    }
    return 1;
}

function _xml_enter (tag) {
    XML_RPATH = tag "/" XML_RPATH;
}

function _xml_exit (tag) {
    _xml_p = index(XML_RPATH, "/");
    _xml_expected = substr(XML_RPATH, 1, _xml_p-1);
    if (_xml_expected != XML_TAG)
        _xml_panic("Unexpected close tag: " XML_TAG ", expecting " _xml_expected);
    XML_RPATH = substr(XML_RPATH, _xml_p+1);
}

# ---------------------------- cut here ---------------------------

# USAGE:
#
# The functions provided here are used to extract the tags and attributes of a
# given XML file. They do not support extraction of data, CDATA, comments,
# processing instructions and declarations at all.
#
# You should use this from the BEGIN {} action of your awk script (it will
# not work from an END {} action).
#
# Call xml_event() in a while loop. This functions returns 1 for each XML
# 'event' encountered, or 0 when the end of input is reached. Note that in
# case of malformed output, an error will be printed and the script will
# force an exit(1)
#
# After each succesful xml_event() call, the following variables will be set:
#
#    XML_TYPE:  type of event: "BEGIN" -> mean an opening tag, "END" a
#               closing one.
#
#    XML_TAG:   name of the tag, always in UPPERCASE!
#
#    XML_ATTR:  a map of attributes for the type. Only set for "BEGIN" types.
#               all attribute names are in lowercase.
#
#               beware: values are *not* unescaped !
#
#    XML_RPATH: the _reversed_ element path, using "/" as a separator.
#               if you are within the <manifest><application> tag, then
#               it will be set to "APPLICATION/MANIFEST/"
#               (note the trailing slash).
#

# This is a simple example that dumps the output of the parsing.
#
BEGIN {
    while ( xml_event() ) {
        printf "XML_TYPE=%s XML_TAG=%s XML_RPATH=%s", XML_TYPE, XML_TAG, XML_RPATH;
        if (XML_TYPE == "BEGIN") {
            for (attr in XML_ATTR) {
                printf " %s='%s'", attr, XML_ATTR[attr];
            }
        }
        printf "\n";
    }
}

# IMPLEMENTATION DETAILS:
#
# 1. '>' as the record separator:
#
# RS is set to '>' to use this character as the record separator, instead of
# the default '\n'. This means that something like the following:
#
#   <foo><bar attrib="value">stuff</bar></foo>
#
# will be translated into the following successive 'records':
#
#  <foo
#  <bar attrib="value"
#  stuff</bar
#  </foo
#
# Note that the '>' is never part of the records and thus will not be matched.
# If the record does not contain a single '<', the input is either
# malformed XML, or we reached the end of file with data after the last
# '>'.
#
# Newlines in the original input are kept in the records as-is.
#
# 2. Getting rid of unwanted stuff:
#
# We don't need any of the data within elements, so we get rid of them by
# simply ignoring anything before the '<' in the current record. This is
# done with code like this:
#
#     p = index($0, "<");       # get index of '<'
#     if (p == 0) -> return 0;  # malformed input or end of file
#     $0 = substr($0, p+1);     # remove anything before the '<' in record
#
# We also want to ignore certain sections like CDATA, comments, declarations,
# etc.. These begin with a certain pattern and end with another one, e.g.
# "<!--" and "-->" for comments. This is handled by the _xml_in_section()
# function that accepts two patterns as input:
#
#    sec_begin: is the pattern for the start of the record.
#    sec_end:   is the pattern for the end of the record (minus trailing '>').
#
# The function deals with the fact that these section can embed a valid '>'
# and will then span multiple records, i.e. something like:
#
#  <!-- A comment with an embedded > right here ! -->
#
# will be decomposed into two records:
#
#   "<!-- A comment with an embedded "
#   " right here ! --"
#
# The function deals with this case, and exits when such a section is not
# properly terminated in the input.
#
# _xml_in_section() returns 1 if an ignorable section was found, or 0 otherwise.
#
# 3. Extracting the tag name:
#
# </foo> is a closing tag, and <foo> an opening tag, this is handled
# by the following code:
#
#       if (substr($0, 1, 2) == "</") {
#           XML_TYPE = "END";
#           $0 = substr($0, 3);
#       } else {
#           XML_TYPE = "BEGIN";
#           $0 = substr($0, 2);
#       }
#
# which defines XML_TYPE, and removes the leading "</" or "<" from the record.
# The tag is later extracted and converted to uppercase with:
#
#       XML_TAG = $0                      # copy record
#       sub("[ \n\t/].*$", "", XML_TAG);  # remove anything after tag name
#       XML_TAG = toupper(XML_TAG);       # conver to uppercase
#       # validate tag
#       if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) -> panic
#
# Then the record is purged from the tag name and the spaces after it:
#
#       # get rid of tag and spaces after it in $0
#       sub("[^ \n\t]*[ \n\t]*", "", $0);
#
# 4. Maintaining XML_RPATH:
#
# The _xml_enter() and _xml_exit() functions are called to maintain the
# XML_RPATH variable when entering and exiting specific tags. _xml_exit()
# will also validate the input, checking proper tag enclosure (or exit(1)
# in case of error).
#
#       if (XML_TYPE == "BEGIN") {
#           _xml_enter(XML_TAG);
#       } else {
#           _xml_exit(XML_TAG);
#       }
#
# 5. Extracting attributes:
#
# A loop is implemented to parse attributes, the idea is to get the attribute
# name, which is always followed by a '=' character:
#
#           _xml_attrib = $0;              # copy record.
#           sub(/=.*$/,"",_xml_attrib);    # get rid of '=' and anything after.
#           sub(/^[^=]*/,"",$0);           # remove attribute name from $0
#           _xml_attrib = tolower(_xml_attrib);
#           if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ )
#               _xml_panic("Invalid attribute name: " _xml_attrib);
#
# Now get the value, which is enclosed by either (") or (')
#
#          if (substr($0,1,2) == "=\"") {        # if $0 begins with ="
#               _xml_value = substr($0,3);       # extract value
#               sub(/".*$/,"",_xml_value);
#               sub(/^="[^"]*"/,"",$0);          # remove it from $0
#           } else if (substr($0,1,2) == "='") { # if $0 begins with ='
#               _xml_value = substr($0,3);       # extract value
#               sub(/'.*$/,"",_xml_value);
#               sub(/^='[^']*'/,"",$0);          # remove it from $0
#           } else {
#               -> panic (malformed input)
#           }
#
# After that, we simply store the value into the XML_ATTR associative
# array, and cleanup $0 from leading spaces:
#
#           XML_ATTR[_xml_attrib] = _xml_value;
#           sub(/^[ \t\n]*/,"",$0);
#
#
# 6. Handling direct tag closure:
#
# When a tag is closed directly (as in <foo/>), A single '/' will be
# parsed in the attribute parsing loop. We need to record this for the
# next call to xml_event(), since the current one should return a"BEGIN"
# for the "FOO" tag instead.
#
# We do this by setting the special _xml_closing variable, as in:
#
#          if ($0 == "/") {
#               # record a delayed tag closure for the next call
#               _xml_closing = XML_TAG;
#               break
#           }
#
# This variable is checked at the start of xml_event() like this:
#
#       # delayed tag closure - see below
#       if (_xml_closing) {
#           XML_TAG = _xml_closing;
#           XML_TYPE = "END";
#           _xml_closing = "";
#           _xml_exit(XML_TAG);
#           return 1;
#       }
#
# Note the call to _xml_exit() to update XML_RPATH here.
#