# dtdgport.awk # Reads an XML document from standard input and # prints a DTD for this document to standard output. # http://saxon.sourceforge.net/dtdgen.html # JK 2004-10-09 # JK 2006-03-05 # JK 2007-08-05 Converted for portability from the original dtd_generator.awk # The program makes an internal list of all the elements # and attributes that appear in your document, noting how # they are nested, and noting which elements contain # character data. BEGIN { while (getXMLEVENT(ARGV[1])) { # Remember each element. if ( XMLEVENT == "STARTELEM" ) { # Remember the parent names of each child node. name[XMLDEPTH] = XMLSTARTELEM if (XMLDEPTH>1) child[name[XMLDEPTH-1], XMLSTARTELEM] ++ # Count how often the element occurs. elem[XMLSTARTELEM] ++ # Remember all the attributes with the element. for (a in XMLATTR) attr[XMLSTARTELEM,a] ++ } } } END { print_elem(1, name[1]) } # name[1] is the root # Print one element (including sub-elements) but only once. function print_elem(depth, element, c, atn, chl, n, i, myChildren) { if (already_printed[element]++) return indent=sprintf("%*s", 2*depth-2, "") myChildren="" for (c in child) { split(c, chl, SUBSEP) if (element == chl[1]) { if (myChildren=="") myChildren = chl[2] else myChildren = myChildren " | " chl[2] } } # If an element has no child nodes, declare it as such. if (myChildren=="") print indent "" else print indent "" # After the element name itself, list its attributes. for (a in attr) { split(a, atn, SUBSEP) # Treat only those attributes that belong to the current element. if (element == atn[1]) { # If an attribute occured each time with its element, notice this. if (attr[element, atn[2]] == elem[element]) print indent "" else print indent "" } } # Now go through the child nodes of this elements and print them. gsub(/[\|]/, " ", myChildren) n=split(myChildren, chl) for(i=1; i<=n; i++) { print_elem(depth+1, chl[i]) split(myChildren, chl) } } ## # getXMLEVENT( file ): # read next xml-data into XMLEVENT,XMLNAME,XMLATTR # # referenced entities are not resolved # Parameters: # file -- path to xml file # External variables: # XMLEVENT -- type of item read, e.g. "STARTELEM"(tag), "ENDELEM"(end tag), # "COMMENT"(comment), "CHARDATA"(data) # XMLNAME -- value of item, e.g. tagname if type is "STARTELEM" or "ENDELEM" # XMLATTR -- Map of attributes, only set if XMLEVENT=="STARTELEM" # XMLPATH -- Path to current tag, e.g. /TopLevelTag/SubTag1/SubTag2 # XMLROW -- current line number in input file # XMLERROR -- error text, set on parse error # Returns: # 1 on successful read: XMLEVENT, XMLNAME, XMLATTR are set accordingly # "" at end of file or parse error, XMLERROR is set on error # Private Data: # _XMLIO -- buffer, XMLROW, XMLPATH for open files ## function getXMLEVENT( file ,end,p,q,tag,att,accu,mline,mode,S0,ex,dtd) { XMLEVENT=XMLNAME=XMLERROR=XMLSTARTELEM=XMLENDELEM = "" split("", XMLATTR) S0 = _XMLIO[file,"S0"] XMLROW = _XMLIO[file,"line"]; XMLPATH = _XMLIO[file,"path"]; XMLDEPTH=_XMLIO[file,"depth"]+0; dtd = _XMLIO[file,"dtd"]; while (!XMLEVENT) { if (S0 == "") { if (1 != (getline S0 < file)) break; XMLROW ++; S0 = S0 RS; } if (mode == "") { mline = XMLROW accu="" p = substr(S0,1,1) if (p != "<" && !(dtd && p=="]")) mode="CHARDATA" else if (p == "]") { S0 = substr(S0,2) mode="ENDDOCT" end=">" dtd=0 } else if ( substr(S0,1,4) == "" } else if ( substr(S0,1,9) == "" } else if (substr(S0,1,9) == "" } else if ( substr(S0,1,2) == "" } else if (substr(S0,1,2) == "") { S0 = substr(S0,3) mode = "PROCINST" end = "?>" } else if ( substr(S0,1,2)=="" ) { S0 = substr(S0,3) mode = "ENDELEM" end = ">"; tag = S0 sub(/[ \n\r\t>].*$/,"",tag) S0 = substr(S0,length(tag)+1) ex = XMLPATH sub(/\/[^\/]*$/,"",XMLPATH) ex = substr(ex, length(XMLPATH)+2) if (tag != ex) { XMLERROR = "unexpected close tag <" ex ">.." tag ">" break } } else { S0 = substr(S0,2) mode = "STARTELEM" tag = S0 sub(/[ \n\r\t\/>].*$/,"",tag) S0 = substr(S0, length(tag)+1) if (tag !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/ XMLERROR = "invalid tag name '" tag "'" break } XMLPATH = XMLPATH "/" tag; } } else if (mode == "CHARDATA") { # terminated by "<" or EOF p = index(S0, "<") if (dtd && (q=index(S0,"]")) && (!p || q
" ) { S0 = substr(S0, 3) mode = "" XMLEVENT = "STARTELEM" XMLNAME = XMLSTARTELEM = tag XMLDEPTH ++ S0 = "" tag ">" S0 } else if (substr(S0, 1, 1) == ">" ) { S0 = substr(S0, 2) mode = "" XMLEVENT = "STARTELEM" XMLNAME = XMLSTARTELEM = tag XMLDEPTH ++ } else { att = S0 sub(/[= \n\r\t\/>].*$/,"",att) S0 = substr(S0, length(att) + 1) mode = "ATTR" if (att !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/ XMLERROR = "invalid attribute name '" att "'" break } } } else if (mode == "ATTR") { sub(/^[ \n\r\t]*/, "", S0) if (S0 == "") continue if (substr(S0,1,1) == "=" ) { S0 = substr(S0,2) mode = "EQ" } else { XMLATTR[att] = att mode = "STARTELEM" } } else if (mode == "EQ") { sub(/^[ \n\r\t]*/,"",S0) if (S0 == "") continue end = substr(S0,1,1) if (end == "\"" || end == "'") { S0 = substr(S0,2) accu = "" mode = "VALUE" } else { accu = S0 sub(/[ \n\r\t\/>].*$/,"", accu) S0 = substr(S0, length(accu)+1) XMLATTR[att] = unescapeXML(accu) mode = "STARTELEM" } } else if (mode == "VALUE") { # terminated by end if (p = index(S0, end)) { XMLATTR[att] = accu unescapeXML(substr(S0,1,p-1)) S0 = substr(S0, p+length(end)) mode = "STARTELEM" } else { accu = accu unescapeXML(S0) S0="" } } else if (mode == "STARTDOCT") { # terminated by "[" or ">" if ((q = index(S0, "[")) && (!(p = index(S0,end)) || q", text ) gsub( "<", "<", text ) gsub( "&", "\\&", text) return text } # close xml file function closeXMLEVENT(file) { close(file); delete _XMLIO[file,"S0"] delete _XMLIO[file,"line"] delete _XMLIO[file,"path"]; delete _XMLIO[file,"depth"]; delete _XMLIO[file,"dtd"] delete _XMLIO[file,"open"] delete _XMLIO[file,"IND"] }