""" printFunc.py: print method that can be called after populating DOM in order to see the XML objects in human readable format. Based on the original files from the pyXML 0.8.4 dom/ext module. @author: Urmi @version: $Id$ @copyright: 2004-2008 Nanorex, Inc. See LICENSE file for details. """ from xml.dom import XMLNS_NAMESPACE, XML_NAMESPACE, XHTML_NAMESPACE import string, re, sys from xml.dom import Node from Visitor import Visitor, WalkerInterface ILLEGAL_LOW_CHARS = '[\x01-\x08\x0B-\x0C\x0E-\x1F]' ILLEGAL_HIGH_CHARS = '\xEF\xBF[\xBE\xBF]' XML_ILLEGAL_CHAR_PATTERN = re.compile('%s|%s'%(ILLEGAL_LOW_CHARS, ILLEGAL_HIGH_CHARS)) def PrettyPrint(root, stream=sys.stdout, encoding='UTF-8', indent=' ', preserveElements=None): """ Prints the DOM object to stream """ if not hasattr(root, "nodeType"): return #from xml.dom.ext import Printer nss_hints = SeekNss(root) preserveElements = preserveElements or [] owner_doc = root.ownerDocument or root if hasattr(owner_doc, 'getElementsByName'): #We don't want to insert any whitespace into HTML inline elements #preserveElements = preserveElements + HTML_4_TRANSITIONAL_INLINE print "No provision for HTML Inline elements" visitor = PrintVisitor(stream, encoding, indent, preserveElements, nss_hints) PrintWalker(visitor, root).run() stream.write('\n') return def SeekNss(node, nss=None): """ traverses the tree to seek an approximate set of defined namespaces """ nss = nss or {} for child in node.childNodes: if child.nodeType == Node.ELEMENT_NODE: if child.namespaceURI: nss[child.prefix] = child.namespaceURI for attr in child.attributes.values(): if attr.namespaceURI == XMLNS_NAMESPACE: if attr.localName == 'xmlns': nss[None] = attr.value else: nss[attr.localName] = attr.value elif attr.namespaceURI: nss[attr.prefix] = attr.namespaceURI SeekNss(child, nss) return nss def utf8_to_code(text, encoding): # support for UTF-8 only encoding = string.upper(encoding) if encoding == 'UTF-8': return text def TranslateCdata(characters, encoding='UTF-8', prev_chars='', markupSafe=0, charsetHandler=utf8_to_code): """ charsetHandler is a function that takes a string or unicode object as the first argument, representing the string to be procesed, and an encoding specifier as the second argument. It must return a string or unicode object """ if not characters: return '' new_string = characters #Note: use decimal char entity rep because some browsers are broken #FIXME: This will bomb for high characters. Should, for instance, detect #The UTF-8 for 0xFFFE and put out ￾ if XML_ILLEGAL_CHAR_PATTERN.search(new_string): new_string = XML_ILLEGAL_CHAR_PATTERN.subn( lambda m: '&#%i;' % ord(m.group()), new_string)[0] new_string = charsetHandler(new_string, encoding) return new_string def TranslateCdataAttr(characters): """ Handles normalization and some intelligence about quoting """ if not characters: return '', "'" if "'" in characters: delimiter = '"' new_chars = re.sub('"', '"', characters) else: delimiter = "'" new_chars = re.sub("'", ''', characters) #FIXME: There's more to normalization #Convert attribute new-lines to character entity # characters is possibly shorter than new_chars (no entities) if "\n" in characters: new_chars = re.sub('\n', ' ', new_chars) return new_chars, delimiter class PrintVisitor(Visitor): def __init__(self, stream, encoding, indent='', plainElements=None, nsHints=None, isXhtml=0, force8bit=0): self.stream = stream self.encoding = encoding # Namespaces self._namespaces = [{}] self._nsHints = nsHints or {} # PrettyPrint self._indent = indent self._depth = 0 self._inText = 0 self._plainElements = plainElements or [] # HTML support self._html = None self._isXhtml = isXhtml self.force8bit = force8bit return def visit(self, node): if self._html is None: # Set HTMLDocument flag here for speed #print " leave html tag to be None" self._html = hasattr(node.ownerDocument, 'getElementsByName') nodeType = node.nodeType if node.nodeType == Node.ELEMENT_NODE: return self.visitElement(node) elif node.nodeType == Node.ATTRIBUTE_NODE: return self.visitAttr(node) elif node.nodeType == Node.TEXT_NODE: return self.visitText(node) elif node.nodeType == Node.DOCUMENT_NODE: return self.visitDocument(node) elif node.nodeType == Node.DOCUMENT_TYPE_NODE: return self.visitDocumentType(node) # It has a node type, but we don't know how to handle it raise Exception("Unknown node type: %s" % repr(node)) def _write(self, text): obj = utf8_to_code(text, self.encoding) self.stream.write(obj) return def _tryIndent(self): if not self._inText and self._indent: self._write('\n' + self._indent*self._depth) return def visitDocumentType(self, doctype): if not doctype.systemId and not doctype.publicId: return self._tryIndent() self._write(' | | | # [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] public = "'%s'" % doctype.publicId else: public = '"%s"' % doctype.publicId if doctype.publicId and doctype.systemId: self._write(' PUBLIC %s %s' % (public, system)) elif doctype.systemId: self._write(' SYSTEM %s' % system) if doctype.entities or doctype.notations: print "No support for entities" else: self._write('>') self._inText = 0 return def visitProlog(self): self._write("" % ( self.encoding or 'utf-8' )) self._inText = 0 return def visitNodeList(self, node, exclude=None): for curr in node: curr is not exclude and self.visit(curr) return def visitDocument(self, node): not self._html and self.visitProlog() node.doctype and self.visitDocumentType(node.doctype) self.visitNodeList(node.childNodes, exclude=node.doctype) return def visitText(self, node): text = node.data if self._indent: text = string.strip(text) and text if text: text = TranslateCdata(text, self.encoding) self.stream.write(text) self._inText = 1 return def visitAttr(self, node): if node.namespaceURI == XMLNS_NAMESPACE: # Skip namespace declarations return self._write(' ' + node.name) value = node.value if value or not self._html: text = TranslateCdata(value, self.encoding) text, delimiter = TranslateCdataAttr(text) self.stream.write("=%s%s%s" % (delimiter, text, delimiter)) return def GetAllNs(self,node): #The xml namespace is implicit nss = {'xml': XML_NAMESPACE} if node.nodeType == Node.ATTRIBUTE_NODE and node.ownerElement: return self.GetAllNs(node.ownerElement) if node.nodeType == Node.ELEMENT_NODE: if node.namespaceURI: nss[node.prefix] = node.namespaceURI for attr in node.attributes.values(): if attr.namespaceURI == XMLNS_NAMESPACE: if attr.localName == 'xmlns': nss[None] = attr.value else: nss[attr.localName] = attr.value elif attr.namespaceURI: nss[attr.prefix] = attr.namespaceURI if node.parentNode: #Inner NS/Prefix mappings take precedence over outer ones parent_nss = self.GetAllNs(node.parentNode) parent_nss.update(nss) nss = parent_nss return nss def visitElement(self, node): self._namespaces.append(self._namespaces[-1].copy()) inline = node.tagName in self._plainElements not inline and self._tryIndent() self._write('<%s' % node.tagName) if self._isXhtml or not self._html: namespaces = '' if self._isXhtml: nss = {'xml': XML_NAMESPACE, None: XHTML_NAMESPACE} else: nss = self.GetAllNs(node) if self._nsHints: self._nsHints.update(nss) nss = self._nsHints self._nsHints = {} del nss['xml'] for prefix in nss.keys(): if not self._namespaces[-1].has_key(prefix) or self._namespaces[-1][prefix] != nss[prefix]: nsuri, delimiter = TranslateCdataAttr(nss[prefix]) if prefix: xmlns = " xmlns:%s=%s%s%s" % (prefix, delimiter,nsuri,delimiter) namespaces = namespaces + xmlns else: xmlns = " xmlns=%s%s%s" % (delimiter,nsuri,delimiter) self._namespaces[-1][prefix] = nss[prefix] self._write(namespaces) for attr in node.attributes.values(): self.visitAttr(attr) if len(node.childNodes): self._write('>') self._depth = self._depth + 1 self.visitNodeList(node.childNodes) self._depth = self._depth - 1 if not self._html or (node.tagName not in HTML_FORBIDDEN_END): not (self._inText and inline) and self._tryIndent() self._write('' % node.tagName) elif not self._html: self._write('/>') else: self._write('>') del self._namespaces[-1] self._inText = 0 return class PrintWalker(WalkerInterface): def __init__(self, visitor, startNode): WalkerInterface.__init__(self, visitor) self.start_node = startNode return def step(self): """ There is really no step to printing. It prints the whole thing """ self.visitor.visit(self.start_node) return def run(self): return self.step()