#!/usr/bin/python # -*- coding: utf-8 -*- """ BSXPath.py: XPathEvaluator Extension for BeautifulSoup """ __version__ = '0.01e' # based on JavaScript-XPath 0.1.11 (c) 2007 Cybozu Labs, Inc. (http://coderepos.org/share/wiki/JavaScript-XPath) __date__ = '2009-04-12' __license__ = 'MIT-style license' __author__ = 'furyu' # http://furyu.tea-nifty.com/annex/ # http://d.hatena.ne.jp/furyu-tei/ """ Usage: from BSXPath import BSXPathEvaluator,XPathResult #*** PREPARATION (create object) document = BSXPathEvaluator() # BSXPathEvaluator is sub-class of BeautifulSoup # html: HTML (text string) #*** BASIC OPERATIONS result = document.evaluate(,,None,,None) # expression: XPath expression # node : base context-node(document is document-root) # type : XPathResult. # name : ANY_TYPE, NUMBER_TYPE, STRING_TYPE, BOOLEAN_TYPE, UNORDERED_NODE_ITERATOR_TYPE, ORDERED_NODE_ITERATOR_TYPE # UNORDERED_NODE_SNAPSHOT_TYPE, ORDERED_NODE_SNAPSHOT_TYPE, ANY_UNORDERED_NODE_TYPE, FIRST_ORDERED_NODE_TYPE # (*) 3rd(resolver) and 5th(result) arguments are not implemented length = result.snapshotLength node = result.snapshotItem() #*** USEFUL WRAPPER-FUNCTIONS nodes = document.getItemList([,]) first = document.getFirstItem([,]) # expression: XPath expression # node(optional): base context-node(default: document(document-root)) Examples: from BSXPath import BSXPathEvaluator,XPathResult html = 'Hello, DOM 3 XPath!

Hello, DOM 3 XPath!

This is XPathEvaluator Extension for BeautifulSoup.

This is based on JavaScript-XPath!

' document = BSXPathEvaluator(html) result = document.evaluate('//h1/text()[1]',document,None,XPathResult.STRING_TYPE,None) print result.stringValue # Hello, DOM 3 XPath! result = document.evaluate('//h1',document,None,XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,None) print result.snapshotLength # 1 print result.snapshotItem(0) #

Hello, DOM 3 XPath!

nodes = document.getItemList('//p') print len(nodes) # 2 print nodes # [

This is XPathEvaluator Extension for BeautifulSoup.

This is based on JavaScript-XPath!

] first = document.getFirstItem('//p') print first #

This is XPathEvaluator Extension for BeautifulSoup.

Notice: - This is based on JavaScript-XPath (c) 2007 Cybozu Labs, Inc. (http://coderepos.org/share/wiki/JavaScript-XPath) - Required: - Python 2.5+ - BeautifulSoup 3.0.7+(recommended) or 3.1.0+ """ import re,types,math,datetime #import logging from BeautifulSoup import * try: if DEFAULT_OUTPUT_ENCODING: pass except: DEFAULT_OUTPUT_ENCODING='utf-8' #***** Optional Parameters USE_NODE_CACHE=True USE_NODE_INDEX=True #***** General Functions def throwError(str): raise ValueError, str def typeof(obj): if isinstance(obj,bool): return 'boolean' if isinstance(obj,int) or isinstance(obj,float): return 'number' if isinstance(obj,basestring): return 'string' if isinstance(obj,types.FunctionType): return 'function' return 'object' def isNaN(obj): if isinstance(obj,int) or isinstance(obj,float): return False if not isinstance(obj,basestring): return True if obj.isdigit(): return False try: float(obj) return False except: return True def toNumber(obj): if isinstance(obj,int) or isinstance(obj,float): return obj if isinstance(obj,basestring): if obj.isdigit(): return int(obj) try: return float(obj) except: return obj return obj def toBoolean(obj): return bool(obj) def toString(obj): if isinstance(obj,bool): return u'true' if obj else u'false' if isinstance(obj,str) or isinstance(obj,int) or isinstance(obj,float): return unicode(obj) return obj #***** General Classes class ExtDict(dict): def __getattr__(self,name): try: attr=super(ExtDict,self).__getattr__(name) except: if not self.has_key(name): raise AttributeError,name attr=self.get(name) return attr #***** Common Definitions indent_space=' ' #{ // Regular Expressions re_has_ualpha=re.compile(r'(?![0-9])[\w]') re_seqspace=re.compile(r'\s+') re_firstspace=re.compile(r'^\s') re_lastspace=re.compile(r'\s$') #} // end of Regular Expressions #{ // NodeTypeDOM NodeTypeDOM=ExtDict({ 'ANY_NODE' :0 , 'ELEMENT_NODE' :1 , 'ATTRIBUTE_NODE' :2 , 'TEXT_NODE' :3 , 'CDATA_SECTION_NODE' :4 , 'ENTITY_REFERENCE_NODE' :5 , 'ENTITY_NODE' :6 , 'PROCESSING_INSTRUCTION_NODE':7 , 'COMMENT_NODE' :8 , 'DOCUMENT_NODE' :9 , 'DOCUMENT_TYPE_NODE' :10 , 'DOCUMENT_FRAGMENT_NODE' :11 , 'NOTATION_NODE' :12 }) NodeTypeBS=ExtDict({ 'BSXPathEvaluator' :NodeTypeDOM.DOCUMENT_NODE , 'NavigableString' :NodeTypeDOM.TEXT_NODE , 'CData' :NodeTypeDOM.CDATA_SECTION_NODE , 'ProcessingInstruction':NodeTypeDOM.PROCESSING_INSTRUCTION_NODE , 'Comment' :NodeTypeDOM.COMMENT_NODE , 'Declaration' :NodeTypeDOM.ANY_NODE , 'Tag' :NodeTypeDOM.ELEMENT_NODE }) #} // end of NodeTypeDOM #{ // NodeUtil def makeNodeUtils(): re_type_document_type=re.compile(r'^DOCTYPE\s') re_type_entity =re.compile(r'^ENTITY\s') re_type_notation =re.compile(r'^NOTATION\s') #re_processing_instruction=re.compile(r'^(.*?)\s+(.*?)\?*$') re_processing_instruction=re.compile(r'^(.*?)(\s+.*?)\?*$') re_declaration_name=re.compile(r'^([^\s]+)\s+([\%]?)\s*([^\s]+)\s') def makeNU_BS(): def _nodeType(node): if getattr(node,'nodeType',None)==NodeTypeDOM.ATTRIBUTE_NODE: return node.nodeType nodeType=NodeTypeBS.get(node.__class__.__name__) if nodeType==NodeTypeDOM.ANY_NODE: str=NavigableString.encode(node,DEFAULT_OUTPUT_ENCODING) if re_type_document_type.search(str): nodeType=NodeTypeDOM.DOCUMENT_TYPE_NODE elif re_type_entity.search(str): nodeType=NodeTypeDOM.ENTITY_NODE elif re_type_notation.search(str): nodeType=NodeTypeDOM.NOTATION_NODE return nodeType def _nodeName(node): if getattr(node,'nodeType',None)==NodeTypeDOM.ATTRIBUTE_NODE: return node.nodeName.lower() nodeType=_nodeType(node) if nodeType==NodeTypeDOM.DOCUMENT_NODE: return '#document' elif nodeType==NodeTypeDOM.TEXT_NODE: return '#text' elif nodeType==NodeTypeDOM.CDATA_SECTION_NODE: return '#cdata-section' elif nodeType==NodeTypeDOM.PROCESSING_INSTRUCTION_NODE: mrslt=re_processing_instruction.search(NavigableString.encode(node,DEFAULT_OUTPUT_ENCODING)) if mrslt: return mrslt.group(1) else: return NavigableString.encode(node,DEFAULT_OUTPUT_ENCODING) elif nodeType==NodeTypeDOM.COMMENT_NODE: return '#comment' elif nodeType==NodeTypeDOM.DOCUMENT_TYPE_NODE or nodeType==NodeTypeDOM.ENTITY_NODE or nodeType==NodeTypeDOM.NOTATION_NODE: mrslt=re_declaration_name.search(NavigableString.encode(node,DEFAULT_OUTPUT_ENCODING)) if mrslt: return mrslt.group(2) else: return NavigableString.encode(node,DEFAULT_OUTPUT_ENCODING) else: return node.name.lower() def _nodeValue(node): if getattr(node,'nodeType',None)==NodeTypeDOM.ATTRIBUTE_NODE: return node.nodeValue nodeType=_nodeType(node) if nodeType==NodeTypeDOM.CDATA_SECTION_NODE or \ nodeType==NodeTypeDOM.COMMENT_NODE or \ nodeType==NodeTypeDOM.TEXT_NODE: return NavigableString.encode(node, DEFAULT_OUTPUT_ENCODING) if nodeType==NodeTypeDOM.PROCESSING_INSTRUCTION_NODE: mrslt=re_processing_instruction.search(NavigableString.encode(node,DEFAULT_OUTPUT_ENCODING)) if mrslt: return mrslt.group(2) else: return None return None def _nodeAttrValue(node,attrName): if getattr(node,'nodeType',None)==NodeTypeDOM.ATTRIBUTE_NODE: return None nodeType=_nodeType(node) if nodeType!=NodeTypeDOM.ELEMENT_NODE: return None return node.get(attrName) def _parentNode(node): if getattr(node,'nodeType',None)==NodeTypeDOM.ATTRIBUTE_NODE: return node.parentNode return node.parent def _ownerDocument(node): owner=getattr(node,'_owner',None) if owner: return owner if getattr(node,'nodeType',None)==NodeTypeDOM.ATTRIBUTE_NODE: owner=node.parentNode else: owner=node while True: parent=owner.parent if not parent: break owner=parent try: node._owner=owner except: pass return owner def pairwise(iterable): itnext = iter(iterable).next while True: yield itnext(), itnext() def _attributes(node): if _nodeType(node)==NodeTypeDOM.ELEMENT_NODE: #return node._getAttrMap() if not getattr(node,'attrMap'): node.attrMap=dict(pairwise(node.attrs)) return node.attrMap else: return None def _contains(node,cnode): if _nodeType(node)==NodeTypeDOM.ATTRIBUTE_NODE: node=node.parentNode if _nodeType(cnode)==NodeTypeDOM.ATTRIBUTE_NODE: cnode=cnode.parentNode return node in cnode.findParents() def _preceding(node,cnode): if _nodeType(node)==NodeTypeDOM.ATTRIBUTE_NODE: node=node.parentNode if _nodeType(cnode)==NodeTypeDOM.ATTRIBUTE_NODE: cnode=cnode.parentNode #return cnode in node.findAllPrevious() return cnode in node.findPreviousSiblings() def _following(node,cnode): if _nodeType(node)==NodeTypeDOM.ATTRIBUTE_NODE: node=node.parentNode if _nodeType(cnode)==NodeTypeDOM.ATTRIBUTE_NODE: cnode=cnode.parentNode #return cnode in node.findAllNext() return cnode in node.findNextSiblings() def d_getattr(self,name): raise AttributeError,name #{ // ExtPageElement class ExtPageElement: def __getattr__(self,name): if name=='nodeType': return _nodeType(self) if name=='nodeName': return _nodeName(self) if name=='nodeValue': return _nodeValue(self) if name=='parentNode': return _parentNode(self) if name=='ownerDocument': return _ownerDocument(self) if name=='attributes': return _attributes(self) if name=='get': return self.get if name=='contains': return self.contains if name=='preceding': return self.preceding if name=='following': return self.following d_getattr(self,name) def get(self,key,default=None): return _nodeAttrValue(self,key) def contains(self,cnode): return _contains(self,cnode) def preceding(self,cnode): return _preceding(self,cnode) def following(self,cnode): return _following(self,cnode) PageElement.__bases__+=(ExtPageElement,) BeautifulSoup.__bases__+=(ExtPageElement,) NavigableString.__bases__+=(ExtPageElement,) CData.__bases__+=(ExtPageElement,) ProcessingInstruction.__bases__+=(ExtPageElement,) Comment.__bases__+=(ExtPageElement,) Declaration.__bases__+=(ExtPageElement,) Tag.__bases__+=(ExtPageElement,) #} // ExtPageElement #{ // _extBeautifulSoup def _extBeautifulSoup(): o_getattr=getattr(BeautifulSoup,'__getattr__',d_getattr) def e_getattr(self,name): if name=='nodeType': return NodeTypeDOM.DOCUMENT_NODE if name=='nodeName': return '#document' if name=='nodeValue': return None if name=='parentNode': return None if name=='ownerDocument': return None if name=='attributes': return None return o_getattr(self,name) BeautifulSoup.__getattr__=e_getattr _extBeautifulSoup() #} // _extBeautifulSoup #{ // _extNavigableString def _extNavigableString(): o_getattr=getattr(NavigableString,'__getattr__',d_getattr) def e_getattr(self,name): if name=='nodeType': return NodeTypeDOM.TEXT_NODE if name=='nodeName': return '#text' if name=='nodeValue': return NavigableString.encode(self,DEFAULT_OUTPUT_ENCODING) if name=='parentNode': return self.parent if name=='ownerDocument': return _ownerDocument(self) if name=='attributes': return None return o_getattr(self,name) NavigableString.__getattr__=e_getattr _extNavigableString() #} // _extNavigableString #{ // _extCData def _extCData(): o_getattr=getattr(CData,'__getattr__',d_getattr) def e_getattr(self,name): if name=='nodeType': return NodeTypeDOM.CDATA_SECTION_NODE if name=='nodeName': return '#cdata-section' if name=='nodeValue': return NavigableString.encode(self,DEFAULT_OUTPUT_ENCODING) if name=='parentNode': return self.parent if name=='ownerDocument': return _ownerDocument(self) if name=='attributes': return None return o_getattr(self,name) CData.__getattr__=e_getattr _extCData() #} // _extCData #{ // _extProcessingInstruction def _extProcessingInstruction(): o_getattr=getattr(ProcessingInstruction,'__getattr__',d_getattr) def e_getattr(self,name): if name=='nodeType': return NodeTypeDOM.PROCESSING_INSTRUCTION_NODE if name=='nodeName': mrslt=re_processing_instruction.search(NavigableString.encode(self,DEFAULT_OUTPUT_ENCODING)) return mrslt.group(1) if mrslt else NavigableString.encode(self,DEFAULT_OUTPUT_ENCODING) if name=='nodeValue': mrslt=re_processing_instruction.search(NavigableString.encode(self,DEFAULT_OUTPUT_ENCODING)) return mrslt.group(2) if mrslt else None if name=='parentNode': return self.parent if name=='ownerDocument': return _ownerDocument(self) if name=='attributes': return None return o_getattr(self,name) ProcessingInstruction.__getattr__=e_getattr _extProcessingInstruction() #} // _extProcessingInstruction #{ // _extComment def _extComment(): o_getattr=getattr(Comment,'__getattr__',d_getattr) def e_getattr(self,name): if name=='nodeType': return NodeTypeDOM.COMMENT_NODE if name=='nodeName': return '#comment' if name=='nodeValue': return NavigableString.encode(self, DEFAULT_OUTPUT_ENCODING) if name=='parentNode': return self.parent if name=='ownerDocument': return _ownerDocument(self) if name=='attributes': return None return o_getattr(self,name) Comment.__getattr__=e_getattr _extComment() #} // _extComment #{ // _extDeclaration def _extDeclaration(): o_getattr=getattr(Declaration,'__getattr__',d_getattr) def e_getattr(self,name): if name=='nodeType': str=NavigableString.encode(self,DEFAULT_OUTPUT_ENCODING) if re_type_document_type.search(str): return NodeTypeDOM.DOCUMENT_TYPE_NODE elif re_type_entity.search(str): return NodeTypeDOM.ENTITY_NODE elif re_type_notation.search(str): return NodeTypeDOM.NOTATION_NODE else: return NodeTypeDOM.ANY_NODE if name=='nodeName': mrslt=re_declaration_name.search(NavigableString.encode(self,DEFAULT_OUTPUT_ENCODING)) return mrslt.group(2) if mrslt else NavigableString.encode(self,DEFAULT_OUTPUT_ENCODING) if name=='nodeValue': return None if name=='parentNode': return self.parent if name=='ownerDocument': return _ownerDocument(self) if name=='attributes': return None return o_getattr(self,name) Declaration.__getattr__=e_getattr _extDeclaration() #} // _extDeclaration #{ // _extTag def _extTag(): o_getattr=getattr(Tag,'__getattr__',d_getattr) def e_getattr(self,name): if name=='nodeType': return NodeTypeDOM.ELEMENT_NODE if name=='nodeName': return self.name.lower() if name=='nodeValue': return None if name=='parentNode': return self.parent if name=='ownerDocument': return _ownerDocument(self) if name=='attributes': return self._getAttrMap() return o_getattr(self,name) Tag.__getattr__=e_getattr _extTag() #} // _extTag def _it_deepNodes(node): child_next=iter(getattr(node,'contents',[])).next while True: child=child_next() yield child for gchild in _it_deepNodes(child): yield gchild return ExtDict({ 'nodeType' :_nodeType , 'nodeName' :_nodeName , 'nodeValue' :_nodeValue , 'nodeAttrValue':_nodeAttrValue , 'parentNode' :_parentNode , 'ownerDocument':_ownerDocument , 'attributes' :_attributes , 'contains' :_contains , 'preceding' :_preceding , 'following' :_following , 'it_deepNodes' :_it_deepNodes }) return def makeNU(): def _to(valueType,node): if typeof(node)=='string': result=node else: nodeType=node.nodeType if nodeType==NodeTypeDOM.ATTRIBUTE_NODE: result=node.nodeValue else: strings=[] for _node in NodeUtilBS.it_deepNodes(node): if _node.nodeType==NodeTypeDOM.TEXT_NODE: strings.append(unicode(_node)) result=''.join(strings) if valueType=='number': return toNumber(result) elif valueType=='boolean': return toBoolean(result) else: return result def _attrMatch(node,attrName,attrValue): if not attrName or \ not attrValue and node.get(attrName) or \ (attrValue and node.get(attrName)==attrValue): return True else: return False def _getDescendantNodes(test,node,nodeset,attrName,attrValue,prevNodeset,prevIndex): if prevNodeset: prevNodeset.delDescendant(node,prevIndex) if USE_NODE_CACHE: _cachemap=getattr(node,'_cachemap',None) if not _cachemap: _cachemap=node._cachemap=ExtDict({'attrib':ExtDict({}),'all':None,'tag':ExtDict({})}) if attrValue and attrName: _cm=_cachemap.attrib _anmap=_cm.get(attrName) if not _anmap: _anmap=_cm[attrName]=ExtDict({}) nodes=_anmap.get(attrValue) if not nodes: nodes=_anmap[attrValue]=[] if getattr(node,'findAll',None): nodes.extend(node.findAll(attrs={attrName:attrValue})) for elm in nodes: if test.match(elm): nodeset.push(elm) elif getattr(test,'notOnlyElement',None): nodes=_cachemap.all if not nodes: nodes=_cachemap.all=[] for elm in NodeUtilBS.it_deepNodes(node): nodes.append(elm) for elm in nodes: if NodeUtil.attrMatch(elm,attrName,attrValue) and test.match(elm): nodeset.push(elm) else: nodeType=node.nodeType if nodeType==NodeTypeDOM.ELEMENT_NODE or nodeType==NodeTypeDOM.DOCUMENT_NODE: _cm=_cachemap.tag name=getattr(test,'name',None) if not name or name=='*': nodes=_cm.get('*') if not nodes: nodes=_cm['*']=node.findAll() else: nodes=_cm.get(name) if not nodes: nodes=_cm[name]=node.findAll([name]) for elm in nodes: if NodeUtil.attrMatch(elm,attrName,attrValue): nodeset.push(elm) else: # USE_NODE_CACHE is False if attrValue and attrName: if getattr(node,'findAll',None): for elm in node.findAll(attrs={attrName:attrValue}): if test.match(elm): nodeset.push(elm) elif getattr(test,'notOnlyElement',None): for elm in NodeUtilBS.it_deepNodes(node): if NodeUtil.attrMatch(elm,attrName,attrValue) and test.match(elm): nodeset.push(elm) else: nodeType=node.nodeType if nodeType==NodeTypeDOM.ELEMENT_NODE or nodeType==NodeTypeDOM.DOCUMENT_NODE: name=getattr(test,'name',None) if not name or name=='*': nodes=node.findAll() else: nodes=node.findAll([name]) for elm in nodes: if NodeUtil.attrMatch(elm,attrName,attrValue): nodeset.push(elm) return nodeset def _getChildNodes(test,node,nodeset,attrName,attrValue,prevNodeset,prevIndex): contents=getattr(node,'contents',[]) for elm in contents: if NodeUtil.attrMatch(elm,attrName,attrValue) and test.match(elm): nodeset.push(elm) return nodeset return ExtDict({ 'to' :_to , 'attrMatch' :_attrMatch , 'getDescendantNodes':_getDescendantNodes , 'getChildNodes' :_getChildNodes }) return (makeNU_BS(),makeNU()) (NodeUtilBS,NodeUtil)=makeNodeUtils() #} // end of NodeUtil #***** Application Classes #{ // Lexer class Lexer(object): def __init__(self,source): tokens=self.tokens=[] def anlz_token(mrslt): token=mrslt.group() if not self.re_strip.search(token): tokens.append(token) return token self.re_token.sub(anlz_token,source,count=0) self.index=0 def peek(self,i=0): token=self.tokens[self.index+i] if self.index+i]=|(?![0-9-])[\w-]+:\*|\s+|.') re_strip=re.compile(r'^\s') #} // end of Lexer #{ // Ctx class Ctx(object): def __init__(self,node,position=1,last=1): self.node=node self.position=position self.last=last #} // end of Ctx #{ // AttributeWrapper class AttributeWrapper(object): def __init__(self,name,value,parent): self.nodeType=NodeTypeDOM.ATTRIBUTE_NODE self.nodeName=name self.nodeValue=value self.parentNode=parent self.ownerElement=parent def get(self,key,default=None): return None def contains(self,cnode): return NodeUtilBS.contains(self,cnode) def preceding(self,cnode): return NodeUtilBS.preceding(self,cnode) def following(self,cnode): return NodeUtilBS.following(self,cnode) def __str__(self,encoding=DEFAULT_OUTPUT_ENCODING): if encoding: return self.nodeValue.encode(encoding) else: return self.nodeValue def __unicode__(self): return str(self).decode(DEFAULT_OUTPUT_ENCODING) @classmethod def getAttributeWrapper(cls,name,value,parent): _mapattr=getattr(parent,'_mapattr',None) if not _mapattr: _mapattr=parent._mapattr=ExtDict({}) if _mapattr.get(name): return _mapattr[name] _mapattr[name]=cls(name,value,parent) return _mapattr[name] #} // end of AttributeWrapper #{ // BaseExpr class BaseExpr(object): def __init__(self): pass def number(self,ctx): exrs=self.evaluate(ctx) if getattr(exrs,'isNodeSet',None): result=exrs.number() else: result=toNumber(exrs) return result def string(self,ctx): exrs=self.evaluate(ctx) if getattr(exrs,'isNodeSet',None): result=exrs.string() else: result=toString(exrs) return result def bool(self,ctx): exrs=self.evaluate(ctx) if getattr(exrs,'isNodeSet',None): result=exrs.bool() else: result=toBoolean(exrs) return result #} // end of BaseExpr #{ // BaseExprHasPredicates class BaseExprHasPredicates(BaseExpr): def __init__(self): pass def evaluatePredicates(self,nodeset,start=0): reverse=getattr(self,'reverse',False) predicates=getattr(self,'predicates',[]) nodeset.sort() l0=len(predicates) for i in range(start,l0): predicate=predicates[i] deleteIndexes=[] nodes=nodeset.list() l1=len(nodes) for j in range(0,l1): position=(l1-j) if reverse else (j+1) exrs=predicate.evaluate(Ctx(nodes[j],position,l1)) if typeof(exrs)=='number': exrs=(position==exrs) elif typeof(exrs)=='string': exrs=False if exrs=='' else True elif typeof(exrs)=='object': exrs=exrs.bool() if not exrs: deleteIndexes.append(j) r=range(0,len(deleteIndexes)) r.sort(reverse=True) for j in r: nodeset._del(deleteIndexes[j]) return nodeset @classmethod def parsePredicates(cls,lexer,expr): while lexer.peek()=='[': lexer.next() if lexer.empty(): throwError(u'missing predicate expr') predicate=BinaryExpr.parse(lexer) expr.predicate(predicate) if lexer.empty(): throwError(u'unclosed predicate expr') if lexer.next() != ']': lexer.back() throwError(u'bad token: %s' % (lexer.next())) #} // end of BaseExprHasPredicates #{ // BinaryExpr class BinaryExpr(BaseExpr): def __init__(self,op,left,right): self.op=op self.left=left self.right=right self.dataType=BinaryExpr.ops[op][2] (lneedContextPosition,rneedContextPosition)=(getattr(left,'needContextPosition',None),getattr(right,'needContextPosition',None)) (lneedContextNode,rneedContextNode)=(getattr(left,'needContextNode',None),getattr(right,'needContextNode',None)) self.needContextPosition=lneedContextPosition or rneedContextPosition self.needContextNode=lneedContextNode or rneedContextNode if op=='=': (ldatatype,rdatatype)=(getattr(left,'datatype',None),getattr(right,'datatype',None)) (lqattr,rqattr)=(getattr(left,'quickAttr',None),getattr(right,'quickAttr',None)) if not rneedContextNode and not rneedContextPosition and rdatatype!='nodeset' and rdatatype!='void' and lqattr: self.quickAttr=True self.attrName=left.attrName self.attrValueExpr=right elif not lneedContextNode and not lneedContextPosition and ldatatype!='nodeset' and ldatatype!='void' and rqattr: self.quickAttr=True self.attrName=right.attrName self.attrValueExpr=left def evaluate(self,ctx): result=BinaryExpr.ops[self.op][1](self.left,self.right,ctx) return result def show(self,indent=''): t='' t+=indent+'binary: '+self.op+'\n' indent+=indent_space t+=self.left.show(indent) t+=self.right.show(indent) return t # --- Local Functions @staticmethod def _compare(op,comp,left,right,ctx): left=left.evaluate(ctx) right=right.evaluate(ctx) if getattr(left,'isNodeSet',None) and getattr(right,'isNodeSet',None): lnodes=left.list() rnodes=right.list() for lnode in lnodes: for rnode in rnodes: if comp(NodeUtil.to('string',lnode),NodeUtil.to('string',rnode)): return True return False if getattr(left,'isNodeSet',None) or getattr(right,'isNodeSet',None): if getattr(left,'isNodeSet',None): (nodeset,primitive)=(left,right) else: (nodeset,primitive)=(right,left) nodes=nodeset.list() type=typeof(primitive) for node in nodes: if comp(NodeUtil.to(type,node),primitive): return True return False if op=='=' or op=='!=': if typeof(left)=='boolean' or typeof(right)=='boolean': return comp(toBoolean(left),toBoolean(right)) if typeof(left)=='number' or typeof(right)=='number': return comp(toNumber(left),toNumber(right)) return comp(left,right) return comp(toNumber(left),toNumber(right)) def _div(left,right,ctx): l=left.number(ctx) r=right.number(ctx) if typeof(l)!='number' or typeof(r)!='number': return 'NaN' if r==0: sign=int(getattr(left,'op','+')+'1')*int(getattr(right,'op','+')+'1') if l==0: return 'NaN' elif sign<0: return '-Infinity' else: return 'Infinity' n=float(l) / float(r) n1=int(n) return n1 if n1==n else n def _mod(left,right,ctx): l=left.number(ctx) r=right.number(ctx) if typeof(l)!='number' or typeof(r)!='number': return 'NaN' if r==0: if l==0: return 'NaN' else: return 0 return l % r def _mul(left,right,ctx): l=left.number(ctx) r=right.number(ctx) if typeof(l)!='number' or typeof(r)!='number': return 'NaN' n=l * r n1=int(n) return n1 if n1==n else n def _add(left,right,ctx): l=left.number(ctx) r=right.number(ctx) if typeof(l)!='number' or typeof(r)!='number': return 'NaN' n=l + r n1=int(n) return n1 if n1==n else n def _sub(left,right,ctx): l=left.number(ctx) r=right.number(ctx) if typeof(l)!='number' or typeof(r)!='number': return 'NaN' n=l - r n1=int(n) return n1 if n1==n else n def _lt(left,right,ctx): return BinaryExpr._compare('<',(lambda a,b:a',(lambda a,b:a>b),left,right,ctx) def _le(left,right,ctx): return BinaryExpr._compare('<=',(lambda a,b:a<=b),left,right,ctx) def _ge(left,right,ctx): return BinaryExpr._compare('>=',(lambda a,b:a>=b),left,right,ctx) def _eq(left,right,ctx): return BinaryExpr._compare('=',(lambda a,b:a==b),left,right,ctx) def _ne(left,right,ctx): return BinaryExpr._compare('!=',(lambda a,b:a!=b),left,right,ctx) def _and(left,right,ctx): return left.bool(ctx) & right.bool(ctx) def _or(left,right,ctx): return left.bool(ctx) | right.bool(ctx) ops=ExtDict({ 'div':[6,_div,'number' ] , 'mod':[6,_mod,'number' ] , '*' :[6,_mul,'number' ] , '+' :[5,_add,'number' ] , '-' :[5,_sub,'number' ] , '<' :[4,_lt ,'boolean'] , '>' :[4,_gt ,'boolean'] , '<=' :[4,_le ,'boolean'] , '>=' :[4,_ge ,'boolean'] , '=' :[3,_eq ,'boolean'] , '!=' :[3,_ne ,'boolean'] , 'and':[2,_and,'boolean'] , 'or' :[1,_or ,'boolean'] }) @classmethod def parse(cls,lexer): ops=cls.ops stack=[] index=lexer.index while True: if lexer.empty(): throwError(u'missing right expression') expr=UnaryExpr.parse(lexer) op=lexer.next() if not op: break info=ops.get(op) precedence=info and info[0] if not precedence: lexer.back() break while 0deep2: while deep1!=deep2: deep1-=1 node1=node1.parentNode if node1==node2: return 1 elif deep2>deep1: while deep2!=deep1: deep2-=1 node2=node2.parentNode if node1==node2: return -1 while True: ancestor1=node1.parentNode ancestor2=node2.parentNode if ancestor1==ancestor2: break node1=ancestor1 node2=ancestor2 while True: node1=node1.nextSibling if not node1: break if node1==node2: return -1 return 1 def index_comp(a,b): if a.nodeType==NodeTypeDOM.ATTRIBUTE_NODE: a=a.parentNode if b.nodeType==NodeTypeDOM.ATTRIBUTE_NODE: b=b.parentNode return cmp(a._sortindex,b._sortindex) if USE_NODE_INDEX: nodes.sort(index_comp) else: nodes.sort(_comp) def reserveDelByNodeID(self,id,offset,reverse): _map=self.createIdIndexMap() index=_map.get(id) if index: if reverse and index<(self.length-offset-1) or not reverse and offset

text before
text after

" # # returned 'True', even though 'False' is right # - cope with problems on malformed HTML # may convert '

' to '

' automatically # # 0.01 : 2009-03-25 # first release # #■ End of BSXPath.py