import os, sys, sgmllib, cookielib, urllib, htmlentitydefs if len(sys.argv) > 1: ref = sys.argv[1] else: ref = "../html/gcode.html" if len(sys.argv) > 2: main = sys.argv[2] else: main = ref.replace("gcode", "gcode_main") def get(attr, attrs, default=""): attr = attr.lower() for k, v in attrs: if k.lower() == attr: return v return default class MetaHandler: def do_meta(self, attrs): print "meta", attrs equiv = get("http-equiv", attrs) content = get("content", attrs) print "meta", equiv, content if equiv != "content-type": return attrs = cookielib.split_header_words([content])[0] print "meta", attrs encoding = get("charset", attrs) print "encoding", repr(encoding) if encoding == "ASCII": encoding = "ISO-8859-1" if encoding: self.encoding = encoding class get_refs(sgmllib.SGMLParser, MetaHandler): entitydefs = htmlentitydefs.entitydefs def __init__(self, verbose=0): sgmllib.SGMLParser.__init__(self, verbose) self.refs = set() self.encoding = None def do_a(self, attrs): href = get('href', attrs) if self.encoding: href = href.decode(self.encoding) href = urllib.unquote(href) if "#" in href: a, b = href.split("#") if b: self.refs.add(b) class get_anchors(sgmllib.SGMLParser, MetaHandler): entitydefs = htmlentitydefs.entitydefs def __init__(self, verbose=0): sgmllib.SGMLParser.__init__(self, verbose) self.anchors = set() self.encoding = None def unknown_starttag(self, tag, attrs): id = get('id', attrs) if id: self.do_a([('name', id)]) def unknown_endtag(self, tag): pass def do_a(self, attrs): name = get('name', attrs, get('id', attrs)) if self.encoding: name = name.decode(self.encoding) name = urllib.unquote(name) if name: self.anchors.add(name) r = get_refs() r.feed(open(ref).read()) r = r.refs a = get_anchors() a.feed(open(main).read()) a = a.anchors missing = r - a if missing: print "Anchors used in %s but not defined in %s:" % ( os.path.basename(ref), os.path.basename(main)) for i in missing: print "\t%r" % i raise SystemExit, 1