#!/usr/bin/python # # -*- coding: utf-8 -*- # vim: set ts=2 sw=2 et sts=2 ai: """This program converts mediawiki format to markdown. It takes a file as an argument and outputs the markdown text to stdout. """ # stdlib imports import cStringIO as StringIO import optparse import sys # External module imports from mwlib.uparser import simpleparse from mwlib.parser import nodes class options: # Should we drop to the debugger on error DEBUGGER = False # Should we die on unknown flags STRICT = True parser = optparse.OptionParser() parser.add_option( "-f", "--file", dest="file", help="wikimedia FILE to read.", metavar="FILE") parser.add_option( "-d", "--debugger", dest="DEBUGGER", action="store_true", default=sys.stdout.isatty() and sys.stdin.isatty(), help="Drop to Python PDB debugger on an error.") parser.add_option( "--no-debugger", dest="DEBUGGER", action="store_false", default=sys.stdout.isatty() and sys.stdin.isatty(), help="Don't drop to Python PDB debugger on an error.") parser.add_option( "-s", "--strict", dest="STRICT", action="store_true", default=options.STRICT, help="Error on known tags, style or other problems.") parser.add_option( "--no-strict", dest="STRICT", action="store_false", default=options.STRICT, help="Error on known tags, style or other problems.") parser.add_option( "-l", "--lenient", dest="STRICT", action="store_true", default=options.STRICT, help="Don't error on known tags, style or other problems.") def debugger(): if options.DEBUGGER: # Flush the output to make sure we see the latest errors/output sys.stderr.write('\n\n') sys.stderr.flush() sys.stdout.write('\n\n') sys.stdout.flush() import traceback, pdb type, value, tb = sys.exc_info() pdb.post_mortem(tb) else: raise class BaseConverter(object): def __init__(self): self.out = "" self.listmode = [] def append(self, s): if isinstance(s, unicode): s = s.encode('utf-8') self.out += s def getvalue(self): out = self.out self.out = "" return out def parse_node(self, node): try: tagname = 'on_'+str(node.tagname).replace('@', '') classname = 'on_'+node.__class__.__name__.lower().replace('@', '') f = getattr(self, tagname, getattr(self, classname, None)) f(node) except AttributeError, e: sys.stderr.write('Unknown node: '+(node.tagname or node.__class__.__name__.lower())) assert not options.STRICT def parse_children(self, node): for child in node.children: self.parse_node(child) on_node = parse_children def on_article(self, node): self.parse_children(node) self.produce_dls() def on_ol(self, node): self.listmode.append('order') self.append('\n') self.parse_children(node) self.listmode.pop(-1) def on_ul(self, node): self.listmode.append('unorder') self.append('\n') self.parse_children(node) self.listmode.pop(-1) def on_text(self, node): self.append(node.asText()) def on_style(self, node): # Definition list? if node.caption == ';': if not hasattr(self, 'dls'): self.dls = [] self.dls.append(node) elif node.caption == ':': # If not part of an definition list, it's an indent... if not hasattr(self, 'dls'): self.on_blockquote(node) else: self.dls.append(node) # italics elif node.caption == "'''": self.on_bold(node) # bold elif node.caption == "''": self.on_italics(node) # underline elif node.caption == "u": self.on_underline(node) else: sys.stderr.write("Unknown style: %s\n" % node.caption) assert not options.STRICT def on_underline(self, node): self.append("") self.parse_children(node) self.append("") # Ignore category links as we don't have anything similar def on_categorylink(self, node): return def on_table(self, node): table_cell_parser = self.__class__() table = [] table_caption = "" table_width = 0 table_column_widths = [] for row in node.children: if isinstance(row, nodes.Caption): table_cell_parser.parse_children(row) output = table_cell_parser.getvalue() table_caption += output continue if len(row.children) > table_width: table_width = len(row.children) while len(table_column_widths) < table_width: table_column_widths.append(0) cells = [] for i, cell in enumerate(row.children): table_cell_parser.parse_children(cell) cell_data = table_cell_parser.getvalue() if table_column_widths[i] < len(cell_data): table_column_widths[i] = len(cell_data) cells.append({'node': cell, 'rendered': cell_data}) args = {'rowtype': row} if len(row.children): args['celltype'] = row.children[-1] if len(cells): args['cells'] = cells table.append(args) for row in table: while len(row['cells']) < table_width: row['cells'].append({'rendered': ''}) self.on_process_table(table_caption, table_column_widths, table) class HTMLConverter(BaseConverter): """During HTML output (such as dl lists and tables) Markdown doesn't work.""" def on_p(self, node): self.append("
") self.parse_children(node) self.append("
") self.produce_dls() def on_italics(self, node): self.append("") self.parse_children(node) self.append("") def on_bold(self, node): self.append("") self.parse_children(node) self.append("") def on_url(self, node): parser = HTMLConverter() parser.parse_children(node) output = parser.getvalue() if not output: output = node.caption self.append("%s" % ( node.caption.replace(' ', '_'), output)) on_namedurl = on_url def on_imagelink(self, node): self.append("'" % ( node.target.replace('Image:', '').replace(' ', '_'), node.asText())) def on_articlelink(self, node): for i in ['jpg', 'png', 'gif']: if node.target.endswith(i): self.on_imagelink(node) break else: self.append("%s" % ( node.caption.replace(' ', '_'), node.caption)) def on_namespacelink(self, node): for i in ['jpg', 'png', 'gif']: if node.target.endswith(i): self.on_imagelink(node) break else: node.caption = node.target self.on_url(node) def produce_dls(self): # We need to specially handle defintion lists if hasattr(self, "dls"): self.append("%s\n' % output) def on_code(self, node): self.append('`') self.parse_children(node) self.append('`') def on_tt(self, node): self.append('') self.parse_children(node) self.append('') def on_p(self, node): self.parse_children(node) self.produce_dls() def produce_dls(self): if hasattr(self, "dls"): self.append("