#!/usr/bin/env python """ tokenize-python.py -- print tokenization of one or more input .py files $Id$ History: bruce 080616 drafted this from an old script I had at home, py-tokenize.py russ 080617 Added options to format whitespace and to remove certain kinds of output, to make it more useful for comparing source files for important changes while ignoring unimportant changes. """ import sys from os.path import basename from getopt import getopt, GetoptError from tokenize import generate_tokens, tok_name from pprint import pprint usageMsg = '''usage: %s [-t] [-a] [-l] [-c] [-d] [-s n] [-o] files... When multiple files are given, "======= file" separators are inserted. -t - Print raw token types and strings. (The default is to print tokens in an indented text form with generated whitespace.) -a - Suppress all, same as "-lcd". -l - Suppress printing line breaks within a statement (NL tokens.) -c - Suppress printing comments. -d - Suppress printing doc strings. -s n - Print just the "signature" of each line (the first n words.) -o - Suppress printing operators in the signature. Examples: To check on changes other than comments and file formatting, use the "-a" option on the before-and-after versions and diff them. You can do a whole directory with *.py . cd before; %s -a *.py > ../before.pytok cd ../after; %s -a *.py > ../after.pytok cd ..; diff -C 1 before.pytok after.pytok > pytok.diffs Use "-aos 2" to concentrate more closely on statement indentation changes. indent-diff.gawk filters out *just* indentation changes from the diffs. indent-diff.gawk pytok.diffs > pytok.indent-diff ''' def usage(): pgm = basename(sys.argv[0]) print >> sys.stderr, usageMsg % (3*(pgm,)) return # Option variables. printToks = False noNLs = False noComments = False noDocStrs = False sigWords = 0 noOps = False filenames = None def doOpts(): global printToks, noNLs, noComments, noDocStrs, sigWords, noOps, filenames try: opts, filenames = getopt(sys.argv[1:], "talcds:o") except GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" usage() sys.exit(2) pass for opt, val in opts: if opt == "-t": printToks = True elif opt == "-a": noNLs = noComments = noDocStrs = True elif opt == "-l": noNLs = True elif opt == "-c": noComments = True elif opt == "-d": noDocStrs = True elif opt == "-s": sigWords = int(val) pass elif opt == "-o": noOps = True else: usage() continue return def improve(tup5): typ, text, s, t, line = tup5 tokname = tok_name[typ] if tokname == 'INDENT': text = '' return (tokname, text) def py_tokenize(filename_in, file_out): ## file_out = open(filename_out, "w") file_in = open(filename_in, 'rU') g = generate_tokens(file_in.readline) li = list(g) file_in.close() li2 = map(improve, li) if printToks: pprint(li2, file_out) else: doText(li2, file_out) pass return def doText(tokStrs, file_out): prevTok = 'NEWLINE' # Start out at the beginning of a line. prevString = '' firstTok = True prevUnary = False nTok = 1 # Token number in the line. nWords = 0 # Number of words in the line. indent = 0 nlIndent = 0 # Second-level indent within a statement. commentIndent = 0 lineBuff = "" nlToks = ('NEWLINE', 'NL', 'COMMENT') # Comments have a newline at the end. wordToks = ('NAME', 'NUMBER', 'STRING') noSpaceOps = ('.', ':', '(', '[', '{', '}', ']', ')') for (tok, tokString) in tokStrs: # Various things to ignore. if (tok == 'NL' and noNLs or tok == 'COMMENT' and noComments or prevTok == 'NEWLINE' and tok == 'STRING' and noDocStrs or tok != 'NEWLINE' and sigWords > 0 and nWords >= sigWords or tok == 'OP' and noOps): continue # Alter the indentation level. (These may occur after a NEWLINE token.) if tok == 'INDENT': indent += 4 continue if tok == 'DEDENT': indent -= 4 continue # NEWLINE is the logical end of statement, as opposed to NL, which is # mere formatting whitespace. Comments also end lines. if tok in nlToks: if not firstTok or tok == 'COMMENT': # Indentation for comments. if tok == 'COMMENT': if nTok == 1: lineBuff += (indent + commentIndent) * " " else: lineBuff += " " pass pass # Output the last token on the line, and then the line. lineBuff += tokString # Output the line. if not noNLs or lineBuff != "\n": file_out.write(lineBuff) lineBuff = "" pass # Second-level indent within a statement, after a NL token # that isn't at the beginning of a line. if tok == 'NL' and nTok > 1: nlIndent = 4 else: nlIndent = 0 pass pass else: # Generate indentation at the beginning of a line. if lineBuff == "": lineBuff = (indent + nlIndent) * " " # Put spaces around word tokens, but not before commas, after unary # ops, or around some special binary ops. if (nTok > 1 and # Never before the first token in the line. # When we might put out a space before this token. (prevTok in wordToks or tok in wordToks or tok == 'OP') and # When we wouldn't put out a space before this token. not ( prevUnary or (prevTok == 'OP' and (prevString in noSpaceOps and tok != 'OP')) or tok == 'OP' and (tokString == ',' or tokString in noSpaceOps)) ): lineBuff += " " pass # Output the token. lineBuff += tokString pass # Carry over a little bit of context from the last two tokens. prevUnary = (tok == 'OP' and tokString != ',' and tokString not in noSpaceOps and prevTok == 'OP' and prevString not in noSpaceOps) # Bug: Comments tokens after a colon appear *before* the INDENT, and # similarly after pass, continue, and return, *before* the DEDENT. if tok in nlToks: if prevTok == 'OP' and prevString == ':': commentIndent = 4 elif (prevTok == 'NAME' and prevString in ('pass', 'continue', 'return')): commentIndent = -4 else: commentIndent = 0 pass # The current token becomes the previous. prevTok = tok prevString = tokString firstTok = False # So we know there really was a previous token. # Reset the token and word counters after a newline. if tok in nlToks: nTok = 1 nWords = 0 else: nTok += 1 if tok in wordToks: nWords += 1 pass continue return def dofiles(filenames): for filename in filenames: if len(filenames) > 1: print "\n======= [%s]\n" % (filename,) py_tokenize(filename, sys.stdout) return def run(): doOpts() if filenames: dofiles(filenames) else: usage() return if __name__ == '__main__': run() # end