diff options
author | Russell D. Fish <fish@cs.utah.edu> | 2008-06-19 18:19:33 +0000 |
---|---|---|
committer | Russell D. Fish <fish@cs.utah.edu> | 2008-06-19 18:19:33 +0000 |
commit | 40c76baae020696f879136227dcc4ece11aabc27 (patch) | |
tree | dfca3b8f2dd8309a1d271d4892e49c401e34b75a | |
parent | 5e486b37ec55d8039f3f3659c8b76971ad9ad33e (diff) | |
download | nanoengineer-40c76baae020696f879136227dcc4ece11aabc27.tar.gz nanoengineer-40c76baae020696f879136227dcc4ece11aabc27.zip |
Tools for checking large whitespace/line-length changes.
tokenize-python.py - Added options to format whitespace and to remove
certain kinds of output, to make it more useful for comparing source
files for important changes while ignoring unimportant changes.
indent-diff.gawk - Filter indentation changes from context diffs.
Only diff line groups with indentation changes are printed.
-rwxr-xr-x | cad/src/tools/Refactoring/indent-diff.gawk | 118 | ||||
-rwxr-xr-x | cad/src/tools/Refactoring/tokenize-python.py | 216 |
2 files changed, 326 insertions, 8 deletions
diff --git a/cad/src/tools/Refactoring/indent-diff.gawk b/cad/src/tools/Refactoring/indent-diff.gawk new file mode 100755 index 000000000..6ed20112c --- /dev/null +++ b/cad/src/tools/Refactoring/indent-diff.gawk @@ -0,0 +1,118 @@ +#!/usr/bin/env gawk -f +# +# indent-diff.gawk -- Filter indentation changes from context diffs. +# Only diff line groups with indentation changes are printed. +# +# Indentation is significant for control structures in Python code. +# On matching change lines (starting with a "!" in context diffs), +# if the indentation is different, the indentation count is inserted +# after the "! " to highlight the change. e.g: +# +# *************** +# *** 33,35 **** +# draw_vane( bond, a1py, a2py, ord_pi_y, rad, col) +# ! 8 if ord_pi_z: +# draw_vane( bond, a1pz, a2pz, ord_pi_z, rad, col) +# --- 33,35 ---- +# draw_vane( bond, a1py, a2py, ord_pi_y, rad, col) +# ! 12 if ord_pi_z: +# draw_vane( bond, a1pz, a2pz, ord_pi_z, rad, col) +# +# Note: currently assumes that indentation is done with spaces, not tabs. + +BEGIN { + debug = 0 ## 1 + # Minimum indentation difference to report. + mindiff = 2 +} + +# Capture the file header lines. +/^\*\*\* [a-zA-Z]/ { + if ( entry ) doGroup(); entry = 0; + ofile = $0; + didHdr = 0; + if ( debug ) print "Starting", ofile; + next; +} +/^\-\-\- [a-zA-Z]/ { nfile = $0; next; } + +# Collect and process diff line groups. +# Line group separator. +/^\*\*\*\*\*\*/ { + if ( entry ) doGroup(); + entry = 1; oline = 1; nline = 0; +} +# Separator between old and new line sections. +/^--- [1-9]/ { nline = 1; } +# Store lines. +{ if ( nline ) nlines[nline++] = $0; else olines[oline++] = $0; next; } +END { if ( entry ) doGroup(); } + +# Process a diff line group within a file diff. +function doGroup() { + # Compare the indentation on old and new lines in a line group. + nonmatches = 0; + for ( o = n = 1; o < oline && n < nline; o++ ) { + osig = sig( ol = olines[o] ); + # Ignore old, non-change ("!") lines. + if ( substr(ol, 1, 1) != "!" ) continue; + # Look for a corresponding new change line. + nonmatched = 1; + oind = indLen(ol); + for ( ; n < nline; n++ ) { + nsig = sig( nl = nlines[n] ); + ##printf "sigs\n %s|%s\n %s|%s\n", osig, ol, nsig, nl; + # Ignore new, non-change ("!") lines. + if ( substr(nl, 1, 1) != "!" ) continue; + if ( nsig != osig ) { + if ( debug ) printf "different signatures\n %s\n %s\n", ol, nl; + } else { + # Matching signatures, compare indentation. + nind = indLen(nl); + if ( nind <= (oind + mindiff) && nind >= (oind - mindiff) ) { + if ( debug ) printf "matched sigs & indentation\n %s\n %s\n", ol, nl; + nonmatched = 0; + } + else { + # Non-match: Insert the indentation lengths in the lines to show where. + olines[o] = substr(olines[o], 1, 2) sprintf("%2d", oind) substr(olines[o], 5); + nlines[n] = substr(nlines[n], 1, 2) sprintf("%2d", nind) substr(nlines[n], 5); + if ( debug ) printf "Different indentation\n %s\n %s\n", olines[o], nlines[n] + } + n++; # Matched signatures, go on to next line. + break; # Out of the new-line loop. + } + } + nonmatches += nonmatched + } + + # Print line groups with indentation that isn't known to match. + if ( nonmatches ) { + if ( ! didHdr ) { print ofile; print nfile; didHdr = 1; } + for ( o = 1; o < oline; o++ ) print olines[o]; + for ( n = 1; n < nline; n++ ) print nlines[n]; + } +} + +# Use the first two words on the line as a signature. +function sig(line) { + notWord = "[^a-zA-Z0-9_]+"; + word = "([a-zA-Z0-9_]+)"; + twoWords = "^" notWord word notWord word ".*"; + oneWord = "^" notWord word ".*"; + ret = gensub(twoWords, "\\1 \\2", 1, line); + + # There may not be two words on the line, or any. + if ( ret == line ) ret = gensub(oneWord, "\\1", 1, line); + if ( ret == line ) ret = "" + + ##print "sig", match(line, pat), ret + return ret; +} + +# The length of indentation on a line in a context diff entry. +function indLen(line) { + # Skip the first two characters, which are prefixed by diff. + return length(gensub("..( *).*", "\\1", 1, line)) +} + diff --git a/cad/src/tools/Refactoring/tokenize-python.py b/cad/src/tools/Refactoring/tokenize-python.py index 1a8204860..28732e03a 100755 --- a/cad/src/tools/Refactoring/tokenize-python.py +++ b/cad/src/tools/Refactoring/tokenize-python.py @@ -8,17 +8,89 @@ History: bruce 080616 drafted this from an old script I had at home, py-tokenize.py -TODO: - -add options to remove certain kinds of output, -to make it more useful for comparing source files for important changes -while ignoring some kinds of changes. +russ 080617 Added options to format whitespace and to remove certain kinds of +output, to make it more useful for comparing source files for important changes +while ignoring unimportant changes. """ import sys +from os.path import basename +from getopt import getopt, GetoptError from tokenize import generate_tokens, tok_name from pprint import pprint +usageMsg = '''usage: %s [-t] [-a] [-l] [-c] [-d] [-s n] [-o] files... + When multiple files are given, "======= file" separators are inserted. + + -t - Print raw token types and strings. (The default is to print + tokens in an indented text form with generated whitespace.) + + -a - Suppress all, same as "-lcd". + -l - Suppress printing line breaks within a statement (NL tokens.) + -c - Suppress printing comments. + -d - Suppress printing doc strings. + + -s n - Print just the "signature" of each line (the first n words.) + -o - Suppress printing operators in the signature. + + Examples: + To check on changes other than comments and file formatting, use the "-a" + option on the before-and-after versions and diff them. You can do a whole + directory with *.py . + cd before; %s -a *.py > ../before.pytok + cd ../after; %s -a *.py > ../after.pytok + cd ..; diff -C 1 before.pytok after.pytok > pytok.diffs + + Use "-aos 2" to concentrate more closely on statement indentation changes. + + indent-diff.gawk filters out *just* indentation changes from the diffs. + indent-diff.gawk pytok.diffs > pytok.indent-diff +''' +def usage(): + pgm = basename(sys.argv[0]) + print >> sys.stderr, usageMsg % (3*(pgm,)) + return + +# Option variables. +printToks = False +noNLs = False +noComments = False +noDocStrs = False +sigWords = 0 +noOps = False +filenames = None + +def doOpts(): + global printToks, noNLs, noComments, noDocStrs, sigWords, noOps, filenames + try: + opts, filenames = getopt(sys.argv[1:], "talcds:o") + except GetoptError, err: + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) + pass + for opt, val in opts: + if opt == "-t": + printToks = True + elif opt == "-a": + noNLs = noComments = noDocStrs = True + elif opt == "-l": + noNLs = True + elif opt == "-c": + noComments = True + elif opt == "-d": + noDocStrs = True + elif opt == "-s": + sigWords = int(val) + pass + elif opt == "-o": + noOps = True + else: + usage() + continue + return + def improve(tup5): typ, text, s, t, line = tup5 tokname = tok_name[typ] @@ -33,7 +105,135 @@ def py_tokenize(filename_in, file_out): li = list(g) file_in.close() li2 = map(improve, li) - pprint(li2, file_out) + if printToks: + pprint(li2, file_out) + else: + doText(li2, file_out) + pass + return + +def doText(tokStrs, file_out): + prevTok = 'NEWLINE' # Start out at the beginning of a line. + prevString = '' + firstTok = True + prevUnary = False + nTok = 1 # Token number in the line. + nWords = 0 # Number of words in the line. + indent = 0 + nlIndent = 0 # Second-level indent within a statement. + commentIndent = 0 + lineBuff = "" + + nlToks = ('NEWLINE', 'NL', 'COMMENT') # Comments have a newline at the end. + wordToks = ('NAME', 'NUMBER', 'STRING') + noSpaceOps = ('.', ':', '(', '[', '{', '}', ']', ')') + + for (tok, tokString) in tokStrs: + # Various things to ignore. + if (tok == 'NL' and noNLs or + tok == 'COMMENT' and noComments or + prevTok == 'NEWLINE' and tok == 'STRING' and noDocStrs or + tok != 'NEWLINE' and sigWords > 0 and nWords >= sigWords or + tok == 'OP' and noOps): + continue + + # Alter the indentation level. (These may occur after a NEWLINE token.) + if tok == 'INDENT': + indent += 4 + continue + if tok == 'DEDENT': + indent -= 4 + continue + + # NEWLINE is the logical end of statement, as opposed to NL, which is + # mere formatting whitespace. Comments also end lines. + if tok in nlToks: + if not firstTok or tok == 'COMMENT': + + # Indentation for comments. + if tok == 'COMMENT': + if nTok == 1: + lineBuff += (indent + commentIndent) * " " + else: + lineBuff += " " + pass + pass + + # Output the last token on the line, and then the line. + lineBuff += tokString + + # Output the line. + if not noNLs or lineBuff != "\n": + file_out.write(lineBuff) + lineBuff = "" + pass + + # Second-level indent within a statement, after a NL token + # that isn't at the beginning of a line. + if tok == 'NL' and nTok > 1: + nlIndent = 4 + else: + nlIndent = 0 + pass + + pass + else: + + # Generate indentation at the beginning of a line. + if lineBuff == "": + lineBuff = (indent + nlIndent) * " " + + # Put spaces around word tokens, but not before commas, after unary + # ops, or around some special binary ops. + if (nTok > 1 and # Never before the first token in the line. + # When we might put out a space before this token. + (prevTok in wordToks or tok in wordToks or tok == 'OP') and + # When we wouldn't put out a space before this token. + not ( prevUnary or + (prevTok == 'OP' and (prevString in noSpaceOps + and tok != 'OP')) or + tok == 'OP' and (tokString == ',' + or tokString in noSpaceOps)) ): + lineBuff += " " + pass + + # Output the token. + lineBuff += tokString + pass + + # Carry over a little bit of context from the last two tokens. + prevUnary = (tok == 'OP' and tokString != ',' and + tokString not in noSpaceOps and + prevTok == 'OP' and prevString not in noSpaceOps) + # Bug: Comments tokens after a colon appear *before* the INDENT, and + # similarly after pass, continue, and return, *before* the DEDENT. + if tok in nlToks: + if prevTok == 'OP' and prevString == ':': + commentIndent = 4 + elif (prevTok == 'NAME' and + prevString in ('pass', 'continue', 'return')): + commentIndent = -4 + else: + commentIndent = 0 + pass + + # The current token becomes the previous. + prevTok = tok + prevString = tokString + firstTok = False # So we know there really was a previous token. + + # Reset the token and word counters after a newline. + if tok in nlToks: + nTok = 1 + nWords = 0 + else: + nTok += 1 + if tok in wordToks: + nWords += 1 + pass + + continue + return def dofiles(filenames): for filename in filenames: @@ -43,11 +243,11 @@ def dofiles(filenames): return def run(): - filenames = sys.argv[1:] + doOpts() if filenames: dofiles(filenames) else: - print >> sys.stderr, "usage: ..." + usage() return if __name__ == '__main__': |