summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRussell D. Fish <fish@cs.utah.edu>2008-06-19 18:19:33 +0000
committerRussell D. Fish <fish@cs.utah.edu>2008-06-19 18:19:33 +0000
commit40c76baae020696f879136227dcc4ece11aabc27 (patch)
treedfca3b8f2dd8309a1d271d4892e49c401e34b75a
parent5e486b37ec55d8039f3f3659c8b76971ad9ad33e (diff)
downloadnanoengineer-40c76baae020696f879136227dcc4ece11aabc27.tar.gz
nanoengineer-40c76baae020696f879136227dcc4ece11aabc27.zip
Tools for checking large whitespace/line-length changes.
tokenize-python.py - Added options to format whitespace and to remove certain kinds of output, to make it more useful for comparing source files for important changes while ignoring unimportant changes. indent-diff.gawk - Filter indentation changes from context diffs. Only diff line groups with indentation changes are printed.
-rwxr-xr-xcad/src/tools/Refactoring/indent-diff.gawk118
-rwxr-xr-xcad/src/tools/Refactoring/tokenize-python.py216
2 files changed, 326 insertions, 8 deletions
diff --git a/cad/src/tools/Refactoring/indent-diff.gawk b/cad/src/tools/Refactoring/indent-diff.gawk
new file mode 100755
index 000000000..6ed20112c
--- /dev/null
+++ b/cad/src/tools/Refactoring/indent-diff.gawk
@@ -0,0 +1,118 @@
+#!/usr/bin/env gawk -f
+#
+# indent-diff.gawk -- Filter indentation changes from context diffs.
+# Only diff line groups with indentation changes are printed.
+#
+# Indentation is significant for control structures in Python code.
+# On matching change lines (starting with a "!" in context diffs),
+# if the indentation is different, the indentation count is inserted
+# after the "! " to highlight the change. e.g:
+#
+# ***************
+# *** 33,35 ****
+# draw_vane( bond, a1py, a2py, ord_pi_y, rad, col)
+# ! 8 if ord_pi_z:
+# draw_vane( bond, a1pz, a2pz, ord_pi_z, rad, col)
+# --- 33,35 ----
+# draw_vane( bond, a1py, a2py, ord_pi_y, rad, col)
+# ! 12 if ord_pi_z:
+# draw_vane( bond, a1pz, a2pz, ord_pi_z, rad, col)
+#
+# Note: currently assumes that indentation is done with spaces, not tabs.
+
+BEGIN {
+ debug = 0 ## 1
+ # Minimum indentation difference to report.
+ mindiff = 2
+}
+
+# Capture the file header lines.
+/^\*\*\* [a-zA-Z]/ {
+ if ( entry ) doGroup(); entry = 0;
+ ofile = $0;
+ didHdr = 0;
+ if ( debug ) print "Starting", ofile;
+ next;
+}
+/^\-\-\- [a-zA-Z]/ { nfile = $0; next; }
+
+# Collect and process diff line groups.
+# Line group separator.
+/^\*\*\*\*\*\*/ {
+ if ( entry ) doGroup();
+ entry = 1; oline = 1; nline = 0;
+}
+# Separator between old and new line sections.
+/^--- [1-9]/ { nline = 1; }
+# Store lines.
+{ if ( nline ) nlines[nline++] = $0; else olines[oline++] = $0; next; }
+END { if ( entry ) doGroup(); }
+
+# Process a diff line group within a file diff.
+function doGroup() {
+ # Compare the indentation on old and new lines in a line group.
+ nonmatches = 0;
+ for ( o = n = 1; o < oline && n < nline; o++ ) {
+ osig = sig( ol = olines[o] );
+ # Ignore old, non-change ("!") lines.
+ if ( substr(ol, 1, 1) != "!" ) continue;
+ # Look for a corresponding new change line.
+ nonmatched = 1;
+ oind = indLen(ol);
+ for ( ; n < nline; n++ ) {
+ nsig = sig( nl = nlines[n] );
+ ##printf "sigs\n %s|%s\n %s|%s\n", osig, ol, nsig, nl;
+ # Ignore new, non-change ("!") lines.
+ if ( substr(nl, 1, 1) != "!" ) continue;
+ if ( nsig != osig ) {
+ if ( debug ) printf "different signatures\n %s\n %s\n", ol, nl;
+ } else {
+ # Matching signatures, compare indentation.
+ nind = indLen(nl);
+ if ( nind <= (oind + mindiff) && nind >= (oind - mindiff) ) {
+ if ( debug ) printf "matched sigs & indentation\n %s\n %s\n", ol, nl;
+ nonmatched = 0;
+ }
+ else {
+ # Non-match: Insert the indentation lengths in the lines to show where.
+ olines[o] = substr(olines[o], 1, 2) sprintf("%2d", oind) substr(olines[o], 5);
+ nlines[n] = substr(nlines[n], 1, 2) sprintf("%2d", nind) substr(nlines[n], 5);
+ if ( debug ) printf "Different indentation\n %s\n %s\n", olines[o], nlines[n]
+ }
+ n++; # Matched signatures, go on to next line.
+ break; # Out of the new-line loop.
+ }
+ }
+ nonmatches += nonmatched
+ }
+
+ # Print line groups with indentation that isn't known to match.
+ if ( nonmatches ) {
+ if ( ! didHdr ) { print ofile; print nfile; didHdr = 1; }
+ for ( o = 1; o < oline; o++ ) print olines[o];
+ for ( n = 1; n < nline; n++ ) print nlines[n];
+ }
+}
+
+# Use the first two words on the line as a signature.
+function sig(line) {
+ notWord = "[^a-zA-Z0-9_]+";
+ word = "([a-zA-Z0-9_]+)";
+ twoWords = "^" notWord word notWord word ".*";
+ oneWord = "^" notWord word ".*";
+ ret = gensub(twoWords, "\\1 \\2", 1, line);
+
+ # There may not be two words on the line, or any.
+ if ( ret == line ) ret = gensub(oneWord, "\\1", 1, line);
+ if ( ret == line ) ret = ""
+
+ ##print "sig", match(line, pat), ret
+ return ret;
+}
+
+# The length of indentation on a line in a context diff entry.
+function indLen(line) {
+ # Skip the first two characters, which are prefixed by diff.
+ return length(gensub("..( *).*", "\\1", 1, line))
+}
+
diff --git a/cad/src/tools/Refactoring/tokenize-python.py b/cad/src/tools/Refactoring/tokenize-python.py
index 1a8204860..28732e03a 100755
--- a/cad/src/tools/Refactoring/tokenize-python.py
+++ b/cad/src/tools/Refactoring/tokenize-python.py
@@ -8,17 +8,89 @@ History:
bruce 080616 drafted this from an old script I had at home, py-tokenize.py
-TODO:
-
-add options to remove certain kinds of output,
-to make it more useful for comparing source files for important changes
-while ignoring some kinds of changes.
+russ 080617 Added options to format whitespace and to remove certain kinds of
+output, to make it more useful for comparing source files for important changes
+while ignoring unimportant changes.
"""
import sys
+from os.path import basename
+from getopt import getopt, GetoptError
from tokenize import generate_tokens, tok_name
from pprint import pprint
+usageMsg = '''usage: %s [-t] [-a] [-l] [-c] [-d] [-s n] [-o] files...
+ When multiple files are given, "======= file" separators are inserted.
+
+ -t - Print raw token types and strings. (The default is to print
+ tokens in an indented text form with generated whitespace.)
+
+ -a - Suppress all, same as "-lcd".
+ -l - Suppress printing line breaks within a statement (NL tokens.)
+ -c - Suppress printing comments.
+ -d - Suppress printing doc strings.
+
+ -s n - Print just the "signature" of each line (the first n words.)
+ -o - Suppress printing operators in the signature.
+
+ Examples:
+ To check on changes other than comments and file formatting, use the "-a"
+ option on the before-and-after versions and diff them. You can do a whole
+ directory with *.py .
+ cd before; %s -a *.py > ../before.pytok
+ cd ../after; %s -a *.py > ../after.pytok
+ cd ..; diff -C 1 before.pytok after.pytok > pytok.diffs
+
+ Use "-aos 2" to concentrate more closely on statement indentation changes.
+
+ indent-diff.gawk filters out *just* indentation changes from the diffs.
+ indent-diff.gawk pytok.diffs > pytok.indent-diff
+'''
+def usage():
+ pgm = basename(sys.argv[0])
+ print >> sys.stderr, usageMsg % (3*(pgm,))
+ return
+
+# Option variables.
+printToks = False
+noNLs = False
+noComments = False
+noDocStrs = False
+sigWords = 0
+noOps = False
+filenames = None
+
+def doOpts():
+ global printToks, noNLs, noComments, noDocStrs, sigWords, noOps, filenames
+ try:
+ opts, filenames = getopt(sys.argv[1:], "talcds:o")
+ except GetoptError, err:
+ # print help information and exit:
+ print str(err) # will print something like "option -a not recognized"
+ usage()
+ sys.exit(2)
+ pass
+ for opt, val in opts:
+ if opt == "-t":
+ printToks = True
+ elif opt == "-a":
+ noNLs = noComments = noDocStrs = True
+ elif opt == "-l":
+ noNLs = True
+ elif opt == "-c":
+ noComments = True
+ elif opt == "-d":
+ noDocStrs = True
+ elif opt == "-s":
+ sigWords = int(val)
+ pass
+ elif opt == "-o":
+ noOps = True
+ else:
+ usage()
+ continue
+ return
+
def improve(tup5):
typ, text, s, t, line = tup5
tokname = tok_name[typ]
@@ -33,7 +105,135 @@ def py_tokenize(filename_in, file_out):
li = list(g)
file_in.close()
li2 = map(improve, li)
- pprint(li2, file_out)
+ if printToks:
+ pprint(li2, file_out)
+ else:
+ doText(li2, file_out)
+ pass
+ return
+
+def doText(tokStrs, file_out):
+ prevTok = 'NEWLINE' # Start out at the beginning of a line.
+ prevString = ''
+ firstTok = True
+ prevUnary = False
+ nTok = 1 # Token number in the line.
+ nWords = 0 # Number of words in the line.
+ indent = 0
+ nlIndent = 0 # Second-level indent within a statement.
+ commentIndent = 0
+ lineBuff = ""
+
+ nlToks = ('NEWLINE', 'NL', 'COMMENT') # Comments have a newline at the end.
+ wordToks = ('NAME', 'NUMBER', 'STRING')
+ noSpaceOps = ('.', ':', '(', '[', '{', '}', ']', ')')
+
+ for (tok, tokString) in tokStrs:
+ # Various things to ignore.
+ if (tok == 'NL' and noNLs or
+ tok == 'COMMENT' and noComments or
+ prevTok == 'NEWLINE' and tok == 'STRING' and noDocStrs or
+ tok != 'NEWLINE' and sigWords > 0 and nWords >= sigWords or
+ tok == 'OP' and noOps):
+ continue
+
+ # Alter the indentation level. (These may occur after a NEWLINE token.)
+ if tok == 'INDENT':
+ indent += 4
+ continue
+ if tok == 'DEDENT':
+ indent -= 4
+ continue
+
+ # NEWLINE is the logical end of statement, as opposed to NL, which is
+ # mere formatting whitespace. Comments also end lines.
+ if tok in nlToks:
+ if not firstTok or tok == 'COMMENT':
+
+ # Indentation for comments.
+ if tok == 'COMMENT':
+ if nTok == 1:
+ lineBuff += (indent + commentIndent) * " "
+ else:
+ lineBuff += " "
+ pass
+ pass
+
+ # Output the last token on the line, and then the line.
+ lineBuff += tokString
+
+ # Output the line.
+ if not noNLs or lineBuff != "\n":
+ file_out.write(lineBuff)
+ lineBuff = ""
+ pass
+
+ # Second-level indent within a statement, after a NL token
+ # that isn't at the beginning of a line.
+ if tok == 'NL' and nTok > 1:
+ nlIndent = 4
+ else:
+ nlIndent = 0
+ pass
+
+ pass
+ else:
+
+ # Generate indentation at the beginning of a line.
+ if lineBuff == "":
+ lineBuff = (indent + nlIndent) * " "
+
+ # Put spaces around word tokens, but not before commas, after unary
+ # ops, or around some special binary ops.
+ if (nTok > 1 and # Never before the first token in the line.
+ # When we might put out a space before this token.
+ (prevTok in wordToks or tok in wordToks or tok == 'OP') and
+ # When we wouldn't put out a space before this token.
+ not ( prevUnary or
+ (prevTok == 'OP' and (prevString in noSpaceOps
+ and tok != 'OP')) or
+ tok == 'OP' and (tokString == ','
+ or tokString in noSpaceOps)) ):
+ lineBuff += " "
+ pass
+
+ # Output the token.
+ lineBuff += tokString
+ pass
+
+ # Carry over a little bit of context from the last two tokens.
+ prevUnary = (tok == 'OP' and tokString != ',' and
+ tokString not in noSpaceOps and
+ prevTok == 'OP' and prevString not in noSpaceOps)
+ # Bug: Comments tokens after a colon appear *before* the INDENT, and
+ # similarly after pass, continue, and return, *before* the DEDENT.
+ if tok in nlToks:
+ if prevTok == 'OP' and prevString == ':':
+ commentIndent = 4
+ elif (prevTok == 'NAME' and
+ prevString in ('pass', 'continue', 'return')):
+ commentIndent = -4
+ else:
+ commentIndent = 0
+ pass
+
+ # The current token becomes the previous.
+ prevTok = tok
+ prevString = tokString
+ firstTok = False # So we know there really was a previous token.
+
+ # Reset the token and word counters after a newline.
+ if tok in nlToks:
+ nTok = 1
+ nWords = 0
+ else:
+ nTok += 1
+ if tok in wordToks:
+ nWords += 1
+ pass
+
+ continue
+ return
def dofiles(filenames):
for filename in filenames:
@@ -43,11 +243,11 @@ def dofiles(filenames):
return
def run():
- filenames = sys.argv[1:]
+ doOpts()
if filenames:
dofiles(filenames)
else:
- print >> sys.stderr, "usage: ..."
+ usage()
return
if __name__ == '__main__':