Tools for checking large whitespace/line-length changes.

tokenize-python.py - Added options to format whitespace and to remove certain kinds of output, to make it more useful for comparing source files for important changes while ignoring unimportant changes. indent-diff.gawk - Filter indentation changes from context diffs. Only diff line groups with indentation changes are printed.
author: Russell D. Fish <fish@cs.utah.edu> 2008-06-19 18:19:33 +0000
committer: Russell D. Fish <fish@cs.utah.edu> 2008-06-19 18:19:33 +0000
commit: 40c76baae020696f879136227dcc4ece11aabc27 (patch)
tree: dfca3b8f2dd8309a1d271d4892e49c401e34b75a
parent: 5e486b37ec55d8039f3f3659c8b76971ad9ad33e (diff)
download: nanoengineer-40c76baae020696f879136227dcc4ece11aabc27.tar.gz
nanoengineer-40c76baae020696f879136227dcc4ece11aabc27.zip
2 files changed, 326 insertions, 8 deletions
diff --git a/cad/src/tools/Refactoring/indent-diff.gawk b/cad/src/tools/Refactoring/indent-diff.gawk
new file mode 100755
index 000000000..6ed20112c
--- /dev/null
+++ b/cad/src/tools/Refactoring/indent-diff.gawk
@@ -0,0 +1,118 @@
+#!/usr/bin/env gawk -f
+#
+# indent-diff.gawk -- Filter indentation changes from context diffs.
+#    Only diff line groups with indentation changes are printed.
+#
+#   Indentation is significant for control structures in Python code.
+#   On matching change lines (starting with a "!" in context diffs),
+#   if the indentation is different, the indentation count is inserted
+#   after the "! " to highlight the change.  e.g:
+#
+#      ***************
+#      *** 33,35 ****
+#                    draw_vane( bond, a1py, a2py, ord_pi_y, rad, col)
+#      !  8      if ord_pi_z:
+#                    draw_vane( bond, a1pz, a2pz, ord_pi_z, rad, col) 
+#      --- 33,35 ----
+#                    draw_vane( bond, a1py, a2py, ord_pi_y, rad, col)
+#      ! 12          if ord_pi_z:
+#                    draw_vane( bond, a1pz, a2pz, ord_pi_z, rad, col) 
+#
+#   Note: currently assumes that indentation is done with spaces, not tabs.
+
+BEGIN {
+    debug = 0 ## 1
+    # Minimum indentation difference to report.
+    mindiff = 2
+}
+
+# Capture the file header lines.
+/^\*\*\* [a-zA-Z]/ {
+    if ( entry ) doGroup(); entry = 0;
+    ofile = $0;
+    didHdr = 0;
+    if ( debug ) print "Starting", ofile;
+    next;
+}
+/^\-\-\- [a-zA-Z]/ { nfile = $0; next; }
+
+# Collect and process diff line groups.
+# Line group separator.
+/^\*\*\*\*\*\*/ {
+    if ( entry ) doGroup();
+    entry = 1; oline = 1; nline = 0;
+}
+# Separator between old and new line sections.
+/^--- [1-9]/ { nline = 1; }
+# Store lines.
+{ if ( nline ) nlines[nline++] = $0; else olines[oline++] = $0; next; }
+END { if ( entry ) doGroup(); }
+
+# Process a diff line group within a file diff.
+function doGroup() {
+    # Compare the indentation on old and new lines in a line group.
+    nonmatches = 0;
+    for ( o = n = 1; o < oline && n < nline; o++ ) {
+        osig = sig( ol = olines[o] );
+        # Ignore old, non-change ("!") lines.
+        if ( substr(ol, 1, 1) != "!" ) continue;
+        # Look for a corresponding new change line.
+	nonmatched = 1;
+        oind = indLen(ol);
+        for ( ; n < nline; n++ ) {
+            nsig = sig( nl = nlines[n] );
+	    ##printf "sigs\n  %s|%s\n  %s|%s\n", osig, ol, nsig, nl;
+            # Ignore new, non-change ("!") lines.
+            if ( substr(nl, 1, 1) != "!" ) continue;
+            if ( nsig != osig ) {
+	        if ( debug ) printf "different signatures\n  %s\n  %s\n", ol, nl;
+	    } else {
+                # Matching signatures, compare indentation.
+                nind = indLen(nl);
+                if ( nind <= (oind + mindiff) && nind >= (oind - mindiff)  ) {
+		    if ( debug ) printf "matched sigs & indentation\n  %s\n  %s\n", ol, nl;
+		    nonmatched = 0;
+		}
+		else {
+		    # Non-match: Insert the indentation lengths in the lines to show where.
+		    olines[o] = substr(olines[o], 1, 2) sprintf("%2d", oind) substr(olines[o], 5);
+		    nlines[n] = substr(nlines[n], 1, 2) sprintf("%2d", nind) substr(nlines[n], 5);
+		    if ( debug ) printf "Different indentation\n  %s\n  %s\n", olines[o], nlines[n]
+		}
+		n++;   # Matched signatures, go on to next line.
+	        break; # Out of the new-line loop.
+            }
+	}
+	nonmatches += nonmatched
+    }
+
+    # Print line groups with indentation that isn't known to match.
+    if ( nonmatches ) {
+        if ( ! didHdr ) { print ofile; print nfile; didHdr = 1; }
+	for ( o = 1; o < oline; o++ ) print olines[o];
+	for ( n = 1; n < nline; n++ ) print nlines[n];
+    }
+}
+
+# Use the first two words on the line as a signature.
+function sig(line) {
+    notWord = "[^a-zA-Z0-9_]+";
+    word = "([a-zA-Z0-9_]+)";
+    twoWords = "^" notWord word notWord word ".*";
+    oneWord = "^" notWord word ".*";
+    ret = gensub(twoWords, "\\1 \\2", 1, line);
+
+    # There may not be two words on the line, or any.
+    if ( ret == line ) ret = gensub(oneWord, "\\1", 1, line);
+    if ( ret == line ) ret = ""
+
+    ##print "sig", match(line, pat), ret
+    return ret;
+}
+    
+# The length of indentation on a line in a context diff entry.
+function indLen(line) {
+    # Skip the first two characters, which are prefixed by diff.
+    return length(gensub("..( *).*", "\\1", 1, line))
+}
+
diff --git a/cad/src/tools/Refactoring/tokenize-python.py b/cad/src/tools/Refactoring/tokenize-python.py
index 1a8204860..28732e03a 100755
--- a/cad/src/tools/Refactoring/tokenize-python.py
+++ b/cad/src/tools/Refactoring/tokenize-python.py
@@ -8,17 +8,89 @@ History:
 
 bruce 080616 drafted this from an old script I had at home, py-tokenize.py
 
-TODO:
-
-add options to remove certain kinds of output,
-to make it more useful for comparing source files for important changes
-while ignoring some kinds of changes.
+russ 080617 Added options to format whitespace and to remove certain kinds of
+output, to make it more useful for comparing source files for important changes
+while ignoring unimportant changes.
 """
 import sys
 
+from os.path import basename
+from getopt import getopt, GetoptError
 from tokenize import generate_tokens, tok_name
 from pprint import pprint
 
+usageMsg = '''usage: %s [-t] [-a] [-l] [-c] [-d] [-s n] [-o] files...
+    When multiple files are given, "======= file" separators are inserted.
+
+    -t - Print raw token types and strings. (The default is to print
+         tokens in an indented text form with generated whitespace.)
+
+    -a - Suppress all, same as "-lcd".
+    -l - Suppress printing line breaks within a statement (NL tokens.)
+    -c - Suppress printing comments.
+    -d - Suppress printing doc strings.
+
+    -s n - Print just the "signature" of each line (the first n words.)
+    -o - Suppress printing operators in the signature.
+
+    Examples:
+      To check on changes other than comments and file formatting, use the "-a"
+      option on the before-and-after versions and diff them.  You can do a whole
+      directory with *.py .
+        cd before; %s -a *.py > ../before.pytok
+        cd ../after; %s -a *.py > ../after.pytok
+        cd ..; diff -C 1 before.pytok after.pytok > pytok.diffs
+
+      Use "-aos 2" to concentrate more closely on statement indentation changes.
+
+      indent-diff.gawk filters out *just* indentation changes from the diffs.
+        indent-diff.gawk pytok.diffs > pytok.indent-diff
+'''
+def usage():
+    pgm = basename(sys.argv[0])
+    print >> sys.stderr, usageMsg % (3*(pgm,))
+    return
+
+# Option variables.
+printToks = False
+noNLs = False
+noComments = False
+noDocStrs = False
+sigWords = 0
+noOps = False
+filenames = None
+
+def doOpts():
+    global printToks, noNLs, noComments, noDocStrs, sigWords, noOps, filenames
+    try:
+        opts, filenames = getopt(sys.argv[1:], "talcds:o")
+    except GetoptError, err:
+        # print help information and exit:
+        print str(err) # will print something like "option -a not recognized"
+        usage()
+        sys.exit(2)
+        pass
+    for opt, val in opts:
+        if opt == "-t":
+            printToks = True
+        elif opt == "-a":
+            noNLs = noComments = noDocStrs = True
+        elif opt == "-l":
+            noNLs = True
+        elif opt == "-c":
+            noComments = True
+        elif opt == "-d":
+            noDocStrs = True
+        elif opt == "-s":
+            sigWords = int(val)
+            pass
+        elif opt == "-o":
+            noOps = True
+        else:
+            usage()
+        continue
+    return
+
 def improve(tup5):
     typ, text, s, t, line = tup5
     tokname = tok_name[typ]
@@ -33,7 +105,135 @@ def py_tokenize(filename_in, file_out):
     li = list(g)
     file_in.close()
     li2 = map(improve, li)
-    pprint(li2, file_out)
+    if printToks:
+        pprint(li2, file_out)
+    else:
+        doText(li2, file_out)
+        pass
+    return
+
+def doText(tokStrs, file_out):
+    prevTok = 'NEWLINE'                 # Start out at the beginning of a line.
+    prevString = ''
+    firstTok = True
+    prevUnary = False
+    nTok = 1                            # Token number in the line.
+    nWords = 0                          # Number of words in the line.
+    indent = 0
+    nlIndent = 0                       # Second-level indent within a statement.
+    commentIndent = 0
+    lineBuff = ""
+    
+    nlToks = ('NEWLINE', 'NL', 'COMMENT') # Comments have a newline at the end.
+    wordToks = ('NAME', 'NUMBER', 'STRING')
+    noSpaceOps = ('.', ':', '(', '[', '{', '}', ']', ')')
+
+    for (tok, tokString) in tokStrs:
+        # Various things to ignore.
+        if (tok == 'NL' and noNLs or
+            tok == 'COMMENT' and noComments or
+            prevTok == 'NEWLINE' and tok == 'STRING' and noDocStrs or
+            tok != 'NEWLINE' and sigWords > 0 and nWords >= sigWords or
+            tok == 'OP' and noOps):
+            continue
+
+        # Alter the indentation level.  (These may occur after a NEWLINE token.)
+        if tok == 'INDENT':
+            indent += 4
+            continue
+        if tok == 'DEDENT':
+            indent -= 4
+            continue
+
+        # NEWLINE is the logical end of statement, as opposed to NL, which is
+        # mere formatting whitespace.  Comments also end lines.
+        if tok in nlToks:
+            if not firstTok or tok == 'COMMENT':
+
+                # Indentation for comments.
+                if tok == 'COMMENT':
+                    if nTok == 1:
+                        lineBuff += (indent + commentIndent) * " "
+                    else:
+                        lineBuff += " "
+                        pass
+                    pass
+
+                # Output the last token on the line, and then the line.
+                lineBuff += tokString
+
+                # Output the line.
+                if not noNLs or lineBuff != "\n":
+                    file_out.write(lineBuff)
+                lineBuff = ""
+                pass
+
+                # Second-level indent within a statement, after a NL token
+                # that isn't at the beginning of a line.
+                if tok == 'NL' and nTok > 1:
+                    nlIndent = 4
+                else:
+                    nlIndent = 0
+                pass
+
+            pass
+        else:
+
+            # Generate indentation at the beginning of a line.
+            if lineBuff == "":
+                lineBuff = (indent + nlIndent) * " "
+
+            # Put spaces around word tokens, but not before commas, after unary
+            # ops, or around some special binary ops.
+            if (nTok > 1 and         # Never before the first token in the line.
+                # When we might put out a space before this token.
+                (prevTok in wordToks or tok in wordToks or tok == 'OP') and
+                # When we wouldn't put out a space before this token.
+                not ( prevUnary or
+                      (prevTok == 'OP' and (prevString in noSpaceOps
+                                            and tok != 'OP')) or
+                      tok == 'OP' and (tokString == ','
+                                       or tokString in noSpaceOps)) ):
+                lineBuff += " "
+                pass
+
+            # Output the token.
+            lineBuff += tokString
+            pass
+
+        # Carry over a little bit of context from the last two tokens.
+        prevUnary = (tok == 'OP' and tokString != ',' and
+                     tokString not in noSpaceOps and
+                     prevTok == 'OP' and prevString not in noSpaceOps)
+        # Bug: Comments tokens after a colon appear *before* the INDENT, and
+        # similarly after pass, continue, and return, *before* the DEDENT.
+        if tok in nlToks:
+            if prevTok == 'OP' and prevString == ':':
+                commentIndent = 4
+            elif (prevTok == 'NAME' and
+                  prevString in ('pass', 'continue', 'return')):
+                commentIndent = -4
+        else:
+            commentIndent = 0
+            pass
+            
+        # The current token becomes the previous.
+        prevTok = tok
+        prevString = tokString
+        firstTok = False         # So we know there really was a previous token.
+
+        # Reset the token and word counters after a newline.
+        if tok in nlToks:
+            nTok = 1
+            nWords = 0
+        else:
+            nTok += 1
+            if tok in wordToks:
+                nWords += 1
+            pass
+
+        continue
+    return
 
 def dofiles(filenames):
     for filename in filenames:
@@ -43,11 +243,11 @@ def dofiles(filenames):
     return
 
 def run():
-    filenames = sys.argv[1:]
+    doOpts()
     if filenames:
         dofiles(filenames)
     else:
-        print >> sys.stderr, "usage: ..."
+        usage()
     return
 
 if __name__ == '__main__':
author	Russell D. Fish <fish@cs.utah.edu>	2008-06-19 18:19:33 +0000
committer	Russell D. Fish <fish@cs.utah.edu>	2008-06-19 18:19:33 +0000
commit	40c76baae020696f879136227dcc4ece11aabc27 (patch)
tree	dfca3b8f2dd8309a1d271d4892e49c401e34b75a
parent	5e486b37ec55d8039f3f3659c8b76971ad9ad33e (diff)
download	nanoengineer-40c76baae020696f879136227dcc4ece11aabc27.tar.gz nanoengineer-40c76baae020696f879136227dcc4ece11aabc27.zip