summaryrefslogtreecommitdiff
path: root/cad/src/tools/Refactoring/tokenize-python.py
blob: 28732e03a4cd4032db8123a25e8cadcdd3d661f0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#!/usr/bin/env python
"""
tokenize-python.py -- print tokenization of one or more input .py files

$Id$

History:

bruce 080616 drafted this from an old script I had at home, py-tokenize.py

russ 080617 Added options to format whitespace and to remove certain kinds of
output, to make it more useful for comparing source files for important changes
while ignoring unimportant changes.
"""
import sys

from os.path import basename
from getopt import getopt, GetoptError
from tokenize import generate_tokens, tok_name
from pprint import pprint

usageMsg = '''usage: %s [-t] [-a] [-l] [-c] [-d] [-s n] [-o] files...
    When multiple files are given, "======= file" separators are inserted.

    -t - Print raw token types and strings. (The default is to print
         tokens in an indented text form with generated whitespace.)

    -a - Suppress all, same as "-lcd".
    -l - Suppress printing line breaks within a statement (NL tokens.)
    -c - Suppress printing comments.
    -d - Suppress printing doc strings.

    -s n - Print just the "signature" of each line (the first n words.)
    -o - Suppress printing operators in the signature.

    Examples:
      To check on changes other than comments and file formatting, use the "-a"
      option on the before-and-after versions and diff them.  You can do a whole
      directory with *.py .
        cd before; %s -a *.py > ../before.pytok
        cd ../after; %s -a *.py > ../after.pytok
        cd ..; diff -C 1 before.pytok after.pytok > pytok.diffs

      Use "-aos 2" to concentrate more closely on statement indentation changes.

      indent-diff.gawk filters out *just* indentation changes from the diffs.
        indent-diff.gawk pytok.diffs > pytok.indent-diff
'''
def usage():
    pgm = basename(sys.argv[0])
    print >> sys.stderr, usageMsg % (3*(pgm,))
    return

# Option variables.
printToks = False
noNLs = False
noComments = False
noDocStrs = False
sigWords = 0
noOps = False
filenames = None

def doOpts():
    global printToks, noNLs, noComments, noDocStrs, sigWords, noOps, filenames
    try:
        opts, filenames = getopt(sys.argv[1:], "talcds:o")
    except GetoptError, err:
        # print help information and exit:
        print str(err) # will print something like "option -a not recognized"
        usage()
        sys.exit(2)
        pass
    for opt, val in opts:
        if opt == "-t":
            printToks = True
        elif opt == "-a":
            noNLs = noComments = noDocStrs = True
        elif opt == "-l":
            noNLs = True
        elif opt == "-c":
            noComments = True
        elif opt == "-d":
            noDocStrs = True
        elif opt == "-s":
            sigWords = int(val)
            pass
        elif opt == "-o":
            noOps = True
        else:
            usage()
        continue
    return

def improve(tup5):
    typ, text, s, t, line = tup5
    tokname = tok_name[typ]
    if tokname == 'INDENT':
        text = ''
    return (tokname, text)

def py_tokenize(filename_in, file_out):
    ## file_out = open(filename_out, "w")
    file_in = open(filename_in, 'rU')
    g = generate_tokens(file_in.readline)
    li = list(g)
    file_in.close()
    li2 = map(improve, li)
    if printToks:
        pprint(li2, file_out)
    else:
        doText(li2, file_out)
        pass
    return

def doText(tokStrs, file_out):
    prevTok = 'NEWLINE'                 # Start out at the beginning of a line.
    prevString = ''
    firstTok = True
    prevUnary = False
    nTok = 1                            # Token number in the line.
    nWords = 0                          # Number of words in the line.
    indent = 0
    nlIndent = 0                       # Second-level indent within a statement.
    commentIndent = 0
    lineBuff = ""
    
    nlToks = ('NEWLINE', 'NL', 'COMMENT') # Comments have a newline at the end.
    wordToks = ('NAME', 'NUMBER', 'STRING')
    noSpaceOps = ('.', ':', '(', '[', '{', '}', ']', ')')

    for (tok, tokString) in tokStrs:
        # Various things to ignore.
        if (tok == 'NL' and noNLs or
            tok == 'COMMENT' and noComments or
            prevTok == 'NEWLINE' and tok == 'STRING' and noDocStrs or
            tok != 'NEWLINE' and sigWords > 0 and nWords >= sigWords or
            tok == 'OP' and noOps):
            continue

        # Alter the indentation level.  (These may occur after a NEWLINE token.)
        if tok == 'INDENT':
            indent += 4
            continue
        if tok == 'DEDENT':
            indent -= 4
            continue

        # NEWLINE is the logical end of statement, as opposed to NL, which is
        # mere formatting whitespace.  Comments also end lines.
        if tok in nlToks:
            if not firstTok or tok == 'COMMENT':

                # Indentation for comments.
                if tok == 'COMMENT':
                    if nTok == 1:
                        lineBuff += (indent + commentIndent) * " "
                    else:
                        lineBuff += " "
                        pass
                    pass

                # Output the last token on the line, and then the line.
                lineBuff += tokString

                # Output the line.
                if not noNLs or lineBuff != "\n":
                    file_out.write(lineBuff)
                lineBuff = ""
                pass

                # Second-level indent within a statement, after a NL token
                # that isn't at the beginning of a line.
                if tok == 'NL' and nTok > 1:
                    nlIndent = 4
                else:
                    nlIndent = 0
                pass

            pass
        else:

            # Generate indentation at the beginning of a line.
            if lineBuff == "":
                lineBuff = (indent + nlIndent) * " "

            # Put spaces around word tokens, but not before commas, after unary
            # ops, or around some special binary ops.
            if (nTok > 1 and         # Never before the first token in the line.
                # When we might put out a space before this token.
                (prevTok in wordToks or tok in wordToks or tok == 'OP') and
                # When we wouldn't put out a space before this token.
                not ( prevUnary or
                      (prevTok == 'OP' and (prevString in noSpaceOps
                                            and tok != 'OP')) or
                      tok == 'OP' and (tokString == ','
                                       or tokString in noSpaceOps)) ):
                lineBuff += " "
                pass

            # Output the token.
            lineBuff += tokString
            pass

        # Carry over a little bit of context from the last two tokens.
        prevUnary = (tok == 'OP' and tokString != ',' and
                     tokString not in noSpaceOps and
                     prevTok == 'OP' and prevString not in noSpaceOps)
        # Bug: Comments tokens after a colon appear *before* the INDENT, and
        # similarly after pass, continue, and return, *before* the DEDENT.
        if tok in nlToks:
            if prevTok == 'OP' and prevString == ':':
                commentIndent = 4
            elif (prevTok == 'NAME' and
                  prevString in ('pass', 'continue', 'return')):
                commentIndent = -4
        else:
            commentIndent = 0
            pass
            
        # The current token becomes the previous.
        prevTok = tok
        prevString = tokString
        firstTok = False         # So we know there really was a previous token.

        # Reset the token and word counters after a newline.
        if tok in nlToks:
            nTok = 1
            nWords = 0
        else:
            nTok += 1
            if tok in wordToks:
                nWords += 1
            pass

        continue
    return

def dofiles(filenames):
    for filename in filenames:
        if len(filenames) > 1:
            print "\n======= [%s]\n" % (filename,)
        py_tokenize(filename, sys.stdout)
    return

def run():
    doOpts()
    if filenames:
        dofiles(filenames)
    else:
        usage()
    return

if __name__ == '__main__':
    run()

# end