1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
|
#!/usr/bin/env python
"""
tokenize-python.py -- print tokenization of one or more input .py files
$Id$
History:
bruce 080616 drafted this from an old script I had at home, py-tokenize.py
russ 080617 Added options to format whitespace and to remove certain kinds of
output, to make it more useful for comparing source files for important changes
while ignoring unimportant changes.
"""
import sys
from os.path import basename
from getopt import getopt, GetoptError
from tokenize import generate_tokens, tok_name
from pprint import pprint
usageMsg = '''usage: %s [-t] [-a] [-l] [-c] [-d] [-s n] [-o] files...
When multiple files are given, "======= file" separators are inserted.
-t - Print raw token types and strings. (The default is to print
tokens in an indented text form with generated whitespace.)
-a - Suppress all, same as "-lcd".
-l - Suppress printing line breaks within a statement (NL tokens.)
-c - Suppress printing comments.
-d - Suppress printing doc strings.
-s n - Print just the "signature" of each line (the first n words.)
-o - Suppress printing operators in the signature.
Examples:
To check on changes other than comments and file formatting, use the "-a"
option on the before-and-after versions and diff them. You can do a whole
directory with *.py .
cd before; %s -a *.py > ../before.pytok
cd ../after; %s -a *.py > ../after.pytok
cd ..; diff -C 1 before.pytok after.pytok > pytok.diffs
Use "-aos 2" to concentrate more closely on statement indentation changes.
indent-diff.gawk filters out *just* indentation changes from the diffs.
indent-diff.gawk pytok.diffs > pytok.indent-diff
'''
def usage():
pgm = basename(sys.argv[0])
print >> sys.stderr, usageMsg % (3*(pgm,))
return
# Option variables.
printToks = False
noNLs = False
noComments = False
noDocStrs = False
sigWords = 0
noOps = False
filenames = None
def doOpts():
global printToks, noNLs, noComments, noDocStrs, sigWords, noOps, filenames
try:
opts, filenames = getopt(sys.argv[1:], "talcds:o")
except GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
pass
for opt, val in opts:
if opt == "-t":
printToks = True
elif opt == "-a":
noNLs = noComments = noDocStrs = True
elif opt == "-l":
noNLs = True
elif opt == "-c":
noComments = True
elif opt == "-d":
noDocStrs = True
elif opt == "-s":
sigWords = int(val)
pass
elif opt == "-o":
noOps = True
else:
usage()
continue
return
def improve(tup5):
typ, text, s, t, line = tup5
tokname = tok_name[typ]
if tokname == 'INDENT':
text = ''
return (tokname, text)
def py_tokenize(filename_in, file_out):
## file_out = open(filename_out, "w")
file_in = open(filename_in, 'rU')
g = generate_tokens(file_in.readline)
li = list(g)
file_in.close()
li2 = map(improve, li)
if printToks:
pprint(li2, file_out)
else:
doText(li2, file_out)
pass
return
def doText(tokStrs, file_out):
prevTok = 'NEWLINE' # Start out at the beginning of a line.
prevString = ''
firstTok = True
prevUnary = False
nTok = 1 # Token number in the line.
nWords = 0 # Number of words in the line.
indent = 0
nlIndent = 0 # Second-level indent within a statement.
commentIndent = 0
lineBuff = ""
nlToks = ('NEWLINE', 'NL', 'COMMENT') # Comments have a newline at the end.
wordToks = ('NAME', 'NUMBER', 'STRING')
noSpaceOps = ('.', ':', '(', '[', '{', '}', ']', ')')
for (tok, tokString) in tokStrs:
# Various things to ignore.
if (tok == 'NL' and noNLs or
tok == 'COMMENT' and noComments or
prevTok == 'NEWLINE' and tok == 'STRING' and noDocStrs or
tok != 'NEWLINE' and sigWords > 0 and nWords >= sigWords or
tok == 'OP' and noOps):
continue
# Alter the indentation level. (These may occur after a NEWLINE token.)
if tok == 'INDENT':
indent += 4
continue
if tok == 'DEDENT':
indent -= 4
continue
# NEWLINE is the logical end of statement, as opposed to NL, which is
# mere formatting whitespace. Comments also end lines.
if tok in nlToks:
if not firstTok or tok == 'COMMENT':
# Indentation for comments.
if tok == 'COMMENT':
if nTok == 1:
lineBuff += (indent + commentIndent) * " "
else:
lineBuff += " "
pass
pass
# Output the last token on the line, and then the line.
lineBuff += tokString
# Output the line.
if not noNLs or lineBuff != "\n":
file_out.write(lineBuff)
lineBuff = ""
pass
# Second-level indent within a statement, after a NL token
# that isn't at the beginning of a line.
if tok == 'NL' and nTok > 1:
nlIndent = 4
else:
nlIndent = 0
pass
pass
else:
# Generate indentation at the beginning of a line.
if lineBuff == "":
lineBuff = (indent + nlIndent) * " "
# Put spaces around word tokens, but not before commas, after unary
# ops, or around some special binary ops.
if (nTok > 1 and # Never before the first token in the line.
# When we might put out a space before this token.
(prevTok in wordToks or tok in wordToks or tok == 'OP') and
# When we wouldn't put out a space before this token.
not ( prevUnary or
(prevTok == 'OP' and (prevString in noSpaceOps
and tok != 'OP')) or
tok == 'OP' and (tokString == ','
or tokString in noSpaceOps)) ):
lineBuff += " "
pass
# Output the token.
lineBuff += tokString
pass
# Carry over a little bit of context from the last two tokens.
prevUnary = (tok == 'OP' and tokString != ',' and
tokString not in noSpaceOps and
prevTok == 'OP' and prevString not in noSpaceOps)
# Bug: Comments tokens after a colon appear *before* the INDENT, and
# similarly after pass, continue, and return, *before* the DEDENT.
if tok in nlToks:
if prevTok == 'OP' and prevString == ':':
commentIndent = 4
elif (prevTok == 'NAME' and
prevString in ('pass', 'continue', 'return')):
commentIndent = -4
else:
commentIndent = 0
pass
# The current token becomes the previous.
prevTok = tok
prevString = tokString
firstTok = False # So we know there really was a previous token.
# Reset the token and word counters after a newline.
if tok in nlToks:
nTok = 1
nWords = 0
else:
nTok += 1
if tok in wordToks:
nWords += 1
pass
continue
return
def dofiles(filenames):
for filename in filenames:
if len(filenames) > 1:
print "\n======= [%s]\n" % (filename,)
py_tokenize(filename, sys.stdout)
return
def run():
doOpts()
if filenames:
dofiles(filenames)
else:
usage()
return
if __name__ == '__main__':
run()
# end
|