summaryrefslogtreecommitdiff
path: root/cad/src/utilities/parse_utils.py
blob: dc0636c035390fb9929ba3d57abf2554394168e0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
# Copyright 2006-2007 Nanorex, Inc.  See LICENSE file for details. 
"""
parse_utils.py -- utilities for general parsing, and for parsing streams of python tokens.
Also a prototype "description class" which can be used to represent results of parsing a "description".
Also an example grammar, which can be used for parsing "parameter-set description files".
(FYI: All these things are used to parse "parameter dialog description files", *.desc.)

@author: Bruce
@version: $Id$
@copyright: 2006-2007 Nanorex, Inc.  See LICENSE file for details. 

TODO:

This ought to be split into several files, and generalized, and ThingData renamed
and cleaned up. And optimized (easy), since the parser is probably quadratic time
in input file size, at least when used in the usual way, on a list that comes
from generate_tokens.
"""

from tokenize import generate_tokens
from tokenize import tok_name
import sys

# == basic general parser

debug_grammar = False # set to True for more info about grammar in syntax error messages

class ParseFail(Exception): pass #e make it a more specific kind of exception?
class SyntaxError(Exception): pass

def parse(pat, rest):
    """
    either return (res, newrest), or raise ParseFail or SyntaxError
    """
##    if type(pat) == type(""):
##        pass
    #e other python types with special meanings, like list?
    try:
        retval = "<none yet>" # for debugging
        retval = pat.parse(rest)
        res, newrest = retval
    except ParseFail:
        raise
    except SyntaxError:
        raise
    except:
        print "retval at exception: %r" % (retval,)
        raise SyntaxError, "exception %s %s at %s" % (sys.exc_info()[0], sys.exc_info()[1], describe_where(rest)) ###k ##e need more info, about pat?
        ##e maybe also postprocess the results using another method of pat, or call pat.parse1 to do it all,
        # to support uniform keyword args for postprocess funcs (with access to rest's env)
    try:
        resultfunc = pat.kws.get('result') # this would be cleaner if some pat method did the whole thing...
    except:
        return res, newrest
    if resultfunc:
        try:
            return resultfunc(res), newrest
        except:
            print "resultfunc %r failed on res %r" % (resultfunc, res) # this seems often useful
            raise
    return res, newrest

def parse_top(pat, rest):
    try:
        return parse(pat, rest)
    except ParseFail:
        return "ParseFail", None
    except SyntaxError, e:
        return 'SyntaxError: ' + str(e), None
    pass

class ParseRule:
    """
    subclasses are specific parse-rule constructors; their instances are therefore parse rules
    """
    def __init__(self, *args, **kws):
        self.args = args
        self.kws = kws
        self.validate()
        return
    def validate(self):
        "subclasses can have this check for errors in args and kws, preprocess them, etc"
        #e the subclasses in this example might not bother to define this as an error checker
        pass
    #e need __str__ for use in syntax error messages
    pass

class Seq(ParseRule):
    def parse(self, rest):
        res = []
        for arg in self.args:
            try:
                res0, rest = parse(arg, rest)
            except ParseFail:
                if not res:
                    raise
                # this is mostly useless until we have sensible __str__ (and even then, is mainly only useful for debugging grammar):
                if debug_grammar:
                    msg = "Can't complete %s, stuck at arg %d = %s\nat %s" % (self, len(res) + 1, arg, describe_where(rest))
                else:
                    msg = "%s" % (describe_where(rest),)
                raise SyntaxError, msg
            res.append(res0)
        return res, rest
    pass

class Alt(ParseRule):
    def parse(self, rest):
        for arg in self.args:
            try:
                return parse(arg, rest)
            except ParseFail:
                continue
        raise ParseFail
    pass

# == higher-level general parser utilities

class ForwardDef(ParseRule):
    """
    For defining placeholders for recursive patterns;
    by convention, arg0 (optional) is some sort of debug name;
    self.pat must be set by caller before use
    """
    def parse(self, rest):
        return parse(self.pat, rest)
    pass

def ListOf(pat):
    """
    0 or more pats
    """
    res = ForwardDef()
    res.pat = Optional(Seq(pat, res,
                            result = lambda (p,r): [p] + r # fix retval format to be a single list (warning: quadratic time)
                               # note: this has value None: [p].extend(r)
                               # (and it too would make entire ListOf take quadratic time if it worked)
                           ))
    return res

def Optional(pat):
    return Alt(pat, Nothing)

class NothingClass(ParseRule):
    def parse(self, rest):
        return [], rest # the fact that this is [] (rather than e.g. None) is depended on by ListOf's result lambda
    pass

Nothing = NothingClass()

# == some things that are specific for rest being a list of 5-tuples coming from tokenize.generate_tokens

##e ideally, these would be methods of a class which represented this kind of input, and had an efficient .next method,
# and methods for other kinds of access into it; the current implem might be quadratic time in the number of tokens,
# depending on how python lists implement the [1:] operation.

def describe_where(rest):
    """
    assume rest is a list of token 5-tuples as returned by generate_tokens
    """
    if not rest:
        return "end of input"
    toktype, tokstring, (srow, scol), (erow, ecol), line = rest[0]
    res1 = "line %d, column %d:" % (srow, scol) # tested! exactly correct (numbering columns from 0, lines from 1)
    res2 = "*******>" + line.rstrip() ##e should also turn tabs to spaces -- until we do, use a prefix of length 8
    res3 = "*******>" + scol * ' ' + '^'
    return '\n'.join([res1,res2,res3])

def token_name(rest):
    if not rest:
        return None
    return tok_name[rest[0][0]]

IGNORED_TOKNAMES = ('NL', 'COMMENT') # this is really an aspect of our specific grammar
    # note: NL is a continuation newline, not a syntactically relevant newline
    # (for that see Newline below, based on tokentype NEWLINE)

class TokenType(ParseRule):
    def validate(self):
        toknames = self.args[0]
        # let this be a string (one name) or a list (multiple names) (list has been tested but might not be currently used)
        want_tokname_dflt = True
        if type(toknames) == type(""):
            want_tokname_dflt = False
            toknames = [toknames]
        self.want_tokname = self.kws.get('want_tokname', want_tokname_dflt)
        assert type(toknames) == type([])
        for tokname in toknames:
            assert type(tokname) == type("") and tokname in tok_name.itervalues(), \
                   "illegal tokname: %r (not found in %r)" % \
                   ( tokname, tok_name.values() )
        self.toknames = toknames
        try:
            self.cond = self.args[1]
        except IndexError:
            self.cond = lambda tokstring: True
    def parse(self, rest):
        """
        assume rest is a list of token 5-tuples as returned by generate_tokens
        """
        while rest and token_name(rest) in IGNORED_TOKNAMES:
            rest = rest[1:] # this might be inefficient for long inputs, and for that matter, so might everything else here be
            # note, this filtering is wasted (redone many times at same place) if we end up parsefailing, but that's tolerable for now
        if not rest or token_name(rest) not in self.toknames:
            raise ParseFail
        tokstring = rest[0][1]
        if self.want_tokname:
            res = (token_name(rest), tokstring)
        else:
            res = tokstring
        if not self.cond(res):
            raise ParseFail
        return res, rest[1:]
    pass

def Op( opstring):
    return TokenType('OP', lambda token: token == opstring)
        ### REVIEW: why doesn't this lambda need the "opstring = opstring" kluge?
        # Has notneeding this been tested? [bruce comment 070918]

# == the specific grammar of "parameter-set description files"

# TODO: split this grammar (including IGNORED_TOKNAMES above, somehow) into its own file

# thing = name : arglist
# optional indented things
# ignore everything else (some are errors) (easiest: filter the ok to ignore, stop at an error, print it at end)

def make_number(token, sign = 1): # other signs untested
    for type in (int, long, float):
        try:
            return type(token) * sign
        except:
            pass
    raise SyntaxError, "illegal number: %r" % (token,) ### flaw: this doesn't include desc of where it happened...
    
Name = TokenType('NAME')
Colon = Op(':')
Minus = Op('-')
End = TokenType('ENDMARKER')
Newline = TokenType('NEWLINE')
# Arg = TokenType(['NUMBER', 'STRING', 'NAME'])
Number = TokenType('NUMBER', result = make_number)
Name = TokenType('NAME')
String = TokenType('STRING', result = eval)
    # eval is to turn '"x"' into 'x'; it's safe since the tokenizer promises this is a string literal
# String, Name
Arg = Alt( Number,
           String,
           Name, #e do STRING & NAME results need to be distinguished?? We'll see...
           Seq( Minus, Number, result = lambda (m,n): - n )
        )
Arglist = ListOf(Arg) # not comma-sep; whitespace sep is ok (since ws is ignored by tokenizer) ##k test that!

def Indented(pat):
    return Seq(TokenType('INDENT'), pat, TokenType('DEDENT'), result = lambda (i,p,d): p )

Thing = ForwardDef("Thing")
Thing.pat = Seq( Name, Colon, Arglist, Newline, Optional(Indented(ListOf(Thing))),
                 result = lambda (name,c,args,nl,subthings): makeThing(name, args, subthings)
                 )

Whole = Seq(ListOf(Thing), End, result = lambda (lt,e): lt )

# ==

# Description objects (prototype)

class attr_interface_to_dict:
    # make sure all of our methods and data start with '_'!
    def __init__(self, _dict1):
        self._dict1 = _dict1
    def __getattr__(self, attr): # in class attr_interface_to_dict
        if attr.startswith('_'):
            raise AttributeError, attr
            # Something like this is needed, even if _dict1 contains such an attr,
            # so such an attr (if specially named) won't fool Python into giving us different semantics.
            # But if we decide _dict1 should be able to contain some names of this form, we could make
            # the prohibition tighter as long as it covered all Python special attrnames and our own attr/method names.
        try:
            return self._dict1[attr]
        except KeyError:
            raise AttributeError, attr
        pass
    pass

class Info:
    def __init__(self, *_args, **kws): # sort of like ParseRule -- unify them?
        self._args = _args
        self.kws = kws
        self.init()
    def init(self):
        pass
    def __repr__(self):
        return "%s%r" % (self.__class__.__name__, self._args) ##k crude approx.
    pass

class ThingData(Info):
    """
    #doc...
    the data in a thing
    """# could this be Thing -- that symbol's value would be both a constructor and a parserule... not sure...
    options = {} # save RAM & time by sharing this when surely empty... could cause bugs if it's altered directly by outside code
    option_attrs = attr_interface_to_dict(options) # also shared, also must be replaced if nonempty
    def init(self):
        self.name, self.thingargs, self.subthings = self._args # possible name conflict: .args vs .thingargs
        #070330 improving the internal documentation:
        ## print "debug ThingData: name = %r, thingargs = %r, subthings = %r" % self._args
        # for an option setting like "max = 9999.0":
        #   name = 'max', thingargs = [9999.0], subthings = []
        #   so name is the option name, thingargs contains one member which is the value, subthings is empty.
        # for a subobject:
        #   name = 'parameter', thingargs = ['L2'], subthings = [ThingData()...]
        #   so name is the type (or used by the parent to choose the type), thingargs has one (optional?) member which is the name,
        #   and subthings contains both actual subobjects and option settings.
        # for widget: combobox, two kluges are used: it acts as both a subthing and an option setting,
        # and its own subthings, which look like option settings, also have an order which is preserved (I think).
        # Both kluges apply to everything -- all option settings stay around in the subthings list,
        # and (I think) all subthing typenames get treated as options set to the subthing name.

        self.args = self.thingargs # already assumed twice, in the using code for desc... should translate remaining thingargs -> args
        if self.subthings:
            self.options = {}
            self.option_attrs = attr_interface_to_dict(self.options)
            ## self.optattrs = AttrHolder() # not yet needed... maybe better to make an obj providing attr interface to self.options
        for sub in self.subthings:
            sub.maybe_set_self_as_option_in(self.options)
        ## print "options:",self.options
        return
    def maybe_set_self_as_option_in(self, dict1):
        """
        If self is an option setting, set it in dict1
        """
        #e in future might need more args, like an env, or might need to store a formula
        # (which might indicate switching to getattr interface?)
        if not self.subthings and len(self.thingargs) == 1:
            dict1[self.name] = self.thingargs[0]
        elif len(self.thingargs) == 1:
            # this is the "simplest hack that could possibly work" to let widget=combobox work as a condition
            # even though it has subthings which further define the combobox. we'll see if doing it generally
            # causes any trouble and/or is useful in other cases. Note that we stored only 'combobox' as the value,
            # not a combobox datum extended with those subthings (its items). (As if the cond was really about the class of value?)
            dict1[self.name] = self.thingargs[0]
    def pprint(self, indent = ""):
        name, args, subthings = self._args
        print indent + name + ': ' + ', '.join(map(repr,args)) # works decently except for 1.4099999999999999 rather than 1.41
        for sub in subthings:
            sub.pprint( indent + '    ' )
    def kids(self, kinds):
        # kinds could already be a list
        if type(kinds) == type(''):
            kinds = (kinds,)
        res = []
        for kid in self.subthings:
            # could be a real kid, or just an assignment; use kinds to tell (assume caller knows what it's doing)
            #e (maybe we even permit multiple assignments and let the last one done *before a kid is made* control that kid????)
            if kid.name in kinds:
                res.append(kid)
        return res
    def isa(self, kind, **conds):
        """
        Predicate: are we of this kind, and do we match conditions like xxx=yyy for our option xxx being yyy?
        """
        #### LOGIC BUG: for symbolic options, the stored value is a string, the match is a string, all ok.
        # but for booleans, the stored val is 'true' or 'false' -- not ok. How do we get best of all these worlds?? ####
        if self.name != kind:
            return False
        for param, val in conds.items():
            #### LOGIC BUG 2 - do we match if we don't store param at all? who supplies defaults?
            # for now: let missing param be the same as a value of None. (this is used in matching widget = ('lineedit',None), etc)
            if self.matches( self.options.get(param, None), val):
                pass
            else:
                return False
            continue
        return True
    def matches(self, paramval, valpattern):
        return paramval == valpattern or (type(valpattern) == type(()) and paramval in valpattern)
            # note: 'in' is not using recursive match, just ==
    def as_expr(self):
        """
        Return an Expr form of self. (Only call after it's fully parsed, since parsing is destructive.)
        """
        # 070330, experimental. Will require exprs module. Not yet called. For now, advise don't call except when a debug_pref is set.
        #e name -> exprhead? using an env? via a Symbol?
        pass
        
    pass

def makeThing(name, args, subthings):
    """
    #doc...
    Note: we don't yet know if the ThingData we return will end up as a subobject
    or an option-value-setting of its parent... its parent will call
    thingdata.maybe_set_self_as_option_in(parent) to make and execute that decision.
    """
    if not args and not subthings:
        print "warning: \"%s:\" with no args or subthings" % (name,)
    return ThingData(name, args, subthings)

# == test code (might not be up to date)

if __name__ == '__main__':

    from pprint import pprint
    
    ## filename = "testui.txt"
    filename = "../plugins/CoNTub/HJ-params.desc"

    file = open(filename, 'rU')

    gentok = generate_tokens(file.readline)

    # print gentok # a generator object

    # pprint( list(gentok) ) # works

    if 0: # works
        res = []

        for toktype, tokstring, (srow, scol), (erow, ecol), line in gentok:
            # print toktype, tokstring
            res.append( (toktype, tok_name[toktype], tokstring) )

        res.sort()
        pprint(res)

    res, newrest = parse_top(Whole, list(gentok))
    print len(` res `), 'chars in res' #3924
    print res # might be an error message
    if newrest is not None:
        print newrest
        print res[0].pprint() #k

    print "test done"

# that might be working... now what?
# the language has an ambiguity... exprhead:args {moreargs}, vs option:val meaning option=val.
# we'll turn things into objects, recognize some subthings as those objects and others as assigments (perhaps with decorations).
# (or, we'll decide that all assignments use '=' not ':'. Tried it... decided too hard to casually write the file this way.)

#### possible bugs:
# - I never tried a negative or explicit-positive number -- now, negative works, explicit-positive doesn't but should ###
# - Won't work for args sep by comma or in parens (doesn't yet matter)

# end