server/util/tdf.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635

#!/usr/bin/python
# coding=UTF-8
# Copyright (C) 2010 Christian Siefkes <christian@siefkes.net>.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""
Tangible Data Format (TDF) is a lightweight data serialization format, see
doc/formats for a description. The API for reading and writing TDF is similar to
the Python standard modules marshal, pickle, and json.
"""

import cStringIO
import re
import sys

# Introduces a simple list item.
_SIMPLE_ITEM_MARKER = u"-"
# Introduces a complex list item.
_COMPLEX_ITEM_MARKER = u"+"
# Separates key and value in a simple pair.
_SIMPLE_MAP_SEP = u":"
# Separates key and value in a complex pair.
_COMPLEX_MAP_SEP = u"::"
# Opens an inline list.
_INLINE_LIST_START = u"{"
# Closes an inline list.
_INLINE_LIST_END = u"}"
# Separates items in inline lists.
_INLINE_LIST_SEP = u","
# Opens a bracket key.
_BRACKET_KEY_START = u"["
# Closes a bracket key.
_BRACKET_KEY_END = u"]"
# Introduces a comment.
_COMMENT_START = u"#"
# The escape character
_ESCAPE_CHAR = u"\\"
# Literal true.
_TRUE = u"true"
# Literal false.
_FALSE = u"false"
# Literal null.
_NULL = u"null"
# Characters making up inline whitespace (space and tab).
_INLINE_WS = u" \t"

# Pattern matching a newline following (via lookahead) by a printable character.
_NL_BEFORE_PRINT = re.compile(ur"\n(?=\S)")
# Pattern string matching whitespace
_WS = ur"\s+"
# Pattern matching the start of an inline comment (preceded by whitespace)
_INLINE_COMMENT = re.compile(_WS + re.escape(_COMMENT_START))
# Pattern matching the start of a simple list item
_SIMPLE_ITEM_START = re.compile(re.escape(_SIMPLE_ITEM_MARKER) + _WS)
# Pattern matching the start of a complex list item
_COMPLEX_ITEM_START = re.compile(re.escape(_COMPLEX_ITEM_MARKER) + _WS)
# Pattern matching a key in a pair: can contain escape sequences and any
# unescaped characters except ":"
_KEY = re.compile(ur"([^:\\]|\\.)*", re.DOTALL)
# Pattern matching an atomic inline list item: can contain escape sequences and
# any unescaped characters except ",{}"
_INLINE_LIST_ATOM = re.compile(ur"([^,{}\\]|\\.)*", re.DOTALL)
# Pattern matching an escape sequence (backslash followed by any other char)
_ESCAPE_SEQ = re.compile(ur"\\(.)", re.DOTALL)
# Pattern matching an integer number
_INTEGER = re.compile(ur"^(0|[1-9][0-9]*)$")
# Pattern matching a float
_FLOAT = re.compile(ur"^([0-9]+\.?[0-9]*([eE][+-]?[0-9]+)?|"
                        "\.[0-9]+([eE][+-]?[0-9]+)?|"
                        "[+-]?(nan|inf))$")

### Public API ###

def dump(obj, fp, encoding = u"UTF-8", comment = None, complexMap = True,
         indentPrefix = u"  "):
    """
    Serialize *obj* as a TDF formatted stream to *fp* (a file-like object with
    a `write` method). *obj* must be a list, tuple, or [frozen] set (all
    serialized as lists) or dict (serialized as map); it can contain nested
    lists (etc.) and dicts as well as [Unicode] strings, numbers (int, float,
    long), Booleans, and None values.

    *encoding* specifies the character encoding to use for string objects;
    if set to None, no encoding is performed.

    The optional *comment* is serialized before the start of the object.

    If *complexMap* is True, the outmost object is serialized as a complex map
    (Win-INI-style) if possible. Inner maps are never serialized as complex
    maps.

    The *indentPrefix* specifies the string of whitespace preceding each deeper
    indentation level. Only spaces and tabs are allowed. By default, two spaces
    are used.

    A `ValueError` is thrown if the *indentPrefix* is invalid.
    """
    # Delegate
    return _doDump(obj, fp, encoding, comment, complexMap, indentPrefix)

def dumps(obj, comment = None, complexMap = True, indentPrefix = u"  "):
    """
    Return a TDF formatted Unicode string containing the serialized *obj*.

    The optional *comment* is serialized before the start of the object.

    If *complexMap* is True, the outmost object is serialized as a complex map
    (Win-INI-style) if possible. Inner maps are never serialized as complex
    maps.

    The *indentPrefix* specifies the string of whitespace preceding each deeper
    indentation level. Only spaces and tabs are allowed. By default, two spaces
    are used.

    A `ValueError` is thrown if the *indentPrefix* is invalid.
    """
    # Create string buffer object and delegate
    output = cStringIO.StringIO()
    dump(obj, output, None, comment, complexMap,indentPrefix)
    return output.getvalue()

def load(fp, encoding = u"UTF-8"):
    """
    Return a list or dict deserialized from *fp* (a file-like object with
    a `read` method containing a TDF document).

    *encoding* specifies the character encoding to use for decoding string
    objects; if set to None, no decoding is performed.

    Throws a `ValueError` if the TDF document is malformed.
    """
    # Read whole contents of input file (using universal newline support)
    infile = open(fp, "rU")
    try:
        if encoding:
            text = infile.read().decode(encoding)
        else:
            text = infile.read()
    finally:
        infile.close()
    # Delegate to loads method
    return loads(text)

def loads(s):
    """
    Return a list or dict deserialized from *s* (a `str` or `unicode` instance
    containing a TDF document).

    Throws a `ValueError` if the TDF document is malformed.
    """
    # Delegate
    return _doLoads(s)

### Private helper functions ###

def _doDump(obj, fp, encoding, comment, complexMap, indentPrefix,
            currentIndent = u""):
    """
    Helper function that serializes *obj* as a TDF formatted stream to *fp*.
    """
    # Check that indentPrefix contains only spaces+tabs and it not empty
    if indentPrefix == u"":
        raise ValueError("Indent prefix is empty")
    elif indentPrefix.strip(_INLINE_WS) != u"":
        raise ValueError("Indent prefix contains invalid characters (only "
            "spaces and tabs are allowed): %r" % indentPrefix.strip(_INLINE_WS))

    if comment:
        # Serialize the comment at the begin of the stream
        commentlines = comment.splitlines()
        for line in commentlines:
            _writeEncoded(u"%s %s\n" % (_COMMENT_START, line), fp, encoding)

    # Determine type of object
    if isinstance(obj, dict):
        # Serialize dictionary (possibly as complex map)
        if complexMap and currentIndent == u"" and _isComplexMap(obj):
            # Serialize as a complex map
            for k, v in obj.iteritems():
                _dumpComplexKey(k, fp, encoding, indentPrefix, currentIndent)
                # Call myself recursively without increasing the indent
                _doDump(v, fp, encoding, None, complexMap, indentPrefix,
                        currentIndent)
        else:
            # TODO Serialize as a normal map
            for k, v in obj.iteritems():
                pass
        # TODO For lists and maps, call myself recursively, setting
        # currentIndent += indentPrefix.
    elif _isListLike(obj):
        # TODO Serialize list (possibly as an inline list)
        pass
    else:
        # It must be an atom
        if currentIndent == u"":
            raise TypeError(u"TDF document must be a list or map: %r" % obj)
        _writeEncoded(u"%s" % currentIndent, fp, encoding)
        _dumpAtom(obj, fp, encoding, indentPrefix, currentIndent)
        _writeEncoded(u"\n", fp, encoding)

def _doLoads(s, commentsStripped = False):
    """
    Helper function that returns a list or dict deserialized from *s*.

    Set *commentsStripped* to True if comments have already been stripped from
    TDF document.
    """
    if not commentsStripped:
        # Discard all comments prior to further parsing
        lines = []
        for line in s.splitlines():
            if line.strip().startswith(_COMMENT_START):
                continue # Skip comment lines
            # Check if the line contains an inline comment
            # (must be preceded by whitespace)
            match = _INLINE_COMMENT.search(line)
            if match:
                # Discard inline comment and preceding whitespace
                line = line[0:match.start()]
            lines.append(line)
        # Rejoin lines in a single string
        s = u"\n".join(lines)
    if _SIMPLE_ITEM_START.match(s) or _COMPLEX_ITEM_START.match(s):
        return _parseBlockList(s)
    elif s.startswith(_INLINE_LIST_START):
        result, rest = _parseInlineList(s)
        if rest:
            raise ValueError(u"Unexpected data after end of inline list: %s" % \
                             rest)
        return result
    else:
        return _parseMap(s)

def _dumpAtom(atom, fp, encoding, indentPrefix, currentIndent):
    """
    Dump an *atom*. If the atom is a string that spans multiple lines, the
    continuation lines are prefixed by *currentIndent* + *indentPrefix*.
    The first line of the atom is NOT indented; neither is a newline or other
    terminator written after the atom.

    A *TypeError* is thrown if *atom* is not an atomic value (string, number,
    Boolean, or None).
    """
    if isinstance(atom, basestring):
        # TODO Serialize string, handling indentation + escapes
        pass
    elif isinstance(atom, int) or isinstance(atom, long) or \
            isinstance(atom, float):
        # Serialize number
        _writeEncoded(str(atom), fp, encoding)
    elif atom == True:
        # Serialize true
        _writeEncoded(_TRUE, fp, encoding)
    elif atom == False:
        # Serialize false
        _writeEncoded(_FALSE, fp, encoding)
    elif atom == None:
        # Serialize null
        _writeEncoded(_NULL, fp, encoding)
    else:
        # Throw TypeError
        raise TypeError(u"Not a valid atom: %r" % atom)

def _dumpComplexKey(k, fp, encoding, indentPrefix, currentIndent):
    """
    Dump a key in a complex map.
    """
    _writeEncoded(u"%s%s" % (currentIndent, _BRACKET_KEY_START), fp, encoding)
    _dumpAtom(k, fp, encoding, indentPrefix, currentIndent)
    _writeEncoded(u"%s\n" % _BRACKET_KEY_END, fp, encoding)

def _isListLike(obj):
    """
    Check whether *obj* is a list-like object. Return True iff *obj* is a list,
    tuple, set, or frozen set.
    """
    return isinstance(obj, list) or isinstance(obj, tuple) or \
            isinstance(obj, set) or isinstance(obj, frozenset)

def _isComplexMap(d):
    """
    Test whether a dictionary is suitable for serialization as a complex map.
    Return True iff all values are compounds (lists or similar, or dictionaries)
    and none of them is the empty list.
    """
    for val in d.values():
        if _isListLike(val):
            # Lists are OK provided they aren't empty
            if len(val) == 0:
                return False
        elif not isinstance(obj, dict):
            # Dictionaries are OK, other objects aren't
            return False
    # it's OK, all values have passed the test
    return True

def _parseAtom(s):
    """
    Returns an atom deserialized from *s*.
    """
    stripped = s.strip()
    # Is it a literal?
    if stripped == _TRUE:
        result = True
    elif stripped == _FALSE :
        result = False
    elif stripped == _NULL:
        result = None
    # Is it an int (or long) number?
    elif _INTEGER.match(stripped):
        # This should always work without error
        result = int(stripped)
    # Is it a float number?
    elif _FLOAT.match(stripped):
        # This should always work without error
        result = float(stripped)
    else:
        # It must be a string
        result = _parseString(s)
    return result

def _parseCompound(s):
    """
    Returns an compound deserialized from *s*.
    """
    # Remove the common whitespace prefix of all lines (except the first)
    s = _removeCommonWhitespacePrefix(s)
    # Delegate
    return _doLoads(s, True)

def _parseBlockList(s):
    """
    Return a block list deserialized from a string *s*.
    """
    # Split string in items
    items = _splitItems(s)
    result = []
    for item in items:
        result.append(_parseBlockListItem(item))
    return result

def _parseBlockListItem(s):
    """
    Return a block list item deserialized from *s*.
    """
    simpleMatch = _SIMPLE_ITEM_START.match(s)
    if simpleMatch:
        # This is a simple list item:
        # remove the matched list item marker and parse the contained atom
        s = s[len(simpleMatch.group()):]
        return _parseAtom(s)
    complexMatch = _COMPLEX_ITEM_START.match(s)
    if complexMatch:
        # This is a complex list item:
        # remove the matched list item marker and parse the contained compound
        s = s[len(complexMatch.group()):]
        return _parseCompound(s)
    else:
        raise ValueError(u"Malformed block list item: %r" % s)

def _parseComplexMap(items):
    """
    Return a complex map deserialized from the list of *items* (which must not
    be empty).
    """
    result = {}
    first = True
    values = None
    for item in items:
        stripped = item.strip()
        isKey = stripped.startswith(_BRACKET_KEY_START) and \
                stripped.endswith(_BRACKET_KEY_END)

        # Check that first item is a bracket key
        if first and not isKey:
            raise ValueError(
                u"First item of complex map must be a bracket key: %s" % item)
            first = False

        if isKey:
            if first:
                first = False
            else:
                # Store values collected for previous key
                if not values:
                    # Zero values represent an empty map (not an empty list)
                    values = {}
                if key in result:
                    raise ValueError(
                        u"Complex map contains the key [%s] twice" % key)
                result[key] = values
                values = None
            # Strip brackets and parse new key
            key = _parseAtom(stripped[1:-1])
        else:
            # Parse value
            if _SIMPLE_ITEM_START.match(stripped) or \
                    _COMPLEX_ITEM_START.match(stripped):
                # Values must be a list
                v = _parseBlockListItem(item)
                if not values:
                    values = []
                elif isinstance(values, dict):
                    raise ValueError(
                        u"Cannot mix list items and key/value pairs in the "
                        u"same section [%s] of a complex map: %s" % (key, item))
                values.append(v)
            else:
                # Values must be a map
                k, v = _parsePair(item)
                if not values:
                    values = {}
                elif isinstance(values, list):
                    raise ValueError(
                        u"Cannot mix list items and key/value pairs in the "
                        u"same section [%s] of a complex map: %s" % (key, item))
                values[k] = v

    # Store final key/value pair and return
    if not values:
        # Zero values represent an empty map (not an empty list)
        values = {}
    if key in result:
        raise ValueError(
            u"Complex map contains the key [%s] twice" % key)
    result[key] = values
    return result

def _parseInlineList(s):
    """
    Parse an inline list deserialized from a string *s*. Returns two values:
    the *parsed* list (a list object) as well as a *rest* (a string object) if
    there is non-whitespace content in the string after the end of the parsed
    list--otherwise *rest* will be the empty string.
    """
    stripped = s.strip()
    if not stripped.startswith(_INLINE_LIST_START):
        raise ValueError("Not a valid inline list: %s" % stripped)
    # Remove opening delimiter and surrounding whitespace
    content = stripped[len(_INLINE_LIST_START):]
    content = content.strip()
    reachedEOL = False
    parsed = []
    # Find and process the next item, until content is empty
    while content and not reachedEOL:
        if stripped.startswith(_INLINE_LIST_START):
            # We found an inner list: call this function recursively
            inner, rest = _parseInlineList(content)
            parsed.append(inner)
            content = rest
        else:
            # Pattern is guaranteed to match (maybe the empty string)
            rawItem = _INLINE_LIST_ATOM.match(content).group()
            item = _parseAtom(rawItem)
            parsed.append(inner)
            # Remove parsed item
            content = content[len(rawItem):]
        if content.startswith(_INLINE_LIST_SEP):
             # Strip delimiter and any following whitespace
            content = content[len(_INLINE_LIST_SEP):].lstrip()
        elif content.startswith(_INLINE_LIST_END):
            # Looks like we're done
            reachedEOL = True
             # Strip delimiter and any following whitespace
            content = content[len(_INLINE_LIST_END):].lstrip()
        else:
            raise ValueError(u"Invalid characters in inline list: %s" % content)
    # Return parsed items and the remaining content, if any
    return parsed, content

def _parseMap(s):
    """
    Return a map deserialized from a string *s*.
    """
    # Split string in items
    items = _splitItems(s)
    if items and items[0].startswith(_BRACKET_KEY_START):
        # It's a complex map
        return _parseComplexMap(items)
    else:
        result = {}
        for item in items:
            key, value = _parsePair(item)
            if key in result:
                raise ValueError(u"Map contains the key %r twice" % key)
            else:
                result[key] = value
        return result

def _parsePair(s):
    """
    Return a (key, value) tuple deserialized from the string *s*.
    """
    # Match key (up to first unescaped ":")--this will always matching something
    # (though it may be the empty string)
    rawKey = _KEY.match(s).group()
    key = _parseAtom(rawKey)
    # Look at rest of string
    rest = s[len(rawKey):]
    if rest.startswith(_COMPLEX_MAP_SEP):
        # Value is a compound
        value = _parseCompound(rest[len(_COMPLEX_MAP_SEP):])
    elif rest.startswith(_SIMPLE_MAP_SEP):
        # Value is an atom
        value = _parseAtom(rest[len(_SIMPLE_MAP_SEP):])
    else:
        raise ValueError(u"Not a valid pair: %s" % s)
    return key, value

def _parseString(s):
    """
    Return a string deserialized from *s*, unindenting continuation lines and
    unescaping escaped characters as necessary.
    """
    # Strip leading whitespace and unindent continuation lines
    s = _removeCommonWhitespacePrefix(s)
    # Strip final whitespace, if any
    stripped = s.rstrip()
    # Experimentally strip final backslashes, if any
    backslashStripped = stripped.rstrip(_ESCAPE_CHAR)
    lenDiff = len(stripped) - len(backslashStripped)
    if lenDiff % 2 == 1:
        # stripped ends in an odd number of backslashes:
        # the last of them escapes the following whitespace
        if s != stripped:
            # Strip the backslash and re-add the first whitespace-character
            stripped = stripped[0:-1]
            lenStripped = len(stripped)
            stripped += s[lenStripped:lenStripped + 1]
        else:
            # Strip the backslash and add a newline
            stripped = stripped[0:-1] + "\n"
    # Handle other escape sequences (removing the backslash)
    stripped = _ESCAPE_SEQ.sub(ur"\1", stripped)
    return stripped

def _removeCommonWhitespacePrefix(s):
    """
    Strip a uniform amount of indentation from the second and further lines of
    the string *s*, equal to the minimum indentation of all non-blank lines
    after the first line. Any indentation in the first line (i.e., up to the
    first newline) is insignificant and removed. Relative indentation of later
    lines is retained. Blank lines should be removed from the beginning and end
    of the string. The algorithm is based on the "Handling Docstring Indentation"
    algorithm from `PEP 257
    <http://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation>`_.
    """
    # Split into a list of lines:
    lines = s.splitlines()
    # Determine common whitespace prefix (first line doesn't count):
    commonPrefix = None
    for line in lines[1:]:
        stripped = line.lstrip(_INLINE_WS)
        strippedChars = len(line) - len(stripped)
        linePrefix = line[0:strippedChars]
        if stripped:
            if commonPrefix == None:
                # First empty line determines initial common prefix
                commonPrefix = linePrefix
            else:
                if len(commonPrefix) > len(linePrefix):
                    # Ensure that linePrefix is a prefix of commonPrefix
                    if commonPrefix.startswith(linePrefix):
                        # Shorten common prefix
                        commonPrefix = linePrefix
                    else:
                        raise ValueError(
                            u"Indentation error: Initial whitespace of line %r "
                            u"is incompatible with preceding indentation %r" % \
                            (line, commonPrefix))
                else:
                    # Ensure that commonPrefix is a prefix of linePrefix
                    if not linePrefix.startswith(commonPrefix):
                        raise ValueError(
                            u"Indentation error: Initial whitespace of line %r "
                            u"is incompatible with preceding indentation %r" % \
                            (line, commonPrefix))
    # Remove indentation (first line is special)
    trimmed = [lines[0].lstrip(_INLINE_WS)]
    # Skip this if commonPrefix is None (no continuation lines)
    if commonPrefix != None:
        indent = len(commonPrefix)
        for line in lines[1:]:
            trimmed.append(line[indent:])
    # Strip off trailing and leading blank lines:
    while trimmed and not trimmed[-1]:
        trimmed.pop()
    while trimmed and not trimmed[0]:
        trimmed.pop(0)
    # Return a single string:
    return u'\n'.join(trimmed)

def _splitItems(s):
    """
    Split from the string *s* into a list of items and return the items.
    Each new item is preceded by a linebreak and starts without any initial
    whitespace (lines with initial whitespace are considered continuation lines
    and become part of the preceding item).

    Empty items will be deleted.
    """
    items = _NL_BEFORE_PRINT.split(s)
    result = []
    # Discard empty items
    for item in items:
        if item:
            result.append(item)
    return result

def _writeEncoded(s, fp, encoding = None):
    """
    Writes the string *s* to the file-like object *fp*, using the specified
    character *encoding*. If *encoding* is None, no encoding is performed.
    """
    if encoding == None:
        fp.write(s)
    else:
        fp.write(s.encode(encoding))

# TODO Temporary test code
if __name__ == "__main__" :
    ##print load(u"../sample/build/com.tangiblebit.drawdio.buildit/process.tb")
    ##print load(u"../sample/objects/com.tangiblebit.drawdio/object.tb")
    print load(u"../sample/use/com.tangiblebit.drawdio.use/process.tb")