1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
|
#!/usr/bin/python
# coding=UTF-8
# Copyright (C) 2010 Christian Siefkes <christian@siefkes.net>.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
Tangible Data Format (TDF) is a lightweight data serialization format, see
doc/formats for a description. The API for reading and writing TDF is similar to
the Python standard modules marshal, pickle, and json.
"""
import cStringIO
import re
import sys
# Introduces a simple list item.
_SIMPLE_ITEM_MARKER = u"-"
# Introduces a complex list item.
_COMPLEX_ITEM_MARKER = u"+"
# Separates key and value in a simple pair.
_SIMPLE_MAP_SEP = u":"
# Separates key and value in a complex pair.
_COMPLEX_MAP_SEP = u"::"
# Opens an inline list.
_INLINE_LIST_START = u"{"
# Closes an inline list.
_INLINE_LIST_END = u"}"
# Separates items in inline lists.
_INLINE_LIST_SEP = u","
# Opens a bracket key.
_BRACKET_KEY_START = u"["
# Closes a bracket key.
_BRACKET_KEY_END = u"]"
# Introduces a comment.
_COMMENT_START = u"#"
# The escape character
_ESCAPE_CHAR = u"\\"
# Literal true.
_TRUE = u"true"
# Literal false.
_FALSE = u"false"
# Literal null.
_NULL = u"null"
# Characters making up inline whitespace (space and tab).
_INLINE_WS = u" \t"
# Pattern matching a newline following (via lookahead) by a printable character.
_NL_BEFORE_PRINT = re.compile(ur"\n(?=\S)")
# Pattern string matching whitespace
_WS = ur"\s+"
# Pattern matching the start of an inline comment (preceded by whitespace)
_INLINE_COMMENT = re.compile(_WS + re.escape(_COMMENT_START))
# Pattern matching the start of a simple list item
_SIMPLE_ITEM_START = re.compile(re.escape(_SIMPLE_ITEM_MARKER) + _WS)
# Pattern matching the start of a complex list item
_COMPLEX_ITEM_START = re.compile(re.escape(_COMPLEX_ITEM_MARKER) + _WS)
# Pattern matching a key in a pair: can contain escape sequences and any
# unescaped characters except ":"
_KEY = re.compile(ur"([^:\\]|\\.)*", re.DOTALL)
# Pattern matching an atomic inline list item: can contain escape sequences and
# any unescaped characters except ",{}"
_INLINE_LIST_ATOM = re.compile(ur"([^,{}\\]|\\.)*", re.DOTALL)
# Pattern matching an escape sequence (backslash followed by any other char)
_ESCAPE_SEQ = re.compile(ur"\\(.)", re.DOTALL)
# Pattern matching an integer number
_INTEGER = re.compile(ur"^(0|[1-9][0-9]*)$")
# Pattern matching a float
_FLOAT = re.compile(ur"^([0-9]+\.?[0-9]*([eE][+-]?[0-9]+)?|"
"\.[0-9]+([eE][+-]?[0-9]+)?|"
"[+-]?(nan|inf))$")
### Public API ###
def dump(obj, fp, encoding = u"UTF-8", comment = None, complexMap = True,
indentPrefix = u" "):
"""
Serialize *obj* as a TDF formatted stream to *fp* (a file-like object with
a `write` method). *obj* must be a list, tuple, or [frozen] set (all
serialized as lists) or dict (serialized as map); it can contain nested
lists (etc.) and dicts as well as [Unicode] strings, numbers (int, float,
long), Booleans, and None values.
*encoding* specifies the character encoding to use for string objects;
if set to None, no encoding is performed.
The optional *comment* is serialized before the start of the object.
If *complexMap* is True, the outmost object is serialized as a complex map
(Win-INI-style) if possible. Inner maps are never serialized as complex
maps.
The *indentPrefix* specifies the string of whitespace preceding each deeper
indentation level. Only spaces and tabs are allowed. By default, two spaces
are used.
A `ValueError` is thrown if the *indentPrefix* is invalid.
"""
# Delegate
return _doDump(obj, fp, encoding, comment, complexMap, indentPrefix)
def dumps(obj, comment = None, complexMap = True, indentPrefix = u" "):
"""
Return a TDF formatted Unicode string containing the serialized *obj*.
The optional *comment* is serialized before the start of the object.
If *complexMap* is True, the outmost object is serialized as a complex map
(Win-INI-style) if possible. Inner maps are never serialized as complex
maps.
The *indentPrefix* specifies the string of whitespace preceding each deeper
indentation level. Only spaces and tabs are allowed. By default, two spaces
are used.
A `ValueError` is thrown if the *indentPrefix* is invalid.
"""
# Create string buffer object and delegate
output = cStringIO.StringIO()
dump(obj, output, None, comment, complexMap,indentPrefix)
return output.getvalue()
def load(fp, encoding = u"UTF-8"):
"""
Return a list or dict deserialized from *fp* (a file-like object with
a `read` method containing a TDF document).
*encoding* specifies the character encoding to use for decoding string
objects; if set to None, no decoding is performed.
Throws a `ValueError` if the TDF document is malformed.
"""
# Read whole contents of input file (using universal newline support)
infile = open(fp, "rU")
try:
if encoding:
text = infile.read().decode(encoding)
else:
text = infile.read()
finally:
infile.close()
# Delegate to loads method
return loads(text)
def loads(s):
"""
Return a list or dict deserialized from *s* (a `str` or `unicode` instance
containing a TDF document).
Throws a `ValueError` if the TDF document is malformed.
"""
# Delegate
return _doLoads(s)
### Private helper functions ###
def _doDump(obj, fp, encoding, comment, complexMap, indentPrefix,
currentIndent = u""):
"""
Helper function that serializes *obj* as a TDF formatted stream to *fp*.
"""
# Check that indentPrefix contains only spaces+tabs and it not empty
if indentPrefix == u"":
raise ValueError("Indent prefix is empty")
elif indentPrefix.strip(_INLINE_WS) != u"":
raise ValueError("Indent prefix contains invalid characters (only "
"spaces and tabs are allowed): %r" % indentPrefix.strip(_INLINE_WS))
if comment:
# Serialize the comment at the begin of the stream
commentlines = comment.splitlines()
for line in commentlines:
_writeEncoded(u"%s %s\n" % (_COMMENT_START, line), fp, encoding)
# Determine type of object
if isinstance(obj, dict):
# Serialize dictionary (possibly as complex map)
if complexMap and currentIndent == u"" and _isComplexMap(obj):
# Serialize as a complex map
for k, v in obj.iteritems():
_dumpComplexKey(k, fp, encoding, indentPrefix, currentIndent)
# Call myself recursively without increasing the indent
_doDump(v, fp, encoding, None, complexMap, indentPrefix,
currentIndent)
else:
# TODO Serialize as a normal map
for k, v in obj.iteritems():
pass
# TODO For lists and maps, call myself recursively, setting
# currentIndent += indentPrefix.
elif _isListLike(obj):
# TODO Serialize list (possibly as an inline list)
pass
else:
# It must be an atom
if currentIndent == u"":
raise TypeError(u"TDF document must be a list or map: %r" % obj)
_writeEncoded(u"%s" % currentIndent, fp, encoding)
_dumpAtom(obj, fp, encoding, indentPrefix, currentIndent)
_writeEncoded(u"\n", fp, encoding)
def _doLoads(s, commentsStripped = False):
"""
Helper function that returns a list or dict deserialized from *s*.
Set *commentsStripped* to True if comments have already been stripped from
TDF document.
"""
if not commentsStripped:
# Discard all comments prior to further parsing
lines = []
for line in s.splitlines():
if line.strip().startswith(_COMMENT_START):
continue # Skip comment lines
# Check if the line contains an inline comment
# (must be preceded by whitespace)
match = _INLINE_COMMENT.search(line)
if match:
# Discard inline comment and preceding whitespace
line = line[0:match.start()]
lines.append(line)
# Rejoin lines in a single string
s = u"\n".join(lines)
if _SIMPLE_ITEM_START.match(s) or _COMPLEX_ITEM_START.match(s):
return _parseBlockList(s)
elif s.startswith(_INLINE_LIST_START):
result, rest = _parseInlineList(s)
if rest:
raise ValueError(u"Unexpected data after end of inline list: %s" % \
rest)
return result
else:
return _parseMap(s)
def _dumpAtom(atom, fp, encoding, indentPrefix, currentIndent):
"""
Dump an *atom*. If the atom is a string that spans multiple lines, the
continuation lines are prefixed by *currentIndent* + *indentPrefix*.
The first line of the atom is NOT indented; neither is a newline or other
terminator written after the atom.
A *TypeError* is thrown if *atom* is not an atomic value (string, number,
Boolean, or None).
"""
if isinstance(atom, basestring):
# TODO Serialize string, handling indentation + escapes
pass
elif isinstance(atom, int) or isinstance(atom, long) or \
isinstance(atom, float):
# Serialize number
_writeEncoded(str(atom), fp, encoding)
elif atom == True:
# Serialize true
_writeEncoded(_TRUE, fp, encoding)
elif atom == False:
# Serialize false
_writeEncoded(_FALSE, fp, encoding)
elif atom == None:
# Serialize null
_writeEncoded(_NULL, fp, encoding)
else:
# Throw TypeError
raise TypeError(u"Not a valid atom: %r" % atom)
def _dumpComplexKey(k, fp, encoding, indentPrefix, currentIndent):
"""
Dump a key in a complex map.
"""
_writeEncoded(u"%s%s" % (currentIndent, _BRACKET_KEY_START), fp, encoding)
_dumpAtom(k, fp, encoding, indentPrefix, currentIndent)
_writeEncoded(u"%s\n" % _BRACKET_KEY_END, fp, encoding)
def _isListLike(obj):
"""
Check whether *obj* is a list-like object. Return True iff *obj* is a list,
tuple, set, or frozen set.
"""
return isinstance(obj, list) or isinstance(obj, tuple) or \
isinstance(obj, set) or isinstance(obj, frozenset)
def _isComplexMap(d):
"""
Test whether a dictionary is suitable for serialization as a complex map.
Return True iff all values are compounds (lists or similar, or dictionaries)
and none of them is the empty list.
"""
for val in d.values():
if _isListLike(val):
# Lists are OK provided they aren't empty
if len(val) == 0:
return False
elif not isinstance(obj, dict):
# Dictionaries are OK, other objects aren't
return False
# it's OK, all values have passed the test
return True
def _parseAtom(s):
"""
Returns an atom deserialized from *s*.
"""
stripped = s.strip()
# Is it a literal?
if stripped == _TRUE:
result = True
elif stripped == _FALSE :
result = False
elif stripped == _NULL:
result = None
# Is it an int (or long) number?
elif _INTEGER.match(stripped):
# This should always work without error
result = int(stripped)
# Is it a float number?
elif _FLOAT.match(stripped):
# This should always work without error
result = float(stripped)
else:
# It must be a string
result = _parseString(s)
return result
def _parseCompound(s):
"""
Returns an compound deserialized from *s*.
"""
# Remove the common whitespace prefix of all lines (except the first)
s = _removeCommonWhitespacePrefix(s)
# Delegate
return _doLoads(s, True)
def _parseBlockList(s):
"""
Return a block list deserialized from a string *s*.
"""
# Split string in items
items = _splitItems(s)
result = []
for item in items:
result.append(_parseBlockListItem(item))
return result
def _parseBlockListItem(s):
"""
Return a block list item deserialized from *s*.
"""
simpleMatch = _SIMPLE_ITEM_START.match(s)
if simpleMatch:
# This is a simple list item:
# remove the matched list item marker and parse the contained atom
s = s[len(simpleMatch.group()):]
return _parseAtom(s)
complexMatch = _COMPLEX_ITEM_START.match(s)
if complexMatch:
# This is a complex list item:
# remove the matched list item marker and parse the contained compound
s = s[len(complexMatch.group()):]
return _parseCompound(s)
else:
raise ValueError(u"Malformed block list item: %r" % s)
def _parseComplexMap(items):
"""
Return a complex map deserialized from the list of *items* (which must not
be empty).
"""
result = {}
first = True
values = None
for item in items:
stripped = item.strip()
isKey = stripped.startswith(_BRACKET_KEY_START) and \
stripped.endswith(_BRACKET_KEY_END)
# Check that first item is a bracket key
if first and not isKey:
raise ValueError(
u"First item of complex map must be a bracket key: %s" % item)
first = False
if isKey:
if first:
first = False
else:
# Store values collected for previous key
if not values:
# Zero values represent an empty map (not an empty list)
values = {}
if key in result:
raise ValueError(
u"Complex map contains the key [%s] twice" % key)
result[key] = values
values = None
# Strip brackets and parse new key
key = _parseAtom(stripped[1:-1])
else:
# Parse value
if _SIMPLE_ITEM_START.match(stripped) or \
_COMPLEX_ITEM_START.match(stripped):
# Values must be a list
v = _parseBlockListItem(item)
if not values:
values = []
elif isinstance(values, dict):
raise ValueError(
u"Cannot mix list items and key/value pairs in the "
u"same section [%s] of a complex map: %s" % (key, item))
values.append(v)
else:
# Values must be a map
k, v = _parsePair(item)
if not values:
values = {}
elif isinstance(values, list):
raise ValueError(
u"Cannot mix list items and key/value pairs in the "
u"same section [%s] of a complex map: %s" % (key, item))
values[k] = v
# Store final key/value pair and return
if not values:
# Zero values represent an empty map (not an empty list)
values = {}
if key in result:
raise ValueError(
u"Complex map contains the key [%s] twice" % key)
result[key] = values
return result
def _parseInlineList(s):
"""
Parse an inline list deserialized from a string *s*. Returns two values:
the *parsed* list (a list object) as well as a *rest* (a string object) if
there is non-whitespace content in the string after the end of the parsed
list--otherwise *rest* will be the empty string.
"""
stripped = s.strip()
if not stripped.startswith(_INLINE_LIST_START):
raise ValueError("Not a valid inline list: %s" % stripped)
# Remove opening delimiter and surrounding whitespace
content = stripped[len(_INLINE_LIST_START):]
content = content.strip()
reachedEOL = False
parsed = []
# Find and process the next item, until content is empty
while content and not reachedEOL:
if stripped.startswith(_INLINE_LIST_START):
# We found an inner list: call this function recursively
inner, rest = _parseInlineList(content)
parsed.append(inner)
content = rest
else:
# Pattern is guaranteed to match (maybe the empty string)
rawItem = _INLINE_LIST_ATOM.match(content).group()
item = _parseAtom(rawItem)
parsed.append(inner)
# Remove parsed item
content = content[len(rawItem):]
if content.startswith(_INLINE_LIST_SEP):
# Strip delimiter and any following whitespace
content = content[len(_INLINE_LIST_SEP):].lstrip()
elif content.startswith(_INLINE_LIST_END):
# Looks like we're done
reachedEOL = True
# Strip delimiter and any following whitespace
content = content[len(_INLINE_LIST_END):].lstrip()
else:
raise ValueError(u"Invalid characters in inline list: %s" % content)
# Return parsed items and the remaining content, if any
return parsed, content
def _parseMap(s):
"""
Return a map deserialized from a string *s*.
"""
# Split string in items
items = _splitItems(s)
if items and items[0].startswith(_BRACKET_KEY_START):
# It's a complex map
return _parseComplexMap(items)
else:
result = {}
for item in items:
key, value = _parsePair(item)
if key in result:
raise ValueError(u"Map contains the key %r twice" % key)
else:
result[key] = value
return result
def _parsePair(s):
"""
Return a (key, value) tuple deserialized from the string *s*.
"""
# Match key (up to first unescaped ":")--this will always matching something
# (though it may be the empty string)
rawKey = _KEY.match(s).group()
key = _parseAtom(rawKey)
# Look at rest of string
rest = s[len(rawKey):]
if rest.startswith(_COMPLEX_MAP_SEP):
# Value is a compound
value = _parseCompound(rest[len(_COMPLEX_MAP_SEP):])
elif rest.startswith(_SIMPLE_MAP_SEP):
# Value is an atom
value = _parseAtom(rest[len(_SIMPLE_MAP_SEP):])
else:
raise ValueError(u"Not a valid pair: %s" % s)
return key, value
def _parseString(s):
"""
Return a string deserialized from *s*, unindenting continuation lines and
unescaping escaped characters as necessary.
"""
# Strip leading whitespace and unindent continuation lines
s = _removeCommonWhitespacePrefix(s)
# Strip final whitespace, if any
stripped = s.rstrip()
# Experimentally strip final backslashes, if any
backslashStripped = stripped.rstrip(_ESCAPE_CHAR)
lenDiff = len(stripped) - len(backslashStripped)
if lenDiff % 2 == 1:
# stripped ends in an odd number of backslashes:
# the last of them escapes the following whitespace
if s != stripped:
# Strip the backslash and re-add the first whitespace-character
stripped = stripped[0:-1]
lenStripped = len(stripped)
stripped += s[lenStripped:lenStripped + 1]
else:
# Strip the backslash and add a newline
stripped = stripped[0:-1] + "\n"
# Handle other escape sequences (removing the backslash)
stripped = _ESCAPE_SEQ.sub(ur"\1", stripped)
return stripped
def _removeCommonWhitespacePrefix(s):
"""
Strip a uniform amount of indentation from the second and further lines of
the string *s*, equal to the minimum indentation of all non-blank lines
after the first line. Any indentation in the first line (i.e., up to the
first newline) is insignificant and removed. Relative indentation of later
lines is retained. Blank lines should be removed from the beginning and end
of the string. The algorithm is based on the "Handling Docstring Indentation"
algorithm from `PEP 257
<http://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation>`_.
"""
# Split into a list of lines:
lines = s.splitlines()
# Determine common whitespace prefix (first line doesn't count):
commonPrefix = None
for line in lines[1:]:
stripped = line.lstrip(_INLINE_WS)
strippedChars = len(line) - len(stripped)
linePrefix = line[0:strippedChars]
if stripped:
if commonPrefix == None:
# First empty line determines initial common prefix
commonPrefix = linePrefix
else:
if len(commonPrefix) > len(linePrefix):
# Ensure that linePrefix is a prefix of commonPrefix
if commonPrefix.startswith(linePrefix):
# Shorten common prefix
commonPrefix = linePrefix
else:
raise ValueError(
u"Indentation error: Initial whitespace of line %r "
u"is incompatible with preceding indentation %r" % \
(line, commonPrefix))
else:
# Ensure that commonPrefix is a prefix of linePrefix
if not linePrefix.startswith(commonPrefix):
raise ValueError(
u"Indentation error: Initial whitespace of line %r "
u"is incompatible with preceding indentation %r" % \
(line, commonPrefix))
# Remove indentation (first line is special)
trimmed = [lines[0].lstrip(_INLINE_WS)]
# Skip this if commonPrefix is None (no continuation lines)
if commonPrefix != None:
indent = len(commonPrefix)
for line in lines[1:]:
trimmed.append(line[indent:])
# Strip off trailing and leading blank lines:
while trimmed and not trimmed[-1]:
trimmed.pop()
while trimmed and not trimmed[0]:
trimmed.pop(0)
# Return a single string:
return u'\n'.join(trimmed)
def _splitItems(s):
"""
Split from the string *s* into a list of items and return the items.
Each new item is preceded by a linebreak and starts without any initial
whitespace (lines with initial whitespace are considered continuation lines
and become part of the preceding item).
Empty items will be deleted.
"""
items = _NL_BEFORE_PRINT.split(s)
result = []
# Discard empty items
for item in items:
if item:
result.append(item)
return result
def _writeEncoded(s, fp, encoding = None):
"""
Writes the string *s* to the file-like object *fp*, using the specified
character *encoding*. If *encoding* is None, no encoding is performed.
"""
if encoding == None:
fp.write(s)
else:
fp.write(s.encode(encoding))
# TODO Temporary test code
if __name__ == "__main__" :
##print load(u"../sample/build/com.tangiblebit.drawdio.buildit/process.tb")
##print load(u"../sample/objects/com.tangiblebit.drawdio/object.tb")
print load(u"../sample/use/com.tangiblebit.drawdio.use/process.tb")
|