#!/usr/bin/python # coding=UTF-8 # Copyright (C) 2010 Christian Siefkes . # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . """ Tangible Data Format (TDF) is a lightweight data serialization format, see doc/formats for a description. The API for reading and writing TDF is similar to the Python standard modules marshal, pickle, and json. """ import cStringIO import re import sys # Introduces a simple list item. _SIMPLE_ITEM_MARKER = u"-" # Introduces a complex list item. _COMPLEX_ITEM_MARKER = u"+" # Separates key and value in a simple pair. _SIMPLE_MAP_SEP = u":" # Separates key and value in a complex pair. _COMPLEX_MAP_SEP = u"::" # Opens an inline list. _INLINE_LIST_START = u"{" # Closes an inline list. _INLINE_LIST_END = u"}" # Separates items in inline lists. _INLINE_LIST_SEP = u"," # Opens a bracket key. _BRACKET_KEY_START = u"[" # Closes a bracket key. _BRACKET_KEY_END = u"]" # Introduces a comment. _COMMENT_START = u"#" # The escape character _ESCAPE_CHAR = u"\\" # Literal true. _TRUE = u"true" # Literal false. _FALSE = u"false" # Literal null. _NULL = u"null" # Characters making up inline whitespace (space and tab). _INLINE_WS = u" \t" # Pattern matching a newline following (via lookahead) by a printable character. _NL_BEFORE_PRINT = re.compile(ur"\n(?=\S)") # Pattern string matching whitespace _WS = ur"\s+" # Pattern matching the start of an inline comment (preceded by whitespace) _INLINE_COMMENT = re.compile(_WS + re.escape(_COMMENT_START)) # Pattern matching the start of a simple list item _SIMPLE_ITEM_START = re.compile(re.escape(_SIMPLE_ITEM_MARKER) + _WS) # Pattern matching the start of a complex list item _COMPLEX_ITEM_START = re.compile(re.escape(_COMPLEX_ITEM_MARKER) + _WS) # Pattern matching a key in a pair: can contain escape sequences and any # unescaped characters except ":" _KEY = re.compile(ur"([^:\\]|\\.)*", re.DOTALL) # Pattern matching an atomic inline list item: can contain escape sequences and # any unescaped characters except ",{}" _INLINE_LIST_ATOM = re.compile(ur"([^,{}\\]|\\.)*", re.DOTALL) # Pattern matching an escape sequence (backslash followed by any other char) _ESCAPE_SEQ = re.compile(ur"\\(.)", re.DOTALL) # Pattern matching an integer number _INTEGER = re.compile(ur"^(0|[1-9][0-9]*)$") # Pattern matching a float _FLOAT = re.compile(ur"^([0-9]+\.?[0-9]*([eE][+-]?[0-9]+)?|" "\.[0-9]+([eE][+-]?[0-9]+)?|" "[+-]?(nan|inf))$") ### Public API ### def dump(obj, fp, encoding = u"UTF-8", comment = None, complexMap = True, indentPrefix = u" "): """ Serialize *obj* as a TDF formatted stream to *fp* (a file-like object with a `write` method). *obj* must be a list, tuple, or [frozen] set (all serialized as lists) or dict (serialized as map); it can contain nested lists (etc.) and dicts as well as [Unicode] strings, numbers (int, float, long), Booleans, and None values. *encoding* specifies the character encoding to use for string objects; if set to None, no encoding is performed. The optional *comment* is serialized before the start of the object. If *complexMap* is True, the outmost object is serialized as a complex map (Win-INI-style) if possible. Inner maps are never serialized as complex maps. The *indentPrefix* specifies the string of whitespace preceding each deeper indentation level. Only spaces and tabs are allowed. By default, two spaces are used. A `ValueError` is thrown if the *indentPrefix* is invalid. """ # Delegate return _doDump(obj, fp, encoding, comment, complexMap, indentPrefix) def dumps(obj, comment = None, complexMap = True, indentPrefix = u" "): """ Return a TDF formatted Unicode string containing the serialized *obj*. The optional *comment* is serialized before the start of the object. If *complexMap* is True, the outmost object is serialized as a complex map (Win-INI-style) if possible. Inner maps are never serialized as complex maps. The *indentPrefix* specifies the string of whitespace preceding each deeper indentation level. Only spaces and tabs are allowed. By default, two spaces are used. A `ValueError` is thrown if the *indentPrefix* is invalid. """ # Create string buffer object and delegate output = cStringIO.StringIO() dump(obj, output, None, comment, complexMap,indentPrefix) return output.getvalue() def load(fp, encoding = u"UTF-8"): """ Return a list or dict deserialized from *fp* (a file-like object with a `read` method containing a TDF document). *encoding* specifies the character encoding to use for decoding string objects; if set to None, no decoding is performed. Throws a `ValueError` if the TDF document is malformed. """ # Read whole contents of input file (using universal newline support) infile = open(fp, "rU") try: if encoding: text = infile.read().decode(encoding) else: text = infile.read() finally: infile.close() # Delegate to loads method return loads(text) def loads(s): """ Return a list or dict deserialized from *s* (a `str` or `unicode` instance containing a TDF document). Throws a `ValueError` if the TDF document is malformed. """ # Delegate return _doLoads(s) ### Private helper functions ### def _doDump(obj, fp, encoding, comment, complexMap, indentPrefix, currentIndent = u""): """ Helper function that serializes *obj* as a TDF formatted stream to *fp*. """ # Check that indentPrefix contains only spaces+tabs and it not empty if indentPrefix == u"": raise ValueError("Indent prefix is empty") elif indentPrefix.strip(_INLINE_WS) != u"": raise ValueError("Indent prefix contains invalid characters (only " "spaces and tabs are allowed): %r" % indentPrefix.strip(_INLINE_WS)) if comment: # Serialize the comment at the begin of the stream commentlines = comment.splitlines() for line in commentlines: _writeEncoded(u"%s %s\n" % (_COMMENT_START, line), fp, encoding) # Determine type of object if isinstance(obj, dict): # Serialize dictionary (possibly as complex map) if complexMap and currentIndent == u"" and _isComplexMap(obj): # Serialize as a complex map for k, v in obj.iteritems(): _dumpComplexKey(k, fp, encoding, indentPrefix, currentIndent) # Call myself recursively without increasing the indent _doDump(v, fp, encoding, None, complexMap, indentPrefix, currentIndent) else: # TODO Serialize as a normal map for k, v in obj.iteritems(): pass # TODO For lists and maps, call myself recursively, setting # currentIndent += indentPrefix. elif _isListLike(obj): # TODO Serialize list (possibly as an inline list) pass else: # It must be an atom if currentIndent == u"": raise TypeError(u"TDF document must be a list or map: %r" % obj) _writeEncoded(u"%s" % currentIndent, fp, encoding) _dumpAtom(obj, fp, encoding, indentPrefix, currentIndent) _writeEncoded(u"\n", fp, encoding) def _doLoads(s, commentsStripped = False): """ Helper function that returns a list or dict deserialized from *s*. Set *commentsStripped* to True if comments have already been stripped from TDF document. """ if not commentsStripped: # Discard all comments prior to further parsing lines = [] for line in s.splitlines(): if line.strip().startswith(_COMMENT_START): continue # Skip comment lines # Check if the line contains an inline comment # (must be preceded by whitespace) match = _INLINE_COMMENT.search(line) if match: # Discard inline comment and preceding whitespace line = line[0:match.start()] lines.append(line) # Rejoin lines in a single string s = u"\n".join(lines) if _SIMPLE_ITEM_START.match(s) or _COMPLEX_ITEM_START.match(s): return _parseBlockList(s) elif s.startswith(_INLINE_LIST_START): result, rest = _parseInlineList(s) if rest: raise ValueError(u"Unexpected data after end of inline list: %s" % \ rest) return result else: return _parseMap(s) def _dumpAtom(atom, fp, encoding, indentPrefix, currentIndent): """ Dump an *atom*. If the atom is a string that spans multiple lines, the continuation lines are prefixed by *currentIndent* + *indentPrefix*. The first line of the atom is NOT indented; neither is a newline or other terminator written after the atom. A *TypeError* is thrown if *atom* is not an atomic value (string, number, Boolean, or None). """ if isinstance(atom, basestring): # TODO Serialize string, handling indentation + escapes pass elif isinstance(atom, int) or isinstance(atom, long) or \ isinstance(atom, float): # Serialize number _writeEncoded(str(atom), fp, encoding) elif atom == True: # Serialize true _writeEncoded(_TRUE, fp, encoding) elif atom == False: # Serialize false _writeEncoded(_FALSE, fp, encoding) elif atom == None: # Serialize null _writeEncoded(_NULL, fp, encoding) else: # Throw TypeError raise TypeError(u"Not a valid atom: %r" % atom) def _dumpComplexKey(k, fp, encoding, indentPrefix, currentIndent): """ Dump a key in a complex map. """ _writeEncoded(u"%s%s" % (currentIndent, _BRACKET_KEY_START), fp, encoding) _dumpAtom(k, fp, encoding, indentPrefix, currentIndent) _writeEncoded(u"%s\n" % _BRACKET_KEY_END, fp, encoding) def _isListLike(obj): """ Check whether *obj* is a list-like object. Return True iff *obj* is a list, tuple, set, or frozen set. """ return isinstance(obj, list) or isinstance(obj, tuple) or \ isinstance(obj, set) or isinstance(obj, frozenset) def _isComplexMap(d): """ Test whether a dictionary is suitable for serialization as a complex map. Return True iff all values are compounds (lists or similar, or dictionaries) and none of them is the empty list. """ for val in d.values(): if _isListLike(val): # Lists are OK provided they aren't empty if len(val) == 0: return False elif not isinstance(obj, dict): # Dictionaries are OK, other objects aren't return False # it's OK, all values have passed the test return True def _parseAtom(s): """ Returns an atom deserialized from *s*. """ stripped = s.strip() # Is it a literal? if stripped == _TRUE: result = True elif stripped == _FALSE : result = False elif stripped == _NULL: result = None # Is it an int (or long) number? elif _INTEGER.match(stripped): # This should always work without error result = int(stripped) # Is it a float number? elif _FLOAT.match(stripped): # This should always work without error result = float(stripped) else: # It must be a string result = _parseString(s) return result def _parseCompound(s): """ Returns an compound deserialized from *s*. """ # Remove the common whitespace prefix of all lines (except the first) s = _removeCommonWhitespacePrefix(s) # Delegate return _doLoads(s, True) def _parseBlockList(s): """ Return a block list deserialized from a string *s*. """ # Split string in items items = _splitItems(s) result = [] for item in items: result.append(_parseBlockListItem(item)) return result def _parseBlockListItem(s): """ Return a block list item deserialized from *s*. """ simpleMatch = _SIMPLE_ITEM_START.match(s) if simpleMatch: # This is a simple list item: # remove the matched list item marker and parse the contained atom s = s[len(simpleMatch.group()):] return _parseAtom(s) complexMatch = _COMPLEX_ITEM_START.match(s) if complexMatch: # This is a complex list item: # remove the matched list item marker and parse the contained compound s = s[len(complexMatch.group()):] return _parseCompound(s) else: raise ValueError(u"Malformed block list item: %r" % s) def _parseComplexMap(items): """ Return a complex map deserialized from the list of *items* (which must not be empty). """ result = {} first = True values = None for item in items: stripped = item.strip() isKey = stripped.startswith(_BRACKET_KEY_START) and \ stripped.endswith(_BRACKET_KEY_END) # Check that first item is a bracket key if first and not isKey: raise ValueError( u"First item of complex map must be a bracket key: %s" % item) first = False if isKey: if first: first = False else: # Store values collected for previous key if not values: # Zero values represent an empty map (not an empty list) values = {} if key in result: raise ValueError( u"Complex map contains the key [%s] twice" % key) result[key] = values values = None # Strip brackets and parse new key key = _parseAtom(stripped[1:-1]) else: # Parse value if _SIMPLE_ITEM_START.match(stripped) or \ _COMPLEX_ITEM_START.match(stripped): # Values must be a list v = _parseBlockListItem(item) if not values: values = [] elif isinstance(values, dict): raise ValueError( u"Cannot mix list items and key/value pairs in the " u"same section [%s] of a complex map: %s" % (key, item)) values.append(v) else: # Values must be a map k, v = _parsePair(item) if not values: values = {} elif isinstance(values, list): raise ValueError( u"Cannot mix list items and key/value pairs in the " u"same section [%s] of a complex map: %s" % (key, item)) values[k] = v # Store final key/value pair and return if not values: # Zero values represent an empty map (not an empty list) values = {} if key in result: raise ValueError( u"Complex map contains the key [%s] twice" % key) result[key] = values return result def _parseInlineList(s): """ Parse an inline list deserialized from a string *s*. Returns two values: the *parsed* list (a list object) as well as a *rest* (a string object) if there is non-whitespace content in the string after the end of the parsed list--otherwise *rest* will be the empty string. """ stripped = s.strip() if not stripped.startswith(_INLINE_LIST_START): raise ValueError("Not a valid inline list: %s" % stripped) # Remove opening delimiter and surrounding whitespace content = stripped[len(_INLINE_LIST_START):] content = content.strip() reachedEOL = False parsed = [] # Find and process the next item, until content is empty while content and not reachedEOL: if stripped.startswith(_INLINE_LIST_START): # We found an inner list: call this function recursively inner, rest = _parseInlineList(content) parsed.append(inner) content = rest else: # Pattern is guaranteed to match (maybe the empty string) rawItem = _INLINE_LIST_ATOM.match(content).group() item = _parseAtom(rawItem) parsed.append(inner) # Remove parsed item content = content[len(rawItem):] if content.startswith(_INLINE_LIST_SEP): # Strip delimiter and any following whitespace content = content[len(_INLINE_LIST_SEP):].lstrip() elif content.startswith(_INLINE_LIST_END): # Looks like we're done reachedEOL = True # Strip delimiter and any following whitespace content = content[len(_INLINE_LIST_END):].lstrip() else: raise ValueError(u"Invalid characters in inline list: %s" % content) # Return parsed items and the remaining content, if any return parsed, content def _parseMap(s): """ Return a map deserialized from a string *s*. """ # Split string in items items = _splitItems(s) if items and items[0].startswith(_BRACKET_KEY_START): # It's a complex map return _parseComplexMap(items) else: result = {} for item in items: key, value = _parsePair(item) if key in result: raise ValueError(u"Map contains the key %r twice" % key) else: result[key] = value return result def _parsePair(s): """ Return a (key, value) tuple deserialized from the string *s*. """ # Match key (up to first unescaped ":")--this will always matching something # (though it may be the empty string) rawKey = _KEY.match(s).group() key = _parseAtom(rawKey) # Look at rest of string rest = s[len(rawKey):] if rest.startswith(_COMPLEX_MAP_SEP): # Value is a compound value = _parseCompound(rest[len(_COMPLEX_MAP_SEP):]) elif rest.startswith(_SIMPLE_MAP_SEP): # Value is an atom value = _parseAtom(rest[len(_SIMPLE_MAP_SEP):]) else: raise ValueError(u"Not a valid pair: %s" % s) return key, value def _parseString(s): """ Return a string deserialized from *s*, unindenting continuation lines and unescaping escaped characters as necessary. """ # Strip leading whitespace and unindent continuation lines s = _removeCommonWhitespacePrefix(s) # Strip final whitespace, if any stripped = s.rstrip() # Experimentally strip final backslashes, if any backslashStripped = stripped.rstrip(_ESCAPE_CHAR) lenDiff = len(stripped) - len(backslashStripped) if lenDiff % 2 == 1: # stripped ends in an odd number of backslashes: # the last of them escapes the following whitespace if s != stripped: # Strip the backslash and re-add the first whitespace-character stripped = stripped[0:-1] lenStripped = len(stripped) stripped += s[lenStripped:lenStripped + 1] else: # Strip the backslash and add a newline stripped = stripped[0:-1] + "\n" # Handle other escape sequences (removing the backslash) stripped = _ESCAPE_SEQ.sub(ur"\1", stripped) return stripped def _removeCommonWhitespacePrefix(s): """ Strip a uniform amount of indentation from the second and further lines of the string *s*, equal to the minimum indentation of all non-blank lines after the first line. Any indentation in the first line (i.e., up to the first newline) is insignificant and removed. Relative indentation of later lines is retained. Blank lines should be removed from the beginning and end of the string. The algorithm is based on the "Handling Docstring Indentation" algorithm from `PEP 257 `_. """ # Split into a list of lines: lines = s.splitlines() # Determine common whitespace prefix (first line doesn't count): commonPrefix = None for line in lines[1:]: stripped = line.lstrip(_INLINE_WS) strippedChars = len(line) - len(stripped) linePrefix = line[0:strippedChars] if stripped: if commonPrefix == None: # First empty line determines initial common prefix commonPrefix = linePrefix else: if len(commonPrefix) > len(linePrefix): # Ensure that linePrefix is a prefix of commonPrefix if commonPrefix.startswith(linePrefix): # Shorten common prefix commonPrefix = linePrefix else: raise ValueError( u"Indentation error: Initial whitespace of line %r " u"is incompatible with preceding indentation %r" % \ (line, commonPrefix)) else: # Ensure that commonPrefix is a prefix of linePrefix if not linePrefix.startswith(commonPrefix): raise ValueError( u"Indentation error: Initial whitespace of line %r " u"is incompatible with preceding indentation %r" % \ (line, commonPrefix)) # Remove indentation (first line is special) trimmed = [lines[0].lstrip(_INLINE_WS)] # Skip this if commonPrefix is None (no continuation lines) if commonPrefix != None: indent = len(commonPrefix) for line in lines[1:]: trimmed.append(line[indent:]) # Strip off trailing and leading blank lines: while trimmed and not trimmed[-1]: trimmed.pop() while trimmed and not trimmed[0]: trimmed.pop(0) # Return a single string: return u'\n'.join(trimmed) def _splitItems(s): """ Split from the string *s* into a list of items and return the items. Each new item is preceded by a linebreak and starts without any initial whitespace (lines with initial whitespace are considered continuation lines and become part of the preceding item). Empty items will be deleted. """ items = _NL_BEFORE_PRINT.split(s) result = [] # Discard empty items for item in items: if item: result.append(item) return result def _writeEncoded(s, fp, encoding = None): """ Writes the string *s* to the file-like object *fp*, using the specified character *encoding*. If *encoding* is None, no encoding is performed. """ if encoding == None: fp.write(s) else: fp.write(s.encode(encoding)) # TODO Temporary test code if __name__ == "__main__" : ##print load(u"../sample/build/com.tangiblebit.drawdio.buildit/process.tb") ##print load(u"../sample/objects/com.tangiblebit.drawdio/object.tb") print load(u"../sample/use/com.tangiblebit.drawdio.use/process.tb")