#!/usr/bin/python
# coding=UTF-8
# Copyright (C) 2010 Christian Siefkes <christian@siefkes.net>.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""
Tangible Data Format (TDF) is a lightweight data serialization format, see
doc/formats for a description. The API for reading and writing TDF is similar to
the Python standard modules marshal, pickle, and json.
"""

import cStringIO
import re
import sys

# Introduces a simple list item.
_SIMPLE_ITEM_MARKER = u"-"
# Introduces a complex list item.
_COMPLEX_ITEM_MARKER = u"+"
# Separates key and value in a simple pair.
_SIMPLE_MAP_SEP = u":"
# Separates key and value in a complex pair.
_COMPLEX_MAP_SEP = u"::"
# Opens an inline list.
_INLINE_LIST_START = u"{"
# Closes an inline list.
_INLINE_LIST_END = u"}"
# Separates items in inline lists.
_INLINE_LIST_SEP = u","
# Opens a bracket key.
_BRACKET_KEY_START = u"["
# Closes a bracket key.
_BRACKET_KEY_END = u"]"
# Introduces a comment.
_COMMENT_START = u"#"
# The escape character
_ESCAPE_CHAR = u"\\"
# Literal true.
_TRUE = u"true"
# Literal false.
_FALSE = u"false"
# Literal null.
_NULL = u"null"
# Characters making up inline whitespace (space and tab).
_INLINE_WS = u" \t"

# Pattern matching a newline following (via lookahead) by a printable character.
_NL_BEFORE_PRINT = re.compile(ur"\n(?=\S)")
# Pattern string matching whitespace
_WS = ur"\s+"
# Pattern matching the start of an inline comment (preceded by whitespace)
_INLINE_COMMENT = re.compile(_WS + re.escape(_COMMENT_START))
# Pattern matching the start of a simple list item
_SIMPLE_ITEM_START = re.compile(re.escape(_SIMPLE_ITEM_MARKER) + _WS)
# Pattern matching the start of a complex list item
_COMPLEX_ITEM_START = re.compile(re.escape(_COMPLEX_ITEM_MARKER) + _WS)
# Pattern matching a key in a pair: can contain escape sequences and any
# unescaped characters except ":"
_KEY = re.compile(ur"([^:\\]|\\.)*", re.DOTALL)
# Pattern matching an atomic inline list item: can contain escape sequences and
# any unescaped characters except ",{}"
_INLINE_LIST_ATOM = re.compile(ur"([^,{}\\]|\\.)*", re.DOTALL)
# Pattern matching an escape sequence (backslash followed by any other char)
_ESCAPE_SEQ = re.compile(ur"\\(.)", re.DOTALL)
# Pattern matching an integer number
_INTEGER = re.compile(ur"^(0|[1-9][0-9]*)$")
# Pattern matching a float
_FLOAT = re.compile(ur"^([0-9]+\.?[0-9]*([eE][+-]?[0-9]+)?|"
                        "\.[0-9]+([eE][+-]?[0-9]+)?|"
                        "[+-]?(nan|inf))$")

### Public API ###

def dump(obj, fp, encoding = u"UTF-8", comment = None, complexMap = True,
         indentPrefix = u"  "):
    """
    Serialize *obj* as a TDF formatted stream to *fp* (a file-like object with
    a `write` method). *obj* must be a list, tuple, or [frozen] set (all
    serialized as lists) or dict (serialized as map); it can contain nested
    lists (etc.) and dicts as well as [Unicode] strings, numbers (int, float,
    long), Booleans, and None values.

    *encoding* specifies the character encoding to use for string objects;
    if set to None, no encoding is performed.

    The optional *comment* is serialized before the start of the object.

    If *complexMap* is True, the outmost object is serialized as a complex map
    (Win-INI-style) if possible. Inner maps are never serialized as complex
    maps.

    The *indentPrefix* specifies the string of whitespace preceding each deeper
    indentation level. Only spaces and tabs are allowed. By default, two spaces
    are used.

    A `ValueError` is thrown if the *indentPrefix* is invalid.
    """
    # Delegate
    return _doDump(obj, fp, encoding, comment, complexMap, indentPrefix)

def dumps(obj, comment = None, complexMap = True, indentPrefix = u"  "):
    """
    Return a TDF formatted Unicode string containing the serialized *obj*.

    The optional *comment* is serialized before the start of the object.

    If *complexMap* is True, the outmost object is serialized as a complex map
    (Win-INI-style) if possible. Inner maps are never serialized as complex
    maps.

    The *indentPrefix* specifies the string of whitespace preceding each deeper
    indentation level. Only spaces and tabs are allowed. By default, two spaces
    are used.

    A `ValueError` is thrown if the *indentPrefix* is invalid.
    """
    # Create string buffer object and delegate
    output = cStringIO.StringIO()
    dump(obj, output, None, comment, complexMap,indentPrefix)
    return output.getvalue()

def load(fp, encoding = u"UTF-8"):
    """
    Return a list or dict deserialized from *fp* (a file-like object with
    a `read` method containing a TDF document).

    *encoding* specifies the character encoding to use for decoding string
    objects; if set to None, no decoding is performed.

    Throws a `ValueError` if the TDF document is malformed.
    """
    # Read whole contents of input file (using universal newline support)
    infile = open(fp, "rU")
    try:
        if encoding:
            text = infile.read().decode(encoding)
        else:
            text = infile.read()
    finally:
        infile.close()
    # Delegate to loads method
    return loads(text)

def loads(s):
    """
    Return a list or dict deserialized from *s* (a `str` or `unicode` instance
    containing a TDF document).

    Throws a `ValueError` if the TDF document is malformed.
    """
    # Delegate
    return _doLoads(s)

### Private helper functions ###

def _doDump(obj, fp, encoding, comment, complexMap, indentPrefix,
            currentIndent = u""):
    """
    Helper function that serializes *obj* as a TDF formatted stream to *fp*.
    """
    # Check that indentPrefix contains only spaces+tabs and it not empty
    if indentPrefix == u"":
        raise ValueError("Indent prefix is empty")
    elif indentPrefix.strip(_INLINE_WS) != u"":
        raise ValueError("Indent prefix contains invalid characters (only "
            "spaces and tabs are allowed): %r" % indentPrefix.strip(_INLINE_WS))

    if comment:
        # Serialize the comment at the begin of the stream
        commentlines = comment.splitlines()
        for line in commentlines:
            _writeEncoded(u"%s %s\n" % (_COMMENT_START, line), fp, encoding)

    # Determine type of object
    if isinstance(obj, dict):
        # Serialize dictionary (possibly as complex map)
        if complexMap and currentIndent == u"" and _isComplexMap(obj):
            # Serialize as a complex map
            for k, v in obj.iteritems():
                _dumpComplexKey(k, fp, encoding, indentPrefix, currentIndent)
                # Call myself recursively without increasing the indent
                _doDump(v, fp, encoding, None, complexMap, indentPrefix,
                        currentIndent)
        else:
            # TODO Serialize as a normal map
            for k, v in obj.iteritems():
                pass
        # TODO For lists and maps, call myself recursively, setting
        # currentIndent += indentPrefix.
    elif _isListLike(obj):
        # TODO Serialize list (possibly as an inline list)
        pass
    else:
        # It must be an atom
        if currentIndent == u"":
            raise TypeError(u"TDF document must be a list or map: %r" % obj)
        _writeEncoded(u"%s" % currentIndent, fp, encoding)
        _dumpAtom(obj, fp, encoding, indentPrefix, currentIndent)
        _writeEncoded(u"\n", fp, encoding)

def _doLoads(s, commentsStripped = False):
    """
    Helper function that returns a list or dict deserialized from *s*.

    Set *commentsStripped* to True if comments have already been stripped from
    TDF document.
    """
    if not commentsStripped:
        # Discard all comments prior to further parsing
        lines = []
        for line in s.splitlines():
            if line.strip().startswith(_COMMENT_START):
                continue # Skip comment lines
            # Check if the line contains an inline comment
            # (must be preceded by whitespace)
            match = _INLINE_COMMENT.search(line)
            if match:
                # Discard inline comment and preceding whitespace
                line = line[0:match.start()]
            lines.append(line)
        # Rejoin lines in a single string
        s = u"\n".join(lines)
    if _SIMPLE_ITEM_START.match(s) or _COMPLEX_ITEM_START.match(s):
        return _parseBlockList(s)
    elif s.startswith(_INLINE_LIST_START):
        result, rest = _parseInlineList(s)
        if rest:
            raise ValueError(u"Unexpected data after end of inline list: %s" % \
                             rest)
        return result
    else:
        return _parseMap(s)

def _dumpAtom(atom, fp, encoding, indentPrefix, currentIndent):
    """
    Dump an *atom*. If the atom is a string that spans multiple lines, the
    continuation lines are prefixed by *currentIndent* + *indentPrefix*.
    The first line of the atom is NOT indented; neither is a newline or other
    terminator written after the atom.

    A *TypeError* is thrown if *atom* is not an atomic value (string, number,
    Boolean, or None).
    """
    if isinstance(atom, basestring):
        # TODO Serialize string, handling indentation + escapes
        pass
    elif isinstance(atom, int) or isinstance(atom, long) or \
            isinstance(atom, float):
        # Serialize number
        _writeEncoded(str(atom), fp, encoding)
    elif atom == True:
        # Serialize true
        _writeEncoded(_TRUE, fp, encoding)
    elif atom == False:
        # Serialize false
        _writeEncoded(_FALSE, fp, encoding)
    elif atom == None:
        # Serialize null
        _writeEncoded(_NULL, fp, encoding)
    else:
        # Throw TypeError
        raise TypeError(u"Not a valid atom: %r" % atom)

def _dumpComplexKey(k, fp, encoding, indentPrefix, currentIndent):
    """
    Dump a key in a complex map.
    """
    _writeEncoded(u"%s%s" % (currentIndent, _BRACKET_KEY_START), fp, encoding)
    _dumpAtom(k, fp, encoding, indentPrefix, currentIndent)
    _writeEncoded(u"%s\n" % _BRACKET_KEY_END, fp, encoding)

def _isListLike(obj):
    """
    Check whether *obj* is a list-like object. Return True iff *obj* is a list,
    tuple, set, or frozen set.
    """
    return isinstance(obj, list) or isinstance(obj, tuple) or \
            isinstance(obj, set) or isinstance(obj, frozenset)

def _isComplexMap(d):
    """
    Test whether a dictionary is suitable for serialization as a complex map.
    Return True iff all values are compounds (lists or similar, or dictionaries)
    and none of them is the empty list.
    """
    for val in d.values():
        if _isListLike(val):
            # Lists are OK provided they aren't empty
            if len(val) == 0:
                return False
        elif not isinstance(obj, dict):
            # Dictionaries are OK, other objects aren't
            return False
    # it's OK, all values have passed the test
    return True

def _parseAtom(s):
    """
    Returns an atom deserialized from *s*.
    """
    stripped = s.strip()
    # Is it a literal?
    if stripped == _TRUE:
        result = True
    elif stripped == _FALSE :
        result = False
    elif stripped == _NULL:
        result = None
    # Is it an int (or long) number?
    elif _INTEGER.match(stripped):
        # This should always work without error
        result = int(stripped)
    # Is it a float number?
    elif _FLOAT.match(stripped):
        # This should always work without error
        result = float(stripped)
    else:
        # It must be a string
        result = _parseString(s)
    return result

def _parseCompound(s):
    """
    Returns an compound deserialized from *s*.
    """
    # Remove the common whitespace prefix of all lines (except the first)
    s = _removeCommonWhitespacePrefix(s)
    # Delegate
    return _doLoads(s, True)

def _parseBlockList(s):
    """
    Return a block list deserialized from a string *s*.
    """
    # Split string in items
    items = _splitItems(s)
    result = []
    for item in items:
        result.append(_parseBlockListItem(item))
    return result

def _parseBlockListItem(s):
    """
    Return a block list item deserialized from *s*.
    """
    simpleMatch = _SIMPLE_ITEM_START.match(s)
    if simpleMatch:
        # This is a simple list item:
        # remove the matched list item marker and parse the contained atom
        s = s[len(simpleMatch.group()):]
        return _parseAtom(s)
    complexMatch = _COMPLEX_ITEM_START.match(s)
    if complexMatch:
        # This is a complex list item:
        # remove the matched list item marker and parse the contained compound
        s = s[len(complexMatch.group()):]
        return _parseCompound(s)
    else:
        raise ValueError(u"Malformed block list item: %r" % s)

def _parseComplexMap(items):
    """
    Return a complex map deserialized from the list of *items* (which must not
    be empty).
    """
    result = {}
    first = True
    values = None
    for item in items:
        stripped = item.strip()
        isKey = stripped.startswith(_BRACKET_KEY_START) and \
                stripped.endswith(_BRACKET_KEY_END)

        # Check that first item is a bracket key
        if first and not isKey:
            raise ValueError(
                u"First item of complex map must be a bracket key: %s" % item)
            first = False

        if isKey:
            if first:
                first = False
            else:
                # Store values collected for previous key
                if not values:
                    # Zero values represent an empty map (not an empty list)
                    values = {}
                if key in result:
                    raise ValueError(
                        u"Complex map contains the key [%s] twice" % key)
                result[key] = values
                values = None
            # Strip brackets and parse new key
            key = _parseAtom(stripped[1:-1])
        else:
            # Parse value
            if _SIMPLE_ITEM_START.match(stripped) or \
                    _COMPLEX_ITEM_START.match(stripped):
                # Values must be a list
                v = _parseBlockListItem(item)
                if not values:
                    values = []
                elif isinstance(values, dict):
                    raise ValueError(
                        u"Cannot mix list items and key/value pairs in the "
                        u"same section [%s] of a complex map: %s" % (key, item))
                values.append(v)
            else:
                # Values must be a map
                k, v = _parsePair(item)
                if not values:
                    values = {}
                elif isinstance(values, list):
                    raise ValueError(
                        u"Cannot mix list items and key/value pairs in the "
                        u"same section [%s] of a complex map: %s" % (key, item))
                values[k] = v

    # Store final key/value pair and return
    if not values:
        # Zero values represent an empty map (not an empty list)
        values = {}
    if key in result:
        raise ValueError(
            u"Complex map contains the key [%s] twice" % key)
    result[key] = values
    return result

def _parseInlineList(s):
    """
    Parse an inline list deserialized from a string *s*. Returns two values:
    the *parsed* list (a list object) as well as a *rest* (a string object) if
    there is non-whitespace content in the string after the end of the parsed
    list--otherwise *rest* will be the empty string.
    """
    stripped = s.strip()
    if not stripped.startswith(_INLINE_LIST_START):
        raise ValueError("Not a valid inline list: %s" % stripped)
    # Remove opening delimiter and surrounding whitespace
    content = stripped[len(_INLINE_LIST_START):]
    content = content.strip()
    reachedEOL = False
    parsed = []
    # Find and process the next item, until content is empty
    while content and not reachedEOL:
        if stripped.startswith(_INLINE_LIST_START):
            # We found an inner list: call this function recursively
            inner, rest = _parseInlineList(content)
            parsed.append(inner)
            content = rest
        else:
            # Pattern is guaranteed to match (maybe the empty string)
            rawItem = _INLINE_LIST_ATOM.match(content).group()
            item = _parseAtom(rawItem)
            parsed.append(inner)
            # Remove parsed item
            content = content[len(rawItem):]
        if content.startswith(_INLINE_LIST_SEP):
             # Strip delimiter and any following whitespace
            content = content[len(_INLINE_LIST_SEP):].lstrip()
        elif content.startswith(_INLINE_LIST_END):
            # Looks like we're done
            reachedEOL = True
             # Strip delimiter and any following whitespace
            content = content[len(_INLINE_LIST_END):].lstrip()
        else:
            raise ValueError(u"Invalid characters in inline list: %s" % content)
    # Return parsed items and the remaining content, if any
    return parsed, content

def _parseMap(s):
    """
    Return a map deserialized from a string *s*.
    """
    # Split string in items
    items = _splitItems(s)
    if items and items[0].startswith(_BRACKET_KEY_START):
        # It's a complex map
        return _parseComplexMap(items)
    else:
        result = {}
        for item in items:
            key, value = _parsePair(item)
            if key in result:
                raise ValueError(u"Map contains the key %r twice" % key)
            else:
                result[key] = value
        return result

def _parsePair(s):
    """
    Return a (key, value) tuple deserialized from the string *s*.
    """
    # Match key (up to first unescaped ":")--this will always matching something
    # (though it may be the empty string)
    rawKey = _KEY.match(s).group()
    key = _parseAtom(rawKey)
    # Look at rest of string
    rest = s[len(rawKey):]
    if rest.startswith(_COMPLEX_MAP_SEP):
        # Value is a compound
        value = _parseCompound(rest[len(_COMPLEX_MAP_SEP):])
    elif rest.startswith(_SIMPLE_MAP_SEP):
        # Value is an atom
        value = _parseAtom(rest[len(_SIMPLE_MAP_SEP):])
    else:
        raise ValueError(u"Not a valid pair: %s" % s)
    return key, value

def _parseString(s):
    """
    Return a string deserialized from *s*, unindenting continuation lines and
    unescaping escaped characters as necessary.
    """
    # Strip leading whitespace and unindent continuation lines
    s = _removeCommonWhitespacePrefix(s)
    # Strip final whitespace, if any
    stripped = s.rstrip()
    # Experimentally strip final backslashes, if any
    backslashStripped = stripped.rstrip(_ESCAPE_CHAR)
    lenDiff = len(stripped) - len(backslashStripped)
    if lenDiff % 2 == 1:
        # stripped ends in an odd number of backslashes:
        # the last of them escapes the following whitespace
        if s != stripped:
            # Strip the backslash and re-add the first whitespace-character
            stripped = stripped[0:-1]
            lenStripped = len(stripped)
            stripped += s[lenStripped:lenStripped + 1]
        else:
            # Strip the backslash and add a newline
            stripped = stripped[0:-1] + "\n"
    # Handle other escape sequences (removing the backslash)
    stripped = _ESCAPE_SEQ.sub(ur"\1", stripped)
    return stripped

def _removeCommonWhitespacePrefix(s):
    """
    Strip a uniform amount of indentation from the second and further lines of
    the string *s*, equal to the minimum indentation of all non-blank lines
    after the first line. Any indentation in the first line (i.e., up to the
    first newline) is insignificant and removed. Relative indentation of later
    lines is retained. Blank lines should be removed from the beginning and end
    of the string. The algorithm is based on the "Handling Docstring Indentation"
    algorithm from `PEP 257
    <http://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation>`_.
    """
    # Split into a list of lines:
    lines = s.splitlines()
    # Determine common whitespace prefix (first line doesn't count):
    commonPrefix = None
    for line in lines[1:]:
        stripped = line.lstrip(_INLINE_WS)
        strippedChars = len(line) - len(stripped)
        linePrefix = line[0:strippedChars]
        if stripped:
            if commonPrefix == None:
                # First empty line determines initial common prefix
                commonPrefix = linePrefix
            else:
                if len(commonPrefix) > len(linePrefix):
                    # Ensure that linePrefix is a prefix of commonPrefix
                    if commonPrefix.startswith(linePrefix):
                        # Shorten common prefix
                        commonPrefix = linePrefix
                    else:
                        raise ValueError(
                            u"Indentation error: Initial whitespace of line %r "
                            u"is incompatible with preceding indentation %r" % \
                            (line, commonPrefix))
                else:
                    # Ensure that commonPrefix is a prefix of linePrefix
                    if not linePrefix.startswith(commonPrefix):
                        raise ValueError(
                            u"Indentation error: Initial whitespace of line %r "
                            u"is incompatible with preceding indentation %r" % \
                            (line, commonPrefix))
    # Remove indentation (first line is special)
    trimmed = [lines[0].lstrip(_INLINE_WS)]
    # Skip this if commonPrefix is None (no continuation lines)
    if commonPrefix != None:
        indent = len(commonPrefix)
        for line in lines[1:]:
            trimmed.append(line[indent:])
    # Strip off trailing and leading blank lines:
    while trimmed and not trimmed[-1]:
        trimmed.pop()
    while trimmed and not trimmed[0]:
        trimmed.pop(0)
    # Return a single string:
    return u'\n'.join(trimmed)

def _splitItems(s):
    """
    Split from the string *s* into a list of items and return the items.
    Each new item is preceded by a linebreak and starts without any initial
    whitespace (lines with initial whitespace are considered continuation lines
    and become part of the preceding item).

    Empty items will be deleted.
    """
    items = _NL_BEFORE_PRINT.split(s)
    result = []
    # Discard empty items
    for item in items:
        if item:
            result.append(item)
    return result

def _writeEncoded(s, fp, encoding = None):
    """
    Writes the string *s* to the file-like object *fp*, using the specified
    character *encoding*. If *encoding* is None, no encoding is performed.
    """
    if encoding == None:
        fp.write(s)
    else:
        fp.write(s.encode(encoding))

# TODO Temporary test code
if __name__ == "__main__" :
    ##print load(u"../sample/build/com.tangiblebit.drawdio.buildit/process.tb")
    ##print load(u"../sample/objects/com.tangiblebit.drawdio/object.tb")
    print load(u"../sample/use/com.tangiblebit.drawdio.use/process.tb")