From 73f1a583ef801e056c4b764b129adddd11545716 Mon Sep 17 00:00:00 2001
From: Arne Neumann <github@arne.cl>
Date: Tue, 29 Sep 2015 15:46:15 +0200
Subject: [PATCH 1/3] fix #123: pattern sentence instance can be converted into
 an NLTK tree

---
 pattern/text/tree.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pattern/text/tree.py b/pattern/text/tree.py
index cbe7f2d4..f2bd6baa 100644
--- a/pattern/text/tree.py
+++ b/pattern/text/tree.py
@@ -1566,7 +1566,7 @@ def nltk_tree(sentence):
     """ Returns an NLTK nltk.tree.Tree object from the given Sentence.
         The NLTK module should be on the search path somewhere.
     """
-    from nltk import tree
+    from nltk.tree import Tree
     def do_pnp(pnp):
         # Returns the PNPChunk (and the contained Chunk objects) in NLTK bracket format.
         s = ' '.join([do_chunk(ch) for ch in pnp.chunks])
@@ -1591,7 +1591,7 @@ def do_chunk(ch):
             T.append(do_pnp(ch.pnp))
             v.append(ch.pnp)
     T.append(')')
-    return tree.bracket_parse(' '.join(T))
+    return Tree.fromstring(' '.join(T))
 
 ### GRAPHVIZ DOT ###################################################################################
 

From d5ef99442ac2cbe06129f8a27dc1c54cb26d4001 Mon Sep 17 00:00:00 2001
From: Arne Neumann <github@arne.cl>
Date: Wed, 30 Sep 2015 11:28:27 +0200
Subject: [PATCH 2/3] added parse_sentence(). parses tokenized sentence, result
 incl. offsets

---
 pattern/text/__init__.py | 100 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/pattern/text/__init__.py b/pattern/text/__init__.py
index 39ce5558..bbe80270 100644
--- a/pattern/text/__init__.py
+++ b/pattern/text/__init__.py
@@ -59,6 +59,17 @@ def encode_string(v, encoding="utf-8"):
         return v
     return str(v)
 
+def split_span(s):
+    """
+    split a string on whitespace and yield a
+    (token string, (start index, stop index)) tuple for each token.
+
+    source: http://stackoverflow.com/a/9518903
+    """
+    for match in re.finditer(r"\S+", s):
+        span = match.span()
+        yield match.group(0), (span[0], span[1] - 1)
+
 decode_utf8 = decode_string
 encode_utf8 = encode_string
 
@@ -915,6 +926,95 @@ def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemma
         s = TaggedString(s, format, language=kwargs.get("language", self.language))
         return s
 
+    def add_offsets(self, tokens, offsets):
+        """
+        adds offsets to tokens.
+
+        Parameters
+        ----------
+        tokens : list of list of str
+            each list represents a token and its annotations,
+            e.g. [u'Schlusspunkt', 'NN']
+        offsets: list of (int, int) tuples
+            each tuple represents the (start, end) position of the
+            corresponding token in the original, unparsed string.
+
+        Returns
+        -------
+        tokens : list of list of str
+            each list represents a token and its annotations (incl. their
+            offsets as strings), e.g. [u'Schlusspunkt', 'NN', '10', '22']
+        """
+        assert len(tokens) == len(offsets)
+        for i, token in enumerate(tokens):
+            token.extend(str(val) for val in offsets[i])
+        return tokens
+
+    def parse_sentence(self, s, tokenize=False, tags=True, chunks=True, relations=False, lemmata=False, offsets=True, encoding="utf-8", **kwargs):
+        """ Takes a string (sentence) and returns a tagged Unicode string (TaggedString).
+            With tags=True, part-of-speech tags are parsed (NN, VB, IN, ...).
+            With chunks=True, phrase chunk tags are parsed (NP, VP, PP, PNP, ...).
+            With relations=True, semantic role labels are parsed (SBJ, OBJ).
+            With lemmata=True, word lemmata are parsed.
+            Optional parameters are passed to
+            the tokenizer, tagger, chunker, labeler and lemmatizer.
+        """
+        assert isinstance(s, basestring), \
+            "This method only works on a sentence given as a single string."
+        # Tokenizer.
+        tokens, offsets = zip(*split_span(s))
+        for token in tokens:
+            # Unicode
+            if isinstance(token, str):
+                token = decode_string(token, encoding)
+
+        # Tagger (required by chunker, labeler & lemmatizer).
+        if tags or chunks or relations or lemmata:
+            tokens = self.find_tags(tokens, **kwargs)
+        else:
+            tokens = [[w] for w in tokens]
+        # Chunker.
+        if chunks or relations:
+            tokens = self.find_chunks(tokens, **kwargs)
+        # Labeler.
+        if relations:
+            tokens = self.find_labels(tokens, **kwargs)
+        # Lemmatizer.
+        if lemmata:
+            tokens = self.find_lemmata(tokens, **kwargs)
+        if offsets:
+            tokens = self.add_offsets(tokens, offsets)
+
+        # Slash-formatted tagged string.
+        # With collapse=False (or split=True), returns raw list
+        # (this output is not usable by tree.Text).
+        if not kwargs.get("collapse", True) \
+            or kwargs.get("split", False):
+            return s
+        # Construct TaggedString.format.
+        # (this output is usable by tree.Text).
+        format = ["word"]
+        if tags:
+            format.append("part-of-speech")
+        if chunks:
+            format.extend(("chunk", "preposition"))
+        if relations:
+            format.append("relation")
+        if lemmata:
+            format.append("lemma")
+        if offsets:
+            format.extend(("start-pos", "end-pos"))
+        # Collapse raw list.
+        # Sentences are separated by newlines, tokens by spaces, tags by slashes.
+        # Slashes in words are encoded with &slash;
+        for i, token in enumerate(tokens):
+            tokens[i][0] = token[0].replace("/", "&slash;")
+            tokens[i] = "/".join(tokens[i])
+        s = " ".join(tokens)
+        #~ s = "\n".join(s)
+        s = TaggedString(s, format, language=kwargs.get("language", self.language))
+        return s
+
 #--- TAGGED STRING ---------------------------------------------------------------------------------
 # Pattern.parse() returns a TaggedString: a Unicode string with "tags" and "language" attributes.
 # The pattern.text.tree.Text class uses this attribute to determine the token format and

From 3a2625b6a52eb0ab1783a615584c8df651051ad7 Mon Sep 17 00:00:00 2001
From: Arne Neumann <github@arne.cl>
Date: Wed, 30 Sep 2015 16:43:12 +0200
Subject: [PATCH 3/3] added parsetree_with_offsets()

- will generate a Text(), which will contain one Sentence(),
  where each word is annotated with string offsets

>>> for w in s:
        print w, w.custom_tags

Word(u'F\xfcr/IN') {'start-pos': u'0', 'end-pos': u'2'}
Word(u'den/DT') {'start-pos': u'4', 'end-pos': u'6'}
---
 pattern/text/de/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pattern/text/de/__init__.py b/pattern/text/de/__init__.py
index 8af4add6..079b6529 100644
--- a/pattern/text/de/__init__.py
+++ b/pattern/text/de/__init__.py
@@ -248,6 +248,11 @@ def parsetree(s, *args, **kwargs):
     """
     return Text(parse(s, *args, **kwargs))
 
+def parsetree_with_offsets(s, *args, **kwargs):
+    """ Returns a parsed Text (with offsets) from the given string.
+    """
+    return Text(parser.parse_sentence(s, *args, **kwargs))
+
 def tree(s, token=[WORD, POS, CHUNK, PNP, REL, LEMMA]):
     """ Returns a parsed Text from the given parsed string.
     """