clips · arne-cl · Sep 29, 2015 · Sep 30, 2015 · Sep 30, 2015
diff --git a/pattern/text/__init__.py b/pattern/text/__init__.py
@@ -59,6 +59,17 @@ def encode_string(v, encoding="utf-8"):
         return v
     return str(v)
 
+def split_span(s):
+    """
+    split a string on whitespace and yield a
+    (token string, (start index, stop index)) tuple for each token.
+
+    source: http://stackoverflow.com/a/9518903
+    """
+    for match in re.finditer(r"\S+", s):
+        span = match.span()
+        yield match.group(0), (span[0], span[1] - 1)
+
 decode_utf8 = decode_string
 encode_utf8 = encode_string
 
@@ -915,6 +926,95 @@ def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemma
         s = TaggedString(s, format, language=kwargs.get("language", self.language))
         return s
 
+    def add_offsets(self, tokens, offsets):
+        """
+        adds offsets to tokens.
+
+        Parameters
+        ----------
+        tokens : list of list of str
+            each list represents a token and its annotations,
+            e.g. [u'Schlusspunkt', 'NN']
+        offsets: list of (int, int) tuples
+            each tuple represents the (start, end) position of the
+            corresponding token in the original, unparsed string.
+
+        Returns
+        -------
+        tokens : list of list of str
+            each list represents a token and its annotations (incl. their
+            offsets as strings), e.g. [u'Schlusspunkt', 'NN', '10', '22']
+        """
+        assert len(tokens) == len(offsets)
+        for i, token in enumerate(tokens):
+            token.extend(str(val) for val in offsets[i])
+        return tokens
+
+    def parse_sentence(self, s, tokenize=False, tags=True, chunks=True, relations=False, lemmata=False, offsets=True, encoding="utf-8", **kwargs):
+        """ Takes a string (sentence) and returns a tagged Unicode string (TaggedString).
+            With tags=True, part-of-speech tags are parsed (NN, VB, IN, ...).
+            With chunks=True, phrase chunk tags are parsed (NP, VP, PP, PNP, ...).
+            With relations=True, semantic role labels are parsed (SBJ, OBJ).
+            With lemmata=True, word lemmata are parsed.
+            Optional parameters are passed to
+            the tokenizer, tagger, chunker, labeler and lemmatizer.
+        """
+        assert isinstance(s, basestring), \
+            "This method only works on a sentence given as a single string."
+        # Tokenizer.
+        tokens, offsets = zip(*split_span(s))
+        for token in tokens:
+            # Unicode
+            if isinstance(token, str):
+                token = decode_string(token, encoding)
+
+        # Tagger (required by chunker, labeler & lemmatizer).
+        if tags or chunks or relations or lemmata:
+            tokens = self.find_tags(tokens, **kwargs)
+        else:
+            tokens = [[w] for w in tokens]
+        # Chunker.
+        if chunks or relations:
+            tokens = self.find_chunks(tokens, **kwargs)
+        # Labeler.
+        if relations:
+            tokens = self.find_labels(tokens, **kwargs)
+        # Lemmatizer.
+        if lemmata:
+            tokens = self.find_lemmata(tokens, **kwargs)
+        if offsets:
+            tokens = self.add_offsets(tokens, offsets)
+
+        # Slash-formatted tagged string.
+        # With collapse=False (or split=True), returns raw list
+        # (this output is not usable by tree.Text).
+        if not kwargs.get("collapse", True) \
+            or kwargs.get("split", False):
+            return s
+        # Construct TaggedString.format.
+        # (this output is usable by tree.Text).
+        format = ["word"]
+        if tags:
+            format.append("part-of-speech")
+        if chunks:
+            format.extend(("chunk", "preposition"))
+        if relations:
+            format.append("relation")
+        if lemmata:
+            format.append("lemma")
+        if offsets:
+            format.extend(("start-pos", "end-pos"))
+        # Collapse raw list.
+        # Sentences are separated by newlines, tokens by spaces, tags by slashes.
+        # Slashes in words are encoded with &slash;
+        for i, token in enumerate(tokens):
+            tokens[i][0] = token[0].replace("/", "&slash;")
+            tokens[i] = "/".join(tokens[i])
+        s = " ".join(tokens)
+        #~ s = "\n".join(s)
+        s = TaggedString(s, format, language=kwargs.get("language", self.language))
+        return s
+
 #--- TAGGED STRING ---------------------------------------------------------------------------------
 # Pattern.parse() returns a TaggedString: a Unicode string with "tags" and "language" attributes.
 # The pattern.text.tree.Text class uses this attribute to determine the token format and

diff --git a/pattern/text/de/__init__.py b/pattern/text/de/__init__.py
@@ -248,6 +248,11 @@ def parsetree(s, *args, **kwargs):
     """
     return Text(parse(s, *args, **kwargs))
 
+def parsetree_with_offsets(s, *args, **kwargs):
+    """ Returns a parsed Text (with offsets) from the given string.
+    """
+    return Text(parser.parse_sentence(s, *args, **kwargs))
+
 def tree(s, token=[WORD, POS, CHUNK, PNP, REL, LEMMA]):
     """ Returns a parsed Text from the given parsed string.
     """

diff --git a/pattern/text/tree.py b/pattern/text/tree.py
@@ -1566,7 +1566,7 @@ def nltk_tree(sentence):
     """ Returns an NLTK nltk.tree.Tree object from the given Sentence.
         The NLTK module should be on the search path somewhere.
     """
-    from nltk import tree
+    from nltk.tree import Tree
     def do_pnp(pnp):
         # Returns the PNPChunk (and the contained Chunk objects) in NLTK bracket format.
         s = ' '.join([do_chunk(ch) for ch in pnp.chunks])
@@ -1591,7 +1591,7 @@ def do_chunk(ch):
             T.append(do_pnp(ch.pnp))
             v.append(ch.pnp)
     T.append(')')
-    return tree.bracket_parse(' '.join(T))
+    return Tree.fromstring(' '.join(T))
 
 ### GRAPHVIZ DOT ###################################################################################