From 73f1a583ef801e056c4b764b129adddd11545716 Mon Sep 17 00:00:00 2001 From: Arne Neumann Date: Tue, 29 Sep 2015 15:46:15 +0200 Subject: [PATCH 1/3] fix #123: pattern sentence instance can be converted into an NLTK tree --- pattern/text/tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pattern/text/tree.py b/pattern/text/tree.py index cbe7f2d4..f2bd6baa 100644 --- a/pattern/text/tree.py +++ b/pattern/text/tree.py @@ -1566,7 +1566,7 @@ def nltk_tree(sentence): """ Returns an NLTK nltk.tree.Tree object from the given Sentence. The NLTK module should be on the search path somewhere. """ - from nltk import tree + from nltk.tree import Tree def do_pnp(pnp): # Returns the PNPChunk (and the contained Chunk objects) in NLTK bracket format. s = ' '.join([do_chunk(ch) for ch in pnp.chunks]) @@ -1591,7 +1591,7 @@ def do_chunk(ch): T.append(do_pnp(ch.pnp)) v.append(ch.pnp) T.append(')') - return tree.bracket_parse(' '.join(T)) + return Tree.fromstring(' '.join(T)) ### GRAPHVIZ DOT ################################################################################### From d5ef99442ac2cbe06129f8a27dc1c54cb26d4001 Mon Sep 17 00:00:00 2001 From: Arne Neumann Date: Wed, 30 Sep 2015 11:28:27 +0200 Subject: [PATCH 2/3] added parse_sentence(). parses tokenized sentence, result incl. offsets --- pattern/text/__init__.py | 100 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/pattern/text/__init__.py b/pattern/text/__init__.py index 39ce5558..bbe80270 100644 --- a/pattern/text/__init__.py +++ b/pattern/text/__init__.py @@ -59,6 +59,17 @@ def encode_string(v, encoding="utf-8"): return v return str(v) +def split_span(s): + """ + split a string on whitespace and yield a + (token string, (start index, stop index)) tuple for each token. + + source: http://stackoverflow.com/a/9518903 + """ + for match in re.finditer(r"\S+", s): + span = match.span() + yield match.group(0), (span[0], span[1] - 1) + decode_utf8 = decode_string encode_utf8 = encode_string @@ -915,6 +926,95 @@ def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemma s = TaggedString(s, format, language=kwargs.get("language", self.language)) return s + def add_offsets(self, tokens, offsets): + """ + adds offsets to tokens. + + Parameters + ---------- + tokens : list of list of str + each list represents a token and its annotations, + e.g. [u'Schlusspunkt', 'NN'] + offsets: list of (int, int) tuples + each tuple represents the (start, end) position of the + corresponding token in the original, unparsed string. + + Returns + ------- + tokens : list of list of str + each list represents a token and its annotations (incl. their + offsets as strings), e.g. [u'Schlusspunkt', 'NN', '10', '22'] + """ + assert len(tokens) == len(offsets) + for i, token in enumerate(tokens): + token.extend(str(val) for val in offsets[i]) + return tokens + + def parse_sentence(self, s, tokenize=False, tags=True, chunks=True, relations=False, lemmata=False, offsets=True, encoding="utf-8", **kwargs): + """ Takes a string (sentence) and returns a tagged Unicode string (TaggedString). + With tags=True, part-of-speech tags are parsed (NN, VB, IN, ...). + With chunks=True, phrase chunk tags are parsed (NP, VP, PP, PNP, ...). + With relations=True, semantic role labels are parsed (SBJ, OBJ). + With lemmata=True, word lemmata are parsed. + Optional parameters are passed to + the tokenizer, tagger, chunker, labeler and lemmatizer. + """ + assert isinstance(s, basestring), \ + "This method only works on a sentence given as a single string." + # Tokenizer. + tokens, offsets = zip(*split_span(s)) + for token in tokens: + # Unicode + if isinstance(token, str): + token = decode_string(token, encoding) + + # Tagger (required by chunker, labeler & lemmatizer). + if tags or chunks or relations or lemmata: + tokens = self.find_tags(tokens, **kwargs) + else: + tokens = [[w] for w in tokens] + # Chunker. + if chunks or relations: + tokens = self.find_chunks(tokens, **kwargs) + # Labeler. + if relations: + tokens = self.find_labels(tokens, **kwargs) + # Lemmatizer. + if lemmata: + tokens = self.find_lemmata(tokens, **kwargs) + if offsets: + tokens = self.add_offsets(tokens, offsets) + + # Slash-formatted tagged string. + # With collapse=False (or split=True), returns raw list + # (this output is not usable by tree.Text). + if not kwargs.get("collapse", True) \ + or kwargs.get("split", False): + return s + # Construct TaggedString.format. + # (this output is usable by tree.Text). + format = ["word"] + if tags: + format.append("part-of-speech") + if chunks: + format.extend(("chunk", "preposition")) + if relations: + format.append("relation") + if lemmata: + format.append("lemma") + if offsets: + format.extend(("start-pos", "end-pos")) + # Collapse raw list. + # Sentences are separated by newlines, tokens by spaces, tags by slashes. + # Slashes in words are encoded with &slash; + for i, token in enumerate(tokens): + tokens[i][0] = token[0].replace("/", "&slash;") + tokens[i] = "/".join(tokens[i]) + s = " ".join(tokens) + #~ s = "\n".join(s) + s = TaggedString(s, format, language=kwargs.get("language", self.language)) + return s + #--- TAGGED STRING --------------------------------------------------------------------------------- # Pattern.parse() returns a TaggedString: a Unicode string with "tags" and "language" attributes. # The pattern.text.tree.Text class uses this attribute to determine the token format and From 3a2625b6a52eb0ab1783a615584c8df651051ad7 Mon Sep 17 00:00:00 2001 From: Arne Neumann Date: Wed, 30 Sep 2015 16:43:12 +0200 Subject: [PATCH 3/3] added parsetree_with_offsets() - will generate a Text(), which will contain one Sentence(), where each word is annotated with string offsets >>> for w in s: print w, w.custom_tags Word(u'F\xfcr/IN') {'start-pos': u'0', 'end-pos': u'2'} Word(u'den/DT') {'start-pos': u'4', 'end-pos': u'6'} --- pattern/text/de/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pattern/text/de/__init__.py b/pattern/text/de/__init__.py index 8af4add6..079b6529 100644 --- a/pattern/text/de/__init__.py +++ b/pattern/text/de/__init__.py @@ -248,6 +248,11 @@ def parsetree(s, *args, **kwargs): """ return Text(parse(s, *args, **kwargs)) +def parsetree_with_offsets(s, *args, **kwargs): + """ Returns a parsed Text (with offsets) from the given string. + """ + return Text(parser.parse_sentence(s, *args, **kwargs)) + def tree(s, token=[WORD, POS, CHUNK, PNP, REL, LEMMA]): """ Returns a parsed Text from the given parsed string. """