Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions pattern/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,17 @@ def encode_string(v, encoding="utf-8"):
return v
return str(v)

def split_span(s):
"""
split a string on whitespace and yield a
(token string, (start index, stop index)) tuple for each token.

source: http://stackoverflow.com/a/9518903
"""
for match in re.finditer(r"\S+", s):
span = match.span()
yield match.group(0), (span[0], span[1] - 1)

decode_utf8 = decode_string
encode_utf8 = encode_string

Expand Down Expand Up @@ -915,6 +926,95 @@ def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemma
s = TaggedString(s, format, language=kwargs.get("language", self.language))
return s

def add_offsets(self, tokens, offsets):
"""
adds offsets to tokens.

Parameters
----------
tokens : list of list of str
each list represents a token and its annotations,
e.g. [u'Schlusspunkt', 'NN']
offsets: list of (int, int) tuples
each tuple represents the (start, end) position of the
corresponding token in the original, unparsed string.

Returns
-------
tokens : list of list of str
each list represents a token and its annotations (incl. their
offsets as strings), e.g. [u'Schlusspunkt', 'NN', '10', '22']
"""
assert len(tokens) == len(offsets)
for i, token in enumerate(tokens):
token.extend(str(val) for val in offsets[i])
return tokens

def parse_sentence(self, s, tokenize=False, tags=True, chunks=True, relations=False, lemmata=False, offsets=True, encoding="utf-8", **kwargs):
""" Takes a string (sentence) and returns a tagged Unicode string (TaggedString).
With tags=True, part-of-speech tags are parsed (NN, VB, IN, ...).
With chunks=True, phrase chunk tags are parsed (NP, VP, PP, PNP, ...).
With relations=True, semantic role labels are parsed (SBJ, OBJ).
With lemmata=True, word lemmata are parsed.
Optional parameters are passed to
the tokenizer, tagger, chunker, labeler and lemmatizer.
"""
assert isinstance(s, basestring), \
"This method only works on a sentence given as a single string."
# Tokenizer.
tokens, offsets = zip(*split_span(s))
for token in tokens:
# Unicode
if isinstance(token, str):
token = decode_string(token, encoding)

# Tagger (required by chunker, labeler & lemmatizer).
if tags or chunks or relations or lemmata:
tokens = self.find_tags(tokens, **kwargs)
else:
tokens = [[w] for w in tokens]
# Chunker.
if chunks or relations:
tokens = self.find_chunks(tokens, **kwargs)
# Labeler.
if relations:
tokens = self.find_labels(tokens, **kwargs)
# Lemmatizer.
if lemmata:
tokens = self.find_lemmata(tokens, **kwargs)
if offsets:
tokens = self.add_offsets(tokens, offsets)

# Slash-formatted tagged string.
# With collapse=False (or split=True), returns raw list
# (this output is not usable by tree.Text).
if not kwargs.get("collapse", True) \
or kwargs.get("split", False):
return s
# Construct TaggedString.format.
# (this output is usable by tree.Text).
format = ["word"]
if tags:
format.append("part-of-speech")
if chunks:
format.extend(("chunk", "preposition"))
if relations:
format.append("relation")
if lemmata:
format.append("lemma")
if offsets:
format.extend(("start-pos", "end-pos"))
# Collapse raw list.
# Sentences are separated by newlines, tokens by spaces, tags by slashes.
# Slashes in words are encoded with &slash;
for i, token in enumerate(tokens):
tokens[i][0] = token[0].replace("/", "&slash;")
tokens[i] = "/".join(tokens[i])
s = " ".join(tokens)
#~ s = "\n".join(s)
s = TaggedString(s, format, language=kwargs.get("language", self.language))
return s

#--- TAGGED STRING ---------------------------------------------------------------------------------
# Pattern.parse() returns a TaggedString: a Unicode string with "tags" and "language" attributes.
# The pattern.text.tree.Text class uses this attribute to determine the token format and
Expand Down
5 changes: 5 additions & 0 deletions pattern/text/de/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,11 @@ def parsetree(s, *args, **kwargs):
"""
return Text(parse(s, *args, **kwargs))

def parsetree_with_offsets(s, *args, **kwargs):
""" Returns a parsed Text (with offsets) from the given string.
"""
return Text(parser.parse_sentence(s, *args, **kwargs))

def tree(s, token=[WORD, POS, CHUNK, PNP, REL, LEMMA]):
""" Returns a parsed Text from the given parsed string.
"""
Expand Down
4 changes: 2 additions & 2 deletions pattern/text/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -1566,7 +1566,7 @@ def nltk_tree(sentence):
""" Returns an NLTK nltk.tree.Tree object from the given Sentence.
The NLTK module should be on the search path somewhere.
"""
from nltk import tree
from nltk.tree import Tree
def do_pnp(pnp):
# Returns the PNPChunk (and the contained Chunk objects) in NLTK bracket format.
s = ' '.join([do_chunk(ch) for ch in pnp.chunks])
Expand All @@ -1591,7 +1591,7 @@ def do_chunk(ch):
T.append(do_pnp(ch.pnp))
v.append(ch.pnp)
T.append(')')
return tree.bracket_parse(' '.join(T))
return Tree.fromstring(' '.join(T))

### GRAPHVIZ DOT ###################################################################################

Expand Down