diff --git a/.travis.yml b/.travis.yml index 06c20368..4e412ba3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,39 +2,24 @@ language: python python: - "2.6" - - "2.7.13" + - "2.7" - "3.5" - "3.6" -matrix: - allow_failures: - - python: "2.6" - - python: "3.5" - - python: "3.6" - before_install: - export TZ=Europe/Brussels - pip install --upgrade setuptools - - pip install --quiet pytest pytest-cov pytest-xdist + - pip install --upgrade pytest pytest-cov pytest-xdist install: - - python setup.py install --quiet + - python setup.py install script: - pytest --cov -after_script: - - pip install --quiet coveralls - - coveralls - -branches: - only: - - development - - python3 +# after_script: +# - pip install coveralls +# - coveralls notifications: - email: false - -# You can connect to MySQL/MariaDB using the username "travis" or "root" and a blank password. -services: - - mysql + - email: false diff --git a/pattern/text/__init__.py b/pattern/text/__init__.py index d36951a3..5dd93996 100644 --- a/pattern/text/__init__.py +++ b/pattern/text/__init__.py @@ -919,7 +919,7 @@ def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemma s[i][j][0] = s[i][j][0].replace("/", "&slash;") s[i][j] = "/".join(s[i][j]) s[i] = " ".join(s[i]) - s = "\n".join(s) + s = u"\n".join(s) s = TaggedString(s, format, language=kwargs.get("language", self.language)) return s diff --git a/pattern/text/en/wordnet/__init__.py b/pattern/text/en/wordnet/__init__.py index 4b265e5d..d80115c9 100644 --- a/pattern/text/en/wordnet/__init__.py +++ b/pattern/text/en/wordnet/__init__.py @@ -35,8 +35,8 @@ os.environ["WNHOME"] = os.path.join(MODULE, CORPUS) os.environ["WNSEARCHDIR"] = os.path.join(MODULE, CORPUS, "dict") -from pywordnet import wordnet as wn -from pywordnet import wntools +from .pywordnet import wordnet as wn +from .pywordnet import wntools # The bundled version of PyWordNet has custom fixes. # - line 365: check if lexnames exist. diff --git a/pattern/text/en/wordnet/pywordnet/wntools.py b/pattern/text/en/wordnet/pywordnet/wntools.py index 7593eeea..4e44e133 100755 --- a/pattern/text/en/wordnet/pywordnet/wntools.py +++ b/pattern/text/en/wordnet/pywordnet/wntools.py @@ -32,7 +32,7 @@ __author__ = "Oliver Steele " __version__ = "2.0" -from wordnet import * +from .wordnet import * # # Domain utilities @@ -41,9 +41,9 @@ def _requireSource(entity): if not hasattr(entity, 'pointers'): if isinstance(entity, Word): - raise TypeError(`entity` + " is not a Sense or Synset. Try " + `entity` + "[0] instead.") + raise TypeError(repr(entity) + " is not a Sense or Synset. Try " + repr(entity) + "[0] instead.") else: - raise TypeError(`entity` + " is not a Sense or Synset") + raise TypeError(repr(entity) + " is not a Sense or Synset") def tree(source, pointerType): """ @@ -337,7 +337,8 @@ def trySubstitutions(trySubstitutions, # workaround for lack of nested closures # Testing # def _test(reset=0): - import doctest, wntools + import doctest + from . import wordnet if reset: doctest.master = None # This keeps doctest from complaining after a reload. return doctest.testmod(wntools) diff --git a/pattern/text/en/wordnet/pywordnet/wordnet.py b/pattern/text/en/wordnet/pywordnet/wordnet.py index 7410756e..1fa0ca24 100755 --- a/pattern/text/en/wordnet/pywordnet/wordnet.py +++ b/pattern/text/en/wordnet/pywordnet/wordnet.py @@ -42,7 +42,6 @@ import string import os from os import environ -from types import IntType, ListType, StringType, TupleType # @@ -210,15 +209,15 @@ class Word: def __init__(self, line): """Initialize the word from a line of a WN POS file.""" - tokens = string.split(line) - ints = map(int, tokens[int(tokens[3]) + 4:]) - self.form = string.replace(tokens[0], '_', ' ') + tokens = line.split() + ints = map(int, tokens[int(tokens[3]) + 4:]) + self.form = tokens[0].replace('_', ' ') "Orthographic representation of the word." - self.pos = _normalizePOS(tokens[1]) + self.pos = _normalizePOS(tokens[1]) "Part of speech. One of NOUN, VERB, ADJECTIVE, ADVERB." - self.taggedSenseCount = ints[1] + self.taggedSenseCount = ints[1] "Number of senses that are tagged." - self._synsetOffsets = ints[2:ints[0]+2] + self._synsetOffsets = ints[2:ints[0]+2] def getPointers(self, pointerType=None): """Pointers connect senses and synsets, not words. @@ -231,17 +230,17 @@ def getPointerTargets(self, pointerType=None): raise self.getPointers.__doc__ def getSenses(self): - """Return a sequence of senses. - - >>> N['dog'].getSenses() - ('dog' in {noun: dog, domestic dog, Canis familiaris}, 'dog' in {noun: frump, dog}, 'dog' in {noun: dog}, 'dog' in {noun: cad, bounder, blackguard, dog, hound, heel}, 'dog' in {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, 'dog' in {noun: pawl, detent, click, dog}, 'dog' in {noun: andiron, firedog, dog, dog-iron}) - """ - if not hasattr(self, '_senses'): - def getSense(offset, pos=self.pos, form=self.form): - return getSynset(pos, offset)[form] - self._senses = tuple(map(getSense, self._synsetOffsets)) - del self._synsetOffsets - return self._senses + """Return a sequence of senses. + + >>> N['dog'].getSenses() + ('dog' in {noun: dog, domestic dog, Canis familiaris}, 'dog' in {noun: frump, dog}, 'dog' in {noun: dog}, 'dog' in {noun: cad, bounder, blackguard, dog, hound, heel}, 'dog' in {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, 'dog' in {noun: pawl, detent, click, dog}, 'dog' in {noun: andiron, firedog, dog, dog-iron}) + """ + if not hasattr(self, '_senses'): + def getSense(offset, pos=self.pos, form=self.form): + return getSynset(pos, offset)[form] + self._senses = tuple(map(getSense, self._synsetOffsets)) + del self._synsetOffsets + return self._senses # Deprecated. Present for backwards compatability. def senses(self): @@ -253,70 +252,70 @@ def senses(self): return self.getSense() def isTagged(self): - """Return 1 if any sense is tagged. - - >>> N['dog'].isTagged() - 1 - """ - return self.taggedSenseCount > 0 + """Return 1 if any sense is tagged. + + >>> N['dog'].isTagged() + 1 + """ + return self.taggedSenseCount > 0 def getAdjectivePositions(self): - """Return a sequence of adjective positions that this word can - appear in. These are elements of ADJECTIVE_POSITIONS. - - >>> ADJ['clear'].getAdjectivePositions() - [None, 'predicative'] - """ - positions = {} - for sense in self.getSenses(): - positions[sense.position] = 1 - return positions.keys() + """Return a sequence of adjective positions that this word can + appear in. These are elements of ADJECTIVE_POSITIONS. + + >>> ADJ['clear'].getAdjectivePositions() + [None, 'predicative'] + """ + positions = {} + for sense in self.getSenses(): + positions[sense.position] = 1 + return positions.keys() adjectivePositions = getAdjectivePositions # backwards compatability def __cmp__(self, other): - """ - >>> N['cat'] < N['dog'] - 1 - >>> N['dog'] < V['dog'] - 1 - """ - return _compareInstances(self, other, ('pos', 'form')) + """ + >>> N['cat'] < N['dog'] + 1 + >>> N['dog'] < V['dog'] + 1 + """ + return _compareInstances(self, other, ('pos', 'form')) def __str__(self): - """Return a human-readable representation. - - >>> str(N['dog']) - 'dog(n.)' - """ - abbrs = {NOUN: 'n.', VERB: 'v.', ADJECTIVE: 'adj.', ADVERB: 'adv.'} - return self.form + "(" + abbrs[self.pos] + ")" + """Return a human-readable representation. + + >>> str(N['dog']) + 'dog(n.)' + """ + abbrs = {NOUN: 'n.', VERB: 'v.', ADJECTIVE: 'adj.', ADVERB: 'adv.'} + return self.form + "(" + abbrs[self.pos] + ")" def __repr__(self): - """If ReadableRepresentations is true, return a human-readable - representation, e.g. 'dog(n.)'. - - If ReadableRepresentations is false, return a machine-readable - representation, e.g. "getWord('dog', 'noun')". - """ - if ReadableRepresentations: - return str(self) - return "getWord" + `(self.form, self.pos)` - + """If ReadableRepresentations is true, return a human-readable + representation, e.g. 'dog(n.)'. + + If ReadableRepresentations is false, return a machine-readable + representation, e.g. "getWord('dog', 'noun')". + """ + if ReadableRepresentations: + return str(self) + return "getWord" + repr(self.form, self.pos) + # # Sequence protocol (a Word's elements are its Senses) # def __nonzero__(self): - return 1 + return 1 def __len__(self): - return len(self.getSenses()) + return len(self.getSenses()) def __getitem__(self, index): - return self.getSenses()[index] + return self.getSenses()[index] def __getslice__(self, i, j): - return self.getSenses()[i:j] + return self.getSenses()[i:j] class Synset: @@ -354,157 +353,157 @@ class Synset: def __init__(self, pos, offset, line): "Initialize the synset from a line off a WN synset file." - self.pos = pos + self.pos = pos "part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB." - self.offset = offset + self.offset = offset """integer offset into the part-of-speech file. Together with pos, this can be used as a unique id.""" - tokens = string.split(line[:string.index(line, '|')]) - self.ssType = tokens[2] - self.gloss = string.strip(line[string.index(line, '|') + 1:]) + tokens = line[:string.index(line, '|')].split() + self.ssType = tokens[2] + self.gloss = line[string.index(line, '|') + 1:].strip() self.lexname = Lexname.lexnames and Lexname.lexnames[int(tokens[1])] or [] - (self._senseTuples, remainder) = _partition(tokens[4:], 2, string.atoi(tokens[3], 16)) - (self._pointerTuples, remainder) = _partition(remainder[1:], 4, int(remainder[0])) - if pos == VERB: - (vfTuples, remainder) = _partition(remainder[1:], 3, int(remainder[0])) - def extractVerbFrames(index, vfTuples): - return tuple(map(lambda t:string.atoi(t[1]), filter(lambda t,i=index:string.atoi(t[2],16) in (0, i), vfTuples))) - senseVerbFrames = [] - for index in range(1, len(self._senseTuples) + 1): - senseVerbFrames.append(extractVerbFrames(index, vfTuples)) - self._senseVerbFrames = senseVerbFrames - self.verbFrames = tuple(extractVerbFrames(None, vfTuples)) + (self._senseTuples, remainder) = _partition(tokens[4:], 2, string.atoi(tokens[3], 16)) + (self._pointerTuples, remainder) = _partition(remainder[1:], 4, int(remainder[0])) + if pos == VERB: + (vfTuples, remainder) = _partition(remainder[1:], 3, int(remainder[0])) + def extractVerbFrames(index, vfTuples): + return tuple(map(lambda t:string.atoi(t[1]), filter(lambda t,i=index:string.atoi(t[2],16) in (0, i), vfTuples))) + senseVerbFrames = [] + for index in range(1, len(self._senseTuples) + 1): + senseVerbFrames.append(extractVerbFrames(index, vfTuples)) + self._senseVerbFrames = senseVerbFrames + self.verbFrames = tuple(extractVerbFrames(None, vfTuples)) """A sequence of integers that index into VERB_FRAME_STRINGS. These list the verb frames that any Sense in this synset participates in. (See also Sense.verbFrames.) Defined only for verbs.""" def getSenses(self): - """Return a sequence of Senses. - - >>> N['dog'][0].getSenses() - ('dog' in {noun: dog, domestic dog, Canis familiaris},) - """ - if not hasattr(self, '_senses'): - def loadSense(senseTuple, verbFrames=None, synset=self): - return Sense(synset, senseTuple, verbFrames) - if self.pos == VERB: - self._senses = tuple(map(loadSense, self._senseTuples, self._senseVerbFrames)) - del self._senseVerbFrames - else: - self._senses = tuple(map(loadSense, self._senseTuples)) - del self._senseTuples - return self._senses + """Return a sequence of Senses. + + >>> N['dog'][0].getSenses() + ('dog' in {noun: dog, domestic dog, Canis familiaris},) + """ + if not hasattr(self, '_senses'): + def loadSense(senseTuple, verbFrames=None, synset=self): + return Sense(synset, senseTuple, verbFrames) + if self.pos == VERB: + self._senses = tuple(map(loadSense, self._senseTuples, self._senseVerbFrames)) + del self._senseVerbFrames + else: + self._senses = tuple(map(loadSense, self._senseTuples)) + del self._senseTuples + return self._senses senses = getSenses def getPointers(self, pointerType=None): - """Return a sequence of Pointers. + """Return a sequence of Pointers. If pointerType is specified, only pointers of that type are returned. In this case, pointerType should be an element of POINTER_TYPES. - - >>> N['dog'][0].getPointers()[:5] - (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt}) - >>> N['dog'][0].getPointers(HYPERNYM) - (hypernym -> {noun: canine, canid},) - """ - if not hasattr(self, '_pointers'): - def loadPointer(tuple, synset=self): - return Pointer(synset.offset, tuple) - self._pointers = tuple(map(loadPointer, self._pointerTuples)) - del self._pointerTuples - if pointerType == None: - return self._pointers - else: - _requirePointerType(pointerType) - return filter(lambda pointer, type=pointerType: pointer.type == type, self._pointers) + + >>> N['dog'][0].getPointers()[:5] + (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt}) + >>> N['dog'][0].getPointers(HYPERNYM) + (hypernym -> {noun: canine, canid},) + """ + if not hasattr(self, '_pointers'): + def loadPointer(tuple, synset=self): + return Pointer(synset.offset, tuple) + self._pointers = tuple(map(loadPointer, self._pointerTuples)) + del self._pointerTuples + if pointerType == None: + return self._pointers + else: + _requirePointerType(pointerType) + return filter(lambda pointer, type=pointerType: pointer.type == type, self._pointers) pointers = getPointers # backwards compatability def getPointerTargets(self, pointerType=None): - """Return a sequence of Senses or Synsets. - + """Return a sequence of Senses or Synsets. + If pointerType is specified, only targets of pointers of that type are returned. In this case, pointerType should be an element of POINTER_TYPES. - - >>> N['dog'][0].getPointerTargets()[:5] - [{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}] - >>> N['dog'][0].getPointerTargets(HYPERNYM) - [{noun: canine, canid}] - """ - return map(Pointer.target, self.getPointers(pointerType)) + + >>> N['dog'][0].getPointerTargets()[:5] + [{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}] + >>> N['dog'][0].getPointerTargets(HYPERNYM) + [{noun: canine, canid}] + """ + return map(Pointer.target, self.getPointers(pointerType)) pointerTargets = getPointerTargets # backwards compatability def isTagged(self): - """Return 1 if any sense is tagged. - - >>> N['dog'][0].isTagged() - 1 - >>> N['dog'][1].isTagged() - 0 - """ - return len(filter(Sense.isTagged, self.getSenses())) > 0 + """Return 1 if any sense is tagged. + + >>> N['dog'][0].isTagged() + 1 + >>> N['dog'][1].isTagged() + 0 + """ + return len(filter(Sense.isTagged, self.getSenses())) > 0 def __str__(self): - """Return a human-readable representation. - - >>> str(N['dog'][0].synset) - '{noun: dog, domestic dog, Canis familiaris}' - """ - return "{" + self.pos + ": " + string.joinfields(map(lambda sense:sense.form, self.getSenses()), ", ") + "}" + """Return a human-readable representation. + + >>> str(N['dog'][0].synset) + '{noun: dog, domestic dog, Canis familiaris}' + """ + return "{" + self.pos + ": " + string.joinfields(map(lambda sense:sense.form, self.getSenses()), ", ") + "}" def __repr__(self): - """If ReadableRepresentations is true, return a human-readable - representation, e.g. 'dog(n.)'. - - If ReadableRepresentations is false, return a machine-readable - representation, e.g. "getSynset(pos, 1234)". - """ - if ReadableRepresentations: - return str(self) - return "getSynset" + `(self.pos, self.offset)` + """If ReadableRepresentations is true, return a human-readable + representation, e.g. 'dog(n.)'. + + If ReadableRepresentations is false, return a machine-readable + representation, e.g. "getSynset(pos, 1234)". + """ + if ReadableRepresentations: + return str(self) + return "getSynset" + repr(self.pos, self.offset) def __cmp__(self, other): - return _compareInstances(self, other, ('pos', 'offset')) + return _compareInstances(self, other, ('pos', 'offset')) # # Sequence protocol (a Synset's elements are its senses). # def __nonzero__(self): - return 1 + return 1 def __len__(self): - """ - >>> len(N['dog'][0].synset) - 3 - """ - return len(self.getSenses()) + """ + >>> len(N['dog'][0].synset) + 3 + """ + return len(self.getSenses()) def __getitem__(self, idx): - """ - >>> N['dog'][0].synset[0] == N['dog'][0] - 1 - >>> N['dog'][0].synset['dog'] == N['dog'][0] - 1 - >>> N['dog'][0].synset[N['dog']] == N['dog'][0] - 1 - >>> N['cat'][6] - 'cat' in {noun: big cat, cat} - """ - senses = self.getSenses() - if isinstance(idx, Word): - idx = idx.form - if isinstance(idx, StringType): - idx = _index(idx, map(lambda sense:sense.form, senses)) or \ - _index(idx, map(lambda sense:sense.form, senses), _equalsIgnoreCase) - return senses[idx] + """ + >>> N['dog'][0].synset[0] == N['dog'][0] + 1 + >>> N['dog'][0].synset['dog'] == N['dog'][0] + 1 + >>> N['dog'][0].synset[N['dog']] == N['dog'][0] + 1 + >>> N['cat'][6] + 'cat' in {noun: big cat, cat} + """ + senses = self.getSenses() + if isinstance(idx, Word): + idx = idx.form + if isinstance(idx, str): + idx = _index(idx, map(lambda sense:sense.form, senses)) or \ + _index(idx, map(lambda sense:sense.form, senses), _equalsIgnoreCase) + return senses[idx] def __getslice__(self, i, j): - return self.getSenses()[i:j] + return self.getSenses()[i:j] class Sense: @@ -525,7 +524,7 @@ class Sense: VERB_FRAME_STRINGS. These list the verb frames that this Sense partipates in. Defined only for verbs. - >>> decide = V['decide'][0].synset # first synset for 'decide' + >>> decide = V['decide'][0].synset # first synset for 'decide' >>> decide[0].verbFrames (8, 2, 26, 29) >>> decide[1].verbFrames @@ -536,124 +535,124 @@ class Sense: def __init__(sense, synset, senseTuple, verbFrames=None): "Initialize a sense from a synset's senseTuple." - # synset is stored by key (pos, synset) rather than object - # reference, to avoid creating a circular reference between - # Senses and Synsets that will prevent the vm from - # garbage-collecting them. - sense.pos = synset.pos + # synset is stored by key (pos, synset) rather than object + # reference, to avoid creating a circular reference between + # Senses and Synsets that will prevent the vm from + # garbage-collecting them. + sense.pos = synset.pos "part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB" - sense.synsetOffset = synset.offset + sense.synsetOffset = synset.offset "synset key. This is used to retrieve the sense." - sense.verbFrames = verbFrames + sense.verbFrames = verbFrames """A sequence of integers that index into VERB_FRAME_STRINGS. These list the verb frames that this Sense partipates in. Defined only for verbs.""" - (form, idString) = senseTuple - sense.position = None - if '(' in form: - index = string.index(form, '(') - key = form[index + 1:-1] - form = form[:index] - if key == 'a': - sense.position = ATTRIBUTIVE - elif key == 'p': - sense.position = PREDICATIVE - elif key == 'ip': - sense.position = IMMEDIATE_POSTNOMINAL - else: - raise "unknown attribute " + key - sense.form = string.replace(form, '_', ' ') + (form, idString) = senseTuple + sense.position = None + if '(' in form: + index = string.index(form, '(') + key = form[index + 1:-1] + form = form[:index] + if key == 'a': + sense.position = ATTRIBUTIVE + elif key == 'p': + sense.position = PREDICATIVE + elif key == 'ip': + sense.position = IMMEDIATE_POSTNOMINAL + else: + raise "unknown attribute " + key + sense.form = string.replace(form, '_', ' ') "orthographic representation of the Word this is a Sense of." def __getattr__(self, name): - # see the note at __init__ about why 'synset' is provided as a - # 'virtual' slot - if name == 'synset': - return getSynset(self.pos, self.synsetOffset) + # see the note at __init__ about why 'synset' is provided as a + # 'virtual' slot + if name == 'synset': + return getSynset(self.pos, self.synsetOffset) elif name == 'lexname': return self.synset.lexname - else: - raise AttributeError(name) + else: + raise AttributeError(name) def __str__(self): - """Return a human-readable representation. - - >>> str(N['dog']) - 'dog(n.)' - """ - return `self.form` + " in " + str(self.synset) + """Return a human-readable representation. + + >>> str(N['dog']) + 'dog(n.)' + """ + return repr(self.form) + " in " + str(self.synset) def __repr__(self): - """If ReadableRepresentations is true, return a human-readable - representation, e.g. 'dog(n.)'. - - If ReadableRepresentations is false, return a machine-readable - representation, e.g. "getWord('dog', 'noun')". - """ - if ReadableRepresentations: - return str(self) - return "%s[%s]" % (`self.synset`, `self.form`) + """If ReadableRepresentations is true, return a human-readable + representation, e.g. 'dog(n.)'. + + If ReadableRepresentations is false, return a machine-readable + representation, e.g. "getWord('dog', 'noun')". + """ + if ReadableRepresentations: + return str(self) + return "%s[%s]" % (repr(self.synset), repr(self.form)) def getPointers(self, pointerType=None): - """Return a sequence of Pointers. - + """Return a sequence of Pointers. + If pointerType is specified, only pointers of that type are returned. In this case, pointerType should be an element of POINTER_TYPES. - - >>> N['dog'][0].getPointers()[:5] - (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt}) - >>> N['dog'][0].getPointers(HYPERNYM) - (hypernym -> {noun: canine, canid},) - """ - senseIndex = _index(self, self.synset.getSenses()) - def pointsFromThisSense(pointer, selfIndex=senseIndex): - return pointer.sourceIndex == 0 or pointer.sourceIndex - 1 == selfIndex - return filter(pointsFromThisSense, self.synset.getPointers(pointerType)) + + >>> N['dog'][0].getPointers()[:5] + (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt}) + >>> N['dog'][0].getPointers(HYPERNYM) + (hypernym -> {noun: canine, canid},) + """ + senseIndex = _index(self, self.synset.getSenses()) + def pointsFromThisSense(pointer, selfIndex=senseIndex): + return pointer.sourceIndex == 0 or pointer.sourceIndex - 1 == selfIndex + return filter(pointsFromThisSense, self.synset.getPointers(pointerType)) pointers = getPointers # backwards compatability def getPointerTargets(self, pointerType=None): - """Return a sequence of Senses or Synsets. - + """Return a sequence of Senses or Synsets. + If pointerType is specified, only targets of pointers of that type are returned. In this case, pointerType should be an element of POINTER_TYPES. - - >>> N['dog'][0].getPointerTargets()[:5] - [{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}] - >>> N['dog'][0].getPointerTargets(HYPERNYM) - [{noun: canine, canid}] - """ - return map(Pointer.target, self.getPointers(pointerType)) + + >>> N['dog'][0].getPointerTargets()[:5] + [{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}] + >>> N['dog'][0].getPointerTargets(HYPERNYM) + [{noun: canine, canid}] + """ + return map(Pointer.target, self.getPointers(pointerType)) pointerTargets = getPointerTargets # backwards compatability def getSenses(self): - return self, + return self, senses = getSenses # backwards compatability def isTagged(self): - """Return 1 if any sense is tagged. - - >>> N['dog'][0].isTagged() - 1 - >>> N['dog'][1].isTagged() - 0 - """ - word = self.word() - return _index(self, word.getSenses()) < word.taggedSenseCount + """Return 1 if any sense is tagged. + + >>> N['dog'][0].isTagged() + 1 + >>> N['dog'][1].isTagged() + 0 + """ + word = self.word() + return _index(self, word.getSenses()) < word.taggedSenseCount def getWord(self): - return getWord(self.form, self.pos) + return getWord(self.form, self.pos) word = getWord # backwards compatability def __cmp__(self, other): - def senseIndex(sense, synset=self.synset): - return _index(sense, synset.getSenses(), testfn=lambda a,b: a.form == b.form) - return _compareInstances(self, other, ('synset',)) or cmp(senseIndex(self), senseIndex(other)) + def senseIndex(sense, synset=self.synset): + return _index(sense, synset.getSenses(), testfn=lambda a,b: a.form == b.form) + return _compareInstances(self, other, ('synset',)) or cmp(senseIndex(self), senseIndex(other)) class Pointer: @@ -668,23 +667,23 @@ class Pointer: """ _POINTER_TYPE_TABLE = { - '!': ANTONYM, + '!': ANTONYM, '@': HYPERNYM, '~': HYPONYM, '~i': HYPONYM, # Tom De Smedt, 2006: '@i': HYPERNYM, # yields a KeyError otherwise - '=': ATTRIBUTE, + '=': ATTRIBUTE, '^': ALSO_SEE, '*': ENTAILMENT, '>': CAUSE, - '$': VERB_GROUP, - '#m': MEMBER_MERONYM, + '$': VERB_GROUP, + '#m': MEMBER_MERONYM, '#s': SUBSTANCE_MERONYM, '#p': PART_MERONYM, - '%m': MEMBER_HOLONYM, + '%m': MEMBER_HOLONYM, '%s': SUBSTANCE_HOLONYM, '%p': PART_HOLONYM, - '&': SIMILAR, + '&': SIMILAR, '<': PARTICIPLE_OF, '\\': PERTAINYM, # New in wn 2.0: @@ -698,51 +697,51 @@ class Pointer: } def __init__(self, sourceOffset, pointerTuple): - (type, offset, pos, indices) = pointerTuple - self.type = Pointer._POINTER_TYPE_TABLE[type] + (type, offset, pos, indices) = pointerTuple + self.type = Pointer._POINTER_TYPE_TABLE[type] """One of POINTER_TYPES.""" - self.sourceOffset = sourceOffset - self.targetOffset = int(offset) - self.pos = _normalizePOS(pos) + self.sourceOffset = sourceOffset + self.targetOffset = int(offset) + self.pos = _normalizePOS(pos) """part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB""" - indices = string.atoi(indices, 16) - self.sourceIndex = indices >> 8 - self.targetIndex = indices & 255 + indices = string.atoi(indices, 16) + self.sourceIndex = indices >> 8 + self.targetIndex = indices & 255 def getSource(self): - synset = getSynset(self.pos, self.sourceOffset) - if self.sourceIndex: - return synset[self.sourceIndex - 1] - else: - return synset + synset = getSynset(self.pos, self.sourceOffset) + if self.sourceIndex: + return synset[self.sourceIndex - 1] + else: + return synset source = getSource # backwards compatability def getTarget(self): - synset = getSynset(self.pos, self.targetOffset) - if self.targetIndex: - return synset[self.targetIndex - 1] - else: - return synset + synset = getSynset(self.pos, self.targetOffset) + if self.targetIndex: + return synset[self.targetIndex - 1] + else: + return synset target = getTarget # backwards compatability def __str__(self): - return self.type + " -> " + str(self.target()) + return self.type + " -> " + str(self.target()) def __repr__(self): - if ReadableRepresentations: - return str(self) - return "<" + str(self) + ">" + if ReadableRepresentations: + return str(self) + return "<" + str(self) + ">" def __cmp__(self, other): - diff = _compareInstances(self, other, ('pos', 'sourceOffset')) - if diff: - return diff - synset = self.source() - def pointerIndex(sense, synset=synset): - return _index(sense, synset.getPointers(), testfn=lambda a,b: not _compareInstances(a, b, ('type', 'sourceIndex', 'targetIndex'))) - return cmp(pointerIndex(self), pointerIndex(other)) + diff = _compareInstances(self, other, ('pos', 'sourceOffset')) + if diff: + return diff + synset = self.source() + def pointerIndex(sense, synset=synset): + return _index(sense, synset.getPointers(), testfn=lambda a,b: not _compareInstances(a, b, ('type', 'sourceIndex', 'targetIndex'))) + return cmp(pointerIndex(self), pointerIndex(other)) # Loading the lexnames @@ -764,7 +763,7 @@ def __str__(self): def setupLexnames(): if os.path.exists(os.path.join(WNSEARCHDIR, 'lexnames')): for l in open(os.path.join(WNSEARCHDIR, 'lexnames')).readlines(): - i,name,category = string.split(l) + i,name,category = l.split() Lexname(name,PartsOfSpeech[int(category)-1]) setupLexnames() @@ -797,59 +796,59 @@ class Dictionary: """ def __init__(self, pos, filenameroot): - self.pos = pos + self.pos = pos """part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB""" - self.indexFile = _IndexFile(pos, filenameroot) - self.dataFile = [(open(f, _FILE_OPEN_MODE), os.stat(f)[6]) for f in _dataFilePathname(filenameroot)] # Tom De Smedt, 2011 + self.indexFile = _IndexFile(pos, filenameroot) + self.dataFile = [(open(f, _FILE_OPEN_MODE), os.stat(f)[6]) for f in _dataFilePathname(filenameroot)] # Tom De Smedt, 2011 def __repr__(self): - dictionaryVariables = {N: 'N', V: 'V', ADJ: 'ADJ', ADV: 'ADV'} - if dictionaryVariables.get(self): - return self.__module__ + "." + dictionaryVariables[self] - return "<%s.%s instance for %s>" % (self.__module__, "Dictionary", self.pos) + dictionaryVariables = {N: 'N', V: 'V', ADJ: 'ADJ', ADV: 'ADV'} + if dictionaryVariables.get(self): + return self.__module__ + "." + dictionaryVariables[self] + return "<%s.%s instance for %s>" % (self.__module__, "Dictionary", self.pos) def getWord(self, form, line=None): - key = string.replace(string.lower(form), ' ', '_') - pos = self.pos - def loader(key=key, line=line, indexFile=self.indexFile): - line = line or indexFile.get(key) - return line and Word(line) - word = _entityCache.get((pos, key), loader) - if word: - return word - else: - raise KeyError("%s is not in the %s database" % (`form`, `pos`)) + key = string.replace(string.lower(form), ' ', '_') + pos = self.pos + def loader(key=key, line=line, indexFile=self.indexFile): + line = line or indexFile.get(key) + return line and Word(line) + word = _entityCache.get((pos, key), loader) + if word: + return word + else: + raise KeyError("%s is not in the %s database" % (repr(form), repr(pos))) def getSynset(self, offset): - pos = self.pos - def loader(pos=pos, offset=offset, dataFile=self.dataFile): - return Synset(pos, offset, _lineAt(dataFile, offset)) - return _entityCache.get((pos, offset), loader) + pos = self.pos + def loader(pos=pos, offset=offset, dataFile=self.dataFile): + return Synset(pos, offset, _lineAt(dataFile, offset)) + return _entityCache.get((pos, offset), loader) def _buildIndexCacheFile(self): - self.indexFile._buildIndexCacheFile() + self.indexFile._buildIndexCacheFile() # # Sequence protocol (a Dictionary's items are its Words) # def __nonzero__(self): - """Return false. (This is to avoid scanning the whole index file - to compute len when a Dictionary is used in test position.) - - >>> N and 'true' - 'true' - """ - return 1 + """Return false. (This is to avoid scanning the whole index file + to compute len when a Dictionary is used in test position.) + + >>> N and 'true' + 'true' + """ + return 1 def __len__(self): - """Return the number of index entries. - - >>> len(ADJ) - 21435 - """ - if not hasattr(self, 'length'): - self.length = len(self.indexFile) - return self.length + """Return the number of index entries. + + >>> len(ADJ) + 21435 + """ + if not hasattr(self, 'length'): + self.length = len(self.indexFile) + return self.length def __getslice__(self, a, b): results = [] @@ -863,22 +862,22 @@ def __getslice__(self, a, b): return results def __getitem__(self, index): - """If index is a String, return the Word whose form is - index. If index is an integer n, return the Word - indexed by the n'th Word in the Index file. - - >>> N['dog'] - dog(n.) - >>> N[0] - 'hood(n.) - """ - if isinstance(index, StringType): - return self.getWord(index) - elif isinstance(index, IntType): - line = self.indexFile[index] - return self.getWord(string.replace(line[:string.find(line, ' ')], '_', ' '), line) - else: - raise TypeError("%s is not a String or Int" % `index`) + """If index is a String, return the Word whose form is + index. If index is an integer n, return the Word + indexed by the n'th Word in the Index file. + + >>> N['dog'] + dog(n.) + >>> N[0] + 'hood(n.) + """ + if isinstance(index, str): + return self.getWord(index) + elif isinstance(index, int): + line = self.indexFile[index] + return self.getWord(string.replace(line[:string.find(line, ' ')], '_', ' '), line) + else: + raise TypeError("%s is not a String or Int" % repr(index)) # # Dictionary protocol @@ -887,32 +886,32 @@ def __getitem__(self, index): # def get(self, key, default=None): - """Return the Word whose form is _key_, or _default_. - - >>> N.get('dog') - dog(n.) - >>> N.get('inu') - """ - try: - return self[key] - except LookupError: - return default + """Return the Word whose form is _key_, or _default_. + + >>> N.get('dog') + dog(n.) + >>> N.get('inu') + """ + try: + return self[key] + except LookupError: + return default def keys(self): - """Return a sorted list of strings that index words in this - dictionary.""" - return self.indexFile.keys() + """Return a sorted list of strings that index words in this + dictionary.""" + return self.indexFile.keys() def has_key(self, form): - """Return true iff the argument indexes a word in this dictionary. - - >>> N.has_key('dog') - 1 - >>> N.has_key('inu') - 0 - """ - return self.indexFile.has_key(form) - + """Return true iff the argument indexes a word in this dictionary. + + >>> N.has_key('dog') + 1 + >>> N.has_key('inu') + 0 + """ + return self.indexFile.has_key(form) + def __contains__(self, form): return self.indexFile.has_key(form.encode("utf-8", "ignore")) # Tom De Smedt, 2013 @@ -921,23 +920,23 @@ def __contains__(self, form): # def _testKeys(self): - """Verify that index lookup can find each word in the index file.""" - print("Testing: " + repr(self)) - file = open(self.indexFile.file.name, _FILE_OPEN_MODE) - counter = 0 - while 1: - line = file.readline() - if line == '': break - if line[0] != ' ': - key = string.replace(line[:string.find(line, ' ')], '_', ' ') - if (counter % 1000) == 0: - print("%s..." % (key,)) - import sys - sys.stdout.flush() - counter = counter + 1 - self[key] - file.close() - print("done.") + """Verify that index lookup can find each word in the index file.""" + print("Testing: " + repr(self)) + file = open(self.indexFile.file.name, _FILE_OPEN_MODE) + counter = 0 + while 1: + line = file.readline() + if line == '': break + if line[0] != ' ': + key = string.replace(line[:string.find(line, ' ')], '_', ' ') + if (counter % 1000) == 0: + print("%s..." % (key,)) + import sys + sys.stdout.flush() + counter = counter + 1 + self[key] + file.close() + print("done.") class _IndexFile: @@ -945,69 +944,69 @@ class _IndexFile: Sequence and Dictionary interface to a sorted index file.""" def __init__(self, pos, filenameroot): - self.pos = pos - self.file = open(_indexFilePathname(filenameroot), _FILE_OPEN_MODE) - self.offsetLineCache = {} # Table of (pathname, offset) -> (line, nextOffset) - self.rewind() - self.shelfname = os.path.join(WNSEARCHDIR, pos + ".pyidx") - try: - import shelve - self.indexCache = shelve.open(self.shelfname, 'r') - except: - pass + self.pos = pos + self.file = open(_indexFilePathname(filenameroot), _FILE_OPEN_MODE) + self.offsetLineCache = {} # Table of (pathname, offset) -> (line, nextOffset) + self.rewind() + self.shelfname = os.path.join(WNSEARCHDIR, pos + ".pyidx") + try: + import shelve + self.indexCache = shelve.open(self.shelfname, 'r') + except: + pass def rewind(self): - self.file.seek(0) - while 1: - offset = self.file.tell() - line = self.file.readline() - if (line[0] != ' '): - break - self.nextIndex = 0 - self.nextOffset = offset + self.file.seek(0) + while 1: + offset = self.file.tell() + line = self.file.readline() + if (line[0] != ' '): + break + self.nextIndex = 0 + self.nextOffset = offset # # Sequence protocol (an _IndexFile's items are its lines) # def __nonzero__(self): - return 1 + return 1 def __len__(self): - if hasattr(self, 'indexCache'): - return len(self.indexCache) - self.rewind() - lines = 0 - while 1: - line = self.file.readline() - if line == "": - break - lines = lines + 1 - return lines + if hasattr(self, 'indexCache'): + return len(self.indexCache) + self.rewind() + lines = 0 + while 1: + line = self.file.readline() + if line == "": + break + lines = lines + 1 + return lines def __nonzero__(self): - return 1 + return 1 def __getitem__(self, index): - if isinstance(index, StringType): - if hasattr(self, 'indexCache'): - return self.indexCache[index] - return binarySearchFile(self.file, index, self.offsetLineCache, 8) - elif isinstance(index, IntType): - if hasattr(self, 'indexCache'): - return self.get(self.keys[index]) - if index < self.nextIndex: - self.rewind() - while self.nextIndex <= index: - self.file.seek(self.nextOffset) - line = self.file.readline() - if line == "": - raise IndexError("index out of range") - self.nextIndex = self.nextIndex + 1 - self.nextOffset = self.file.tell() - return line - else: - raise TypeError("%s is not a String or Int" % `index`) - + if isinstance(index, str): + if hasattr(self, 'indexCache'): + return self.indexCache[index] + return binarySearchFile(self.file, index, self.offsetLineCache, 8) + elif isinstance(index, int): + if hasattr(self, 'indexCache'): + return self.get(self.keys[index]) + if index < self.nextIndex: + self.rewind() + while self.nextIndex <= index: + self.file.seek(self.nextOffset) + line = self.file.readline() + if line == "": + raise IndexError("index out of range") + self.nextIndex = self.nextIndex + 1 + self.nextOffset = self.file.tell() + return line + else: + raise TypeError("%s is not a String or Int" % repr(index)) + # # Dictionary protocol # @@ -1015,62 +1014,62 @@ def __getitem__(self, index): # def get(self, key, default=None): - try: - return self[key] - except LookupError: - return default + try: + return self[key] + except LookupError: + return default def keys(self): - if hasattr(self, 'indexCache'): - keys = self.indexCache.keys() - keys.sort() - return keys - else: - keys = [] - self.rewind() - while 1: - line = self.file.readline() - if not line: break + if hasattr(self, 'indexCache'): + keys = self.indexCache.keys() + keys.sort() + return keys + else: + keys = [] + self.rewind() + while 1: + line = self.file.readline() + if not line: break key = line.split(' ', 1)[0] - keys.append(key.replace('_', ' ')) - return keys + keys.append(key.replace('_', ' ')) + return keys def has_key(self, key): - key = key.replace(' ', '_') # test case: V['haze over'] - if hasattr(self, 'indexCache'): - return self.indexCache.has_key(key) - return self.get(key) != None + key = key.replace(' ', '_') # test case: V['haze over'] + if hasattr(self, 'indexCache'): + return self.indexCache.has_key(key) + return self.get(key) != None # # Index file # def _buildIndexCacheFile(self): - import shelve - import os - print("Building %s:" % (self.shelfname,)) - tempname = self.shelfname + ".temp" - try: - indexCache = shelve.open(tempname) - self.rewind() - count = 0 - while 1: - offset, line = self.file.tell(), self.file.readline() - if not line: break - key = line[:string.find(line, ' ')] - if (count % 1000) == 0: - print("%s..." % (key,)) - import sys - sys.stdout.flush() - indexCache[key] = line - count = count + 1 - indexCache.close() - os.rename(tempname, self.shelfname) - finally: - try: os.remove(tempname) - except: pass - print("done.") - self.indexCache = shelve.open(self.shelfname, 'r') + import shelve + import os + print("Building %s:" % (self.shelfname,)) + tempname = self.shelfname + ".temp" + try: + indexCache = shelve.open(tempname) + self.rewind() + count = 0 + while 1: + offset, line = self.file.tell(), self.file.readline() + if not line: break + key = line[:string.find(line, ' ')] + if (count % 1000) == 0: + print("%s..." % (key,)) + import sys + sys.stdout.flush() + indexCache[key] = line + count = count + 1 + indexCache.close() + os.rename(tempname, self.shelfname) + finally: + try: os.remove(tempname) + except: pass + print("done.") + self.indexCache = shelve.open(self.shelfname, 'r') # @@ -1097,20 +1096,20 @@ def getSynset(pos, offset): def _requirePointerType(pointerType): if pointerType not in POINTER_TYPES: - raise TypeError(`pointerType` + " is not a pointer type") + raise TypeError(repr(pointerType) + " is not a pointer type") return pointerType def _compareInstances(a, b, fields): """"Return -1, 0, or 1 according to a comparison first by type, then by class, and finally by each of fields.""" # " <- for emacs if not hasattr(b, '__class__'): - return cmp(type(a), type(b)) + return cmp(type(a), type(b)) elif a.__class__ != b.__class__: - return cmp(a.__class__, b.__class__) + return cmp(a.__class__, b.__class__) for field in fields: - diff = cmp(getattr(a, field), getattr(b, field)) - if diff: - return diff + diff = cmp(getattr(a, field), getattr(b, field)) + if diff: + return diff return 0 def _equalsIgnoreCase(a, b): @@ -1128,7 +1127,7 @@ def _equalsIgnoreCase(a, b): # def _dataFilePathname(filenameroot): if os.name in ('dos', 'nt'): - path = os.path.join(WNSEARCHDIR, filenameroot + ".dat") + path = os.path.join(WNSEARCHDIR, filenameroot + ".dat") if os.path.exists(path): return [path] # Tom De Smedt, 2011 @@ -1138,7 +1137,7 @@ def _dataFilePathname(filenameroot): def _indexFilePathname(filenameroot): if os.name in ('dos', 'nt'): - path = os.path.join(WNSEARCHDIR, filenameroot + ".idx") + path = os.path.join(WNSEARCHDIR, filenameroot + ".idx") if os.path.exists(path): return path return os.path.join(WNSEARCHDIR, "index." + filenameroot) @@ -1155,30 +1154,30 @@ def binarySearchFile(file, key, cache={}, cacheDepth=-1): #if count > 20: # raise "infinite loop" lastState = start, end - middle = (start + end) / 2 - if cache.get(middle): - offset, line = cache[middle] - else: - file.seek(max(0, middle - 1)) - if middle > 0: - file.readline() - offset, line = file.tell(), file.readline() - if currentDepth < cacheDepth: - cache[middle] = (offset, line) + middle = (start + end) / 2 + if cache.get(middle): + offset, line = cache[middle] + else: + file.seek(max(0, middle - 1)) + if middle > 0: + file.readline() + offset, line = file.tell(), file.readline() + if currentDepth < cacheDepth: + cache[middle] = (offset, line) #print(start, middle, end, offset, line) - if offset > end: - assert end != middle - 1, "infinite loop" - end = middle - 1 - elif line[:keylen] == key:# and line[keylen + 1] == ' ': - return line + if offset > end: + assert end != middle - 1, "infinite loop" + end = middle - 1 + elif line[:keylen] == key:# and line[keylen + 1] == ' ': + return line #elif offset == end: # return None - elif line > key: - assert end != middle - 1, "infinite loop" - end = middle - 1 - elif line < key: - start = offset + len(line) - 1 - currentDepth = currentDepth + 1 + elif line > key: + assert end != middle - 1, "infinite loop" + end = middle - 1 + elif line < key: + start = offset + len(line) - 1 + currentDepth = currentDepth + 1 thisState = start, end if lastState == thisState: # detects the condition where we're searching past the end @@ -1211,12 +1210,12 @@ def _index(key, sequence, testfn=None, keyfn=None): """ index = 0 for element in sequence: - value = element - if keyfn: - value = keyfn(value) - if (not testfn and value == key) or (testfn and testfn(value, key)): - return index - index = index + 1 + value = element + if keyfn: + value = keyfn(value) + if (not testfn and value == key) or (testfn and testfn(value, key)): + return index + index = index + 1 return None def _partition(sequence, size, count): @@ -1229,7 +1228,7 @@ def _partition(sequence, size, count): partitions = [] for index in range(0, size * count, size): - partitions.append(sequence[index:index + size]) + partitions.append(sequence[index:index + size]) return (partitions, sequence[size * count:]) @@ -1274,49 +1273,49 @@ class _LRUCache: but the two implementations aren't directly comparable.""" def __init__(this, capacity): - this.capacity = capacity - this.clear() + this.capacity = capacity + this.clear() def clear(this): - this.values = {} - this.history = {} - this.oldestTimestamp = 0 - this.nextTimestamp = 1 + this.values = {} + this.history = {} + this.oldestTimestamp = 0 + this.nextTimestamp = 1 def removeOldestEntry(this): - while this.oldestTimestamp < this.nextTimestamp: - if this.history.get(this.oldestTimestamp): - key = this.history[this.oldestTimestamp] - del this.history[this.oldestTimestamp] - del this.values[key] - return - this.oldestTimestamp = this.oldestTimestamp + 1 + while this.oldestTimestamp < this.nextTimestamp: + if this.history.get(this.oldestTimestamp): + key = this.history[this.oldestTimestamp] + del this.history[this.oldestTimestamp] + del this.values[key] + return + this.oldestTimestamp = this.oldestTimestamp + 1 def setCapacity(this, capacity): - if capacity == 0: - this.clear() - else: - this.capacity = capacity - while len(this.values) > this.capacity: - this.removeOldestEntry() + if capacity == 0: + this.clear() + else: + this.capacity = capacity + while len(this.values) > this.capacity: + this.removeOldestEntry() def get(this, key, loadfn=None): - value = None - if this.values: - pair = this.values.get(key) - if pair: - (value, timestamp) = pair - del this.history[timestamp] - if value == None: - value = loadfn and loadfn() - if this.values != None: - timestamp = this.nextTimestamp - this.nextTimestamp = this.nextTimestamp + 1 - this.values[key] = (value, timestamp) - this.history[timestamp] = key - if len(this.values) > this.capacity: - this.removeOldestEntry() - return value + value = None + if this.values: + pair = this.values.get(key) + if pair: + (value, timestamp) = pair + del this.history[timestamp] + if value == None: + value = loadfn and loadfn() + if this.values != None: + timestamp = this.nextTimestamp + this.nextTimestamp = this.nextTimestamp + 1 + this.values[key] = (value, timestamp) + this.history[timestamp] = key + if len(this.values) > this.capacity: + this.removeOldestEntry() + return value class _NullCache: @@ -1324,10 +1323,10 @@ class _NullCache: LRUCache implements), but doesn't store any values.""" def clear(): - pass + pass def get(this, key, loadfn=None): - return loadfn and loadfn() + return loadfn and loadfn() DEFAULT_CACHE_CAPACITY = 1000 @@ -1340,7 +1339,7 @@ def disableCache(): def enableCache(): """Enable the entity cache.""" if not isinstance(_entityCache, LRUCache): - _entityCache = _LRUCache(size) + _entityCache = _LRUCache(size) def clearCache(): """Clear the entity cache.""" @@ -1378,36 +1377,36 @@ def _initializePOSTables(): _POSNormalizationTable = {} _POStoDictionaryTable = {} for pos, abbreviations in ( - (NOUN, "noun n n."), - (VERB, "verb v v."), - (ADJECTIVE, "adjective adj adj. a s"), - (ADVERB, "adverb adv adv. r")): - tokens = string.split(abbreviations) - for token in tokens: - _POSNormalizationTable[token] = pos - _POSNormalizationTable[string.upper(token)] = pos + (NOUN, "noun n n."), + (VERB, "verb v v."), + (ADJECTIVE, "adjective adj adj. a s"), + (ADVERB, "adverb adv adv. r")): + tokens = abbreviations.split() + for token in tokens: + _POSNormalizationTable[token] = pos + _POSNormalizationTable[token.upper()] = pos for dict in Dictionaries: - _POSNormalizationTable[dict] = dict.pos - _POStoDictionaryTable[dict.pos] = dict + _POSNormalizationTable[dict] = dict.pos + _POStoDictionaryTable[dict.pos] = dict _initializePOSTables() def _normalizePOS(pos): norm = _POSNormalizationTable.get(pos) if norm: - return norm - raise TypeError(`pos` + " is not a part of speech type") + return norm + raise TypeError(repr(pos) + " is not a part of speech type") def _dictionaryFor(pos): pos = _normalizePOS(pos) dict = _POStoDictionaryTable.get(pos) if dict == None: - raise RuntimeError("The " + `pos` + " dictionary has not been created") + raise RuntimeError("The " + repr(pos) + " dictionary has not been created") return dict def buildIndexFiles(): for dict in Dictionaries: - dict._buildIndexCacheFile() + dict._buildIndexCacheFile() # @@ -1417,7 +1416,7 @@ def buildIndexFiles(): def _testKeys(): #This is slow, so don't do it as part of the normal test procedure. for dictionary in Dictionaries: - dictionary._testKeys() + dictionary._testKeys() def _test(reset=0): import doctest, wordnet diff --git a/pattern/text/fr/__init__.py b/pattern/text/fr/__init__.py index 4b715470..03b3b316 100644 --- a/pattern/text/fr/__init__.py +++ b/pattern/text/fr/__init__.py @@ -112,7 +112,7 @@ def penntreebank2universal(token, tag): "lorsqu'": "lorsqu' ", "puisqu'": "puisqu' ", # Same rule for Unicode apostrophe, see also Parser.find_tokens(): - ur"(l|c|d|j|m|n|qu|s|t|jusqu|lorsqu|puisqu)’": u"\\1’ " + u"(l|c|d|j|m|n|qu|s|t|jusqu|lorsqu|puisqu)’": u"\\1’ " } replacements.update(((k.upper(), v.upper()) for k, v in replacements.items())) @@ -261,4 +261,4 @@ def positive(s, threshold=0.1, **kwargs): # python -m pattern.fr xml -s "C'est l'exception qui confirme la règle." -OTCL if __name__ == "__main__": - commandline(parse) \ No newline at end of file + commandline(parse) diff --git a/pattern/text/search.py b/pattern/text/search.py index d5e818ea..13fee503 100644 --- a/pattern/text/search.py +++ b/pattern/text/search.py @@ -10,6 +10,12 @@ import re import itertools +try: + # Python 3 style map function + import itertools.imap as map +except ImportError: + pass + from past.builtins import basestring #--- TEXT, SENTENCE AND WORD ----------------------------------------------------------------------- @@ -179,7 +185,7 @@ def variations(iterable, optional=lambda x: False): v = tuple(iterable[i] for i in range(len(v)) if not v[i]) a.add(v) # Longest-first. - return sorted(a, cmp=lambda x, y: len(y) - len(x)) + return sorted(a, key=len, reverse=True) #### TAXONOMY ###################################################################################### @@ -247,7 +253,7 @@ def clear(self): def iterkeys(self): return reversed(self._o) def itervalues(self): - return itertools.imap(self.__getitem__, reversed(self._o)) + return map(self.__getitem__, reversed(self._o)) def iteritems(self): return iter(zip(self.iterkeys(), self.itervalues())) diff --git a/pattern/text/tree.py b/pattern/text/tree.py index 04588dd8..3017ad09 100644 --- a/pattern/text/tree.py +++ b/pattern/text/tree.py @@ -127,8 +127,12 @@ def __iter__(self): #--- WORD ------------------------------------------------------------------------------------------ + class Word(object): + # `type` is used as a reference name, so we need the original type builtin + type_builtin = type + def __init__(self, sentence, string, lemma=None, type=None, index=0): """ A word in the sentence. - lemma: base form of the word; "was" => "be". @@ -136,10 +140,10 @@ def __init__(self, sentence, string, lemma=None, type=None, index=0): - chunk: the chunk (or phrase) this word belongs to. - index: the index in the sentence. """ - if not isinstance(string, unicode): + + if not isinstance(string, self.type_builtin(u"")): try: string = string.decode("utf-8") # ensure Unicode - except: - pass + except: pass self.sentence = sentence self.index = index self.string = string # "was" @@ -155,8 +159,8 @@ def copy(self, chunk=None, pnp=None): self.sentence, self.string, self.lemma, - self.type, - self.index + type=self.type, + index=self.index ) w.chunk = chunk w.pnp = pnp @@ -584,7 +588,7 @@ def _is_tokenstring(string): # The class mbsp.TokenString stores the format of tags for each token. # Since it comes directly from MBSP.parse(), this format is always correct, # regardless of the given token format parameter for Sentence() or Text(). - return isinstance(string, unicode) and hasattr(string, "tags") + return isinstance(string, type(u"")) and hasattr(string, "tags") class Sentence(object): @@ -597,7 +601,7 @@ def __init__(self, string="", token=[WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA], if _is_tokenstring(string): token, language = string.tags, getattr(string, "language", language) # Convert to Unicode. - if not isinstance(string, unicode): + if not isinstance(string, type(u"")): for encoding in (("utf-8",), ("windows-1252",), ("utf-8", "ignore")): try: string = string.decode(*encoding) except: @@ -837,7 +841,7 @@ def _do_word(self, word, lemma=None, type=None): # Improve 3rd person singular "'s" lemma to "be", e.g., as in "he's fine". if lemma == "'s" and type in ("VB", "VBZ"): lemma = "be" - self.words.append(Word(self, word, lemma, type, index=len(self.words))) + self.words.append(Word(self, word, lemma, type=type, index=len(self.words))) def _do_chunk(self, type, role=None, relation=None, iob=None): """ Adds a new Chunk to the sentence, or adds the last word to the previous chunk. @@ -976,7 +980,7 @@ def indexof(self, value, tag=WORD): match = lambda a, b: a.endswith("*") and b.startswith(a[:-1]) or a==b indices = [] for i in range(len(self.words)): - if match(value, unicode(self.get(i, tag))): + if match(value, u"%s" % (self.get(i, tag))): indices.append(i) return indices @@ -1364,7 +1368,7 @@ def parse_xml(sentence, tab="\t", id=""): word.type and ' %s="%s"' % (XML_TYPE, xml_encode(word.type)) or '', word.lemma and ' %s="%s"' % (XML_LEMMA, xml_encode(word.lemma)) or '', (" "+" ".join(['%s="%s"' % (k,v) for k,v in word.custom_tags.items() if v != None])).rstrip(), - xml_encode(unicode(word)), + xml_encode(u"%s" % (word)), XML_WORD )) if not chunk: @@ -1427,11 +1431,11 @@ def get(self, k, default=""): # This is a fallback if for some reason we fail to import MBSP.TokenString, # e.g., when tree.py is part of another project. -class TaggedString(unicode): +class TaggedString(str): def __new__(cls, string, tags=["word"], language="en"): - if isinstance(string, unicode) and hasattr(string, "tags"): + if isinstance(string, type(u"")) and hasattr(string, "tags"): tags, language = string.tags, getattr(string, "language", language) - s = unicode.__new__(cls, string) + s = str.__new__(cls, string) s.tags = list(tags) s.language = language return s diff --git a/pattern/web/__init__.py b/pattern/web/__init__.py index eb8f3636..97d0a1f5 100644 --- a/pattern/web/__init__.py +++ b/pattern/web/__init__.py @@ -76,7 +76,7 @@ import oauth import locale -import BeautifulSoup +import bs4 try: # Import persistent Cache. @@ -3002,10 +3002,10 @@ def sort(terms=[], context="", service=GOOGLE, license=None, strict=True, prefix # L. Richardson (2004), http://www.crummy.com/software/BeautifulSoup/ SOUP = ( - BeautifulSoup.BeautifulSoup, - BeautifulSoup.Tag, - BeautifulSoup.NavigableString, - BeautifulSoup.Comment + bs4.BeautifulSoup, + bs4.Tag, + bs4.NavigableString, + bs4.Comment ) NODE, TEXT, COMMENT, ELEMENT, DOCUMENT = \ @@ -3020,7 +3020,7 @@ def __init__(self, html, type=NODE, **kwargs): All DOM nodes can be navigated in the same way (e.g. Node.parent, Node.children, ...) """ self.type = type - self._p = not isinstance(html, SOUP) and BeautifulSoup.BeautifulSoup(u(html), **kwargs) or html + self._p = not isinstance(html, SOUP) and bs4.BeautifulSoup(u(html), **kwargs) or html @property def _beautifulSoup(self): @@ -3033,13 +3033,13 @@ def __eq__(self, other): def _wrap(self, x): # Navigating to other nodes yields either Text, Element or None. - if isinstance(x, BeautifulSoup.Comment): + if isinstance(x, bs4.Comment): return Comment(x) - if isinstance(x, BeautifulSoup.Declaration): + if isinstance(x, bs4.Declaration): return Text(x) - if isinstance(x, BeautifulSoup.NavigableString): + if isinstance(x, bs4.NavigableString): return Text(x) - if isinstance(x, BeautifulSoup.Tag): + if isinstance(x, bs4.Tag): return Element(x) @property @@ -3233,7 +3233,7 @@ def declaration(self): """ Yields the declaration, as a TEXT Node or None. """ for child in self.children: - if isinstance(child._p, BeautifulSoup.Declaration): + if isinstance(child._p, bs4.Declaration): return child @property diff --git a/setup.py b/setup.py index da5a3428..3beff5bb 100644 --- a/setup.py +++ b/setup.py @@ -128,9 +128,9 @@ 'future', 'backports.csv', 'mysqlclient', - 'BeautifulSoup', + 'beautifulsoup4', 'feedparser', - 'pdfminer', + 'pdfminer.six', 'python-docx', 'cherrypy' ], diff --git a/test/test_text.py b/test/test_text.py index d11c1c80..2e455f87 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -228,8 +228,8 @@ def test_dict(self): self.assertEqual(s(v)[1], +1.0) self.assertEqual(s(v).assessments[0], ([":-("], -0.75, 1.0, "mood")) self.assertEqual(s(v).assessments[1], ([":-)"], +0.50, 1.0, "mood")) - print "pattern.text.Sentiment.assessments" - + print("pattern.text.Sentiment.assessments") + def test_bag_of_words(self): # Assert weighted average polarity and subjectivity for bag-of-words with weighted features. from pattern.vector import BagOfWords # Alias for pattern.vector.Document.