diff --git a/docs/html/pattern-web.html b/docs/html/pattern-web.html index a624585d..b783b638 100644 --- a/docs/html/pattern-web.html +++ b/docs/html/pattern-web.html @@ -83,7 +83,7 @@
The example below downloads an image.
The extension() helper function parses the file extension from a file name:
>>> from pattern.web import URL, extension
->>>
+>>>
>>> url = URL('http://www.clips.ua.ac.be/media/pattern_schema.gif')
>>> f = open('test' + extension(url.page), 'wb') # save as test.gif
>>> f.write(url.download())
@@ -97,7 +97,7 @@ URL mime-type
The URL.mimetype can be used to check the type of document at the given URL. This is more reliable than sniffing the filename extension (which may be omitted).
>>> from pattern import URL, MIMETYPE_IMAGE
->>>
+>>>
>>> url = URL('http://www.clips.ua.ac.be/media/pattern_schema.gif')
>>> print url.mimetype in MIMETYPE_IMAGE
@@ -222,7 +222,7 @@ Asynchronous requests
The example below executes a Google query without halting the main program. Instead, it displays a "busy" message (e.g., a progress bar updated in the application's event loop) until request.done.
>>> from pattern.web import asynchronous, time, Google
->>>
+>>>
>>> request = asynchronous(Google().search, 'holy grail', timeout=4)
>>> while not request.done:
>>> time.sleep(0.1)
@@ -236,14 +236,14 @@ Search engine + web services
The SearchEngine object has a number of subclasses that can be used to query different web services (e.g., Google, Wikipedia). SearchEngine.search() returns a list of Result objects for a given query string – similar to a search field and a results page in a browser.
engine = SearchEngine(license=None, throttle=1.0, language=None)
engine.license # Service license key.
engine.throttle # Time between requests (being nice to server).
-engine.language # Restriction for Result.language (e.g., 'en').
engine.search(query,
+engine.language # Restriction for Result.language (e.g., 'en').
engine.search(query,
type = SEARCH, # SEARCH | IMAGE | NEWS
start = 1, # Starting page.
count = 10, # Results per page.
size = None # Image size: TINY | SMALL | MEDIUM | LARGE
cached = True) # Cache locally?Note: SearchEngine.search() takes the same optional parameters as URL.download().
Google, Bing, Twitter, Facebook, Wikipedia, Flickr
-SearchEngine is subclassed by Google, Yahoo, Bing, DuckDuckGo, Twitter, Facebook, Wikipedia, Wiktionary, Wikia, DBPedia, Flickr and Newsfeed. The constructors take the same parameters:
+SearchEngine is subclassed by Google, Bing, DuckDuckGo, Twitter, Facebook, Wikipedia, Wiktionary, Wikia, DBPedia, Flickr and Newsfeed. The constructors take the same parameters:
engine = Google(license=None, throttle=0.5, language=None)
engine = Bing(license=None, throttle=0.5, language=None)
engine = Twitter(license=None, throttle=0.5, language=None)
engine = Facebook(license=None, throttle=1.0, language='en')
engine = Wikipedia(license=None, throttle=5.0, language=None)
engine = Flickr(license=None, throttle=5.0, language=None)
Each search engine has different settings for the search() method. For example, Twitter.search() returns up to 3000 results for a given query (30 queries with 100 results each, or 300 queries with 10 results each). It has a limit of 150 queries per 15 minutes. Each call to search() counts as one query.
@@ -272,14 +272,6 @@ Google, Bing, Twitter, Facebook, Wikipedia, Flickr
0.5
-Yahoo
-SEARCH | NEWS | IMAGE13
-1-1000/count
-1-50
-paid
-0.5
-
-
DuckDuckGo
SEARCH
1
@@ -353,14 +345,13 @@ Google, Bing, Twitter, Facebook, Wikipedia, Flickr
-1 Google, Bing and Yahoo are paid services – see further how to obtain a license key.
2 Bing.search(type=NEWS) has a count of 1-15.
3 Yahoo.search(type=IMAGES) has a count of 1-35.
Web service license key
+1 Google and Bing are paid services – see further how to obtain a license key.
2 Bing.search(type=NEWS) has a count of 1-15.
Web service license key
Some services require a license key. They may work without one, but this implies that you share a public license key (and query limit) with other users of the pattern.web module. If the query limit is exceeded, SearchEngine.search() raises a SearchEngineLimitError.
- Google is a paid service ($1 for 200 queries), with a 100 free queries per day. When you obtain a license key (follow the link below), activate "Custom Search API" and "Translate API" under "Services" and look up the key under "API Access".
- Bing is a paid service ($1 for 500 queries), with a 5,000 free queries per month.
-- Yahoo is a paid service ($1 for 1250 queries) that requires an OAuth key + secret, which can be passed as a tuple: Yahoo(license=(key, secret)).
-Obtain a license key: Google, Bing, Yahoo, Twitter, Facebook, Flickr.
Web service request throttle
+Obtain a license key: Google, Bing, Twitter, Facebook, Flickr.
Web service request throttle
A SearchEngine.search() request takes a minimum amount of time to complete, as outlined in the table above. This is intended as etiquette towards the server providing the service. Raise the throttle value if you plan to run multiple queries in batch. Wikipedia requests are especially intensive. If you plan to mine a lot of data from Wikipedia, download the Wikipedia database instead.
@@ -379,7 +370,7 @@ Search Engine results
>>> from pattern.web import Bing, SEARCH, plaintext
->>>
+>>>
>>> engine = Bing(license=None) # Enter your license key.
>>> for i in range(1,5):
>>> for result in engine.search('holy handgrenade', type=SEARCH, start=i):
@@ -392,13 +383,13 @@ Search Engine results
Since SearchEngine.search() takes the same optional parameters as URL.download() it is easy to disable local caching, set a proxy server, a throttle (minimum time) or a timeout (maximum time).
>>> from pattern.web import Google
->>>
+>>>
>>> engine = Google(license=None) # Enter your license key.
>>> for result in engine.search('tim', cached=False, proxy=('proxy.com', 'https'))
>>> print result.url
>>> print result.text
Image search
-For Flickr, Bing and Yahoo, image URLs retrieved with search(type=IMAGE) can be filtered by setting the size to TINY, SMALL, MEDIUM, LARGE or None (any size). Images may be subject to copyright.
+For Flickr and Bing image URLs retrieved with search(type=IMAGE) can be filtered by setting the size to TINY, SMALL, MEDIUM, LARGE or None (any size). Images may be subject to copyright.
For Flickr, use search(copyright=False) to retrieve results with no copyright restrictions (either public domain or Creative Commons by-sa).
For Twitter, each result has a Result.profile property with the URL to the user's profile picture.
@@ -407,7 +398,7 @@ Google translate
Google.translate() returns the translated string in the given language.
Google.identify() returns a (language code, confidence)-tuple for a given string.
>>> from pattern.web import Google
->>>
+>>>
>>> s = "C'est un lapin, lapin de bois. Quoi? Un cadeau."
>>> g = Google()
>>> print g.translate(s, input='fr', output='en', cached=False)
@@ -424,7 +415,7 @@ Twitter search
>>> from pattern.web import Twitter
>>>
>>> t = Twitter()
->>> i = None
+>>> i = None
>>> for j in range(3):
>>> for tweet in t.search('win', start=i, count=10):
>>> print tweet.text
@@ -436,10 +427,10 @@ Twitter streams
Twitter.stream() returns an endless, live stream of Result objects. A Stream is a Python list that accumulates each time Stream.update() is called:
>>> from pattern.web import Twitter
->>>
+>>>
>>> s = Twitter().stream('#fail')
>>> for i in range(10):
->>> time.sleep(1)
+>>> time.sleep(1)
>>> s.update(bytes=1024)
>>> print s[-1].text if s else ''
To clear the accumulated list, call Stream.clear().
@@ -477,11 +468,11 @@ Wikipedia article sections
section.string # Section plaintext unicode string.
section.content # Section string minus title.
section.level # Section nested depth (from 0).
-section.links # List of titles of linked articles.
+section.links # List of titles of linked articles.
section.tables # List of WikipediaTable objects.The following example downloads a Wikipedia article and prints the title of each section, indented according to the section level:
>>> from pattern.web import Wikipedia
->>>
+>>>
>>> article = Wikipedia().search('cat')
>>> for section in article.sections:
>>> print repr(' ' * section.level + section.title)
@@ -525,7 +516,7 @@ DBPedia
DBPedia is a database of structured information mined from Wikipedia and stored as (subject, predicate, object)-triples (e.g., cat is-a animal). DBPedia can be queried with SPARQL, where subject, predicate and/or object can be given as ?variables. The Result objects in the list returned from DBPedia.search() have the variables as additional properties:
>>> from pattern.web import DBPedia
->>>
+>>>
>>> sparql = '\n'.join((
>>> 'prefix dbo: <http://dbpedia.org/ontology/>',
>>> 'select ?person ?place where {',
@@ -547,25 +538,25 @@ Facebook posts, comments & likes
Facebook.search(id, type=COMMENTS) retrieves comments for a given post's Result.id. You can also pass the id of a post or a comment to Facebook.search(id, type=LIKES) to retrieve users that liked it.
>>> from pattern.web import Facebook, NEWS, COMMENTS, LIKES
->>>
+>>>
>>> fb = Facebook(license='your key')
>>> me = fb.profile(id=None) # (id, name, date, gender, locale, likes)-tuple
->>>
+>>>
>>> for post in fb.search(me[0], type=NEWS, count=100):
>>> print repr(post.id)
>>> print repr(post.text)
>>> print repr(post.url)
>>> if post.comments > 0:
->>> print '%i comments' % post.comments
+>>> print '%i comments' % post.comments
>>> print [(r.text, r.author) for r in fb.search(post.id, type=COMMENTS)]
>>> if post.likes > 0:
->>> print '%i likes' % post.likes
+>>> print '%i likes' % post.likes
>>> print [r.author for r in fb.search(post.id, type=LIKES)]
u'530415277_10151455896030278'
u'Tom De Smedt likes CLiPS Research Center'
u'http://www.facebook.com/CLiPS.UA'
-1 likes
+1 likes
[(u'485942414773810', u'CLiPS Research Center')]
....
The maximum count for COMMENTS and LIKES is 1000 (by default, 10).
@@ -575,7 +566,7 @@ RSS + Atom newsfeeds
The Newsfeed object is a wrapper for Mark Pilgrim's Universal Feed Parser. Newsfeed.search() takes the URL of an RSS or Atom news feed and returns a list of Result objects.
>>> from pattern.web import Newsfeed
->>>
+>>>
>>> NATURE = 'http://www.nature.com/nature/current_issue/rss/index.html'
>>> for result in Newsfeed().search(NATURE)[:5]:
>>> print repr(result.title)
@@ -597,7 +588,7 @@ Web sort
sort(
terms = [], # List of search terms.
context = '', # Term used for sorting.
- service = GOOGLE, # GOOGLE | BING | YAHOO | FLICKR
+ service = GOOGLE, # GOOGLE | BING | FLICKR
license = None, # Service license key.
strict = True, # Wrap query in quotes?
prefix = False, # context + term or term + context?
@@ -605,13 +596,13 @@ Web sort
Now let's see who is more dangerous:
>>> from pattern.web import sort
->>>
+>>>
>>> results = sort(terms=[
->>> 'arnold schwarzenegger',
->>> 'chuck norris',
->>> 'dolph lundgren',
+>>> 'arnold schwarzenegger',
+>>> 'chuck norris',
+>>> 'dolph lundgren',
>>> 'steven seagal',
->>> 'sylvester stallone',
+>>> 'sylvester stallone',
>>> 'mickey mouse'], context='dangerous', prefix=True)
>>>
>>> for weight, term in results:
@@ -649,7 +640,7 @@ HTML to plaintext
The following example downloads a HTML document and keeps a minimal amount of formatting (headings, bold, links).
>>> from pattern.web import URL, plaintext
->>>
+>>>
>>> s = URL('http://www.clips.ua.ac.be').download()
>>> s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
>>> print s
@@ -742,12 +733,12 @@ Node
>>> from pattern.web import Element
->>>
+>>>
>>> div = Element('<div> <a>1st</a> <a>2nd<a> </div>')
>>> print div('a:first-child')
>>> print div('a:first-child')[0].source
-[Element(tag='a')]
+[Element(tag='a')]
<a>1st</a>
DOM
The top-level element in the Document Object Model.
@@ -756,7 +747,7 @@ DOM
dom.body # <body> Element.
The following example retrieves the most recent reddit entries. The pattern.web module does not include a reddit search engine, but we can parse entries directly from the HTML source. This is called screen scraping, and many websites will strongly dislike it.
>>> from pattern.web import URL, DOM, plaintext
->>>
+>>>
>>> url = URL('http://www.reddit.com/top/')
>>> dom = DOM(url.download(cached=True))
>>> for e in dom('div.entry')[:3]: # Top 3 reddit entries.
@@ -765,13 +756,13 @@ DOM
u'Invisible Kitty'
u'Naturally, he said yes.'
-u"I'd just like to remind everyone that /r/minecraft exists and not everyone wants"
+u"I'd just like to remind everyone that /r/minecraft exists and not everyone wants"
"to have 10 Minecraft posts a day on their front page."
Absolute URLs
Links parsed from the DOM can be relative (e.g., starting with "../" instead of "http://").
To get the absolute URL, you can use the abs() function in combination with URL.redirect:
>>> from pattern.web import URL, DOM, abs
->>>
+>>>
>>> url = URL('http://www.clips.ua.ac.be')
>>> dom = DOM(url.download())
>>> for link in dom('a'):
@@ -783,13 +774,13 @@ PDF Parser
The PDF object (based on PDFMiner) parses the source text from a PDF file.
>>> from pattern.web import URL, PDF
->>>
+>>>
>>> url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
>>> pdf = PDF(url.download())
>>> print pdf.string
CLiPS Technical Report series 002 September 7, 2010
-Tom De Smedt, Vincent Van Asch, Walter Daelemans
+Tom De Smedt, Vincent Van Asch, Walter Daelemans
Computational Linguistics & Psycholinguistics Research Center
...
URLs linking to a PDF document can be identified with: URL.mimetype in MIMETYPE_PDF.
@@ -803,7 +794,7 @@ Crawler
crawler.history # Dictionary of (domain, time last visited)-items.
crawler.visited # Dictionary of URLs visited.
crawler.sort # FIFO | LIFO (how new links are queued).
-crawler.done # True when all links have been visited.crawler.push(link, priority=1.0, sort=LIFO)
+crawler.done # True when all links have been visited.
crawler.push(link, priority=1.0, sort=LIFO)
crawler.pop(remove=True)
crawler.next # Yields the next scheduled link = Crawler.pop(False)
crawler.crawl(method=DEPTH) # DEPTH | BREADTH | None.
crawler.priority(link, method=DEPTH)
crawler.follow(link)
@@ -832,7 +823,7 @@ Crawler
>>> from pattern.web import Crawler
>>>
->>> class Polly(Crawler):
+>>> class Polly(Crawler):
>>> def visit(self, link, source=None):
>>> print 'visited:', repr(link.url), 'from:', link.referrer
>>> def fail(self, link):
@@ -852,19 +843,19 @@ Crawler
Crawl function
The crawl() function returns an iterator that yields (Link, source)-tuples. When it is idle (e.g., waiting for the delay on a domain) it yields (None, None).
crawl(
- links = [],
- domains = [],
- delay = 20.0,
- sort = FIFO,
+ links = [],
+ domains = [],
+ delay = 20.0,
+ sort = FIFO,
method = DEPTH, **kwargs)
>>> from pattern.web import crawl
->>>
+>>>
>>> for link, source in crawl('http://www.clips.ua.ac.be/', delay=3, throttle=3):
>>> print link
Link(url=u'http://www.clips.ua.ac.be/')
Link(url=u'http://www.clips.ua.ac.be/#navigation')
-Link(url=u'http://www.clips.ua.ac.be/computational-linguistics')
+Link(url=u'http://www.clips.ua.ac.be/computational-linguistics')
...
@@ -889,7 +880,7 @@ E-mail
The following example retrieves spam e-mails containing the word "wish":
>>> from pattern.web import Mail, GMAIL, SUBJECT
->>>
+>>>
>>> gmail = Mail(username='...', password='...', service=GMAIL)
>>> print gmail.folders.keys()
@@ -914,8 +905,8 @@ Locale
geocode(location) # 'Brussels' => (50.83, 4.33, u'nl', u'Belgium')
This is useful in combination with the geo parameter for Twitter.search() to obtain regional tweets:
>>> from pattern.web import Twitter
->>> from pattern.web.locale import geocode
->>>
+>>> from pattern.web.locale import geocode
+>>>
>>> twitter = Twitter(language='en')
>>> for tweet in twitter.search('restaurant', geo=geocode('Brussels')[:2]):
>>> print tweet.text
diff --git a/examples/01-web/01-google.py b/examples/01-web/01-google.py
index 1fbd2ac7..1fc9c1b4 100644
--- a/examples/01-web/01-google.py
+++ b/examples/01-web/01-google.py
@@ -6,8 +6,8 @@
# The pattern.web module has a SearchEngine class,
# with a SearchEngine.search() method that yields a list of Result objects.
# Each Result has url, title, text, language, author and date and properties.
-# Subclasses of SearchEngine include:
-# Google, Bing, Yahoo, Twitter, Facebook, Wikipedia, Wiktionary, Flickr, ...
+# Subclasses of SearchEngine include:
+# Google, Bing, Twitter, Facebook, Wikipedia, Wiktionary, Flickr, ...
# This example retrieves results from Google based on a given query.
# The Google search engine can handle SEARCH type searches.
@@ -17,7 +17,7 @@
# The pattern.web module uses a test account by default,
# with a 100 free queries per day shared by all Pattern users.
# If this limit is exceeded, SearchEngineLimitError is raised.
-# You should obtain your own license key at:
+# You should obtain your own license key at:
# https://code.google.com/apis/console/
# Activate "Custom Search API" under "Services" and get the key under "API Access".
# Then use Google(license=[YOUR_KEY]).search().
diff --git a/examples/01-web/11-facebook.py b/examples/01-web/11-facebook.py
index a633cb9c..4924237c 100644
--- a/examples/01-web/11-facebook.py
+++ b/examples/01-web/11-facebook.py
@@ -15,7 +15,7 @@
# 1) Searching for public status updates.
# Search for all status updates that contain the word "horrible".
-try:
+try:
# We'll store the status updates in a Datasheet.
# A Datasheet is a table of rows and columns that can be exported as a CSV-file.
# In the first column, we'll store a unique id for each status update.
@@ -33,7 +33,7 @@
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
-for status in fb.search("horrible", count=25, cached=False):
+for status in fb.search(262588213843476, count=25, cached=False):
print "=" * 100
print status.id
print status.text
@@ -59,8 +59,8 @@
if license != "":
fb = Facebook(license)
# Facebook.profile() returns a dictionary with author info.
- # By default, this is your own profile.
- # You can also supply the id of another profile,
+ # By default, this is your own profile.
+ # You can also supply the id of another profile,
# or the name of a product page.
me = fb.profile()["id"]
for status in fb.search(me, type=NEWS, count=30, cached=False):
diff --git a/examples/01-web/15-sort.py b/examples/01-web/15-sort.py
index 8134ccfc..0c369253 100644
--- a/examples/01-web/15-sort.py
+++ b/examples/01-web/15-sort.py
@@ -1,29 +1,29 @@
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
-from pattern.web import GOOGLE, YAHOO, BING, sort
+from pattern.web import GOOGLE, BING, sort
# The pattern.web module includes an interesting sort() algorithm.
# Ir classifies search terms according to a search engine's total results count.
# When a context is defined, it sorts according to relevancy to the context:
# sort(terms=["black", "green", "red"], context="Darth Vader") =>
-# yields "black" as the best candidate,
+# yields "black" as the best candidate,
# because "black Darth Vader" yields more search results.
results = sort(
terms = [
- "arnold schwarzenegger",
- "chuck norris",
- "dolph lundgren",
+ "arnold schwarzenegger",
+ "chuck norris",
+ "dolph lundgren",
"steven seagal",
- "sylvester stallone",
+ "sylvester stallone",
"mickey mouse",
],
context = "dangerous", # Term used for sorting.
- service = BING, # GOOGLE, YAHOO, BING, ...
+ service = BING, # GOOGLE, BING, ...
license = None, # You should supply your own API license key for the given service.
- strict = True, # Wraps the query in quotes, i.e. 'mac sweet'.
+ strict = True, # Wraps the query in quotes, i.e. 'mac sweet'.
reverse = True, # Reverses term and context: 'sweet mac' instead of 'mac sweet'.
cached = True)
-
+
for weight, term in results:
print "%5.2f" % (weight * 100) + "%", term
\ No newline at end of file
diff --git a/pattern/web/__init__.py b/pattern/web/__init__.py
index b1390eab..343154b1 100644
--- a/pattern/web/__init__.py
+++ b/pattern/web/__init__.py
@@ -58,7 +58,7 @@
MODULE = os.path.dirname(os.path.realpath(__file__))
except:
MODULE = ""
-
+
if sys.version > "3":
long = int
@@ -68,8 +68,8 @@
# assigned to these code points.
GREMLINS = set([
- 0x0152, 0x0153, 0x0160, 0x0161, 0x0178, 0x017E, 0x017D, 0x0192, 0x02C6,
- 0x02DC, 0x2013, 0x2014, 0x201A, 0x201C, 0x201D, 0x201E, 0x2018, 0x2019,
+ 0x0152, 0x0153, 0x0160, 0x0161, 0x0178, 0x017E, 0x017D, 0x0192, 0x02C6,
+ 0x02DC, 0x2013, 0x2014, 0x201A, 0x201C, 0x201D, 0x201E, 0x2018, 0x2019,
0x2020, 0x2021, 0x2022, 0x2026, 0x2030, 0x2039, 0x203A, 0x20AC, 0x2122
])
@@ -81,7 +81,7 @@ def fix(s, ignore=""):
if not isinstance(s, unicode):
s = s.decode("utf-8")
# If this doesn't work,
- # copy & paste string in a Unicode .txt,
+ # copy & paste string in a Unicode .txt,
# and then pass open(f).read() to fix().
u = []
i = 0
@@ -1048,7 +1048,7 @@ def description(self):
@property
def likes(self):
return self.votes
-
+
@property
def retweets(self):
return self.shares
@@ -1077,7 +1077,7 @@ def __setitem__(self, k, v):
def setdefault(self, k, v=None):
return dict.setdefault(self, u(k), self._format(v))
-
+
def update(self, *args, **kwargs):
dict.update(self, [(u(k), self._format(v)) for k, v in dict(*args, **kwargs).items()])
@@ -1233,89 +1233,6 @@ def identify(self, string, **kwargs):
data = u(data.get("language")), float(data.get("confidence"))
return data
-#--- YAHOO -----------------------------------------------------------------------------------------
-# Yahoo! Search is a web search engine owned by Yahoo! Inc.
-# Yahoo! BOSS ("Build Your Own Search Service") is a paid service.
-# http://developer.yahoo.com/search/
-
-YAHOO = "http://yboss.yahooapis.com/ysearch/"
-YAHOO_LICENSE = api.license["Yahoo"]
-
-class Yahoo(SearchEngine):
-
- def __init__(self, license=None, throttle=0.5, language=None):
- SearchEngine.__init__(self, license or YAHOO_LICENSE, throttle, language)
-
- def _authenticate(self, url):
- url.query.update({
- "oauth_version": "1.0",
- "oauth_nonce": oauth.nonce(),
- "oauth_timestamp": oauth.timestamp(),
- "oauth_consumer_key": self.license[0],
- "oauth_signature_method": "HMAC-SHA1"
- })
- url.query["oauth_signature"] = oauth.sign(url.string.split("?")[0], url.query,
- method = url.method,
- secret = self.license[1]
- )
- return url
-
- def search(self, query, type=SEARCH, start=1, count=10, sort=RELEVANCY, size=None, cached=True, **kwargs):
- """ Returns a list of results from Yahoo for the given query.
- - type : SEARCH, IMAGE or NEWS,
- - start: maximum 1000 results => start 1-100 with count=10, 1000/count,
- - count: maximum 50, or 35 for images.
- There is no daily limit, however Yahoo BOSS is a paid service.
- """
- if type not in (SEARCH, IMAGE, NEWS):
- raise SearchEngineTypeError
- if type == SEARCH:
- url = YAHOO + "web"
- if type == IMAGE:
- url = YAHOO + "images"
- if type == NEWS:
- url = YAHOO + "news"
- if not query or count < 1 or start < 1 or start > 1000 / count:
- return Results(YAHOO, query, type)
- # 1) Create request URL.
- url = URL(url, method=GET, query={
- "q": query.replace(" ", "+"),
- "start": 1 + (start-1) * count,
- "count": min(count, type==IMAGE and 35 or 50),
- "format": "json"
- })
- # 2) Restrict language.
- if self.language is not None:
- market = locale.market(self.language)
- if market:
- url.query["market"] = market.lower()
- # 3) Authenticate.
- url = self._authenticate(url)
- # 4) Parse JSON response.
- kwargs.setdefault("unicode", True)
- kwargs.setdefault("throttle", self.throttle)
- try:
- data = url.download(cached=cached, **kwargs)
- except HTTP401Authentication:
- raise HTTP401Authentication("Yahoo %s API is a paid service" % type)
- except HTTP403Forbidden:
- raise SearchEngineLimitError
- data = json.loads(data)
- data = data.get("bossresponse") or {}
- data = data.get({SEARCH:"web", IMAGE:"images", NEWS:"news"}[type], {})
- results = Results(YAHOO, query, type)
- results.total = int(data.get("totalresults") or 0)
- for x in data.get("results", []):
- r = Result(url=None)
- r.url = self.format(x.get("url", x.get("clickurl")))
- r.title = self.format(x.get("title"))
- r.text = self.format(x.get("abstract"))
- r.date = self.format(x.get("date"))
- r.author = self.format(x.get("source"))
- r.language = self.format(x.get("language") and \
- x.get("language").split(" ")[0] or self.language or "")
- results.append(r)
- return results
#--- BING ------------------------------------------------------------------------------------------
# Bing is a web search engine owned by Microsoft.
@@ -1638,12 +1555,12 @@ def search(self, query, type=SEARCH, start=1, count=10, sort=RELEVANCY, size=Non
# If search() is called again with start+1, start from this id.
if isinstance(start, (int, long, float)):
k = (query, kwargs.get("geo"), kwargs.get("date"), int(start), count)
- if results:
- self._pagination[k] = str(int(results[-1].id) - 1)
+ if results:
+ self._pagination[k] = str(int(results[-1].id) - 1)
else:
self._pagination[k] = id
return results
-
+
def profile(self, query, start=1, count=10, **kwargs):
""" Returns a list of results for the given author id, alias or search query.
"""
@@ -1869,13 +1786,13 @@ def index(self, namespace=0, start=None, count=100, cached=True, **kwargs):
start = data.get("query-continue", {}).get("allpages", {})
start = start.get("apcontinue", start.get("apfrom", -1))
raise StopIteration
-
+
# Backwards compatibility.
list = index
def search(self, query, type=SEARCH, start=1, count=10, sort=RELEVANCY, size=None, cached=True, **kwargs):
""" With type=SEARCH, returns a MediaWikiArticle for the given query (case-sensitive).
- With type=ALL, returns a list of results.
+ With type=ALL, returns a list of results.
Each result.title is the title of an article that contains the given query.
"""
if type not in (SEARCH, ALL, "*"):
@@ -2063,7 +1980,7 @@ def plaintext(self, **kwargs):
@property
def html(self):
return self.source
-
+
@property
def src(self):
return self.source
@@ -2100,7 +2017,7 @@ def source(self):
@property
def html(self):
return self.source
-
+
@property
def src(self):
return self.source
@@ -2142,7 +2059,7 @@ def tables(self):
p = self.article._plaintext
f = find_between
for s in f(b[0], b[1], self.source):
- t = self.article.parser.MediaWikiTable(self,
+ t = self.article.parser.MediaWikiTable(self,
title = p((f(r"", "", s) + [""])[0]),
source = b[0] + s + b[1])
# 1) Parse and content and format it as plain text.
@@ -2186,7 +2103,7 @@ def plaintext(self, **kwargs):
@property
def html(self):
return self.source
-
+
@property
def src(self):
return self.source
@@ -2493,7 +2410,7 @@ def search(self, query, type=SPARQL, start=1, count=10, sort=RELEVANCY, size=Non
# Flickr is a popular image hosting and video hosting website.
# http://www.flickr.com/services/api/
-FLICKR = "http://api.flickr.com/services/rest/"
+FLICKR = "https://api.flickr.com/services/rest/"
FLICKR_LICENSE = api.license["Flickr"]
INTERESTING = "interesting"
@@ -2624,23 +2541,22 @@ def _token(self):
"client_secret": "81ff4204e73ecafcd87635a3a3683fbe"
}).download().split("=")[1]
- def search(self, query, type=SEARCH, start=1, count=10, cached=False, **kwargs):
+ def search(self, query, type=NEWS, start=1, count=10, cached=False, **kwargs):
""" Returns a list of results from Facebook public status updates for the given query.
- query: string, or Result.id for NEWS and COMMENTS,
- - type : SEARCH,
+ - type : NEWS,
- start: 1,
- - count: maximum 100 for SEARCH and NEWS, 1000 for COMMENTS and LIKES.
+ - count: maximum 100 for NEWS, 1000 for COMMENTS and LIKES.
There is an hourly limit of +-600 queries (actual amount undisclosed).
"""
- # Facebook.search(type=SEARCH) returns public posts + author.
# Facebook.search(type=NEWS) returns posts for the given author (id | alias | "me").
# Facebook.search(type=COMMENTS) returns comments for the given post id.
# Facebook.search(type=LIKES) returns authors for the given author, post or comments.
# Facebook.search(type=FRIENDS) returns authors for the given author.
# An author is a Facebook user or other entity (e.g., a product page).
- if type not in (SEARCH, NEWS, COMMENTS, LIKES, FRIENDS):
+ if type not in (NEWS, COMMENTS, LIKES, FRIENDS):
raise SearchEngineTypeError
- if type in (SEARCH, NEWS):
+ if type in (NEWS):
max = 100
if type in (COMMENTS, LIKES):
max = 1000
@@ -2651,15 +2567,6 @@ def search(self, query, type=SEARCH, start=1, count=10, cached=False, **kwargs):
if isinstance(query, FacebookResult):
query = query.id
# 1) Construct request URL.
- if type == SEARCH:
- url = FACEBOOK + type
- url = URL(url, method=GET, query={
- "q": query,
- "type": "post",
- "access_token": self.license,
- "offset": (start-1) * min(count, max),
- "limit": (start-0) * min(count, max)
- })
if type in (NEWS, FEED, COMMENTS, LIKES, FRIENDS):
url = FACEBOOK + (u(query) or "me").replace(FACEBOOK, "") + "/" + type.replace("news", "feed")
url = URL(url, method=GET, query={
@@ -2667,10 +2574,10 @@ def search(self, query, type=SEARCH, start=1, count=10, cached=False, **kwargs):
"offset": (start-1) * min(count, max),
"limit": (start-0) * min(count, max),
})
- if type in (SEARCH, NEWS, FEED):
+ if type in (NEWS, FEED):
url.query["fields"] = ",".join((
- "id", "from", "name", "story", "message", "link", "picture", "created_time", "shares",
- "comments.limit(1).summary(true)",
+ "id", "from", "name", "story", "message", "link", "picture", "created_time", "shares",
+ "comments.limit(1).summary(true)",
"likes.limit(1).summary(true)"
))
# 2) Parse JSON response.
@@ -2737,7 +2644,7 @@ def profile(self, id=None, **kwargs):
locale = data.get("hometown", {}).get("name", ""),
votes = int(data.get("likes", 0)) # (for product pages)
)
-
+
page = profile
#--- PRODUCT REVIEWS -------------------------------------------------------------------------------
@@ -2876,8 +2783,6 @@ def query(string, service=GOOGLE, **kwargs):
service = service.lower()
if service in (GOOGLE, "google", "g"):
engine = Google
- if service in (YAHOO, "yahoo", "y!"):
- engine = Yahoo
if service in (BING, "bing"):
engine = Bing
if service in (DUCKDUCKGO, "duckduckgo", "ddg"):
@@ -2907,7 +2812,6 @@ def query(string, service=GOOGLE, **kwargs):
SERVICES = {
GOOGLE : Google,
- YAHOO : Yahoo,
BING : Bing,
TWITTER : Twitter,
WIKIPEDIA : Wikipedia,
@@ -2924,7 +2828,7 @@ def sort(terms=[], context="", service=GOOGLE, license=None, strict=True, prefix
yields "black" as the best candidate, because "black Darth Vader" is more common in search results.
- terms : list of search terms,
- context : term used for sorting,
- - service : web service name (GOOGLE, YAHOO, BING),
+ - service : web service name (GOOGLE, BING),
- license : web service license id,
- strict : when True the query constructed from term + context is wrapped in quotes.
"""
@@ -3021,7 +2925,7 @@ def traverse(self, visit=lambda node: None):
""" Executes the visit function on this node and each of its child nodes.
"""
visit(self); [node.traverse(visit) for node in self.children]
-
+
def remove(self, child):
""" Removes the given child node (and all nested nodes).
"""
@@ -3043,7 +2947,7 @@ def __str__(self):
return bytestring(self.__unicode__())
def __unicode__(self):
return u(self._p)
-
+
def __call__(self, *args, **kwargs):
pass
@@ -3235,7 +3139,7 @@ def __repr__(self):
def _encode_space(s):
return s.replace(" ", "")
-
+
def _decode_space(s):
return s.replace("", " ")
@@ -3253,11 +3157,11 @@ def __init__(self, s):
s = s.replace(".", " .") # .class
s = s.replace(":", " :") # :pseudo-element
s = s.replace("[", " [") # [attribute="value"]
- s = re.sub(r"\[.*?\]",
- lambda m: re.sub(r" (\#|\.|\:)", "\\1", m.group(0)), s)
- s = re.sub(r"\[.*?\]",
+ s = re.sub(r"\[.*?\]",
+ lambda m: re.sub(r" (\#|\.|\:)", "\\1", m.group(0)), s)
+ s = re.sub(r"\[.*?\]",
lambda m: _encode_space(m.group(0)), s)
- s = re.sub(r":contains\(.*?\)",
+ s = re.sub(r":contains\(.*?\)",
lambda m: _encode_space(m.group(0)), s)
s = s.split(" ")
self.tag, self.id, self.classes, self.pseudo, self.attributes = (
@@ -3297,7 +3201,7 @@ def _first_child(self, e):
for e in e.children:
if isinstance(e, Element):
return e
-
+
def _next_sibling(self, e):
""" Returns the first next sibling Element of the given element.
"""
@@ -3305,7 +3209,7 @@ def _next_sibling(self, e):
e = e.next
if isinstance(e, Element):
return e
-
+
def _previous_sibling(self, e):
""" Returns the last previous sibling Element of the given element.
"""
@@ -3313,7 +3217,7 @@ def _previous_sibling(self, e):
e = e.previous
if isinstance(e, Element):
return e
-
+
def _contains(self, e, s):
""" Returns True if string s occurs in the given element (case-insensitive).
"""
@@ -3383,9 +3287,9 @@ def __init__(self, s):
s = re.sub(r" *\> *", " >", s)
s = re.sub(r" *\< *", " <", s)
s = re.sub(r" *\+ *", " +", s)
- s = re.sub(r"\[.*?\]",
+ s = re.sub(r"\[.*?\]",
lambda m: _encode_space(m.group(0)), s)
- s = re.sub(r":contains\(.*?\)",
+ s = re.sub(r":contains\(.*?\)",
lambda m: _encode_space(m.group(0)), s)
self.append([])
for s in s.split(" "):
@@ -3739,7 +3643,7 @@ class DocumentParserError(Exception):
pass
class DocumentParser(object):
-
+
def __init__(self, path, *args, **kwargs):
""" Parses a text document (e.g., .pdf or .docx),
given as a file path or a string.
@@ -3760,7 +3664,7 @@ def _parse(self, path, *args, **kwargs):
""" Returns a plaintext Unicode string parsed from the given document.
"""
return plaintext(decode_utf8(self.open(path).read()))
-
+
@property
def string(self):
return self.content
@@ -3810,7 +3714,7 @@ class DOCXError(DocumentParserError):
pass
class DOCX(DocumentParser):
-
+
def _parse(self, path, *args, **kwargs):
from docx.docx import opendocx
from docx.docx import getdocumenttext
@@ -3830,7 +3734,7 @@ def parsepdf(path, *args, **kwargs):
""" Returns the content as a Unicode string from the given .pdf file.
"""
return PDF(path, *args, **kwargs).content
-
+
def parsedocx(path, *args, **kwargs):
""" Returns the content as a Unicode string from the given .docx file.
"""
@@ -3853,7 +3757,7 @@ def parsedoc(path, format=None):
return parsehtml(path)
# Brute-force approach if the format is unknown.
for f in (parsepdf, parsedocx, parsehtml):
- try:
+ try:
return f(path)
except:
pass
diff --git a/pattern/web/api.py b/pattern/web/api.py
index 5dfd0d54..882b456c 100644
--- a/pattern/web/api.py
+++ b/pattern/web/api.py
@@ -1,6 +1,6 @@
#--- API LICENSE CONFIGURATION -----------------------------------------------------------------------
# Default license keys used by pattern.web.SearchEngine to contact different API's.
-# Google and Yahoo are paid services for which you need a personal license + payment method.
+# Google is paid services for which you need a personal license + payment method.
# The default Google license is for testing purposes (= 100 daily queries).
# Wikipedia, Twitter and Facebook are free.
# Bing, Flickr and ProductsWiki use licenses shared among all Pattern users.
@@ -12,9 +12,6 @@
license["Bing"] = \
"VnJEK4HTlntE3SyF58QLkUCLp/78tkYjV1Fl3J7lHa0="
-license["Yahoo"] = \
- ("", "") # OAuth (key, secret)
-
license["DuckDuckGo"] = \
None
diff --git a/test/test.py b/test/test.py
index 40b08a09..3483cb5e 100644
--- a/test/test.py
+++ b/test/test.py
@@ -18,8 +18,8 @@
#---------------------------------------------------------------------------------------------------
# Run all tests.
# pattern.db tests require a valid username and password for MySQL.
-# pattern.web tests require a working internet connection
-# and API license keys (see pattern.web.api.py) for Google and Yahoo API's.
+# pattern.web tests require a working internet connection
+# and API license keys (see pattern.web.api.py) for Google API's.
def suite():
suite = unittest.TestSuite()
diff --git a/test/test_web.py b/test/test_web.py
index 269a6822..95b2d92b 100644
--- a/test/test_web.py
+++ b/test/test_web.py
@@ -15,10 +15,10 @@
#---------------------------------------------------------------------------------------------------
class TestCache(unittest.TestCase):
-
+
def setUp(self):
pass
-
+
def test_cache(self):
# Assert cache unicode.
k, v = "test", u"ünîcødé"
@@ -28,11 +28,11 @@ def test_cache(self):
self.assertEqual(web.cache.age(k), 0)
del web.cache[k]
print("pattern.web.Cache")
-
+
#---------------------------------------------------------------------------------------------------
class TestUnicode(unittest.TestCase):
-
+
def setUp(self):
# Test data with different (or wrong) encodings.
self.strings = (
@@ -43,7 +43,7 @@ def setUp(self):
"ünîcøde",
u"אוניקאָד"
)
-
+
def test_decode_utf8(self):
# Assert unicode.
for s in self.strings:
@@ -55,7 +55,7 @@ def test_encode_utf8(self):
for s in self.strings:
self.assertTrue(isinstance(web.encode_utf8(s), str))
print("pattern.web.encode_utf8()")
-
+
def test_fix(self):
# Assert fix for common Unicode mistakes.
self.assertEqual(web.fix(u"cliché"), u"cliché")
@@ -66,7 +66,7 @@ def test_fix(self):
#---------------------------------------------------------------------------------------------------
class TestURL(unittest.TestCase):
-
+
def setUp(self):
# Test a live URL that has fast response time
self.live = "http://www.google.com/"
@@ -83,7 +83,7 @@ def setUp(self):
"query": {"q": 1},
"anchor": "anchor"
}
-
+
def test_asynchrous(self):
# Assert asynchronous function call (returns 1).
v = web.asynchronous(lambda t: time.sleep(t) or 1, 0.2)
@@ -91,25 +91,25 @@ def test_asynchrous(self):
time.sleep(0.1)
self.assertEqual(v.value, 1)
print("pattern.web.asynchronous()")
-
+
def test_extension(self):
# Assert filename extension.
v = web.extension(os.path.join("pattern", "test", "test-web.py.zip"))
self.assertEqual(v, ".zip")
print("pattern.web.extension()")
-
+
def test_urldecode(self):
# Assert URL decode (inverse of urllib.urlencode).
v = web.urldecode("?user=me&page=1&q=&")
self.assertEqual(v, {"user": "me", "page": 1, "q": None})
print("pattern.web.urldecode()")
-
+
def test_proxy(self):
# Assert URL proxy.
v = web.proxy("www.proxy.com", "https")
self.assertEqual(v, ("www.proxy.com", "https"))
print("pattern.web.proxy()")
-
+
def test_url_parts(self):
# Assert URL._parse and URL.parts{}.
v = web.URL(self.url)
@@ -125,7 +125,7 @@ def test_url_parts(self):
(web.ANCHOR, self.parts["anchor"])):
self.assertEqual(v.parts[a], b)
print("pattern.web.URL.parts")
-
+
def test_url_query(self):
# Assert URL.query and URL.querystring.
v = web.URL(self.url)
@@ -142,7 +142,7 @@ def test_url_query(self):
self.assertEqual(v.query, q[0])
print("pattern.web.URL.query")
print("pattern.web.URL.querystring")
-
+
def test_url_string(self):
# Assert URL._set_string().
v = web.URL("")
@@ -151,7 +151,7 @@ def test_url_string(self):
self.assertEqual(v.parts[web.DOMAIN], "domain.com")
self.assertEqual(v.parts[web.PATH], [])
print("pattern.web.URL.string")
-
+
def test_url(self):
# Assert URL.copy().
v = web.URL(self.url)
@@ -171,7 +171,7 @@ def test_url(self):
self.assertEqual(v.query, self.parts["query"])
self.assertEqual(v.anchor, self.parts["anchor"])
print("pattern.web.URL")
-
+
def test_url_open(self):
# Assert URLError.
v = web.URL(self.live.replace("http://", "htp://"))
@@ -190,7 +190,7 @@ def test_url_open(self):
self.assertTrue(v.open(user_agent=web.MOZILLA, referrer=web.REFERRER) != None)
print("pattern.web.URL.exists")
print("pattern.web.URL.open()")
-
+
def test_url_download(self):
t = time.time()
v = web.URL(self.live).download(cached=False, throttle=0.25, unicode=True)
@@ -200,19 +200,19 @@ def test_url_download(self):
# Assert download rate limiting.
self.assertTrue(t >= 0.25)
print("pattern.web.URL.download()")
-
+
def test_url_mimetype(self):
# Assert URL MIME-type.
v = web.URL(self.live).mimetype
self.assertTrue(v in web.MIMETYPE_WEBPAGE)
print("pattern.web.URL.mimetype")
-
+
def test_url_headers(self):
# Assert URL headers.
v = web.URL(self.live).headers["content-type"].split(";")[0]
self.assertEqual(v, "text/html")
print("pattern.web.URL.headers")
-
+
def test_url_redirect(self):
# Assert URL redirected URL (this depends on where you are).
# In Belgium, it yields "http://www.google.be/".
@@ -233,12 +233,12 @@ def test_abs(self):
v = web.abs(a, base=b)
self.assertEqual(v, b+c+a) # http://domain.com/#anchor
print("pattern.web.abs()")
-
+
def test_base(self):
# Assert base URL domain name.
self.assertEqual(web.base("http://domain.com/home.html"), "domain.com")
print("pattern.web.base()")
-
+
def test_oauth(self):
# Assert OAuth algorithm.
data = {
@@ -247,7 +247,7 @@ def test_oauth(self):
"oauth_nonce": "0",
"oauth_timestamp": 0,
"oauth_consumer_key": "key",
- "oauth_signature_method": "HMAC-SHA1"
+ "oauth_signature_method": "HMAC-SHA1"
}
v = web.oauth.sign("http://yboss.yahooapis.com/ysearch/web", data, secret="secret")
self.assertEqual(v, "RtTu8dxSp3uBzSbsuLAXIWOKfyI=")
@@ -256,10 +256,10 @@ def test_oauth(self):
#---------------------------------------------------------------------------------------------------
class TestPlaintext(unittest.TestCase):
-
+
def setUp(self):
pass
-
+
def test_find_urls(self):
# Assert URL finder with common URL notations.
for url in (
@@ -277,7 +277,7 @@ def test_find_urls(self):
self.assertEqual(web.find_urls("http://domain.net\">domain")[0], "http://domain.net")
self.assertEqual(web.find_urls("domain.com, domain.net"), ["domain.com", "domain.net"])
print("pattern.web.find_urls()")
-
+
def test_find_email(self):
# Assert e-mail finder with common e-mail notations.
s = "firstname.last+name@domain.ac.co.uk"
@@ -288,7 +288,7 @@ def test_find_email(self):
v = web.find_email("("+",".join(s)+")")
self.assertEqual(v, s)
print("pattern.web.find_email()")
-
+
def test_find_between(self):
# Assert search between open tag and close tag.
s = ""
@@ -299,7 +299,7 @@ def test_find_between(self):
v = web.find_between("a", "b", s)
self.assertEqual(v, ["0", "1"])
print("pattern.web.find_between()")
-
+
def test_strip_tags(self):
# Assert HTML parser and tag stripper.
for html, plain in (
@@ -314,19 +314,19 @@ def test_strip_tags(self):
v = web.strip_tags("text", exclude={"a": ["href"]})
self.assertEqual(v, "text")
print("pattern.web.strip_tags()")
-
+
def test_strip_element(self):
# Assert strip elements.
v = web.strip_element("
text ", "p")
self.assertEqual(v, " ")
print("pattern.web.strip_element()")
-
+
def test_strip_between(self):
# Assert strip elements.
v = web.strip_between("
", "
text ")
self.assertEqual(v, " text ")
print("pattern.web.strip_between()")
-
+
def test_strip_javascript(self):
# Assert strip ")
@@ -338,7 +338,7 @@ def test_strip_inline_css(self):
v = web.strip_inline_css(" ")
self.assertEqual(v, " ")
print("pattern.web.strip_inline_css()")
-
+
def test_strip_comments(self):
# Assert strip elements.
v = web.strip_comments(" ")
@@ -350,19 +350,19 @@ def test_strip_forms(self):
v = web.strip_forms(" ")
self.assertEqual(v, " ")
print("pattern.web.strip_forms()")
-
+
def test_encode_entities(self):
# Assert HTML entity encoder (e.g., "&" => "&&")
for a, b in (
- ("É", "É"),
- ("&", "&"),
- ("<", "<"),
- (">", ">"),
+ ("É", "É"),
+ ("&", "&"),
+ ("<", "<"),
+ (">", ">"),
('"', """),
("'", "'")):
self.assertEqual(web.encode_entities(a), b)
print("pattern.web.encode_entities()")
-
+
def test_decode_entities(self):
# Assert HMTL entity decoder (e.g., "&" => "&")
for a, b in (
@@ -373,7 +373,7 @@ def test_decode_entities(self):
("&foo;", "&foo;")):
self.assertEqual(web.decode_entities(a), b)
print("pattern.web.decode_entities()")
-
+
def test_collapse_spaces(self):
# Assert collapse multiple spaces.
for a, b in (
@@ -386,7 +386,7 @@ def test_collapse_spaces(self):
# Assert preserve indendation.
self.assertEqual(web.collapse_spaces(" . \n", indentation=True), " .")
print("pattern.web.collapse_spaces()")
-
+
def test_collapse_tabs(self):
# Assert collapse multiple tabs to 1 space.
for a, b in (
@@ -398,7 +398,7 @@ def test_collapse_tabs(self):
# Assert preserve indendation.
self.assertEqual(web.collapse_tabs("\t\t .\t\n", indentation=True), "\t\t .")
print("pattern.web.collapse_tabs()")
-
+
def test_collapse_linebreaks(self):
# Assert collapse multiple linebreaks.
for a, b in (
@@ -409,9 +409,9 @@ def test_collapse_linebreaks(self):
(" \n .", "\n .")):
self.assertEqual(web.collapse_linebreaks(a), b)
print("pattern.web.collapse_linebreaks()")
-
+
def test_plaintext(self):
- # Assert plaintext:
+ # Assert plaintext:
# - strip
@@ -744,7 +733,7 @@ def setUp(self):