我需要一个获取谷歌adsense收入的python脚本,我找到了adsense scraper: http://pypi.python.org/pypi/adsense_scraper/0.5 它使用Twill和html5lib来刮取谷歌的adsense收入数据。当我使用它时,我收到此错误消息:
Traceback (most recent call last):
File "adsense_scraper.py", line 163, in <module>
data = main()
File "adsense_scraper.py", line 154, in main
b = get_adsense(login, password)
File "adsense_scraper.py", line 128, in get_adsense
b.submit()
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 467, in submit
self._journey('open', request)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 523, in _journey
r = func(*args, **kwargs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 590, in http_response
"http", request, response, code, msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\utils.py", line 442, in http_response
"refresh", msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 181, in open
response = urlopen(self, req, data)
File "C:\Python26\lib\urllib2.py", line 406, in _open 'unknown_open', req)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain result = func(*args)
File "C:\Python26\lib\urllib2.py", line 1163, in unknown_open raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>
重要的是:
urllib2.URLError: <urlopen error unknown url type: 'http>
有人可以告诉我错误在哪里吗?有没有更好的方法来通过python获取数据?感谢
答案 0 :(得分:1)
包中有几个错误,你只提到了第一个错误
1)斜纹包不能正确处理谷歌的重定向,添加
newurl = newurl.strip( "'" )
到twill / other_packages / _mechanize_dist / _http.py:108之前
newurl = _rfc3986.clean_url(newurl, "latin-1")
修复了
2)你必须在adsense中设置正确的语言 - 英语
3)orignal adsense_scraper
中存在一些问题#!/usr/bin/env python
"""Scrapes Google AdSense data with Python using Twill
Current canonical location of this module is here:
http://github.com/etrepum/adsense_scraper/tree/master
Usage::
from adsense_scraper import get_adsense, get_time_period
b = get_adsense('YOUR_ADSENSE_LOGIN', 'YOUR_ADSENSE_PASSWORD')
rows = get_time_period(b, 'yesterday')
# The summary data is always the first row with channel == ''
print 'I earned this much yesterday: $%(earnings)s' % rows[0]
"""
# requires html5lib, twill
import sys
import pprint
import decimal
from cStringIO import StringIO
from xml.etree import cElementTree
try:
from html5lib import HTMLParser
import twill.commands
except ImportError:
print >>sys.stderr, """\
adsense_scraper has dependencies::
Twill 0.9 http://twill.idyll.org/
html5lib 0.11 http://code.google.com/p/html5lib/
Try this::
$ easy_install twill html5lib
"""
raise SystemExit()
__version__ = '0.5'
SERVICE_LOGIN_BOX_URL = "https://www.google.com/accounts/ServiceLogin?service=adsense&rm=hide&fpui=3&nui=15&alwf=true<mpl=adsense&passive=true&continue=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&followup=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&hl=en_US"
OVERVIEW_URL = "https://www.google.com/adsense/report/overview?timePeriod="
TIME_PERIODS = [
'today',
'yesterday',
'thismonth',
'lastmonth',
'sincelastpayment',
]
def parse_decimal(s):
"""Return an int or decimal.Decimal given a human-readable number
"""
light_stripped = s.strip(u'\u20ac')
stripped = light_stripped.replace(',', '.').rstrip('%').lstrip('$')
try:
int(stripped)
return light_stripped
except ValueError:
pass
try:
float(stripped)
return light_stripped
except ValueError:
return decimal.Decimal(stripped)
def parse_summary_table(doc):
"""
Parse the etree doc for summarytable, returns::
[{'channel': unicode,
'impressions': int,
'clicks': int,
'ctr': decimal.Decimal,
'ecpm': decimal.Decimal,
'earnings': decimal.Decimal}]
"""
for t in doc.findall('.//table'):
if t.attrib.get('id') == 'summarytable':
break
else:
raise ValueError("summary table not found")
res = []
FIELDS = ['impressions', 'clicks', 'ctr', 'ecpm', 'earnings']
for row in t.findall('.//tr'):
celltext = []
for c in row.findall('td'):
tail = ''
# adsense inserts an empty span if a row has a period in it, so
# get the children and find the tail element to append to the text
if c.find('a') and c.find('a').getchildren():
tail = c.find('a').getchildren()[0].tail or ''
celltext.append('%s%s' % ((c.text or c.findtext('a') or '').strip(), tail.strip()))
celltext = filter( lambda x: x != "" , celltext )
if len(celltext) != len(FIELDS):
continue
try:
value_cols = map(parse_decimal, celltext)
except decimal.InvalidOperation:
continue
res.append(dict(zip(FIELDS, value_cols)))
return res
def get_adsense(login, password):
"""Returns a twill browser instance after having logged in to AdSense
with *login* and *password*.
The returned browser will have all of the appropriate cookies set but may
not be at the exact page that you want data from.
"""
b = twill.commands.get_browser()
b.go(SERVICE_LOGIN_BOX_URL)
for form in b.get_all_forms():
try:
form['Email'] = login
form['Passwd'] = password
except ValueError:
continue
else:
break
else:
raise ValueError("Could not find login form on page")
b._browser.select_form(predicate=lambda f: f is form)
b.submit()
return b
def get_time_period(b, period):
"""Returns the parsed summarytable for the time period *period* given
*b* which should be the result of a get_adsense call. *period* must be
a time period that AdSense supports:
``'today'``, ``'yesterday'``, ``'thismonth'``,
``'lastmonth'``, ``'sincelastpayment'``.
"""
b.go(OVERVIEW_URL + period)
# The cElementTree treebuilder doesn't work reliably enough
# to use directly, so we parse and then dump into cElementTree.
doc = cElementTree.fromstring(HTMLParser().parse(b.get_html()).toxml())
return parse_summary_table(doc)
def main():
try:
login, password = sys.argv[1:]
except ValueError:
raise SystemExit("usage: %s LOGIN PASSWORD" % (sys.argv[0],))
twill.set_output(StringIO())
twill.commands.reset_browser()
b = get_adsense(login, password)
data = {}
for period in TIME_PERIODS:
data[period] = get_time_period(b, period)
pprint.pprint(data)
twill.set_output(None)
return data
if __name__ == '__main__':
data = main()