我想制作一个网络抓取工具,以便从website下载HTML,但我不太了解re
模型,并且卡住了。
import urllib2
def download(url):
print("Downloading: " + url)
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print("Download error: ", e.reason)
html = None
return html
FIELD = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone',
'postal_code_format', 'postal_code_regex', 'languages', 'neighhbours')
import re
def re_scraper(html):
results = {}
for field in FIELD:
results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).group()
return results
import time
NUM_ITERATIONS = 1000
html = download("http://example.webscraping.com/view/Afghanistan-1")
for name, scraper in [('Regular expressions', re_scraper), ('BeautifulSoup', bs_scraper), ('Lxml', lxml_scraper)]:
start = time.time()
for i in range(NUM_ITERATIONS):
if scraper == re_scraper:
re.purge()
result = scraper(html)
assert (result['area'] == '647,500 square kilometres')
end = time.time()
print('%s: %.2f seconds' % (name, end - start))
错误消息:
File "E:/���/Projects/new.py", line 20, in re_scraper
results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).group()
AttributeError: 'NoneType' object has no attribute 'group'
HTML是:
<tr id="places_area__row"><td class="w2p_fl"><label for="places_area" id="places_area__label">Area: </label></td><td class="w2p_fw">647,500 square kilometres</td>
我测试过代码并找到HTML和正则表达式没问题。问题可能出在field
或FIELD
上。我认为他们的类型会导致这个错误,但我该如何解决呢?