此脚本抛出索引错误
from urlparse import urlparse
from multiprocessing.pool import Pool
import re
import urllib2
def btl_test(url):
page = urllib2.urlopen(url).read()
page1 = (re.findall(r'<title>(.*?)<\/title>',page)[0])
return page1
url = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://facebook.com","http://orkut.com","http://oosing.com","http://pinterets.com"]
nprocs = 100 # nprocs is the number of processes to run
ParsePool = Pool(nprocs)
ParsedURLS = ParsePool.map(btl_test,url)
print ParsedURLS
输出:
Traceback (most recent call last):
File "multithread1.py", line 15, in <module>
ParsedURLS = ParsePool.map(btl_test,url)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 558, in get
raise self._value
IndexError: list index out of range
以上是错误消息
问题出在哪里,解决方案是什么?
答案 0 :(得分:2)
有可能url does not have title tag
因此从此转换为
def btl_test(url):
page = urllib2.urlopen(url).read()
page1 = (re.findall(r'<title>(.*?)<\/title>',page)[0])
return page1
此
def btl_test(url):
page = urllib2.urlopen(url).read()
page1 = re.findall(r'<title>(.*?)<\/title>',page)
return (page1[0]) if len(page1)>0 else "None"