我有以下代码: 导入请求 来自bs4 import BeautifulSoup
reloadData
返回错误:
def hltvmatch_spider(max_offset):
offset = 0
while offset < max_offset:
url = 'http://www.hltv.org/?pageid=188&offset=' + str(offset)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a'):
href = "http://www.hltv.org" + link.get('href')
print(href)
offset += 50
hltvmatch_spider(1)
我需要改变修改的想法吗?感谢帮助!!
答案 0 :(得分:1)
您应该设置 href = True ,这样您才能获得具有href属性的锚点,当锚点没有href属性时,调用.get("href")
将返回None:
for link in soup.findAll('a', href=True):
正如我评论所有锚点可能不是你想要的,因为加入基本网址不起作用,这将从主要内容中获取所有锚标记:
cont = soup.select_one("div.covMainBoxContent")
print([a["href"] for a in cont.select("a[href]")])
哪会给你:
['/?pageid=188&matchid=31342', '/?pageid=179&teamid=4411', '/?pageid=179&teamid=5995', '/?pageid=188&eventid=2135', '/?pageid=188&matchid=31343', '/?pageid=179&teamid=4548', '/?pageid=179&teamid=6865', '/?pageid=188&eventid=2254', '/?pageid=188&matchid=31339', '/?pageid=179&teamid=4548', '/?pageid=179&teamid=6865', '/?pageid=188&eventid=2254', '/?pageid=188&matchid=31338', '/?pageid=179&teamid=5995', '/?pageid=179&teamid=6736', '/?pageid=188&eventid=2135', '/?pageid=188&matchid=31341', '/?pageid=179&teamid=6620', '/?pageid=179&teamid=6807', '/?pageid=188&eventid=2238', '/?pageid=188&matchid=31340', '/?pageid=179&teamid=6807', '/?pageid=179&teamid=6620', '/?pageid=188&eventid=2238', '/?pageid=188&matchid=31329', '/?pageid=179&teamid=5995', '/?pageid=179&teamid=6736', '/?pageid=188&eventid=2135', '/?pageid=188&matchid=31336', '/?pageid=179&teamid=6998', '/?pageid=179&teamid=4602', '/?pageid=188&eventid=2262', '/?pageid=188&matchid=31334', '/?pageid=179&teamid=4602', '/?pageid=179&teamid=6998', '/?pageid=188&eventid=2262', '/?pageid=188&matchid=31331', '/?pageid=179&teamid=4674', '/?pageid=179&teamid=6133', '/?pageid=188&eventid=2254', '/?pageid=188&matchid=31330', '/?pageid=179&teamid=6133', '/?pageid=179&teamid=4674', '/?pageid=188&eventid=2254', '/?pageid=188&matchid=31333', '/?pageid=179&teamid=4501', '/?pageid=179&teamid=6686', '/?pageid=188&eventid=2232', '/?pageid=188&matchid=31332', '/?pageid=179&teamid=4501', '/?pageid=179&teamid=6686', '/?pageid=188&eventid=2232', '/?pageid=188&matchid=31319', '/?pageid=179&teamid=4411', '/?pageid=179&teamid=6615', '/?pageid=188&eventid=2135', '/?pageid=188&matchid=31321', '/?pageid=179&teamid=6133', '/?pageid=179&teamid=5929', '/?pageid=188&eventid=2262', '/?pageid=188&matchid=31320', '/?pageid=179&teamid=5929', '/?pageid=179&teamid=6133', '/?pageid=188&eventid=2262', '/?pageid=188&matchid=31318', '/?pageid=179&teamid=6615', '/?pageid=179&teamid=4411', '/?pageid=188&eventid=2135', '/?pageid=188&matchid=31328', '/?pageid=179&teamid=6222', '/?pageid=179&teamid=6408', '/?pageid=188&eventid=2232', '/?pageid=188&matchid=31327', '/?pageid=179&teamid=6408', '/?pageid=179&teamid=6222', '/?pageid=188&eventid=2232', '/?pageid=188&matchid=31326', '/?pageid=179&teamid=6621', '/?pageid=179&teamid=6968', '/?pageid=188&eventid=2252', '/?pageid=188&matchid=31325', '/?pageid=179&teamid=6621', '/?pageid=179&teamid=6968', '/?pageid=188&eventid=2252', '/?pageid=188&matchid=31324', '/?pageid=179&teamid=6619', '/?pageid=179&teamid=6785', '/?pageid=188&eventid=2252', '/?pageid=188&matchid=31322', '/?pageid=179&teamid=6619', '/?pageid=179&teamid=6785', '/?pageid=188&eventid=2252', '/?pageid=188&matchid=31317', '/?pageid=179&teamid=4548', '/?pageid=179&teamid=6407', '/?pageid=188&eventid=2254', '/?pageid=188&matchid=31316', '/?pageid=179&teamid=4548', '/?pageid=179&teamid=6407', '/?pageid=188&eventid=2254', '/?pageid=188&matchid=31315', '/?pageid=179&teamid=6995', '/?pageid=179&teamid=7009', '/?pageid=188&eventid=2253', '/?pageid=188&matchid=31306', '/?pageid=179&teamid=6995', '/?pageid=179&teamid=7009', '/?pageid=188&eventid=2253', '/?pageid=188&matchid=31314', '/?pageid=179&teamid=4501', '/?pageid=179&teamid=6686', '/?pageid=188&eventid=2262', '/?pageid=188&matchid=31310', '/?pageid=179&teamid=6686', '/?pageid=179&teamid=4501', '/?pageid=188&eventid=2262', '/?pageid=188&matchid=31304', '/?pageid=179&teamid=6889', '/?pageid=179&teamid=6994', '/?pageid=188&eventid=2253', '/?pageid=188&matchid=31302', '/?pageid=179&teamid=6994', '/?pageid=179&teamid=6889', '/?pageid=188&eventid=2253', '/?pageid=188&matchid=31313', '/?pageid=179&teamid=6865', '/?pageid=179&teamid=4688', '/?pageid=188&eventid=2255', '/?pageid=188&matchid=31312', '/?pageid=179&teamid=4688', '/?pageid=179&teamid=6865', '/?pageid=188&eventid=2255', '/?pageid=188&matchid=31311', '/?pageid=179&teamid=4688', '/?pageid=179&teamid=6865', '/?pageid=188&eventid=2255', '/?pageid=188&matchid=31309', '/?pageid=179&teamid=4602', '/?pageid=179&teamid=6408', '/?pageid=188&eventid=2232', '/?pageid=188&matchid=31308', '/?pageid=179&teamid=4602', '/?pageid=179&teamid=6408', '/?pageid=188&eventid=2232', '/?pageid=188&matchid=31307', '/?pageid=179&teamid=4602', '/?pageid=179&teamid=6408', '/?pageid=188&eventid=2232', '/?pageid=188&matchid=31305', '/?pageid=179&teamid=6133', '/?pageid=179&teamid=4548', '/?pageid=188&eventid=2254', '/?pageid=188&matchid=31303', '/?pageid=179&teamid=4548', '/?pageid=179&teamid=6133', '/?pageid=188&eventid=2254', '/?pageid=188&matchid=31294', '/?pageid=179&teamid=4869', '/?pageid=179&teamid=6137', '/?pageid=188&eventid=2176', '/?pageid=188&matchid=31293', '/?pageid=179&teamid=6137', '/?pageid=179&teamid=4869', '/?pageid=188&eventid=2176', '/?pageid=188&matchid=31292', '/?pageid=179&teamid=4869', '/?pageid=179&teamid=6137', '/?pageid=188&eventid=2176', '/?pageid=188&matchid=31291', '/?pageid=179&teamid=4548', '/?pageid=179&teamid=6407', '/?pageid=188&eventid=2232', '/?pageid=188&matchid=31290', '/?pageid=179&teamid=4548', '/?pageid=179&teamid=6407', '/?pageid=188&eventid=2232', '/?pageid=188&matchid=31289', '/?pageid=179&teamid=4548', '/?pageid=179&teamid=6407', '/?pageid=188&eventid=2232', '/?pageid=188&matchid=31301', '/?pageid=179&teamid=5996', '/?pageid=179&teamid=6981', '/?pageid=188&eventid=2273', '/?pageid=188&matchid=31300', '/?pageid=179&teamid=5996', '/?pageid=179&teamid=6981', '/?pageid=188&eventid=2273', '/?pageid=188&matchid=31299', '/?pageid=179&teamid=5996', '/?pageid=179&teamid=6869', '/?pageid=188&eventid=2273', '/?pageid=188&matchid=31298', '/?pageid=179&teamid=5996', '/?pageid=179&teamid=6869', '/?pageid=188&eventid=2273', '/?pageid=188&matchid=31297', '/?pageid=179&teamid=6981', '/?pageid=179&teamid=6792', '/?pageid=188&eventid=2273']
如果您不是更具体,您将获得如下链接:
http://static.hltv.org//images/category/5.gif
当+ http://www.hltv.org
不能正常工作时。
要获取第一个ie date 列下的链接,我们可以将a[href*=matchid=]
添加到我们的select中,html布局不是很好解析,因为没有我称之为的解析数据的可靠方法,但由于matchid=
对于该列中的href而言是唯一的。
soup = BeautifulSoup(requests.get("http://www.hltv.org/?pageid=188&offset=1").content)
cont = soup.select("div.covMainBoxContent a[href*=matchid=]")
print([a["href"] for a in cont])
要获取完整的html,您需要将基本网址加入href:
from urlparse import urljoin
base = "http://www.hltv.org/"
soup = BeautifulSoup(requests.get("http://www.hltv.org/?pageid=188&offset=1").content)
cont = soup.select("div.covMainBoxContent a[href*=matchid=]")
print([urljoin(base, a["href"]) for a in cont])
这给了你:
['http://www.hltv.org/?pageid=188&matchid=31342', 'http://www.hltv.org/?pageid=188&matchid=31343', 'http://www.hltv.org/?pageid=188&matchid=31339', 'http://www.hltv.org/?pageid=188&matchid=31338', 'http://www.hltv.org/?pageid=188&matchid=31341', 'http://www.hltv.org/?pageid=188&matchid=31340', 'http://www.hltv.org/?pageid=188&matchid=31329', 'http://www.hltv.org/?pageid=188&matchid=31336', 'http://www.hltv.org/?pageid=188&matchid=31334', 'http://www.hltv.org/?pageid=188&matchid=31331', 'http://www.hltv.org/?pageid=188&matchid=31330', 'http://www.hltv.org/?pageid=188&matchid=31333', 'http://www.hltv.org/?pageid=188&matchid=31332', 'http://www.hltv.org/?pageid=188&matchid=31319', 'http://www.hltv.org/?pageid=188&matchid=31321', 'http://www.hltv.org/?pageid=188&matchid=31320', 'http://www.hltv.org/?pageid=188&matchid=31318', 'http://www.hltv.org/?pageid=188&matchid=31328', 'http://www.hltv.org/?pageid=188&matchid=31327', 'http://www.hltv.org/?pageid=188&matchid=31326', 'http://www.hltv.org/?pageid=188&matchid=31325', 'http://www.hltv.org/?pageid=188&matchid=31324', 'http://www.hltv.org/?pageid=188&matchid=31322', 'http://www.hltv.org/?pageid=188&matchid=31317', 'http://www.hltv.org/?pageid=188&matchid=31316', 'http://www.hltv.org/?pageid=188&matchid=31315', 'http://www.hltv.org/?pageid=188&matchid=31306', 'http://www.hltv.org/?pageid=188&matchid=31314', 'http://www.hltv.org/?pageid=188&matchid=31310', 'http://www.hltv.org/?pageid=188&matchid=31304', 'http://www.hltv.org/?pageid=188&matchid=31302', 'http://www.hltv.org/?pageid=188&matchid=31313', 'http://www.hltv.org/?pageid=188&matchid=31312', 'http://www.hltv.org/?pageid=188&matchid=31311', 'http://www.hltv.org/?pageid=188&matchid=31309', 'http://www.hltv.org/?pageid=188&matchid=31308', 'http://www.hltv.org/?pageid=188&matchid=31307', 'http://www.hltv.org/?pageid=188&matchid=31305', 'http://www.hltv.org/?pageid=188&matchid=31303', 'http://www.hltv.org/?pageid=188&matchid=31294', 'http://www.hltv.org/?pageid=188&matchid=31293', 'http://www.hltv.org/?pageid=188&matchid=31292', 'http://www.hltv.org/?pageid=188&matchid=31291', 'http://www.hltv.org/?pageid=188&matchid=31290', 'http://www.hltv.org/?pageid=188&matchid=31289', 'http://www.hltv.org/?pageid=188&matchid=31301', 'http://www.hltv.org/?pageid=188&matchid=31300', 'http://www.hltv.org/?pageid=188&matchid=31299', 'http://www.hltv.org/?pageid=188&matchid=31298', 'http://www.hltv.org/?pageid=188&matchid=31297']