请耐心等待。我是Python的新手 - 但有很多乐趣。我正在尝试编写一个网络爬虫代码,用于搜索丹麦最后一次公投的选举结果。我设法从主页面中提取所有相关链接。现在我希望Python遵循92个链接中的每一个,并从每个页面中收集9条信息。但我很困惑。希望你能给我一个提示。
这是我的代码:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?
答案 0 :(得分:5)
以下是我使用lxml
的解决方案。它类似于BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[@class="LetterGroup"]//a/@href') # grab all link
print 'Length of all links = ', len(my_list)
my_list
是由所有链接组成的列表。现在你可以使用for循环来抓取每个页面内的信息。
我们可以遍历每个链接。在每个页面中,您可以提取信息作为示例。这仅适用于顶级表格。
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[@class="statusHeader"]/text()')
table_value = tree.xpath('//td[@class="statusText"]/text()') + tree.xpath('//td[@class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
对于页面下方的表格,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[@class="tableRowPrimary"]/td[@class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[@class="tableRowSecondary"]/td[@class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
希望这有帮助!
答案 1 :(得分:4)
一种简单的方法是遍历您的网址列表并逐个解析它们:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
或者,您可以使用Futures显着加快速度。
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]
答案 2 :(得分:2)
这将是我解决问题的方法
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
它没有完成......我只从表中得到一个简单的元素,但你明白了它应该如何工作。
答案 3 :(得分:1)
这是我的最终代码顺利运行。如果我能做得更聪明,请告诉我!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()