此代码用于循环遍历所有结果页面,然后遍历每个页面上的结果表,并从表中获取所有数据以及存储在表外部的一些信息。
但是,生成的CSV文件似乎不在任何合理的组织中,每行在不同的列中具有不同的信息类别。我所追求的是每行包含所定义的所有类别的信息(日期,聚会,开始日期,结束日期,选区,注册协会,候选人是否当选,候选人姓名,地址和财务代理人) )。其中一些数据存储在每个页面的表格中,而其余数据(日期,聚会,区域,注册关联)存储在表格之外,需要与每个页面上每个表格行中的每个候选项相关联。此外,“当选”,“地址”或“财务代理”似乎没有任何输出,我不确定我哪里出错了。
如果您能帮助我弄清楚如何修复我的代码以实现此输出,我将非常感激。它如下:
from bs4 import BeautifulSoup
import requests
import re
import csv
url = "http://www.elections.ca/WPAPPS/WPR/EN/NC?province=-1&distyear=2013&district=-1&party=-1&pageno={}&totalpages=55&totalcount=1368&secondaryaction=prev25"
rows = []
for i in range(1, 56):
print(i)
r = requests.get(url.format(i))
data = r.text
soup = BeautifulSoup(data, "html.parser")
links = []
for link in soup.find_all('a', href=re.compile('selectedid=')):
links.append("http://www.elections.ca" + link.get('href'))
for link in links:
r = requests.get(link)
data = r.text
cat = BeautifulSoup(data, "html.parser")
header = cat.find_all('span')
tables = cat.find_all("table")[0].find_all("td")
rows.append({
#"date":
header[2].contents[0],
#"party":
re.sub("[\n\r/]", "", cat.find("legend").contents[2]).strip(),
#"start_date":
header[3].contents[0],
#"end_date":
header[5].contents[0],
#"electoral district":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip(),
#"registered association":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip().encode('latin-1'),
#"elected":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="elected/1")[0].contents[0]).strip(),
#"name":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="name/1")[0].contents[0]).strip(),
#"address":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="address/1")[0].contents[0]).strip(),
#"financial_agent":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="fa/1")[0].contents[0]).strip()
})
with open('scrapeOutput.csv', 'w') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerows(rows)
答案 0 :(得分:1)
我认为你的词典有点混乱,你没有分配键。提醒一下,如果将字典转换为列表, python将根据键按字母顺序对它们进行排序。但是使用 csv 库,您可以轻松打印csv,而无需执行所有这些操作。
所以分配键:
rows.append({
"date":
header[2].contents[0],
"party":
re.sub("[\n\r/]", "", cat.find("legend").contents[2]).strip(),
"start_date":
header[3].contents[0],
"end_date":
header[5].contents[0],
"electoral district":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip(),
"registered association":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip().encode('latin-1'),
"elected":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="elected/1")[0].contents[0]).strip(),
"name":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="name/1")[0].contents[0]).strip(),
"address":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="address/1")[0].contents[0]).strip(),
"financial_agent":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="fa/1")[0].contents[0]).strip()
})
然后用 DictWriter 编写你的csv:
with open('scrapeOutput.csv', 'w') as f_output:
csv_output = csv.DictWriter(f_output, rows[0].keys())
csv_output.writeheader() # Write header to understand the csv
csv_output.writerows(rows)
我对此进行了测试并且它正常工作,但请注意,您的某些字段(例如地址或当选字段)是空的:)
见啊!
答案 1 :(得分:1)
如果您想抓取,可能需要查看来自CrawlSpider
的{{1}}。我之所以使用scrapy
只是因为它提供了更大的灵活性。
要安装这些库,您可以使用:
lxml.html
pip install scrapy
要构建一个基本的scrapy项目,您可以使用command:
pip install lxml
然后添加蜘蛛和物品:
选举/蜘蛛/ spider.py
scrapy startproject elections
选举/ items.py
from scrapy.spiders import CrawlSpider, Rule
from elections.items import ElectionsItem
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from lxml import html
class ElectionsSpider(CrawlSpider):
name = "elections"
allowed_domains = ["elections.ca"]
start_urls = ["http://www.elections.ca/WPAPPS/WPR/EN/NC/Details?province=-1&distyear=2013&district=-1&party=-1&pageno=1&totalpages=55&totalcount=1372&viewall=1"]
rules = (
Rule(LxmlLinkExtractor(
allow = ('http://www.elections.ca/WPAPPS/WPR/EN/NC/Details.*'),
),
callback='parse_item',
follow=True
),
)
def unindent(self, string):
return ''.join(map(str.strip, string.encode('utf8').splitlines(1)))
def parse_item(self, response):
item = ElectionsItem()
original_html = Selector(response).extract()
lxml_obj = html.fromstring(original_html)
for entry in lxml_obj.xpath('.//fieldset[contains(@class,"wpr-detailgroup")]'):
date = entry.xpath('.//legend[contains(@class,"wpr-ltitle")]/span[contains(@class,"date")]')
if date:
item['date'] = self.unindent(date[0].text.strip())
party = entry.xpath('.//legend[contains(@class,"wpr-ltitle")]')
if party:
item['party'] = self.unindent(party[0].text.strip())
start_date = entry.xpath('.//div[contains(@class,"group")]/span[contains(@class,"date")][1]')
if start_date:
item['start_date'] = self.unindent(start_date[0].text.strip())
end_date = entry.xpath('.//div[contains(@class,"group")]/span[contains(@class,"date")][2]')
if end_date:
item['end_date'] = self.unindent(end_date[0].text.strip())
electoral_district = entry.xpath('.//div[contains(@class,"wpr-title")][contains(text(),"Electoral district:")]')
if electoral_district:
item['electoral_district'] = self.unindent(electoral_district[0].tail.strip())
registered_association = entry.xpath('.//div[contains(@class,"wpr-title")][contains(text(),"Registered association:")]')
if registered_association:
item['registered_association'] = self.unindent(registered_association[0].tail.strip())
for candidate in entry.xpath('.//table[contains(@class, "wpr-datatable")]//tr[not(@class)]'):
item['elected'] = len(candidate.xpath('.//img[contains(@alt, "contestant won this nomination contest")]'))
candidate_name = candidate.xpath('.//td[contains(@headers,"name")]')
if candidate_name:
item['candidate_name'] = self.unindent(candidate_name[0].text.strip())
item['address'] = self.unindent(candidate.xpath('.//td[contains(@headers,"address")]')[0].text_content().strip())
item['financial_agent'] = self.unindent(candidate.xpath('.//td[contains(@headers,"fa")]')[0].text_content().strip())
yield item
选举/ settings.py
from scrapy.item import Item, Field
class ElectionsItem(Item):
date = Field()
party = Field()
start_date = Field()
end_date = Field()
electoral_district = Field()
registered_association = Field()
elected = Field()
candidate_name = Field()
address = Field()
financial_agent = Field()
选举/ pipelines.py
BOT_NAME = 'elections'
SPIDER_MODULES = ['elections.spiders']
NEWSPIDER_MODULE = 'elections.spiders'
ITEM_PIPELINES = {
'elections.pipelines.ElectionsPipeline': 300,
}
您可以通过运行command:
来运行蜘蛛from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.exporters import CsvItemExporter
class electionsPipeline(object):
def __init__(self):
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.files = {}
def spider_opened(self, spider):
file = open('%s_ads.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
从项目的根目录开始。
它应该在项目的根目录中创建一个scrapy runspider elections/spiders/spider.py
,如下所示:
elections.csv
答案 2 :(得分:1)
我建议您一直按行写入输出CSV文件,而不是等到最后。此外,最好使用列表而不是字典来保存数据。这样就可以保持列的排序。
from bs4 import BeautifulSoup
import requests
import re
import csv
url = "http://www.elections.ca/WPAPPS/WPR/EN/NC?province=-1&distyear=2013&district=-1&party=-1&pageno={}&totalpages=55&totalcount=1368&secondaryaction=prev25"
with open('scrapeOutput.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
for i in range(1, 56):
print(i)
r = requests.get(url.format(i))
data = r.text
soup = BeautifulSoup(data, "html.parser")
links = []
for link in soup.find_all('a', href=re.compile('selectedid=')):
links.append("http://www.elections.ca" + link.get('href'))
for link in links:
r = requests.get(link)
data = r.text
cat = BeautifulSoup(data, "html.parser")
header = cat.find_all('span')
tables = cat.find_all("table")[0].find_all("td")
row = [
#"date":
header[2].contents[0],
#"party":
re.sub("[\n\r/]", "", cat.find("legend").contents[2]).strip(),
#"start_date":
header[3].contents[0],
#"end_date":
header[5].contents[0],
#"electoral district":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip(),
#"registered association":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip().encode('latin-1'),
#"elected":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="elected/1")[0].contents[0]).strip(),
#"name":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="name/1")[0].contents[0]).strip(),
#"address":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="address/1")[0].contents[0]).strip(),
#"financial_agent":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="fa/1")[0].contents[0]).strip()]
csv_output.writerow(row)
print(row)
这将导致CSV开始如下:
"December 08, 2016",Green Party,"September 21, 2016","December 08, 2016",Calgary Midnapore,b'Calgary Midnapore',,Ryan Zedic,,
"November 29, 2016",NDP-New Democratic Party,"August 24, 2016","November 29, 2016",Ottawa--Vanier,b'Ottawa--Vanier',,Emilie Taman,,
"September 28, 2016",Green Party,"September 04, 2016","September 28, 2016",Medicine Hat--Cardston--Warner,b'Medicine Hat--Cardston--Warner',,Kelly Dawson,,