网页抓取:输出CSV混乱

时间:2016-12-20 08:21:39

标签: python html python-3.x csv web-scraping

此代码用于循环遍历所有结果页面,然后遍历每个页面上的结果表,并从表中获取所有数据以及存储在表外部的一些信息。

但是,生成的CSV文件似乎不在任何合理的组织中,每行在不同的列中具有不同的信息类别。我所追求的是每行包含所定义的所有类别的信息(日期,聚会,开始日期,结束日期,选区,注册协会,候选人是否当选,候选人姓名,地址和财务代理人) )。其中一些数据存储在每个页面的表格中,而其余数据(日期,聚会,区域,注册关联)存储在表格之外,需要与每个页面上每个表格行中的每个候选项相关联。此外,“当选”,“地址”或“财务代理”似乎没有任何输出,我不确定我哪里出错了。

如果您能帮助我弄清楚如何修复我的代码以实现此输出,我将非常感激。它如下:

from bs4 import BeautifulSoup
import requests
import re
import csv

url = "http://www.elections.ca/WPAPPS/WPR/EN/NC?province=-1&distyear=2013&district=-1&party=-1&pageno={}&totalpages=55&totalcount=1368&secondaryaction=prev25"

rows = []

for i in range(1, 56):
    print(i)
    r  = requests.get(url.format(i))
    data = r.text
    soup = BeautifulSoup(data, "html.parser")
    links = []

    for link in soup.find_all('a', href=re.compile('selectedid=')):
        links.append("http://www.elections.ca" + link.get('href'))

    for link in links:
        r  = requests.get(link)
        data = r.text
        cat = BeautifulSoup(data, "html.parser")
        header = cat.find_all('span')
        tables = cat.find_all("table")[0].find_all("td")        

        rows.append({
            #"date": 
            header[2].contents[0],
            #"party": 
            re.sub("[\n\r/]", "", cat.find("legend").contents[2]).strip(),
            #"start_date": 
            header[3].contents[0],
            #"end_date": 
            header[5].contents[0],
            #"electoral district": 
            re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip(),
            #"registered association": 
            re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip().encode('latin-1'),
            #"elected": 
            re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="elected/1")[0].contents[0]).strip(),
            #"name": 
            re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="name/1")[0].contents[0]).strip(),
            #"address": 
            re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="address/1")[0].contents[0]).strip(),
            #"financial_agent": 
            re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="fa/1")[0].contents[0]).strip()
        })

with open('scrapeOutput.csv', 'w') as f_output:
   csv_output = csv.writer(f_output)
   csv_output.writerows(rows)

3 个答案:

答案 0 :(得分:1)

我认为你的词典有点混乱,你没有分配键。提醒一下,如果将字典转换为列表, python将根据键按字母顺序对它们进行排序。但是使用 csv 库,您可以轻松打印csv,而无需执行所有这些操作。

所以分配键:

rows.append({
        "date": 
        header[2].contents[0],
        "party": 
        re.sub("[\n\r/]", "", cat.find("legend").contents[2]).strip(),
        "start_date": 
        header[3].contents[0],
        "end_date": 
        header[5].contents[0],
        "electoral district": 
        re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip(),
        "registered association": 
        re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip().encode('latin-1'),
        "elected": 
        re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="elected/1")[0].contents[0]).strip(),
        "name": 
        re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="name/1")[0].contents[0]).strip(),
        "address": 
        re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="address/1")[0].contents[0]).strip(),
        "financial_agent": 
        re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="fa/1")[0].contents[0]).strip()
    })

然后用 DictWriter 编写你的csv:

with open('scrapeOutput.csv', 'w') as f_output:
    csv_output = csv.DictWriter(f_output, rows[0].keys())
    csv_output.writeheader() # Write header to understand the csv
    csv_output.writerows(rows)

我对此进行了测试并且它正常工作,但请注意,您的某些字段(例如地址或当选字段)是空的:)

见啊!

答案 1 :(得分:1)

如果您想抓取,可能需要查看来自CrawlSpider的{​​{1}}。我之所以使用scrapy只是因为它提供了更大的灵活性。

要安装这些库,您可以使用:

lxml.html

pip install scrapy

要构建一个基本的scrapy项目,您可以使用command

pip install lxml

然后添加蜘蛛和物品:

  

选举/蜘蛛/ spider.py

scrapy startproject elections
  

选举/ items.py

from scrapy.spiders import CrawlSpider, Rule
from elections.items import ElectionsItem
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector

from lxml import html

class ElectionsSpider(CrawlSpider):
    name = "elections"
    allowed_domains = ["elections.ca"]
    start_urls = ["http://www.elections.ca/WPAPPS/WPR/EN/NC/Details?province=-1&distyear=2013&district=-1&party=-1&pageno=1&totalpages=55&totalcount=1372&viewall=1"]

    rules = (

        Rule(LxmlLinkExtractor(
                allow = ('http://www.elections.ca/WPAPPS/WPR/EN/NC/Details.*'),
            ),
            callback='parse_item',
            follow=True
        ),


      )

    def unindent(self, string):
        return ''.join(map(str.strip, string.encode('utf8').splitlines(1)))

    def parse_item(self, response):

        item = ElectionsItem()

        original_html = Selector(response).extract()

        lxml_obj = html.fromstring(original_html)

        for entry in lxml_obj.xpath('.//fieldset[contains(@class,"wpr-detailgroup")]'):


            date = entry.xpath('.//legend[contains(@class,"wpr-ltitle")]/span[contains(@class,"date")]')
            if date:
                item['date'] = self.unindent(date[0].text.strip())
            party = entry.xpath('.//legend[contains(@class,"wpr-ltitle")]')
            if party:
                item['party'] = self.unindent(party[0].text.strip())
            start_date = entry.xpath('.//div[contains(@class,"group")]/span[contains(@class,"date")][1]')
            if start_date:
                item['start_date'] = self.unindent(start_date[0].text.strip())
            end_date = entry.xpath('.//div[contains(@class,"group")]/span[contains(@class,"date")][2]')
            if end_date:
                item['end_date'] = self.unindent(end_date[0].text.strip())
            electoral_district = entry.xpath('.//div[contains(@class,"wpr-title")][contains(text(),"Electoral district:")]')
            if electoral_district:
                item['electoral_district'] = self.unindent(electoral_district[0].tail.strip())
            registered_association = entry.xpath('.//div[contains(@class,"wpr-title")][contains(text(),"Registered association:")]')
            if registered_association:
                item['registered_association'] = self.unindent(registered_association[0].tail.strip())

            for candidate in entry.xpath('.//table[contains(@class, "wpr-datatable")]//tr[not(@class)]'):

                item['elected'] = len(candidate.xpath('.//img[contains(@alt, "contestant won this nomination contest")]'))
                candidate_name = candidate.xpath('.//td[contains(@headers,"name")]')
                if candidate_name:
                    item['candidate_name'] = self.unindent(candidate_name[0].text.strip())
                item['address'] = self.unindent(candidate.xpath('.//td[contains(@headers,"address")]')[0].text_content().strip())
                item['financial_agent'] = self.unindent(candidate.xpath('.//td[contains(@headers,"fa")]')[0].text_content().strip())

                yield item
  

选举/ settings.py

from scrapy.item import Item, Field

class ElectionsItem(Item):

    date = Field()
    party = Field()
    start_date = Field()
    end_date = Field()
    electoral_district = Field()
    registered_association = Field()
    elected = Field()
    candidate_name = Field()
    address = Field()
    financial_agent = Field()
  

选举/ pipelines.py

BOT_NAME = 'elections'

SPIDER_MODULES = ['elections.spiders']
NEWSPIDER_MODULE = 'elections.spiders'

ITEM_PIPELINES = {
   'elections.pipelines.ElectionsPipeline': 300,
}

您可以通过运行command

来运行蜘蛛
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.exporters import CsvItemExporter

class electionsPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_ads.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

从项目的根目录开始。

它应该在项目的根目录中创建一个scrapy runspider elections/spiders/spider.py ,如下所示:

elections.csv

答案 2 :(得分:1)

我建议您一直按行写入输出CSV文件,而不是等到最后。此外,最好使用列表而不是字典来保存数据。这样就可以保持列的排序。

from bs4 import BeautifulSoup
import requests
import re
import csv


url = "http://www.elections.ca/WPAPPS/WPR/EN/NC?province=-1&distyear=2013&district=-1&party=-1&pageno={}&totalpages=55&totalcount=1368&secondaryaction=prev25"

with open('scrapeOutput.csv', 'w', newline='') as f_output:
    csv_output = csv.writer(f_output)

    for i in range(1, 56):
        print(i)
        r  = requests.get(url.format(i))
        data = r.text
        soup = BeautifulSoup(data, "html.parser")
        links = []

        for link in soup.find_all('a', href=re.compile('selectedid=')):
            links.append("http://www.elections.ca" + link.get('href'))

        for link in links:
            r  = requests.get(link)
            data = r.text
            cat = BeautifulSoup(data, "html.parser")
            header = cat.find_all('span')
            tables = cat.find_all("table")[0].find_all("td")        

            row = [
                #"date": 
                header[2].contents[0],
                #"party": 
                re.sub("[\n\r/]", "", cat.find("legend").contents[2]).strip(),
                #"start_date": 
                header[3].contents[0],
                #"end_date": 
                header[5].contents[0],
                #"electoral district": 
                re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip(),
                #"registered association": 
                re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip().encode('latin-1'),
                #"elected": 
                re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="elected/1")[0].contents[0]).strip(),
                #"name": 
                re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="name/1")[0].contents[0]).strip(),
                #"address": 
                re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="address/1")[0].contents[0]).strip(),
                #"financial_agent": 
                re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="fa/1")[0].contents[0]).strip()]

            csv_output.writerow(row)    
            print(row)

这将导致CSV开始如下:

"December 08, 2016",Green Party,"September 21, 2016","December 08, 2016",Calgary Midnapore,b'Calgary Midnapore',,Ryan Zedic,,
"November 29, 2016",NDP-New Democratic Party,"August 24, 2016","November 29, 2016",Ottawa--Vanier,b'Ottawa--Vanier',,Emilie Taman,,
"September 28, 2016",Green Party,"September 04, 2016","September 28, 2016",Medicine Hat--Cardston--Warner,b'Medicine Hat--Cardston--Warner',,Kelly Dawson,,