写入每个表到csv后跟标题

时间:2018-09-26 22:14:51

标签: python python-3.x csv

我正在尝试使用BS4抓取一些数据,然后将其写入CSV。我希望以CSV格式编写的模式类似于this website

所以它更像是1:标头3然后是各自的表,然后标头3和表等等。 但是我得到了这样的输出:

Total rainfall in millimetres for SherkinIsland 
Mean temperature in degrees Celsius for SherkinIsland   
Mean 10cm soil temperature for SherkinIsland at 0900 UTC    
Global Solar Radiation in Joules/cm2 for SherkinIsland  
Potential Evapotranspiration (mm) for SherkinIsland 
Evaporation (mm) for SherkinIsland  
Notes on the Data
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,199.1,67.2,116.6,129.3,93.0,17.2,48.8,62.5,82.1,,,,815.8
2017,66.7,78.5,132.7,14.6,39.2,112.3,89.9,78.6,150.8,115.5,51.9,147.5,1078.2
2016,185.8,113.0,61.5,68.8,59.4,61.5,69.7,111.1,111.1,64.4,43.3,78.3,1027.9
2015,106.6,78.0,88.9,18.5,110.0,77.4,127.0,87.0,121.2,52.8,107.7,292.7,1267.8
mean,132.7,101.4,94.7,73.7,73.7,75.1,78.0,88.3,92.4,127.6,120.1,130.3,1188.0
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,8.2,6.1,5.7,9.2,12.1,15.4,17.1,15.0,13.6,,,,11.4
2017,8.1,8.2,9.2,9.8,12.2,14.0,14.9,14.6,13.5,12.6,9.4,8.1,11.2
2016,8.4,7.0,7.5,8.5,12.0,14.3,14.4,15.2,14.5,12.3,8.0,9.5,11.0
2015,7.5,6.5,7.7,9.4,10.9,12.9,14.2,14.3,13.8,12.3,11.2,10.3,10.9
mean,7.5,7.5,8.4,9.4,11.7,13.9,15.5,15.7,14.3,12.0,9.5,8.0,11.1
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,6.9,4.7,5.1,9.5,13.4,17.3,19.0,16.2,n/a,,,,11.6
2017,7.5,7.9,8.4,10.2,12.9,15.4,16.2,15.3,13.4,12.3,8.5,6.8,11.3
2016,7.4,5.9,6.6,8.5,13.0,15.6,15.8,15.8,14.6,11.8,7.7,8.8,11.0
2015,6.6,5.3,6.8,9.3,11.7,14.5,14.8,14.7,13.1,11.2,10.6,9.6,10.7
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,9023,15831,29709,42026,58669,67070,65526,44784,29711,,,,362349
2017,8345,14868,28307,43479,57060,59325,57794,46218,33526,15375,11157,7084,382538
2016,7262,16452,27956,48481,60218,56262,53776,48503,25866,19137,12859,5660,382432
2015,8882,13475,30056,50190,55679,57207,57047,49551,33798,19483,8962,5121,389451
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,21.8,28.5,34.9,49.9,76.3,98.8,104.6,64.5,42.9,,,,522.2
2017,20.6,25.2,40.5,59.4,75.1,80.5,79.1,63.5,46.3,26.2,38.7,18.7,573.8
2016,20.8,27.3,39.7,61.4,77.3,81.1,73.7,68.6,43.9,39.0,23.5,21.0,577.3
2015,23.5,21.0,38.1,59.8,67.1,73.3,76.1,66.2,53.0,34.4,25.6,24.1,562.2
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,30.5,41.0,55.4,81.0,116.3,143.1,147.9,96.8,64.3,,,,776.3
2017,27.1,37.8,64.0,88.6,117.8,127.9,122.2,97.5,71.3,39.2,46.4,24.6,864.4
2016,28.7,41.0,61.1,96.8,118.9,122.4,112.7,104.8,64.3,52.8,30.3,26.7,860.5
2015,32.7,31.1,60.5,95.8,113.2,115.7,120.8,101.4,75.9,47.2,35.1,32.8,862.2
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a

Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total
2018,226,262,303,188,115,38,12,30,N/A,N/A,N/A,N/A,N/A
2017,228,206,195,170,105,55,34,37,63,90,183,230,1596
2016,220,247,247,210,112,44,44,28,41,99,226,185,1702
2015,247,253,243,182,143,82,48,46,57,100,130,162,1693

我的源代码是:

import time
from os import getcwd
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas

import time, re
import csv
import uuid


class metEiren():
   def __init__(self):
       print("hurray33")
       global downloadDir
       downloadDir = ""

       fp = webdriver.FirefoxProfile()
       fp.set_preference("browser.download.folderList", 2)
       fp.set_preference("browser.download.manager.showWhenStarting", False)
       fp.set_preference("browser.download.dir", downloadDir)
       fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")

       options = Options()
       options.add_argument("--headless")
       global driver
       driver = webdriver.Firefox(firefox_profile=fp,firefox_options=options)
       driver.get("https://www.met.ie/climate/available-data/monthly-data")
       verificationErrors = []
       accept_next_alert = True
   def scrap(self):

       driver.get("https://www.met.ie/climate/available-data/monthly-data")
       driver.execute_script("window.scrollTo(0, 1000)")
       wait = WebDriverWait(driver, 10)
       link = wait.until(EC.presence_of_element_located((By.LINK_TEXT, "Sherkin Island")))
       link.click()
       time.sleep(2)
       uFileName = str(uuid.uuid4())
       filname = downloadDir + uFileName + ".csv"
       soup = BeautifulSoup(driver.page_source, 'html.parser')
       headerList = []
       tableContentList = []
       for h in soup.find_all('h3'):
           print(h.text)
           headerList.append(h.text)
       for table in soup.find_all('table'):
           for row in table.find_all('tr'):
               list_of_cells = []
               for hd in row.find_all(['th','td']):
                   list_of_cells.append(hd.text.strip())
               tableContentList.append(list_of_cells)
       with open(filname, 'w', newline='') as f:
           writer = csv.writer(f, delimiter = ',', quoting = csv.QUOTE_NONE,escapechar=',',lineterminator='\n')
           length1 = len(headerList)
           length2 = len(tableContentList)

           for i in range(len(headerList)):
               writer.writerows([headerList[i].strip(',').split(',')])
               writer.writerows(tableContentList[s] for s in range (len(tableContentList)))

if __name__ == '__main__':
   obj = metEiren()
   obj.scrap()

任何帮助将不胜感激,谢谢

1 个答案:

答案 0 :(得分:2)

您需要保留一个表列表,而不是将所有信息附加到一个列表中。然后,您可以使用zip()一次获取单个标头和表,以将其写入输出CSV文件。这比尝试使用range()更好。

import time
from os import getcwd
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas


import time, re
import csv
import uuid


class metEiren():
    def __init__(self):
        print("hurray33")
        global downloadDir
        downloadDir = ""

        fp = webdriver.FirefoxProfile()
        fp.set_preference("browser.download.folderList", 2)
        fp.set_preference("browser.download.manager.showWhenStarting", False)
        fp.set_preference("browser.download.dir", downloadDir)
        fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")

        options = Options()
        options.add_argument("--headless")
        global driver
        driver = webdriver.Firefox(firefox_profile=fp,firefox_options=options)
        driver.get("https://www.met.ie/climate/available-data/monthly-data")
        verificationErrors = []
        accept_next_alert = True

    def scrape(self):
        driver.get("https://www.met.ie/climate/available-data/monthly-data")
        driver.execute_script("window.scrollTo(0, 1000)")
        wait = WebDriverWait(driver, 10)
        link = wait.until(EC.presence_of_element_located((By.LINK_TEXT, "Sherkin Island")))
        link.click()
        time.sleep(2)
        uFileName = str(uuid.uuid4())
        filname = downloadDir + uFileName + ".csv"
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        headerList = []

        for h in soup.find_all('h3'):
            print(h.text)
            headerList.append(h.text.strip('\t'))

        tables = []

        for table in soup.find_all('table'):
            tableContentList = []

            for row in table.find_all('tr'):
                list_of_cells = []
                for hd in row.find_all(['th','td']):
                    list_of_cells.append(hd.text.strip())

                # Only append a row if it non-empty
                if len(list_of_cells):
                    tableContentList.append(list_of_cells)

            tables.append(tableContentList)

        with open(filname, 'w', newline='') as f:
            writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar=',', lineterminator='\n')

            for header, table in zip(headerList, tables):
                writer.writerow([header])
                writer.writerows(table)

if __name__ == '__main__':
    obj = metEiren()
    obj.scrape()

这将为您提供如下输出:

Total rainfall in millimetres for SherkinIsland
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,199.1,67.2,116.6,129.3,93.0,17.2,48.8,62.5,82.1,,,,815.8
2017,66.7,78.5,132.7,14.6,39.2,112.3,89.9,78.6,150.8,115.5,51.9,147.5,1078.2
2016,185.8,113.0,61.5,68.8,59.4,61.5,69.7,111.1,111.1,64.4,43.3,78.3,1027.9
2015,106.6,78.0,88.9,18.5,110.0,77.4,127.0,87.0,121.2,52.8,107.7,292.7,1267.8
mean,132.7,101.4,94.7,73.7,73.7,75.1,78.0,88.3,92.4,127.6,120.1,130.3,1188.0
Mean temperature in degrees Celsius for SherkinIsland
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,8.2,6.1,5.7,9.2,12.1,15.4,17.1,15.0,13.6,,,,11.4
2017,8.1,8.2,9.2,9.8,12.2,14.0,14.9,14.6,13.5,12.6,9.4,8.1,11.2
2016,8.4,7.0,7.5,8.5,12.0,14.3,14.4,15.2,14.5,12.3,8.0,9.5,11.0
2015,7.5,6.5,7.7,9.4,10.9,12.9,14.2,14.3,13.8,12.3,11.2,10.3,10.9
mean,7.5,7.5,8.4,9.4,11.7,13.9,15.5,15.7,14.3,12.0,9.5,8.0,11.1
Mean 10cm soil temperature for SherkinIsland at 0900 UTC
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,6.9,4.7,5.1,9.5,13.4,17.3,19.0,16.2,n/a,,,,11.6
2017,7.5,7.9,8.4,10.2,12.9,15.4,16.2,15.3,13.4,12.3,8.5,6.8,11.3
2016,7.4,5.9,6.6,8.5,13.0,15.6,15.8,15.8,14.6,11.8,7.7,8.8,11.0
2015,6.6,5.3,6.8,9.3,11.7,14.5,14.8,14.7,13.1,11.2,10.6,9.6,10.7
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a
Global Solar Radiation in Joules/cm2 for SherkinIsland
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,9023,15831,29709,42026,58669,67070,65526,44784,29711,,,,362349
2017,8345,14868,28307,43479,57060,59325,57794,46218,33526,15375,11157,7084,382538
2016,7262,16452,27956,48481,60218,56262,53776,48503,25866,19137,12859,5660,382432
2015,8882,13475,30056,50190,55679,57207,57047,49551,33798,19483,8962,5121,389451
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a
Potential Evapotranspiration (mm) for SherkinIsland
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,21.8,28.5,34.9,49.9,76.3,98.8,104.6,64.5,42.9,,,,522.2
2017,20.6,25.2,40.5,59.4,75.1,80.5,79.1,63.5,46.3,26.2,38.7,18.7,573.8
2016,20.8,27.3,39.7,61.4,77.3,81.1,73.7,68.6,43.9,39.0,23.5,21.0,577.3
2015,23.5,21.0,38.1,59.8,67.1,73.3,76.1,66.2,53.0,34.4,25.6,24.1,562.2
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a
Evaporation (mm) for SherkinIsland
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,30.5,41.0,55.4,81.0,116.3,143.1,147.9,96.8,64.3,,,,776.3
2017,27.1,37.8,64.0,88.6,117.8,127.9,122.2,97.5,71.3,39.2,46.4,24.6,864.4
2016,28.7,41.0,61.1,96.8,118.9,122.4,112.7,104.8,64.3,52.8,30.3,26.7,860.5
2015,32.7,31.1,60.5,95.8,113.2,115.7,120.8,101.4,75.9,47.2,35.1,32.8,862.2
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a
Notes on the Data
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total
2018,226,262,303,188,115,38,12,30,N/A,N/A,N/A,N/A,N/A
2017,228,206,195,170,105,55,34,37,63,90,183,230,1596
2016,220,247,247,210,112,44,44,28,41,99,226,185,1702
2015,247,253,243,182,143,82,48,46,57,100,130,162,1693