从记录列表将时间转换为ISO:BS4 Python

时间:2018-10-25 14:38:12

标签: python datetime beautifulsoup

我正在抓取网站信息,并且能够检索所有信息。检索所有信息后,我将它们存储到列表中。

该列表包含此格式的时间戳25-10-2018 09:00,我想将其转换为ISO。我知道dateutil可以用于此目的,但我找不到所有值的正确方法。有人可以帮忙吗?

源代码:

List<ChildWrapper> children

输出:

# -*- coding utf-8 -*-
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
import time
import os
import shutil
from bs4 import BeautifulSoup
import uuid
import csv
import dateutil.parser as parser
import pandas as pd

from selenium.webdriver.support.select import Select


class crawlOcean():

    def __init__(self):
        print("hurray33")
        global downloadDir
        global uFileName
        global filname
        downloadDir = ""
        uFileName = str(uuid.uuid4())
        filname = downloadDir + uFileName + ".csv"
        pd.set_option('display.max_rows', 500)
        pd.set_option('display.max_columns', 500)
        pd.set_option('display.width', 1000)
        fp = webdriver.FirefoxProfile()
        fp.set_preference("browser.download.folderList", 2)
        fp.set_preference("browser.download.manager.showWhenStarting", False)
        fp.set_preference("browser.download.dir", downloadDir)
        fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
                          "attachment/csv")
        options = Options()
        options.add_argument("--headless")
        self.driver = webdriver.Firefox(firefox_profile=fp)
        #self.driver = webdriver.Firefox()
        print("hurray")
        self.driver.implicitly_wait(15)
        self.driver.get("http://www.epa.ie/hydronet/#Water%20Levels")
        self.verificationErrors = []
        self.accept_next_alert = True

  def crawl(self):
        print("see")
        driver = self.driver
        driver.execute_script("window.scrollTo(0, 800)")
        driver.find_element_by_id("dijit_MenuItem_3_text").click()
        driver.find_element_by_xpath('//td[.="All"]').click()
        driver.find_element_by_xpath('//td[.="Active EPA/LA (239)"]').click()
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        headers = []
        for m in soup.find_all("th"):
            headers.append(m.get_text())
        print(headers)
        content = []
        finalContent = []
        filtter = ['km²']
        new_data = [[c.text.rstrip(" km²") for c in i.find_all('td')]  for i in soup.find_all('table')[5::]]
        print(new_data)
        """filtter = ['km²']
        for table in soup.find_all("table")[5::]:
            for row in table.find_all("tr"):
                contentCells = []
                for cells in row.find_all("td"):
                    contentCells.append(cells.get_text())
            content.append(contentCells)
        for idx, v in enumerate(content):
            for t in filtter:
                content[idx] = [i.replace(t, '') for i in content[idx]]
                #.append(timerecorded)
        print(content)"""
        print(content)
        with open(filname, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(headers)
            writer.writerows(new_data)
        driver.close()



if __name__ == '__main__':
    obj = crawlOcean()
    obj.crawl()

谢谢

2 个答案:

答案 0 :(得分:1)

尝试将以下内容应用于您的contentCells列表

import re
from datetime import datetime as dt

contentCells = [dt.strptime(i, '%d-%m-%Y %H:%M').strftime('%d-%m-%YT%H:%MZ') if re.match("\d{2}-\d{2}-\d{4}\s\d{2}:\d{2}", i) else i for i in contentCells]

答案 1 :(得分:0)

from datetime import datetime
datetime.strptime('25-10-2018 09:00', '%d-%m-%Y %H:%M').isoformat()

它没有Z,因为原始日期时间没有时区信息,但是您可以根据需要添加它。