我正在抓取网站信息,并且能够检索所有信息。检索所有信息后,我将它们存储到列表中。
该列表包含此格式的时间戳25-10-2018 09:00,我想将其转换为ISO。我知道dateutil可以用于此目的,但我找不到所有值的正确方法。有人可以帮忙吗?
源代码:
List<ChildWrapper> children
输出:
# -*- coding utf-8 -*-
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
import time
import os
import shutil
from bs4 import BeautifulSoup
import uuid
import csv
import dateutil.parser as parser
import pandas as pd
from selenium.webdriver.support.select import Select
class crawlOcean():
def __init__(self):
print("hurray33")
global downloadDir
global uFileName
global filname
downloadDir = ""
uFileName = str(uuid.uuid4())
filname = downloadDir + uFileName + ".csv"
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList", 2)
fp.set_preference("browser.download.manager.showWhenStarting", False)
fp.set_preference("browser.download.dir", downloadDir)
fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
"attachment/csv")
options = Options()
options.add_argument("--headless")
self.driver = webdriver.Firefox(firefox_profile=fp)
#self.driver = webdriver.Firefox()
print("hurray")
self.driver.implicitly_wait(15)
self.driver.get("http://www.epa.ie/hydronet/#Water%20Levels")
self.verificationErrors = []
self.accept_next_alert = True
def crawl(self):
print("see")
driver = self.driver
driver.execute_script("window.scrollTo(0, 800)")
driver.find_element_by_id("dijit_MenuItem_3_text").click()
driver.find_element_by_xpath('//td[.="All"]').click()
driver.find_element_by_xpath('//td[.="Active EPA/LA (239)"]').click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
headers = []
for m in soup.find_all("th"):
headers.append(m.get_text())
print(headers)
content = []
finalContent = []
filtter = ['km²']
new_data = [[c.text.rstrip(" km²") for c in i.find_all('td')] for i in soup.find_all('table')[5::]]
print(new_data)
"""filtter = ['km²']
for table in soup.find_all("table")[5::]:
for row in table.find_all("tr"):
contentCells = []
for cells in row.find_all("td"):
contentCells.append(cells.get_text())
content.append(contentCells)
for idx, v in enumerate(content):
for t in filtter:
content[idx] = [i.replace(t, '') for i in content[idx]]
#.append(timerecorded)
print(content)"""
print(content)
with open(filname, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(new_data)
driver.close()
if __name__ == '__main__':
obj = crawlOcean()
obj.crawl()
谢谢
答案 0 :(得分:1)
尝试将以下内容应用于您的contentCells
列表
import re
from datetime import datetime as dt
contentCells = [dt.strptime(i, '%d-%m-%Y %H:%M').strftime('%d-%m-%YT%H:%MZ') if re.match("\d{2}-\d{2}-\d{4}\s\d{2}:\d{2}", i) else i for i in contentCells]
答案 1 :(得分:0)
from datetime import datetime
datetime.strptime('25-10-2018 09:00', '%d-%m-%Y %H:%M').isoformat()
它没有Z
,因为原始日期时间没有时区信息,但是您可以根据需要添加它。