我正在抓一个网站的日期。日期在浏览器中以可用格式显示,但是当我从网站中提取数据字符串时,格式会发生变化吗?以MM / DD / YYYY格式获取日期的最简单方法是什么?
在网站上,日期显示为:“12/05/2013 9:26 PM GMT”当我在下面的脚本中提取它时,它显示为:“Thu Dec 05 16:26:24 EST 2013 GMT” 。我想只捕捉“12/05/2013”的价值
#Import libraries
import urllib2
from bs4 import BeautifulSoup
import datetime
#create output document
f = open('CarbonPrice.txt','a')
#create soup
soup = BeautifulSoup(urllib2.urlopen('https://www.theice.com/marketdata/DelayedMarkets.shtml?productId=3418&hubId=4080').read())
table = soup.find('table', {"class":"data default borderless"})
#Find and record time
try:
first_th = table.find('th')
second_th = first_th.findNext('th')
if second_th.contents[0] == 'Time':
td_tag = table.find('td', text = 'Dec13')
next_td_tag = td_tag.findNext('td')
timevar = next_td_tag.contents[0]
else:
third_th = second_th.findNext('th')
if third_th.contents[0] == 'Time':
td_tag = table.find('td', text = 'Dec13')
next_td_tag = td_tag.findNext('td')
third_td_tag = next_td_tag.findNext('td')
timevar = third_td_tag.contents[0]
else:
fourth_th = third_th.findNext('th')
if fourth_th.contents[0] == 'Time':
td_tag = table.find('td', text = 'Dec13')
next_td_tag = td_tag.findNext('td')
third_td_tag = next_td_tag.findNext('td')
fourth_td_tag = third_td_tag.findNext('td')
timevar = fourth_td_tag.contents[0]
else:
fifth_th = fourth_th.findNext('th')
if fifth_th.contents[0] == 'Time':
td_tag = table.find('td', text = 'Dec13')
next_td_tag = td_tag.findNext('td')
third_td_tag = next_td_tag.findNext('td')
fourth_td_tag = third_td_tag.findNext('td')
fifth_td_tag = fourth_td_tag.findNext('td')
timevar = fifth_td_tag.contents[0]
else:
f.write ('Error')
f.write (timevar)
except AttributeError:
f.write('Error')
f.write('\n')
f.close()
答案 0 :(得分:2)
这是一种方法:
>>> import time
>>> date_time = 'Thu Dec 05 16:26:24 EST 2013 GMT'
>>> year = time.strptime(date_time, "%a %b %d %H:%M:%S EST %Y GMT").tm_year
>>> month = time.strptime(date_time, "%a %b %d %H:%M:%S EST %Y GMT").tm_mon
>>> day = time.strptime(date_time, "%a %b %d %H:%M:%S EST %Y GMT").tm_mday
>>> print("%i/%i/%i"%(month, day, year))
12/5/2013
答案 1 :(得分:1)
您的代码中存在各种问题。您应该尝试使用loop,这样您就不需要重复相同的代码五次了。
对于BeautifulSoup,您可以使用函数find_all
代替find
来查找标记的所有匹配项。
而BeautifulSoup显然以特定格式解析时间,因此执行任务的一种方法是解析BeautifulSoup返回的字符串。
我已经改变了很多代码:
#Import libraries
import urllib2
from bs4 import BeautifulSoup
import datetime
#create soup
soup = BeautifulSoup(urllib2.urlopen('https://www.theice.com/marketdata/DelayedMarkets.shtml?productId=3418&hubId=4080').read())
table = soup.find('table', {"class":"data default borderless"})
#Find and record time
time_idx = -1
for idx, th in enumerate(table.find_all('th')):
# Find the column index of Time
if th.get_text() == 'Time':
time_idx = idx
break
timevar = []
for tr in table.find_all('tr'):
# Extract the content of each column in a list
td_contents = [td.get_text() for td in tr.find_all('td')]
# If this row matches our requirement, take the Time column
if 'Dec13' in td_contents:
time_str = td_contents[time_idx]
# This will capture Thu Dec 05 16:26:24 EST 2013 GMT, convert to datetime object
time_obj = datetime.datetime.strptime(time_str,'%a %b %d %H:%M:%S EST %Y GMT')
timevar.append(datetime.datetime.strftime(time_obj,'%x'))
#create output document
with open('CarbonPrice.txt','a') as f:
f.write(timevar[0])