我有一个Excel电子表格,我准备迁移到Access,日期列有多种格式的条目,例如:1963年至1969年,1968年8月至1968年9月,1972年3月7日,7月24日, 1980年10月2日,1980年8月29日,1946年7月等,并且“未注明日期”。我将把关键(地图编号)和日期列的列拉成csv并写回csv。 我可以删除4位数的年份,但不是范围。而且我很难理解如何在手工重新格式化的情况下提取数天和2位数。我的代码不是很优雅,可能不是最佳实践:
import csv, xlwt, re
# create new Excel document and add sheet
# from tempfile import TemporaryFile
from xlwt import Workbook
book = Workbook()
sheet1 = book.add_sheet('Sheet 1')
# populate first row with header
sheet1.write(0,0,"Year")
sheet1.write(0,1,"Map")
sheet1.write(0,2,"As Entered")
# count variable for populating sheet
rowCount=0
# open csv file and read
with open('C:\dateTestMSDOs.csv', 'rb') as f:
reader=csv.reader(f)
for row in reader:
map = row[0] # first row is map number
dateRaw = row[1] # second row is raw date as entered
# write undated and blank entries
if dateRaw == 'undated':
yearStr = '0000'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
if dateRaw == '':
yearStr = 'NoEntry'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# search and write instances of four consecutive digits
try:
year = re.search(r'\d\d\d\d', dateRaw)
yearStr= year.group()
#print yearStr, map, dateRaw
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# if none exist flag for cleaning spreadsheet and print
except:
#print 'Nope', map, dateRaw
rowCount +=1
yearStr='Format'
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
yearStr=''
dateRaw=''
book.save('D:\dateProperty.xls')
print "Done!"
我想将日期和月份写入其他列,并提取范围条目的第二个4位日期。
答案 0 :(得分:1)
您可以尝试使用dateutil
。我认为你仍然需要以不同的方式处理一些更难的格式。请参阅下面的示例实现:
<强>代码:强>
import dateutil.parser as dateparser
date_list = ['1963 to 1969',
'Aug. 1968 to Sept. 1968',
'Mar-73',
'24-Jul',
'Oct. 2 1980',
'Aug 29, 1980',
'July 1946',
'undated']
for d in date_list:
if 'to' in d:
a, b = d.split('to')
# Get the higher number. Use min to get lower of two.
print max(dateparser.parse(a.strip()).year, dateparser.parse(b.strip()).year)
elif d == 'undated':
print '0000'
else:
yr = dateparser.parse(d).year
print yr
<强>结果:强>
1969
1968
1973
2014
1980
1980
1946
0000
[Finished in 0.4s]
我能看到的只有明显的问题是24-Jul
返回2014
的日期,因为解析器假设当前的日,月或年代替缺少的组件,即。如果今天是本月的20日等,Mar-73
将成为1973-03-20
答案 1 :(得分:0)
不完全确定这是不是你想要的但我只是用了一个简单的&#34;正则表达式搜索,然后遍历匹配的组,应用定义的给定函数。如果找到匹配项,则调用的函数(在regex_groups变量中找到)应返回包含以下键的字典:start_day, start_month, start_year, end_day, end_month, end_year
然后你可以用这些价值做任何你喜欢的事情。绝对不是最干净的解决方案,但据我所知它可行。
#!/usr/local/bin/python2.7
import re
# Crazy regex
regex_pattern = '(?:(\d{4}) to (\d{4}))|(?:(\w+)\. (\d{4}) to (\w+)\. (\d{4}))|(?:(\w+)-(\d{2}))|(?:(\d{2})-(\w+))|(?:(\w+)\. (\d+), (\d{4}))|(?:(\w+) (\d+), (\d{4}))|(?:(\w+) (\d{4}))|(?:(\d{4}))'
date_strings = [
'1963 to 1969',
'Aug. 1968 to Sept. 1968',
'1972',
'Mar-73',
'24-Jul',
'Oct. 2, 1980',
'Aug 29, 1980',
'July 1946',
]
# Here you set the group matching functions that will be called for a matching group
regex_groups = {
(1,2): lambda group_matches: {
'start_day': '', 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': group_matches[1]
},
(3,4,5,6): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': group_matches[2], 'end_year': group_matches[3]
},
(7,8): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': '', 'end_year': ''
},
(9,10): lambda group_matches: {
'start_day': group_matches[1], 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': ''
},
(11,12,13): lambda group_matches: {
'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2],
'end_day': '', 'end_month': '', 'end_year': ''
},
(14,15,16): lambda group_matches: {
'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2],
'end_day': '', 'end_month': '', 'end_year': ''
},
(17,18): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': '', 'end_year': ''
},
(19,): lambda group_matches: {
'start_day': '', 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': ''
},
}
for ds in date_strings:
matches = re.search(regex_pattern, ds)
start_month = ''
start_year = ''
end_month = ''
end_year = ''
for regex_group, group_func in regex_groups.items():
group_matches = [matches.group(sub_group_num) for sub_group_num in regex_group]
if all(group_matches):
match_data = group_func(group_matches)
print
print 'Matched:', ds
print '%s to %s' % ('-'.join([match_data['start_day'], match_data['start_month'], match_data['start_year']]), '-'.join([match_data['end_day'], match_data['end_month'], match_data['end_year']]))
# match_data is a dictionary with keys:
# * start_day
# * start_month
# * start_year
# * end_day
# * end_month
# * end_year
# If a group doesn't contain one of those items, then it is set to a blank string
输出:
Matched: 1963 to 1969
--1963 to --1969
Matched: Aug. 1968 to Sept. 1968
-Aug-1968 to -Sept-1968
Matched: 1972
--1972 to --
Matched: Mar-73
-Mar-73 to --
Matched: 24-Jul
Jul--24 to --
Matched: Oct. 2, 1980
2-Oct-1980 to --
Matched: Aug 29, 1980
29-Aug-1980 to --
Matched: July 1946
-July-1946 to --
答案 2 :(得分:0)
您可以使用正则表达式定义日期的所有可能情况,例如:
import re
s = ['1963 to 1969', 'Aug. 1968 to Sept. 1968',
'1972', 'Mar-73', '03-Jun', '24-Jul', 'Oct. 2, 1980', 'Oct. 26, 1980',
'Aug 29 1980', 'July 1946']
def get_year(date):
mm = re.findall("\d{4}", date)
if mm:
return mm
mm = re.search("\w+-(\d{2})", date)
if mm:
return [mm.group(1)]
def get_month(date):
mm = re.findall("[A-Z][a-z]+", date)
if mm:
return mm
def get_day(date):
d_expr = ["(\d|\d{2})\-[A-Z][a-z]+","[A-Z][a-z]+[\. ]+(\d|\d{2}),"]
for expr in d_expr:
mm = re.search(expr, date)
if mm:
return [mm.group(1)]
d = {}
m = {}
y = {}
for idx, date in enumerate(s):
d[idx] = get_day(date)
m[idx] = get_month(date)
y[idx] = get_year(date)
print "Year Dict: ", y
print "Month Dict: ", m
print "Day Dict: ", d
结果你会得到几天,几个月和几年的字典。它们可用于填充行。
输出:
Year Dict: {0: ['1963', '1969'], 1: ['1968', '1968'], 2: ['1972'], 3: ['73'], 4: None, 5: None, 6: ['1980'], 7: ['1980'], 8: ['1980'], 9: ['1946']}
Month Dict: {0: None, 1: ['Aug', 'Sept'], 2: None, 3: ['Mar'], 4: ['Jun'], 5: ['Jul'], 6: ['Oct'], 7: ['Oct'], 8: ['Aug'], 9: ['July']}
Day Dict: {0: None, 1: None, 2: None, 3: None, 4: ['03'], 5: ['24'], 6: ['2'], 7: ['26'], 8: None, 9: None}
答案 3 :(得分:0)
感谢您提出的创新建议。在考虑之后,我们决定从数据库中可搜索的内容中删除日期和月份,因为只有相对少量的数据具有该级别的详细信息。这是我用来从冗长而凌乱的列表中提取和生成所需数据的代码。
import csv, xlwt, re
# create new Excel document and add sheet
from xlwt import Workbook
book = Workbook()
sheet1 = book.add_sheet('Sheet 1')
# populate first row with header
sheet1.write(0,0,"MapYear_(Parsed)")
sheet1.write(0,1,"Map_Number")
sheet1.write(0,2,"As_Entered")
# count variable for populating sheet
rowCount=0
# open csv file and read
yearStr = ''
with open('C:\mapsDateFix.csv', 'rb') as f:
reader=csv.reader(f)
for row in reader:
map = row[0] # first row is map number
dateRaw = row[1] # second row is raw date as entered
# write undated and blank entries
if dateRaw == 'undated':
yearStr = 'undated'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
#yearStr=''
if yearStr != 'undated':
if dateRaw == '':
yearStr = 'NoEntry'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
#yearStr=''
# search and write instances of four consecutive digits
if yearStr != dateRaw:
try:
year = re.search(r'\d\d\d\d', dateRaw)
yearStr= year.group()
#print yearStr, map, dateRaw
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# if none exist flag for cleaning spreadsheet and print
except:
#print 'Nope', map, dateRaw
rowCount +=1
yearStr='Format'
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
yearStr=''
dateRaw=''
book.save('D:\dateProperty.xls')
print "Done!"