我正在尝试使用正则表达式从文本文件中提取日期。 文本文件中日期行的示例:
1530Z 1 FEB 1990
使用的正则表达式:
date_matcher = re.compile("^([0-9]{4}[z].[0-9]+.[A-Z]{3}.[0-9]{4})")
我试图修改我正在使用的代码,然后从正则表达式中“拉”日期和时间。这是该代码:
# get just the data lines, without headers.
def get_data_lines( path ):
# where we are putting data lines (no header lines)
data_lines = []
#for root, dirs, files in os.walk(path):
#print oot, dirs, dirs2, files
if os.path.isfile(str(path)) and (str(path.endswith('.dat'))):
with open(path) as f:
dt = None
for line in f:
# check that line isn't empty
if line.strip():
# the compiled matcher will return a match object
# or null if no match was found.
result = data_matcher.match(line)
if result:
data_lines.append((line,dt))
else:
dtres = date_matcher.match(line)
if dtres:
line = [ w for w in line.split() if w]
date = line[-4:]
if len(date) == 4:
time, day, month, year = date
# print date
# fix the date bits
time = time.replace('Z','')
day = int(day)
month = strptime(month,'%b').tm_mon
year = int(year)
hour, minutes = re.findall('..',time)
dt = datetime(year,month,day,int(hour),int(minutes))
return data_lines
dt = datetime(年,月,日,int(小时),int(分钟))都是一行,但是在格式化时看起来不是那样,所以我认为这对指向很有帮助
我知道问题出在dt = None。当我从要拉出的文件目录中打印出所有日期时,它只打印与我拥有的日期一样多的文件。
预期的结果是将dt变量创建为空,并在遇到它们时将其替换为日期。
因此,对于此示例,我想要的是:1530 1 2 1990
该行:1530Z 1 FEB 1990
并能够从我为其指定的对象t中调用月,日,年,时间。
答案 0 :(得分:2)
这是我更改正则表达式模式的解决方案。我将其替换为date_matcher = re.compile(r"((\d{4})[Z]).*(\d{1,2}).(\w{3}).(\d{4})")
,它将为您提供所需的结果。
在这里,我使用re.sub
来简单地使日期看起来像您想要的日期(即比原始日期更具可读性)。它删除Z字符,将月份名称更改为相应的月份号,并删除字符串中间的多余空格。
import re
from time import strptime
from datetime import datetime
data_matcher = re.compile('^(\s\s[0-2])')
date_matcher = re.compile(r"((\d{4})[Z]).*(\d{1,2}).(\w{3}).(\d{4})")
def get_data_lines( path ):
# where we are putting data lines (no header lines)
data_lines = []
#for root, dirs, files in os.walk(path):
#print oot, dirs, dirs2, files
if os.path.isfile(str(path)) and (str(path.endswith('.dat'))):
with open(path) as f:
dt = None
for line in f:
# check that line isn't empty
if line.strip():
# the compiled matcher will return a match object
# or null if no match was found.
result = data_matcher.match(line)
if result:
dt = re.sub(r'((\d{4})[Z])', r'\2', line) #Remove Z character
month = date_matcher.match(line).group(4)
dt = re.sub(r'\b(\w{3})\b', str(strptime(month,'%b').tm_mon), line) #Change month name to number
dt = re.sub(r'\s+', ' ', dt) #Remove extra whitespace
data_lines.append((line,dt))
print('Data Lines: ', data_lines)
else:
line = [ w for w in line.split() if w]
date = line[-4:]
if len(date) == 4:
time, day, month, year = date
# print date
# fix the date bits
time = time.replace('Z','')
day = int(day)
month = strptime(month,'%b').tm_mon
year = int(year)
hour, minutes = re.findall('..',time)
dt = datetime(year,month,day,int(hour),int(minutes))
data_lines.append((line,dt))
return data_lines