我正在尝试根据文件名的ID和日期从s3中提取文件:
命名约定:
命名约定如下:
** ID_NAME_DATE.csv :文件名遵循相同的模式
示例:9919USEN_File_20180216.csv
示例:9919GBEN_File_20180211.csv
**
代码:
import boto3
import re
def downloadFiletest():
#connect to s3
client = boto3.resource(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
#used for downloading
s3 = boto3.client(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
dateIdReg = '[0-9]{8}'
dateSuffix = re.compile(date)
print (u"= S3 Client Connected =")
# configure s3 bucket
bucket = client.Bucket(u'us-eu-Bucket')
b_folder = "/folder/example/"
c_folder = b_folder.lower() + '/'
files_not_found = True
for cList in bucket.objects.filter(Prefix=b_folder):
cFiles= cList.key
print ('file : ', cFiles)
for fileId in cFiles.lower():
files_not_found = False
f = fileId.rstrip()
print(f)
fileidreg= '[0-9]{4}[a-zA-Z]{4}'
FileID = re.compile(fileidreg)
if FileID.match(f) and dateSuffix.match(f):
print(u'cList.key.lower(): ', cList.key.lower())
old_file = cList.key
dot_index = old_file.find(u'.')
print (u'old dot file name: ', dot_index)
file_ext = old_file[dot_index:]
cfile = fileId + '_file_' + dateSuffix + file_ext
tmp_path = "/tmp/folder/" + cfile
b_path = cVal + cfile
print (u'b path : ', b_path)
s3.download_file("us-eu-Bucket", b_path, tmp_path)
print ("TEMP PATH: ", tmp_path)
if files_not_found:
print("ALERT", "No file in {0}/{1}".format(bucket, b_folder))
downloadFiletest()
错误:
它会跳过cFiles.lower()中的fileId并关闭脚本。
目标:
从 S3 中拉出文件,并将其下载到tmp_path
,以根据需要使用。
当提取文件时,我希望脚本根据ID和日期选择文件。例如:
规则:伪:
如果S3具有文件9919USEN_File_20180216.csv和9919USEN_File_20180217.csv,则选择9919USEN_File_20180217.csv进行下载。同样,如果S3中的991USEN_File_2018.csv不匹配,则不要选择文件,fileidreg ='[0-9] {4} [a-zA-Z] {4}'和dateIdReg ='[0- 9] {8}'。
规则:视觉:
9919USEN_File_20180217.csv > 9919USEN_File_20180216.csv [due to date]
9919USEN_File_20180217.csv > 991USEN_File_2018.csv [Due to Incorrect ID and Date]
答案 0 :(得分:0)
解决方案
问题在于它的结构方式。我已经进行了重组,并将其放在一个尝试,异常条件循环中。我还使用了FileIDPrefix.search
而不是FileIDPrefix.match
,因为它只是专门查看索引,并不适合当前的问题。
最终解决方案。
import boto3
import re
#connect to s3
client = boto3.resource(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
#used for downloading
s3 = boto3.client(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
def downloadFiletest():
date = '[0-9]{8}' # fileDate regex
dateSuffix = re.compile(dates) # regex used to check the date of the file
reg = '[0-9]{4}[a-zA-Z]{4}' # filename regex
fileIDPrefix = re.compile(reg) # check fileID of the Filename.
folder = u"/folder/example/" # directory
bucket = client.Bucket(bucketname) # bucket
try:
for cuList in bucket.objects.filter(Prefix=folder): # filter to the folder
filenames= cList.key # directory of the files that we would like to use
print(cu)
# specific locations of site fileID of the file and date of the file
fileID = filenames[33:41]
fileDate = filenames[51:59]
# check the length of each values to be verified later.
lenf = len(fileID)
lenG = len(fileDate)
old_file = cList.key
dot_index = old_file.find(u'.')
file_ext = old_file[dot_index:]
# this check that the files in directory match our specified rules. if does it proceeds.
if fileIDPrefix.search(cu) and fileDateSuffix.search(cu):
filename = fileID + u'_file_' + fileDate + file_ext
tmp_path = "/tmp/mpcmt/" + filename
file_path = folder + filename
s3.download_file(bucketname, file_path, tmp_path)
return filename, tmp_path, fileID, fileDate
# this check the number of values/char in a directory to see it matches up to what is expected.
if dot_index > 59 or dot_index < 59:
print('File has wrong fileID or Wrong Date')
if lenG > 8 or lenG < 8:
print('File has wrong fileDate Format')
if lenf > 8 or lenf < 8:
print('File has wrong fileID')
except Exception as e: # this closes and displays an error if the file doesn't exist.
print("ALERT", "No file in {0}/{1}".format(bucket, folder))
# There was some issue / error / problem and that is why the program is exiting.
print >> sys.stderr, "No file in {0}/{1}".format(bucket, folder)
print >> sys.stderr, "Exception: %s" % str(e)
sys.exit(1)
downloadFiletest()