我目前使用此代码从EML文件中提取附件。我想知道我是否可以将附件链接到邮件(EML文件)。也就是说,添加eml文件名作为附件名称前缀。 所以我可以知道附件属于什么邮件。 谢谢
import os, re
import email
import argparse
import olefile
def extractAttachment(msg, eml_files, output_path):
#print len(msg.get_payload())
#print msg.get_payload()
if len(msg.get_payload()) > 2:
if isinstance(msg.get_payload(), str):
try:
extractOLEFormat(eml_files, output_path)
except IOError:
#print 'Could not process %s. Try manual extraction.' % (eml_files)
#print '\tHeader of file: %s\n' % (msg.get_payload()[:8])
pass
elif isinstance(msg.get_payload(), list):
count = 0
while count < len(msg.get_payload()):
payload = msg.get_payload()[count]
#récupérer les pièces jointes
filename = payload.get_filename()
#os.rename(filename,'rrrrr'+filename)
#filename=os.path.join(str(filename), str(eml_files))
if filename is not None:
try:
magic = payload.get_payload(decode=True)[:4]
except TypeError:
magic = "None"
# Print the magic deader and the filename for reference.
printIT(eml_files, magic, filename)
# Write the payload out.
writeFile(filename, payload, output_path)
count += 1
elif len(msg.get_payload()) == 2:
payload = msg.get_payload()[1]
filename = payload.get_filename()
try:
magic = payload.get_payload(decode=True)[:4]
except TypeError:
magic = "None"
# Print the magic deader and the filename for reference.
printIT(eml_files, magic, filename)
# Write the payload out.
writeFile(filename, payload, output_path)
elif len(msg.get_payload()) == 1:
attachment = msg.get_payload()[0]
payload = attachment.get_payload()[1]
filename = attachment.get_payload()[1].get_filename()
try:
magic = payload.get_payload(decode=True)[:4]
except TypeError:
magic = "None"
# Print the magic deader and the filename for reference.
printIT(eml_files, magic, filename)
# Write the payload out.
writeFile(filename, payload, output_path)
#else:
# print 'Could not process %s\t%s' % (eml_files, len(msg.get_payload()))
def extractOLEFormat(eml_files, output_path):
data = '__substg1.0_37010102'
filename = olefile.OleFileIO(eml_files)
msg = olefile.OleFileIO(eml_files)
attachmentDirs = []
for directories in msg.listdir():
if directories[0].startswith('__attach') and directories[0] not in attachmentDirs:
attachmentDirs.append(directories[0])
for dir in attachmentDirs:
filename = [dir, data]
if isinstance(filename, list):
filenames = "/".join(filename)
filename = msg.openstream(dir + '/' + '__substg1.0_3707001F').read().replace('\000', '')
payload = msg.openstream(filenames).read()
magic = payload[:4]
# Print the magic deader and the filename for reference.
printIT(eml_files, magic, filename)
# Write the payload out.
writeOLE(filename, payload, output_path)
#filename = str(eml_files)+"--"+str(filename)
def printIT(eml_files, magic, filename):
filename = str(eml_files)+"--"+str(filename)
print ('Email Name: %s\n\tMagic: %s\n\tSaved File as: %s\n' % (eml_files, magic, filename))
def writeFile(filename, payload, output_path):
filename = str(eml_files)+"--"+str(filename)
try:
file_location = output_path + filename
open(os.path.join(file_location), 'wb').write(payload.get_payload(decode=True))
except (TypeError, IOError):
pass
def writeOLE(filename, payload, output_path):
open(os.path.join(output_path + filename), 'wb')
def main():
parser = argparse.ArgumentParser(description='Attempt to parse the attachment from EML messages.')
parser.add_argument('-p', '--path',default='C:\\Users\\hamd\\Desktop\\TEX\\emails' ,help='eml')#Path to EML files
parser.add_argument('-o', '--out', default='C:\\Users\\hamd\\Desktop\\TEX\\PJ\\eml_files\\',help='pj')#Path to write attachments to.
args = parser.parse_args()
if args.path:
input_path = args.path
else:
print ("You need to specify a path to your EML files.")
exit(0)
if args.out:
output_path = args.out
else:
print ("You need to specify a path to write your attachments to.")
exit(0)
for root, subdirs, files in os.walk(input_path):
for file_names in files:
eml_files = os.path.join(root, file_names)
msg = email.message_from_file(open(eml_files))
extractAttachment(msg, eml_files, output_path)
if __name__ == "__main__":
main()
答案 0 :(得分:0)
我试着把它写成评论,但太长了。我不会给出一个完整的解决方案,但我会解释这个想法。
一个可能的解决方案是创建一个指向提取附件的硬链接,为硬链接提供与 EML 文件相同的名称。如果同一个 EML 文件中有更多附件,可以附加一个增量后缀:
whatever.eml (original email file)
whatever_001.attch (hard link to first extracted attachment)
whatever_002.attch (hard link to second extracted attachment)
...
这样:
在 Python 中,您可以简单地使用以下命令创建硬链接:
import os
os.link(existing_target_file, new_link_name)