我正在尝试将电子邮件数据解析为数据框,但是当我查看该数据框时,大部分电子邮件正文似乎消失了。
我尝试在添加到数据框之前打印主体,它似乎已正确解析,但是当我使用iloc将其添加到数据框时,它会被截断。
from bs4 import BeautifulSoup
from html2text import HTML2Text
import pandas as pd
import easyimap
import getpass
import email
import base64
import os
import email
import mimetypes
from datetime import datetime
from email.utils import parsedate_to_datetime
def to_text(html, rehtml=False):
parser = HTML2Text()
parser.wrap_links = False
parser.skip_internal_links = True
parser.inline_links = True
parser.ignore_anchors = True
parser.ignore_images = True
parser.ignore_emphasis = True
parser.ignore_links = True
text = parser.handle(html)
text = text.strip(' \t\n\r')
if rehtml:
text = text.replace('\n', '<br/>')
text = text.replace('\\', '')
return text
imap_password = getpass.getpass()
user = 'pmccabe@tradevela.com\edc-notifications'
host = 'outlook.office365.com'
password = imap_password
#'
folders = ('"INBOX/Americas/Not Raised"', '"INBOX/Americas/Raised"', '"INBOX/APAC/Not Raised"', '"INBOX/APAC/Raised"',
'"INBOX/Consolidated/Not Raised"', '"INBOX/Consolidated/Raised"', '"INBOX/EMEA"', '"INBOX/EMEA/Not Raised"', '"INBOX/EMEA/Raised"')
df = pd.DataFrame(columns=['Subject','Sender','From','To','Body','References','content_type', 'local_date_time',
'Classification', 'in_reply_to','return_path', 'mime_version', 'message_id', 'folder_name'])
for mailbox in folders:
#Connect to mailbox read_only = True to ensure the mail is not marked as read.
imapper = easyimap.connect(host, user, password, mailbox,read_only=True)
#fetch each mail up to limit and return email data and add to a dataframe
for mail_id in imapper.listids(limit=5000):
try:
mail = imapper.mail(mail_id, include_raw=True)
#convert body to text using to_text function and add to dataframe
df.loc[mail_id, ['Body']] = to_text(mail.body, rehtml=False)
#return mail features to dataframe
df.loc[mail_id, ['Subject']] = mail.title
df.loc[mail_id, ['Sender']] = mail.sender
df.loc [mail_id, ['From']] = mail.from_addr
df.loc [mail_id, ['To']] = mail.to
df.loc [mail_id, ['References']] = mail.references
df.loc [mail_id, ['content_type']] = mail.content_type
#converting the date to datetime and taking account of time difference changes
date_= mail.date
df.loc [mail_id, ['local_date_time']] = datetime.fromtimestamp(parsedate_to_datetime(date_).timestamp()).strftime('%Y-%m-%d %H:%M:%S')
#parsing the keyword data from the raw mail data to provide the classification
raw_data = mail.raw
email_message = email.message_from_bytes(raw_data)
df.loc [mail_id, ['Classification']] = email_message['Keywords']
df.loc [mail_id, ['in_reply_to']] = mail.in_reply_to
df.loc [mail_id, ['return_path']] = mail.return_path
df.loc [mail_id, ['mime_version']] = mail.mime_version
df.loc [mail_id, ['message_id']] = mail.message_id
df.loc [mail_id, ['folder_name']] = mailbox
except:
#if error print email to file
counter = 1
for part in email_message.walk():
if part.get_content_maintype() == "multipart":
continue
filename = part.get_filename()
content_type = part.get_content_type()
if not filename:
ext = mimetypes.guess_extension(content_type)
if not ext:
ext = '.bin'
if 'text' in content_type:
ext = '.txt'
elif 'html' in content_type:
ext = '.html'
filename = 'msg-part-%08d%s' %(counter, ext)
counter += 1
#save file
date_ = datetime.fromtimestamp(parsedate_to_datetime(date_).timestamp()).strftime('%Y-%m-%d %H:%M:%S')
save_path = os.path.join(os.getcwd(), "emails", date_, mail.title)
if not os.path.exists(save_path):
os.makedirs(save_path)
with open(os.path.join(save_path, filename), 'wb') as fp:
fp.write(part.get_payload(decode=True))
counter += 1
数据框应包含所有电子邮件正文内容
答案 0 :(得分:0)
更新了Jupyter笔记本,它已解决了此问题。