添加到数据框时电子邮件正文被切断

时间:2019-07-23 14:24:26

标签: pandas dataframe email

我正在尝试将电子邮件数据解析为数据框,但是当我查看该数据框时,大部分电子邮件正文似乎消失了。

我尝试在添加到数据框之前打印主体,它似乎已正确解析,但是当我使用iloc将其添加到数据框时,它会被截断。

from bs4 import BeautifulSoup
from html2text import HTML2Text

import pandas as pd

import easyimap
import getpass
import email
import base64
import os
import email
import mimetypes

from datetime import datetime
from email.utils import parsedate_to_datetime




def to_text(html, rehtml=False):
    parser = HTML2Text()
    parser.wrap_links = False
    parser.skip_internal_links = True
    parser.inline_links = True
    parser.ignore_anchors = True
    parser.ignore_images = True
    parser.ignore_emphasis = True
    parser.ignore_links = True
    text = parser.handle(html)
    text = text.strip(' \t\n\r')
    if rehtml:
        text = text.replace('\n', '<br/>')
        text = text.replace('\\', '')
    return text 

imap_password = getpass.getpass()


user = 'pmccabe@tradevela.com\edc-notifications'
host = 'outlook.office365.com'
password = imap_password

#'
folders =  ('"INBOX/Americas/Not Raised"', '"INBOX/Americas/Raised"', '"INBOX/APAC/Not Raised"', '"INBOX/APAC/Raised"',
            '"INBOX/Consolidated/Not Raised"', '"INBOX/Consolidated/Raised"', '"INBOX/EMEA"', '"INBOX/EMEA/Not Raised"', '"INBOX/EMEA/Raised"')


df = pd.DataFrame(columns=['Subject','Sender','From','To','Body','References','content_type', 'local_date_time', 
                          'Classification', 'in_reply_to','return_path', 'mime_version', 'message_id', 'folder_name'])


for mailbox in folders:

    #Connect to mailbox read_only = True to ensure the mail is not marked as read.
    imapper = easyimap.connect(host, user, password, mailbox,read_only=True)

    #fetch each mail up to limit and return email data and add to a dataframe

    for mail_id in imapper.listids(limit=5000):
        try:
            mail = imapper.mail(mail_id, include_raw=True)
            #convert body to text using to_text function and add to dataframe
            df.loc[mail_id, ['Body']] = to_text(mail.body, rehtml=False)
            #return mail features to dataframe
            df.loc[mail_id, ['Subject']] = mail.title
            df.loc[mail_id, ['Sender']] = mail.sender
            df.loc [mail_id, ['From']] = mail.from_addr
            df.loc [mail_id, ['To']] = mail.to
            df.loc [mail_id, ['References']] = mail.references
            df.loc [mail_id, ['content_type']] = mail.content_type
            #converting the date to datetime and taking account of time difference changes
            date_= mail.date
            df.loc [mail_id, ['local_date_time']] = datetime.fromtimestamp(parsedate_to_datetime(date_).timestamp()).strftime('%Y-%m-%d %H:%M:%S')
            #parsing the keyword data from the raw mail data to provide the classification
            raw_data = mail.raw
            email_message = email.message_from_bytes(raw_data)
            df.loc [mail_id, ['Classification']] = email_message['Keywords']
            df.loc [mail_id, ['in_reply_to']] = mail.in_reply_to
            df.loc [mail_id, ['return_path']] = mail.return_path
            df.loc [mail_id, ['mime_version']] = mail.mime_version
            df.loc [mail_id, ['message_id']] = mail.message_id
            df.loc [mail_id, ['folder_name']] = mailbox
        except:

            #if error print email to file
            counter = 1
            for part in email_message.walk():
                if part.get_content_maintype() == "multipart":
                    continue 
                filename = part.get_filename()
                content_type = part.get_content_type()
                if not filename:
                    ext = mimetypes.guess_extension(content_type)
                    if not ext:
                        ext = '.bin'
                    if 'text' in content_type:
                        ext = '.txt'
                    elif 'html' in content_type:
                        ext = '.html'
                    filename = 'msg-part-%08d%s' %(counter, ext)
                counter += 1
            #save file
            date_ = datetime.fromtimestamp(parsedate_to_datetime(date_).timestamp()).strftime('%Y-%m-%d %H:%M:%S')
            save_path = os.path.join(os.getcwd(), "emails", date_, mail.title)
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            with open(os.path.join(save_path, filename), 'wb') as fp:
                fp.write(part.get_payload(decode=True))
            counter += 1

数据框应包含所有电子邮件正文内容

1 个答案:

答案 0 :(得分:0)

更新了Jupyter笔记本,它已解决了此问题。