Question

好的，所以我使用imaplib从gmail加载电子邮件，然后当我试图解析电子邮件时，它不会以可用格式分隔任何内容。我怀疑这是因为过程中的某个地方＆＃39;＆lt;＆＃39;或者＆＃39;＆gt;＆＃39;正在添加到原始电子邮件中。

以下是我调用该方法后调试器向我显示的内容： enter image description here 正如您所看到的，它还没有将任何内容解析成可用的格式。

以下是我使用的代码:(注意：.replace('>', '')似乎对最终结果没有影响。）

mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login('myEmail@gmail.com', 'password')
mail.list()
mail.select('inbox')
typ, data = mail.search(None, 'ALL')
ids = data[0]
id_list = ids.split()

# get the most recent email id
latest_email_id = int( id_list[-1] )

# iterate through 15 messages in descending order starting with latest_email_id
# the '-1' dictates reverse looping order

for i in range( latest_email_id -10, latest_email_id-15, -1 ):
    typ, data = mail.fetch( str(i), '(RFC822)' )

    for response_part in data:
        if isinstance(response_part, tuple):
            msg = str(response_part[1]).replace('<', '')
            msg = msg.replace('>', '')
            msg = email.message_from_string(msg)
            #msg = feedparser.parse(response_part[1])
            varSubject = msg['subject']
            varFrom = msg['from']

python email.message_from_string() parse problems和Parsing email with Python两者都有非常相似和相同的问题（我认为），他们通过更改电子邮件解决了这个问题，但我直接从Google阅读了我的电子邮件＆＃39 ; s服务器，因此我不确定要对电子邮件做些什么来修复它，因为删除所有＆＃39;＆lt;＆＃39;和＆＃39;＆gt;＆＃39;显然不会工作。

那么，如何修复从imaplib读取的电子邮件，以便可以使用email.message_from_string（）轻松读取？（或者任何其他改进/可能的解决方案，因为我并非100％确定＆＃39;＆＃39;＆＃39;＆＃39;＆＃39;实际上是问题，我是只根据提出的其他问题进行猜测。）

干杯

Answer 1

你不应该解析<，>和它们之间的数据 - 就像解析HTML一样，但要复杂得多。有现成的解决方案。
以下是我的代码，可以使用附件读取邮件，提取可用于进一步使用的数据，并将其处理为人工和代码可读格式。如您所见，所有任务都是由第三方模块完成的。

from datetime import datetime
import imaplib
import email
import html2text
from os import path

class MailClient(object):
    def __init__(self):
        self.m = imaplib.IMAP4_SSL('your.server.com')
        self.Login()


    def Login(self):
        result, data = self.m.login('login@domain.com', 'p@s$w0rd')
        if result != 'OK':
            raise Exception("Error connecting to mailbox: {}".format(data))


    def ReadLatest(self, delete = True):
        result, data = self.m.select("inbox")
        if result != 'OK':
            raise Exception("Error reading inbox: {}".format(data))
        if data == ['0']:
            return None
        latest = data[0].split()[-1]
        result, data = self.m.fetch(latest, "(RFC822)")
        if result != 'OK':
            raise Exception("Error reading email: {}".format(data))
        if delete:
            self.m.store(latest, '+FLAGS', '\\Deleted')

        message = email.message_from_string(data[0][1])
        res = {
            'From' : email.utils.parseaddr(message['From'])[1],
            'From name' : email.utils.parseaddr(message['From'])[0],
            'Time' : datetime.fromtimestamp(email.utils.mktime_tz(email.utils.parsedate_tz(message['Date']))),
            'To' : message['To'],
            'Subject' : email.Header.decode_header(message["Subject"])[0][0],
            'Text' : '',
            'File' : None 
        }

        for part in message.walk():
            if part.get_content_maintype() == 'multipart':
                continue
            if part.get_content_maintype() == 'text':
                # reading as HTML (not plain text)

                _html = part.get_payload(decode = True)
                res['Text'] = html2text.html2text(_html)

            elif part.get_content_maintype() == 'application' and part.get_filename():
                fname = path.join("your/folder", part.get_filename())
                attachment = open(fname, 'wb')
                attachment.write(part.get_payload(decode = True))
                attachment.close()
                if res['File']:
                    res['File'].append(fname)
                else:
                    res['File'] = [fname]

        return res


    def __del__(self):
        self.m.close()

email.message_from_string（）和imaplib添加'＆lt;' '＆GT;'

1 个答案: