为什么我从mbox获取一个空DB?

时间:2017-09-06 18:24:46

标签: python python-2.7 sqlite mbox

所以我有这个代码,它接受mbox文件,然后将它们转换为sqlite数据库。但是,我得到一个空的数据库。 无论我使用哪个mbox,结果数据库都是12kb。当我尝试查看数据库/分析它时,它会产生任何结果,没有任何信息或任何东西,尽管表和密钥都已生成。但里面没有任何信息。 可能是什么问题呢?是不是脚本没有选择mbox,或者循环内部出错?

import mailbox
import os
import email
from bs4 import BeautifulSoup
import sqlite3

# Current working directory
cwd = os.path.dirname(os.path.realpath(__file__))

# First find all the mbox files using os.walk
mbox_path = cwd + '/mbox_files'
mbox_files = []
pattern = '*.mbox'

for root, dirs, files in os.walk(mbox_path):
    for filename in fnmatch.filter(files, pattern):
        mbox_files.append((filename, os.path.join(root, filename)))

# Now process each message in the folder
for mbox_file in mbox_files:
    src_mbox = mailbox.mbox(mbox_file[1])
    for msg in src_mbox:
        sender = name_email(msg['From'])
        recipient = name_email(msg['To'])
        b = email.message_from_string(str(msg))
        if b.is_multipart():
            for payload in b.get_payload():
                p = payload.get_payload()
                if isinstance(p,list):
                    html_text = p[0]
                else:
                    html_text = p

        try:
            # Remove any HTML tags, and any inline styles
            soup = BeautifulSoup(str(html_text))
            [s.extract() for s in soup('style')]
            text = soup.text.strip()
        except:
            pass
        # Just in case we get a plain text email

        else:
            text = b.get_payload()
        row = [
                    None,
                    sender[0],
                    sender[1],
                    recipient[0],
                    recipient[1],
                    msg['Subject'],
                    topic,
                    msg['Date'],
                    msg['Message-ID'],
                    text
                ]
    cur.execute("INSERT INTO emails VALUES(?,?,?,?,?,?,?,?,?,?);", row)



# A litle utility function that separates name and email from strings like '"Some Name" <some@email.com>'
def name_email(s):
    if not s:
        return None, None
    pieces = s.split('<')
    if len(pieces) > 1:
        name = pieces[0].replace('"','').strip()
        email = pieces[1].replace('>','')
    else:
        name = None
        email = pieces[0].replace('>','')
    return name,email

conn = sqlite3.connect(cwd + '/test.db')
cur = conn.cursor()

# Create the table. 
cur.execute("DROP TABLE IF EXISTS emails")    
cur.execute("CREATE TABLE emails(id INTEGER PRIMARY KEY, sender_name TEXT, sender_email TEXT, recipient_name TEXT, recipient_email TEXT, subject TEXT, conversation_topic TEXT, message_date TEXT, message_id TEXT, text_body TEXT)")
cur.execute("CREATE INDEX index_sender_name ON emails (sender_name)")

Mbox samples

0 个答案:

没有答案