所以我有这个代码,它接受mbox文件,然后将它们转换为sqlite数据库。但是,我得到一个空的数据库。 无论我使用哪个mbox,结果数据库都是12kb。当我尝试查看数据库/分析它时,它会产生任何结果,没有任何信息或任何东西,尽管表和密钥都已生成。但里面没有任何信息。 可能是什么问题呢?是不是脚本没有选择mbox,或者循环内部出错?
import mailbox
import os
import email
from bs4 import BeautifulSoup
import sqlite3
# Current working directory
cwd = os.path.dirname(os.path.realpath(__file__))
# First find all the mbox files using os.walk
mbox_path = cwd + '/mbox_files'
mbox_files = []
pattern = '*.mbox'
for root, dirs, files in os.walk(mbox_path):
for filename in fnmatch.filter(files, pattern):
mbox_files.append((filename, os.path.join(root, filename)))
# Now process each message in the folder
for mbox_file in mbox_files:
src_mbox = mailbox.mbox(mbox_file[1])
for msg in src_mbox:
sender = name_email(msg['From'])
recipient = name_email(msg['To'])
b = email.message_from_string(str(msg))
if b.is_multipart():
for payload in b.get_payload():
p = payload.get_payload()
if isinstance(p,list):
html_text = p[0]
else:
html_text = p
try:
# Remove any HTML tags, and any inline styles
soup = BeautifulSoup(str(html_text))
[s.extract() for s in soup('style')]
text = soup.text.strip()
except:
pass
# Just in case we get a plain text email
else:
text = b.get_payload()
row = [
None,
sender[0],
sender[1],
recipient[0],
recipient[1],
msg['Subject'],
topic,
msg['Date'],
msg['Message-ID'],
text
]
cur.execute("INSERT INTO emails VALUES(?,?,?,?,?,?,?,?,?,?);", row)
# A litle utility function that separates name and email from strings like '"Some Name" <some@email.com>'
def name_email(s):
if not s:
return None, None
pieces = s.split('<')
if len(pieces) > 1:
name = pieces[0].replace('"','').strip()
email = pieces[1].replace('>','')
else:
name = None
email = pieces[0].replace('>','')
return name,email
conn = sqlite3.connect(cwd + '/test.db')
cur = conn.cursor()
# Create the table.
cur.execute("DROP TABLE IF EXISTS emails")
cur.execute("CREATE TABLE emails(id INTEGER PRIMARY KEY, sender_name TEXT, sender_email TEXT, recipient_name TEXT, recipient_email TEXT, subject TEXT, conversation_topic TEXT, message_date TEXT, message_id TEXT, text_body TEXT)")
cur.execute("CREATE INDEX index_sender_name ON emails (sender_name)")