我想获取电子邮件正文,但不是html格式。我用过Beautifulsoup和 data = re.sub(r'<。*?>','',html2) 删除html标签,但它无法正常工作我无法接收电子邮件给申请人
def message_new(self, cr, uid, msg, custom_values=None, context=None):
""" Overrides mail_thread message_new that is called by the mailgateway
through message_process.
This override updates the document according to the email.
"""
if custom_values is None:
custom_values = {}
val = msg.get('from').split('<')[0]
val1 = msg.get('from').split('<')[1]
val2 = val1.split('>')[0]
myString2 = msg.get('body') if msg.get('body') else ''
soup = BeautifulSoup(myString2)
data = soup.get_text()
defaults = {
'name': msg.get('subject') or _("No Subject"),
'partner_name': val,
'email_from': val2,
'email_cc': msg.get('cc'),
'user_id': False,
'partner_id': msg.get('author_id', False),
'description': data,
}
if msg.get('priority'):
defaults['priority'] = msg.get('priority')
defaults.update(custom_values)
return super(hr_applicant, self).message_new(cr, uid, msg,custom_values=defaults, context=context)
答案 0 :(得分:0)
我一直在使用HTMLParser并取得了相同的成功:剥离HTML标记但保留数据。
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
strip_tags
需要一个包含HTML内容的字符串。
答案 1 :(得分:0)
请尝试重新使用python库从文本中删除html标签。
import re
text = re.compile('<.*?>')
message = re.sub(text, '', self.body)