我正在尝试使用下面的代码中的计数器运行循环。
import sys
import imaplib
import getpass
import email
import email.header
import datetime
from bs4 import BeautifulSoup
import re
from lxml import etree, html
from HTMLParser import HTMLParser
EMAIL_ACCOUNT = "sample@gmail.com"
EMAIL_FOLDER = "INBOX"
def stringify_children(node):
from lxml.etree import tostring
from itertools import chain
parts = ([node.text] +
list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
[node.tail])
# filter removes possible Nones in texts and tails
return ''.join(filter(None, parts))
def process_mailbox(M):
"""
Do something with emails messages in the folder.
For the sake of this example, print some headers.
"""
rv, data = M.search(None, "ALL")
if rv != 'OK':
print "No messages found!"
return
for num in data[0].split():
rv, data = M.fetch(num, '(RFC822)')
if rv != 'OK':
print "ERROR getting message", num
return
msg = email.message_from_string(data[0][1])
if msg.is_multipart():
html = None
print "Checking for html or text"
for part in msg.get_payload():
if part.get_content_charset() is None:
charset = chardet.detect(srt(part))['encoding']
else:
charset = part.get_content_charset()
if part.get_content_type() == 'text/plain':
text = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
f = open('email.txt', 'w')
f.write(text)
f.close
if part.get_content_type() == 'text/html':
html = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
f = open('email.html','w')
f.write(html)
f.close
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
if not os.path.isfile(filename) :
fp = open(filename, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
return 0
if html is None:
return text.strip()
else:
return html.strip()
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def parse_html():
htmldoc = open('email.html', 'r+')
doc = htmldoc.read()
doc = re.sub('foobar', 'bar', doc)
soup = BeautifulSoup(open("email.html"))
VALID_TAGS = ['iframe', 'video', 'o>', 'li', 'sub', 'sup', 'source', 'br', 'h3', 'h4', 'h6', 'hr', 'q', 'mark','wbr', 'audio','strong', 'em', 'p','ul', 'li', 'br', 'blockquote', 'pre', 'del', 'h3', 'body', 'header', 'html', 'title', 'div', 'img', 'a']
for tag in soup.findAll(True):
if tag.name == 'i':
tag.name = 'em'
elif tag.name == 'cite':
tag.name = 'em'
elif tag.name == 'b':
tag.name = 'strong'
elif tag.name == 'kdb':
tag.name = 'strong'
elif tag.name == 'var':
tag.name = 'strong'
elif tag.name == 'aside':
tag.name = 'blackquote'
elif tag.name == 'code':
tag.name = 'pre'
elif tag.name == 'samp':
tag.name = 'pre'
elif tag.name == 's':
tag.name = 'del'
elif tag.name == 'h1':
tag.name = 'h3'
elif tag.name == 'h2':
tag.name = 'h3'
pretty_soup = soup.prettify()
docstring = str(pretty_soup)
tree = etree.fromstring(docstring)
walkAll = tree.iterchildren()
count = 0
for elt in walkAll:
if count <= 300:
child = stringify_children(elt)
childtext = strip_tags(child)
childstring = childtext.replace(" ", "")
for i in childstring:
count = count + len(i)
print count
else:
root = etree.Element("elt")
root.text = ''
root.tail = ''
etroot = tree.getroottree()
results = etree.tostring(etroot)
htmldoc.write(results)
M = imaplib.IMAP4_SSL('imap.gmail.com')
try:
rv, data = M.login(EMAIL_ACCOUNT, getpass.getpass())
except imaplib.IMAP4.error:
print "LOGIN FAILED!!! "
sys.exit(1)
print rv, data
rv, mailboxes = M.list()
if rv == 'OK':
print "Mailboxes:"
print mailboxes
rv, data = M.select(EMAIL_FOLDER)
if rv == 'OK':
print "Processing mailbox...\n"
process_mailbox(M)
M.close()
parse_html()
else:
print "ERROR: Unable to open mailbox ", rv
M.logout()
但是我的循环被破坏了,我不明白为什么。当计数达到300时,应该执行else语句中的代码,但计数器继续迭代元素。
我希望在计数达到300之后对每个元素执行else语句。结果将影响到300之后的元素,并且在计数达到300之前迭代的元素不会受到影响。我在else语句中放了一个打印“test”来测试它并且它没有打印出来。
此代码的目的是获取电子邮件并计算电子邮件中的文本,当总文本计数达到某个参数时(在本例中为300),否则执行并清除剩余元素中的文本。我随机使用了300。文本限制也可以是50。
count = 0
for elt in walkAll:
if count <= 300:
child = stringify_children(elt)
childtext = strip_tags(child)
childstring = childtext.replace(" ", "")
for i in childstring:
count = count + len(i)
print count
else:
root = etree.Element("elt")
root.text = ''
root.tail = ''
修改
我可以在count = count + len(i)之后插入一个if break条件,这将停止循环,如下所示:
count = 0
for elt in walkAll:
if count <= 300:
child = stringify_children(elt)
childtext = strip_tags(child)
childstring = childtext.replace(" ", "")
for i in childstring:
count = count + len(i)
if count >= 300:
break
print count
else:
root = etree.Element("elt")
root.text = ''
root.tail = ''
这会破坏计数,但else语句仍然无法运行
答案 0 :(得分:0)
不确定我是否完全理解你想要做的事情,但根据你上面的代码,这应该有用。
for elt in walkAll:
count = 0
child = stringify_children(elt)
childtext = strip_tags(child)
childstring = childtext.replace(" ", "")
for s in childstring:
count+=1
if count >= 300:
root = etree.Element("elt")
root.text = ''
root.tail = ''