Question

我正在尝试使用下面的代码中的计数器运行循环。

import sys
import imaplib
import getpass
import email
import email.header
import datetime
from bs4 import BeautifulSoup 
import re
from lxml import etree, html
from HTMLParser import HTMLParser


EMAIL_ACCOUNT = "sample@gmail.com"
EMAIL_FOLDER = "INBOX"


def stringify_children(node):
    from lxml.etree import tostring
    from itertools import chain
    parts = ([node.text] +
            list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
        [node.tail])
    # filter removes possible Nones in texts and tails
    return ''.join(filter(None, parts))

def process_mailbox(M):
    """
    Do something with emails messages in the folder.  
    For the sake of this example, print some headers.
    """

    rv, data = M.search(None, "ALL")
    if rv != 'OK':
        print "No messages found!"
        return
    for num in data[0].split(): 
        rv, data = M.fetch(num, '(RFC822)')
        if rv != 'OK':
            print "ERROR getting message", num
            return

        msg = email.message_from_string(data[0][1])

        if msg.is_multipart():
            html = None
            print "Checking for html or text"
            for part in msg.get_payload():
                if part.get_content_charset() is None:
                    charset = chardet.detect(srt(part))['encoding']
                else:
                    charset = part.get_content_charset()
                if part.get_content_type() == 'text/plain':
                    text = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
                    f = open('email.txt', 'w')
                    f.write(text)
                    f.close
                if part.get_content_type() == 'text/html':
                    html = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
                    f = open('email.html','w')
                    f.write(html)
                    f.close
                if part.get('Content-Disposition') is None:
                    continue

                filename = part.get_filename()

                if not os.path.isfile(filename) :
                    fp = open(filename, 'wb')
                    fp.write(part.get_payload(decode=True))
                    fp.close()
                    return 0

            if html is None:
                return text.strip()
            else:
                return html.strip()


class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()





def parse_html():
    htmldoc = open('email.html', 'r+')
    doc = htmldoc.read()
    doc = re.sub('foobar', 'bar', doc)
    soup = BeautifulSoup(open("email.html"))
    VALID_TAGS = ['iframe', 'video', 'o>', 'li', 'sub', 'sup', 'source', 'br', 'h3', 'h4', 'h6', 'hr', 'q', 'mark','wbr', 'audio','strong', 'em', 'p','ul', 'li', 'br', 'blockquote', 'pre', 'del', 'h3', 'body', 'header', 'html', 'title', 'div', 'img', 'a']

    for tag in soup.findAll(True):
        if tag.name == 'i':
            tag.name = 'em'
        elif tag.name == 'cite':
            tag.name = 'em'
        elif tag.name == 'b':
            tag.name = 'strong'
        elif tag.name == 'kdb':
            tag.name = 'strong'
        elif tag.name == 'var':
            tag.name = 'strong'
        elif tag.name == 'aside':
            tag.name = 'blackquote'
        elif tag.name == 'code':
            tag.name = 'pre'
        elif tag.name == 'samp':
            tag.name = 'pre'
        elif tag.name == 's':
            tag.name = 'del'
        elif tag.name == 'h1':
            tag.name = 'h3'
        elif tag.name == 'h2':
            tag.name = 'h3'

    pretty_soup = soup.prettify()
    docstring = str(pretty_soup)
    tree = etree.fromstring(docstring)
    walkAll = tree.iterchildren() 

    count = 0
    for elt in walkAll:
        if count <= 300: 
            child = stringify_children(elt)
            childtext = strip_tags(child)
            childstring = childtext.replace(" ", "")
            for i in childstring:
                count = count + len(i)
                print count

        else:
            root = etree.Element("elt")
            root.text = ''
            root.tail = ''

    etroot = tree.getroottree()
    results = etree.tostring(etroot)
    htmldoc.write(results)





M = imaplib.IMAP4_SSL('imap.gmail.com')

try:
    rv, data = M.login(EMAIL_ACCOUNT, getpass.getpass())
except imaplib.IMAP4.error:
    print "LOGIN FAILED!!! "
    sys.exit(1)

print rv, data

rv, mailboxes = M.list()
if rv == 'OK':
    print "Mailboxes:"
    print mailboxes

rv, data = M.select(EMAIL_FOLDER)
if rv == 'OK':
    print "Processing mailbox...\n"
    process_mailbox(M)
    M.close()
    parse_html()
else:
    print "ERROR: Unable to open mailbox ", rv

M.logout()

但是我的循环被破坏了，我不明白为什么。当计数达到300时，应该执行else语句中的代码，但计数器继续迭代元素。

我希望在计数达到300之后对每个元素执行else语句。结果将影响到300之后的元素，并且在计数达到300之前迭代的元素不会受到影响。我在else语句中放了一个打印“test”来测试它并且它没有打印出来。

此代码的目的是获取电子邮件并计算电子邮件中的文本，当总文本计数达到某个参数时（在本例中为300），否则执行并清除剩余元素中的文本。我随机使用了300。文本限制也可以是50。

count = 0
for elt in walkAll:
    if count <= 300: 
       child = stringify_children(elt)
       childtext = strip_tags(child)
       childstring = childtext.replace(" ", "")
           for i in childstring:
               count = count + len(i)
               print count

    else:
        root = etree.Element("elt")
        root.text = ''
        root.tail = ''

修改

我可以在count = count + len（i）之后插入一个if break条件，这将停止循环，如下所示：

count = 0
for elt in walkAll:
    if count <= 300: 
        child = stringify_children(elt)
        childtext = strip_tags(child)
        childstring = childtext.replace(" ", "")
        for i in childstring:
            count = count + len(i)
            if count >= 300:
                break
            print count

    else:
        root = etree.Element("elt")
        root.text = ''
        root.tail = ''

这会破坏计数，但else语句仍然无法运行

Answer 1

不确定我是否完全理解你想要做的事情，但根据你上面的代码，这应该有用。

for elt in walkAll:
    count = 0
    child = stringify_children(elt)
    childtext = strip_tags(child)
    childstring = childtext.replace(" ", "")
    for s in childstring:
        count+=1
        if count >= 300:
            root = etree.Element("elt")
            root.text = ''
            root.tail = ''

python破坏if else嵌套在for循环中

1 个答案: