python破坏if else嵌套在for循环中

时间:2014-05-03 07:57:01

标签: python if-statement for-loop counter

我正在尝试使用下面的代码中的计数器运行循环。

import sys
import imaplib
import getpass
import email
import email.header
import datetime
from bs4 import BeautifulSoup 
import re
from lxml import etree, html
from HTMLParser import HTMLParser


EMAIL_ACCOUNT = "sample@gmail.com"
EMAIL_FOLDER = "INBOX"


def stringify_children(node):
    from lxml.etree import tostring
    from itertools import chain
    parts = ([node.text] +
            list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
        [node.tail])
    # filter removes possible Nones in texts and tails
    return ''.join(filter(None, parts))

def process_mailbox(M):
    """
    Do something with emails messages in the folder.  
    For the sake of this example, print some headers.
    """

    rv, data = M.search(None, "ALL")
    if rv != 'OK':
        print "No messages found!"
        return
    for num in data[0].split(): 
        rv, data = M.fetch(num, '(RFC822)')
        if rv != 'OK':
            print "ERROR getting message", num
            return

        msg = email.message_from_string(data[0][1])

        if msg.is_multipart():
            html = None
            print "Checking for html or text"
            for part in msg.get_payload():
                if part.get_content_charset() is None:
                    charset = chardet.detect(srt(part))['encoding']
                else:
                    charset = part.get_content_charset()
                if part.get_content_type() == 'text/plain':
                    text = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
                    f = open('email.txt', 'w')
                    f.write(text)
                    f.close
                if part.get_content_type() == 'text/html':
                    html = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
                    f = open('email.html','w')
                    f.write(html)
                    f.close
                if part.get('Content-Disposition') is None:
                    continue

                filename = part.get_filename()

                if not os.path.isfile(filename) :
                    fp = open(filename, 'wb')
                    fp.write(part.get_payload(decode=True))
                    fp.close()
                    return 0

            if html is None:
                return text.strip()
            else:
                return html.strip()


class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()





def parse_html():
    htmldoc = open('email.html', 'r+')
    doc = htmldoc.read()
    doc = re.sub('foobar', 'bar', doc)
    soup = BeautifulSoup(open("email.html"))
    VALID_TAGS = ['iframe', 'video', 'o>', 'li', 'sub', 'sup', 'source', 'br', 'h3', 'h4', 'h6', 'hr', 'q', 'mark','wbr', 'audio','strong', 'em', 'p','ul', 'li', 'br', 'blockquote', 'pre', 'del', 'h3', 'body', 'header', 'html', 'title', 'div', 'img', 'a']

    for tag in soup.findAll(True):
        if tag.name == 'i':
            tag.name = 'em'
        elif tag.name == 'cite':
            tag.name = 'em'
        elif tag.name == 'b':
            tag.name = 'strong'
        elif tag.name == 'kdb':
            tag.name = 'strong'
        elif tag.name == 'var':
            tag.name = 'strong'
        elif tag.name == 'aside':
            tag.name = 'blackquote'
        elif tag.name == 'code':
            tag.name = 'pre'
        elif tag.name == 'samp':
            tag.name = 'pre'
        elif tag.name == 's':
            tag.name = 'del'
        elif tag.name == 'h1':
            tag.name = 'h3'
        elif tag.name == 'h2':
            tag.name = 'h3'

    pretty_soup = soup.prettify()
    docstring = str(pretty_soup)
    tree = etree.fromstring(docstring)
    walkAll = tree.iterchildren() 

    count = 0
    for elt in walkAll:
        if count <= 300: 
            child = stringify_children(elt)
            childtext = strip_tags(child)
            childstring = childtext.replace(" ", "")
            for i in childstring:
                count = count + len(i)
                print count

        else:
            root = etree.Element("elt")
            root.text = ''
            root.tail = ''

    etroot = tree.getroottree()
    results = etree.tostring(etroot)
    htmldoc.write(results)





M = imaplib.IMAP4_SSL('imap.gmail.com')

try:
    rv, data = M.login(EMAIL_ACCOUNT, getpass.getpass())
except imaplib.IMAP4.error:
    print "LOGIN FAILED!!! "
    sys.exit(1)

print rv, data

rv, mailboxes = M.list()
if rv == 'OK':
    print "Mailboxes:"
    print mailboxes

rv, data = M.select(EMAIL_FOLDER)
if rv == 'OK':
    print "Processing mailbox...\n"
    process_mailbox(M)
    M.close()
    parse_html()
else:
    print "ERROR: Unable to open mailbox ", rv

M.logout()

但是我的循环被破坏了,我不明白为什么。当计数达到300时,应该执行else语句中的代码,但计数器继续迭代元素。

我希望在计数达到300之后对每个元素执行else语句。结果将影响到300之后的元素,并且在计数达到300之前迭代的元素不会受到影响。我在else语句中放了一个打印“test”来测试它并且它没有打印出来。

此代码的目的是获取电子邮件并计算电子邮件中的文本,当总文本计数达到某个参数时(在本例中为300),否则执行并清除剩余元素中的文本。我随机使用了300。文本限制也可以是50。

count = 0
for elt in walkAll:
    if count <= 300: 
       child = stringify_children(elt)
       childtext = strip_tags(child)
       childstring = childtext.replace(" ", "")
           for i in childstring:
               count = count + len(i)
               print count

    else:
        root = etree.Element("elt")
        root.text = ''
        root.tail = ''

修改

我可以在count = count + len(i)之后插入一个if break条件,这将停止循环,如下所示:

count = 0
for elt in walkAll:
    if count <= 300: 
        child = stringify_children(elt)
        childtext = strip_tags(child)
        childstring = childtext.replace(" ", "")
        for i in childstring:
            count = count + len(i)
            if count >= 300:
                break
            print count

    else:
        root = etree.Element("elt")
        root.text = ''
        root.tail = ''

这会破坏计数,但else语句仍然无法运行

1 个答案:

答案 0 :(得分:0)

不确定我是否完全理解你想要做的事情,但根据你上面的代码,这应该有用。

for elt in walkAll:
    count = 0
    child = stringify_children(elt)
    childtext = strip_tags(child)
    childstring = childtext.replace(" ", "")
    for s in childstring:
        count+=1
        if count >= 300:
            root = etree.Element("elt")
            root.text = ''
            root.tail = ''