Question

我尝试编写一个定义类的代码：Webpage和crwaler。对象：构建通用搜索引擎。我在我的电脑上找到了一个带有几个＆＃34; web_pages＆＃34;的文件夹。以html文件的形式。 .im对所有代码感到抱歉，但我不知道如何在没有它的情况下使这个问题具体化。

import re
import os

def remove_html_tags(s):
    tag = False
    quote = False
    out = ""

    for c in s:
            if c == '<' and not quote:
                tag = True
            elif c == '>' and not quote:
                tag = False
            elif (c == '"' or c == "'") and tag:
                quote = not quote
            elif not tag:
                out = out + c

    return out


def lev(s1, s2):
    return lev_iter(s1, s2, dict())

def lev_iter(s1, s2, mem):

    (i,j) = (len(s1), len(s2))
    if (i,j) in mem:
        return mem[(i,j)]

    s1_low = s1.lower()
    s2_low = s2.lower()
    if len(s1_low) == 0 or len(s2_low) == 0:
        return max(len(s1_low), len(s2_low))
    d1 = lev_iter(s1_low[:-1], s2_low, mem) + 1
    d2 = lev_iter(s1_low, s2_low[:-1], mem) + 1
    last = 0 if s1_low[-1] == s2_low[-1] else 1
    d3 = lev_iter(s1_low[:-1], s2_low[:-1], mem) + last
    result = min(d1, d2, d3)

    mem[(i,j)] = result

    return result




""" A Class that holds data on a Web page """
class WebPage:

    def __init__(self, filename):

        self.filename = filename

    def process(self):

        f = open(self.filename,'r')
        LINE_lst = f.readlines()

        self.info = {}

        for i in range(len(LINE_lst)):
            LINE_lst[i] = LINE_lst[i].strip(' \n\t')
            LINE_lst[i] = remove_html_tags(LINE_lst[i])
        lines = LINE_lst[:]
        for line in lines:
            if len(line) == 0:
                LINE_lst.remove(line)
        self.body = ' '.join(LINE_lst[1:])
        self.title = LINE_lst[0]
        f.close()

    def __str__(self):
        return self.title + '\n' + self.body

    def __repr__(self):
        return self.title

    def __eq__(self,other):
        n = lev(self.body,other.body)
        k = len(self.body)
        m = len(other.body)
        return float(n)/max(k,m) <= 0.15

    def __lt__(self,other):
        return self.title < other.title

""" A Class that crawls the web """     
class Crawler:
    def __init__(self, directory):

        self.folder = directory

    def crawl(self):

        pages = [f for f in os.listdir(self.folder) if f.endswith('.html')]

        final_list = []

        for page in pages:

            page = WebPage(self.folder + '\\' + page)
            page.process()

            for k in range(len(final_list)+1):

                if k == len(final_list):

                    final_list.append(page)

                elif page == final_list[k]:
                    if page < final_list[k]:
                        final_list.remove(final_list[k])
                        final_list.append(page)

        self.crawl = final_list

嗯..除了crawl方法之外，一切都有效。也许我做错了我不知道。我想在k等于final_list的长度时打破循环，但不打破包含它的循环。任何sugestions？

Answer 1

这就是break关键字所做的。它只会打破最里面的循环。

如何打破循环而不破坏包含它的循环？

1 个答案: