我尝试编写一个定义类的代码:Webpage和crwaler。 对象:构建通用搜索引擎。 我在我的电脑上找到了一个带有几个" web_pages"的文件夹。以html文件的形式。 .im对所有代码感到抱歉,但我不知道如何在没有它的情况下使这个问题具体化。
import re
import os
def remove_html_tags(s):
tag = False
quote = False
out = ""
for c in s:
if c == '<' and not quote:
tag = True
elif c == '>' and not quote:
tag = False
elif (c == '"' or c == "'") and tag:
quote = not quote
elif not tag:
out = out + c
return out
def lev(s1, s2):
return lev_iter(s1, s2, dict())
def lev_iter(s1, s2, mem):
(i,j) = (len(s1), len(s2))
if (i,j) in mem:
return mem[(i,j)]
s1_low = s1.lower()
s2_low = s2.lower()
if len(s1_low) == 0 or len(s2_low) == 0:
return max(len(s1_low), len(s2_low))
d1 = lev_iter(s1_low[:-1], s2_low, mem) + 1
d2 = lev_iter(s1_low, s2_low[:-1], mem) + 1
last = 0 if s1_low[-1] == s2_low[-1] else 1
d3 = lev_iter(s1_low[:-1], s2_low[:-1], mem) + last
result = min(d1, d2, d3)
mem[(i,j)] = result
return result
""" A Class that holds data on a Web page """
class WebPage:
def __init__(self, filename):
self.filename = filename
def process(self):
f = open(self.filename,'r')
LINE_lst = f.readlines()
self.info = {}
for i in range(len(LINE_lst)):
LINE_lst[i] = LINE_lst[i].strip(' \n\t')
LINE_lst[i] = remove_html_tags(LINE_lst[i])
lines = LINE_lst[:]
for line in lines:
if len(line) == 0:
LINE_lst.remove(line)
self.body = ' '.join(LINE_lst[1:])
self.title = LINE_lst[0]
f.close()
def __str__(self):
return self.title + '\n' + self.body
def __repr__(self):
return self.title
def __eq__(self,other):
n = lev(self.body,other.body)
k = len(self.body)
m = len(other.body)
return float(n)/max(k,m) <= 0.15
def __lt__(self,other):
return self.title < other.title
""" A Class that crawls the web """
class Crawler:
def __init__(self, directory):
self.folder = directory
def crawl(self):
pages = [f for f in os.listdir(self.folder) if f.endswith('.html')]
final_list = []
for page in pages:
page = WebPage(self.folder + '\\' + page)
page.process()
for k in range(len(final_list)+1):
if k == len(final_list):
final_list.append(page)
elif page == final_list[k]:
if page < final_list[k]:
final_list.remove(final_list[k])
final_list.append(page)
self.crawl = final_list
嗯..除了crawl
方法之外,一切都有效。
也许我做错了我不知道。
我想在k等于final_list的长度时打破循环,但不打破包含它的循环。
任何sugestions?
答案 0 :(得分:6)
这就是break
关键字所做的。它只会打破最里面的循环。