每当我更改对象的limit
属性并调用crawl
函数时,代码都会终止。
但是,如果我不更改限制属性,则代码有效:
from api import Crawler
if __name__ == "__main__":
mycrawler = Crawler("http://metasozluk.com/")
mycrawler.crawl()
让我解释一下代码。这是 - 所谓的 - 我试图编码的简单爬虫。 limit[0]
属性是抓取页面的限制。只要抓取功能完成as you can see at the line 54 of api.py,limit[1]
就会加1。
From 26th to 31st lines of api.py,我检查是否存在限制,如果存在限制,请确保limit[0]
和limit[1]
是否相等(如果相等,则返回函数)。
但是,如果我在app.py中确定限制,则代码不会运行mycrawler.crawl()
函数并终止,而如果我确实如此,它会起作用。我不知道这里有什么问题。
我想做的是:
答案 0 :(得分:1)
编辑:抱歉,我(想)必须对代码进行一些编辑。 :/
这对你有用吗?
import re, requests, logging, os, time, json
from bs4 import BeautifulSoup as bs
class Crawler(object):
logger = logging.getLogger("crawler_logger")
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(os.getcwd()+"/logs/{}.log".format( str(int(round(time.time()*1000))) ))
file_handler.setLevel(logging.DEBUG)
terminal_handler = logging.StreamHandler()
terminal_handler.setLevel(logging.INFO)
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler.setFormatter(log_format)
terminal_handler.setFormatter(log_format)
logger.addHandler(terminal_handler)
logger.addHandler(file_handler)
def __init__(self, *args):
self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args)))
self.domain = args[0]
self.urls = list(args)
self.crawled = []
self.limit = [0, 0]
self.dump_file = "urls.json"
def crawl(self):
# while urls in self.urls
while self.urls:
if self.limit[0] != 0:
if self.limit[0] == self.limit[1]:
self.logger.info("Limit reached, writing to file and returns.")
with open(self.dump_file, "w") as dump_file:
dump_file.write(json.dumps(self.urls))
return
try:
self.logger.info("Connecting to {}...".format(self.urls[0]))
response = requests.get(self.urls[0])
response.encoding = "utf-8"
self.logger.info("Analyzing to structures...")
soup = bs(response.text, "html.parser")
links = soup.find_all("a", {"href" : re.compile("^/")})
hrefs = [x.attrs["href"] for x in links]
self.logger.info("Links are checked if they are crawled...")
for href in hrefs:
if self.domain[0:-1]+href in self.crawled:
self.logger.warn("{} already crawled.".format(str(href)))
pass
else: self.urls.append(self.domain[0:-1]+href)
self.crawled.append(self.urls[0])
# Remove first url from reversed self.urls list
self.urls[::-1].pop()
self.limit[1]+=1
self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0])))
except Exception as e:
self.logger.error("Crawling function raised an error, passing: {}".format(str(e)))
if len(self.urls) == 0:
self.logger.info("No url left to crawl, returns.")
with open(self.dump_file, "w+") as dump_file:
dump_file.write(json.dumps(self.urls))
return
if __name__ == "__main__":
mycrawler = Crawler("http://metasozluk.com/")
mycrawler.limit[0] = 5
mycrawler.crawl()