我开发了一个Web抓取工具,它可以浏览类似Facebook的网站(Lang-8)的配置文件并保存所需的数据。但是,我不知道如何开发系统,以防万一PC崩溃了,代码将从它扫描的最后一个配置文件恢复
import requests
from bs4 import BeautifulSoup
profile = 1
while profile <= max_profiles:
url = "http://lang-8.com/" + str(profile)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, features="html.parser")
for lang in soup.findAll('dd', {'class':'studying_lang_name'}):
lang1 = str(lang.string)
if lang1 == "\n\nPolish\n":
journal = str(url) + "/journals"
open_article(journal)
profile += 1
def open_article(url2):
in_page = 1
while in_page < 5:
source_code = requests.get(url2 + "?page=" + str(in_page))
plain_text = source_code.text
soup = BeautifulSoup(plain_text, features="html.parser")
for link in soup.findAll('h3', {'class':'journal_title'}):
href1 = str(link.find('a').get("href"))
file_create(href1)
in_page += 1
def file_create(linked):
source_code = requests.get(linked)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, features="html.parser")
for text in soup.findAll('li', {'class':'corrections_num'}):
corrections = text.text
for content in soup.findAll('div', {'id':'body_show_ori'}):
text1 = content.text
fout = open(linked[-1] + linked[-2] + linked[-3] + "_" + corrections +
"_.txt", 'w', encoding='utf-8')
fout.write(text1)
fout.close()
答案 0 :(得分:0)
完成个人资料抓取后,我将创建和更新进度文件。
您的个人资料+ = 1后添加类似内容:
fprogress = open("progress.txt","w")
fprogress.write("%d" % profile)
fprogress.close()
然后在加载时将配置文件设置为1:
if os.path.isfile('progress.txt'):
fprogress = open("progress.txt", "r")
profile = int(fprogress.read())
else:
profile = 1