我正在尝试从多个网址中抓取数据,对抓取结果进行哈希处理,比较哈希结果,然后在给定网站上发布新内容时通知自己。我已经成功地成功哈希了每个URL,但是在比较哈希时我遇到了问题。解决此问题的最佳方法是什么?以下提供的代码的相关部分:
for i in accounts:
url_data = requests.get(i)
url_data = Soup(url_data.content, "html5lib")
url_data = url_data.find("div", attrs={"class": "results"})
url_data = str(url_data)
url_data = html2text.html2text(url_data)
list_data = list([url_data])
for l in list_data:
# print(l)
url_data = l.encode("utf-8")
md5_hash = hashlib.md5()
md5_hash.update(url_data)
result = md5_hash.hexdigest()
logger.info(("Start Monitoring... hash "
"{url_hash}").format(url_hash=result))
time.sleep(args.time)
if(result == result):
logger.info("Nothing has changed")
# return False
else:
logger.info("Something has changed")
date = datetime.datetime.now().strftime("%d.%m.%Y %H:%M:%S")
print("The Url {l} has changed at {date}.")
if(not args.nodiff):
new_result = ""
new_content = list_data
s = difflib.SequenceMatcher(None, new_content)
for tag, i1, i2, j1, j2 in s.get_opcodes():
if(tag == "insert" or tag == "replaced"):
new_result += new_content[j1:j2]
diff = new_result
logger.info("{diff}".format(**locals()))
print(diff)
# return True
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--time", help="seconds between
checks (default: 600)", default=20, type=int)
parser.add_argument("-nd", "--nodiff", help="show no
difference", action="store_true")
args = parser.parse_args()
logging.basicConfig(format="%(asctime)s %
(message)s",datefmt="%d.%m.%Y %H:%M:%S")
logger = logging.getLogger()
logger.setLevel(logging.INFO)
scrape_urls()
更新
class scrape():
def __init__(self, url):
self.list_data = url
self.url_hash = self.hash()
self.get_content = self.scrape_urls()
logger.info(("Start Monitoring... hash "
"{url_hash}").format(url_hash=self.url_hash))
def hash(self):
for l in self.list_data:
a = hashlib.md5()
a.update(l.encode("utf-8"))
return a.hexdigest()
def scrape_urls(self):
url_data = requests.get(self.list_data)
url_data = Soup(url_data.content, "html5lib")
url_data = url_data.find("div", attrs={"class": "results"})
url_data = str(url_data)
url_data = html2text.html2text(url_data)
list_data = list([url_data])
for l in list_data:
time.sleep(args.time)
if(self.hash() == self.url_hash):
logger.info("Nothing has changed")
return False
else:
logger.info("Something has changed")
date = datetime.datetime.now().strftime("%d.%m.%Y
%H:%M:%S")
print("The Url {l['link ']} has changed at {date}.")
return True
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--time", help="seconds between
checks (default: 600)", default=5, type=int)
args = parser.parse_args()
logging.basicConfig(format="%(asctime)s % .
(message)s",datefmt="%d.%m.%Y %H:%M:%S")
logger = logging.getLogger()
logger.setLevel(logging.INFO)
links_list = pd.read_excel('urls_list.xlsx',
index=False)
loc_subset = links_list.to_dict('records')
accounts = []
for l in loc_subset:
url = str(l['link '])
accounts.append(url)
for i in accounts:
print(i)
url = scrape(i)
time.sleep(args.time)
while True:
if(url.scrape_urls()):
break
time.sleep(args.time)