用美丽的汤进行网页抓取的哈希结果

时间:2018-12-14 20:44:47

标签: python-3.x hash web-scraping beautifulsoup

我正在尝试从多个网址中抓取数据,对抓取结果进行哈希处理,比较哈希结果,然后在给定网站上发布新内容时通知自己。我已经成功地成功哈希了每个URL,但是在比较哈希时我遇到了问题。解决此问题的最佳方法是什么?以下提供的代码的相关部分:

 for i in accounts:
    url_data = requests.get(i)
    url_data = Soup(url_data.content, "html5lib")
    url_data = url_data.find("div", attrs={"class": "results"})
    url_data = str(url_data)
    url_data = html2text.html2text(url_data)

    list_data = list([url_data])

    for l in list_data:
        # print(l)
        url_data = l.encode("utf-8")
        md5_hash = hashlib.md5()
        md5_hash.update(url_data)

        result = md5_hash.hexdigest()

        logger.info(("Start Monitoring... hash "
                    "{url_hash}").format(url_hash=result))

        time.sleep(args.time)

        if(result == result):
            logger.info("Nothing has changed")
            # return False
        else:
            logger.info("Something has changed")
            date = datetime.datetime.now().strftime("%d.%m.%Y %H:%M:%S")
            print("The Url {l} has changed at {date}.")
            if(not args.nodiff):
                new_result = ""
                new_content = list_data
                s = difflib.SequenceMatcher(None, new_content)
                for tag, i1, i2, j1, j2 in s.get_opcodes():
                    if(tag == "insert" or tag == "replaced"):
                        new_result += new_content[j1:j2]
                diff = new_result
                logger.info("{diff}".format(**locals()))
                print(diff)
            # return True

     if __name__ == "__main__":
        parser = argparse.ArgumentParser()
        parser.add_argument("-t", "--time", help="seconds between 
        checks (default: 600)", default=20, type=int)
        parser.add_argument("-nd", "--nodiff", help="show no 
        difference", action="store_true")

        args = parser.parse_args()

        logging.basicConfig(format="%(asctime)s % 
        (message)s",datefmt="%d.%m.%Y %H:%M:%S")
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)

        scrape_urls()

更新

class scrape():

def __init__(self, url):
    self.list_data = url
    self.url_hash = self.hash()
    self.get_content = self.scrape_urls()
    logger.info(("Start Monitoring... hash "
                "{url_hash}").format(url_hash=self.url_hash))

def hash(self):
    for l in self.list_data:
        a = hashlib.md5()
        a.update(l.encode("utf-8"))

    return a.hexdigest()

def scrape_urls(self):
    url_data = requests.get(self.list_data)
    url_data = Soup(url_data.content, "html5lib")
    url_data = url_data.find("div", attrs={"class": "results"})
    url_data = str(url_data)
    url_data = html2text.html2text(url_data)

    list_data = list([url_data])

    for l in list_data:

        time.sleep(args.time)

        if(self.hash() == self.url_hash):
            logger.info("Nothing has changed")
            return False
        else:
            logger.info("Something has changed")
            date = datetime.datetime.now().strftime("%d.%m.%Y 
            %H:%M:%S")
            print("The Url {l['link ']} has changed at {date}.")
            return True


if __name__ == "__main__":
         parser = argparse.ArgumentParser()
         parser.add_argument("-t", "--time", help="seconds between 
         checks (default: 600)", default=5, type=int)


         args = parser.parse_args()

         logging.basicConfig(format="%(asctime)s % . 
         (message)s",datefmt="%d.%m.%Y %H:%M:%S")
         logger = logging.getLogger()
         logger.setLevel(logging.INFO)

         links_list = pd.read_excel('urls_list.xlsx', 
         index=False)
         loc_subset = links_list.to_dict('records')

         accounts = []

         for l in loc_subset:
             url = str(l['link '])

         accounts.append(url)

         for i in accounts:
              print(i)
             url = scrape(i)
             time.sleep(args.time)
             while True:
                  if(url.scrape_urls()):
                         break
                  time.sleep(args.time)

0 个答案:

没有答案