Python抓取JSON比较文件

时间:2018-11-30 13:29:40

标签: python json web-scraping

这是我的python代码,它两次在网页上搜索以获取产品详细信息并将数据保存在.json文件中。它应该检查新文件中的密钥是否更改并打印更改内容,但是出现以下错误。

错误:

 Traceback (most recent call last):
    File "x.py", line 84, in <module>
    compare()
    File "x.py", line 76, in compare
    for key in b.keys():
    AttributeError: 'NoneType' object has no attribute 'keys'

代码:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import cfscrape
import requests
from bs4 import BeautifulSoup as bs
import re
from pprint import pprint
import json

s = requests.Session()
s = cfscrape.create_scraper()

products = []
products1 = []

def x():
    r = s.get("https://www.oneblockdown.it/it/calzature-sneakers", headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36"})
    soup = bs(r.content, "html.parser")

    js = [x.text for x in soup.find_all('script', {'type': 'text/javascript'}) if "var preloadedItems =" in x.text][0]
    js = js.replace('var preloadedItems = ', '')
    js = js[:js.find("}];")]+"}]".strip()
    data = json.loads(js)
    for product in data:
        product_id = product["id"]
        product_title = product["title"]
        product_link = product["permalink"]
        product_price = product["displayPrice"]
        product_available = product["isAvailable"]
        product_size = product["attributes"]
        products.append({
            "product_id": product_id,
            "product_title": product_title,
            "product_link": product_link,
            "product_price": product_price,
            "product_available": product_available,
            "product_size": product_size
        })

    with open('data.json', 'w') as f:
        json.dump(products, f, indent = 4)
        f.close()

def y():
    r1 = s.get("https://www.oneblockdown.it/it/calzature-sneakers", 
    headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36"})
    soup = bs(r1.content, "html.parser")

    js = [x.text for x in soup.find_all('script', {'type': 'text/javascript'}) if "var preloadedItems =" in x.text][0]
    js = js.replace('var preloadedItems = ', '')
    js = js[:js.find("}];")]+"}]".strip()
    data1 = json.loads(js)
    for product in data1:
        product_id = product["id"]
        product_title = product["title"]
        product_link = product["permalink"]
        product_price = product["displayPrice"]
        product_available = product["isAvailable"]
        product_size = product["attributes"]
        products1.append({
            "product_id": product_id,
            "product_title": product_title,
            "product_link": product_link,
            "product_price": product_price,
            "product_available": product_available,
            "product_size": product_size
        })

    with open('data1.json', 'w') as f:
        json.dump(products, f, indent = 4)
        f.close()


def compare():
    while True:
        a = x()
        b = y()
        for key in b.keys():
            value = b[key]
            if key not in a:
                print(key, value)
            else:
                if a[key] != value:
                    print("for key {} values are different".format(key))

compare()

我选择了这种方法,但我不知道是否有更好的方法可用于此目的。

1 个答案:

答案 0 :(得分:0)

您不会从DocumentTypex()方法返回任何内容。因此,y()a的类型为b

您很可能希望从Noneproducts中删除x()列表,因此在该方法中添加return语句。

赞:

y()