如何在python中删除部分字符串json

时间:2017-01-31 15:41:23

标签: python json

我有一个使用Python的爬虫,我的函数使用dataLayer获取脚本并将字符串转换为格式并转换为json。

这是dataLayer:

dataLayer = [{

    "site": {
        "isMobile": false,
        "source": (function() {
            var userAgent = navigator.userAgent.toLocaleLowerCase();
            var source = "web";
            resultMatch = userAgent.match(/\[olx-source\/(\w+);/);
            if (resultMatch) {
                [, source] = resultMatch;
            }
            return source;
        })()
    },
    "page": {
        "pageType": "ad_detail",
        "detail": {
            "parent_category_id": "2000",
            "category_id": "2020",
            "state_id": "2",
            "region_id": "31",

            "ad_id": "354269527",
            "list_id": "295567499",
            "city_id": "9190",
            "zipcode":"32146045",

        },

        "adDetail": {
            "adID": "354269527",
            "listID": "295567499",
            "sellerName": "Glauber Marlon",
            "adDate": "2017-01-23 18:35:26",
        },

    },
    "session": {
        "user": {
            "userID": "",
            "loginType": ""
        }
    },

    "pageType": "Ad_detail",
    "abtestingEnable" : "1",



// Listing information

"listingCategory": "2020",


// Ad information
"adId": "354269527",
"state": "2",
"region": "31",
"category": "2020",

"pictures": "14",
"listId": "295567499",

//Account Information

"loggedUser":"0",

"referrer": "",

//User Information


}];

这是我在Json格式化和转换的函数:

s = page_ad.findAll('script')[25].text.replace('\'', '"')
// if print s this line and put in JsonLint show error in function. 
s = re.search(r'\{.+\}', s, re.DOTALL).group() # get json data
s = re.sub(r'//.+\n', '', s) # replace comment
s = re.sub(r'\s+', '', s) # strip whitspace
s = re.sub(r',}', '}', s) # get rid of last , in the dict

dataLayer = json.loads(s)

这是转换前的json:

{  
    "site":{  
        "isMobile":false,
        "source":(function()      {  
            varuserAgent=navigator.userAgent.toLocaleLowerCase();varsource="web";resultMatch=userAgent.match(/\         [  
                olx-source\/(\w+);/);if(resultMatch)            {  
                [  
                ,
                source
                ]               =resultMatch;
            }            returnsource;
        }         )()
    },
    "page":{  
        "pageType":"ad_detail",
        "detail":{  
            "parent_category_id":"2000",
            "category_id":"2020",
            "state_id":"2",
            "region_id":"31",
            "ad_id":"354269527",
            "list_id":"295567499",
            "city_id":"9190",
            "zipcode":"32146045"
        },
        "adDetail":{  
            "adID":"354269527",
            "listID":"295567499",
            "sellerName":"GlauberMarlon",
            "adDate":"2017-01-2318:35:26"
        }
    },
    "session":{  
        "user":{  
            "userID":"",
            "loginType":""
        }
    },
    "pageType":"Ad_detail",
    "abtestingEnable":"1",
    "listingCategory":"2020",
    "adId":"354269527",
    "state":"2",
    "region":"31",
    "category":"2020",
    "pictures":"14",
    "listId":"295567499",
    "loggedUser":"0",
    "referrer":""
}

我想删除索引“site”错误在哪里。

Traceback (most recent call last):
  File "crawler_olx_0.1.py", line 182, in <module>
    run(link_base)
  File "crawler_olx_0.1.py", line 52, in run
    vehicleInformation = getVehicleInformation(page_ad)
  File "crawler_olx_0.1.py", line 81, in getVehicleInformation
    dataLayer = json.loads(s)
  File "/usr/lib/python2.7/json/__init__.py", line 339, in loads
    return _default_decoder.decode(s)
  File "/usr/lib/python2.7/json/decoder.py", line 364, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/usr/lib/python2.7/json/decoder.py", line 382, in raw_decode
    raise ValueError("No JSON object could be decoded")
ValueError: No JSON object could be decoded

1 个答案:

答案 0 :(得分:0)

在抓取时执行javascript会更好(Selenium,...) 以下可能是愚蠢的方法(对于2.7,我没有安装py3 + pyv8)。

 var PrintThis = function (val) {
       printWindow.focus();
       printWindow.print();

    }

    printWindow.addEventListener("message", PrintThis);
    printWindow.postMessage("Print", "*");
    //in this case iam able to get the data in printhis()
    //but its also not working

返回:

import PyV8
jsCtx = PyV8.JSContext()
jsCtx.enter()

jsCtx.eval('var navigator = { "userAgent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3" }')
JsFilledJSON = """dataLayer = [
    {
        "site": {
            "isMobile": false,
            "source": (function() {
                var userAgent = navigator.userAgent.toLocaleLowerCase();
                var source = "web";
                resultMatch = userAgent.match(/\[olx-source\/(\w+);/);
                if (resultMatch) {
                    [, source] = resultMatch;
                }
                return source;
            })()
        },
        "page": {
            "pageType": "ad_detail",
            "detail": {
                "parent_category_id": "2000",
                "category_id": "2020",
                "state_id": "2",
                "region_id": "31",

                "ad_id": "354269527",
                "list_id": "295567499",
                "city_id": "9190",
                "zipcode":"32146045",

            }
        }
    }
]
"""

x = jsCtx.eval("JSON.stringify(%s)" % JsFilledJSON.decode('utf-8'))
print json.dumps(json.loads(x), indent=4, sort_keys=True)

但是您可以使用PyV8或任何其他python执行JS-JSON到avascript包装器。二进制文件位于github