网页抓取内容类型:JSON

时间:2021-04-11 18:45:05

标签: python json web-scraping beautifulsoup python-requests

我试图从 here
使用 Beatifulsoup 结果我得到了空的 [ ]List。问题是我要抓取的数据在 viewpagesource 中不可用。在 Developertool > network 中,content-type 为 JSON。所以我尝试使用以下代码:

from bs4 import BeautifulSoup
import requests
import pandas as pd
import json

url = 'https://ngc.taleo.net/careersection/ng_pro_intl_aujobs/jobsearch.ftl?lang=en_GB&location=756140022608&radiusType=K&searchExpanded=true&radius=1&portal=34140031600&_ga=2.197392303.1699610010.1604351575-1311873605.1579627290'

s = requests.Session()

cookies = {
    'locale': 'en-GB',
    '_gcl_au': '1.1.79711829.1614933155',
    '_ga': 'GA1.2.693390019.1614933178',
    '__atssc': 'google^%^3B1',
    '_gid': 'GA1.2.1213481278.1618077337',
    '__atuvc': '1^%^7C10^%^2C0^%^7C11^%^2C9^%^7C12^%^2C14^%^7C13^%^2C28^%^7C14',
    '__atuvs': '6071e67dc413e3d6001',
}

headers = {
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'sec-ch-ua': '^\\^Google',
    'tzname': 'Asia/Calcutta',
    'sec-ch-ua-mobile': '?0',
    'tz': 'GMT+05:30',
    'Content-Type': 'application/json',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'User-Agent': '###MY USER AGENT HERE####',
    'X-Requested-With': 'XMLHttpRequest',
    'Origin': 'https://ngc.taleo.net',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Dest': 'empty',
    'Referer': 'https://ngc.taleo.net/careersection/ng_pro_intl_aujobs/jobsearch.ftl?lang=en_GB&location=756140022608&radiusType=K&searchExpanded=true&radius=1&portal=34140031600&_ga=2.197392303.1699610010.1604351575-1311873605.1579627290',
    'Accept-Language': 'en-US,en;q=0.9',
}

params = (
    ('lang', 'en_GB'),
    ('portal', '34140031600'),
)
data = '^{^\\^multilineEnabled^\\^:true,^\\^sortingSelection^\\^:^{^\\^sortBySelectionParam^\\^:^\\^3^\\^,^\\^ascendingSortingOrder^\\^:^\\^false^\\^^},^\\^fieldData^\\^:^{^\\^fields^\\^:^{^\\^KEYWORD^\\^:^\\^^\\^,^\\^LOCATION^\\^:^\\^756140022608^\\^,^\\^JOB_TITLE^\\^:^\\^^\\^^},^\\^valid^\\^:true^},^\\^filterSelectionParam^\\^:^{^\\^searchFilterSelections^\\^:^[^{^\\^id^\\^:^\\^POSTING_DATE^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^LOCATION^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_FIELD^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_TYPE^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_SCHEDULE^\\^,^\\^selectedValues^\\^:^[^]^}^]^},^\\^advancedSearchFiltersSelectionParam^\\^:^{^\\^searchFilterSelections^\\^:^[^{^\\^id^\\^:^\\^ORGANIZATION^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^LOCATION^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_FIELD^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_NUMBER^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^URGENT_JOB^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^EMPLOYEE_STATUS^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^STUDY_LEVEL^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^WILL_TRAVEL^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_SHIFT^\\^,^\\^selectedValues^\\^:^[^]^}^]^},^\\^pageNo^\\^:1^}'


response = s.post(url, headers=headers, cookies=cookies, data=data).json()

#res_json = json.loads(response)
#print(response.status_code)

但在 response 行中,我收到了一个错误,如 JSONDecodeError("Expecting value", s, err.value) from None json.decoder.JSONDecodeError:期望值:第 1 行第 1 列(字符 0)

非常感谢您对此的任何帮助!

不幸的是,我目前只能使用请求或其他流行的 Python 库。

提前致谢..

1 个答案:

答案 0 :(得分:0)

您需要发送 JSON 数据,您可以将 json = data 与 python requests 模块一起使用。您还需要将数据格式化为字典:

import requests

r = requests.post("https://ngc.taleo.net/careersection/rest/jobboard/searchjobs",
                  params={
                      "lang": "en_GB",
                      "location": "756140022608",
                      "radiusType": "K",
                      "searchExpanded": "true",
                      "radius": "1",
                      "portal": "34140031600"
                  },
                  headers={
                      "tzname": "Asia/Calcutta",
                      "tz": "GMT+05:30"
                  },
                  json={
                      "multilineEnabled": True,
                      "sortingSelection": {
                          "sortBySelectionParam": "3",
                          "ascendingSortingOrder": "false"
                      },
                      "fieldData": {
                          "fields": {
                              "KEYWORD": "",
                              "LOCATION": "756140022608",
                              "JOB_TITLE": ""
                          },
                          "valid": True
                      },
                      "filterSelectionParam": {
                          "searchFilterSelections": [{
                              "id": "POSTING_DATE",
                              "selectedValues": []
                          }, {
                              "id": "LOCATION",
                              "selectedValues": []
                          }, {
                              "id": "JOB_FIELD",
                              "selectedValues": []
                          }, {
                              "id": "JOB_TYPE",
                              "selectedValues": []
                          }, {
                              "id": "JOB_SCHEDULE",
                              "selectedValues": []
                          }]
                      },
                      "advancedSearchFiltersSelectionParam": {
                          "searchFilterSelections": [{
                              "id": "ORGANIZATION",
                              "selectedValues": []
                          }, {
                              "id": "LOCATION",
                              "selectedValues": []
                          }, {
                              "id": "JOB_FIELD",
                              "selectedValues": []
                          }, {
                              "id": "JOB_NUMBER",
                              "selectedValues": []
                          }, {
                              "id": "URGENT_JOB",
                              "selectedValues": []
                          }, {
                              "id": "EMPLOYEE_STATUS",
                              "selectedValues": []
                          }, {
                              "id": "STUDY_LEVEL",
                              "selectedValues": []
                          }, {
                              "id": "WILL_TRAVEL",
                              "selectedValues": []
                          }, {
                              "id": "JOB_SHIFT",
                              "selectedValues": []
                          }]},
                      "pageNo": 1
                  })

print(r.json())