无法使用漂亮的汤从网页中提取div标签?

时间:2019-12-14 07:42:26

标签: python-3.x selenium web-scraping beautifulsoup

我正在尝试使用beautifulsoup从以下链接中提取一些信息: https://aiesec.org/opportunity/1212595 我需要的是项目的名称和开始日期。但是,我无法提取该名称,并且总是给出None。

 title = soup.find(lambda tag: tag.name == 'div' and tag['class'] == ['opportunity-tile', ''])

在进一步分析中,我发现它甚至没有获得div标签,因为以下内容均不返回“

print(soup.find_all("div"))

我要去哪里错了?

1 个答案:

答案 0 :(得分:0)

<text>Header Component</text>
<text>{{propA}}</text>

输出:

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
    'Authorization': 'e316ebe109dd84ed16734e5161a2d236d0a7e6daf499941f7c110078e3c75493'}
data = {"operationName": "OpportunityQuery", "variables": {"id": "1212595", "cdn_region": "Global"}, "query": "query OpportunityQuery($id: ID, $cdn_region: String) {\n  getOpportunity(id: $id) {\n    application_processing_time\n    applied_to\n    applied_to_with\n    applications_close_date\n    available_openings\n    backgrounds {\n      constant_id\n      constant_name\n      option\n      __typename\n    }\n    branch {\n      id\n      address_detail {\n        id\n        city\n        country\n        __typename\n      }\n      company {\n        id\n        name\n        profile_photo(cdn_region: $cdn_region)\n        __typename\n      }\n      __typename\n    }\n    cover_photo(cdn_region: $cdn_region)\n    description\n    duration\n    project_duration\n    earliest_start_date\n    google_place_id\n    home_lc {\n      id\n      email\n      full_name\n      parent {\n        id\n        name\n        __typename\n      }\n      __typename\n    }\n    id\n    is_favourited\n    is_gep\n    languages {\n      constant_id\n      constant_name\n      option\n      __typename\n    }\n    lat\n    latest_end_date\n    lng\n    legal_info {\n      health_insurance_info\n      visa_duration\n      visa_link\n      visa_type\n      __typename\n    }\n    location\n    logistics_info {\n      accommodation_covered\n      accommodation_provided\n      food_covered\n      __typename\n    }\n    nationalities {\n      constant_id\n      constant_name\n      option\n      __typename\n    }\n    office_footfall_for_exchange\n    openings\n    opportunity_cost\n    opportunity_questions {\n      edges {\n        node {\n          id\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    organisation {\n      id\n      name\n      __typename\n    }\n    percentage_of_fulfillment\n    programme {\n      id\n      short_name_display\n      __typename\n    }\n    remark\n    reviews\n    role_info {\n      selection_process\n      learning_points_list\n      __typename\n    }\n    sdg_info {\n      id\n      sdg_target {\n        description\n        goal_index\n        id\n        parent {\n          id\n          __typename\n        }\n        target\n        __typename\n      }\n      __typename\n    }\n    selection_processes(first: 50) {\n      edges {\n        cursor\n        node {\n          id\n          title\n          no_of_days\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    skills {\n      constant_id\n      constant_name\n      option\n      __typename\n    }\n    specifics_info {\n      computer\n      expected_work_schedule\n      ef_test_required\n      salary\n      salary_currency {\n        id\n        alphabetic_code\n        __typename\n      }\n      salary_periodicity\n      saturday_work\n      __typename\n    }\n    status\n    study_levels {\n      id\n      name\n      __typename\n    }\n    title\n    transparent_fee_details {\n      covers_accomodation\n      covers_administrative_costs\n      covers_leadership_spaces\n      covers_pickup\n      sponsored_by\n      __typename\n    }\n    __typename\n  }\n}\n"}

r = requests.post('https://gis-api.aiesec.org/graphql',
                  json=data, headers=headers).json()

print(r['data']['getOpportunity']['title'])
print(r['data']['getOpportunity']['earliest_start_date'])
print(r['data']['getOpportunity']['applications_close_date'])
print(r['data']['getOpportunity']['latest_end_date'])