使用python抓取标题网址时出现错误

时间:2020-03-16 09:40:50

标签: python web-scraping beautifulsoup

我写了一段代码来抓取标题URL,但是在提取标题URL时遇到错误,因此请您指导我。 这是我的代码:

import requests
from bs4 import BeautifulSoup
# import pandas as pd
# import pandas as pd
import csv


def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('server responded:', response.status_code)
    else:
        # 1. html , 2. parser
        soup = BeautifulSoup(response.text, 'html.parser')
    return soup


def get_index_data(soup):
    try:
        titles_link = soup.find_all('a', class_="body_link_11")
    except:
        titles_link = []
    # urls = [item.get('href') for item in titles_link]
    print(titles_link)


def main():
    mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/" \
              "searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
    get_index_data(get_page(mainurl))


if __name__ == '__main__':
    main()

1 个答案:

答案 0 :(得分:0)

如果要获取所有链接,请尝试以下操作:

mvn clean package install -DskipTests

输出:

def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
    return soup

def get_index_data(soup):
    try:
        titles_link = soup.find_all('a',class_="body_link_11")
    except:
        titles_link = []
    else:
        titles_link_output = []
        for link in titles_link:
            try:
                item_id = link.attrs.get('item_id', None) # All titles with valid links will have an item_id
                if item_id:
                    titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
            except:
                continue
        print(titles_link_output)

def main():
    mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
    get_index_data(get_page(mainurl))

main()