我想使用python抓取所有标题的网址

时间:2020-03-16 07:49:28

标签: python web-scraping beautifulsoup

我写了一个代码来获取所有标题url,但是有一些问题,例如它显示None值。那你能帮我吗?

这是我的代码:

import requests
from bs4 import BeautifulSoup
import csv

def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
    return soup

def get_index_data(soup):
    try:
        titles_link = soup.find_all('div',class_="marginTopTextAdjuster")
    except:
        titles_link = []
    urls = [item.get('href') for item in titles_link]
    print(urls)

def main():
    #url = "http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1"
    mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
    #get_page(url)
    get_index_data(get_page(mainurl))
    #write_csv(data,url)


if __name__ == '__main__':
    main()

4 个答案:

答案 0 :(得分:1)

您正在尝试获取href标签的div属性。而是尝试选择所有a标签。它们似乎具有通用的类属性body_link_11

使用titles_link = soup.find_all('a',class_="body_link_11")代替titles_link = soup.find_all('div',class_="marginTopTextAdjuster")

答案 1 :(得分:0)

使用requestsBeautifulSoup的简单方法:

import requests
from bs4 import BeautifulSoup

req = requests.get(url)  # url stands for the page's url you want to find
soup = BeautifulSoup(req.text, "html.parser")  # req.text is the complete html of the page

print(soup.title.string)  # soup.title will give you the title of the page but with the <title> tags so .string removes them

答案 2 :(得分:0)

尝试一下。

from simplified_scrapy import SimplifiedDoc,req,utils
url = 'http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1'
html = req.get(url)
doc  = SimplifiedDoc(html)
lst = doc.selects('div.marginTopTextAdjuster').select('a')
titles_link = [(utils.absoluteUrl(url,a.href),a.text) for a in lst if a]
print (titles_link)

结果:

[('http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1', 'Civil Affairs Handbook, Japan, section 1a: population statistics.'), ('http://cgsc.cdmhost.com/cdm/landingpage/collection/p4013coll8', 'World War II Operational Documents'), ('http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2385/rec/2', 'Army Air Forces Program 1943.'),...

答案 3 :(得分:0)

url = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
req = requests.get(url) 
soup = BeautifulSoup(req.text, "lxml") 

titles_link = []
titles_div = soup.find_all('div', attrs={'class': 'marginTopTextAdjuster'})
for link in titles_div:
    tag = link.find_all('a', href=True)
    try:
        if tag[0].attrs.get('item_id', None):
            titles_link.append({tag[0].text: tag[0].attrs.get('href', None)})
    except IndexError:
        continue

print(titles_link)

输出:

[{'Civil Affairs Handbook, Japan, section 1a: population statistics.': '/cdm/singleitem/collection/p4013coll8/id/2653/rec/1'}, {'Army Air Forces Program 1943.': '/cdm/singleitem/collection/p4013coll8/id/2385/rec/2'}, {'Casualty report number II.': '/cdm/singleitem/collection/p4013coll8/id/3309/rec/3'}, {'Light armored division, proposed March 1943.': '/cdm/singleitem/collection/p4013coll8/id/2425/rec/4'}, {'Tentative troop list by type units for Blacklist operations.': '/cdm/singleitem/collection/p4013coll8/id/150/rec/5'}, {'Chemical Warfare Service: history of training, part 2, schooling of commissioned officers.': '/cdm/compoundobject/collection/p4013coll8/id/2501/rec/6'}, {'Horses in the German Army (1941-1945).': '/cdm/compoundobject/collection/p4013coll8/id/2495/rec/7'}, {'Unit history: 38 (MECZ) cavalry rcn. sq.': '/cdm/singleitem/collection/p4013coll8/id/3672/rec/8'}, {'Operations in France: December 1944, 714th Tank Battalion.': '/cdm/singleitem/collection/p4013coll8/id/3407/rec/9'}, {'G-3 Reports : Third Infantry Division. (22 Jan- 30 Mar 44)': '/cdm/singleitem/collection/p4013coll8/id/4393/rec/10'}, {'Summary of operations, 1 July thru 31 July 1944.': '/cdm/singleitem/collection/p4013coll8/id/3445/rec/11'}, {'After action report 36th Armored Infantry Regiment, 3rd Armored Division, Nov 1944 thru April 1945.': '/cdm/singleitem/collection/p4013coll8/id/3668/rec/12'}, {'Unit history, 38th Mechanized Cavalry Reconnaissance Squadron, 9604 thru 9665.': '/cdm/singleitem/collection/p4013coll8/id/3703/rec/13'}, {'Redeployment: occupation forces in Europe series, 1945-1946.': '/cdm/singleitem/collection/p4013coll8/id/2952/rec/14'}, {'Twelfth US Army group directives. Annex no. 1.': '/cdm/singleitem/collection/p4013coll8/id/2898/rec/15'}, {'After action report, 749th Tank Battalion: Jan, Feb, Apr - 8 May 45.': '/cdm/singleitem/collection/p4013coll8/id/3502/rec/16'}, {'743rd Tank Battalion, S3 journal history.': '/cdm/singleitem/collection/p4013coll8/id/3553/rec/17'}, {'History of military training, WAAC / WAC training.': '/cdm/singleitem/collection/p4013coll8/id/4052/rec/18'}, {'After action report, 756th Tank Battalion.': '/cdm/singleitem/collection/p4013coll8/id/3440/rec/19'}, {'After action report 92nd Cavalry Recon Squadron Mechanized 12th Armored Division, Jan thru May 45.': '/cdm/singleitem/collection/p4013coll8/id/3583/rec/20'}]