我写了一个代码来获取所有标题url,但是有一些问题,例如它显示None
值。那你能帮我吗?
这是我的代码:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_index_data(soup):
try:
titles_link = soup.find_all('div',class_="marginTopTextAdjuster")
except:
titles_link = []
urls = [item.get('href') for item in titles_link]
print(urls)
def main():
#url = "http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1"
mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
#get_page(url)
get_index_data(get_page(mainurl))
#write_csv(data,url)
if __name__ == '__main__':
main()
答案 0 :(得分:1)
您正在尝试获取href
标签的div
属性。而是尝试选择所有a
标签。它们似乎具有通用的类属性body_link_11
。
使用titles_link = soup.find_all('a',class_="body_link_11")
代替titles_link = soup.find_all('div',class_="marginTopTextAdjuster")
答案 1 :(得分:0)
使用requests
和BeautifulSoup
的简单方法:
import requests
from bs4 import BeautifulSoup
req = requests.get(url) # url stands for the page's url you want to find
soup = BeautifulSoup(req.text, "html.parser") # req.text is the complete html of the page
print(soup.title.string) # soup.title will give you the title of the page but with the <title> tags so .string removes them
答案 2 :(得分:0)
尝试一下。
from simplified_scrapy import SimplifiedDoc,req,utils
url = 'http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1'
html = req.get(url)
doc = SimplifiedDoc(html)
lst = doc.selects('div.marginTopTextAdjuster').select('a')
titles_link = [(utils.absoluteUrl(url,a.href),a.text) for a in lst if a]
print (titles_link)
结果:
[('http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1', 'Civil Affairs Handbook, Japan, section 1a: population statistics.'), ('http://cgsc.cdmhost.com/cdm/landingpage/collection/p4013coll8', 'World War II Operational Documents'), ('http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2385/rec/2', 'Army Air Forces Program 1943.'),...
答案 3 :(得分:0)
url = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
titles_link = []
titles_div = soup.find_all('div', attrs={'class': 'marginTopTextAdjuster'})
for link in titles_div:
tag = link.find_all('a', href=True)
try:
if tag[0].attrs.get('item_id', None):
titles_link.append({tag[0].text: tag[0].attrs.get('href', None)})
except IndexError:
continue
print(titles_link)
输出:
[{'Civil Affairs Handbook, Japan, section 1a: population statistics.': '/cdm/singleitem/collection/p4013coll8/id/2653/rec/1'}, {'Army Air Forces Program 1943.': '/cdm/singleitem/collection/p4013coll8/id/2385/rec/2'}, {'Casualty report number II.': '/cdm/singleitem/collection/p4013coll8/id/3309/rec/3'}, {'Light armored division, proposed March 1943.': '/cdm/singleitem/collection/p4013coll8/id/2425/rec/4'}, {'Tentative troop list by type units for Blacklist operations.': '/cdm/singleitem/collection/p4013coll8/id/150/rec/5'}, {'Chemical Warfare Service: history of training, part 2, schooling of commissioned officers.': '/cdm/compoundobject/collection/p4013coll8/id/2501/rec/6'}, {'Horses in the German Army (1941-1945).': '/cdm/compoundobject/collection/p4013coll8/id/2495/rec/7'}, {'Unit history: 38 (MECZ) cavalry rcn. sq.': '/cdm/singleitem/collection/p4013coll8/id/3672/rec/8'}, {'Operations in France: December 1944, 714th Tank Battalion.': '/cdm/singleitem/collection/p4013coll8/id/3407/rec/9'}, {'G-3 Reports : Third Infantry Division. (22 Jan- 30 Mar 44)': '/cdm/singleitem/collection/p4013coll8/id/4393/rec/10'}, {'Summary of operations, 1 July thru 31 July 1944.': '/cdm/singleitem/collection/p4013coll8/id/3445/rec/11'}, {'After action report 36th Armored Infantry Regiment, 3rd Armored Division, Nov 1944 thru April 1945.': '/cdm/singleitem/collection/p4013coll8/id/3668/rec/12'}, {'Unit history, 38th Mechanized Cavalry Reconnaissance Squadron, 9604 thru 9665.': '/cdm/singleitem/collection/p4013coll8/id/3703/rec/13'}, {'Redeployment: occupation forces in Europe series, 1945-1946.': '/cdm/singleitem/collection/p4013coll8/id/2952/rec/14'}, {'Twelfth US Army group directives. Annex no. 1.': '/cdm/singleitem/collection/p4013coll8/id/2898/rec/15'}, {'After action report, 749th Tank Battalion: Jan, Feb, Apr - 8 May 45.': '/cdm/singleitem/collection/p4013coll8/id/3502/rec/16'}, {'743rd Tank Battalion, S3 journal history.': '/cdm/singleitem/collection/p4013coll8/id/3553/rec/17'}, {'History of military training, WAAC / WAC training.': '/cdm/singleitem/collection/p4013coll8/id/4052/rec/18'}, {'After action report, 756th Tank Battalion.': '/cdm/singleitem/collection/p4013coll8/id/3440/rec/19'}, {'After action report 92nd Cavalry Recon Squadron Mechanized 12th Armored Division, Jan thru May 45.': '/cdm/singleitem/collection/p4013coll8/id/3583/rec/20'}]