我试图从我的脚本中的网页抓取主题标题,但它只是拉动页面上的最后一个主题并将其添加到我的JSON文件中的每个记录。如何让它循环遍历每个h2标签,为每条记录添加正确的主题?
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import json
uci_urls = ['http://ocw.uci.edu/courses']
data =[]
#opening up connection and grabbing page
for uci_url in uci_urls:
uClient = urlopen(uci_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs info for each textbook
containers = page_soup.findAll("div",{"class":"panel-heading"})
subjects = page_soup.findAll("h2")
for container in containers:
item = {}
item['type'] = "Course"
item['title'] = container.h3.text
item['author'] = "University of California Irvine"
item['link'] = "http://ocw.uci.edu/courses" + container.a["href"]
item['source'] = "UCI Open"
for subject in subjects:
item['subject'] = subject.text
item['base_url'] = "http://ocw.uci.edu/"
data.append(item) # add the item to the list
with open("./json/uci.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
答案 0 :(得分:0)
一次抓取页面中的所有标题无法提供帮助。您必须让每个父div包含您想要的标题和课程面板。
#html parsing
page_soup = soup(page_html, "html.parser")
for tab_pane in page_soup.select('.col-sm-9.col-md-9.tab-content > .tab-pane'):
subject = tab_pane.h2.text
for container in tab_pane.findAll("div",{"class":"panel-heading"}):
item = {}
item['type'] = "Course"
item['title'] = container.h3.text
item['author'] = "University of California Irvine"
item['link'] = "http://ocw.uci.edu/courses" + container.a["href"]
item['source'] = "UCI Open"
item['subject'] = subject
item['base_url'] = "http://ocw.uci.edu/"
data.append(item) # add the item to the list