通过标签循环 - BeautifulSoup

时间:2018-05-12 19:41:08

标签: python json beautifulsoup

我试图从我的脚本中的网页抓取主题标题,但它只是拉动页面上的最后一个主题并将其添加到我的JSON文件中的每个记录。如何让它循环遍历每个h2标签,为每条记录添加正确的主题?

from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import json

uci_urls = ['http://ocw.uci.edu/courses']

data =[]
#opening up connection and grabbing page
for uci_url in uci_urls:
    uClient = urlopen(uci_url)
    page_html = uClient.read()
    uClient.close()

    #html parsing
    page_soup = soup(page_html, "html.parser")

    #grabs info for each textbook
    containers = page_soup.findAll("div",{"class":"panel-heading"})
    subjects = page_soup.findAll("h2")

    for container in containers:
        item = {}
        item['type'] = "Course"
        item['title'] = container.h3.text
        item['author'] = "University of California Irvine"
        item['link'] = "http://ocw.uci.edu/courses" + container.a["href"]
        item['source'] = "UCI Open"
        for subject in subjects:
           item['subject'] = subject.text
        item['base_url'] = "http://ocw.uci.edu/"
        data.append(item) # add the item to the list

    with open("./json/uci.json", "w") as writeJSON:
       json.dump(data, writeJSON, ensure_ascii=False)

1 个答案:

答案 0 :(得分:0)

一次抓取页面中的所有标题无法提供帮助。您必须让每个父div包含您想要的标题和课程面板。

#html parsing
page_soup = soup(page_html, "html.parser")

for tab_pane in page_soup.select('.col-sm-9.col-md-9.tab-content > .tab-pane'):
    subject = tab_pane.h2.text
    for container in tab_pane.findAll("div",{"class":"panel-heading"}):
        item = {}
        item['type'] = "Course"
        item['title'] = container.h3.text
        item['author'] = "University of California Irvine"
        item['link'] = "http://ocw.uci.edu/courses" + container.a["href"]
        item['source'] = "UCI Open"
        item['subject'] = subject
        item['base_url'] = "http://ocw.uci.edu/"
        data.append(item) # add the item to the list