我是python和web-scraping的新手。我想抓一个网站(链接是网址)。我收到了一个错误"' NoneType'对象不可迭代",使用下面代码的最后一行。有谁可以指出可能出错的地方?
D:\home\site\wwwroot
以下是回溯错误:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = 'https://labtestsonline.org/tests-index'
soup = BeautifulSoup(requests.get(url).content, 'lxml')
# Function to get hyper-links for all test components
hyperlinks = []
def parseUrl(url):
global hyperlinks
page = requests.get(url).content
soup = BeautifulSoup(page, 'lxml')
for a in soup.findAll('div',{'class':'field-content'}):
a = a.find('a')
href = urlparse.urljoin(Url,a.get('href'))
hyperlinks.append(href)
parseUrl(url)
# function to get header and common questions for each test component
def header(url):
page = requests.get(url).content
soup = BeautifulSoup(page, 'lxml')
h = []
commonquestions = []
for head in soup.find('div',{'class':'field-item'}).find('h1'):
heading = head.get_text()
h.append(heading)
for q in soup.find('div',{'id':'Common_Questions'}):
questions = q.get_text()
commonquestions.append(questions)
for i in range(0, len(hyperlinks)):
header(hyperlinks[i])
答案 0 :(得分:0)
soup.find('div',{'class':'field-item'}).find('h1')
正在返回None
。首先检查函数在循环之前是否返回任何内容。
类似的东西:
heads = soup.find('div',{'class':'field-item'}).find('h1')
if heads:
for head in heads:
# remaining code
答案 1 :(得分:0)
试试这个。它应该解决你现在遇到的问题。我用css选择器来完成工作。
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
link = 'https://labtestsonline.org/tests-index'
page = requests.get(link)
soup = BeautifulSoup(page.content, 'lxml')
for a in soup.select('.field-content a'):
new_link = urljoin(link,a.get('href')) ##joining broken urls so as to reuse these
response = requests.get(new_link) ##sending another http requests
sauce = BeautifulSoup(response.text,'lxml')
for item in sauce.select("#Common_Questions .field-item"):
print(item.text)
print("<<<<<<<<<>>>>>>>>>>>")