嗨,我正尝试从网站上抓取数据,并且在相当长的一段时间内运行良好,现在我收到此错误消息:“ 'NoneType'对象没有属性'text'”,并且它不再抓取任何数据。 他们一定在网站上做了一些更改,但我不知道是什么。
该行中的错误记录:“ data_page = soup_page.find('script',text = r_page).text ”
那是我的代码:
from bs4 import BeautifulSoup as bs
import urllib.request
from urllib.request import urlopen
import requests
import time
import re
from datetime import datetime
import pandas as pd
import json
for seite in range(1):
print("Loop " + str(seite) + " startet.")
df = pd.DataFrame()
l=[]
try:
page = ("https://www.immobilienscout24.de/Suche/radius/neubauwohnung-kaufen?centerofsearchaddress=Krefeld;47799;Grenzstra%C3%9Fe;;;Bockum&geocoordinates=51.33798;6.58608;1.0&enteredFrom=result_list")
print(page)
res_page = requests.get(page)
soup_page = bs(res_page.content, 'lxml')
r_page = re.compile(r'resultListModel:(.*)')
data_page = soup_page.find('script', text=r_page).text
script_page = r_page.findall(data_page)[0].rstrip(',')
results_page = json.loads(script_page)
for item in results_page['searchResponseModel']['resultlist.resultlist']['resultlistEntries'][0]['resultlistEntry']:
l.append(item['@id'])
if 'similarObjects' in item:
for i in item['similarObjects'][0]['similarObject']:
if isinstance(i,dict):
l.append(i['@id'])
elif i == '@id':
l.append(item['similarObjects'][0]['similarObject'][i])
l = list(set(l))
答案 0 :(得分:1)
如果您未指定User-Agent
和Accept-Language
HTTP标头,则服务器将返回CAPTCHA页面:
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
'Accept-Language': 'en-US,en;q=0.5'
}
for seite in range(1):
print("Loop " + str(seite) + " startet.")
df = pd.DataFrame()
l=[]
page = ("https://www.immobilienscout24.de/Suche/radius/neubauwohnung-kaufen?centerofsearchaddress=Krefeld;47799;Grenzstra%C3%9Fe;;;Bockum&geocoordinates=51.33798;6.58608;1.0&enteredFrom=result_list")
res_page = requests.get(page, headers=headers)
soup_page = BeautifulSoup(res_page.content, 'lxml')
r_page = re.compile(r'resultListModel:(.*)')
data_page = soup_page.find('script', text=r_page).string
script_page = r_page.findall(data_page)[0].rstrip(',')
results_page = json.loads(script_page)
for item in results_page['searchResponseModel']['resultlist.resultlist']['resultlistEntries']:
item = item['resultlistEntry']
l.append(item['@id'])
if 'similarObjects' in item:
for i in item['similarObjects'][0]['similarObject']:
if isinstance(i,dict):
l.append(i['@id'])
elif i == '@id':
l.append(item['similarObjects'][0]['similarObject'][i])
l = list(set(l))
print(l)
打印:
['119256589', '119215242', '119254488', '119256425', '119254296', '119256175', '119240835']