这是我的代码。我的目标是抓取多个域名。我在url数组中设置了域名,但我无法抓取。
代码可以找到网址但不解析或抓取。
这是结果:我的代码运行 ('总链接数:',387) ('新闻链接:',146)
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import codecs
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"accept-charset": "cp1254,ISO-8859-9,utf-8;q=0.7,*;q=0.3",
"accept-encoding": "gzip,deflate,sdch",
"accept-language": "tr,tr-TR,en-US,en;q=0.8",
}
def haber_oku(haber_url):
r = requests.get(haber_url, headers=headers)
if r.status_code != 200:
return
soup = BeautifulSoup(r.content)
result = soup.find("div", {'itemprop': 'articleBody'})
if result:
return result.get_text()
else:
result = soup.find("div", {'itemprop': 'description'})
if result:
return result.get_text()
return
def scrape_hurriyet(keywords, detay_goster, url):
if len(keywords) > 0:
keywords = keywords.split(',')
s = 0
r = requests.get(url, headers=headers)
if r.status_code != 200:
print("request reddedildi")
return
soup = BeautifulSoup(r.content)
results = soup.findAll("a")
print ("Toplam link sayisi : ", len(results))
liste_link = []
liste_text = []
haberler = []
for result in results:
h = result.get('href')
t = result.get_text()
if h is not None:
if str(h).find('http://www.hurriyet.com.tr/') or str(h).find('http://www.milliyet.com.tr/spor') >= 0:
if h not in liste_link:
if h.find('.asp') or h.find('.htm') > 0:
liste_link.append(h)
liste_text.append(t)
print ("Tekil linkler: ", len(liste_link))
i = 0
while i < len(liste_link):
h = liste_link[i]
t = liste_text[i]
haber = haber_oku(h)
if haber is not None:
haber = BeautifulSoup(haber).get_text()
ok = 0
found = ""
if len(keywords) == 0:
haberler.append(haber)
else:
for keyword in keywords:
print ('----------------------')
if haber.find(keyword) >= 0:
found = found + " " + keyword
ok += 1
if ok > 0:
print ("3", h, t, found)
if detay_goster is True:
haberler.append(haber)
i += 1
k = 0
while k < len(haberler):
f = codecs.open("abc" + str(k+1) + ".txt", encoding='utf-8', mode='w+')
f.write(haberler[k])
k += 1
f.close()
keywords = ''
url = ['http://www.hurriyet.com.tr/', 'http://www.milliyet.com.tr/']
s = 0
while s < len(url):
scrape_hurriyet(keywords, True, url[s])
s += 1
他们是例外:
Traceback (most recent call last):
File "C:/Users/KerimCaner/PycharmProjects/Hurriyet/hurriyet.py", line 94, in <module>
scrape_hurriyet(keywords, True, url[s])
File "C:/Users/KerimCaner/PycharmProjects/Hurriyet/hurriyet.py", line 62, in scrape_hurriyet
haber = haber_oku(h)
File "C:/Users/KerimCaner/PycharmProjects/Hurriyet/hurriyet.py", line 17, in haber_oku
r = requests.get(haber_url, headers=headers)
File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\api.py", line 69, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\api.py", line 50, in request
response = session.request(method=method, url=url, **kwargs)
File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\sessions.py", line 465, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\sessions.py", line 567, in send
adapter = self.get_adapter(url=request.url)
File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\sessions.py", line 641, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'javascript:;'
答案 0 :(得分:0)
您收到的错误:requests.exceptions.InvalidSchema: No connection adapters were found for 'javascript:;'
表示您正在尝试抓取一段javascript。您当前正在抓取锚标记中的所有网址,但您需要过滤掉javascript-url。您应该替换以下行:
if h is not None:
有这样的事情:
if h is not None and not(h.startswith("javascript")):