Python多域爬虫InvalidSchema异常

时间:2015-05-07 07:58:07

标签: python web-crawler multiple-domains

这是我的代码。我的目标是抓取多个域名。我在url数组中设置了域名,但我无法抓取。

代码可以找到网址但不解析或抓取。

这是结果:我的代码运行 ('总链接数:',387) ('新闻链接:',146)

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import codecs

headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "accept-charset": "cp1254,ISO-8859-9,utf-8;q=0.7,*;q=0.3",
    "accept-encoding": "gzip,deflate,sdch",
    "accept-language": "tr,tr-TR,en-US,en;q=0.8",
}

def haber_oku(haber_url):
    r = requests.get(haber_url, headers=headers)
    if r.status_code != 200:
        return
    soup = BeautifulSoup(r.content)
    result = soup.find("div", {'itemprop': 'articleBody'})
    if result:
        return result.get_text()
    else:
        result = soup.find("div", {'itemprop': 'description'})
        if result:
            return result.get_text()
    return

def scrape_hurriyet(keywords, detay_goster, url):

    if len(keywords) > 0:
        keywords = keywords.split(',')
    s = 0

    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        print("request reddedildi")
        return

    soup = BeautifulSoup(r.content)
    results = soup.findAll("a")
    print ("Toplam link sayisi : ", len(results))
    liste_link = []
    liste_text = []
    haberler = []
    for result in results:
        h = result.get('href')
        t = result.get_text()
        if h is not None:
            if str(h).find('http://www.hurriyet.com.tr/') or str(h).find('http://www.milliyet.com.tr/spor') >= 0:
                if h not in liste_link:
                    if h.find('.asp') or h.find('.htm') > 0:
                        liste_link.append(h)
                        liste_text.append(t)

    print ("Tekil linkler: ", len(liste_link))
    i = 0
    while i < len(liste_link):
        h = liste_link[i]
        t = liste_text[i]
        haber = haber_oku(h)

        if haber is not None:
            haber = BeautifulSoup(haber).get_text()
            ok = 0
            found = ""

            if len(keywords) == 0:
                haberler.append(haber)

            else:
                for keyword in keywords:
                    print ('----------------------')
                    if haber.find(keyword) >= 0:
                        found = found + " " + keyword
                        ok += 1
                    if ok > 0:
                        print ("3", h, t, found)
                    if detay_goster is True:
                        haberler.append(haber)
        i += 1
    k = 0
    while k < len(haberler):
        f = codecs.open("abc" + str(k+1) + ".txt", encoding='utf-8', mode='w+')
        f.write(haberler[k])
        k += 1
    f.close()

keywords = ''
url = ['http://www.hurriyet.com.tr/', 'http://www.milliyet.com.tr/']
s = 0
while s < len(url):
    scrape_hurriyet(keywords, True, url[s])
    s += 1

他们是例外:

Traceback (most recent call last):
  File "C:/Users/KerimCaner/PycharmProjects/Hurriyet/hurriyet.py", line 94, in <module>
    scrape_hurriyet(keywords, True, url[s])
  File "C:/Users/KerimCaner/PycharmProjects/Hurriyet/hurriyet.py", line 62, in scrape_hurriyet
    haber = haber_oku(h)
  File "C:/Users/KerimCaner/PycharmProjects/Hurriyet/hurriyet.py", line 17, in haber_oku
    r = requests.get(haber_url, headers=headers)
  File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\api.py", line 69, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\api.py", line 50, in request
    response = session.request(method=method, url=url, **kwargs)
  File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\sessions.py", line 465, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\sessions.py", line 567, in send
    adapter = self.get_adapter(url=request.url)
  File "C:\Users\KerimCaner\AppData\Roaming\Python\Python27\site-packages\requests\sessions.py", line 641, in get_adapter
    raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'javascript:;'

1 个答案:

答案 0 :(得分:0)

您收到的错误:requests.exceptions.InvalidSchema: No connection adapters were found for 'javascript:;'表示您正在尝试抓取一段javascript。您当前正在抓取锚标记中的所有网址,但您需要过滤掉javascript-url。您应该替换以下行:

    if h is not None:

有这样的事情:

    if h is not None and not(h.startswith("javascript")):