当我尝试抓取图片的详细信息页面时出现错误

时间:2016-10-08 01:59:29

标签: python parsing beautifulsoup screen-scraping feedparser

我想从网站上获取详细信息页面的图像。我正在使用rss的“链接”功能来获取链接。这是我的代码

@app.task
def pan_task():
    url = 'http://feeds.example.com/reuters/technologyNews'
    name = 'noticiassin'
    live_leaks = [i for i in feedparser.parse(url).entries][:10]
    the_count = len(live_leaks)
    ky = feedparser.parse(url).keys()
    oky = [i.keys() for i in feedparser.parse(url).entries][1] # shows what I can pull

    def make_soup(url):
        def swappo():
            user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" '
            user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" '
            user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" '
            user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" '

            agent_list = [user_one, user_two, user_thr, user_for]
            a = random.choice(agent_list)
            return a
        headers = {
            "user-agent": swappo(),
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
            "accept-encoding": "gzip,deflate,sdch",
            "accept-language": "en-US,en;q=0.8",
        }
        the_comments_page = requests.get(url, headers=headers)
        soupdata = BeautifulSoup(the_comments_page.text, 'html5lib')
        # comment = soupdata.find('a').get('src')
        # para = comment.find_all('p')
        # kids = [child.text for child in para]
        # blu = str(kids).strip('[]')
        return soupdata

    try:
        live_entries = [{'href': live_leak.links[0]['href']} for live_leak in live_leaks]
        o = make_soup(live_entries)
    except IndexError:
        print('error check logs')
        live_entries = []

    return print(o)

但是当我尝试时我收到此错误

[2016-10-07 21:10:58,019: ERROR/MainProcess] Task blog.tasks.pan_task[f43ed360-c06e-4a4b-95ab-4f44a4564afa] raised unexpected: InvalidSchema("No connection adapters were found for '[{'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/AA1uAIpygjQ/us-apple-samsung-elec-appeal-idUSKCN1271LF'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/Nz28cqiuS0Y/us-google-pixel-advertising-idUSKCN12721U'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/POLoFj22hc4/us-yahoo-nsa-order-idUSKCN12800D'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/eF-XlhlQl-s/us-fcc-dataservices-idUSKCN1271RB'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/hNf9IQ3rXjw/us-autonomous-nauto-idUSKCN1271FX'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/NXkk5WfWVhM/us-sony-sensors-idUSKCN1270EC'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/gdBvoarqQro/us-yahoo-discrimination-lawsuit-idUSKCN12800K'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/nt8K--27bDg/us-thomsonreuters-ceo-idUSKCN1271DQ'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/f8z3eQg2Fpo/us-snapchat-ipo-idUSKCN12627S'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/rr4vdLsC11Y/us-samsung-elec-results-idUSKCN1262NO'}]'",)
Traceback (most recent call last):
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/celery/app/trace.py", line 240, in trace_task
    R = retval = fun(*args, **kwargs)
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/celery/app/trace.py", line 438, in __protected_call__
    return self.run(*args, **kwargs)
  File "/Users/ray/Desktop/myheroku/practice/src/blog/tasks.py", line 134, in pan_task
    o = make_soup(live_entries)
  File "/Users/ray/Desktop/myheroku/practice/src/blog/tasks.py", line 124, in make_soup
    the_comments_page = requests.get(url, headers=headers)
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/api.py", line 67, in get
    return request('get', url, params=params, **kwargs)
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/api.py", line 53, in request
    return session.request(method=method, url=url, **kwargs)
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 468, in request
    resp = self.send(prep, **send_kwargs)
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 570, in send
    adapter = self.get_adapter(url=request.url)
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 644, in get_adapter
    raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for '[{'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/AA1uAIpygjQ/us-apple-samsung-elec-appeal-idUSKCN1271LF'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/Nz28cqiuS0Y/us-google-pixel-advertising-idUSKCN12721U'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/POLoFj22hc4/us-yahoo-nsa-order-idUSKCN12800D'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/eF-XlhlQl-s/us-fcc-dataservices-idUSKCN1271RB'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/hNf9IQ3rXjw/us-autonomous-nauto-idUSKCN1271FX'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/NXkk5WfWVhM/us-sony-sensors-idUSKCN1270EC'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/gdBvoarqQro/us-yahoo-discrimination-lawsuit-idUSKCN12800K'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/nt8K--27bDg/us-thomsonreuters-ceo-idUSKCN1271DQ'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/f8z3eQg2Fpo/us-snapchat-ipo-idUSKCN12627S'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/rr4vdLsC11Y/us-samsung-elec-results-idUSKCN1262NO'}]'

这是不行的?我在另一个程序中使用该函数。

1 个答案:

答案 0 :(得分:-1)

你需要做这样的事情:

@app.task
def pan_task():
    url = 'http://feeds.example.com/reuters/technologyNews'
    name = 'noticiassin'
    live_leaks = [i for i in feedparser.parse(url).entries][:10]
    the_count = len(live_leaks)
    ky = feedparser.parse(url).keys()
    oky = [i.keys() for i in feedparser.parse(url).entries][1] # shows what I can pull

    def make_soup(url):
        def swappo():
            user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" '
            user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" '
            user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" '
            user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" '

            agent_list = [user_one, user_two, user_thr, user_for]
            a = random.choice(agent_list)
            return a
        headers = {
            "user-agent": swappo(),
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
            "accept-encoding": "gzip,deflate,sdch",
            "accept-language": "en-US,en;q=0.8",
        }
        the_comments_page = requests.get(url, headers=headers)
        soupdata = BeautifulSoup(the_comments_page.text, 'html5lib')
        # comment = soupdata.find('div')
        # para = comment.find_all('p')
        # kids = [child.text for child in para]
        # blu = str(kids).strip('[]')
        return soupdata

    live_entries = []
    try:
        for live_leak in live_leaks:
            live_entries.append(make_soup(live_leak.links[0]['href']))
            # Do what ever you need to do to o here
    except IndexError:
        print('error check logs')
        live_entries = []
    return live_entries