我想从网站上获取详细信息页面的图像。我正在使用rss的“链接”功能来获取链接。这是我的代码
@app.task
def pan_task():
url = 'http://feeds.example.com/reuters/technologyNews'
name = 'noticiassin'
live_leaks = [i for i in feedparser.parse(url).entries][:10]
the_count = len(live_leaks)
ky = feedparser.parse(url).keys()
oky = [i.keys() for i in feedparser.parse(url).entries][1] # shows what I can pull
def make_soup(url):
def swappo():
user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" '
user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" '
user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" '
user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" '
agent_list = [user_one, user_two, user_thr, user_for]
a = random.choice(agent_list)
return a
headers = {
"user-agent": swappo(),
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"accept-encoding": "gzip,deflate,sdch",
"accept-language": "en-US,en;q=0.8",
}
the_comments_page = requests.get(url, headers=headers)
soupdata = BeautifulSoup(the_comments_page.text, 'html5lib')
# comment = soupdata.find('a').get('src')
# para = comment.find_all('p')
# kids = [child.text for child in para]
# blu = str(kids).strip('[]')
return soupdata
try:
live_entries = [{'href': live_leak.links[0]['href']} for live_leak in live_leaks]
o = make_soup(live_entries)
except IndexError:
print('error check logs')
live_entries = []
return print(o)
但是当我尝试时我收到此错误
[2016-10-07 21:10:58,019: ERROR/MainProcess] Task blog.tasks.pan_task[f43ed360-c06e-4a4b-95ab-4f44a4564afa] raised unexpected: InvalidSchema("No connection adapters were found for '[{'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/AA1uAIpygjQ/us-apple-samsung-elec-appeal-idUSKCN1271LF'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/Nz28cqiuS0Y/us-google-pixel-advertising-idUSKCN12721U'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/POLoFj22hc4/us-yahoo-nsa-order-idUSKCN12800D'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/eF-XlhlQl-s/us-fcc-dataservices-idUSKCN1271RB'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/hNf9IQ3rXjw/us-autonomous-nauto-idUSKCN1271FX'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/NXkk5WfWVhM/us-sony-sensors-idUSKCN1270EC'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/gdBvoarqQro/us-yahoo-discrimination-lawsuit-idUSKCN12800K'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/nt8K--27bDg/us-thomsonreuters-ceo-idUSKCN1271DQ'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/f8z3eQg2Fpo/us-snapchat-ipo-idUSKCN12627S'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/rr4vdLsC11Y/us-samsung-elec-results-idUSKCN1262NO'}]'",)
Traceback (most recent call last):
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/celery/app/trace.py", line 240, in trace_task
R = retval = fun(*args, **kwargs)
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/celery/app/trace.py", line 438, in __protected_call__
return self.run(*args, **kwargs)
File "/Users/ray/Desktop/myheroku/practice/src/blog/tasks.py", line 134, in pan_task
o = make_soup(live_entries)
File "/Users/ray/Desktop/myheroku/practice/src/blog/tasks.py", line 124, in make_soup
the_comments_page = requests.get(url, headers=headers)
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/api.py", line 67, in get
return request('get', url, params=params, **kwargs)
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/api.py", line 53, in request
return session.request(method=method, url=url, **kwargs)
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 468, in request
resp = self.send(prep, **send_kwargs)
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 570, in send
adapter = self.get_adapter(url=request.url)
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 644, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for '[{'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/AA1uAIpygjQ/us-apple-samsung-elec-appeal-idUSKCN1271LF'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/Nz28cqiuS0Y/us-google-pixel-advertising-idUSKCN12721U'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/POLoFj22hc4/us-yahoo-nsa-order-idUSKCN12800D'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/eF-XlhlQl-s/us-fcc-dataservices-idUSKCN1271RB'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/hNf9IQ3rXjw/us-autonomous-nauto-idUSKCN1271FX'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/NXkk5WfWVhM/us-sony-sensors-idUSKCN1270EC'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/gdBvoarqQro/us-yahoo-discrimination-lawsuit-idUSKCN12800K'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/nt8K--27bDg/us-thomsonreuters-ceo-idUSKCN1271DQ'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/f8z3eQg2Fpo/us-snapchat-ipo-idUSKCN12627S'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/rr4vdLsC11Y/us-samsung-elec-results-idUSKCN1262NO'}]'
这是不行的?我在另一个程序中使用该函数。
答案 0 :(得分:-1)
你需要做这样的事情:
@app.task
def pan_task():
url = 'http://feeds.example.com/reuters/technologyNews'
name = 'noticiassin'
live_leaks = [i for i in feedparser.parse(url).entries][:10]
the_count = len(live_leaks)
ky = feedparser.parse(url).keys()
oky = [i.keys() for i in feedparser.parse(url).entries][1] # shows what I can pull
def make_soup(url):
def swappo():
user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" '
user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" '
user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" '
user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" '
agent_list = [user_one, user_two, user_thr, user_for]
a = random.choice(agent_list)
return a
headers = {
"user-agent": swappo(),
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"accept-encoding": "gzip,deflate,sdch",
"accept-language": "en-US,en;q=0.8",
}
the_comments_page = requests.get(url, headers=headers)
soupdata = BeautifulSoup(the_comments_page.text, 'html5lib')
# comment = soupdata.find('div')
# para = comment.find_all('p')
# kids = [child.text for child in para]
# blu = str(kids).strip('[]')
return soupdata
live_entries = []
try:
for live_leak in live_leaks:
live_entries.append(make_soup(live_leak.links[0]['href']))
# Do what ever you need to do to o here
except IndexError:
print('error check logs')
live_entries = []
return live_entries