我试图抓一个网页。这是代码:
## getting links from a web page in a form of list
def get_list(url) :
for i in range(262) :
url = "http://pann.nate.com/search/talk?
searchType=A&q=%EB%AF%B8%EC%84%B8%EB%A8%BC%EC%A7%80&page=1".format(i)
response = requests.get(url)
my_soup = BeautifulSoup(response.content, "html.parser")
return my_soup.body.select("a")
list = get_list(url)
## getting the links and make as a list
def get_link(list) :
ls = []
for i in range(0, len(list)) :
link = list[i].get('href')
ls.append(link)
return ls
ls = get_link(list)
## scraping the content from each link
def get_text(link) :
response = requests.get(link)
soup = BeautifulSoup(response.content, "html.parser")
text = soup.select_one("#contentArea").text
return text
text = get_text(ls)
但是,在运行" text":
后,我收到此错误消息In[30]: text = get_text(ls)
Traceback (most recent call last):
File "<ipython-input-30-a4e9a7e8cd0f>", line 1, in <module>
text = get_text(ls)
File "<ipython-input-29-1d89fe03762f>", line 2, in get_text
response = requests.get(link)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 508, in request
resp = self.send(prep, **send_kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 612, in send
adapter = self.get_adapter(url=request.url)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 703, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
InvalidSchema: No connection adapters were found for
'['http://www.nate.com/?f=pann', 'http://pann.nate.com/',
'http://news.nate.com/', 'http://sports.news.nate.com/index',
'http://news.nate.com/ent/index', 'http://www.nate.com/',
'http://www.nate.com/sitemap/', '#', '#', '#', 'http://pann.nate.com/',
'http://pann.nate.com/talk', 'http://pann.nate.com/talk/imageTheme/index',
'http://pann.nate.com/fantalk', 'http://pann.nate.com/video', '#',
'http://pann.nate.com/search?q=%EB%AF%B8%EC%84%B8%EB%A8%BC%EC%A7%80',
'http://pann.nate.com/search/talk?q=%EB%AF%B8%EC%84%B8%EB%A8%BC%EC%A7%80',
'http://pann.nate.com/search/fantalk?q=%EB%AF%B8%EC%84%B8%EB%A8%BC%EC%A7%80', 'http://pann.nate.com/search/video?q=%EB%AF%B8%EC%84%B8%EB%A8%BC%EC%A7%80', 'http://pann.nate.com/talk/342035421',
'http://pann.nate.com/talk/342035421', 'http://pann.nate.com/talk/c20025',
'http://pann.nate.com/search/talk?q=%E3%85%87%E3%85%87&searchType=N',
'http://pann.nate.com/talk/342038400',
'http://pann.nate.com/talk/342038400', 'http://pann.nate.com/talk/c20038',
'http://pann.nate.com/search/talk?q=%E3%85%87%E3%85%87&searchType=N',
'http://pann.nate.com/talk/341991386',
消息一直在继续......似乎是什么问题?非常感谢你的帮助。
答案 0 :(得分:0)
在python .format中做了一些魔术。例如,如果您运行以下代码:
"{}".format("foo")
你得到的结果是&#34; foo&#34;。
在这个特定的情况下,你似乎已经格式化了一些东西,而没有真正改变某些东西(我怀疑你打算多次请求相同的URL。如果你这样做了,那么格式是不需要的。第一名)。
除此之外;我无法重现这个问题。我在尝试这个时得到200回复。
答案 1 :(得分:0)
您需要在网址的开头显示http://
协议指示符。
答案 2 :(得分:0)
url =“http://pann.nate.com/search/talk? 检索类别= A&安培; Q =%EB%AF%B8%乳油%84%B8%EB%A8%BC%乳油%A7%80安培;页= 1" .format(I)
尝试:
url = "http://pann.nate.com/search/talk?
searchType=A&q=%EB%AF%B8%EC%84%B8%EB%A8%BC%EC%A7%80&page={}".format(i)
答案 3 :(得分:0)
我实际上解决了这个问题。我只需要指定具有#contentArea的html的确切部分。我修改了我的代码如下:
import requests
from bs4 import BeautifulSoup
for i in range(1, 262) :
url = "http://pann.nate.com/search/talk?
q=%EB%AF%B8%EC%84%B8%EB%A8%BC%EC%A7%80&page={}".format(i)
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
list = soup.find_all("ul", {"class":"s_list"})
for i in range(1, len(list)) :
a_list = list[i].select("dt")
for i in range(1, len(a_list)) :
mylist = a_list[i].select("a")
ln = []
for i in range(1, len(mylist)) :
link = mylist[i].get('href')
if len(link) > 2 and "javascript" not in link:
ln.append(link)
for i in range(1, len(ln)) :
my_response = requests.get(ln[i])
mysoup = BeautifulSoup(my_response.content, "html.parser")
filename = "natepan_dust.csv"
f = open(filename, "w", encoding='UTF-8')
headers = "title, time, content/n"
f.write("headers")
for i in range(1, len(mysoup)) :
a_title = mysoup.select("h4")
title = a_title[0].text
a_time = mysoup.findAll("span", {"class":"date"})
time = a_time[0].text
content = mysoup.select_one("#contentArea").text
print("title: " + title)
print("time: " + time)
print("content: " + content)
f.write(title.replace(","," ") + "," + time + "," +
content.replace(","," ") + "\n")
f.close()
然而!我有另一个问题。我无法遍历从1到262的页面。 我最后得到的CSV文件只包含最后一页的数据。 我做错了什么?