我想使用以下代码抓取Naver博客,只会抓取第一页上的帖子1到10。 11~20,21~30 ....如何编辑继续抓取
import sys
from bs4 import BeautifulSoup
import requests
import csv
BASE_URL = "https://search.naver.com/search.naver?where=post&sm=tab_pge&query=%ED%99%94%EC%A0%95%EC%B2%9C&st=sim&date_option=8&date_from=20160101&date_to=20161231&dup_remove=1&post_blogurl=&post_blogurl_without=&srchby=all&nso=p%3Afrom20160101to20161231&ie=utf8&start="
f = open("park01.csv", 'w', newline='')
wr =csv.writer(f)
for i in range(100):
URL_with_page_num = BASE_URL + str(1 + i*10)
response = requests.get(BASE_URL)
response.status_code
print (response.status_code)
dom = BeautifulSoup(response.content, "html.parser")
post_elements = dom.select("li.sh_blog_top")
for post_element in post_elements:
title_element = post_element.select_one("a.sh_blog_title")
passage_element = post_element.select_one("dd.sh_blog_passage")
title = title_element.text
url = title_element.get("href")
passage = passage_element.text
data=[title, url, passage]
wr.writerow(data)
f.close()
答案 0 :(得分:2)
我想问题出现在下面的代码中 -
for i in range(100):
URL_with_page_num = BASE_URL + str(1 + i*10)
response = requests.get(BASE_URL)
在上面代码的最后一行中放置URL_with_page_num
代替BASE_URL
response = requests.get(URL_with_page_num)