我计划使用url列表,使用下面的代码连续抓取多个页面。
是否有一种聪明的方法,通过引用广泛的网址列表(可以是CSV或Excel文件)来替换“ desired_google_queries”的手动插入的字词?
from bs4 import BeautifulSoup
import urllib.request
import csv
desired_google_queries = ['Word' , 'lifdsst', 'yvou', 'should', 'load']
for query in desired_google_queries:
url = 'http://google.com/search?q=' + query
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
response = urllib.request.urlopen( req )
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
resultStats = soup.find(id="resultStats").string
print(resultStats)
with open('queries.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(['query', 'resultStats'])
for query in desired_google_queries:
...
spamwriter.writerow([query, resultStats])
答案 0 :(得分:1)
您可以将抓取逻辑放入函数中,然后在从query
文件中读取的每个.csv
上调用它。
from bs4 import BeautifulSoup
import urllib.request
import csv
def scrape_site(query):
url = 'http://google.com/search?q=' + query
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
response = urllib.request.urlopen( req )
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
resultStats = soup.find(id="resultStats").string
return resultStats
#####################################################
# Read in queries from .csv to desired_google_queries
with open('queries.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(['query', 'resultStats'])
for query in desired_google_queries:
resultStats = scrape_site(query)
spamwriter.writerow([query, resultStats])