我是Python新手,我正在努力将一个函数的值传递回同一个函数。
上下文:我正在开发一个从reddit中抓取数据的网络爬虫。我能够抓住帖子,对帖子发表评论,并将下一页href存储在名为next_page
的变量中。我的问题是我无法将next_page
传递回getPosts
。
非常感谢任何人提供的任何指导!
import requests
import re
from bs4 import BeautifulSoup
import time
from fake_useragent import UserAgent
import datetime
#sets default endcoding
import sys
from collections import deque
reload(sys)
sys.setdefaultencoding('UTF8')
#global variables
ua = UserAgent()
dt=datetime.datetime.now()
next_page=[]
comment_url=[]
url_list=[]
############################################
#Declare functions
def getComments(comment_url):
return ('')
def getNext(next_page):
return(next_page)
############################################
#print standard output
#sys.stdout=open('reddit_output.txt','w')
############################################
print "Crawl Date: ", dt.strftime('%B %d %Y'),"\n"
print "First Post"
#############################################
####################################################################################################################################
#capture individual search results
def getPosts(next_page):
global url_list
time.sleep(2)
ua=UserAgent()
headers={'user-agent': ua.random}
#iterate through list_object of urls
url_list=["https://www.reddit.com/r/HealthInsurance/search?q=health+insurance&sort=new&t=all", next_page]
url=url_list[0]
response=requests.get(url, headers=headers)
html=response.text
soup=BeautifulSoup(html,'lxml')
comment_url=[]
#clears original url from url_list
del url_list[0]
#capture just the search results content
content=soup.findAll(class_='listing search-result-listing')[1]
#add next_page link to url_list
#capture individual posts and link to the comments
for results in content.findAll(class_='contents'):
for post in results.findAll(class_=' search-result search-result-link no-linkflair '):
for post_elements in post:
#find all 'comments' links and store in list
comment_url.append(post.find(class_="search-result-meta").find('a').attrs['href'])
#header= post.find(class_='search-results-header')
title=post.findAll(class_='search-result-header')
meta=post.findAll(class_='search-result-meta')
##print post and comments if there are any
##print '|'
#getComments(comment_url)
#capture next page link
next_page=soup.find(class_="nav-buttons").findAll('a')[0].attrs['href']
url_list.insert(0,next_page)
return comment_url
print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!next search results page!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
print (next_page)
getPosts('')
###################################################################################################################################
def getComments(comment_url):
url= getPosts(comment_url)
##iterate through comment_list
for link in url:
time.sleep(2)
ua=UserAgent()
headers={'user-agent': ua.random}
response=requests.get(link, headers=headers)
html=response.text
soup=BeautifulSoup(html, 'lxml')
original_post=soup.find(class_='sitetable linklisting')
content=soup.find(class_="sitetable nestedlisting")
#print comments
print "Original Post##########################################\n"
print original_post.get_text()
print "Comments###############################################\n"
print content.get_text()
print '|'
print "New Post++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"
getComments("")
####################################################################################################################################
#passes the next page to getPosts
print "/////////////////////next search results page//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////"
print url_list[0]
getNext(next_page)
def getNext(next_page):
url=(url_list[0])
print url
getPosts(url)
getNext(next_page)
答案 0 :(得分:0)
将global next_page
添加到函数getPosts()
的开头,而不是将参数传递给它,以便更新全局变量next_page
的值。
并将全局变量next_page声明为str ''
而不是list []
。