在python中传递一个列表值

时间:2016-11-06 20:19:49

标签: python web-crawler parameter-passing

我是Python新手,我正在努力将一个函数的值传递回同一个函数。

上下文:我正在开发一个从reddit中抓取数据的网络爬虫。我能够抓住帖子,对帖子发表评论,并将下一页href存储在名为next_page的变量中。我的问题是我无法将next_page传递回getPosts

非常感谢任何人提供的任何指导!

import requests
import re
from bs4 import BeautifulSoup
import time
from fake_useragent import UserAgent
import datetime
#sets default endcoding
import sys
from collections import deque
reload(sys)
sys.setdefaultencoding('UTF8')
#global variables
ua = UserAgent()
dt=datetime.datetime.now()
next_page=[]
comment_url=[]
url_list=[]
############################################
#Declare functions
def getComments(comment_url):
    return ('')
def getNext(next_page):
    return(next_page)
############################################
#print standard output
#sys.stdout=open('reddit_output.txt','w')
############################################
print "Crawl Date: ", dt.strftime('%B %d %Y'),"\n"
print "First Post"
#############################################



####################################################################################################################################
#capture individual search results

def getPosts(next_page):
    global url_list
    time.sleep(2)
    ua=UserAgent()
    headers={'user-agent': ua.random}
    #iterate through list_object of urls
    url_list=["https://www.reddit.com/r/HealthInsurance/search?q=health+insurance&sort=new&t=all", next_page]
    url=url_list[0]
    response=requests.get(url, headers=headers)
    html=response.text
    soup=BeautifulSoup(html,'lxml')
    comment_url=[]
    #clears original url from url_list  
    del url_list[0]     
#capture just the search results content
    content=soup.findAll(class_='listing search-result-listing')[1]
#add next_page link to url_list
#capture individual posts and link to the comments  
    for results in content.findAll(class_='contents'):
                    for post in results.findAll(class_=' search-result search-result-link no-linkflair '):
                        for post_elements in post:
                        #find all 'comments' links and store in list
                            comment_url.append(post.find(class_="search-result-meta").find('a').attrs['href'])
                        #header= post.find(class_='search-results-header')
                            title=post.findAll(class_='search-result-header')
                            meta=post.findAll(class_='search-result-meta')
                        ##print post and comments if there are any
                            ##print '|'         
                    #getComments(comment_url)       
#capture next page link     
    next_page=soup.find(class_="nav-buttons").findAll('a')[0].attrs['href']
    url_list.insert(0,next_page)
    return comment_url
    print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!next search results page!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
    print (next_page)

getPosts('')
################################################################################################################################### 
def getComments(comment_url):
    url= getPosts(comment_url)
    ##iterate through comment_list
    for link in url:
        time.sleep(2)
        ua=UserAgent()
        headers={'user-agent': ua.random}
        response=requests.get(link, headers=headers)
        html=response.text
        soup=BeautifulSoup(html, 'lxml')
        original_post=soup.find(class_='sitetable linklisting')
        content=soup.find(class_="sitetable nestedlisting")

        #print comments
        print "Original Post##########################################\n"
        print original_post.get_text()
        print "Comments###############################################\n"
        print content.get_text()
        print '|'
        print "New Post++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"


getComments("")

####################################################################################################################################
#passes the next page to getPosts
print "/////////////////////next search results page//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////"
print url_list[0]

getNext(next_page)

def getNext(next_page):
    url=(url_list[0])
    print url
    getPosts(url)
getNext(next_page)

1 个答案:

答案 0 :(得分:0)

global next_page添加到函数getPosts()的开头,而不是将参数传递给它,以便更新全局变量next_page的值。 并将全局变量next_page声明为str ''而不是list []