我想知道如何使用精美的汤来收集所有网址和页面来源,并且可以在谷歌搜索结果中逐一访问所有网址并转到下一个谷歌索引页面。
这里是我要收集的网址https://www.google.com/search?q=site%3Awww.rashmi.com&rct=j,并在此处截屏http://www.rashmi.com/blog/wp-content/uploads/2014/11/screencapture-www-google-com-search-1433026719960.png
这是我尝试的代码
def getPageLinks(page):
links = []
for link in page.find_all('a'):
url = link.get('href')
if url:
if 'www.rashmi.com/' in url:
links.append(url)
return links
def Links(url):
pUrl = urlparse(url)
return parse_qs(pUrl.query)[0]
def PagesVisit(browser, printInfo):
pageIndex = 1
visited = []
time.sleep(5)
while True:
browser.get("https://www.google.com/search?q=site:www.rashmi.com&ei=50hqVdCqJozEogS7uoKADg" + str(pageIndex)+"&start=10&sa=N")
pList = []
count = 0
pageIndex += 1
答案 0 :(得分:0)
试试这个应该有效。
def getPageLinks(page):
links = []
for link in page.find_all('a'):
url = link.get('href')
if url:
if 'www.rashmi.com/' in url:
links.append(url)
return links
def Links(url):
pUrl = urlparse(url)
return parse_qs(pUrl.query)
def PagesVisit(browser, printInfo):
start = 0
visited = []
time.sleep(5)
while True:
browser.get("https://www.google.com/search?q=site:www.rashmi.com&ei=V896VdiLEcPmUsK7gdAH&" + str(start) + "&sa=N")
pList = []
count = 0
# Random sleep to make sure everything loads
time.sleep(random.randint(1, 5))
page = BeautifulSoup(browser.page_source)
start +=10
if start ==500:
browser.close()