编辑*重新编写正确格式的代码。谢谢你的提示!
我试图抓住一个有多个网页的网站。我尝试使用循环来更改网址并打印出来。第一页工作,循环进入第二个循环,我看到page2的打印,但我没有得到任何输出,我似乎进入一个不会死的循环。我在这里错了什么想法?
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
import requests
from bs4 import BeautifulSoup
s = list("https://games.crossfit.com/leaderboard?competition=1&year=2017&division=2&scaled=0&sort=0&fittest=1&fittest1=0&occupation=0&page=1")
for num in range(1,3):
nums = str(num)
s[129] =(nums)
varurl = ''.join(s)
print (varurl)
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = varurl
r = Render(url)
html = r.frame.toHtml()
soup = BeautifulSoup(html)
for divtag in soup.find_all('div', {'id':'containerOverlay'}):
for divtag2 in divtag.find_all('div',{'id':'leaderboard'}):
for ultag in divtag2.find_all('ul', {'class': 'scores'}):
for litag in ultag.find_all('li'):
print (litag.text)
更新#1 我采取了三联的建议(谢谢!)我添加了一个打印,看看我在循环中的位置,看到网址的变化。循环永远不会打印第二个URL,我们似乎进入无限循环的虚无。
baseurl = ("https://games.crossfit.com/leaderboard?competition=1&year=2017&division=2&scaled=0&sort=0&fittest=1&fittest1=0&occupation=0")
urls = ['{0}&page={1}'.format(baseurl, page) for page in range(1,3)]
for url in urls:
r = Render(url)
html = r.frame.toHtml()
print (url)
soup = BeautifulSoup(html)
for divtag in soup.find_all('div', {'id':'containerOverlay'}):
for divtag2 in divtag.find_all('div',{'id':'leaderboard'}):
for ultag in divtag2.find_all('ul', {'class': 'scores'}):
for litag in ultag.find_all('li'):
print (litag.text)
更新#2 使用来自互联网的一些提示重写。通过网址工作的最后一个循环存在同样的问题。然而刮环部分似乎不会发生环长度> 2 ..
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
import requests
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, urls, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.urls = urls
self.cb = cb
self.crawl()
self.app.exec_()
def crawl(self):
if self.urls:
url = self.urls.pop(0)
print ('Downloading', url)
self.mainFrame().load(QUrl(url))
else:
self.app.quit()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.crawl()
def scrape(url, html):
pass
soup = BeautifulSoup(html)
for divtag in soup.find_all('div', {'id':'containerOverlay'}):
for divtag2 in divtag.find_all('div',{'id':'leaderboard'}):
for ultag in divtag2.find_all('ul', {'class': 'scores'}):
for litag in ultag.find_all('li'):
score = litag.text
filename='file2.txt'
with open(filename,'a',encoding='utf-8') as out:
out.write(score + "\n")
print (len(html))
baseurl = ("https://games.crossfit.com/leaderboard?competition=1&year=2017&division=2&scaled=0&sort=0&fittest=1&fittest1=0&occupation=0")
urls = ['{0}&page={1}'.format(baseurl, page) for page in range(1,3)]
r = Render(urls, cb=scrape)
答案 0 :(得分:0)
我认为你的循环有缩进问题所以当它进入for循环时,里面没有任何东西。
for num in range(1,3):
nums = str(num)
s[129] =(nums)
varurl = ''.join(s)
print (varurl)
试试这个。我认为这会奏效。
答案 1 :(得分:0)
你应该生成一个URL列表,然后循环遍历它。
baseurl = "https://games.crossfit.com/leaderboard?competition=1&" \
"year=2017&division=2&scaled=0&sort=0&fittest=1&fittest1=0&occupation=0"
urls = ['{0}&page={1}'.format(baseurl, page) for page in range(1,3)]
for url in urls:
r = Render(url)
html = r.frame.toHtml()
# ... etc
如果您不熟悉列表推导,urls
作业可以用手写出来
urls = []
for page in range(1,3):
pageurl = '{0}&page={1}'.format(baseurl, page)
urls.append(pageurl)
然后,当然,将URL列表保留在内存中是不必要的,如果您计划循环遍历多个页面,则可能会出现问题。
for page in range(1,3):
pageurl = '{0}&page={1}'.format(baseurl, page)
r = Render(pageurl)
html = r.frame.toHtml()
# ... etc