我想网络抓取预定义的链接。 我想在特定区域从https://www.outdooractive.com/de/刮取徒步旅行,所以我用20个链接定义了该区域。到现在为止还挺好。我获取了一个链接的数据但是当我尝试通过List of Pages循环它时它只通过一个链接。我希望它只是我在逻辑思维方面的无能。如果有人能帮助我,我会非常高兴。
这是我的代码。只有三个链接不是全部。
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
import codecs
webliste = []
webliste.append('https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.54301,48.94731')
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.5443,48.88763")
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.4589,48.93163")
for Page in webliste:
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
page = Page(webliste[0+1])
filename = "WandertourenLinks.csv"
f = codecs.open(filename, "w","utf-8")
headers ="Tour Name" + ";" + "Länge" + ";" + "Zeit" + ";" + "Aufstieg" + ";" + "Abstieg" + ";" + "Link zur Tour"+ ";" + "Anbieter\n"
f.write(headers)
def main():
soup = bs.BeautifulSoup(page.html, 'html.parser')
containers = soup.findAll("div", {"class":"oax_dp_snippet"})
print ("Anzahl der gefundenen touren", len(containers))
#loop
for container in containers:
tour_container = container.findAll("span",{"dir":"auto"})
cont = tour_container[0].text
print("Name der Tour: ", cont)
tour_name = cont
tour_data = container.findAll("div",{"class":"oax_tour_data oax_fl"})
leange = tour_data[0].text.strip()
zeit = tour_data[1].text.strip()
aufstieg = tour_data[2].text.strip()
abstieg = tour_data[3].text.strip()
print("Länge der Tour: ", leange)
print("Länge der Tour: ", zeit)
print("Länge der Tour: ", aufstieg)
print("Länge der Tour: ", abstieg)
link = container.a["href"]
link_a = link
print ("Link zur Tour: ", link)
tour_anbieter = container.findAll("div",{"class":"oax_var_pos oax_var_pos_bottom oax_font_smaller oax_line_height_14 oax_ellipsis"})
anbieter = tour_anbieter[0].text.strip()
print("Tourenanbieter: ", anbieter)
f.write(tour_name + ";" + leange + ";" + zeit + ";" + aufstieg + ";" + abstieg + ";" + link+ ";" + anbieter+ "\n")
f.close()
if __name__ == '__main__': main()
答案 0 :(得分:0)
@Steve Haigh感谢您给我的第二个提示是最好的。知道一切正常。我知道它不是很性感,但是它有效;)
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
import codecs
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
webliste = []
webliste.append('https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.32663,49.07201')
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.30002,49.0945")
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.22097,49.11664")
filename = "WandertourenLinks.csv"
f = codecs.open(filename, "w","utf-8")
headers ="Tour Name" + ";" + "Länge" + ";" + "Zeit" + ";" + "Aufstieg" + ";" + "Abstieg" + ";" + "Link zur Tour"+ ";" + "Anbieter\n"
f.write(headers)
def main():
for i in range(3):
page = Page(webliste[i])
soup = bs.BeautifulSoup(page.html, 'html.parser')
containers = soup.findAll("div", {"class":"oax_dp_snippet"})
print ("Anzahl der gefundenen touren", len(containers))
for container in containers:
tour_container = container.findAll("span",{"dir":"auto"})
cont = tour_container[0].text
print("Name der Tour: ", cont)
tour_name = cont
tour_data = container.findAll("div",{"class":"oax_tour_data oax_fl"})
leange = tour_data[0].text.strip()
zeit = tour_data[1].text.strip()
aufstieg = tour_data[2].text.strip()
abstieg = tour_data[3].text.strip()
print("Länge der Tour: ", leange)
print("Länge der Tour: ", zeit)
print("Länge der Tour: ", aufstieg)
print("Länge der Tour: ", abstieg)
link = container.a["href"]
link_a = link
print ("Link zur Tour: ", link)
tour_anbieter = container.findAll("div",{"class":"oax_var_pos oax_var_pos_bottom oax_font_smaller oax_line_height_14 oax_ellipsis"})
anbieter = tour_anbieter[0].text.strip()
print("Tourenanbieter: ", anbieter)
f.write(tour_name + ";" + leange + ";" + zeit + ";" + aufstieg + ";" + abstieg + ";" + link+ ";" + anbieter+ "\n")
f.close()
if __name__ == '__main__': main()