我需要刮擦Soccerway.com,当我在比赛的每个链接中选择日期(例如2011-2013)时遇到问题,但问题仅保存了最后一个日期2012-2013,而不是2011-2012和2012-2013但只有最后一个日期。
from time import sleep
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
def get_urls_season(url_path):
driver = webdriver.Chrome()
driver.fullscreen_window()
driver.get("https://us.soccerway.com" + url_path)
click_privacy_policy(driver)
date = date_selector(driver)
#url_list = cycle_through_game_weeks(driver)
url_list.reverse()
driver.quit()
print("=" * 100)
print(f"{len(set(url_list))} find")
if input("con? (y/n): ") != "y":
exit()
return url_list
def date_selector(driver):
inptdate='2010-2012'
startdate=inptdate.split('-')[0]
enddate=inptdate.split('-')[1]
while int(startdate)< int(enddate):
textstring=str(startdate) + "/" + str(int(startdate)+1)
print(textstring)
driver.find_element_by_xpath("//select[@name='season_id']/option[text()='" + textstring +"']").click()
startdate=int(startdate)+1
url_list = cycle_through_game_weeks(driver)
def click_privacy_policy(driver):
try:
driver.find_element_by_class_name("qc-cmp-button").click()
except NoSuchElementException:
pass
def cycle_through_game_weeks(driver):
season_urls = get_fixture_urls(innerhtml_soup(driver))
while is_previous_button_enabled(driver):
click_previous_button(driver)
sleep(2)
urls = get_fixture_urls(innerhtml_soup(driver))
urls.reverse()
season_urls += urls
return season_urls
def is_previous_button_enabled(driver):
return driver.find_element_by_id(
"page_competition_1_block_competition_matches_summary_5_previous"
).get_attribute("class") != "previous disabled"
def click_previous_button(driver):
driver.find_element_by_id(
"page_competition_1_block_competition_matches_summary_5_previous"
).click()
def get_fixture_urls(soup):
urls = []
for elem in soup.select(".info-button.button > a"):
urls.append(urlparse(elem.get("href")).path)
return urls
def innerhtml_soup(driver):
html = driver.find_element_by_tag_name("html").get_attribute("innerHTML")
soup = BeautifulSoup(html, "html.parser")
return soup
我需要抓取所有日期2011-2013 2011-2012 2012-2013不只是最后日期 而且我找不到问题的原因。
答案 0 :(得分:0)
如果我正确理解了代码,问题就出在这里:
Command: kudusync -v 50 -f /home/site/repository -t /home/site/wwwroot -n /home/site/deployments/e96456e2542d0f1229a95843063e79969b9e050e/manifest -p /opt/Kudu/Scripts/firstDeploymentManifest -i ".git;.hg;.deployment;.deploy.sh"
Error: ENOENT: no such file or directory, stat '/home/site/repository/env/include/python3.5m'
\n/opt/Kudu/Scripts/starter.sh kudusync -v 50 -f /home/site/repository -t /home/site/wwwroot -n /home/site/deployments/e96456e2542d0f1229a95843063e79969b9e050e/manifest -p /opt/Kudu/Scripts/firstDeploymentManifest -i ".git;.hg;.deployment;.deploy.sh"
在每次迭代中,您都用新的url_list覆盖新的url_list,最简单的解决方案是:
url_list = cycle_through_game_weeks(driver)
更加优雅和有效:
url_list += cycle_through_game_weeks(driver)
通过这种方式,您可以在url_list [0]下获取第一年的值,在url_list [1]下获取第二年的值,依此类推