好的,这是我的代码:
from lxml import html
from lxml import etree
from selenium import webdriver
import calendar
import math
import urllib
import progressbar
import requests
使用selenium
path_to_driver = '/home/vladislav/Shit/geckodriver'
browser = webdriver.Firefox(executable_path = path_to_driver)
创建一个dict,我存储数据并创建进度条
DataDict = {}
barY = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barM = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barW = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
在循环中形成参数,从中构建网址并发送browser.get
请求
for year in (range(2014,2016)):
barY.update(year)
for month in range(1,13):
barM.update(month)
weeks = math.ceil(calendar.monthrange(year,month)[1]/4)
for week in range(weeks):
barW.update(week)
if (week > 2):
start_day = 22
end_day = calendar.monthrange(year,month)[1]
else:
start_day =7*week + 1
end_day = 7*(week + 1)
start_date = str(year) + '-' + str(month).zfill(2) +'-' + str(start_day).zfill(2)
end_date = str(year) + '-' +str(month).zfill(2) + '-' + str(end_day).zfill(2)
params = {'end-date': end_date, 'start-date': start_date}
url = 'http://www.finam.ru/profile/moex-akcii/aeroflot/news'
url = url + ('&' if urllib.parse.urlparse(url).query else '?') + urllib.parse.urlencode(params)
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
print("Found! Adding news to the dictionary!")
except:
pass
答案 0 :(得分:0)
好的,问题发生在一个广告横幅上,这个广告横幅出现在几个请求之后。解决方案只是等待( try:
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass
time.sleep(10)
except:
print("perchaps this shitty AD?")
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass
),直到横幅消失,再次发送请求!:
event_id = a.attr("href").match(/\?code=(\d+)/)[1]