我正在学习抓取技巧,我想做如下:
问题是我已经为单个链接测试了我的代码,它工作正常,但是当我尝试它用于多级作业时。它以我无法理解的方式失败:它只能抓取每个链接的某些部分。我想知道我的代码中是否存在某些逻辑错误,请帮忙。以下是代码
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
start_urls = ['http://tieba.baidu.com']
main_url = 'http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
username = ""
password = ""
def __init__(self, username=username, password=password):
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#options.add_argument('window-size=1200x600')
self.driver = webdriver.Chrome()#chrome_options=options)
self.username = username
self.password = password
# checked
def logIn(self):
elem = self.driver.find_element_by_css_selector('#com_userbar > ul > li.u_login > div > a')
elem.click()
wait = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#TANGRAM__PSP_10__footerULoginBtn')))
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__footerULoginBtn')
elem.click()
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__userName')
elem.send_keys(self.username)
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__password')
elem.send_keys(self.password)
self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit').click()
# basic checked
def parse(self, response):
self.driver.get(response.url)
self.logIn()
# wait for hand input verify code
time.sleep(20)
self.driver.get('http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
# try first page first
for url in self.driver.find_elements_by_css_selector('a.j_th_tit'):
#new_url = response.urljoin(url)
new_url = url.get_attribute("href")
yield scrapy.Request(url=new_url, callback=self.parse_sub)
# checked
def pageScroll(self, url):
self.log('I am scrolling' + url)
self.driver.get(url)
SCROLL_PAUSE_TIME = 0.5
SCROLL_LENGTH = 1200
page_height = int(self.driver.execute_script("return document.body.scrollHeight"))
scrollPosition = 0
while scrollPosition < page_height:
scrollPosition = scrollPosition + SCROLL_LENGTH
self.driver.execute_script("window.scrollTo(0, " + str(scrollPosition) + ");")
time.sleep(SCROLL_PAUSE_TIME)
time.sleep(1.2)
def parse_sub(self, response):
self.log('I visited ' + response.url)
self.pageScroll(response.url)
for sel in self.driver.find_elements_by_css_selector('div.l_post.j_l_post.l_post_bright'):
name = sel.find_element_by_css_selector('.d_name').text
try:
content = sel.find_element_by_css_selector('.j_d_post_content').text
except: content = ''
replys = []
for i in sel.find_elements_by_xpath('.//div[@class="lzl_cnt"]'):
user1 = i.find_element_by_xpath('.//a[@username]')
user1 = self.driver.execute_script("return arguments[0].firstChild.textContent", user1)
try:
user2 = i.find_element_by_xpath('.//span[@class="lzl_content_main"]/a[@username]')
user2 = self.driver.execute_script("return arguments[0].firstChild.textContent", user2)
except: user2 = name
span = i.find_element_by_xpath('.//span[@class="lzl_content_main"]')
reply = self.driver.execute_script('return arguments[0].lastChild.textContent;', span)
replys.append(tuple(user1, user2, reply))
yield {"topic": response.css(".core_title_txt::text").extract(), "name":name, "content":content, "replys":replys}
#follow to next page
#next_sel = self.driver.find_element_by_css_selector('#thread_theme_7 a:nth-child(3)')
#next_url_name = next_sel.text
#if next_sel and next_url_name == '下一页':
# next_url = next_sel.get_attribute('href')
# yield scrapy.Request(url=next_url, callback=self.parse_sub)
答案 0 :(得分:0)
好像你正在使用硬编码的容器而不是通用链接,因此只能在
中找回一个链接for url in self.driver.find_elements_by_css_selector('a.j_th_tit')
这 - j_th_tit - 似乎是动态生成的类名,并且对于所有锚(a)标记可能不一样。
你可以尝试
for url in self.driver.find_elements_by_css_selector('a')
获取页面的所有链接。