使用Selenium Python迭代许多搜索项的搜索

时间:2018-09-06 00:11:19

标签: python selenium

我正在尝试对LexisNexis进行搜索并抓取结果。我需要从所有页面抓取结果,所以我希望硒进行搜索,抓取数据,然后单击“下一步”并再次进行。此外,我希望它可以搜索多个术语。例如,我希望它搜索“法律”一词,执行我刚才描述的内容,然后搜索“ medicaid”一词,执行我刚刚描述的内容,等等。

这是我的代码:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
#from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
#import requests
#import re
import csv
import numpy as np
#import pandas as pd

###############################################################################
#CLICKING AND SEARCH
###############################################################################

browser = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver')
browser.implicitly_wait(5)

#Goes to library website and finds database
browser.get('https://sfx.carli.illinois.edu/sfxuiu?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&ctx_enc=info:ofi/enc:UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/sfxit.com:azlist&sfx.ignore_date_threshold=1&rft.object_id=63750000000001351&svc.fulltext=yes')
browser.find_element_by_link_text('LEXIS NEXIS DATABASES').click()

alert = browser.switch_to.alert
alert.accept()

browser.close()
browser.switch_to.window(browser.window_handles[0])

#Login to NexisUni through university library ONLY WHEN NOT ON CAMPUS
browser.find_element_by_id('j_username').send_keys('USERNAME')
browser.find_element_by_id('j_password').send_keys('PASS')
browser.find_element_by_name('_eventId_proceed').click()

#click on advanced search on NexisUni homepage
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH ('/html/body/main/div[13]/div[2]/div[1]/header/div[3]/ul/li[1]/button'))
advancedSearch = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[1]/header/div[3]/ul/li[1]/button')
advancedSearch.click()

#Selecting Specific Content Type
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/button')))
Select_Content = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/button')
Select_Content.click()

#Choose News
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/aside/div[2]/ul[2]/li[2]/button')))
Choose_News = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/aside/div[2]/ul[2]/li[2]/button')
Choose_News.click()

#Type in Search Term
browser.find_element_by_xpath('//*[@id="headline"]').send_keys('Law')

#Type in Publication
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('//*[@id="publication"]')))
Pub = browser.find_element_by_xpath('//*[@id="publication"]')
Pub.send_keys('The Associated Press')

#input date range
select = Select(browser.find_element_by_id('date'))
select.select_by_visible_text('Date is after')
browser.find_element_by_id('dateFrom').send_keys('01/01/1980')

#click on Search
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/footer/span/button[1]')))
Search = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/footer/span/button[1]')
Search.click()      

###############################################################################
#SCRAPING
###############################################################################
scd = browser.page_source
soup = BeautifulSoup(scd, "lxml")

HEADLINES = soup.findAll('a', attrs={"data-action":"title"})
headlines=[]
for H in HEADLINES:
    headlines.append(H.text.strip())

DETAILS = soup.findAll('div', attrs={"class":"dataInfo translate"})
details = []
for D in DETAILS:
    details.append(D.text.strip())


Dates1 = [i.split('\t\t\t\t\t\n\n',2)[1] for i in details]
Dates = [i.split('\n',1)[0] for i in Dates1]

Source1 = [i.split('\t\t\t\t\t\n\n',1)[1] for i in details]
Source = [i.split('\n',1)[1] for i in Source1]


News = zip(headlines,Dates,Source)

result = "/Users/danashaat/Desktop/data.csv"
with open(result, 'a') as result:
    newswriter = csv.writer(result) 
    for row in News:
        newswriter.writerow(row)

#Next Page:
while True:
    WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')))
    Next = browser.find_element_by_xpath('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')
    if len(Next) < 1:
        print("No more pages left")
        break
    else:
        WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')))
        Next = browser.find_element_by_xpath('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')
        Next.click()

它没有按照我想要的方式工作!我不确定为什么:/

1 个答案:

答案 0 :(得分:0)

代码末尾的while循环将继续单击“下一步”按钮,直到显示“没有剩余页面”消息。单击“下一步”后,它不会再次调用代码的“抓取”部分。因此,您的代码将只抓取第一页并保存zip文件,但随后将继续单击“下一步”直到存在。

也许您应该创建一个抓取函数,并在每次单击“下一步”按钮之后在while循环中调用它。