我正在尝试对LexisNexis进行搜索并抓取结果。我需要从所有页面抓取结果,所以我希望硒进行搜索,抓取数据,然后单击“下一步”并再次进行。此外,我希望它可以搜索多个术语。例如,我希望它搜索“法律”一词,执行我刚才描述的内容,然后搜索“ medicaid”一词,执行我刚刚描述的内容,等等。
这是我的代码:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
#from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
#import requests
#import re
import csv
import numpy as np
#import pandas as pd
###############################################################################
#CLICKING AND SEARCH
###############################################################################
browser = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver')
browser.implicitly_wait(5)
#Goes to library website and finds database
browser.get('https://sfx.carli.illinois.edu/sfxuiu?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&ctx_enc=info:ofi/enc:UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/sfxit.com:azlist&sfx.ignore_date_threshold=1&rft.object_id=63750000000001351&svc.fulltext=yes')
browser.find_element_by_link_text('LEXIS NEXIS DATABASES').click()
alert = browser.switch_to.alert
alert.accept()
browser.close()
browser.switch_to.window(browser.window_handles[0])
#Login to NexisUni through university library ONLY WHEN NOT ON CAMPUS
browser.find_element_by_id('j_username').send_keys('USERNAME')
browser.find_element_by_id('j_password').send_keys('PASS')
browser.find_element_by_name('_eventId_proceed').click()
#click on advanced search on NexisUni homepage
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH ('/html/body/main/div[13]/div[2]/div[1]/header/div[3]/ul/li[1]/button'))
advancedSearch = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[1]/header/div[3]/ul/li[1]/button')
advancedSearch.click()
#Selecting Specific Content Type
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/button')))
Select_Content = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/button')
Select_Content.click()
#Choose News
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/aside/div[2]/ul[2]/li[2]/button')))
Choose_News = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/aside/div[2]/ul[2]/li[2]/button')
Choose_News.click()
#Type in Search Term
browser.find_element_by_xpath('//*[@id="headline"]').send_keys('Law')
#Type in Publication
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('//*[@id="publication"]')))
Pub = browser.find_element_by_xpath('//*[@id="publication"]')
Pub.send_keys('The Associated Press')
#input date range
select = Select(browser.find_element_by_id('date'))
select.select_by_visible_text('Date is after')
browser.find_element_by_id('dateFrom').send_keys('01/01/1980')
#click on Search
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/footer/span/button[1]')))
Search = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/footer/span/button[1]')
Search.click()
###############################################################################
#SCRAPING
###############################################################################
scd = browser.page_source
soup = BeautifulSoup(scd, "lxml")
HEADLINES = soup.findAll('a', attrs={"data-action":"title"})
headlines=[]
for H in HEADLINES:
headlines.append(H.text.strip())
DETAILS = soup.findAll('div', attrs={"class":"dataInfo translate"})
details = []
for D in DETAILS:
details.append(D.text.strip())
Dates1 = [i.split('\t\t\t\t\t\n\n',2)[1] for i in details]
Dates = [i.split('\n',1)[0] for i in Dates1]
Source1 = [i.split('\t\t\t\t\t\n\n',1)[1] for i in details]
Source = [i.split('\n',1)[1] for i in Source1]
News = zip(headlines,Dates,Source)
result = "/Users/danashaat/Desktop/data.csv"
with open(result, 'a') as result:
newswriter = csv.writer(result)
for row in News:
newswriter.writerow(row)
#Next Page:
while True:
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')))
Next = browser.find_element_by_xpath('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')
if len(Next) < 1:
print("No more pages left")
break
else:
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')))
Next = browser.find_element_by_xpath('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')
Next.click()
它没有按照我想要的方式工作!我不确定为什么:/
答案 0 :(得分:0)
代码末尾的while循环将继续单击“下一步”按钮,直到显示“没有剩余页面”消息。单击“下一步”后,它不会再次调用代码的“抓取”部分。因此,您的代码将只抓取第一页并保存zip文件,但随后将继续单击“下一步”直到存在。
也许您应该创建一个抓取函数,并在每次单击“下一步”按钮之后在while循环中调用它。