我已经构建了一个刮刀,它在处理数据方面效果很好,但出于某种原因,它只会刮擦所需搜索的第一页。
我有两个函数,一个用于在页面中找到我想要的元素,另一个用于搜索NEXT链接,如果存在则单击它。否则,刮刀只打印该页面并继续。我使用以下内容:
from __future__ import print_function
import fileinput
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding('utf8')
letters = ["x"]
for letter in letters:
try:
driver = webdriver.Chrome()
#driver.set_window_size(1120, 550)
driver.get("http://sam.gov")
driver.find_element_by_css_selector("a.button[title='Search Records']").click()
except:
print("Failed for " + letter)
pass
else:
driver.find_element_by_id('q').send_keys(letter)
driver.find_element_by_id('RegSearchButton').click()
def findRecords():
bsObj = BeautifulSoup(driver.page_source, "html.parser")
tableList = bsObj.find_all("table", {"class":"width100 menu_header_top_emr"})
tdList = bsObj.find_all("td", {"class":"menu_header width100"})
for table,td in zip(tableList,tdList):
a = table.find_all("span", {"class":"results_body_text"})
b = td.find_all("span", {"class":"results_body_text"})
hf = open("sam.csv",'a')
hf.write(', '.join(tag.get_text().strip() for tag in a+b) +'\n')
def crawl():
if driver.find_element_by_id('anch_16'):
print("Found next button")
findRecords()
driver.find_element_by_id('anch_16').click()
print("Going to next page")
else:
print("Scraping last page for " + letter)
findRecords()
print("Done scraping letter " + letter + "\nNow cleaning results file...")
seen = set() # set for fast O(1) amortized lookup
for line in fileinput.FileInput('sam.csv', inplace=1):
if line in seen: continue # skip duplicate
seen.add(line)
print(line)
print("Scraping and cleaning done for " + letter)
crawl()
driver.quit()