我的研究项目要求我从this website下载专利数据。
我必须输入一个日期,例如2009.4.30
,然后返回专利信息列表。每页只返回3个条目,我想下载所有条目,所以我需要向下滚动。
现在我尝试使用Selenium进行滚动,使用XPath找到转到区域,然后使用以下内容提交页码请求并转到下一页。它失败了。我还必须尝试send_keys(Keys.RETURN)
。如何从页面获取额外数据?
#-*- coding: utf-8 -*-
# download data from
time1 = time.time()
driver = webdriver.Firefox()
driver.get("http://epub.sipo.gov.cn/")
elem = driver.find_element_by_id("soso_text")
a = date(2009,5,30)
b = date(2009,6,9)
print "hello"
for dt in rrule(DAILY, dtstart = a, until = b):
print dt.year
print dt.month
print dt.day
date = str(dt.year) + "." + str(dt.month) + "." + str(dt.day)
filename = date+".txt"
f = open("/Users/jasondou/Google Drive/data/patent/" + filename, "wb")
driver.get("http://epub.sipo.gov.cn/")
elem = driver.find_element_by_id("soso_text")
elem.send_keys(str(date))
elem.send_keys(Keys.RETURN)
content = driver.page_source.encode('utf-8')
for uchar in content:
f.write(bytearray([ord(uchar)]))
print "hello here"
nextpage = driver.
find_element_by_xpath("/html/body/div[3]/div[2]/div[4]/a[7]")
turnto = driver.find_element_by_xpath("/html/body/div[3]/div[2]/div[4]/span")
print "hello 0"
print nextpage.get_attribute("innerHTML")
totalnum = int(nextpage.get_attribute("innerHTML"))
for i in range(2, totalnum + 1):
print "i: "
print i
turnto = driver.find_element_by_xpath
("/html/body/div[3]/div[2]/div[4]/span")
turnto.send_keys(str(i))
turnto.send_keys(Keys.ENTER)
content = driver.page_source.encode('utf-8')
for uchar in content:
f.write(bytearray([ord(uchar)]))
print "hello2"
f.close()
time2 = time.time()
print time2-time1