我正在开发一个项目,我正在使用scrapy和selenium在python中构建一个屏幕抓取器,然后使用屏幕scraper的输出并使用xlsxwriter将其写入excel文件。然而,似乎我的scrapy物品总是空着。我不确定此时发生了什么,所以任何帮助都表示赞赏。请注意,在发布此处之前,脚本中有一些网址,目录和其他一些敏感信息。所以一些链接和目录可能看起来很奇怪。
excel输出的屏幕截图:
Screen Scraper File Output
蜘蛛:
import os
import time
from datetime import date
from ScreenScraper.items import *
from scrapy import *
from scrapy.http import FormRequest
from scrapy.loader import ItemLoader
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from xlsxwriter import *
class CquentiaSpider(Spider):
name = 'cquentia'
allowed_domains = ['linktotable.com']
start_urls = ['www.linktotable.com/login'] #had to strip directories ,urls, usernames, and passwords from script.
login_user = 'Example'
login_pass = 'Example'
accnt_ID_1 = 'Example'
phantomjspath = r'C:\Users\[User]\Documents\Visual Studio 2015\Projects\ScreenScraper\Scraper\selenium\webdriver\phantomjs\bin\phantomjs.exe'
def __init__(self, name = None, **kwargs):
self.browser = webdriver.PhantomJS(executable_path=self.phantomjspath)
return super(CquentiaSpider, self).__init__(name, **kwargs)
def parse(self, response):
self.browser.get(response.url)
username = self.browser.find_element_by_name('username')
password = self.browser.find_element_by_name('password')
login = self.browser.find_element_by_name('submit')
username.send_keys(self.login_user)
password.send_keys(self.login_pass)
login.click()
time.sleep(1.5)
self.browser.get('www.linktotable.com/search')
accnt_id = self.browser.find_element_by_name('accnId')
search = self.browser.find_element_by_name('accnSrch')
accnt_id.send_keys(self.accnt_ID_1)
search.click()
time.sleep(1.5)
select = Selector(text=self.browser.page_source)
get_table_count = count()
get_table_count['row_count'] = select.xpath('//*[@id="otTable"]/thead/tr/td[1]/text()').extract() [0]
count_final = int(get_table_count['row_count']) + 1
patient = cquentiaPatientItems()
patient['Pt_First_Name'] = select.xpath('//*[@id="ptFNm"]/text()').extract()
patient['Pt_Last_Name'] = select.xpath('//*[@id="ptLNm"]/text()').extract()
patient['Client_ID_Name'] = select.xpath('//*[@id="clnNm"]/text()').extract()
patient['DOS'] = select.xpath('//*[@id="dos"]/text()').extract()
patient_First_Name = str(patient['Pt_First_Name'])
patient_Last_Name = str(patient['Pt_Last_Name'])
patient_Client_ID = str(patient['Client_ID_Name'])
patient_DOS = str(patient['DOS'])
header_layout = ['First Name:', 'Last Name:', 'Client Name:', 'DOS:']
header_data = [patient_First_Name, patient_Last_Name, patient_Client_ID, patient_DOS]
table_header_layout = ['Test ID', 'Name', 'Mod 1', 'Mod 2', 'Mod 3', 'Mod 4', 'Proc Code', 'Name','Units Billed $', 'Billed $', 'Gross $', 'Expect $', 'Price Method', 'Payor ID', 'POS', 'Rendering Phys']
workbook = Workbook('%s_spider.xlsx' % (self.name))
worksheet = workbook.add_worksheet()
row = 0
col = 0
for value in header_layout:
worksheet.write(row, 0, value)
row = 1 + row
row = 0
for value in header_data:
worksheet.write(row, 1, value)
row = 1 + row
row = 6
for value in table_header_layout:
worksheet.write(row, col, value)
col = 1 + col
col = 0
row = 7
for number in range(0, count_final):
table = Table()
table['Test_ID'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[2]/text()' % (number)).extract()
table['Name_1'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[3]/text()' % (number)).extract()
table['Name_2'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[9]/text()' % (number)).extract()
table['Mod_1'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[4]/text()' % (number)).extract()
table['Mod_2'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[5]/text()' % (number)).extract()
table['Mod_3'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[6]/text()' % (number)).extract()
table['Mod_4'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[7]/text()' % (number)).extract()
table['Proc_Code'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[8]/text()' % (number)).extract()
table['Units_Billed'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[10]/text()' % (number)).extract()
table['Billed'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[11]/text()' % (number)).extract()
table['Gross'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[12]/text()' % (number)).extract()
table['Expect'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[13]/text()' % (number)).extract()
table['Price_Method'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[15]/text()' % (number)).extract()
table['Payor_ID'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[16]/text()' % (number)).extract()
table['POS'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[22]/text()' % (number)).extract()
table['Rendering_Phys'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[23]/text()' % (number)).extract()
table_Test_ID = str(table['Test_ID'])
table_Name_1 = str(table['Name_1'])
table_Name_2 = str(table['Name_2'])
table_Mod_1 = str(table['Mod_1'])
table_Mod_2 = str(table['Mod_2'])
table_Mod_3 = str(table['Mod_3'])
table_Mod_4 = str(table['Mod_4'])
table_Proc_Code = str(table['Proc_Code'])
table_Units_Billed = str(table['Units_Billed'])
table_Billed = str(table['Billed'])
table_Gross = str(table['Gross'])
table_Expect = str(table['Expect'])
table_Price_Method = str(table['Price_Method'])
table_Payor_ID = str(table['Payor_ID'])
table_POS = str(table['POS'])
table_Rendering_Phys = str(table['Rendering_Phys'])
table_data = [table_Test_ID, table_Name_1, table_Mod_1, table_Mod_2, table_Mod_3, table_Mod_4, table_Proc_Code, table_Name_2, table_Units_Billed, table_Billed, table_Gross, table_Expect, table_Price_Method, table_Payor_ID, table_POS, table_Rendering_Phys]
for text in table_data:
worksheet.write(row, col, text)
col = 1 + col
row = 1 + row
col = 0
workbook.close()
产品:
from scrapy import *
class count(Item):
row_count = Field()
class Table(Item):
Test_ID = Field(serializer=str)
Name_1 = Field(serializer=str)
Name_2 = Field(serializer=str)
Mod_1 = Field(serializer=str)
Mod_2 = Field(serializer=str)
Mod_3 = Field(serializer=str)
Mod_4 = Field(serializer=str)
Proc_Code = Field(serializer=int)
Units_Billed = Field(serializer=str)
Billed = Field(serializer=str)
Gross = Field(serializer=str)
Expect = Field(serializer=str)
Price_Method = Field(serializer=str)
Payor_ID = Field(serializer=str)
POS = Field(serializer=str)
Rendering_Phys = Field(serializer=str)
class cquentiaPatientItems(Item):
Pt_First_Name = Field(serializer=str)
Pt_Last_Name = Field(serializer=str)
Client_ID_Name = Field(serializer=str)
DOS = Field()