我正在尝试从我的大学网站中提取一些项目的数据。这是我的代码。但是项目的字段不包含任何数据。
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
import scrapy
from vasavi.items import VasaviItem
class MySpider(InitSpider):
name = 'myspider'
allowed_domains = ['domainsite']
login_page = 'domainsite/index.aspx'
start_urls = ['domainsite/My_Info.aspx']
def init_request(self):
return Request(url=self.login_page, callback=self.login)
def login(self, response):
"""Generate a login request."""
return FormRequest.from_response(response,
formdata={'txtLoginID': 'srichakra', 'txtPWD': '12345'},
callback=self.check_login_response)
def check_login_response(self, response):
if "SRI CHAKRA GOUD" in response.body:
self.log("Successfully logged in. Let's start crawling!")
# Now the crawling can begin..
return self.initialized()
def parse(self, response):
print "Parsing"
item = VasaviItem()
ur = response.url
print ur
item['rollno'] = response.xpath('//*[@id="divStudInfo"]/table/tbody/tr[2]/td[1]/text()').extract()
item['name'] = response.css('#divStudInfo > table > tbody > tr:nth-child(3) > td:nth-child(2)::text').extract()
item['Marks'] = response.xpath('//*[@id="divStudySummary"]/table/tbody/tr[3]/td[9]/a/text()').extract()
yield item
我不允许在这里发布超过2个网址,所以我用域名替换了所有http://www.domain.com
输出:
2015-01-03 18:45:06+0530 [myspider] INFO: Spider opened
2015-01-03 18:45:06+0530 [myspider] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-01-03 18:45:06+0530 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2015-01-03 18:45:06+0530 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080
2015-01-03 18:45:07+0530 [myspider] DEBUG: Crawled (200) <GET domainsite> (referer: None)
2015-01-03 18:45:09+0530 [myspider] DEBUG: Redirecting (302) to <GET domainsite/My_Info.aspx> from <POST domainsite/index.aspx>
2015-01-03 18:45:15+0530 [myspider] DEBUG: Crawled (200) <GET domainsite/My_Info.aspx> (referer: domainsite/index.aspx)
2015-01-03 18:45:15+0530 [myspider] DEBUG: Successfully logged in. Let's start crawling!
2015-01-03 18:45:21+0530 [myspider] DEBUG: Crawled (200) <GET domainsite/My_Info.aspx>(referer: domainsite/My_Info.aspx)
Parsing
domainsite/My_Info.aspx
2015-01-03 18:45:21+0530 [myspider] DEBUG: Scraped from <200 domainsite/My_Info.aspx>
{'rollno': [], 'Marks': [], 'name': []}
2015-01-03 18:45:21+0530 [myspider] INFO: Closing spider (finished)
2015-01-03 18:45:21+0530 [myspider] INFO: Stored json feed (1 items) in: vce.json
2015-01-03 18:45:21+0530 [myspider] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1370,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 3,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 92491,
'downloader/response_count': 4,
'downloader/response_status_count/200': 3,
'downloader/response_status_count/302': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 1, 3, 13, 15, 21, 528000),
'item_scraped_count': 1,
'log_count/DEBUG': 8,
'log_count/INFO': 8,
'request_depth_max': 2,
'response_received_count': 3,
'scheduler/dequeued': 4,
'scheduler/dequeued/memory': 4,
'scheduler/enqueued': 4,
'scheduler/enqueued/memory': 4,
'start_time': datetime.datetime(2015, 1, 3, 13, 15, 6, 518000)}
2015-01-03 18:45:21+0530 [myspider] INFO: Spider closed (finished)
答案 0 :(得分:1)
正如其他评论者所说,你真的需要显示HTML输入。如果我不得不猜测,我说tbody
并不真正出现在页面上 - 请参阅例如this question或this question。 tbody
出现在您显示的两个路径表达式中以及CSS中。
要测试此假设,请跳过表达式中的tbody
元素:
item['rollno'] = response.xpath('//*[@id="divStudInfo"]/table//tr[2]/td[1]/text()').extract()