I am learning how to use scrapy, and I have come to the problem, my spider doesn't scrape information from a website that I choose. Here is my spider's code:
from scrapy.spider import Spider
from scrapy.selector import Selector
from reddit.items import RedditItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class RedditSpider(Spider):
name = "reddit"
allowed_domains =['http://www.reddit.com']
start_urls = ["http://www.reddit.com/r/funny/comments/3arta6/awkward_moment_seal/"]
rules = (
Rule(SgmlLinkExtractor(allow=r'Items'), callback='parse_item', follow=True),
)
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//*[@id="siteTable_t3_3arta6"]')
items = []
for site in sites:
item = RedditItem()
item['author'] = site.xpath('a/text()').extract()
item['score_unvoted'] = site.xpath('//span[contains(@class, "score_unvoted")]/text()').extract()
item['usertext'] = site.xpath('//*[@id="form-t1_csfkjb86q9"]/text()').extract()
yield item
Here is my items.py:
from scrapy.item import Item, Field
import sys
if "C:\\Python27" not in sys.path:
sys.path.append("C:\\Python27")
class RedditItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
author = Field()
score_unvoted = Field()
usertext = Field()
This is what happens when I use scrapy crawl reddit
in terminal.
Any help would be great.