我是新来的,这是我第一次使用scrapy而且我真的需要帮助。我知道以前曾经问过,我确实尝试了很多解决方案,但都没有。
我的管道文件:
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import NotConfigured
from scrapy.exceptions import DropItem
from scrapy.http import Request
from projetpfe.items import ProjetpfeItem
class MySQLStorePipeline(object):
def __init__(self):
try:
self.conn= MySQLdb.connect(user='root', passwd='root123', host='localhost', db='pressebam', use_unicode=True, charset='utf8')
self.cursor = self.conn.cursor()
self.cursor.execute("CREATE TABLE IF NOT EXISTS scrapeddata2( idscrapedData INT NOT NULL AUTO_INCREMENT PRIMARY KEY, nomOrganePresse VARCHAR(200), titreArticle VARCHAR(200), url VARCHAR(200), nomJournaliste VARCHAR(200), jour VARCHAR(100), annee VARCHAR(100), categorie VARCHAR(100), contenuArticle VARCHAR(5000), lienImage VARCHAR(200)) ")
self.conn.commit()
except (AttributeError, MySQLdb.OperationalError), e:
raise e
def process_item(self, item, spider):
try:
self.cursor.execute( "INSERT INTO scrapeddata2 ( nomOrganePresse, titreArticle, url, jour, contenuArticle, lienImage) VALUES (%s, %s, %s,%s,%s, %s)",
(item['OrganePresse'],
item['Titre'],
item['URL'],
item['Jour'],
item['Contenu'],
item['LienImage'] ))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
这是我的蜘蛛文件
import urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from projetpfe.items import ProjetpfeItem
class ProjetpfeSpider(CrawlSpider):
name = 'telquel'
start_urls = ['http://telquel.ma'] # urls from which the spider will start crawling
rules = [Rule(SgmlLinkExtractor(allow=[r'page/\d+']), follow=True),
# r'page/\d+' : regular expression for http://telquelpage/X URLs
Rule(SgmlLinkExtractor(allow=[r'\d{4}/\d{2}/\d{2}/\w+']), callback='parse_telquel')]
# r'\d{4}/\d{2}/\w+' : regular expression for http://telquel.ma/YYYY/MM/title URLs
def parse_telquel(self, response):
hxs = HtmlXPathSelector(response)
item = ProjetpfeItem()
# XPath selector for title
item['Titre'] = hxs.select("//h1[@class='article-title']/text()").extract()
item['LienImage'] = hxs.select("//div[@class='main-article-content']//img[@class='setborder']/@src").extract()
item['OrganePresse'] = hxs.select("//img[@class='logo']/@alt").extract()
item['Jour'] = hxs.select("//div[@class='calendar-date']/text()").extract()
item['Contenu'] = hxs.select("//div[@class='shortcode-content']").extract()
item['URL'] = hxs.select("/html/head/link[5]/@href").extract()
return item
这是设置文件
BOT_NAME = 'projetpfe'
SPIDER_MODULES = ['projetpfe.spiders']
NEWSPIDER_MODULE = 'projetpfe.spiders'
ITEM_PIPELINES = {'projetpfe.pipelines.MySQLStorePipeline' : 300}
最后是我的物品
from scrapy.item import Item, Field
class ProjetpfeItem(Item):
OrganePresse = Field()
Titre = Field()
Journaliste = Field()
Jour = Field()
Annee = Field()
Categorie = Field()
Contenu = Field()
LienImage = Field()
URL = Field()
所以蜘蛛工作正常但是nada存储在数据库中。 HELP !!!