我正在使用scrapy来抓取网站数据,但我想将其存储在数据库中。
我目前的代码是这样的:
def start_requests(self):
yield scrapy.Request(self.start_urls, callback=self.parse
meta={"use_splash": False})
def parse(self, response):
for sel in response.xpath('//li'):
item = ProjectjomkerjaItem()
item['title'] = sel.xpath('a/div[@class="position"]/div[@id="job-title-job-listing"]/strong/text()').extract()
item['company'] = sel.xpath('a/div[@class="position"]/div[@class="company"]/strong/text()').extract()
item['link'] = sel.xpath('a/@href').extract()
item['job_type'] = sel.xpath('a/ul[@class="meta"]/li/text()').extract()
for link in item['link']:
yield scrapy.Request(link, meta={'item': item},
callback=self.description)
def description(self, response):
sel = HtmlXPathSelector(response)
item = response.meta['item']
item['location'] = sel.xpath('//h2[@class="page-subtitle"]/span[@class="job-location"]/text()').extract()
item['salary'] = sel.xpath('//h2[@class="page-subtitle"]/span[@class="company-social-title"]/text()').extract()
item['job_description'] = sel.xpath('//div[@class="job-desc"]/div[@class="show-more-inner"]/span[@class="no_translate"]/ul/li/text()').extract()
item['more_job_description'] = sel.xpath('//div[@class="job-desc"]/div[@class="show-more-inner"]/span[@class="no_translate"]/p/text()').extract()
item['others'] = sel.xpath('//div/span[@class="no_translate"]/text()').extract()
item['about_company'] = sel.xpath('//div/span[@class="no_translate"]/span[@id="job_detail_"]/text()').extract()
yield item
“about_company”和“company”等一些项目属于table2,其余项目属于另一个table1。 Table2将有一个job_id,它是table1的唯一ID。
我如何实现这一目标?
(PS - 我正在使用带有sqlalchemy的postgresql。
修改 这是我在pipelines.py中做的另一个尝试:
from sqlalchemy.orm import sessionmaker
from models import Jobs, db_connect, create_jobs_table
from models import CompanyDetails, db_connect, create_company_details_table
from projectjomkerja.items import ProjectjomkerjaMainItem
from projectjomkerja.items import ProjectjomkerjaSecondaryItem
class ProjectjomkerjaPipeline(object):
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_jobs_table(engine)
create_company_details_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
if isinstance(item, ProjectjomkerjaMainItem):
session = self.Session()
jobs = Jobs(**item)
try:
session.add(jobs)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
if isinstance(item, ProjectjomkerjaSecondaryItem):
session = self.Session()
company_details = CompanyDetails(**item)
try:
session.add(company_details)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
def process_item2(self, item, spider):
item1 = item
item2 = item1['item2']
job_id = Jobs(item1)
CompanyDetails(item2, job_id)
仍然,我没有得到我的job_id。
答案 0 :(得分:0)
你需要在scrapy管道中使用它。 您可以创建2个项目类,每个表一个。 在管道中,您可以根据将项目类型存储在所需的表中来检查项目类型。 根据您的描述,代码可能如下所示:
class Item1(Item):
location = Field()
salary = Field()
job_description = Field()
more_job_description = Field()
others = Field()
title = Field()
link = Field()
job_type = Field()
class Item2(Item):
company = Field()
about_company = Field()
class SQLAlchemyPipeline(object):
def process_item(self, item, spider):
if isinstance(item, Item1):
save_in_table1
if isinstance(item, Item2):
save_in_table2
def process_item2(self, item, spider):
item1 = item
item2 = item1['item2']
job_id = save_in_table1(item1)
save_in_table2(item2, job_id)
答案 1 :(得分:0)