在我的蜘蛛文件中,我试图将其他项目从json文件传递到管道文件,以便它们可以包含在数据库插入或更新中。我怎么做?正确的方法是什么?
在蜘蛛中,我有两个for循环,但是我知道这是不正确的。目的是在piplines文件中从数据库中进行选择,如果行基于主键(url)在该行中-则调用更新,如果不调用插入。函数get_data用于在数据库中搜索该行,然后调用set_daat_update或set_insert_data。感谢帮助!
myspider.py-
import scrapy
import json
import sys
from ..items import AmazonItem
class MySpider(scrapy.Spider):
name = 'price_monitor'
newlist = []
start_urls = []
itemdatalist = []
with open('C:\\Users\\Documents\\python_virtual\\price_monitor\\price_monitor\\products.json') as f:
data = json.load(f)
itemdatalist = data['itemdata']
# print(type(data['itemdata']))
for item in itemdatalist:
start_urls.append(item['url'])
def start_requests(self):
for item in MySpider.start_urls:
yield scrapy.Request(url=item, callback=self.parse)
def parse(self, response):
for url in MySpider.start_urls:
scrapeitem = AmazonItem()
title = response.css('span#productTitle::text').extract_first()
title = title.strip()
price = response.css('span#priceblock_ourprice::text').extract_first()
scrapeitem['title'] = title
scrapeitem['price'] = price
for item in MySpider.data['itemdata']:
url = item['url']
name = item['name']
email = item['email']
scrapeitem['url'] = url
scrapeitem['name'] = name
scrapeitem['email'] = email
yield scrapeitem
pipelines.py
import sqlite3
class PriceMonitorPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = sqlite3.connect("price_monitor.db")
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.get_data(item)
return item
def get_data(self, item):
""" Check if the row already exists for this url """
new_price = ''
self.curr.execute("""select url, new_price from price_monitor WHERE url=url""",
{'url': item['url']})
rows = self.curr.fetchone()
print("Printing rows")
print(rows)
rows_url = rows[0]
print("Rows url")
print(rows_url)
new_price = rows[1]
#
for item['url'] in rows_url:
if item['url'] == rows_url:
print("calling db func")
self.set_data_update(item, rows_url, new_price)
else:
print("DB Calling Insert")
self.set_insert_data(item)
def set_insert_data(self, item):
self.curr.execute(""" insert into price_monitor values (?, ?, ?, ?, ?, ?)""", (
item['url'],
item['title'],
item['name'],
item['email'],
item['price'],
item['price'],
))
self.conn.commit()
def set_data_update(self, item, rows_url, new_price):
old_price = new_price
self.curr.execute("""update price_monitor SET old_price=?, new_price=?
WHERE url=?""",
(old_price, item['price'], rows_url))
print("DB Update ran")
self.conn.commit()
我从中获取URL和其他数据的Json文件-
{
"itemdata": [
{ "url": "https://www.amazon.com/dp/B07GWKT87L/?coliid=I36XKNB8MLE3&colid=KRASGH7290D0&psc=0&ref_=lv_ov_lig_dp_it#customerReview",
"title": "coffee_maker_black_and_decker",
"name": "Cobi Maguire",
"email": "cobi@noemail.com"
},
{ "url": "https://www.amazon.com/Hamilton-Beach-46310-Programmable-Coffee/dp/B07684BPLB/ref=sr_1_10?keywords=coffee+maker&qid=1559098604&s=home-garden&sr=1-10",
"title": "coffee_maker_hamilton_beach",
"name": "Ryan Murphy",
"email": "ryan@noemail.com"
}
]
}
答案 0 :(得分:1)
您可以使用response.meta
:
class MySpider(scrapy.Spider):
name = 'price_monitor'
def start_requests(self):
with open('C:\\Users\\Documents\\python_virtual\\price_monitor\\price_monitor\\products.json') as f:
data = json.load(f)
itemdatalist = data['itemdata']
for item in itemdatalist:
yield scrapy.Request(url=item['url'], callback=self.parse, meta={'item': item})
def parse(self, response):
item = response.meta["item"]
scrapeitem = AmazonItem()
title = response.css('span#productTitle::text').extract_first()
title = title.strip()
price = response.css('span#priceblock_ourprice::text').extract_first()
scrapeitem['title'] = title
scrapeitem['price'] = price
scrapeitem['name'] = item["name"] # from response.meta
scrapeitem['email'] = item["email"] # from response.meta
yield scrapeitem