我正在尝试从python脚本运行scrapy。我几乎成功了(我认为),但是有些事情行不通。在我的代码中,有一行这样的run_spider(quotes5)
。 quotes5
是我曾经在cmd:scrapy crawl quotes5
中执行过的scrapy的名称。有什么帮助吗?
错误是quotes5
未定义。
这是我的代码:
import scrapy
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
import json
import csv
import re
from crochet import setup
from importlib import import_module
from scrapy.utils.project import get_project_settings
setup()
def run_spider(spiderName):
module_name="WS_Vardata.spiders.{}".format(spiderName)
scrapy_var = import_module(module_name) #do some dynamic import of selected spider
spiderObj= scrapy_var.QuotesSpider() #get mySpider-object from spider module
crawler = CrawlerRunner(get_project_settings()) #from Scrapy docs
crawler.crawl(spiderObj)
run_spider(quotes5)
草率代码(quotes_spider.py):
import scrapy
import json
import csv
import re
class QuotesSpider(scrapy.Spider):
name = "quotes5"
def start_requests(self):
with open('input.csv','r') as csvf:
urlreader = csv.reader(csvf, delimiter=',',quotechar='"')
for url in urlreader:
if url[0]=="y":
yield scrapy.Request(url[1])
#with open('so_52069753_out.csv', 'w') as csvfile:
#fieldnames = ['Category', 'Type', 'Model', 'SK']
#writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#writer.writeheader()
def parse(self, response):
regex = re.compile(r'"product"\s*:\s*(.+?\})', re.DOTALL)
regex1 = re.compile(r'"pathIndicator"\s*:\s*(.+?\})', re.DOTALL)
source_json1 = response.xpath("//script[contains(., 'var digitalData')]/text()").re_first(regex)
source_json2 = response.xpath("//script[contains(., 'var digitalData')]/text()").re_first(regex1)
model_code = response.xpath('//script').re_first('modelCode.*?"(.*)"')
if source_json1 and source_json2:
source_json1 = re.sub(r'//[^\n]+', "", source_json1)
source_json2 = re.sub(r'//[^\n]+', "", source_json2)
product = json.loads(source_json1)
path = json.loads(source_json2)
product_category = product["pvi_type_name"]
product_type = product["pvi_subtype_name"]
product_model = path["depth_5"]
product_name = product["model_name"]
if source_json1 and source_json2:
source1 = source_json1[0]
source2 = source_json2[0]
with open('output.csv','a',newline='') as csvfile:
fieldnames = ['Category','Type','Model','Name','SK']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if product_category:
writer.writerow({'Category': product_category, 'Type': product_type, 'Model': product_model, 'Name': product_name, 'SK': model_code})
答案 0 :(得分:2)
由于错误提示quote5未定义,因此您需要先定义quote5,然后再将其传递给方法。或者尝试这样的事情:
run_spider(“quotes5”)
编辑:
import WS_Vardata.spiders.quotes_spiders as quote_spider_module
def run_spider(spiderName):
#get the class from within the module
spiderClass = getattr(quote_spider_module, spiderName)
#create the object and your good to go
spiderObj= spiderClass()
crawler = CrawlerRunner(get_project_settings()) #from Scrapy docs
crawler.crawl(spiderObj)
run_spider("QuotesSpider")
此脚本应与WS_Vardata在同一目录中运行
所以在您的情况下:
- TEST
| the_code.py
| WS_Vardata
| spiders
| quotes_spider <= containing QuotesSpider class