我是scrapy的新手,在运行蜘蛛爬行behance
时import scrapy
from scrapy.selector import Selector
from behance.items import BehanceItem
from selenium import webdriver
from scrapy.http import TextResponse
from scrapy.crawler import CrawlerProcess
class DmozSpider(scrapy.Spider):
name = "behance"
#allowed_domains = ["behance.com"]
start_urls = [
"https://www.behance.net/gallery/29535305/Mind-Your-Monsters",
]
def __init__ (self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
item = BehanceItem()
hxs = Selector(response)
item['link'] = response.xpath("//div[@class='js-project-module-image-hd project-module module image project-module-image']/@data-hd-src").extract()
yield item
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(DmozSpider)
process.start()
当我运行我的抓取工具时,我在命令行上遇到以下错误
追踪(最近一次通话): 文件“/home/davy/behance/behance/spiders/behance_spider.py”,第3行,in 来自behance.items导入BehanceItem
ImportError:没有名为behance.items的模块
我的目录结构:
behance/
├── behance
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ └── behance_spider.py
-── scrapy.cfg
答案 0 :(得分:1)
尝试使用此命令运行蜘蛛:
scrapy crawl behance
或者更改你的蜘蛛文件:
import scrapy
from scrapy.selector import Selector
from behance.items import BehanceItem
from selenium import webdriver
from scrapy.http import TextResponse
from scrapy.crawler import CrawlerProcess
class BehanceSpider(scrapy.Spider):
name = "behance"
allowed_domains = ["behance.com"]
start_urls = [
"https://www.behance.net/gallery/29535305/Mind-Your-Monsters",
]
def __init__ (self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
item = BehanceItem()
hxs = Selector(response)
item['link'] = response.xpath("//div[@class='js-project-module-image-hd project-module module image project-module-image']/@data-hd-src").extract()
yield item
在settings.py
文件所在的目录中创建另一个python文件。
run.py
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl("behance")
process.start()
现在运行普通python脚本时运行此文件。 python run.py
答案 1 :(得分:0)
您可以将它添加到您的python路径:
export PYTHONPATH=$PYTHONPATH:/home/davy/behance/