这就是项目树的样子:
connection
pipelines.py
rym_chart_scraper
├───scrapy.cfg
├───rym_chart_scraper
│ ├───__init__.py
│ ├───items.py
│ ├───models.py
├───pipelines.py
├───settings.py
├───spiders
├───my_spider.py
├───__init__.py
models.py
from models import TopAlbums, db_connect, create_topalbums_table
from sqlalchemy.orm import sessionmaker
class TopAlbumPipeline:
def __init__(self):
engine = db_connect()
create_topalbums_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
topalbums = TopAlbums(**item)
try:
session.add(topalbums)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
蜘蛛:
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime
import settings
Base = declarative_base()
def db_connect():
return create_engine(URL(**settings.DATABASE))
def create_topalbums_table(engine):
Base.metadata.create_all(engine)
class TopAlbums(Base):
__tablename__ = 'top_albums'
id = Column(Integer, primary_key=True)
Artist = Column('Artist', String)
Album = Column('Album', String)
Chart_year = Column('Chart_year', String)
Genre = Column('Genre', String)
Ratings = Column('Ratings', Integer)
Reviews = Column('Reviews', Integer)
Date = Column('Date', DateTime)
当我使用刮刀运行时:
from scrapy import Spider, Request
from rym_chart_scraper.utility import find_between, listToString
from rym_chart_scraper.items import TopAlbumChartItem
from datetime import datetime
class TopAlbumChartSpider(Spider):
name = "top_music_charts"
allowed_domains = ['rateyourmusic.com']
start_urls = [
"https://rateyourmusic.com/charts/top/album/all-time"
]
n_pages = 1
def parse(self, response):
for album, stats in zip(response.css('div.chart_main'),
response.css('div.chart_stats')):
...
yield item
next_page = response.css('a.navlinknext')[0].css(
'a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
self.n_pages += 1
if self.n_pages < 31:
yield Request(next_page, callback=self.parse)
我收到以下导入错误。
scrapy crawl top_music_charts
从命令行运行实际的蜘蛛时,尝试从 main 以交互方式导入“模型”不会出错。项目结构有问题吗?还是其他一些愚蠢的错误?出于某种原因,我无法理解这一点。