免责声明:我是Scrapy的新手。
我已经制作了蜘蛛的工作版本。但是,它只需要一张图像。如何链接到从新页面获取并添加到其余元素并添加到数据库中的图像?在这种情况下,将有尽可能多的图像。
在代码末尾,我写了我想做的事情。我也做了一个视觉方案。
蜘蛛代码,最后带有注释:
class MySpider(scrapy.Spider):
name = 'MySpider'
download_delay = 20
allowed_domains = ['example.com']
start_urls = ['https://example.com/page/']
visited_urls = []
#Take posts urls from page
def parse(self, response):
lua_script = """
function main(splash, args)
assert(splash:go(splash.args.url))
-- requires Splash 2.3
while not splash:select(".image-group") do
splash:wait(40)
end
return {html=splash:html()}
end
"""
if response.url not in self.visited_urls:
self.visited_urls.append(response.url)
for post_link in response.xpath('//*[@id="search-results"]/div/div/article/ul/li/@data-url').extract():
url = urljoin(response.url, post_link)
yield SplashRequest(url, self.parse_post, endpoint='execute', args={'lua_source': lua_script})
next_pages = response.xpath('//*[contains(@rel, "next")]/@href').extract()
next_page = next_pages[-1]
next_page_url = urljoin(response.url+'/', next_page)
yield response.follow(next_page_url, callback=self.parse)
def parse_post(self, response):
items = MyItem()
link = response.xpath('//*[@property="og:url"]/@content').extract()
title = response.xpath('//*[@id="product-header"]/h1/text()').extract()
tags = response.xpath('//*[@id="widget-tags"]/div/a/@data-tag').extract()
category = ('Category')
price = response.xpath('//*[@name="twitter:data1"]/@content').extract()
items['link'] = ''.join(link).strip()
items['title'] = ''.join(title).strip()
items['tags'] = ', '.join(tags).strip()
items['category'] = ''.join(category).strip()
yield items
### Go to page(url = item['link'] + '/images/screenshot1')
# Take ImageLink = response.xpath('//*[@class="preview"]/img/@src').extract()
# Add previous items from def parse_post (link, title, tags, category, price)
# yield all items (link, title, tags, category, price, imagelink) (To add to the database)
### Go to page(url = item['link'] + '/images/screenshot2')
# Take ImageLink = response.xpath('//*[@class="preview"]/img/@src').extract()
# Add previous items from def parse_post (link, title, tags, category, price)
# yield all items (link, title, tags, category, price, imagelink) (To add to the database)
### Go to page(url = item['link'] + '/images/screenshot3')
# Take ImageLink = response.xpath('//*[@class="preview"]/img/@src').extract()
# Add previous items from def parse_post (link, title, tags, category, price)
# yield all items (link, title, tags, category, price, imagelink) (To add to the database)
# And do it as many times as there will be screenshots (A number of screenshots of different)