我制作了一个简单的scrapy脚本,用于从https://www.jobs2careers.com中抓取带有项目的数据并将数据导出到csv文件。但是我有一个问题,当我放置多个URL时,数据将被覆盖。
我尝试了其他一些python库,例如openpyxl。也许运行多个蜘蛛会有问题
import scrapy
from scrapy.selector import Selector
from ..items import QuotetutorialItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
n = 1
start_urls = ['https://www.jobs2careers.com/results3.php?q=Fashion&l=Miami%2C+FL&s=00']
def parse(self, response):
items = QuotetutorialItem()
s = Selector(response)
quote = s.xpath("//div[@id='jobs_container']/div")
for q in quote:
url = response.url
industry = q.xpath("//input[@id='search_q']/@value").get()
state = q.xpath("//*[@id='results_controller']/div[1]/div[1]/div/div/div/div/a[3]/text()").get()
company_name = q.xpath(".//div[@class='companyname']/span[@class='company']/text()").get()
job_title = q.xpath(".//div[@class='title title1 hidden-xs']/text()").get()
items['url'] = url
items['industry'] = industry
items['state'] = state
items['company_name'] = company_name
items['job_title'] = job_title
yield items
num = int(response.xpath("//h1[@class='result2']//text()").get().split("\n")[0].replace(',', ''))
if num > 1000:
num = 1000
total = int(num) // 10 + 1
np = response.url
np = np[np.rfind('=') + 1:]
next_page = response.url.replace(np, str((self.n * 10)))
if self.n < total:
self.n += 1
yield response.follow(next_page,callback = self.parse)
答案 0 :(得分:0)
此处数据不会被覆盖。
您正在使用self.n
来限制分页,因此您总共可以获得1000项。
每个start_url中都有填充的回调,并且它们都异步地增加了Spider的self.n
属性。第一个url将self.n
从1移到2,第二个url从2移到3,然后第一个从3移到4,依此类推。因为它是异步的,所以不能保证确实如此,但是每次都会发生类似的情况。
答案 1 :(得分:0)
正如pwinz所说,这是由于转到下一页的逻辑所致。 下面的代码根据当前页面构建下一页的url,如果达到1000个作业或已达到作业总数,它将停止。
import scrapy
from scrapy.selector import Selector
from ..items import JobsItem
from urllib.parse import urlparse, urlunparse, urlencode, parse_qsl
class QuotesSpider(scrapy.Spider):
name = 'jobscareers'
start_urls = ['https://www.jobs2careers.com/results3.php?q=Healthcare&l=Chicago%2C+IL&s=00',
'https://www.jobs2careers.com/results3.php?q=Fashion&l=Miami%2C+FL&s=00%27]']
def parse(self, response):
items = JobsItem()
s = Selector(response)
quote = s.xpath("//div[@id='jobs_container']/div")
for q in quote:
url = response.url
industry = q.xpath("//input[@id='search_q']/@value").get()
state = q.xpath("//*[@id='results_controller']/div[1]/div[1]/div/div/div/div/a[3]/text()").get()
company_name = q.xpath(".//div[@class='companyname']/span[@class='company']/text()").get()
job_title = q.xpath(".//div[@class='title title1 hidden-xs']/text()").get()
items['url'] = url
items['industry'] = industry
items['state'] = state
items['company_name'] = company_name
items['job_title'] = job_title
yield items
num = int(
response.xpath("//h1[@class='result2']//text()").get().split(
"\n")[0].replace(',', ''))
parsed_url = urlparse(response.url)
query = dict(parse_qsl(parsed_url.query))
try:
s = int(query['s'])
except (ValueError, KeyError):
s = 0
if s < 1000 and s <= num:
new_query = query
new_query['s'] = str(s + 10)
next_url = urlunparse(
parsed_url._replace(query=urlencode(new_query))
)
yield scrapy.Request(next_url, callback=self.parse)