我有一个~2211的开始网址列表和scrapy爬网一些,但不是全部。 当我将start_url设置为单个URL时,它会抓取URL,如果我在大型列表中有URL,则scrapy不会抓取。
是否设置了start_urls
的限制?
我的代码:
from pymongo import MongoClient
import re
from scrapy.selector import Selector
#from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from mongo.items import MongoItem
import scrapy
import json
from scrapy.http import Request
from bs4 import BeautifulSoup as BS
uri = "mongodb://asdf@asdf.ac.commerce.com:23423423/"
client = MongoClient(uri)
db = client['page_content']
collection3 = db['category_page_content']
copyblocks3 = collection3.distinct('cwc')
copyblockss = str(copyblocks3)
hrefs = re.findall(r'href=[\'"]?([^\'" >]+)', copyblockss)
class MongoSpider(scrapy.Spider):
name = "collections3"
allowed_domains = ["www.ecommerce.com"]
handle_httpstatus_list = [502, 503, 504, 400, 408, 404]
start_urls = hrefs
def parse(self, response):
hxs = Selector(response)
sites = response.selector.xpath('//html')
items = []
if response.status == 404:
for site in sites:
item = MongoItem()
item['url'] = response.url
item['status'] = response.status
item['original_url'] = response.meta.get('redirect_urls', [response.url])[0]
items.append(item)
htmlvar = item['original_url']
change_list = list(collection3.find({"cwc":{"$regex":htmlvar}}))
alldata = dict()
cwcblockdic = ""
for a in change_list:
alldata.update(a)
ids = alldata['_id']
cwcblock = alldata['cwc']
cwcblockdic = cwcblockdic + cwcblock
soup = BS(cwcblockdic)
wholehref = soup.find(href=htmlvar)
try:
anchortext = soup.findAll(href=htmlvar)[0].text
except:
anchortext = wholehref.get_text()
soup.find(href=htmlvar).replaceWith(anchortext)
soup = str(soup)
newlist = soup.replace('<html><body>', '').replace('</body></html>','')
print "this is the anchor:", anchortext
print "this is the href:", wholehref
print "this is newlist:", newlist
print "this is the id:", ids
print "this is pagetype: CP"
for item in change_list:
item['cwc'] = newlist
collection3.update({'_id':ids}, {"$set":{"cwc":item['cwc']}}, upsert=False)
return items
elif hxs.xpath('/html/head/title/text()[contains(.,"invalid")]'):
for site in sites:
item = MongoItem()
item['url'] = response.url
item['status'] = response.status
item['original_url'] = response.meta.get('redirect_urls', [response.url])[0]
items.append(item)
htmlvar = item['original_url']
change_list = list(collection3.find({"cwc":{"$regex":htmlvar}}))
alldata = dict()
cwcblockdic = ""
for a in change_list:
alldata.update(a)
ids = alldata['_id']
cwcblock = alldata['cwc']
cwcblockdic = cwcblockdic + cwcblock
soup = BS(cwcblockdic)
wholehref = soup.find(href=htmlvar)
try:
anchortext = soup.findAll(href=htmlvar)[0].text
except:
anchortext = wholehref.get_text()
soup.find(href=htmlvar).replaceWith(anchortext)
soup = str(soup)
newlist = soup.replace('<html><body>', '').replace('</body></html>','')
print "this is the anchor:", anchortext
print "this is the href:", wholehref
print "this is newlist:", newlist
print "this is the id:", ids
print "this is pagetype: CP"
for item in change_list:
item['cwc'] = newlist
collection3.update({'_id':ids}, {"$set":{"cwc":item['cwc']}}, upsert=False)
return items
elif hxs.xpath('//head/link[@rel="canonical"]/@href[contains(.,"invalid-category-id")]'):
for site in sites:
item = MongoItem()
item['url'] = response.url
item['status'] = response.status
item['original_url'] = response.meta.get('redirect_urls', [response.url])[0]
items.append(item)
htmlvar = item['original_url']
change_list = list(collection3.find({"cwc":{"$regex":htmlvar}}))
alldata = dict()
cwcblockdic = ""
for a in change_list:
alldata.update(a)
ids = alldata['_id']
cwcblock = alldata['cwc']
cwcblockdic = cwcblockdic + cwcblock
soup = BS(cwcblockdic)
wholehref = soup.find(href=htmlvar)
try:
anchortext = soup.findAll(href=htmlvar)[0].text
except:
anchortext = wholehref.get_text()
soup.find(href=htmlvar).replaceWith(anchortext)
soup = str(soup)
newlist = soup.replace('<html><body>', '').replace('</body></html>','')
print "this is the anchor:", anchortext
print "this is the href:", wholehref
print "this is newlist:", newlist
print "this is the id:", ids
print "this is pagetype: CP"
for item in change_list:
item['cwc'] = newlist
collection3.update({'_id':ids}, {"$set":{"cwc":item['cwc']}}, upsert=False)
return items
else:
if hxs.xpath('//*[@class="result-summary-container"]/text()[contains(.,"Showing 0 of")]'):
for site in sites:
item = MongoItem()
item['url'] = response.url
item['status'] = response.status
item['original_url'] = response.meta.get('redirect_urls', [response.url])[0]
items.append(item)
htmlvar = item['original_url']
change_list = list(collection3.find({"cwc":{"$regex":htmlvar}}))
alldata = dict()
cwcblockdic = ""
for a in change_list:
alldata.update(a)
ids = alldata['_id']
cwcblock = alldata['cwc']
cwcblockdic = cwcblockdic + cwcblock
soup = BS(cwcblockdic)
wholehref = soup.find(href=htmlvar)
try:
anchortext = soup.findAll(href=htmlvar)[0].text
except:
anchortext = wholehref.get_text()
soup.find(href=htmlvar).replaceWith(anchortext)
soup = str(soup)
newlist = soup.replace('<html><body>', '').replace('</body></html>','')
print "this is the anchor:", anchortext
print "this is the href:", wholehref
print "this is newlist:", newlist
print "this is the id:", ids
print "this is pagetype: CP"
for item in change_list:
item['cwc'] = newlist
collection3.update({'_id':ids}, {"$set":{"cwc":item['cwc']}}, upsert=False)
return items
答案 0 :(得分:1)
这可能是其中一个原因,但仍然是有效原因:网址列表中有重复的网址:
>>> urls = [...] # list of urls you've posted
>>> len(urls)
2221
>>> len(set(urls))
1177
默认情况下,Scrapy
会过滤重复的请求。
答案 1 :(得分:0)
如果您正在使用
def start_requests(self):
''' Your start url logic '''
yield scrapy.Requests(url=url, callback=self.parse, dont_filter=True)