我正在尝试使用scrapy创建一个webcrawler,我正在使用之前使用过的模板,但我似乎无法解析网址。我可以看到它进入youtube,然后去看看,但从那里它不会拉标题或描述或任何东西,因为它总是无法解析。
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import log
from krakenkrawler.items import KrakenItem
class AttractionSpider(CrawlSpider):
name = "thekraken"
allowed_domains = ["youtube.com"]
start_urls = [
"http://www.youtube.com/?gl=GB&hl=en-GB"
]
rules = ()
def __init__(self, name=None, **kwargs):
super(AttractionSpider, self).__init__(name, **kwargs)
self.items_buffer = {}
self.base_url = "http://www.youtube.com"
from scrapy.conf import settings
settings.overrides['DOWNLOAD_TIMEOUT'] = 360
def parse(self, response):
print "Start scrapping Attractions...."
try:
hxs = HtmlXPathSelector(response)
links = hxs.select("//h3[@class='yt-lockup-title']//a/@href")
if not links:
return
log.msg("No Data to scrap")
for link in links:
v_url = ''.join( link.extract() )
if not v_url:
continue
else:
_url = self.base_url + v_url
yield Request( url= _url, callback=self.parse_details )
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise
def parse_details(self, response):
print "Start scrapping Detailed Info...."
try:
hxs = HtmlXPathSelector(response)
l_venue = KrakenItem()
v_name = hxs.select("//*[@id='eow-title'].text").extract()
if not v_name:
v_name = hxs.select("//*[@id='eow-title'].text").extract()
l_venue["name"] = v_name[0].strip()
base = hxs.select("//*[@id='content']/div[7]")
if base.extract()[0].strip() == "<div style=\"clear:both\"></div>":
base = hxs.select("//*[@id='content']/div[8]")
elif base.extract()[0].strip() == "<div style=\"padding-top:10px;margin-top:10px;border-top:1px dotted #DDD;\">\n You must be logged in to add a tip\n </div>":
base = hxs.select("//*[@id='content']/div[6]")
x_datas = base.select("div[1]/b").extract()
v_datas = base.select("div[1]/text()").extract()
i_d = 0;
if x_datas:
for x_data in x_datas:
print "data is:" + x_data.strip()
if x_data.strip() == "<b>Address:</b>":
l_venue["address"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Contact:</b>":
l_venue["contact"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Operating Hours:</b>":
l_venue["hours"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Website:</b>":
l_venue["website"] = (base.select("//*[@id='watch-actions-share-panel']/div/div[2]/div[2]/span[1]/input/text()").extract())[0].strip()
i_d += 1
v_photo = base.select("img/@src").extract()
if v_photo:
l_venue["photo"] = v_photo[0].strip()
v_desc = base.select("div[3]/text()").extract()
if v_desc:
desc = ""
for dsc in v_desc:
desc += dsc
l_venue["desc"] = desc.strip()
v_video = hxs.select("//*[@id='content']/iframe/@src").extract()
if v_video:
l_venue["video"] = v_video[0].strip()
yield l_venue
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise
提前感谢。
答案 0 :(得分:0)
问题在于您正在寻找的结构&#34; // h3 [@class =&#39; yt-lockup-title&#39;] // a / @ href&#34;在所有页面中都不存在。
我修改了您的代码以验证打开了哪些页面以及提取了哪些数据:
class AttractionSpider(CrawlSpider):
name = "thekraken"
bot_name = 'kraken'
allowed_domains = ["youtube.com"]
start_urls = ["http://www.youtube.com/?gl=GB&hl=en-GB"]
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
print "Start scrapping Attractions...."
print response.url
try :
hxs = HtmlXPathSelector(response)
links = hxs.select("//h3[@class='yt-lockup-title']//a/@href")
for link in links:
v_url = ''.join( link.extract() )
print v_url
if not links:
log.msg("No Data to scrap")
except :
pass
结果是这样的:
开始废弃景点.... http://www.youtube.com/watch?v=GBdCbciGLK0
开始废弃景点.... http://www.youtube.com/watch?v=BxUjDpnSHyc&list=TL4PEfm95Wz3k
开始废弃景点.... http://www.youtube.com/watch?v=T-CZW4YjAig
开始报废景点...... https://www.youtube.com/user/ComedyShortsGamer
/观看V = TdICODRvAhc&安培;列表= UUrqsNpKuDQZreGaxBL_a5Jg
/观看V = CDGzm5edrlw&安培;列表= UUrqsNpKuDQZreGaxBL_a5Jg
/观看V = F2oR5KS54JM&安培;列表= UUrqsNpKuDQZreGaxBL_a5Jg
/观看V = LHRzOIvqmQI&安培;列表= UUrqsNpKuDQZreGaxBL_a5Jg
/观看V = F4iqiM6h-2U&安培;列表= UUrqsNpKuDQZreGaxBL_a5Jg
/观看V = ug3UPIvWlvU&安培;列表= UUrqsNpKuDQZreGaxBL_a5Jg
/观看V = msiZs6lIZ9w&安培;列表= UUrqsNpKuDQZreGaxBL_a5Jg
/观看V = Jh6A3DoOLBg&安培;列表= UUrqsNpKuDQZreGaxBL_a5Jg
在没有结果被删除的内页中没有&#34; yt-lockup-title&#34;类。 简而言之,你必须改进你的蜘蛛。