我一直在尝试使用Scrapy(xpath)从Kbb的HTML中提取脚本标记中的数据。但我的主要问题是识别正确的div和脚本标签。我是使用xpath的新手,非常感谢任何帮助!


 
&# xA; < script type =“text / javascript”src =“http://s1.kbb.com/combine/IncentivesPilotJs/949332058”>< / script>
 < input type =“hidden”id =“ResaleValueUrl”value =“/ ymmt / resalevalue /?vehicleid = 392396”/>
 < input type =“hidden”id =“Intent”value =“buy-used”/>
 <! - [if lt IE 9]>
 <脚本>
 window.FlashCanvasOptions = {
 swfPath:“/ js / canvas / FlashCanvas / UCMarketMeter /”
 };
 < /脚本>
 < script type =“text / javascript”src =“http://s1.kbb.com/combine/YmmtMarketMeterFlashCanvasJs/795892638”>< / script>
百分比抑制率ENDIF] - GT;!
 < script type =“text / javascript”src =“http://s1.kbb.com/combine/YMMTOverview/1527402533”>< / script>
 < script type =“text / javascript”src =“http://s1.kbb.com/combine/YmmtPricingOverviewBuyUsedJs/-1416499456”>< / script>

 < script language =“javascript”type =“text / javascript”>
 $(document).ready(function(){
 KBB.Vehicle.Pages.PricingOverview.Buyers.setup({
 //解决方法直到我们为Flash和#xA提供跨域工作; imageDir:window。 FlashCanvasOptions?“/ Content / images”:“http://file.kelleybluebookimages.com/kbb/images/marketmeter",
vehicleId:”392396“,
 zipCode:”78701“,&#xA ;里程:“10000”,
意图:“买二手”,
价格类型:“零售”,
条件:“好”,
期权:“392396 | 53635 | 78701 | 100 | 10 |“,
价格:”17074“,
制造商:”日产“,
型号:”Altima“,
年份:”2014“,&# xA;风格:“2.5 S Sedan 4D”,
类别:“”,
 hasCpo:true,
 meetsCpoReq:true,
 showOthersPaid:false,
数据:{
 “价值观”:{
 “cpo”:{
 “priceMin”:17335.0,
 “价格”:18275.0,
 “priceMax”:19214.0
 },
 “fpp”:{
 “priceMin”:15286.0,
 “价格”:17074.0,
 “priceMax”:18861.0
 },
 “privatepartyexcellent”:{
 “priceMin”:0.0,
 “价格”:16064.0,
 “priceMax”:0.0
 },
 “privatepartyfair”:{
 “priceMin”:0.0,
 “价格”:14081.0,
 “priceMax”:0.0
 },
 “privatepartygood”:{
 “priceMin”:0.0,
 “价格”:15454.0,
 “priceMax”:0.0
 },
 “privatepartyverygood”:{
 “priceMin”:0.0,
 “价格”:15715.0,
 “priceMax”:0.0
 },
 “零售”:{
 “priceMin”:0.0,
 “价格”:17875.0,
 “priceMax”:0.0
 }
 },
 “timAmount”:0.0,
 “monthlyPayments”:{
 “cpo”:{
 “vehiclePrice”:18275.0,
 “率”:2.9,
 “条款”:60.0,
 “taxAndTitle”:6.5,
 “downPay”:0.0,
 “金额”:348.0
 },
 “fpp”:{
 “vehiclePrice”:17074.0,
 “率”:4.9,
 “条款”:60.0,
 “taxAndTitle”:6.5,
 “downPay”:0.0,
 “金额”:342.0
 },
 “privatepartyexcellent”:{
 “vehiclePrice”:16064.0,
 “率”:4.9,
 “条款”:60.0,
 “taxAndTitle”:6.5,
 “downPay”:0.0,
 “金额”:322.0
 },
 “privatepartyfair”:{
 “vehiclePrice”:14081.0,
 “率”:4.9,
 “条款”:60.0,
 “taxAndTitle”:6.5,
 “downPay”:0.0,
 “金额”:282.0
 },
 “privatepartygood”:{
 “vehiclePrice”:15454.0,
 “率”:4.9,
 “条款”:60.0,
 “taxAndTitle”:6.5,
 “downPay”:0.0,
 “金额”:309.0
 },
 “privatepartyverygood”:{
 “vehiclePrice”:15715.0,
 “率”:4.9,
 “条款”:60.0,
 “taxAndTitle”:6.5,
 “downPay”:0.0,
 “金额”:315.0
 },
 “零售”:{
 “vehiclePrice”:17875.0,
 “率”:4.9,
 “条款”:60.0,
 “taxAndTitle”:6.5,
 “downPay”:0.0,
 “金额”:358.0
 }
 },
 “规模”:{
 “scaleLow”:14081.0,
 “scaleHigh”:19214.0
 },
 “交易”:{
 “下面”:7,
 “介于”之间:17,
 “上方”:3
 }
}
 adPriceRanges:{“AdPriceRange”:[{“PriceMin”:0,“PriceMax”:8499,“AdPRValue”:1},{“PriceMin”:8500,“PriceMax”:18499,“AdPRValue”:2},{“ PriceMin “:18500”,PriceMax “:23499”,AdPRValue “:3},{” PriceMin “:23500”,PriceMax “:28499”,AdPRValue “:4},{” PriceMin “:28500”,PriceMax“:33499 “AdPRValue”:5},{ “PriceMin”:33500 “PriceMax”:38499 “AdPRValue”:6},{ “PriceMin”:38500 “PriceMax”:43499 “AdPRValue”:7},{” PriceMin “:43500”,PriceMax “:48499”,AdPRValue “:8},{” PriceMin “:48500”,PriceMax “:53499”,AdPRValue “:9},{” PriceMin “:53500”,PriceMax“:63499 “AdPRValue”:10},{ “PriceMin”:63500 “PriceMax”:73499 “AdPRValue”:11},{ “PriceMin”:73500 “PriceMax”:1000000, “AdPRValue”:12}]}} );
 });
 $( '脚音符')隐藏();
 $(window).on('popstate',function(){
 KBB.Vehicle.Pages.PricingOverview.Buyers.stateChangeHandler();
});
 < / script>


 Scrapy代码:

来自scrapy.spider导入BaseSpider
来自scrapy.selector import Selector
 import scrapy&#xA ;
来自kbb.items import kbbItem

 class kbbSpider(scrapy.Spider):
 name =“kbb”
 allowed_domains = [“kbb.com”]&# xA; start_urls = [
 “http://www.kbb.com/nissan/altima/2014/25-s-sedan-4d/?vehicleid=392396&intent=buy-used&10000&good&pricetype=retail"
]& #xA;
 def parse(self,response):
 SEL =选择(响应)
 #位点= sel.xpath( '// DIV')
项= []
 #for网站中的网站:
项= kbbItem
 。项[ 'priceMin'] = site.xpath( '// DIV /脚本')提取[35] [915:922]
返回项目



 我最后要填充 priceMin
, price
,来自
和来自 fpp
的priceMax retail
字段的价格到我的商品中。目前我正在使用索引来获取这些值但是想知道是否有更简单的方法。
答案 0 :(得分:7)
问题是所需数据在Javascript代码中。而且,您依赖线索引的当前方法非常脆弱且不可靠。
想法是找到包含所需数据的script
标记,使用regular expressions来获取包含价格的对象/字典,在{{3}的帮助下将对象加载到python字典中并获得所需的信息。
来自json
module:
In [1]: import re
In [2]: import json
In [3]: pattern = re.compile(r"KBB\.Vehicle\.Pages\.PricingOverview\.Buyers\.setup\(.*?data: ({.*?}),\W+adPriceRanges", re.MULTILINE | re.DOTALL)
In [4]: data = response.xpath("//script[contains(., 'KBB.Vehicle.Pages.PricingOverview.Buyers.setup')]/text()").re(pattern)[0]
In [5]: data = data.replace("//Workaround until we get cross domain working for Flash", "")
In [6]: data_obj = json.loads(data)
In [7]: data_obj['values']['fpp']
Out[7]: {u'price': 15569.0, u'priceMax': 17356.0, u'priceMin': 13781.0}
In [8]: data_obj['values']['retail']
Out[8]: {u'price': 16370.0, u'priceMax': 0.0, u'priceMin': 0.0}