在我的scrapy脚本中,我正在抓取网页源并以xml格式保存输出。在该xml输出中,内容以
开头"<!DOCTYPE html> <!--[if IE 7]><html lang="en" "
但是我在输出xml中需要的是
" <![CDATA[<html lang="en" "
我如何在我的scrapy脚本中获得它?
我的代码在下面给出..
import scrapy
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from Dell.items import DellItem
from scrapy.http.request import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CSVFeedSpider
class DellSpider(scrapy.Spider):
name = "dell"
allowed_domains = ["dell.com"]
start_urls = (
'http://jobs.dell.com/united-states-jobs/',
)
def parse(self, response):
item = response.meta['item']
response = requests.get(response.url)
html =response.content
item['source']=str(html)
return item