我如何使用scrapy在python中提取确切的html内容

时间:2019-03-07 07:25:44

标签: python-3.x web-scraping scrapy python-requests

我是一个草率的初学者,我无法获得确切的html内容来提取所需的划分。

如何使用scrapy在python中提取确切的html内容?我尝试使用requests库和scrapy请求。但是无法获取页面的确切HTML内容。我想在scrapy笔记本中使用Spyder / Jupiter

尝试了以下不同的请求,但是没有用。

# My Code:

import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = 'https://www.rbauction.com/construction?cid=3279191388',
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = 'quotes-%s.html' % page
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log('Saved file %s' % filename)


from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

process = CrawlerProcess(get_project_settings())

process.crawl(QuotesSpider)
process.start()
process.stop()
import requests
from scrapy.http import TextResponse
import scrapy

url = 'https://www.rbauction.com/construction?cid=3279191388'
a = scrapy.Request(url)
from scrapy.http import HtmlResponse
a = HtmlResponse(url)
a.xpath('//*[@id="rba--category-page"]/div[3]/div[2]/div[1]/dl/dd[6]/a').extract()
import requests
import json
from lxml import html
import requests
source = requests.get("https://www.rbauction.com/construction?cid=3279191388")
tree = html.fromstring(source.content)
import urllib.request

opener = urllib.request.FancyURLopener({})
url = "https://www.rbauction.com/construction?cid=3279191388"
f = opener.open(url)
content = f.read()
import scrapy
class QuotesSpider(scrapy.Spider):
     name = "quotes"
     def start_requests(self):
           url = [
               'https://www.rbauction.com/construction?cid=3279191388'
           ]
           for url1 in url:
               yield scrapy.Request(url=url1, callback=self.parse)

def parse(self, response):
     for href in response.xpath('//*[@id="rba--category-page"]/div[3]/div[2]/div[1]/dl/dd[6]/a'):
         full_url = response.urljoin(href.extract())
         yield scrapy.Request(full_url, callback=self.parse_item)



from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings`enter code here`

process = CrawlerProcess(get_project_settings())

    #enter code here

process.crawl(QuotesSpider)
process.start() # the script will block here until the crawling is finished
process.stop()

0 个答案:

没有答案