我是一个草率的初学者,我无法获得确切的html
内容来提取所需的划分。
如何使用scrapy
在python中提取确切的html内容?我尝试使用requests
库和scrapy
请求。但是无法获取页面的确切HTML内容。我想在scrapy
笔记本中使用Spyder / Jupiter
。
尝试了以下不同的请求,但是没有用。
# My Code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = 'https://www.rbauction.com/construction?cid=3279191388',
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl(QuotesSpider)
process.start()
process.stop()
#
import requests
from scrapy.http import TextResponse
import scrapy
url = 'https://www.rbauction.com/construction?cid=3279191388'
a = scrapy.Request(url)
from scrapy.http import HtmlResponse
a = HtmlResponse(url)
a.xpath('//*[@id="rba--category-page"]/div[3]/div[2]/div[1]/dl/dd[6]/a').extract()
#
import requests
import json
from lxml import html
import requests
source = requests.get("https://www.rbauction.com/construction?cid=3279191388")
tree = html.fromstring(source.content)
#
import urllib.request
opener = urllib.request.FancyURLopener({})
url = "https://www.rbauction.com/construction?cid=3279191388"
f = opener.open(url)
content = f.read()
#
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
url = [
'https://www.rbauction.com/construction?cid=3279191388'
]
for url1 in url:
yield scrapy.Request(url=url1, callback=self.parse)
def parse(self, response):
for href in response.xpath('//*[@id="rba--category-page"]/div[3]/div[2]/div[1]/dl/dd[6]/a'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_item)
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings`enter code here`
process = CrawlerProcess(get_project_settings())
#enter code here
process.crawl(QuotesSpider)
process.start() # the script will block here until the crawling is finished
process.stop()