我有这段代码从网址中提取大量内容并遵循一些链接,但我想在网站上获取所有现有文本。在response.css()中,是否有一个允许您选择每个现有标记的属性?
import scrapy
from bs4 import BeautifulSoup
import nltk
import lxml.html
import pandas as pd
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(scrapy.Spider):
name = "dialpad"
def start_requests(self):
#pass in your URLs here
urls = [
'https://www.dialpad.com/',
'https://www.domo.com/'
]
allowed_domains = [i for i in urls]
for url in urls:
BASE_URL = url
rules = (Rule(LinkExtractor(allow=(r'{}/.*'.format(url), ))), )
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
#enter your path to where you want to save results
root_path = "~/Desktop/DataSci/results_scrape"
page = response.url.split(".")[1]
filename = 'quotes-thing-{}.csv'.format(page)
BASE_URL = response.url
with open(filename, 'w') as f:
for h in response.css('body').xpath('.//text()'):
selector = h.extract()
f.write(selector)
for href in response.css("li > a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback = self.parse)
for thing in response.css('body').xpath('.//text()'):
thing = thing.extract()
f.write(thing)
答案 0 :(得分:3)
此代码为您提供数据中所有类型的标记。
else
如果要提取像'href'这样的特定标签,您应该在项目中使用以下代码 ;
data = """
... <html><head>
...
"""
tag_list = []
soup = BeautifulSoup(data)
for tag in soup.find_all():
print (tag.name)
tag_list.append(tag.name)