我的scrapy文件工作正常,它会爬网所有页面并返回抓取的数据。但是,现在我想将我的scrapy与flask集成在一起,希望从用户输入中获取URL,而我面临着AttributeError: 'str' object has no attribute 'text'
问题。
我已经尝试过crawlerProcess,crawlRunner和子流程,但是所有这些方法都卡在all_links = LxmlLinkExtractor(allow=(), unique=True, canonicalize=True).extract_links(URL)
这一行代码中。
这是我的scrap脚代码,任何帮助将不胜感激
ScrapeBot.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
import time
driver = webdriver.PhantomJS()
class scrapingBotSpider(CrawlSpider):
name = 'scrapingBot'
allowed_domains = ''
start_urls = ''
crawl_count = 0
max_url_value = 1000
failed_url = []
rules = [Rule(LxmlLinkExtractor(allow=()), callback="parse_item", follow=True)]
def parse_item(self, response):
self.crawl_count += 1
print("!!!!!!!!!!!!!!!!!!!")
print("RESPONSE ===========>", response)
print("Parsing {} page, Crawl_Count={}".format(response, self.crawl_count))
URL = response
all_links = LxmlLinkExtractor(allow=(), unique=True, canonicalize=True).extract_links(URL)
print("==============////////=====LXMLLINKEXTRACTOR=========/////////////////////")
print(all_links)
driver = webdriver.PhantomJS()
items = []
for link in all_links:
is_allowed = False
for allowed_domain in self.allowed_domains:
if allowed_domain in link.url:
is_allowed = True
if is_allowed:
print("=========>>")
driver.get(link.url)
# scrape here
from_title = response.xpath("//title/text()").extract()
items.append({ "title": from_title})
return items
# run spider
process = CrawlerProcess()
process.start()
process.crawl(scrapingBotSpider)
Flask.py
rom flask import Flask, request, Response, render_template, make_response
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapeWeb.spiders.scrapeBot import scrapingBotSpider
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
import requests
app = Flask(__name__)
@app.route('/')
def crawl():
"""
Ui for crawling a website
:return:
"""
return render_template('scrper.html')
@app.route('/result', methods=['POST', 'GET'])
def result():
if request.method == 'POST':
driver = webdriver.PhantomJS()
link = request.form.get('url')
allow_url = request.form.get('domain')
scrape = scrapingBotSpider(CrawlSpider)
scrape.start_urls = link
scrape.allowed_domains = allow_url
print(scrape.start_urls, scrape.allowed_domains)
return scrape.parse_item(link)
if __name__ == "__main__":
app.run(host="0.0.0.0", port="5000", debug=True)
追踪
!!!!!!!!!!!!!!!!!!!
RESPONSE ===========> https://www.example.com/
Parsing https://www.example.com/ page, Crawl_Count=1
2020-09-08 13:20:41 [werkzeug] INFO: 127.0.0.1 - - [08/Sep/2020 13:20:41] "POST /result HTTP/1.1" 500 -
return scrape.parse_item(link)
File "/Users/user/tech/scrapeWeb/scrapeWeb/spiders/scrapeBot.py", line 61, in parse_item
all_links = LxmlLinkExtractor(allow=(), unique=True, canonicalize=True).extract_links(URL)
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/linkextractors/lxmlhtml.py", line 151, in extract_links
base_url = get_base_url(response)
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/utils/response.py", line 21, in get_base_url
text = response.text[0:4096]
AttributeError: 'str' object has no attribute 'text'
2020-09-08 13:20:41 [werkzeug] INFO: 127.0.0.1 - - [08/Sep/2020 13:20:41] "GET /result?__debugger__=yes&cmd=resource&f=debugger.js HTTP/1.1" 200 -
2020-09-08 13:20:41 [werkzeug] INFO: 127.0.0.1 - - [08/Sep/2020 13:20:41] "GET /result?__debugger__=yes&cmd=resource&f=style.css HTTP/1.1" 200 -
2020-09-08 13:20:41 [werkzeug] INFO: 127.0.0.1 - - [08/Sep/2020 13:20:41] "GET /result?__debugger__=yes&cmd=resource&f=jquery.js HTTP/1.1" 200 -
2020-09-08 13:20:41 [werkzeug] INFO: 127.0.0.1 - - [08/Sep/2020 13:20:41] "GET /result?__debugger__=yes&cmd=resource&f=console.png HTTP/1.1" 200 -