Question

我的scrapy文件工作正常，它会爬网所有页面并返回抓取的数据。但是，现在我想将我的scrapy与flask集成在一起，希望从用户输入中获取URL，而我面临着AttributeError: 'str' object has no attribute 'text'问题。我已经尝试过crawlerProcess，crawlRunner和子流程，但是所有这些方法都卡在all_links = LxmlLinkExtractor(allow=(), unique=True, canonicalize=True).extract_links(URL)这一行代码中。这是我的scrap脚代码，任何帮助将不胜感激

ScrapeBot.py

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
import time

driver = webdriver.PhantomJS()

class scrapingBotSpider(CrawlSpider):
    name = 'scrapingBot'
    allowed_domains = ''
    start_urls = ''
    crawl_count = 0
    max_url_value = 1000
    failed_url = []
    rules = [Rule(LxmlLinkExtractor(allow=()), callback="parse_item", follow=True)]
    
def parse_item(self, response):
        self.crawl_count += 1
        print("!!!!!!!!!!!!!!!!!!!")
        print("RESPONSE ===========>", response)
        print("Parsing {} page, Crawl_Count={}".format(response, self.crawl_count))
        URL = response
        all_links = LxmlLinkExtractor(allow=(), unique=True, canonicalize=True).extract_links(URL)
        print("==============////////=====LXMLLINKEXTRACTOR=========/////////////////////")
        print(all_links)
        driver = webdriver.PhantomJS()
        items = []
        for link in all_links:
            is_allowed = False
            for allowed_domain in self.allowed_domains:
                if allowed_domain in link.url:
                    is_allowed = True
                if is_allowed:
                    print("=========>>")
                    driver.get(link.url)
                   # scrape here
                   from_title = response.xpath("//title/text()").extract()
                   items.append({ "title": from_title})

        return items

# run spider
process = CrawlerProcess()
process.start()
process.crawl(scrapingBotSpider)

Flask.py

rom flask import Flask, request, Response, render_template, make_response
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapeWeb.spiders.scrapeBot import scrapingBotSpider
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
import requests

app = Flask(__name__)


@app.route('/')
def crawl():
    """
    Ui for crawling a website
    :return:
    """
    return render_template('scrper.html')

@app.route('/result', methods=['POST', 'GET'])
def result():
    if request.method == 'POST':
        driver = webdriver.PhantomJS()
        link = request.form.get('url')
        allow_url = request.form.get('domain')
         scrape = scrapingBotSpider(CrawlSpider)
        scrape.start_urls = link
        scrape.allowed_domains = allow_url
        print(scrape.start_urls, scrape.allowed_domains)

        return scrape.parse_item(link)

if __name__ == "__main__":
    app.run(host="0.0.0.0", port="5000", debug=True)

追踪

!!!!!!!!!!!!!!!!!!!
RESPONSE ===========> https://www.example.com/
Parsing https://www.example.com/ page, Crawl_Count=1
2020-09-08 13:20:41 [werkzeug] INFO: 127.0.0.1 - - [08/Sep/2020 13:20:41] "POST /result HTTP/1.1" 500 -
   return scrape.parse_item(link)
  File "/Users/user/tech/scrapeWeb/scrapeWeb/spiders/scrapeBot.py", line 61, in parse_item
    all_links = LxmlLinkExtractor(allow=(), unique=True, canonicalize=True).extract_links(URL)
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/linkextractors/lxmlhtml.py", line 151, in extract_links
    base_url = get_base_url(response)
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/utils/response.py", line 21, in get_base_url
    text = response.text[0:4096]
AttributeError: 'str' object has no attribute 'text'
2020-09-08 13:20:41 [werkzeug] INFO: 127.0.0.1 - - [08/Sep/2020 13:20:41] "GET /result?__debugger__=yes&cmd=resource&f=debugger.js HTTP/1.1" 200 -
2020-09-08 13:20:41 [werkzeug] INFO: 127.0.0.1 - - [08/Sep/2020 13:20:41] "GET /result?__debugger__=yes&cmd=resource&f=style.css HTTP/1.1" 200 -
2020-09-08 13:20:41 [werkzeug] INFO: 127.0.0.1 - - [08/Sep/2020 13:20:41] "GET /result?__debugger__=yes&cmd=resource&f=jquery.js HTTP/1.1" 200 -
2020-09-08 13:20:41 [werkzeug] INFO: 127.0.0.1 - - [08/Sep/2020 13:20:41] "GET /result?__debugger__=yes&cmd=resource&f=console.png HTTP/1.1" 200 -

AttributeError：“ str”对象在返回烧瓶的输出时没有属性“ text” =>

0 个答案: