在运行scrapy之前插入多个输入字段

时间:2019-03-17 21:56:01

标签: python pyqt scrapy pyqt5

我引用的是类似于我的GUI应用程序的stackoverflow answer。我的草率应用程序有点不同。执行该应用程序时,系统会提示用户输入关键字以进行搜寻以搜寻

看起来像这样

enter image description here

我试图将这种逻辑放在GUI上,但是不确定如何执行。

这是gui的样子。

enter image description here

我希望能够输入字段,以便用户在处理scrapy脚本之前可以输入所需的信息。

这是一些令人毛骨悚然的脚本

my_spider.py

import scrapy
import sys
import random
import csv
from scrape.items import Item
from var_dump import var_dump


search_item = input("Input The Search Item: ")
location = input("Location:")
second_location = input("Second Location:")
third_location = input("Third Location:")
fourth_location = input("Fourth Location:")
fifth_location = input("Fifth Location:")
sixth_location = input("Sixth Location:")




# city = [
#     "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "Fort Worth", 
#     "San Diego", "Dallas", "San Jose", "Austin", "Columbus", "Indianapolis",  "Seattle", "St. Paul", "Nashville", 
#     "Louisville", "Plano"
# ]

# rancity = random.choice(city)


class YellowSpider(scrapy.Spider):


    name = "yellow"

    # start_urls = [
    #     "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location
    #     # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
    #     # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
    # ]

    def start_requests(self):
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location, self.parse)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location, self.parse2)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location, self.parse3)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location, self.parse4)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fifth_location, self.parse5)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + sixth_location, self.parse6)
        # yield scrapy.Request('http://www.example.com/3.html', self.parse)

    def __init__(self):
        self.seen_business_names = []
        self.seen_phonenumbers = []
        self.seen_websites = []
        self.seen_emails = []

    def parse(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse)

    def parse2(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile2)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse2)

    def parse3(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile3)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse3)
        ........

这是GUI

main.py

from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets

class ScrapyWorker(QtCore.QObject):
    logChanged = QtCore.pyqtSignal(str)
    started = QtCore.pyqtSignal()
    finished = QtCore.pyqtSignal()

    def __init__(self, parent=None):
        super(ScrapyWorker, self).__init__(parent)
        self._process = QtCore.QProcess(self)
        self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
        self._process.setProgram('scrapy')
        self._process.started.connect(self.started)
        self._process.finished.connect(self.finished)

    def run(self, project, spider):
        self._process.setWorkingDirectory(project)
        self._process.setArguments(['crawl', spider])
        self._process.start()

    @QtCore.pyqtSlot()
    def on_readyReadStandardOutput(self):
        data = self._process.readAllStandardOutput().data().decode()
        self.logChanged.emit(data)

    @QtCore.pyqtSlot()
    def stop(self):
        self._process.kill()

    def spiders(self, project):
        process = QtCore.QProcess()
        process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        process.setWorkingDirectory(project)
        loop = QtCore.QEventLoop()
        process.finished.connect(loop.quit)
        process.start('scrapy', ['list'])
        loop.exec_()
        return process.readAllStandardOutput().data().decode().split()

class MainWindow(QtWidgets.QMainWindow):
    def __init__(self, parent=None):
        super(MainWindow, self).__init__(parent)

        self.project_le = QtWidgets.QLineEdit()
        self.project_button = QtWidgets.QPushButton('Select Project')
        self.spider_combobox = QtWidgets.QComboBox()
        self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
        self.text_edit = QtWidgets.QTextBrowser()
        self.input = QtWidgets.QLineEdit()
        self.input1 = QtWidgets.QLineEdit()
        self.input2 = QtWidgets.QLineEdit()
        self.input3 = QtWidgets.QLineEdit()
        self.input4 = QtWidgets.QLineEdit()
        self.input5 = QtWidgets.QLineEdit()
        self.input6 = QtWidgets.QLineEdit()
        central_widget = QtWidgets.QWidget()
        self.setCentralWidget(central_widget)

        lay = QtWidgets.QVBoxLayout(central_widget)
        hlay = QtWidgets.QHBoxLayout()
        hlay.addWidget(self.project_le)
        hlay.addWidget(self.project_button)
        lay.addLayout(hlay)
        hlay2 = QtWidgets.QHBoxLayout()
        hlay2.addWidget(QtWidgets.QLabel("Input The Search Item :"))
        hlay2.addWidget(self.input, 1)
        hlay3 = QtWidgets.QHBoxLayout()
        hlay4 = QtWidgets.QHBoxLayout()
        hlay5 = QtWidgets.QHBoxLayout()
        hlay6 = QtWidgets.QHBoxLayout()
        hlay7 = QtWidgets.QHBoxLayout()
        hlay8 = QtWidgets.QHBoxLayout()
        hlay3.addWidget(QtWidgets.QLabel("Location :"))
        hlay3.addWidget(self.input1, 1 )
        hlay4.addWidget(QtWidgets.QLabel("Location 2 :"))
        hlay4.addWidget(self.input2, 1 )
        hlay5.addWidget(QtWidgets.QLabel("Location 3 :"))
        hlay5.addWidget(self.input3, 1 )
        hlay6.addWidget(QtWidgets.QLabel("Location 4 :"))
        hlay6.addWidget(self.input4, 1 )
        hlay7.addWidget(QtWidgets.QLabel("Location 5 :"))
        hlay7.addWidget(self.input5, 1 )
        hlay8.addWidget(QtWidgets.QLabel("Location 6 :"))
        hlay8.addWidget(self.input6, 1 )
        lay.addLayout(hlay2)
        lay.addLayout(hlay3)
        lay.addLayout(hlay4)
        lay.addLayout(hlay5)
        lay.addLayout(hlay6)
        lay.addLayout(hlay7)
        lay.addLayout(hlay8)
        lay.addWidget(self.start_stop_button)
        lay.addWidget(self.text_edit)

        self.start_stop_button.setEnabled(False)

        self.scrapy_worker = ScrapyWorker(self)
        self.scrapy_worker.logChanged.connect(self.insert_log)
        self.scrapy_worker.started.connect(self.text_edit.clear)
        self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))

        self.start_stop_button.toggled.connect(self.on_checked)
        self.project_button.clicked.connect(self.select_project)
        self.resize(640, 480)

    @QtCore.pyqtSlot(bool)
    def on_checked(self, state):
        if state:
            filename = self.project_le.text()
            finfo = QtCore.QFileInfo(filename)
            directory = finfo.dir().absolutePath()
            self.scrapy_worker.run(directory, self.spider_combobox.currentText())
            self.start_stop_button.setText('Stop')
        else:
            self.start_stop_button.setText('Start')
            self.scrapy_worker.stop()

    @QtCore.pyqtSlot()
    def select_project(self):
        filename, _ = QtWidgets.QFileDialog.getOpenFileName(
            self,
            "Select .cfg file",
            QtCore.QDir.currentPath(),
            "Configure File (*.cfg)"
        )
        if filename:
            self.project_le.setText(filename)
            finfo = QtCore.QFileInfo(filename)
            directory = finfo.dir().absolutePath()
            spiders = self.scrapy_worker.spiders(directory)
            self.spider_combobox.clear()
            self.spider_combobox.addItems(spiders)
            self.start_stop_button.setEnabled(True if spiders else False)

    @QtCore.pyqtSlot(str)
    def insert_log(self, text):
        prev_cursor = self.text_edit.textCursor()
        self.text_edit.moveCursor(QtGui.QTextCursor.End)
        self.text_edit.insertPlainText(text)
        self.text_edit.setTextCursor(prev_cursor)

if __name__ == '__main__':
    import sys
    app = QtWidgets.QApplication(sys.argv)
    app.setStyle('fusion')
    w = MainWindow()
    w.show()
    sys.exit(app.exec_())

1 个答案:

答案 0 :(得分:2)

首先,您必须修改蜘蛛程序以使其直接被控制台接受,而避免使用input()方法:

yellowpage_spider.py

import json
import scrapy
from scrape.items import Item

class YellowSpider(scrapy.Spider):
    name = "yellow"

    def __init__(self, *args, **kwargs):
        super(YellowSpider, self).__init__(*args, **kwargs)
        self.seen_business_names = []
        self.seen_phonenumbers = []
        self.seen_websites = []
        self.seen_emails = []

    def start_requests(self):
        if not hasattr(self, 'parameters'):
            return
        parameters = json.loads(self.parameters)
        search_item = parameters['search_item']
        locations = parameters['locations']
        for location in locations:
            url = "https://www.yellowpages.com/search?search_terms={}&geo_location_terms={}".format(search_item, location)
            yield scrapy.Request(url=url, callback=self.parse, meta={'location': location})

    def parse(self, response):
        location = response.meta['location']
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile, meta={'location': location})

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse, meta={'location': location})

    def businessprofile(self, response):
        location = response.meta['location']
        for business in response.css('header#main-header'):
            item = Item()
            item['business_name'] = business.css('div.sales-info h1::text').extract()
            w = business.css('a.secondary-btn.website-link::attr(href)').extract()

            item['website'] = str(w).strip('[]')

            item['location'] = location

            s = business.css('a.email-business::attr(href)').extract()
            item['email'] = [item[7:] for item in s]

            item['phonenumber'] = business.css('p.phone::text').extract_first()
            for x in item['email']:
                #new code here, call to self.seen_business_names
                if x not in self.seen_emails:
                    if item['email']:
                        if item['phonenumber']:
                            if item['website']:
                                self.seen_emails.append(x)
                                yield item

然后前面的代码需要一个名为parameters的参数:

scrapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany", "brazil"]}'

因此,在GUI中,我们现在必须使用GUI输入来形成条目:

gui.py

import os
import json
from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets
import utils


dir_path = os.path.dirname(os.path.abspath(__file__))
icons_dir = os.path.join(dir_path, 'assets', 'icons')


class ScrapyWorker(QtCore.QObject):
    logChanged = QtCore.pyqtSignal(str)
    started = QtCore.pyqtSignal()
    finished = QtCore.pyqtSignal()

    def __init__(self, parent=None):
        super(ScrapyWorker, self).__init__(parent)
        self._process = QtCore.QProcess(self)
        self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
        self._process.started.connect(self.started)
        self._process.finished.connect(self.finished)

    def run(self, project, program, arguments):
        self._process.setWorkingDirectory(project)
        self._process.setProgram('scrapy')
        self._process.setArguments(arguments)
        self._process.start()

    @QtCore.pyqtSlot()
    def on_readyReadStandardOutput(self):
        data = self._process.readAllStandardOutput().data().decode()
        self.logChanged.emit(data)

    @QtCore.pyqtSlot()
    def stop(self):
        self._process.kill()

class LocationWidget(QtWidgets.QWidget):
    def __init__(self, parent=None):
        super(LocationWidget, self).__init__(parent)
        self.lay = QtWidgets.QVBoxLayout(self)
        self.lay.setContentsMargins(0, 0, 0, 0)
        self.lay.addStretch()
        self.setContentsMargins(0, 0, 0, 0)
        self.widgets = []
        self.create_row()

    def create_row(self):
        widget = QtWidgets.QWidget()
        widget.setContentsMargins(0, 0, 0, 0)
        hlay = QtWidgets.QHBoxLayout(widget)
        hlay.setContentsMargins(0, 0, 0, 0)
        lineedit = QtWidgets.QLineEdit()
        button = QtWidgets.QToolButton(clicked=self.on_clicled)
        button.setFocusPolicy(QtCore.Qt.NoFocus)
        hlay.addWidget(lineedit)
        hlay.addWidget(button)
        button.setIconSize(QtCore.QSize(24, 24))
        button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))
        self.widgets.append(widget)
        self.lay.insertWidget(-1, widget)

    @QtCore.pyqtSlot()
    def on_clicled(self):
        button = self.sender()
        widget = button.parentWidget()
        if self.lay.indexOf(widget) == (self.lay.count()-1):
            self.create_row()
        else:
            self.lay.removeWidget(widget)
            widget.deleteLater()
            self.widgets.remove(widget)
        for widget in self.widgets:
            button = widget.findChild(QtWidgets.QToolButton)
            button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'remove.png')))
        self.widgets[-1].findChild(QtWidgets.QToolButton).setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))

    def get_locations(self):
        locations = []
        for widget in self.widgets:
            le = widget.findChild(QtWidgets.QLineEdit)
            if le.text():
                locations.append(le.text())
        return locations

class YellowWidget(QtWidgets.QMainWindow):
    def __init__(self, parent=None):
        super(YellowWidget, self).__init__(parent)
        self.setWindowTitle('Yellow Pages Scrapper')
        self.scrapy_worker = ScrapyWorker(self)
        self.search_item_le = QtWidgets.QLineEdit()
        self.location_widget = LocationWidget()
        self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
        self.text_edit = QtWidgets.QTextBrowser()

        central_widget = QtWidgets.QWidget()
        self.setCentralWidget(central_widget)
        lay = QtWidgets.QGridLayout(central_widget)
        lay.addWidget(QtWidgets.QLabel("<b>Search:</b>"), 0, 0)
        lay.addWidget(self.search_item_le, 0, 1)
        lay.addWidget(QtWidgets.QLabel("<b>Locations:</b>"), 1, 0, alignment=QtCore.Qt.AlignTop|QtCore.Qt.AlignLeft)
        lay.addWidget(self.location_widget, 1, 1, alignment=QtCore.Qt.AlignTop)
        lay.addWidget(self.start_stop_button, 2, 0, 1, 2)
        lay.addWidget(self.text_edit, 3, 0, 1, 2)

        self.start_stop_button.toggled.connect(self.on_checked)
        self.scrapy_worker.logChanged.connect(self.insert_log)
        self.scrapy_worker.started.connect(self.text_edit.clear)
        self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))

    @QtCore.pyqtSlot(bool)
    def on_checked(self, state):
        if state:
            # crapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany"]}'
            search_item = self.search_item_le.text()
            locations = self.location_widget.get_locations()
            directory, program, args = utils.create_arguments(search_item, locations)
            self.scrapy_worker.run(directory, program, args)
            self.start_stop_button.setText('Stop')
        else:
            self.start_stop_button.setText('Start')
            self.scrapy_worker.stop()

    @QtCore.pyqtSlot(str)
    def insert_log(self, text):
        prev_cursor = self.text_edit.textCursor()
        self.text_edit.moveCursor(QtGui.QTextCursor.End)
        self.text_edit.insertPlainText(text)
        self.text_edit.setTextCursor(prev_cursor)

if __name__ == '__main__':
    import sys
    app = QtWidgets.QApplication(sys.argv)
    app.setStyle('fusion')
    w = YellowWidget()
    w.resize(640, 480)
    w.show()
    sys.exit(app.exec_())

我使用了utils.py文件中的一个函数:

import os
import json

def create_arguments(search_item, locations):
    program = 'scrapy'
    dir_path = os.path.dirname(os.path.abspath(__file__))
    directory = os.path.join(dir_path, 'scrape')
    d = {"search_item": search_item, "locations": locations}
    argument = 'parameters={}'.format(json.dumps(d))
    return directory, program, ['crawl', 'yellow', "-a", argument]

获得以下内容:

enter image description here

完整的项目是here