我引用的是类似于我的GUI应用程序的stackoverflow answer。我的草率应用程序有点不同。执行该应用程序时,系统会提示用户输入关键字以进行搜寻以搜寻
看起来像这样
我试图将这种逻辑放在GUI上,但是不确定如何执行。
这是gui的样子。
我希望能够输入字段,以便用户在处理scrapy脚本之前可以输入所需的信息。
这是一些令人毛骨悚然的脚本
my_spider.py
import scrapy
import sys
import random
import csv
from scrape.items import Item
from var_dump import var_dump
search_item = input("Input The Search Item: ")
location = input("Location:")
second_location = input("Second Location:")
third_location = input("Third Location:")
fourth_location = input("Fourth Location:")
fifth_location = input("Fifth Location:")
sixth_location = input("Sixth Location:")
# city = [
# "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "Fort Worth",
# "San Diego", "Dallas", "San Jose", "Austin", "Columbus", "Indianapolis", "Seattle", "St. Paul", "Nashville",
# "Louisville", "Plano"
# ]
# rancity = random.choice(city)
class YellowSpider(scrapy.Spider):
name = "yellow"
# start_urls = [
# "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location
# # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
# # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
# ]
def start_requests(self):
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location, self.parse)
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location, self.parse2)
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location, self.parse3)
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location, self.parse4)
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fifth_location, self.parse5)
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + sixth_location, self.parse6)
# yield scrapy.Request('http://www.example.com/3.html', self.parse)
def __init__(self):
self.seen_business_names = []
self.seen_phonenumbers = []
self.seen_websites = []
self.seen_emails = []
def parse(self, response):
for href in response.css('div.v-card a.business-name::attr(href)'):
yield response.follow(href, self.businessprofile)
for href in response.css('div.pagination a::attr(href)'):
yield response.follow(href, self.parse)
def parse2(self, response):
for href in response.css('div.v-card a.business-name::attr(href)'):
yield response.follow(href, self.businessprofile2)
for href in response.css('div.pagination a::attr(href)'):
yield response.follow(href, self.parse2)
def parse3(self, response):
for href in response.css('div.v-card a.business-name::attr(href)'):
yield response.follow(href, self.businessprofile3)
for href in response.css('div.pagination a::attr(href)'):
yield response.follow(href, self.parse3)
........
这是GUI
main.py
from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets
class ScrapyWorker(QtCore.QObject):
logChanged = QtCore.pyqtSignal(str)
started = QtCore.pyqtSignal()
finished = QtCore.pyqtSignal()
def __init__(self, parent=None):
super(ScrapyWorker, self).__init__(parent)
self._process = QtCore.QProcess(self)
self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
self._process.setProgram('scrapy')
self._process.started.connect(self.started)
self._process.finished.connect(self.finished)
def run(self, project, spider):
self._process.setWorkingDirectory(project)
self._process.setArguments(['crawl', spider])
self._process.start()
@QtCore.pyqtSlot()
def on_readyReadStandardOutput(self):
data = self._process.readAllStandardOutput().data().decode()
self.logChanged.emit(data)
@QtCore.pyqtSlot()
def stop(self):
self._process.kill()
def spiders(self, project):
process = QtCore.QProcess()
process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
process.setWorkingDirectory(project)
loop = QtCore.QEventLoop()
process.finished.connect(loop.quit)
process.start('scrapy', ['list'])
loop.exec_()
return process.readAllStandardOutput().data().decode().split()
class MainWindow(QtWidgets.QMainWindow):
def __init__(self, parent=None):
super(MainWindow, self).__init__(parent)
self.project_le = QtWidgets.QLineEdit()
self.project_button = QtWidgets.QPushButton('Select Project')
self.spider_combobox = QtWidgets.QComboBox()
self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
self.text_edit = QtWidgets.QTextBrowser()
self.input = QtWidgets.QLineEdit()
self.input1 = QtWidgets.QLineEdit()
self.input2 = QtWidgets.QLineEdit()
self.input3 = QtWidgets.QLineEdit()
self.input4 = QtWidgets.QLineEdit()
self.input5 = QtWidgets.QLineEdit()
self.input6 = QtWidgets.QLineEdit()
central_widget = QtWidgets.QWidget()
self.setCentralWidget(central_widget)
lay = QtWidgets.QVBoxLayout(central_widget)
hlay = QtWidgets.QHBoxLayout()
hlay.addWidget(self.project_le)
hlay.addWidget(self.project_button)
lay.addLayout(hlay)
hlay2 = QtWidgets.QHBoxLayout()
hlay2.addWidget(QtWidgets.QLabel("Input The Search Item :"))
hlay2.addWidget(self.input, 1)
hlay3 = QtWidgets.QHBoxLayout()
hlay4 = QtWidgets.QHBoxLayout()
hlay5 = QtWidgets.QHBoxLayout()
hlay6 = QtWidgets.QHBoxLayout()
hlay7 = QtWidgets.QHBoxLayout()
hlay8 = QtWidgets.QHBoxLayout()
hlay3.addWidget(QtWidgets.QLabel("Location :"))
hlay3.addWidget(self.input1, 1 )
hlay4.addWidget(QtWidgets.QLabel("Location 2 :"))
hlay4.addWidget(self.input2, 1 )
hlay5.addWidget(QtWidgets.QLabel("Location 3 :"))
hlay5.addWidget(self.input3, 1 )
hlay6.addWidget(QtWidgets.QLabel("Location 4 :"))
hlay6.addWidget(self.input4, 1 )
hlay7.addWidget(QtWidgets.QLabel("Location 5 :"))
hlay7.addWidget(self.input5, 1 )
hlay8.addWidget(QtWidgets.QLabel("Location 6 :"))
hlay8.addWidget(self.input6, 1 )
lay.addLayout(hlay2)
lay.addLayout(hlay3)
lay.addLayout(hlay4)
lay.addLayout(hlay5)
lay.addLayout(hlay6)
lay.addLayout(hlay7)
lay.addLayout(hlay8)
lay.addWidget(self.start_stop_button)
lay.addWidget(self.text_edit)
self.start_stop_button.setEnabled(False)
self.scrapy_worker = ScrapyWorker(self)
self.scrapy_worker.logChanged.connect(self.insert_log)
self.scrapy_worker.started.connect(self.text_edit.clear)
self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))
self.start_stop_button.toggled.connect(self.on_checked)
self.project_button.clicked.connect(self.select_project)
self.resize(640, 480)
@QtCore.pyqtSlot(bool)
def on_checked(self, state):
if state:
filename = self.project_le.text()
finfo = QtCore.QFileInfo(filename)
directory = finfo.dir().absolutePath()
self.scrapy_worker.run(directory, self.spider_combobox.currentText())
self.start_stop_button.setText('Stop')
else:
self.start_stop_button.setText('Start')
self.scrapy_worker.stop()
@QtCore.pyqtSlot()
def select_project(self):
filename, _ = QtWidgets.QFileDialog.getOpenFileName(
self,
"Select .cfg file",
QtCore.QDir.currentPath(),
"Configure File (*.cfg)"
)
if filename:
self.project_le.setText(filename)
finfo = QtCore.QFileInfo(filename)
directory = finfo.dir().absolutePath()
spiders = self.scrapy_worker.spiders(directory)
self.spider_combobox.clear()
self.spider_combobox.addItems(spiders)
self.start_stop_button.setEnabled(True if spiders else False)
@QtCore.pyqtSlot(str)
def insert_log(self, text):
prev_cursor = self.text_edit.textCursor()
self.text_edit.moveCursor(QtGui.QTextCursor.End)
self.text_edit.insertPlainText(text)
self.text_edit.setTextCursor(prev_cursor)
if __name__ == '__main__':
import sys
app = QtWidgets.QApplication(sys.argv)
app.setStyle('fusion')
w = MainWindow()
w.show()
sys.exit(app.exec_())
答案 0 :(得分:2)
首先,您必须修改蜘蛛程序以使其直接被控制台接受,而避免使用input()
方法:
yellowpage_spider.py
import json
import scrapy
from scrape.items import Item
class YellowSpider(scrapy.Spider):
name = "yellow"
def __init__(self, *args, **kwargs):
super(YellowSpider, self).__init__(*args, **kwargs)
self.seen_business_names = []
self.seen_phonenumbers = []
self.seen_websites = []
self.seen_emails = []
def start_requests(self):
if not hasattr(self, 'parameters'):
return
parameters = json.loads(self.parameters)
search_item = parameters['search_item']
locations = parameters['locations']
for location in locations:
url = "https://www.yellowpages.com/search?search_terms={}&geo_location_terms={}".format(search_item, location)
yield scrapy.Request(url=url, callback=self.parse, meta={'location': location})
def parse(self, response):
location = response.meta['location']
for href in response.css('div.v-card a.business-name::attr(href)'):
yield response.follow(href, self.businessprofile, meta={'location': location})
for href in response.css('div.pagination a::attr(href)'):
yield response.follow(href, self.parse, meta={'location': location})
def businessprofile(self, response):
location = response.meta['location']
for business in response.css('header#main-header'):
item = Item()
item['business_name'] = business.css('div.sales-info h1::text').extract()
w = business.css('a.secondary-btn.website-link::attr(href)').extract()
item['website'] = str(w).strip('[]')
item['location'] = location
s = business.css('a.email-business::attr(href)').extract()
item['email'] = [item[7:] for item in s]
item['phonenumber'] = business.css('p.phone::text').extract_first()
for x in item['email']:
#new code here, call to self.seen_business_names
if x not in self.seen_emails:
if item['email']:
if item['phonenumber']:
if item['website']:
self.seen_emails.append(x)
yield item
然后前面的代码需要一个名为parameters
的参数:
scrapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany", "brazil"]}'
因此,在GUI中,我们现在必须使用GUI输入来形成条目:
gui.py
import os
import json
from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets
import utils
dir_path = os.path.dirname(os.path.abspath(__file__))
icons_dir = os.path.join(dir_path, 'assets', 'icons')
class ScrapyWorker(QtCore.QObject):
logChanged = QtCore.pyqtSignal(str)
started = QtCore.pyqtSignal()
finished = QtCore.pyqtSignal()
def __init__(self, parent=None):
super(ScrapyWorker, self).__init__(parent)
self._process = QtCore.QProcess(self)
self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
self._process.started.connect(self.started)
self._process.finished.connect(self.finished)
def run(self, project, program, arguments):
self._process.setWorkingDirectory(project)
self._process.setProgram('scrapy')
self._process.setArguments(arguments)
self._process.start()
@QtCore.pyqtSlot()
def on_readyReadStandardOutput(self):
data = self._process.readAllStandardOutput().data().decode()
self.logChanged.emit(data)
@QtCore.pyqtSlot()
def stop(self):
self._process.kill()
class LocationWidget(QtWidgets.QWidget):
def __init__(self, parent=None):
super(LocationWidget, self).__init__(parent)
self.lay = QtWidgets.QVBoxLayout(self)
self.lay.setContentsMargins(0, 0, 0, 0)
self.lay.addStretch()
self.setContentsMargins(0, 0, 0, 0)
self.widgets = []
self.create_row()
def create_row(self):
widget = QtWidgets.QWidget()
widget.setContentsMargins(0, 0, 0, 0)
hlay = QtWidgets.QHBoxLayout(widget)
hlay.setContentsMargins(0, 0, 0, 0)
lineedit = QtWidgets.QLineEdit()
button = QtWidgets.QToolButton(clicked=self.on_clicled)
button.setFocusPolicy(QtCore.Qt.NoFocus)
hlay.addWidget(lineedit)
hlay.addWidget(button)
button.setIconSize(QtCore.QSize(24, 24))
button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))
self.widgets.append(widget)
self.lay.insertWidget(-1, widget)
@QtCore.pyqtSlot()
def on_clicled(self):
button = self.sender()
widget = button.parentWidget()
if self.lay.indexOf(widget) == (self.lay.count()-1):
self.create_row()
else:
self.lay.removeWidget(widget)
widget.deleteLater()
self.widgets.remove(widget)
for widget in self.widgets:
button = widget.findChild(QtWidgets.QToolButton)
button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'remove.png')))
self.widgets[-1].findChild(QtWidgets.QToolButton).setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))
def get_locations(self):
locations = []
for widget in self.widgets:
le = widget.findChild(QtWidgets.QLineEdit)
if le.text():
locations.append(le.text())
return locations
class YellowWidget(QtWidgets.QMainWindow):
def __init__(self, parent=None):
super(YellowWidget, self).__init__(parent)
self.setWindowTitle('Yellow Pages Scrapper')
self.scrapy_worker = ScrapyWorker(self)
self.search_item_le = QtWidgets.QLineEdit()
self.location_widget = LocationWidget()
self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
self.text_edit = QtWidgets.QTextBrowser()
central_widget = QtWidgets.QWidget()
self.setCentralWidget(central_widget)
lay = QtWidgets.QGridLayout(central_widget)
lay.addWidget(QtWidgets.QLabel("<b>Search:</b>"), 0, 0)
lay.addWidget(self.search_item_le, 0, 1)
lay.addWidget(QtWidgets.QLabel("<b>Locations:</b>"), 1, 0, alignment=QtCore.Qt.AlignTop|QtCore.Qt.AlignLeft)
lay.addWidget(self.location_widget, 1, 1, alignment=QtCore.Qt.AlignTop)
lay.addWidget(self.start_stop_button, 2, 0, 1, 2)
lay.addWidget(self.text_edit, 3, 0, 1, 2)
self.start_stop_button.toggled.connect(self.on_checked)
self.scrapy_worker.logChanged.connect(self.insert_log)
self.scrapy_worker.started.connect(self.text_edit.clear)
self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))
@QtCore.pyqtSlot(bool)
def on_checked(self, state):
if state:
# crapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany"]}'
search_item = self.search_item_le.text()
locations = self.location_widget.get_locations()
directory, program, args = utils.create_arguments(search_item, locations)
self.scrapy_worker.run(directory, program, args)
self.start_stop_button.setText('Stop')
else:
self.start_stop_button.setText('Start')
self.scrapy_worker.stop()
@QtCore.pyqtSlot(str)
def insert_log(self, text):
prev_cursor = self.text_edit.textCursor()
self.text_edit.moveCursor(QtGui.QTextCursor.End)
self.text_edit.insertPlainText(text)
self.text_edit.setTextCursor(prev_cursor)
if __name__ == '__main__':
import sys
app = QtWidgets.QApplication(sys.argv)
app.setStyle('fusion')
w = YellowWidget()
w.resize(640, 480)
w.show()
sys.exit(app.exec_())
我使用了utils.py文件中的一个函数:
import os
import json
def create_arguments(search_item, locations):
program = 'scrapy'
dir_path = os.path.dirname(os.path.abspath(__file__))
directory = os.path.join(dir_path, 'scrape')
d = {"search_item": search_item, "locations": locations}
argument = 'parameters={}'.format(json.dumps(d))
return directory, program, ['crawl', 'yellow', "-a", argument]
获得以下内容:
完整的项目是here。