我创建了一个抓取工具,以从https://www.lawinsider.com网站抓取数据。从这个网站上,我设法在单独的线程中使用QThread明智地获取了数据以便进行单次迭代,而不会阻塞当前的GUI屏幕。
在本次抓取中,我首先根据设计分页的选定类别使用BeautifulSoup抓取了合约,因此我必须抓取剩余的合约才能分别与选定类别进行调用。我也成功完成了这一部分。此后,当我尝试获取所有合同链接时,出现None
类型错误,而can not parse HTML
则显示合同列表。
这是我的代码:
from bs4 import BeautifulSoup
from PyQt4 import QtGui
from PyQt4.QtCore import QThread, SIGNAL
from fake_useragent import UserAgent
from urllib import request, parse
import sys
from PyDB import LawInsiderDB
import time
# threading tool
# CONSTANTS
HOST = "https://www.lawinsider.com"
CONTRACTS = "{host}/contracts/tagged".format(host=HOST)
SLEEP = 3
这是我的线程部分,在其中,我以如下数组(列表)格式传递合同清单:
self.contracts = ["/contracts/hV6Qu7IcxJIXGMj6dkdaw/healthcare-acquisition/1326190/2018-11-27", "/contracts/1TwRbPflPyk61lMcUnraDP/national-commerce-corp/1609951/2018-11-26"]
传递合同列表后,我的代码使用全局对象 HOST 形成 contracts_url 。并使用动态用户代理生成器使用 urllib 插件从提供的URL中获取数据,这样我的抓取工具就不会被目标网站阻止:
# Thread Calling
class GetContractsThread(QThread):
def __init__(self, contracts):
QThread.__init__(self)
self.contracts = contracts
self.contract_html = None
self.db = LawInsiderDB()
def __del__(self):
self.wait()
def get_contract_html(self, url=None):
# Contract HOST URL to call
contract_url = HOST + url
full_url = "{0}".format(contract_url)
# user agent implementation
try:
user_agent = UserAgent(use_cache_server=False)
request_url = request.Request(full_url, headers={'User-Agent': user_agent.random})
html = request.urlopen(request_url)
# html_text = html.text
self.contract_html = BeautifulSoup(html, 'html.parser')
except:
self.contract_html = ''
从提供的链接中解析HTML之后,我将从解析的HTML中获取标题和主要内容,并保存到数据库中以备将来使用
# Prase
try:
title = self.contract_html.head.title.string
content = self.contract_html.find("div", class_="contract-content")
except:
title = ''
content = ''
# need to save into database
formated_content = str(content.text).replace("'", "")
# data = str({'title': title, 'url': url})
# save data
contracts = "INSERT INTO contracts(cat_id, document, title) " \
"VALUES({category_id}, '{document}', '{title}')"\
.format(category_id=self.contracts['category_id'], document=formated_content, title=title)
# end
if self.db.save_data(contracts):
# save contract link into database
save_link = "INSERT INTO contracts_link_history(`category_id`, `contract_id`, `contract_link`) " \
"VALUES({category_id}, 0, '{contract_link}')" \
.format(category_id=self.contracts['category_id'], contract_link=url)
if self.db.save_data(save_link):
return "Saved Contract Link : {url}".format(url=url)
return "Not Saved Contract Link: {url}".format(url=url)
else:
return "Not Saved: {url}".format(url=url)
下面的代码将运行我的线程,直到合同列表完成:
def run(self):
for contract in self.contracts['contracts']:
contract_html = self.get_contract_html(contract)
self.emit(SIGNAL('add_completed_contract(QString)'), contract_html)
self.sleep(SLEEP)
# end
下面的代码在我开始抓取时调用,该代码负责创建和调用在上一节中创建的QThread。让我逐个方法解释我的代码方法
1)类声明
# Scraping Class
class CategoryScrap:
在 init 方法中: 第1行: self.db 数据库实例以执行查询
第2行: self.ui MainWindow对象与主UI交互
第3行: self.catVars 将所有与剪贴相关的数据存储到一个对象中
第4行: self.html ,用于存储报废的html
第5行: self.start_thread 以存储线程类对象
def __init__(self, ui=None):
self.db = LawInsiderDB()
self.ui = ui
# class objects
self.catVars = {"host": HOST, "contracts_url": CONTRACTS}
self.html = None
self.start_thread = None
此方法用于在首次传递网站URL时在 self.catVars ['url'] 对象中设置URL。网址就像
https://www.lawinsider.com/contracts/tagged/employment-agreement
def setUrl(self, url=None):
if url is not None:
self.catVars['url'] = url
# check category is new
self.check_category()
else:
QtGui.QMessageBox.warning(None, "Invalid", "Invalid Domain URL")
我还以类别形式管理我的剪贴内容,以便我可以识别哪些剪贴内容属于哪个类别,并且我的此方法检查类别是否已经存在或必须创建新类别:
如果类别是新类别,则通过调用 self.save_new_cursor()和 self.create_category()方法来报废类别URL并保存光标和新类别。光标是分页点,它存储在 self.catVars 中,如下所示:
self.catVars['current_cursor'] = "#####"
self.catVars['next_cursor'] = "#####"
在上述光标 next_cursor 中,该字符通过在开始的讨论中传入给定的类别URL来获取下一页合同。
如果类别已经可用,则光标点将被最后保存到给定类别URL的数据库中的光标
def check_category(self):
# get category name
category = self.catVars['url'].split('/')[-1]
self.ui.processingBox.append("Checking category : {title}".format(title=category))
sql = "SELECT id,name FROM category WHERE `name` LIKE '%{title}%'".format(title=category)
is_category = self.db.get_single_data(sql)
# checking category
if is_category is None:
new_category_msg = "This is new category. Do you want to create ?"
qm = QtGui.QMessageBox
response = qm.question(None, 'New', new_category_msg, qm.Yes | qm.No)
# if user agreed to create new category
if response == qm.Yes:
if self.create_category(category):
# get new cursor if categry is new
self.scrap_url()
self.get_new_cursor()
return True
else:
self.ui.processingBox.append("Scraping stopped")
return False
else:
self.ui.processingBox.append("Category '{title}' already exists.".format(title=category))
# if category exists get category information
self.catVars['category_id'] = is_category['id']
self.catVars['category_name'] = is_category['name']
# get existing cursor
# print(is_category)
last_cursor_sql = "SELECT id, category_id, current_cursor, next_cursor FROM contract_category_history " \
"WHERE `category_id`={category_id} " \
"ORDER BY `id` DESC LIMIT 1".format(category_id=self.catVars['category_id'])
# print(self.db.get_single_data(last_cursor_sql))
try:
last_cursor_data = self.db.get_single_data(last_cursor_sql)
self.catVars['current_cursor'] = last_cursor_data['current_cursor']
self.catVars['next_cursor'] = last_cursor_data['next_cursor']
# scrap url if already exists
if last_cursor_data['current_cursor'] == '':
self.scrap_url()
else:
category_url = "{contracts}/{category_name}?cursor={cursor}".format(
contracts=CONTRACTS, category_name=is_category['name'],
cursor=last_cursor_data['current_cursor'])
self.catVars['url'] = category_url
self.scrap_url()
except:
QtGui.QMessageBox.warning(None, "Error", "Can not get category please check log.")
# get all contracts from saved and online
self.get_all_contracts_links()
# start scraping
self.start_category_scraping()
当类别为new时,上面将调用此方法
def create_category(self, category):
new_category = "INSERT INTO `category`(`name`) VALUES('{title}')".format(title=category)
if self.db.save_data(new_category):
self.ui.processingBox.append("New category : {title} created.".format(title=category))
self.catVars['category_id'] = self.db.cursor.lastrowid
self.catVars['category_name'] = category
return True
else:
QtGui.QMessageBox.warning(None, "Error", "Follow error.log file created in setup directory.")
此方法将剪贴提供的URL并存储到 self.html 对象中,以便在执行剪贴时,我的代码可以直接从Class上下文获取。
def scrap_url(self):
self.html = None
user_agent = UserAgent(use_cache_server=False)
print("Scrap Url: ", self.catVars['url'])
request_url = request.Request(self.catVars['url'], headers={'User-Agent': user_agent.random})
try:
html = request.urlopen(request_url)
self.html = html
except:
print("Error: ", sys.exc_info()[0])
return True
此方法可以删除html并将新光标保存到数据库中属于所属类别。
def get_new_cursor(self):
try:
html = self.get_parsed_html(self.html)
# fetch new cursor
html_cursors = html.find('div', id="pagination-append")
data_cursor = [item['data-cursor'] for item in html_cursors.find_all() if 'data-cursor' in item.attrs][0]
data_next_cursor = [item['data-next-cursor'] for item in html_cursors.find_all() if 'data-next-cursor' in item.attrs][0]
# store cursor into cursor object
self.catVars['current_cursor'] = data_cursor
self.catVars['next_cursor'] = data_next_cursor
# save new cursor
if self.save_cursors():
self.ui.processingBox.append("New cursor point : {title} created.".format(title=self.catVars['category_name']))
except:
# if any occurs
self.ui.processingBox.append("Invalid html to parse")
return True
此方法将已解析的html转换为Beautifulshop对象,以便我的代码可以轻松地从html抓取内容
def get_parsed_html(self, html=None):
if html is not None:
return BeautifulSoup(self.html, 'html.parser')
else:
return BeautifulSoup(html, 'html.parser')
此方法保存新的光标
def save_cursors(self):
save_cursor_sql = "INSERT INTO contract_category_history(category_id, current_cursor, next_cursor) " \
"VALUES({}, '{}', '{}')".format(self.catVars['category_id'], self.catVars['current_cursor'],
self.catVars['next_cursor'])
try:
if self.db.save_data(save_cursor_sql):
return True
else:
return False
except:
print("Can not save new record", sys.exc_info()[0])
此方法将刮除所有 contracts 链接,并保存到 self.catVars ['contracts'] 对象中,该对象在上面的“线程”部分中进行了描述:
# get all contracts
def get_all_contracts_links(self):
try:
html = self.get_parsed_html(self.html)
# save contracts link from url
contract_html = html.find('div', id="pagination-append")
online_contracts = [a.get('href') for a in contract_html.find_all('a')]
print(online_contracts)
except:
online_contracts = []
QtGui.QMessageBox.warning(None, "Error", "Html parse error.", sys.exc_info()[0])
# saved contract list
saved_contract_sql = "SELECT id, category_id, contract_link FROM `contracts_link_history` " \
"WHERE `category_id`={category_id}".format(category_id=self.catVars['category_id'])
try:
rows = self.db.get_all_data(saved_contract_sql)
if rows is not None:
saved_contract_list = [row['contract_link'] for row in rows]
else:
saved_contract_list = []
except:
saved_contract_list = []
# filter online and saved contract list
self.catVars['contracts'] = list(set(online_contracts)-set(saved_contract_list))
return True
此方法启动我的线程,并将解析的合约传递到我的 Thread 当所有合同都完成时,调用 self.scrap_complete()方法;当一个合同废件一次完成时,则调用 self.add_completed_contract 方法。
def start_category_scraping(self):
if len(self.catVars['contracts']):
self.start_thread = GetContractsThread(self.catVars)
self.start_thread.start()
QtGui.QMainWindow.connect(self.start_thread, SIGNAL("finished()"), self.scrap_complete)
QtGui.QMainWindow.connect(self.start_thread, SIGNAL("add_completed_contract(QString)"),
self.completed_contract)
self.ui.stopScrapBtn.setEnabled(True)
self.ui.stopScrapBtn.clicked.connect(self.start_thread.terminate)
# We don't want to enable user to start another thread while this one is
# running so we disable the start button.
self.ui.startScrapBtn.setEnabled(False)
else:
QtGui.QMessageBox.information(None, "Info", "No contract in list.")
return True
完成报废时称为。抓取完成后,我想使用所有抓取方法再次致电,以便我可以继续抓取直到用户完成操作或被用户强行停止。
问题:
但是在第二次迭代中,我没有得到合同链接,因为我是第一次开始报废。但是我可以得到下一个合同的光标 然后尝试获取Contracts_links并收到HTML NoneType错误
要获取下一个contract_links,请遵循以下URL
在以下代码中, category_url 构成了上述URL链接:
def scrap_complete(self):
print("Scraping Complete")
time.sleep(SLEEP)
# get next cursor
category_url = "{contracts}/{category_name}?cursor={cursor}".format(
contracts=CONTRACTS, category_name=self.catVars['category_name'],
cursor=self.catVars['next_cursor'])
# next url
self.catVars['url'] = category_url
self.scrap_url()
# print(self.html.read())
self.get_new_cursor()
# get all contracts from saved and online
self.get_all_contracts_links()
# print("New Contracts: ", self.catVars['contracts'])
# # start scraping
# self.start_category_scraping()
def completed_contract(self, contract):
print("Complete: ", contract)
如果您发现我的英语语法错误,请考虑一下,我会尽力在这里详细说明。