我正在尝试从一个精确的点提取一个子字符串,直到一个特殊字符“,这是字符串:
import sys
from PyQt5 import QtCore, QtGui, QtWidgets, QtPrintSupport, QtWebEngineWidgets
class main(QtWebEngineWidgets.QWebEngineView):
def __init__(self,windows,parent = None):
super(main,self).__init__(parent)
self._windows = windows
self._windows.append(self)
self.load(QtCore.QUrl("https://twitter.com/CodeRentor"))
#Menu items on mouse Right click for Print
def contextMenuEvent(self, event):
cmenu = QtWidgets.QMenu(self)
printAct = cmenu.addAction("print")
action = cmenu.exec_(self.mapToGlobal(event.pos()))
if action == printAct:
self._preview()
def createWindow(self, windows):
if windows == QtWebEngineWidgets.QWebEnginePage.WebBrowserTab:
webView = main(self._windows)
webView.setAttribute(QtCore.Qt.WA_DeleteOnClose, True)
webView.resize(900, 780) # <----
webView.show()
return webView
elif windows == QtWebEngineWidgets.QWebEnginePage.WebDialog:
webView = main(self._windows)
webView.setAttribute(QtCore.Qt.WA_DeleteOnClose, True)
webView.resize(900, 780) # <----
webView.show()
return webView
return super(main, self).createWindow(windows)
def _preview(self):
dialog = QtPrintSupport.QPrintPreviewDialog()
dialog.paintRequested.connect(self._request)
dialog.exec_()
def _request(self,printer):
printer.newPage()
painter = QtGui.QPainter()
painter.begin(printer)
screen = self.grab()
painter.drawPixmap(0, 0, screen)
painter.end()
if __name__ == "__main__":
app = QtWidgets.QApplication(sys.argv)
windows = []
web = main(windows)
web.show()
sys.exit(app.exec_())
我要提取的部分是关键字,来自:data-keyword =“直到:下一个”符号,因此在这种情况下:aa battery plus
但是,只要用\ b分隔符和方括号限制左右字符串,我就得到一个字母。
我尝试使用re.findall()方法
myDocument = QTextDocument()
myDocument.setHtml(self.currentHtml) # <= self.currentHtml is the current page converted to Html
font = QFont()
myDocument.setDefaultFont(font)
myDocument.defaultStyleSheet()
myDocument.setPageSize(QSizeF(printer.pageRect().size()))
myDocument.print_(printer)
这就是我得到的:
element = '<div class="s-suggestion" data-alias="aps" data-crid="2AZHZA23OLYLF" data-isfb="false" data-issc="false" data-keyword="aa battery plus" data-nid="" data-reftag="nb_sb_ss_i_6_2" data-store="" data-type="a9" id="issDiv5"><span class="s-heavy"></span>ab<span class="s-heavy">reva cold sore treatment</span></div>'
如何仅提取关键字? IE:AA电池加
答案 0 :(得分:3)
如果要在两个字符串之间输入文本,则需要使用此正则表达式格式。
import re
element = '<div class="s-suggestion" data-alias="aps" data-crid="2AZHZA23OLYLF" data-isfb="false" data-issc="false" data-keyword="aa batteries plus" data-nid="" data-reftag="nb_sb_ss_i_6_2" data-store="" data-type="a9" id="issDiv5"><span class="s-heavy"></span>ab<span class="s-heavy">reva cold sore treatment</span></div>'
z = re.search(r'data-keyword="(.*?)"', element).group(1)
print(z)
答案 1 :(得分:3)
使用Regex解析HTML不是一个好主意。相反,您可以使用类似BeautifulSoup的html解析器。
例如:
from bs4 import BeautifulSoup
element = '<div class="s-suggestion" data-alias="aps" data-crid="2AZHZA23OLYLF" data-isfb="false" data-issc="false" data-keyword="aa battery plus" data-nid="" data-reftag="nb_sb_ss_i_6_2" data-store="" data-type="a9" id="issDiv5"><span class="s-heavy"></span>ab<span class="s-heavy">reva cold sore treatment</span></div>'
soup = BeautifulSoup(element, "html.parser")
print(soup.find("div", class_="s-suggestion")["data-keyword"])
输出:
aa battery plus
答案 2 :(得分:1)
您不需要正则表达式。
您可以使用内置函数find(substring,begin,end)
来简单地搜索“数据关键字”的索引。然后搜索以下每个括号的索引,并将它们之间的文本切成薄片。
i_key = element.find('data-keyword')
i_1 = element.find('"', i_key)
i_2 = element.find('"', i_1+1)
z = element[i_1+1:i_2]
有关find function的更多信息。
答案 3 :(得分:1)
即使不是最好的想法,此表达式也可能在这里工作,我们可能想使用this method来解决问题,但是如果必须这样做:
data-keyword="\s*([^"]+?)\s*"
还可以删除所需数据前后的多余空格。
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"data-keyword=\"\s*([^\"]+?)\s*\""
test_str = ("<div class=\"s-suggestion\" data-alias=\"aps\" data-crid=\"2AZHZA23OLYLF\" data-isfb=\"false\" data-issc=\"false\" data-keyword=\"aa batteries plus\" data-nid=\"\" data-reftag=\"nb_sb_ss_i_6_2\" data-store=\"\" data-type=\"a9\" id=\"issDiv5\"><span class=\"s-heavy\"></span>ab<span class=\"s-heavy\">reva cold sore treatment</span></div>\n"
"<div class=\"s-suggestion\" data-alias=\"aps\" data-crid=\"2AZHZA23OLYLF\" data-isfb=\"false\" data-issc=\"false\" data-keyword=\" aa batteries plus \" data-nid=\"\" data-reftag=\"nb_sb_ss_i_6_2\" data-store=\"\" data-type=\"a9\" id=\"issDiv5\"><span class=\"s-heavy\"></span>ab<span class=\"s-heavy\">reva cold sore treatment</span></div>")
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Match 1 was found at 105-137: data-keyword="aa batteries plus"
Group 1 found at 119-136: aa batteries plus
Match 2 was found at 417-458: data-keyword=" aa batteries plus "
Group 1 found at 435-452: aa batteries plus
jex.im可视化正则表达式:
答案 4 :(得分:1)
虽然我完全同意上一个答案,但是您也可以考虑下一个解决方案:
element.split('data-keyword="')[-1].split('" data-nid')[0]
当您需要解析“结构化”输入时,这可能被认为非常方便...
答案 5 :(得分:1)
您可以使用re.findall()
函数:
import re
element = '<div class="s-suggestion" data-alias="aps" data-crid="2AZHZA23OLYLF" data-isfb="false" data-issc="false" data-keyword="aa battery plus" data-nid="" data-reftag="nb_sb_ss_i_6_2" data-store="" data-type="a9" id="issDiv5"><span class="s-heavy"></span>ab<span class="s-heavy">reva cold sore treatment</span></div>'
output = re.findall(r'data-keyword="(.*?)"', element)[0]
print(output)
输出
aa电池加