我正在尝试从网站解析的内容中,对每个段落中的关键字w1和w2进行完全匹配。 w1来自250个单词的列表,而w2来自150个单词的列表,因此有(250 * 150)个不同的组合。在组合上执行此正则表达式字符串匹配大约需要1-2秒,因此不用说,这需要相当长的时间才能执行。我可以使用哪些技巧来加快速度?我正在尝试多处理,但似乎没有任何明显的改进。
def filterByKeyWordsinParagraph(self, row, w1, w2):
paragraphs = row['formatted_content']
paragraphs = [p.lower() for p in paragraphs]
regex = re.compile(r'^(?=.*\b{0}\b)(?=.*\b{1}\b).*$'.format(w1.lower(), w2.lower()))
out = [paragraph for paragraph in paragraphs if regex.search(" ".join(paragraph.split()))]
objectid = self.objectMap[w1]
elementid = self.elementMap[w2]
cols = ['URL', 'Content', 'Proximity', 'ObjectID', 'ObjectName', 'ElementID', 'ElementName']
if len(out) > 0:
return pd.Series([row['url'], row['content'], self.proximity, objectid, w1, elementid, w2], index=cols)
else:
return pd.Series([np.nan]*7, index=cols)
def matchByParagraph(self, obj, ele):
data = self.splitData
ddata = dd.from_pandas(data, npartitions=10)
s = ddata.map_partitions(lambda df: df.apply((lambda row: self.filterByKeyWordsinParagraph(row, obj, ele)), axis=1)).compute(scheduler='processes')
return s[pd.notnull(s['Proximity'])].reset_index(drop=True)