我有一个sentences列表,大约有500,000 sentences。还有concepts左右的13,000,000 concepts列表。对于每个句子,我都想按句子顺序从concepts中提取sentences,并将其写入输出。


import re

sentences = ['data mining is the process of discovering patterns in large data sets involving methods at the intersection of machine learning statistics and database systems', 
             'data mining is an interdisciplinary subfield of computer science and statistics with an overall goal to extract information from a data set and transform the information into a comprehensible structure for further use',
             'data mining is the analysis step of the knowledge discovery in databases process or kdd']

concepts = ['data mining', 'database systems', 'databases process', 
            'interdisciplinary subfield', 'information', 'knowledge discovery',
            'methods', 'machine learning', 'patterns', 'process']

output = []
counting = 0

re_concepts = [re.escape(t) for t in concepts]

find_all_concepts = re.compile('|'.join(re_concepts), flags=re.DOTALL).findall

for sentence in sentences:


输出为; [['data mining', 'process', 'patterns', 'methods', 'machine learning', 'database systems'], ['data mining', 'interdisciplinary subfield', 'information', 'information'], ['data mining', 'knowledge discovery', 'databases process']]


[['data mining', 'interdisciplinary subfield', 'information', 'information'], ['data mining', 'knowledge discovery', 'databases process'], ['data mining', 'process', 'patterns', 'methods', 'machine learning', 'database systems']]

[['data mining', 'knowledge discovery', 'databases process'], ['data mining', 'interdisciplinary subfield', 'information', 'information'], ['data mining', 'process', 'patterns', 'methods', 'machine learning', 'database systems']]



3 个答案:

答案 0 :(得分:2)




import re
import queue
import threading

sentences = ['data mining is the process of discovering patterns in large data sets involving methods at the intersection of machine learning statistics and database systems',
             'data mining is an interdisciplinary subfield of computer science and statistics with an overall goal to extract information from a data set and transform the information into a comprehensible structure for further use',
             'data mining is the analysis step of the knowledge discovery in databases process or kdd']

concepts = ['data mining', 'database systems', 'databases process',
            'interdisciplinary subfield', 'information', 'knowledge discovery',
            'methods', 'machine learning', 'patterns', 'process']

re_concepts = [re.escape(t) for t in concepts]

find_all_concepts = re.compile('|'.join(re_concepts), flags=re.DOTALL).findall

def do_find_all_concepts(q_in, l_out):
    while True:
        sentence = q_in.get()

# Queue with default maxsize of 0, infinite queue size
sentences_q = queue.Queue()
output = []

# any reasonable number of workers
num_threads = 2
for i in range(num_threads):
    worker = threading.Thread(target=do_find_all_concepts, args=(sentences_q, output))
    # once there's nothing but daemon threads left, Python exits the program
    worker.daemon = True

# put all the input on the queue
for s in sentences:

# wait for the entire queue to be processed


import re
import queue
import multiprocessing

sentences = [
    'data mining is the process of discovering patterns in large data sets involving methods at the intersection of machine learning statistics and database systems',
    'data mining is an interdisciplinary subfield of computer science and statistics with an overall goal to extract information from a data set and transform the information into a comprehensible structure for further use',
    'data mining is the analysis step of the knowledge discovery in databases process or kdd']

concepts = ['data mining', 'database systems', 'databases process',
            'interdisciplinary subfield', 'information', 'knowledge discovery',
            'methods', 'machine learning', 'patterns', 'process']

re_concepts = [re.escape(t) for t in concepts]

find_all_concepts = re.compile('|'.join(re_concepts), flags=re.DOTALL).findall

def do_find_all_concepts(q_in, q_out):
        while True:
            sentence = q_in.get(False)
    except queue.Empty:

if __name__ == '__main__':
    # default maxsize of 0, infinite queue size
    sentences_q = multiprocessing.Queue()
    output_q = multiprocessing.Queue()

    # any reasonable number of workers
    num_processes = 2
    pool = multiprocessing.Pool(num_processes, do_find_all_concepts, (sentences_q, output_q))

    # put all the input on the queue
    for s in sentences:

    # wait for the entire queue to be processed
    while not output_q.empty():


答案 1 :(得分:1)

这里有两个使用concurrent.futures。ProcessPoolExecutor的解决方案,它们会将任务分配到不同的进程。您的任务似乎受cpu约束,而不受I / O约束,因此线程可能无济于事。

import re
import concurrent.futures

# using the lists in your example

re_concepts = [re.escape(t) for t in concepts]
all_concepts = re.compile('|'.join(re_concepts), flags=re.DOTALL)

def f(sequence, regex=all_concepts):
    result = regex.findall(sequence)
    return result

if __name__ == '__main__':

    out1 = []
    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = [executor.submit(f, s) for s in sentences]
        for future in concurrent.futures.as_completed(futures):
                result = future.result()
            except Exception as e:

    out2 = []
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for result in executor.map(f, sentences):

Executor.map()有一个chunksize参数:the docs说,发送大于一个可迭代项的块可能是有益的。该功能将需要重构以解决此问题。我用一个仅返回所发送内容的函数进行了测试,但是无论我指定的块大小如何,测试函数仅返回单个项目。 ¿去吧?

def h(sequence):
    return sequence



data =set(''.join(random.choice(string.printable) for _ in range(20)) for _ in range(13000000))

io.BytesIO流进行酸洗大约需要7.5秒,而从io.BytesIO流中酸洗则需要9秒。如果使用多处理解决方案,将概念对象(以任何形式)酸洗到硬盘驱动器上,然后让每个进程从硬盘驱动器上酸洗,而不是每次新进程在IPC的每一侧酸洗/酸洗,将是有益的。创建后,绝对值得测试-YMMV。我的硬盘驱动器上的腌制集为380 MB。



答案 2 :(得分:1)


我认为您可以通过制作一组concepts(最初是构造时或从您的列表中)来缩短搜索时间,然后将每个句子分成一到十个字符串(连续) )字词,并测试集合中的成员资格。




'data mining is the process of discovering patterns in large data sets involving methods at the intersection of machine learning statistics and database systems'
# becomes
[('data', 'mining', 'is', 'the'),
 ('mining', 'is', 'the', 'process'),
 ('is', 'the', 'process', 'of'),
 ('the', 'process', 'of', 'discovering'),
 ('process', 'of', 'discovering', 'patterns'),
 ('of', 'discovering', 'patterns', 'in'),
 ('discovering', 'patterns', 'in', 'large'),
 ('patterns', 'in', 'large', 'data'),
 ('in', 'large', 'data', 'sets'),
 ('large', 'data', 'sets', 'involving'),
 ('data', 'sets', 'involving', 'methods'),
 ('sets', 'involving', 'methods', 'at'),
 ('involving', 'methods', 'at', 'the'),
 ('methods', 'at', 'the', 'intersection'),
 ('at', 'the', 'intersection', 'of'),
 ('the', 'intersection', 'of', 'machine'),
 ('intersection', 'of', 'machine', 'learning'),
 ('of', 'machine', 'learning', 'statistics'),
 ('machine', 'learning', 'statistics', 'and'),
 ('learning', 'statistics', 'and', 'database'),
 ('statistics', 'and', 'database', 'systems')]


concepts = set(concepts)
sentence = sentence.split()
#one word
for meme in sentence:
    if meme in concepts:
        #keep it
#two words
for meme in zip(sentence,sentence[1:]):
    if ' '.join(meme) in concepts:
        #keep it
#three words
for meme in zip(sentence,sentence[1:],sentence[2:]):
    if ' '.join(meme) in concepts:
        #keep it


from itertools import tee
def nwise(iterable, n=2):
    "s -> (s0,s1), (s1,s2), (s2, s3), ... for n=2"
    iterables = tee(iterable, n)
    # advance each iterable to the appropriate starting point
    for i, thing in enumerate(iterables[1:],1):
        for _ in range(i):
            next(thing, None)
    return zip(*iterables)

我制作了一组13e6个随机字符串,每个字符串包含20个字符,以近似sentence = sentence.strip().split() for n in [1,2,3,4,5,6,7,8,9,10]: for meme in nwise(sentence,n): if ' '.join(meme) in concepts: #keep meme


import random, string data =set(''.join(random.choice(string.printable) for _ in range(20)) for _ in range(13000000)) 中测试四个或四十个字符串作为成员资格大约需要60纳秒。一百个单词的句子包含955个一到十个单词字符串,因此搜索该句子大约需要60微秒。

示例data中的第一句话有195个可能的概念(一到十个字串)。以下两个功能的计时大致相同:'data mining is the process of discovering patterns in large data sets involving methods at the intersection of machine learning statistics and database systems'约为140微秒,f约为150微秒:



在对您的示例数据进行测试后,我发现def f(sentence, data=data, nwise=nwise): '''iterate over memes in sentence and see if they are in data''' sentence = sentence.strip().split() found = [] for n in [1,2,3,4,5,6,7,8,9,10]: for meme in nwise(sentence,n): meme = ' '.join(meme) if meme in data: found.append(meme) return found def g(sentence, data=data, nwise=nwise): 'make a set of the memes in sentence then find its intersection with data''' sentence = sentence.strip().split() test_strings = set(' '.join(meme) for n in range(1,11) for meme in nwise(sentence,n)) found = test_strings.intersection(data) return found 不能在一个句子中出现两次概念。

因此,这里的所有内容都与按照每个句子中出现的顺序列出的概念结合在一起。 g的新版本将花费更长的时间,但是增加的时间应该相对较少。如果可能的话,您会发表评论让我知道它比原始评论多了吗? (我很好奇)。
