具有以下代码,以使用StanfordDependencyParser
和multiprocessing
包提取sn-gram:
import multiprocessing
from nltk.parse.stanford import StanfordDependencyParser
parser = StanfordDependencyParser(model,jar)
def S2Gram(twt):
words = []
pos_tags = []
try:
result = parser.raw_parse(twt)
dep = next(result)
s_bigrams = list(dep.triples())
for sgram in s_bigrams:
words.append(sgram[0][0]+' '+sgram[2][0])
pos_tags.append(sgram[0][1]+' '+sgram[2][1])
except AssertionError:
pass
return (words,pos_tags)
def process_raw(df):
res = df['tweet_text'].apply(lambda x: pd.Series(S2Gram(x)))
return res
if __name__ == '__main__':
num_processes = multiprocessing.cpu_count()
p = multiprocessing.Pool(processes=num_processes)
split_dfs = np.array_split(df,num_processes)
pool_results = p.map(process_raw, split_dfs)
p.close()
p.join()
parts = pd.concat(pool_results, axis=0)
parts.columns = ['s2gram_words_raw','s2gram_tags_raw']
df = pd.concat([df, parts], axis=1)
它可以工作几个小时,但突然出现以下错误:
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/dependencygraph.py", line 351, in _parse
cell_extractor = extractors[cell_number]
KeyError: 12
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "~/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "~/anaconda3/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
File "<ipython-input-15-184e525bf1e7>", line 27, in process_raw
res = df['tweet_text'].apply(lambda x: pd.Series(S2Gram(x)))
File "`/anaconda3/lib/python3.6/site-packages/pandas/core/series.py", line 3192, in apply
mapped = lib.map_infer(values, f, convert=convert_dtype)
File "pandas/_libs/src/inference.pyx", line 1472, in pandas._libs.lib.map_infer
File "<ipython-input-15-184e525bf1e7>", line 27, in <lambda>
res = df['tweet_text'].apply(lambda x: pd.Series(S2Gram(x)))
File "<ipython-input-15-184e525bf1e7>", line 12, in S2Gram
result = parser.raw_parse(twt)
File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/stanford.py", line 158, in raw_parse
return next(self.raw_parse_sents([sentence], verbose))
File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/stanford.py", line 180, in raw_parse_sents
self._execute(cmd, '\n'.join(sentences), verbose)
File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/stanford.py", line 110, in _parse_trees_output
res.append(iter([self._make_tree('\n'.join(cur_lines))]))
File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/stanford.py", line 411, in _make_tree
return DependencyGraph(result, top_relation_label='root')
File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/dependencygraph.py", line 90, in __init__
top_relation_label=top_relation_label,
File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/dependencygraph.py", line 355, in _parse
'CoNLL(10) or Malt-Tab(4) format'.format(cell_number)
ValueError: Number of tab-delimited fields (12) not supported by CoNLL(10) or Malt-Tab(4) format
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-15-184e525bf1e7> in <module>()
32 p = multiprocessing.Pool(processes=num_processes)
33 split_dfs = np.array_split(df,num_processes)
---> 34 pool_results = p.map(process_raw, split_dfs)
35 p.close()
36 p.join()
~/anaconda3/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize)
264 in a list that is returned.
265 '''
--> 266 return self._map_async(func, iterable, mapstar, chunksize).get()
267
268 def starmap(self, func, iterable, chunksize=None):
~/anaconda3/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
ValueError: Number of tab-delimited fields (12) not supported by CoNLL(10) or Malt-Tab(4) format
任何解决此问题的帮助将不胜感激。