ValueError:CoNLL(10)或Malt-Tab(4)格式不支持制表符分隔的字段数(12)

时间:2019-02-12 23:35:15

标签: python multiprocessing stanford-nlp

具有以下代码,以使用StanfordDependencyParsermultiprocessing包提取sn-gram:

import multiprocessing
from nltk.parse.stanford import StanfordDependencyParser

parser = StanfordDependencyParser(model,jar)

def S2Gram(twt):
    words = []
    pos_tags = []
    try:
        result = parser.raw_parse(twt)
        dep = next(result)
        s_bigrams = list(dep.triples())
        for sgram in s_bigrams:
            words.append(sgram[0][0]+' '+sgram[2][0])
            pos_tags.append(sgram[0][1]+' '+sgram[2][1])
    except AssertionError:
        pass
    return (words,pos_tags)

def process_raw(df):
    res = df['tweet_text'].apply(lambda x: pd.Series(S2Gram(x)))
    return res

if __name__ == '__main__':
    num_processes = multiprocessing.cpu_count()
    p = multiprocessing.Pool(processes=num_processes)
    split_dfs = np.array_split(df,num_processes)
    pool_results = p.map(process_raw, split_dfs)
    p.close()
    p.join()
    parts = pd.concat(pool_results, axis=0)
    parts.columns = ['s2gram_words_raw','s2gram_tags_raw']
    df = pd.concat([df, parts], axis=1)

它可以工作几个小时,但突然出现以下错误:

RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/dependencygraph.py", line 351, in _parse
cell_extractor = extractors[cell_number]
KeyError: 12

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "~/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "~/anaconda3/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
  File "<ipython-input-15-184e525bf1e7>", line 27, in process_raw
    res = df['tweet_text'].apply(lambda x: pd.Series(S2Gram(x)))
  File "`/anaconda3/lib/python3.6/site-packages/pandas/core/series.py", line 3192, in apply
mapped = lib.map_infer(values, f, convert=convert_dtype)
  File "pandas/_libs/src/inference.pyx", line 1472, in pandas._libs.lib.map_infer
  File "<ipython-input-15-184e525bf1e7>", line 27, in <lambda>
res = df['tweet_text'].apply(lambda x: pd.Series(S2Gram(x)))
  File "<ipython-input-15-184e525bf1e7>", line 12, in S2Gram
result = parser.raw_parse(twt)
  File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/stanford.py", line 158, in raw_parse
return next(self.raw_parse_sents([sentence], verbose))
  File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/stanford.py", line 180, in raw_parse_sents
self._execute(cmd, '\n'.join(sentences), verbose)
  File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/stanford.py", line 110, in _parse_trees_output
res.append(iter([self._make_tree('\n'.join(cur_lines))]))
  File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/stanford.py", line 411, in _make_tree
return DependencyGraph(result, top_relation_label='root')
  File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/dependencygraph.py", line 90, in __init__
top_relation_label=top_relation_label,
  File "~/anaconda3/lib/python3.6/site-packages/nltk/parse/dependencygraph.py", line 355, in _parse
'CoNLL(10) or Malt-Tab(4) format'.format(cell_number)
ValueError: Number of tab-delimited fields (12) not supported by CoNLL(10) or Malt-Tab(4) format
"""

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
<ipython-input-15-184e525bf1e7> in <module>()
     32     p = multiprocessing.Pool(processes=num_processes)
     33     split_dfs = np.array_split(df,num_processes)
---> 34     pool_results = p.map(process_raw, split_dfs)
     35     p.close()
     36     p.join()

~/anaconda3/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize)
    264         in a list that is returned.
    265         '''
--> 266         return self._map_async(func, iterable, mapstar, chunksize).get()
    267 
    268     def starmap(self, func, iterable, chunksize=None):

~/anaconda3/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
    642             return self._value
    643         else:
--> 644             raise self._value
    645 
    646     def _set(self, i, obj):

ValueError: Number of tab-delimited fields (12) not supported by CoNLL(10) or Malt-Tab(4) format

任何解决此问题的帮助将不胜感激。

0 个答案:

没有答案