#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import nltk
import re
from nltk.tree import *
from nltk.chunk.util import tagstr2tree
from nltk import word_tokenize, pos_tag
text = "Yarın, Mehmet ile birlikte Ankara'da ki Nüfus Müdürlüğü'ne, Aziz
Yıldırım ile birlikte, Şükrü Saraçoğlu Stadı'na gideceğiz.".decode("utf-8")
tagged_text = pos_tag(word_tokenize(text))
tagged_text2 = word_tokenize(text)
grammar = "NP:{<NNP>+}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(tagged_text)
for tree in result:
print(tree)
wrapped = "(ROOT "+ str(result) + " )" # Add a "root" node at the top
trees = nltk.Tree.fromstring(wrapped, read_leaf=lambda x: x.split("/")[0])
for tree in trees:
print(tree.leaves())
for tree2 in result:
print(nltk.Tree.fromstring(str(tree2), read_leaf=lambda x: x.split("/")[0]))
The Output:
(NP Yar\u0131n/NNP)
(u',', ',')
(NP Mehmet/NNP)
(u'ile', 'NN')
(u'birlikte', 'NN')
(NP Ankara'da/NNP ki/NNP Nufus/NNP Mudurlugu'ne/NNP)
(u',', ',')
(NP Aziz/NNP Y\u0131ld\u0131r\u0131m/NNP)
(u'ile', 'NN')
(u'birlikte', 'NN')
(u',', ',')
(NP Sukru/NNP Saracoglu/NNP Stad\u0131'na/NNP)
(u'gidece\u011fiz', 'NN')
(u'.', '.')
['Yar\\u0131n', ',', 'Mehmet', 'ile', 'birlikte', "Ankara'da", 'ki', 'Nufus', "Mudurlugu'ne", ',', 'Aziz', 'Y\\u0131ld\\u0131r\\u0131m', 'ile', 'birlikte', ',', 'Sukru', 'Saracoglu', "Stad\\u0131'na", 'gidecegiz', '.']
(NP Yar\u0131n)
(u',', ',')
(NP Mehmet)
(u'ile', 'NN')
(u'birlikte', 'NN')
(NP Ankara'da ki Nufus Mudurlugu'ne)
(u',', ',')
(NP Aziz Y\u0131ld\u0131r\u0131m)
(u'ile', 'NN')
(u'birlikte', 'NN')
(u',', ',')
(NP Sukru Saracoglu Stad\u0131'na)
(u'gidece\u011fiz', 'NN')
(u'.', '.')
I referenced from :How can I remove POS tags before slashes in nltk?
I want to grouping proper names and remove the tags but when i used the solution it effects the whole text and after that my chunk parse is gone. I really tried the understand the tree structure but how can i apply the the removing function in for statement. I want my Output like:
My desired output:
[Yar\u0131n]
[,]
[Mehmet]
[ile]
[birlikte]
[Ankara'da ki Nufus Mudurlugu'ne]
...
...
Also i can't deal with utf-8 as you see my output is full of non-ascii characters. How can i deal with it ?
EDIT:
for i in range(len(tree)):
arr.append(nltk.Tree.fromstring(str(tree[i]), read_leaf=lambda x: x.split("/")[0]).leaves())
print(arr[i])
I found what shoul i write in the code but now i have the following error. I think i can't append punctuations on my array.
['Yar\\u0131n']
Traceback (most recent call last):
File "./chunk2.py", line 61, in <module>
arr.append(nltk.Tree.fromstring(str(tree[i]), read_leaf=lambda x: x.split("/")[0]).leaves())
File "/usr/local/lib/python2.7/dist-packages/nltk/tree.py", line 630, in fromstring
cls._parse_error(s, match, open_b)
File "/usr/local/lib/python2.7/dist-packages/nltk/tree.py", line 675, in _parse_error
raise ValueError(msg)
ValueError: Tree.read(): expected u'(' but got ','
at index 0.
","
^
答案 0 :(得分:2)
它比你意识到的效率更低。您正在生成一个解析树,将其转换为字符串,将其包裹起来就好像它是多个树(它不是),然后将包装好的字符串解析回树中。只要您拥有解析树result
,就停止并删除POS标签。
nltk树是一种列表,因此只需迭代树的分支并从叶元组中删除POS标记。要获得所需的格式,还需要在不是NP的单词周围添加一层包装:
...
>>> result = cp.parse(tagged_text)
>>> terms = []
>>> for e in result:
if isinstance(e, tuple):
terms.append([ e[0] ])
else:
terms.append([w for w, t in e])
>>> pprint.pprint(terms)
[['Yarın'],
[','],
['Mehmet'],
['ile'],
['birlikte'],
["Ankara'da", 'ki', 'Nüfus', "Müdürlüğü'ne"],
[','],
['Aziz', 'Yıldırım'],
...
答案 1 :(得分:0)
for i in range(len(tree)):
try:
arr.append(nltk.Tree.fromstring(str(tree[i]), read_leaf=lambda x: x.split("/")[0]).leaves())
print(arr[i])
except ValueError:
arr.append(tree[i])
print(arr[i])
效率不高,但能提供我想要的输出。