我试图将用印地语写的段落拆分为句子。问题在于,并非该段中的所有句子都以“ |”结尾因此使用split()的想法失败了。有人可以推荐任何解决方案吗?
以下是段落:
विउपयोगपपपउपयोगउपयोगउपयोगउपयोगउपयोगउपयोगउपयोगउपयोगउपयोगउपयोगउपयोगउपयोगउपयोगउपयोग भवसमसमसमसमाााााासमसमसमसमसमसमसम ीवपीीीीीीीीीीी,,,,,,,,,,,2001 p>ाठठजएंपहैं。p p p p
这是我的代码:
import codecs
class Tokenizer():
def __init__(self,text):
self.text=text.decode('utf-8')
self.clean_text()
self.sentences=[]
self.final_list=[]
self.bsentences=[]
self.asentences=[]
self.final_sentences=self.bsentences+self.asentences
def readFromFile(self,filename):
f=codecs.open(filename,encoding='utf-8')
self.text=f.read()
self.clean_text()
def print_sentences(self,sentences):
for i in self.sentences:
print i.encode('utf-8')
def cleanText(self):
text=self.text
text=re.sub(r'(\d+)',r'',text)
text=text.replace(u',','')
text=text.replace(u'"','')
text=text.replace(u'"','')
text=text.replace(u':','')
text=text.replace(u"'",'')
text=text.replace(u"‘‘",'')
text=text.replace(u"’’",'')
text=text.replace(u"''",'')
text=text.replace(u".",'')
self.text=text
def getSentence(self):
text=self.text
self.bsentences=text.split(u"।")
sw=codecs.open("stopwords.txt",encoding='utf-8')
stopwords=[x.strip() for x in sw.readlines()]
sentences=[s for s in sw.readlines() if s in enumerate(stopwords)]
return self.final_sentences
t=Tokenizer('')
t.readFromFile('sample.txt')
t.getSentences()
t.print_sentences()
答案 0 :(得分:0)
def sentencesplit_hindi(paragraph):
import re
sentenceEnders = re.compile(ur"""(?:(?<=[\|!?])|(?<=[\।]))\s+""",re.MULTILINE |re.UNICODE)
sentenceList = sentenceEnders.split(paragraph)
return sentenceList