我有一个程序将文本分成句子,然后将句子分成单词,然后将语音部分的数量和写入数据写入csv文件。问题是:我需要按类别划分句子。在输入上我想收到一系列句子。然后在句子末尾用标点符号标出每个句子,确定其类型。如果这是一个肯定的句子,那么csv中的标志将为零,如果这是一个 疑问句,然后国旗将是1.我怎么能这样做?
这是代码:
# -*- coding: utf-8 -*-
import json
import pymorphy2
import csv
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import re
# with open('kuprin.txt', 'r') as myfile:
# text = myfile.read().replace('\n', '')
text="Hi!How are you?My name is Jack.What is your name?"
sentences = sent_tokenize(text)
morph = pymorphy2.MorphAnalyzer();
s = set(sentences)
for sentences in s:
# print('-'+sentences)
words = word_tokenize(sentences)
print(words)
json_data = []
i = 0
for item in s:
if item == '':
continue
word_list = item.split(' ')
data = {
"id": i,
"sentences": item,
"ADJF": 0,
"NOUN": 0,
"INTJ": 0,
"ADJS": 0,
"COMP": 0,
"VERB": 0,
"INFN": 0,
"PRTF": 0,
"PRTS": 0,
"GRND": 0,
"NUMR": 0,
"ADVB": 0,
"NPRO": 0,
"PRED": 0,
"PREP": 0,
"CONJ": 0,
"PRCL": 0,
"FLAG": 0
}
for word in word_list:
res = morph.parse(word)
pos = res[0].tag.POS
if pos == None:
continue
print(word + "---" + str(pos))
data[pos] += 1
json_data.append(data)
i = i+1
for el in json_data:
print(el)
with open('test.json', 'w') as f:
json.dump(json_data, f, ensure_ascii=False, sort_keys=False, indent=4,
separators=(',', ': '))
txt_file = r"test.json"
csv_file = r"test.csv"
in_txt = csv.reader(open(txt_file, "rt"))
out_csv = csv.writer(open(csv_file, 'w'))
out_csv.writerow(
["id", "sentences", "ADJF", "NOUN", "INTJ", "ADJS", "COMP", "VERB",
"INFN", "PRTF", "PRTS", "GRND", "NUMR",
"ADVB", "NPRO", "PRED", "PREP", "CONJ", "PRCL"])
for el in json_data:
csv_str =[]
for value in el.values():
csv_str += [value]
print(csv_str)
out_csv.writerow(csv_str)