我正在预处理我的文本数据。数据在CSV文件(输入文件)中。现在,我想打开我的数据,读取它,然后将结果保存到另一个CSV文件(输出文件)中。 我尝试了不同的方法,并且也查看了Internet和StackOverflow,但是没有一个答案可以解决我的问题。
import re, string, unicodedata
import nltk
import csv
import inflect
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()
lem = WordNetLemmatizer
with open('file.csv', 'r') as csv_file, open('new_file.csv', 'w') as out_file:
reader = csv.reader(csv_file)
writer = csv.writer(out_file)
text = nltk.word_tokenize(text)
def non_ascii(text):
new_words = []
for word in text:
remove_non_ascii_words = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8','ignore')
new_words.append(remove_non_ascii_words)
return new_words
def remove_punct(text):
new_words =[]
for word in text:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def to_lower(text):
new_words = []
for word in text:
new_word = word.lower()
new_words.append(new_word)
return new_words
def replace_numbers(text):
p = inflect.engine()
new_words = []
for word in text:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
def remove_stopwords(text):
new_words = []
for word in text:
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
def stem_words(text):
stemmer = LancasterStemmer()
stems = []
for word in text:
stem = stemmer.stem(word)
stems.append(stem)
return stems
def lemmatize_verbs(text):
lemmatizer = WordNetLemmatizer()
lemmas = []
for word in text:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma)
return lemmas
def normalize(text):
text = non_ascii(text)
text = remove_punct(text)
text = to_lower(text)
text = replace_numbers(text)
text = remove_stopwords(text)
return text
text = normalize(text)
def stem_lemmatize(text):
stems = stem_words(text)
lemmas = lemmatize_verbs(text)
return stems, lemmas
stems, lemmas = stem_lemmatize(text)
print('Stemmed:\n', stems)
print('\nLemmatized:\n', lemmas)
writer.writerow()
csv_file.close()
我要将结果保存到CSV文件中。
答案 0 :(得分:0)
取决于您希望如何在输出CSV文件中存储数据,可以考虑使用.write_row()
。
您可以将信息存储在每一行中,例如
writeObject.write_row(["rowNumber", "day", "dollars"])
。
我发现了一个有用的website,它也可以帮助您写入CSV文件。