我正在使用python版本3进行编码,并且得到了带有正“ words”的list
,但是有些项目带有空格:
posWords = ['beautiful', 'love', 'happy day', 'enjoy', 'smelling flowers']
但是,我需要对肯定词进行分析的文本在项目中没有任何空格:
wordList = ['I', 'enjoy', 'smelling', 'flowers', 'on', 'a', 'happy', 'day']
我要遍历wordList
,并且当算法看到posWords
中也存在但已合并的单词(例如'happy day'
)时,请删除wordList
中的相应单词('happy', 'day'
),然后在wordList
中添加合并的版本。
所以最后,wordList
必须看起来像这样:
wordList = ['I', 'enjoy', 'smelling flowers', 'on', 'a', 'happy day']
大更新:
因为我答应过要让您保持最新,所以这是到目前为止的代码。这有点棘手,因为在我的带有正词和负词的列表中,短语最多包含三个词。所以我需要弄清楚如何使用它。我意识到(同样由于你们给我的答案,再次感谢!),我不得不从需要分析的文本的所有单词中列出一个字符串项目,其中包含3、2或1个单词,因此我可以检查这些项目是否也出现在我的肯定词列表和否定词列表中。到目前为止,这是我的代码。这有点笨拙,有很多复制粘贴...我打算解决这个问题,但是我很累,周末现在开始了,所以请不要讨厌! (欢迎您提示)
from textblob import TextBlob
# open the files with negative words
negatives = open("neg_dutch_2.txt")
neg_list = []
# push all the words from text file to list
for lines in negatives:
lines = lines.lower()
neg_list.append(lines.strip("\n"))
neg_no_space = []
neg_one_space = []
neg_two_spaces = []
neg_three_spaces = []
count = 0
def neg_how_many_spaces(neg_list, neg_no_space, neg_one_space, neg_two_spaces,
neg_three_spaces, count):
# read every word in the list with negative words
for i in range(len(neg_list)):
# every word is a phrase, because there are "words" with spaces
phrase = neg_list[i]
# look at every character and assign the phrase to a list
# that correspondes with the number of spaces in it
for j in range(len(phrase)):
if phrase[j] == " ":
count += 1
if phrase[-1]:
if count == 1:
neg_one_space.append(phrase)
elif count == 2:
neg_two_spaces.append(phrase)
elif count == 3:
neg_three_spaces.append(phrase)
else:
neg_no_space.append(phrase)
# reset the counter to avoid the total sum of spaces in a list
count = 0
return neg_list, neg_no_space, neg_one_space, neg_two_spaces,
neg_three_spaces, count
neg_how_many_spaces(neg_list, neg_no_space, neg_one_space,
neg_two_spaces, neg_three_spaces, count)
# open the files with positive words
positives = open("pos_dutch_2.txt")
pos_list = []
# push all the words from text file to list
for lines in positives:
lines = lines.lower()
pos_list.append(lines.strip("\n"))
pos_no_space = []
pos_one_space = []
pos_two_spaces = []
pos_three_spaces = []
count = 0
def pos_how_many_spaces(pos_list, pos_no_space, pos_one_space, pos_two_spaces,
pos_three_spaces, count):
# read every word in the list with positive words
for i in range(len(pos_list)):
# every word is a phrase, because there are "words" with spaces
phrase = pos_list[i]
# look at every character and assign the phrase to a list
# that correspondes with the number of spaces in it
for j in range(len(phrase)):
if phrase[j] == " ":
count += 1
if phrase[-1]:
if count == 1:
pos_one_space.append(phrase)
elif count == 2:
pos_two_spaces.append(phrase)
elif count == 3:
pos_three_spaces.append(phrase)
else:
pos_no_space.append(phrase)
# reset the counter to avoid the total sum of spaces in a list
count = 0
return pos_list, pos_no_space, pos_one_space, pos_two_spaces,
pos_three_spaces, count
pos_how_many_spaces(pos_list, pos_no_space, pos_one_space,
pos_two_spaces, pos_three_spaces, count)
text = open("nrc_sample.TXT")
# reading the article, using TextBlob library to seperate each word
text = text.read()
blob = TextBlob(text)
# these are words that are bound to the meta-deta of the articlesfile
ruis = ["DOCUMENTS", "SECTION", "LENGTH", "LOAD-DATE", "LANGUAGE",
"PUBLICATION-TYPE", "JOURNAL-CODE", "BYLINE", "All", "Rights",
"Reserved", "Copyright", "krant", "Krant", "KRANT", "blz"]
# make a list for all the words in the articles
word_list = []
# and store every word in that list
for word in blob.words:
if not any(x in word for x in ruis):
word = word.lower()
if word.isalpha():
word_list.append(word)
# variables for the frequencies of negative and positive words in articles
amount_pos = 0
amount_neg = 0
count = 0
phrases_four = []
phrases_three = []
phrases_two = []
phrases_one = []
amount_neg = 0
# PHRASE 4
for i in range(0, len(word_list)-4, 1):
if word_list[-1]:
phrase = " "
strings = word_list[i], word_list[i+1], word_list[i+2], word_list[i+3]
phrase = phrase.join(strings)
phrases_four.append(phrase)
count = 0
for phrase in phrases_four:
print("phrase4", count, phrase)
count += 1
for neg in neg_three_spaces:
if phrase == neg:
print("negatief woord^")
amount_neg += 1
print(amount_neg)
# PHRASE 3
for i in range(0, len(word_list)-3, 1):
if word_list[-1]:
phrase = " "
strings = word_list[i], word_list[i+1], word_list[i+2]
phrase = phrase.join(strings)
phrases_three.append(phrase)
count = 0
for phrase in phrases_three:
print("phrase3", count, phrase)
count += 1
for neg in neg_two_spaces:
if phrase == neg:
print("negatief woord^")
amount_neg += 1
print(amount_neg)
# PHRASE 2
# start at index zero, till one before end of the list
for i in range(0, len(word_list)-2, 1):
# until it hits the last word of the list, make for every two words in the
# article next to each other a phrase of two words, so we can check if
# there are phrases in the article who also exists in the pos or neg wordslists
if word_list[-1]:
phrase = " "
strings = word_list[i], word_list[i+1]
phrase = phrase.join(strings)
phrases_two.append(phrase)
count = 0
# examine each phrase, and check if the same phrase exists in the list
# with negative phrases containing two words
# dont forget to delete the counter, is only for readability
for phrase in phrases_two:
count += 1
for neg in neg_one_space:
if phrase == neg:
amount_neg += 1
print(amount_neg)
# JUST A WORD
for i in range(0, len(word_list)-1, 1):
if word_list[-1]:
phrase = word_list[i]
phrases_one.append(phrase)
count = 0
for phrase in phrases_one:
print("phrase1", count, phrase)
count += 1
for neg in neg_no_space:
if phrase == neg:
print("negatief woord^")
amount_neg += 1
print(amount_neg)
答案 0 :(得分:1)
这是一种实现方法:
posWords = ['beautiful', 'love', 'happy day', 'enjoy','smelling flowers']
wordList = ['I', 'enjoy', 'smelling', 'flowers', 'on', 'a', 'happy', 'day']
# Create a sentence for the wordList.
joinedWordList = " ".join(wordList)
# Find all phrases in the posWords list.
phrases = [elem for elem in posWords if len(elem.split()) > 1]
# For every phrase, locate it in the sentence,
# count the space characters which is the same number as the index of the first word of phrase in the word list,
# insert the phrase and delete the word that combine the phrase from the wordList.
for phrase in phrases:
try:
i = joinedWordList.index(phrase)
spaces = len([letter for letter in joinedWordList[:i] if letter==' '])
wordList.insert(spaces,phrase)
del wordList[spaces+1:spaces+1 + len(phrase.split())]
except ValueError:
pass
print(wordList)
输出:
['I', 'enjoy', 'smelling flowers', 'on', 'a', 'happy day']
答案 1 :(得分:1)
这是另一种适用于任何短语长度的方法:
posWords = ['beautiful', 'love', 'happy day', 'enjoy','smelling flowers']
wordList = ['I', 'enjoy', 'smelling', 'flowers', 'on', 'a', 'happy', 'day']
for w in posWords:
nrWords = len(w.split(' '))
if nrWords > 1:
word_array = w.split(' ')
word_index_array = [wordList.index(w) for w in word_array]
index_difference_array = [abs(b-a) for a in word_index_array[0:-1] for b in word_index_array[1:]]
if sum(index_difference_array) == len(index_difference_array): #elements are consecutive in wordList
for elem in word_array:
wordList.remove(elem)
wordList.insert(word_index_array[0], w)
输出将是:
['I', 'enjoy', 'smelling flowers', 'on', 'a', 'happy day']
但是,例如,如果我们输入以下内容:
posWords = ['beautiful', 'love', 'happy day', 'enjoy','smelling flowers on']
wordList = ['I', 'enjoy', 'smelling', 'flowers', 'on', 'a', 'happy', 'day']
输出将是:
['I', 'enjoy', 'smelling flowers on', 'a', 'happy day']
答案 2 :(得分:0)
您可以执行以下操作:
In [711]: s = ''.join(posWords)
In [712]: s
Out[712]: 'beautifullovehappy dayenjoysmelling flowers'
In [672]: n = []
In [673]: for i in wordList:
...: if i in s:
...: n.append(i)
...:
In [713]: n
Out[713]: ['enjoy', 'smelling', 'flowers', 'a', 'happy', 'day']
In [740]: for c, i in enumerate(n):
...: if c+1 < len(n):
...: word = n[c] + ' ' + n[c+1]
...: if word in posWords:
...: ix1 = wordList.index(n[c])
...: del wordList[ix1: ix1+2]
...: wordList.insert(ix1,word)
...:
In [710]: wordList
Out[710]: ['I', 'enjoy', 'smelling flowers', 'on', 'a', 'happy day']
让我知道这是否有帮助。
答案 3 :(得分:0)
另一种方法:
>>> m=["good bad", "enjoy", "play"]
>>> l=["good", "bad", "happy", "delight"]
>>>
>>> for e in m:
... tmp = e.split(" ")
... if(len(tmp) > 1):
... l = [ent for ent in l if ent not in tmp]
... l.append(" ".join(tmp))
...
>>>
>>> l
['happy', 'delight', 'good bad']