我相对较不熟悉python编程,我正在研究一个词频分析程序,该程序可以分析整本书,并给出词频以及每个词在全文中的位置。我有它的工作,但要花很长时间。有人对此代码有效率建议吗?此外,也欢迎您对进行分析的方向有任何其他想法。感谢您的反馈!
import re
import string
import numpy as nmp
def occurances(word,fulltext):
#this function receives a word and returns an array of all of the occurances of the word
#will not be efficient, but will work
locations = []#will hold the locations found where matching words are within the 1984.txt
#fulltextfile = open('1984.txt', 'r')
#fulltext = fulltextfile.read().lower() #make lowercase to parse correctly
#for speed this could be done globally and save a new file with the lowercase already done
#check for word in index and append to index with position
#position = re.search(word, index)
fulltext = fulltext.split() # to seperate the sentence into its word for analysis. now the item below
#refers to a word not letter
position = 0 # this is an initilization for the iterator
count = 0 # this iterates through the array that will be returned, it will equal number of
#occurances of the word in the string
for item in fulltext:
position+=1
if item == word:
#word has been found
#append the location to array
#print(item," @ ",position)
# -----//// these two are options of methods for saving these things.
locations.append(position) # save position in locations at index = count
#locations[count] = position # save position in locations at index = count
count += 1 #found one occurance
#fulltextfile.close()
return locations
text_file = open("positions.txt", "w")
text_file.truncate(0)#clearing the previous contents
iterationvar = 0
for word in match_pattern:
count = frequency.get(word,0)#working here
#here = position.get(word,)
frequency[word] = count + 1
#position[word].append(y)
text_file.write("{0}".format(word))
text_file.write("[{0}]".format(iterationvar))
#this is the original below
""" text_file.write("{0},".format(word))
text_file.write("{0}\n".format(iterationvar)) """
iterationvar+=1
#y +=1
#I am working in the above function. Goal: get a dictionary that is full of arrays. One array for each word with every position where it occured
#in the book. This gives an idea of sentiment over time. Copy the syntax for the above frequency dictionary but implement an array instead of the 0 on the right
#also add in a word length dictionary to be able to graph that... but first position.
#then sentiment analysis will be easier to do when we are able to assign sentiment to words.
text_file.close()
frequency_list = frequency.keys()
text_file = open("patterns.txt", "w")
text_file.truncate(0)#clearing the previous contents
#uncomment this section down to end of for loop for displaying the freq in the commandline
#for words in frequency_list:
#print(words, frequency[words])
print("sorting now")
sort_words = sorted(frequency.items(), key=lambda x: x[1], reverse=True)
t = 0
total = len(sort_words)
#math declarations
print("processing word: ",word, "in ",total," lines...")
for i in sort_words:
#print(i[0], i[1])
var1 = i[0]
var2 = i[1]
var3 = occurances(i[0],fulltext) #this is highly inefficient and is taking a long time.
text_file.write("{0},".format(var1))
#text_file.write("{0},".format(var2))
text_file.write("{0}\n".format(var3))
if t % 200 == 0:
print(".",end="")
result =(total)/2
if t == result:
print("50 percent complete")
result =(total)/3
if t == result:
print("30 percent complete")
result =2*(total)/3
if t == result:
print("60 percent complete")
result =3*(total)/4
if t == result:
print("75 percent complete")
result =(total)/4
if t == result:
print("25 percent complete")
result =(total)/1
if t == result:
print("10 percent complete")
t+=1
#include a position in the book
#print("starting occurances")
#occurancesarray = occurances("just")
#print(occurancesarray)
#print(wordsarray)
#successfully implemented the occurrences function. Now iterate across the index and create arrays for each of these saved to
# a matrix.
text_file.close()