此文本挖掘Python程序需要效率

时间:2020-07-22 22:09:14

标签: python performance text nlp statistics

我相对较不熟悉python编程,我正在研究一个词频分析程序,该程序可以分析整本书,并给出词频以及每个词在全文中的位置。我有它的工作,但要花很长时间。有人对此代码有效率建议吗?此外,也欢迎您对进行分析的方向有任何其他想法。感谢您的反馈!

import re
import string
import numpy as nmp

def occurances(word,fulltext):
    #this function receives a word and returns an array of all of the occurances of the word
    #will not be efficient, but will work
    locations = []#will hold the locations found where matching words are within the 1984.txt
    #fulltextfile = open('1984.txt', 'r')
    #fulltext = fulltextfile.read().lower() #make lowercase to parse correctly 
    #for speed this could be done globally and save a new file with the lowercase already done 

    #check for word in index and append to index with position
    #position = re.search(word, index)

    fulltext = fulltext.split() # to seperate the sentence into its word for analysis. now the item below 
    #refers to a word not letter
    position = 0 # this is an initilization for the iterator
    count = 0 # this iterates through the array that will be returned, it will equal number of 
    #occurances of the word in the string
    for item in fulltext:
        position+=1
        if item == word:
            #word has been found
            #append the location to array
            #print(item," @ ",position)
            
            # -----//// these two are options of methods for saving these things. 
        
            locations.append(position) # save position in locations at index = count
            #locations[count] = position # save position in locations at index = count
       
            count += 1 #found one occurance
    
    #fulltextfile.close()
    return locations

text_file = open("positions.txt", "w")
text_file.truncate(0)#clearing the previous contents

iterationvar = 0
for word in match_pattern:
    count = frequency.get(word,0)#working here
    #here = position.get(word,)
    frequency[word] = count + 1
    #position[word].append(y)
    text_file.write("{0}".format(word))
    text_file.write("[{0}]".format(iterationvar))
    #this is the original below
    """ text_file.write("{0},".format(word))
    text_file.write("{0}\n".format(iterationvar)) """
    iterationvar+=1
    #y +=1
                                #I am working in the above function. Goal: get a dictionary that is full of arrays. One array for each word with every position where it occured 
                                #in the book. This gives an idea of sentiment over time. Copy the syntax for the above frequency dictionary but implement an array instead of the 0 on the right

#also add in a word length dictionary to be able to graph that... but first position. 
#then sentiment analysis will be easier to do when we are able to assign sentiment to words. 
text_file.close()

frequency_list = frequency.keys()
 
text_file = open("patterns.txt", "w")
text_file.truncate(0)#clearing the previous contents

#uncomment this section down to end of for loop for displaying the freq in the commandline
#for words in frequency_list:
    #print(words, frequency[words])

print("sorting now")

sort_words = sorted(frequency.items(), key=lambda x: x[1], reverse=True)
t = 0
total = len(sort_words)

#math declarations

print("processing word: ",word, "in ",total," lines...")
for i in sort_words:
    #print(i[0], i[1])
    var1 = i[0]
    var2 = i[1]
    var3 = occurances(i[0],fulltext) #this is highly inefficient and is taking a long time. 
    text_file.write("{0},".format(var1))
    #text_file.write("{0},".format(var2))
    text_file.write("{0}\n".format(var3))
    if t % 200 == 0:
        print(".",end="")
    
    result =(total)/2
    if t == result:
        print("50 percent complete")
    result =(total)/3
    if t == result:
        print("30 percent complete")
    result =2*(total)/3
    if t == result:
        print("60 percent complete")
    result =3*(total)/4
    if t == result:
        print("75 percent complete")
    result =(total)/4
    if t == result:
        print("25 percent complete")
    result =(total)/1
    if t == result:
        print("10 percent complete")

    t+=1

    
#include a position in the book

#print("starting occurances")
#occurancesarray = occurances("just")
#print(occurancesarray)
#print(wordsarray)

#successfully implemented the occurrences function. Now iterate across the index and create arrays for each of these saved to
# a matrix. 



text_file.close()

0 个答案:

没有答案