使用python 3.4.3进行文本文件索引

时间:2015-05-17 13:23:39

标签: python python-3.x indexing

我尝试编写Python 3.4代码来从外部索引文本文档 这是我的尝试。运行时出现错误信息:

  

未定义原始输入

我想要的是:

  1. 用于标记出python 34文件夹
  2. 的文档
  3. 删除停用词
  4. 来阻止
  5. 索引
  6. 代码:

      import string
    
      def RemovePunc():
            line = []
            i = 0
            text_input = ""
            total_text_input = "C:Users\Kelil\Desktop\IRS_Assignment\project.txt"
            #This part removes the punctuation and converts input text to lowercase
            while i != 1:
                text_input = raw_input 
                if text_input == ".":
                    i = 1
                else:
                    new_char_string = "" 
                    for char in text_input:
                        if char in string.punctuation:
                            char = " "
    
                        new_char_string = new_char_string + char
    
                    line = line + [new_char_string.lower()]
                    #This is a list with all of the text that was entered in
                    total_text_input = (total_text_input + new_char_string).lower()
            return line
    
    def RemoveStopWords(line):
            line_stop_words = []
            stop_words = ['a','able','about','across','after','all','almost','also','am','among',
                 'an','and','any','are','as','at','be','because','been','but','by','can',
                 'cannot','could','dear','did','do','does','either','else','ever','every',
                 'for','from','get','got','had','has','have','he','her','hers','him','his',
                 'how','however','i','if','in','into','is','it','its','just','least','let',
                 'like','likely','may','me','might','most','must','my','neither','no','nor',
               'not','of','off','often','on','only','or','other','our','own','rather','said',
                 'say','says','she','should','since','so','some','than','that','the','their',
                 'them','then','there','these','they','this','tis','to','too','twas','us',
                 'wants','was','we','were','what','when','where','which','while','who',
                 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your']
            #this part removes the stop words for the list of inputs
            line_stop_words = []
            sent = ""
            word = ""
            test = []
            for sent in line:
                word_list = string.split(sent)
                new_string = ""
                for word in word_list:
                    if word  not in stop_words:
                        new_string = new_string + word + " "
                new_string = string.split(new_string)
                line_stop_words = line_stop_words + [new_string]
            return(line_stop_words)
    def StemWords(line_stop_words):
            leaf_words = "s","es","ed","er","ly","ing"
            i=0
            while i < 6:    
                count = 0
                length = len(leaf_words[i])
                while count < len(line_stop_words):
                    line = line_stop_words[count]
                    count2 = 0
                    while count2 < len(line):
                        #line is the particular list(or line) that we are dealing with, count if the specific word
                        if leaf_words[i] == line[count2][-length:]:
                            line[count2] = line[count2][:-length]
                        count2 = count2 + 1
                    line_stop_words[count] = line
                    count2 = 0
                    count = count + 1
                count = 0
                i = i + 1
            return(line_stop_words)
    def indexDupe(lineCount,occur):
            if str(lineCount) in occur:
                return True
            else:
                return False
    def Indexing(line_stop_words):
            line_limit = len(line_stop_words)
            index = []
            line_count = 0
            while line_count < line_limit:
                for x in line_stop_words[line_count]:
                    count = 0
                    while count <= len(index):
                        if count == len(index):
                            index = index + [[x,[str(line_count+1)]]]
                            break
                        else:
                            if x == index[count][0]:
                                if indexDupe(line_count+1,index[count][1]) == False:
                                    index[count][1] += str(line_count+1)
                                break
    
                        count = count + 1
                line_count = line_count + 1
            return(index)
    def OutputIndex(index):
    
            print ("Index:")
            count = 0
            indexLength = len(index)
            while count < indexLength:
                print (index[count][0],)
                count2 = 0
                lineOccur = len(index[count][1])
                while count2 < lineOccur:
                    print (index[count][1][count2],)
                    if count2 == lineOccur -1:
                        print ("")
                        break
                    else:
                        print (",",)
                    count2 += 1
    
                count += 1
    line = RemovePunc()   
    line_stop_words = RemoveStopWords(line)
    line_stop_words = StemWords(line_stop_words)    
    index = Indexing(line_stop_words)
    OutputIndex(index)
    

1 个答案:

答案 0 :(得分:0)

@smichak已经在评论中提出了正确的答案。在Python 3中,raw_input被重命名为input。所以你想要:

text_input = input()

不要忘记这些括号,因为你想调用这个函数。