Question

我尝试编写Python 3.4代码来从外部索引文本文档这是我的尝试。运行时出现错误信息：

未定义原始输入

我想要的是：

用于标记出python 34文件夹
删除停用词
来阻止
索引

代码：

  import string

  def RemovePunc():
        line = []
        i = 0
        text_input = ""
        total_text_input = "C:Users\Kelil\Desktop\IRS_Assignment\project.txt"
        #This part removes the punctuation and converts input text to lowercase
        while i != 1:
            text_input = raw_input 
            if text_input == ".":
                i = 1
            else:
                new_char_string = "" 
                for char in text_input:
                    if char in string.punctuation:
                        char = " "

                    new_char_string = new_char_string + char

                line = line + [new_char_string.lower()]
                #This is a list with all of the text that was entered in
                total_text_input = (total_text_input + new_char_string).lower()
        return line

def RemoveStopWords(line):
        line_stop_words = []
        stop_words = ['a','able','about','across','after','all','almost','also','am','among',
             'an','and','any','are','as','at','be','because','been','but','by','can',
             'cannot','could','dear','did','do','does','either','else','ever','every',
             'for','from','get','got','had','has','have','he','her','hers','him','his',
             'how','however','i','if','in','into','is','it','its','just','least','let',
             'like','likely','may','me','might','most','must','my','neither','no','nor',
           'not','of','off','often','on','only','or','other','our','own','rather','said',
             'say','says','she','should','since','so','some','than','that','the','their',
             'them','then','there','these','they','this','tis','to','too','twas','us',
             'wants','was','we','were','what','when','where','which','while','who',
             'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your']
        #this part removes the stop words for the list of inputs
        line_stop_words = []
        sent = ""
        word = ""
        test = []
        for sent in line:
            word_list = string.split(sent)
            new_string = ""
            for word in word_list:
                if word  not in stop_words:
                    new_string = new_string + word + " "
            new_string = string.split(new_string)
            line_stop_words = line_stop_words + [new_string]
        return(line_stop_words)
def StemWords(line_stop_words):
        leaf_words = "s","es","ed","er","ly","ing"
        i=0
        while i < 6:    
            count = 0
            length = len(leaf_words[i])
            while count < len(line_stop_words):
                line = line_stop_words[count]
                count2 = 0
                while count2 < len(line):
                    #line is the particular list(or line) that we are dealing with, count if the specific word
                    if leaf_words[i] == line[count2][-length:]:
                        line[count2] = line[count2][:-length]
                    count2 = count2 + 1
                line_stop_words[count] = line
                count2 = 0
                count = count + 1
            count = 0
            i = i + 1
        return(line_stop_words)
def indexDupe(lineCount,occur):
        if str(lineCount) in occur:
            return True
        else:
            return False
def Indexing(line_stop_words):
        line_limit = len(line_stop_words)
        index = []
        line_count = 0
        while line_count < line_limit:
            for x in line_stop_words[line_count]:
                count = 0
                while count <= len(index):
                    if count == len(index):
                        index = index + [[x,[str(line_count+1)]]]
                        break
                    else:
                        if x == index[count][0]:
                            if indexDupe(line_count+1,index[count][1]) == False:
                                index[count][1] += str(line_count+1)
                            break

                    count = count + 1
            line_count = line_count + 1
        return(index)
def OutputIndex(index):

        print ("Index:")
        count = 0
        indexLength = len(index)
        while count < indexLength:
            print (index[count][0],)
            count2 = 0
            lineOccur = len(index[count][1])
            while count2 < lineOccur:
                print (index[count][1][count2],)
                if count2 == lineOccur -1:
                    print ("")
                    break
                else:
                    print (",",)
                count2 += 1

            count += 1
line = RemovePunc()   
line_stop_words = RemoveStopWords(line)
line_stop_words = StemWords(line_stop_words)    
index = Indexing(line_stop_words)
OutputIndex(index)

Answer 1

@smichak已经在评论中提出了正确的答案。在Python 3中，raw_input被重命名为input。所以你想要：

text_input = input()

不要忘记这些括号，因为你想调用这个函数。

使用python 3.4.3进行文本文件索引

1 个答案: