这个程序的目的是将作者签名(写作风格)与他们的作品相匹配,以便弄清楚作者是谁。我基本上已经完成了程序,但是当我去运行它时,除了第一个提示要求文件名称之外,它什么也没做。我无法弄清楚为什么它不继续使用该文件。它给了我一个“找不到该文件”的打印消息,但文件IS在正确的目录中,所以我不知道发生了什么。
相对来说相当大的程序,所以我不指望人们真正深入,但我不能自己弄明白。
import os.path, math
def clean_up(s):
''' Return a version of string str in which all letters have been
converted to lowercase and punctuation characters have been stripped
from both ends. Inner punctuation is left untouched. '''
punctuation = '''!"',;:.-?)([]<>*#\n\t\r'''
result = s.lower().strip(punctuation)
return result
def average_word_length(text):
''' Return the average length of all words in text. Do not
include surrounding punctuation in words.
text is a non-empty list of strings each ending in \n.
At least one line in text contains a word.'''
words = text.split()
for word in words:
average=sum(len(word) for word in words)/len(words)
return average
def type_token_ratio(text):
''' Return the type token ratio (TTR) for this text.
TTR is the number of different words divided by the total number of words.
text is a non-empty list of strings each ending in \n.
At least one line in text contains a word. '''
uniquewords=dict()
words=0
for line in text.splitlines():
line=line.strip().split()
for word in line:
words+=1
if word in uniquewords:
uniquewords[word]+=1
else:
uniquewords[word]=1
TTR= len(uniquewords)/words
return TTR
def hapax_legomana_ratio(text):
''' Return the hapax_legomana ratio for this text.
This ratio is the number of words that occur exactly once divided
by the total number of words.
text is a list of strings each ending in \n.
At least one line in text contains a word.'''
uniquewords = dict()
words = 0
for line in text.splitlines():
line = line.strip().split()
for word in line:
words += 1
word = word.replace(',', '').strip()
if word in uniquewords:
uniquewords[word] -= 1
else:
uniquewords[word] = 1
unique_count = 0
for each in uniquewords:
if uniquewords[each] == 1:
unique_count += 1
HLR = unique_count/words
return HLR
def split_on_separators(original, separators):
''' Return a list of non-empty, non-blank strings from the original string
determined by splitting the string on any of the separators.
separators is a string of single-character separators.'''
result = []
newstring=''
for char in original:
if char in separators:
result.append(newstring)
newstring=''
if '' in result:
result.remove('')
else:
newstring+=char
return result
def average_sentence_length(text):
''' Return the average number of words per sentence in text.
text is guaranteed to have at least one sentence.
Terminating punctuation defined as !?.
A sentence is defined as a non-empty string of non-terminating
punctuation surrounded by terminating punctuation
or beginning or end of file. '''
words=0
Sentences=0
for line in text.split():
words+=1
sentence=split_on_separators(text,'?!.')
for sep in sentence:
Sentences+=1
ASL=words/Sentences
return ASL
def avg_sentence_complexity(text):
'''Return the average number of phrases per sentence.
Terminating punctuation defined as !?.
A sentence is defined as a non-empty string of non-terminating
punctuation surrounded by terminating punctuation
or beginning or end of file.
Phrases are substrings of a sentences separated by
one or more of the following delimiters ,;: '''
Sentences=0
Phrases=0
sentence=split_on_separators(text,'?!.')
for sep in sentence:
Sentences+=1
Phrase=split_on_separators(text, ',;:')
for n in Phrase:
Phrases+=1
ASC=Phrases/Sentences
return ASC
def get_valid_filename(prompt):
'''Use prompt (a string) to ask the user to type the name of a file. If
the file does not exist, keep asking until they give a valid filename.
Return the name of that file.'''
filename = input(prompt)
if os.path.isfile(filename) == False:
print ("That file does not exist.")
filename = input(prompt)
return filename
def read_directory_name(prompt):
'''Use prompt (a string) to ask the user to type the name of a directory. If
the directory does not exist, keep asking until they give a valid directory.
'''
dirname = input(prompt)
if os.path.isfile(dirname)== False:
print ("That directory does not exist.")
dirname=input(prompt)
return dirname
def compare_signatures(sig1, sig2, weight):
'''Return a non-negative real number indicating the similarity of two
linguistic signatures. The smaller the number the more similar the
signatures. Zero indicates identical signatures.
sig1 and sig2 are 6 element lists with the following elements
0 : author name (a string)
1 : average word length (float)
2 : TTR (float)
3 : Hapax Legomana Ratio (float)
4 : average sentence length (float)
5 : average sentence complexity (float)
weight is a list of multiplicative weights to apply to each
linguistic feature. weight[0] is ignored.
'''
i=1
while i <=5:
result +=(abs(sig1[i]-sig2[i]))*weight[i]
i+=1
return result
def read_signature(filename):
'''Read a linguistic signature from filename and return it as
list of features. '''
file = open(filename, 'r')
# the first feature is a string so it doesn't need casting to float
result = [file.readline()]
# all remaining features are real numbers
for line in file:
result.append(float(line.strip()))
return result
if __name__ == '__main__':
prompt = 'enter the name of the file with unknown author:'
mystery_filename = get_valid_filename(prompt)
# readlines gives us the file as a list of strings each ending in '\n'
text = open(mystery_filename, 'r').read()
text.close()
# calculate the signature for the mystery file
mystery_signature = [mystery_filename]
mystery_signature.append(average_word_length(text))
mystery_signature.append(type_token_ratio(text))
mystery_signature.append(hapax_legomana_ratio(text))
mystery_signature.append(average_sentence_length(text))
mystery_signature.append(avg_sentence_complexity(text))
weights = [0, 11, 33, 50, 0.4, 4]
prompt = 'enter the path to the directory of signature files: '
dir = read_directory_name(prompt)
# every file in this directory must be a linguistic signature
files=os.listdir(dir)
# to do: Get list of all files in directory name we just got.
# store it in a list called 'files'
# we will assume that there is at least one signature in that directory
this_file = files[0]
signature = read_signature('{} {}'.format(dir,this_file))
best_score = compare_signatures(mystery_signature, signature, weights)
best_author = signature[0]
for this_file in files[1:]:
signature = read_signature('{} {}'.format(dir,this_file))
score = compare_signatures(mystery_signature, signature, weights)
if score < best_score:
best_score = score
best_author = signature[0]
print( "best author match: {} with score {}".format(best_author, best_score))
答案 0 :(得分:1)
在以下行(出现两次)中,通过将dir部分和文件名部分与空格连接来生成路径。
signature = read_signature('{} {}'.format(dir,this_file))
# ^
您应该使用os.sep
加入它们。
signature = read_signature('{}{}{}'.format(dir, os.sep, this_file))
或更优选地,使用os.path.join
:
signature = read_signature(os.path.join(dir, this_file))
答案 1 :(得分:0)
您确定它报告“该文件不存在”。而不是“那个目录不存在。”?
在read_directory_name(prompt)
中,作者使用os.path.isfile(path)
,其文档为“如果路径是现有常规文件,则返回True。(...)”。
由于您要查找的目录需要os.path.isdir
或os.path.exists
。