此程序从两个.CSV文件中提取数据,这些文件链接在此处: https://drive.google.com/folderview?id=0B1SjPejhqNU-bVkzYlVHM2oxdGs&usp=sharing
它应该在两个文件中的每个文件中以逗号后面查找任何内容,但我的范围逻辑在某种程度上是错误的。我正在向第101行运行追溯错误:
“第101行,在calc_corr中:sum_smokers_value = sum_smokers_value + float(s_percent_smokers_data [r] [1]) IndexError:列表索引超出范围“
我认为它会在[k] [1]出现的其他时间做同样的事情。
如果有办法解决这个问题,请提前多多感谢。 到目前为止,该计划是:# this program opens two files containing data and runs a corralation calculation
import math
def main():
try:
print('does smoking directly lead to lung cancer?')
print('''let's find out, shall we?''''')
print('to do so, this program will find correlation between the instances of smokers, and the number of people with lung cancer.')
percent_smokers, percent_cancer = retrieve_csv()
s_percent_smokers_data, c_percent_cancer_data = read_csv(percent_smokers, percent_cancer)
correlation = calc_corr(s_percent_smokers_data, c_percent_cancer_data,)
print('r_value =', corretation)
except IOError as e:
print(str(e))
print('this program has been cancelled. run it again.')
def retrieve_csv():
num_times_failed = 0
percent_smokers_opened = False
percent_cancer_opened = False
while((not percent_smokers_opened) or (not percent_cancer_opened)) and (num_times_failed < 5):
try:
if not percent_smokers_opened:
percent_smokers_input = input('what is the name of the file containing the percentage of smokers per state?')
percent_smokers = open(percent_smokers_input, 'r')
percent_smokers_opened = True
if not percent_cancer_opened:
percent_cancer_input = input('what is the name of the file containing the number of cases of lung cancer contracted?')
percent_cancer = open(percent_cancer_input, 'r')
percent_cancer_opened = True
except IOError:
print('a file was not located. try again.')
num_times_failed = num_times_failed + 1
if not percent_smokers_opened or not percent_cancer_opened:
raise IOError('you have failed too many times.')
else:
return(percent_smokers, percent_cancer)
def read_csv(percent_smokers, percent_cancer):
s_percent_smokers_data = []
c_percent_cancer_data = []
empty_list = ''
percent_smokers.readline()
percent_cancer.readline()
eof = False
while not eof:
smoker_list = percent_smokers.readline()
cancer_list = percent_cancer.readline()
if smoker_list == empty_list and cancer_list == empty_list:
eof = True
elif smoker_list == empty_list:
raise IOError('smokers file error')
elif cancer_list == empty_list:
raise IOError('cancer file error')
else:
s_percent_smokers_data.append(smoker_list.strip().split(','))
c_percent_cancer_data.append(cancer_list.strip().split(','))
return (s_percent_smokers_data, c_percent_cancer_data)
def calc_corr(s_percent_smokers_data, c_percent_cancer_data):
sum_smokers_value = sum_cancer_cases_values = 0
sum_smokers_sq = sum_cancer_cases_sq = 0
sum_value_porducts = 0
numbers = len(s_percent_smokers_data)
for k in range(0, numbers):
sum_smokers_value = sum_smokers_value + float(s_percent_smokers_data[k][1])
sum_cancer_cases_values = sum_cancer_cases_values + float(c_percent_cancer_data[k][1])
sum_smokers_sq = sum_smokers_sq + float(s_percent_smokers_data[k][1]) ** 2
sum_cancer_cases_sq = sum_cancer_cases_sq + float(c_percent_cancer_data[k][1]) ** 2
sum_value_products = sum_value_products + float(percent_smokers[k][1]) ** float(percent_cancer[k][1])
numerator_value = (numbers * sum_value_products) - (sum_smokers_value * sum_cancer_cases_values)
denominator_value = math.sqrt(abs((numbers * sum_smokers_sq) - (sum_smokers_value ** 2)) * ((numbers * sum_cancer_cases_sq) - (sum_cancer_cases_values ** 2)))
return numerator_value / denominator_value
main()
答案 0 :(得分:0)
数据文件的每一行中的值不是逗号分隔,而是以制表符分隔。您需要更改','
分割的'\t'
分隔符。或者也许使用csv
模块并告诉它您的分隔符是'\t'
。您可以在the documentation中详细了解csv
模块。