我在这里有很多解决方案,但我已经完全解决了这些问题,但仍无法解决我的问题。我试图打印每个图形的rSquared,但我得到错误'局部变量" m"在分配之前引用。请帮忙!我知道这里的间距是关闭的。我在跑步者中拥有所有这一切。谢谢!
def readData(fileName):
hsGPA = [] #High School GPA
mathSAT = [] #Math SAT scores
crSAT = [] #Verbal SAT scores
collegeGPA = [] #College GPA
compGPA=[] #ComputerScience GPA
FullList=[]
inputFile = open(fileName, 'r', encoding = 'utf-8')
for line in inputFile:
FullList=line.split(',')
hsGPA.append(float(FullList[0]))
mathSAT.append(int(FullList[1]))
crSAT.append(int(FullList[2]))
collegeGPA.append(float(FullList[3]))
compGPA.append(float(FullList[4]))
return hsGPA, mathSAT, crSAT, collegeGPA, compGPA
def plotData(hsGPA, mathSAT, crSAT, collegeGPA,compGPA):
GPA1 = [] #High School GPA
Score1 = [] #Math SAT scores
Score2= [] #Verbal SAT scores
GPA2 = [] #College GPA
GPA3=[] #ComputerScience GPA
hsGPA, mathGPA, crSAT, collegeGPA,compGPA = readData('satFINAL.txt')
pyplot.figure(1)
pyplot.subplot(5,1,1)
for line in range(len(hsGPA)):
GPA1.append(line)
pyplot.plot(GPA1,hsGPA)
pyplot.subplot(5,1,2)
for line in range(len(mathSAT)):
Score1.append(line)
pyplot.plot(Score1,mathSAT)
pyplot.subplot(5,1,3)
for line in range(len(crSAT)):
Score2.append(line)
pyplot.plot(Score2,crSAT)
pyplot.subplot(5,1,4)
for line in range(len(collegeGPA)):
GPA2.append(line)
pyplot.plot(GPA2,collegeGPA)
pyplot.subplot(5,1,5)
for line in range(len(compGPA)):
GPA3.append(line)
pyplot.plot(GPA3,compGPA)
pyplot.show()
def LinearRegression(xList, yList):
'''
This function finds the constants in the y = mx+b, or linear regression
forumula
xList - a list of the x values
yList - a list of the y values
m - the slope f the line
b - where the line intercepts the y axis
'''
n = len(xList)
sumX = 0
sumXX = 0
sumXY = 0
sumY = 0
for index in range(n):
sumX += xList[index]
sumXY += xList[index] * yList[index]
sumXX += xList[index]**2
sumY += yList[index]
#the components needed to find m and b
m = (n*(sumXY - (sumX*sumY)))/(n*(sumXX - (sumX**2)))
b = (sumY - (m*sumX))/n
#actually implements formula
return m, b
def plotRegression(x,y, xLabel, yLabel):
pyplot.scatter(x,y)
m,b = LinearRegression(x,y)
minX = min(x)
maxX = max(x)
pyplot.plot([minX, maxX], [m * minX + b, m * maxX + b], color ='red')
pyplot.xlabel(xLabel)
pyplot.ylabel(yLabel)
pyplot.show()
def rSquared(x,y):
n = len(x)
R=0
sumS=0
sumT=0
sumY=0
for index in range(n):
a=(y[index]-((m*x[index])+b))**2
sumS = sumS+a
for index in range(len(y)):
sumY = sumY= y[index]
MeanY= sumY/(len(y))
e=(y[index]-MeanY)**2
sumT = sumT+e
m,b= LinearRegression(x, y)
RG=1-(sumS/sumT)
def main():
data = readData('satFINAL.txt')
print(data)
plotData(*data)
hsGPA, mathSAT, crSAT, collegeGPA,compGPA = data
# added ScoreT calculation here
ScoreT = [sum(x) for x in zip(mathSAT, crSAT)]
plotRegression(hsGPA,collegeGPA, 'highGPA', 'collegeGPA')
plotRegression(mathSAT,collegeGPA, 'mathSAT' , 'collegeGPA')
plotRegression(crSAT,collegeGPA, 'crSAT' , 'collegeGPA')
plotRegression(ScoreT,collegeGPA, 'Math and CR SAT' , 'collegeGPA')
plotRegression(mathSAT,crSAT, 'mathSAT', 'CR SAT')
plotRegression(mathSAT,compGPA, 'mathSAT', 'CompGPA')
plotRegression(hsGPA,compGPA, 'HsGPA', 'CompGPA')
plotRegression(ScoreT,compGPA, 'SATscore ', 'CompGPA')
print(rSquared(hsGPA,collegeGPA))
main()
答案 0 :(得分:3)
很难说 - 你的缩进搞砸了,而且你的代码很多,你实际上没有给出错误跟踪(这实际上会识别出错误所在的行!) - 但看起来,在rSquared
的定义中,您在为a=(y[index]-((m*x[index])+b))**2
分配值之前调用了m
。
编辑:我经历了很多重复的代码并重构为循环;现在希望它更具可读性。我还针对linear_regression
交叉检查了scipy.stats.linregress
函数并获得了相同的结果;我未已验证r_squared
,因此您应该检查一下。
import matplotlib.pyplot as plt
# column indices
HS, MATH, VERBAL, COLLEGE, COMPSCI = range(5)
# column labels
LABELS = ["High school GPA", "Math SAT", "Verbal SAT", "College GPA", "CompSci GPA"]
# column data types
DTYPES = [ float, int, int, float, float ]
def read_columns(fname, encoding="utf-8", separator=",", dtypes=None):
"""
Return columns of data from a file
If dtypes is specified, convert each column to the given data type
"""
# read rows from data file
with open(fname, encoding=encoding) as inf:
rows = [line.split(separator) for line in inf]
# transpose to columns
cols = zip(*rows)
# apply data types
if dtypes is not None:
cols = [[dtype(cell) for cell in col] for dtype,col in zip(dtypes,cols)]
return cols
def linear_regression(xs, ys):
"""
Return the linear regression constants m,b
in the least-squares best fit to y = m*x+b
"""
# if you have SciPy you can use scipy.stats.linregress instead
n = len(xs)
xsum = sum(xs)
ysum = sum(ys)
xxsum = sum(x*x for x in xs)
xysum = sum(x*y for x,y in zip(xs, ys))
m = (n * xysum - xsum * ysum) / (n * xxsum - xsum * xsum)
b = (ysum - m * xsum) / n
return m, b
def r_squared(xs, ys):
m, b = linear_regression(xs, ys)
ysum, n = sum(ys), len(ys)
ymean = ysum / n
ssum = sum((y - (m * x + b))**2 for x,y in zip(xs, ys))
tsum = sum((y - ymean)**2 for y in ys)
return 1 - ssum / tsum
def plot_regression(xs, xlabel, ys, ylabel):
m, b = linear_regression(xs, ys)
min_, max_ = min(xs), max(xs)
plt.scatter(xs, ys)
plt.plot([min_, max_], [m * min_ + b, m * max_ + b], "r")
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.show()
def main():
# read data
scores = read_columns("satFINAL.txt", dtypes=DTYPES)
# add composite math-and-verbal score
MATH_VERBAL = 5
LABELS.append("Math+Verbal SAT")
DTYPES.append(int)
scores.append([math+verbal for math,verbal in zip(scores[MATH], scores[VERBAL])])
# do raw score plots
plt.figure(1)
num_figs = len(LABELS)
# draw subplots
for fig, column, nums in zip(range(num_figs), LABELS, scores):
plt.subplot(num_figs, 1, fig+1)
plt.plot(range(len(nums)), nums)
plt.xlabel(LABELS[fig])
# display results
plt.show()
# do regression plots
regressions = [
(HS, COLLEGE),
(MATH, COLLEGE),
(VERBAL, COLLEGE),
(MATH_VERBAL, COLLEGE),
(MATH, VERBAL),
(MATH, COMPSCI),
(HS, COMPSCI),
(MATH_VERBAL, COMPSCI)
]
for x,y in regressions:
print("r**2 for {} and {}: {}".format(LABELS[x], LABELS[y], r_squared(scores[x], scores[y])))
plot_regression(scores[x], LABELS[x], scores[y], LABELS[y])
if __name__=="__main__":
main()