因此,我正在使用来自udemy的Python教程来学习Apache Spark,在该教程中,辅导老师正在从名为Canopy的IDE中运行Spark代码。现在,我想直接从命令提示符运行火花代码,并且可以使用findspark模块来运行火花代码,但是无论我运行哪个文件,它始终只运行一个文件。我对此很陌生,所以我不知道为什么。
现在,当我运行python movie-recommendations-als.py时,它将运行此特定代码,当我尝试运行rating-counter.py时,它还将运行movie-recommendations-als.py
以下是本教程提供的两个代码
import findspark
findspark.init("C:/spark")
import sys
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating
def loadMovieNames():
movieNames = {}
with open("ml-100k/uitem.txt", encoding='ascii', errors="ignore") as f:
for line in f:
fields = line.split('|')
movieNames[int(fields[0])] = fields[1]
return movieNames
conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS")
sc = SparkContext(conf = conf)
sc.setCheckpointDir('checkpoint')
print("\nLoading movie names...")
nameDict = loadMovieNames()
data = sc.textFile("file:///SparkCourse/ml-100k/u.data")
ratings = data.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
# Build the recommendation model using Alternating Least Squares
#print("\nTraining recommendation model...")
rank = 10
# Lowered numIterations to ensure it works on lower-end systems
numIterations = 6
model = ALS.train(ratings, rank, numIterations)
userID=10
print("1.My Movies\n2.My Recommendations")
if(o==1):
print("\nRatings for user ID " + str(userID) + ":")
userRatings = ratings.filter(lambda l: l[0] == userID)
for rating in userRatings.collect():
print (nameDict[int(rating[1])] + ": " + str(rating[2]))
elif(o==2):
print("\nTop 10 recommendations:")
recommendations = model.recommendProducts(userID, 10)
for recommendation in recommendations:
print ("Movie Name:"+nameDict[int(recommendation[1])] +" score " + str(recommendation[2]))
print("THIS IS THE END OF IT")
ratings-counter.py
import findspark
import os
findspark.init("C:/spark")
from pyspark import SparkConf, SparkContext
import collections
conf = SparkConf().setMaster("local").setAppName("RatingsHistogram")
sc = SparkContext(conf = conf)
lines = sc.textFile("file:///SparkCourse/ml-100k/u.data")
ratings = lines.map(lambda x: x.split()[2])
result = ratings.countByValue()
sortedResults = collections.OrderedDict(sorted(result.items()))
for key, value in sortedResults.items():
print("%s %i" % (key, value))```