我试图使用Python从我的Bluemix对象存储中读取几个JSON文件到Jupyter笔记本中。我已经按照我发现的示例进行了操作,但我仍然得到了一个"没有这样的文件或目录"错误。
以下是应该验证对象存储并识别文件的代码:
# Set up Spark
from pyspark import SparkContext
from pyspark import SparkConf
if('config' not in globals()):
config = SparkConf().setAppName('warehousing_sql').setMaster('local')
if('sc' not in globals()):
sc= SparkContext(conf=config)
# Set the Hadoop configuration.
def set_hadoop_config(name, credentials):
prefix = "fs.swift.service." + name
hconf = sc._jsc.hadoopConfiguration()
hconf.set(prefix + ".auth.url", credentials['auth_url']+'/v3/auth/tokens')
hconf.set(prefix + ".auth.endpoint.prefix", "endpoints")
hconf.set(prefix + ".tenant", credentials['project_id'])
hconf.set(prefix + ".username", credentials['user_id'])
hconf.set(prefix + ".password", credentials['password'])
hconf.setInt(prefix + ".http.port", 8080)
hconf.set(prefix + ".region", credentials['region'])
hconf.setBoolean(prefix + ".public", True)
# Data Sources (generated by Insert to code)
credentials = {
'auth_url':'https://identity.open.softlayer.com',
'project':'***',
'project_id':'****',
'region':'dallas',
'user_id':'****',
'domain_id':'****',
'domain_name':'****',
'username':'****',
'password':"""****""",
'filename':'Warehousing-data.json',
'container':'notebooks',
'tenantId':'****'
}
set_hadoop_config('spark', credentials)
# The data files should now be accessible through URLs of the form
# swift://notebooks.spark/filename.json
这是调用代码:
...
resource_path= "swift://notebooks.spark/"
Warehousing_data_json = "Warehousing-data.json"
Warehousing_sales_data_nominal_scenario_json = "Warehousing-sales_data-nominal_scenario.json"
...
这是错误: IOError:[Errno 2]没有这样的文件或目录:' swift://notebooks.spark/Warehousing-data.json'
对不起,如果这似乎是一个新手问题(我承认我是),但我认为设置这个并且非常糟糕的形式依赖于未记录的方法SparkContext是非常复杂的。 _jsc.hadoopConfiguration()。
根据Hobert's和Sven的评论添加:
谢谢霍伯特。我不明白你对" swift://notebooks**.spark**/"的定义的评论。除非我误解了我所遵循的示例的逻辑(这与Sven在其响应中显示的基本相同),否则此路径来自对sc._jsc.hadoopConfiguration()的调用,但很难知道此调用实际上做了什么,因为没有记录HadoopConfiguation类。
我也不理解“为Hadoop配置使用/添加该定义”的替代方案,或“或者......在Spark中使用swift客户端来访问JSON。”我想我更喜欢后者,因为我没有在我的笔记本中使用Hadoop。请指出我对这些替代方案的更详细解释。
谢谢斯文。你是对的,我没有显示JSON文件的实际读数。读取实际上发生在作为DOcplexcloud的API一部分的方法中。这是我笔记本中的相关代码:
resource_path= "swift://notebooks.spark/"
Warehousing_data_json = "Warehousing-data.json"
Warehousing_sales_data_nominal_scenario_json = "Warehousing-sales_data-nominal_scenario.json"
resp = client.execute(input= [{'name': "warehousing.mod",
'file': StringIO(warehousing_data_dotmod + warehousing_inputs + warehousing_dotmod + warehousing_outputs)},
{'name': Warehousing_data_json,
'filename': resource_path + Warehousing_data_json},
{'name': Warehousing_sales_data_nominal_scenario_json,
'filename': resource_path + Warehousing_sales_data_nominal_scenario_json}],
output= "results.json",
load_solution= True,
log= "solver.log",
gzip= True,
waittime= 300,
delete_on_completion= True)
这是堆栈跟踪:
IOError Traceback (most recent call last)
<ipython-input-8-67cf709788b3> in <module>()
29 gzip= True,
30 waittime= 300,
---> 31 delete_on_completion= True)
32
33 result = WarehousingResult(json.loads(resp.solution.decode("utf-8")))
/gpfs/fs01/user/sbf1-4c17d3407da8d0-a7ea98a5cc6d/.local/lib/python2.7/site-packages/docloud/job.pyc in execute(self, input, output, load_solution, log, delete_on_completion, timeout, waittime, gzip, parameters)
496 # submit job
497 jobid = self.submit(input=input, timeout=timeout, gzip=gzip,
--> 498 parameters=parameters)
499 response = None
500 completed = False
/gpfs/fs01/user/sbf1-4c17d3407da8d0-a7ea98a5cc6d/.local/lib/python2.7/site-packages/docloud/job.pyc in submit(self, input, timeout, gzip, parameters)
436 gzip=gzip,
437 timeout=timeout,
--> 438 parameters=parameters)
439 # run model
440 self.execute_job(jobid, timeout=timeout)
/gpfs/fs01/user/sbf1-4c17d3407da8d0-a7ea98a5cc6d/.local/lib/python2.7/site-packages/docloud/job.pyc in create_job(self, **kwargs)
620 self.upload_job_attachment(job_id,
621 attid=inp.name,
--> 622 data=inp.get_data(),
623 gzip=gzip)
624 return job_id
/gpfs/fs01/user/sbf1-4c17d3407da8d0-a7ea98a5cc6d/.local/lib/python2.7/site-packages/docloud/job.pyc in get_data(self)
110 data = self.data
111 if self.filename is not None:
--> 112 with open(self.filename, "rb") as f:
113 data = f.read()
114 if self.file is not None:
IOError: [Errno 2] No such file or directory: 'swift://notebooks.spark/Warehousing-data.json'
当我在本地运行它并且resource_path是我自己机器上的路径时,这个笔记本工作正常。
Sven,你的代码看起来与我的代码非常相似,它紧跟着我复制的样本,所以我不明白为什么你的工作而我的代码没有。
我已经验证我的Instance_objectstore上存在这些文件。因此似乎swift://notebooks.spark/不指向此对象库。从一开始,这对我来说一直是个谜。同样,HadoopConfiguation类没有记录,因此无法知道它如何在URL和objectstore之间建立关联。
答案 0 :(得分:0)
您收到的错误消息IOError: [Errno 2] No such file or directory: 'swift://notebooks.spark/Warehousing-data.json'
表示在该路径中没有此类文件。我认为Hadoop配置的设置是成功的,否则您会收到一条错误消息,抱怨缺少凭据设置。
我已经在Bluemix上的Python笔记本中测试了以下代码,它对我有用。我从最新的示例笔记本中获取了示例代码,演示了如何从Bluemix Object Storage V3加载数据。
设置Hadoop配置的方法:
def set_hadoop_config(credentials):
"""This function sets the Hadoop configuration with given credentials,
so it is possible to access data using SparkContext"""
prefix = "fs.swift.service." + credentials['name']
hconf = sc._jsc.hadoopConfiguration()
hconf.set(prefix + ".auth.url", credentials['auth_url']+'/v3/auth/tokens')
hconf.set(prefix + ".auth.endpoint.prefix", "endpoints")
hconf.set(prefix + ".tenant", credentials['project_id'])
hconf.set(prefix + ".username", credentials['user_id'])
hconf.set(prefix + ".password", credentials['password'])
hconf.setInt(prefix + ".http.port", 8080)
hconf.set(prefix + ".region", credentials['region'])
hconf.setBoolean(prefix + ".public", True)
插入关联的Bluemix Object Storave V3的凭据:
credentials_1 = {
'auth_url':'https://identity.open.softlayer.com',
'project':'***',
'project_id':'***',
'region':'dallas',
'user_id':'***',
'domain_id':'***',
'domain_name':'***',
'username':'***',
'password':"""***""",
'filename':'people.json',
'container':'notebooks',
'tenantId':'***'
}
使用给定的凭据设置Hadopp配置:
credentials_1['name'] = 'spark'
set_hadoop_config(credentials_1)
将JSON文件usind sc.textFile()
读入RDD
并打印出前5行:
data_rdd = sc.textFile("swift://" + credentials_1['container'] + "." + credentials_1['name'] + "/" + credentials_1['filename'])
data_rdd.take(3)
输出:
[u'{"name":"Michael"}',
u'{"name":"Andy", "age":30}',
u'{"name":"Justin", "age":19}']
使用sqlContext.read.json()
将JSON文件读入DataFrame
并输出前5行:
data_df = sqlContext.read.json("swift://" + credentials_1['container'] + "." + credentials_1['name'] + "/" + credentials_1['filename'])
data_df.take(3)
输出:
[Row(age=None, name=u'Michael'),
Row(age=30, name=u'Andy'),
Row(age=19, name=u'Justin')]
答案 1 :(得分:0)
我在https://developer.ibm.com/recipes/tutorials/using-ibm-object-storage-in-bluemix-with-python/找到了一个更好的解决方案,示例代码位于https://github.com/saviosaldanha/IBM_Object_Store_Python_Example/blob/master/storage_recipe_example.py
以下是修订后的代码:
import swiftclient
from keystoneclient import client
# Object Store credentials (generated by Insert to code)
credentials = {
'auth_url':'https://identity.open.softlayer.com',
'project':'***',
'project_id':'***',
'region':'dallas',
'user_id':'***',
'domain_id':'***',
'domain_name':'***',
'username':'***',
'password':"""***""",
'filename':'Warehousing-data.json',
'container':'notebooks',
'tenantId':'***'
}
# Establish Connection to Bluemix Object Store
connection = swiftclient.Connection(
key=credentials[password],
authurl=credentials[auth_url],
auth_version='3',
os_options={"project_id": credentials[projectId],
"user_id": credentials[userId],
"region_name": credentials[region]})
# The data files should now be accessible through calls of the form
# connection.get_object(credentials[container], fileName)[1]
然后将文件访问为:
Warehousing_data_json = "Warehousing-data.json"
Warehousing_sales_data_nominal_scenario_json = "Warehousing-sales_data-nominal_scenario.json"
resp = client.execute(input= [{'name': "warehousing.mod",
'file': StringIO(warehousing_data_dotmod + warehousing_inputs + warehousing_dotmod + warehousing_outputs)},
{'name': Warehousing_data_json,
'filename': connection.get_object(credentials[container], Warehousing_data_json)[1]},
{'name': Warehousing_sales_data_nominal_scenario_json,
'filename': connection.get_object(credentials[container], Warehousing_sales_data_nominal_scenario_json)[1]}],
output= "results.json",
load_solution= True,
log= "solver.log",
gzip= True,
waittime= 300,
delete_on_completion= True)
问题是如何在Bluemix中加载swiftclient和keystoneclient库? Pip似乎不适用于笔记本电脑。任何人都知道如何处理这个?