Question

我正在尝试访问和修改从Google Cloud Functions中的Google Cloud Storage提取的换行JSON文件中的数据。尽管结果不是JSON中的数据，但结果始终显示为数字。

我看到blob对象的download_as_string（）返回字节（https://googleapis.github.io/google-cloud-python/latest/_modules/google/cloud/storage/blob.html#Blob.download_as_string），但是在我看到的任何引用中，每个人都可以正常访问其数据。

我正在Cloud Functions中进行此操作，但我认为我的问题将适用于任何GCP工具。

下面的示例仅应加载换行JSON数据，将其添加到列表中，选择前两个字典条目，转换回换行JSON并在GCS上输出为JSON文件。下面列出了示例，代码和错误的输出。

示例换行JSON输入

{"Website": "Google", "URL": "Google.com", "ID": 1}
{"Website": "Bing", "URL": "Bing.com", "ID": 2}
{"Website": "Yahoo", "URL": "Yahoo.com", "ID": 3}
{"Website": "Yandex", "URL": "Yandex.com", "ID": 4}

云功能中的代码

import requests
import json
import csv
from datetime import datetime, timedelta
import sys
from collections import OrderedDict
import os
import random

from google.cloud import bigquery
from google.cloud import storage

def importData(request, execution):
    # Read the data from Google Cloud Storage
    read_storage_client = storage.Client()

    # Set buckets and filenames
    bucket_name = "sample_bucket"
    filename = 'sample_json_output.json'

    # get bucket with name
    bucket = read_storage_client.get_bucket('sample_bucket')
    # get bucket data as blob
    blob = bucket.get_blob('sample_json.json')
    # download as string
    json_data = blob.download_as_string()

    # create list 
    website_list = []
    for u,y in enumerate(json_data):
        website_list.append(y)

    # select first two
    website_list = website_list[0:2]

    # Create new-line JSON
    results_ready = '\n'.join(json.dumps(item) for item in website_list)

    # Write the data to Google Cloud Storage
    write_storage_client = storage.Client()

    write_storage_client.get_bucket(bucket_name) \
        .blob(filename) \
        .upload_from_string(results_ready)

sample_json_output.json文件中的当前输出

123
34

预期产量

{"Website": "Google", "URL": "Google.com", "ID": 1}
{"Website": "Bing", "URL": "Bing.com", "ID": 2}

更新6/6：如果我直接从 download_to_string blob写入文件，则它会完美写入JSON文件，但我需要事先访问内容。

import requests
import json
import csv
from datetime import datetime, timedelta
import sys
from collections import OrderedDict
import os
import random

from google.cloud import bigquery
from google.cloud import storage

def importData(request, execution):

    # Read the data from Google Cloud Storage
    read_storage_client = storage.Client()

    # Set buckets and filenames
    bucket_name = "sample_bucket"
    filename = 'sample_json_output.json'

    # get bucket with name
    bucket = read_storage_client.get_bucket('sample_bucket')

    # get bucket data as blob
    blob = bucket.get_blob('sample_json.json')

    # convert to string
    json_data = blob.download_as_string()


    # Write the data to Google Cloud Storage
    write_storage_client = storage.Client()

    write_storage_client.get_bucket(bucket_name) \
        .blob(filename) \
        .upload_from_string(json_data)

更新6/6输出

{"Website": "Google", "URL": "Google.com", "ID": 1}
{"Website": "Bing", "URL": "Bing.com", "ID": 2}
{"Website": "Yahoo", "URL": "Yahoo.com", "ID": 3}
{"Website": "Yandex", "URL": "Yandex.com", "ID": 4}

Answer 1

在下面的代码和用于换行JSON的ndjson库中，我能够使用与您自己类似的方法来获得所需的结果。

import requests
import json
import ndjson
import csv
from datetime import datetime, timedelta
import sys
from collections import OrderedDict
import os
import random

from google.cloud import bigquery
from google.cloud import storage

def importData(request, execution):

    # Read the data from Google Cloud Storage
    read_storage_client = storage.Client()

    # Set buckets and filenames
    bucket_name = "bucket-name"
    filename = "sample_json_output.json"

    # get bucket with name
    bucket = read_storage_client.get_bucket(bucket_name)

    # get bucket data as blob
    blob = bucket.get_blob("sample_json.json")

    # convert to string
    json_data_string = blob.download_as_string()

    json_data = ndjson.loads(json_data_string)

    list = []
    for item in json_data:
        list.append(item)

    list1 = list[0:2]

    result = ""
    for item in list1:
        result = result + str(item) + "\n"


    # Write the data to Google Cloud Storage
    write_storage_client = storage.Client()

    write_storage_client.get_bucket(bucket_name) \
        .blob(filename) \
        .upload_from_string(result)

Answer 2

当您在 json_data 中读取blob时，您将得到一个字节对象，并对其进行迭代时，将获得每个字符的数字表示形式。在下面的示例中，该示例从bytes对象创建了字典列表

json_data                                                                                                                                                                                                 
b'{"Website": "Google", "URL": "Google.com", "ID": 1}\n{"Website": "Bing", "URL": "Bing.com", "ID": 2}\n{"Website": "Yahoo", "URL": "Yahoo.com", "ID": 3}\n{"Website": "Yandex", "URL": "Yandex.com", "ID": 4}\n'

type(json_data)                                                                                                                                                                                           
bytes

website_list = [json.loads(row.decode('utf-8')) for row in json_data.split(b'\n') if row]                                                                                                                 

website_list                                                                                                                                                                                              
[{'Website': 'Google', 'URL': 'Google.com', 'ID': 1},
 {'Website': 'Bing', 'URL': 'Bing.com', 'ID': 2},
 {'Website': 'Yahoo', 'URL': 'Yahoo.com', 'ID': 3},
 {'Website': 'Yandex', 'URL': 'Yandex.com', 'ID': 4}]

从Python中的download_as_string访问blob对象中的数据

2 个答案: