Question

我自己一直在努力解决这个问题并且失去了生产力。

我正在使用python打开一个带有api的会话并提取它的数据。 URL的格式为

http://api.kivaws.org/v1/teams/2/loans.json

＆＃34; 2＆＃34;路径中的占位符表示团队ID，加载的页面是团队所做的所有贷款。不要担心这意味着什么;只知道我的代码修改了这个url来遍历团队。实际上，这是代码;

import urllib.request as urllib
import json
import time

team_loans = {}

url = "http://api.kivaws.org/v1/teams/"
#Teams ultimately 1- 11885
for i in range(1, 4):
params = dict(
    id = i
)

try:
    handle = urllib.urlopen(str(url+str(i)+"/loans.json"))
except:
    print("Could not handle url")
    continue
# reading response
item_html =  handle.read().decode('utf-8')
# converting bytes to str
data = str(item_html)
# converting to json
data = json.loads(data)
# getting number of pages to crawl
numPages = data['paging']['pages']
# deleting paging data
data.pop('paging')
#Put these items in a list and iterate through indices for boolean check
#data is a dictionary, with a list object inside for each team
for item in data['loans']:
        del item['name']
        del item['lender_count']
        del item['loan_amount']
        del item['sector']
        del item['description']
        del item['status']
        del item['funded_amount']
        del item['image']
        del item['activity']
        del item['use']
        del item['location']
        del item['posted_date']
        del item['borrower_count']
        del item['bonus_credit_eligibility']
        del item['tags']
        try:
            del item['basket_amount']
        except:
            pass
        try:
            del item['planned_expiration_date']
        except:
            pass
        try:
            del item['themes']
        except:
            pass
        try:
            del item['currency_exchange_loss_amount']
        except:
            pass
        try:
            del item['video']
        except:
            pass
        item['team_id'] = i

#More than one page
if numPages > 1:
    for pa in range(2,numPages + 1,1):
        handle = urllib.urlopen(str(url+str(i)+"/loans.json?page="+str(pa)))
        print("Pulling loan data from team " + str(i) + "...")
        # reading response
        item_html =  handle.read().decode('utf-8')
        # converting bytes to str
        datatemp = str(item_html)
        # converting to json
        datatemp = json.loads(datatemp)
        #print(datatemp)
        datatemp.pop('paging')
        #Put these items in a list and iterate through indices for boolean check
        for item in datatemp['loans']:
                del item['name']
                del item['lender_count']
                del item['loan_amount']
                del item['sector']
                del item['description']
                del item['status']
                del item['funded_amount']
                del item['image']
                del item['activity']
                del item['use']
                del item['location']
                del item['posted_date']
                del item['borrower_count']
                del item['bonus_credit_eligibility']
                del item['tags']
                try:
                    del item['basket_amount']
                except:
                    pass
                try:
                    del item['planned_expiration_date']
                except:
                    pass
                try:
                    del item['themes']
                except:
                    pass
                try:
                    del item['currency_exchange_loss_amount']
                except:
                    pass
                try:
                    del item['video']
                except:
                     pass
                item['team_id'] = i

        # adding data to initial list
        for loan in datatemp['loans']:
            data['loans'].append(loan)
        time.sleep(1)

# recording loans by team in dict
team_loans[i] = data['loans']
if (data['loans']):
    print("===Data added to the team_loan dictionary===")
else:
    print("!!!FAILURE to add data to team_loan dictionary!!!")
# recording data to file when 10 teams are read
if int(i) % 3 == 0:
    file = "data" + str(i - 3) + "-" + str(i) + ".json"
    with open(file, "w") as outfile:
        print("===Now writing team " + str(i) + " data to outfile===")
        json.dump(team_loans, outfile, sort_keys = True, indent = 2, ensure_ascii=True)
        outfile.close()

time.sleep(1)

print ('Done! Check your outfile (data'+ str(i - 3)+'_'+str(i)+'.json)')

确实是业余乱七八糟的意大利面条代码。基本上，api页面中包含很多数据，但我只想要三个元素（ID）。它的工作原理。问题在于我得到的数据结构，这篇文章的关键是什么。这是一个例子;

    {
      "1": [
        {
          "id": 434361,
          "partner_id": 225,
          "team_id": 1
        },
        {
          "id": 431287,
          "partner_id": 122,
          "team_id": 1
        }
      ],
      "2": [
        {
          "id": 1164263,
          "partner_id": 381,
          "team_id": 2
        },
        {
          "id": 1154377,
          "partner_id": 121,
          "team_id": 2
        }
      ],
      "3": [
        {
          "id": 1164263,
          "partner_id": 381,
          "team_id": 3
        },
        {
          "id": 1154377,
          "partner_id": 121,
          "team_id": 3
        }
      ]
    }

为什么这个JSON结构存在问题？请注意，每个团队ID都会开始一个键值对列表，这些列表都在较大的JSON字典中。我不希望每个团队都有一个列表，我只想要列表中包含的所有键值对。这是出于数据库表的目的。数据看起来应如下所示;

{
  {
    "id": 434361,
    "partner_id": 225,
    "team_id": 1
  },
  {
    "id": 431287,
    "partner_id": 122,
    "team_id": 1
  },
  {
    "id": 1164263,
    "partner_id": 381,
    "team_id": 2
  },
  {
    "id": 1154377,
    "partner_id": 121,
    "team_id": 2
  },
  {
    "id": 1164263,
    "partner_id": 381,
    "team_id": 3
  },
  {
    "id": 1154377,
    "partner_id": 121,
    "team_id": 3
  }
}

现在凭借我有限的字典知识，如果我要删除这些团队密钥（在我们的示例中，＆＃34; 1＆＃34;，＆＃34; 2＆＃34;和＆＃34; 3＆＃34; ），它们各自列表中的内容也将被删除，从而产生一个空的JSON字典。

因此我一直试图手动删除构成列表的字符串（想想一个正则表达式来剥离＆＃39;＆＃34; 77＆＃34; [＆＃39; 以及＆＃39;}]，＆＃39; ，但也用适当的字符串替换这些字符串以保持JSON有效性）。由于显而易见的原因，这令人头疼。我正在反对这些数据。然而，我还没有找到另一种方式。

到目前为止，我没有成功。请发布任何澄清问题，我知道这是一个漫长的过程。谢谢

Answer 1

只需获取值并展平它们：

import pprint

data = {'1': [{'id': 434361, 'partner_id': 225, 'team_id': 1},
       {'id': 431287, 'partner_id': 122, 'team_id': 1}],
 '2': [{'id': 1164263, 'partner_id': 381, 'team_id': 2},
       {'id': 1154377, 'partner_id': 121, 'team_id': 2}],
 '3': [{'id': 1164263, 'partner_id': 381, 'team_id': 3},
       {'id': 1154377, 'partner_id': 121, 'team_id': 3}]}

pprint.pprint(sum(data.values(), []))

输出：

[{'id': 1164263, 'partner_id': 381, 'team_id': 3},
 {'id': 1154377, 'partner_id': 121, 'team_id': 3},
 {'id': 1164263, 'partner_id': 381, 'team_id': 2},
 {'id': 1154377, 'partner_id': 121, 'team_id': 2},
 {'id': 434361, 'partner_id': 225, 'team_id': 1},
 {'id': 431287, 'partner_id': 122, 'team_id': 1}]

请注意，这是一个列表。你用大括号发布的最终输出是一个在这里不容易使用的集合（字典不可用），可能没用。

Python：拉动后重塑JSON

1 个答案: