从python中提取字符并从JSON中提取数组值

时间:2018-09-04 09:02:23

标签: python regex

我有以下JSON字符串,我正在尝试将值提取到python列表中。我获得了id_list字符串,但是我想获得每个值而每个变量中都没有:

编辑: 不能使用python json库。 我的方法(以前从未使用过很多正则表达式):https://regex101.com/r/qxYe9N/1

我想将表达式与re.filterall(EXPR,jsonstr)一起使用以接收如下列表:

result = ["B01M8QSY16", "B017XBDBI6", ...more ]

{
  "ajax": {
    "params": {
      "asinMetadataKeys": "adId",
      "featureId": "SimilaritiesCarousel",
      "reftagPrefix": "pd_sbs_60",
      "widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
      "imageHeight": 160,
      "linkGetParameters": "{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"ac83cd73-b019-11e8-99c8-33d23753c678\",\"pf_rd_r\":\"H21WNBAW5EGZX90ND4PN\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"e6DPw\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"xg8TH\"}",
      "faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
      "auiDeviceType": "desktop",
      "imageWidth": 160,
      "schemaVersion": 2,
      "productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
      "forceFreshWin": 0,
      "productDataFlavor": "Faceout",
      "relatedRequestID": "H21WNBAW5EGZX90ND4PN",
      "maxLineCount": 6
    },
    "id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
    "url": "/gp/p13n-shared/faceout-partial",
    "id_param_name": "asins"
  },
  "baseAsin": "B01GL56060",
  "name": "desktop-dp-sims_session-similarities",
  "set_size": 57
}

编辑:

原始字符串:

{"ajax":{"params":{"asinMetadataKeys":"adId","featureId":"SimilaritiesCarousel","reftagPrefix":"pd_sbs_193","widgetTemplateClass":"PI::Similarities::ViewTemplates::Carousel::Desktop","imageHeight":160,"linkGetParameters":"{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"e672bcd4-b03e-11e8-8dbb-41abd883f66d\",\"pf_rd_r\":\"X5Z293FJ403CC225M759\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"CrGGS\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"ktYgt\"}","faceoutTemplateClass":"PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout","auiDeviceType":"desktop","imageWidth":160,"schemaVersion":2,"productDetailsTemplateClass":"PI::P13N::ViewTemplates::ProductDetails::Desktop::Base","forceFreshWin":0,"productDataFlavor":"Faceout","relatedRequestID":"X5Z293FJ403CC225M759","maxLineCount":6},"id_list":["B07BHS22V6:","B00ITJNHX6:","B07DDGCLZ1:","B017XYQ4X2:","B01LYA8CLG:","B0747T62HS:","B00LHT0I78:","B071D5LL18:","B071NPLTRS:","B00CFMRFO0:","B01N4X1EL9:","B077R4WZ46:","B00YTZSTVY:","B073V5T8G2:","B00CFMRI7E:","B01ARIYIPM:","B0747X16FY:","B00ZWNPJVA:","B01N4WZ4AL:","B00BU662AU:","B07C2NYVMP:","B01FD7ZOB4:","B017M17VTC:","B00YTZST0K:","B07CVSJG6H:","B00V63GQBC:","B00NYBAJJY:","B01MCZ2ZQC:","B078BSJ8TV:","B077QXWJBR:","B07BL5FWVP:","B00N8SPSSU:","B01LXMVFGI:","B06ZY83D2Z:","B00ZQYY9TI:","B0761HT6JJ:","B06XRWB686:","B075XHDQ85:","B01LYJMK02:","B018JWYKRE:","B0759W61P6:","B078ZKNGRS:","B013BJBZBE:","B01LYMTVY2:","B072VMTVGZ:","B077QXW1Z9:","B07CMB96BX:","B07BNXNMZ5:","B01N3CY4Y3:","B018JX3J7U:","B0747T5MY1:","B07CQPTFDB:","B077QW292J:","B00LHT0GLQ:","B01C4B17XG:","B019WD74F4:"],"url":"/gp/p13n-shared/faceout-partial","id_param_name":"asins"},"baseAsin":"B01LS24R2U","name":"desktop-dp-sims_session-similarities","set_size":56}

4 个答案:

答案 0 :(得分:2)

只使用pythons json库

import json

j1 = """{
  "ajax": {
    "params": {
      "asinMetadataKeys": "adId",
      "featureId": "SimilaritiesCarousel",
      "reftagPrefix": "pd_sbs_60",
      "widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
      "imageHeight": 160,
      "faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
      "auiDeviceType": "desktop",
      "imageWidth": 160,
      "schemaVersion": 2,
      "productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
      "forceFreshWin": 0,
      "productDataFlavor": "Faceout",
      "relatedRequestID": "H21WNBAW5EGZX90ND4PN",
      "maxLineCount": 6
    },
    "id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
    "url": "/gp/p13n-shared/faceout-partial",
    "id_param_name": "asins"
  },
  "baseAsin": "B01GL56060",
  "name": "desktop-dp-sims_session-similarities",
  "set_size": 57
}"""

d1 = json.loads(j1) 

id_list = [elem.replace(":", "") for elem in d1["ajax"]['id_list']]
id_list

输出:

['B01M8QSY16',
 'B017XBDBI6',
 ...
 'B00R25QZHS']

我不得不删除“ linkGetParameters:...”行,因为它似乎不符合json。

答案 1 :(得分:1)

如果您确定属性"id_list"将始终以类似的单空格格式排在逗号和冒号之后,则不是json模块,那么您可以执行以下操作:

list(  # make sure the result is a list
    filter(  # filter to…
        None,  # …remove any empty items
        re.split(  # split the line of id_list on…
            r':(?:,\s)?',   # …colon and then optional comma and spaces
            re.search(  # search…
                r'(?<="id_list": \[)((?:"[^"]+:"(?:,\s*)?)+)', j1)  # …for the id_list property and its value
            .group(0)  # take the match
            .replace('"', '')  # and drop all double quotes
)))
['B01M8QSY16', 'B017XBDBI6', 'B01GL5MYCE', 'B0751DHYXC', 'B01AHWOH54', 'B01M7XYENW', 'B01N7FKKXV', 'B07C1NLKS5', 'B00R25QZDC', 'B01AJB1VFW', 'B079K773M7', 'B07DX3W41P', 'B01GL5606A', 'B07654YLSB', 'B01GFL6MZE', 'B00WLI5E3M', 'B01CTE28DG', 'B01BELELVC', 'B00ZY7H91M', 'B077TPG2WK', 'B01G503MC6', 'B01LYZFC4V', 'B00ID9UQYK', 'B07C3T52LB', 'B07DX39RNS', 'B076551MZP', 'B0761RWKPQ', 'B00T8FD9YM', 'B07653JBYS', 'B07G316H74', 'B01FSEBC9K', 'B014QKBVH0', 'B01BVA2I4S', 'B01CVOZNAE', 'B07D19JDH9', 'B018ACDMJK', 'B00V0H83YW', 'B07C432PK3', 'B07B9P4T4V', 'B076H4WWLK', 'B077G3Y86F', 'B077Z7XLJF', 'B01NCFB2BB', 'B01M4I7FMC', 'B01BEVFJCM', 'B01FSEBC8G', 'B07DXCTKB6', 'B01NBHYAR0', 'B07DGWJ887', 'B00SLP58SU', 'B01N55H5AE', 'B013AZCPLS', 'B076PC3NYV', 'B01BVA2JHE', 'B07FF38J8C', 'B07DHGTS81', 'B00R25QZHS']

这是密集且几乎不可读的代码;按原样使用,或者如果需要的话,我可以更合理地细分逻辑。

答案 2 :(得分:1)

看到您无法使用JSON库,可以在此处尝试此表达式(在Python3上测试):

result = [ id.strip('":') for id in re.search('"id_list": \[(.*)\],', jsonstr).group(1).split(", ") ]

(其中jsonstr是包含所有原始JSON代码的字符串)。

为便于理解,以上代码使用了

  1. re.search(不是您建议的re.filterall)来大致定位并选择该行,
  2. group以缩小选择范围,
  3. split将字符串转换为列表,然后
  4. strip修剪掉每个列表项中不必要的字符

为您提供ID列表,例如您在问题中指定的ID。

答案 3 :(得分:1)

首先,正如Florian H所说。为了能够使用json Python模块,您应该从源中声明有效的JSON。提供JSON的人应提供有效 JSON ...

编辑:JSON似乎有效,请参见下文

无论如何尝试使用the json module来满足您的需求,我注意到解析问题来自linkGetParameters值中转义的双引号。 我假设JSON字符串已按原样复制/粘贴,这可能是JSON解析问题的根源。只需将这个JSON粘贴在Python字符串中,Python就可以使用反斜杠来转义双引号而不是保留两个字符。 要测试JSON内容,您必须将其复制到raw string(=以r开头):

import json

json_ = r"""{
  "ajax": {
    "params": {
      "asinMetadataKeys": "adId",
      "featureId": "SimilaritiesCarousel",
      "reftagPrefix": "pd_sbs_60",
      "widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
      "imageHeight": 160,
      "linkGetParameters": "{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"ac83cd73-b019-11e8-99c8-33d23753c678\",\"pf_rd_r\":\"H21WNBAW5EGZX90ND4PN\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"e6DPw\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"xg8TH\"}",
      "faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
      "auiDeviceType": "desktop",
      "imageWidth": 160,
      "schemaVersion": 2,
      "productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
      "forceFreshWin": 0,
      "productDataFlavor": "Faceout",
      "relatedRequestID": "H21WNBAW5EGZX90ND4PN",
      "maxLineCount": 6
    },
    "id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
    "url": "/gp/p13n-shared/faceout-partial",
    "id_param_name": "asins"
  },
  "baseAsin": "B01GL56060",
  "name": "desktop-dp-sims_session-similarities",
  "set_size": 57
}"""

result = json.loads(json_)
print [id_[:-1] for id_ in result['ajax']['id_list']]
# [u'B01M8QSY16', u'B017XBDBI6', u'B01GL5MYCE', u'B0751DHYXC', u'B01AHWOH54', u'B01M7XYENW', u'B01N7FKKXV', u'B07C1NLKS5', u'B00R25QZDC', u'B01AJB1VFW', u'B079K773M7', u'B07DX3W41P', u'B01GL5606A', u'B07654YLSB', u'B01GFL6MZE', u'B00WLI5E3M', u'B01CTE28DG', u'B01BELELVC', u'B00ZY7H91M', u'B077TPG2WK', u'B01G503MC6', u'B01LYZFC4V', u'B00ID9UQYK', u'B07C3T52LB', u'B07DX39RNS', u'B076551MZP', u'B0761RWKPQ', u'B00T8FD9YM', u'B07653JBYS', u'B07G316H74', u'B01FSEBC9K', u'B014QKBVH0', u'B01BVA2I4S', u'B01CVOZNAE', u'B07D19JDH9', u'B018ACDMJK', u'B00V0H83YW', u'B07C432PK3', u'B07B9P4T4V', u'B076H4WWLK', u'B077G3Y86F', u'B077Z7XLJF', u'B01NCFB2BB', u'B01M4I7FMC', u'B01BEVFJCM', u'B01FSEBC8G', u'B07DXCTKB6', u'B01NBHYAR0', u'B07DGWJ887', u'B00SLP58SU', u'B01N55H5AE', u'B013AZCPLS', u'B076PC3NYV', u'B01BVA2JHE', u'B07FF38J8C', u'B07DHGTS81', u'B00R25QZHS']

一旦检索到id_list,您就可以使用string slicing删除每个ID的最后一个字符。

在使用原始来源的JSON内容而不是乱七八糟的字符串时,您应该不会遇到这种转义问题。


如果实际上不可能,假设id始终为10个字符长,则应该这样做:

import re

json = """{
  "ajax": {
    "params": {
      "asinMetadataKeys": "adId",
      "featureId": "SimilaritiesCarousel",
      "reftagPrefix": "pd_sbs_60",
      "widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
      "imageHeight": 160,
      "linkGetParameters": "{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"ac83cd73-b019-11e8-99c8-33d23753c678\",\"pf_rd_r\":\"H21WNBAW5EGZX90ND4PN\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"e6DPw\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"xg8TH\"}",
      "faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
      "auiDeviceType": "desktop",
      "imageWidth": 160,
      "schemaVersion": 2,
      "productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
      "forceFreshWin": 0,
      "productDataFlavor": "Faceout",
      "relatedRequestID": "H21WNBAW5EGZX90ND4PN",
      "maxLineCount": 6
    },
    "id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
    "url": "/gp/p13n-shared/faceout-partial",
    "id_param_name": "asins"
  },
  "baseAsin": "B01GL56060",
  "name": "desktop-dp-sims_session-similarities",
  "set_size": 57
}"""

# https://regex101.com/r/qxYe9N/11
id_re = re.compile('"([A-Z0-9]{10}):"')
result = id_re.findall(json)

print result
# ['B01M8QSY16', 'B017XBDBI6', 'B01GL5MYCE', 'B0751DHYXC', 'B01AHWOH54', 'B01M7XYENW', 'B01N7FKKXV', 'B07C1NLKS5', 'B00R25QZDC', 'B01AJB1VFW', 'B079K773M7', 'B07DX3W41P', 'B01GL5606A', 'B07654YLSB', 'B01GFL6MZE', 'B00WLI5E3M', 'B01CTE28DG', 'B01BELELVC', 'B00ZY7H91M', 'B077TPG2WK', 'B01G503MC6', 'B01LYZFC4V', 'B00ID9UQYK', 'B07C3T52LB', 'B07DX39RNS', 'B076551MZP', 'B0761RWKPQ', 'B00T8FD9YM', 'B07653JBYS', 'B07G316H74', 'B01FSEBC9K', 'B014QKBVH0', 'B01BVA2I4S', 'B01CVOZNAE', 'B07D19JDH9', 'B018ACDMJK', 'B00V0H83YW', 'B07C432PK3', 'B07B9P4T4V', 'B076H4WWLK', 'B077G3Y86F', 'B077Z7XLJF', 'B01NCFB2BB', 'B01M4I7FMC', 'B01BEVFJCM', 'B01FSEBC8G', 'B07DXCTKB6', 'B01NBHYAR0', 'B07DGWJ887', 'B00SLP58SU', 'B01N55H5AE', 'B013AZCPLS', 'B076PC3NYV', 'B01BVA2JHE', 'B07FF38J8C', 'B07DHGTS81', 'B00R25QZHS']