我正在尝试解析以下格式的JSON项:
DataStore.prime('ws-stage-stat',
{ against: 0, field: 2, stageId: 9155, teamId: 26, type: 8 },
[[['goal','fastbreak','leftfoot',[1]],['goal','openplay','leftfoot',[2]],
['goal','openplay','rightfoot',[1]],['goal','owngoal','leftfoot',[1]],
['goal','penalty','rightfoot',[1]],['miss','corner','header',[6]],
['miss','corner','leftfoot',[2]],['miss','corner','rightfoot',[2]],
['miss','crossedfreekick','header',[1]],['miss','openplay','header',[4]],
['miss','openplay','leftfoot',[11]],['miss','openplay','rightfoot',[27]]]]
引号中的项目表示网站上列出的得分目标类型或错过的机会。数字代表音量。我假设这是一个带有混合文本和数字数据的JSON数组数组。我想要做的是以
的格式将其分解为python变量var1 = "'goal','fastbreak','leftfoot'"
var2 = 1
...并重复上述模式的所有元素。
解析此数据结构的代码是:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from scrapy.spider import BaseSpider
from scrapy import log
from scrapy.cmdline import execute
from scrapy.utils.markup import remove_tags
import time
import re
import json
import requests
class ExampleSpider(CrawlSpider):
name = "goal2"
allowed_domains = ["whoscored.com"]
start_urls = ["http://www.whoscored.com/Teams/32/Statistics/England-Manchester-United"]
download_delay = 5
rules = [Rule(SgmlLinkExtractor(allow=('http://www.whoscored.com/Teams/32/Statistics/England-Manchester-United'),deny=('/News', '/Graphics', '/Articles', '/Live', '/Matches', '/Explanations', '/Glossary', 'ContactUs', 'TermsOfUse', 'Jobs', 'AboutUs', 'RSS'),), follow=False, callback='parse_item')]
def parse_item(self, response):
url = 'http://www.whoscored.com/stagestatfeed'
params = {
'against': '0',
'field': '2',
'stageId': '9155',
'teamId': '32',
'type': '8'
}
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Host': 'www.whoscored.com',
'Referer': 'http://www.whoscored.com/'}
responser = requests.get(url, params=params, headers=headers)
print responser.text
我已使用print type(responser.text)
检查了responser.text的类型,该结果返回了' unicode'的结果。这是否意味着此对象现在是一组嵌套的Python列表?如果是这样,我该如何解析它以便以我之后的格式返回数据?
由于
答案 0 :(得分:2)
那不是JSON。 JSON不允许使用单引号字符串。它也没有这样的构造函数调用。请参阅the official grammar。
你真的想弄明白实际的格式,并适当地解析它。或者,更好的是,如果您对输出代码有任何控制权,请将其修复为易于(安全且高效)解析的内容。
无论如何,这看起来像是Python对象的repr
(特别是,Datastore.prime
对象正在构造一个字符串,一个字典和一个列表...作为参数) 。所以,你可能可以用eval
解析它。这是否是一个好主意(可能有某种消毒)取决于您从哪里获取数据以及您的安全要求是什么。
或者它可以很容易地成为JavaScript代码。或其他各种脚本语言。 (它们中的大多数具有类似的结构,具有相似的语法 - 这正是它们在JSON和本机数据之间轻松映射的原因; JSON基本上是大多数脚本语言的文字的子集。)
一个稍微更安全和更安全的解决方案是显式解析顶层,然后使用ast.literal_eval
来解析字符串,字典和列表组件。
可能过于复杂的解决方案是编写一个真正的自定义解析器。
但是,最好的解决方案是改变来源,为你提供更有用的东西。即使您确实想要不安全地传递Python对象,pickle
也比repr
和eval
更好。但最有可能的是,这并不是你最初想要做的事情。
答案 1 :(得分:1)
一种选择是在这里使用正则表达式:
import re
data = """
DataStore.prime('ws-stage-stat',
{ against: 0, field: 2, stageId: 9155, teamId: 26, type: 8 },
[[['goal','fastbreak','leftfoot',[1]],['goal','openplay','leftfoot',[2]],
['goal','openplay','rightfoot',[1]],['goal','owngoal','leftfoot',[1]],
['goal','penalty','rightfoot',[1]],['miss','corner','header',[6]],
['miss','corner','leftfoot',[2]],['miss','corner','rightfoot',[2]],
['miss','crossedfreekick','header',[1]],['miss','openplay','header',[4]],
['miss','openplay','leftfoot',[11]],['miss','openplay','rightfoot',[27]]]]
"""
# parse js
pattern = re.compile("\[([^\[]+?),\[(\d+)\]\]")
print pattern.findall(data)
打印:
[
("'goal','fastbreak','leftfoot'", '1'),
("'goal','openplay','leftfoot'", '2'),
...
("'miss','openplay','rightfoot'", '27')
]
\[([^\[]+?),\[(\d+)\]\]
基本上与方括号中的组匹配。括号此处有助于capture匹配字符串的某些部分;反斜杠有助于转义在正则表达式中具有特殊含义的字符,例如[
和]
。
另一种选择,因为这看起来像是javascript代码的一部分,将使用javascript解析器。我已经成功使用了slimit
模块,这里有相关的主题示例:
答案 2 :(得分:1)
运行代码并使用response.text
,您可以拆分文本并获取数据列表,然后使用ordereddict来保存所需的数据。
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import requests
class ExampleSpider(CrawlSpider):
name = "goal2"
allowed_domains = ["whoscored.com"]
start_urls = ["http://www.whoscored.com/Teams/32/Statistics/England-Manchester-United"]
download_delay = 5
rules = [Rule(SgmlLinkExtractor(allow=('http://www.whoscored.com/Teams/32/Statistics/England-Manchester-United'),deny=('/News', '/Graphics', '/Articles', '/Live', '/Matches', '/Explanations', '/Glossary', 'ContactUs', 'TermsOfUse', 'Jobs', 'AboutUs', 'RSS'),), follow=False, callback='parse_item')]
def parse_item(self, response):
url = 'http://www.whoscored.com/stagestatfeed'
params = {
'against': '0',
'field': '2',
'stageId': '9155',
'teamId': '32',
'type': '8'
}
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Host': 'www.whoscored.com',
'Referer': 'http://www.whoscored.com/'}
responser = requests.get(url, params=params, headers=headers)
resp = responser.text
from ast import literal_eval
from collections import OrderedDict
d = OrderedDict()
for line in resp.split():
if line.startswith("[[["):
break
l = literal_eval(line)
count = 1
for sub_ele in l[0]:
print sub_ele[-1]
d["var{}".format(count)] = ", ".join(sub_ele[:-1])
count += 1
print sub_ele[-1][0],count
if sub_ele[-1][0]:
d["var{}".format(count)] = sub_ele[-1][0]
count +=1
print d
OrderedDict([('var1', 'goal, corner, rightfoot'), ('var2', 1), ('var3', 'goal, directfreekick, leftfoot'), ('var4', 1), ('var5', 'goal, openplay, leftfoot'), ('var6', 2), ('var7', 'goal, openplay, rightfoot'), ('var8', 2), ('var9', 'miss, corner, header'), ('var10', 5), ('var11', 'miss, corner, rightfoot'), ('var12', 1), ('var13', 'miss, directfreekick, leftfoot'), ('var14', 1), ('var15', 'miss, directfreekick, rightfoot'), ('var16', 2), ('var17', 'miss, openplay, header'), ('var18', 4), ('var19', 'miss, openplay, leftfoot'), ('var20', 14), ('var21', 'miss, openplay, rightfoot'), ('var22', 16)])