将JSON文件加载到Python 3.6中

时间:2018-02-01 14:10:53

标签: python json

我试图将包含许多推文的JSON文件加载到Python中,但我总是收到错误说:

ValueError(errmsg("Extra data", s, end, len(s)))

这是Json文件:

{
    "contributors": null,
    "coordinates": null,
    "created_at": "Thu Feb 01 14:02:44 +0000 2018",
    "entities": {
        "hashtags": [
            {
                "indices": [
                    100,
                    114
                ],
                "text": "البلطجي_جبران"
            },
            {
                "indices": [
                    115,
                    132
                ],
                "text": "نبيه_بري_خط_احمر"
            }
        ],
        "symbols": [],
        "urls": [],
        "user_mentions": []
    },
    "favorite_count": 0,
    "favorited": false,
    "geo": null,
    "id": 959064478078955520,
    "id_str": "959064478078955520",
    "in_reply_to_screen_name": null,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "is_quote_status": false,
    "lang": "ar",
    "metadata": {
        "iso_language_code": "ar",
        "result_type": "recent"
    },
    "place": null,
    "retweet_count": 0,
    "retweeted": false,
    "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>",
    "text": "لو كانت حتخلص هيك من الاول بهالبساطة، ليش لتروعوا الامنين وتشلوا حياة الناس. وين الاعتذار المنتظر. ##البلطجي_جبران #نبيه_بري_خط_احمر",
    "truncated": false,
    "user": {
        "contributors_enabled": false,
        "created_at": "Wed Sep 28 16:22:51 +0000 2016",
        "default_profile": true,
        "default_profile_image": false,
        "description": "طالب هندسة ميكانيكية وطالب في كلية الحقوق _ العلوم السياسية..مهتم بالأحداث السياسة وبقضايا المجتمع..محب للأدب والثقافة الإسلامية",
        "entities": {
            "description": {
                "urls": []
            }
        },
        "favourites_count": 99,
        "follow_request_sent": false,
        "followers_count": 32,
        "following": false,
        "friends_count": 94,
        "geo_enabled": false,
        "has_extended_profile": true,
        "id": 781167295645245440,
        "id_str": "781167295645245440",
        "is_translation_enabled": false,
        "is_translator": false,
        "lang": "en",
        "listed_count": 0,
        "location": "Lebanon",
        "name": "Mujahed Dkmak",
        "notifications": false,
        "profile_background_color": "F5F8FA",
        "profile_background_image_url": null,
        "profile_background_image_url_https": null,
        "profile_background_tile": false,
        "profile_banner_url": "https://pbs.twimg.com/profile_banners/781167295645245440/1517268329",
        "profile_image_url": "http://pbs.twimg.com/profile_images/958118925824352256/_oZgUrap_normal.jpg",
        "profile_image_url_https": "https://pbs.twimg.com/profile_images/958118925824352256/_oZgUrap_normal.jpg",
        "profile_link_color": "1DA1F2",
        "profile_sidebar_border_color": "C0DEED",
        "profile_sidebar_fill_color": "DDEEF6",
        "profile_text_color": "333333",
        "profile_use_background_image": true,
        "protected": false,
        "screen_name": "Mucahit_Dkmk",
        "statuses_count": 167,
        "time_zone": null,
        "translator_type": "none",
        "url": null,
        "utc_offset": null,
        "verified": false
    }
}{
    "contributors": null,
    "coordinates": null,
    "created_at": "Thu Feb 01 14:02:29 +0000 2018",
    "entities": {
        "hashtags": [
            {
                "indices": [
                    73,
                    87
                ],
                "text": "البلطجي_جبران"
            }
        ],
        "symbols": [],
        "urls": [],
        "user_mentions": [
            {
                "id": 15621444,
                "id_str": "15621444",
                "indices": [
                    3,
                    15
                ],
                "name": "Jerry Maher",
                "screen_name": "jerrymahers"
            }
        ]
    },
    "favorite_count": 0,
    "favorited": false,
    "geo": null,
    "id": 959064414073901056,
    "id_str": "959064414073901056",
    "in_reply_to_screen_name": null,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "is_quote_status": false,
    "lang": "ar",
    "metadata": {
        "iso_language_code": "ar",
        "result_type": "recent"
    },
    "place": null,
    "retweet_count": 112,
    "retweeted": false,
    "retweeted_status": {
        "contributors": null,
        "coordinates": null,
        "created_at": "Thu Feb 01 07:37:12 +0000 2018",
        "entities": {
            "hashtags": [
                {
                    "indices": [
                        56,
                        70
                    ],
                    "text": "البلطجي_جبران"
                }
            ],
            "symbols": [],
            "urls": [
                {
                    "display_url": "twitter.com/i/web/status/9…",
                    "expanded_url": "https://twitter.com/i/web/status/958967454281207808",
                    "indices": [
                        117,
                        140
                    ],
                    "url": ""
                }
            ],
            "user_mentions": []
        },
        "favorite_count": 118,
        "favorited": false,
        "geo": null,
        "id": 958967454281207808,
        "id_str": "958967454281207808",
        "in_reply_to_screen_name": null,
        "in_reply_to_status_id": null,
        "in_reply_to_status_id_str": null,
        "in_reply_to_user_id": null,
        "in_reply_to_user_id_str": null,
        "is_quote_status": false,
        "lang": "ar",
        "metadata": {
            "iso_language_code": "ar",
            "result_type": "recent"
        },
        "place": {
            "attributes": {},
            "bounding_box": {
                "coordinates": [
                    [
                        [
                            12.453064,
                            55.613717
                        ],
                        [
                            12.6522012,
                            55.613717
                        ],
                        [
                            12.6522012,
                            55.731834
                        ],
                        [
                            12.453064,
                            55.731834
                        ]
                    ]
                ],
                "type": "Polygon"
            },
            "contained_within": [],
            "country": "Denmark",
            "country_code": "DK",
            "full_name": "Copenhagen, Denmark",
            "id": "936b83f20956cd4c",
            "name": "Copenhagen",
            "place_type": "city",
            "url": "https://api.twitter.com/1.1/geo/id/936b83f20956cd4c.json"
        },
        "retweet_count": 112,
        "retweeted": false,
        "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>",
        "text": "ما لم يتم تسريبه من كلام وزير خارجية التيار الوطني الحر #البلطجي_جبران هو تطاوله على الرئيس الشهيد رفيق الحريري وعه…",
        "truncated": true,
        "user": {
            "contributors_enabled": false,
            "created_at": "Sun Jul 27 16:49:04 +0000 2008",
            "default_profile": false,
            "default_profile_image": false,
            "description": "Daniel known as Jerry, CEO @SawtBeirut , a Journalist based in Europe & a father of 1 under 10!  #جيري_ماهر",
            "entities": {
                "description": {
                    "urls": [
                        {
                            "display_url": "Fb.com/sawtbeirut",
                            "expanded_url": "http://Fb.com/sawtbeirut",
                            "indices": [
                                98,
                                121
                            ],
                            "url": ""
                        }
                    ]
                },
                "url": {
                    "urls": [
                        {
                            "display_url": "sbeirut.com",
                            "expanded_url": "http://sbeirut.com",
                            "indices": [
                                0,
                                23
                            ],
                            "url": ""
                        }
                    ]
                }
            },
            "favourites_count": 4014,
            "follow_request_sent": false,
            "followers_count": 241866,
            "following": false,
            "friends_count": 39,
            "geo_enabled": true,
            "has_extended_profile": true,
            "id": 15621444,
            "id_str": "15621444",
            "is_translation_enabled": true,
            "is_translator": false,
            "lang": "en",
            "listed_count": 1011,
            "location": "Sweden | مملكة السويد",
            "name": "Jerry Maher",
            "notifications": false,
            "profile_background_color": "DADDC0",
            "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/591745396025602048/8WfYJY91.jpg",
            "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/591745396025602048/8WfYJY91.jpg",
            "profile_background_tile": false,
            "profile_banner_url": "https://pbs.twimg.com/profile_banners/15621444/1478685711",
            "profile_image_url": "http://pbs.twimg.com/profile_images/938835270593421314/hWzZM4NV_normal.jpg",
            "profile_image_url_https": "https://pbs.twimg.com/profile_images/938835270593421314/hWzZM4NV_normal.jpg",
            "profile_link_color": "646D7E",
            "profile_sidebar_border_color": "E2EAEF",
            "profile_sidebar_fill_color": "E2EAEF",
            "profile_text_color": "6C961C",
            "profile_use_background_image": true,
            "protected": false,
            "screen_name": "jerrymahers",
            "statuses_count": 7101,
            "time_zone": "Santiago",
            "translator_type": "none",
            "url": "",
            "utc_offset": -10800,
            "verified": true
        }
    },
    "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>",
    "text": "RT @jerrymahers: ما لم يتم تسريبه من كلام وزير خارجية التيار الوطني الحر #البلطجي_جبران هو تطاوله على الرئيس الشهيد رفيق الحريري وعهده السي…",
    "truncated": false,
    "user": {
        "contributors_enabled": false,
        "created_at": "Thu Jan 23 18:10:34 +0000 2014",
        "default_profile": true,
        "default_profile_image": false,
        "description": "‏كلمتان خفيفتان على اللسان\n ثقيلتان في الميزان حبيبتان للرحمن\n\nسُبحَانَﷲ͜وَبحَمدِه     سُبحِاْنﷲ͜ﭑلْعظِيم",
        "entities": {
            "description": {
                "urls": []
            }
        },
        "favourites_count": 227,
        "follow_request_sent": false,
        "followers_count": 666,
        "following": false,
        "friends_count": 96,
        "geo_enabled": false,
        "has_extended_profile": false,
        "id": 2299720641,
        "id_str": "2299720641",
        "is_translation_enabled": false,
        "is_translator": false,
        "lang": "ar",
        "listed_count": 6,
        "location": "",
        "name": "عاصفة الحزم . AM",
        "notifications": false,
        "profile_background_color": "C0DEED",
        "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
        "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
        "profile_background_tile": false,
        "profile_banner_url": "https://pbs.twimg.com/profile_banners/2299720641/1445001218",
        "profile_image_url": "http://pbs.twimg.com/profile_images/909051784861306880/hlVPyEeq_normal.jpg",
        "profile_image_url_https": "https://pbs.twimg.com/profile_images/909051784861306880/hlVPyEeq_normal.jpg",
        "profile_link_color": "1DA1F2",
        "profile_sidebar_border_color": "C0DEED",
        "profile_sidebar_fill_color": "DDEEF6",
        "profile_text_color": "333333",
        "profile_use_background_image": true,
        "protected": false,
        "screen_name": "thowaibi",
        "statuses_count": 54627,
        "time_zone": null,
        "translator_type": "none",
        "url": null,
        "utc_offset": null,
        "verified": false
    }
}

我使用普通的json.load和json.loads方法加载它但它只给出了错误。我寻找一个解决方案,并找到一个循环,迭代文件的任何行,并检查与下一行连接的字符串是否是一个json,它加载它并打破循环,但即使这不起作用。光标到达文件末尾并且没有加载。

顺便说一下,我必须删除文件中的一些URL,因为stackoverflow没有让我发布它们

1 个答案:

答案 0 :(得分:1)

您在单个文件中有多个JSON对象,并且它不是包含它们的JSON数组,从而使文件无效JSON。

见这一行

}{

这就是“额外数据”的原因。网址不是问题

我建议如果可能的话,使用正确的JSON语法重新生成文件。