Python帮助需要从推文中检索文本

时间:2014-09-16 23:46:10

标签: python python-2.7 python-3.x

我有一条推文,我想检索转推,关注,跟随和收藏。推文看起来像: " retweet_count":0,随机文字在" followers_count":27,随机文字" favourites_count:100",

如何在一行中检索一组文本的这些数字。首选输出:

0 27 100

漂亮的JSON文档,OP包含在原始问题中,但随后被删除:

{
    "created_at": "Tue Feb 18 02:15:22 +0000 2014",
    "id": 435598383483322400,
    "id_str": "435598383483322368",
    "text": "Iran: Khamenei says nuclear talks will ‘lead nowhere http://t.co/t9d5WsDmBb #Aljazeera #NBC #CampLiberty #FREETHE7 #FOXNEWS #syria #CNN #AFP",
    "source": "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Mobile Web (M2)</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
        "id": 1350553807,
        "id_str": "1350553807",
        "name": "S IranChange",
        "screen_name": "SIranChange",
        "location": "",
        "url": null,
        "description": "سال س - سال سرنگونی آخوندها و رهایی میهن\r\n -- S for Sarnegooni (Farsi) - Regime Change  -- Support People's Resistance in Iran - IranNCR - @Maryam_Rajavi",
        "protected": false,
        "followers_count": 582,
        "friends_count": 144,
        "listed_count": 5,
        "created_at": "Sun Apr 14 00:04:31 +0000 2013",
        "favourites_count": 742,
        "utc_offset": null,
        "time_zone": null,
        "geo_enabled": false,
        "verified": false,
        "statuses_count": 43040,
        "lang": "en",
        "contributors_enabled": false,
        "is_translator": false,
        "is_translation_enabled": false,
        "profile_background_color": "C0DEED",
        "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
        "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
        "profile_background_tile": false,
        "profile_image_url": "http://pbs.twimg.com/profile_images/3517290015/343d943432a93f0353485044d56715de_normal.jpeg",
        "profile_image_url_https": "https://pbs.twimg.com/profile_images/3517290015/343d943432a93f0353485044d56715de_normal.jpeg",
        "profile_link_color": "0084B4",
        "profile_sidebar_border_color": "C0DEED",
        "profile_sidebar_fill_color": "DDEEF6",
        "profile_text_color": "333333",
        "profile_use_background_image": true,
        "default_profile": true,
        "default_profile_image": false,
        "following": null,
        "follow_request_sent": null,
        "notifications": null
    },
    "geo": null,
    "coordinates": null,
    "place": null,
    "contributors": null,
    "retweet_count": 0,
    "favorite_count": 0,
    "entities": {
        "hashtags": [
            {
                "text": "Aljazeera",
                "indices": [
                    76,
                    86
                ]
            },
            {
                "text": "NBC",
                "indices": [
                    87,
                    91
                ]
            },
            {
                "text": "CampLiberty",
                "indices": [
                    92,
                    104
                ]
            },
            {
                "text": "FREETHE7",
                "indices": [
                    105,
                    114
                ]
            },
            {
                "text": "FOXNEWS",
                "indices": [
                    115,
                    123
                ]
            },
            {
                "text": "syria",
                "indices": [
                    124,
                    130
                ]
            },
            {
                "text": "CNN",
                "indices": [
                    131,
                    135
                ]
            },
            {
                "text": "AFP",
                "indices": [
                    136,
                    140
                ]
            }
        ],
        "symbols": [],
        "urls": [
            {
                "url": "http://t.co/t9d5WsDmBb",
                "expanded_url": "http://is.gd/7YeC5Y",
                "display_url": "is.gd/7YeC5Y",
                "indices": [
                    53,
                    75
                ]
            }
        ],
        "user_mentions": []
    },
    "favorited": false,
    "retweeted": false,
    "possibly_sensitive": false,
    "filter_level": "medium",
    "lang": "en"
}

1 个答案:

答案 0 :(得分:2)

通过JSON.loads()

import json

text = """{"created_at":"Tue Feb 18 02:15:22 +0000 2014","id":435598383483322368,"id_str":"435598383483322368","text":"Iran: Khamenei says nuclear talks will \u2018lead nowhere http:\/\/t.co\/t9d5WsDmBb #Aljazeera #NBC #CampLiberty #FREETHE7 #FOXNEWS #syria #CNN #AFP","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eMobile Web (M2)\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1350553807,"id_str":"1350553807","name":"S IranChange","screen_name":"SIranChange","location":"","url":null,"description":"\u0633\u0627\u0644 \u0633 - \u0633\u0627\u0644 \u0633\u0631\u0646\u06af\u0648\u0646\u06cc \u0622\u062e\u0648\u0646\u062f\u0647\u0627 \u0648 \u0631\u0647\u0627\u06cc\u06cc \u0645\u06cc\u0647\u0646\r\n -- S for Sarnegooni (Farsi) - Regime Change  -- Support People's Resistance in Iran - IranNCR - @Maryam_Rajavi","protected":false,"followers_count":582,"friends_count":144,"listed_count":5,"created_at":"Sun Apr 14 00:04:31 +0000 2013","favourites_count":742,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":43040,"lang":"en","contributors_enabled":false,"is_translator":false,"is_translation_enabled":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/3517290015\/343d943432a93f0353485044d56715de_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/3517290015\/343d943432a93f0353485044d56715de_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"Aljazeera","indices":[76,86]},{"text":"NBC","indices":[87,91]},{"text":"CampLiberty","indices":[92,104]},{"text":"FREETHE7","indices":[105,114]},{"text":"FOXNEWS","indices":[115,123]},{"text":"syria","indices":[124,130]},{"text":"CNN","indices":[131,135]},{"text":"AFP","indices":[136,140]}],"symbols":[],"urls":[{"url":"http:\/\/t.co\/t9d5WsDmBb","expanded_url":"http:\/\/is.gd\/7YeC5Y","display_url":"is.gd\/7YeC5Y","indices":[53,75]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"}"""
js = json.loads(text)
user = js['user']
with open('file_to_write_to.txt', 'w') as f:
    f.write('{} {} {}'.format(user['followers_count'], user['friends_count'], user['favourites_count']))