获取错误KeyError:'转推了_status'在阅读嵌套的json文档时

时间:2016-10-22 11:35:27

标签: python pandas elasticsearch tweepy

#Importing libaray and Api
import sys
import re,string
import json
from elasticsearch import Elasticsearch

# create instance of elasticsearch
es = Elasticsearch()


def strip_links(text):

    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def strip_all_entities(text):

    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

res = es.search
(index="tweets", doc_type='rumors',body={"query": {"match_all": {}}})

for hit in res['hits']['hits']:

    if 'text' in hit['_source']:
        print strip_all_entities(strip_links(hit['_source']['text'].encode("utf-8")))
        print strip_all_entities(strip_links(hit['_source']['retweeted_status']['text'].encode("utf-8")))

错误:

我说所有的黑色从车到里面从装备到钉子

Traceback (most recent call last):
  File "/Users/ConnectorNLP.py", line 35, in <module>
    print strip_all_entities(strip_links(hit['_source']['retweeted_status']['text'].encode("utf-8")))
KeyError: 'retweeted_status'

JSON:

    "_source": {
      "contributors": null,
      "truncated": false,
      "text": "RT @Ionerivan: i miss the old days, no stress, no worries, always a good time.",
      "is_quote_status": false,
      "in_reply_to_status_id": null,
      "id": 789575690161049605,
      "favorite_count": 0,
      "source": "<a href=\"https://about.twitter.com/products/tweetdeck\" rel=\"nofollow\">TweetDeck</a>",
      "retweeted": false,
      "coordinates": null,
      "timestamp_ms": "1477084488798",
      "entities": {
        "user_mentions": [
          {
            "id": 756674882922881024,
            "indices": [
              3,
              13
            ],
            "id_str": "756674882922881024",
            "screen_name": "Ionerivan",
            "name": "ⅈvan"
          }
        ],
        "symbols": [],
        "hashtags": [],
        "urls": []
      },
      "in_reply_to_screen_name": null,
      "id_str": "789575690161049605",
      "retweet_count": 0,
      "in_reply_to_user_id": null,
      "favorited": false,
      "retweeted_status": {
        "contributors": null,
        "truncated": false,
        "text": "i miss the old days, no stress, no worries, always a good time.",

0 个答案:

没有答案