#Importing libaray and Api
import sys
import re,string
import json
from elasticsearch import Elasticsearch
# create instance of elasticsearch
es = Elasticsearch()
def strip_links(text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_all_entities(text):
entity_prefixes = ['@','#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
words = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
words.append(word)
return ' '.join(words)
res = es.search
(index="tweets", doc_type='rumors',body={"query": {"match_all": {}}})
for hit in res['hits']['hits']:
if 'text' in hit['_source']:
print strip_all_entities(strip_links(hit['_source']['text'].encode("utf-8")))
print strip_all_entities(strip_links(hit['_source']['retweeted_status']['text'].encode("utf-8")))
错误:
我说所有的黑色从车到里面从装备到钉子
Traceback (most recent call last):
File "/Users/ConnectorNLP.py", line 35, in <module>
print strip_all_entities(strip_links(hit['_source']['retweeted_status']['text'].encode("utf-8")))
KeyError: 'retweeted_status'
JSON:
"_source": {
"contributors": null,
"truncated": false,
"text": "RT @Ionerivan: i miss the old days, no stress, no worries, always a good time.",
"is_quote_status": false,
"in_reply_to_status_id": null,
"id": 789575690161049605,
"favorite_count": 0,
"source": "<a href=\"https://about.twitter.com/products/tweetdeck\" rel=\"nofollow\">TweetDeck</a>",
"retweeted": false,
"coordinates": null,
"timestamp_ms": "1477084488798",
"entities": {
"user_mentions": [
{
"id": 756674882922881024,
"indices": [
3,
13
],
"id_str": "756674882922881024",
"screen_name": "Ionerivan",
"name": "ⅈvan"
}
],
"symbols": [],
"hashtags": [],
"urls": []
},
"in_reply_to_screen_name": null,
"id_str": "789575690161049605",
"retweet_count": 0,
"in_reply_to_user_id": null,
"favorited": false,
"retweeted_status": {
"contributors": null,
"truncated": false,
"text": "i miss the old days, no stress, no worries, always a good time.",