我正在尝试设置tweepy以流式传输到Elasticsearch,但是,我似乎在没有使用hashtag或location的情况下流式传输示例推文有问题,我尝试过steam.sample()但是这似乎给了我错误:
{u'delete': {u'status': {u'user_id_str': u'1538141671', u'user_id': 1538141671, u'id': 972190631614406656, u'id_str': u'972190631614406656'}, u'timestamp_ms': u'1520623506593'}}
Traceback (most recent call last):
File "sentiment2.py", line 98, in <module>
stream.sample()
File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 419, in sample
self._start(async)
File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 361, in _start
self._run()
File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 294, in _run
raise exception
KeyError: 'text'
或此错误:
File "sentiment2.py", line 98, in <module>
stream.sample()
File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 419, in sample
self._start(async)
File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 361, in _start
self._run()
File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 294, in _run
raise exception
IndexError: list index out of range
这些错误不一定会立即发生,我可以看到一些推文被打印到控制台,但是由于elasticsearch索引中的文档数量没有增加,因此它们都没有被实际编入索引。
此外,我似乎在从JSON对象获取主题标签时遇到问题,当我更改为通过过滤的主题标签进行搜索以测试检索它时,我得到此错误,我相信它是某种不兼容的对象类型但不是确定如何解决这个问题?
File "sentiment2.py", line 99, in <module>
stream.filter(track=['#EUref', '#Brexit'])
File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 445, in filter
self._start(async)
File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 361, in _start
self._run()
File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 294, in _run
raise exception
elasticsearch.exceptions.RequestError: TransportError(400, u'mapper_parsing_exception', u'object mapping for [hashtags] tried to parse field [hashtags] as object, but found a concrete value')
我的代码:
import json
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from textblob import TextBlob
from elasticsearch import Elasticsearch
from datetime import datetime
# import twitter keys and tokens
from config import *
# create instance of elasticsearch
es = Elasticsearch()
indexName = "test_new_fields"
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''
class TweetStreamListener(StreamListener):
# on success
def on_data(self, data):
# decode json
dict_data = json.loads(data) # data is a json string
print(dict_data)
# pass tweet into TextBlob
tweet = TextBlob(dict_data["text"])
# determine if sentiment is positive, negative, or neutral
if tweet.sentiment.polarity < 0:
sentiment = "negative"
elif tweet.sentiment.polarity == 0:
sentiment = "neutral"
else:
sentiment = "positive"
# output polarity sentiment and tweet text
print (str(tweet.sentiment.polarity) + " " + sentiment + " " + dict_data["text"])
coord = dict_data["coordinates"]
if coord is not None:
coord = dict_data["coordinates"]
lan = dict_data["coordinates"][0]
lat = dict_data["coordinates"][1]
else:
coord = "None"
es.indices.put_settings(index=indexName, body={"index.blocks.write":False})
# add text and sentiment info to elasticsearch
es.index(index=indexName,
doc_type="test-type",
body={"author": dict_data["user"]["screen_name"],
"date": dict_data["created_at"], # unfortunately this gets stored as a string
"location": dict_data["user"]["location"], # user location
"followers": dict_data["user"]["followers_count"],
"friends": dict_data["user"]["friends_count"],
"time_zone": dict_data["user"]["time_zone"],
"lang": dict_data["user"]["lang"],
#"timestamp": float(dict_data["timestamp_ms"]), # double not recognised as date
"timestamp": dict_data["timestamp_ms"],
"datetime": datetime.now(),
"message": dict_data["text"],
"hashtags": dict_data["entities"]["hashtags"][0]["text"],
#"retweetCount": dict_data["'retweet_count'"],
"polarity": tweet.sentiment.polarity,
"subjectivity": tweet.sentiment.subjectivity,
"sentiment": sentiment,
# handle geo data
"coordinates": coord
# if coord is not None:
# "coordinates": dict_data["coordinates"]
# "lan": dict_data["coordinates"][0]
# "lat": dict_data["coordinates"][1]
# else:
# "coordinates": "None"
})
return True
# on failure
def on_error(self, status):
print (status)
if __name__ == '__main__':
# create instance of the tweepy tweet stream listener
listener = TweetStreamListener()
# set twitter keys/tokens
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# create instance of the tweepy stream
stream = Stream(auth, listener)
stream.sample()
# search twitter for these keywords
#stream.filter(track=['#EUref', '#Brexit'])
映射:
{
"test_new_fields" : {
"mappings" : {
"test-type" : {
"properties" : {
"author" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"coordinates" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"country" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"countrycode" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"date" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"datetime" : {
"type" : "date"
},
"followers" : {
"type" : "long"
},
"friends" : {
"type" : "long"
},
"geoEnabled" : {
"type" : "boolean"
},
"hashtags" : {
"properties" : {
"indices" : {
"type" : "long"
},
"text" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
},
"lang" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"location" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"message" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"polarity" : {
"type" : "float"
},
"sentiment" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"subjectivity" : {
"type" : "float"
},
"time_zone" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"timestamp" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
答案 0 :(得分:0)
您的hashtags
字段是一个对象字段,其中有一个名为indices
的子字段 - 为什么要构建一个只有字段的对象?它毫无意义
"hashtags" : {
"properties" : {
"indices" : {
"type" : "long"
}
如果您不想更改索引,则必须在编制索引时声明子字段:
"hashtags": {"indices": int(dict_data["entities"]["hashtags"][0]["text"])},
#"retweetCount": dict_data["'retweet_count'"],
"polarity": tweet.sentiment.polarity,
但是,如果可以的话,我建议你让你的hashtags字段不是由长字段组成的对象,而是直接用长字段