Question

我正在尝试设置tweepy以流式传输到Elasticsearch，但是，我似乎在没有使用hashtag或location的情况下流式传输示例推文有问题，我尝试过steam.sample（）但是这似乎给了我错误：

    {u'delete': {u'status': {u'user_id_str': u'1538141671', u'user_id': 1538141671, u'id': 972190631614406656, u'id_str': u'972190631614406656'}, u'timestamp_ms': u'1520623506593'}}
Traceback (most recent call last):
  File "sentiment2.py", line 98, in <module>
    stream.sample()
  File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 419, in sample
    self._start(async)
  File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 361, in _start
    self._run()
  File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 294, in _run
    raise exception
KeyError: 'text'

或此错误：

  File "sentiment2.py", line 98, in <module>
    stream.sample()
  File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 419, in sample
    self._start(async)
  File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 361, in _start
    self._run()
  File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 294, in _run
    raise exception
IndexError: list index out of range

这些错误不一定会立即发生，我可以看到一些推文被打印到控制台，但是由于elasticsearch索引中的文档数量没有增加，因此它们都没有被实际编入索引。

此外，我似乎在从JSON对象获取主题标签时遇到问题，当我更改为通过过滤的主题标签进行搜索以测试检索它时，我得到此错误，我相信它是某种不兼容的对象类型但不是确定如何解决这个问题？

 File "sentiment2.py", line 99, in <module>
    stream.filter(track=['#EUref', '#Brexit'])
  File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 445, in filter
    self._start(async)
  File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 361, in _start
    self._run()
  File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 294, in _run
    raise exception
elasticsearch.exceptions.RequestError: TransportError(400, u'mapper_parsing_exception', u'object mapping for [hashtags] tried to parse field [hashtags] as object, but found a concrete value')

我的代码：

import json
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from textblob import TextBlob
from elasticsearch import Elasticsearch
from datetime import datetime

# import twitter keys and tokens
from config import *

# create instance of elasticsearch
es = Elasticsearch()

indexName = "test_new_fields"

consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''


class TweetStreamListener(StreamListener):

    # on success
    def on_data(self, data):

        # decode json
        dict_data = json.loads(data) # data is a json string
        print(dict_data)
        # pass tweet into TextBlob
        tweet = TextBlob(dict_data["text"])
        # determine if sentiment is positive, negative, or neutral
        if tweet.sentiment.polarity < 0:
            sentiment = "negative"
        elif tweet.sentiment.polarity == 0:
            sentiment = "neutral"
        else:
            sentiment = "positive"

        # output polarity sentiment and tweet text
        print (str(tweet.sentiment.polarity) + " " + sentiment + " " + dict_data["text"])
        coord = dict_data["coordinates"]
        if coord is not None:
            coord = dict_data["coordinates"]
            lan = dict_data["coordinates"][0]
            lat = dict_data["coordinates"][1]
        else:
            coord = "None"

    es.indices.put_settings(index=indexName, body={"index.blocks.write":False})
        # add text and sentiment info to elasticsearch
        es.index(index=indexName,
                 doc_type="test-type",
                 body={"author": dict_data["user"]["screen_name"],
                       "date": dict_data["created_at"], # unfortunately this gets stored as a string
                       "location": dict_data["user"]["location"], # user location
                       "followers": dict_data["user"]["followers_count"],
                       "friends": dict_data["user"]["friends_count"],
                       "time_zone": dict_data["user"]["time_zone"],
                       "lang": dict_data["user"]["lang"],
                       #"timestamp": float(dict_data["timestamp_ms"]), # double not recognised as date 
                       "timestamp": dict_data["timestamp_ms"],
                       "datetime": datetime.now(),
                       "message": dict_data["text"],
                       "hashtags": dict_data["entities"]["hashtags"][0]["text"],
                       #"retweetCount": dict_data["'retweet_count'"],
                       "polarity": tweet.sentiment.polarity,
                       "subjectivity": tweet.sentiment.subjectivity,
                       "sentiment": sentiment,
                       # handle geo data
                       "coordinates": coord
                      # if coord is not None:
                       #     "coordinates": dict_data["coordinates"]
                        #    "lan": dict_data["coordinates"][0]
                         #   "lat": dict_data["coordinates"][1]
                #       else:
                 #           "coordinates": "None"
                       })
        return True

    # on failure
    def on_error(self, status):
        print (status)

if __name__ == '__main__':

    # create instance of the tweepy tweet stream listener
    listener = TweetStreamListener()

    # set twitter keys/tokens
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)

    # create instance of the tweepy stream
    stream = Stream(auth, listener)
    stream.sample()

    # search twitter for these keywords
    #stream.filter(track=['#EUref', '#Brexit'])

映射：

{
  "test_new_fields" : {
    "mappings" : {
      "test-type" : {
        "properties" : {
          "author" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "coordinates" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "country" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "countrycode" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "date" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "datetime" : {
            "type" : "date"
          },
          "followers" : {
            "type" : "long"
          },
          "friends" : {
            "type" : "long"
          },
          "geoEnabled" : {
            "type" : "boolean"
          },
          "hashtags" : {
            "properties" : {
              "indices" : {
                "type" : "long"
              },
              "text" : {
                "type" : "text",
                "fields" : {
                  "keyword" : {
                    "type" : "keyword",
                    "ignore_above" : 256
                  }
                }
              }
            }
          },
          "lang" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "location" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "message" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "polarity" : {
            "type" : "float"
          },
          "sentiment" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "subjectivity" : {
            "type" : "float"
          },
          "time_zone" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "timestamp" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          }
        }
      }
    }
  }
}

Answer 1

您的hashtags字段是一个对象字段，其中有一个名为indices的子字段 - 为什么要构建一个只有字段的对象？它毫无意义

      "hashtags" : {
        "properties" : {
          "indices" : {
            "type" : "long"
          }

如果您不想更改索引，则必须在编制索引时声明子字段：

"hashtags": {"indices": int(dict_data["entities"]["hashtags"][0]["text"])},
#"retweetCount": dict_data["'retweet_count'"],
"polarity": tweet.sentiment.polarity,

但是，如果可以的话，我建议你让你的hashtags字段不是由长字段组成的对象，而是直接用长字段

tweepy elasticsearch - 流式样本推文和主题标签

1 个答案: