Question

我正在尝试收集一些本地化的推文并将其作为推文字典存储在我的硬盘上。在fetchsamples函数的一些迭代中，尽管在for循环期间将数据添加到字典中（参见下面的输出），但保存的字典被强制为空状态。

我尝试了不同的编码或传递＆＃34; w＆＃34;和＆＃34; wb＆＃34;标志到我的保存功能，但它没有帮助。

我尝试使用随机字符串复制它（以便人们更容易检查我的代码）但我无法做到。我不确定推文结构或我的代码是什么导致了这种行为。

注意：我已经添加了一个代码片段，以便在字典强制进入空状态进行调试时捕获。

import oauth2 as oauth
import urllib2 as urllib
import json
import pickle
import os

api_key = "Insert api_key here"
api_secret = "Insert api_secret here"
access_token_key = "Insert access_token_key"
access_token_secret = "Insert access_token_secret"

_debug = 0

oauth_token    = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)

signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()

http_method = "GET"

http_handler  = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)

def twitterreq(url, method, parameters):
    req = oauth.Request.from_consumer_and_token(oauth_consumer,
                                                token=oauth_token,
                                                http_method=http_method,
                                                http_url=url, 
                                                parameters=parameters)

    req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
    headers = req.to_header()

    if http_method == "POST":
        encoded_post_data = req.to_postdata()
    else:
        encoded_post_data = None
        url = req.to_url()

    opener = urllib.OpenerDirector()
    opener.add_handler(http_handler)
    opener.add_handler(https_handler)

    response = opener.open(url, encoded_post_data)

    return response

def fetchsamples():

    url = "https://stream.twitter.com/1/statuses/sample.json"
    url = "https://stream.twitter.com/1/statuses/filter.json?locations=-0.489,51.28,0.236,51.686"
    parameters = []
    response = twitterreq(url, "GET", parameters)

    data = {}
    count = 1
    for line in response:        
        try:
            strip = json.loads(line.strip())
            if strip['coordinates'] != None:
                data[count] = strip

                count += 1

                if count % 10 == 0: 
                    print count, len(data.keys())

        except Exception as e:
            # Print error and store in a log file
            print e            
            with open("/Temp/Data/error.log","w") as log:
                log.write(str(e))

        # If 100 tweets have passed save the file
        if count % 100 == 0:
            print "Before saving: ", len(data.keys())
            fp =  open("/Temp/Data/"+str(count/100)+".json","w")
            json.dump(data,fp,encoding="latin-1")
            fp.close()

            # This code is for debug purposes to catch when dictionary
            # when dictionary is forced into empty state
            if os.path.getsize("/Temp/Data/"+str(count/100)+".json") < 10:
                print "After saving: ", len(data.keys())
                return data
            else:
                data = {}

data = fetchsamples()

这会产生以下输出而没有错误。 data字典为空。

100 99
Before saving:  99
110 10
120 20
130 30
140 40
150 50
160 60
170 70
180 80
190 90
200 100
Before saving:  100
Before saving:  0
After saving:  0

Answer 1

字典为空，因为在每100次迭代后，您要么设置data = {}，要么字典已经为空。如果我理解正确，你需要另一个字典，一个你永远不会空的字典，并将项目推送到该字典。

import oauth2 as oauth
import urllib2 as urllib
import json
import pickle
import os

api_key = "Insert api_key here"
api_secret = "Insert api_secret here"
access_token_key = "Insert access_token_key"
access_token_secret = "Insert access_token_secret"

_debug = 0

oauth_token    = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)

signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()

http_method = "GET"

http_handler  = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)

def twitterreq(url, method, parameters):
    req = oauth.Request.from_consumer_and_token(oauth_consumer,
                                                token=oauth_token,
                                                http_method=http_method,
                                                http_url=url, 
                                                parameters=parameters)

    req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
    headers = req.to_header()

    if http_method == "POST":
        encoded_post_data = req.to_postdata()
    else:
        encoded_post_data = None
        url = req.to_url()

    opener = urllib.OpenerDirector()
    opener.add_handler(http_handler)
    opener.add_handler(https_handler)

    response = opener.open(url, encoded_post_data)

    return response

def fetchsamples():

    url = "https://stream.twitter.com/1/statuses/sample.json"
    url = "https://stream.twitter.com/1/statuses/filter.json?locations=-0.489,51.28,0.236,51.686"
    parameters = []
    response = twitterreq(url, "GET", parameters)

    data = {}
    allData = {}
    count = 1
    for line in response:        
        try:
            strip = json.loads(line.strip())
            if strip['coordinates'] != None:
                data[count] = strip
                allData[count] = strip

                count += 1

                if count % 10 == 0: 
                    print count, len(data.keys())

        except Exception as e:
            # Print error and store in a log file
            print e            
            with open("/Temp/Data/error.log","w") as log:
                log.write(str(e))

        # If 100 tweets have passed save the file
        if count % 100 == 0:
            print "Before saving: ", len(data.keys())
            fp =  open("/Temp/Data/"+str(count/100)+".json","w")
            json.dump(data,fp,encoding="latin-1")
            fp.close()

            # Return data if the file is empty and stop
            if os.path.getsize("/Temp/Data/"+str(count/100)+".json") < 10:
                print "After saving: ", len(data.keys())
                return allData
            else:
                data = {}

data = fetchsamples()

Answer 2

问题在于我增加count值的方式。由于count仅在strip["coordinates"] != None时才会递增，如果我收到推文strip["coordinates"] == None，则计数值不会增加但data = {}和count % 100 == 0会True }，表示原始的非空文件被替换为空文件。

解决方案是在保存后增加count，例如：

    if count % 100 == 0:
        print "Before saving: ", len(data.keys())
        fp =  open("/Temp/Data/"+str(count/100)+".json","w")
        json.dump(data,fp,encoding="latin-1")
        fp.close()

        count += 1

将推文字典保存到JSON文件中会产生一个空字典

2 个答案: