使用下面显示的部分代码,我从twitter获取推文,并将它们存储在" backup.txt"中。我还创建了一个文件" tweets3.csv"并保存每个推文的一些特定字段。但我意识到一些推文有完全相同的文本(重复)。我怎么能从我的csv文件中删除那些?
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
import json
import csv
ckey = ''
csecret = ''
atoken = ''
asecret = ''
class listener(StreamListener):
def on_data(self, data):
try:
all_data = json.loads(data)
with open("backup.txt", 'a') as backup:
backup.write(str(all_data) + "\n")
backup.close()
text = str(all_data["text"]).encode("utf-8")
id = str(all_data["id"]).encode("utf-8")
timestamp = str(all_data["timestamp_ms"]).encode("utf-8")
sn = str(all_data["user"]["screen_name"]).encode("utf-8")
user_id = str(all_data["user"]["id"]).encode("utf-8")
create = str(all_data["created_at"]).encode("utf-8")
follower = str(all_data["user"]["followers_count"]).encode("utf-8")
following = str(all_data["user"]["following"]).encode("utf-8")
status = str(all_data["user"]["statuses_count"]).encode("utf-8")
# text = data.split(',"text":"')[1].split('","source')[0]
# name = data.split(',"screen_name":"')[1].split('","location')[0]
contentlist = []
contentlist.append(text)
contentlist.append(id)
contentlist.append(timestamp)
contentlist.append(sn)
contentlist.append(user_id)
contentlist.append(create)
contentlist.append(follower)
contentlist.append(following)
contentlist.append(status)
print contentlist
f = open("tweets3.csv", 'ab')
wrt = csv.writer(f, dialect='excel')
try:
wrt.writerow(contentlist)
except UnicodeEncodeError, UnicodeEncodeError:
return True
return True
except BaseException, e:
print 'failed on data',type(e),str(e)
time.sleep(3)
def on_error(self, status):
print "Error status:" + str(status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["zikavirus"], languages=['en'])
答案 0 :(得分:2)
我编写了这个列表的代码,每次通过推文时都会检查该列表。如果文本不存在,请将其添加到列表中。
# Defines a list - It stores all unique tweets
tweetChecklist = [];
# All your tweets. I represent them as a list to test the code
AllTweets = ["Hello", "HelloFoo", "HelloBar", "Hello", "hello", "Bye"];
# Goes over all "tweets"
for current_tweet in AllTweets:
# If tweet doesn't exist in the list
if current_tweet not in tweetChecklist:
tweetChecklist.append(current_tweet);
# Do what you want with this tweet, it won't appear two times...
# Print ["Hello", "HelloFoo", "HelloBar", "hello", "Bye"]
# Note that the second Hello doesn't show up - It's what you want
# However, it's case sensitive.
print(tweetIDlist);
# Clear the list
tweetChecklist = [];
我认为在实现我的解决方案后,您的代码应该显示如下:
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
import json
import csv
# Define a list - It stores all unique tweets
# Clear this list after completion of fetching all tweets
tweetChecklist = [];
ckey = ''
csecret = ''
atoken = ''
asecret = ''
class listener(StreamListener):
def on_data(self, data):
try:
all_data = json.loads(data)
with open("backup.txt", 'a') as backup:
backup.write(str(all_data) + "\n")
backup.close()
text = str(all_data["text"]).encode("utf-8")
id = str(all_data["id"]).encode("utf-8")
timestamp = str(all_data["timestamp_ms"]).encode("utf-8")
sn = str(all_data["user"]["screen_name"]).encode("utf-8")
user_id = str(all_data["user"]["id"]).encode("utf-8")
create = str(all_data["created_at"]).encode("utf-8")
follower = str(all_data["user"]["followers_count"]).encode("utf-8")
following = str(all_data["user"]["following"]).encode("utf-8")
status = str(all_data["user"]["statuses_count"]).encode("utf-8")
# If the text does not exist in the list that stores all unique tweets
if text not in tweetChecklist:
# Store it, so that on further times with the same text,
# it didn't reach this code
tweetChecklist.append(current_tweet);
# Now, do your unique stuff
contentlist = []
contentlist.append(text)
contentlist.append(id)
contentlist.append(timestamp)
contentlist.append(sn)
contentlist.append(user_id)
contentlist.append(create)
contentlist.append(follower)
contentlist.append(following)
contentlist.append(status)
print contentlist
f = open("tweets3.csv", 'ab')
wrt = csv.writer(f, dialect='excel')
try:
wrt.writerow(contentlist)
except UnicodeEncodeError, UnicodeEncodeError:
return True
return True
except BaseException, e:
print 'failed on data',type(e),str(e)
time.sleep(3)
def on_error(self, status):
print "Error status:" + str(status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["zikavirus"], languages=['en'])