以下是我在代码之前使用的原始数据。 (我的代码拨打电话后,我从Twitter API获取此数据)
{"metadata":{"result_type":"recent","iso_language_code":"et"}
"created_at":"Tue Dec 03 01:41:53 +0000 2013","id":407686093790662656,"id_str":"407686093790662656","text":"@emblems123 justinbieberfan12599@gamil.com","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":407677310821613569,"in_reply_to_status_id_str":"407677310821613569","in_reply_to_user_id":2201997043,"in_reply_to_user_id_str":"2201997043","in_reply_to_screen_name":"emblems123","user":{"id":1220098345,"id_str":"1220098345","name":"PYD","screen_name":"bieberfan12599","location":
我运行以下代码:
import csv
import json
import oauth2 as oauth
import urllib
import sys
import requests
import time
CONSUMER_KEY = ""
CONSUMER_SECRET = ""
ACCESS_KEY = ""
ACCESS_SECRET = ""
class TwitterSearch:
def __init__(self,
ckey = CONSUMER_KEY,
csecret = CONSUMER_SECRET,
akey = ACCESS_KEY,
asecret = ACCESS_SECRET,
query = 'https://api.twitter.com/1.1/search/tweets.{mode}?{query}'
):
consumer = oauth.Consumer(key=ckey, secret=csecret)
access_token = oauth.Token(key=akey, secret=asecret)
self.client = oauth.Client(consumer, access_token)
self.query = query
def search(self, q, mode='json', **queryargs):
queryargs['q'] = q
query = urllib.urlencode(queryargs)
return self.client.request(self.query.format(query=query, mode=mode))
def write_csv(fname, rows, header=None, append=False, **kwargs):
filemode = 'ab' if append else 'wb'
with open(fname, filemode) as outf:
out_csv = csv.writer(outf, **kwargs)
if header:
out_csv.writerow(header)
out_csv.writerows(rows)
def main():
ts = TwitterSearch()
response, data = ts.search('@gmail.com', result_type='recent')
js = json.loads(data)
search_terms = ['@gmail.com']
matches = []
for term in search_terms:
match = [word for word in js if term in word]
matches.append(match)
messages = ([msg['created_at'], msg['text'], msg['user']['id'], matches[0]] for msg in js.get('statuses', []))
write_csv('twitter_gmail.csv', messages, append=True)
if __name__ == '__main__':
main()
这是.csv中的输出:
Fri Dec 13 03:49:06 +0000 2013,I need some HARD TRAP beats producers help me out here...louiethefifthonline@gmail.com,490060971,[]
我的问题是我希望它只打印解析后的JS文本中的电子邮件地址。我尝试过split()但是我不能用表达式来做。似乎无论我做什么它总是空白“[]”
我真的想知道如何让它只打印'text'中的电子邮件地址作为行的一部分。
答案 0 :(得分:0)
假设您拥有字符串中的数据,那么您可以使用regex
提取电子邮件:
import re
string = "Fri Dec 13 03:49:06 +0000 2013,I need some HARD TRAP beats producers help me out here...louiethefifthonline@gmail.com,490060971,[]"
regex = "\w+@\w+\.com"
match = re.findall(regex,string)
print match
包含所有匹配项的输出,在这种情况下一个:
['louiethefifthonline@gmail.com']
即使您使用string
函数将dict
转换为字符串,也要将str()
替换为您获得的原始数据的字符串:
string = str({"metadata":{"result_type":"recent","iso_language_code":"et"},
"created_at":"Tue Dec 03 01:41:53 +0000 2013","id":407686093790662656,"id_str":"407686093790662656","text":"@emblems123 justinbieberfan12599@gamil.com","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":False,"in_reply_to_status_id":407677310821613569,"in_reply_to_status_id_str":"407677310821613569","in_reply_to_user_id":2201997043,"in_reply_to_user_id_str":"2201997043","in_reply_to_screen_name":"emblems123","user":{"id":1220098345,"id_str":"1220098345","name":"PYD","screen_name":"bieberfan12599","location":"NY"}})
您仍然可以获得预期的输出:
['justinbieberfan12599@gamil.com']