我的代码使用python 3.6.1

时间:2017-06-29 22:50:44

标签: python json twitter sentiment-analysis

以下是代码:

import json
import re

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs

    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

def tokenize(s):
    return tokens_re.findall(s)

def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
with open('mytweets.json',  mode='r', encoding='utf-8') as f:
    for line in f:
        #line = f.readline()
        tweet = json.loads(line) 
    print(preprocess(tweet['text'])) 

运行后显示问题: Getting the problem after running the codes

问题的解决方案是什么?如何从json格式成功读取数据和标记推文?

以下是mytweets.json的一些示例

{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007261674602496,"id_str":"878007261674602496","text":"RT @wreckitroy: Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try t\u2026 ","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":632645991,"id_str":"632645991","name":"meche","screen_name":"mercedessreyes","location":null,"url":null,"description":"I mean, really it's same me, it's old me \u2022 FSU '21 \u2022 https:\/\/vsco.co\/onlymeche","protected":false,"verified":false,"followers_count":1039,"friends_count":352,"listed_count":6,"favourites_count":21860,"statuses_count":21676,"created_at":"Wed Jul 11 04:06:28 +0000 2012","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FCEBB6","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/762423763\/6c7d56ca20260816f75c10759208b283.png","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/762423763\/6c7d56ca20260816f75c10759208b283.png","profile_background_tile":true,"profile_link_color":"CE7834","profile_sidebar_border_color":"F0A830","profile_sidebar_fill_color":"78C0A8","profile_text_color":"5E412F","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/876886584087502848\/9WSQDm8F_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/876886584087502848\/9WSQDm8F_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/632645991\/1497147929","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Jun 21 02:57:42 +0000 2017","id":877359845074018304,"id_str":"877359845074018304","text":"Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try t\u2026 https:\/\/t.co\/lUJzY60Sn8","display_text_range":[0,140],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2341390003,"id_str":"2341390003","name":"roy","screen_name":"wreckitroy","location":"Fresno, CA","url":null,"description":"She said I'm looking like a bad man, smooth criminal. \ud83c\udf43 \/ snapchat\/instagram: thericharrow","protected":false,"verified":false,"followers_count":4831,"friends_count":1103,"listed_count":23,"favourites_count":79829,"statuses_count":1012,"created_at":"Thu Feb 13 04:30:59 +0000 2014","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/876941549874978816\/eTGFmh8u_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/876941549874978816\/eTGFmh8u_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/2341390003\/1498157548","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877359034621468672,"quoted_status_id_str":"877359034621468672","quoted_status":{"created_at":"Wed Jun 21 02:54:29 +0000 2017","id":877359034621468672,"id_str":"877359034621468672","text":"When you trying so hard to getvout the friend zone\ud83d\ude02\ud83d\ude02 https:\/\/t.co\/i8yFNbGDNn","display_text_range":[0,52],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":844510650,"id_str":"844510650","name":"\u3164","screen_name":"DaddyGunPlay","location":null,"url":null,"description":"One of the best Contoller players dont @. Bo2 is surperior #JellyFam\ud83c\udf47","protected":false,"verified":false,"followers_count":325,"friends_count":276,"listed_count":3,"favourites_count":1795,"statuses_count":5009,"created_at":"Mon Sep 24 23:51:03 +0000 2012","utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"3B94D9","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/844510650\/1496174936","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877210813462740992,"quoted_status_id_str":"877210813462740992","is_quote_status":true,"retweet_count":45,"favorite_count":138,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/i8yFNbGDNn","expanded_url":"https:\/\/twitter.com\/wreckitroy\/status\/877210813462740992","display_url":"twitter.com\/wreckitroy\/sta\u2026","indices":[53,76]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"extended_tweet":{"full_text":"Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try to find the truth. \ud83d\ude09 https:\/\/t.co\/fv4Kqvv2sb","display_text_range":[0,134],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/fv4Kqvv2sb","expanded_url":"https:\/\/twitter.com\/daddygunplay\/status\/877359034621468672","display_url":"twitter.com\/daddygunplay\/s\u2026","indices":[135,158]}],"user_mentions":[],"symbols":[]}},"retweet_count":2496,"favorite_count":12594,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/lUJzY60Sn8","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/877359845074018304","display_url":"twitter.com\/i\/web\/status\/8\u2026","indices":[117,140]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"quoted_status_id":877359034621468672,"quoted_status_id_str":"877359034621468672","quoted_status":{"created_at":"Wed Jun 21 02:54:29 +0000 2017","id":877359034621468672,"id_str":"877359034621468672","text":"When you trying so hard to getvout the friend zone\ud83d\ude02\ud83d\ude02 https:\/\/t.co\/i8yFNbGDNn","display_text_range":[0,52],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":844510650,"id_str":"844510650","name":"\u3164","screen_name":"DaddyGunPlay","location":null,"url":null,"description":"One of the best Contoller players dont @. Bo2 is surperior #JellyFam\ud83c\udf47","protected":false,"verified":false,"followers_count":325,"friends_count":276,"listed_count":3,"favourites_count":1795,"statuses_count":5009,"created_at":"Mon Sep 24 23:51:03 +0000 2012","utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"3B94D9","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/844510650\/1496174936","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877210813462740992,"quoted_status_id_str":"877210813462740992","is_quote_status":true,"retweet_count":45,"favorite_count":138,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/i8yFNbGDNn","expanded_url":"https:\/\/twitter.com\/wreckitroy\/status\/877210813462740992","display_url":"twitter.com\/wreckitroy\/sta\u2026","indices":[53,76]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"","expanded_url":null,"indices":[133,133]}],"user_mentions":[{"screen_name":"wreckitroy","name":"roy","id":2341390003,"id_str":"2341390003","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218426"}

{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007262320754692,"id_str":"878007262320754692","text":"It makes me feel some type of way now bree got another lil boy friend","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":47587983,"id_str":"47587983","name":"Kee Gotti","screen_name":"_BadGalKee","location":"Columbus, OH","url":null,"description":"\u2022 Instagram|_badgalkee \u2022 SnapChat| kbabiy","protected":false,"verified":false,"followers_count":1107,"friends_count":639,"listed_count":12,"favourites_count":1160,"statuses_count":28359,"created_at":"Tue Jun 16 09:46:12 +0000 2009","utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/850590447261167616\/MuywFrn8_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/850590447261167616\/MuywFrn8_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/47587983\/1487216863","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218580"}

{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007263310393344,"id_str":"878007263310393344","text":"I liked a @YouTube video https:\/\/t.co\/Znu4govqDi My Friend is in LOVE ...","source":"\u003ca href=\"http:\/\/www.google.com\/\" rel=\"nofollow\"\u003eGoogle\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":42287518,"id_str":"42287518","name":"David","screen_name":"iceman120","location":"FT LAUDERDALE, FL","url":"http:\/\/www.youtube.com\/iceman120dl","description":"\ue10e\ue10eOH YOU WANT SOME OF THIS\ue12f\ue12f\ue12f\ue12f\ue10e\ue10e","protected":false,"verified":false,"followers_count":4667,"friends_count":361,"listed_count":69,"favourites_count":134,"statuses_count":69716,"created_at":"Sun May 24 21:43:04 +0000 2009","utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/53704022\/ahamericanflag72.br.jpg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/53704022\/ahamericanflag72.br.jpg","profile_background_tile":false,"profile_link_color":"D60000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"1C1939","profile_text_color":"777777","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/511261204120363008\/DuNoXOXB_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/511261204120363008\/DuNoXOXB_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/42287518\/1375147278","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/Znu4govqDi","expanded_url":"http:\/\/youtu.be\/up6u1hzWHHc?a","display_url":"youtu.be\/up6u1hzWHHc?a","indices":[25,48]}],"user_mentions":[{"screen_name":"YouTube","name":"YouTube","id":10228272,"id_str":"10228272","indices":[10,18]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218816"}

1 个答案:

答案 0 :(得分:0)

您已发布样本,据我所知,您只需要跳过空行。

OLD ANSWER BELWAN

你应该用这种方式解析json:

...
with open('mytweets.json', mode='r', encoding='utf-8') as f:
    tweet = json.load(f)
    ...

json.load()接受file-like object作为第一个参数。

您目前要做的是逐行读取文件并尝试将每行解析为单独的JSON字符串,并且文件似乎已格式化,因此您在任何行中都没有完整的json。

你可能希望迭代你文件中的推文列表(如果我的猜测是正确的),而不是文本行并在循环中调用print(preprocess())