import fileinput
import json
import sys
import os
from collections import defaultdict
line = [] # set to list
tw = 0 # set total words to int
tuw = 0 # set total unique words to int
lexd = 0 # set total lexical diversity to int
awpt = 0 # set average words per tweet to int
line_counter = 0
inputfilename = sys.argv[1] # read the first system argument as the input file name
word_count = defaultdict(int) # set word_count to the default dictionary
for line in fileinput.input([inputfilename]): # FOR each line in the input file
line = line.strip(); # strip any blank lines and throw them out
if not line: continue # if the file does contain a blank line still: in the case of EOF then continue
tweettext = json.loads(line).get('text') # load the line with json.loads and get the "text" field
if not json.loads(line).get('text'): continue # if the line does not contain json data then continue
words = tweettext.split() # split the words from the single line into individual dicts
tw += len(words) # total words counter
line_counter += 1 # total lines counter
print line_counter # so we know what line we're on
for word in words: # FOR each word in the individual line "text" corpus
word_count[word]+=1 # Take the word_count dict, insert the words and incriment
tuw = len(set(word_count)) # calculate the total number of unique words
lexd += 1.0*tuw/tw # calculate the lexical diversity
awpt = 1.0*tuw/line_counter # calc average number of words per tweet
print word_count # print the word list dictionary
print "total number of words", tw # print the total number of words
print "total uniq words", tuw # print the total number of unique words
print "total corpus lexical diversity", lexd # print the total lexical diversity of the entire corpus
print "average number of words per tweet", awpt # print the average number of words per tweet
{"favorited": false, "in_reply_to_user_id": 213741147, "contributors": null, "truncated": false, "text": "@Rafinha_Angelo sim sim, manda o print l\u00e1 HUSAHUS!", "created_at": "Tue Feb 14 00:30:59 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": "169216950453542912", "coordinates": null, "in_reply_to_user_id_str": "213741147", "entities": {"user_mentions": [{"indices": [0, 15], "screen_name": "Rafinha_Angelo", "id": 213741147, "name": "Rafael A. Figueiredo", "id_str": "213741147"}], "hashtags": [], "urls": []}, "in_reply_to_status_id": 169216950453542912, "id_str": "169217034821976067", "in_reply_to_screen_name": "Rafinha_Angelo", "user": {"follow_request_sent": null, "profile_use_background_image": true, "profile_background_image_url_https": "https://si0.twimg.com/images/themes/theme9/bg.gif", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1769152407/223_normal.JPG", "profile_sidebar_fill_color": "252429", "is_translator": false, "id": 67115876, "profile_text_color": "666666", "followers_count": 310, "profile_sidebar_border_color": "181A1E", "location": "Somewhere.", "default_profile_image": false, "listed_count": 0, "utc_offset": -10800, "statuses_count": 6027, "description": "it's like one more day, with no more things !", "friends_count": 106, "profile_link_color": "2FC2EF", "profile_image_url": "http://a2.twimg.com/profile_images/1769152407/223_normal.JPG", "notifications": null, "show_all_inline_media": false, "geo_enabled": true, "profile_background_color": "1A1B1F", "id_str": "67115876", "profile_background_image_url": "http://a1.twimg.com/images/themes/theme9/bg.gif", "screen_name": "Guiii_Fernandes", "lang": "en", "profile_background_tile": false, "favourites_count": 112, "name": "Guilherme Fernandes", "url": "http://facebook.com/GuiiFernandes", "created_at": "Wed Aug 19 20:43:05 +0000 2009", "contributors_enabled": false, "time_zone": "Brasilia", "protected": false, "default_profile": false, "following": null}, "place": null, "retweet_count": 0, "geo": null, "id": 169217034821976067, "source": "web"}
{"favorited": false, "in_reply_to_user_id": null, "contributors": null, "retweeted_status": {"favorited": false, "in_reply_to_user_id": null, "contributors": null, "truncated": false, "text": "On the stage in Vegas for the last few days of rehearsal...this is epic! Going to be a huge show. I like! (said in Borat voice). Xoxo, JM", "created_at": "Mon Feb 13 23:27:08 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [], "hashtags": [], "urls": []}, "in_reply_to_status_id": null, "id_str": "169200965151494144", "place": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "id": 69751644, "description": "", "verified": true, "profile_image_url_https": "https://si0.twimg.com/profile_images/387138234/1_normal.jpg", "profile_sidebar_fill_color": "5c5c5c", "is_translator": false, "geo_enabled": false, "profile_text_color": "333333", "followers_count": 473162, "profile_sidebar_border_color": "00e35f", "id_str": "69751644", "default_profile_image": false, "location": "Los Angeles", "utc_offset": -28800, "statuses_count": 5380, "profile_background_color": "00e35f", "friends_count": 10730, "profile_link_color": "05bcff", "profile_image_url": "http://a0.twimg.com/profile_images/387138234/1_normal.jpg", "notifications": null, "show_all_inline_media": false, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/72720138/green.jpg", "profile_background_image_url": "http://a0.twimg.com/profile_background_images/72720138/green.jpg", "screen_name": "jamesmaslow", "lang": "en", "profile_background_tile": false, "favourites_count": 1, "name": "james maslow", "url": "http://www.JamesMaslow.com", "created_at": "Sat Aug 29 01:32:02 +0000 2009", "contributors_enabled": false, "time_zone": "Pacific Time (US & Canada)", "protected": false, "default_profile": false, "following": null, "listed_count": 8348}, "in_reply_to_screen_name": null, "retweet_count": 465, "geo": null, "id": 169200965151494144, "source": "<a href=\"http://www.osfoora.com\" rel=\"nofollow\">Osfoora for iPhone</a>"}, "truncated": true, "text": "RT @jamesmaslow: On the stage in Vegas for the last few days of rehearsal...this is epic! Going to be a huge show. I like! (said in Bora ...", "created_at": "Tue Feb 14 00:30:59 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [{"indices": [3, 15], "id_str": "69751644", "id": 69751644, "name": "james maslow", "screen_name": "jamesmaslow"}], "hashtags": [], "urls": []}, "in_reply_to_status_id": null, "id_str": "169217034817765377", "place": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "id": 466873377, "description": "Totally dedicate for @1LoganHenderson MINE perfect BTBoy!!!! *--* Rusher for the infinity and beyond and much more beyond!!! Since 01/17/12 =*", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1781262618/_20120126_023206_424_normal.gif", "profile_sidebar_fill_color": "940a2d", "is_translator": false, "geo_enabled": false, "profile_text_color": "eb4466", "followers_count": 103, "profile_sidebar_border_color": "d61153", "id_str": "466873377", "default_profile_image": false, "location": "", "utc_offset": -7200, "statuses_count": 3730, "profile_background_color": "070808", "friends_count": 154, "profile_link_color": "de243d", "profile_image_url": "http://a2.twimg.com/profile_images/1781262618/_20120126_023206_424_normal.gif", "notifications": null, "show_all_inline_media": false, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/421884815/tumblr_lz7316OE041rnvmm7o1_500.jpg", "profile_background_image_url": "http://a3.twimg.com/profile_background_images/421884815/tumblr_lz7316OE041rnvmm7o1_500.jpg", "screen_name": "Logiehbear", "lang": "en", "profile_background_tile": true, "favourites_count": 209, "name": "BBFFF da Laryh!!", "url": null, "created_at": "Tue Jan 17 21:53:17 +0000 2012", "contributors_enabled": false, "time_zone": "Mid-Atlantic", "protected": false, "default_profile": false, "following": null, "listed_count": 1}, "in_reply_to_screen_name": null, "retweet_count": 465, "geo": null, "id": 169217034817765377, "source": "web"}
defaultdict(<type 'int'>, {u'be': 1, u'is': 1, u'Going': 1, u'in': 2, u'I': 1, u'(said': 1, u'RT': 1, u'huge': 1, u'for': 1, u'l\xe1': 1, u'few': 1, u'Vegas': 1, u'manda': 1, u'print': 1, u'sim,': 1, u'sim': 1, u'On': 1, u'to': 1, u'like!': 1, u'HUSAHUS!': 1, u'rehearsal...this': 1, u'@jamesmaslow:': 1, u'...': 1, u'epic!': 1, u'stage': 1, u'a': 1, u'show.': 1, u'last': 1, u'of': 1, u'days': 1, u'o': 1, u'@Rafinha_Angelo': 1, u'the': 2, u'Bora': 1})
total number of words 36
total uniq words 34
total corpus lexical diversity 0.944444444444
average number of words per tweet 17.0
Traceback (most recent call last):
File "lex.py", line 21, in <module>
tweettext = json.loads(line).get('text') # load the line with json.loads and get the "text" field
File "/usr/lib64/python2.7/json/__init__.py", line 326, in loads
return _default_decoder.decode(s)
File "/usr/lib64/python2.7/json/decoder.py", line 366, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/usr/lib64/python2.7/json/decoder.py", line 382, in raw_decode
obj, end = self.scan_once(s, idx)
ValueError: Unterminated string starting at: line 1 column 1531 (char 1531)
我已经使用Jesse Harris的解决方案解决了这个问题,如果json.loads出错,可以包含异常。
import fileinput
import json
import sys
import os
from collections import defaultdict
line = []
tw = 0
tuw = 0
lexd = 0
awpt = 0
line_counter = 0
inputfilename = sys.argv[1]
word_count = defaultdict(int)
for line in fileinput.input([inputfilename]):
line = line.strip();
if not line: continue
tweettext = json.loads(line).get('text')
if not json.loads(line).get('text'): continue
words = tweettext.split()
tw += len(words)
line_counter += 1
print line_counter
for word in words:
print "Problem Line: " + line
tuw = len(set(word_count))
lexd += 1.0*tuw/tw
awpt = 1.0*tuw/line_counter
# print word_count
print "total number of words", tw
print "total uniq words", tuw
print "total corpus lexical diversity", lexd
print "average number of words per tweet", awpt