这是我的代码。
数据的形状:
data_dict.items()
Out[57]:
[('Sympathetic', defaultdict(<type 'int'>, {'2011-10-06': 1})),
('protest', defaultdict(<type 'int'>, {'2011-10-06': 16})),
('occupycanada', defaultdict(<type 'int'>, {'2011-10-06': 1})),
('hating', defaultdict(<type 'int'>, {'2011-10-06': 1})),
('AND', defaultdict(<type 'int'>, {'2011-10-06': 4})),
('c', defaultdict(<type 'int'>, {'2011-10-06': 2})),
...]
data_dict被定义为
data_dict = defaultdict(lambda: defaultdict(int))
我想构建一个数据帧,如下所示:
columns = ['word','date',"number"]
word date number
"Sympathetic" '2011-10-06' 1
"protest" '2011-10-06' 16
'occupycanada' '2011-10-06' 1
'hating' '2011-10-06' 1
'AND' '2011-10-06' 4
'comunity' '2011-10-06' 2
...
我尝试过这样做,使用pandas:
import pandas as pd
for d in data_dict:
for date in data_dict[d]:
data=[d,date,data_dict[d][date]]
dat = pd.DataFrame(data, columns = ['word','date',"number"])
print dat
但是当我运行此代码时,我有以下错误:
ValueError Traceback (most recent call last)
<ipython-input-56-80b3affa34fe> in <module>()
3 for date in data_dict[d]:
4 data=[d,date,data_dict[d][date]]
----> 5 dat = pd.DataFrame(data, columns = ['word','date',"number"])
6 print dat
....
ValueError: Shape of passed values is (1, 3), indices imply (3, 3)
我该如何解决?
有关data_dict的其他代码:
from collections import defaultdict
import csv
import re
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
data_dict = defaultdict(lambda: defaultdict(int))
error_num = 0
line_num = 0
total_num = 0
bigfile = open('D:/Data/ows/ows_sample.txt', 'rb')
chunkSize = 10000000
chunk = bigfile.readlines(chunkSize)
while chunk:
total_num += len(chunk)
lines = csv.reader((line.replace('\x00','') for line in chunk), delimiter=',', quotechar='"')
for i in lines:
line_num+=1
if line_num%1000000==0:
flushPrint(line_num)
try:
i[1]= re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+|(?:@[\w_]+)', "", i[1])
tweets=re.split(r"\W+",i[1])
date=i[3]
for word in tweets: # error
if len(date)==10:
data_dict[word][date] += 1
except Exception, e:
print e
error_num+=1
pass
chunk = bigfile.readlines(chunkSize)
print line_num, total_num,error_num
示例数据
['"Twitter ID",Text,"Profile Image URL",Day,Hour,Minute,"Created At",Geo,"From User","From User ID",Language,"To User","To User ID",Source\n',
'121813144174727168,"RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE RT !!HELP!!!!",,2011-10-06,5,4,"2011-10-06 05:04:51",N;,Anonops_Cop,401240477,en,,0,"<a href=">web</a>"\n',
'121813146137657344,"@jamiekilstein @allisonkilkenny Interesting interview (never aired, wonder why??) by Fox with #ows protester 2011-10-06,5,4,"2011-10-06 05:04:51",N;,KittyHybrid,34532053,en,jamiekilstein,2149053,"<a href=">web</a>"\n',
'121813150000619521,"@Seductivpancake Right! Those guys have a victory condition: regime change. #ows doesn\'t seem to have a goal I can figure out.",2011-10-06,5,4,"2011-10-06 05:04:52",N;,nerdsherpa,95067344,en,Seductivpancake,19695580,"<a href="nofollow">Echofon</a>"\n',
'121813150701072385,"RT @bembel "Occupy Wall Street" als linke Antwort auf die Tea Party? #OccupyWallStreet #OWS",2011-10-06,5,4,"2011-10-06 05:04:52",N;,hamudistan,35862923,en,,0,"<a href="rel="nofollow">Plume\xc2\xa0\xc2\xa0</a>"\n',
'121813163778899968,"#ows White shirt= Brown shirt.",2011-10-06,5,4,"2011-10-06 05:04:56",N;,kl_knox,419580636,en,,0,"<a href=">web</a>"\n',
'121813169999065088,"RT @TheNewDeal: The #NYPD are Out of Control. Is This a Free Country or a Middle-East Dictatorship? #OccupyWallStreet #OWS #p2",2011-10-06,5,4,"2011-10-06 05:04:57",N;,vickycrampton,32151083,en,,0,"<a href=">web</a>"\n',
答案 0 :(得分:1)
我会这样做:
# -*- coding: utf-8 -*-
from collections import defaultdict, Counter
import string
import pandas as pd
# prepare translate table, which will remove all punctuations and digits
chars2remove = list(string.punctuation + string.digits)
transl_tab = str.maketrans(dict(zip(chars2remove, list(' ' * len(chars2remove)))))
# replace 'carriage return' and 'new line' characters with spaces
transl_tab[10] = ' '
transl_tab[13] = ' '
def tokenize(s):
return s.translate(transl_tab).lower().split()
chunksize = 100
fn = r'D:\temp\.data\ows-sample.txt'
#
# read `Day` and `Text` columns from the source CSV file
#
# not-chunked version
#df = pd.read_csv(fn, usecols=['Text','Day'])
# "chunked" version - will prepare a list of "reduced" DFs,
# containing word counts in the form: "{'we': 1, 'stand': 1, 'and': 1}"
dfs = []
for df in pd.read_csv(fn, usecols=['Text','Day'], chunksize=chunksize):
# group DF by date and count words for each unique day, summing up counters
dfs.append(df.assign(count=df['Text']
.apply(lambda x: Counter(tokenize(x))))
.groupby('Day', as_index=False)['count'].sum()
)
# convert sets of {'word1': count, 'word2': count} into columns
tmp = (pd.concat(dfs, ignore_index=True)
.set_index('Day')['count']
.apply(pd.Series)
.reset_index()
)
tmp['Day'] = pd.to_datetime(tmp['Day'])
# free up memory
del dfs
# transform (melt) columns into desired columns: [Day, word, number]]
rslt = (pd.melt(tmp, id_vars='Day', var_name='word', value_name='number')
.fillna(0)
)
# delete temporary DF from memory
del tmp
# save results as HDF5 file
rslt.to_hdf('d:/temp/.data/twit_words.h5', 'twit_words', mode='a',
format='t', complib='zlib', complevel=4)
# save results as CSV file
rslt.to_csv('d:/temp/.data/twit_words.csv.gz', index=False,
encoding='utf_8', compression='gzip')
针对this样本数据进行测试:
In [254]: pd.melt(new, id_vars='Day', var_name='word', value_name='number').fillna(0)
Out[254]:
Day word number
0 2011-11-13 a 4.0
1 2011-11-14 a 9.0
2 2011-11-15 a 92.0
3 2011-11-16 a 111.0
4 2011-11-17 a 93.0
5 2011-11-18 a 141.0
6 2011-11-19 a 77.0
7 2011-11-20 a 58.0
8 2011-11-21 a 29.0
9 2011-11-22 a 70.0
10 2011-11-23 a 55.0
11 2011-11-24 a 49.0
12 2011-11-25 a 41.0
13 2011-11-26 a 67.0
14 2011-11-27 a 27.0
15 2011-11-28 a 34.0
16 2011-11-29 a 23.0
17 2011-11-30 a 33.0
18 2011-12-01 a 26.0
19 2011-12-02 a 32.0
20 2011-12-03 a 46.0
21 2011-12-04 a 29.0
22 2011-12-05 a 22.0
23 2011-12-06 a 60.0
24 2011-12-07 a 32.0
25 2011-12-08 a 33.0
26 2011-12-09 a 16.0
27 2011-11-13 aa 0.0
28 2011-11-14 aa 0.0
29 2011-11-15 aa 0.0
... ... ... ...
答案 1 :(得分:1)
您可以在原始代码中添加几行,以便您可以使用字典,看起来很简单:
df=pd.DataFrame(data_dict.items())
df=df.rename(columns = {0:'word'})
f1 = lambda x: x.values()[0]
df['number']=df[1].apply(f1)
df=df.rename(columns = {1:'date'})
f2 = lambda x: x.keys()[0]
df['date']=df['date'].apply(f2)
完整的程序将是:
from collections import defaultdict
import csv
import re
import sys
import pandas as pd
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
data_dict = defaultdict(lambda: defaultdict(int))
error_num = 0
line_num = 0
total_num = 0
bigfile = open('data.txt', 'rb')
chunkSize = 10000000
chunk = bigfile.readlines(chunkSize)
while chunk:
total_num += len(chunk)
lines = csv.reader((line.replace('\x00','') for line in chunk), delimiter=',', quotechar='"')
for i in lines:
line_num+=1
if line_num%1000000==0:
flushPrint(line_num)
try:
i[1]= re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+|(?:@[\w_]+)', "", i[1])
tweets=re.split(r"\W+",i[1])
date=i[3]
for word in tweets: # error
if len(date)==10:
data_dict[word][date] += 1
except Exception, e:
print e
error_num+=1
pass
chunk = bigfile.readlines(chunkSize)
print line_num, total_num,error_num
df=pd.DataFrame(data_dict.items())
df=df.rename(columns = {0:'word'})
f1 = lambda x: x.values()[0]
df['number']=df[1].apply(f1)
df=df.rename(columns = {1:'date'})
f2 = lambda x: x.keys()[0]
df['date']=df['date'].apply(f2)
print df
结果:
word date number
0 RT 2011-10-06 2
1 HELICOPTERS 2011-10-06 1
2 HELP 2011-10-06 1
3 2011-10-06 1
4 KETTLING 2011-10-06 1
5 OWS 2011-10-06 1
6 OCCUPYWALLSTREET 2011-10-06 1
7 PARK 2011-10-06 1
8 PROTESTERS 2011-10-06 1
9 ALERT 2011-10-06 1
10 OCCUPYNY 2011-10-06 1
11 COPS 2011-10-06 1
12 ARE 2011-10-06 1
13 W 2011-10-06 1
14 IN 2011-10-06 1
15 PLEASE 2011-10-06 1
16 PADDYWAGONS 2011-10-06 1
17 AND 2011-10-06 1