#pandas DataFrame ValueError:传递值的形状为(1,3),索引意味着(3,3)

时间:2016-05-11 03:54:37

标签: python pandas dataframe

这是我的代码。

数据的形状:

data_dict.items()
Out[57]:
[('Sympathetic', defaultdict(<type 'int'>, {'2011-10-06': 1})),
 ('protest', defaultdict(<type 'int'>, {'2011-10-06': 16})),
 ('occupycanada', defaultdict(<type 'int'>, {'2011-10-06': 1})),
 ('hating', defaultdict(<type 'int'>, {'2011-10-06': 1})),
 ('AND', defaultdict(<type 'int'>, {'2011-10-06': 4})),
 ('c', defaultdict(<type 'int'>, {'2011-10-06': 2})),
 ...]

data_dict被定义为

data_dict = defaultdict(lambda: defaultdict(int)) 

我想构建一个数据帧,如下所示:

columns = ['word','date',"number"]

word date number
"Sympathetic" '2011-10-06' 1
"protest" '2011-10-06' 16
'occupycanada' '2011-10-06' 1
'hating' '2011-10-06' 1
'AND' '2011-10-06' 4
'comunity' '2011-10-06' 2 
...

我尝试过这样做,使用pandas:

import pandas as pd
for d in data_dict:
    for date in data_dict[d]:
        data=[d,date,data_dict[d][date]]
        dat = pd.DataFrame(data, columns = ['word','date',"number"])
        print dat

但是当我运行此代码时,我有以下错误:

ValueError                                Traceback (most recent call last)
<ipython-input-56-80b3affa34fe> in <module>()
      3     for date in data_dict[d]:
      4         data=[d,date,data_dict[d][date]]
----> 5         dat = pd.DataFrame(data, columns = ['word','date',"number"])
      6         print dat
....
ValueError: Shape of passed values is (1, 3), indices imply (3, 3)

我该如何解决?

有关data_dict的其他代码:

from collections import defaultdict
import csv
import re
import sys
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()

data_dict = defaultdict(lambda: defaultdict(int)) 
error_num = 0
line_num = 0
total_num = 0

bigfile = open('D:/Data/ows/ows_sample.txt', 'rb')
chunkSize = 10000000
chunk = bigfile.readlines(chunkSize)
while chunk:
    total_num += len(chunk)
    lines = csv.reader((line.replace('\x00','') for line in chunk), delimiter=',', quotechar='"')
    for i in lines:
        line_num+=1
        if line_num%1000000==0:
            flushPrint(line_num)
        try:
            i[1]= re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+|(?:@[\w_]+)', "", i[1])
            tweets=re.split(r"\W+",i[1])
            date=i[3]
            for word in tweets: # error
                if len(date)==10:
                    data_dict[word][date] += 1
        except Exception, e:
            print e
            error_num+=1
            pass
    chunk = bigfile.readlines(chunkSize) 
print line_num, total_num,error_num

示例数据

['"Twitter ID",Text,"Profile Image URL",Day,Hour,Minute,"Created At",Geo,"From User","From User ID",Language,"To User","To User ID",Source\n',

 '121813144174727168,"RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE RT !!HELP!!!!",,2011-10-06,5,4,"2011-10-06 05:04:51",N;,Anonops_Cop,401240477,en,,0,"&lt;a href=&quot;&gt;web&lt;/a&gt;"\n',

 '121813146137657344,"@jamiekilstein @allisonkilkenny Interesting interview (never aired, wonder why??) by Fox with #ows protester 2011-10-06,5,4,"2011-10-06 05:04:51",N;,KittyHybrid,34532053,en,jamiekilstein,2149053,"&lt;a href=&quot;&gt;web&lt;/a&gt;"\n',

 '121813150000619521,"@Seductivpancake Right! Those guys have a victory condition: regime change. #ows doesn\'t seem to have a goal I can figure out.",2011-10-06,5,4,"2011-10-06 05:04:52",N;,nerdsherpa,95067344,en,Seductivpancake,19695580,"&lt;a href=&quot;nofollow&quot;&gt;Echofon&lt;/a&gt;"\n',

 '121813150701072385,"RT @bembel &quot;Occupy Wall Street&quot; als linke Antwort auf die Tea Party? #OccupyWallStreet #OWS",2011-10-06,5,4,"2011-10-06 05:04:52",N;,hamudistan,35862923,en,,0,"&lt;a href=&quot;rel=&quot;nofollow&quot;&gt;Plume\xc2\xa0\xc2\xa0&lt;/a&gt;"\n',

 '121813163778899968,"#ows White shirt= Brown shirt.",2011-10-06,5,4,"2011-10-06 05:04:56",N;,kl_knox,419580636,en,,0,"&lt;a href=&quot;&gt;web&lt;/a&gt;"\n',

 '121813169999065088,"RT @TheNewDeal: The #NYPD are Out of Control. Is This a Free Country or a Middle-East Dictatorship? #OccupyWallStreet #OWS #p2",2011-10-06,5,4,"2011-10-06 05:04:57",N;,vickycrampton,32151083,en,,0,"&lt;a href=&quot;&gt;web&lt;/a&gt;"\n',

2 个答案:

答案 0 :(得分:1)

我会这样做:

# -*- coding: utf-8 -*-
from collections import defaultdict, Counter
import string
import pandas as pd

# prepare translate table, which will remove all punctuations and digits
chars2remove = list(string.punctuation + string.digits)
transl_tab = str.maketrans(dict(zip(chars2remove, list(' ' * len(chars2remove)))))
# replace 'carriage return' and 'new line' characters with spaces
transl_tab[10] = ' '
transl_tab[13] = ' '

def tokenize(s):
    return s.translate(transl_tab).lower().split()

chunksize = 100
fn = r'D:\temp\.data\ows-sample.txt'

#
# read `Day` and `Text` columns from the source CSV file
#

# not-chunked version
#df = pd.read_csv(fn, usecols=['Text','Day'])

# "chunked" version - will prepare a list of "reduced" DFs,
# containing word counts in the form: "{'we': 1, 'stand': 1, 'and': 1}"
dfs = []
for df in pd.read_csv(fn, usecols=['Text','Day'], chunksize=chunksize):
    # group DF by date and count words for each unique day, summing up counters
    dfs.append(df.assign(count=df['Text']
                                 .apply(lambda x: Counter(tokenize(x))))
                 .groupby('Day', as_index=False)['count'].sum()
    )

# convert sets of {'word1': count, 'word2': count} into columns
tmp = (pd.concat(dfs, ignore_index=True)
         .set_index('Day')['count']
         .apply(pd.Series)
         .reset_index()
)
tmp['Day'] = pd.to_datetime(tmp['Day'])

# free up memory
del dfs

# transform (melt) columns into desired columns: [Day, word, number]]
rslt = (pd.melt(tmp, id_vars='Day', var_name='word', value_name='number')
          .fillna(0)
)

# delete temporary DF from memory
del tmp

# save results as HDF5 file
rslt.to_hdf('d:/temp/.data/twit_words.h5', 'twit_words', mode='a',
            format='t', complib='zlib', complevel=4)

# save results as CSV file
rslt.to_csv('d:/temp/.data/twit_words.csv.gz', index=False,
            encoding='utf_8', compression='gzip')

针对this样本数据进行测试:

In [254]: pd.melt(new, id_vars='Day', var_name='word', value_name='number').fillna(0)
Out[254]:
               Day            word  number
0       2011-11-13               a     4.0
1       2011-11-14               a     9.0
2       2011-11-15               a    92.0
3       2011-11-16               a   111.0
4       2011-11-17               a    93.0
5       2011-11-18               a   141.0
6       2011-11-19               a    77.0
7       2011-11-20               a    58.0
8       2011-11-21               a    29.0
9       2011-11-22               a    70.0
10      2011-11-23               a    55.0
11      2011-11-24               a    49.0
12      2011-11-25               a    41.0
13      2011-11-26               a    67.0
14      2011-11-27               a    27.0
15      2011-11-28               a    34.0
16      2011-11-29               a    23.0
17      2011-11-30               a    33.0
18      2011-12-01               a    26.0
19      2011-12-02               a    32.0
20      2011-12-03               a    46.0
21      2011-12-04               a    29.0
22      2011-12-05               a    22.0
23      2011-12-06               a    60.0
24      2011-12-07               a    32.0
25      2011-12-08               a    33.0
26      2011-12-09               a    16.0
27      2011-11-13              aa     0.0
28      2011-11-14              aa     0.0
29      2011-11-15              aa     0.0
...            ...             ...     ...

答案 1 :(得分:1)

您可以在原始代码中添加几行,以便您可以使用字典,看起来很简单:

df=pd.DataFrame(data_dict.items())
df=df.rename(columns = {0:'word'}) 

f1 = lambda x: x.values()[0]
df['number']=df[1].apply(f1)

df=df.rename(columns = {1:'date'}) 
f2 = lambda x: x.keys()[0]
df['date']=df['date'].apply(f2)

完整的程序将是:

from collections import defaultdict
import csv
import re
import sys
import pandas as pd

def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()


data_dict = defaultdict(lambda: defaultdict(int)) 
error_num = 0
line_num = 0
total_num = 0

bigfile = open('data.txt', 'rb')
chunkSize = 10000000
chunk = bigfile.readlines(chunkSize)
while chunk:
    total_num += len(chunk)
    lines = csv.reader((line.replace('\x00','') for line in chunk), delimiter=',', quotechar='"')
    for i in lines:
        line_num+=1
        if line_num%1000000==0:
            flushPrint(line_num)
        try:
            i[1]= re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+|(?:@[\w_]+)', "", i[1])
            tweets=re.split(r"\W+",i[1])
            date=i[3]
            for word in tweets: # error
                if len(date)==10:
                    data_dict[word][date] += 1
        except Exception, e:
            print e
            error_num+=1
            pass
    chunk = bigfile.readlines(chunkSize) 
print line_num, total_num,error_num


df=pd.DataFrame(data_dict.items())
df=df.rename(columns = {0:'word'}) 

f1 = lambda x: x.values()[0]
df['number']=df[1].apply(f1)

df=df.rename(columns = {1:'date'}) 
f2 = lambda x: x.keys()[0]
df['date']=df['date'].apply(f2)

print df

结果:

                word        date  number
0                 RT  2011-10-06       2
1        HELICOPTERS  2011-10-06       1
2               HELP  2011-10-06       1
3                     2011-10-06       1
4           KETTLING  2011-10-06       1
5                OWS  2011-10-06       1
6   OCCUPYWALLSTREET  2011-10-06       1
7               PARK  2011-10-06       1
8         PROTESTERS  2011-10-06       1
9              ALERT  2011-10-06       1
10          OCCUPYNY  2011-10-06       1
11              COPS  2011-10-06       1
12               ARE  2011-10-06       1
13                 W  2011-10-06       1
14                IN  2011-10-06       1
15            PLEASE  2011-10-06       1
16       PADDYWAGONS  2011-10-06       1
17               AND  2011-10-06       1