我已使用sudo pip install pandas --upgrade
更新了我的pandas软件包,我尝试使用virtualenv
环境但是当我尝试使用pandas
绘制图表时仍然收到以下错误matplotlib
我已经通过终端和spyder
IDE测试了代码。
这是我的数据:
Call ID Mins Call Number Name Join Time
1434600 (01/02) 185
000-000-0000 Unspecified 4:59:12 AM
000-000-0000 Unspecified 4:59:17 AM
000-000-0000 Unspecified 5:00:36 AM
000-000-0000 Unspecified 5:02:14 AM
000-000-0000 Unspecified 5:11:07 AM
000-000-0000 Unspecified 4:58:11 AM
这是我的代码:
import pandas as pd
import datetime
def calculate(row):
"""
Calculates the join date and leave date for the session
Also give the time till a user was attending session
"""
date = row['Date']
join = row['Join Time']
leave = row['Leave Time']
year = "2015"
fmt = "%m/%d/%Y %I:%M:%S %p"
date = date.replace('(', '').replace(')', '') + '/' + year
join = date + ' ' + join
leave = date + ' ' + leave
join = datetime.datetime.strptime(join, fmt)
leave = datetime.datetime.strptime(leave, fmt)
time = (leave - join) / 60
# strangely the seconds giving the value for minutes
# normally we get seconds then convert to seconds
mins = time.seconds
return join, leave, mins
# read the csv file into pandas dataframe
conf_df = pd.read_csv('data/all_report.csv')
# show all the columns inside the dataframe
print list(conf_df.columns.values)
# remove the first line as it contains only summary info
conf_df = conf_df[1:]
# remove the column, doesn't contain any value
# he axis=1 specifier, that means that the application is done at a row,
# rather than a column level.
conf_df = conf_df.drop('Conference Code', 1)
conf_df.dropna(subset=['Call Number'], how='all')
# fill the conference id across the user details
conf_df['Call ID'] = conf_df['Call ID'].fillna(method='ffill')
# remove rows which don't contain the users
conf_df.dropna(subset=['Call Number'], inplace=True)
conf_df["Conference_ID"], conf_df["Date"] = zip(*conf_df["Call ID"].str.split().tolist())
# remove the Call ID column
conf_df = conf_df.drop('Call ID', 1)
# computing all the values necessary for the analysis
conf_df['Join Time'], conf_df['Leave Time'], conf_df['Mins'] = zip(*conf_df.apply(lambda row: calculate(row),axis=1))
print conf_df.head(8)
conf_df
conf_df.groupby(conf_df["Join Time"]).count().plot(kind="bar")
#conf_df.groupby(conf_df["Join Time"].dt.month).count().plot(kind="bar")
conf_df.plot(y=conf_df['Join Time'])
# selecting the months and plotting the bar-graph
months = conf_df.groupby(conf_df["Join Time"].dt.month)
months.size()
months.size().plot("line")
# getting the date time
days = conf_df.groupby(conf_df["Join Time"].dt.date)
days.size()
days.size().plot("bar")
# selecting the session only on Wednesday, Friday and Saturday
conf_df_wfs = conf_df[(conf_df["Join Time"].dt.weekday == 2) | (conf_df["Join Time"].dt.weekday == 4) | (conf_df["Join Time"].dt.weekday == 5)]
conf_df_wfs.groupby(conf_df["Join Time"].dt.weekday).size().plot("bar")
# saving the pandas dataframe to csv
conf_df.to_csv('data/processed_data.csv')
错误:
File "analyze_data.py", line 68, in <module>
conf_df.plot(y=conf_df['Join Time'])
File "/usr/local/lib/python2.7/dist-packages/pandas/tools/plotting.py", line 3671, in __call__
sort_columns=sort_columns, **kwds)
File "/usr/local/lib/python2.7/dist-packages/pandas/tools/plotting.py", line 2556, in plot_frame
**kwds)
File "/usr/local/lib/python2.7/dist-packages/pandas/tools/plotting.py", line 2370, in _plot
series = data[y].copy() # Don't modify
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 1963, in __getitem__
return self._getitem_array(key)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 2007, in _getitem_array
indexer = self.ix._convert_to_indexer(key, axis=1)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.py", line 1150, in _convert_to_indexer
raise KeyError('%s not in index' % objarr[mask])
KeyError: "['2015-01-01T23:59:12.000000000-0500' '2015-01-01T23:59:17.000000000-0500'\n '2015-01-02T00:00:36.000000000-0500' ...,\n '2015-11-25T00:06:59.000000000-0500' '2015-11-25T00:16:50.000000000-0500'\n '2015-11-24T23:56:52.000000000-0500'] not in index"