我有数据avito_trend.csv的文件,我想打印
的条形图void dragMouse(GLint x, GLint y) {
x = setX(x);
y = setY(y);
glMatrixMode(GL_PROJECTION);
if (mouseMode == 2) {
printf("드래그로 도형을 옮깁니다.\n");
glPushMatrix();
glTranslatef((mouseX - x), (mouse - y), 0.0);
glutPostRedisplay();
glPopMatrix();
}
else if (mouseMode == 3) {
angle += -(x - mouseX)*10.0 / 3.14;
glRotatef(angle, 0.0, 0.0, 1.0);
}
else if (mouseMode == 4) {
scale *= (1.0 + (y - mouseY) / 100.0);
glScalef(scale, scale, 0.0);
}
glutPostRedisplay();
}
我使用import pandas as pd
import itertools
df = pd.read_csv("avito_trend.csv", parse_dates=[2])
def f(df):
dfs = []
for x in [list(x) for x in itertools.combinations(df['address'].unique(), 2)]:
c1 = df.loc[df['address'].isin([x[0]]), 'ID']
c2 = df.loc[df['address'].isin([x[1]]), 'ID']
c = pd.Series(list(set(c1).intersection(set(c2))))
dfs.append(pd.DataFrame({'a':len(c), 'b':' and '.join(x)}, index=[0]))
return pd.concat(dfs)
result = df.groupby([df['used_at'].dt.year]).apply(f).reset_index(drop=True, level=1).reset_index()
print result
used_at a b
0 2014 1364 avito.ru and e1.ru
1 2014 1716 avito.ru and drom.ru
2 2014 1602 avito.ru and auto.ru
3 2014 299 avito.ru and avtomarket.ru
4 2014 579 avito.ru and am.ru
5 2014 602 avito.ru and irr.ru/cars
6 2014 424 avito.ru and cars.mail.ru/sale
7 2014 634 e1.ru and drom.ru
8 2014 475 e1.ru and auto.ru
9 2014 139 e1.ru and avtomarket.ru
10 2014 224 e1.ru and am.ru
11 2014 235 e1.ru and irr.ru/cars
12 2014 154 e1.ru and cars.mail.ru/sale
13 2014 874 drom.ru and auto.ru
14 2014 247 drom.ru and avtomarket.ru
15 2014 394 drom.ru and am.ru
16 2014 423 drom.ru and irr.ru/cars
17 2014 292 drom.ru and cars.mail.ru/sale
18 2014 243 auto.ru and avtomarket.ru
19 2014 408 auto.ru and am.ru
20 2014 409 auto.ru and irr.ru/cars
21 2014 330 auto.ru and cars.mail.ru/sale
22 2014 133 avtomarket.ru and am.ru
23 2014 139 avtomarket.ru and irr.ru/cars
24 2014 105 avtomarket.ru and cars.mail.ru/sale
25 2014 223 am.ru and irr.ru/cars
26 2014 166 am.ru and cars.mail.ru/sale
27 2014 197 irr.ru/cars and cars.mail.ru/sale
28 2015 1153 avito.ru and e1.ru
29 2015 1473 avito.ru and auto.ru
30 2015 1491 avito.ru and drom.ru
31 2015 403 avito.ru and irr.ru/cars
32 2015 205 avito.ru and avtomarket.ru
33 2015 256 avito.ru and cars.mail.ru/sale
34 2015 262 avito.ru and am.ru
35 2015 451 e1.ru and auto.ru
36 2015 539 e1.ru and drom.ru
37 2015 148 e1.ru and irr.ru/cars
38 2015 105 e1.ru and avtomarket.ru
39 2015 105 e1.ru and cars.mail.ru/sale
40 2015 99 e1.ru and am.ru
41 2015 799 auto.ru and drom.ru
42 2015 288 auto.ru and irr.ru/cars
43 2015 162 auto.ru and avtomarket.ru
44 2015 195 auto.ru and cars.mail.ru/sale
45 2015 224 auto.ru and am.ru
46 2015 277 drom.ru and irr.ru/cars
47 2015 175 drom.ru and avtomarket.ru
48 2015 189 drom.ru and cars.mail.ru/sale
49 2015 187 drom.ru and am.ru
50 2015 73 irr.ru/cars and avtomarket.ru
51 2015 94 irr.ru/cars and cars.mail.ru/sale
52 2015 102 irr.ru/cars and am.ru
53 2015 48 avtomarket.ru and cars.mail.ru/sale
54 2015 72 avtomarket.ru and am.ru
55 2015 73 cars.mail.ru/sale and am.ru
我希望得到this graph之类的东西。
我该怎么做?
我需要2014年和2015年的意义在其他一对网站的一个方面。
相反,我需要列ax = result.plot(width=0.5, kind='barh', stacked=True)
答案 0 :(得分:1)
正如@ user308827已经说过的那样,我也会使用seaborn,但我会采用不同的方式:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.expand_frame_repr', False)
cols = ['ID', 'address', 'used_at']
df = pd.read_csv(r'D:\data\gDrive\data\.stack.overflow\data\avito_trend.csv.gz',
parse_dates=['used_at'], usecols=cols)
df.sort_values(['ID','used_at','address'], inplace=True)
df['prev_address'] = df['address'].shift()
df['time_diff'] = df['used_at'] - df['used_at'].shift()
df = df[df['address'] != df['prev_address']]
df = df[df['time_diff'] <= pd.Timedelta('10min')]
tmp = df[['ID','address','prev_address']] \
.groupby(['address','prev_address', df.used_at.dt.year]) \
.count() \
.reset_index()
# remove `df` from memory
del df
tmp['visit_from'] = tmp['prev_address'] + ' -> ' + tmp['address']
# keep only 'interesting' columns
tmp = tmp[['visit_from','used_at','ID']]
tmp.columns = ['visit_from','year','visits']
# save temporary groupped CSV file
#fn = r'D:\data\gDrive\data\.stack.overflow\data\avito_grp.csv'
#tmp.to_csv(fn, index=False)
# show all
#df = tmp
# show only those sites with visits >= 100 (within both years)
df = tmp[tmp.groupby('visit_from')['visits'].transform('sum') >= 100].reset_index()
# prepare sorted index
idx = df.groupby('visit_from')['visits'].transform('sum').sort_values(ascending=False).index
# 'apply' index
df = df.reindex(idx)
# add 'total' column (sum of visits for all years)
#df['total'] = df.groupby('visit_from')['visits'].transform('sum')
################################################
#
# SeaBorn plotting
#
sns.set(style="darkgrid")
sns.set_color_codes("pastel")
f, ax = plt.subplots(figsize=(16, 12))
ax = sns.barplot(x='visits', y='visit_from', hue='year', data=df, saturation=0.8)
plt.xlabel('Visits')
# add annotations
[ax.text(p.get_width() + 3, p.get_y() + p.get_height(),
int(p.get_width()), fontsize=8)
for p in ax.patches]
plt.show()
PS对你有意思的部分以SeaBorn plotting
评论
答案 1 :(得分:0)
尝试seaborn
来自https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.barplot.html
import seaborn as sns
sns.set_style("whitegrid")
tips = sns.load_dataset("tips")
ax = sns.barplot(x="day", y="total_bill", data=tips)
对于堆积条形图: https://gist.github.com/randyzwitch/b71d47e0d380a1a6bef9