我有一个这样的熊猫系列:
# Defining breaks and labels, minor and major:
range.f <- range(unique(df1$weeks))
minor.f <- 1 # every 1 week, NOTE: range.f[2] should be divisible by minor.f!
major.f <- 5 # every 5 weeks
breaks.f <- seq(range.f[1], range.f[2], minor.f)
every_nth.lt <- function (x, nth) {x[1:nth != 1] <- ""; x}
# (lite version of https://stackoverflow.com/a/34533473/6574038
# works better for me than `insert_minor()`)
labels.f <- every_nth.lt(sequence(range.f[2]), major.f)
n_minor.f <- major.f / minor.f - 1
# Normal plot:
library(ggplot2)
p.f <- ggplot(df1, aes(weeks, births)) +
geom_bar(stat="identity", fill="#F48024") + theme_bw() +
scale_x_continuous(breaks=breaks.f, labels=labels.f) +
coord_cartesian(xlim=range.f) +
facet_wrap(year ~ .) +
theme(panel.grid = element_blank(),
axis.text.x = element_text(margin=margin(t=5, unit="pt")))
# Manipulating plot:
g.f <- ggplotGrob(p.f)
xaxis.f <- g.f$grobs[grep("^axis-b", g.f$layout$name)] # get x-axes
ticks.f <- lapply(lapply(xaxis.f, "["),
function(x) x$children[[2]]) # get ticks
marks.f <- lapply(lapply(ticks.f, "["),
function(x) x[1]$grobs) # get ticks
# editing y-positions of tick marks
library(grid)
marks.f <- lapply(marks.f, function(x) {
x[[1]]$y <- unit.c(unit.c(unit(1, "npc") - unit(6, "pt"),
unit(1, "npc"),
rep(unit.c(unit(1, "npc") - unit(3, "pt"),
unit(1, "npc")), n_minor.f)))
x
})
# putting tick marks back into plot
for(i in seq_along(ticks.f)) {
ticks.f[[i]]$grobs[[1]] <- marks.f[[i]][[1]]
}
for(i in seq_along(xaxis.f)) {
xaxis.f[[i]]$children[[2]] <- ticks.f[[i]]
}
g.f$grobs[grep("^axis-b", g.f$layout$name)] <- xaxis.f
# Drawing the plot:
grid.newpage()
grid.draw(g.f)
我想创建4个热编码变量,该变量指示每行哪个值位于哪个四分位数上,将该系列划分为4个四分位数。会是这样的:
tmp <- data.frame(date=as.Date(sample(1:1095, 10000, replace=TRUE),
origin="2014-01-01"),
births=sample(0:10, 10000, replace=TRUE))
tmp$year <- factor(substr(tmp$date, 1, 4))
df1 <- aggregate(births ~ date + year, tmp, sum)
rm(tmp) # remove tmp
df1$weeks <- as.integer(strftime(lubridate::floor_date(as.Date(df1$date,
format="%m/%d/%Y"),
unit="week"), "%W")) + 1
我知道数字不完全匹配,这只是给出所需输出的直观示例。
我已经尝试过了:
0 1787
1 4789
2 1350
3 1476
4 0
5 747
6 307
7 147
8 221
9 -88
10 9374
11 264
12 1109
13 502
14 360
15 194
16 4073
17 2317
18 -221
20 0
21 16
22 106
29 105
30 4189
31 171
32 42
但这只会大喊这四个值:
0 1787 Q1 Q2 Q3 Q4
1 4789 0 0 0 0
2 1350 0 0 0 1
3 1476 1 0 0 0
4 0 0 1 0 0
5 747 0 0 1 0
6 307 1 0 1 0
7 147 0 1 0 1
我也尝试过这个:
series.quantile[0.25, 0.5, 0.75, 1]
但是这会导致以下错误:
0.25 67
0.50 442.5
0.75 1477.75
1.00 71188
。
达成目标的最好方法是什么?
非常感谢您
答案 0 :(得分:1)
我想你可以试试看。
df = pd.DataFrame({'Series': series})
quantiles = df['Series'].quantile([0, 0.25, 0.5, 0.75, 1]).to_frame('quantiles').reset_index(drop = True)
for quant, Q in enumerate(['Q1', 'Q2', 'Q3', 'Q4']):
quant = quant + 1
df.loc[:,Q] = np.where((df.Series > quantiles.quantiles[quant-1]) & (df.Series <= quantiles.quantiles[quant]), 1, 0)
应该为您提供的:
Series Q1 Q2 Q3 Q4
0 1787 0 0 0 1
1 4789 0 0 0 1
2 1350 0 0 1 0
3 1476 0 0 0 1
4 0 1 0 0 0
5 747 0 0 1 0
6 307 0 0 1 0
7 147 0 1 0 0
8 221 0 1 0 0
9 -88 1 0 0 0
10 9374 0 0 0 1
11 264 0 1 0 0
12 1109 0 0 1 0
13 502 0 0 1 0
14 360 0 0 1 0
15 194 0 1 0 0
16 4073 0 0 0 1
17 2317 0 0 0 1
18 -221 0 0 0 0
19 0 1 0 0 0
20 16 1 0 0 0
21 106 0 1 0 0
22 105 1 0 0 0
23 4189 0 0 0 1
24 171 0 1 0 0
25 42 1 0 0 0
答案 1 :(得分:1)
以下具有pandas.qcut和pandas.get_dummies的代码应该起作用
quantiles = pd.qcut(series,
[0, 0.25, 0.5, 0.75, 1],
labels=['Q1', 'Q2', 'Q3', 'Q4'])
dummies = pd.get_dummies(quantiles)
pd.concat([df, dummies], axis=1)
吸引
Series Q1 Q2 Q3 Q4
0 1787 0 0 0 1
1 4789 0 0 0 1
2 1350 0 0 1 0
3 1476 0 0 0 1
4 0 1 0 0 0
5 747 0 0 1 0
6 307 0 0 1 0
7 147 0 1 0 0
8 221 0 1 0 0
9 -88 1 0 0 0
10 9374 0 0 0 1
11 264 0 1 0 0
12 1109 0 0 1 0
13 502 0 0 1 0
14 360 0 0 1 0
15 194 0 1 0 0
16 4073 0 0 0 1
17 2317 0 0 0 1
18 -221 1 0 0 0
20 0 1 0 0 0
21 16 1 0 0 0
22 106 0 1 0 0
29 105 1 0 0 0
30 4189 0 0 0 1
31 171 0 1 0 0
32 42 1 0 0 0