我有一个股票价格数据框。存在重复的可能性,因此在执行合并功能时,数据会失控。我想要的是每当任何列中有重复项时,我想以少量增加它。
例如表格
library(rgeos) # gBuffer
dat <- structure(list(x = c(6, 5.98, 5.94, 5.86, 5.75, 5.62, 5.47, 5.31, 5.13, -4.87, -5.04, -5.22, -5.39, -5.55, -5.69, -5.81, -5.9, -5.96, -6, -6, -6, -5.96, -5.9, -5.81, -5.69, -5.55, -5.39, -5.22, -5.04, -3.04, -2.87, -2.69, -2.53, -2.38, -2.25, -2.14, -2.06, -2.02, -2, -2, -1.96, -1.9, -1.81, -1.69, -1.55, -1.39, -1.22, -1.04, -0.87, 1.13, 1.31, 1.47, 1.62, 1.75, 1.86, 1.94, 1.98, 2, 2, 2, 2.04, 2.1, 2.19, 2.31, 2.45, 2.61, 2.78, 2.96, 4.96, 5.13, 5.31, 5.47, 5.62, 5.75, 5.86, 5.94, 5.98, 6), y = c(5, 5.18, 5.35, 5.51, 5.66, 5.78, 5.88, 5.95, 5.99, 5.99, 6, 5.97, 5.92, 5.83, 5.72, 5.59, 5.43, 5.27, 5.09, -4.91, -5.09, -5.27, -5.43, -5.59, -5.72, -5.83, -5.92, -5.97, -6, -6, -5.99, -5.95, -5.88, -5.78, -5.66, -5.51, -5.35, -5.18, -5, -1.91, -1.73, -1.57, -1.41, -1.28, -1.17, -1.08, -1.03, -1, -1.01, -1.01, -1.05, -1.12, -1.22, -1.34, -1.49, -1.65, -1.82, -2, -4.91, -5.09, -5.27, -5.43, -5.59, -5.72, -5.83, -5.92, -5.97, -6, -6, -5.99, -5.95, -5.88, -5.78, -5.66, -5.51, -5.35, -5.18, 5)), row.names = c(NA, -78L), class = "data.frame")
# "shrink-wrap"
sp <- sp::SpatialPolygons(list(sp::Polygons(list(sp::Polygon( as.matrix(dat) )), "dat")))
sp2 <- gBuffer(sp, width = -0.5)
dat2 <- as.data.frame(sp2@polygons[[1]]@Polygons[[1]]@coords)
REDUCTION_FACTOR <- 0.92
CONVEX_SHIFT_CUTOFF <- 0.55
x <- c()
y <- c()
max_x <- max(dat$x)
max_y <- max(dat$y)
for ( i in 1:length(dat$x)) {
x_p <- dat$x[i]
y_p <- dat$y[i]
r <- sqrt(x_p^2 + y_p^2)
theta <- atan2(y_p, x_p)
r <- REDUCTION_FACTOR * r
if(abs(x_p) < CONVEX_SHIFT_CUTOFF * max_x) {
x <- c(x, r*cos(theta)*(1 + 4*(1-REDUCTION_FACTOR)) )
} else {
x <- c(x, r*cos(theta))
}
if(abs(y_p) < CONVEX_SHIFT_CUTOFF*max_y ) {
y <- c(y, r*sin(theta)*(1 - 4*(1-REDUCTION_FACTOR) ))
} else {
y <- c(y, r*sin(theta))
}
}
dat3 <- data.frame(x = x, y = y)
library(ggplot2) # just for vis here
ggplot(dat, aes(x, y)) +
geom_path() + geom_point() +
geom_path(data = dat2, color = "red") + geom_point(data = dat2, color = "red") +
geom_path(data = dat3, color = "blue") + geom_point(data = dat3, color = "blue")
在上表中,我们有高重复和低重复的实例。因此将增加重复项,例如非常小的数量 0.0025
期望输出
|Date| High| low|
|:--|:---:|---:|
|1-12-2020| 515|505|
|2-12-2020| 525|515|
|3-12-2020| 515| 510|
|4-12-2020|530 |505|
我应该使用什么函数来解决这个问题 谢谢
答案 0 :(得分:1)
解决方案是这样的。您的数据框是 ddf
Date High low
0 1-12-2020 515 505
1 2-12-2020 525 515
2 3-12-2020 515 510
3 4-12-2020 530 505
并这样做
mask = ddf['High'].duplicated(keep=False)
ddf.loc[mask, 'High'] += ddf.groupby('High').cumcount().add(1)
返回
Date High low
0 1-12-2020 516.0 505
1 2-12-2020 525.0 515
2 3-12-2020 517.0 510
3 4-12-2020 530.0 505
答案 1 :(得分:1)
将 df.duplicated
与 df.loc
一起使用:
In [1421]: df.loc[df.duplicated('High'), 'High'] = df.loc[df.duplicated('High'), 'High'] + 0.0025
In [1423]: df.loc[df.duplicated('low'), 'low'] = df.loc[df.duplicated('low'), 'low'] + 0.0025
In [1424]: df
Out[1424]:
Date High low
0 1-12-2020 515.0000 505.0000
1 2-12-2020 525.0000 515.0000
2 3-12-2020 515.0025 510.0000
3 4-12-2020 530.0000 505.0025