更改熊猫中重复项的值

时间:2020-12-19 05:03:06

标签: python pandas

我有一个股票价格数据框。存在重复的可能性,因此在执行合并功能时,数据会失控。我想要的是每当任何列中有重复项时,我想以少量增加它。

例如表格

library(rgeos)   # gBuffer

dat <- structure(list(x = c(6, 5.98, 5.94, 5.86, 5.75, 5.62, 5.47, 5.31, 5.13, -4.87, -5.04, -5.22, -5.39, -5.55, -5.69, -5.81, -5.9, -5.96, -6, -6, -6, -5.96, -5.9, -5.81, -5.69, -5.55, -5.39, -5.22, -5.04, -3.04, -2.87, -2.69, -2.53, -2.38, -2.25, -2.14, -2.06, -2.02, -2, -2, -1.96, -1.9, -1.81, -1.69, -1.55, -1.39, -1.22, -1.04, -0.87, 1.13, 1.31, 1.47, 1.62, 1.75, 1.86, 1.94, 1.98, 2, 2, 2, 2.04, 2.1, 2.19, 2.31, 2.45, 2.61, 2.78, 2.96, 4.96, 5.13, 5.31, 5.47, 5.62, 5.75, 5.86, 5.94, 5.98, 6), y = c(5, 5.18, 5.35, 5.51, 5.66, 5.78, 5.88, 5.95, 5.99, 5.99, 6, 5.97, 5.92, 5.83, 5.72, 5.59, 5.43, 5.27, 5.09, -4.91, -5.09, -5.27, -5.43, -5.59, -5.72, -5.83, -5.92, -5.97, -6, -6, -5.99, -5.95, -5.88, -5.78, -5.66, -5.51, -5.35, -5.18, -5, -1.91, -1.73, -1.57, -1.41, -1.28, -1.17, -1.08, -1.03, -1, -1.01, -1.01, -1.05, -1.12, -1.22, -1.34, -1.49, -1.65, -1.82, -2, -4.91, -5.09, -5.27, -5.43, -5.59, -5.72, -5.83, -5.92, -5.97, -6, -6, -5.99, -5.95, -5.88, -5.78, -5.66, -5.51, -5.35, -5.18, 5)), row.names = c(NA, -78L), class = "data.frame")

# "shrink-wrap"
sp <- sp::SpatialPolygons(list(sp::Polygons(list(sp::Polygon( as.matrix(dat) )), "dat")))
sp2 <- gBuffer(sp, width = -0.5)
dat2 <- as.data.frame(sp2@polygons[[1]]@Polygons[[1]]@coords)


REDUCTION_FACTOR <- 0.92
CONVEX_SHIFT_CUTOFF <- 0.55

x <- c()
y <- c()
max_x <- max(dat$x)
max_y <- max(dat$y)
for ( i in 1:length(dat$x)) {
  x_p <- dat$x[i]
  y_p <- dat$y[i]
  r <- sqrt(x_p^2 + y_p^2)
  theta <- atan2(y_p, x_p)
  r <- REDUCTION_FACTOR * r
  if(abs(x_p) < CONVEX_SHIFT_CUTOFF * max_x) {
    x <- c(x, r*cos(theta)*(1 + 4*(1-REDUCTION_FACTOR))  )
  } else {
    x <- c(x, r*cos(theta))
  }
  
  if(abs(y_p) < CONVEX_SHIFT_CUTOFF*max_y ) {
    y <- c(y, r*sin(theta)*(1 - 4*(1-REDUCTION_FACTOR) ))
  } else {
    y <- c(y, r*sin(theta)) 
  }
  
}

dat3 <- data.frame(x = x, y = y)

library(ggplot2) # just for vis here
ggplot(dat, aes(x, y)) +
  geom_path() + geom_point() +
  geom_path(data = dat2, color = "red") + geom_point(data = dat2, color = "red")  +
  geom_path(data = dat3, color = "blue") + geom_point(data = dat3, color = "blue")  

在上表中,我们有高重复和低重复的实例。因此将增加重复项,例如非常小的数量 0.0025

期望输出

|Date| High| low|
|:--|:---:|---:|
|1-12-2020| 515|505|
|2-12-2020| 525|515|
|3-12-2020| 515| 510|
|4-12-2020|530 |505|

我应该使用什么函数来解决这个问题 谢谢

2 个答案:

答案 0 :(得分:1)

解决方案是这样的。您的数据框是 ddf

        Date  High  low
0  1-12-2020   515  505
1  2-12-2020   525  515
2  3-12-2020   515  510
3  4-12-2020   530  505

并这样做

mask = ddf['High'].duplicated(keep=False)
ddf.loc[mask, 'High'] += ddf.groupby('High').cumcount().add(1)

返回

        Date   High  low
0  1-12-2020  516.0  505
1  2-12-2020  525.0  515
2  3-12-2020  517.0  510
3  4-12-2020  530.0  505

答案 1 :(得分:1)

df.duplicateddf.loc 一起使用:

In [1421]: df.loc[df.duplicated('High'), 'High'] = df.loc[df.duplicated('High'), 'High'] + 0.0025

In [1423]: df.loc[df.duplicated('low'), 'low'] = df.loc[df.duplicated('low'), 'low'] + 0.0025

In [1424]: df
Out[1424]: 
        Date      High       low
0  1-12-2020  515.0000  505.0000
1  2-12-2020  525.0000  515.0000
2  3-12-2020  515.0025  510.0000
3  4-12-2020  530.0000  505.0025