如何合并具有不同行数和重叠索引的两个pandas DataFrame?

时间:2019-10-04 14:48:46

标签: python pandas dataframe concat

我正在尝试合并具有不同行数的两个熊猫数据框,也有一些熊猫共有的索引值,而有些不是。我想创建一个组合的数据框。

我正在使用'concat'方法获得结果

library(tidyr)
library(ggplot2)
library(ggiraph)
library(ggdendro)
library(heatmaply)

# mydata <- cor(mtcars)
create_data <- function(){
df <- matrix(runif(2500, min = -2, max = 2), ncol = 50)
row.names(df) <- paste0("row_", seq_len(nrow(df)))
colnames(df) <- paste0("col_", seq_len(ncol(df)))
return(df)
}

gg2heat <- function(mydata){
# dendrogram for rows
hc <- hclust(dist(mydata), "ave")
dhr <- as.dendrogram(hc)
order_r <- rownames(mydata)[hc$order]

# dendrogram for columns
hc <- hclust(dist(t(mydata)), "ave")
dhc <- as.dendrogram(hc)
order_c <- colnames(mydata)[hc$order]

# the data
expr_set <- bind_cols(
data_frame(rowvar = rownames(mydata)),
as.data.frame(mydata)
)
expr_set <- gather(expr_set, colvar, measure, -rowvar)
expr_set$rowvar <- factor( expr_set$rowvar, levels = order_r )
expr_set$colvar <- factor( expr_set$colvar, levels = order_c )
expr_set <- arrange(expr_set, rowvar, colvar)

# get data for dendrograms - IMHO, ggdendro is the hero here...
data_c <- dendro_data(dhc, type = "rectangle")
data_c <- segment(data_c) %>% mutate(
y = y + length(order_r) + .5,
yend = yend + length(order_r) + .5
)

data_r <- dendro_data(dhr, type = "rectangle")
data_r <- segment(data_r)
data_r <- data_r %>%
mutate( x_ = y + length(order_c) + .5,
       xend_ = yend + length(order_c) + .5,
       y_ = x,
       yend_ = xend )

expr_set <- expr_set %>% 
mutate( 
 tooltip = sprintf("Row: %s<br/>Col: %s<br/>measure: %.02f", 
                   rowvar, colvar, measure) ,
 data_id = sprintf("%s_%s", rowvar, colvar)
)


# all data are tidy and can be now used with ggplot
p <- ggplot(data = expr_set, aes(x = colvar, y = rowvar) ) +
geom_tile_interactive(aes(fill = measure, tooltip = tooltip, data_id = data_id), colour = "white") +
scale_fill_gradient(low = "white", high = "#BC120A") +
geom_segment(
 data = data_c,
 mapping = aes(x = x, y = yend, xend = xend, yend = y),
 colour = "gray20", size = .2) +
geom_segment(
 data = data_r,
 mapping = aes(x = x_, y = y_, xend = xend_, yend = yend_),
 colour = "gray20", size = .2) +
coord_equal()

# cosmetics
p <- p + theme_minimal() +
theme(
 legend.position = "right",
 panel.grid.minor = element_line(color = "transparent"),
 panel.grid.major = element_line(color = "transparent"),
 axis.ticks.length   = unit(2, units = "mm"),
 plot.title = element_text(face = "bold", hjust = 0.5, size = 12),
 axis.title = element_text(size = 9, colour = "gray30"),
 axis.text.y = element_text(hjust = 1, size = 5, colour = "gray40"),
 axis.text.x = element_text(angle = 90, hjust = 1, size = 5, colour = "gray40"),
 legend.title=element_text(face = "bold", hjust = 0.5, size=8),
 legend.text=element_text(size=6)
)
ggiraph(ggobj = p)
}

htmp_gg <- c()
htmp_maply <-c() 


for (i in 1:20){
df <- create_data()
time_gg <- (system.time(gg2heat(df)))[3]
htmp_gg<- append(htmp_gg, values = time_gg)
time_heatmaply <- (system.time(heatmaply::heatmaply(df, hclust_method = 'ave')))[3]
htmp_maply<- append(htmp_maply, values = time_heatmaply)
rm(df)
}

score <- data.frame(htmp_gg, htmp_maply)%>% gather(key = 'method', value = 'time')


p <- ggplot(score, aes(x = method, y = time, fill = method))+geom_violin()+ stat_summary(fun.y=median, geom="point", size=2, color="black")
print(p)

例如我有以下两个数据帧(df1和df2)

 result=pd.concat([df1,df2]).sort_index()

并希望获得此结果:

index         value1
2019-01-01    1
2019-02-01    2
2019-03-01    3
2019-04-01    4
2019-05-01    5
2019-12-01    17

index          value2
2019-01-01     2
2019-02-01     6
2019-08-01     9
2019-09-01     7.5
2019-10-01     11

但是当我使用上面的“ concat”时,会得到类似的结果,其中常见的索引被复制而不是合并。

 index      value1    value2
2019-01-01    1       2
2019-02-01    2       6
2019-03-01    3       NaN
2019-04-01    4       NaN   
2019-05-01    5       NaN
2019-08-01    NaN     9
2019-09-01    NaN     7.5
2019-10-01    NaN      11
2019-12-01    17      NaN

任何人都可以指出我在这里做错了什么,或者如何获得我想要的结果?

0 个答案:

没有答案