Merge data frames in for loop

时间:2017-08-04 12:49:30

标签: r apply lapply

I have two data frames data:

data <- structure(list(chrom = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 
3L, 4L, 4L, 4L, 4L), pos = c(10L, 200L, 134L, 400L, 600L, 1000L, 
20L, 33L, 40L, 45L, 50L, 55L, 100L, 123L)), .Names = c("chrom", 
"pos"), class = "data.frame", row.names = c(NA, -14L))

And tss_locations:

tss_locations <- structure(list(gene = structure(c(1L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 2L, 3L), .Label = c("gene1", "gene10", "gene11", "gene2", 
"gene3", "gene4", "gene5", "gene6", "gene7", "gene8", "gene9"
), class = "factor"), chrom = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 
3L, 4L, 4L), tss = c(5L, 10L, 23L, 1340L, 313L, 88L, 44L, 57L, 
88L, 74L, 127L)), .Names = c("gene", "chrom", "tss"), class = "data.frame", row.names = c(NA, 
-11L))

And a function to find the closest match between a position in data (pos) and a position in tss_locations (tss). I would like to make comparisons only where the chrom value is the same for both pos and tss.

So far, I am achiving this by first filtering both data frames to select the same chrom, and then running them through my function:

# Filter both data frames on same chrom
data<-filter(data, chrom == 1)
tss_locations<-filter(tss_locations, chrom == 1)

# apply fun2
dist2tss<-lapply(data$pos, fun2)

fun2 <- function(p) {
  # Get index of nearest tss
  index<-which.min(abs(tss_locations$tss - p))
  # Get corresponding tss, chrom and gene
  closestTss<-tss_locations$tss[index]
  chrom<-as.character(tss_locations$chrom[index])
  gene<-as.character(tss_locations$gene[index])

  # Calculate distance 
  dist<-(p-closestTss)
  list(p, closestTss, dist, chrom, gene)
}

# Convert to data frame
dist2tss<-do.call(rbind, dist2tss)
dist2tss<-as.data.frame(dist2tss)
colnames(dist2tss)=c("snp", "closest_tss", "min_dist", "chrom", "closest_gene")
dist2tss$min_dist<-as.numeric(dist2tss$min_dist)

I've tried putting this in a for loop, i.e.:

for (c in levels(data$chrom)){

  data<-filter(data, chrom == c)
  tss_locations<-filter(tss_locations, chrom == c)
  ...
}

But I'm not sure how to get this to return a data frame for each chromosome, and then merge them together, so that I can plot the global distances.

Any help appreciated

1 个答案:

答案 0 :(得分:1)

Bit of a toy example but build a list in the loop and do.call at the end (presuming they all have the same column structure?)

df1 <- data.frame(cell=c(1,2,3),val=c(345,123,466))
df2 <- data.frame(cell=c(67,3,2),val=c(234,234,56))
df3 <- data.frame(cell=c(3,67,23),val=c(23,8,34))

l <- list()

for(i in 1:3){
  df.now <- get(paste0("df",i))
  l[[i]] <- df.now
}

do.call(rbind,l)

in the context of yours, you build the dataframe for each chromosome as you did in the initial workflow and at the end of the loop, populate the list element with that new dataframe. Then do.call them altogether, something like this:

l <- list()

for (c in levels(data$chrom)){
  data<-filter(data, chrom == c)
  tss_locations<-filter(tss_locations, chrom == c)
  dist2tss<-lapply(df$pos, fun2)
  dist2tss<-do.call(rbind, dist2tss)
  dist2tss<-as.data.frame(dist2tss)

  colnames(dist2tss)=c("snp", "closest_tss", "min_dist", "chrom", "closest_gene")
  dist2tss$min_dist<-as.numeric(dist2tss$min_dist)
  l[[c]] <- dist2tss
}
dist2tss<-do.call(rbind,l)