Question

我有2个数据框，有一些匹配的列（污染物）。第一个数据框包含观察结果，而第二个数据框包含某些污染物的不同阈值。

这里是两个数据帧的一小部分：

dput(df1)
structure(list(sample = structure(27:76, .Label = c("A_1", "A_2", 
"A_LS", "A_PC", "A_PM", "B_1", "B1_1", "B1_2", "B1-8_PC", "B1-8_PM", 
"B1_LS", "B1_PC", "B1_PM", "B_2", "B2_1", "B2_2", "B2-8_PC", 
"B2-8_PM", "B2_LS", "B2_PC", "B2_PM", "B_LS", "B_PC", "B_PM", 
"C_1", "C_2", "C386", "C387", "C388", "C389", "C390", "C391", 
"C392", "C393", "C394", "C395", "C396", "C397", "C398", "C399", 
"C400", "C401", "C402", "C403", "C404", "C405", "C406", "C407", 
"C408", "C409", "C410", "C411", "C412", "C413", "C414", "C415", 
"C416", "C417", "C418", "C419", "C420", "C421", "C422", "C423", 
"C424", "C425", "C426", "C427", "C428", "C429", "C430", "C431", 
"C432", "C433", "C434", "C435", "C436", "C437", "C438", "C439", 
"C440", "C441", "C442", "C443", "C444", "C445", "C446", "C447", 
"C448", "C449", "C450", "C451", "C452", "C453", "C454", "C455", 
"C456", "C457", "C458", "C459", "C460", "C461", "C462", "C463", 
"C464", "C465", "C466", "C467", "C468", "C469", "C470", "C471", 
"C472", "C473", "C474", "C475", "C476", "C477", "C478", "C479", 
"C480", "C481", "C482", "C483", "C484", "C485", "C486", "C487", 
"C488", "C489", "C490", "C491", "C492", "C493", "C494", "C495", 
"C496", "C497", "C498", "C499", "C500", "C501", "C502", "C503", 
"C504", "C505", "C506", "C507", "C508", "C509", "C510", "C511", 
"C512", "C513", "C514", "C515", "C516", "C517", "C518", "C519", 
"C520", "C521", "C522", "C523", "C524", "C-8_PC", "C-8_PM", "D_1", 
"D_2", "E_1", "E_2", "F_1", "F_2"), class = "factor"), As = c(9, 
8.75, 13.5, 7.75, 7.6, 8.33, 8, 8.75, 7.4, 8.25, 8.17, 7.75, 
7.6, 7.5, 7.2, 8, 7.83, 7.75, 7, 7.5, 8.17, 8.75, 6.67, 7, 5.83, 
6.75, 5.6, 6.4, 6.2, 6.2, 6.2, 6.25, 7, 6, 6, 6.4, 6, 5.8, 5.6, 
6, 5.8, 7.25, 8.8, 8.5, 8, 8.25, 8.25, 8.5, 8.25, 8.25), Al = c(30245, 
38060, 36280, 24355, 27776, 35190, 38733.8, 36400, 29624, 33699.75, 
32163.33, 30645.75, 31373, 26647.5, 19987.6, 32210, 27158, 24220.25, 
18598.5, 23081.75, 29393, 26800.5, 22581.67, 29290, 29651.67, 
20947.5, 19762.6, 23815, 32784.8, 20696.2, 26880.6, 25087.75, 
19497.2, 21794, 32232, 24253.4, 20034, 21270, 22510, 15170.25, 
8956.6, 21612.25, 35828, 30006.25, 27128.75, 25835, 31118.75, 
35614.5, 37440.25, 33736.75), Hg = c(0.25, 0.35, 0.48, 1.03, 
1.12, 0.2, 1.14, 0.4, 2, 0.48, 0.85, 0.18, 0.76, 0.4, 0.48, 0.35, 
0.32, 0.33, 0.4, 0.13, 0.15, 0.13, 0.87, 0.12, 0.03, 0.33, 0.2, 
0.22, 0.04, 0.16, 0.1, 0.18, 0.11, 0.08, 0.03, 0.06, 0.06, 0.1, 
0.03, 0.07, 0.03, 0.1, 0.08, 0.11, 0.1, 0.13, 0.08, 0.12, 0.07, 
0.09)), .Names = c("sample", "As", "Al", "Hg"), row.names = c(NA, 
50L), class = "data.frame")

和

dput(df2)
structure(list(As = c(25L, 32L), Hg = c(0.4, 0.8), Cr = c(100L, 
360L), Element = structure(c(1L, 3L), .Label = c("LCB", "LCB_pelite", 
"LCL"), class = "factor")), .Names = c("As", "Hg", "Cr", "Element"
), row.names = c(NA, -2L), class = "data.frame")

实际上原始数据框架更大，但是这个子集提供了这个想法。

我现在想要的是在第3个数据帧中输入第一个df的每个元素的值超过第二个df中包含的阈值。

请注意df2中有 2个不同的阈值（对于每个元素），df2中有一些元素在df1中不匹配（例如 Cr ）。

我试过写一个for循环，但我一次只能为1个元素做这个：

for (i in df2$As)  {
    print(length(which(df1$As > i)))
}

我也试过使用嵌套for循环但没有成功..

Answer 1

我很确定这看起来不太好，但我认为它有效。我添加了一些额外的行来匹配两个数据框中找到的元素，在这种情况下只有1.它可能会对您的完整数据进行一些更改：

df1.2 <- rbind(df1, df1) #Duplicate the df1 to compare to each threshold value
df1.2 <- df1.2[order(df1.2$sample),] #Order by sample again
cols2 <- na.omit(match(colnames(df1), colnames(df2)))[[1]] #Get the columns of df2 which are in df1
cols1 <- na.omit(match(colnames(df2), colnames(df1)))[[1]] #Get the columns of df1 which are in df2
df2.2 <- df2[rep(1:2, nrow(df1)),cols2] #Replicates df2 the number of times to allow matching the thresholds to each sample, once for each threshold
exceeds <- df1.2[,cols1]>df2.2 #Make the comparions and return a boolean
sum(exceeds) #You will need colSums() for more than one column

对于样本数据，从答案中也不清楚ir引用的元素，但如果多个元素匹配且结果是矩阵，则不会发生这种情况。

也许有一种更优雅的方式，无需复制数据帧，不必担心元素匹配的数量。

Answer 2

df3=data.frame(Pollutant="Z",LCB=0,LCL=0,stringsAsFactors=FALSE)
for (p in names(df1)[-1]) {
  if(p %in% names(df2)[1:(length(df2)-1)]) {
    df3 = rbind(df3,c(p,sum(df1[p]>df2[[p]][1]),sum(df1[p]>df2[[p]][2])))
  }
}
df3=df3[-1,]
df3

更新

啊，每个新行都是一个角色向量。完成：

str(df3)
df3$LCB=as.numeric(df3$LCB)
df3$LCL=as.numeric(df3$LCL)
str(df3)

Answer 3

这个怎么样？

foo <- function(x, y) {
    sapply(x, function(i) sum(y>i))
}

cols = c("As", "Hg")
mapply(foo, df2[cols], df1[cols])
#      As Hg
# [1,]  0 10
# [2,]  0  6

如有必要，请将其转换为data.frame。

for循环以查找不同数据帧之间的阈值

3 个答案: