我试图根据我使用dplyr的spread()函数创建的数据帧创建一些相关图。当我使用扩展功能时,它在新数据帧中创建了NA。这是有道理的,因为数据框在不同时间段具有不同参数的浓度值。
当我使用散布函数时,它给了我这样的数据帧(样本数据):
structure(list(orgid = c("11NPSWRD", "11NPSWRD", "11NPSWRD",
"11NPSWRD", "11NPSWRD", "11NPSWRD", "11NPSWRD", "11NPSWRD", "11NPSWRD",
"11NPSWRD", "11NPSWRD", "11NPSWRD", "11NPSWRD", "11NPSWRD", "11NPSWRD",
"11NPSWRD", "11NPSWRD", "11NPSWRD", "11NPSWRD", "11NPSWRD"),
locid = c("11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2",
"11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2",
"11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2",
"11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2",
"11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2",
"11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2",
"11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2", "11NPSWRD-MORR_NPS_PR2"
), stdate = structure(c(9891, 9891, 9891, 9920, 9920, 9920,
9949, 9949, 9949, 9978, 9978, 9978, 10011, 10011, 10011,
10067, 10067, 10073, 10073, 10073), class = "Date"), sttime = structure(c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), class = c("hms",
"difftime"), units = "secs"), valunit = c("uS/cm", "mg/l",
"mg/l", "uS/cm", "mg/l", "mg/l", "uS/cm", "mg/l", "mg/l",
"uS/cm", "mg/l", "mg/l", "uS/cm", "mg/l", "mg/l", "uS/cm",
"mg/l", "uS/cm", "mg/l", "mg/l"), swqs = c("FW2-TP", "FW2-TP",
"FW2-TP", "FW2-TP", "FW2-TP", "FW2-TP", "FW2-TP", "FW2-TP",
"FW2-TP", "FW2-TP", "FW2-TP", "FW2-TP", "FW2-TP", "FW2-TP",
"FW2-TP", "FW2-TP", "FW2-TP", "FW2-TP", "FW2-TP", "FW2-TP"
), WMA = c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), year = c(1997L, 1997L, 1997L,
1997L, 1997L, 1997L, 1997L, 1997L, 1997L, 1997L, 1997L, 1997L,
1997L, 1997L, 1997L, 1997L, 1997L, 1997L, 1997L, 1997L),
Chloride = c(NA, 35, NA, NA, 45, NA, NA, 30, NA, NA, 30,
NA, NA, 30, NA, NA, NA, NA, 35, NA), `Specific conductance` = c(224,
NA, NA, 248, NA, NA, 204, NA, NA, 166, NA, NA, 189, NA, NA,
119, NA, 194, NA, NA), `Total dissolved solids` = c(NA, NA,
101, NA, NA, 115, NA, NA, 96, NA, NA, 79, NA, NA, 89, NA,
56, NA, NA, 92)), .Names = c("orgid", "locid", "stdate",
"sttime", "valunit", "swqs", "WMA", "year", "Chloride", "Specific conductance",
"Total dissolved solids"), row.names = c(NA, 20L), class = "data.frame")
我遇到的问题是,当我尝试创建相关图时,它只给我一个点。.我猜这是因为数据帧中有NA。但是当我尝试过滤时NA给了我0观察值的数据框。任何帮助将不胜感激!
用于创建相关图的示例代码:
plot1<-ggplot(data=df,aes(x="Specific conductance",y="Chloride"))+
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+
geom_point()
答案 0 :(得分:2)
您需要remove NAs & collapse rows which have the same Date
library(tidyverse)
# clean up column names by removing spaces
df <- df %>%
select_all(~str_replace(., " ", "_"))
# removing NAs & collapsing rows which have the same Date
require(data.table)
DT <- data.table(df)
DT2 <- unique(DT[, lapply(.SD, na.omit), by = stdate], by = "stdate")
library(ggpmisc)
formula1 <- y ~ x
ggplot(data = DT2, aes(x = Specific_conductance, y = Chloride)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, formula = formula1) +
stat_poly_eq(aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~~")),
label.x.npc = "left", label.y.npc = "top",
formula = formula1, parse = TRUE, size = 6) +
theme_bw(base_size = 14)
由reprex package(v0.2.0.9000)于2018-09-10创建。
答案 1 :(得分:1)
快速而肮脏的解决方案是修改您已经拥有的数据。通过特定的列将其自身合并,并在两个值都不为NA
的地方保留匹配项。
# Merge data with itself
# Here I'm only guessing columns that need to match between
# Conductance and Chloride
df2 <- merge(df, df, c("orgid", "locid", "stdate"))
# This will give table with multiple duplicate rows (all possible combinations)
# Select only those combinations where both values are not NA
df2 <- subset(df2, !is.na(Chloride.x) & !is.na(`Specific conductance.y`))
# Plot
ggplot(df2, aes(`Specific conductance.y`, Chloride.x)) +
geom_smooth(method = "lm", se = FALSE, color = "black", formula = y ~ x) +
geom_point()