在这里我导入数据,对它进行一些操作(这可能不是问题/修复所在的位置)
前两行为我的剪裁设置了我的参数。
lab_var_num <- (0:24)
times_var <-c(0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300,2400,2500)
all_files_ls <- read_csv("~/Desktop/bioinformatic_work/log_parse_files/sorted_by_habitat/all_trap/all_files_la_selva_log.csv")
#Eliminate bad data and capture in separate dataframe- "bad" data contained within all_files_ls_bad
all_files_ls_bad<-subset(all_files_ls,all_files_ls$temp<10|all_files_ls$temp>50)
all_files_ls <-subset(all_files_ls,all_files_ls$temp>10&all_files_ls$temp<50)
# convert our character data to date data- then change to POSIXct data type.
# all_dates <- strptime(all_files_ls$date,format="%m/%d/%Y")
# Data needs to be put into a cosnistant format of %m/%d/%Y before you can coerce it
# into POSIXct, or any other, data otherwise it will spit out errors.
all_files_ls$date <- strptime(all_files_ls$date,format="%m/%d/%Y")
all_files_ls$date <- as.POSIXct(all_files_ls$date)
# Create wet and dry season data sets.
all_files_ls_w <- subset(all_files_ls,date>="2015-05-01"&date<="2015-12-31"|date>="2016-05-01"&date<="2016-12-31")
all_files_ls_s <- subset(all_files_ls,date>="2015-01-01"&date<="2015-4-30"|date>="2016-01-01"&date<="2016-04-30")
# Subset into canopy and understory dataframes.
all_files_ls_s_c <- subset(all_files_ls_s,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_s_u <- subset(all_files_ls_s,canopy_understory=="u"|canopy_understory=="U")
all_files_ls_w_c <- subset(all_files_ls_w,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_w_u <- subset(all_files_ls_w,canopy_understory=="u"|canopy_understory=="U")
all_files_ls_s_c_summ <- all_files_ls_s_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_s_u_summ <- all_files_ls_s_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_c_summ <- all_files_ls_w_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_u_summ <- all_files_ls_w_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
以下是我的剪辑功能:
all_files_ls_s_c_summ$time <- cut(as.numeric(all_files_ls_s_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_s_u_summ$time <- cut(as.numeric(all_files_ls_s_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_c_summ$time <- cut(as.numeric(all_files_ls_w_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_u_summ$time <- cut(as.numeric(all_files_ls_w_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
当我检查剪切功能产生的数据时,我有超过我想要的24个类别。
以下是一些示例数据:
trap serial_no file_name canopy_understory date time temp humidity
1 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 600 20.1 <NA>
2 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 800 25.5 <NA>
3 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 1000 29.0 <NA>
4 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 1200 28.0 <NA>
5 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 1400 28.5 <NA>
6 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 1601 27.5 <NA>
7 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 1803 25.5 <NA>
8 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 2001 23.5 <NA>
9 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 2200 22.5 <NA>
10 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-29 000 21.5 <NA>
11 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 0159 23.6 <NA>
12 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 0359 24.1 <NA>
13 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 0559 24.1 <NA>
14 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 0759 24.6 <NA>
15 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 0959 24.6 <NA>
16 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 1159 26.1 <NA>
17 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 1359 26.6 <NA>
18 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 1559 25.6 <NA>
19 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 1759 24.1 <NA>
20 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 1959 24.1 <NA>
此样本数据可能存在问题,因为我无法提供足够的数据集快照(太大),这可能是问题的高度可变性。
这里是切割产生的一个数据帧:
"","time","standard_deviation","mean"
"1","0",0.864956566100052,23.5574468085106
"2","0",1.14440510857225,22.81103515625
"3","0",0.984904980117555,22.2286831812256
"4","0",1.08678357585325,22.3990654205607
"5","1",1.05145037946718,22.0769704433498
"6","1",1.12960402993109,22.3836754643206
"7","2",1.03725039998279,21.7559322033898
"8","2",1.1068790873174,21.9357894736842
"9","3",1.12097157902533,21.6717980295567
"10","3",1.19621923944834,22.00751953125
"11","4",1.07458677721861,21.4380704041721
"12","4",1.13677253853809,21.6116959064328
"13","5",1.17900504899409,21.4315270935961
"14","5",1.28653071505367,21.79990234375
"15","6",1.20354620166699,21.9286831812256
"16","6",1.31676108631382,22.2322429906542
"17","7",1.86260704732764,23.7655596555966
"18","7",1.77861521566506,24.20419921875
"19","8",2.46883855937697,25.7301298701299
"20","8",2.46920498327612,26.1562427071179
"21","9",2.68395795782085,27.1479115479115
"22","0",0.949097628789142,23.3553191489362
"23","9",2.79945910162021,27.6413533834586
"24","10",2.79930128034239,27.7833981841764
"25","10",2.90435941493285,28.4350606394708
"26","11",2.79704441144441,28.2748466257669
"27","11",2.84178392019108,28.8
"28","12",2.88487423989003,28.5626131953428
"29","12",3.09977843678832,29.2737596471885
"30","13",2.78609514613334,28.6300613496933
"31","13",2.9274394403559,29.0124410933082
"32","14",2.46471466241151,28.0413748378729
"33","14",2.64014509330527,28.5502750275027
"34","15",2.24926437332819,27.1096296296296
"35","15",2.3886068967475,27.4907634307257
"36","16",1.9467999768684,26.0171875
"37","16",1.96854340222531,26.4749174917492
"38","17",1.43673026552318,24.7727385377943
"39","17",1.49178257598373,25.1431279620853
"40","18",1.23662593572858,24.0101694915254
"41","18",1.36276616154878,24.3736434108527
"42","19",1.07197213445298,23.5255266418835
"43","1",0.99431780638411,23.0787234042553
"44","19",1.13453791853054,23.854174573055
"45","20",1.01855291267246,23.1731421121252
"46","20",1.10799364301127,23.4543743078627
"47","21",0.998989468534969,22.9889714993804
"48","21",1.0452391633029,23.2751423149905
"49","22",0.993841145023006,22.6971316818774
"50","22",1.08423014353774,22.9405524861878
"51","23",1.01856406998964,22.517843866171
"52","2",1.03074836073784,22.8872340425532
"53","3",1.10188636506543,22.7382978723404
"54","4",1.11782711780932,22.5787234042553
"55","5",1.06571756649915,22.6106382978723
"56","6",1.16909794681656,23.8127659574468
"57","7",1.28653814110936,26.2702127659574
"58","8",1.39470055539637,28.0787234042553
我正在使用group_by获取每个时间点的摘要数据。然后尝试使用剪切来使其在特定时间附近的每个数据点分配到该时间。因此,如果时间是1801,则它与1800组合在一起.group_by函数仅将具有相同“时间”的每个数据点放在一起。我想要完成的是将每个附近的时间点组合在一起。
我无法弄清楚为什么当我期望得到24个时,我会得到58个类别。
答案 0 :(得分:1)
您可以只按多个变量进行分组,而不是将data.frame的一部分保存为单独的文件并对它们执行相同的操作。您可以使用lubridate::month
从每个日期提取月份(在基础R中可以使用strptime(df$date, '%Y-%m-%d')$mon + 1
),这样您只需使用ifelse
创建新的分组变量,而不是带有重复标签的cut
(这将导致R&gt; = 3.4.0中的错误)。设置所有分组变量后,汇总很简单,DRY。
library(dplyr)
df %>% group_by(canopy_understory, # Group by canopy/understory factor
# Extract numeric month from date. If less than 5, make `season` "s" else "w", and group by it.
season = ifelse(lubridate::month(date) < 5, 's', 'w'),
# Cut time by 0,100,200,...,2400, and group by the factor returned.
hour = cut(time, seq(0, 2400, 100), include.lowest = TRUE)) %>%
summarise(temp_mean = mean(temp), # For each group, calc mean and sd of temp.
temp_sd = sd(temp))
#> # A tibble: 20 x 5
#> # Groups: canopy_understory, season [?]
#> canopy_understory season hour temp_mean temp_sd
#> <fctr> <chr> <fctr> <dbl> <dbl>
#> 1 c w [0,100] 21.5 NA
#> 2 c w (500,600] 20.1 NA
#> 3 c w (700,800] 25.5 NA
#> 4 c w (900,1e+03] 29.0 NA
#> 5 c w (1.1e+03,1.2e+03] 28.0 NA
#> 6 c w (1.3e+03,1.4e+03] 28.5 NA
#> 7 c w (1.6e+03,1.7e+03] 27.5 NA
#> 8 c w (1.8e+03,1.9e+03] 25.5 NA
#> 9 c w (2e+03,2.1e+03] 23.5 NA
#> 10 c w (2.1e+03,2.2e+03] 22.5 NA
#> 11 u s (100,200] 23.6 NA
#> 12 u s (300,400] 24.1 NA
#> 13 u s (500,600] 24.1 NA
#> 14 u s (700,800] 24.6 NA
#> 15 u s (900,1e+03] 24.6 NA
#> 16 u s (1.1e+03,1.2e+03] 26.1 NA
#> 17 u s (1.3e+03,1.4e+03] 26.6 NA
#> 18 u s (1.5e+03,1.6e+03] 25.6 NA
#> 19 u s (1.7e+03,1.8e+03] 24.1 NA
#> 20 u s (1.9e+03,2e+03] 24.1 NA
样本数据的标准偏差为NA
,因为每组中只有一个观察值,但它应该可以在较大的数据上正常工作。
数据
df <- structure(list(trap = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("LS_trap_10c",
"LS_trap_10u"), class = "factor"), serial_no = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("7C000000395C1641", "9F00000039641541"
), class = "factor"), file_name = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("trap10c_7C000000395C1641_150809.csv", "trap10u_9F00000039641541_160110.csv"
), class = "factor"), canopy_understory = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("c", "u"), class = "factor"), date = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("2015-05-28", "2015-05-29", "2016-01-01"
), class = "factor"), time = c(600L, 800L, 1000L, 1200L, 1400L,
1601L, 1803L, 2001L, 2200L, 0L, 159L, 359L, 559L, 759L, 959L,
1159L, 1359L, 1559L, 1759L, 1959L), temp = c(20.1, 25.5, 29,
28, 28.5, 27.5, 25.5, 23.5, 22.5, 21.5, 23.6, 24.1, 24.1, 24.6,
24.6, 26.1, 26.6, 25.6, 24.1, 24.1), humidity = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "<NA>", class = "factor")), .Names = c("trap",
"serial_no", "file_name", "canopy_understory", "date", "time",
"temp", "humidity"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20"))