使用cut为时间变量创建24个类别

时间:2017-05-12 21:51:05

标签: r dataframe cut categorical-data

在这里我导入数据,对它进行一些操作(这可能不是问题/修复所在的位置)

前两行为我的剪裁设置了我的参数。

lab_var_num <- (0:24) 
times_var <-c(0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300,2400,2500)


all_files_ls <- read_csv("~/Desktop/bioinformatic_work/log_parse_files/sorted_by_habitat/all_trap/all_files_la_selva_log.csv")
#Eliminate bad data and capture in separate dataframe- "bad" data contained within all_files_ls_bad
all_files_ls_bad<-subset(all_files_ls,all_files_ls$temp<10|all_files_ls$temp>50)
all_files_ls <-subset(all_files_ls,all_files_ls$temp>10&all_files_ls$temp<50)

# convert our character data to date data- then change to POSIXct data type.
# all_dates <- strptime(all_files_ls$date,format="%m/%d/%Y")
# Data needs to be put into a cosnistant format of %m/%d/%Y before you can coerce it
# into POSIXct, or any other, data otherwise it will spit out errors.

all_files_ls$date <- strptime(all_files_ls$date,format="%m/%d/%Y")
all_files_ls$date <- as.POSIXct(all_files_ls$date)
# Create wet and dry season data sets.
all_files_ls_w <- subset(all_files_ls,date>="2015-05-01"&date<="2015-12-31"|date>="2016-05-01"&date<="2016-12-31")
all_files_ls_s <- subset(all_files_ls,date>="2015-01-01"&date<="2015-4-30"|date>="2016-01-01"&date<="2016-04-30")


# Subset into canopy and understory dataframes.

all_files_ls_s_c <- subset(all_files_ls_s,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_s_u <- subset(all_files_ls_s,canopy_understory=="u"|canopy_understory=="U")

all_files_ls_w_c <- subset(all_files_ls_w,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_w_u <- subset(all_files_ls_w,canopy_understory=="u"|canopy_understory=="U")

all_files_ls_s_c_summ <- all_files_ls_s_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_s_u_summ <- all_files_ls_s_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_c_summ <- all_files_ls_w_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_u_summ <- all_files_ls_w_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))

以下是我的剪辑功能:

all_files_ls_s_c_summ$time <- cut(as.numeric(all_files_ls_s_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_s_u_summ$time <- cut(as.numeric(all_files_ls_s_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_c_summ$time <- cut(as.numeric(all_files_ls_w_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_u_summ$time <- cut(as.numeric(all_files_ls_w_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)

当我检查剪切功能产生的数据时,我有超过我想要的24个类别。

以下是一些示例数据:

  trap        serial_no                           file_name canopy_understory       date  time  temp humidity
1  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28   600  20.1     <NA>
2  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28   800  25.5     <NA>
3  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1000  29.0     <NA>
4  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1200  28.0     <NA>
5  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1400  28.5     <NA>
6  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1601  27.5     <NA>
7  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1803  25.5     <NA>
8  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  2001  23.5     <NA>
9  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  2200  22.5     <NA>
10 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-29   000  21.5     <NA>
11  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0159  23.6     <NA>
12  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0359  24.1     <NA>
13  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0559  24.1     <NA>
14  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0759  24.6     <NA>
15  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0959  24.6     <NA>
16  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1159  26.1     <NA>
17  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1359  26.6     <NA>
18  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1559  25.6     <NA>
19  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1759  24.1     <NA>
20 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1959  24.1     <NA>

此样本数据可能存在问题,因为我无法提供足够的数据集快照(太大),这可能是问题的高度可变性。

这里是切割产生的一个数据帧:

"","time","standard_deviation","mean"
"1","0",0.864956566100052,23.5574468085106
"2","0",1.14440510857225,22.81103515625
"3","0",0.984904980117555,22.2286831812256
"4","0",1.08678357585325,22.3990654205607
"5","1",1.05145037946718,22.0769704433498
"6","1",1.12960402993109,22.3836754643206
"7","2",1.03725039998279,21.7559322033898
"8","2",1.1068790873174,21.9357894736842
"9","3",1.12097157902533,21.6717980295567
"10","3",1.19621923944834,22.00751953125
"11","4",1.07458677721861,21.4380704041721
"12","4",1.13677253853809,21.6116959064328
"13","5",1.17900504899409,21.4315270935961
"14","5",1.28653071505367,21.79990234375
"15","6",1.20354620166699,21.9286831812256
"16","6",1.31676108631382,22.2322429906542
"17","7",1.86260704732764,23.7655596555966
"18","7",1.77861521566506,24.20419921875
"19","8",2.46883855937697,25.7301298701299
"20","8",2.46920498327612,26.1562427071179
"21","9",2.68395795782085,27.1479115479115
"22","0",0.949097628789142,23.3553191489362
"23","9",2.79945910162021,27.6413533834586
"24","10",2.79930128034239,27.7833981841764
"25","10",2.90435941493285,28.4350606394708
"26","11",2.79704441144441,28.2748466257669
"27","11",2.84178392019108,28.8
"28","12",2.88487423989003,28.5626131953428
"29","12",3.09977843678832,29.2737596471885
"30","13",2.78609514613334,28.6300613496933
"31","13",2.9274394403559,29.0124410933082
"32","14",2.46471466241151,28.0413748378729
"33","14",2.64014509330527,28.5502750275027
"34","15",2.24926437332819,27.1096296296296
"35","15",2.3886068967475,27.4907634307257
"36","16",1.9467999768684,26.0171875
"37","16",1.96854340222531,26.4749174917492
"38","17",1.43673026552318,24.7727385377943
"39","17",1.49178257598373,25.1431279620853
"40","18",1.23662593572858,24.0101694915254
"41","18",1.36276616154878,24.3736434108527
"42","19",1.07197213445298,23.5255266418835
"43","1",0.99431780638411,23.0787234042553
"44","19",1.13453791853054,23.854174573055
"45","20",1.01855291267246,23.1731421121252
"46","20",1.10799364301127,23.4543743078627
"47","21",0.998989468534969,22.9889714993804
"48","21",1.0452391633029,23.2751423149905
"49","22",0.993841145023006,22.6971316818774
"50","22",1.08423014353774,22.9405524861878
"51","23",1.01856406998964,22.517843866171
"52","2",1.03074836073784,22.8872340425532
"53","3",1.10188636506543,22.7382978723404
"54","4",1.11782711780932,22.5787234042553
"55","5",1.06571756649915,22.6106382978723
"56","6",1.16909794681656,23.8127659574468
"57","7",1.28653814110936,26.2702127659574
"58","8",1.39470055539637,28.0787234042553

我正在使用group_by获取每个时间点的摘要数据。然后尝试使用剪切来使其在特定时间附近的每个数据点分配到该时间。因此,如果时间是1801,则它与1800组合在一起.group_by函数仅将具有相同“时间”的每个数据点放在一起。我想要完成的是将每个附近的时间点组合在一起。

我无法弄清楚为什么当我期望得到24个时,我会得到58个类别。

1 个答案:

答案 0 :(得分:1)

您可以只按多个变量进行分组,而不是将data.frame的一部分保存为单独的文件并对它们执行相同的操作。您可以使用lubridate::month从每个日期提取月份(在基础R中可以使用strptime(df$date, '%Y-%m-%d')$mon + 1),这样您只需使用ifelse创建新的分组变量,而不是带有重复标签的cut(这将导致R&gt; = 3.4.0中的错误)。设置所有分组变量后,汇总很简单,DRY

library(dplyr)

df %>% group_by(canopy_understory,    # Group by canopy/understory factor
                # Extract numeric month from date. If less than 5, make `season` "s" else "w", and group by it.
                season = ifelse(lubridate::month(date) < 5, 's', 'w'), 
                # Cut time by 0,100,200,...,2400, and group by the factor returned.
                hour = cut(time, seq(0, 2400, 100), include.lowest = TRUE)) %>% 
    summarise(temp_mean = mean(temp),    # For each group, calc mean and sd of temp.
              temp_sd = sd(temp))

#> # A tibble: 20 x 5
#> # Groups: canopy_understory, season [?]
#>    canopy_understory season              hour temp_mean temp_sd
#>               <fctr>  <chr>            <fctr>     <dbl>   <dbl>
#>  1                 c      w           [0,100]      21.5      NA
#>  2                 c      w         (500,600]      20.1      NA
#>  3                 c      w         (700,800]      25.5      NA
#>  4                 c      w       (900,1e+03]      29.0      NA
#>  5                 c      w (1.1e+03,1.2e+03]      28.0      NA
#>  6                 c      w (1.3e+03,1.4e+03]      28.5      NA
#>  7                 c      w (1.6e+03,1.7e+03]      27.5      NA
#>  8                 c      w (1.8e+03,1.9e+03]      25.5      NA
#>  9                 c      w   (2e+03,2.1e+03]      23.5      NA
#> 10                 c      w (2.1e+03,2.2e+03]      22.5      NA
#> 11                 u      s         (100,200]      23.6      NA
#> 12                 u      s         (300,400]      24.1      NA
#> 13                 u      s         (500,600]      24.1      NA
#> 14                 u      s         (700,800]      24.6      NA
#> 15                 u      s       (900,1e+03]      24.6      NA
#> 16                 u      s (1.1e+03,1.2e+03]      26.1      NA
#> 17                 u      s (1.3e+03,1.4e+03]      26.6      NA
#> 18                 u      s (1.5e+03,1.6e+03]      25.6      NA
#> 19                 u      s (1.7e+03,1.8e+03]      24.1      NA
#> 20                 u      s   (1.9e+03,2e+03]      24.1      NA

样本数据的标准偏差为NA,因为每组中只有一个观察值,但它应该可以在较大的数据上正常工作。

数据

df <- structure(list(trap = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("LS_trap_10c", 
    "LS_trap_10u"), class = "factor"), serial_no = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L), .Label = c("7C000000395C1641", "9F00000039641541"
    ), class = "factor"), file_name = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
    ), .Label = c("trap10c_7C000000395C1641_150809.csv", "trap10u_9F00000039641541_160110.csv"
    ), class = "factor"), canopy_understory = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L), .Label = c("c", "u"), class = "factor"), date = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L), .Label = c("2015-05-28", "2015-05-29", "2016-01-01"
    ), class = "factor"), time = c(600L, 800L, 1000L, 1200L, 1400L, 
    1601L, 1803L, 2001L, 2200L, 0L, 159L, 359L, 559L, 759L, 959L, 
    1159L, 1359L, 1559L, 1759L, 1959L), temp = c(20.1, 25.5, 29, 
    28, 28.5, 27.5, 25.5, 23.5, 22.5, 21.5, 23.6, 24.1, 24.1, 24.6, 
    24.6, 26.1, 26.6, 25.6, 24.1, 24.1), humidity = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L), .Label = "<NA>", class = "factor")), .Names = c("trap", 
    "serial_no", "file_name", "canopy_understory", "date", "time", 
    "temp", "humidity"), class = "data.frame", row.names = c("1", 
    "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
    "14", "15", "16", "17", "18", "19", "20"))