如何将数据转换为长格式?

时间:2021-07-23 21:30:47

标签: r

我有一个这样的数据集:

structure(list(X = 1:100, username = c("Ohh_Ziggy", "WEAVYwonder_", 
"mcbadlon", "NaazhimSupreme", "nivanacampos", "nrazaliyah", "Bigfamlife", 
"_CurlyFryGuy_", "Jackie_montes", "SplendentSweven", "EclecticCoding", 
"RevTinTin", "MiyaDior", "RudyJb1024", "xkimmygirlx", "Crescen04324237", 
"wsrphoto", "Keioney_kisses", "preciousrubie", "Brenderzzz_", 
"kikilovescats", "samanthacraig15", "cappir", "ChildOUniverse", 
"PeachiesPromo", "Bierkast", "terri2kool", "LattaZakyra", "Yildiiiiiiiiz", 
"d1no_nugget", "brendochendo", "thehulkster", "Nick_Miles_", 
"__aubssss", "Warrior_Maiden", "napitupulu_a", "lizziemonkhouse", 
"smutwiizard", "tayzer6", "SimStrength", "popkis", "AnnetteVitelli3", 
"Karabo_Mtaung", "ankushthebest10", "ducksandchucks", "May_leita", 
"anna_michaux", "WayCatPub", "orodelancs", "CoastCyclist", "bluemoon357", 
"danieldurrans", "james_southcott", "CarmelScotsNews", "JanJoostBouwman", 
"100ProofWoman", "undersiegexo", "parrothead34", "amoodyknapp", 
"Maverick1914", "lilac_bun", "TaterSaladJD", "ItsBittie", "gengen0309", 
"MainMandarin", "napitupulu_a", "Ipeethree3", "KLobstar", "simplyaracelii", 
"stankloaf", "CllrKenPollock", "OnlySimphiwe", "IanSmalley3", 
"KathrynwithaY", "msmrocks", "marge_cord", "m_melodias", "spaceboosh", 
"huegolden", "Crampedsultana", "siempreAM0nae", "Herbie555", 
"SandraRPearce1", "reverend_thom", "natashametzler", "charlesdlcruz", 
"Ho8Go8L1N", "_JLovee", "Felstedboy", "Its_Jack_Brooo", "CuddiCAPALOT", 
"iamesrvan", "PickledGingerBC", "boogy1228", "Honeylotus333", 
"ThisOffendsMeTV", "Just_Jones33", "damnnndor", "dbrown13", "DaYy_ShiNeBRITE"
), compound = c(-0.1027, -0.3612, 0.5574, -0.886, 0.4738, 0, 
0.9277, 0, -0.6077, 0.5023, 0.5635, 0, 0, -0.4767, -0.8248, -0.4678, 
-0.296, 0.0094, 0, 0.9274, 0.6124, -0.6664, 0, 0.6486, 0.6116, 
0.5399, 0.8926, 0, 0.6792, 0.9768, 0, 0.2732, -0.7073, 0.892, 
-0.7783, 0.3818, 0, -0.6739, 0.7314, 0.4588, -0.2411, 0, -0.2212, 
0.2023, -0.2244, 0.296, -0.4417, -0.7003, 0.2946, -0.6808, 0, 
0, -0.0387, -0.3816, 0.5106, 0.296, 0.6739, 0.5487, -0.2023, 
0.5229, -0.9559, 0.6369, 0.743, 0, 0, 0.3818, 0, -0.1531, 0.4278, 
0, 0.555, 0, 0.3182, 0, 0.4939, 0.1531, 0, 0.802, 0.7717, 0.624, 
-0.7184, -0.128, 0.1027, 0.6209, 0.3109, -0.6249, 0.4118, -0.6249, 
-0.296, 0.5083, 0.5994, 0.6124, 0.4199, -0.8069, 0.4767, -0.5423, 
0.5994, 0.9153, -0.296, -0.8417), Date = c("2020-03-30", "2020-03-30", 
"2020-03-26", "2020-03-26", "2020-03-21", "2020-03-20", "2020-03-18", 
"2020-03-18", "2020-03-18", "2020-03-18", "2020-03-16", "2020-03-16", 
"2020-03-16", "2020-03-05", "2020-03-05", "2020-02-27", "2020-02-27", 
"2020-02-26", "2020-02-21", "2020-02-14", "2020-03-15", "2020-02-09", 
"2020-02-07", "2020-02-02", "2020-03-31", "2020-03-30", "2020-03-21", 
"2020-03-20", "2020-02-08", "2020-02-07", "2020-03-20", "2020-03-11", 
"2020-03-01", "2020-02-29", "2020-02-27", "2020-03-25", "2020-03-23", 
"2020-02-17", "2020-02-17", "2020-02-15", "2020-03-31", "2020-03-31", 
"2020-03-27", "2020-03-25", "2020-03-24", "2020-03-22", "2020-03-20", 
"2020-03-18", "2020-03-15", "2020-03-14", "2020-03-13", "2020-03-01", 
"2020-02-29", "2020-02-11", "2020-02-02", "2020-03-15", "2020-03-31", 
"2020-03-22", "2020-03-22", "2020-03-19", "2020-02-22", "2020-02-22", 
"2020-02-11", "2020-03-27", "2020-03-27", "2020-03-25", "2020-02-18", 
"2020-03-17", "2020-03-13", "2020-03-10", "2020-03-30", "2020-03-25", 
"2020-02-17", "2020-02-15", "2020-03-17", "2020-03-17", "2020-02-29", 
"2020-03-26", "2020-03-22", "2020-03-18", "2020-03-18", "2020-03-16", 
"2020-03-15", "2020-02-11", "2020-03-20", "2020-03-18", "2020-03-15", 
"2020-03-14", "2020-02-18", "2020-02-18", "2020-02-16", "2020-02-13", 
"2020-03-29", "2020-03-25", "2020-03-22", "2020-02-24", "2020-02-18", 
"2020-02-15", "2020-03-11", "2020-03-31"), agegroup = c("YA", 
"YA", "YA", "YA", "YA", "YA", "YA", "MA", "YA", "YA", "MA", "YA", 
"YA", "YA", "YA", "YA", "MA", "YA", "YA", "YA", "YA", "YA", "YA", 
"YA", "YA", "YA", "YA", "YA", "YA", "YA", "YA", "MA", "OA", "YA", 
"YA", "YA", "YA", "YA", "YA", "YA", "YA", "YA", "YA", "YA", "YA", 
"YA", "YA", "YA", "MA", "MA", "MA", "YA", "YA", "MA", "YA", "YA", 
"YA", "YA", "MA", "YA", "YA", "YA", "YA", "YA", "YA", "YA", "YA", 
"YA", "YA", "YA", "MA", "MA", "YA", "YA", "MA", "YA", "YA", "YA", 
"YA", "OA", "YA", "YA", "YA", "YA", "YA", "YA", "YA", "YA", "MA", 
"YA", "YA", "YA", "MA", "YA", "YA", "YA", "YA", "YA", "MA", "YA"
)), row.names = c(NA, 100L), class = "data.frame")

一个用户名可能有一对多的观察结果。 我希望生成这样的长格式输出:

如果时间介于两者之间

日期从 2020-03-1 到 2020-05-31 编码为时间“1”

日期从 2020-06-1 到 2020-08-31 编码为时间“2”

日期从 2020-09-1 到 2021-02-28 编码为时间“3”

日期从 2021-03-1 到 2021-04-30 编码为时间“4”

如果有超过 2 个分数落在时间范围内,计算平均值。

我想看到的输出是: 对于每个用户名,他们将有四个观察结果。

Username   Agegroup    Score    Time   Date
A            YA         NA       1     Date in the original dataset
A            YA         0.1      2     Date in the original dataset
A            YA         NA       3     Date in the oringal dataset
A            YA         NA       4     ...
B            MA         NA       1     ...
B            MA         NA       2     ...
B            MA         0.2      3     ...
B            MA         0.2      4     ...

2 个答案:

答案 0 :(得分:1)

这应该有效

df$Date <- as.Date(df$Date)

   df <- df %>%
  mutate(time = case_when(Date >= "2020-02-01" & Date <= "2020-03-01" ~ 1,
                      Date >= "2020-03-01" & Date <= "2020-03-31" ~ 2,
                      Date >= "2020-09-01" & Date <= "2021-02-28" ~ 3,
                      Date >= "2021-03-01" & Date <= "2021-04-30" ~ 4))
                      
df2 <- df %>% 
  group_by(username, agegroup, time) %>% 
  summarize(score = mean(compound))

df2$username <- as.factor(df2$username)
df2$agegroup <- as.factor(df2$agegroup)

expanded_df <- expand.grid(df2$username, df2$time)

expanded_df <- expanded_df %>%
  distinct()

colnames(expanded_df) <- c("username", "time")

df3 <- full_join(df2, expanded_df, by = c("username", "time"))

enter image description here

答案 1 :(得分:0)

使用 cutDate 分成 4 个间隔。计算每个 compoundusernameagegroup 值的 Time 平均值。使用 complete 获取每个 username 的所有 4 个时间值。

library(dplyr)
library(tidyr)

res <- df %>%
  mutate(Date = as.Date(Date),
         Time = cut(Date, as.Date(c('2020-03-1', '2020-06-1', '2020-09-1', '2021-03-1', '2021-04-30')), labels = FALSE)) %>%
  group_by(username, agegroup, Time) %>%
  summarise(Score = mean(compound, na.rm = TRUE), .groups = 'drop') %>%
  complete(username, Time = 1:4)

res