Question

我有一个数据集，其中的每一行都是部长ID，对于每一行我都有带有日期（天，月和年）的列，部长进入政府（开始）并离开政府（退出）。这是有关当前外观的示例：

data <- structure(list(id_min = c("1030015", "1030028"), begin_day = c("29", 
"4"), begin_month = c("12", "1"), begin_year = c("2019", "2020"
), exit_day = c("3", "10"), exit_month = c("1", "1"), exit_year = c("2020", 
"2020")), row.names = c(NA, -2L), class = c("data.frame"))

我想扩展“开始日期”和“退出日期”之间的行，并创建一个新列（number_days）来计算部长在政府任职的天数。数据集中的其他变量（此处省略）仅应在数据的扩展版本中重复）。这是我正在寻找的数据帧输出：

我能够将日期以％Y-％m-％d的格式放在一起，并使用以下代码创建新列“ number_days”

data$begin_date <- as.Date(with(data, paste(begin_year, begin_month, begin_day,sep="-")), "%Y-%m-%d")
data$exit_date <- as.Date(with(data, paste(exit_year, exit_month, exit_day,sep="-")), "%Y-%m-%d")
data$number_days <- data$exit_date - data$begin_date

但是我没有成功扩展两个日期（begin_date和exit_date）之间的行。我试图使用tidyr::complete()来做到这一点。

Answer 1

一种<string.h>解决方案：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char *CreateString(const char *string1, const char *string2) {
    size_t size1 = strlen(string1);
    size_t size2 = strlen(string2);
    size_t size = size1 * (size2 + 1);
    char *stringptr = malloc(size + 1);

    if (stringptr) {
        for (size_t i = 0; i < size; i = i + size2 + 1) {
            stringptr[i] = *string1++;
            strcpy(stringptr + i + 1, string2);
        }
    }
    return stringptr;
}

int main() {
    char string1[] = "chocolate";
    char string2[] = "123";
    char *string3 = CreateString(string1, string2);
    if (string3) {
        printf("%s\n", string3);
        free(string3);
    } else {
        printf("cannot allocate memory\n");
    }
    return 0;
}

Answer 2

您也可以使用fill_gaps()软件包中的tsibble函数。

library(tsibble)
library(dplyr)
library(tidyr)

data %>%
  mutate(
    exit_date = as.Date(paste(exit_year, exit_month, exit_day, sep = "-")),
    begin_date = as.Date(paste(begin_year, begin_month, begin_day, sep = "-")),
    number_days = as.numeric(exit_date - begin_date) + 1
  ) %>%
  pivot_longer(cols = ends_with("date"), names_to = "event",values_to = "date") %>%
  as_tsibble(key = id_min, index = date) %>%
  fill_gaps() %>%
  mutate(begin_day = format(date, "%d"),
         begin_month = format(date, "%m"),
         begin_year = format(date, "%Y")) %>%
  as_tibble() %>%
  select(-event, -date) %>%
  fill(everything())

# A tibble: 13 x 8
   id_min  begin_day begin_month begin_year exit_day exit_month exit_year number_days
   <chr>   <chr>     <chr>       <chr>      <chr>    <chr>      <chr>           <dbl>
 1 1030015 29        12          2019       3        1          2020                6
 2 1030015 30        12          2019       3        1          2020                6
 3 1030015 31        12          2019       3        1          2020                6
 4 1030015 01        01          2020       3        1          2020                6
 5 1030015 02        01          2020       3        1          2020                6
 6 1030015 03        01          2020       3        1          2020                6
 7 1030028 04        01          2020       10       1          2020                7
 8 1030028 05        01          2020       10       1          2020                7
 9 1030028 06        01          2020       10       1          2020                7
10 1030028 07        01          2020       10       1          2020                7
11 1030028 08        01          2020       10       1          2020                7
12 1030028 09        01          2020       10       1          2020                7
13 1030028 10        01          2020       10       1          2020                7

Answer 3

一种方法是使用uncount函数：

library(dplyr)
library(tidyr)
library(lubridate)
data %>% 
  mutate(start_date = mdy(paste(begin_month,begin_day,begin_year,sep="-")),
         end_date = mdy(paste(exit_month,exit_day,exit_year,sep="-")),
         number_days = as.integer(end_date-start_date + 1)) %>%
  uncount(as.integer(number_days)) %>%
  group_by(id_min) %>%
  mutate(begin_day = day(seq(start_date[1], end_date[1], by = "days")),
         begin_month = month(seq(start_date[1], end_date[1], by = "days")),
         begin_year = year(seq(start_date[1], end_date[1], by = "days"))) %>%
  dplyr::select(-start_date, -end_date)
# A tibble: 13 x 8
# Groups:   id_min [2]
   id_min  begin_day begin_month begin_year exit_day exit_month exit_year number_days
   <chr>       <int>       <int>      <int> <chr>    <chr>      <chr>           <int>
 1 1030015        29          12       2019 3        1          2020                6
 2 1030015        30          12       2019 3        1          2020                6
 3 1030015        31          12       2019 3        1          2020                6
 4 1030015         1           1       2020 3        1          2020                6
 5 1030015         2           1       2020 3        1          2020                6
 6 1030015         3           1       2020 3        1          2020                6
 7 1030028         4           1       2020 10       1          2020                7
 8 1030028         5           1       2020 10       1          2020                7
 9 1030028         6           1       2020 10       1          2020                7
10 1030028         7           1       2020 10       1          2020                7
11 1030028         8           1       2020 10       1          2020                7
12 1030028         9           1       2020 10       1          2020                7
13 1030028        10           1       2020 10       1          2020                7

展开开始日期和结束日期之间的行，并计算R

3 个答案: