我试图将海量数据转换为数据库,但要使其有序化似乎太麻烦了。
是示例数据。
structure(list(V1 = structure(c(2L, 1L, 1L, 3L, 1L, 4L, 1L, 1L,
1L, 5L), .Label = c("", "01.01 ? 01.06 ",
"02.01 ? 02.10 ", "03.01 ? 03.07 ",
"Chapter 4 (04.01~04.10)"), class = "factor"), V2 = structure(c(2L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 3L), .Label = c("", "CTH", "WO"
), class = "factor")), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-10L))
我想要显示的结果就像这样
structure(list(V1 = c(1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 2.01,
2.02, 2.03, 2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 3.01, 3.02,
3.03, 3.04, 3.05, 3.06, 3.07, 4.01, 4.02, 4.03, 4.04, 4.05, 4.06,
4.07, 4.08, 4.09, 4.1), V2 = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("CTH",
"WO"), class = "factor")), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-33L))
你可以发现A行的每个单元格都包含一个间隔值。它将是我想要制作的数据库中几个单元格的缩写。
我应该将这些单元格标记为C行的相同代码 - 例如,01.01 - CTH的单元格,01.02 - CTH的单元格,01.03 - CTH的单元格,依此类推到0.06。
但是有几个问题。
让我知道如何用R来解决这些问题。
谢谢大家
答案 0 :(得分:1)
它并不是特别优雅,但它可以完成工作。
library(dplyr)
library(stringr)
library(purrr)
library(tidyr)
input %>%
filter(V1 != "") %>%
mutate(lim = str_extract_all(V1, "\\d{2}\\.\\d{2}"),
lim = map_chr(lim, paste0, collapse = ",")) %>%
separate(lim,
into = c("low", "high"),
sep = ",") %>%
separate(low,
into = c("prefix", "low"),
sep = "\\.") %>%
mutate(high = str_replace(hight, "^.+\\.", ""),
id = row_number()) %>%
select(id, V2, prefix, low, high) %>%
split(f = list(.$id)) %>%
map(.f = function(rdf){
suffix <- (as.numeric(rdf$low):as.numeric(rdf$high)) / 100
data.frame(V1 = as.numeric(rdf$prefix) + suffix,
V2 = rep(rdf$V2, length(suffix)),
stringsAsFactors = FALSE)
}) %>%
bind_rows()
答案 1 :(得分:0)
这是一个混合基础R-stringr - data.table方法。
library(stringr)
library(data.table)
aa = aa = structure(list(V1 = structure(c(2L, 1L, 1L, 3L, 1L, 4L, 1L, 1L, 1L, 5L),
.Label = c("", "01.01 ? 01.06 ", "02.01 ? 02.10 ", "03.01 ? 03.07 ",
"Chapter 4 (04.01~04.10)"), class = "factor"),
V2 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 3L),
.Label = c("", "CTH", "WO"), class = "factor")), .Names = c("V1", "V2"),
class = "data.frame", row.names = c(NA, -10L))
aa = aa[aa$V1 !="",]
a1 = str_trim(aa$V1)
cc = str_split_fixed(aa$V1, "\\?|~", 2)
cc2 = apply(cc, 2, str_extract_all, "\\d+\\.\\d+", simplify = T)
cc2 = apply(cc2, 2, as.numeric)
cc2 = cbind(cc2, as.character(aa$V2))
range_col = list()
for(i in 1:nrow(cc2)){
range_col[[i]] = data.table(V1 = seq(cc2[i,1], cc2[i,2], by = 0.01),
V2 = cc2[i, 3])
}
rr = rbindlist(range_col)
导致
V1 V2
1: 1.01 CTH
2: 1.02 CTH
3: 1.03 CTH
4: 1.04 CTH
5: 1.05 CTH
6: 1.06 CTH
7: 2.01 CTH
8: 2.02 CTH
9: 2.03 CTH
10: 2.04 CTH
11: 2.05 CTH
12: 2.06 CTH
13: 2.07 CTH
14: 2.08 CTH
15: 2.09 CTH
16: 2.10 CTH
17: 3.01 CTH
18: 3.02 CTH
19: 3.03 CTH
20: 3.04 CTH
21: 3.05 CTH
22: 3.06 CTH
23: 3.07 CTH
24: 4.01 WO
25: 4.02 WO
26: 4.03 WO
27: 4.04 WO
28: 4.05 WO
29: 4.06 WO
30: 4.07 WO
31: 4.08 WO
32: 4.09 WO
33: 4.10 WO
答案 2 :(得分:0)
structure(list(V1 = structure(c(2L, 1L, 1L, 3L, 1L, 4L, 1L, 1L,
1L, 5L), .Label = c("", "01.01 ? 01.06 ",
"02.01 ? 02.10 ", "03.01 ? 03.07 ",
"Chapter 4 (04.01~04.10)"), class = "factor"), V2 = structure(c(2L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 3L), .Label = c("", "CTH", "WO"
), class = "factor")), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA, -10L)) -> df1
structure(list(V1 = c(1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 2.01,
2.02, 2.03, 2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 3.01, 3.02,
3.03, 3.04, 3.05, 3.06, 3.07, 4.01, 4.02, 4.03, 4.04, 4.05, 4.06,
4.07, 4.08, 4.09, 4.1), V2 = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("CTH",
"WO"), class = "factor")), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA, -33L)) -> df2
library(dplyr)
library(stringr)
library(purrr)
library(tidyr)
df1 %>%
filter(V1 != "") %>%
mutate(lim = str_extract_all(V1, "\\d{2}\\.\\d{2}"),
lim = map_chr(lim, paste0, collapse = ",")) %>%
separate(lim, into = c("low", "high"), sep = ",", convert = T) %>%
mutate(value = map2(low, high, ~paste0(seq(.x,.y, by=0.01), collapse = ","))) %>%
select(V2, value) %>%
separate_rows(value, convert = T)
答案 3 :(得分:0)
这也说明了当您拥有一个数字时的情况......命名为您的数据df
:
library(tidyverse)
as.character(df$V1) %>%
# extract all matching, ie 12.34
str_extract_all(pattern = "\\d{2}\\.\\d{2}") %>%
# convert to matrix
map(as.matrix) %>%
# transpose, such that it is only one column
map(t) %>%
# convert to tibble (or data frame)
map(as_tibble) %>%
# bind rows together, regardless of single or double rows
reduce(bind_rows) %>%
# name columns
set_names(nm = c("from", "to")) %>%
# if single values occur...
mutate(to = ifelse(is.na(to), from, to),
# add info of second column
second = as.character(df$V2)) %>%
# drop missing rows
drop_na() %>%
# convert to double for seq
mutate_at(.vars = vars(from, to), as.double) %>%
group_by_all() %>%
# create list of individual length using seq
mutate(first = list(seq(from = from, to = to, by = .01))) %>%
unnest() %>% ungroup() %>%
select(first, second)