从R中的字符串中提取特定模式

时间:2016-03-19 00:15:48

标签: r

我想从以下导入的数据中提取一系列数字(来自后者的中间2位数字和后4位数字)以创建新列表Date

 1. 251502 091412  
 2. 81502 091549  
 3. 591601 011207  
 4. 591601 011213  
 5. 591502 1026025  
 6. 591601 011203  
 7. 591601 011206  
 8. 631601 120302

我为测试运行编写了以下代码并且运行良好:

x <- c("251502 091412", "591502 1026025")
t = do.call(rbind, strsplit(x, " "))
t1 = substring(rev(t[,1]), 3, 4) 
t2 = substring(t[, 2], 1, 4)
t = paste("20", t1, t2, sep = "")
ar <- as.Date(t, "%Y%m%d")

但是,当我导入更大的数据集时,如果无法应用。我有什么遗漏的吗?

---------------------------- UPDATE ------------------ ----------------------

我想我没有清楚地问这个问题。我想要做的是从UPC中提取数字并将它们转移到date。如果我手动输入UPC,上面的代码工作正常,但是在导入UPC时无法编译。

> sample <- read.csv("~/Documents/Kokko/sample/sample.csv")
> sample
   sell.date Qty            UPC  Price
1    1/12/16   1  251502 091412 139.04
2    1/12/16   1   81502 091549 318.00
3    1/13/16   1  591601 011207 998.00
4    1/13/16   1  591601 011213 338.00
5    1/13/16   1 591502 1026025 131.60
6    1/13/16   1  591601 011203 338.00
7    1/13/16   1  591601 011206 998.00
8    1/13/16   1  631601 120302 298.00
9    1/13/16   1  631601 010523 278.00
10   1/13/16   1  591601 011212 338.00
11   1/13/16   1  591601 122121 378.00
12   1/13/16   1  691502 082036 369.60
13   1/13/16   1  951502 072421 658.00
14   1/14/16   1  591601 011207 998.00
15   1/14/16   1  591601 011212 338.00
16   2/13/16   1  631601 120302 298.00
17            NA                    NA
18            NA                    NA
> 
> library(data.table)
> d <- fread("~/Documents/Kokko/sample/sample.csv", select = c("UPC"))
> x <- as.character(d)
> x
[1] "c(\"251502 091412\", \"81502 091549\", \"591601 011207\", \"591601    011213\", \"591502 1026025\", \"591601 011203\", \"591601 011206\", \"631601 120302\", \"631601 010523\", \"591601 011212\", \"591601 122121\", \"691502 082036\", \"951502 072421\", \"591601 011207\", \"591601 011212\", \"631601 120302\", \"\", \"\")"
> t = do.call(rbind, strsplit(x, " "))
> t1 = substring(rev(t[,1]), 3, 4) 
> t2 = substring(t[, 2], 1, 4)
> t = paste("20", t1, t2, sep = "")
> ar <- as.Date(t, "%Y%m%d")
> ar
[1] NA`

2 个答案:

答案 0 :(得分:2)

您需要索引第一个数字末尾的第一个数字,因为它们的长度不一样:

# read in data as lines of text
x <- readLines(textConnection(' 1. 251502 091412  
                                2. 81502 091549  
                                3. 591601 011207  
                                4. 591601 011213  
                                5. 591502 1026025  
                                6. 591601 011203  
                                7. 591601 011206  
                                8. 631601 120302'))

# split on 1+ whitespace characters
t <- do.call(rbind, strsplit(x, '\\s+'))

# index t1 from the end of the string using `nchar`
t1 <- substr(t[,3], nchar(t[,3])-3, nchar(t[,3])-2) 
t2 <- substr(t[,4], 1, 4)
t <- paste0(t1, t2)           # simplified with `paste0` for `sep = ""`, chop "20" and
ar <- as.Date(t, "%y%m%d")    # use `%y` instead of `%Y` here

ar
# [1] "2015-09-14" "2015-09-15" "2016-01-12" "2016-01-12" "2015-10-26" "2016-01-12" "2016-01-12"
# [8] "2016-12-03"

答案 1 :(得分:0)

我们可以使用sub

一步完成此操作
as.Date(sub("^.+(.{2}).{2}\\s+(.{4}).*", "\\1\\2", 
                      sample$UPC), "%y%m%d")
#[1] "2015-09-14" "2015-09-15" "2016-01-12" "2016-01-12" "2015-10-26"
#[6] "2016-01-12" "2016-01-12" "2016-12-03" "2016-01-05" "2016-01-12"
#[11] "2016-12-21" "2015-08-20" "2015-07-24" "2016-01-12" "2016-01-12"
#[16] "2016-12-03" NA           NA          

数据

sample <- structure(list(sell.date = c("1/12/16", "1/12/16", "1/13/16", 
"1/13/16", "1/13/16", "1/13/16", "1/13/16", "1/13/16", "1/13/16", 
"1/13/16", "1/13/16", "1/13/16", "1/13/16", "1/14/16", "1/14/16", 
"2/13/16", "", ""), Qty = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA), UPC = c("251502 091412", 
"81502 091549", "591601 011207", "591601 011213", "591502 1026025", 
"591601 011203", "591601 011206", "631601 120302", "631601 010523", 
"591601 011212", "591601 122121", "691502 082036", "951502 072421", 
"591601 011207", "591601 011212", "631601 120302", "", ""), Price = c(139.04, 
318, 998, 338, 131.6, 338, 998, 298, 278, 338, 378, 369.6, 658, 
998, 338, 298, NA, NA)), .Names = c("sell.date", "Qty", "UPC", 
"Price"), class = "data.frame", row.names = c("1", "2", "3", 
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", 
"16", "17", "18"))