x <- c("2001 Tax @ $25.19/Widget, 2002 Est Tax @ $10.68/Widget; 2000 Est Int @ $55.67/Widget",
"1999 Tax @ $81.16/Widget",
"1998 Tax @ $52.72/Widget; 2001 Est Int @ $62.49/Widget",
"1994 Combined Tax/Int @ $68.33/widget; 1993 Est Int @ $159.67/Widget",
"1993 Combined Tax/Int @ $38.33/widget; 1992 Est Int @ $159.67/Widget",
"2006 Tax @ $129.21/Widget, 1991 Est Tax @ $58.19/Widget; 1991 Est Int @ $30.95/Widget")
等等。阅读表格以获得更大的矢量显示大多数条目用分号或逗号分隔,并且只使用有限数量的术语 - 年份,税收,国际合作,合并,预计 - 偶尔会有变化在条目中(如“;”与“,”或“小部件”与“小部件”)。
[id] [year] [number] [cat] [est]
row1 2001 25.19 Tax
row1 2002 10.68 Tax Est
row1 2000 55.67 Int Est
row2 1999 81.16 Tax
row3 1998 52.72 Tax
row3 2001 62.49 Int Est
[id] [1999tax] [2001tax] [2002esttax] [2000estint]
row1 0 25.19 10.68 55.67
row2 81.16 0 0 0
如果这是有道理的 - 我最终需要将其纳入回归模型。
pieces.of.x <- strsplit(x1, "[;,]"); head(pieces.of.x)
[1] "2001 Tax @ $25.19/Widget" " 2002 Est Tax @ $10.68/Widget" " 2000 Est Int @ $55.67/Widget"
[1] "1999 Tax @ $81.16/Widget"
[1] "1998 Tax @ $52.72/Widget" " 2001 Est Int @ $62.49/Widget"
[1] "1994 Combined Tax/Int @ $68.33/widget" " 1993 Est Int @ $159.67/Widget"
[1] "1993 Combined Tax/Int @ $38.33/widget" " 1992 Est Int @ $159.67/Widget"
[1] "2006 Tax @ $129.21/Widget" " 1991 Est Tax @ $58.19/Widget" " 1991 Est Int @ $30.95/Widget"
答案 0 :(得分:2)
matches <- regmatches(x,gregexpr("[0-9]{4} [^@]+@ \\$[0-9.]+",x))
lengths <- sapply(matches,length)
z <- unlist(matches)
z <- regmatches(z,regexec("([0-9]{4}) ([^@]+) @ \\$([0-9.]+)",z))
df <- t(sapply(z,function(x)c(year=x[2], number=x[4], cat=x[3])))
df <- data.frame(id=rep(1:length(x),times=lengths),df, stringsAsFactors=F)
df$est <- ifelse(grepl("Est",df$cat),"Est","")
df$cat <- regmatches(df$cat,regexpr("[^ /]+$",df$cat))
# id year number cat est
# 1 1 2001 25.19 Tax
# 2 1 2002 10.68 Tax Est
# 3 1 2000 55.67 Int Est
# 4 2 1999 81.16 Tax
# 5 3 1998 52.72 Tax
# 6 3 2001 62.49 Int Est
# 7 4 1994 68.33 Int
# 8 4 1993 159.67 Int Est
# 9 5 1993 38.33 Int
# 10 5 1992 159.67 Int Est
# 11 6 2006 129.21 Tax
# 12 6 1991 58.19 Tax Est
# 13 6 1991 30.95 Int Est
答案 1 :(得分:2)
# Split the row entries
x <- strsplit(x, "[,;]")
# Generate the entry identifiers.
i <- 0
id <- unlist( sapply( x, function(r) rep(i<<-i+1, length(r) ) ) )
# Extract the desired values
x <- unlist( x, recursive = FALSE )
year.re <- "(^\\s?([[:digit:]]{4})\\s)"
value.re <- "[$]([[:digit:]]+[.][[:digit:]]{2})[/]"
object.re <- "[/]([[:alnum:]]+)$"
Cats<- c("Tax","Int","Combination")
x <- lapply( x, function(str) {
c( Year=str_extract( str, year.re),
Category=Cats[ grepl( "Tax", str)*1 + grepl( "Int", str)*2 ],
Estimate=grepl( "Est", str),
Value=str_match( str, value.re)[2],
Object=str_match( str, object.re)[2] )
# Create a data object.
data.table( ID=id, do.call(rbind,x), key=c("Year") )
## ID Year Category Estimate Value Object
## 1: 6 1991 Tax TRUE 58.19 Widget
## 2: 6 1991 Int TRUE 30.95 Widget
## 3: 5 1992 Int TRUE 159.67 Widget
## 4: 4 1993 Int TRUE 159.67 Widget
## 5: 5 1993 Combination FALSE 38.33 widget
## 6: 4 1994 Combination FALSE 68.33 widget
## 7: 3 1998 Tax FALSE 52.72 Widget
## 8: 2 1999 Tax FALSE 81.16 Widget
## 9: 1 2000 Int TRUE 55.67 Widget
## 10: 3 2001 Int TRUE 62.49 Widget
## 11: 1 2001 Tax FALSE 25.19 Widget
## 12: 1 2002 Tax TRUE 10.68 Widget
## 13: 6 2006 Tax FALSE 129.21 Widget
答案 2 :(得分:1)
x <- unlist(strsplit(x, ',|;'))
bits <- regmatches(x,gregexpr('(\\d|\\.)+|(Tax|Int|Est)', x))
df <- do.call(rbind, lapply(bits, function(info) {
data.frame(year = info[[1]], number = tail(info, 1)[[1]],
cat = if ('Tax' %in% info) 'Tax' else 'Int',
est = if ('Est' %in% info) 'Est' else '')
df$cat <- factor(df$cat); df$est <- factor(df$est);
year number cat est
1 2001 25.19 Tax
2 2002 10.68 Tax Est
3 2000 55.67 Int Est
4 1999 81.16 Tax
5 1998 52.72 Tax
答案 3 :(得分:0)
regmatches(x,gregexpr('(\\d)+', x))
[1] "2001" "25.19" "2002" "10.68" "2000" "55.67"
[1] "1999" "81.16"
[1] "1998" "52.72" "2001" "62.49"
[1] "1994" "68.33" "1993" "159.67"
[1] "1993" "38.33" "1992" "159.67"
[1] "2006" "129.21" "1991" "58.19" "1991" "30.95"
x <- unlist(strsplit(x, ',|;'))
nums <- regmatches(x,gregexpr('(\\d|\\.)+', x))
df <- data.frame(matrix(as.numeric(unlist(nums)), ncol = 2, byrow = TRUE))
colnames(df) <- c('Year', 'Number')
Year Number
1 2001 25.19
2 2002 10.68
3 2000 55.67
4 1999 81.16
5 1998 52.72