我正在尝试基于几个属性将数据帧分为多个数据帧: 一种。识别包含标点符号的姓氏,即(1),(2),(3)等 b。在为
确定的两个col位置之间拆分数据框第二个确定的位置= 11
预期输出= NewDataSet [,2:10],依此类推
这是我们每月接收的数据形式,我们需要对其进行按摩以使其与SQL数据库兼容
structure(list(`Document ID` = c(159812L, 159822L, 170083L),
`Rental unit (1)` = 1:3, `Gross area` = structure(1:3, .Label = c("1,000.00",
"1,001.00", "1,002.00"), class = "factor"), Unit = structure(c(1L,
1L, 1L), .Label = "sq ft", class = "factor"), `Net area` = structure(c(1L,
1L, 1L), .Label = "n/a", class = "factor"), Unit = c(NA,
NA, NA), `Floor no.` = structure(c(1L, 1L, 1L), .Label = "n/a", class = "factor"),
Unit = c(NA, NA, NA), `Start date` = structure(1:3, .Label = c("6/3/2008",
"7/20/2007", "n/a"), class = "factor"), `End date` = structure(c(2L,
1L, 3L), .Label = c("6/29/2025", "6/30/2028", "n/a"), class = "factor"),
`Rental unit (2)` = 3:5, `Gross area` = structure(1:3, .Label = c("1,000.00",
"1,001.00", "1,002.00"), class = "factor"), Unit = structure(c(1L,
1L, 1L), .Label = "sq ft", class = "factor"), `Net area` = structure(c(1L,
1L, 1L), .Label = "n/a", class = "factor"), Unit = c(NA,
NA, NA), `Floor no.` = structure(c(1L, 1L, 1L), .Label = "n/a", class = "factor"),
Unit = c(NA, NA, NA), `Start date` = structure(1:3, .Label = c("6/3/2008",
"7/20/2007", "n/a"), class = "factor"), `End date` = structure(c(2L,
1L, 3L), .Label = c("6/29/2025", "6/30/2028", "n/a"), class = "factor"),
`Rental unit (3)` = 5:7, `Gross area` = structure(1:3, .Label = c("1,000.00",
"1,001.00", "1,002.00"), class = "factor"), Unit = structure(c(1L,
1L, 1L), .Label = "sq ft", class = "factor"), `Net area` = structure(c(1L,
1L, 1L), .Label = "n/a", class = "factor"), Unit = c(NA,
NA, NA), `Floor no.` = structure(c(1L, 1L, 1L), .Label = "n/a", class = "factor"),
Unit = c(NA, NA, NA), `Start date` = structure(1:3, .Label = c("6/3/2008",
"7/20/2007", "n/a"), class = "factor"), `End date` = structure(c(2L,
1L, 3L), .Label = c("6/29/2025", "6/30/2028", "n/a"), class = "factor"),
Longitude = c(NA, NA, NA), Latitude = c(NA, NA, NA), `Orga Unit` = structure(c(2L,
2L, 1L), .Label = c("SESAC and Sublease", " 2018 - Real Estate Lease Demo"
), class = "factor"), `Workflow state` = structure(c(1L,
1L, 1L), .Label = "R1 + R2 done", class = "factor"), `Name of DocSet` = structure(c(3L,
1L, 2L), .Label = c("ii - 1000 - Target", "SESAC", "Stop & Shop executed lease 060308"
), class = "factor"), `Language of DocSet` = structure(c(2L,
2L, 1L), .Label = c("en", "en_US"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
基于列位置的多个数据框
我从下面提到的代码开始:
newFile <- read.csv("sample.csv", check.names = FALSE)
vecLoc <- c(grep("[[:punct:]]", colnames(newFile)))
答案 0 :(得分:1)
data=structure(list(`Document ID` = c(159812L, 159822L, 170083L),
`Rental unit (1)` = 1:3, `Gross area` = structure(1:3, .Label = c("1,000.00",
"1,001.00", "1,002.00"), class = "factor"), Unit = structure(c(1L,
1L, 1L), .Label = "sq ft", class = "factor"), `Net area` = structure(c(1L,
1L, 1L), .Label = "n/a", class = "factor"), Unit = c(NA,
NA, NA), `Floor no.` = structure(c(1L, 1L, 1L), .Label = "n/a", class = "factor"),
Unit = c(NA, NA, NA), `Start date` = structure(1:3, .Label = c("6/3/2008",
"7/20/2007", "n/a"), class = "factor"), `End date` = structure(c(2L,
1L, 3L), .Label = c("6/29/2025", "6/30/2028", "n/a"), class = "factor"),
`Rental unit (2)` = 3:5, `Gross area` = structure(1:3, .Label = c("1,000.00",
"1,001.00", "1,002.00"), class = "factor"), Unit = structure(c(1L,
1L, 1L), .Label = "sq ft", class = "factor"), `Net area` = structure(c(1L,
1L, 1L), .Label = "n/a", class = "factor"), Unit = c(NA,
NA, NA), `Floor no.` = structure(c(1L, 1L, 1L), .Label = "n/a", class = "factor"),
Unit = c(NA, NA, NA), `Start date` = structure(1:3, .Label = c("6/3/2008",
"7/20/2007", "n/a"), class = "factor"), `End date` = structure(c(2L,
1L, 3L), .Label = c("6/29/2025", "6/30/2028", "n/a"), class = "factor"),
`Rental unit (3)` = 5:7, `Gross area` = structure(1:3, .Label = c("1,000.00",
"1,001.00", "1,002.00"), class = "factor"), Unit = structure(c(1L,
1L, 1L), .Label = "sq ft", class = "factor"), `Net area` = structure(c(1L,
1L, 1L), .Label = "n/a", class = "factor"), Unit = c(NA,
NA, NA), `Floor no.` = structure(c(1L, 1L, 1L), .Label = "n/a", class = "factor"),
Unit = c(NA, NA, NA), `Start date` = structure(1:3, .Label = c("6/3/2008",
"7/20/2007", "n/a"), class = "factor"), `End date` = structure(c(2L,
1L, 3L), .Label = c("6/29/2025", "6/30/2028", "n/a"), class = "factor"),
Longitude = c(NA, NA, NA), Latitude = c(NA, NA, NA), `Orga Unit` = structure(c(2L,
2L, 1L), .Label = c("SESAC and Sublease", " 2018 - Real Estate Lease Demo"
), class = "factor"), `Workflow state` = structure(c(1L,
1L, 1L), .Label = "R1 + R2 done", class = "factor"), `Name of DocSet` = structure(c(3L,
1L, 2L), .Label = c("ii - 1000 - Target", "SESAC", "Stop & Shop executed lease 060308"
), class = "factor"), `Language of DocSet` = structure(c(2L,
2L, 1L), .Label = c("en", "en_US"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
因此,我找到所需列的位置的方式如下:
split_locations=grep(colnames(data),pattern = "[(*)]") # the strategy that you provided in your Q identified columns with 'Floor no.' in them...
因此,根据上面创建的split_locations
,我获得了列号c(2,11,20)
-希望这是正确的。
这只是3个拆分,因此您可以简单地进行以下操作:
df1=data[,2:10]
df2=data[,11:19]
df3=data[,20:ncol(data)]
但是,如果上述内容对于您所做的实际分析而言太简单了。您可以执行以下操作:
split_locations=c(split_locations,ncol(data)) #add the final number to the end of split_locations
iterate_to=length(split_locations)-1 #specify how far we'll be iterating
for(i in 1:iterate_to){ #from 1 to the second last element of split_locations
assign(paste0('df',i), data[,c(split_locations[i]:split_locations[i+1])]) #use the command 'assign' to assign data to 'df1', 'df2' etc.
}
上面的代码将拆分的列分配给不同的数据帧:列2:10到df1
,11:19到df2
等。通过将列号i
到{ {1}}。
希望这有道理。
如果需要,您也可以使用i+1