我有一些房屋的空气质量数据。我想从嵌套在具有以下文件结构的文件夹中的.csv文件中递归导入数据。我遇到的问题是,虽然我可以使用不带文件夹或文件名(房间名称或日期名称)的列表导入它们,但无法解决如何提取房间名称,月份或年份并将其归因于以下列表创建,以便我可以操纵数据进行绘图等。
任何人都可以帮助从文件/文件夹名称中提取门牌号,房间名称,月份和年份,并使用rbind创建一个具有 ID 列的数据.frame,该列代表门牌号,房间,月份和年份?
此代码适用于一个文件夹:
filenames <- list.files( pattern="*.csv", full.names=TRUE,recursive = T)
ldf <- lapply(filenames, read.csv)
df <- dplyr::bind_rows(ldf)
df <- purrr::map_df(ldf, dplyr::bind_rows)
df <- purrr::map_df(ldf, ~.x)
文件结构:
.
├── House 01
| ├── Kitchen
| | ├──House 01 kitchen Apr 2019.csv
| | ├──House 01 kitchen December 2019.csv
| | ├──House 01 kitchen February 2018.csv
| └── Living room
| | ├──House 01 living room Apr 2019.csv
| | ├──House 01 living room December 2019.csv
| | ├──House 01 living room February 2018.csv
├── House 02
| ├── Kitchen
| | ├──House 02 kitchen Apr 2019.csv
| | ├──House 02 kitchen December 2019.csv
| | ├──House 02 kitchen February 2018.csv
| └── Living room
| | ├──House 02 living room Apr 2019.csv
| | ├──House 02 living room December 2019.csv
| | ├──House 02 living room February 2018.csv
每个csv文件如下所示:
我理想的输出是带有列的data.frame: 时间,日期,var1,var2,var2,houseNum,roomName
filenames <- list.files( pattern="*.csv", full.names=TRUE,recursive = T)
>filenames
[1] "./House 01 Bedroom/House 01 bedroom Apr 2019.csv"
[2] "./House 01 Bedroom/House 01 bedroom December 2018.csv"
[3] "./House 01 Bedroom/House 01 bedroom February 2019.csv"
[4] "./House 01 Bedroom/House 01 bedroom January 2018.csv"
[5] "./House 01 Bedroom/House 01 bedroom March 2019.csv"
[6] "./House 01 Bedroom/House 01 bedroom May 2019.csv"
[7] "./House 01 Bedroom/House 01 bedroom November 2018.csv"
[8] "./House 01 Bedroom/House 01 bedroom October 2018.csv"
[9] "./House 01 Kitchen/House 01 kit Apr 2019.csv"
[10] "./House 01 Kitchen/House 01 kit May 2019.csv"
[11] "./House 01 Kitchen/House 01 kitchen December 2018.csv"
[12] "./House 01 Kitchen/House 01 kitchen February 2019.csv"
[13] "./House 01 Kitchen/House 01 kitchen January 2019.csv"
[14] "./House 01 Kitchen/House 01 kitchen March 2019.csv"
[15] "./House 01 Kitchen/House 01 kitchen November 2018.csv"
[16] "./House 01 Kitchen/House 01 kitchen October 2018.csv"
[17] "./House 01 Living room/House 01 Liv Apr 2019.csv"
[18] "./House 01 Living room/House 01 Liv May 2019.csv"
[19] "./House 01 Living room/House 01 living room December 2018.csv"
[20] "./House 01 Living room/House 01 living room February 2018.csv"
[21] "./House 01 Living room/House 01 living room January 2018.csv"
[22] "./House 01 Living room/House 01 living room March 2018.csv"
[23] "./House 01 Living room/House 01 living room November 2018.csv"
[24] "./House 01 Living room/House 01 living room October 2018.csv"
>
ldf <- lapply(filenames, read.csv)
ldf<-lapply(ldf, function(y) { y["X"] <- NULL; y })
dn <- do.call(rbind, strsplit(dirname(filenames), "/")) #extracts month and year from
dn <- dn[,-(1:(ncol(dn)-2))]
colnames(dtf) <- c("House", "Room", "Month", "Year")
dtf$Date <- as.Date(paste(dtf$Month, dtf$Year, 1), "%b %Y %d")
cnamez<-c("Time","DevTime","pm2.5","Temp","RH","CO2","VOC.ppb","allpol")
ldf<-lapply(ldf, setNames, cnamez)
names(ldf)<-dn #Names the data frames
AllData<-bind_rows(ldf, .id = "ID") #Binds them into data frames
library(stringr)
tmp<-str_split_fixed(AllData$ID, " ", 2) #Splits House number and room
tmp <- tmp[,-c(1,4)] #Removes excess
AllData$House<-tmp[,1] #Assigns house to be first column of tmp
AllData$Room<-tmp[,2]
AllData$ID<-NULL #Gets rid of ID column
head(AllData)
Time DevTime pm2.5 Temp RH CO2 VOC.ppb allpol House Room
1 1554073200 01/04/2019 00:00 7.320007 18.7700 48.9200 452.0 125 7.320007 01 Bedroom
2 1554073500 01/04/2019 00:05 7.550003 18.7595 48.9190 451.0 125 7.550003 01 Bedroom
3 1554073800 01/04/2019 00:10 8.240021 18.7270 48.9600 453.0 126 8.382878 01 Bedroom
4 1554074100 01/04/2019 00:15 14.450012 18.7205 48.9815 452.5 126 14.592871 01 Bedroom
5 1554074400 01/04/2019 00:20 19.740020 18.7050 48.9930 463.0 129 20.311450 01 Bedroom
6 1554074700 01/04/2019 00:25 17.210022 18.6995 48.9875 468.0 130 17.924307 01 Bedroom
答案 0 :(得分:4)
使用您提供的共享点链接,我做了一个紧凑的mcve,它可以捕获整个数据集中的大多数不正常情况。主要问题是空的data.frames,尽管并非所有人都在文件名中使用(no data)
,也不难找到它们。尽管丢弃空的data.frame很容易,但我选择通过填充NA
的一行来保留它们。如果发现这些行很麻烦,则很容易将其删除。
我已经在完整的数据集上进行了尝试,并且效果很好。
# set.seed(2)
# filenames <- list.files("Foobot", recursive=TRUE, full.names=TRUE)
# filenames[sample(length(filenames), 5)][c(1, 4, 5)]
# ldf <- lapply(filenames, read.csv, stringsAsFactors=FALSE)
# s <- sapply(ldf, nrow) != 0
# ldf[s] <- lapply(ldf[s], function(x) x[sample(nrow(x), sample(2:3)),])
# ldf <- lapply(ldf, "rownames<-", NULL)
filenames <- c(
"Foobot/House 04 foobot data/House 04 bedroom/House 04 bed Mar 2019.csv",
"Foobot/House 03 foobot data/House 03 Living room/House 03 Liv May 2019.csv",
"Foobot/House 18 foobot data/House 18 living room/House 18 liv Feb 2019.csv")
ldf <- list(structure(list(time..s.=logical(0), Device.Local.Time=logical(0),
pm..ugm3.=logical(0), tmp..C.=logical(0), hum..pc.=logical(0),
co2..ppm.=logical(0), voc..ppb.=logical(0), allpollu....=logical(0),
X=logical(0)), class="data.frame", row.names=integer(0)),
structure(list(time..s.=c(1557342000L, 1556863500L),
Device.Local.Time=c("08/05/2019 20:00", "03/05/2019 07:05"),
pm..ugm3.=c(18.660004, 43.5), tmp..C.=c(17.73, 17.5), hum..pc.=c(55.947,
50.739), co2..ppm.=c(1187, 1003), voc..ppb.=c(328, 277),
allpollu....=c(45.99334, 59.928574)), row.names=c(NA, -2L),
class="data.frame"), structure(list(time..s.=c(1549291500L, 1550995200L,
1550111100L), Device.Local.Time=c("04/02/2019 14:45", "24/02/2019 08:00",
"14/02/2019 02:25"), pm..ugm3.=c(13.76001, 8.4700165, 11), tmp..C.=c(21.407,
16.972, 20.918), hum..pc.=c(48.643997, 55.678, 52.008), co2..ppm.=c(643, 910,
738), voc..ppb.=c(178, 251.5, 204.5), allpollu....=c(21.331438, 26.541447,
22.357143), X=c(NA, NA, NA)), row.names=c(NA, -3L), class="data.frame"))
# One of the data.frames have zero rows
sapply(ldf, dim)
# [,1] [,2] [,3]
# [1,] 0 2 3
# [2,] 9 8 9
# Forcing all the data.frames to have at least one row results in
# padding with NAs for those that have less
ldf <- lapply(ldf,
function(x) data.frame(
lapply(x, "length<-", max(c(1, nrow(x)))),
stringsAsFactors=FALSE))
# Extract metadata from the directory names
dn <- do.call(rbind, strsplit(dirname(filenames), "/"))
dn <- dn[,-(1:(ncol(dn)-2))]
dn[,1] <- sub("^(House [0-9]+) .*", "\\1", dn[,1])
dn[,2] <- tolower(sub("^House [0-9]+ ", "", dn[,2]))
# Extract metadata from the base names
bn <- strsplit(sub("\\.csv$", "", basename(filenames)), " ")
bn <- t(sapply(bn, tail, 2))
# Combine and create Date column
dtf <- data.frame(dn, bn, stringsAsFactors=FALSE)
colnames(dtf) <- c("House", "Room", "Month", "Year")
dtf$Date <- as.Date(paste(dtf$Month, dtf$Year, 1), "%b %Y %d")
# Multi-argument intersection function
intsect <- function(x) {
Reduce(function(x, y) unique(y[match(x, y, 0L)]), x)
}
# Create vectors of valid column names
ldf.cn <- intsect(lapply(ldf, colnames))
dtf.cn <- colnames(dtf)
# Bind metadata and sensor data
ldf.cbind <- mapply(function(dtf, ldf) {
d <- cbind(c(dtf), ldf, stringsAsFactors=FALSE)
d <- d[, c("House", "Room", "Date", ldf.cn)]
d
}, split(dtf, 1:nrow(dtf)), ldf, SIMPLIFY=FALSE)
# Bind list of data.frames to one tall data.frame
ldf.rbind <- do.call(rbind, ldf.cbind)
# Convert to date-time
ldf.rbind$Device.Local.Time <- as.POSIXct(
ldf.rbind$Device.Local.Time, format="%d/%m/%Y %H:%M")
# Control that all the column classes make sense
sapply(ldf.rbind[1,], function(x) class(x)[1])
# House Room Date time..s.
# "character" "character" "Date" "integer"
# Device.Local.Time pm..ugm3. tmp..C. hum..pc.
# "POSIXct" "numeric" "numeric" "numeric"
# co2..ppm. voc..ppb. allpollu....
# "numeric" "numeric" "numeric"
# Inspect subset of final data.frame
ldf.rbind[sample(nrow(ldf.rbind), 3),]
# House Room Date time..s. Device.Local.Time
# 1 House 04 bedroom 2019-03-01 NA <NA>
# 3.3 House 18 living room 2019-02-01 1550111100 2019-02-14 02:25:00
# 2.2 House 03 living room 2019-05-01 1556863500 2019-05-03 07:05:00
# pm..ugm3. tmp..C. hum..pc. co2..ppm. voc..ppb. allpollu....
# 1 NA NA NA NA NA NA
# 3.3 11.0 20.918 52.008 738 204.5 22.35714
# 2.2 43.5 17.500 50.739 1003 277.0 59.92857
答案 1 :(得分:1)
以下内容将使您接近。将具有阅读者的read_csv
的CSV导入purrr的map
中。使用paths
设置列表名称,然后使用参数.id = "path"
绑定数据框以包括带有列表元素名称的列。用tidyr的separate
用“ /”分隔路径。使用Stringr的str_remove
和其他变量作为模式从文件名中删除多余的字符串。最后,用另一个调用来分隔文件名的剩余部分:
library(tidyverse)
paths <- list.files(pattern = "csv$", recursive = T)
map(paths, read_csv) %>%
set_names(paths) %>%
bind_rows(.id = "path") %>%
separate(path, c("house", "room", "file"), "/") %>%
mutate(file = file %>%
str_remove(regex(house, T)) %>%
str_remove(regex(room, T)) %>%
str_remove("\\.csv") %>%
str_trim,
house = parse_number(house)
) %>%
separate(file, c("month", "year"), convert = T)
使用一些综合数据返回以下内容:
# A tibble: 4 x 8
house room month year time var1 var2 var3
<dbl> <chr> <chr> <int> <drtn> <chr> <chr> <chr>
1 1 Kitchen Apr 2019 02:00 blah bleh bluh
2 1 Living room June 2018 12:00 blah bleh bluh
3 2 Kitchen July 2019 08:00 blah bleh bluh
4 2 Living room January 2016 16:00 blah bleh bluh