解析CSV文件中的提取

时间:2017-11-10 14:28:30

标签: r csv

我正在尝试从此csv文件中提取数据 -

"NA,NA,NA,\"KMI for Lessons Learnt\",NA,\"0\",\"0\",\"7\",\"7\",\"0\",\"7\",\"7\",\"7\""
"NA,NA,NA,\"KMI for Knowledge Documents\",NA,\"0\",\"0\",\"7\",\"10\",\"0\",\"4.1999999999999993\",\"10\",\"10\""
"(Annual Basis)\",NA,\"KMI for Innovation (Overall for IPR & MIA/MRA)\",NA,\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"8.5399999999999991\""
"NA,NA,NA,\"KMI for IPR\",NA,\"10\",\"10\",\"0\",\"3\",\"10\",\"10\",\"3\",\"8.6\""
"NA,NA,NA,\"KMI for MIA/MRA\",NA,\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"8.1999999999999993\""
"NA,NA,NA,\"KMI for Knowledge gained from ext. experts\",NA,\"10\",\"0\",\"0\",\"0\",\"10\",\"10\",\"0\",\"10\""
"NA,NA,NA,\"KMI for Technical papers publication\",NA,\"10\",\"0\",\"10\",\"0\",\"10\",\"10\",\"10\",\"10\""
"NA,NA,NA,\"Overall KMI\",NA,\"5.5\",\"2.5\",\"4.6500000000000004\",\"4.5\",\"5.5\",\"8.09\",\"6\",\"8.8849999999999998\""

我只想要跳过NA字符串和连字符的值和行名称。 输出应该像 -

\"KMI for Lessons Learnt\",\"0\",\"0\",\"7\",\"7\",\"0\",\"7\",\"7\",\"7\""

从开始读取“KMI”字符串并删除所有出现的NA和连字符。

任何帮助都将不胜感激。

感谢。

我已经编辑了这个CSV文件。希望它看起来不像以前那么难看。

NA,NA,NA,KMI for Lessons Learnt,NA,0,0,7,7,0,7,7,7
NA,NA,NA,KMI for Knowledge Documents,NA,0,0,7,10,0,4.1999999999999993,10,10
(Annual Basis),NA,KMI for Innovation (Overall for IPR & MIA/MRA),NA,NA,NA,NA,NA,NA,NA,NA,8.5399999999999991
NA,NA,NA,KMI for IPR,NA,10,10,0,3,10,10,3,8.6
NA,NA,NA,KMI for MIA/MRA,NA,NA,NA,NA,NA,NA,NA,NA,8.1999999999999993
NA,NA,NA,KMI for Knowledge gained from ext. experts,NA,10,0,0,0,10,10,0,10
NA,NA,NA,KMI for Technical papers publication,NA,10,0,10,0,10,10,10,10
NA,NA,NA,Overall KMI ,NA,5.5,2.5,4.6500000000000004,4.5,5.5,8.09,6,8.8849999999999998

2 个答案:

答案 0 :(得分:1)

这很难看,但数据也是如此:

library(data.table)

x <- readLines('testFile.csv')

cleanLines <- function(line){
  x <- gsub('.*(KMI.*)', '\\1',  line)
  x <- gsub(',NA', '', x)
  x <- gsub('[\\"]', '', x)
  x <- strsplit(x, ',')[[1]]
  mat <- matrix(ncol = length(x))
  mat[1,] <- x
  dfTemp <- as.data.frame(mat, stringsAsFactors = FALSE)
  return(dfTemp)
}

parsed <- lapply(x, cleanLines)
df <- rbindlist(parsed)

输出df:

                                               V1  V2  V3                 V4  V5  V6                 V7 V8                 V9
1:                         KMI for Lessons Learnt   0   0                  7   7   0                  7  7                  7
2:                    KMI for Knowledge Documents   0   0                  7  10   0 4.1999999999999993 10                 10
3: KMI for Innovation (Overall for IPR & MIA/MRA)   -   -                  -   -   -                  -  - 8.5399999999999991
4:                                    KMI for IPR  10  10                  0   3  10                 10  3                8.6
5:                                KMI for MIA/MRA   -   -                  -   -   -                  -  - 8.1999999999999993
6:     KMI for Knowledge gained from ext. experts  10   0                  0   0  10                 10  0                 10
7:           KMI for Technical papers publication  10   0                 10   0  10                 10 10                 10
8:                                            KMI 5.5 2.5 4.6500000000000004 4.5 5.5               8.09  6 8.8849999999999998

答案 1 :(得分:0)

library(stringi)
library(tidyverse)

read_lines("so.csv") %>% 
  stri_replace_all_fixed("\\", "") %>% 
  stri_replace_all_regex('(^"|"$)', '') %>% 
  paste0(collapse="\n") %>% 
  read.csv(text=., header=FALSE, stringsAsFactors=FALSE)

##   V1 V2 V3                                         V4 V5   V6  V7    V8  V9  V10   V11 V12    V13
## 1 NA NA NA KMI for Knowledge gained from ext. experts NA 10.0 0.0  0.00 0.0 10.0 10.00   0 10.000
## 2 NA NA NA       KMI for Technical papers publication NA 10.0 0.0 10.00 0.0 10.0 10.00  10 10.000
## 3 NA NA NA                                Overall KMI NA  5.5 2.5  4.65 4.5  5.5  8.09   6  8.885