读取R中excel单元格中逗号分隔的数字的长度

时间:2015-02-23 04:04:45

标签: r excel

我有一个.csv文件,我正在从R读取。有一列包含单元格:suppse

单元格C1 = 2,3 C2 = 1,2,3,4 C3 = 1等等......

修改 C1代表C列和第1行。

我只想阅读R中那些单元格中数字的长度。如何做到这一点?

有没有人有任何线索?

从excel中读取数据。

data=read.csv("location", header=T)

我需要计算长度单元格的数据列之一。

V24

1,2,3,4




1,2,3,4

1,4,2,3


1,2,4,3




1,3,2,4

4,3,1,2

这个数据太大了;因此我无法将其粘贴在这里。

snap shot of the data; 12 columns and 35 rows.

编辑1

dput(string_data)
structure(list(v_1 = c(NA, NA, NA, NA, 3L, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
1L, NA, NA, NA, NA, NA, 2L, NA, NA, NA, 1L, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0L, NA, NA, NA, NA, 2L
), v_2 = c(NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, 3L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3L, NA, NA, 
NA, NA, NA, 2L, NA, NA, NA, 3L, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, 4L, NA, NA, NA, NA, 2L), v_3 = structure(c(1L, 
1L, 1L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 4L, 
1L, 1L, 1L, 7L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "1,4", "2", 
"2,1", "2,4", "3", "4"), class = "factor"), v_4 = c(NA, NA, NA, 
NA, NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0L, NA, NA, NA, 
NA, NA, NA, 0L, 0L, NA, NA, NA, NA, 0L, 2L, NA, 0L, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA), v_5 = c(NA, NA, NA, NA, NA, 0L, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L, NA, NA, NA, NA, NA, NA, 
2L, 2L, NA, NA, NA, NA, 2L, 0L, NA, 0L, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA), v_6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA), v_7 = c(NA, NA, NA, NA, 0L, NA, NA, NA, NA, NA, NA, NA, 
1L, NA, NA, NA, NA, 0L, NA, NA, 0L, NA, NA, 1L, NA, NA, 0L, 0L, 
NA, NA, NA, NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 0L, 
NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, 1L, NA, NA), v_8 = c(NA, 
NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, 0L, NA, NA, NA, NA, 
1L, NA, NA, 1L, NA, NA, 0L, NA, NA, 1L, 1L, NA, NA, NA, NA, 1L, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0L, 1L, NA, NA, NA, NA, 0L, 
NA, NA, NA, NA, NA, NA, 0L, NA, NA), v_9 = c(NA, NA, NA, NA, 
1L, NA, NA, NA, NA, NA, NA, NA, 4L, NA, NA, NA, NA, 1L, NA, NA, 
3L, NA, NA, 4L, NA, NA, 3L, 3L, NA, NA, NA, NA, 3L, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, 4L, 3L, NA, NA, NA, NA, 4L, NA, NA, NA, 
NA, NA, NA, 4L, NA, NA), v_10 = c(NA, 5L, NA, NA, NA, 0L, 3L, 
NA, 3L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 3L, NA, NA, 3L, NA, NA, NA, 
NA, NA, NA), v_11 = c(NA, 0L, NA, NA, NA, 0L, 2L, NA, 2L, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, 2L, NA, NA, 2L, NA, NA, NA, NA, NA, NA
), v_12 = structure(c(1L, 4L, 1L, 1L, 1L, 1L, 2L, 1L, 3L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("", "3", "4", "4,1,3"), class = "factor")), .Names = c("v_1", 
"v_2", "v_3", "v_4", "v_5", "v_6", "v_7", "v_8", "v_9", "v_10", 
"v_11", "v_12"), class = "data.frame", row.names = c(NA, -58L
))

2 个答案:

答案 0 :(得分:2)

在基础R中,read.table使用的函数是count.fields,您可以像这样使用它(使用@ akrun' s样本数据):

count.fields(textConnection(dat1$V24), sep = ",", blank.lines.skip = FALSE)
#  [1] 0 4 1 0 0 1 4 0 4 0 0 3 1 0 0 1 3

NA替换0应该非常简单。

请注意,这与@ akrun的方法不同,因为这意味着要计算数据集中应该有多少列。因此," "与空字符串不同,因此" 1"我的结果中的值,但不是@ akrun中的值。您可以使用gsub("\\s+", "", dat$V24)来摆脱这些。

答案 1 :(得分:0)

我们可以删除带有str_trim的前导/滞后空格(如果有),使用str_count计算元素的数量,假定分隔符为,,行数为nzchar可以通过NA找到空的,只需一点算术,我们就可以制作这些元素 library(stringr) dat1$V24 <- str_trim(dat1$V24) with(dat1, str_count(V24, ',')+1 * NA^!nzchar(V24)) #[1] NA 4 NA NA NA NA 4 NA 4 NA NA 3 NA NA NA NA 3

stringi

library(stringi) dat1$V24 <- stri_trim_both(dat1$V24) with(dat1, stri_count(V24, fixed= ',')+1 * NA^!nzchar(V24)) #[1] NA 4 NA NA NA NA 4 NA 4 NA NA 3 NA NA NA NA 3 中的类似功能会更快

 indx <- seq(1, ncol(dat2), by=3)
 lapply(dat2[indx], function(x) {r1 <- str_trim(x)
            str_count(r1, ',')+1 * NA^!nzchar(r1) })
 #$V1
 #[1] 1 4 1 1 3

 #$V4
 #[1]  4  1  2  3 NA

更新

如果要对数据集的每个第3列执行此操作

  dat2[indx]
  #      V1       V4
  #1       1  1,2,5,6
  #2 1,2,3,4        1
  #3      3       1,2
  #4       1 15,23,24
  #5   1,2,3         

其中,

  lapply(dat3[indx], function(x) {r1 <- str_trim(x)
              str_count(r1, ',')+1 * NA^is.na(r1)})
  #$V1
  #[1]  1  4  1 NA  3

  #$V4
  #[1]  4 NA  2  3 NA

UPDATE2

dput

UPDATE3

根据string_data的{​​{1}},只有两列(3和12)是factor类,它们是字符串元素。即2,41,4等。

 indx1 <- sapply(string_data, is.factor)
 lapply(string_data[indx1], function(x){r1 <- str_trim(x)
              str_count(r1, ',')+1 * NA^!nzchar(r1)})
 #$v_3
 #[1] NA NA NA NA  1 NA NA NA NA NA NA NA NA NA NA  1 NA NA NA NA NA NA NA NA NA
 #[26] NA  1 NA NA NA NA NA  2 NA NA NA  1 NA NA NA NA NA NA NA NA NA NA NA NA NA
 #[51] NA NA  2 NA NA NA NA  2

#$v_12
#[1] NA  3 NA NA NA NA  1 NA  1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#[26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA  1 NA
#[51] NA  1 NA NA NA NA NA NA

所有其他变量都是integerlogical

数据

 dat1 <- data.frame(V24=c('', '1,2,3,4', ' ', '', '', ' ',
 '1,2,3,4', '', '1,4,2,3', '', '', '1,2,4', ' ', '', '', ' ', 
 '1,3,2'), stringsAsFactors=FALSE)


 dat2 <- data.frame(V1=c('1', '1,2,3,4', '3 ', '1', '1,2,3'), V2=1:5, 
 V3=6:10, V4=c('1,2,5,6', '1', '1,2', '15,23,24', ' '), V6=11:15,
  stringsAsFactors=FALSE)

 dat3 <- data.frame(V1= c('1', '1,2,3,4', '3 ', NA, '1,2,3'), V2=1:5,
  V3=6:10, V4=c('1,2,5,6', NA, '1,2', '15,23,24', NA), 
  stringsAsFactors=FALSE)