我有一个.csv文件,我正在从R读取。有一列包含单元格:suppse
单元格C1 = 2,3 C2 = 1,2,3,4 C3 = 1等等......
修改 C1代表C列和第1行。
我只想阅读R中那些单元格中数字的长度。如何做到这一点?
有没有人有任何线索?
从excel中读取数据。
data=read.csv("location", header=T)
我需要计算长度单元格的数据列之一。
V24
1,2,3,4
1,2,3,4
1,4,2,3
1,2,4,3
1,3,2,4
4,3,1,2
这个数据太大了;因此我无法将其粘贴在这里。
编辑1 :
dput(string_data)
structure(list(v_1 = c(NA, NA, NA, NA, 3L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1L, NA, NA, NA, NA, NA, 2L, NA, NA, NA, 1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0L, NA, NA, NA, NA, 2L
), v_2 = c(NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 3L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3L, NA, NA,
NA, NA, NA, 2L, NA, NA, NA, 3L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 4L, NA, NA, NA, NA, 2L), v_3 = structure(c(1L,
1L, 1L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 4L,
1L, 1L, 1L, 7L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "1,4", "2",
"2,1", "2,4", "3", "4"), class = "factor"), v_4 = c(NA, NA, NA,
NA, NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0L, NA, NA, NA,
NA, NA, NA, 0L, 0L, NA, NA, NA, NA, 0L, 2L, NA, 0L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), v_5 = c(NA, NA, NA, NA, NA, 0L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L, NA, NA, NA, NA, NA, NA,
2L, 2L, NA, NA, NA, NA, 2L, 0L, NA, 0L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), v_6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), v_7 = c(NA, NA, NA, NA, 0L, NA, NA, NA, NA, NA, NA, NA,
1L, NA, NA, NA, NA, 0L, NA, NA, 0L, NA, NA, 1L, NA, NA, 0L, 0L,
NA, NA, NA, NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 0L,
NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, 1L, NA, NA), v_8 = c(NA,
NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, 0L, NA, NA, NA, NA,
1L, NA, NA, 1L, NA, NA, 0L, NA, NA, 1L, 1L, NA, NA, NA, NA, 1L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0L, 1L, NA, NA, NA, NA, 0L,
NA, NA, NA, NA, NA, NA, 0L, NA, NA), v_9 = c(NA, NA, NA, NA,
1L, NA, NA, NA, NA, NA, NA, NA, 4L, NA, NA, NA, NA, 1L, NA, NA,
3L, NA, NA, 4L, NA, NA, 3L, 3L, NA, NA, NA, NA, 3L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 4L, 3L, NA, NA, NA, NA, 4L, NA, NA, NA,
NA, NA, NA, 4L, NA, NA), v_10 = c(NA, 5L, NA, NA, NA, 0L, 3L,
NA, 3L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 3L, NA, NA, 3L, NA, NA, NA,
NA, NA, NA), v_11 = c(NA, 0L, NA, NA, NA, 0L, 2L, NA, 2L, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 2L, NA, NA, 2L, NA, NA, NA, NA, NA, NA
), v_12 = structure(c(1L, 4L, 1L, 1L, 1L, 1L, 2L, 1L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("", "3", "4", "4,1,3"), class = "factor")), .Names = c("v_1",
"v_2", "v_3", "v_4", "v_5", "v_6", "v_7", "v_8", "v_9", "v_10",
"v_11", "v_12"), class = "data.frame", row.names = c(NA, -58L
))
答案 0 :(得分:2)
在基础R中,read.table
使用的函数是count.fields
,您可以像这样使用它(使用@ akrun' s样本数据):
count.fields(textConnection(dat1$V24), sep = ",", blank.lines.skip = FALSE)
# [1] 0 4 1 0 0 1 4 0 4 0 0 3 1 0 0 1 3
用NA
替换0应该非常简单。
请注意,这与@ akrun的方法不同,因为这意味着要计算数据集中应该有多少列。因此," "与空字符串不同,因此" 1"我的结果中的值,但不是@ akrun中的值。您可以使用
gsub("\\s+", "", dat$V24)
来摆脱这些。
答案 1 :(得分:0)
我们可以删除带有str_trim
的前导/滞后空格(如果有),使用str_count
计算元素的数量,假定分隔符为,
,行数为nzchar
可以通过NA
找到空的,只需一点算术,我们就可以制作这些元素 library(stringr)
dat1$V24 <- str_trim(dat1$V24)
with(dat1, str_count(V24, ',')+1 * NA^!nzchar(V24))
#[1] NA 4 NA NA NA NA 4 NA 4 NA NA 3 NA NA NA NA 3
stringi
library(stringi)
dat1$V24 <- stri_trim_both(dat1$V24)
with(dat1, stri_count(V24, fixed= ',')+1 * NA^!nzchar(V24))
#[1] NA 4 NA NA NA NA 4 NA 4 NA NA 3 NA NA NA NA 3
中的类似功能会更快
indx <- seq(1, ncol(dat2), by=3)
lapply(dat2[indx], function(x) {r1 <- str_trim(x)
str_count(r1, ',')+1 * NA^!nzchar(r1) })
#$V1
#[1] 1 4 1 1 3
#$V4
#[1] 4 1 2 3 NA
如果要对数据集的每个第3列执行此操作
dat2[indx]
# V1 V4
#1 1 1,2,5,6
#2 1,2,3,4 1
#3 3 1,2
#4 1 15,23,24
#5 1,2,3
其中,
lapply(dat3[indx], function(x) {r1 <- str_trim(x)
str_count(r1, ',')+1 * NA^is.na(r1)})
#$V1
#[1] 1 4 1 NA 3
#$V4
#[1] 4 NA 2 3 NA
dput
根据string_data
的{{1}},只有两列(3和12)是factor
类,它们是字符串元素。即2,4
,1,4
等。
indx1 <- sapply(string_data, is.factor)
lapply(string_data[indx1], function(x){r1 <- str_trim(x)
str_count(r1, ',')+1 * NA^!nzchar(r1)})
#$v_3
#[1] NA NA NA NA 1 NA NA NA NA NA NA NA NA NA NA 1 NA NA NA NA NA NA NA NA NA
#[26] NA 1 NA NA NA NA NA 2 NA NA NA 1 NA NA NA NA NA NA NA NA NA NA NA NA NA
#[51] NA NA 2 NA NA NA NA 2
#$v_12
#[1] NA 3 NA NA NA NA 1 NA 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#[26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1 NA
#[51] NA 1 NA NA NA NA NA NA
所有其他变量都是integer
或logical
dat1 <- data.frame(V24=c('', '1,2,3,4', ' ', '', '', ' ',
'1,2,3,4', '', '1,4,2,3', '', '', '1,2,4', ' ', '', '', ' ',
'1,3,2'), stringsAsFactors=FALSE)
dat2 <- data.frame(V1=c('1', '1,2,3,4', '3 ', '1', '1,2,3'), V2=1:5,
V3=6:10, V4=c('1,2,5,6', '1', '1,2', '15,23,24', ' '), V6=11:15,
stringsAsFactors=FALSE)
dat3 <- data.frame(V1= c('1', '1,2,3,4', '3 ', NA, '1,2,3'), V2=1:5,
V3=6:10, V4=c('1,2,5,6', NA, '1,2', '15,23,24', NA),
stringsAsFactors=FALSE)