从非结构化数组

时间:2015-08-13 01:13:13

标签: r

我有以下向量,它显示了变量可以采用的可能值。正如您所看到的,它不是用户友好的,我很难找到一种系统的方法来确定最小值和最大值。有没有人有什么建议?

[211] "-1\n1-960"                                                         "-1\n1-960"                                                        
[213] "-1\n1-960"                                                         "-1\n1\n2\n3"                                                      
[215] "-1\n0\n1\n\n2\n3\n\n4\n\n5"                                        "-1\nF\nG\nH\nP\nR\nS\nU"                                          
[217] "-1\n0\n1\n2\n3"                                                    "-1\n0\n1"                                                         
[219] "-1\n0\n1\n2\n3\n4\n5\n6"                                           "-1\n0-255"                                                        
[221] "-1\n0-255"                                                         "-1\n0-255"                                                        
[223] "-1\n0-255"                                                         "-1\n0-255"                                                        
[225] "-1\n0\n0.01–0.99\n1\n1.01–99.99"                                   "-1\n0\n1\n2\n3\n4\n5\n\n6\n\n7\n8\n\n9\n10\n11\n12"               
[227] "-1\n0\n1\n\n2\n\n3\n4\n5\n\n6"                                     "-1\n0\n1\n2\n\n3\n\n4\n5\n6"    

值" -1 \ n1-960"指的是1到960之间可能的值范围。-1并不代表任何东西,应该忽略所有字母。

例如:

"-1\n1-960" 
"-1\n0\n1\n\n2\n\n3\n4\n5\n\n6"                                     "-1\n0\n1\n2\n\n3\n\n4\n5\n6" 

应该导致:

max    min
960    1
6      0 
6      0

2 个答案:

答案 0 :(得分:1)

删除前导-1后,您可以拆分换行符。然后,由于-表示范围,您还可以拆分-个字符,因为这两个数字给出范围的最小值和最大值。所以这里有一些代码:

 lapply(
        strsplit(
                 gsub('^-1\n', '', dat),
                 '\n|-'
        ), 
        function(x) range(x)
 )

[[1]]
[1] "1"   "960"

[[2]]
[1] "1"   "960"

[[3]]
[1] "1"   "960"

[[4]]
[1] "1" "3"

[[5]]
[1] ""  "5"

[[6]]
[1] "F" "U"

[[7]]
[1] "0" "3"

[[8]]
[1] "0" "1"

[[9]]
[1] "0" "6"

[[10]]
[1] "0"   "255"

[[11]]
[1] "0"   "255"

[[12]]
[1] "0"   "255"

[[13]]
[1] "0"   "255"

[[14]]
[1] "0"   "255"

[[15]]
[1] "0"          "1.01–99.99"

[[16]]
[1] ""  "9"

[[17]]
[1] ""  "6"

[[18]]
[1] ""  "6"

答案 1 :(得分:0)

使用可能会或可能不是部分答案的其他代码扩展我的评论:

我猜-255是某种缺失值标记。其中一些字符值(目前)可以在R中解析为“数字”值,但如果您尝试解析,则其他值会抛出错误。您对1-960的期望是什么?这是一个表达,所以既不是数字也不是字符。

dat <- c( "-1\n1-960"                 ,                                        "-1\n1-960",                                                        
 "-1\n1-960"                        ,                                 "-1\n1\n2\n3"          ,                                            
 "-1\n0\n1\n\n2\n3\n\n4\n\n5"        ,                                "-1\nF\nG\nH\nP\nR\nS\nU",                                          
 "-1\n0\n1\n2\n3"                     ,                               "-1\n0\n1"                ,                                         
 "-1\n0\n1\n2\n3\n4\n5\n6"             ,                              "-1\n0-255"                ,                                        
 "-1\n0-255"                            ,                             "-1\n0-255"                 ,                                       
 "-1\n0-255"                             ,                            "-1\n0-255"                  ,                                      
 "-1\n0\n0.01–0.99\n1\n1.01–99.99"        ,                           "-1\n0\n1\n2\n3\n4\n5\n\n6\n\n7\n8\n\n9\n10\n11\n12" ,              
 "-1\n0\n1\n\n2\n\n3\n4\n5\n\n6"           ,                          "-1\n0\n1\n2\n\n3\n\n4\n5\n6" )

scandat <- sapply( dat, function(x) try( scan(textConnection(x)) ) )
# Lots of error messages but wrapping the scan call in try let's it continue
# So these are the items that could be parsed as numeric:

> scandat[ sapply(scandat,class)=="numeric" ]
$`-1\n1\n2\n3`
[1] -1  1  2  3

$`-1\n0\n1\n\n2\n3\n\n4\n\n5`
[1] -1  0  1  2  3  4  5

$`-1\n0\n1\n2\n3`
[1] -1  0  1  2  3

$`-1\n0\n1`
[1] -1  0  1

$`-1\n0\n1\n2\n3\n4\n5\n6`
[1] -1  0  1  2  3  4  5  6

$`-1\n0\n1\n2\n3\n4\n5\n\n6\n\n7\n8\n\n9\n10\n11\n12`
 [1] -1  0  1  2  3  4  5  6  7  8  9 10 11 12

$`-1\n0\n1\n\n2\n\n3\n4\n5\n\n6`
[1] -1  0  1  2  3  4  5  6

$`-1\n0\n1\n2\n\n3\n\n4\n5\n6`
[1] -1  0  1  2  3  4  5  6

我不是要清理它,但你可以用其他东西替换那些时髦的名字,它会更好地打印出来:

> sapply( scandat[ sapply(scandat,class)=="numeric" ], function(x) list(minx=min(x), maxx=max(x) )
+ )
     -1\n1\n2\n3 -1\n0\n1\n\n2\n3\n\n4\n\n5 -1\n0\n1\n2\n3 -1\n0\n1 -1\n0\n1\n2\n3\n4\n5\n6
minx -1          -1                         -1             -1       -1                     
maxx 3           5                          3              1        6                      
     -1\n0\n1\n2\n3\n4\n5\n\n6\n\n7\n8\n\n9\n10\n11\n12 -1\n0\n1\n\n2\n\n3\n4\n5\n\n6 -1\n0\n1\n2\n\n3\n\n4\n5\n6
minx -1                                                 -1                            -1                         
maxx 12                                                 6                             6