在列值

时间:2015-11-17 20:22:48

标签: r

我使用以下代码:

library(stringi)
library(data.table)
v1 <- stri_extract_last_regex(RES1$sequence, '[[:alnum:]]+')
setDT(SRC2)[, Exist:= +(items %chin% v1)]
i1 <- SRC2[, Co:= as.character(round(sum(Exist)/.N, 2)) , 
         sequenceID][, .I[1:(.N-1)], sequenceID]$V1
SRC2[i1, Co:= '']

当RES1包含&#34;序列&#34;的正常值时:

"sequence" "support"
    "1" "<{OV50}>"   0.286
    "2" "<{OV148}>"  0.121
    "3" "<{OV46},{OV197}>" 0.065
    "4" "<{OV198},{OV199}, {OV205}>" 0.065

SRC2包含&#34;项目&#34;的正常值:

  "sequenceID" "transactionID" "eventID" "items"
"1" 42207993       1577          1        "OV50"
"2" 42207993       6048          2        "OV11"
"3" 42207993       1597          3        "OV148"
"4" 57237976       12423         1        "OV46"
"5" 57237976       12589         2        "OV197"

输出:

"sequenceID" "transactionID" "eventID" "items"  "Exist" "Co"
"1" 42207993       1577          1        "OV50"     1
"2" 42207993       6048          2        "OV11"     0
"3" 42207993       1597          3        "OV148"    1       0.67
"4" 57237976       12423         1        "OV46"     0
"5" 57237976       12589         2        "OV197"    1       0.5

代码完美无缺。然而,&#34;序列&#34;的值和&#34;项目&#34;持有许多字母,包括特殊字符,如:

RES1:

                                         sequence    support
1         <{EADFE20F543836047330DEFFB893127AF536560121698ADE2FCE6985E07A40D8 SELECT;DD2E595CF23E65E128560B655E0C6848 SELECT}> 0.286
2                                                                                 <{F73431225ED64969DC4BEBD06092FD6F SELECT}> 0.121
3         <{9AD4E96D7EF2FC7B64407EAF4E2274868B2C0545BFBA9F15F828D9986D484A4D CALL}, {828F0D137BDA57F7F4F02801B0E51FDEDCB610BC734FE31D26399E5CBCE651FF SELECT;5BD81A7A48EDA2B8E56100CE844D3BE7 SELECT}  > 0.065
4                                                 <{1C9AAE933F916BA94B5D2B5FA320E05D85C780CD1A9922E26BC1FB7C422F42B2 SELECT}{3FCC23C2562BE9926049EAF2D88CD3D4 SELECT;314CD91DCA8849C64DCEACBA2E3B65B7 SELECT;09E9146A444AE1C47B8E4139D6D69A48 SELECT},{184E7C8929FC9CEA72EF21D99CDC40D9 SELECT}, {184E7C8929EREREERE72EF21D99CDC40D9 SELECT}> 0.065

和SRC2:

   sequenceID   transactionID   eventID           items
1:   42207993    1577            1             EADFE20F543836047330DEFFB893127AF536560121698ADE2FCE6985E07A40D8 SELECT;DD2E595CF23E65E128560B655E0C6848 SELECT 
2:   42207993    6048            2             15873DB37BF80750C70B68A8778B9DC01D548B6D06E3BF92CADAFF289B3FCAEE CALL
3:   42207993    1597            3             F73431225ED64969DC4BEBD06092FD6F SELECT
4:   57237976    12423           1             9AD4E96D7EF2FC7B64407EAF4E2274868B2C0545BFBA9F15F828D9986D484A4D CALL
5:   57237976    12589           2             828F0D137BDA57F7F4F02801B0E51FDEDCB610BC734FE31D26399E5CBCE651FF SELECT;5BD81A7A48EDA2B8E56100CE844D3BE7 SELECT

所需的输出:

  "sequenceID" "transactionID" "eventID" "items"  "Exist" "Co"
"1" 42207993       1577          1        "<{EADFE20F543836047330DEFFB893127AF536560121698ADE2FCE6985E07A40D8 SELECT;DD2E595CF23E65E128560B655E0C6848 SELECT}>"     1
"2" 42207993       6048          2        "{15873DB37BF80750C70B68A8778B9DC01D548B6D06E3BF92CADAFF289B3FCAEE CALL}"     0
"3" 42207993       1597          3        "{F73431225ED64969DC4BEBD06092FD6F SELECT}"    1       0.67
"4" 57237976       12423         1        "{9AD4E96D7EF2FC7B64407EAF4E2274868B2C0545BFBA9F15F828D9986D484A4D CALL}"     0
"5" 57237976       12589         2        "{828F0D137BDA57F7F4F02801B0E51FDEDCB610BC734FE31D26399E5CBCE651FF SELECT;5BD81A7A48EDA2B8E56100CE844D3BE7 SELECT}"    1       0.5

我需要更改代码以便处理特殊字符? (代码运行但结果不符合预期)。

可以看出,表之间存在一对一的关系,只有&#34;序列&#34;和&#34;项目&#34;列内容已更改。所以没有必要只改变逻辑处理&#34;序列&#34;和&#34;项目&#34;列内容。

1 个答案:

答案 0 :(得分:1)

我们可以尝试

v1 <- trimws(gsub('[[:punct:]]+', '', 
   sapply(strsplit(RES1$sequence, '(?<=\\})(?=\\{)|[[,;]', 
             perl=TRUE), tail, 1)))
v2 <- trimws(gsub('[[:punct:]]+', '',
     sapply(strsplit(SRC2$items, '(?<=\\})(?=\\{)|[[,;]', 
           perl=TRUE), tail, 1)))
setDT(SRC2)[, Exist:= +(v2 %chin% v1)]
i1 <- SRC2[, Co:= as.character(round(sum(Exist)/.N, 2)) , 
        sequenceID][, .I[1:(.N-1)], sequenceID]$V1
SRC2[i1, Co:= '']