我使用以下代码:
library(stringi)
library(data.table)
v1 <- stri_extract_last_regex(RES1$sequence, '[[:alnum:]]+')
setDT(SRC2)[, Exist:= +(items %chin% v1)]
i1 <- SRC2[, Co:= as.character(round(sum(Exist)/.N, 2)) ,
sequenceID][, .I[1:(.N-1)], sequenceID]$V1
SRC2[i1, Co:= '']
当RES1包含&#34;序列&#34;的正常值时:
"sequence" "support"
"1" "<{OV50}>" 0.286
"2" "<{OV148}>" 0.121
"3" "<{OV46},{OV197}>" 0.065
"4" "<{OV198},{OV199}, {OV205}>" 0.065
SRC2包含&#34;项目&#34;的正常值:
"sequenceID" "transactionID" "eventID" "items"
"1" 42207993 1577 1 "OV50"
"2" 42207993 6048 2 "OV11"
"3" 42207993 1597 3 "OV148"
"4" 57237976 12423 1 "OV46"
"5" 57237976 12589 2 "OV197"
输出:
"sequenceID" "transactionID" "eventID" "items" "Exist" "Co"
"1" 42207993 1577 1 "OV50" 1
"2" 42207993 6048 2 "OV11" 0
"3" 42207993 1597 3 "OV148" 1 0.67
"4" 57237976 12423 1 "OV46" 0
"5" 57237976 12589 2 "OV197" 1 0.5
代码完美无缺。然而,&#34;序列&#34;的值和&#34;项目&#34;持有许多字母,包括特殊字符,如:
RES1:
sequence support
1 <{EADFE20F543836047330DEFFB893127AF536560121698ADE2FCE6985E07A40D8 SELECT;DD2E595CF23E65E128560B655E0C6848 SELECT}> 0.286
2 <{F73431225ED64969DC4BEBD06092FD6F SELECT}> 0.121
3 <{9AD4E96D7EF2FC7B64407EAF4E2274868B2C0545BFBA9F15F828D9986D484A4D CALL}, {828F0D137BDA57F7F4F02801B0E51FDEDCB610BC734FE31D26399E5CBCE651FF SELECT;5BD81A7A48EDA2B8E56100CE844D3BE7 SELECT} > 0.065
4 <{1C9AAE933F916BA94B5D2B5FA320E05D85C780CD1A9922E26BC1FB7C422F42B2 SELECT}{3FCC23C2562BE9926049EAF2D88CD3D4 SELECT;314CD91DCA8849C64DCEACBA2E3B65B7 SELECT;09E9146A444AE1C47B8E4139D6D69A48 SELECT},{184E7C8929FC9CEA72EF21D99CDC40D9 SELECT}, {184E7C8929EREREERE72EF21D99CDC40D9 SELECT}> 0.065
和SRC2:
sequenceID transactionID eventID items
1: 42207993 1577 1 EADFE20F543836047330DEFFB893127AF536560121698ADE2FCE6985E07A40D8 SELECT;DD2E595CF23E65E128560B655E0C6848 SELECT
2: 42207993 6048 2 15873DB37BF80750C70B68A8778B9DC01D548B6D06E3BF92CADAFF289B3FCAEE CALL
3: 42207993 1597 3 F73431225ED64969DC4BEBD06092FD6F SELECT
4: 57237976 12423 1 9AD4E96D7EF2FC7B64407EAF4E2274868B2C0545BFBA9F15F828D9986D484A4D CALL
5: 57237976 12589 2 828F0D137BDA57F7F4F02801B0E51FDEDCB610BC734FE31D26399E5CBCE651FF SELECT;5BD81A7A48EDA2B8E56100CE844D3BE7 SELECT
所需的输出:
"sequenceID" "transactionID" "eventID" "items" "Exist" "Co"
"1" 42207993 1577 1 "<{EADFE20F543836047330DEFFB893127AF536560121698ADE2FCE6985E07A40D8 SELECT;DD2E595CF23E65E128560B655E0C6848 SELECT}>" 1
"2" 42207993 6048 2 "{15873DB37BF80750C70B68A8778B9DC01D548B6D06E3BF92CADAFF289B3FCAEE CALL}" 0
"3" 42207993 1597 3 "{F73431225ED64969DC4BEBD06092FD6F SELECT}" 1 0.67
"4" 57237976 12423 1 "{9AD4E96D7EF2FC7B64407EAF4E2274868B2C0545BFBA9F15F828D9986D484A4D CALL}" 0
"5" 57237976 12589 2 "{828F0D137BDA57F7F4F02801B0E51FDEDCB610BC734FE31D26399E5CBCE651FF SELECT;5BD81A7A48EDA2B8E56100CE844D3BE7 SELECT}" 1 0.5
我需要更改代码以便处理特殊字符? (代码运行但结果不符合预期)。
可以看出,表之间存在一对一的关系,只有&#34;序列&#34;和&#34;项目&#34;列内容已更改。所以没有必要只改变逻辑处理&#34;序列&#34;和&#34;项目&#34;列内容。
答案 0 :(得分:1)
我们可以尝试
v1 <- trimws(gsub('[[:punct:]]+', '',
sapply(strsplit(RES1$sequence, '(?<=\\})(?=\\{)|[[,;]',
perl=TRUE), tail, 1)))
v2 <- trimws(gsub('[[:punct:]]+', '',
sapply(strsplit(SRC2$items, '(?<=\\})(?=\\{)|[[,;]',
perl=TRUE), tail, 1)))
setDT(SRC2)[, Exist:= +(v2 %chin% v1)]
i1 <- SRC2[, Co:= as.character(round(sum(Exist)/.N, 2)) ,
sequenceID][, .I[1:(.N-1)], sequenceID]$V1
SRC2[i1, Co:= '']