我有一个包含三个变量的数据框:theta
,rho
和score
。我想过滤掉“几乎”重复的行。
例如:如果theta
的值与另一行的间隔为10 , rho
的间隔为15,则可以认为它们是重复项。然后,我要删除两个重复项中最低的score
行。
我在想的是,首先arrange()
将数据帧score
(降序),然后创建一个for()
循环。然后创建一个向量lines
,并在没有重复项的情况下向lines
添加一行。
我真的想避免使用for()
循环,因为数据帧很大(超过40000行),并且这样的循环可能很慢。我不知道如何使用例如dplyr
来实现这一目标。
数据样本:
> dput(head(df %>% arrange(desc(score)), 1000))
structure(list(theta = c(112.727272727111, 294.545454545031,
298.18181818139, 294.545454545031, 116.363636363469, 116.363636363469,
116.363636363469, 112.727272727111, 294.545454545031, 298.18181818139,
112.727272727111, 119.999999999828, 116.363636363469, 112.727272727111,
294.545454545031, 298.18181818139, 294.545454545031, 301.818181817748,
294.545454545031, 112.727272727111, 298.18181818139, 116.363636363469,
290.909090908673, 290.909090908673, 116.363636363469, 116.363636363469,
112.727272727111, 298.18181818139, 116.363636363469, 294.545454545031,
119.999999999828, 298.18181818139, 298.18181818139, 119.999999999828,
116.363636363469, 287.272727272315, 294.545454545031, 116.363636363469,
112.727272727111, 112.727272727111, 101.818181818036, 298.18181818139,
290.909090908673, 116.363636363469, 119.999999999828, 298.18181818139,
298.18181818139, 290.909090908673, 109.090909090752, 290.909090908673,
294.545454545031, 109.090909090752, 290.909090908673, 119.999999999828,
290.909090908673, 109.090909090752, 112.727272727111, 301.818181817748,
290.909090908673, 116.363636363469, 101.818181818036, 109.090909090752,
109.090909090752, 116.363636363469, 287.272727272315, 87.2727272726019,
119.999999999828, 112.727272727111, 301.818181817748, 119.999999999828,
119.999999999828, 83.6363636362435, 109.090909090752, 290.909090908673,
109.090909090752, 119.999999999828, 301.818181817748, 116.363636363469,
109.090909090752, 109.090909090752, 283.636363635956, 290.909090908673,
330.909090908615, 109.090909090752, 298.18181818139, 301.818181817748,
116.363636363469, 105.454545454394, 29.0909090908673, 101.818181818036,
105.454545454394, 119.999999999828, 290.909090908673, 116.363636363469,
112.727272727111, 109.090909090752, 269.090909090522, 105.454545454394,
109.090909090752, 290.909090908673, 269.090909090522, 301.818181817748,
301.818181817748, 305.454545454107, 272.727272726881, 283.636363635956,
87.2727272726019, 290.909090908673, 116.363636363469, 301.818181817748,
290.909090908673, 294.545454545031, 239.999999999655, 105.454545454394,
119.999999999828, 301.818181817748, 116.363636363469, 290.909090908673,
83.6363636362435, 327.272727272257, 283.636363635956, 283.636363635956,
254.545454545089, 287.272727272315, 258.181818181447, 119.999999999828,
298.18181818139, 261.818181817806, 87.2727272726019, 290.909090908673,
210.909090908788, 290.909090908673, 105.454545454394, 327.272727272257,
319.99999999954, 65.4545454544514, 287.272727272315, 119.999999999828,
283.636363635956, 287.272727272315, 298.18181818139, 105.454545454394,
279.999999999598, 272.727272726881, 305.454545454107, 287.272727272315,
87.2727272726019, 43.6363636363009, 87.2727272726019, 43.6363636363009,
287.272727272315, 138.18181818162, 119.999999999828, 309.090909090465,
7.27272727271682, 90.9090909089603, 290.909090908673, 301.818181817748,
39.9999999999425, 32.7272727272257, 109.090909090752, 116.363636363469,
279.999999999598, 105.454545454394, 119.999999999828, 276.363636363239,
305.454545454107, 301.818181817748, 101.818181818036, 218.181818181505,
272.727272726881, 272.727272726881, 301.818181817748, 309.090909090465,
279.999999999598, 254.545454545089, 243.636363636014, 229.09090909058,
32.7272727272257, 199.999999999713, 254.545454545089, 272.727272726881,
294.545454545031, 301.818181817748, 283.636363635956, 123.636363636186,
7.27272727271682, 287.272727272315, 87.2727272726019, 254.545454545089,
283.636363635956, 305.454545454107, 65.4545454544514, 330.909090908615,
119.999999999828, 123.636363636186, 283.636363635956, 72.7272727271682,
76.3636363635266, 109.090909090752), rho = c(41.0836002377417,
-45.0465084042093, -37.9532480936103, -43.044158227946, 30.9877673496997,
35.9936427903578, 32.9901175259629, 39.0812500614784, -46.0476834923408,
-29.9438473885574, 40.0824251496101, 23.8936399748773, 26.9830669971733,
42.0847753258733, -42.0429831398144, -36.9520730054787, -41.0418080516828,
-26.8536397978266, -44.0453333160777, 43.0859504140049, -35.950897917347,
28.9854171734365, -50.1354285011879, -51.1366035893195, 34.9924677022261,
27.9842420853049, 46.0894756783997, -34.9497228292154, 25.9818919090416,
-48.0500336686041, 13.8818890935612, -38.9544231817418, -40.9567733580051,
24.8948150630089, 29.9865922615681, -60.2278996099466, -47.0488585804724,
33.9912926140945, 37.0788998852153, 38.0800749733469, 85.3732907252706,
-33.9485477410838, -54.1401288537143, 31.9889424378313, 16.885414357956,
-32.9473726529522, -31.9461975648206, -62.1495295587672, 55.18197515857,
-48.1330783249247, -49.0512087567357, 64.1925509517545, -52.1377786774511,
26.8971652392721, -49.1342534130563, 48.1737495416487, 44.0871255021365,
-27.8548148859583, -59.1460042943724, 24.98071682091, 66.3509640507701,
45.1702242772539, 49.1749246297804, 22.9783666446468, -82.253751548842,
120.686872286734, 12.8807140054296, 48.0918258546629, -21.8477643571686,
10.8783638291664, 14.8830641816928, 131.758025665537, 53.1796249823068,
-60.147179382504, 60.1878505992281, 8.87601365290313, -13.8383636521157,
23.9795417327784, 47.1725744535171, 57.1843253348332, -72.3200714342242,
-53.1389537655828, 91.9951732050117, 40.1643488365959, -27.9414972122942,
-22.8489394453002, 21.9771915565152, 50.2555352073846, 237.136779368164,
55.3380380813223, 70.2790369700168, 21.8912897986141, -67.1554049994253,
20.9760164683836, 45.0883005902681, 52.1784498941752, -119.654988035104,
62.2696362649639, 50.176099717912, -57.1436541181092, -178.724318234869,
-18.8442390927738, -33.8618654147479, -0.73510658654489, -110.579957573771,
-90.3412230205932, 99.6621954359703, -72.1612804400833, 36.9948178784894,
-29.8571650622215, -65.1530548231621, -51.0535589329989, -167.060502489566,
48.2531850311214, 25.8959901511405, -31.8595152384847, 19.974841380252,
-58.1448292062408, 183.819130248381, 62.8740557236012, -91.3423981087248,
-89.3400479324616, -143.897766031004, -59.226724521815, -169.881611464456,
29.900690503667, -42.9591235342683, -150.80781195583, 111.67629649355,
-42.126027796135, -244.156619676724, -68.1565800875569, 82.2931380275962,
72.8858066049173, 45.6767233434207, 169.021874270401, -81.2525764607104,
20.8901147104825, -98.3506237256461, -61.2290746980781, -39.9555982698734,
54.260235559911, -112.44218190202, -119.590533366955, -10.746857467861,
-50.2161487286305, 132.700973344313, 237.190546304581, 121.688047374866,
232.184670863923, -52.2184989048937, -53.6413626642005, 15.8842392698244,
20.3784955094908, 211.865903318108, 154.664384192041, -63.1507046468988,
-28.8559899740899, 236.184390219668, 239.160977781544, 46.1713993653855,
17.9724912039888, -70.3928282004921, 53.2590604717795, 11.879538917298,
-127.531648913209, 0.266068501586671, -3.82661277079962, 102.393267223508,
-228.170369542773, -123.595233719481, -104.572907044981, -25.852464709695,
1.35616883499019, -60.3810773191761, -248.019975196692, -210.085066262338,
-240.190868827689, 237.158627605281, -218.036119344605, -200.964746054506,
-122.59405863135, -56.0594343736569, -20.846589269037, -93.344748284988,
3.78276259922932, 162.80832399966, -54.2208490811569, 175.751502133973,
-180.941244291874, -92.3435731968564, 9.27664429477117, 170.023049358533,
84.9869475880904, 18.8877645342192, -2.22428792956033, -63.3094956410397,
127.900471265421, 127.856219152482, 32.154948131543), score = c(381.781574988754,
376.867929304018, 364.626105647432, 299.691569724508, 294.231324378097,
293.952772362833, 283.460261504884, 282.056364048014, 267.206929374373,
261.168680064558, 252.414542012009, 237.042185845451, 234.078052148331,
229.606725420143, 216.324860493748, 215.558533330939, 210.842537314501,
208.368243554419, 205.181406815045, 204.261096051397, 197.324237979165,
189.521926303097, 188.969964944608, 186.638575913254, 185.140684108428,
183.377726563851, 182.124986813598, 182.045260423294, 181.846212597077,
180.37944359477, 178.789891727661, 176.192227059953, 175.964328485497,
174.846290171029, 171.988837570636, 169.019640895807, 167.9119495256,
167.813177641545, 163.691114744162, 163.450749580731, 162.451649238718,
158.403998247919, 156.155648844112, 153.590137368734, 152.769361891751,
152.520713855705, 151.655777652606, 149.409331820614, 146.472014630364,
146.115633442028, 146.052498649575, 135.864783446716, 134.768563651941,
132.404742423211, 131.51576293036, 130.82284429468, 130.788174510703,
128.424175293775, 126.128019200478, 124.336645293274, 123.597399463198,
121.646401630518, 120.301723278087, 114.561484716175, 114.48553813731,
113.216562649051, 112.226621711484, 111.461463120992, 110.013083445497,
109.576273588556, 109.137012441314, 106.688602963489, 106.400626093183,
106.133536519578, 105.777492905745, 105.688442177713, 104.456641874613,
104.433669269611, 103.314623077763, 102.259746393435, 102.17938530452,
101.670539083899, 100.452688050998, 99.02520484887, 98.4335113782726,
98.0429505018094, 97.4402598820602, 95.7587658236876, 95.2950696240885,
94.9586058341693, 94.3866751251166, 94.3158182244223, 93.7565070140147,
92.127262977829, 91.8548747919413, 91.4386321895478, 91.3768416587042,
91.3010019223121, 91.1345447367969, 90.3128424524546, 89.0661383970599,
89.0055316281083, 88.6971624966804, 87.7983886555544, 87.0306396564274,
85.9204774141763, 85.7965196497019, 85.3142922578242, 84.8425562837965,
84.3735638666951, 83.9731763120239, 82.7466784247688, 82.6092763898421,
82.0587829926224, 81.6777059131652, 81.4293731744454, 80.8438294356826,
80.8211826121344, 80.6709609724947, 80.6465532824821, 80.5178349938752,
80.1999503837411, 80.1719116395444, 79.8942969725428, 79.7436418638969,
79.7277805289744, 79.70684435464, 79.6847128002474, 79.6806763649338,
79.1777583656981, 78.6352312634342, 78.6274361426217, 78.5428920502696,
77.7337176668642, 76.7946629291839, 76.4466244176081, 76.1485811758828,
75.7612036684239, 75.2150823543861, 75.0345313303339, 74.8673067291731,
74.8517038879151, 74.7508858198005, 74.5527292285188, 74.5267512278319,
74.5227005364768, 74.4094234954551, 74.3674893962718, 74.3219691212919,
73.8157794991736, 73.2286453984909, 73.222367796426, 72.8313773620846,
72.7470649807689, 72.5137753468749, 72.5132078673106, 72.5025730252246,
72.4385676060556, 72.1537622540259, 71.9425613728977, 71.6928128941157,
71.5832397214361, 71.5322922095169, 71.2972159994947, 71.1810801604769,
71.1642327631934, 71.076532336903, 71.0515945984565, 70.8404496937105,
70.6690362061286, 70.3221433523102, 70.3120255269573, 70.3017098533045,
70.265603928538, 70.2477297855539, 69.8757560784155, 69.8418754120254,
69.7502537524278, 69.6096619396855, 69.0774383151024, 69.0200227396948,
69.0172508598699, 68.9528342390049, 68.9023657817845, 68.5773542159423,
68.305995695382, 68.1327451195585, 67.9244802616279, 67.6804525353541,
67.5989028542724, 67.5963786570388, 67.4025447304631, 67.3888299703067,
67.0351641892631, 66.9494920024158, 66.2982455895387, 66.2033914475987,
66.0129552398745, 65.9579581030256, 65.6753707562488)), row.names = c(NA,
200L), class = "data.frame")
旁注:数据是通过对图像进行霍夫变换获得的。
编辑:示例数据:
> df
theta rho score
1 10 10 200
2 15 15 150
3 16 16 145
4 25 25 100
5 50 50 50
6 60 60 40
7 70 70 20
8 75 75 5
9 110 110 100
所需的输出:
> df
theta rho score
1 10 10 200
2 50 50 50
3 110 110 100
对于我的实际数据,theta
周围的间隔为8,rho
周围的间隔为10
答案 0 :(得分:0)
整洁的方法
所需输出的样本数据
df <- read.table(text="id theta rho score
1 10 10 200
2 15 15 150
3 16 16 145
4 25 25 100
5 50 50 50
6 60 60 40
7 70 70 20
8 75 75 5
9 110 110 100", header = TRUE)
代码
library( tidyverse )
df %>%
#first, arrange by theta
arrange( theta ) %>%
#mark rows with theta <=10 of next row
mutate( theta_mark = if_else( abs( theta - lag(theta) ) <= 10, 1, 0 ) ) %>%
#second, arrange by rho
arrange( rho ) %>%
#mark rows with rho <=15 of next row
mutate( rho_mark = if_else( abs( rho - lag(rho) ) <= 15, 1, 0 ) ) %>%
#replace NA's with 0
replace_na( list( theta_mark = 0, rho_mark = 0 ) ) %>%
#filter all unwanted rows
filter( theta_mark == 0 & rho_mark == 0 ) %>%
#drop unwanted columns
select( -theta_mark, -rho_mark )
输出
# id theta rho score
# 1 1 10 10 200
# 2 5 50 50 50
# 3 9 110 110 100
答案 1 :(得分:0)
参考我对重叠重复项的评论,您可以迭代地执行此操作,删除得分最高的重复项,然后重复其余得分的下一个最高得分等。下面的代码应该为您做到这一点,并且在您发布的数据集中,剩下27分。尽管效率很低,但循环是我想出快速答案的最简单方法。我确信没有循环就可以做到这一点,如果您拥有数千万个点,您将需要它,但是在这种情况下,确实没有必要。
df1 <- df
thetaInterval <- 8
rhoInterval <- 10
i <- 0
while(i < nrow(df1)){
i <- i + 1
df1 <- df1[order(df1$score, decreasing = TRUE), ]
dups <- abs(df1$theta - df1$theta[i]) <= 8 &
abs(df1$rho - df1$rho[i]) <= 10
df1 <- df1[!dups, ]
}