对每个组乘以另一列,将多列相乘并划分

时间:2018-05-24 23:45:19

标签: r dplyr data.table

我有一个具有以下结构的数据框:

# A tibble: 95 x 7
# Groups:   WallReg_2p5 [19]
   CellID_2p5 Y_Coord_2p5Weighting WallReg_2p5 piC_1  piC_2 piC_3 piC_4
        <int>                <dbl> <chr>       <dbl>  <dbl> <dbl> <dbl>
 1       6561                0.915 African      6.55  6.63  5.84  0.766
 2       6278                0.947 African     15.1   5.59  2.15  2.01 
 3       4394                0.971 African     11.4   3.92  0.774 1.47 
 4       4840                0.994 African      4.70  0.962 6.21  3.54 
 5       4105                0.947 African      6.35  2.10  2.25  3.24 
 6       5228                1.000 Amazonian    8.49  5.00  1.92  2.42 
 7       5089                1.000 Amazonian   15.6   6.48  2.53  2.89 
 8       4939                0.998 Amazonian    5.56  2.94  0.389 2.44 
 9       5088                1.000 Amazonian   12.9   5.16  1.99  3.13 
10       4947                0.998 Amazonian    8.05 11.2   2.54  4.61 
# ... with 85 more rows

以下是数据框子集的dput()。我的真实数据集包含10,368行和255,611列

structure(list(CellID_2p5 = c(6561L, 6278L, 4394L, 4840L, 4105L, 
5228L, 5089L, 4939L, 5088L, 4947L, 1710L, 2569L, 1438L, 1175L, 
1840L, 6888L, 7185L, 6031L, 7045L, 7044L, 3432L, 3288L, 3143L, 
3574L, 3577L, 3260L, 1959L, 2568L, 2986L, 2386L, 5551L, 5407L, 
5556L, 4979L, 5694L, 5303L, 4442L, 5587L, 5157L, 4865L, 3294L, 
3009L, 2865L, 2722L, 3151L, 6427L, 6571L, 5996L, 6570L, 6139L, 
3631L, 3920L, 3342L, 3341L, 4064L, 2617L, 2049L, 3346L, 1599L, 
3205L, 7487L, 6612L, 6613L, 7630L, 7916L, 3854L, 3561L, 4290L, 
4138L, 3704L, 4211L, 4068L, 4069L, 4357L, 4648L, 5601L, 5600L, 
5455L, 5456L, 5458L, 3978L, 3822L, 3532L, 3832L, 3834L, 7105L, 
6817L, 6104L, 7963L, 6098L, 3418L, 3424L, 3281L, 3566L, 3273L
), Y_Coord_2p5Weighting = c(0.915311479119447, 0.946930129495106, 
0.971342069813261, 0.99405633822232, 0.946930129495106, 0.999762027079909, 
0.999762027079909, 0.997858923238603, 0.999762027079909, 0.997858923238603, 
0.480988768919388, 0.691513055782269, 0.402746689858737, 0.362438038283702, 
0.518773258160522, 0.876726755707508, 0.831469612302545, 0.971342069813261, 
0.854911870672947, 0.854911870672947, 0.854911870672947, 0.831469612302545, 
0.806444604267483, 0.876726755707508, 0.876726755707508, 0.831469612302545, 
0.555570233019602, 0.691513055782269, 0.779884483092882, 0.659345815100069, 
0.99405633822232, 0.997858923238603, 0.99405633822232, 0.997858923238603, 
0.988361510467761, 0.999762027079909, 0.971342069813261, 0.99405633822232, 
0.999762027079909, 0.99405633822232, 0.831469612302545, 0.779884483092882, 
0.751839807478977, 0.722363962059756, 0.806444604267483, 0.932007869282799, 
0.915311479119447, 0.971342069813261, 0.915311479119447, 0.960049854385929, 
0.896872741532688, 0.932007869282799, 0.854911870672947, 0.854911870672947, 
0.946930129495106, 0.722363962059756, 0.591309648363582, 0.854911870672947, 
0.480988768919388, 0.831469612302545, 0.779884483092882, 0.915311479119447, 
0.915311479119447, 0.751839807478977, 0.691513055782269, 0.915311479119447, 
0.876726755707508, 0.960049854385929, 0.946930129495106, 0.896872741532688, 
0.960049854385929, 0.946930129495106, 0.946930129495106, 0.971342069813261, 
0.988361510467761, 0.99405633822232, 0.99405633822232, 0.997858923238603, 
0.997858923238603, 0.997858923238603, 0.932007869282799, 0.915311479119447, 
0.876726755707508, 0.915311479119447, 0.915311479119447, 0.831469612302545, 
0.876726755707508, 0.960049854385929, 0.659345815100069, 0.960049854385929, 
0.854911870672947, 0.854911870672947, 0.831469612302545, 0.876726755707508, 
0.831469612302545), WallReg_2p5 = c("African", "African", "African", 
"African", "African", "Amazonian", "Amazonian", "Amazonian", 
"Amazonian", "Amazonian", "Arctico-Siberian", "Arctico-Siberian", 
"Arctico-Siberian", "Arctico-Siberian", "Arctico-Siberian", "Australian", 
"Australian", "Australian", "Australian", "Australian", "Chinese", 
"Chinese", "Chinese", "Chinese", "Chinese", "Eurasian", "Eurasian", 
"Eurasian", "Eurasian", "Eurasian", "Guineo-Congolian", "Guineo-Congolian", 
"Guineo-Congolian", "Guineo-Congolian", "Guineo-Congolian", "Indo-Malayan", 
"Indo-Malayan", "Indo-Malayan", "Indo-Malayan", "Indo-Malayan", 
"Japanese", "Japanese", "Japanese", "Japanese", "Japanese", "Madagascan", 
"Madagascan", "Madagascan", "Madagascan", "Madagascan", "Mexican", 
"Mexican", "Mexican", "Mexican", "Mexican", "North American", 
"North American", "North American", "North American", "North American", 
"Novozelandic", "Novozelandic", "Novozelandic", "Novozelandic", 
"Novozelandic", "Oriental", "Oriental", "Oriental", "Oriental", 
"Oriental", "Panamanian", "Panamanian", "Panamanian", "Panamanian", 
"Panamanian", "Papua-Melanesian", "Papua-Melanesian", "Papua-Melanesian", 
"Papua-Melanesian", "Papua-Melanesian", "Saharo-Arabian", "Saharo-Arabian", 
"Saharo-Arabian", "Saharo-Arabian", "Saharo-Arabian", "South American", 
"South American", "South American", "South American", "South American", 
"Tibetan", "Tibetan", "Tibetan", "Tibetan", "Tibetan"), piC_1 = c(6.54637718200684, 
15.1273813247681, 11.4171981811523, 4.70245027542114, 6.35227298736572, 
8.48885822296143, 15.5538415908813, 5.56155681610107, 12.9046697616577, 
8.04517650604248, 2.95071268081665, 21.6441345214844, 11.2329692840576, 
16.1649322509766, 17.2905006408691, 3.43583130836487, 10.0594062805176, 
12.3438568115234, 7.94222640991211, 6.89916276931763, 7.45456171035767, 
8.77329444885254, 14.3378238677979, 3.86588025093079, 12.4889860153198, 
7.18962049484253, 19.2145137786865, 22.0060653686523, 1.86285281181335, 
2.09195709228516, 9.87592029571533, 12.2629871368408, 7.31402492523193, 
0.601671099662781, 6.9998254776001, 20.6269207000732, 6.21515369415283, 
22.039529800415, 8.35955047607422, 9.50113105773926, 7.06818675994873, 
4.63532447814941, 5.81412315368652, 0.996474027633667, 8.32744407653809, 
5.03945255279541, 0.893457889556885, 2.42736291885376, 10.3842725753784, 
3.32475543022156, 8.1105375289917, 6.61336517333984, 4.06754541397095, 
3.31069254875183, 8.05746650695801, 1.24714422225952, 6.44647121429443, 
2.97141313552856, 13.3264999389648, 4.86157178878784, 6.71903085708618, 
20.3318004608154, 20.8287792205811, 10.0042209625244, 12.7859420776367, 
13.6358938217163, 15.9491415023804, 11.4823551177979, 18.6053276062012, 
16.6047229766846, 16.1496143341064, 2.9492039680481, 13.8130388259888, 
18.6300754547119, 14.464674949646, 4.92032289505005, 0.511945068836212, 
3.16324853897095, 13.3062620162964, 9.84803581237793, 1.74625515937805, 
2.54861640930176, 9.97869968414307, 11.2339553833008, 0.865878522396088, 
14.7632684707642, 21.8330593109131, 6.42118740081787, 9.51691722869873, 
13.2857227325439, 4.01672554016113, 10.9487056732178, 13.6308097839355, 
4.69979858398438, 1.83490359783173), piC_2 = c(6.62732124328613, 
5.59194660186768, 3.92186212539673, 0.962285339832306, 2.1002824306488, 
4.99801731109619, 6.4822793006897, 2.94481801986694, 5.16082000732422, 
11.2070302963257, 0.585842967033386, 4.83236265182495, 1.637331366539, 
7.65087461471558, 2.28347945213318, 7.16115474700928, 3.54162955284119, 
5.23653078079224, 2.28897953033447, 2.29887819290161, 0.752622723579407, 
0.653791189193726, 1.5378258228302, 2.15203213691711, 1.64702248573303, 
6.0682373046875, 0.22119003534317, 4.76900386810303, 0.366481363773346, 
6.11435651779175, 10.8921070098877, 7.97591733932495, 6.05282688140869, 
3.74584698677063, 5.75792741775513, 0.471727430820465, 2.75132250785828, 
1.21862363815308, 0.138835281133652, 2.98711204528809, 0.627980709075928, 
0.108154557645321, 0.995486855506897, 2.4163064956665, 0.0193456951528788, 
5.70003795623779, 5.56746625900269, 2.9861011505127, 0.344279021024704, 
0.640789806842804, 9.4457426071167, 7.05727958679199, 3.89853048324585, 
0.340702921152115, 1.17963445186615, 8.93050575256348, 14.796028137207, 
4.88054323196411, 9.28642845153809, 7.68382120132446, 2.27267980575562, 
0.916118919849396, 0.689630210399628, 0.549197673797607, 1.68408465385437, 
1.76007652282715, 3.2269868850708, 0.980833470821381, 5.00142002105713, 
3.41616177558899, 6.74930334091187, 12.0952653884888, 15.2918863296509, 
0.105648428201675, 4.59846162796021, 1.48986113071442, 5.02905178070068, 
5.07208204269409, 4.98251914978027, 4.70810985565186, 2.37468719482422, 
6.78730487823486, 6.18559217453003, 11.6090707778931, 2.91017484664917, 
3.51590204238892, 3.35987615585327, 8.74919319152832, 2.23059439659119, 
0.292922139167786, 5.41262531280518, 8.86936473846436, 8.20160961151123, 
7.33296489715576, 8.42716407775879), piC_3 = c(5.84101867675781, 
2.14856338500977, 0.774434208869934, 6.21446466445923, 2.25056719779968, 
1.9200998544693, 2.52935075759888, 0.38894659280777, 1.98762917518616, 
2.53701376914978, 6.93642854690552, 0.608367025852203, 4.7472562789917, 
1.25435817241669, 4.09390258789062, 5.41882562637329, 0.221905186772346, 
3.72868466377258, 0.763698220252991, 0.783569753170013, 8.32380294799805, 
4.482017993927, 2.38237118721008, 10.7143220901489, 10.1253957748413, 
4.51582384109497, 5.18871164321899, 1.76670265197754, 7.50785446166992, 
6.2304630279541, 8.79040622711182, 7.47595691680908, 1.57976567745209, 
1.46996772289276, 0.894773840904236, 1.30858862400055, 7.34649181365967, 
1.41060519218445, 2.03947067260742, 4.6038031578064, 4.44245910644531, 
0.236538723111153, 0.194929093122482, 0.684483885765076, 0.530747056007385, 
1.89696133136749, 1.94861626625061, 3.36041831970215, 0.0835498198866844, 
2.04665040969849, 7.02379274368286, 2.93551588058472, 5.33355855941772, 
1.59516668319702, 2.19099020957947, 2.88170146942139, 7.42911052703857, 
4.64155960083008, 2.24829292297363, 3.64715957641602, 0.363596022129059, 
1.41882479190826, 0.474381387233734, 2.24125337600708, 4.11492681503296, 
3.44695138931274, 3.08158445358276, 0.218709617853165, 2.44625425338745, 
1.71628797054291, 1.75634157657623, 4.76044988632202, 0.387977868318558, 
1.70636379718781, 1.70855867862701, 3.67641615867615, 0.744896650314331, 
1.09648311138153, 1.37377882003784, 0.200171306729317, 1.4753475189209, 
6.56762170791626, 7.72892284393311, 2.18395304679871, 0.481256455183029, 
0.37385630607605, 4.25140476226807, 6.76727914810181, 4.81376981735229, 
3.8882269859314, 2.90145373344421, 7.48540449142456, 9.90997123718262, 
4.46362543106079, 5.19004011154175), piC_4 = c(0.765519082546234, 
2.01459360122681, 1.4724348783493, 3.53503012657166, 3.23746180534363, 
2.42439723014832, 2.89345812797546, 2.43676805496216, 3.13469624519348, 
4.61154937744141, 4.51843070983887, 0.767921149730682, 5.01102733612061, 
2.94891023635864, 5.20972728729248, 1.1311411857605, 2.22004199028015, 
3.79573369026184, 0.551535904407501, 0.574182093143463, 5.87988710403442, 
5.06349992752075, 3.72144675254822, 8.49415874481201, 4.27884483337402, 
2.48057842254639, 4.45665884017944, 0.667030334472656, 6.93020153045654, 
2.26927351951599, 1.5674192905426, 3.63813829421997, 2.73822736740112, 
0.674351632595062, 1.89532685279846, 4.79139471054077, 1.34277474880219, 
0.564522683620453, 3.33897042274475, 1.42253696918488, 2.7286331653595, 
0.960368096828461, 2.00121903419495, 4.58775472640991, 2.11190366744995, 
0.29313051700592, 0.0706640183925629, 2.87113666534424, 1.36242246627808, 
3.57689785957336, 2.05132532119751, 0.340487778186798, 1.3506361246109, 
0.400035679340363, 1.65728294849396, 5.17583227157593, 6.23331356048584, 
1.60608506202698, 6.12336874008179, 0.46411395072937, 0.205161795020103, 
1.93029391765594, 2.6833176612854, 0.199026927351952, 0.0609574876725674, 
1.12770354747772, 1.49503016471863, 0.299944281578064, 0.302427768707275, 
0.745285212993622, 2.91650176048279, 4.18865776062012, 2.71514081954956, 
1.93356776237488, 1.67894613742828, 1.67655885219574, 3.09425163269043, 
2.87126135826111, 2.42724895477295, 5.48751878738403, 3.4703311920166, 
3.71456289291382, 4.29666662216187, 3.37810254096985, 3.07785415649414, 
1.90873026847839, 3.57397627830505, 0.902793109416962, 3.96058869361877, 
0.35958793759346, 2.9896719455719, 1.81924939155579, 4.22445392608643, 
2.22684979438782, 4.53710412979126)), row.names = c(NA, -95L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), .Names = c("CellID_2p5", "Y_Coord_2p5Weighting", 
"WallReg_2p5", "piC_1", "piC_2", "piC_3", "piC_4"), vars = "WallReg_2p5", drop = TRUE, indices = list(
    0:4, 5:9, 10:14, 15:19, 20:24, 25:29, 30:34, 35:39, 40:44, 
    45:49, 50:54, 55:59, 60:64, 65:69, 70:74, 75:79, 80:84, 85:89, 
    90:94), group_sizes = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), biggest_group_size = 5L, labels = structure(list(
    WallReg_2p5 = c("African", "Amazonian", "Arctico-Siberian", 
    "Australian", "Chinese", "Eurasian", "Guineo-Congolian", 
    "Indo-Malayan", "Japanese", "Madagascan", "Mexican", "North American", 
    "Novozelandic", "Oriental", "Panamanian", "Papua-Melanesian", 
    "Saharo-Arabian", "South American", "Tibetan")), row.names = c(NA, 
-19L), class = "data.frame", vars = "WallReg_2p5", drop = TRUE, .Names = "WallReg_2p5"))

我要做的是为每个区域生成所有piC_列的加权值。每列(x)的过程包括3个步骤:

  1. piC_x列中的每一行乘以Y_Coord_2p5Weighting
  2. 中的值
  3. 汇总每个piC_x
  4. 中的加权WallReg_2p5
  5. 将总计piC_x值除以每个Y_Coord_2p5Weighting组的WallReg_2p5值的总和
  6. 经过一些阅读后,data.table显示大型数据集上的dplyrr更快,但我愿意使用任一软件包,甚至是基础data.table。 我曾尝试同时执行这两项操作,但在使用dplyr时收到的结果不正确,而且当我将其应用于完整数据框时,我担心df <- df %>% tbl_df() %>% group_by(WallReg_2p5) %>% summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1)) # A tibble: 19 x 4 WallReg_2p5 meanS minS maxS <chr> <dbl> <dbl> <dbl> 1 African 8.83 4.70 15.1 2 Amazonian 10.1 5.56 15.6 3 Arctico-Siberian 13.9 2.95 21.6 4 Australian 8.14 3.44 12.3 5 Chinese 9.38 3.87 14.3 6 Eurasian 10.5 1.86 22.0 7 Guineo-Congolian 7.41 0.602 12.3 8 Indo-Malayan 13.3 6.22 22.0 9 Japanese 5.37 0.996 8.33 10 Madagascan 4.41 0.893 10.4 11 Mexican 6.03 3.31 8.11 12 North American 5.77 1.25 13.3 13 Novozelandic 14.1 6.72 20.8 14 Oriental 15.3 11.5 18.6 15 Panamanian 13.2 2.95 18.6 16 Papua-Melanesian 6.35 0.512 13.3 17 Saharo-Arabian 5.27 0.866 11.2 18 South American 13.2 6.42 21.8 19 Tibetan 7.03 1.83 13.6 weighted <- df %>% mutate_at(.funs = funs(.*Y_Coord_2p5Weighting), .vars = vars(starts_with("piC_"))) %>% ## multiply by lat weight mutate_at(.funs = funs(sum), .vars = vars(starts_with("piC_"))) %>% ## sum the weighted values mutate_at(.funs = funs(./sum(Y_Coord_2p5Weighting)), .vars = vars(starts_with("piC_"))) ## divide weighted values by sum of weights weighted %>% tbl_df %>% group_by(WallReg_2p5) %>% summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1)) # A tibble: 19 x 4 WallReg_2p5 meanS minS maxS <chr> <dbl> <dbl> <dbl> 1 African 8.82 8.82 8.82 2 Amazonian 10.1 10.1 10.1 3 Arctico-Siberian 14.5 14.5 14.5 4 Australian 8.21 8.21 8.21 5 Chinese 9.32 9.32 9.32 6 Eurasian 9.86 9.86 9.86 7 Guineo-Congolian 7.41 7.41 7.41 8 Indo-Malayan 13.4 13.4 13.4 9 Japanese 5.47 5.47 5.47 10 Madagascan 4.38 4.38 4.38 11 Mexican 6.10 6.10 6.10 12 North American 5.09 5.09 5.09 13 Novozelandic 14.6 14.6 14.6 14 Oriental 15.2 15.2 15.2 15 Panamanian 13.2 13.2 13.2 16 Papua-Melanesian 6.36 6.36 6.36 17 Saharo-Arabian 5.22 5.22 5.22 18 South American 13.2 13.2 13.2 19 Tibetan 7.01 7.01 7.01 的速度。这是我到目前为止所尝试的内容

    dplyr

    dplyr

    使用data.table我得到了正确的值。但是,当我使用df <- df %>% group_by(WallReg_2p5) %>% as.data.table(.) %>% setkey(., WallReg_2p5) is.data.table(df); haskey(df) [1] TRUE [1] TRUE ## same as above df %>% tbl_df %>% group_by(WallReg_2p5) %>% summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1)) # A tibble: 19 x 4 WallReg_2p5 meanS minS maxS <chr> <dbl> <dbl> <dbl> 1 African 8.83 4.70 15.1 2 Amazonian 10.1 5.56 15.6 3 Arctico-Siberian 13.9 2.95 21.6 4 Australian 8.14 3.44 12.3 5 Chinese 9.38 3.87 14.3 6 Eurasian 10.5 1.86 22.0 7 Guineo-Congolian 7.41 0.602 12.3 8 Indo-Malayan 13.3 6.22 22.0 9 Japanese 5.37 0.996 8.33 10 Madagascan 4.41 0.893 10.4 11 Mexican 6.03 3.31 8.11 12 North American 5.77 1.25 13.3 13 Novozelandic 14.1 6.72 20.8 14 Oriental 15.3 11.5 18.6 15 Panamanian 13.2 2.95 18.6 16 Papua-Melanesian 6.35 0.512 13.3 17 Saharo-Arabian 5.27 0.866 11.2 18 South American 13.2 6.42 21.8 19 Tibetan 7.03 1.83 13.6 # https://stackoverflow.com/q/28123098/1710632 indx <- grep("piC_", colnames(df)) for (j in indx) { set(df, i = NULL, j = j, value = df[[j]]*df[["Y_Coord_2p5Weighting"]]) ## multiply by weights set(df, i = NULL, j = j, value = sum(df[[j]])) ## sum the weighted values set(df, i = NULL, j = j, value = df[[j]]/sum(df[["Y_Coord_2p5Weighting"]])) ## divide by sum of weights } ## wrong values df %>% tbl_df %>% group_by(WallReg_2p5) %>% summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1)) # A tibble: 19 x 4 WallReg_2p5 meanS minS maxS <chr> <dbl> <dbl> <dbl> 1 African 9.27 9.27 9.27 2 Amazonian 9.27 9.27 9.27 3 Arctico-Siberian 9.27 9.27 9.27 4 Australian 9.27 9.27 9.27 5 Chinese 9.27 9.27 9.27 6 Eurasian 9.27 9.27 9.27 7 Guineo-Congolian 9.27 9.27 9.27 8 Indo-Malayan 9.27 9.27 9.27 9 Japanese 9.27 9.27 9.27 10 Madagascan 9.27 9.27 9.27 11 Mexican 9.27 9.27 9.27 12 North American 9.27 9.27 9.27 13 Novozelandic 9.27 9.27 9.27 14 Oriental 9.27 9.27 9.27 15 Panamanian 9.27 9.27 9.27 16 Papua-Melanesian 9.27 9.27 9.27 17 Saharo-Arabian 9.27 9.27 9.27 18 South American 9.27 9.27 9.27 19 Tibetan 9.27 9.27 9.27 时,我得到的值不正确。我的代码基于问题here,但显然我做错了。

    data.table

    ?set()

    阅读data.table,声明它无法执行分组操作,但我认为,因为我已经定义了我的组,这个过程可以正常工作。我之前从未使用String fileURL = "https://www.dropbox.com/s/(obfuscated)/stripad.jpg"; ImageLoaderConfiguration config = new ImageLoaderConfiguration.Builder(this).build(); ImageLoader imageLoader = ImageLoader.getInstance(); imageLoader.init(config); ImageView imageview = (ImageView)findViewById(R.id.adtest1); DisplayImageOptions options = new DisplayImageOptions.Builder() .imageScaleType(ImageScaleType.EXACTLY_STRETCHED) .cacheInMemory(true) .bitmapConfig(Bitmap.Config.RGB_565) .build(); imageLoader.displayImage(fileURL, imageview, options); ,所以我们非常感谢任何指导。

0 个答案:

没有答案