所以,假设我有一些航空公司的航班数据。一个领域是Origin机场,另一个领域是Destination Airport。我想按路线分组观察(一个独特的Origin to Destination组合)。问题是,对于每个唯一路线,还需要包括相应的返回路线。例如,如果我有机场A到机场B进行一组观察,但是在接下来的几次观察中机场B到机场A,我希望他们都拥有相同的路线ID。
我可以使用SAS,Stata或R.即使是Python,如果它更容易。
代表。代码如下:
df1 <- structure(list(airl = c("US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US"), ORIGIN = c("ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "CLT", "CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL"), DESTINATION = c("CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE"), miles = c(480, 480, 480, 480, 480,
480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 54,
54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54,
480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480,
480, 480, 480, 480, 480, 54, 54, 54, 54, 54, 54, 54, 54, 54,
54, 54, 54, 54, 54, 54, 54, 54), orig_area = c(23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23), dest_area = c(36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23), month = c(1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10,
11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1, 2, 3, 4, 5, 6, 7), freq = c(88, 80, 89, 78, 88, 83, 85, 80,
76, 79, 76, 81, 86, 65, 62, 60, 82, 137, 138, 142, 144, 149,
147, 150, 143, 150, 138, 128, 151, 145, 148, 146, 147, 149, 79,
76, 81, 86, 65, 62, 60, 82, 82, 82, 84, 81, 83, 81, 85, 84, 76,
85, 143, 137, 138, 142, 143, 151, 147, 150, 143, 150, 137, 128,
151, 145, 148, 146, 147), seats = c(8146, 7352, 7599, 6920, 6759,
6060, 6189, 5939, 6137, 6504, 6440, 6804, 6862, 5330, 5242, 5068,
6204, 6460, 6276, 6047, 6095, 6306, 6102, 6265, 7085, 7344, 6809,
6348, 6965, 6626, 6893, 6741, 6765, 6865, 6504, 6440, 6804, 6862,
5330, 5242, 5068, 6204, 6104, 6030, 6278, 6034, 6944, 6816, 6544,
6494, 5872, 6544, 6747, 6460, 6276, 6034, 6058, 6380, 6102, 6278,
7085, 7344, 6759, 6348, 6952, 6613, 6919, 6728, 6765), year = c(2009,
2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009,
2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010,
2010, 2010, 2010, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2009, 2009, 2009, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010,
2010, 2010, 2010, 2010, 2011, 2011, 2011, 2010, 2010, 2010, 2010,
2010, 2010, 2010, 2010, 2010, 2010, 2011, 2011, 2011, 2011, 2011,
2011, 2011)), .Names = c("airl", "ORIGIN", "DESTINATION", "miles",
"orig_area", "dest_area", "month", "freq", "seats", "year"), class = "data.frame", row.names = c(NA,
69L))
答案 0 :(得分:2)
采用最简单的标签(两个机场代码,按字母顺序列出,以便原点和目的地无关紧要):
df1$group <- apply(df1[c("ORIGIN", "DESTINATION")],
1,
function(cit) {paste(sort(cit),collapse="-")})
简而言之,只取出机场代码,然后按行按字母顺序对代码进行排序,然后用连字符将它们分开粘贴在一起。指定group
变量。
> df1$group
[1] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT"
[8] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT"
[15] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL"
[22] "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL"
[29] "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-CLT"
[36] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT"
[43] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT"
[50] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL"
[57] "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL"
[64] "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL"
答案 1 :(得分:1)
在Stata中,您可以利用字符串之间的关系:
assert "ABE"<"CLT"
很好,虽然当然是
assert "ABE"<"CLT"
assert "ABE"<1
将产生错误(第一个是错误,第二个是比较不兼容的数据类型)。因此反映Brian在R中的建议,我们可以
gen str7 route = origin + "-" + destination if origin < destination & !missing(origin) & !missing(destination)
replace route = destination + "-" + origin if destination < origin & !missing(origin) & !missing(destination)
list origin destination route if missing( route )
当然,所有缺失值检查都只是偏执狂。但谁知道你的数据有多糟糕:)。
答案 2 :(得分:1)
使用R
和data.table
的方法(用于编码优雅和内存效率)
library(data.table)
DT <- as.data.table(df1)
DT[, id := paste(sort(c(ORIGIN, DESTINATION)), collapse ='-') ,
by = list(ORIGIN, DESTINATION)]
答案 3 :(得分:0)
SAS解决方案,使用PROC FORMAT。我在这里采取了比你可能需要的更多步骤 - 只需分出每一步,以明确我在做什么。这会为每对分配一个值,并将该对的两个方向分配给相同的值。
通过使用转换格式,您可以使用标签本身作为组定义的R解决方案完全相同 'ABQ-DEN'或'DEN-ABQ'进入'ABQ-DEN' - 就像这样,但在预处理排序数据步骤中设置label = start。您还需要将INFORMAT转换为FORMAT。
proc format;
*What this will look like - this is an example and NOT used in the final solution;
invalue $AIRRT
'ABE-CLT'=1
'CLT-ABE'=1
'ABE-PHL'=2
'PHL-ABE'=2
'ABQ-DEN'=3
'DEN-ABQ'=3
'ABQ-ELP'=4
'ELP-ABQ'=4
'MDW-MCI'=5
;
*Only used to create sample data;
value AIRPORT
1="ABE"
2="CLT"
3="PHL"
4="ABQ"
5="ELP"
6="DEN"
7="MDW"
8="MCI"
;
quit;
*create sample data;
data have;
do _t = 1 to 100;
origin=put(ceil(8*ranuni(7)),$AIRPORT.);
do until (destination ne origin);
destination=put(ceil(8*ranuni(7)),AIRPORT.);
end;
output;
end;
run;
*create preliminary dataset for format, creating combined field;
data for_format_pre;
set have;
call sortc(of origin destination);
start = catx('-',origin,destination);
keep start origin destination;
run;
*sort down to one per route;
proc sort nodupkey data=for_format_pre;
by start;
run;
*create final format dataset, with group counter;
data for_format;
set for_format_pre;
retain fmtname "AIRRT" type 'j';
label+1;
output;
start=catx('-',destination,origin);
output;
run;
*import into formats;
proc format cntlin=for_format;
quit;
*apply to dataset;
data want;
set have;
combined=catx('-',origin,destination);
group_sort = input(combined,$AIRRT.);
drop _:;
run;