我正在尝试绘制一系列人口统计因素。每个图都按性别显示了人口统计变量的频率分布。它运行得很好,但是有些标签是按字母顺序而不是有意义的顺序排序的。教育,婚姻状况和SIC2007。
数据结构
structure(list(DMSex = c("Male", "Female", "Male", "Male"), Income = c(980,
-8, 3000, 120), IncCat = c("-1", "-8", "-1", "-1"), HrWkAc = c(-1,
-1, -1, -1), ShiftWk = c(-1, -1, -1, -1), ShiftPat = c(-1, -1,
-1, -1), SOC2010C = c("-1", "9.2.3.3", "-1", "-1"), XSOC2010 = c(-1,
9233, -1, -1), IndexNo = c(-1, 1398, -1, -1), ES2010 = c(-1,
7, -1, -1), nssec = c(-1, 13.4, -1, -1), SECFlag = c(-1, 0, -1,
-1), LSOC2000 = c("-1", "9.2.3.3", "-1", "-1"), XSOC2000 = c(-1,
9233, -1, -1), seg = c(-1, 11, -1, -1), sc = c(-1, 5, -1, -1),
SIC2007 = c(-1, 87, -1, -1), Educ = c(1, 1, -1, 2), EducCur = c(10,
1, -1, -1), FinFTEd = c(-1, -1, -1, 1), FinFTEdY = c(-1,
-1, -1, 21), HiQual = c(22, 10, -1, 1), sic20070 = c(-1,
87, -1, -1), dhhtype = c(6, 8, 7, 3), dagegrp = c(2, 3, 3,
3), dmarsta = c("Single, never married", "Single, never married",
"Interview not achieved", "Married/cohabitating"), dhiqual = c(" Secondary",
" A level or equivalent", "Item not applicable", "Degree or higher"
), dnssec8 = c(-1, 8, -1, -1), duresmc = c(14, 15, 11, 16
), dgorpaf = c(7, 8, 5, 10), dukcntr = c(1, 1, 1, 1), dnrkid04 = c(0,
0, 0, 0), dilodefr = c(3, 3, -1, 3), deconact = c(8, 8, -1,
11), dtenure = c(2, 3, 2, 3), dtotac = c(-1, -1, -1, -1),
dtotus = c(-1, -1, -1, -1), dsic = c("Item not applicable",
"Public admin, education and health", "Item not applicable",
"Item not applicable"), dsoc = c(-1, 9, -1, -1), DVAge_category = c("15 to 30",
"15 to 30", "15 to 30", "15 to 30"), Income_category = c("Less than 1000",
"Less than 1000", "1001 to 3000", "Less than 1000"), HoursWorked_category = c("Less than 20 hours",
"Less than 20 hours", "Less than 20 hours", "Less than 20 hours"
)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
#Age variable
demographics$dagegrp_category<-ifelse(demographics$dagegrp_01 > 2 & demographics$dagegrp < 6, age<-"15 to 30",
ifelse(demographics$dagegrp> 6 & demographics$dagegrp < 9, age<-"31 to 45",
ifelse(demographics$dagegrp > 9 & demographics$dagegrp < 12 , age<-"46 to 60",
ifelse(demographics$dagegrp > 12 & demographics$dagegrp < 15 , age<-"61 to 75",
ifelse(demographics$dagegrp > 15 & demographics$dagegrp < 18 , age<-"76+",
age<- "zombie")))))
demographics$DVAge_category<-c("15 to 30","31 to 45", "46 to 60","61 to 75", "76+", "zombie")[findInterval(demographics$dagegrp , c(-Inf, 6, 10, 12, 15,18, Inf))]
Age<-as.vector(demographics$DVAge_category)
#Gender variable
demographics$DMSex[demographics$DMSex==1]<-"Male"
demographics$DMSex[demographics$DMSex==2]<-"Female"
Gender<-as.vector(demographics$DMSex)
#Income variable
demographics$Income_category<-ifelse(demographics$Income < 1001, income<-"Less than 1000",
ifelse(demographics$Income > 999 & demographics$Income < 3001, income<-"1001 to 3000",
ifelse(demographics$Income > 3001 & demographics$Income < 6001, income <-"3001 to 6000",
ifelse(demographics$Income > 6001 & demographics$Income < 10001 , income<-"6001 to 10000",
income<- "zombie"))))
demographics$Income_category<-c("Less than 1000","1001 to 3000", "3001 to 6000", "6001 to 10000","zombie")[findInterval(demographics$Income , c(-Inf, 1001, 3001, 6001,10001, Inf) ) ]
Income<-as.vector(demographics$Income_category)
#Marital status variable
demographics$dmarsta[demographics$dmarsta==-1]<-"Interview not achieved"
demographics$dmarsta[demographics$dmarsta==1]<-"Single, never married"
demographics$dmarsta[demographics$dmarsta==2]<-"Married/cohabitating"
demographics$dmarsta[demographics$dmarsta==3]<-"Divorced/widowed"
MaritalStatus<-as.vector(demographics$dmarsta)
#Education
demographics$dhiqual[demographics$dhiqual==-8]<-"Don't know"
demographics$dhiqual[demographics$dhiqual==-1]<-"Item not applicable"
demographics$dhiqual[demographics$dhiqual==1]<-"Degree or higher"
demographics$dhiqual[demographics$dhiqual==2]<-"Higher education"
demographics$dhiqual[demographics$dhiqual==3]<-" A level or equivalent"
demographics$dhiqual[demographics$dhiqual==4]<-" Secondary"
demographics$dhiqual[demographics$dhiqual==5]<-" Other"
Education<-as.vector(demographics$dhiqual)
#Hours worked per week in main job variable
demographics$HoursWorked_category<-ifelse(demographics$dtotac < 21, workhours<-"Less than 20 hours",
ifelse(demographics$dtotac > 20 & demographics$dtotac< 41, workhours <-"Between 21 to 40 hours",
ifelse(demographics$dtotac > 40 & demographics$dtotac < 61, workhours <-"Between 41 to 60 hours",
ifelse(demographics$dtotac > 62, workhours<-"More than 61 hours",
workhours<- "Not Applicable"))))
demographics$HoursWorked_category<-c("Less than 20 hours", "Between 21 to 40 hours", "Between 41 to 60 hours","More than 61 hours","Not Applicable")[findInterval(demographics$dtotac, c(-Inf, 21, 41, 61, 62, Inf) ) ]
WorkHours<-as.vector(demographics$HoursWorked_category)
#DV: SIC 2007 industry divisions (grouped)
demographics$dsic[demographics$dsic==-8]<-"Don't know"
demographics$dsic[demographics$dsic==-1]<-"Item not applicable"
demographics$dsic[demographics$dsic==1]<-"Agriculture, forestry and fishing"
demographics$dsic[demographics$dsic==2]<-"Manufacturing"
demographics$dsic[demographics$dsic==3]<-"Energy and water supply"
demographics$dsic[demographics$dsic==4]<-"Construction"
demographics$dsic[demographics$dsic==5]<-"Distribution, hotels and restaurants"
demographics$dsic[demographics$dsic==6]<-"Transport and communication"
demographics$dsic[demographics$dsic==7]<-"Banking and finances"
demographics$dsic[demographics$dsic==8]<-"Public admin, education and health"
demographics$dsic[demographics$dsic==9]<-"Other services"
demographics$industry_category<-c("Don't know", "Item not applicable", "Agriculture, forestry and fishing","Manufacturing","Energy and water supply",
"Construction", "Distribution, hotels and restaurants", "Transport and communication", "Banking and finances",
"Public admin, education and health", "Other service")
SIC2007<-as.vector(demographics$dsic)
# creating df
df<-data.frame(Gender, Age, Education, MaritalStatus, Income, WorkHours, SIC2007)
df %>%
#tidy,而不是性别
gather(variable, value, -c(Gender))%>%
#按值,变量然后按性别分组
group_by(value, variable, Gender) %>%
#summarise获取表单元格频率
summarise(freq=n()) %>%
#Plot
ggplot(aes(x=value, y=freq, group=Gender))+geom_bar(aes(fill=Gender), stat='identity', position='dodge')+ facet_wrap(~variable, scales='free_x') + theme(legend.position="right", axis.text.x = element_text(angle = 60, hjust = 1)) + labs(x="Characteristics", y="Frequencies")
答案 0 :(得分:1)
在ggplot2中,数据是根据data.frame列的因子级别排序的。 要(重新)设置图中的顺序,只需通过以下方式设置因子的顺序:
df$variable <- factor(df$variable, levels = c(...))
您可以先存储data.frame,然后再传递给ggplot函数,然后手动设置要更改的变量的级别。可能效率不高,但这应该可以解决问题:
## Make your plotting data.frame
df2 <- df %>%
gather(variable, value, -c(Gender))%>%
group_by(value, variable, Gender) %>%
summarise(freq=n())
## Apply custom order to MaritalStatus variable:
custom <- c(sort(unique(MaritalStatus))[c(4,3,1,2)],
....)
df2$variable <- factor(df2$variable, levels = c(levels(df2$variable)[!levels(df2$variable) %in% custom],
custom))