使用条形图时设置订单级别

时间:2019-05-03 11:22:52

标签: r dataframe ggplot2

我正在尝试绘制一系列人口统计因素。每个图都按性别显示了人口统计变量的频率分布。它运行得很好,但是有些标签是按字母顺序而不是有意义的顺序排序的。教育,婚姻状况和SIC2007。

数据结构

structure(list(DMSex = c("Male", "Female", "Male", "Male"), Income = c(980, 
-8, 3000, 120), IncCat = c("-1", "-8", "-1", "-1"), HrWkAc = c(-1, 
-1, -1, -1), ShiftWk = c(-1, -1, -1, -1), ShiftPat = c(-1, -1, 
-1, -1), SOC2010C = c("-1", "9.2.3.3", "-1", "-1"), XSOC2010 = c(-1, 
9233, -1, -1), IndexNo = c(-1, 1398, -1, -1), ES2010 = c(-1, 
7, -1, -1), nssec = c(-1, 13.4, -1, -1), SECFlag = c(-1, 0, -1, 
-1), LSOC2000 = c("-1", "9.2.3.3", "-1", "-1"), XSOC2000 = c(-1, 
9233, -1, -1), seg = c(-1, 11, -1, -1), sc = c(-1, 5, -1, -1), 
    SIC2007 = c(-1, 87, -1, -1), Educ = c(1, 1, -1, 2), EducCur = c(10, 
    1, -1, -1), FinFTEd = c(-1, -1, -1, 1), FinFTEdY = c(-1, 
    -1, -1, 21), HiQual = c(22, 10, -1, 1), sic20070 = c(-1, 
    87, -1, -1), dhhtype = c(6, 8, 7, 3), dagegrp = c(2, 3, 3, 
    3), dmarsta = c("Single, never married", "Single, never married", 
    "Interview not achieved", "Married/cohabitating"), dhiqual = c(" Secondary", 
    " A level or equivalent", "Item not applicable", "Degree or higher"
    ), dnssec8 = c(-1, 8, -1, -1), duresmc = c(14, 15, 11, 16
    ), dgorpaf = c(7, 8, 5, 10), dukcntr = c(1, 1, 1, 1), dnrkid04 = c(0, 
    0, 0, 0), dilodefr = c(3, 3, -1, 3), deconact = c(8, 8, -1, 
    11), dtenure = c(2, 3, 2, 3), dtotac = c(-1, -1, -1, -1), 
    dtotus = c(-1, -1, -1, -1), dsic = c("Item not applicable", 
    "Public admin, education and health", "Item not applicable", 
    "Item not applicable"), dsoc = c(-1, 9, -1, -1), DVAge_category = c("15 to 30", 
    "15 to 30", "15 to 30", "15 to 30"), Income_category = c("Less than 1000", 
    "Less than 1000", "1001 to 3000", "Less than 1000"), HoursWorked_category = c("Less than 20 hours", 
    "Less than 20 hours", "Less than 20 hours", "Less than 20 hours"
    )), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))

#Age variable

demographics$dagegrp_category<-ifelse(demographics$dagegrp_01 > 2 & demographics$dagegrp < 6, age<-"15 to 30",
                             ifelse(demographics$dagegrp> 6 & demographics$dagegrp < 9, age<-"31 to 45",                          
                             ifelse(demographics$dagegrp > 9 & demographics$dagegrp < 12 , age<-"46 to 60",
                             ifelse(demographics$dagegrp > 12 & demographics$dagegrp < 15 , age<-"61 to 75",
                             ifelse(demographics$dagegrp > 15 & demographics$dagegrp < 18 , age<-"76+",
                                             age<- "zombie")))))

demographics$DVAge_category<-c("15 to 30","31 to 45", "46 to 60","61 to 75", "76+", "zombie")[findInterval(demographics$dagegrp  , c(-Inf, 6, 10, 12, 15,18, Inf))]
Age<-as.vector(demographics$DVAge_category)

#Gender variable

demographics$DMSex[demographics$DMSex==1]<-"Male"
demographics$DMSex[demographics$DMSex==2]<-"Female"

Gender<-as.vector(demographics$DMSex)

#Income variable

demographics$Income_category<-ifelse(demographics$Income < 1001, income<-"Less than 1000",
                              ifelse(demographics$Income > 999 & demographics$Income < 3001, income<-"1001 to 3000",
                              ifelse(demographics$Income  > 3001 & demographics$Income < 6001, income <-"3001 to 6000",                          
                              ifelse(demographics$Income > 6001 & demographics$Income < 10001 , income<-"6001 to 10000",
                              income<- "zombie")))) 

demographics$Income_category<-c("Less than 1000","1001 to 3000", "3001 to 6000", "6001 to 10000","zombie")[findInterval(demographics$Income , c(-Inf, 1001, 3001, 6001,10001, Inf) ) ]

Income<-as.vector(demographics$Income_category)

#Marital status variable

demographics$dmarsta[demographics$dmarsta==-1]<-"Interview not achieved"
demographics$dmarsta[demographics$dmarsta==1]<-"Single, never married"
demographics$dmarsta[demographics$dmarsta==2]<-"Married/cohabitating"
demographics$dmarsta[demographics$dmarsta==3]<-"Divorced/widowed"

MaritalStatus<-as.vector(demographics$dmarsta)

#Education

demographics$dhiqual[demographics$dhiqual==-8]<-"Don't know"
demographics$dhiqual[demographics$dhiqual==-1]<-"Item not applicable"
demographics$dhiqual[demographics$dhiqual==1]<-"Degree or higher"
demographics$dhiqual[demographics$dhiqual==2]<-"Higher education"
demographics$dhiqual[demographics$dhiqual==3]<-" A level or equivalent"
demographics$dhiqual[demographics$dhiqual==4]<-" Secondary"
demographics$dhiqual[demographics$dhiqual==5]<-" Other"

Education<-as.vector(demographics$dhiqual)


#Hours worked per week in main job variable

demographics$HoursWorked_category<-ifelse(demographics$dtotac < 21, workhours<-"Less than 20 hours",
                                   ifelse(demographics$dtotac > 20 & demographics$dtotac< 41, workhours <-"Between 21 to 40 hours",
                                   ifelse(demographics$dtotac > 40 & demographics$dtotac < 61, workhours <-"Between 41 to 60 hours",                          
                                   ifelse(demographics$dtotac > 62, workhours<-"More than 61 hours",
                                     workhours<- "Not Applicable")))) 


demographics$HoursWorked_category<-c("Less than 20 hours", "Between 21 to 40 hours", "Between 41 to 60 hours","More than 61 hours","Not Applicable")[findInterval(demographics$dtotac, c(-Inf, 21, 41, 61, 62, Inf) ) ]

WorkHours<-as.vector(demographics$HoursWorked_category)

#DV: SIC 2007 industry divisions (grouped)

demographics$dsic[demographics$dsic==-8]<-"Don't know"
demographics$dsic[demographics$dsic==-1]<-"Item not applicable"
demographics$dsic[demographics$dsic==1]<-"Agriculture, forestry and fishing"
demographics$dsic[demographics$dsic==2]<-"Manufacturing"
demographics$dsic[demographics$dsic==3]<-"Energy and water supply"
demographics$dsic[demographics$dsic==4]<-"Construction"
demographics$dsic[demographics$dsic==5]<-"Distribution, hotels and restaurants"
demographics$dsic[demographics$dsic==6]<-"Transport and communication"
demographics$dsic[demographics$dsic==7]<-"Banking and finances"
demographics$dsic[demographics$dsic==8]<-"Public admin, education and health"
demographics$dsic[demographics$dsic==9]<-"Other services"

demographics$industry_category<-c("Don't know", "Item not applicable", "Agriculture, forestry and fishing","Manufacturing","Energy and water supply",
                                  "Construction", "Distribution, hotels and restaurants", "Transport and communication", "Banking and finances",
                                  "Public admin, education and health", "Other service")

SIC2007<-as.vector(demographics$dsic)


# creating df

df<-data.frame(Gender, Age, Education, MaritalStatus, Income, WorkHours, SIC2007)


 df %>% 

#tidy,而不是性别

gather(variable, value, -c(Gender))%>%

#按值,变量然后按性别分组

group_by(value, variable, Gender)  %>%

#summarise获取表单元格频率

summarise(freq=n()) %>%

#Plot

  ggplot(aes(x=value, y=freq, group=Gender))+geom_bar(aes(fill=Gender),  stat='identity', position='dodge')+  facet_wrap(~variable, scales='free_x') + theme(legend.position="right", axis.text.x = element_text(angle = 60, hjust = 1)) + labs(x="Characteristics", y="Frequencies") 

Output

1 个答案:

答案 0 :(得分:1)

在ggplot2中,数据是根据data.frame列的因子级别排序的。 要(重新)设置图中的顺序,只需通过以下方式设置因子的顺序:

df$variable <- factor(df$variable, levels = c(...))

您可以先存储data.frame,然后再传递给ggplot函数,然后手动设置要更改的变量的级别。可能效率不高,但这应该可以解决问题:

## Make your plotting data.frame
df2 <- df %>% 
gather(variable, value, -c(Gender))%>%
group_by(value, variable, Gender) %>%
summarise(freq=n())

## Apply custom order to MaritalStatus variable:
custom <- c(sort(unique(MaritalStatus))[c(4,3,1,2)], 
            ....)
df2$variable <- factor(df2$variable, levels = c(levels(df2$variable)[!levels(df2$variable) %in% custom], 
custom))