DATE obs1 obs2 obs3
1981-01-01 2032.409 3142.46 1741.143
1981-01-02 2023.687 3870.04 1735.256
1981-01-03 2014.274 4126.25 1728.556
1981-01-04 2005.795 2615.91 1722.985
1981-01-05 2000.674 2940.83 1722.317
1981-01-06 1998.477 3258.69 1723.937
1981-01-07 1997.014 3371.6 1724.104
1981-01-08 1995.576 3184.13 1722.624
1981-01-09 1993.706 3540.76 1719.592
1981-01-10 1991.286 3312.43 1715.156
1981-01-11 1988.633 3028.65 1710.141
1981-01-12 1986.147 3212.79 1705.183
1981-01-13 1984.229 3193.23 1700.789
1981-01-14 1982.756 3294.52 1697.785
1981-01-15 1981.548 3553.78 1696.068
1981-01-16 1980.561 3492.28 1694.544
1981-01-17 1979.792 2452.09 1692.36
1981-01-18 1979.224 1873.82 1689.525
1981-01-19 1978.845 3218.28 1686.452
如上图所示,我需要在R中绘制季节性(冬季,春季,夏季和秋季)框图,以获取每日数据。我有10年以上格式的数据用于不同的电台。情节应该在一个图中,每个季节都有多个箱形图。
答案 0 :(得分:2)
使用tidyverse
和lubridate
的解决方案。 tidyverse
包括dplyr
和tidyr
执行数据争论,ggplot2
创建情节。 lubridate
用于处理数据框中的日期。
由于您提供的数据集不是很有用,因为它仅包含1月份的一些记录,因此无法创建显示季节性差异的箱线图,因此我决定创建一个新的示例数据框。我的示例数据框的结构类似于您的数据集,它应该为您提供一些提示作为您现实世界问题的起点。
# Set the seed for reproducibility
set.seed(123)
library(tidyverse)
library(lubridate)
# Create example data frame
dt <- data_frame(DATE = seq(ymd("1980-01-01"), ymd("1989-12-31"), by = 1)) %>%
mutate(obs1 = rnorm(nrow(.), mean = 0, sd = 1),
obs2 = rnorm(nrow(.), mean = 1, sd = 2),
obs3 = rnorm(nrow(.), mean = 2, sd = 3))
head(dt)
# # A tibble: 6 x 4
# DATE obs1 obs2 obs3
# <date> <dbl> <dbl> <dbl>
# 1 1980-01-01 -0.56047565 0.7874145 2.7827006
# 2 1980-01-02 -0.23017749 0.1517417 8.5720252
# 3 1980-01-03 1.55870831 0.7193725 1.3293478
# 4 1980-01-04 0.07050839 0.5454177 0.3253155
# 5 1980-01-05 0.12928774 1.4101239 6.1245771
# 6 1980-01-06 1.71506499 -0.6491910 4.5034395
tail(dt)
# # A tibble: 6 x 4
# DATE obs1 obs2 obs3
# <date> <dbl> <dbl> <dbl>
# 1 1989-12-26 -0.3629796 0.6750946 0.8586325
# 2 1989-12-27 0.1102218 2.8572337 9.8541328
# 3 1989-12-28 -0.2700741 1.7614026 1.9109596
# 4 1989-12-29 0.6920973 0.5275611 -0.4756240
# 5 1989-12-30 0.9282803 1.3811225 1.5222535
# 6 1989-12-31 0.5931301 -1.6638739 4.1157087
示例数据框包含10年的记录,包含3个观察组。每列的值均为正态分布,具有不同的均值和标准差。
第一步是通过将数据集从宽格式转换为长格式来处理数据框,并添加一个显示季节信息的列。
dt2 <- dt %>%
# Convert data frame from lwide format to long format
gather(Observation, Value, -DATE) %>%
# Remove "obs" in the Observation column
mutate(Observation = str_replace(Observation, "obs", "")) %>%
# Convert the DATE column to date class
mutate(DATE = ymd(DATE)) %>%
# Create Month column
mutate(Month = month(DATE)) %>%
# Create Season column
mutate(Season = case_when(
Month %in% c(12, 1, 2) ~ "winter",
Month %in% c(3, 4, 5) ~ "spring",
Month %in% c(6, 7, 8) ~ "summer",
Month %in% c(9, 10, 11) ~ "fall",
TRUE ~ NA_character_
))
之后,我们可以使用ggplot2
创建箱线图。请注意,我使用stat_summary
为每个组添加一条红线来表示平均值。
# Create a boxplot using ggplot2
# Specify the aesthetics
ggplot(dt2, aes(x = Season, y = Value, fill = Observation)) +
# Specify the geom to be boxplot
geom_boxplot() +
# Add a red line to the mean
stat_summary(aes(ymax = ..y.., ymin = ..y..),
fun.y = "mean",
geom = "errorbar", # Use geom_errorbar to add line as mean
color = "red",
width = 0.7,
position = position_dodge(width = 0.75), # Add the line to each group
show.legend = FALSE)
答案 1 :(得分:2)
好的......第一步是建立一个可以检测给定日期下降的季节的函数。幸运的是,我很久以前就已经开发出了可以处理南半球季节的方法(相反)。
该函数没有实现任何健全性检查,因为我在已经清理过的数据集中使用它,但是最终实现一些并不困难(除非您决定在使用之前清理数据集)。它以矢量化方式工作,以最大化Matlab中的计算性能。
就是这样:
function season = GetSeason(date,southern_hemisphere)
if (nargin == 1)
southern_hemisphere = false;
end
[~,month,day] = datevec(date);
offset = month + (day / 100);
winter = (offset < 3.21) | (offset >= 12.22);
spring = ~winter & (offset < 6.21);
summer = ~winter & ~spring & (offset < 9.23);
autumn = ~winter & ~spring & ~summer;
offset(spring) = 0;
offset(summer) = 1;
offset(autumn) = 2;
offset(winter) = 3;
if (southern_hemisphere)
offset = offset + 2;
end
season = mod(offset,4) + 1;
end
现在,您的脚本中的第一步是从数据集文件中提取您的观察结果。为了为您创建一个完整的演示,我创建了一个Excel
数据集。但您也可以使用CSV
数据集,而Matlab处理的代码或其他文件格式几乎没有变化:
% detect the dataset columns format
opts = detectImportOptions('data.xlsx');
% impose a specific format for the dataset columns
opts = setvartype(opts,{'datetime' 'double' 'double' 'double'});
% extract data in a table variable
data = readtable('data.xlsx',opts);
% sanitize the table variable removing the rows with missing or invalid values
data = rmmissing(data);
% sort the table variable rows by date (default first rows, default ascending)
data = sortrows(data);
第二项测试包括获得观察日期的相应季节:
seasons = GetSeason(data.Date);
第三步,假设我们仅针对名为Obs1
的第一列观察结果执行所有此过程:
spring_1 = data.Obs1(seasons == 1);
summer_1 = data.Obs1(seasons == 2);
autumn_1 = data.Obs1(seasons == 3);
winter_1 = data.Obs1(seasons == 4);
第四步也是最后一步是在一个图中绘制每个季节的一个箱图(分组变量groups
必须作为参数传递到boxplot
函数中,以便知道后者如何它必须绘制和使用哪些值的许多框):
groups = [
ones(size(spring_1));
2 * ones(size(summer_1));
3 * ones(size(autumn_1));
4 * ones(size(winter_1));
];
figure();
boxplot([spring_1; summer_1; autumn_1; winter_1],groups);
set(gca,'XTickLabel',{'Spring' 'Summer' 'Autumn' 'Winter'});
结果如下:
更新所有观察的完整工作代码
opts = detectImportOptions('data.xlsx');
opts = setvartype(opts,{'datetime' 'double' 'double' 'double'});
data = readtable('data.xlsx',opts);
data = rmmissing(data);
data = sortrows(data);
seasons = GetSeason(data.Date);
spring_1 = data.Obs1(seasons == 1);
summer_1 = data.Obs1(seasons == 2);
autumn_1 = data.Obs1(seasons == 3);
winter_1 = data.Obs1(seasons == 4);
spring_2 = data.Obs2(seasons == 1);
summer_2 = data.Obs2(seasons == 2);
autumn_2 = data.Obs2(seasons == 3);
winter_2 = data.Obs2(seasons == 4);
spring_3 = data.Obs3(seasons == 1);
summer_3 = data.Obs3(seasons == 2);
autumn_3 = data.Obs3(seasons == 3);
winter_3 = data.Obs3(seasons == 4);
plot_data = [
spring_1;
summer_1;
autumn_1;
winter_1;
spring_2;
summer_2;
autumn_2;
winter_2;
spring_3;
summer_3;
autumn_3;
winter_3
];
plot_groups = [
(1 * ones(size(spring_1))) (1 * ones(size(spring_1)));
(1 * ones(size(summer_1))) (2 * ones(size(summer_1)));
(1 * ones(size(autumn_1))) (3 * ones(size(autumn_1)));
(1 * ones(size(winter_1))) (4 * ones(size(winter_1)));
(2 * ones(size(spring_2))) (5 * ones(size(spring_2)));
(2 * ones(size(summer_2))) (6 * ones(size(summer_2)));
(2 * ones(size(autumn_2))) (7 * ones(size(autumn_2)));
(2 * ones(size(winter_2))) (8 * ones(size(winter_2)));
(3 * ones(size(spring_3))) (9 * ones(size(spring_3)));
(3 * ones(size(summer_3))) (10 * ones(size(summer_3)));
(3 * ones(size(autumn_3))) (11 * ones(size(autumn_3)));
(3 * ones(size(winter_3))) (12 * ones(size(winter_3)))
];
labels_obs = {'' '' '' '' '' '' '' '' '' '' '' ''};
labels_season = repmat({'Spring' 'Summer' 'Autumn' 'Winter'},1,3);
figure('Units','normalized','Position',[0.05 0.1 0.9 0.8]);
boxplot(plot_data,plot_groups, ...
'BoxStyle','outline', ...
'FactorGap',[5 1], ...
'Labels',{labels_obs; labels_season}, ...
'Notch','on');
colors = repmat('wcyg',1,3);
h = findobj(gca,'Tag','Box');
for i = 1:numel(h)
patch(get(h(i),'XData'),get(h(i),'YData'),colors(i),'FaceAlpha',0.5);
end
h = findall(allchild(findall(gca,'Type','hggroup')),'Type','text','String','');
positions = cell2mat(get(h,'pos'));
positions_new = num2cell([mean(reshape(positions(:,1),4,[]))' positions(1:4:end,2:end)],2);
set(h(1:4:end),{'Position'},positions_new,{'String'},{'Observations 3'; 'Observations 2'; 'Observations 1'})
h = findall(allchild(findall(gca,'Type','hggroup')),'Type','text','String','');
delete(h);
结果: