欧洲委员会收集并公布欧洲每周平均运输燃料价格。但是,他们以一种痛苦的方式发布它。按下此link可以下载.xls格式的数据并查看它。
我通过电子邮件向委员会询问他们是否可以在tidy类似结构中发布数据,但他们回答说他们无意这样做。
是否有人设法将programmaticaly导入此类数据或将其导入R或在将其导入R之前将其整理为V,例如,第一个标签:"价格和税率,每个点击率"?
我通常在Excel中手动将国家/地区表格堆叠在一起,并进行一些其他转换,并将更整洁的数据保存为.csv,然后将其导入R.
提前谢谢。
请注意,数据是按国家/地区提供的。
答案 0 :(得分:1)
url = "http://ec.europa.eu/energy/observatory/reports/Oil_Bulletin_Prices_History.xls"
download.file(url=url, "data/PriceHistory.xls")
require(XLConnect)
wb <- XLConnect::loadWorkbook("data/PriceHistory.xls")
dt <- readWorksheet(wb, sheet = 1, startRow = 1, endRow = 14992)
dt.body <- dt[7:14992, 2:7]
dt.head <- c("Date", "ExcRate", "EuroSuper", "GasOilAuto", "GasOilHeat", "FuelOil")
colnames(dt.body) <- dt.head
dt.body <- dt.body[nchar(dt.body$Date) == nchar(dt.body$Date[1]), ]
dt.body[["Date"]] <- as.Date(dt.body[["Date"]], format = "%Y-%m-%d %H:%M:%S")
for(i in 2:ncol(dt.body)) {
dt.body[, i] <- as.numeric(dt.body[, i])
}
如果您愿意,请使用NA
删除行,但请考虑ExcRate
列
# dt.body <- na.omit(dt.body)
rownames(dt.body) <- NULL
dt.body <- dplyr::tbl_dt(dt.body)
dt.body
结果是(na.omit
之后),
Source: local data table [11,054 x 6]
Date ExcRate EuroSuper GasOilAuto GasOilHeat FuelOil
(date) (dbl) (dbl) (dbl) (dbl) (dbl)
1 2016-03-07 1 371.64 387.86 358.08 176
2 2016-02-29 1 370.81 382.03 344.54 171
3 2016-02-22 1 375.81 384.53 339.44 171
4 2016-02-15 1 369.97 372.03 331.05 171
5 2016-02-08 1 382.47 380.36 337.09 178
6 2016-02-01 1 387.47 376.19 337.01 172
7 2016-01-25 1 384.97 370.36 321.67 146
8 2016-01-18 1 392.47 385.36 329.30 165
9 2016-01-11 1 413.31 407.86 346.26 175
10 2016-01-04 1 418.31 412.86 363.64 176
.. ... ... ... ... ... ...
R version 3.2.4 (2016-03-10)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.3 (El Capitan)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] MASS_7.3-45 ggplot2_2.0.0 dplyr_0.4.3 data.table_1.9.7
loaded via a namespace (and not attached):
[1] Rcpp_0.12.3 digest_0.6.9 assertthat_0.1 chron_2.3-47
[5] grid_3.2.4 R6_2.1.2 plyr_1.8.3 gtable_0.1.2
[9] DBI_0.3.1 magrittr_1.5 scales_0.3.0 pls_2.5-0
[13] labeling_0.3 tools_3.2.4 munsell_0.4.3 parallel_3.2.4
[17] colorspace_1.2-6
答案 1 :(得分:0)
您可以使用TheRimalaya的代码(如有必要,可以更改下载目的地)
然后添加此项以使导入的数据整洁:
library(dplyr)
library(tidyr)
dt.body.tidy <- dt.body %>%
gather(fueltype, price, 3:ncol(dt.body), -ExcRate)
dt.body.tidy <- dt.body.tidy %>%
filter(ExcRate == 1)
library(ggplot2)
dt.body.tidy %>%
ggplot(aes(Date, price)) +
geom_line(color="darkgrey") +
geom_line(stat="summary", fun.y="median", color="red", size=0.3)+
facet_wrap(~fueltype, ncol=1)
此外,它只导入第一个工作表,并且具有硬编码的startrow和endrow值,但您可以根据需要调整代码。
答案 2 :(得分:0)
在我自己之前发布的答案是部分解决方案。我决定发布受他们影响的我的。
url = "http://ec.europa.eu/energy/observatory/reports/Oil_Bulletin_Prices_History.xls"
download.file(url = url, "PriceHistory.xls")
继续
Save Excel spreadsheet as .csv with R?可以打开Excel文件并将标签保存为由\t
分隔的单独的.csv文件。假设一个人在工作目录中保存了三个(&#34;价格和税,每个点击率&#34; ,&#34;带税的价格,每个点击率&#34; ,&#34;所有税金,按点击率&#34; )并将其命名为mpc_ex_taxes.txt
,mpc.txt
,taxes.txt
,下面的代码应该有效。
library(plyr)
library(dplyr)
library(readr)
library(stringr)
library(lubridate)
library(ggplot2)
library(tidyr)
# name of .txt files
my_file_name <-
c("mpc_ex_taxes", "mpc", "taxes")
# c("mpc")
my_files <-
str_c(my_file_name, ".txt")
# getting data at once
my_file_list <- list()
for(file in seq_along(my_files)){
# read data: line by line (assuming you are in working directory)
# my_oil <- read_lines(my_files[1])
my_oil <- read_lines(my_files[file])
# split by delimiter
my_oil_split <- str_split(my_oil, "\t")
# compute length of vector in each line
my_len <- sapply(my_oil_split, length)
# filter lines with vector length equal to 9
my_data <-
my_oil_split[which(my_len == 9)] %>%
ldply(.) %>%
tbl_df
# offset rows by this integer (used below in selecting data values)
my_number_to_add <- 4
# get country codes; str_length(V1) == 2
# get index_of_data; my_number_to_add lower than country code
my_geos <-
my_data %>%
select(V1) %>%
add_rownames(., var = "index_geo") %>%
filter(str_length(V1) == 2) %>%
mutate(index_geo = as.numeric(index_geo),
index_of_data = index_geo + my_number_to_add) %>%
rename(geo = V1)
# calculate index where data for each country starts and where data ends
my_data_values <-
my_data %>%
add_rownames(., var = "index_of_data") %>%
filter(str_length(V2) == 8) %>%
mutate(index_of_data = as.integer(index_of_data),
diff_index_of_data = index_of_data - lag(index_of_data, 1),
index_of_data_end = index_of_data - diff_index_of_data) %>%
filter(is.na(diff_index_of_data) | diff_index_of_data > 1) %>%
mutate(index_of_data_end = lead(index_of_data_end, 1)) %>%
select(matches("^index"))
# join country codes and start and end of data values
my_case <-
left_join(my_geos, my_data_values) %>%
filter(complete.cases(.))
# get data for each country except the last one (UK)
my_data_list <- list()
for(i in seq_along(my_case$geo)){
my_data_list[[i]] <-
my_data[my_case$index_of_data[i]:my_case$index_of_data_end[i], ]
}
names(my_data_list) <- my_case$geo
# tidy dataset
my_df <-
ldply(my_data_list) %>%
tbl_df %>%
rename(geo = .id) %>%
select(geo, V2:V7) %>%
mutate(my_date = as.Date(dmy(V2))) %>%
select(-V2) %>%
mutate_each(funs(str_replace_all(., ",", ""))) %>%
mutate_each(funs(as.numeric), matches("V")) %>%
select(geo, my_date, everything())
# renaming the variables
my_names <- c("my_geo", "my_date", "ExcRate", "Gasoline",
"Diesel", "GasOilHeat", "FuelOil")
names(my_df) <- my_names
my_df_to_save <-
my_df %>%
mutate(my_fuel_price_component = rep(my_file_name[file]))
my_file_list[[file]] <- my_df_to_save
}
# final data frame
my_final_fuel_df <-
my_file_list %>%
ldply(.) %>%
tbl_df
备注:可以通过my_date
函数将类别字符的时间变量as.Date()
强制为类日期。
Source: local data frame [42,495 x 8]
my_geo my_date ExcRate Gasoline Diesel GasOilHeat FuelOil my_fuel_price_component
(chr) (chr) (dbl) (dbl) (dbl) (dbl) (dbl) (chr)
1 AT 2016-03-14 1 387.47 409.53 373.71 191 mpc_ex_taxes
2 AT 2016-03-07 1 371.64 387.86 358.08 176 mpc_ex_taxes
3 AT 2016-02-29 1 370.81 382.03 344.54 171 mpc_ex_taxes
4 AT 2016-02-22 1 375.81 384.53 339.44 171 mpc_ex_taxes
5 AT 2016-02-15 1 369.97 372.03 331.05 171 mpc_ex_taxes
6 AT 2016-02-08 1 382.47 380.36 337.09 178 mpc_ex_taxes
7 AT 2016-02-01 1 387.47 376.19 337.01 172 mpc_ex_taxes
8 AT 2016-01-25 1 384.97 370.36 321.67 146 mpc_ex_taxes
9 AT 2016-01-18 1 392.47 385.36 329.30 165 mpc_ex_taxes
10 AT 2016-01-11 1 413.31 407.86 346.26 175 mpc_ex_taxes
.. ... ... ... ... ... ... ... ...
R version 3.2.4 (2016-03-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
locale:
[1] LC_COLLATE=Slovenian_Slovenia.1250 LC_CTYPE=Slovenian_Slovenia.1250 LC_MONETARY=Slovenian_Slovenia.1250
[4] LC_NUMERIC=C LC_TIME=Slovenian_Slovenia.1250
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] tidyr_0.3.0 ggplot2_1.0.1 lubridate_1.5.0 stringr_1.0.0 readr_0.2.2
[6] dplyr_0.4.3 plyr_1.8.3 XLConnect_0.2-11 XLConnectJars_0.2-9
loaded via a namespace (and not attached):
[1] Rcpp_0.12.0 digest_0.6.8 assertthat_0.1 MASS_7.3-44 grid_3.2.4 R6_2.1.1
[7] gtable_0.1.2 DBI_0.3.1 magrittr_1.5 scales_0.3.0 stringi_0.5-5 lazyeval_0.1.10
[13] reshape2_1.4.1 labeling_0.3 proto_0.3-10 tools_3.2.4 munsell_0.4.2 parallel_3.2.4
[19] colorspace_1.2-6 rJava_0.9-7