Question

背景

欧洲委员会收集并公布欧洲每周平均运输燃料价格。但是，他们以一种痛苦的方式发布它。按下此link可以下载.xls格式的数据并查看它。

我通过电子邮件向委员会询问他们是否可以在tidy类似结构中发布数据，但他们回答说他们无意这样做。

问题

是否有人设法将programmaticaly导入此类数据或将其导入R或在将其导入R之前将其整理为V，例如，第一个标签：＆＃34;价格和税率，每个点击率＆＃34;？

我尝试了什么

我通常在Excel中手动将国家/地区表格堆叠在一起，并进行一些其他转换，并将更整洁的数据保存为.csv，然后将其导入R.

提前谢谢。

修改

请注意，数据是按国家/地区提供的。

Answer 1

url = "http://ec.europa.eu/energy/observatory/reports/Oil_Bulletin_Prices_History.xls"
download.file(url=url, "data/PriceHistory.xls")

require(XLConnect)
wb <- XLConnect::loadWorkbook("data/PriceHistory.xls")
dt <- readWorksheet(wb, sheet = 1, startRow = 1, endRow = 14992)
dt.body <- dt[7:14992, 2:7]
dt.head <- c("Date", "ExcRate", "EuroSuper", "GasOilAuto", "GasOilHeat", "FuelOil")
colnames(dt.body) <- dt.head

dt.body <- dt.body[nchar(dt.body$Date) == nchar(dt.body$Date[1]), ]
dt.body[["Date"]] <- as.Date(dt.body[["Date"]], format = "%Y-%m-%d %H:%M:%S")

for(i in 2:ncol(dt.body)) {
    dt.body[, i] <- as.numeric(dt.body[, i])
}

如果您愿意，请使用NA删除行，但请考虑ExcRate列

# dt.body <- na.omit(dt.body)
rownames(dt.body) <- NULL
dt.body <- dplyr::tbl_dt(dt.body)

dt.body

结果是（na.omit之后），

Source: local data table [11,054 x 6]

         Date ExcRate EuroSuper GasOilAuto GasOilHeat FuelOil
       (date)   (dbl)     (dbl)      (dbl)      (dbl)   (dbl)
1  2016-03-07       1    371.64     387.86     358.08     176
2  2016-02-29       1    370.81     382.03     344.54     171
3  2016-02-22       1    375.81     384.53     339.44     171
4  2016-02-15       1    369.97     372.03     331.05     171
5  2016-02-08       1    382.47     380.36     337.09     178
6  2016-02-01       1    387.47     376.19     337.01     172
7  2016-01-25       1    384.97     370.36     321.67     146
8  2016-01-18       1    392.47     385.36     329.30     165
9  2016-01-11       1    413.31     407.86     346.26     175
10 2016-01-04       1    418.31     412.86     363.64     176
..        ...     ...       ...        ...        ...     ...

会话信息：

R version 3.2.4 (2016-03-10)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.3 (El Capitan)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] MASS_7.3-45      ggplot2_2.0.0    dplyr_0.4.3      data.table_1.9.7

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.3      digest_0.6.9     assertthat_0.1   chron_2.3-47    
 [5] grid_3.2.4       R6_2.1.2         plyr_1.8.3       gtable_0.1.2    
 [9] DBI_0.3.1        magrittr_1.5     scales_0.3.0     pls_2.5-0       
[13] labeling_0.3     tools_3.2.4      munsell_0.4.3    parallel_3.2.4  
[17] colorspace_1.2-6

Answer 2

您可以使用TheRimalaya的代码（如有必要，可以更改下载目的地）

然后添加此项以使导入的数据整洁：

library(dplyr)
library(tidyr)
dt.body.tidy <- dt.body %>%
        gather(fueltype, price, 3:ncol(dt.body), -ExcRate)

dt.body.tidy <- dt.body.tidy %>%
        filter(ExcRate == 1)

library(ggplot2)
dt.body.tidy %>%
        ggplot(aes(Date, price)) +
        geom_line(color="darkgrey") +
        geom_line(stat="summary", fun.y="median", color="red", size=0.3)+
        facet_wrap(~fueltype, ncol=1)

此外，它只导入第一个工作表，并且具有硬编码的startrow和endrow值，但您可以根据需要调整代码。

Answer 3

在我自己之前发布的答案是部分解决方案。我决定发布受他们影响的我的。

下载文件

url = "http://ec.europa.eu/energy/observatory/reports/Oil_Bulletin_Prices_History.xls"
download.file(url = url, "PriceHistory.xls")

假设

继续 Save Excel spreadsheet as .csv with R?可以打开Excel文件并将标签保存为由\t分隔的单独的.csv文件。假设一个人在工作目录中保存了三个（＆＃34;价格和税，每个点击率＆＃34; ，＆＃34;带税的价格，每个点击率＆＃34; ，＆＃34;所有税金，按点击率＆＃34; ）并将其命名为mpc_ex_taxes.txt，mpc.txt，taxes.txt，下面的代码应该有效。

使用的库

library(plyr)
library(dplyr)
library(readr)
library(stringr)
library(lubridate)
library(ggplot2)
library(tidyr)

从.csv文件中获取数据

# name of .txt files
my_file_name <- 
  c("mpc_ex_taxes", "mpc", "taxes")
  # c("mpc")

my_files <- 
  str_c(my_file_name, ".txt")

# getting data at once
my_file_list <- list()

for(file in seq_along(my_files)){

# read data: line by line (assuming you are in working directory)
  # my_oil <- read_lines(my_files[1])
  my_oil <- read_lines(my_files[file])

# split by delimiter
my_oil_split <- str_split(my_oil, "\t")

# compute length of vector in each line
my_len <- sapply(my_oil_split, length)

# filter lines with vector length equal to 9
my_data <-
  my_oil_split[which(my_len == 9)] %>% 
  ldply(.) %>% 
  tbl_df

# offset rows by this integer (used below in selecting data values)
my_number_to_add <- 4

# get country codes; str_length(V1) == 2
# get index_of_data; my_number_to_add lower than country code
my_geos <-
  my_data %>% 
  select(V1) %>%
  add_rownames(., var = "index_geo") %>% 
  filter(str_length(V1) == 2) %>% 
  mutate(index_geo = as.numeric(index_geo),
         index_of_data = index_geo + my_number_to_add) %>% 
  rename(geo = V1)

# calculate index where data for each country starts and where data ends
my_data_values <- 
  my_data %>% 
  add_rownames(., var = "index_of_data") %>% 
  filter(str_length(V2) == 8) %>% 
  mutate(index_of_data = as.integer(index_of_data),
         diff_index_of_data = index_of_data - lag(index_of_data, 1),
         index_of_data_end = index_of_data - diff_index_of_data) %>% 
  filter(is.na(diff_index_of_data) | diff_index_of_data > 1) %>% 
  mutate(index_of_data_end = lead(index_of_data_end, 1)) %>% 
  select(matches("^index"))

# join country codes and start and end of data values
my_case <- 
  left_join(my_geos, my_data_values) %>% 
  filter(complete.cases(.))

# get data for each country except the last one (UK)
my_data_list <- list()

for(i in seq_along(my_case$geo)){
  my_data_list[[i]] <- 
    my_data[my_case$index_of_data[i]:my_case$index_of_data_end[i], ]
}

names(my_data_list) <- my_case$geo

# tidy dataset
my_df <-
  ldply(my_data_list) %>% 
  tbl_df %>% 
  rename(geo = .id) %>%
  select(geo, V2:V7) %>% 
  mutate(my_date = as.Date(dmy(V2))) %>% 
  select(-V2) %>% 
  mutate_each(funs(str_replace_all(., ",", ""))) %>% 
  mutate_each(funs(as.numeric), matches("V")) %>% 
  select(geo, my_date, everything())

# renaming the variables
my_names <-  c("my_geo", "my_date", "ExcRate", "Gasoline", 
               "Diesel", "GasOilHeat", "FuelOil")

names(my_df) <- my_names

my_df_to_save <-
  my_df %>% 
  mutate(my_fuel_price_component = rep(my_file_name[file]))

my_file_list[[file]] <- my_df_to_save

}

# final data frame
my_final_fuel_df <-
  my_file_list %>% 
  ldply(.) %>%
  tbl_df

Tidy Oil Bulletin数据

备注：可以通过my_date函数将类别字符的时间变量as.Date()强制为类日期。

Source: local data frame [42,495 x 8]

   my_geo    my_date ExcRate Gasoline Diesel GasOilHeat FuelOil my_fuel_price_component
    (chr)      (chr)   (dbl)    (dbl)  (dbl)      (dbl)   (dbl)                   (chr)
1      AT 2016-03-14       1   387.47 409.53     373.71     191            mpc_ex_taxes
2      AT 2016-03-07       1   371.64 387.86     358.08     176            mpc_ex_taxes
3      AT 2016-02-29       1   370.81 382.03     344.54     171            mpc_ex_taxes
4      AT 2016-02-22       1   375.81 384.53     339.44     171            mpc_ex_taxes
5      AT 2016-02-15       1   369.97 372.03     331.05     171            mpc_ex_taxes
6      AT 2016-02-08       1   382.47 380.36     337.09     178            mpc_ex_taxes
7      AT 2016-02-01       1   387.47 376.19     337.01     172            mpc_ex_taxes
8      AT 2016-01-25       1   384.97 370.36     321.67     146            mpc_ex_taxes
9      AT 2016-01-18       1   392.47 385.36     329.30     165            mpc_ex_taxes
10     AT 2016-01-11       1   413.31 407.86     346.26     175            mpc_ex_taxes
..    ...        ...     ...      ...    ...        ...     ...                     ...

会话信息

R version 3.2.4 (2016-03-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

locale:
[1] LC_COLLATE=Slovenian_Slovenia.1250  LC_CTYPE=Slovenian_Slovenia.1250    LC_MONETARY=Slovenian_Slovenia.1250
[4] LC_NUMERIC=C                        LC_TIME=Slovenian_Slovenia.1250    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] tidyr_0.3.0         ggplot2_1.0.1       lubridate_1.5.0     stringr_1.0.0       readr_0.2.2        
[6] dplyr_0.4.3         plyr_1.8.3          XLConnect_0.2-11    XLConnectJars_0.2-9

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.0      digest_0.6.8     assertthat_0.1   MASS_7.3-44      grid_3.2.4       R6_2.1.1        
 [7] gtable_0.1.2     DBI_0.3.1        magrittr_1.5     scales_0.3.0     stringi_0.5-5    lazyeval_0.1.10 
[13] reshape2_1.4.1   labeling_0.3     proto_0.3-10     tools_3.2.4      munsell_0.4.2    parallel_3.2.4  
[19] colorspace_1.2-6 rJava_0.9-7

Oil Bulletin每周燃料数据：输入R

背景

问题