Question

我正在研究阅读月度时间序列数据的方法，这些数据并非整齐排列，并带有“日期”列和“数据”列。例如，SEMI的这个电子表格包含按月和区域组织的数据块，但年份是分开的，在非连续的块中，年份以YYYY形式作为每个块之前的标题。

non-contiguous data

我的目标是将此数据转换为连续的块，其中第1列中的月度日期和第2列：6中的区域数据。将此电子表格导出为制表符分隔文件后（我发现gdata和XLConnect都遇到了您在屏幕截图中看到的合并单元格问题），我将其读入并获取了一个子集，这是下面dput的来源。

我采用的方法是首先使用以下方法剥离空行：

mydf <- mydf[which(grepl("^$", mydf$January) == FALSE),]

然后在Region列中为具有年份的行添加一个标签 - 方便地，它总是出现在第二个（'January'）列中。

mydf[which(nchar(mydf$January) == 4) ,'Region'] <- 'mydate'

下一步是在这些“年”行中填写1月到12月的列，并按月包含日期。我想，一旦我每个月都有一个独特的日期，我就可以使用ddply或其他东西来处理它。

mydf[which(mydf$Region == 'mydate'), 2:13] <- apply(mydf[which(mydf$Region == 'mydate'), 2:13], 1, function(x) as.character(seq(as.Date(paste(x['January'],"-01-01", sep = "")), as.Date(paste(x['January'],"-12-01", sep = "")), by = 'month')))

由于apply函数没有以我希望的方式生成日期，因此我没有按预期工作 - 它们不是按顺序排列的。我非常感谢（a）针对apply步骤的特定修复或（b）指针可能更简单或更容易的替代方法。

以下数据和代码：

mydf <- structure(list(Region = c("", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "", "Americas", "Europe",
"Japan", "Asia Pacific", "Worldwide", "", "", "Americas", "Europe",
"Japan", "Asia Pacific", "Worldwide"), January = c("1980", "413136",
"189577", "34033", "39868", "676614", "", "1981", "445504", "277290",
"33970", "44642", "801406", "", "1982", "445300", "226274", "34404",
"44989", "750967", "", "January", "1983", "457604", "232443",
"34326", "46247", "770621", "", "1984", "731009", "285740", "205644",
"85426", "1307820"), February = c("", "423748", "234818", "35104",
"42398", "736069", "", "", "440225", "274526", "33795", "44005",
"792550", "", "", "438332", "226806", "33359", "44020", "742517",
"", "February", "", "457899", "233560", "32604", "46184", "770247",
"", "", "790963", "307735", "381282", "102791", "1582770"), March = c("",
"436152", "281353", "34456", "46555", "798516", "", "", "434628",
"267259", "33709", "45206", "780802", "", "", "441313", "235612",
"32380", "43600", "752905", "", "March", "", "459498", "234986",
"31544", "48178", "774206", "", "", "856970", "339674", "574527",
"118091", "1889262"), April = c("", "455673", "288710", "34451",
"48585", "827419", "", "", "443285", "264405", "34823", "47192",
"789705", "", "", "465613", "246425", "33618", "46274", "791930",
"", "April", "", "484299", "243867", "32719", "52333", "813218",
"", "", "909873", "364465", "627400", "126954", "2028693"), May = c("",
"474441", "297343", "35092", "51102", "857977", "", "", "451221",
"255887", "35499", "48459", "791065", "", "", "487738", "249522",
"34339", "47727", "819325", "", "May", "", "507807", "246136",
"34708", "59300", "847950", "", "", "969553", "382706", "655862",
"133455", "2141576"), June = c("", "475552", "299427", "35743",
"51440", "862162", "", "", "453152", "242889", "35798", "48147",
"779986", "", "", "488564", "241273", "34360", "48871", "813068",
"", "June", "", "528620", "246710", "37345", "62910", "875586",
"", "", "991274", "388697", "672773", "135550", "2188294"), July = c("",
"473007", "302075", "37771", "51027", "863880", "", "", "454387",
"231097", "35402", "47468", "768353", "", "", "480702", "229555",
"33915", "49112", "793284", "", "July", "", "543063", "241211",
"40403", "66658", "891335", "", "", "1005742", "395852", "683854",
"138853", "2224302"), August = c("", "462125", "294497", "37628",
"49773", "844023", "", "", "450648", "213017", "34363", "46614",
"744642", "", "", "472486", "215763", "32866", "48620", "769734",
"", "August", "", "565034", "236353", "42524", "66853", "910763",
"", "", "1010739", "393337", "691731", "141101", "2236908"),
    September = c("", "461968", "295501", "37310", "50280", "845059",
    "", "", "459276", "215403", "33801", "47297", "755777", "",
    "", "475729", "219643", "33083", "47540", "775994", "", "September",
    "", "593019", "244979", "44108", "70242", "952348", "", "",
    "1035725", "408658", "698992", "141944", "2285320"), October = c("",
    "459862", "296522", "36399", "51220", "844003", "", "", "465096",
    "218792", "34168", "47369", "765424", "", "", "467151", "225828",
    "33667", "47890", "774536", "", "October", "", "618854",
    "259807", "47622", "71345", "997628", "", "", "1033560",
    "421043", "710563", "140154", "2305320"), November = c("",
    "456832", "296283", "35769", "50531", "839415", "", "", "467288",
    "232593", "35039", "47415", "782335", "", "", "461950", "237117",
    "35672", "47285", "782024", "", "November", "", "641864",
    "275099", "50371", "72095", "1039428", "", "", "1008836",
    "441652", "732948", "133861", "2317297"), December = c("",
    "460343", "291348", "35781", "48298", "835771", "", "", "460574",
    "231461", "35971", "47173", "775179", "", "", "462919", "235861",
    "36251", "47974", "783006", "", "December", "", "672533",
    "276525", "54603", "74717", "1078379", "", "", "982210",
    "442448", "731546", "132982", "2289187")), .Names = c("Region",
"January", "February", "March", "April", "May", "June", "July",
"August", "September", "October", "November", "December"), row.names = 29:63, class = "data.frame")

mydf <- mydf[which(grepl("^$", mydf$January) == FALSE),] # remove rows with nothing in the January column
mydf[which(nchar(mydf$January) == 4) ,'Region'] <- 'mydate' # add a row label for 'year' rows

mydf[which(mydf$Region == 'mydate'), 2:13] <- apply(mydf[which(mydf$Region == 'mydate'), 2:13], 1, function(x) as.character(seq(as.Date(paste(x['January'],"-01-01", sep = "")), as.Date(paste(x['January'],"-12-01", sep = "")), by = 'month')))

Answer 1

您可以使用xlsReadWrite和reshape2

 library(xlsReadWrite)
 tdata<-read.xls('GSR1976-June 2012.xls',stringsAsFactors=F)
 tdata[85,2]<-1987 # fix for missing year
 tdata[228,2]<-2007 # fix for missing year
 year.marker<-c(grep('^[[:digit:]]{4}$',tdata[,2]),270)

 temp.df<-NULL

 for(i in seq_along(year.marker)[-length(year.marker)]){
   dum.df<-cbind(tdata[year.marker[i],2],tdata[(year.marker[i]+1):(year.marker[i+1]-2),])
   temp.df<-rbind(temp.df,dum.df)
 }

 names(temp.df)<-c('year','region',month.name)

 df1<-temp.df[!temp.df[,'region']=='',]
 library(reshape2)
 df2<-melt(df1, id.vars=c("region", "year"))

Answer 2

我采取了以下方法：

首先，我将您的文件转换为CSV，然后读取行。我使用grep()查找＆＃34; Americas＆＃34;，这是每组中的第一行。我手动输入了开始和结束年份，但也可能在那里使用了一些grep。

temp = readLines("GSR1976-June 2012.csv")
START = grep("Americas", temp)
YEARS = 1976:2012

之后，我创建了一个data.frame的列表，每年一个。

temp1 = lapply(1:length(YEARS), 
               function(x) read.csv("GSR1976-June 2012.csv",
                                    header=FALSE, skip=START[x]-1,
                                    nrows=5))
names(temp1) = YEARS

然后，我将它们合并为一个data.frame并做了一些清理。

temp2 = do.call(rbind, temp1)
names(temp2) = c("region", "jan", "feb", "mar", "apr", "may", "jun",
                 "jul", "aug", "sep", "oct", "nov", "dec")
temp2$year = rep(YEARS, each=5)

你没有指定你想要做什么类型的重塑，但如果你想要从宽到长，最简单的方法是使用reshape2包：

library(reshape2)
temp3 = melt(temp2, id.vars=c("region", "year"))
list(head(temp3), tail(temp3))
# [[1]]
#         region year variable  value
# 1     Americas 1976      jan     NA
# 2       Europe 1976      jan     NA
# 3        Japan 1976      jan     NA
# 4 Asia Pacific 1976      jan     NA
# 5    Worldwide 1976      jan     NA
# 6     Americas 1977      jan 195638
# 
# [[2]]
#            region year variable    value
# 2215    Worldwide 2011      dec 23832532
# 2216     Americas 2012      dec       NA
# 2217       Europe 2012      dec       NA
# 2218        Japan 2012      dec       NA
# 2219 Asia Pacific 2012      dec       NA
# 2220    Worldwide 2012      dec       NA

然后，对于您正在寻找发声的输出，请使用dcast()：

temp4 = dcast(temp3, year + variable ~ region)
head(temp4)
#   year variable Americas Asia Pacific Europe Japan Worldwide
# 1 1976      jan       NA           NA     NA    NA        NA
# 2 1976      feb       NA           NA     NA    NA        NA
# 3 1976      mar   178295        16761  55602 10805    261463
# 4 1976      apr   178961        16513  60959 11589    268022
# 5 1976      may   187076        17396  62329 12435    279235
# 6 1976      jun   193675        17712  61676 14411    287475

Answer 3

可以使用 XLConnect 直接从Excel文件中轻松处理上述数据集：

require(XLConnect)
require(reshape2)

# Load Excel workbook
wb = loadWorkbook("~/Downloads/GSR1976-June 2012.xls")

# Read data from 1st worksheet, starting at row 7 with predefined column types
data = readWorksheet(wb, sheet = 1, startRow = 7, 
    colTypes = c("character", rep("numeric", 12)))
# Rename first column and keep month names
colnames(data)[1] = "Region"
months = names(data)[-1]

# The data of merged cells (years) is in the first cell of the merged region
years = ifelse(is.na(data$Region), data$January, NA)
idx = !is.na(years)

# Replicate year information to form a new column 'Year'
data$Year = rep(years[idx], times = diff(c(which(idx), length(years) + 1)))

# Remove any rows where 'Region' is missing (^= non-data rows)
data = data[!is.na(data$Region), ]

# Reshape (wide --> long)
data = melt(data, measure.vars = months, variable.name = "Month")

R：重构不规则的时间序列数据，没有明确的唯一日期

3 个答案: