我正在研究阅读月度时间序列数据的方法,这些数据并非整齐排列,并带有“日期”列和“数据”列。例如,SEMI的这个电子表格包含按月和区域组织的数据块,但年份是分开的,在非连续的块中,年份以YYYY形式作为每个块之前的标题。
我的目标是将此数据转换为连续的块,其中第1列中的月度日期和第2列:6中的区域数据。将此电子表格导出为制表符分隔文件后(我发现gdata
和XLConnect
都遇到了您在屏幕截图中看到的合并单元格问题),我将其读入并获取了一个子集,这是下面dput
的来源。
我采用的方法是首先使用以下方法剥离空行:
mydf <- mydf[which(grepl("^$", mydf$January) == FALSE),]
然后在Region列中为具有年份的行添加一个标签 - 方便地,它总是出现在第二个('January')列中。
mydf[which(nchar(mydf$January) == 4) ,'Region'] <- 'mydate'
下一步是在这些“年”行中填写1月到12月的列,并按月包含日期。我想,一旦我每个月都有一个独特的日期,我就可以使用ddply
或其他东西来处理它。
mydf[which(mydf$Region == 'mydate'), 2:13] <- apply(mydf[which(mydf$Region == 'mydate'), 2:13], 1, function(x) as.character(seq(as.Date(paste(x['January'],"-01-01", sep = "")), as.Date(paste(x['January'],"-12-01", sep = "")), by = 'month')))
由于apply
函数没有以我希望的方式生成日期,因此我没有按预期工作 - 它们不是按顺序排列的。我非常感谢(a)针对apply
步骤的特定修复或(b)指针可能更简单或更容易的替代方法。
以下数据和代码:
mydf <- structure(list(Region = c("", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "", "Americas", "Europe",
"Japan", "Asia Pacific", "Worldwide", "", "", "Americas", "Europe",
"Japan", "Asia Pacific", "Worldwide"), January = c("1980", "413136",
"189577", "34033", "39868", "676614", "", "1981", "445504", "277290",
"33970", "44642", "801406", "", "1982", "445300", "226274", "34404",
"44989", "750967", "", "January", "1983", "457604", "232443",
"34326", "46247", "770621", "", "1984", "731009", "285740", "205644",
"85426", "1307820"), February = c("", "423748", "234818", "35104",
"42398", "736069", "", "", "440225", "274526", "33795", "44005",
"792550", "", "", "438332", "226806", "33359", "44020", "742517",
"", "February", "", "457899", "233560", "32604", "46184", "770247",
"", "", "790963", "307735", "381282", "102791", "1582770"), March = c("",
"436152", "281353", "34456", "46555", "798516", "", "", "434628",
"267259", "33709", "45206", "780802", "", "", "441313", "235612",
"32380", "43600", "752905", "", "March", "", "459498", "234986",
"31544", "48178", "774206", "", "", "856970", "339674", "574527",
"118091", "1889262"), April = c("", "455673", "288710", "34451",
"48585", "827419", "", "", "443285", "264405", "34823", "47192",
"789705", "", "", "465613", "246425", "33618", "46274", "791930",
"", "April", "", "484299", "243867", "32719", "52333", "813218",
"", "", "909873", "364465", "627400", "126954", "2028693"), May = c("",
"474441", "297343", "35092", "51102", "857977", "", "", "451221",
"255887", "35499", "48459", "791065", "", "", "487738", "249522",
"34339", "47727", "819325", "", "May", "", "507807", "246136",
"34708", "59300", "847950", "", "", "969553", "382706", "655862",
"133455", "2141576"), June = c("", "475552", "299427", "35743",
"51440", "862162", "", "", "453152", "242889", "35798", "48147",
"779986", "", "", "488564", "241273", "34360", "48871", "813068",
"", "June", "", "528620", "246710", "37345", "62910", "875586",
"", "", "991274", "388697", "672773", "135550", "2188294"), July = c("",
"473007", "302075", "37771", "51027", "863880", "", "", "454387",
"231097", "35402", "47468", "768353", "", "", "480702", "229555",
"33915", "49112", "793284", "", "July", "", "543063", "241211",
"40403", "66658", "891335", "", "", "1005742", "395852", "683854",
"138853", "2224302"), August = c("", "462125", "294497", "37628",
"49773", "844023", "", "", "450648", "213017", "34363", "46614",
"744642", "", "", "472486", "215763", "32866", "48620", "769734",
"", "August", "", "565034", "236353", "42524", "66853", "910763",
"", "", "1010739", "393337", "691731", "141101", "2236908"),
September = c("", "461968", "295501", "37310", "50280", "845059",
"", "", "459276", "215403", "33801", "47297", "755777", "",
"", "475729", "219643", "33083", "47540", "775994", "", "September",
"", "593019", "244979", "44108", "70242", "952348", "", "",
"1035725", "408658", "698992", "141944", "2285320"), October = c("",
"459862", "296522", "36399", "51220", "844003", "", "", "465096",
"218792", "34168", "47369", "765424", "", "", "467151", "225828",
"33667", "47890", "774536", "", "October", "", "618854",
"259807", "47622", "71345", "997628", "", "", "1033560",
"421043", "710563", "140154", "2305320"), November = c("",
"456832", "296283", "35769", "50531", "839415", "", "", "467288",
"232593", "35039", "47415", "782335", "", "", "461950", "237117",
"35672", "47285", "782024", "", "November", "", "641864",
"275099", "50371", "72095", "1039428", "", "", "1008836",
"441652", "732948", "133861", "2317297"), December = c("",
"460343", "291348", "35781", "48298", "835771", "", "", "460574",
"231461", "35971", "47173", "775179", "", "", "462919", "235861",
"36251", "47974", "783006", "", "December", "", "672533",
"276525", "54603", "74717", "1078379", "", "", "982210",
"442448", "731546", "132982", "2289187")), .Names = c("Region",
"January", "February", "March", "April", "May", "June", "July",
"August", "September", "October", "November", "December"), row.names = 29:63, class = "data.frame")
mydf <- mydf[which(grepl("^$", mydf$January) == FALSE),] # remove rows with nothing in the January column
mydf[which(nchar(mydf$January) == 4) ,'Region'] <- 'mydate' # add a row label for 'year' rows
mydf[which(mydf$Region == 'mydate'), 2:13] <- apply(mydf[which(mydf$Region == 'mydate'), 2:13], 1, function(x) as.character(seq(as.Date(paste(x['January'],"-01-01", sep = "")), as.Date(paste(x['January'],"-12-01", sep = "")), by = 'month')))
答案 0 :(得分:3)
您可以使用xlsReadWrite
和reshape2
library(xlsReadWrite)
tdata<-read.xls('GSR1976-June 2012.xls',stringsAsFactors=F)
tdata[85,2]<-1987 # fix for missing year
tdata[228,2]<-2007 # fix for missing year
year.marker<-c(grep('^[[:digit:]]{4}$',tdata[,2]),270)
temp.df<-NULL
for(i in seq_along(year.marker)[-length(year.marker)]){
dum.df<-cbind(tdata[year.marker[i],2],tdata[(year.marker[i]+1):(year.marker[i+1]-2),])
temp.df<-rbind(temp.df,dum.df)
}
names(temp.df)<-c('year','region',month.name)
df1<-temp.df[!temp.df[,'region']=='',]
library(reshape2)
df2<-melt(df1, id.vars=c("region", "year"))
答案 1 :(得分:2)
我采取了以下方法:
首先,我将您的文件转换为CSV,然后读取行。我使用grep()
查找&#34; Americas&#34;,这是每组中的第一行。我手动输入了开始和结束年份,但也可能在那里使用了一些grep
。
temp = readLines("GSR1976-June 2012.csv")
START = grep("Americas", temp)
YEARS = 1976:2012
之后,我创建了一个data.frame
的列表,每年一个。
temp1 = lapply(1:length(YEARS),
function(x) read.csv("GSR1976-June 2012.csv",
header=FALSE, skip=START[x]-1,
nrows=5))
names(temp1) = YEARS
然后,我将它们合并为一个data.frame
并做了一些清理。
temp2 = do.call(rbind, temp1)
names(temp2) = c("region", "jan", "feb", "mar", "apr", "may", "jun",
"jul", "aug", "sep", "oct", "nov", "dec")
temp2$year = rep(YEARS, each=5)
你没有指定你想要做什么类型的重塑,但如果你想要从宽到长,最简单的方法是使用reshape2
包:
library(reshape2)
temp3 = melt(temp2, id.vars=c("region", "year"))
list(head(temp3), tail(temp3))
# [[1]]
# region year variable value
# 1 Americas 1976 jan NA
# 2 Europe 1976 jan NA
# 3 Japan 1976 jan NA
# 4 Asia Pacific 1976 jan NA
# 5 Worldwide 1976 jan NA
# 6 Americas 1977 jan 195638
#
# [[2]]
# region year variable value
# 2215 Worldwide 2011 dec 23832532
# 2216 Americas 2012 dec NA
# 2217 Europe 2012 dec NA
# 2218 Japan 2012 dec NA
# 2219 Asia Pacific 2012 dec NA
# 2220 Worldwide 2012 dec NA
然后,对于您正在寻找发声的输出,请使用dcast()
:
temp4 = dcast(temp3, year + variable ~ region)
head(temp4)
# year variable Americas Asia Pacific Europe Japan Worldwide
# 1 1976 jan NA NA NA NA NA
# 2 1976 feb NA NA NA NA NA
# 3 1976 mar 178295 16761 55602 10805 261463
# 4 1976 apr 178961 16513 60959 11589 268022
# 5 1976 may 187076 17396 62329 12435 279235
# 6 1976 jun 193675 17712 61676 14411 287475
答案 2 :(得分:1)
可以使用 XLConnect 直接从Excel文件中轻松处理上述数据集:
require(XLConnect)
require(reshape2)
# Load Excel workbook
wb = loadWorkbook("~/Downloads/GSR1976-June 2012.xls")
# Read data from 1st worksheet, starting at row 7 with predefined column types
data = readWorksheet(wb, sheet = 1, startRow = 7,
colTypes = c("character", rep("numeric", 12)))
# Rename first column and keep month names
colnames(data)[1] = "Region"
months = names(data)[-1]
# The data of merged cells (years) is in the first cell of the merged region
years = ifelse(is.na(data$Region), data$January, NA)
idx = !is.na(years)
# Replicate year information to form a new column 'Year'
data$Year = rep(years[idx], times = diff(c(which(idx), length(years) + 1)))
# Remove any rows where 'Region' is missing (^= non-data rows)
data = data[!is.na(data$Region), ]
# Reshape (wide --> long)
data = melt(data, measure.vars = months, variable.name = "Month")