从Datastream

时间:2016-04-09 14:05:36

标签: r long-integer reshape

我已从Datastream下载了每张表格一个变量的数据。 Current data view - One variable: Price

我想将每个工作表(每个变量)转换为面板格式,以便我可以使用plm()或将数据导出到Stata(我是R的新手),所以它看起来像 Click to view - What I expect to have

一个难题是,我有> 500家公司并在R代码中手动编写名称(或代码)非常繁琐

如果您可以绘制基本代码而不仅仅是参考R中的重塑功能,我将非常感激。

P.S。很抱歉发布此问题(如果已经回答)。

1 个答案:

答案 0 :(得分:0)

您当前的数据集是格式,您需要格式,并且reshape包中的melt函数可以很好地使用

melt功能的主键是日期,因为它对所有公司都是相同的

我假设了以下演示的测试数据集:

#Save Price, volume, market value, shares, etc into individual CSV files
#Rename first column as "date" and Remove rows 2 and 3 since you do not need them


#Demo for price data
price_data = read.csv("path_to_price_csv_file",header=TRUE,stringsAsFactors=FALSE,na.strings="NA")

test_DF = price_data

require(reshape2)
require(PerformanceAnalytics)

data(managers)
test_DF = data.frame(date=as.Date(index(managers),format="%Y-%m-%d"),managers,row.names=NULL,stringsAsFactors=FALSE)


#This data is similar in format as your price data

head(test_DF)
# date    HAM1 HAM2    HAM3    HAM4 HAM5 HAM6 EDHEC.LS.EQ SP500.TR US.10Y.TR US.3m.TR
# 1 1996-01-31  0.0074   NA  0.0349  0.0222   NA   NA          NA   0.0340   0.00380  0.00456
# 2 1996-02-29  0.0193   NA  0.0351  0.0195   NA   NA          NA   0.0093  -0.03532  0.00398
# 3 1996-03-31  0.0155   NA  0.0258 -0.0098   NA   NA          NA   0.0096  -0.01057  0.00371
# 4 1996-04-30 -0.0091   NA  0.0449  0.0236   NA   NA          NA   0.0147  -0.01739  0.00428
# 5 1996-05-31  0.0076   NA  0.0353  0.0028   NA   NA          NA   0.0258  -0.00543  0.00443
# 6 1996-06-30 -0.0039   NA -0.0303 -0.0019   NA   NA          NA   0.0038   0.01507  0.00412

#test_data = test_DF                 #replace price, volume , shares dataset here
#dateColumnName = "date"             #name of your date column
#columnOfInterest1 = "manager"       #for you this will be "Name"
#columnOfInterest2 = "return"        #this will vary according to your input data, price, volume, shares etc.


Custom_Melt_DataFrame = function(test_data = test_DF ,dateColumnName = "date", columnOfInterest1 = "manager",columnOfInterest2 = "return") {

    molten_DF = melt(test_data,dateColumnName,stringsAsFactors=FALSE)
    colnames(molten_DF) = c(dateColumnName,columnOfInterest1,columnOfInterest2)

    #format as character
    molten_DF[,columnOfInterest1] = as.character(molten_DF[,columnOfInterest1])

    #assign index
    molten_DF$index =  rep(1:(ncol(test_data)-1),each=nrow(test_data))

    #reorder columns
    molten_DF = molten_DF[,c("index",columnOfInterest1,dateColumnName,columnOfInterest2)]

    return(molten_DF)

}

custom_data = Custom_Melt_DataFrame (test_data = test_DF ,dateColumnName = "date", columnOfInterest1 = "manager",columnOfInterest2 = "return")


head(custom_data,10)

#    index manager       date  return
# 1      1    HAM1 1996-01-31  0.0074
# 2      1    HAM1 1996-02-29  0.0193
# 3      1    HAM1 1996-03-31  0.0155
# 4      1    HAM1 1996-04-30 -0.0091
# 5      1    HAM1 1996-05-31  0.0076
# 6      1    HAM1 1996-06-30 -0.0039
# 7      1    HAM1 1996-07-31 -0.0231
# 8      1    HAM1 1996-08-31  0.0395
# 9      1    HAM1 1996-09-30  0.0147
# 10     1    HAM1 1996-10-31  0.0288


tail(custom_data,10)
#       index  manager     date  return
# 1311    10 US.3m.TR 2006-03-31 0.00385
# 1312    10 US.3m.TR 2006-04-30 0.00366
# 1313    10 US.3m.TR 2006-05-31 0.00404
# 1314    10 US.3m.TR 2006-06-30 0.00384
# 1315    10 US.3m.TR 2006-07-31 0.00423
# 1316    10 US.3m.TR 2006-08-31 0.00441
# 1317    10 US.3m.TR 2006-09-30 0.00456
# 1318    10 US.3m.TR 2006-10-31 0.00381
# 1319    10 US.3m.TR 2006-11-30 0.00430
# 1320    10 US.3m.TR 2006-12-31 0.00441