有效地规范化和转置数据

时间:2016-03-31 12:23:27

标签: r dataframe

我正在尝试根据每个符号规范化和转换股市数据,我已经创建了这个方法,但我看到函数内部的循环这么慢。有没有办法让它更快或以更好的方式做到?

功能是:

normalize <- function(data, col){
    norm <- function(x){
        ((x - min(x))/ (max(x) - min(x)))
    }
    symbols <- t(unique(data['Symbol']))
    result <- data.frame()

    for (s in symbols)
        result <- rbind(result, norm(data[which(data['Symbol'] == s) , col]))
    result = cbind(t(symbols), result)
    colnames(result) <- c('Symbol', paste0('D', t(unique(data['Date']))))
    rownames(result) <- NULL
    return(result)
}

控制台结果是:

> r = normalize(allData, 'Close')
> head(allData, 20) # allData is the input data frame for the function

   Date  Open  High   Low Close  Volume Adj.Close Symbol
   1     1 41.18 41.31 40.37 40.56 1529200  40.35932      A
   2     2 40.32 40.46 39.70 39.80 2041800  39.60308      A
   3     3 39.81 40.02 39.02 39.18 2080600  38.98615      A
   4     4 39.52 39.81 39.29 39.70 3359700  39.50358      A
   5     5 40.24 40.98 40.18 40.89 2116300  40.68769      A
   6     6 41.00 41.00 40.29 40.59 1643900  40.38917      AA
   7     7 40.61 40.72 39.95 40.11 2770800  39.91155      AA
   8     8 40.47 40.70 39.33 39.55 2013100  39.35432      AA
   9     9 39.03 39.10 38.21 39.06 5134000  38.86674      AA
   10   10 39.06 39.41 37.99 38.01 2628900  37.82194      AA
   11   11 37.83 38.46 37.76 38.25 3004000  38.06075      AAL
   12   12 38.43 38.66 37.76 37.93 5033600  37.74233      AAL
   13   13 37.75 38.41 37.68 38.16 2721600  37.97120      AAL
   14   14 38.51 39.86 38.12 39.65 4856600  39.45382      AAL
   15   15 39.60 39.60 38.76 38.81 1519300  38.61798      AAL
   16   16 38.79 39.17 38.47 39.15 1510900  38.95630      AAP
   17   17 38.70 39.25 38.58 38.75 1703500  38.55828      AAP
   18   18 39.01 39.05 37.96 38.00 2033500  37.81199      AAP
   19   19 38.00 38.47 37.69 38.46 2330000  38.26971      AAP
   20   20 38.01 38.32 37.71 37.77 3054300  37.58313      AAP

> head(r, 4)
    A 1.898358 1.138356 0.518357 1.038358 2.228356
   AA 1.928357 1.448358 0.888356 0.398358 -0.651645
  AAL -0.411643 -0.731643 -0.501643 0.988359 0.148358
  AAP 0.488359 0.08835696 -0.661643 -0.201644 -0.891643

部分实际数据:

> dput(allData)
structure(list(Date = 2:20, Open = c(40.32, 39.81, 39.52, 40.24, 
41, 40.61, 40.47, 39.03, 39.06, 37.83, 38.43, 37.75, 38.51, 39.6, 
38.79, 38.7, 39.01, 38, 38.01), High = c(40.46, 40.02, 39.81, 
40.98, 41, 40.72, 40.7, 39.1, 39.41, 38.46, 38.66, 38.41, 39.86, 
39.6, 39.17, 39.25, 39.05, 38.47, 38.32), Low = c(39.7, 39.02, 
39.29, 40.18, 40.29, 39.95, 39.33, 38.21, 37.99, 37.76, 37.76, 
37.68, 38.12, 38.76, 38.47, 38.58, 37.96, 37.69, 37.71), Close = c(39.8, 
39.18, 39.7, 40.89, 40.59, 40.11, 39.55, 39.06, 38.01, 38.25, 
37.93, 38.16, 39.65, 38.81, 39.15, 38.75, 38, 38.46, 37.77), 
    Volume = c(2041800L, 2080600L, 3359700L, 2116300L, 1643900L, 
    2770800L, 2013100L, 5134000L, 2628900L, 3004000L, 5033600L, 
    2721600L, 4856600L, 1519300L, 1510900L, 1703500L, 2033500L, 
    2330000L, 3054300L), Adj.Close = c(39.60308, 38.98615, 39.50358, 
    40.68769, 40.38917, 39.91155, 39.35432, 38.86674, 37.82194, 
    38.06075, 37.74233, 37.9712, 39.45382, 38.61798, 38.9563, 
        38.55828, 37.81199, 38.26971, 37.58313), Symbol = structure(c(1L, 
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 
    4L, 4L, 4L), .Label = c("A", "AA", "AAL", "AAP"), class = "factor")),     .Names = c("Date", 
"Open", "High", "Low", "Close", "Volume", "Adj.Close", "Symbol"
), class = "data.frame", row.names = c(NA, -19L))

2 个答案:

答案 0 :(得分:0)

使用以下功能更改您的功能:

normalize <- function(data, col){
norm <- function(x){
    ((x - min(x))/ (max(x) - min(x)))
}
symbols <- t(unique(data['Symbol']))

listSymbols<-lapply(symbols,function(s){
    norm(data[as.character(data$Symbol) == s , col])
})
symbolsValue<-as.data.frame(matrix(unlist(listSymbols),nrow=length(symbols)))
symbolsValue$Symbol<-t(symbols)

symbolsValue
}

答案 1 :(得分:0)

虽然这可能不是您正在寻找的,但它确实提供了允许每个符号具有不相等数量的条目的灵活性。输出是规范化值的数组列表

 library(dplyr)
normalize<- function(data, col) {
      norm <- function(x){
        ((x - min(x))/ (max(x) - min(x)))
      }
      sym=unique(data$Symbol)
      r<-sapply(sym, FUN=function(x) {norm(select(filter(data, Symbol==x), one_of(col)))})
      names(r)<-sym
      r
    }

 df<-read.csv("test.csv")  #Your data goes here
 r<-normalize(df, "Close")  #Substitute column of interest here