Question

我正在寻找一种解决方案来加快我的代码的速度。我正在使用大约一个数据集。 100.000行，目前正在使用double for循环。我想这正在减慢我的代码的速度。

Example data:

dt<-structure(list(name = c("Marcus", "Tina", "Jack", "George"), 
  address = c("Oxford Str.", "Oxford Str.", "Waterloo Sq.", 
  "London Str."), number = c(1, 1, 20, 15), suffix = c("a", 
  "a", NA, "b"), child = c("Tina", NA, "George", NA)), .Names = c("name", 
  "address", "number", "suffix", "child"), row.names = c(NA, -4L
  ), class = "data.frame")

Example DataFrame:
     name       address      number   suffix   child
1    Marcus     Oxford Str.  1        a        Tina
2    Tina       Oxford Str.  1        a     
3    Jack       Waterloo Sq. 20                George
4    George     London Str.  15       b

我在每一行进行迭代，以检查孩子是否住在同一地址，并在“输出”新列中添加“ 1”。默认为“ 0”。结果应该是：

Example result:
     name       address      number   suffix   child   output
1    Marcus     Oxford Str.  1        a        Tina    1
2    Tina       Oxford Str.  1        a     
3    Jack       Waterloo Sq. 20                George  0
4    George     London Str.  15       b

我当前的代码：

df$output = 0
n = NROW(df)

for(i in 1:n) {
 childID = df[i,5]
 address = df[i,2]
 number = df[i,3]
 suffix = df[i,4]
   for(j in 1:n) {
       if((childID %in% df[j,1])&(address %in% df[j,2])&(number %in% df[j,3])
         &(suffix %in% df[j,4]))
           (df[i,6] = 1)
    }
}

我尝试将Rcpp与C ++代码一起使用。它也正在工作，但仍然很慢。有什么想法可以加快速度吗？或者我应该接受它来运行它吗？

Answer 1

我会尝试将地址连接起来，然后使用match，如下所示：

# recreate your input (I put NAs where you have blanks)
DF <- 
data.frame(name=c('Marcus','Tina','Jack','George'),
           address=c('Oxford Str.','Oxford Str.','Waterloo Sq.','London Str.'),
           number=c(1,1,20,15),
           suffix=c('a','a',NA,'b'),
           child=c('Tina',NA,'George',NA))

# create a single character address by concatenating address,number and suffix
fulladdr <- paste(DF$address,DF$number,DF$suffix,sep='||')
# initialize output to 0
DF$output <- 0
# set 1 where concatenated addresses match
DF$output[fulladdr[match(DF$child,DF$name)] == fulladdr] <- 1

> DF
    name      address number suffix  child output
1 Marcus  Oxford Str.      1      a   Tina      1
2   Tina  Oxford Str.      1      a   <NA>      0
3   Jack Waterloo Sq.     20   <NA> George      0
4 George  London Str.     15      b   <NA>      0

Answer 2

我已经实现了一个data.table解决方案，对于这个特定的数据集，它的解决方案比@digEmAll解决方案要慢，但是可能还是有帮助的。此外，我提供了一些小型基准测试，在这个小型数据集上并没有真正意义，因此请在更大的基准上进行测试。

library(data.table)
name = c("Marcus", "Tina", "Jack", "George")
address = c("Oxford Str.", "Oxford Str.", "Waterloo Sq.", "London Str.")
number = c(1, 1, 20, 15)
suffix = c("a", "a", "", "b")
child = c("Tina", "", "George", "")

dt <- data.table(name
                 , address
                 ,number
                 ,suffix
                 ,child
                 )
dt[, FullAddr := paste0(address, " " , number, suffix)]
dt[ FullAddr[match(child,name)] == FullAddr, output := 1  ]

dt[is.na(output), output := 0]
dt
   name      address number suffix  child        FullAddr output
1: Marcus  Oxford Str.      1      a   Tina  Oxford Str. 1a      1
2:   Tina  Oxford Str.      1      a         Oxford Str. 1a      0
3:   Jack Waterloo Sq.     20        George Waterloo Sq. 20      0
4: George  London Str.     15      b        London Str. 15b      0

library(microbenchmark)

microbenchmark(
        a = {dt[ FullAddr[match(child,name)] == FullAddr, output := 1  ]}
        , b= {df$output = 0
        n = NROW(df)

        for(i in 1:n) {
                childID = df[i,5]
                address = df[i,2]
                number = df[i,3]
                suffix = df[i,4]
                for(j in 1:n) {
                        if((childID %in% df[j,1])&(address %in% df[j,2])&(number %in% df[j,3])
                           &(suffix %in% df[j,4]))
                                (df[i,6] = 1)
                }
        }}
        , c = df$output[fulladdr[match(df$child,df$name)] == fulladdr] <- 1

       , times = 100L

)

    Unit: microseconds
 expr       min        lq        mean     median         uq        max neval cld
    a   298.842   348.347   427.59415   413.6995   489.4665    903.467   100  a 
    b 15042.275 15494.461 17983.16735 15864.5405 16257.7130 162306.656   100   b
    c    39.847    46.487    58.82731    59.1655    64.7495    165.420   100  a

Answer 3

这是评论中提到的基于hashmap的解决方案：

df <- read.csv(text = 'name,address,number,suffix,child
Marcus,Oxford Str.,1,a,Tina
Tina,Oxford Str.,1,a,     
Jack,Waterloo Sq.,20,,George
George,London Str.,15,b,', stringsAsFactors = FALSE)
df

library(hashmap)
address <- paste(df$address, df$number, df$suffix)
name_address <- hashmap(df$name, address)
child_address <- name_address[[df$child]]
output <- as.integer(child_address == address)
output <- ifelse(is.na(output), '', as.character(output))              

df$output <- output
df

输出：

> df
    name      address number suffix  child output
1 Marcus  Oxford Str.      1      a   Tina      1
2   Tina  Oxford Str.      1      a              
3   Jack Waterloo Sq.     20        George      0
4 George  London Str.     15      b

R：加快双循环速度

3 个答案: