在R中避免使用for循环(带有临时变量)

时间:2016-06-07 01:33:22

标签: r loops for-loop

我需要比较两个名字,看看其中一个是否是另一个的昵称。我在数据框中有两列名称。

Names <- data.frame(In_Name = c("Gary",'John','James','William','Bill','Paul','Tom','Annie','Bella','Sue'),
                Match_Name = c('Garry','Jon','Jimmy','Paul','William','Pablo','Thomas','Anne','Belle','Susan'),stringsAsFactors = F)

Names[] <- lapply(Names, toupper)
Names$Match <- 0

我还有一个昵称表,其中包含一对昵称。在整个集合中,名称可能出现在多对类似的对中(如下面的&#39; Bella&#39;行的情况)

    NickName_Table <- data.frame(Names = c('Garrett,Garret,Gary,Garry'
                                           ,'Ian,John,Johnie,Johnnie,Johnny,Jon'
                                           ,'Jae,James,Jamey,Jay,Jaymes,Jem,Jemmy,Jim,Jimi,Jimmie,Jimmy'
                                           ,'Bill,Billie,Billy,Wil,Will,William,Willie,Willy'
                                           ,'Paul,Pauly,Paulie'
                                           ,'Maas,Thom,Thomas,Tom,Tomas,Tommie,Tommy'
                                           ,'Ann,Anna,Anne,Annette,Annie,Nan,Nancy,Nanette,Nannie,Nanny'
                                           ,'Bella,Belle,Ibbie,Issy,Izzy,Sabella'
                                           ,'Isabella,Isabelle,Bella,Belle'
                                           ,'Sue,Sukie,Susan,Susann,Susanna,Suzie'))
    NickName_Table[] <- lapply(NickName_Table, toupper)

我想避免使用for循环但是我无法解决如何使用函数调用,因为我需要将找到的行存储在temp变量中,以便在第二个名称中搜索它的存在在同一行/ s。我需要为超过一百万对名称执行此操作,并且for循环太慢。我目前的循环是:

library(sqldf)
i=1
for (i in 1:nrow(Names))
{

  first_name <- Names[i,1]
  match_name <- Names[i,2]

  if(!is.na(first_name) & !is.na(match_name) & first_name != match_name)
  {  
    if (nrow(subset(NickName_Table,grepl(first_name,NickName_Table$Names)))>= 1)
    {
      possibleMatch <- subset(NickName_Table,grepl(first_name,NickName_Table$Names))
      temp1 <- unique(as.data.frame(strsplit(gsub(" ", ",",Reduce(paste,unlist(possibleMatch))),","), stringsAsFactors = F))
      colnames(temp1) <- "Names"
      temp2 <- data.frame(match_name, stringsAsFactors = F)
      colnames(temp2) <- "Names_1"

      if(nrow(sqldf("Select a.* from temp1 a left join temp2 b on a.Names=b.Names_1 where b.Names_1 is not NULL"))>= 1)
      {
        Names[i,3] <- 1 
      }
      else
        Names[i,3] <- 0 
    }
    else
      Names[i,3] <- 0 
  }
  else
    Names[i,3] <- 0
}

编辑: 我试图创建一个函数,但问题是昵称表的长度和要比较的字符串是不相等的,因此矢量化比较似乎不起作用。

    functiona <- function (inNames,MatchNames,NickName_Table1){
  if(!is.na(inNames) & !is.na(MatchNames) & inNames != MatchNames)
  { 
    if (length(subset(NickName_Table1,grepl(inNames,NickName_Table1)))>= 1)
    {
      possibleMatch <- subset(NickName_Table1,grepl(inNames,NickName_Table1))
      temp1 <- unique(as.data.frame(strsplit(gsub(" ", ",",Reduce(paste,unlist(possibleMatch))),","), stringsAsFactors = F))
      colnames(temp1) <- "Names"
      temp2 <- data.frame(MatchNames, stringsAsFactors = F)
      colnames(temp2) <- "Names_1"

      if(nrow(sqldf("Select a.* from temp1 a left join temp2 b on a.Names=b.Names_1 where b.Names_1 is not NULL"))>= 1)
      {
        return <- 1 
      }
      else
        return <- 0 
    }
    else
      return <- 0 
  }
  else
    return <- 0
}

c <- mapply(functiona,Names$In_Name,Names$Match_Name,NickName_Table$Names)

3 个答案:

答案 0 :(得分:0)

没有循环!

sapply比循环指数快得多。 merge也更快,特别是data.table

require(data.table)
Names <- data.frame(In_Name = c("Gary",'John','James','William','Bill','Paul','Tom','Annie','Bella','Sue'),
                    Match_Name = c('Garry','Jon','Jimmy','Paul','William','Pablo','Thomas','Anne','Belle','Susan'),stringsAsFactors = F)

Names[] <- lapply(Names, toupper)
Names$Match <- 0

NickName_Table <- data.table(Names = c('Garrett,Garret,Gary,Garry'
                                       ,'Ian,John,Johnie,Johnnie,Johnny,Jon'
                                       ,'Jae,James,Jamey,Jay,Jaymes,Jem,Jemmy,Jim,Jimi,Jimmie,Jimmy'
                                       ,'Bill,Billie,Billy,Wil,Will,William,Willie,Willy'
                                       ,'Paul,Pauly,Paulie'
                                       ,'Maas,Thom,Thomas,Tom,Tomas,Tommie,Tommy'
                                       ,'Ann,Anna,Anne,Annette,Annie,Nan,Nancy,Nanette,Nannie,Nanny'
                                       ,'Bella,Belle,Ibbie,Issy,Izzy,Sabella'
                                       ,'Isabella,Isabelle,Bella,Belle'
                                       ,'Sue,Sukie,Susan,Susann,Susanna,Suzie'))
NickName_Table[] <- lapply(NickName_Table, toupper)

n              <- which(like(NickName_Table$Names,"BELLA"))
tmp            <- as.data.frame(paste(NickName_Table$Names[n[1]], NickName_Table$Names[n[2]])) # either tweak if you have > 2 in other cases or just count columnwise TRUE values in final sapply step below
colnames(tmp)  <- NULL
NickName_Table <- NickName_Table[!which(like(NickName_Table$Names,"BELLA")),]
NickName_Table <- rbind(NickName_Table,tmp)
NickName_Table$no <- 1:nrow(NickName_Table)

Names$nick_row <- sapply(Names$In_Name,FUN = function(x) which(grepl(x, NickName_Table$Names)))
Names          <- merge(x = Names, NickName_Table, by.x = "nick_row", by.y = "no")

Names$Match <- diag(sapply(Names$Match_Name, FUN =  function(x) grepl(x, Names$Names)))
Names$Names    <- NULL
Names$nick_row <- NULL
Names

Names
   In_Name Match_Name Match
1     GARY      GARRY  TRUE
2     JOHN        JON  TRUE
3    JAMES      JIMMY  TRUE
4  WILLIAM       PAUL FALSE
5     BILL    WILLIAM  TRUE
6     PAUL      PABLO FALSE
7      TOM     THOMAS  TRUE
8    ANNIE       ANNE  TRUE
9      SUE      SUSAN  TRUE
10   BELLA      BELLE  TRUE

答案 1 :(得分:0)

假设a)您只想知道In_NameMatch_Name对是否存在于同一行Nickname_Table中,而b)您是否需要知道他们所依赖的行,那么我认为这样就可以了:

## separate the nicknames into individual strings
splitlist <- sapply(NickName_Table, strsplit, ",")

## create a truth table where In_Name and Match_Name both exist on a row of Nickname_Table
truthMatrix <- sapply(1:nrow(Names), function(x) {
  sapply(1:length(splitlist), function(y) {
    match(Names$In_Name[x], splitlist[[y]])>0 & match(Names$Match_Name[x], splitlist[[y]])>0
  })
})

## assign the value as a match if there is at least one anywhere
Names$Match <- ifelse(is.na(apply(truthMatrix, 2, any)), 0, 1)

Names    
#>    In_Name Match_Name Match
#> 1     GARY      GARRY     1
#> 2     JOHN        JON     1
#> 3    JAMES      JIMMY     1
#> 4  WILLIAM       PAUL     0
#> 5     BILL    WILLIAM     1
#> 6     PAUL      PABLO     0
#> 7      TOM     THOMAS     1
#> 8    ANNIE       ANNE     1
#> 9    BELLA      BELLE     1
#> 10     SUE      SUSAN     1

这仍然需要遍历nrow(Names)*nrow(Nickname_Table)值,但可能会有一些矢量化来利用。

为清楚起见,这里是truthMatrix的值:

truthMatrix
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> [1,] TRUE   NA   NA   NA   NA   NA   NA   NA   NA    NA
#> [2,]   NA TRUE   NA   NA   NA   NA   NA   NA   NA    NA
#> [3,]   NA   NA TRUE   NA   NA   NA   NA   NA   NA    NA
#> [4,]   NA   NA   NA   NA TRUE   NA   NA   NA   NA    NA
#> [5,]   NA   NA   NA   NA   NA   NA   NA   NA   NA    NA
#> [6,]   NA   NA   NA   NA   NA   NA TRUE   NA   NA    NA
#> [7,]   NA   NA   NA   NA   NA   NA   NA TRUE   NA    NA
#> [8,]   NA   NA   NA   NA   NA   NA   NA   NA TRUE    NA
#> [9,]   NA   NA   NA   NA   NA   NA   NA   NA TRUE    NA
#> [10,]  NA   NA   NA   NA   NA   NA   NA   NA   NA  TRUE

在那里你可以看到贝拉&#39;贝尔&#39;有两次匹配。

答案 2 :(得分:0)

这可以全部放入单个SQL语句中。我们在restNamesIn_Name前加上逗号并附加逗号,以确保我们不会获得部分匹配,然后离开加入(以确保Match_Name的所有行都是保留)Names使用条件,当NickName_TableIn_Name与同一行Match_Name匹配时,该条件为真。 SQLite函数Names检查其第一个参数是否包含其第二个参数作为子字符串。

instr

,并提供:

sqldf("select distinct In_Name, Match_Name, Names is not null as 'Match' 
       from Names 
       left join (select ',' || Names || ',' as Names from NickName_Table) 
       on instr(Names, ',' || In_Name || ',') and instr(Names, ',' || Match_Name || ',')")