Question

说我有一个数据名人df

resident    faculty    submittedBy    match    caseID    phase

george      sally      george         1        george_1  pre
george      sally      sally          0        george_1  pre
george      sally      george         1        george_1  intra
jane        carl       jane           1        jane_1    pre
jane        carl       carl           0        jane_1    pre
jane        carl       carl           0        jane_1    intra

，我想根据以下参数向此数据帧添加一列df$response（我认为我需要一组嵌套的ifelses，但我正在努力正确地执行它）：

对于给定的X行，如果df$match = 1，

在以下情况下，在df$response中打印“ 1”：

在df$match = 0的df$match中

任何行 ，其中df$caseID，df$faculty，和df$phase作为X行。否则打印“ 0”。

所以输出应该是这样：

response

1
0
0
1
0
0

因为只有第一行和第四行包含df$caseID = 1和a的行中df$faculty，df$phase和df$match中都存在匹配项的值df$match = 0的行。

Answer 1

我们可以使用!pip install --upgrade google-cloud-bigquery !pip install --upgrade google-api-python-client !pip install --upgrade google-api-core !pip install --upgrade google-cloud-vision #below from https://cloud.google.com/vision/docs/detecting-safe-search def detect_safe_search_uri(uri): """Detects unsafe features in the file located in Google Cloud Storage or on the Web.""" from google.cloud import vision client = vision.ImageAnnotatorClient() image = vision.types.Image() image.source.image_uri = uri response = client.safe_search_detection(image=image) safe = response.safe_search_annotation # Names of likelihood from google.cloud.vision.enums likelihood_name = ('UNKNOWN', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY') print('Safe search:') print('adult: {}'.format(likelihood_name[safe.adult])) print('medical: {}'.format(likelihood_name[safe.medical])) print('spoofed: {}'.format(likelihood_name[safe.spoof])) print('violence: {}'.format(likelihood_name[safe.violence])) print('racy: {}'.format(likelihood_name[safe.racy])) #Sample URI detect_safe_search_uri("https://www.victoriassecret.com/p/404x539/tif/1a/d2/1ad2f0b4a7054d81ad178b941f3a8b80/395481036_OM_F.jpg")方法。将'data.frame'转换为'data.table'（... ... ... /usr/local/envs/py3env/lib/python3.5/site-packages/pkg_resources/__init__.py in resolve(self, requirements, env, installer, replace_conflicting, extras) 784 # Oops, the "best" so far conflicts with a dependency 785 dependent_req = required_by[req] --> 786 raise VersionConflict(dist, req).with_context(dependent_req) 787 788 # push the new requirements onto the stack ContextualVersionConflict: (google-api-core 0.1.4 (/usr/local/envs/py3env/lib/python3.5/site-packages), Requirement.parse('google-api-core[grpc]<2.0.0dev,>=1.6.0'), {'google-cloud-vision'})），按'caseID'，'faculty'，'phase'分组，得到{{1}的data.table个元素的长度}检查它是否等于2并创建一个二进制列（“响应”），对于“匹配”为0”的值，将“响应”分配给0

setDT(df1)

或将unique与match一起使用

library(data.table)
setDT(df1)[, response := +((uniqueN(match) == 2) & match != 0), 
                  .(caseID, faculty, phase)][]
#   resident faculty submittedBy match   caseID phase response
#1:   george   sally      george     1 george_1   pre        1
#2:   george   sally       sally     0 george_1   pre        0
#3:   george   sally      george     1 george_1 intra        0
#4:     jane    carl        jane     1   jane_1   pre        1
#5:     jane    carl        carl     0   jane_1   pre        0
#6:     jane    carl        carl     0   jane_1 intra        0

数据

base R

Answer 2

这就是我要做的

# read the data
test <- read.table(text = 'resident    faculty    submittedBy    match    caseID    phase
                   george      sally      george         1        george_1  pre
                   george      sally      sally          0        george_1  pre
                   george      sally      george         1        george_1  intra
                   jane        carl       jane           1        jane_1    pre
                   jane        carl       carl           0        jane_1    pre
                   jane        carl       carl           0        jane_1    intra', header=T)

# create the response
resp <- logical(0)

# iterate over each loop
for (rr in 1:nrow(test)){
  if (test$match[rr] == 0){
    resp[rr] <- 0
  }
  else{
    tmp <- rbind(test[-rr, c('faculty', 'caseID', 'phase')],  # add the onto the end
                 test[rr, c('faculty', 'caseID', 'phase')])   # test if line is duplicated
    resp[rr] <- ifelse(duplicated(tmp)[nrow(tmp)], 1, 0)
  }
}

Answer 3

使用[]建立索引的速度更快，并且在您的计算机上的开销也较小

df <- data.frame(
  "resident" = c("george","george","george","jane","jane","jane"),
  "faculty" = c("sally","sally","sally","carl","carl","carl"),
  "submittedBy" = c("george","sally","george","jane","carl","carl"),
  "match" = c(1,0,1,1,0,0),
  "caseID" = c("george_1","george_1","george_1","jane_1","jane_1","jane_1"),
  "phase" = c("pre","pre","intra","pre","pre","intra"),
  stringsAsFactors = FALSE
  )

response <- NULL

for (i in 1:nrow(df)) {
  response[i] <- ifelse(
    df$match[i] == 0, 0,
    ifelse(
      any(paste(df$caseID,df$faculty,df$phase,sep="")[df$match == 0] == 
            paste(df$caseID,df$faculty,df$phase,sep="")[i]),
      1, 0
    )
  )
}

response
[1] 1 0 0 1 0 0

Answer 4

另一种 data.table 方法。加入关键变量，并检查值是否不在match==0集中：

library(data.table)
setDT(dat)

dat[, response := match==1]
dat[!dat[match==0], on=c("caseID","faculty","phase"), response := FALSE]

dat
#   resident faculty submittedBy match   caseID phase response
#1:   george   sally      george     1 george_1   pre     TRUE
#2:   george   sally       sally     0 george_1   pre    FALSE
#3:   george   sally      george     1 george_1 intra    FALSE
#4:     jane    carl        jane     1   jane_1   pre     TRUE
#5:     jane    carl        carl     0   jane_1   pre    FALSE
#6:     jane    carl        carl     0   jane_1 intra    FALSE

Answer 5

假设match中只有1个值和0个值，使用dplyr的一种方法是检查是否每个caseID，faculty和phase match中有两个不同的值（1和0），并将response替换为0，其中match为0。

library(dplyr)
df %>%
  group_by(caseID, faculty, phase) %>%
  mutate(response = as.integer(n_distinct(match) == 2),
         response = replace(response, match == 0, 0))

#  resident faculty submittedBy match caseID   phase response
#  <chr>    <chr>   <chr>       <dbl> <chr>    <chr>    <dbl>
#1 george   sally   george          1 george_1 pre          1
#2 george   sally   sally           0 george_1 pre          0
#3 george   sally   george          1 george_1 intra        0
#4 jane     carl    jane            1 jane_1   pre          1
#5 jane     carl    carl            0 jane_1   pre          0
#6 jane     carl    carl            0 jane_1   intra        0

根据条件匹配R中多列中的值

5 个答案:

数据