说我有一个数据名人df
resident faculty submittedBy match caseID phase
george sally george 1 george_1 pre
george sally sally 0 george_1 pre
george sally george 1 george_1 intra
jane carl jane 1 jane_1 pre
jane carl carl 0 jane_1 pre
jane carl carl 0 jane_1 intra
,我想根据以下参数向此数据帧添加一列df$response
(我认为我需要一组嵌套的ifelses,但我正在努力正确地执行它):
对于给定的X行,如果df$match
= 1,
在以下情况下,在df$response
中打印“ 1”:
df$match
= 0的df$match
中 任何行 ,其中df$caseID
,df$faculty
,和df$phase
作为X行。否则打印“ 0”。
所以输出应该是这样:
response
1
0
0
1
0
0
因为只有第一行和第四行包含df$caseID
= 1和a的行中df$faculty
,df$phase
和df$match
中都存在匹配项的值df$match
= 0的行。
答案 0 :(得分:5)
我们可以使用!pip install --upgrade google-cloud-bigquery
!pip install --upgrade google-api-python-client
!pip install --upgrade google-api-core
!pip install --upgrade google-cloud-vision
#below from https://cloud.google.com/vision/docs/detecting-safe-search
def detect_safe_search_uri(uri):
"""Detects unsafe features in the file located in Google Cloud Storage or
on the Web."""
from google.cloud import vision
client = vision.ImageAnnotatorClient()
image = vision.types.Image()
image.source.image_uri = uri
response = client.safe_search_detection(image=image)
safe = response.safe_search_annotation
# Names of likelihood from google.cloud.vision.enums
likelihood_name = ('UNKNOWN', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE',
'LIKELY', 'VERY_LIKELY')
print('Safe search:')
print('adult: {}'.format(likelihood_name[safe.adult]))
print('medical: {}'.format(likelihood_name[safe.medical]))
print('spoofed: {}'.format(likelihood_name[safe.spoof]))
print('violence: {}'.format(likelihood_name[safe.violence]))
print('racy: {}'.format(likelihood_name[safe.racy]))
#Sample URI
detect_safe_search_uri("https://www.victoriassecret.com/p/404x539/tif/1a/d2/1ad2f0b4a7054d81ad178b941f3a8b80/395481036_OM_F.jpg")
方法。将'data.frame'转换为'data.table'(...
...
...
/usr/local/envs/py3env/lib/python3.5/site-packages/pkg_resources/__init__.py in resolve(self, requirements, env, installer, replace_conflicting, extras)
784 # Oops, the "best" so far conflicts with a dependency
785 dependent_req = required_by[req]
--> 786 raise VersionConflict(dist, req).with_context(dependent_req)
787
788 # push the new requirements onto the stack
ContextualVersionConflict: (google-api-core 0.1.4 (/usr/local/envs/py3env/lib/python3.5/site-packages), Requirement.parse('google-api-core[grpc]<2.0.0dev,>=1.6.0'), {'google-cloud-vision'})
),按'caseID','faculty','phase'分组,得到{{1}的data.table
个元素的长度}检查它是否等于2并创建一个二进制列(“响应”),对于“匹配”为0”的值,将“响应”分配给0
setDT(df1)
或将unique
与match
一起使用
library(data.table)
setDT(df1)[, response := +((uniqueN(match) == 2) & match != 0),
.(caseID, faculty, phase)][]
# resident faculty submittedBy match caseID phase response
#1: george sally george 1 george_1 pre 1
#2: george sally sally 0 george_1 pre 0
#3: george sally george 1 george_1 intra 0
#4: jane carl jane 1 jane_1 pre 1
#5: jane carl carl 0 jane_1 pre 0
#6: jane carl carl 0 jane_1 intra 0
base R
答案 1 :(得分:3)
这就是我要做的
# read the data
test <- read.table(text = 'resident faculty submittedBy match caseID phase
george sally george 1 george_1 pre
george sally sally 0 george_1 pre
george sally george 1 george_1 intra
jane carl jane 1 jane_1 pre
jane carl carl 0 jane_1 pre
jane carl carl 0 jane_1 intra', header=T)
# create the response
resp <- logical(0)
# iterate over each loop
for (rr in 1:nrow(test)){
if (test$match[rr] == 0){
resp[rr] <- 0
}
else{
tmp <- rbind(test[-rr, c('faculty', 'caseID', 'phase')], # add the onto the end
test[rr, c('faculty', 'caseID', 'phase')]) # test if line is duplicated
resp[rr] <- ifelse(duplicated(tmp)[nrow(tmp)], 1, 0)
}
}
答案 2 :(得分:2)
使用[]
建立索引的速度更快,并且在您的计算机上的开销也较小
df <- data.frame(
"resident" = c("george","george","george","jane","jane","jane"),
"faculty" = c("sally","sally","sally","carl","carl","carl"),
"submittedBy" = c("george","sally","george","jane","carl","carl"),
"match" = c(1,0,1,1,0,0),
"caseID" = c("george_1","george_1","george_1","jane_1","jane_1","jane_1"),
"phase" = c("pre","pre","intra","pre","pre","intra"),
stringsAsFactors = FALSE
)
response <- NULL
for (i in 1:nrow(df)) {
response[i] <- ifelse(
df$match[i] == 0, 0,
ifelse(
any(paste(df$caseID,df$faculty,df$phase,sep="")[df$match == 0] ==
paste(df$caseID,df$faculty,df$phase,sep="")[i]),
1, 0
)
)
}
response
[1] 1 0 0 1 0 0
答案 3 :(得分:2)
另一种 data.table 方法。加入关键变量,并检查值是否不在match==0
集中:
library(data.table)
setDT(dat)
dat[, response := match==1]
dat[!dat[match==0], on=c("caseID","faculty","phase"), response := FALSE]
dat
# resident faculty submittedBy match caseID phase response
#1: george sally george 1 george_1 pre TRUE
#2: george sally sally 0 george_1 pre FALSE
#3: george sally george 1 george_1 intra FALSE
#4: jane carl jane 1 jane_1 pre TRUE
#5: jane carl carl 0 jane_1 pre FALSE
#6: jane carl carl 0 jane_1 intra FALSE
答案 4 :(得分:1)
假设match
中只有1个值和0个值,使用dplyr
的一种方法是检查是否每个caseID
,faculty
和phase
match
中有两个不同的值(1和0),并将response
替换为0,其中match
为0。
library(dplyr)
df %>%
group_by(caseID, faculty, phase) %>%
mutate(response = as.integer(n_distinct(match) == 2),
response = replace(response, match == 0, 0))
# resident faculty submittedBy match caseID phase response
# <chr> <chr> <chr> <dbl> <chr> <chr> <dbl>
#1 george sally george 1 george_1 pre 1
#2 george sally sally 0 george_1 pre 0
#3 george sally george 1 george_1 intra 0
#4 jane carl jane 1 jane_1 pre 1
#5 jane carl carl 0 jane_1 pre 0
#6 jane carl carl 0 jane_1 intra 0