我创建了一个函数,该函数基于三个包含字符A到F的字符串来计算数字分数。通过确定每个字符串的最频繁字符来计算中间分数。最终分数由多数投票计算得出。如果所有三个中间分数都不相同,则最终数字分数将是所有中间分数的平均值。我知道该功能尚未优化-有几种方法可以更好地编写它。任何建议都值得欢迎!
我从以下StackOverflow问题中获得了部分代码... Character frequency in a string
fncLetterValue <- function(strLetter)
{
intValue = 9
if(strLetter=="A") intValue <- 1
if(strLetter=="B") intValue <- 2
if(strLetter=="C") intValue <- 3
if(strLetter=="D") intValue <- 4
if(strLetter=="E") intValue <- 5
if(strLetter=="F") intValue <- 6
return(intValue)
}
fncScore <- function(str1, str2, str3)
{
# determine frequency of letters in str1
A1 <- sum(charToRaw(str1)==charToRaw("A"))
B1 <- sum(charToRaw(str1)==charToRaw("B"))
C1 <- sum(charToRaw(str1)==charToRaw("C"))
D1 <- sum(charToRaw(str1)==charToRaw("D"))
E1 <- sum(charToRaw(str1)==charToRaw("E"))
F1 <- sum(charToRaw(str1)==charToRaw("F"))
Max1 <- max(A1,B1,C1,D1,E1,F1) # determine the maximum frequency
Score1 <- 9
if(A1==Max1) Score1 <- 1
if(B1==Max1) Score1 <- 2
if(C1==Max1) Score1 <- 3
if(D1==Max1) Score1 <- 4
if(E1==Max1) Score1 <- 5
if(F1==Max1) Score1 <- 6
# determine frequency of letters in str2
A2 <- sum(charToRaw(str2)==charToRaw("A"))
B2 <- sum(charToRaw(str2)==charToRaw("B"))
C2 <- sum(charToRaw(str2)==charToRaw("C"))
D2 <- sum(charToRaw(str2)==charToRaw("D"))
E2 <- sum(charToRaw(str2)==charToRaw("E"))
F2 <- sum(charToRaw(str2)==charToRaw("F"))
Max2 <- max(A2,B2,C2,D2,E2,F2) # determine the maximum frequency
Score2 <- 9
if(A2==Max2) Score2 <- 1
if(B2==Max2) Score2 <- 2
if(C2==Max2) Score2 <- 3
if(D2==Max2) Score2 <- 4
if(E2==Max2) Score2 <- 5
if(F2==Max2) Score2 <- 6
# determine frequency of letters in str3
A3 <- sum(charToRaw(str3)==charToRaw("A"))
B3 <- sum(charToRaw(str3)==charToRaw("B"))
C3 <- sum(charToRaw(str3)==charToRaw("C"))
D3 <- sum(charToRaw(str3)==charToRaw("D"))
E3 <- sum(charToRaw(str3)==charToRaw("E"))
F3 <- sum(charToRaw(str3)==charToRaw("F"))
Max3 <- max(A3,B3,C3,D3,E3,F3) # determine the maximum frequency
Score3 <- 9
if(A3==Max3) Score3 <- 1
if(B3==Max3) Score3 <- 2
if(C3==Max3) Score3 <- 3
if(D3==Max3) Score3 <- 4
if(E3==Max3) Score3 <- 5
if(F3==Max3) Score3 <- 6
# get final score by majority voting
dblFinalScore <- 9
if(Score1==Score2 | Score1==Score3) dblFinalScore <- Score1
if(Score2==Score1 | Score2==Score3) dblFinalScore <- Score2
if(Score3==Score1 | Score3==Score2) dblFinalScore <- Score3
if(dblFinalScore==9) dblFinalScore <- mean(c(Score1,Score2,Score3))
return(dblFinalScore)
}
# read csv
setwd("~/Downloads")
df <- read.csv("CompositeScore.csv", header = TRUE)
df$score <- fncScore(df$Vector1, df$Vector2, df$Vector3) #THIS LINE GIVES AN ERROR!
该功能确实可以在控制台中处理一组字符串... 例如fncScore(“ AAAABBBBBBBBB”,“ ABBCCCCCCCCCC”,“ FFFFFFFFFFF”)
[1] 3.666667
但是,相同的功能不适用于数据框。 我收到以下错误: “ charToRaw(str1)中的错误:参数必须是长度为1的字符向量”
我希望在数据框中添加一个带有最终得分的新列。
我在做什么错了?
答案 0 :(得分:0)
如果我正确理解了您的计分系统,我认为这可以满足您使用tidyverse
...
library(tidyverse)
df <- read_csv("CompositeScore.csv")
scores <- function(x){ #function to identify most common letter in each string
map_int(x, ~which.max(str_count(., LETTERS[1:6])))
}
df <- df %>%
gather(key = Vector, value = value, -Observation) %>% #change to 'long' format
mutate(score = scores(value)) %>% #calculate scores
group_by(Observation) %>% #group by Observation for next line
mutate(score = ifelse(sum(score == median(score)) > 1, #if two the same
median(score), #then median
mean(score))) %>% #otherwise mean
spread(key = Vector, value = value) #back to wide format
head(df)
Observation score Vector1 Vector2 Vector3
1 1 3.33 CCEDDBEACBAD ADAABEEAEADD ACEFBAFDFDCB
2 2 3.33 ECBDEFACDAEA AFDEECDBEDFF EBEFCCEAEDFB
3 3 5 BDDDBBAFDFFF BBEEDEDBDCAE FFBADEEFCFFF
4 4 4 FDDFDEFBCBBA FECEEFDDCDAF FDFCDFEFBBCE
5 5 2 DBBEEDCBEECB CBFCAAFEBBCD FCFFBEBEEBDA
6 6 6 CBAEEEDBEBDF DCABCEAEDFFF CEFFFDBCADFC