是否有人知道将数字的文本表示转换为实际数字的功能,例如20305年的“二万三千零五”。我在数据框行中写过数字,并希望将它们转换为数字。
在qdap包中,您可以用单词替换数字表示的数字(例如,1001变为一千),但不是相反:
library(qdap)
replace_number("I like 346457 ice cream cones.")
[1] "I like three hundred forty six thousand four hundred fifty seven ice cream cones."
答案 0 :(得分:14)
这是一个应该让你成千上万的开始。
word2num <- function(word){
wsplit <- strsplit(tolower(word)," ")[[1]]
one_digits <- list(zero=0, one=1, two=2, three=3, four=4, five=5,
six=6, seven=7, eight=8, nine=9)
teens <- list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
sixteen=16, seventeen=17, eighteen=18, nineteen=19)
ten_digits <- list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
sixty=60, seventy=70, eighty=80, ninety=90)
doubles <- c(teens,ten_digits)
out <- 0
i <- 1
while(i <= length(wsplit)){
j <- 1
if(i==1 && wsplit[i]=="hundred")
temp <- 100
else if(i==1 && wsplit[i]=="thousand")
temp <- 1000
else if(wsplit[i] %in% names(one_digits))
temp <- as.numeric(one_digits[wsplit[i]])
else if(wsplit[i] %in% names(teens))
temp <- as.numeric(teens[wsplit[i]])
else if(wsplit[i] %in% names(ten_digits))
temp <- (as.numeric(ten_digits[wsplit[i]]))
if(i < length(wsplit) && wsplit[i+1]=="hundred"){
if(i>1 && wsplit[i-1] %in% c("hundred","thousand"))
out <- out + 100*temp
else
out <- 100*(out + temp)
j <- 2
}
else if(i < length(wsplit) && wsplit[i+1]=="thousand"){
if(i>1 && wsplit[i-1] %in% c("hundred","thousand"))
out <- out + 1000*temp
else
out <- 1000*(out + temp)
j <- 2
}
else if(i < length(wsplit) && wsplit[i+1] %in% names(doubles)){
temp <- temp*100
out <- out + temp
}
else{
out <- out + temp
}
i <- i + j
}
return(list(word,out))
}
结果:
> word2num("fifty seven")
[[1]]
[1] "fifty seven"
[[2]]
[1] 57
> word2num("four fifty seven")
[[1]]
[1] "four fifty seven"
[[2]]
[1] 457
> word2num("six thousand four fifty seven")
[[1]]
[1] "six thousand four fifty seven"
[[2]]
[1] 6457
> word2num("forty six thousand four fifty seven")
[[1]]
[1] "forty six thousand four fifty seven"
[[2]]
[1] 46457
> word2num("forty six thousand four hundred fifty seven")
[[1]]
[1] "forty six thousand four hundred fifty seven"
[[2]]
[1] 46457
> word2num("three forty six thousand four hundred fifty seven")
[[1]]
[1] "three forty six thousand four hundred fifty seven"
[[2]]
[1] 346457
我已经可以告诉你,这对word2num("four hundred thousand fifty")
不起作用,因为它不知道如何处理连续的“百”和“千”项,但算法可能会被修改。任何人都可以随意编辑它,如果他们有改进或在他们自己的答案中建立它们。我只是觉得这是一个很有趣的问题(一会儿)。
编辑:显然Bill Venables有一个名为english的软件包可能比上面的代码更好。
答案 1 :(得分:-1)
我认为这是一个更好的解决方案。
library(stringdist)
library(gdata)
#Convert numeric words to digits
isNumericWord=function(string, dist=1, method="dl"){
nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
"hundred","thousand","million","billion","trillion")
return(any(stringdist(tolower(string),nums,method=method)<=dist))
}
numberTypes=function(string, dist=1, method="dl"){
nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
"hundred","thousand","million","billion","trillion")
string=gsub("[[:punct:]]"," ",string)
wrdsplit=strsplit(string,split=" ")[[1]]
wrdsplit=wrdsplit[wrdsplit!=""]
#Handle number types
wrdsplit=ifelse(stringdist("first",tolower(wrdsplit),method=method)<=dist,"one st",wrdsplit)
wrdsplit=ifelse(stringdist("second",tolower(wrdsplit),method=method)<=dist,"two nd",wrdsplit)
wrdsplit=ifelse(stringdist("third",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","three rd",wrdsplit)
wrdsplit=ifelse(stringdist("fourth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","four th",wrdsplit)
wrdsplit=ifelse(stringdist("fifth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","five th",wrdsplit)
wrdsplit=ifelse(stringdist("sixth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","six th",wrdsplit)
wrdsplit=ifelse(stringdist("seventh",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","seven th",wrdsplit)
wrdsplit=ifelse(stringdist("eighth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","eight th",wrdsplit)
wrdsplit=ifelse(stringdist("ninth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","nine th",wrdsplit)
wrdsplit=ifelse(stringdist("tenth",tolower(wrdsplit),method=method)<=dist,"ten th",wrdsplit)
wrdsplit=ifelse(stringdist("twentieth",tolower(wrdsplit),method=method)<=dist,"twenty th",wrdsplit)
wrdsplit=ifelse(stringdist("thirtieth",tolower(wrdsplit),method=method)<=dist,"thirty th",wrdsplit)
wrdsplit=ifelse(stringdist("fortieth",tolower(wrdsplit),method=method)<=dist,"forty th",wrdsplit)
wrdsplit=ifelse(stringdist("fiftieth",tolower(wrdsplit),method=method)<=dist,"fifty th",wrdsplit)
wrdsplit=ifelse(stringdist("sixtieth",tolower(wrdsplit),method=method)<=dist,"sixty th",wrdsplit)
wrdsplit=ifelse(stringdist("seventieth",tolower(wrdsplit),method=method)<=dist,"seventy th",wrdsplit)
wrdsplit=ifelse(stringdist("eightieth",tolower(wrdsplit),method=method)<=dist,"eighty th",wrdsplit)
wrdsplit=ifelse(stringdist("ninetieth",tolower(wrdsplit),method=method)<=dist,"ninety th",wrdsplit)
#Handle other number words that end in "th"
if(length(wrdsplit)>0){
for(i in 1:length(wrdsplit)){
substr_end=substr(wrdsplit[i],(nchar(wrdsplit[i])-1),nchar(wrdsplit[i]))
substr_beg=substr(wrdsplit[i],1,(nchar(wrdsplit[i])-2))
if(substr_end=="th" & nchar(wrdsplit[i])!=2 & any(stringdist(tolower(substr_beg),nums,method=method)<=dist)){
wrdsplit[i]=paste(substr_beg, substr_end,sep=" ")
}
}
return(gsub(" "," ",paste(wrdsplit,collapse=" ")))
}else{
return("")
}
}
#Convert number words to digits
Word2Num=function(string, dist=1, method="dl"){
original=string
#Define numbers
one_digits = list(zero=0, one=1, two=2, three=3, four=4, five=5,
six=6, seven=7, eight=8, nine=9)
teens = list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
sixteen=16, seventeen=17, eighteen=18, nineteen=19)
ten_digits = list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
sixty=60, seventy=70, eighty=80, ninety=90)
large_digits = list(hundred=100, thousand=1000, million=1e6, billion=1e9, trillion=1e12)
double_digits = c(teens,ten_digits)
#Split the string into words
string=gsub("-"," ",gsub(" & ", " and ",string,ignore.case=T))
string=numberTypes(string)
wrdsplit=strsplit(tolower(string)," ")[[1]]
wrdsplit=wrdsplit[wrdsplit!=""]
isNumber=apply(data.frame(wrdsplit),1,isNumericWord)
#Find groups of numbers
if(exists("groups")){
suppressWarnings(rm(groups))
}
i=1
while(i <= length(wrdsplit)){
if(isNumber[i]==T){
if(!exists("groups")){
groups=list(wrdsplit[i])
}else if(exists("groups")){
groups=c(groups, wrdsplit[i])
}
for(j in (i+1):length(wrdsplit)){
if(isNumber[j]){
groups[[length(groups)]]=c(groups[[length(groups)]],wrdsplit[j])
i=j+1
}else{
i=i+1
break
}
}
}else{
i=i+1
}
}
#Convert numeric words to numbers
if(exists("groups")){
groupNums=groups
for(j in 1:length(groups)){
for(i in 1:length(groups[[j]])){
#If word is a single digit number
if(any(stringdist(groups[[j]][i],names(one_digits),method=method)<=dist &
tolower(substr(groups[[j]][i],nchar(groups[[j]][i]),nchar(groups[[j]][i])))!="y")){
#If word is a single digit number
groupNums[[j]][i]=one_digits[stringdist(groups[[j]][i],names(one_digits),method=method)<=dist][[1]]
}else if(any(stringdist(groups[[j]][i],names(double_digits),method=method)<=dist)){
#If word is a double digit number
groupNums[[j]][i]=double_digits[stringdist(groups[[j]][i],names(double_digits),method=method)<=dist][[1]]
}else if(any(stringdist(groups[[j]][i],names(large_digits),method=method)<=dist)){
#If word is a large digit number
groupNums[[j]][i]=large_digits[stringdist(groups[[j]][i],names(large_digits),method=method)<=dist][[1]]
}
}
}
#Convert the separated numbers to a single number
defscipen=options("scipen")[[1]]
options(scipen=999)
for(i in 1:length(groups)){
if(length(groupNums[[i]])==1){
groupNums[[i]]=as.numeric(groupNums[[i]][1])
}else{
while(length(groupNums[[i]])>=2){
if(nchar(groupNums[[i]][2])>nchar(groupNums[[i]][1])){
#If the next word has more digits than the current word, multiply them
temp=as.numeric(groupNums[[i]][1])*as.numeric(groupNums[[i]][2])
}else if(nchar(groupNums[[i]][2])<nchar(groupNums[[i]][1])){
#if the next word has less digits than the current word, add them
temp=as.numeric(groupNums[[i]][1])+as.numeric(groupNums[[i]][2])
}
#Combine the results
if(length(groupNums[[i]])>2){
groupNums[[i]]=c(temp, groupNums[[i]][3:length(groupNums[[i]])])
}else{
groupNums[[i]]=temp
}
}
}
}
#Recreate the original string
groupNums=lapply(groupNums, as.character)
options(scipen=defscipen)
for(i in 1:length(groups)){
wrdsplit[which(wrdsplit==groups[[i]][1])]=groupNums[[i]][1]
if(length(groups[[i]]>1)){
wrdsplit[which(wrdsplit==groups[[i]][2:length(groups)])]=""
}
}
#Combine numbers with their endings
wrdsplit=wrdsplit[wrdsplit!=""]
if(any(wrdsplit[which(wrdsplit %in% unlist(groupNums))+1] %in% c("rd","th","st","nd"))){
locs=which(wrdsplit %in% unlist(groupNums))
for(i in length(locs):1){
wrdsplit[locs[i]]=paste(wrdsplit[c(locs[i],(locs[i]+1))],collapse="")
wrdsplit=wrdsplit[-(locs[i]+1)]
}
}
return(trim(paste(wrdsplit,collapse=" ")))
}else{
return(original)
}
}