将写入的数字转换为R中的数字

时间:2013-08-20 10:14:17

标签: r text qdap

是否有人知道将数字的文本表示转换为实际数字的功能,例如20305年的“二万三千零五”。我在数据框行中写过数字,并希望将它们转换为数字。

在qdap包中,您可以用单词替换数字表示的数字(例如,1001变为一千),但不是相反:

library(qdap)
replace_number("I like 346457 ice cream cones.")
[1] "I like three hundred forty six thousand four hundred fifty seven ice cream cones."

2 个答案:

答案 0 :(得分:14)

这是一个应该让你成千上万的开始。

word2num <- function(word){
    wsplit <- strsplit(tolower(word)," ")[[1]]
    one_digits <- list(zero=0, one=1, two=2, three=3, four=4, five=5,
                       six=6, seven=7, eight=8, nine=9)
    teens <- list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
                  sixteen=16, seventeen=17, eighteen=18, nineteen=19)
    ten_digits <- list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
                       sixty=60, seventy=70, eighty=80, ninety=90)
    doubles <- c(teens,ten_digits)
    out <- 0
    i <- 1
    while(i <= length(wsplit)){
        j <- 1
        if(i==1 && wsplit[i]=="hundred")
            temp <- 100
        else if(i==1 && wsplit[i]=="thousand")
            temp <- 1000
        else if(wsplit[i] %in% names(one_digits))
            temp <- as.numeric(one_digits[wsplit[i]])
        else if(wsplit[i] %in% names(teens))
            temp <- as.numeric(teens[wsplit[i]])
        else if(wsplit[i] %in% names(ten_digits))
            temp <- (as.numeric(ten_digits[wsplit[i]]))
        if(i < length(wsplit) && wsplit[i+1]=="hundred"){
            if(i>1 && wsplit[i-1] %in% c("hundred","thousand"))
                out <- out + 100*temp
            else
                out <- 100*(out + temp)
            j <- 2
        }
        else if(i < length(wsplit) && wsplit[i+1]=="thousand"){
            if(i>1 && wsplit[i-1] %in% c("hundred","thousand"))
                out <- out + 1000*temp
            else
                out <- 1000*(out + temp)
            j <- 2
        }
        else if(i < length(wsplit) && wsplit[i+1] %in% names(doubles)){
            temp <- temp*100
            out <- out + temp
        }
        else{
            out <- out + temp
        }
        i <- i + j
    }
    return(list(word,out))
}

结果:

> word2num("fifty seven")
[[1]]
[1] "fifty seven"

[[2]]
[1] 57

> word2num("four fifty seven")
[[1]]
[1] "four fifty seven"

[[2]]
[1] 457

> word2num("six thousand four fifty seven")
[[1]]
[1] "six thousand four fifty seven"

[[2]]
[1] 6457

> word2num("forty six thousand four fifty seven")
[[1]]
[1] "forty six thousand four fifty seven"

[[2]]
[1] 46457

> word2num("forty six thousand four hundred fifty seven")
[[1]]
[1] "forty six thousand four hundred fifty seven"

[[2]]
[1] 46457

> word2num("three forty six thousand four hundred fifty seven")
[[1]]
[1] "three forty six thousand four hundred fifty seven"

[[2]]
[1] 346457

我已经可以告诉你,这对word2num("four hundred thousand fifty")不起作用,因为它不知道如何处理连续的“百”和“千”项,但算法可能会被修改。任何人都可以随意编辑它,如果他们有改进或在他们自己的答案中建立它们。我只是觉得这是一个很有趣的问题(一会儿)。

编辑:显然Bill Venables有一个名为english的软件包可能比上面的代码更好。

答案 1 :(得分:-1)

我认为这是一个更好的解决方案。

    library(stringdist)
    library(gdata)
    #Convert numeric words to digits
isNumericWord=function(string, dist=1, method="dl"){
  nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
         "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
         "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
         "hundred","thousand","million","billion","trillion")
  return(any(stringdist(tolower(string),nums,method=method)<=dist))
}
numberTypes=function(string, dist=1, method="dl"){
  nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
         "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
         "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
         "hundred","thousand","million","billion","trillion")
  string=gsub("[[:punct:]]"," ",string)
  wrdsplit=strsplit(string,split=" ")[[1]]
  wrdsplit=wrdsplit[wrdsplit!=""]
  #Handle number types
  wrdsplit=ifelse(stringdist("first",tolower(wrdsplit),method=method)<=dist,"one st",wrdsplit)
  wrdsplit=ifelse(stringdist("second",tolower(wrdsplit),method=method)<=dist,"two nd",wrdsplit)
  wrdsplit=ifelse(stringdist("third",tolower(wrdsplit),method=method)<=dist &
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","three rd",wrdsplit)
  wrdsplit=ifelse(stringdist("fourth",tolower(wrdsplit),method=method)<=dist & 
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","four th",wrdsplit)
  wrdsplit=ifelse(stringdist("fifth",tolower(wrdsplit),method=method)<=dist & 
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","five th",wrdsplit)
  wrdsplit=ifelse(stringdist("sixth",tolower(wrdsplit),method=method)<=dist & 
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","six th",wrdsplit)
  wrdsplit=ifelse(stringdist("seventh",tolower(wrdsplit),method=method)<=dist &
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","seven th",wrdsplit)
  wrdsplit=ifelse(stringdist("eighth",tolower(wrdsplit),method=method)<=dist &
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","eight th",wrdsplit)
  wrdsplit=ifelse(stringdist("ninth",tolower(wrdsplit),method=method)<=dist &
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","nine th",wrdsplit)
  wrdsplit=ifelse(stringdist("tenth",tolower(wrdsplit),method=method)<=dist,"ten th",wrdsplit)
  wrdsplit=ifelse(stringdist("twentieth",tolower(wrdsplit),method=method)<=dist,"twenty th",wrdsplit)
  wrdsplit=ifelse(stringdist("thirtieth",tolower(wrdsplit),method=method)<=dist,"thirty th",wrdsplit)
  wrdsplit=ifelse(stringdist("fortieth",tolower(wrdsplit),method=method)<=dist,"forty th",wrdsplit)
  wrdsplit=ifelse(stringdist("fiftieth",tolower(wrdsplit),method=method)<=dist,"fifty th",wrdsplit)
  wrdsplit=ifelse(stringdist("sixtieth",tolower(wrdsplit),method=method)<=dist,"sixty th",wrdsplit)
  wrdsplit=ifelse(stringdist("seventieth",tolower(wrdsplit),method=method)<=dist,"seventy th",wrdsplit)
  wrdsplit=ifelse(stringdist("eightieth",tolower(wrdsplit),method=method)<=dist,"eighty th",wrdsplit)
  wrdsplit=ifelse(stringdist("ninetieth",tolower(wrdsplit),method=method)<=dist,"ninety th",wrdsplit)
  #Handle other number words that end in "th"
  if(length(wrdsplit)>0){
    for(i in 1:length(wrdsplit)){
      substr_end=substr(wrdsplit[i],(nchar(wrdsplit[i])-1),nchar(wrdsplit[i]))
      substr_beg=substr(wrdsplit[i],1,(nchar(wrdsplit[i])-2))
      if(substr_end=="th" & nchar(wrdsplit[i])!=2 & any(stringdist(tolower(substr_beg),nums,method=method)<=dist)){
        wrdsplit[i]=paste(substr_beg, substr_end,sep=" ")
      }
    }
    return(gsub("  "," ",paste(wrdsplit,collapse=" ")))
  }else{
    return("")
  }
}

#Convert number words to digits
Word2Num=function(string, dist=1, method="dl"){
  original=string
  #Define numbers
  one_digits = list(zero=0, one=1, two=2, three=3, four=4, five=5,
                    six=6, seven=7, eight=8, nine=9)
  teens = list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
               sixteen=16, seventeen=17, eighteen=18, nineteen=19)
  ten_digits = list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
                    sixty=60, seventy=70, eighty=80, ninety=90)
  large_digits = list(hundred=100, thousand=1000, million=1e6, billion=1e9, trillion=1e12)
  double_digits = c(teens,ten_digits)

  #Split the string into words
  string=gsub("-"," ",gsub(" & ", " and ",string,ignore.case=T))
  string=numberTypes(string)
  wrdsplit=strsplit(tolower(string)," ")[[1]]
  wrdsplit=wrdsplit[wrdsplit!=""]
  isNumber=apply(data.frame(wrdsplit),1,isNumericWord)

  #Find groups of numbers
  if(exists("groups")){
    suppressWarnings(rm(groups))
  }
  i=1
  while(i <= length(wrdsplit)){
    if(isNumber[i]==T){
      if(!exists("groups")){
        groups=list(wrdsplit[i])
      }else if(exists("groups")){
        groups=c(groups, wrdsplit[i])
      }
      for(j in (i+1):length(wrdsplit)){
        if(isNumber[j]){
          groups[[length(groups)]]=c(groups[[length(groups)]],wrdsplit[j])
          i=j+1
        }else{
          i=i+1
          break
        }
      }
    }else{
      i=i+1
    }
  }

  #Convert numeric words to numbers
  if(exists("groups")){
    groupNums=groups
    for(j in 1:length(groups)){
      for(i in 1:length(groups[[j]])){
        #If word is a single digit number
        if(any(stringdist(groups[[j]][i],names(one_digits),method=method)<=dist & 
               tolower(substr(groups[[j]][i],nchar(groups[[j]][i]),nchar(groups[[j]][i])))!="y")){
          #If word is a single digit number
          groupNums[[j]][i]=one_digits[stringdist(groups[[j]][i],names(one_digits),method=method)<=dist][[1]]
        }else if(any(stringdist(groups[[j]][i],names(double_digits),method=method)<=dist)){
          #If word is a double digit number
          groupNums[[j]][i]=double_digits[stringdist(groups[[j]][i],names(double_digits),method=method)<=dist][[1]]
        }else if(any(stringdist(groups[[j]][i],names(large_digits),method=method)<=dist)){
          #If word is a large digit number
          groupNums[[j]][i]=large_digits[stringdist(groups[[j]][i],names(large_digits),method=method)<=dist][[1]]
        }
      }
    }

    #Convert the separated numbers to a single number
    defscipen=options("scipen")[[1]]
    options(scipen=999)
    for(i in 1:length(groups)){
      if(length(groupNums[[i]])==1){
        groupNums[[i]]=as.numeric(groupNums[[i]][1])
      }else{
        while(length(groupNums[[i]])>=2){
          if(nchar(groupNums[[i]][2])>nchar(groupNums[[i]][1])){
            #If the next word has more digits than the current word, multiply them
            temp=as.numeric(groupNums[[i]][1])*as.numeric(groupNums[[i]][2])
          }else if(nchar(groupNums[[i]][2])<nchar(groupNums[[i]][1])){
            #if the next word has less digits than the current word, add them
            temp=as.numeric(groupNums[[i]][1])+as.numeric(groupNums[[i]][2])
          }
          #Combine the results
          if(length(groupNums[[i]])>2){
            groupNums[[i]]=c(temp, groupNums[[i]][3:length(groupNums[[i]])])
          }else{
            groupNums[[i]]=temp
          }
        }
      }
    }
    #Recreate the original string
    groupNums=lapply(groupNums, as.character)
    options(scipen=defscipen)
    for(i in 1:length(groups)){
      wrdsplit[which(wrdsplit==groups[[i]][1])]=groupNums[[i]][1]
      if(length(groups[[i]]>1)){
        wrdsplit[which(wrdsplit==groups[[i]][2:length(groups)])]=""
      }
    }
    #Combine numbers with their endings
    wrdsplit=wrdsplit[wrdsplit!=""]
    if(any(wrdsplit[which(wrdsplit %in% unlist(groupNums))+1] %in% c("rd","th","st","nd"))){
      locs=which(wrdsplit %in% unlist(groupNums))
      for(i in length(locs):1){
        wrdsplit[locs[i]]=paste(wrdsplit[c(locs[i],(locs[i]+1))],collapse="")
        wrdsplit=wrdsplit[-(locs[i]+1)]
      }
    }
    return(trim(paste(wrdsplit,collapse=" ")))
  }else{
    return(original)
  }
}