superponer - Convertir el número escrito en número en R
tags$style shiny (2)
Aquí hay un comienzo que debería llevarte a cientos de miles.
word2num <- function(word){
wsplit <- strsplit(tolower(word)," ")[[1]]
one_digits <- list(zero=0, one=1, two=2, three=3, four=4, five=5,
six=6, seven=7, eight=8, nine=9)
teens <- list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
sixteen=16, seventeen=17, eighteen=18, nineteen=19)
ten_digits <- list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
sixty=60, seventy=70, eighty=80, ninety=90)
doubles <- c(teens,ten_digits)
out <- 0
i <- 1
while(i <= length(wsplit)){
j <- 1
if(i==1 && wsplit[i]=="hundred")
temp <- 100
else if(i==1 && wsplit[i]=="thousand")
temp <- 1000
else if(wsplit[i] %in% names(one_digits))
temp <- as.numeric(one_digits[wsplit[i]])
else if(wsplit[i] %in% names(teens))
temp <- as.numeric(teens[wsplit[i]])
else if(wsplit[i] %in% names(ten_digits))
temp <- (as.numeric(ten_digits[wsplit[i]]))
if(i < length(wsplit) && wsplit[i+1]=="hundred"){
if(i>1 && wsplit[i-1] %in% c("hundred","thousand"))
out <- out + 100*temp
else
out <- 100*(out + temp)
j <- 2
}
else if(i < length(wsplit) && wsplit[i+1]=="thousand"){
if(i>1 && wsplit[i-1] %in% c("hundred","thousand"))
out <- out + 1000*temp
else
out <- 1000*(out + temp)
j <- 2
}
else if(i < length(wsplit) && wsplit[i+1] %in% names(doubles)){
temp <- temp*100
out <- out + temp
}
else{
out <- out + temp
}
i <- i + j
}
return(list(word,out))
}
Resultados:
> word2num("fifty seven")
[[1]]
[1] "fifty seven"
[[2]]
[1] 57
> word2num("four fifty seven")
[[1]]
[1] "four fifty seven"
[[2]]
[1] 457
> word2num("six thousand four fifty seven")
[[1]]
[1] "six thousand four fifty seven"
[[2]]
[1] 6457
> word2num("forty six thousand four fifty seven")
[[1]]
[1] "forty six thousand four fifty seven"
[[2]]
[1] 46457
> word2num("forty six thousand four hundred fifty seven")
[[1]]
[1] "forty six thousand four hundred fifty seven"
[[2]]
[1] 46457
> word2num("three forty six thousand four hundred fifty seven")
[[1]]
[1] "three forty six thousand four hundred fifty seven"
[[2]]
[1] 346457
Ya puedo decir que esto no funcionará para word2num("four hundred thousand fifty")
, porque no sabe cómo manejar los términos consecutivos "cien" y "mil", pero el algoritmo probablemente se puede modificar. Cualquiera debería sentirse libre de editar esto si tiene mejoras o se basa en ellas en su propia respuesta. Simplemente pensé que era un problema divertido para jugar (por un rato).
Edición: aparentemente, Bill Venables tiene un paquete llamado english que puede lograr esto incluso mejor que el código anterior.
Alguien conoce una función para convertir una representación de texto de un número en un número real, por ejemplo, "veinte mil trescientos cinco" en 20305. He escrito números en filas de marcos de datos y quiero convertirlos en números.
En el paquete qdap, puede reemplazar números representados numéricamente con palabras (por ejemplo, 1001 se convierte en mil uno), pero no al revés:
library(qdap)
replace_number("I like 346457 ice cream cones.")
[1] "I like three hundred forty six thousand four hundred fifty seven ice cream cones."
Esto es lo que creo que es una mejor solución.
library(stringdist)
library(gdata)
#Convert numeric words to digits
isNumericWord=function(string, dist=1, method="dl"){
nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
"hundred","thousand","million","billion","trillion")
return(any(stringdist(tolower(string),nums,method=method)<=dist))
}
numberTypes=function(string, dist=1, method="dl"){
nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
"hundred","thousand","million","billion","trillion")
string=gsub("[[:punct:]]"," ",string)
wrdsplit=strsplit(string,split=" ")[[1]]
wrdsplit=wrdsplit[wrdsplit!=""]
#Handle number types
wrdsplit=ifelse(stringdist("first",tolower(wrdsplit),method=method)<=dist,"one st",wrdsplit)
wrdsplit=ifelse(stringdist("second",tolower(wrdsplit),method=method)<=dist,"two nd",wrdsplit)
wrdsplit=ifelse(stringdist("third",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","three rd",wrdsplit)
wrdsplit=ifelse(stringdist("fourth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","four th",wrdsplit)
wrdsplit=ifelse(stringdist("fifth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","five th",wrdsplit)
wrdsplit=ifelse(stringdist("sixth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","six th",wrdsplit)
wrdsplit=ifelse(stringdist("seventh",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","seven th",wrdsplit)
wrdsplit=ifelse(stringdist("eighth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","eight th",wrdsplit)
wrdsplit=ifelse(stringdist("ninth",tolower(wrdsplit),method=method)<=dist &
tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","nine th",wrdsplit)
wrdsplit=ifelse(stringdist("tenth",tolower(wrdsplit),method=method)<=dist,"ten th",wrdsplit)
wrdsplit=ifelse(stringdist("twentieth",tolower(wrdsplit),method=method)<=dist,"twenty th",wrdsplit)
wrdsplit=ifelse(stringdist("thirtieth",tolower(wrdsplit),method=method)<=dist,"thirty th",wrdsplit)
wrdsplit=ifelse(stringdist("fortieth",tolower(wrdsplit),method=method)<=dist,"forty th",wrdsplit)
wrdsplit=ifelse(stringdist("fiftieth",tolower(wrdsplit),method=method)<=dist,"fifty th",wrdsplit)
wrdsplit=ifelse(stringdist("sixtieth",tolower(wrdsplit),method=method)<=dist,"sixty th",wrdsplit)
wrdsplit=ifelse(stringdist("seventieth",tolower(wrdsplit),method=method)<=dist,"seventy th",wrdsplit)
wrdsplit=ifelse(stringdist("eightieth",tolower(wrdsplit),method=method)<=dist,"eighty th",wrdsplit)
wrdsplit=ifelse(stringdist("ninetieth",tolower(wrdsplit),method=method)<=dist,"ninety th",wrdsplit)
#Handle other number words that end in "th"
if(length(wrdsplit)>0){
for(i in 1:length(wrdsplit)){
substr_end=substr(wrdsplit[i],(nchar(wrdsplit[i])-1),nchar(wrdsplit[i]))
substr_beg=substr(wrdsplit[i],1,(nchar(wrdsplit[i])-2))
if(substr_end=="th" & nchar(wrdsplit[i])!=2 & any(stringdist(tolower(substr_beg),nums,method=method)<=dist)){
wrdsplit[i]=paste(substr_beg, substr_end,sep=" ")
}
}
return(gsub(" "," ",paste(wrdsplit,collapse=" ")))
}else{
return("")
}
}
#Convert number words to digits
Word2Num=function(string, dist=1, method="dl"){
original=string
#Define numbers
one_digits = list(zero=0, one=1, two=2, three=3, four=4, five=5,
six=6, seven=7, eight=8, nine=9)
teens = list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
sixteen=16, seventeen=17, eighteen=18, nineteen=19)
ten_digits = list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
sixty=60, seventy=70, eighty=80, ninety=90)
large_digits = list(hundred=100, thousand=1000, million=1e6, billion=1e9, trillion=1e12)
double_digits = c(teens,ten_digits)
#Split the string into words
string=gsub("-"," ",gsub(" & ", " and ",string,ignore.case=T))
string=numberTypes(string)
wrdsplit=strsplit(tolower(string)," ")[[1]]
wrdsplit=wrdsplit[wrdsplit!=""]
isNumber=apply(data.frame(wrdsplit),1,isNumericWord)
#Find groups of numbers
if(exists("groups")){
suppressWarnings(rm(groups))
}
i=1
while(i <= length(wrdsplit)){
if(isNumber[i]==T){
if(!exists("groups")){
groups=list(wrdsplit[i])
}else if(exists("groups")){
groups=c(groups, wrdsplit[i])
}
for(j in (i+1):length(wrdsplit)){
if(isNumber[j]){
groups[[length(groups)]]=c(groups[[length(groups)]],wrdsplit[j])
i=j+1
}else{
i=i+1
break
}
}
}else{
i=i+1
}
}
#Convert numeric words to numbers
if(exists("groups")){
groupNums=groups
for(j in 1:length(groups)){
for(i in 1:length(groups[[j]])){
#If word is a single digit number
if(any(stringdist(groups[[j]][i],names(one_digits),method=method)<=dist &
tolower(substr(groups[[j]][i],nchar(groups[[j]][i]),nchar(groups[[j]][i])))!="y")){
#If word is a single digit number
groupNums[[j]][i]=one_digits[stringdist(groups[[j]][i],names(one_digits),method=method)<=dist][[1]]
}else if(any(stringdist(groups[[j]][i],names(double_digits),method=method)<=dist)){
#If word is a double digit number
groupNums[[j]][i]=double_digits[stringdist(groups[[j]][i],names(double_digits),method=method)<=dist][[1]]
}else if(any(stringdist(groups[[j]][i],names(large_digits),method=method)<=dist)){
#If word is a large digit number
groupNums[[j]][i]=large_digits[stringdist(groups[[j]][i],names(large_digits),method=method)<=dist][[1]]
}
}
}
#Convert the separated numbers to a single number
defscipen=options("scipen")[[1]]
options(scipen=999)
for(i in 1:length(groups)){
if(length(groupNums[[i]])==1){
groupNums[[i]]=as.numeric(groupNums[[i]][1])
}else{
while(length(groupNums[[i]])>=2){
if(nchar(groupNums[[i]][2])>nchar(groupNums[[i]][1])){
#If the next word has more digits than the current word, multiply them
temp=as.numeric(groupNums[[i]][1])*as.numeric(groupNums[[i]][2])
}else if(nchar(groupNums[[i]][2])<nchar(groupNums[[i]][1])){
#if the next word has less digits than the current word, add them
temp=as.numeric(groupNums[[i]][1])+as.numeric(groupNums[[i]][2])
}
#Combine the results
if(length(groupNums[[i]])>2){
groupNums[[i]]=c(temp, groupNums[[i]][3:length(groupNums[[i]])])
}else{
groupNums[[i]]=temp
}
}
}
}
#Recreate the original string
groupNums=lapply(groupNums, as.character)
options(scipen=defscipen)
for(i in 1:length(groups)){
wrdsplit[which(wrdsplit==groups[[i]][1])]=groupNums[[i]][1]
if(length(groups[[i]]>1)){
wrdsplit[which(wrdsplit==groups[[i]][2:length(groups)])]=""
}
}
#Combine numbers with their endings
wrdsplit=wrdsplit[wrdsplit!=""]
if(any(wrdsplit[which(wrdsplit %in% unlist(groupNums))+1] %in% c("rd","th","st","nd"))){
locs=which(wrdsplit %in% unlist(groupNums))
for(i in length(locs):1){
wrdsplit[locs[i]]=paste(wrdsplit[c(locs[i],(locs[i]+1))],collapse="")
wrdsplit=wrdsplit[-(locs[i]+1)]
}
}
return(trim(paste(wrdsplit,collapse=" ")))
}else{
return(original)
}
}