我需要将one
转换为1
,将two
转换为2
,依此类推。
有没有办法用库或类或任何东西来做这件事?
答案 0 :(得分:95)
这段代码的大部分内容是设置numwords dict,这只能在第一次调用时完成。
def text2int(textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
current = result = 0
for word in textnum.split():
if word not in numwords:
raise Exception("Illegal word: " + word)
scale, increment = numwords[word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current
print text2int("seven billion one hundred million thirty one thousand three hundred thirty seven")
#7100031337
答案 1 :(得分:12)
如果有人有兴趣,我会修改一个维护字符串其余部分的版本(虽然它可能有bug,但没有测试过多)。
def text2int (textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
textnum = textnum.replace('-', ' ')
current = result = 0
curstring = ""
onnumber = False
for word in textnum.split():
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if word not in numwords:
if onnumber:
curstring += repr(result + current) + " "
curstring += word + " "
result = current = 0
onnumber = False
else:
scale, increment = numwords[word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
if onnumber:
curstring += repr(result + current)
return curstring
示例:
>>> text2int("I want fifty five hot dogs for two hundred dollars.")
I want 55 hot dogs for 200 dollars.
如果您有“200美元”,可能会出现问题。但是,这真的非常粗糙。
答案 2 :(得分:9)
感谢您的代码片段......节省了我很多时间!
我需要处理一些额外的解析案例,例如序数词(“第一个”,“第二个”),带连字符的词(“一百”)和带连字符的序数词(如“第五十七”),所以我添加了几行:
def text2int(textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
textnum = textnum.replace('-', ' ')
current = result = 0
for word in textnum.split():
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if word not in numwords:
raise Exception("Illegal word: " + word)
scale, increment = numwords[word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current`
答案 3 :(得分:7)
我刚刚向PyPI发布了一个名为word2number的python模块,用于确切的目的。 https://github.com/akshaynagpal/w2n
使用以下方式安装:
pip install word2number
确保您的点数已更新为最新版本。
用法:
from word2number import w2n
print w2n.word_to_num("two million three thousand nine hundred and eighty four")
2003984
答案 4 :(得分:6)
我需要一些不同的东西,因为我的输入是从语音到文本的转换,解决方案并不总是对数字求和。例如,“我的邮政编码是一二三四五”不应转换为“我的邮政编码是15”。
我采用了安德鲁(Andrew)的answer并对其进行了调整,以处理人们强调为错误的其他一些情况,并且还增加了对示例的支持,例如我上面提到的邮政编码。下面显示了一些基本的测试用例,但我确定仍有改进的空间。
def is_number(x):
if type(x) == str:
x = x.replace(',', '')
try:
float(x)
except:
return False
return True
def text2int (textnum, numwords={}):
units = [
'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
'sixteen', 'seventeen', 'eighteen', 'nineteen',
]
tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
scales = ['hundred', 'thousand', 'million', 'billion', 'trillion']
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
if not numwords:
numwords['and'] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
textnum = textnum.replace('-', ' ')
current = result = 0
curstring = ''
onnumber = False
lastunit = False
lastscale = False
def is_numword(x):
if is_number(x):
return True
if word in numwords:
return True
return False
def from_numword(x):
if is_number(x):
scale = 0
increment = int(x.replace(',', ''))
return scale, increment
return numwords[x]
for word in textnum.split():
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
lastunit = False
lastscale = False
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if (not is_numword(word)) or (word == 'and' and not lastscale):
if onnumber:
# Flush the current number we are building
curstring += repr(result + current) + " "
curstring += word + " "
result = current = 0
onnumber = False
lastunit = False
lastscale = False
else:
scale, increment = from_numword(word)
onnumber = True
if lastunit and (word not in scales):
# Assume this is part of a string of individual numbers to
# be flushed, such as a zipcode "one two three four five"
curstring += repr(result + current)
result = current = 0
if scale > 1:
current = max(1, current)
current = current * scale + increment
if scale > 100:
result += current
current = 0
lastscale = False
lastunit = False
if word in scales:
lastscale = True
elif word in units:
lastunit = True
if onnumber:
curstring += repr(result + current)
return curstring
一些测试...
one two three -> 123
three forty five -> 345
three and forty five -> 3 and 45
three hundred and forty five -> 345
three hundred -> 300
twenty five hundred -> 2500
three thousand and six -> 3006
three thousand six -> 3006
nineteenth -> 19
twentieth -> 20
first -> 1
my zip is one two three four five -> my zip is 12345
nineteen ninety six -> 1996
fifty-seventh -> 57
one million -> 1000000
first hundred -> 100
I will buy the first thousand -> I will buy the 1000 # probably should leave ordinal in the string
thousand -> 1000
hundred and six -> 106
1 million -> 1000000
答案 5 :(得分:4)
这是一个简单的案例方法:
>>> number = {'one':1,
... 'two':2,
... 'three':3,}
>>>
>>> number['two']
2
或者您正在寻找可以处理“一万二千一百七十二”的东西吗?
答案 6 :(得分:3)
如果您要解析的数字数量有限,可以很容易地将其硬编码到字典中。
对于稍微复杂的情况,您可能希望根据相对简单的数字语法自动生成此字典。一些事情(当然,概括......)
for i in range(10):
myDict[30 + i] = "thirty-" + singleDigitsDict[i]
如果您需要更广泛的内容,那么您似乎需要自然语言处理工具。 This article可能是一个很好的起点。
答案 7 :(得分:3)
这是第一个答案中代码的c#实现:
public static double ConvertTextToNumber(string text)
{
string[] units = new string[] {
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
};
string[] tens = new string[] {"", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"};
string[] scales = new string[] { "hundred", "thousand", "million", "billion", "trillion" };
Dictionary<string, ScaleIncrementPair> numWord = new Dictionary<string, ScaleIncrementPair>();
numWord.Add("and", new ScaleIncrementPair(1, 0));
for (int i = 0; i < units.Length; i++)
{
numWord.Add(units[i], new ScaleIncrementPair(1, i));
}
for (int i = 1; i < tens.Length; i++)
{
numWord.Add(tens[i], new ScaleIncrementPair(1, i * 10));
}
for (int i = 0; i < scales.Length; i++)
{
if(i == 0)
numWord.Add(scales[i], new ScaleIncrementPair(100, 0));
else
numWord.Add(scales[i], new ScaleIncrementPair(Math.Pow(10, (i*3)), 0));
}
double current = 0;
double result = 0;
foreach (var word in text.Split(new char[] { ' ', '-', '—'}))
{
ScaleIncrementPair scaleIncrement = numWord[word];
current = current * scaleIncrement.scale + scaleIncrement.increment;
if (scaleIncrement.scale > 100)
{
result += current;
current = 0;
}
}
return result + current;
}
public struct ScaleIncrementPair
{
public double scale;
public int increment;
public ScaleIncrementPair(double s, int i)
{
scale = s;
increment = i;
}
}
答案 8 :(得分:2)
def parse_int(string):
ONES = {'zero': 0,
'one': 1,
'two': 2,
'three': 3,
'four': 4,
'five': 5,
'six': 6,
'seven': 7,
'eight': 8,
'nine': 9,
'ten': 10,
'eleven': 11,
'twelve': 12,
'thirteen': 13,
'fourteen': 14,
'fifteen': 15,
'sixteen': 16,
'seventeen': 17,
'eighteen': 18,
'nineteen': 19,
'twenty': 20,
'thirty': 30,
'forty': 40,
'fifty': 50,
'sixty': 60,
'seventy': 70,
'eighty': 80,
'ninety': 90,
}
numbers = []
for token in string.replace('-', ' ').split(' '):
if token in ONES:
numbers.append(ONES[token])
elif token == 'hundred':
numbers[-1] *= 100
elif token == 'thousand':
numbers = [x * 1000 for x in numbers]
elif token == 'million':
numbers = [x * 1000000 for x in numbers]
return sum(numbers)
我认为这段代码更容易理解
使用 1 到 100 万范围内的 700 个随机数进行测试,效果很好
答案 9 :(得分:1)
马克·伯恩斯(Marc Burns)有ruby gem这样做。我最近分叉它,以增加支持多年。您可以拨打ruby code from python。
require 'numbers_in_words'
require 'numbers_in_words/duck_punch'
nums = ["fifteen sixteen", "eighty five sixteen", "nineteen ninety six",
"one hundred and seventy nine", "thirteen hundred", "nine thousand two hundred and ninety seven"]
nums.each {|n| p n; p n.in_numbers}
结果:
"fifteen sixteen"
1516
"eighty five sixteen"
8516
"nineteen ninety six"
1996
"one hundred and seventy nine"
179
"thirteen hundred"
1300
"nine thousand two hundred and ninety seven"
9297
答案 10 :(得分:1)
e_h的C#实现的快速而脏的Java端口(上图)。请注意,两者都返回double,而不是int。
public class Text2Double {
public double Text2Double(String text) {
String[] units = new String[]{
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
};
String[] tens = new String[]{"", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"};
String[] scales = new String[]{"hundred", "thousand", "million", "billion", "trillion"};
Map<String, ScaleIncrementPair> numWord = new LinkedHashMap<>();
numWord.put("and", new ScaleIncrementPair(1, 0));
for (int i = 0; i < units.length; i++) {
numWord.put(units[i], new ScaleIncrementPair(1, i));
}
for (int i = 1; i < tens.length; i++) {
numWord.put(tens[i], new ScaleIncrementPair(1, i * 10));
}
for (int i = 0; i < scales.length; i++) {
if (i == 0)
numWord.put(scales[i], new ScaleIncrementPair(100, 0));
else
numWord.put(scales[i], new ScaleIncrementPair(Math.pow(10, (i * 3)), 0));
}
double current = 0;
double result = 0;
for(String word : text.split("[ -]"))
{
ScaleIncrementPair scaleIncrement = numWord.get(word);
current = current * scaleIncrement.scale + scaleIncrement.increment;
if (scaleIncrement.scale > 100) {
result += current;
current = 0;
}
}
return result + current;
}
}
public class ScaleIncrementPair
{
public double scale;
public int increment;
public ScaleIncrementPair(double s, int i)
{
scale = s;
increment = i;
}
}
答案 11 :(得分:1)
进行更改,以便text2int(scale)返回正确的转换。例如,text2int(“百”)=&gt; 100.
import re
numwords = {}
def text2int(textnum):
if not numwords:
units = [ "zero", "one", "two", "three", "four", "five", "six",
"seven", "eight", "nine", "ten", "eleven", "twelve",
"thirteen", "fourteen", "fifteen", "sixteen", "seventeen",
"eighteen", "nineteen"]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty",
"seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion",
'quadrillion', 'quintillion', 'sexillion', 'septillion',
'octillion', 'nonillion', 'decillion' ]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5,
'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
current = result = 0
tokens = re.split(r"[\s-]+", textnum)
for word in tokens:
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if word not in numwords:
raise Exception("Illegal word: " + word)
scale, increment = numwords[word]
if scale > 1:
current = max(1, current)
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current
答案 12 :(得分:1)
使用python软件包: WordToDigits
pip安装单词对数字
它可以找到句子中以单词形式出现的数字,然后将其转换为正确的数字格式。如果存在的话,还要照顾小数部分。 数字的单词表示形式可以在段落的任何地方。
答案 13 :(得分:0)
快速解决方案是使用inflect.py生成翻译字典。
inflect.py有一个number_to_words()
函数,它会将一个数字(例如2
)转换为它的单词格式(例如'two'
)。不幸的是,它没有提供它的反向(这将允许你避免翻译字典路由)。同样,您可以使用该函数来构建翻译词典:
>>> import inflect
>>> p = inflect.engine()
>>> word_to_number_mapping = {}
>>>
>>> for i in range(1, 100):
... word_form = p.number_to_words(i) # 1 -> 'one'
... word_to_number_mapping[word_form] = i
...
>>> print word_to_number_mapping['one']
1
>>> print word_to_number_mapping['eleven']
11
>>> print word_to_number_mapping['forty-three']
43
如果您愿意花一些时间,可以检查一下number_to_words()
函数的inflect.py的内部工作原理并构建自己的代码来动态执行此操作(我没有尝试过这一点)。
答案 14 :(得分:0)
我使用import nltk nltk.download('punkt'),并且可以正常工作。
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
macbeth_text_words = (word_tokenize(macbeth_text))
n_words = len(macbeth_text_words)
unique_words = len(set(macbeth_text_words))
print('Total Words: %d' % n_words)
print('Unique Words: %d' % unique_words)
答案 15 :(得分:0)
我接受了@recursive的logic并转换为Ruby。我还对查找表进行了硬编码,因此它虽然不那么酷,但可以帮助新手了解正在发生的事情。
WORDNUMS = {"zero"=> [1,0], "one"=> [1,1], "two"=> [1,2], "three"=> [1,3],
"four"=> [1,4], "five"=> [1,5], "six"=> [1,6], "seven"=> [1,7],
"eight"=> [1,8], "nine"=> [1,9], "ten"=> [1,10],
"eleven"=> [1,11], "twelve"=> [1,12], "thirteen"=> [1,13],
"fourteen"=> [1,14], "fifteen"=> [1,15], "sixteen"=> [1,16],
"seventeen"=> [1,17], "eighteen"=> [1,18], "nineteen"=> [1,19],
"twenty"=> [1,20], "thirty" => [1,30], "forty" => [1,40],
"fifty" => [1,50], "sixty" => [1,60], "seventy" => [1,70],
"eighty" => [1,80], "ninety" => [1,90],
"hundred" => [100,0], "thousand" => [1000,0],
"million" => [1000000, 0]}
def text_2_int(string)
numberWords = string.gsub('-', ' ').split(/ /) - %w{and}
current = result = 0
numberWords.each do |word|
scale, increment = WORDNUMS[word]
current = current * scale + increment
if scale > 100
result += current
current = 0
end
end
return result + current
end
我一直想处理类似two thousand one hundred and forty-six
的字符串
答案 16 :(得分:0)
此代码适用于系列数据:
import pandas as pd
mylist = pd.Series(['one','two','three'])
mylist1 = []
for x in range(len(mylist)):
mylist1.append(w2n.word_to_num(mylist[x]))
print(mylist1)
答案 17 :(得分:0)
这可以处理印度风格单词中的数字、一些分数、数字和单词的组合以及加法。
def words_to_number(words):
numbers = {"zero":0, "a":1, "half":0.5, "quarter":0.25, "one":1,"two":2,
"three":3, "four":4,"five":5,"six":6,"seven":7,"eight":8,
"nine":9, "ten":10,"eleven":11,"twelve":12, "thirteen":13,
"fourteen":14, "fifteen":15,"sixteen":16,"seventeen":17,
"eighteen":18,"nineteen":19, "twenty":20,"thirty":30, "forty":40,
"fifty":50,"sixty":60,"seventy":70, "eighty":80,"ninety":90}
groups = {"hundred":100, "thousand":1_000,
"lac":1_00_000, "lakh":1_00_000,
"million":1_000_000, "crore":10**7,
"billion":10**9, "trillion":10**12}
split_at = ["and", "plus"]
n = 0
skip = False
words_array = words.split(" ")
for i, word in enumerate(words_array):
if not skip:
if word in groups:
n*= groups[word]
elif word in numbers:
n += numbers[word]
elif word in split_at:
skip = True
remaining = ' '.join(words_array[i+1:])
n+=words_to_number(remaining)
else:
try:
n += float(word)
except ValueError as e:
raise ValueError(f"Invalid word {word}") from e
return n
测试:
print(words_to_number("a million and one"))
>> 1000001
print(words_to_number("one crore and one"))
>> 1000,0001
print(words_to_number("0.5 million one"))
>> 500001.0
print(words_to_number("half million and one hundred"))
>> 500100.0
print(words_to_number("quarter"))
>> 0.25
print(words_to_number("one hundred plus one"))
>> 101
答案 18 :(得分:-2)
This code works only for numbers below 99.
both word to Int and int to word.
(for rest need to implement 10-20 lines of code and simple logic. This is just simple code for beginners)
num=input("Enter the number you want to convert : ")
mydict={'1': 'One', '2': 'Two', '3': 'Three', '4': 'Four', '5': 'Five','6': 'Six', '7': 'Seven', '8': 'Eight', '9': 'Nine', '10': 'Ten','11': 'Eleven', '12': 'Twelve', '13': 'Thirteen', '14': 'Fourteen', '15': 'Fifteen', '16': 'Sixteen', '17': 'Seventeen', '18': 'Eighteen', '19': 'Nineteen'}
mydict2=['','','Twenty','Thirty','Fourty','fifty','sixty','Seventy','Eighty','Ninty']
if num.isdigit():
if(int(num)<20):
print(" :---> "+mydict[num])
else:
var1=int(num)%10
var2=int(num)/10
print(" :---> "+mydict2[int(var2)]+mydict[str(var1)])
else:
num=num.lower();
dict_w={'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9,'ten':10,'eleven':11,'twelve':12,'thirteen':13,'fourteen':14,'fifteen':15,'sixteen':16,'seventeen':'17','eighteen':'18','nineteen':'19'}
mydict2=['','','twenty','thirty','fourty','fifty','sixty','seventy','eighty','ninty']
divide=num[num.find("ty")+2:]
if num:
if(num in dict_w.keys()):
print(" :---> "+str(dict_w[num]))
elif divide=='' :
for i in range(0, len(mydict2)-1):
if mydict2[i] == num:
print(" :---> "+str(i*10))
else :
str3=0
str1=num[num.find("ty")+2:]
str2=num[:-len(str1)]
for i in range(0, len(mydict2) ):
if mydict2[i] == str2:
str3=i;
if str2 not in mydict2:
print("----->Invalid Input<-----")
else:
try:
print(" :---> "+str((str3*10)+dict_w[str1]))
except:
print("----->Invalid Input<-----")
else:
print("----->Please Enter Input<-----")