Question

我如何设法创建字母转换矩阵？

我有一个像这样的字母列表：

[u'T', u'i', u'r', u's', u'd', u'a', u'g', u' ', u's', u'k', u'a', u'l', u' ', u'd', u'u', u' ', u'i', u'n', u's', u't', u'a', u'l', u'l', u'e', u'r', u'e', u' ', u'e', u'n', u' ', u'P', u'y', u't', u'h', u'o', u'n', u' ', u'f', u'o', u'r', u't', u'o', u'l', u'k', u'e', u'r', u',', u' ', u'o', u'g', u' ', u'l',u'P', u'l', u'a', u'n', u' ', u'f', u'o', u'r', u' ', u'u', u'g', u'e', u'n', u'D', u'e', u'n', u'n', u'e', u' ', u'u', u'g', u'e', u' ', u'd', u'r', u'e', u'j', u'e', u'r', u' ', u's', u'i', u'g', u' ', u'o', u'm', u' ', u'a', u't', u' ', u'k', u'o', u'm', u'm', u'e', u' ', u'i', u'g', u'a', u'n', u'g', u' ', u'm', u'e', u'd', u' ', u'P', u'y', u't', u'h', u'o', u'n', u'.', u' ', u' ', u'T', u'i', u'r', u's', u'd', u'a', u'g', u' ', u's', u'k', u'a', u'l', u' ', u'd', u'u', u' ', u'i', u'n', u's', u't', u'a', u'l', u'l', u'e', u'r', u'e', u' ', u'e', u'n', u' ', u'P', u'y', u't', u'h', u'o', u'n', u' ', u'f', u'o', u'r', u't', u'o', u'l', u'k', u'e', u'r', u',', u' ', u'o', u'g', u' ', u'l', u'b', u'r', u'e', u' ', u'd', u'e', u'n', u'n', u'e', u' ', u'a', u't', u' ', u'k', u'e', u'n', u'd', u'e', u' ', u'v', u'e', u'd', u' ', u'a', u't', u' ', u'k', u'b', u'r', u'e', u' ', u'n', u'o', u'g', u'l', u'e', u' ', u'p', u'r', u'o', u'g', u'r', u'a', u'm', u'm', u'e', u'r', u'.', u' ', u' ', u'I', u'P', u'y', u't', u'h', u'o', u'n', u' ', u'k', u'a', u'n', u' ', u'a', u'n', u'b', u'e', u'f', u'a', u'l', u'e', u's', u' ', u'd', u'a', u' ', u'd', u'e', u'n', u'n', u'e', u' ', u'f', u'i', u'n', u'd', u'e', u's', u' ', u't', u'i', u'l', u' ', u'L', u'i', u'n']

如何根据此字母列表创建转换矩阵？我有Python transition matrix的以下代码：

 def tmatrix(self, lst):
        b = [[0 for _ in xrange(len(lst))] for _ in xrange(len(lst))]
        for (x,y), c in Counter(zip(lst, lst[1:])).iteritems():
            b[x-1][y-1] = c
        return b

但是我收到以下错误，因为我有一个unicode对象列表而不是整数。 TypeError: unsupported operand type(s) for -: 'unicode' and 'int'。我如何转换代码以支持unicode对象？

Answer 1

您链接的代码使用整数来计算序列。然后可以很容易地将整数转换为索引到转换矩阵中（1转换为索引0等）。

您链接的算法也仅适用于独特元素，其中构建的矩阵为3乘3，而不是10乘10。

您必须为输入列表执行相同的操作：

from collections import Counter, defaultdict
from itertools import count

def tmatrix(self, lst):
    # defaultdict that'll produce a unique index for each unique character
    # encountered in lst
    indices = defaultdict(count().next)
    unique_count = len(set(lst))
    b = [[0 for _ in xrange(unique_count)] for _ in xrange(unique_count)]
    for (x, y), c in Counter(zip(lst, lst[1:])).iteritems():
        b[indices[x]][indices[y]] = c
    return b

这里indices字典将字符映射回输入列表中的索引; itertools.count() instance为字典中尚未存在的任何字符提供自动递增的整数值。

这会为您的输入样本生成29乘29的矩阵：

>>> tmatrix(None, sample)
[[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 2, 0, 0, 0, 2, 0, 0, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 2, 5, 0, 0, 0, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 3, 0, 2, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 2, 1, 0, 5, 0, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 1, 0, 6, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 3, 0, 3, 6, 4, 0, 2, 4, 2, 2, 1, 1, 2, 3, 0, 0, 3, 4, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1],
 [0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 0, 0, 0, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 2, 2, 0, 1, 7, 0, 0, 0, 3, 0, 3, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 6, 2, 2, 0, 0, 11, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 3, 0, 0, 0, 4, 0, 0, 2, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

您可能也希望返回indices映射，因此您知道哪个字符映射到该矩阵中的索引。

Answer 2

你可以成对地使用该字符串（看起来它原来是丹麦语），然后使用Counter作为稀疏矩阵，其中（from，to）作为键：

from collections import Counter
from itertools import tee, izip

data = 'Tirsdag skal du installere en Python fortolker, og lPlan for ugenDenne uge drejer sig om at komme igang med Python.  Tirsdag skal du installere en Python fortolker, og lbre denne at kende ved at kbre nogle programmer.  IPython kan anbefales da denne findes til Lin'
fst, snd = tee(data)
next(snd, '')
matrix = Counter(izip(fst, snd))

然后要获得a-＆gt; b的转换，请使用matrix['a', 'b']等...对于不存在的密钥，您将自动返回0。如果您绝对需要N x N的2D数组，请使用@ Martijn的答案。

Answer 3

这是@Martijn Pieters的有序版本答案：

from collections import Counter, defaultdict
from itertools import count
import numpy as np


def tmatrix(lst):
    """Sorted and normalised transition matrix
    """
    indices = defaultdict(count().next)
    b = np.zeros([len(set(lst)),len(set(lst))])

    Ct = Counter(zip(lst, lst[1:])) # zip together consecutive elements of the list

    for (x, y), c in iter(sorted(Ct.iteritems())): # make sorted iteration to generate sorted trasition matrix
    #print (x,y), c
    b[indices[x]][indices[y]] = float(c)

    res = dict((v,k) for k,v in indices.iteritems())

    b = np.array(b)

    # Normalise 
    for i in range(len(b)):
        b[i] = b[i]/float(b.sum(axis=1)[i])

    return b, indices

计算字母的转换矩阵

3 个答案: