Question

想象一下，我有一对字符串（键）和它们各自的概率（值）的字典/哈希表：

import numpy as np
import random
import uuid

# Creating the N vocabulary and M vocabulary
max_word_len = 20
n_vocab_size = random.randint(8000,10000)
m_vocab_size = random.randint(8000,10000)

def random_word(): 
    return str(uuid.uuid4().get_hex().upper()[0:random.randint(1,max_word_len)])

# Generate some random words.
n_vocab = [random_word() for i in range(n_vocab_size)]
m_vocab = [random_word() for i in range(m_vocab_size)]


# Let's hallucinate probabilities for each word pair.
hashes =  {(n, m): random.random() for n in n_vocab for m in m_vocab}

hashes哈希表看起来像这样：

{('585F', 'B4867'): 0.7582038699473549,
 ('69', 'D98B23C5809A'): 0.7341569569849136,
 ('4D30CB2BF4134', '82ED5FA3A00E4728AC'): 0.9106077161619021,
 ('DD8F8AFA5CF', 'CB'): 0.4609114677237601,
...
}

想象一下，这是我将从CSV文件中读取的输入哈希表，第一列和第二列是哈希表的单词对（键），第三列是概率

如果我将概率放入某种numpy矩阵中，我必须从哈希表中执行此操作：

 n_words, m_words = zip(*hashes.keys())
 probs = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])

还有另一种方法可以将prob放入| N | * | M |哈希表中的矩阵没有通过m_vocab和n_vocab进行嵌套循环？

（注意：我在这里创建随机单词和随机概率，但想象一下我从一个文件读取哈希表并将其读入该哈希表结构）

假设两种情况，其中：

哈希表来自csv文件（@ bunji的答案解决了这个问题）
散列表来自一个酸洗字典。或者哈希表是在到达将其转换为矩阵的部分之前以其他方式计算的。

重要的是最终矩阵需要是可查询的，以下是不可取的：

$ echo -e 'abc\txyz\t0.9\nefg\txyz\t0.3\nlmn\topq\t\0.23\nabc\tjkl\t0.5\n' > test.txt

$ cat test.txt
abc xyz 0.9
efg xyz 0.3
lmn opq .23
abc jkl 0.5


$ python
Python 2.7.10 (default, Jul 30 2016, 18:31:42) 
[GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import pandas as pd
>>> pt = pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack().as_matrix()
>>> pt
array([[ 0.5,  nan,  0.9],
       [ nan,  nan,  0.3],
       [ nan,  nan,  nan]])
>>> pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack()
       2         
1    jkl opq  xyz
0                
abc  0.5 NaN  0.9
efg  NaN NaN  0.3
lmn  NaN NaN  NaN

>>> df = pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack()

>>> df
       2         
1    jkl opq  xyz
0                
abc  0.5 NaN  0.9
efg  NaN NaN  0.3
lmn  NaN NaN  NaN

>>> df['abc', 'jkl']
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
    return self._getitem_multilevel(key)
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
    loc = self.columns.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1617, in get_loc
    return self._engine.get_loc(key)
  File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
  File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
  File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)
  File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)
KeyError: ('abc', 'jkl')
>>> df['abc']['jkl']
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
    return self._getitem_multilevel(key)
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
    loc = self.columns.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
    loc = self._get_level_indexer(key, level=0)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
    loc = level_index.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
  File "pandas/index.pyx", line 163, in pandas.index.IndexEngine.get_loc (pandas/index.c:4090)
KeyError: 'abc'

>>> df[0][2]
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
    return self._getitem_multilevel(key)
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
    loc = self.columns.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
    loc = self._get_level_indexer(key, level=0)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
    loc = level_index.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
  File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
  File "pandas/src/hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)
  File "pandas/src/hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)
KeyError: 0

>>> df[0]
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
    return self._getitem_multilevel(key)
  File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
    loc = self.columns.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
    loc = self._get_level_indexer(key, level=0)
  File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
    loc = level_index.get_loc(key)
  File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
  File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
  File "pandas/src/hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)
  File "pandas/src/hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)
KeyError: 0

生成的矩阵/数据框应该是可查询的，即能够执行以下操作：

probs[('585F', 'B4867')] = 0.7582038699473549

Answer 1

我不确定是否有办法完全避免循环，但我想可以使用itertools进行优化：

import itertools
nested_loop_iter = itertools.product(n_vocab,m_vocab)
#note that because it iterates over n_vocab first we will need to transpose it at the end
probs = np.fromiter(map(hashes.get, nested_loop_iter),dtype=float)
probs.resize((len(n_vocab),len(m_vocab)))
probs = probs.T

Answer 2

如果您的最终目标是从.csv文件中读取数据，则使用pandas直接读取文件可能更容易。

import pandas as pd

df = pd.read_csv('coocurence_data.csv', index_col=[0,1], header=None).unstack()
probs = df.as_matrix()

这将从csv中读取您的数据，将前两列放入multi-index，这对应于您的两组单词。然后它将多索引取消堆栈，以便您有一组单词作为列标签，另一组作为索引标签。这给你的| N | * | M |然后可以使用.as_matrix()函数将矩阵转换为numpy数组。

这并没有真正解决您将{(n,m):prob}字典更改为numpy数组的问题，但考虑到您的意图，这将使您无需完全创建该字典。

另外，如果您还是要在csv中阅读，那么首先使用pandas阅读它会比使用内置csv模块更快：请参阅这些基准测试{ {3}}

修改

要根据行标签和列标签查询DataFrame中的特定值，here：

df.loc['xyz', 'abc']

其中'xyz'是您行标签中的单词，'abc'是您的列标签。另请查看df.loc和df.ix，了解查询DataFrame中特定单元格的其他方法。

Answer 3

[dr-xorile答案的简短延伸]

大多数解决方案对我来说都很好。如果您需要速度或便利，取决于一点。

我同意你基本上有一个co-sparse格式的矩阵。您可能需要查看https://docs.scipy.org/doc/scipy-0.18.1/reference/sparse.html

唯一的问题是矩阵需要整数索引。因此，只要您的哈希小足以快速表达为应该有效的np.int64。稀疏格式应允许$ O（1）$访问所有元素。

（对不起，为了简洁！）

粗略轮廓

这可能很快，但有点像hacky。

以稀疏表示形式获取数据。我认为你应该选择coo_matrix来保存你的2D哈希图。

一个。使用numpy.fromtxt加载CSV并使用例如数据类型['>u8', '>u8', np.float32]将哈希视为无符号8字节整数的字符串表示形式。如果这不起作用，您可以加载字符串并使用numpy进行转换。最后，您有三个大小为N * M的表，就像您的哈希表一样，并使用您选择的scipy稀疏矩阵表示。

湾如果您已将对象存储在内存中，则可以直接使用稀疏构造函数

要访问，您需要再次解析字符串

prob = matrix[np.fromstring(key1, dtype='>u8'), np.fromstring(key2, dtype='>u8')]

Answer 4

对于稀疏矩阵遍历整个n_vocab x m_vocab空间似乎有点低效！你可以遍历原始的哈希表。当然，首先要了解一些事情会很好：

您是否知道n_vocab和m_vocab的大小？或者你打算在构建它时想出来吗？
您知道哈希表中是否有重复，如果是，您将如何处理？看起来哈希是一个字典，在这种情况下，显然键是唯一的。在实践中，这可能意味着你每次都要过度写作，所以最后的价值就是最终的价值。

无论如何，这里是两个选项的比较：

from collections import defaultdict
import numpy as np

hashes = defaultdict(float,{('585F', 'B4867'): 0.7582038699473549,
 ('69', 'D98B23C5809A'): 0.7341569569849136,
 ('4D30CB2BF4134', '82ED5FA3A00E4728AC'): 0.9106077161619021,
 ('DD8F8AFA5CF', 'CB'): 0.4609114677237601})

#Double loop approach
n_vocab, m_vocab = zip(*hashes.keys())
probs1 = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])

#Loop through the hash approach
n_hash = dict()  #Create a hash table to find the correct row number
for i,n in enumerate(n_vocab):
    n_hash[n] = i
m_hash = dict()  #Create a hash table to find the correct col number
for i,m in enumerate(m_vocab):
    m_hash[m] = i
probs2 = np.zeros((len(n_vocab),len(m_vocab)))
for (n,m) in hashes: #Loop through the hashes and put the values into the probs table
    probs2[n_hash[n],m_hash[m]] = hashes[(n,m)]

probs1和probs2的输出当然是相同的：

>>> probs1
array([[ 0.73415696,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.46091147,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.75820387,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.91060772]])
>>> probs2
array([[ 0.73415696,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.46091147,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.75820387,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.91060772]])

当然，你的probs1代码非常简洁。但是，循环的大小大不相同，它可能会对运行时间产生很大影响

Answer 5

我尝试减少样本量以快速比较不同的代码。我编写了数据帧方法，它可能仍然用于pandas函数中的循环，并与Tadhg McDonald-Jensen提供的原始代码和itertools代码进行比较。最快的代码是itertools。

In [3]: %timeit itertool(hashes,n_vocab,m_vocab)
1000 loops, best of 3: 1.12 ms per loop

In [4]: %timeit baseline(hashes,n_vocab,m_vocab)
100 loops, best of 3: 3.23 ms per loop

In [5]: %timeit dataframeMethod(hashes,n_vocab,m_vocab)
100 loops, best of 3: 5.49 ms per loop

这是我用来比较的代码。

import numpy as np
import random
import uuid
import pandas as pd
import itertools

# Creating the N vocabulary and M vocabulary
max_word_len = 20
n_vocab_size = random.randint(80,100)
m_vocab_size = random.randint(80,100)

def random_word(): 
    return str(uuid.uuid4().get_hex().upper()[0:random.randint(1,max_word_len)])

# Generate some random words.
n_vocab = [random_word() for i in range(n_vocab_size)]
m_vocab = [random_word() for i in range(m_vocab_size)]


# Let's hallucinate probabilities for each word pair.
hashes =  {(n, m): random.random() for n in n_vocab for m in m_vocab}

def baseline(hashes,n_vocab,m_vocab):
    n_words, m_words = zip(*hashes.keys())
    probs = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])
    return probs

def itertool(hashes,n_vocab,m_vocab):
    nested_loop_iter = itertools.product(n_vocab,m_vocab)
    #note that because it iterates over n_vocab first we will need to transpose it at the end
    probs = np.fromiter(map(hashes.get, nested_loop_iter),dtype=float)
    probs.resize((len(n_vocab),len(m_vocab)))
    return probs.T  

def dataframeMethod(hashes,n_vocab,m_vocab):
    # build dataframe from hashes
    id1 = pd.MultiIndex.from_tuples(hashes.keys())
    df=pd.DataFrame(hashes.values(),index=id1)
    # make dataframe with one index and one column
    df2=df.unstack(level=0)
    df2.columns = df2.columns.levels[1]
    return df2.loc[m_vocab,n_vocab].values

创建| N | x | M |哈希表中的矩阵

5 个答案:

粗略轮廓