想象一下,我有一对字符串(键)和它们各自的概率(值)的字典/哈希表:
import numpy as np
import random
import uuid
# Creating the N vocabulary and M vocabulary
max_word_len = 20
n_vocab_size = random.randint(8000,10000)
m_vocab_size = random.randint(8000,10000)
def random_word():
return str(uuid.uuid4().get_hex().upper()[0:random.randint(1,max_word_len)])
# Generate some random words.
n_vocab = [random_word() for i in range(n_vocab_size)]
m_vocab = [random_word() for i in range(m_vocab_size)]
# Let's hallucinate probabilities for each word pair.
hashes = {(n, m): random.random() for n in n_vocab for m in m_vocab}
hashes
哈希表看起来像这样:
{('585F', 'B4867'): 0.7582038699473549,
('69', 'D98B23C5809A'): 0.7341569569849136,
('4D30CB2BF4134', '82ED5FA3A00E4728AC'): 0.9106077161619021,
('DD8F8AFA5CF', 'CB'): 0.4609114677237601,
...
}
想象一下,这是我将从CSV文件中读取的输入哈希表,第一列和第二列是哈希表的单词对(键),第三列是概率
如果我将概率放入某种numpy
矩阵中,我必须从哈希表中执行此操作:
n_words, m_words = zip(*hashes.keys())
probs = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])
还有另一种方法可以将prob
放入| N | * | M |哈希表中的矩阵没有通过m_vocab和n_vocab进行嵌套循环?
(注意:我在这里创建随机单词和随机概率,但想象一下我从一个文件读取哈希表并将其读入该哈希表结构)
假设两种情况,其中:
csv
文件(@ bunji的答案解决了这个问题)重要的是最终矩阵需要是可查询的,以下是不可取的:
$ echo -e 'abc\txyz\t0.9\nefg\txyz\t0.3\nlmn\topq\t\0.23\nabc\tjkl\t0.5\n' > test.txt
$ cat test.txt
abc xyz 0.9
efg xyz 0.3
lmn opq .23
abc jkl 0.5
$ python
Python 2.7.10 (default, Jul 30 2016, 18:31:42)
[GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import pandas as pd
>>> pt = pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack().as_matrix()
>>> pt
array([[ 0.5, nan, 0.9],
[ nan, nan, 0.3],
[ nan, nan, nan]])
>>> pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack()
2
1 jkl opq xyz
0
abc 0.5 NaN 0.9
efg NaN NaN 0.3
lmn NaN NaN NaN
>>> df = pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack()
>>> df
2
1 jkl opq xyz
0
abc 0.5 NaN 0.9
efg NaN NaN 0.3
lmn NaN NaN NaN
>>> df['abc', 'jkl']
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
return self._getitem_multilevel(key)
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
loc = self.columns.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1617, in get_loc
return self._engine.get_loc(key)
File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)
File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)
KeyError: ('abc', 'jkl')
>>> df['abc']['jkl']
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
return self._getitem_multilevel(key)
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
loc = self.columns.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
loc = self._get_level_indexer(key, level=0)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
loc = level_index.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
File "pandas/index.pyx", line 163, in pandas.index.IndexEngine.get_loc (pandas/index.c:4090)
KeyError: 'abc'
>>> df[0][2]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
return self._getitem_multilevel(key)
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
loc = self.columns.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
loc = self._get_level_indexer(key, level=0)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
loc = level_index.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
File "pandas/src/hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)
File "pandas/src/hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)
KeyError: 0
>>> df[0]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
return self._getitem_multilevel(key)
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
loc = self.columns.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
loc = self._get_level_indexer(key, level=0)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
loc = level_index.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
File "pandas/src/hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)
File "pandas/src/hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)
KeyError: 0
生成的矩阵/数据框应该是可查询的,即能够执行以下操作:
probs[('585F', 'B4867')] = 0.7582038699473549
答案 0 :(得分:5)
我不确定是否有办法完全避免循环,但我想可以使用itertools
进行优化:
import itertools
nested_loop_iter = itertools.product(n_vocab,m_vocab)
#note that because it iterates over n_vocab first we will need to transpose it at the end
probs = np.fromiter(map(hashes.get, nested_loop_iter),dtype=float)
probs.resize((len(n_vocab),len(m_vocab)))
probs = probs.T
答案 1 :(得分:4)
如果您的最终目标是从.csv文件中读取数据,则使用pandas直接读取文件可能更容易。
import pandas as pd
df = pd.read_csv('coocurence_data.csv', index_col=[0,1], header=None).unstack()
probs = df.as_matrix()
这将从csv中读取您的数据,将前两列放入multi-index,这对应于您的两组单词。然后它将多索引取消堆栈,以便您有一组单词作为列标签,另一组作为索引标签。这给你的| N | * | M |然后可以使用.as_matrix()
函数将矩阵转换为numpy数组。
这并没有真正解决您将{(n,m):prob}
字典更改为numpy数组的问题,但考虑到您的意图,这将使您无需完全创建该字典。
另外,如果您还是要在csv中阅读,那么首先使用pandas阅读它会比使用内置csv
模块更快:请参阅这些基准测试{ {3}}
修改强>
要根据行标签和列标签查询DataFrame中的特定值,here:
df.loc['xyz', 'abc']
其中'xyz'
是您行标签中的单词,'abc'
是您的列标签。另请查看df.loc
和df.ix
,了解查询DataFrame中特定单元格的其他方法。
答案 2 :(得分:3)
[dr-xorile答案的简短延伸]
大多数解决方案对我来说都很好。如果您需要速度或便利,取决于一点。
我同意你基本上有一个co-sparse格式的矩阵。您可能需要查看https://docs.scipy.org/doc/scipy-0.18.1/reference/sparse.html
唯一的问题是矩阵需要整数索引。因此,只要您的哈希小足以快速表达为应该有效的np.int64
。稀疏格式应允许$ O(1)$访问所有元素。
(对不起,为了简洁!)
这可能很快,但有点像hacky。
以稀疏表示形式获取数据。我认为你应该选择coo_matrix
来保存你的2D哈希图。
一个。使用numpy.fromtxt
加载CSV并使用例如数据类型['>u8', '>u8', np.float32]
将哈希视为无符号8字节整数的字符串表示形式。如果这不起作用,您可以加载字符串并使用numpy进行转换。最后,您有三个大小为N * M的表,就像您的哈希表一样,并使用您选择的scipy稀疏矩阵表示。
湾如果您已将对象存储在内存中,则可以直接使用稀疏构造函数
要访问,您需要再次解析字符串
prob = matrix[np.fromstring(key1, dtype='>u8'), np.fromstring(key2, dtype='>u8')]
答案 3 :(得分:2)
对于稀疏矩阵遍历整个n_vocab x m_vocab空间似乎有点低效!你可以遍历原始的哈希表。当然,首先要了解一些事情会很好:
您是否知道n_vocab和m_vocab的大小?或者你打算在构建它时想出来吗?
您知道哈希表中是否有重复,如果是,您将如何处理?看起来哈希是一个字典,在这种情况下,显然键是唯一的。在实践中,这可能意味着你每次都要过度写作,所以最后的价值就是最终的价值。
无论如何,这里是两个选项的比较:
from collections import defaultdict
import numpy as np
hashes = defaultdict(float,{('585F', 'B4867'): 0.7582038699473549,
('69', 'D98B23C5809A'): 0.7341569569849136,
('4D30CB2BF4134', '82ED5FA3A00E4728AC'): 0.9106077161619021,
('DD8F8AFA5CF', 'CB'): 0.4609114677237601})
#Double loop approach
n_vocab, m_vocab = zip(*hashes.keys())
probs1 = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])
#Loop through the hash approach
n_hash = dict() #Create a hash table to find the correct row number
for i,n in enumerate(n_vocab):
n_hash[n] = i
m_hash = dict() #Create a hash table to find the correct col number
for i,m in enumerate(m_vocab):
m_hash[m] = i
probs2 = np.zeros((len(n_vocab),len(m_vocab)))
for (n,m) in hashes: #Loop through the hashes and put the values into the probs table
probs2[n_hash[n],m_hash[m]] = hashes[(n,m)]
probs1和probs2的输出当然是相同的:
>>> probs1
array([[ 0.73415696, 0. , 0. , 0. ],
[ 0. , 0.46091147, 0. , 0. ],
[ 0. , 0. , 0.75820387, 0. ],
[ 0. , 0. , 0. , 0.91060772]])
>>> probs2
array([[ 0.73415696, 0. , 0. , 0. ],
[ 0. , 0.46091147, 0. , 0. ],
[ 0. , 0. , 0.75820387, 0. ],
[ 0. , 0. , 0. , 0.91060772]])
当然,你的probs1代码非常简洁。但是,循环的大小大不相同,它可能会对运行时间产生很大影响
答案 4 :(得分:2)
我尝试减少样本量以快速比较不同的代码。我编写了数据帧方法,它可能仍然用于pandas函数中的循环,并与Tadhg McDonald-Jensen提供的原始代码和itertools代码进行比较。最快的代码是itertools。
In [3]: %timeit itertool(hashes,n_vocab,m_vocab)
1000 loops, best of 3: 1.12 ms per loop
In [4]: %timeit baseline(hashes,n_vocab,m_vocab)
100 loops, best of 3: 3.23 ms per loop
In [5]: %timeit dataframeMethod(hashes,n_vocab,m_vocab)
100 loops, best of 3: 5.49 ms per loop
这是我用来比较的代码。
import numpy as np
import random
import uuid
import pandas as pd
import itertools
# Creating the N vocabulary and M vocabulary
max_word_len = 20
n_vocab_size = random.randint(80,100)
m_vocab_size = random.randint(80,100)
def random_word():
return str(uuid.uuid4().get_hex().upper()[0:random.randint(1,max_word_len)])
# Generate some random words.
n_vocab = [random_word() for i in range(n_vocab_size)]
m_vocab = [random_word() for i in range(m_vocab_size)]
# Let's hallucinate probabilities for each word pair.
hashes = {(n, m): random.random() for n in n_vocab for m in m_vocab}
def baseline(hashes,n_vocab,m_vocab):
n_words, m_words = zip(*hashes.keys())
probs = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])
return probs
def itertool(hashes,n_vocab,m_vocab):
nested_loop_iter = itertools.product(n_vocab,m_vocab)
#note that because it iterates over n_vocab first we will need to transpose it at the end
probs = np.fromiter(map(hashes.get, nested_loop_iter),dtype=float)
probs.resize((len(n_vocab),len(m_vocab)))
return probs.T
def dataframeMethod(hashes,n_vocab,m_vocab):
# build dataframe from hashes
id1 = pd.MultiIndex.from_tuples(hashes.keys())
df=pd.DataFrame(hashes.values(),index=id1)
# make dataframe with one index and one column
df2=df.unstack(level=0)
df2.columns = df2.columns.levels[1]
return df2.loc[m_vocab,n_vocab].values