创建| N | x | M |哈希表中的矩阵

时间:2016-10-24 01:37:19

标签: python csv numpy matrix hash


import numpy as np
import random
import uuid

# Creating the N vocabulary and M vocabulary
max_word_len = 20
n_vocab_size = random.randint(8000,10000)
m_vocab_size = random.randint(8000,10000)

def random_word(): 
    return str(uuid.uuid4().get_hex().upper()[0:random.randint(1,max_word_len)])

# Generate some random words.
n_vocab = [random_word() for i in range(n_vocab_size)]
m_vocab = [random_word() for i in range(m_vocab_size)]

# Let's hallucinate probabilities for each word pair.
hashes =  {(n, m): random.random() for n in n_vocab for m in m_vocab}


{('585F', 'B4867'): 0.7582038699473549,
 ('69', 'D98B23C5809A'): 0.7341569569849136,
 ('4D30CB2BF4134', '82ED5FA3A00E4728AC'): 0.9106077161619021,
 ('DD8F8AFA5CF', 'CB'): 0.4609114677237601,



 n_words, m_words = zip(*hashes.keys())
 probs = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])

还有另一种方法可以将prob放入| N | * | M |哈希表中的矩阵没有通过m_vocab和n_vocab进行嵌套循环?



  1. 哈希表来自csv文件(@ bunji的答案解决了这个问题)
  2. 散列表来自一个酸洗字典。或者哈希表是在到达将其转换为矩阵的部分之前以其他方式计算的。
  3. 重要的是最终矩阵需要是可查询的,以下是不可取的:

    $ echo -e 'abc\txyz\t0.9\nefg\txyz\t0.3\nlmn\topq\t\0.23\nabc\tjkl\t0.5\n' > test.txt
    $ cat test.txt
    abc xyz 0.9
    efg xyz 0.3
    lmn opq .23
    abc jkl 0.5
    $ python
    Python 2.7.10 (default, Jul 30 2016, 18:31:42) 
    [GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] on darwin
    Type "help", "copyright", "credits" or "license" for more information.
    >>> import pandas as pd
    >>> pt = pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack().as_matrix()
    >>> pt
    array([[ 0.5,  nan,  0.9],
           [ nan,  nan,  0.3],
           [ nan,  nan,  nan]])
    >>> pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack()
    1    jkl opq  xyz
    abc  0.5 NaN  0.9
    efg  NaN NaN  0.3
    lmn  NaN NaN  NaN
    >>> df = pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack()
    >>> df
    1    jkl opq  xyz
    abc  0.5 NaN  0.9
    efg  NaN NaN  0.3
    lmn  NaN NaN  NaN
    >>> df['abc', 'jkl']
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
        return self._getitem_multilevel(key)
      File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
        loc = self.columns.get_loc(key)
      File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1617, in get_loc
        return self._engine.get_loc(key)
      File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
      File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
      File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)
      File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)
    KeyError: ('abc', 'jkl')
    >>> df['abc']['jkl']
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
        return self._getitem_multilevel(key)
      File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
        loc = self.columns.get_loc(key)
      File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
        loc = self._get_level_indexer(key, level=0)
      File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
        loc = level_index.get_loc(key)
      File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
        return self._engine.get_loc(self._maybe_cast_indexer(key))
      File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
      File "pandas/index.pyx", line 163, in pandas.index.IndexEngine.get_loc (pandas/index.c:4090)
    KeyError: 'abc'
    >>> df[0][2]
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
        return self._getitem_multilevel(key)
      File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
        loc = self.columns.get_loc(key)
      File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
        loc = self._get_level_indexer(key, level=0)
      File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
        loc = level_index.get_loc(key)
      File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
        return self._engine.get_loc(self._maybe_cast_indexer(key))
      File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
      File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
      File "pandas/src/hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)
      File "pandas/src/hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)
    KeyError: 0
    >>> df[0]
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
        return self._getitem_multilevel(key)
      File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
        loc = self.columns.get_loc(key)
      File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
        loc = self._get_level_indexer(key, level=0)
      File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
        loc = level_index.get_loc(key)
      File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
        return self._engine.get_loc(self._maybe_cast_indexer(key))
      File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
      File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
      File "pandas/src/hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)
      File "pandas/src/hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)
    KeyError: 0


    probs[('585F', 'B4867')] = 0.7582038699473549

import itertools
nested_loop_iter = itertools.product(n_vocab,m_vocab)
#note that because it iterates over n_vocab first we will need to transpose it at the end
probs = np.fromiter(map(hashes.get, nested_loop_iter),dtype=float)
probs = probs.T

import pandas as pd

df = pd.read_csv('coocurence_data.csv', index_col=[0,1], header=None).unstack()
probs = df.as_matrix()

这将从csv中读取您的数据,将前两列放入multi-index,这对应于您的两组单词。然后它将多索引取消堆栈,以便您有一组单词作为列标签,另一组作为索引标签。这给你的| N | * | M |然后可以使用.as_matrix()函数将矩阵转换为numpy数组。


另外,如果您还是要在csv中阅读,那么首先使用pandas阅读它会比使用内置csv模块更快:请参阅这些基准测试{ {3}}



df.loc['xyz', 'abc']


唯一的问题是矩阵需要整数索引。因此,只要您的哈希足以快速表达为应该有效的np.int64。稀疏格式应允许$ O(1)$访问所有元素。




  1. 以稀疏表示形式获取数据。我认为你应该选择coo_matrix来保存你的2D哈希图。

    一个。使用numpy.fromtxt加载CSV并使用例如数据类型['>u8', '>u8', np.float32]将哈希视为无符号8字节整数的字符串表示形式。如果这不起作用,您可以加载字符串并使用numpy进行转换。最后,您有三个大小为N * M的表,就像您的哈希表一样,并使用您选择的scipy稀疏矩阵表示。


  2. 要访问,您需要再次解析字符串

    prob = matrix[np.fromstring(key1, dtype='>u8'), np.fromstring(key2, dtype='>u8')]

对于稀疏矩阵遍历整个n_vocab x m_vocab空间似乎有点低效!你可以遍历原始的哈希表。当然,首先要了解一些事情会很好:

  1. 您是否知道n_vocab和m_vocab的大小?或者你打算在构建它时想出来吗?

  2. 您知道哈希表中是否有重复,如果是,您将如何处理?看起来哈希是一个字典,在这种情况下,显然键是唯一的。在实践中,这可能意味着你每次都要过度写作,所以最后的价值就是最终的价值。

  3. 无论如何,这里是两个选项的比较:

    from collections import defaultdict
    import numpy as np
    hashes = defaultdict(float,{('585F', 'B4867'): 0.7582038699473549,
     ('69', 'D98B23C5809A'): 0.7341569569849136,
     ('4D30CB2BF4134', '82ED5FA3A00E4728AC'): 0.9106077161619021,
     ('DD8F8AFA5CF', 'CB'): 0.4609114677237601})
    #Double loop approach
    n_vocab, m_vocab = zip(*hashes.keys())
    probs1 = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])
    #Loop through the hash approach
    n_hash = dict()  #Create a hash table to find the correct row number
    for i,n in enumerate(n_vocab):
        n_hash[n] = i
    m_hash = dict()  #Create a hash table to find the correct col number
    for i,m in enumerate(m_vocab):
        m_hash[m] = i
    probs2 = np.zeros((len(n_vocab),len(m_vocab)))
    for (n,m) in hashes: #Loop through the hashes and put the values into the probs table
        probs2[n_hash[n],m_hash[m]] = hashes[(n,m)]


    >>> probs1
    array([[ 0.73415696,  0.        ,  0.        ,  0.        ],
           [ 0.        ,  0.46091147,  0.        ,  0.        ],
           [ 0.        ,  0.        ,  0.75820387,  0.        ],
           [ 0.        ,  0.        ,  0.        ,  0.91060772]])
    >>> probs2
    array([[ 0.73415696,  0.        ,  0.        ,  0.        ],
           [ 0.        ,  0.46091147,  0.        ,  0.        ],
           [ 0.        ,  0.        ,  0.75820387,  0.        ],
           [ 0.        ,  0.        ,  0.        ,  0.91060772]])


我尝试减少样本量以快速比较不同的代码。我编写了数据帧方法,它可能仍然用于pandas函数中的循环,并与Tadhg McDonald-Jensen提供的原始代码和itertools代码进行比较。最快的代码是itertools。

In [3]: %timeit itertool(hashes,n_vocab,m_vocab)
1000 loops, best of 3: 1.12 ms per loop

In [4]: %timeit baseline(hashes,n_vocab,m_vocab)
100 loops, best of 3: 3.23 ms per loop

In [5]: %timeit dataframeMethod(hashes,n_vocab,m_vocab)
100 loops, best of 3: 5.49 ms per loop


import numpy as np
import random
import uuid
import pandas as pd
import itertools

# Creating the N vocabulary and M vocabulary
max_word_len = 20
n_vocab_size = random.randint(80,100)
m_vocab_size = random.randint(80,100)

def random_word(): 
    return str(uuid.uuid4().get_hex().upper()[0:random.randint(1,max_word_len)])

# Generate some random words.
n_vocab = [random_word() for i in range(n_vocab_size)]
m_vocab = [random_word() for i in range(m_vocab_size)]

# Let's hallucinate probabilities for each word pair.
hashes =  {(n, m): random.random() for n in n_vocab for m in m_vocab}

def baseline(hashes,n_vocab,m_vocab):
    n_words, m_words = zip(*hashes.keys())
    probs = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])
    return probs

def itertool(hashes,n_vocab,m_vocab):
    nested_loop_iter = itertools.product(n_vocab,m_vocab)
    #note that because it iterates over n_vocab first we will need to transpose it at the end
    probs = np.fromiter(map(hashes.get, nested_loop_iter),dtype=float)
    return probs.T  

def dataframeMethod(hashes,n_vocab,m_vocab):
    # build dataframe from hashes
    id1 = pd.MultiIndex.from_tuples(hashes.keys())
    # make dataframe with one index and one column
    df2.columns = df2.columns.levels[1]
    return df2.loc[m_vocab,n_vocab].values