使用Python识别矩阵中的热点

时间:2016-03-18 10:38:03

标签: python numpy

我有一个平方矩阵,其浮点值从0到1.我需要识别对角线旁边的值为<使用以下bin的阈值。

一旦我识别出这些垃圾箱,我需要查看该列并搜索垃圾箱>超过门槛。使用我编写的代码非常简单:

#!/usr/bin/python

import matplotlib.pyplot as plt
import numpy as np
import argparse
import sys
import re

def InputPar():
    #parsing arguments
    parser = argparse.ArgumentParser(description="Tool for analyzing Hi-C data")
    parser.add_argument("-f", "--file", metavar="MATRIX", type=str, required=True,
        help="matrix")
    return parser.parse_args()

def getRowCount(file):
    """ this def counts the number of rows in the file """
    row_count = 0
    for line in open(file):
        if re.match("# MASKED", line) != None:
            continue
        row_count += 1
    return row_count

def getFieldsCount(file):
    """ this def counts the number of columns in the file """
    field_count = 0
    for line in open(file):
        if re.match("# MASKED", line) != None:
            continue
        field_count = len(line.replace(" ", "\t").split("\t"))
        break
    ##since we have a matrix with chr - range - bin1 - bin2 - ...
    ##we have to substract 2 to the field count
    return field_count-2

def getMAD(arr):
    """ calculates the Median Absolute Deviation (MAD) 
    Note about the consistency constant: if you know that the underlying distribution
    is normal, the consistency constant should be set to 1.4826. This will ensure that
    for large samples the MAD provides a good estimate of the standard deviation """
    median = np.median(arr)
    return median, 1.4826 * np.median(np.abs(arr - median))

def getDistribution(a, masked, rows):
    """ this creates a distribution of norm counts from one
    bin and the following one forwards """
    values = []
    for num in xrange(rows):
        if num == rows-1:
            ##this would be the end of the matrix
            break
        if str(num) in masked or str(num+1) in masked or a[num,num] == 0:
            #row masked, the following row is masked, or
            #no coverage in bin
            continue
        values.append(a[num,num+1])
    median, mad = getMAD(values)
    ##filtering values to plot, 95% of data
    ##so trimming outliers
    values = np.array(values)
    f_values = values[np.where(values < median + 2*mad)]
    hist, bins = np.histogram(f_values, bins=50)
    width = 0.7 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    plt.bar(center, hist, align="center", width=width)
    plt.savefig("distribution.png")
    return median, mad

def main():
    args = InputPar()
    #get file counts (rows/cols)
    rows = getRowCount(args.file)
    cols = getFieldsCount(args.file)
    if rows != cols:
        print "Err -> Col number is not equal to row number (not a square matrix)."
        sys.exit()
    #create matrix with numpy
    a = np.zeros((rows,cols),dtype=np.float)
    #reading matrix and creating numpy matrix
    row_count = 0
    masked = {}
    f = open(args.file)
    for line in f:
        fields = line.rstrip("\n").replace(" ", "\t").split("\t")[2:]
        if re.match("# MASKED", line) != None:
            for col_masked in fields:
                masked[col_masked] = 1
            continue
        field_count = 0
        for col in fields:
            a[row_count,field_count] = col
            field_count += 1
        row_count += 1
    f.close()
    #numpy matrix already created
    #getting distribution
    median, mad = getDistribution(a, masked, rows)
    #identifying inversions, since we only look at one bin,
    #it is highly sensitive, later on we will filter
    candidates = {}
    #threshold accounts for 95% of the data
    threshold = median - 2*mad
    for num in xrange(rows):
        if num == rows-1:
            break
        if str(num) in masked or str(num+1) in masked or a[num,num] == 0:
            continue
        if a[num,num+1] < threshold and a[num-1,num+1]:
            #outliers!
            counter = 0
            for value in a[num,num+2:]: #getting column
                counter += 1
                if value > a[num,num+1] and value >= threshold:
                    if num not in candidates:
                        candidates[num] = []
                    candidates[num].append(num+counter)

if __name__ == "__main__":
    main()

但是,我只是看单箱。我需要搜索如下所示的模式:

enter image description here

你对此有什么想法吗?

0 个答案:

没有答案