我有一个平方矩阵,其浮点值从0到1.我需要识别对角线旁边的值为<使用以下bin的阈值。
一旦我识别出这些垃圾箱,我需要查看该列并搜索垃圾箱>超过门槛。使用我编写的代码非常简单:
#!/usr/bin/python
import matplotlib.pyplot as plt
import numpy as np
import argparse
import sys
import re
def InputPar():
#parsing arguments
parser = argparse.ArgumentParser(description="Tool for analyzing Hi-C data")
parser.add_argument("-f", "--file", metavar="MATRIX", type=str, required=True,
help="matrix")
return parser.parse_args()
def getRowCount(file):
""" this def counts the number of rows in the file """
row_count = 0
for line in open(file):
if re.match("# MASKED", line) != None:
continue
row_count += 1
return row_count
def getFieldsCount(file):
""" this def counts the number of columns in the file """
field_count = 0
for line in open(file):
if re.match("# MASKED", line) != None:
continue
field_count = len(line.replace(" ", "\t").split("\t"))
break
##since we have a matrix with chr - range - bin1 - bin2 - ...
##we have to substract 2 to the field count
return field_count-2
def getMAD(arr):
""" calculates the Median Absolute Deviation (MAD)
Note about the consistency constant: if you know that the underlying distribution
is normal, the consistency constant should be set to 1.4826. This will ensure that
for large samples the MAD provides a good estimate of the standard deviation """
median = np.median(arr)
return median, 1.4826 * np.median(np.abs(arr - median))
def getDistribution(a, masked, rows):
""" this creates a distribution of norm counts from one
bin and the following one forwards """
values = []
for num in xrange(rows):
if num == rows-1:
##this would be the end of the matrix
break
if str(num) in masked or str(num+1) in masked or a[num,num] == 0:
#row masked, the following row is masked, or
#no coverage in bin
continue
values.append(a[num,num+1])
median, mad = getMAD(values)
##filtering values to plot, 95% of data
##so trimming outliers
values = np.array(values)
f_values = values[np.where(values < median + 2*mad)]
hist, bins = np.histogram(f_values, bins=50)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align="center", width=width)
plt.savefig("distribution.png")
return median, mad
def main():
args = InputPar()
#get file counts (rows/cols)
rows = getRowCount(args.file)
cols = getFieldsCount(args.file)
if rows != cols:
print "Err -> Col number is not equal to row number (not a square matrix)."
sys.exit()
#create matrix with numpy
a = np.zeros((rows,cols),dtype=np.float)
#reading matrix and creating numpy matrix
row_count = 0
masked = {}
f = open(args.file)
for line in f:
fields = line.rstrip("\n").replace(" ", "\t").split("\t")[2:]
if re.match("# MASKED", line) != None:
for col_masked in fields:
masked[col_masked] = 1
continue
field_count = 0
for col in fields:
a[row_count,field_count] = col
field_count += 1
row_count += 1
f.close()
#numpy matrix already created
#getting distribution
median, mad = getDistribution(a, masked, rows)
#identifying inversions, since we only look at one bin,
#it is highly sensitive, later on we will filter
candidates = {}
#threshold accounts for 95% of the data
threshold = median - 2*mad
for num in xrange(rows):
if num == rows-1:
break
if str(num) in masked or str(num+1) in masked or a[num,num] == 0:
continue
if a[num,num+1] < threshold and a[num-1,num+1]:
#outliers!
counter = 0
for value in a[num,num+2:]: #getting column
counter += 1
if value > a[num,num+1] and value >= threshold:
if num not in candidates:
candidates[num] = []
candidates[num].append(num+counter)
if __name__ == "__main__":
main()
但是,我只是看单箱。我需要搜索如下所示的模式:
你对此有什么想法吗?