我有一个制表符分隔文件,其中包含区域和在这些区域中找到的相应生物实体(我检查了67个,因此您说每个区域都检查了这67个实体的存在与否及其频率)。
我以表格格式提供所有这些数据。
下面给出了样本数据
Region ATF3 BCL3 BCLAF1 BDP1 BRF1 BRF2 Brg1 CCNT2 CEBPB CHD2 CTCF CTCFL E2F6 ELF1
chr1:109102470:109102970 0 0 1 0 0 0 0 1 0 0 4 1 4 1
chr1:110526886:110527386 0 0 0 0 0 0 0 1 1 0 4 1 0 1
chr1:115300671:115301171 0 0 1 0 0 0 0 0 1 1 4 1 1 1
chr1:115323308:115323808 0 0 0 0 0 0 0 1 0 0 2 1 1 0
chr1:11795641:11796141 1 0 0 0 0 0 0 1 2 0 0 0 1 0
chr1:118148103:118148603 0 0 0 0 0 0 0 1 0 0 0 0 0 1
chr1:150521397:150521897 0 0 0 0 0 0 0 2 2 0 6 2 4 0
chr1:150601609:150602109 0 0 0 0 0 0 0 0 3 2 0 0 1 0
chr1:150602098:150602598 0 0 0 0 0 0 0 0 1 1 0 0 0 0
chr1:151119140:151119640 0 0 0 0 0 0 0 1 0 0 0 0 1 0
chr1:151128604:151129104 0 0 0 0 0 0 0 0 0 0 3 0 0 0
chr1:153517729:153518229 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1:153962738:153963238 0 0 0 0 0 0 0 1 1 0 0 0 0 1
chr1:154155682:154156182 0 0 0 0 0 0 0 1 0 0 0 0 1 1
chr1:154155725:154156225 0 0 0 0 0 0 0 1 0 0 0 0 1 1
chr1:154192154:154192654 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1:154192824:154193324 1 0 0 0 0 0 0 1 0 1 0 0 1 1
chr1:154192943:154193443 1 0 0 0 0 0 0 1 0 2 0 0 1 1
chr1:154193273:154193773 1 0 0 0 0 0 0 1 0 2 0 0 2 1
chr1:154193313:154193813 0 0 0 0 0 0 0 1 0 2 0 0 2 1
chr1:155904188:155904688 0 0 0 0 0 0 0 1 0 0 0 0 1 1
chr1:155947966:155948466 0 0 0 0 0 0 0 1 0 0 3 0 0 1
chr1:155948336:155948836 0 0 0 0 0 0 0 1 0 0 5 1 0 1
chr1:156023516:156024016 0 0 0 0 0 0 0 1 0 1 4 1 1 1
chr1:156024016:156024516 0 1 1 0 0 0 0 0 0 2 0 0 1 1
chr1:156163229:156163729 0 0 0 0 0 0 0 0 0 0 2 0 0 1
chr1:160990902:160991402 0 0 0 0 0 0 0 0 0 1 0 0 1 2
chr1:160991133:160991633 0 0 0 0 0 0 0 0 0 1 0 0 1 2
chr1:161474704:161475204 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1:161509530:161510030 0 0 1 1 1 0 0 0 1 0 1 0 0 1
chr1:161590964:161591464 0 0 0 1 1 0 0 0 0 0 0 0 0 0
chr1:169075446:169075946 0 0 0 0 0 0 0 2 0 0 4 0 3 0
chr1:17053279:17053779 0 0 0 1 0 0 0 0 0 1 0 0 0 0
chr1:1709909:1710409 0 0 0 0 0 0 0 2 0 1 0 0 3 1
chr1:1710297:1710797 0 0 0 0 0 0 0 0 0 1 6 0 1 1
现在我怎样才能把它放在从浅色红色到暗红色的热图中(取决于频率和缺席情况下的白色)?
还有其他更好的方法来表示这类数据吗?
答案 0 :(得分:4)
import pylab as plt
import numpy as np
data = np.loadtxt("14318737.txt", skiprows=1, converters={0:lambda x: 0})
plot_data = np.ma.masked_equal(data[:,1:], 0)
plt.imshow(plot_data, cmap=plt.cm.get_cmap("Reds"), interpolation="nearest")
plt.colorbar()
plt.show()
我忽略了第一行和第一列(如果标签需要它们,我们需要更改它)。对于剩余数据,所有零值都被屏蔽(因此它们在图中显示为白色),然后将这些数据绘制为颜色编码图。
imshow
有许多其他参数来控制结果,例如原点(下/上),宽高(自动/等于/ some_ratio)。
你写的地区 - 你的意思是地理区域?然后,您可能需要查看Basemap Toolkit for Matplotlib以创建颜色编码的地图。
修改强>
新要求,新示例
import pylab as plt
import numpy as np
fn = "14318737.txt"
with open(fn, "r") as f:
labels = f.readline().rstrip("\n").split()[1:]
data = np.loadtxt(fn, skiprows=1, converters={0:lambda x: 0})
plot_data = np.ma.masked_equal(data[:,1:], 0)
plt.subplots_adjust(left=0.1, bottom=0.15, right=0.99, top=0.95)
plt.imshow(plot_data, cmap=plt.cm.get_cmap("Reds"), interpolation="nearest", aspect = "auto")
plt.xticks(range(len(labels)), labels, rotation=90, va="top", ha="center")
plt.colorbar()
plt.show()
现在我首先阅读第一行的标签。我将关键字参数aspect
添加到imshow
- 调用。我为每个因素创建标签。
此外,我使用subplots_adjust
调整绘图的位置。您可以使用这些参数,直到它们满足您的需求。
现在的结果是:
如果您想要y轴的其他刻度线,请使用plt.yticks
,就像我的示例中的xticks
一样。
答案 1 :(得分:1)
由于对我的其他答案的评论OP有关于搜索2d集群的另一个问题。这是一些答案。
从我的库eegpy获取,我使用方法find_clusters。它执行遍历2d阵列,找到高于/低于给定阈值的所有聚类。
这是我的代码:
import pylab as plt
import numpy as np
from Queue import Queue
def find_clusters(ar,thres,cmp_type="greater"):
"""For a given 2d-array (test statistic), find all clusters which
are above/below a certain threshold.
"""
if not cmp_type in ["lower","greater","abs_greater"]:
raise ValueError("cmp_type must be in [\"lower\",\"greater\",\"abs_greater\"]")
clusters = []
if cmp_type=="lower":
ar_in = (ar<thres).astype(np.bool)
elif cmp_type=="greater":
ar_in = (ar>thres).astype(np.bool)
else: #cmp_type=="abs_greater":
ar_in = (abs(ar)>thres).astype(np.bool)
already_visited = np.zeros(ar_in.shape,np.bool)
for i_s in range(ar_in.shape[0]): #i_s wie i_sample
for i_f in range(ar_in.shape[1]):
if not already_visited[i_s,i_f]:
if ar_in[i_s,i_f]:
#print "Anzahl cluster:", len(clusters)
mask = np.zeros(ar_in.shape,np.bool)
check_queue = Queue()
check_queue.put((i_s,i_f))
while not check_queue.empty():
pos_x,pos_y = check_queue.get()
if not already_visited[pos_x,pos_y]:
#print pos_x,pos_y
already_visited[pos_x,pos_y] = True
if ar_in[pos_x,pos_y]:
mask[pos_x,pos_y] = True
for coords in [(pos_x-1,pos_y),(pos_x+1,pos_y),(pos_x,pos_y-1),(pos_x,pos_y+1)]: #Direct Neighbors
if 0<=coords[0]<ar_in.shape[0] and 0<=coords[1]<ar_in.shape[1]:
check_queue.put(coords)
clusters.append(mask)
return clusters
fn = "14318737.txt"
with open(fn, "r") as f:
labels = f.readline().rstrip("\n").split()[1:]
data = np.loadtxt(fn, skiprows=1, converters={0:lambda x: 0})
clusters = find_clusters(data, 0, "greater")
plot_data = np.ma.masked_equal(data[:,1:], 0)
plt.subplots_adjust(left=0.1, bottom=0.15, right=0.99, top=0.95)
plt.imshow(plot_data, cmap=plt.cm.get_cmap("Reds"), interpolation="nearest", aspect = "auto",
vmin=0, extent=[0.5,plot_data.shape[1]+0.5, plot_data.shape[0] - 0.5, -0.5])
plt.colorbar()
for cl in clusters:
plt.contour(cl.astype(np.int),[0.5], colors="k", lw=2)
plt.xticks(np.arange(1, len(labels)+2), labels, rotation=90, va="top", ha="center")
plt.show()
给出了一个形式的图像:
clusters
是布尔2d数组的列表(True / False)。每个arrray代表一个集群,其中每个布尔值指示特定“点”是否是此集群的一部分。您可以在任何进一步的分析中使用它。
<强> 修改 强>
现在在群集上有更多乐趣
import pylab as plt
import numpy as np
from Queue import Queue
def find_clusters(ar,thres,cmp_type="greater"):
"""For a given 2d-array (test statistic), find all clusters which
are above/below a certain threshold.
"""
if not cmp_type in ["lower","greater","abs_greater"]:
raise ValueError("cmp_type must be in [\"lower\",\"greater\",\"abs_greater\"]")
clusters = []
if cmp_type=="lower":
ar_in = (ar<thres).astype(np.bool)
elif cmp_type=="greater":
ar_in = (ar>thres).astype(np.bool)
else: #cmp_type=="abs_greater":
ar_in = (abs(ar)>thres).astype(np.bool)
already_visited = np.zeros(ar_in.shape,np.bool)
for i_s in range(ar_in.shape[0]): #i_s wie i_sample
for i_f in range(ar_in.shape[1]):
if not already_visited[i_s,i_f]:
if ar_in[i_s,i_f]:
#print "Anzahl cluster:", len(clusters)
mask = np.zeros(ar_in.shape,np.bool)
check_queue = Queue()
check_queue.put((i_s,i_f))
while not check_queue.empty():
pos_x,pos_y = check_queue.get()
if not already_visited[pos_x,pos_y]:
#print pos_x,pos_y
already_visited[pos_x,pos_y] = True
if ar_in[pos_x,pos_y]:
mask[pos_x,pos_y] = True
for coords in [(pos_x-1,pos_y),(pos_x+1,pos_y),(pos_x,pos_y-1),(pos_x,pos_y+1)]: #Direct Neighbors
if 0<=coords[0]<ar_in.shape[0] and 0<=coords[1]<ar_in.shape[1]:
check_queue.put(coords)
clusters.append(mask)
return clusters
fn = "14318737.txt"
data = []
with open(fn, "r") as f:
labels = f.readline().rstrip("\n").split()[1:]
for line in f:
data.append([int(v) for v in line.split()[1:]])
data = np.array(data) #np.loadtxt(fn, skiprows=1, usecols=range(1,15))#converters={0:lambda x: 0})
clusters = find_clusters(data, 0, "greater")
large_clusters = filter(lambda cl: cl.sum()>5, clusters) #Only take clusters with five or more items
large_clusters = sorted(large_clusters, key=lambda cl: -cl.sum())
plot_data = np.ma.masked_equal(data[:,:], 0)
plt.subplots_adjust(left=0.1, bottom=0.15, right=0.99, top=0.95)
plt.imshow(plot_data, cmap=plt.cm.get_cmap("Reds"), interpolation="nearest", aspect = "auto",
vmin=0, extent=[-0.5,plot_data.shape[1]-0.5, plot_data.shape[0] - 0.5, -0.5])
plt.colorbar()
for cl in large_clusters:
plt.contour(cl.astype(np.int),[.5], colors="k", lw=2)
plt.xticks(np.arange(0, len(labels)+1), labels, rotation=90, va="top", ha="center")
print "Summary of all large clusters:\n"
print "#\tSize\tIn regions"
for i, cl in enumerate(large_clusters):
print "%i\t%i\t" % (i, cl.sum()),
regions_in_cluster = np.where(np.any(cl, axis=0))[0]
min_region = labels[min(regions_in_cluster)]
max_region = labels[max(regions_in_cluster)]
print "%s to %s" % (min_region, max_region)
plt.xlim(-0.5,plot_data.shape[1]-0.5)
plt.show()
我会过滤包含超过五个点的所有群集。我只画这些。您也可以在每个群集中使用data
的总和。然后我按照它们的大小对这些大的簇进行排序,降序。
最后,我打印所有大型集群的摘要,包括它们所有集群的名称 穿过。