我有以下代码,我试图将样本的总体名称 (igsr_samples.tsv) 与其遗传数据 (1000G_chr1_1-1000001_some.vcf) 进行映射。我不确定我是否做得对,但我不断收到错误消息。数据在此驱动器中:https://drive.google.com/drive/folders/1dwNfONcukNYKzQzUwo-Y-5TfTY16pagy?usp=sharing
import random
import matplotlib as matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import vcf
import seaborn as sns
import allel
from plotly.offline import offline
from sklearn.decomposition import PCA
random.seed(42)
def PCA_conventionalSVD(VCF, fileName):
# Read data of chromosome
data = allel.read_vcf(VCF)
df = allel.vcf_to_dataframe(VCF)
geneData = sorted(data['samples'])
# Read data of labels
df_samples = pd.read_csv(fileName, sep='\t')
labelData = df_samples['Sample name'].sort_values(ascending=True)
setIndex = df_samples.iloc
df_samples2 = df_samples.set_index('Sample name’)
keys = ['Sample name', 'Population_name']
dict2 = {x: df_samples[x] for x in keys}
dict2['sample'] = dict2.pop('Sample name')
# reference the sample to the population
# chomosome/position - read line
print(data.keys())
x = pd.DataFrame.from_dict(data, orient='index')
x = x.transpose()
x = x.dropna()
**x['populations'] = x['samples'].map(df_samples)**
if __name__ == '__main__':
# Data window
desired_width = 320
pd.set_option('display.width', desired_width)
np.set_printoptions(linewidth=desired_width)
pd.set_option('display.max_columns', 50)
dataFile = 'data/1000G_chr1_1-1000001_some.vcf'
fileName = 'data/igsr_samples.tsv'
PCA_conventionalSVD(dataFile, fileName)