来自以下zip文件:
wget http://www.nature.com/nature/journal/v498/n7453/extref/nature12172-s1.zip
unzip nature12172-s1.zip
阅读Supplementary_Table2.xlsx
,它有奇怪的基因名称作为行ID,如" 2010002N04RIK",这些被解析为日期,而不是常规字符串!!
expression = pd.read_excel('nature12172-s1/Supplementary_Table2.xlsx', "suppTable2Final.txt",
# Need to specify the index column as both the first and the last columns,
# Because the last column is the "Gene Category"
index_col=[0, -1], parse_dates=False, infer_datetime_format=False)
expression.index
MultiIndex(levels=[[100043387, 2013-03-01 00:00:00, 2013-03-02 00:00:00, 2013-03-03 00:00:00, 2013-03-04 00:00:00, 2013-03-05 00:00:00, 2013-03-06 00:00:00, 2013-03-07 00:00:00, u'0610007L01RIK', u'0610007P14RIK', u'0610007P22RIK', u'0610008F07RIK', u'0610009B22RIK', u'0610009D07RIK', u'0610009O20RIK', u'0610010B08RIK', u'0610010F05RIK', u'0610010K06RIK', 2013-03-08 00:00:00, 2013-03-09 00:00:00, 2013-03-10 00:00:00, 2013-03-11 00:00:00, u'0610010K14RIK', u'0610010O12RIK', u'0610011F06RIK', u'0610011L14RIK', u'0610012G03RIK', u'0610012H03RIK', u'0610030E20RIK', u'0610031J06RIK', u'0610037L13RIK', u'0610037P05RIK', u'0610038B21RIK', u'0610039K10RIK', u'0610040B10RIK', u'0610040J01RIK', u'0910001L09RIK', 2013-04-03 00:00:00, 2013-09-01 00:00:00, 2013-09-02 00:00:00, 2013-09-03 00:00:00, 2013-09-04 00:00:00, 2013-09-05 00:00:00, 2013-09-06 00:00:00, 2013-09-07 00:00:00, 2013-09-08 00:00:00, 2013-09-09 00:00:00, 2013-09-10 00:00:00, 2013-09-11 00:00:00, 2013-09-12 00:00:00, 2013-09-14 00:00:00, 2013-09-15 00:00:00, u'1100001G20RIK', u'1110001A16RIK', u'1110001J03RIK', u'1110002B05RIK', u'1110002L01RIK', u'1110002N22RIK', u'1110003E01RIK', u'1110004E09RIK', u'1110004F10RIK', u'1110005A03RIK', u'1110006O24RIK', u'1110007C09RIK', u'1110008F13RIK', u'1110008J03RIK', u'1110008L16RIK', u'1110008P14RIK', u'1110012D08RIK', u'1110012J17RIK', u'1110012L19RIK', u'1110014N23RIK', u'1110017F19RIK', u'1110018G07RIK', u'1110018H23RIK', u'1110018J18RIK', u'1110020A21RIK', u'1110020G09RIK', u'1110021J02RIK', u'1110021L09RIK', u'1110028C15RIK', u'1110031I02RIK', u'1110032A03RIK', u'1110032A04RIK', u'1110032F04RIK', u'1110034A24RIK', u'1110034B05RIK', u'1110034G24RIK', u'1110037F02RIK', u'1110038B12RIK', u'1110038D17RIK', u'1110038F14RIK', u'1110049F12RIK', u'1110051M20RIK', u'1110054O05RIK', u'1110057K04RIK', u'1110058L19RIK', u'1110059E24RIK', u'1110059G10RIK', u'1110059M19RIK', ...], [u'Housekeeping', u'LPS Response']],
labels=[[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, ...], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...]],
names=[u'GENE', u'Gene Category'])
我已尝试过parse_dates=False
和infer_datetime_format=False
的所有组合,文档声称这是默认组合,但我仍然将这些ID解析为字符串,而不是日期?
编辑:当我从具有相同数据的文本文件中读取时,不得到同样的东西:
wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE41nnn/GSE41265/suppl/GSE41265_allGenesTPM.txt.gz
expression = pd.read_table("GSE41265_allGenesTPM.txt.gz", compression="gzip", index_col=0)
expression.index
Index([u'XKR4', u'AB338584', u'B3GAT2', u'NPL', u'T2', u'T', u'PDE10A', u'1700010I14RIK', u'6530411M01RIK', u'PABPC6', u'AK019626', u'AK020722', u'QK', u'B930003M22RIK', u'RGS8', u'PACRG', u'AK038428', u'AK163153', u'PARK2', u'AK080902', u'AGPAT4', u'MAP3K4', u'AK029100', u'PLG', u'SLC22A3', u'RGS16', u'AK021075', u'SLC22A2', u'SLC22A1', u'IGF2R', u'AIRN', u'MAS1', u'MRGPRH', u'PNLDC1', u'MRPL18', u'TCP1', u'RNASEL', u'SNORA20', u'ACAT3', u'ACAT2', u'WTAP', u'SOD2', u'GPR31C', u'TCP10C', u'TTLL2', u'UNC93A', u'GM10512', u'RGSL1', u'SMOK2A', u'SMOK2B', u'AK036897', u'BC068229', u'SMOK(TCR)', u'AK143195', u'AK008572', u'TCTE2', u'MLLT4', u'5830403L16RIK', u'GM7168', u'DACT2', u'SMOC2', u'4930474M22RIK', u'THBS2', u'WDR27', u'AK004434', u'1600012H06RIK', u'PHF10', u'LOC106740', u'GM5531', u'TCTE3', u'9030025P20RIK', u'AK050117', u'GM3435', u'GM10510', u'DLL1', u'TEDDM1', u'FAM120B', u'PSMB1', u'TBP', u'PDCD2', u'PRDM9', u'CHD1', u'RGMB', u'ZFP960', u'AK138383', u'ZFP97', u'GLUL', u'AK164331', u'RIOK2', u'LIX1', u'AK164875', u'LNPEP', u'VMN2R90', u'MIR99B', u'MIRLET7E', u'MIR125A', u'4930546H06RIK', u'AK043564', u'HAS1', u'FPR1', ...], dtype='object')