
时间:2014-09-17 10:14:11

标签: python sql pandas bioinformatics


ch  S-MART  utr5    313 516 .   +   .   blabla
ch  GenBank gene    517 1878    .   +   1   ID=SAOUHSC_00001;Name=SAOUHSC_00001
ch  GenBank mRNA    517 1878    .   +   1   ID=SAOUHSC_00001.t01;Parent=SAOUHSC_00001
ch  GenBank CDS 517 1878    .   +   1   ID=SAOUHSC_00001.p01;Name=SAOUHSC_00001;product=chromosomal 
ch  GenBank exon    517 1878    .   +   1   Parent=SAOUHSC_00001.t01
ch  S-MART  gene    517 1878    .   +   .   blabla
ch  S-MART  operon  1879    2155    .   +   .   blabla
ch  GenBank gene    2156    3289    .   +   1   ID=SAOUHSC_00002;Name=SAOUHSC_00002
ch  GenBank mRNA    2156    3289    .   +   1   ID=SAOUHSC_00002.t01;Parent=SAOUHSC_00002
ch  GenBank CDS 2156    3289    .   +   1   ID=SAOUHSC_00002.p01;Parent=SAOUHSC_00002.t01;Name=SAOUHSC_00002;product=DNA polymerase 
ch  S-MART  utr3    3290    3331    .   +   .   blabla
ch  S-MART  utr5    3649    3669    .   +   .   blabla
ch  GenBank gene    3670    3915    .   +   1   ID=SAOUHSC_00003;Name=SAOUHSC_00003
ch  GenBank CDS 3670    3915    .   +   1   ID=SAOUHSC_00003.p01;Parent=SAOUHSC_00003.t01;Name=SAOUHSC_00003;product=conserved  
ch  S-MART  gene    3670    5024    .   +   .   blabla




import pandas as pd
staphInputGff = pd.read_table("myTable", sep='\t',names=["seqid", "source", "type","start","end","score","strand","phase","attributes"])
start = staphInputGff.start
end = staphInputGff.end

def consolidate(start, end):
    _start = start[:]                # Make a copy since we're modifying the list
    result = []
    for i in range(len(_start)-1):   # Iterate until the second-to-last pair
        if _start[i+1] <= end[i]+1 and (start[i+1] != _start[i] or end[i+1] != end[i]):  # If two pairs are contiguous,
            _start[i+1] = _start[i]  # replace the start value with the previous one
        elif start[i+1] != _start[i] or end[i+1] != end[i]:                                  # Otherwise
            result.append((_start[i], end[i])) # add the current pair to the result
    result.append((_start[i+1], end[i+1]))     # the ultimate pair
    return result


(313, 3331), (3649, 5024)



我需要打印一张特定键的表 - 只有CDS就足够了(我不想要其他行的信息)。


ch  GenBank CDS 313 3331    .   +   .   ID=SAOUHSC_00001.p01;Name=SAOUHSC_00001;product=chromosomal
ch  GenBank CDS 3649 5024   .   +   .   ID=SAOUHSC_00003.p01;Parent=SAOUHSC_00003.t01;Name=SAOUHSC_00003;product=conserved



1 个答案:

答案 0 :(得分:1)


# start with the data in a data frame df

     a        b       c  Start   End  f  g  h                                                  i

    0   ch   S-MART    utr5    313   516  .  +  .                                             blabla
    1   ch  GenBank    gene    517  1878  .  +  1                ID=SAOUHSC_00001;Name=SAOUHSC_00001
    2   ch  GenBank    mRNA    517  1878  .  +  1          ID=SAOUHSC_00001.t01;Parent=SAOUHSC_00001
    3   ch  GenBank     CDS    517  1878  .  +  1  ID=SAOUHSC_00001.p01;Name=SAOUHSC_00001;produc...
    4   ch  GenBank    exon    517  1878  .  +  1                           Parent=SAOUHSC_00001.t01
    5   ch   S-MART    gene    517  1878  .  +  .                                             blabla
    6   ch   S-MART  operon   1879  2155  .  +  .                                             blabla
    7   ch  GenBank    gene   2156  3289  .  +  1                ID=SAOUHSC_00002;Name=SAOUHSC_00002
    8   ch  GenBank    mRNA   2156  3289  .  +  1          ID=SAOUHSC_00002.t01;Parent=SAOUHSC_00002
    9   ch  GenBank     CDS   2156  3289  .  +  1  ID=SAOUHSC_00002.p01;Parent=SAOUHSC_00002.t01;...
    10  ch   S-MART    utr3   3290  3331  .  +  .                                             blabla
    11  ch   S-MART    utr5   3649  3669  .  +  .                                             blabla
    12  ch  GenBank    gene   3670  3915  .  +  1                ID=SAOUHSC_00003;Name=SAOUHSC_00003
    13  ch  GenBank     CDS   3670  3915  .  +  1  ID=SAOUHSC_00003.p01;Parent=SAOUHSC_00003.t01;...
    14  ch   S-MART    gene   3670  5024  .  +  .                                             blabla

# If there are duplicate start/end values, discard those that repeat the CDS 
cdsSE = df[df['c']=='CDS'][['Start','End']].values
droprows = []
for idx in df.index:
    if (df.loc[idx][['Start','End']].values in cdsSE) and (df.loc[idx]['c'] != 'CDS'):
df2 = df.drop(df.index[droprows])

# Walk through the new data frame. If the next row is contiguous,
# move up the start value and mark the row for deletion

droprows = []
for i in range(len(df2.index[:-1])): 
    if (df2.iloc[i]['End'] + 1) >= df2.iloc[i+1]['Start']: # to include lesser start values

        # If the present row is CDS, 
        #save its information by also moving it up a row
        if df2.loc[df2.index[i], 'c'] == 'CDS':
            df2.loc[df2.index[i+1], ['a', 'b', 'c', 'f', 'g', 'h', 'i']] = df2.loc[df2.index[i], ['a', 'b', 'c', 'f', 'g', 'h', 'i']]

        # Then replace the start values with the present row
        # and mark the row for deletion
        df2.loc[df2.index[i+1], 'Start'] = df2.loc[df2.index[i], 'Start']

# And make a new data frame by deleting the unwanted rows
df3 = df2.drop(df2.index[droprows])

    a        b    c  Start   End  f  g  h                                                  i
10  ch  GenBank  CDS    313  3331  .  +  1  ID=SAOUHSC_00001.p01;Name=SAOUHSC_00001;produc...
13  ch  GenBank  CDS   3649  3915  .  +  1  ID=SAOUHSC_00003.p01;Parent=SAOUHSC_00003.t01;...