使用Python根据某些键合并字段

时间:2015-08-18 18:11:04

标签: python

我的桌子看起来像这样:

 ID   sam1   sam2  sam3  sam4  sam5
 11   Yes 
 11                 Yes
 11
 22                      Yes
 22
 22                             Yes

我希望结果是这样的:

 ID   sam1   sam2  sam3  sam4  sam5
 11   Yes    NA     Yes   NA    NA
 22   NA     NA     NA   Yes    Yes

我怎么能用python做到这一点?

1 个答案:

答案 0 :(得分:0)

解析自由文本表导致我尝试处理数据的时间最长:

#!/usr/bin/env python
from __future__ import absolute_import, division, print_function
import csv
import re
import sys
from itertools import chain, groupby, islice, izip_longest
from operator import itemgetter

NOT_WHITESPACE_RE = re.compile(r'\S+')


def parse_lines(lines):
    lines = iter(lines)
    first_line = next(lines)
    column_start_indices = [
        match.start() for match in NOT_WHITESPACE_RE.finditer(first_line)
    ]
    slices = [
        slice(i, j)
        for i, j in izip_longest(
            column_start_indices, islice(column_start_indices, 1, None)
        )
    ]
    return (
        [line[s].strip() for s in slices] for line in chain([first_line], lines)
    )


def merge_rows(row_a, row_b):
    return [a or b for a, b in izip_longest(row_a, row_b, fillvalue='')]


def aggregate_rows(rows):
    return (
        reduce(merge_rows, group, [])
        for _, group in groupby(rows, itemgetter(0))
    )


def replace_empty(rows):
    return ([x or 'NA' for x in row] for row in rows)


def process_table(lines):
    return replace_empty(aggregate_rows(parse_lines(lines)))


def main():
    with open('test.txt') as lines:
        writer = csv.writer(sys.stdout, delimiter='\t')
        writer.writerows(process_table(lines))


if __name__ == '__main__':
    main()

使用测试输入,将以下内容写入输出文件:

ID      sam1    sam2    sam3    sam4    sam5
11      Yes     NA      Yes     NA      NA
22      NA      NA      NA      Yes     Yes