重复分钟时的数据帧到时间序列

时间:2018-02-20 09:32:24

标签: python dataframe machine-learning

我正在处理临床数据,并希望对患者进行预测。每分钟的等待时间,数据(简化)看起来像这样:

Time(minutes)    PatientSerial       RemainingTime(minutes)
420              1                      5
420              2                      10
420              3                      8
421              1                      4
421              2                      9
421              3                      7

其中420是自午夜(420 = 7:00 am)以来的分钟数,其中我的输出是RemainingTime(历史数据)。通常,机器学习算法应该在每分钟产生每个患者的等待时间,假设输入是每分钟产生的临床数据。但是,当重复相同的分钟时,我对如何将此数据帧转换为时间序列感到困惑?

1 个答案:

答案 0 :(得分:0)

为清楚起见:这不是一个答案,而是询问结果应该是什么样的(不能在问题下面的评论中显示视图)。这可能有助于更好地理解如何解决这个问题。 编辑1:编码答案低于

@Ted:

我想知道您尝试获得的结果是否如下表所示:

Time (min)   MeanWait (min, single default patient)

...          ...
420          5.2
421          4.9
422          4.3
423          4.2
...
...
820          11.39
821          11.41
822          11.41
823          11.09
824          10.7
825          10.69
...          ...

是否应使用Matplotlib或在屏幕上的程序GUI中以PDF格式查看最终结果?如果是这样,请修改您的问题以包含该问题。

编辑1:

基于我在脚本下面做出的评论,核心工作“计算”意味着每单位日间(分钟)的患者等待时间。内联有评论​​会发生什么。我估计你可以自己实现filedata加载和输出写入,我没有添加matplotlib。有许多例子here,网络上就足够了。

import datetime

# The day timescale is from 0 to 1440 minutes and then resets for day 2.
# The input-textfile can have 24h (0-1440) or continues (e.g. 0-4320 == 3 days) timescaling for x-axis.

# Testset for dataprocessing (day 1 and day2 data)
datas = ['Time(minutes)       RemainingTime(minutes)', 
        '420              :                      5',
        '420              :                      10',
        '420              :                      8',
        '421              :                      4',
        '421              :                      9',
        '421              :                      7',
        '830              :                      8',
        '830              :                      4',
        '340              :                      3',
        '340              :                      5',
        '340              :                      4',
        '351              :                      10',
        '351              :                      7',
        '420              :                      9',
        '420              :                      7',]

def sort_data(scr):

    raw_data            = {}
    day_minute_counter  = 0
    current_list        = []
    day_in_minutes      = (24 * 60)
    elapsed_days_min    = 0           # during processing this holds value in minutes
    processed_days      = 1
    data_from_exception = {}
    count_exceptions    = 0

    for row in scr:
        print row

        try:
            # the following steps take ito account that lapsed time is linear for a single day.
            # each row is being searched for ":" which identifies teh row as having integers or floats.
            x_value, y_value = row.split(":")

#            print 'xy_values : %s, %s' % (x_value, y_value)

            # clipping trailing whitespaces from both ends.
            x_value = x_value.strip(' ')
            y_value = y_value.strip(' ')

            # string > integer conversion
            x_val = int(x_value)
            y_val = int(y_value)

            # set each x-axis timepoint only once.
            if day_minute_counter == 0:
                print 'Start', day_minute_counter, x_val

                day_minute_counter = x_val

            # zipping: append all y-axis datapoint that belong to single x-axispoint
            if day_minute_counter == x_val:
                print 'Append', day_minute_counter, x_val
                current_list.append(y_val)

            # add x,y-axis data to the datalist
            if day_minute_counter < x_val:

                print 'Done', day_minute_counter, x_val, current_list

                raw_data[(day_minute_counter + elapsed_days_min)] = current_list

                day_minute_counter = x_val

                # new list for the next point in the "day_minute_counter".
                current_list      = []  
                current_list.append(y_val)

            # correct x-axis "next-day" time difference.
            if day_minute_counter > x_val:

                processed_days += 1

                print 'Next Day Marker', day_minute_counter, x_val, current_list

                raw_data[(day_minute_counter+ elapsed_days_min)] = current_list

                elapsed_days_min += day_in_minutes
                # reset day_minute_counter because a day has elapsed.
                day_minute_counter = 0
                print 'elapsed_day in minutes : ', elapsed_days_min

        except ValueError:
            #get axis information
            count_exceptions += 1
            data_from_exception[count_exceptions] = row
#            print 'Graph info or "none integer" information collected:\n\n%s > %s\n' % (count_exceptions, row)

    # End of datablock : add the last x,y datapoints without known what EOF marker is being used.
    raw_data[day_minute_counter] = current_list

    print '\nRaw Data   : %s\nOther info : %s\n ' % (raw_data, data_from_exception)

    return (raw_data, processed_days, data_from_exception)

def calc_mean(scr):

    days = scr[1]
    minutes = (days * 24 * 60)
    missing_datapoints = []
    result = []
    print 'Dataset spans a total of "%s" minutes.\n' % minutes

    data = scr[0]

    for x_datapoint in range(1, minutes):

        meanwait  = 0.0
        totalwait = 0

        try:
            # process data from sorte_data.

#            print 'datapoint', x_datapoint  # shows only the absent datapoints on x-axis.
            dataset   = data[x_datapoint]
#            print 'datapoint', x_datapoint  # shows only the available datapoints on x-axis.

            total_values = len(dataset)
            for value in dataset:
                totalwait += value

            meanwait = float(totalwait) / float(total_values)

            x = x_datapoint
            y = meanwait

            result.append((x, y))

            print 'Patient meanwaiting time per Timepoint %s : %.03f' % (x_datapoint, meanwait)

        except Exception:
            missing_datapoints.append(x_datapoint)
#            print 'Patient meainwaiting time "%s" is not available.' % x_datapoint

    return result

def main():

    # open file code here and use readlines to import data to "datas"
    #
    # datas = ...

    ct = str(datetime.datetime.now())[0:23]

    print '%s --> Collecting patient waittime data from Time Series.\n' % ct

    sorted_data = sort_data(datas)   # used template date from this script.

    print 'Processing data to obtain main values'

    the_result = calc_mean(sorted_data)

    print '\nProcessing Finished. Here is the result :\n\n%s' % the_result

    # create new file and store result or keep processing to PDF in matplotlib

if __name__ == '__main__':

    main()