Question

我有一个列

的数据集

 1445544152817 SEND_MSG  123
 1445544152817 SEND_MSG  123
 1445544152829 SEND_MSG  135
 1445544152829 SEND_MSG  135
 1445544152830 SEND_MSG  135
 1445544152830 GET_QUEUE 12
 1445544152830 SEND_MSG  136
 1445544152830 SEND_MSG  136
 1445544152892 GET_LATEST_MSG_DELETE  26

我将列命名为：timestamp type和response_time 我这样做：

df = read_csv(output_path,names=header_row, sep=' ')

当我输出df时它很好，它给了我文件的所有值。问题？当我做的时候

df = df[df['type'] == 'SEND_MSG']

df有0行！怎么会？这不正确，因为文件和df的行具有type = SEND_MSG

这是我的计划：

warm_up = 100
cool_down = 100


def refine(df):
    start_time = np.min(df['timestamp'])
    #print start_time.columns[0]
    end_time = np.max(df['timestamp'])
    #print end_time.columns[0]
    new_start_time = start_time + (10 * 1000)
    #new_end_time = 0
    df = df[df['timestamp'] > new_start_time]
    #df = df[df['timestamp'] < new_end_time]
    return df


def ci(data):
    n, min_max, mean, var, skew, kurt = scipy.stats.describe(data)
    std = math.sqrt(var)
    error_margin = 1.96 * (std / np.sqrt(n))
    l, h = mean - error_margin, mean + error_margin
    return (l, h)


MSG_TYPE = {
    'SEND_MSG', 'GET_QUEUE', 'GET_LATEST_MSG_DELETE'
}
COLORS = ['r','g','b']


def main():
    output_path = "/Users/ramapriyasridharan/Documents/SystemsLabExperiements/merged.txt"

    xlabel = "Time in minutes"
    ylabel = "Response time in ms"
    header_row = ['timestamp','type','response_time']
    df = read_csv(output_path,names=header_row, sep=' ')
    #df = refine(df)
    min_timestamp = np.min(df['timestamp'])




    df['timestamp'] = df['timestamp'] - min_timestamp
    # convert time to minutes
    df['timestamp'] = np.round(df['timestamp'] / 60000)
    # filter all outlier above 70 seconds reponse times
    #df = df[df['response_time'] < 70 ]
    df['type'] = df['type']
    i = 0
    print df['type']
    for msg in MSG_TYPE:
        print msg
        df = df[df['type'] == msg]
        print len(df)
        response_mean = np.mean(df['response_time'])
        response_median = np.median(df['response_time'])
        response_std = np.std(df['response_time'])
        l,h = ci(df['response_time'])
        max_resp = np.max(df['response_time'])
        print "For msg_type = %s maximum response time %s"%(msg,max_resp)
        print "For msg_type = %s Response time avg = %.3f +- %.3f std = %.3f and Median = %.3f "%(msg,np.round(response_mean,3),np.round(h-response_mean,3),np.round(response_median,3),np.round(response_std,3))
        # round to nearest minute
        #find number of timestamps greater than 100
        #print df[df['response_time'] > 70]
        grp_by_timestamp_df = df.groupby('timestamp')
        mean_resp_per_min = grp_by_timestamp_df['response_time'].mean()
        #print mean_resp_per_min[0:36]
        plt.plot(mean_resp_per_min, 'x-', color=COLORS[i], label='%s requests'%msg, lw=0.5)
        i += 1

    response_mean = np.mean(df['response_time'])
    response_median = np.median(df['response_time'])
    response_std = np.std(df['response_time'])
    l,h = ci(df['response_time'])
    max_resp = np.max(df['response_time'])
    print "For msg_type = %s maximum response time %s"%('ALL',max_resp)
    print "For msg_type = %s Response time avg = %.3f +- %.3f std = %.3f and Median = %.3f "%('ALL',np.round(response_mean,3),np.round(h-response_mean,3),np.round(response_median,3),np.round(response_std,3))
    # round to nearest minute
    #find number of timestamps greater than 100
    #print df[df['response_time'] > 70]
    grp_by_timestamp_df = df.groupby('timestamp')
    mean_resp_per_min = grp_by_timestamp_df['response_time'].mean()
    #print mean_resp_per_min[0:36]

    plt.plot(mean_resp_per_min, 'x-', color='k', label='ALL requests', lw=0.5)
    plt.xlim(xmin=0.0,xmax=30)
    plt.ylim(ymin=0.0,ymax=20)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(loc="best", fancybox=True, framealpha=0.5)
    plt.grid()
    plt.show()

    #print df['response_time']

编辑：我发现了问题，但没有解决方案

我的实际数据看起来像我以前粘贴的那样，但当我把它放在数据框中时，它看起来像这样，在类型之前有空格

22059    GET_LATEST_MSG_DELETE
22060    GET_LATEST_MSG_DELETE
22061    GET_LATEST_MSG_DELETE
22062    GET_LATEST_MSG_DELETE
22063                GET_QUEUE
22064                GET_QUEUE
22065                GET_QUEUE
22066                GET_QUEUE
22067                GET_QUEUE
22068                GET_QUEUE
22069                GET_QUEUE
22070                GET_QUEUE
22071                GET_QUEUE
22072    GET_LATEST_MSG_DELETE
22073    GET_LATEST_MSG_DELETE
22074    GET_LATEST_MSG_DELETE
22075    GET_LATEST_MSG_DELETE
22076    GET_LATEST_MSG_DELETE
22077    GET_LATEST_MSG_DELETE
22078    GET_LATEST_MSG_DELETE
22079    GET_LATEST_MSG_DELETE
22080    GET_LATEST_MSG_DELETE
22081    GET_LATEST_MSG_DELETE
22082    GET_LATEST_MSG_DELETE

get_queue前面有一个前导空格，如何解决，我的实际数据中不存在此空间

编辑：问题是类型中包含可变大小的元素，我该如何解决？

Answer 1

由于您只需要一个值（SEND_MSG），因此可以执行此操作：

import pandas as pd

df = pd.read_clipboard()
df.columns = ['timestamp', 'type', 'response_time']
print df.loc[df['type'] == 'SEND_MSG']

输出：

       timestamp      type  response_time
0  1445544152817  SEND_MSG            123
1  1445544152829  SEND_MSG            135
2  1445544152829  SEND_MSG            135
3  1445544152830  SEND_MSG            135
5  1445544152830  SEND_MSG            136
6  1445544152830  SEND_MSG            136

重要的一点是：

df.loc[df['type'] == 'SEND_MSG']

pandas数据框查找具有特定列值的所有行？

1 个答案: