pandas意味着计算csv中的列

时间:2015-11-19 00:04:15

标签: python python-3.x numpy pandas data-analysis

我在csv文件中有一些数据如下所示(此处仅显示部分数据)。

SourceID   local_date   local_time  Vge    BSs              PC         hour Type
7208       8/01/2015    11:00:19    15.4    87             +BC_MSG      11  MAIN
11060      8/01/2015    11:01:56    14.9    67             +AB_MSG      11  MAIN
3737       8/01/2015    11:02:09    15.4    88             +AB_MSG      11  MAIN
9683       8/01/2015    11:07:19    14.9    69             +AB_MSG      11  MAIN
9276       8/01/2015    11:07:52    15.4    88             +AB_MSG      11  MAIN
7754       8/01/2015    11:09:26    14.7    62             +AF_MSG      11  MAIN
11111      8/01/2015    11:10:06    15.2    80             +AF_MSG      11  MAIN
9276       8/01/2015    11:10:52    15.4    88             +AB_MSG      11  MAIN
11111      8/01/2015    11:12:56    15.2    80             +AB_MSG      11  MAIN
6148       8/01/2015    11:15:29    15      70             +AB_MSG      11  MAIN
11111      8/01/2015    11:15:56    15.2    80             +AB_MSG      11  MAIN
9866       8/01/2015    11:16:28    4.102   80             +SUB_MSG     11  SUB
9866       8/01/2015    11:16:38    15.1    78             +AH_MSG      11  MAIN
9866       8/01/2015    11:16:38    4.086   78             +SUB_MSG     11  SUB
20729      8/01/2015    11:23:21    11.6    82             +AB_MSG      11  MAIN
9276       8/01/2015    11:25:52    15.4    88             +AB_MSG      11  MAIN
11111      8/01/2015    11:34:16    15.2    80             +AF_MSG      11  MAIN
20190      8/01/2015    11:36:09    11.2    55             +AF_MSG      11  MAIN
7208       8/01/2015    11:37:09    15.3    85             +AB_MSG      11  MAIN
7208       8/01/2015    11:38:39    15.3    86             +AB_MSG      11  MAIN
7754       8/01/2015    11:39:16    14.7    61             +AB_MSG      11  MAIN
8968       8/01/2015    11:39:39    15.5    91             +AB_MSG      11  MAIN
3737       8/01/2015    11:41:09    15.4    88             +AB_MSG      11  MAIN
9683       8/01/2015    11:41:39    14.9    69             +AF_MSG      11  MAIN
20729      8/01/2015    11:44:36    11.6    81             +AB_MSG      11  MAIN
9704       8/01/2015    11:45:20    14.9    68             +AF_MSG      11  MAIN
11111      8/01/2015    11:46:06    4.111   87             +SUB_MSG     11  PAN

我有以下使用pandas处理此输入的python程序

import sys
import csv
import operator
import os
from glob import glob
import fileinput
from relativeDates import *
import datetime
import math
import pprint
import numpy as np
import pandas as pd
from io import StringIO

COLLECTION = 'NEW'
BATTERY = r'C:\MyFolder\Analysis\\{}'.format(COLLECTION)
INPUT_FILE = Pandas + r'\in.csv'
OUTPUT_FILE = Pandas + r'\out.csv'


with open(INPUT_FILE) as fin:
    df = pd.read_csv(INPUT_FILE,
                  usecols=["SourceID", "local_date","local_time","Vge",'BSs','PC'],
                  header=0)


    #df.set_index(['SourceID','local_date','local_time','Vge','BSs','PC'],inplace=True)
    df.drop_duplicates(inplace=True)
    #df.reset_index(inplace=True)

    hour_list = []
    gb = df['local_time'].groupby(df['local_date'])
    for i in list(gb)[0][1]:
             hour_list.append(i.split(':')[0])
    for j in list(gb)[1][1]:
            hour_list.append(str(int(j.split(':')[0])+ 24))

    df['hour'] = pd.Series(hour_list,index=df.index)


    df.set_index(['SourceID','local_date','local_time','Vge'],inplace=True)

    #gb = df['hour'].groupby(df['PC'])
    #print(list(gb))
    gb = df['PC']
    class_list = []
    for msg in df['PC']:
        if 'SUB' in msg:
            class_list.append('SUB')
        else:
            class_list.append('MAIN')

    df['Type'] = pd.Series(class_list,index=df.index)


    print(df.groupby(['hour','Type'])['BSs'].aggregate(np.mean))
    gb = df['Type'].groupby(df['hour'])
    #print(list(gb))

    #print(list(df.groupby(['hour','Type']).count()))

    df.to_csv(OUTPUT_FILE)

我希望随着时间的推移获得BSs字段的平均值。这就是我在上面的print(df.groupby(['hour','Type'])['BSs'].aggregate(np.mean))中尝试做的事情。

然而,很少有事情需要考虑。

  1. Vge值可根据2 types字段归类为Type
  2. 我们获得的Vge值的数量可能因时而异。
  3. 整个数据集为24小时。
  4. 可以从多个Vge收到SourceID个值。
  5. Vge SourceID值可能在print(df.groupby(['hour','Type'])['BSs'].aggregate(np.mean))之间略有不同,但在同一时间间隔(同一小时)内应该有些相似
  6. 在这种情况下,如上所述override func tableView(tableView: UITableView, cellForRowAtIndexPath indexPath: NSIndexPath) -> UITableViewCell { let cell = tableView.dequeueReusableCellWithIdentifier("OMCFeedCell", forIndexPath: indexPath) as! OMCFeedTableViewCell let p = posts[indexPath.row] cell.selectionStyle = .None let data = p.content!.dataUsingEncoding(NSUTF8StringEncoding, allowLossyConversion: false) let content = try! JSON(NSJSONSerialization.JSONObjectWithData(data!, options: NSJSONReadingOptions.MutableContainers)) let image = content["image"].string let uploads = uploadAPI() uploads.tryGetImage(image) { (result: UIImage?) in if let i = result { cell.postImage.image = i cell.postImage.contentMode = .ScaleAspectFill let containerWidth = cell.postImage.frame.width let aspectRatio = i.size.height / i.size.width let scaledHeight = aspectRatio * containerWidth // do something with the calculated height } else { // kill the imageview } } if let text = content["text"].string { cell.postText.text = text // calculate the textview height } else { // kill the text section } return cell } override func tableView(tableView: UITableView, heightForRowAtIndexPath indexPath: NSIndexPath) -> CGFloat { if heights[indexPath.row] == nil { // calculate the correct overall height } return heights[indexPath.row]! } 的简单均值计算不充分,因为不同时间段(小时)内的样本数量不同。

    在这种情况下应该使用什么功能?

0 个答案:

没有答案