Python:计算数据框列中所有行中特定字符的实例

时间:2015-08-21 18:49:01

标签: python pandas count

我有一个数据框(df),其中包含[[' toaddress',' ccaddress',' body']

我想迭代数据框的索引,以获取toaddress和ccaddress字段中的最小,最大和平均电子邮件地址数量,这是通过计算和' @'的实例来确定的。在这两列中的每个字段中

如果所有其他方法都失败了,我想我可以使用df.toaddress.str.contains(r' @')。sum()并将其除以数据框中的行数来获取平均值,但我认为它只计算至少有1个@符号的行。

4 个答案:

答案 0 :(得分:1)

您可以使用

df[['toaddress', 'ccaddress']].applymap(lambda x: str.count(x, '@'))

获取每个单元格中'@'的计数。

然后,您只需沿结果中的行轴计算pandas maxminmean

正如我对原始问题发表评论时,您已经建议使用df.toaddress.str.contains(r'@').sum() - 如果您不满意我是否愿意使用df.toaddress.str.count(r'@')而不是我上面展示的方法?< / p>

答案 1 :(得分:0)

len(filter(lambda df: df.toaddress.str.contains(r'@'),rows))

甚至

len(filter(lambda df: r'@' in str(df.toaddress), rows))

答案 2 :(得分:0)

也许是这样的

Object.assign()

返回:

#include <iostream>
#include <fstream>
#include <string>
#include <iomanip>
#include <cmath>
#include <cstdlib>
using namespace std;

class payroll {
    ifstream fin;
    char employeeid[12];
    char firstname[20];
    char lastname[20];
    char SMH;
    int SSN, hoursworked, overtimehours;
    double hourlyrate, regularpay, overtimepay, grosspay, taxrate, taxamount, netpay, sum, average;


public:
    payroll(){

        fin.open("employee.txt");

    }//CONSTRUCTOR

    ~payroll();

private:
    void calcgrosspay() {

        grosspay = regularpay + overtimepay;

        if (hoursworked > 40) {
            overtimehours = hoursworked - 40;
            overtimepay = overtimehours * hourlyrate * 1.5;
            regularpay = 40 * hourlyrate;
        }//if

        else {
            overtimehours = 0;
            overtimepay = 0;
            regularpay = hoursworked * hourlyrate;
        }//else
    }//for

    void calctax() {

        if (grosspay >= 500) taxrate = .30;
        else if (grosspay>200.00) taxrate = .20;
        else taxrate = .10;
        if (SMH == 'S' || SMH == 's')
            taxrate = taxrate + .05;
        taxamount = grosspay*taxrate;
    }//for end of grosspay and set taxrate FOR

    void calcNetpay() {
        netpay = grosspay - taxamount;
    }//end of calcnetpay function


    void printheadings() {

        cout << setw(49) << "-Payroll Report-" << endl;
        cout << "------------------------------------------------------------------------------" << endl;
        cout << "ID  First Name Last Name Stat SSN HW  HR  OT  OP   GP   Tax  Net" << endl;
        cout << "==============================================================================" << endl;
        cout << "------------------------------------------------------------------------------" << endl;
    }//printheadings

    void printdata() {
        setprecision(2);
        cout << setw(14) << employeeid
        << setw(16) << firstname
        << setw(15) << lastname
        << setw(6) << SMH
        << setw(5) << SSN
        << setw(6) << hoursworked
        << setw(6) << hourlyrate
        << setw(8) << grosspay
        << setw(6) << taxrate
        << setw(9) << regularpay
        << setw(6) << overtimehours
        << setw(6) << overtimepay
        << setw(9) << netpay << endl;
    }//print data

    void payroll::findsum(int i) {
        sum += netpay;
    }
    double payroll::findavg(double, int i) {
        average = sum / i;
        cout << endl << "The netpay average is " << average << endl;
        return average;
    }

public:
    void printreport() {
        int i = 0;
        printheadings();
        while (fin >> employeeid >> SMH >> SSN >> hoursworked >> hourlyrate >> firstname >> lastname)
        {
            calcgrosspay();
            calctax();
            calcNetpay();
            printheadings();
            printdata();
            i++;
            findsum(i);
        }//while
        findavg(sum, i);
    }//print data report

};  // end of payroll class

payroll::~payroll() {
    fin.close();
}//DESTRUCTOR

int main() {
    payroll employee;
    employee.printreport();
    system("PAUSE");
}//main

答案 3 :(得分:0)

此答案使用https://pypi.python.org/pypi/fake-factory生成测试数据

import pandas as pd
from random import randint
from faker import Factory
fake = Factory.create()

def emails():
    emailAdd = [fake.email()]
    for x in range(randint(0,3)):
        emailAdd.append(fake.email())

    return emailAdd

df1 = pd.DataFrame(columns=['toaddress', 'ccaddress', 'body'])

for extra in range(10):
    df1 = df1.append(pd.DataFrame({'toaddress':[emails()],'ccaddress':[emails()],'body':fake.text()}),ignore_index=True)

print('toaddress length is {}'.format([len(x) for x in df1.toaddress.values]))
print('ccaddress length is {}'.format([len(x) for x in df1.ccaddress.values]))

最后两行是计算电子邮件的部分。 我不确定你是否想要特别检查'@',也许你可以使用假工厂生成一些测试数据作为例子?