如何使用Pandas extractall()?

时间:2017-01-16 06:30:55

标签: python pandas

我在Panda数据帧中存储了几百万个事件日志。一列名为data,存储事件日志详细信息。这是一个例子:

SubjectUserName=XXXX, SubjectDomainName=XX, TargetUserName=XXXX, TargetDomainName=XX.LOCAL, TargetServerName=XXXX.xx.local, TargetInfo=exchangeMDB/XXXX.xx.local, ProcessName=C:\Windows\System32\rundll32.exe

根据捕获的事件,我有数百万行具有不同的key=value对。

我之后提取所有密钥并计算在整个数据集中看到每个密钥的次数。例如:

item, count
SubjectUserName, 1000
TargetServerName, 2000
...

我的代码如下所示:

events = eventlogs()
items = events['data'].str.findall('([^ =]+)=')
items = items.value_counts()
return items

我收到以下错误:AssertionError: 1 columns passed, passed data had 25 columns

在这种情况下使用extractall()或其他方法的正确方法是什么?

提前感谢您的协助!

  • 亚历山大。

1 个答案:

答案 0 :(得分:2)

我认为您可以使用split stack ,to_frame一列df,然后按=拆分。最后value_counts

print (df)
                                                data
0  SubjectUserName=XXXX, SubjectDomainName=XX, Ta...
1  SubjectUserName=XXXX, SubjectDomainName=XX, Ta...

df = df.data.str.split(', ', expand=True).stack().to_frame('data')
splitted = df.data.str.split('=', expand=True)
splitted.columns = ['key','val']
print (splitted)
                   key                               val
0 0    SubjectUserName                              XXXX
  1  SubjectDomainName                                XX
  2     TargetUserName                              XXXX
  3   TargetDomainName                          XX.LOCAL
  4   TargetServerName                     XXXX.xx.local
  5         TargetInfo         exchangeMDB/XXXX.xx.local
  6        ProcessName  C:\Windows\System32\rundll32.exe
1 0    SubjectUserName                              XXXX
  1  SubjectDomainName                                XX
  2     TargetUserName                              XXXX
  3   TargetDomainName                          XX.LOCAL
  4   TargetServerName                     XXXX.xx.local
  5         TargetInfo         exchangeMDB/XXXX.xx.local
  6        ProcessName  C:\Windows\System32\rundll32.exe
items = splitted.key.value_counts().reset_index()
items.columns = ['item','count']
print (items)
                item  count
0    SubjectUserName      2
1   TargetServerName      2
2        ProcessName      2
3   TargetDomainName      2
4  SubjectDomainName      2
5     TargetUserName      2
6         TargetInfo      2

如果NaN列中的data列与DataFrame构造函数和list comprehension中没有df = pd.DataFrame([ x.split(', ') for x in df['data'].values.tolist()]) .stack() .to_frame('data') print (df) data 0 0 SubjectUserName=XXXX 1 SubjectDomainName=XX 2 TargetUserName=XXXX 3 TargetDomainName=XX.LOCAL 4 TargetServerName=XXXX.xx.local 5 TargetInfo=exchangeMDB/XXXX.xx.local 6 ProcessName=C:\Windows\System32\rundll32.exe 1 0 SubjectUserName=XXXX 1 SubjectDomainName=XX 2 TargetUserName=XXXX 3 TargetDomainName=XX.LOCAL 4 TargetServerName=XXXX.xx.local 5 TargetInfo=exchangeMDB/XXXX.xx.local 6 ProcessName=C:\Windows\System32\rundll32.exe splitted = pd.DataFrame([ x.split('=') for x in df['data'].values.tolist()]) splitted.columns = ['key','val'] items = splitted.key.value_counts().reset_index() items.columns = ['item','count'] print (items) item count 0 SubjectUserName 2 1 TargetServerName 2 2 ProcessName 2 3 TargetDomainName 2 4 SubjectDomainName 2 5 TargetUserName 2 6 TargetInfo 2 值,则另一个更快的解决方案:

import Foundation

extension Renter {
    var dictionaryRepresentation: [String: Any]? {
        guard let email = email,
            let zipCode = wantedZipCode,
            let city = wantedCity,
            let state = wantedState,
            let country = wantedCountry,
            let creditRating = creditRating,
            let firstName = firstName,
            let lastName = lastName,
            let id = id
            else { return nil }

        var dictionaryRepresentation: [String: Any] = [UserController.kEmail: email,
                UserController.kZipCode: zipCode,
                UserController.kCity: city,
                UserController.kState: state,
                UserController.kCountry: country,
                UserController.kCreditRating: creditRating,
                UserController.kPetsAllowed: wantsPetFriendly,
                UserController.kSmokingAllowed: wantsSmoking,
                UserController.kWasherDryer: wantsWasherDryer,
                UserController.kGarage: wantsGarage,
                UserController.kDishwasher: wantsDishwasher,
                UserController.kBackyard: wantsBackyard,
                UserController.kPool: wantsPool,
                UserController.kGym: wantsGym,
                UserController.kFirstName: firstName,
                UserController.kLastName: lastName,
                UserController.kMonthlyPayment: Int(wantedPayment),
                UserController.kID: id,
                UserController.kBedroomCount: Int(wantedBedroomCount),
                UserController.kBathroomCount: wantedBathroomCount,
                UserController.kBio: bio ?? "No bio available",
                UserController.kStarRating: starRating,
                UserController.kMaritalStatus: maritalStatus ?? "Not specified",
                UserController.kCurrentOccupation: currentOccupation ?? "No occupation yet",
                UserController.kWithinRangeMiles: withinRangeMiles,
                UserController.kBankruptcies: bankruptcies,
                UserController.kCriminalHistory: criminalHistory ?? "",
                UserController.kDriversLicenseNumber: driversLicenceNum ?? "",
                UserController.kDriversLicensePicURL: driversLicensePicURL ?? "",
                UserController.kEvictionHistory: evictionHistory ?? "",
                UserController.kIncome: income ?? 0,
                UserController.kIsStudent: isStudent ?? false,
                UserController.kIsVerified: isVerified ?? false,
                UserController.kPreviousAddress: previousAddress ?? "",
                UserController.kReasonsForLeaving: reasonForLeaving ?? "",
                UserController.kSchool: school ?? "",
                UserController.kStudentID: studentID ?? "",
                UserController.kStudentPhotoIdURL: studentPhotoIDURL ?? ""]

        guard let profileImageArray = self.profileImages?.array as? [ProfileImage] else { return dictionaryRepresentation }

        let imageURLs = profileImageArray.flatMap({$0.imageURL})

        dictionaryRepresentation[UserController.kImageURLS] = imageURLs

        guard let occupationHistory = self.occupation?.allObjects as? [Occupation] else { return dictionaryRepresentation }

        let occupationDicts = occupationHistory.flatMap({ $0.dictionaryRepresentation })

        dictionaryRepresentation[UserController.kOccupationHistory] = occupationDicts

        return dictionaryRepresentation
    }

}