我在Panda数据帧中存储了几百万个事件日志。一列名为data
,存储事件日志详细信息。这是一个例子:
SubjectUserName=XXXX, SubjectDomainName=XX, TargetUserName=XXXX, TargetDomainName=XX.LOCAL, TargetServerName=XXXX.xx.local, TargetInfo=exchangeMDB/XXXX.xx.local, ProcessName=C:\Windows\System32\rundll32.exe
根据捕获的事件,我有数百万行具有不同的key=value
对。
我之后提取所有密钥并计算在整个数据集中看到每个密钥的次数。例如:
item, count
SubjectUserName, 1000
TargetServerName, 2000
...
我的代码如下所示:
events = eventlogs()
items = events['data'].str.findall('([^ =]+)=')
items = items.value_counts()
return items
我收到以下错误:AssertionError: 1 columns passed, passed data had 25 columns
。
在这种情况下使用extractall()或其他方法的正确方法是什么?
提前感谢您的协助!
答案 0 :(得分:2)
我认为您可以使用split
stack
,
和to_frame
一列df
,然后按=
拆分。最后value_counts
:
print (df)
data
0 SubjectUserName=XXXX, SubjectDomainName=XX, Ta...
1 SubjectUserName=XXXX, SubjectDomainName=XX, Ta...
df = df.data.str.split(', ', expand=True).stack().to_frame('data')
splitted = df.data.str.split('=', expand=True)
splitted.columns = ['key','val']
print (splitted)
key val
0 0 SubjectUserName XXXX
1 SubjectDomainName XX
2 TargetUserName XXXX
3 TargetDomainName XX.LOCAL
4 TargetServerName XXXX.xx.local
5 TargetInfo exchangeMDB/XXXX.xx.local
6 ProcessName C:\Windows\System32\rundll32.exe
1 0 SubjectUserName XXXX
1 SubjectDomainName XX
2 TargetUserName XXXX
3 TargetDomainName XX.LOCAL
4 TargetServerName XXXX.xx.local
5 TargetInfo exchangeMDB/XXXX.xx.local
6 ProcessName C:\Windows\System32\rundll32.exe
items = splitted.key.value_counts().reset_index()
items.columns = ['item','count']
print (items)
item count
0 SubjectUserName 2
1 TargetServerName 2
2 ProcessName 2
3 TargetDomainName 2
4 SubjectDomainName 2
5 TargetUserName 2
6 TargetInfo 2
如果NaN
列中的data
列与DataFrame
构造函数和list comprehension
中没有df = pd.DataFrame([ x.split(', ') for x in df['data'].values.tolist()])
.stack()
.to_frame('data')
print (df)
data
0 0 SubjectUserName=XXXX
1 SubjectDomainName=XX
2 TargetUserName=XXXX
3 TargetDomainName=XX.LOCAL
4 TargetServerName=XXXX.xx.local
5 TargetInfo=exchangeMDB/XXXX.xx.local
6 ProcessName=C:\Windows\System32\rundll32.exe
1 0 SubjectUserName=XXXX
1 SubjectDomainName=XX
2 TargetUserName=XXXX
3 TargetDomainName=XX.LOCAL
4 TargetServerName=XXXX.xx.local
5 TargetInfo=exchangeMDB/XXXX.xx.local
6 ProcessName=C:\Windows\System32\rundll32.exe
splitted = pd.DataFrame([ x.split('=') for x in df['data'].values.tolist()])
splitted.columns = ['key','val']
items = splitted.key.value_counts().reset_index()
items.columns = ['item','count']
print (items)
item count
0 SubjectUserName 2
1 TargetServerName 2
2 ProcessName 2
3 TargetDomainName 2
4 SubjectDomainName 2
5 TargetUserName 2
6 TargetInfo 2
值,则另一个更快的解决方案:
import Foundation
extension Renter {
var dictionaryRepresentation: [String: Any]? {
guard let email = email,
let zipCode = wantedZipCode,
let city = wantedCity,
let state = wantedState,
let country = wantedCountry,
let creditRating = creditRating,
let firstName = firstName,
let lastName = lastName,
let id = id
else { return nil }
var dictionaryRepresentation: [String: Any] = [UserController.kEmail: email,
UserController.kZipCode: zipCode,
UserController.kCity: city,
UserController.kState: state,
UserController.kCountry: country,
UserController.kCreditRating: creditRating,
UserController.kPetsAllowed: wantsPetFriendly,
UserController.kSmokingAllowed: wantsSmoking,
UserController.kWasherDryer: wantsWasherDryer,
UserController.kGarage: wantsGarage,
UserController.kDishwasher: wantsDishwasher,
UserController.kBackyard: wantsBackyard,
UserController.kPool: wantsPool,
UserController.kGym: wantsGym,
UserController.kFirstName: firstName,
UserController.kLastName: lastName,
UserController.kMonthlyPayment: Int(wantedPayment),
UserController.kID: id,
UserController.kBedroomCount: Int(wantedBedroomCount),
UserController.kBathroomCount: wantedBathroomCount,
UserController.kBio: bio ?? "No bio available",
UserController.kStarRating: starRating,
UserController.kMaritalStatus: maritalStatus ?? "Not specified",
UserController.kCurrentOccupation: currentOccupation ?? "No occupation yet",
UserController.kWithinRangeMiles: withinRangeMiles,
UserController.kBankruptcies: bankruptcies,
UserController.kCriminalHistory: criminalHistory ?? "",
UserController.kDriversLicenseNumber: driversLicenceNum ?? "",
UserController.kDriversLicensePicURL: driversLicensePicURL ?? "",
UserController.kEvictionHistory: evictionHistory ?? "",
UserController.kIncome: income ?? 0,
UserController.kIsStudent: isStudent ?? false,
UserController.kIsVerified: isVerified ?? false,
UserController.kPreviousAddress: previousAddress ?? "",
UserController.kReasonsForLeaving: reasonForLeaving ?? "",
UserController.kSchool: school ?? "",
UserController.kStudentID: studentID ?? "",
UserController.kStudentPhotoIdURL: studentPhotoIDURL ?? ""]
guard let profileImageArray = self.profileImages?.array as? [ProfileImage] else { return dictionaryRepresentation }
let imageURLs = profileImageArray.flatMap({$0.imageURL})
dictionaryRepresentation[UserController.kImageURLS] = imageURLs
guard let occupationHistory = self.occupation?.allObjects as? [Occupation] else { return dictionaryRepresentation }
let occupationDicts = occupationHistory.flatMap({ $0.dictionaryRepresentation })
dictionaryRepresentation[UserController.kOccupationHistory] = occupationDicts
return dictionaryRepresentation
}
}