我有一个非常大的文件(> 20GB,grepped part of file)和转储LDAP数据库。
相同的grepped输出:
P[containerVrsn:U(0)recordVrsn:U(0)size:U(188)ownGid:G[mdp:U(1090171286)seqNo:U(4920)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1064620)size:U(160)updateVersion:U(3)checksum:U(1696878703)EPC_SubscriberPot:R[userId:S12("123456789123")groups:A1[ix0:S0("")]services:A0[]blacklist_services:A0[]operatorInfo:A1[ix0:S17("roamingAllowed:NO")]pccSubscriberPotRef:M0[]notificationData:A0[]familyId:S0("")trafficIds:A0[]]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(232)ownGid:G[mdp:U(1090171286)seqNo:U(4921)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1064620)size:U(204)updateVersion:U(102)checksum:U(607801120)EPC_SubscriberPot:R[userId:S12("123456789123")groups:A1[ix0:S44("g_monthly_30Mb_pred:1130:26-09-2015T23:21:19")]services:A0[]blacklist_services:A0[]operatorInfo:A1[ix0:S17("roamingAllowed:NO")]pccSubscriberPotRef:M0[]notificationData:A0[]familyId:S0("")trafficIds:A0[]]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(188)ownGid:G[mdp:U(1090171286)seqNo:U(4922)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1064620)size:U(160)updateVersion:U(85)checksum:U(1685673318)EPC_SubscriberPot:R[userId:S12("123456789123")groups:A1[ix0:S0("")]services:A0[]blacklist_services:A0[]operatorInfo:A1[ix0:S17("roamingAllowed:NO")]pccSubscriberPotRef:M0[]notificationData:A0[]familyId:S0("")trafficIds:A0[]]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(232)ownGid:G[mdp:U(1090171286)seqNo:U(4923)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1064620)size:U(204)updateVersion:U(12)checksum:U(841837929)EPC_SubscriberPot:R[userId:S12("123456789123")groups:A1[ix0:S42("g_monthly_30Mb_pred:13:19-09-2015T23:36:25")]services:A0[]blacklist_services:A0[]operatorInfo:A1[ix0:S17("roamingAllowed:NO")]pccSubscriberPotRef:M0[]notificationData:A0[]familyId:S0("")trafficIds:A0[]]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(188)ownGid:G[mdp:U(1090171286)seqNo:U(4924)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1064620)size:U(160)updateVersion:U(17)checksum:U(1715232109)EPC_SubscriberPot:R[userId:S12("123456789123")groups:A1[ix0:S0("")]services:A0[]blacklist_services:A0[]operatorInfo:A1[ix0:S17("roamingAllowed:NO")]pccSubscriberPotRef:M0[]notificationData:A0[]familyId:S0("")trafficIds:A0[]]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(188)ownGid:G[mdp:U(1090171286)seqNo:U(4925)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1064620)size:U(160)updateVersion:U(3)checksum:U(1831293547)EPC_SubscriberPot:R[userId:S12("123456789123")groups:A1[ix0:S0("")]services:A0[]blacklist_services:A0[]operatorInfo:A1[ix0:S17("roamingAllowed:NO")]pccSubscriberPotRef:M0[]notificationData:A0[]familyId:S0("")trafficIds:A0[]]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(188)ownGid:G[mdp:U(1090171286)seqNo:U(4926)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1064620)size:U(160)updateVersion:U(5)checksum:U(1814191975)EPC_SubscriberPot:R[userId:S12("123456789123")groups:A1[ix0:S0("")]services:A0[]blacklist_services:A0[]operatorInfo:A1[ix0:S17("roamingAllowed:NO")]pccSubscriberPotRef:M0[]notificationData:A0[]familyId:S0("")trafficIds:A0[]]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(188)ownGid:G[mdp:U(1090171286)seqNo:U(4927)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1064620)size:U(160)updateVersion:U(7)checksum:U(1714312039)EPC_SubscriberPot:R[userId:S12("123456789123")groups:A1[ix0:S0("")]services:A0[]blacklist_services:A0[]operatorInfo:A1[ix0:S17("roamingAllowed:NO")]pccSubscriberPotRef:M0[]notificationData:A0[]familyId:S0("")trafficIds:A0[]]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(232)ownGid:G[mdp:U(1090171286)seqNo:U(4928)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1064620)size:U(204)updateVersion:U(60)checksum:U(694242598)EPC_SubscriberPot:R[userId:S12("123456789123")groups:A1[ix0:S44("g_monthly_30Mb_pred:1130:25-09-2015T19:50:04")]services:A0[]blacklist_services:A0[]operatorInfo:A1[ix0:S17("roamingAllowed:NO")]pccSubscriberPotRef:M0[]notificationData:A0[]familyId:S0("")trafficIds:A0[]]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(224)ownGid:G[mdp:U(1090171286)seqNo:U(4929)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1064620)size:U(196)updateVersion:U(20)checksum:U(678693063)EPC_SubscriberPot:R[userId:S12("123456789123")groups:A1[ix0:S34("g_m_q_3GB:1170:30-09-2015T21:41:41")]services:A0[]blacklist_services:A0[]operatorInfo:A1[ix0:S17("roamingAllowed:NO")]pccSubscriberPotRef:M0[]notificationData:A0[]familyId:S0("")trafficIds:A0[]]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(420)ownGid:G[mdp:U(1118178710)seqNo:U(4868)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1091971)size:U(392)updateVersion:U(3673)checksum:U(208178972)EPC_UsageControlAccumulatedPot:R[subscriberId:S12("123456789123")usageControlAccum:S382("{\"reportingGroups\":[{\"absoluteAccumulated\":{\"bidirVolume\":31460668,\"previousExpiryDate\":{\"time\":\"25-12-2014T18:19:45\",\"volume\":\"25-12-2014T18:19:45\"}},\"name\":\"5000\",\"restartInfo\":\"25-12-2014T18:19:45\",\"selected\":\"yes\",\"subscriberGroupName\":\"g_monthly_30Mb_pred\",\"subscriptionDate\":\"25-12-2014T18:19:45\",\"validityTime\":0}],\"version\":\"2.1\"}\n")]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(420)ownGid:G[mdp:U(1118178710)seqNo:U(4869)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1091971)size:U(392)updateVersion:U(269)checksum:U(255165461)EPC_UsageControlAccumulatedPot:R[subscriberId:S12("123456789123")usageControlAccum:S382("{\"reportingGroups\":[{\"absoluteAccumulated\":{\"bidirVolume\":20419748,\"previousExpiryDate\":{\"time\":\"19-09-2015T23:36:25\",\"volume\":\"19-09-2015T23:36:25\"}},\"name\":\"5000\",\"restartInfo\":\"19-09-2015T23:36:25\",\"selected\":\"yes\",\"subscriberGroupName\":\"g_monthly_30Mb_pred\",\"subscriptionDate\":\"19-09-2015T23:36:25\",\"validityTime\":0}],\"version\":\"2.1\"}\n")]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(120)ownGid:G[mdp:U(1118178710)seqNo:U(4870)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1091971)size:U(92)updateVersion:U(10)checksum:U(171763356)EPC_UsageControlAccumulatedPot:R[subscriberId:S12("123456789123")usageControlAccum:S46("{\"reportingGroups\":[],\"version\":\"2.1\"}\n")]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(420)ownGid:G[mdp:U(1118178710)seqNo:U(4871)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1091971)size:U(392)updateVersion:U(686)checksum:U(144346640)EPC_UsageControlAccumulatedPot:R[subscriberId:S12("123456789123")usageControlAccum:S382("{\"reportingGroups\":[{\"absoluteAccumulated\":{\"bidirVolume\":29013670,\"previousExpiryDate\":{\"time\":\"13-12-2014T23:19:19\",\"volume\":\"13-12-2014T23:19:19\"}},\"name\":\"5000\",\"restartInfo\":\"13-12-2014T23:19:19\",\"selected\":\"yes\",\"subscriberGroupName\":\"g_monthly_30Mb_pred\",\"subscriptionDate\":\"13-12-2014T23:19:19\",\"validityTime\":0}],\"version\":\"2.1\"}\n")]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(120)ownGid:G[mdp:U(1118178710)seqNo:U(4872)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1091971)size:U(92)updateVersion:U(185)checksum:U(92726418)EPC_UsageControlAccumulatedPot:R[subscriberId:S12("123456789123")usageControlAccum:S46("{\"reportingGroups\":[],\"version\":\"2.1\"}\n")]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(420)ownGid:G[mdp:U(1118178710)seqNo:U(4873)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1091971)size:U(392)updateVersion:U(139)checksum:U(247172114)EPC_UsageControlAccumulatedPot:R[subscriberId:S12("123456789123")usageControlAccum:S382("{\"reportingGroups\":[{\"absoluteAccumulated\":{\"bidirVolume\":31612708,\"previousExpiryDate\":{\"time\":\"29-11-2014T04:14:03\",\"volume\":\"29-11-2014T04:14:03\"}},\"name\":\"5000\",\"restartInfo\":\"29-11-2014T04:14:03\",\"selected\":\"yes\",\"subscriberGroupName\":\"g_monthly_30Mb_pred\",\"subscriptionDate\":\"29-11-2014T04:14:03\",\"validityTime\":0}],\"version\":\"2.1\"}\n")]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(420)ownGid:G[mdp:U(1118178710)seqNo:U(4874)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1091971)size:U(392)updateVersion:U(12196)checksum:U(714375199)EPC_UsageControlAccumulatedPot:R[subscriberId:S12("123456789123")usageControlAccum:S382("{\"reportingGroups\":[{\"absoluteAccumulated\":{\"bidirVolume\":32961591,\"previousExpiryDate\":{\"time\":\"25-09-2015T19:50:04\",\"volume\":\"25-09-2015T19:50:04\"}},\"name\":\"5000\",\"restartInfo\":\"25-09-2015T19:50:04\",\"selected\":\"yes\",\"subscriberGroupName\":\"g_monthly_30Mb_pred\",\"subscriptionDate\":\"25-09-2015T19:50:04\",\"validityTime\":0}],\"version\":\"2.1\"}\n")]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(412)ownGid:G[mdp:U(1118178710)seqNo:U(4875)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1091971)size:U(384)updateVersion:U(64555)checksum:U(3993616217)EPC_UsageControlAccumulatedPot:R[subscriberId:S12("123456789123")usageControlAccum:S372("{\"reportingGroups\":[{\"absoluteAccumulated\":{\"bidirVolume\":29110847,\"previousExpiryDate\":{\"time\":\"30-09-2015T21:41:41\",\"volume\":\"30-09-2015T21:41:41\"}},\"name\":\"5000\",\"restartInfo\":\"30-09-2015T21:41:41\",\"selected\":\"yes\",\"subscriberGroupName\":\"g_m_q_3GB\",\"subscriptionDate\":\"30-09-2015T21:41:41\",\"validityTime\":0}],\"version\":\"2.1\"}\n")]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(120)ownGid:G[mdp:U(1118178710)seqNo:U(4876)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1091971)size:U(92)updateVersion:U(1427)checksum:U(162455704)EPC_UsageControlAccumulatedPot:R[subscriberId:S12("123456789123")usageControlAccum:S46("{\"reportingGroups\":[],\"version\":\"2.1\"}\n")]]]
P[containerVrsn:U(0)recordVrsn:U(0)size:U(420)ownGid:G[mdp:U(1118178710)seqNo:U(4877)]logicalDbNo:U(1)classVrsn:U(1)timeStamp:U(0)dbRecord:T[classNo:U(1091971)size:U(392)updateVersion:U(166)checksum:U(261918226)EPC_UsageControlAccumulatedPot:R[subscriberId:S12("123456789123")usageControlAccum:S382("{\"reportingGroups\":[{\"absoluteAccumulated\":{\"bidirVolume\":31471028,\"previousExpiryDate\":{\"time\":\"31-05-2015T13:14:46\",\"volume\":\"31-05-2015T13:14:46\"}},\"name\":\"5000\",\"restartInfo\":\"31-05-2015T13:14:46\",\"selected\":\"yes\",\"subscriberGroupName\":\"g_monthly_30Mb_pred\",\"subscriptionDate\":\"31-05-2015T13:14:46\",\"validityTime\":0}],\"version\":\"2.1\"}\n")]]]
我找到了sed命令,它将此脚本的一部分旋转为某些分隔的文本格式。我已阅读SED文档并更改了此脚本。这使我能够获得其他所需的信息,但我已经面临sed
存储字符串的限制(最多9个),并且想知道是否可以重复使用它。
更改了脚本
cat ./databaseBackup/database_output.txt | \
sed -n '
#found part with changes
'/EPC_SubscriberPot/' {
s/.*userId.*("\(.*\)")groups.*("\(g_.*\)\(g_.*\)\(g_.*\)\(g_.*\)\(g_.*\)\(g_.*\)")]services.*/\1|\2\n\1|\3\n\1|\4\n\1|\5\n\1|\6\n\1|\7/w file1.log
s/.*userId.*("\(.*\)")groups.*("\(g_.*\)\(g_.*\)\(g_.*\)\(g_.*\)\(g_.*\)")]services.*/\1|\2\n\1|\3\n\1|\4\n\1|\5\n\1|\6/w file1.log
s/.*userId.*("\(.*\)")groups.*("\(g_.*\)\(g_.*\)\(g_.*\)\(g_.*\)")]services.*/\1|\2\n\1|\3\n\1|\4\n\1|\5/w file1.log
s/.*userId.*("\(.*\)")groups.*("\(g_.*\)\(g_.*\)\(g_.*\)")]services.*/\1|\2\n\1|\3\n\1|\4/w file1.log
s/.*userId.*("\(.*\)")groups.*("\(g_.*\)\(g_.*\)")]services.*/\1|\2\n\1|\3/w file1.log
s/.*userId.*("\(.*\)")groups.*("\(g_.*\)")]services.*/\1|\2/w file1.log
s/.*userId.*("\(.*\)")groups.*/\1|/w file1.log
}
#added part
'/EPC_UsageControlAccumulatedPot/' {
s/.*subscriberId.*("\(.*\)")usageControlAccum.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*/\1|\2\n\1|\3\n\1|\4\n\1|\5\n\1|\6\n\1|\7\n\1|\8\n\1|\9/w file2.log
s/.*subscriberId.*("\(.*\)")usageControlAccum.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*/\1|\2\n\1|\3\n\1|\4\n\1|\5\n\1|\6\n\1|\7\n\1|\8/w file2.log
s/.*subscriberId.*("\(.*\)")usageControlAccum.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*/\1|\2\n\1|\3\n\1|\4\n\1|\5\n\1|\6\n\1|\7/w file2.log
s/.*subscriberId.*("\(.*\)")usageControlAccum.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*/\1|\2\n\1|\3\n\1|\4\n\1|\5\n\1|\6/w file2.log
s/.*subscriberId.*("\(.*\)")usageControlAccum.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*/\1|\2\n\1|\3\n\1|\4\n\1|\5/w file2.log
s/.*subscriberId.*("\(.*\)")usageControlAccum.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*/\1|\2\n\1|\3\n\1|\4/w file2.log
s/.*subscriberId.*("\(.*\)")usageControlAccum.*\(bidirVolume....*\)."...subscriptionDate.*\(bidirVolume....*\)."...subscriptionDate.*/\1|\2\n\1|\3/w file2.log
s/.*subscriberId.*("\(.*\)")usageControlAccum.*\(bidirVolume....*\)."...subscriptionDate.*/\1|\2/w file2.log
}'
#working with improved output to avoid sed limitation
cat ./file2.log | sed -n '
s/\([0-9]*\)|bidirVolume...\([0-9]*\),..*expiryDate.*.subscriberGroupName.....\(.*\)/\1|\2|\3/Iw file2_1.log
'
字符串的简短描述:
问题是:
以下是所需的输出:
# head file1_s.log
123456789123|
123456789123|g_m_q_3GB:17:15-09-2015T19:45:29
123456789123|g_m_q_3GB:17:06-09-2015T17:54:44
123456789123|g_monthly_1Gb_64kb:1160:26-09-2015T16:09:25
123456789123|
# head file2_1.log
123456789123|31460668|g_monthly_30Mb_pred
123456789123|20419748|g_monthly_30Mb_pred
123456789123|29013670|g_monthly_30Mb_pred
123456789123|31612708|g_monthly_30Mb_pred
答案 0 :(得分:0)
正如Jonathan Leffler建议我将我的研究转换为其他语言。在此之前,我使用了sed,因为它在初始主题中指定,解析时间超过4小时:
第一次尝试是perl,但我因为处理速度很慢而失败了(问题是以同样的方式思考 - 只是使用正则表达式)。工作一整夜,即使是第一部分支票(SubscriberPot)也没有完成。
#!/usr/bin/perl
use strict;
use warnings;
my $dt1 = localtime;
print "Started: $dt1\n";
my $file = './databaseBackup/database_output.txt';
my $fileOut1 = './out1.log';
open my $info, $file or die "Could not open $file: $!";
open my $fo1, ">", $fileOut1 or die "Could not open $file: $!";
while( my $line = <$info>) {
if ($line =~ /EPC_SubscriberPot/){
if ($line =~ /.*groups.*\[.*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\)\]services.*/) {
#9 packs
$line =~ /.*userId.*\("(.*?)"\)groups.*\[.*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\)\]services.*/;
print $fo1 "$1|$2\n$1|$3\n$1|$4\n$1|$5\n$1|$6\n$1|$7\n$1|$8\n$1|$9\n$1|$10\n";
} elsif ($line =~ /.*groups.*\[.*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\)\]services.*/) {
#8 packs
$line =~ /.*userId.*\("(.*?)"\)groups.*\[.*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\)\]services.*/;
print $fo1 "$1|$2\n$1|$3\n$1|$4\n$1|$5\n$1|$6\n$1|$7\n$1|$8\n$1|$9\n";
} elsif ($line =~ /.*groups.*\[.*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\)\]services.*/) {
#7 packs
$line =~ /.*userId.*\("(.*?)"\)groups.*\[.*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\)\]services.*/;
print $fo1 "$1|$2\n$1|$3\n$1|$4\n$1|$5\n$1|$6\n$1|$7\n$1|$8\n";
} elsif ($line =~ /.*groups.*\[.*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\)\]services.*/) {
#6 packs
$line =~ /.*userId.*\("(.*?)"\)groups.*\[.*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\)\]services.*/;
print $fo1 "$1|$2\n$1|$3\n$1|$4\n$1|$5\n$1|$6\n$1|$7\n";
} elsif ($line =~ /.*groups.*\[.*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\)\]services.*/) {
#5 packs
$line =~ /.*userId.*\("(.*?)"\)groups.*\[.*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\)\]services.*/;
print $fo1 "$1|$2\n$1|$3\n$1|$4\n$1|$5\n$1|$6\n";
} elsif ($line =~ /.*groups.*\[.*\(".*"\).*\(".*"\).*\(".*"\).*\(".*"\)\]services.*/) {
#4 packs
$line =~ /.*userId.*\("(.*?)"\)groups.*\[.*\("(.*)"\).*\("(.*)"\).*\("(.*)"\).*\("(.*)"\)\]services.*/;
print $fo1 "$1|$2\n$1|$3\n$1|$4\n$1|$5\n";
} elsif ($line =~ /.*groups.*\[.*\(".*"\).*\(".*"\).*\(".*"\)\]services.*/) {
#3 packs
$line =~ /.*userId.*\("(.*?)"\)groups.*\[.*\("(.*)"\).*\("(.*)"\).*\("(.*)"\)\]services.*/;
print $fo1 "$1|$2\n$1|$3\n$1|$4\n";
} elsif ($line =~ /.*groups.*\[.*\(".*"\)ix.*\(".*"\)\]services.*/) {
#2 packs
$line =~ /.*userId.*\("(.*?)"\)groups.*\[.*\("(.*)"\).*\("(.*)"\)\]services.*/;
print $fo1 "$1|$2\n$1|$3\n";
} elsif ($line =~ /.*groups.*\[.*\(".*"\)\]services.*/) {
#1 pack
$line =~ /.*userId.*\("(.*?)"\)groups.*\[.*\("(.*)"\)\]services.*/;
print $fo1 "$1|$2\n";
} elsif ($line =~ /.*groups.*\[\]services.*/) {
#no packs
$line =~ /.*userId.*\("(.*?)"\)groups.*\[\]services.*/;
print $fo1 "$1|\n";
}
}
if ($line =~ /EPC_UsageControlAccumulatedPot/){
}
}
close $info;
close $fo1;
my $dt2 = localtime;
print "Finished: $dt2\n";
第二次尝试是awk,但这次我尝试了另一种方法 - 部分解析后处理和额外解析。速度提升到约25分钟。
->./readFile.pl
Started: Wed Oct 7 18:47:25 2015
Finished: Wed Oct 7 18:53:26 2015
->cat ./readFile.pl
#!/bin/bash
echo "Clear old data"
rm -f repSubsc.txt
rm -f repAccum.txt
echo "Fetch started: `date "+%Y-%m-%d %T"`"
egrep "EPC_UsageControlAccumulatedPot|EPC_SubscriberPot" ./databaseBackup/database_output.txt | awk '
BEGIN{
x=0;
str = "";
fSubsc="repSubsc.txt";
fAccum="repAccum.txt";
}
{
if (index($0,"EPC_SubscriberPot")>0) {
str = gensub(/.*userId.*\("(.*)"\)groups.*\[(.*)\]services.*/, "\\1|\\2", "g", $0);
split(str,arr,"|");
msisdn=arr[1];
str=arr[2];
x=split(str,arr,"ix");
if (x > 0) {
for (i=2;i<=x;i++){
if (arr[i]!="" && (arr[i] !~ /.*\(""\)/)) {
str = gensub(/.*\("(.*)"\)/,"\\1","g",arr[i]);
print msisdn "|" str > fSubsc;
} else {
print msisdn "|" > fSubsc;
}
}
} else {
print msisdn "|" > fSubsc;
}
}
if (index($0,"EPC_UsageControlAccumulatedPot")>0) {
str = gensub(/.*subscriberId.*\("(.*)"\)usageControlAccum.*reportingGroups...\[(.*)\]...version.*/, "\\1|\\2", "g", $0);
split(str,arr,"|");
msisdn=arr[1];
str=arr[2];
x=split(str,arr,"absolute");
if (x > 0) {
for (i=2;i<=x;i++){
if (arr[i]!="" && (arr[i] !~ /.*\(""\)/)) {
str = gensub(/.*bidirVolume...(.*)...previousExpiryDate.*subscriberGroupName.....(.*).....subscriptionDate.*/,"\\1|\\2","g",arr[i]);
if (index(str,"expiryDate")>0){
split(str,arrIn,"|");
str=substr(arrIn[1],1,index(arrIn[1],",")-1) "|" arrIn[2];
}
print msisdn "|" str > fAccum
} else {
print msisdn "||" > fAccum
}
}
} else {
print msisdn "||" > fAccum
}
}
}
'
echo "Fetch finished: `date "+%Y-%m-%d %T"` -> Sorting started"
sort -u repSubsc.txt > repSubsc_1.txt
sort -u repAccum.txt > repAccum_1.txt
echo "Done: `date "+%Y-%m-%d %T"`"
第三次尝试是perl,已翻译并从awk重新格式化。这次我很高兴看到完整的处理工作只用了6分钟。我知道这不是结束。我的编码有很多空白,我必须修改代码。但目前它适合我的任务......
->./readFile_awk.sh
Clear old data
Fetch started: 2015-10-07 18:55:18
Fetch finished: 2015-10-07 19:19:23 -> Sorting started
Done: 2015-10-07 19:20:07
->cat ./readFile_awk.sh
#!/usr/bin/perl
use strict;
use warnings;
my $dt1 = localtime;
print "Started: $dt1\n";
my $file = './databaseBackup/database_output.txt';
my $fSubsc = 'repSubsc.txt';
my $fAccum = 'repAccum.txt';
my $msisdn = '';
my $str = '';
open my $info, $file or die "Could not open $file: $!";
open my $fs, ">", $fSubsc or die "Could not open $fSubsc: $!";
open my $fa, ">", $fAccum or die "Could not open $fAccum: $!";
while( my $line = <$info>) {
if (index($line, 'EPC_SubscriberPot')>0){
$line =~ /.*userId.*\("(.*)"\)groups.*\[(.*)\]services.*/;
$msisdn=$1;
$str=$2;
my @arr=split /ix/, $str;
my $x = @arr;
if ($x > 0) {
for (my $i=1;$i<=$x-1;$i++){
if ($arr[$i] ne "" && ($arr[$i] !~ /.*\(""\)/)) {
$str =~ /.*\("(.*)"\)/;
print $fs $msisdn."|$1\n";
} else {
print $fs $msisdn."|\n";
}
}
} else {
print $fs $msisdn."|\n";
}
}
if (index($line, 'EPC_UsageControlAccumulatedPot')>0){
$line =~ /.*subscriberId.*\("(.*)"\)usageControlAccum.*reportingGroups...(.*)...version.*/;
$msisdn=$1;
$str=$2;
my @arr=split /absolute/, $str;
my $x = @arr;
if ($x > 0) {
if ($x==1) {
print $fa $msisdn."||\n";
}
for (my $i=1;$i<=$x-1;$i++){
if ($arr[$i] ne "" && ($arr[$i] !~ /.*\(""\)/)) {
$arr[$i] =~ /.*bidirVolume...(.*)...previousExpiryDate.*subscriberGroupName.....(.*).....subscriptionDate.*/;
if (defined $1) {$str = $1."|";} else {$str="|";}
if (defined $1) {$str = $str.$2;}
if (index($str, 'expiryDate')>0){
my @arrIn = split /|/, $str;
$str=substr($arrIn[1],1,index($arrIn[1],",")-1)."|".$arrIn[2];
}
print $fa $msisdn."|".$str."\n";
} else {
print $fa $msisdn."||";
}
}
} else {
print $fa $msisdn."||";
}
}
}
close $info;
close $fs;
close $fa;
my $dt2 = localtime;
print "Finished: $dt2\n";