使用perl脚本逐行修改大文件

时间:2014-09-11 11:20:08

标签: perl csv

我有两个巨大的.csv文件,一个是大约8 GB,另一个是3.4 GB的文件大小。我想在.csv文件中的每一行中只有几个值。 它花费了大量时间来修改数据并将其复制到新文件中。

任何人都可以帮助修改代码。因此修改将在合理的时间内完成。

以下是代码行:

#!/usr/bin/perl
use strict;
use warnings;

use Text::CSV;

require "$ENV{'SAI_HOME'}/bin/utils/Logging.pl";
require "$ENV{'SAI_HOME'}/bin/utils/Utilities.pl";

my $date1 = `date '+%d-%m-%Y_%H-%M-%Ss'`;
chomp($date1);
our $LOGPATH = "$ENV{'SAI_HOME'}/logs/SP6migrationcsv_$date1.log";
my $status = 0;
log_info("Refer $LOGPATH log file for more information");
my $csv = Text::CSV->new( { binary => 1, eol => $/, sep_char => ',' } );
my $file1 = $ARGV[0] or die "Please provide Subscriber and Subscription CSV files on the command line\n";
my $file2 = $ARGV[1] or die "Please provide Subscriber and Subscription CSV files on the command line\n";
my $subscriberFile   = "";
my $subscriptionFile = "";

if ( ( grep /SUBSCRIBER/i, $file1 ) && ( grep /SUBSCRIPTION/i, $file2 ) ) {
    $subscriberFile   = $file1;
    $subscriptionFile = $file2;
} elsif ( ( grep /SUBSCRIBER/i, $file2 ) && ( grep /SUBSCRIPTION/i, $file1 ) ) {
    $subscriptionFile = $file1;
    $subscriberFile   = $file2;
} else {
    log_error("Invalid CSV files input");
    exit -1;
}
my $SP6DIR = `dirname $0`;
chomp $SP6DIR;
$SP6DIR = "${SP6DIR}/SP6";
`mkdir -p $SP6DIR` or checkExit( $?, "Unable to carete $SP6DIR directory" );
my $newSubscriberFile    = "Subscriber.csv";
my $newSubscriptionFile  = "Subscription.csv";
my $subscriptionimsifile = "$SP6DIR/.IMSI_$newSubscriptionFile";
my $subscriberimsifile   = "$SP6DIR/.IMSI_$newSubscriberFile";
$newSubscriberFile   = "${SP6DIR}/$newSubscriberFile";
$newSubscriptionFile = "${SP6DIR}/$newSubscriptionFile";
`dos2unix $subscriptionFile $subscriberFile 2>/dev/null`
    or checkExit( $?, "Unable to perform dos2unix on input files" );
`cut -d "," -f3 $subscriptionFile > $subscriptionimsifile`
    or checkExit( $?, "Failed to get IMSI details from $subscriptionFile" );
`cut -d "," -f1 $subscriberFile > $subscriberimsifile`
    or checkExit( $?, "Failed to get IMSI details from $subscriberFile" );
my $isSubscriptionHeaderPresesnt = "false";
my $isSubscriberHeaderPresesnt   = "false";
$status = system("head -1 $subscriptionimsifile | grep 'IMSI' >>/dev/null");

if ( $status == 0 ) {
    $isSubscriptionHeaderPresesnt = "true";
}
$status = system("head -1 $subscriberimsifile | grep 'IMSI' >>/dev/null");
if ( $status == 0 ) {
    $isSubscriberHeaderPresesnt = "true";
}
open( my $subscriptionData, '<:encoding(utf8)', $subscriptionFile )
    or die "Could not open '$subscriptionFile' $!\n";
open( NEWSUBSCRIBERDATA,   "> $newSubscriberFile" )   or die "Could not open '$newSubscriberFile' $!\n";
open( NEWSUBSCRIPTIONDATA, "> $newSubscriptionFile" ) or die "Could not open '$newSubscriptionFile' $!\n";
if ( "$isSubscriptionHeaderPresesnt" eq "true" ) {
    my $subscriptionHeader = <$subscriptionData>;
    if ( $csv->parse($subscriptionHeader) ) {
        my @subscriptionHeaderFields = $csv->fields();
        print NEWSUBSCRIPTIONDATA "\"$subscriptionHeaderFields[0]\",\"$subscriptionHeaderFields[2]\",\"$subscriptionHeaderFields[4]\",\"$subscriptionHeaderFields[5]\",\"$subscriptionHeaderFields[6]\",\"$subscriptionHeaderFields[8]\",\"$subscriptionHeaderFields[13]\",\"$subscriptionHeaderFields[14]\",\"$subscriptionHeaderFields[15]\",\"$subscriptionHeaderFields[16]\",\"$subscriptionHeaderFields[17]\",\"$subscriptionHeaderFields[18]\",\"$subscriptionHeaderFields[25]\",\"$subscriptionHeaderFields[26]\",\"$subscriptionHeaderFields[27]\"\n";
        print NEWSUBSCRIBERDATA "\"IMSI\",\"IMEI\",\"MSISDN\",\"$subscriptionHeaderFields[21]\",\"$subscriptionHeaderFields[22]\",\"$subscriptionHeaderFields[12]\",\"$subscriptionHeaderFields[9]\",\"$subscriptionHeaderFields[1]\",\"$subscriptionHeaderFields[0]\"\n";
    } else {
        log_error("Line could not be parsed: $subscriptionHeader\n");
        exit 1;
    }
} else {
    log_only("No header info in subscription file");
}

if ( "$isSubscriptionHeaderPresesnt" eq "false" && "$isSubscriberHeaderPresesnt" eq "true" ) {
    print NEWSUBSCRIBERDATA "\"IMSI\",\"IMEI\",\"MSISDN\",\"CUSTOMER_SEGMENTATION\",\"CUST_SUBCATEGORY\",\"SUBS_TYPE\",\"SUBSCRIPTION_PLAN\",\"CONTRACT_IDREF\",\"SUBSCRIPTION_IDREF\"\n";
} else {
    log_only("No header info in subscriber file");
}
my $subscriberHeader   = "";
my @subscriptionFields = {};
my @subscriberFields   = {};

while ( my $eachSubscriptionLine = <$subscriptionData> ) {
    chomp $eachSubscriptionLine;
    if ( $csv->parse($eachSubscriptionLine) ) {
        @subscriptionFields = $csv->fields();
        $status = system("grep \"^[\\\"]*${subscriptionFields[2]}[\\\"]*\\\$\" $subscriberimsifile >> /dev/null");
        if ( $status == 0 ) {
            my $lastMatchedSubscriberdata = `grep  "^[\\\"]*${subscriptionFields[2]}[\\\"]*," $subscriberFile | tail -1`;
            chomp $lastMatchedSubscriberdata;
            if ( $csv->parse($lastMatchedSubscriberdata) ) {
                @subscriberFields = $csv->fields();
                if ( "${subscriberFields[0]}" eq "${subscriptionFields[2]}" ) {
                    #log_only("Updating \"@subscriberFields\" subscriber details from subscription data");
                    print NEWSUBSCRIBERDATA "\"$subscriberFields[0]\",\"$subscriberFields[1]\",\"$subscriptionFields[2]\",\"$subscriptionFields[21]\",\"$subscriptionFields[22]\",\"$subscriptionFields[12]\",\"$subscriptionFields[9]\",\"$subscriptionFields[1]\",\"$subscriptionFields[0]\"\n";
                } else {
                    log_error("Unable to process @subscriberFields record");
                    exit -1;
                }
            } else {
                log_error("Line could not be parsed: $lastMatchedSubscriberdata\n");
                exit 1;
            }
        } else {
            log_only("Adding new subscriber details from subscription : \"@subscriptionFields\"");
            print NEWSUBSCRIBERDATA "\"$subscriptionFields[2]\",,\"$subscriptionFields[3]\",\"$subscriptionFields[21]\",\"$subscriptionFields[22]\",\"$subscriptionFields[12]\",\"$subscriptionFields[9]\",\"$subscriptionFields[1]\",\"$subscriptionFields[0]\"\n";
        }
        print NEWSUBSCRIPTIONDATA "\"$subscriptionFields[0]\",\"$subscriptionFields[2]\",\"$subscriptionFields[4]\",\"$subscriptionFields[5]\",\"$subscriptionFields[6]\",\"$subscriptionFields[8]\",\"$subscriptionFields[13]\",\"$subscriptionFields[14]\",\"$subscriptionFields[15]\",\"$subscriptionFields[16]\",\"$subscriptionFields[17]\",\"$subscriptionFields[18]\",\"$subscriptionFields[25]\",\"$subscriptionFields[26]\",\"$subscriptionFields[27]\"\n";
    } else {
        log_error("Line could not be parsed: $eachSubscriptionLine\n");
        exit 1;
    }
}
close(NEWSUBSCRIPTIONDATA);
open( my $subscriberData, '<:encoding(utf8)', $subscriberFile ) || die "Could not open '$subscriberFile' $!\n";
if ( "$isSubscriberHeaderPresesnt" eq "true" ) {
    $subscriberHeader = <$subscriberData>;
}
while ( my $eachSubscriberLine = <$subscriberData> ) {
    chomp $eachSubscriberLine;
    if ( $csv->parse($eachSubscriberLine) ) {
        @subscriberFields = $csv->fields();
        $status = system("grep \"^[\\\"]*${subscriberFields[0]}[\\\"]*\\\$\" $subscriptionimsifile >>/dev/null");
        if ( $status != 0 ) {
            log_only(
                "Adding back subscriber details, because unable to get IMSI details from subscription file : \"@subscriberFields\""
            );
            print NEWSUBSCRIBERDATA "\"$subscriberFields[0]\",\"$subscriberFields[1]\",\"$subscriberFields[2]\",\"$subscriberFields[6]\",,\"$subscriberFields[7]\",,,\n";
        }
    } else {
        log_error("Line could not be parsed: $eachSubscriberLine\n");
        exit 1;
    }
}
close(NEWSUBSCRIBERDATA);
`sed -i -e '1 s|SUBSCRIPTION_ID|SUBSCRIPTION_IDREF|g' -e '1 s|SUBS_CATEGORY|SUBSCRIPTION_PLAN|g'  -e '1 s|SUBS_STATE|SUBS_TYPE|g'  -e '1 s|CUST_CATEGORY|CUSTOMER_SEGMENTATION|g' $newSubscriberFile`
    or checkExit( $?, "Unable to update header info in subscriber fi   le" );

1 个答案:

答案 0 :(得分:4)

一般建议:

  • 请勿使用反引号来解析输入。 Perl完全有能力使用while循环和split执行此操作。

  • 拼写错误的变量名称会让你感到困惑。别。 isSubscriptionHeaderPresesnt

  • 混合你的open调用 - 3个参数与词法通常是首选,但不匹配并不好。

  • 使用文本字符串“false”代替布尔值是可怕的。不要这样做。有一天某人会做相当于print "true" if "false"的事情而且它会破裂。

  • 您的脚本将执行的最“昂贵”的操作是读取文件。这几乎总是如此。因此,在grepsed中查找系统调用以外的内容需要完全重新读取您要定位的文件。假设subscriptionFilesubscriberFile是你的大人物,你会多次阅读它们 - 你正在运行一个cut来读取整个事情。一个dos2unix读取整个事物。一个grep读取整个事物。然后你open它,并阅读整个事情。

  • 你的最后一行是sed,它将......完全重新读取你的输出文件,并逐行应用它。