目前我正在使用Perl脚本解析日志文件并以这种格式输出CSV文件:
$ read_scalepack.pl HR00169_07.log
mosaicity 0.52 - 1.59
SG & cell: p21 64.3 78.1 67.0 90.0 96.8 90.0
Resolution Measured Unique Redun. %Complete R-value I/sigI
Overall 50.00-2.40 60827 23031 2.6 89.1 0.073 11.3
First Shell 50.00-5.17 7173 2431 3.0 91.9 0.044 21.1
3 sig Shell 2.85-2.70 6473 2446 2.6 95.4 0.316 3.4
Penult Shell 2.59-2.49 4149 2056 2.0 80.1 0.370 2.4
Last Shell 2.49-2.40 3105 1722 1.8 66.9 0.343 2.1
Rejected observations. Number: 1599 Percent: 2.6
然后我使用很长的方法将它导入到Excel中的单独工作表中,将我想要的格式的值复制到"需要"表格然后删除导入的CSV表格(看起来很痛苦)。我需要的是编辑Perl脚本(我没有经验)或使用shell脚本使CSV文件如下:
p21 64.3 78.1 67.0 90.0 96.8 90.0 0.52-1.59 50.00-2.40 60827 23031 2.6 89.1 0.073 50.00-5.17 7173 2431 3.0 91.9 0.044 2.49-2.40 3105 1722 1.8 66.9 0.343 2.1
基本上是一长串统计数据,而不是带有单词的堆积统计数据。
我在一行中使用的值是(" SG和单元格","镶嵌度","整体"," First Shell&#34 ;," Last Shell")。所以我忽略了一些行,因为它们没用。
我研究了以下文章:Editing CSV Files (Design Implementation)
这个人似乎正在做我想做的事情,但是他们并没有准确地说明我是如何做到的,或者至少是我需要做的部分。
我还附加了Perl脚本的一部分,以防从那个方面接近它,但是我对Perl的经验很少(警告Perl脚本很长)。
#!/usr/bin/perl -w
...
#
#initialize
$int_over_sigma_cutoff = 2.95;
$flag_correct_lp = 0;
# get information on file
if ( defined($last_file) ) {
@stat_list = stat( $images_directory . '/' . $images_dir_list[$last_file] );
$date_time = localtime( $stat_list[9] );
$date_time =~ s/^\s+//; # remove white space at beginning of line
@date_parts = split( / +/, $date_time );
$date = $date_parts[2] . '-' . uc( $date_parts[1] ) . '-' . $date_parts[4];
}
}
return $date;
}
sub read_xscale_lp {
#
my $junk;
my $flag_sg_cell = 0;
my $flag_by_shells = 0;
#
# Read XSCALE.LP log file
while ( <> ) {
#
# get space group
if (/ THE DATA COLLECTION STATISTICS REPORTED BELOW ASSUMES:/) {
$flag_sg_cell = 1;
}
#
# get unit cell parameters
elsif ($flag_sg_cell) {
$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line
@line_parts = split( / +/, $line );
if (/ SPACE_GROUP_NUMBER=/) {
$space_group = &get_space_group_name( $line_parts[1] );
}
elsif (/ UNIT_CELL_CONSTANTS=/) {
@cell = @line_parts[ 1 .. 7 ];
$flag_sg_cell = 0;
}
}
#
# get rejected measurements and total number of measured reflections
elsif (/REFLECTIONS REJECTED/) {
$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line
@line_parts = split( / +/, $line );
$number_rejected = $line_parts[0];
$overall_measured_w_rej = $line_parts[3];
}
#
# test whether column 10 will be Rmrgd-F or CC(1/2)
elsif (/ Rmrgd-F/) {
$column10 = "Rmrgd-F";
}
elsif (m| CC\(1/2\)|) {
$column10 = "CC1/2";
}
#
# Find table and set flag
elsif (/ SUBSET OF INTENSITY DATA WITH SIGNAL\/NOISE >= \-3.0 AS FUNCTION OF RESOLUTION/) {
$flag_by_shells = 1;
$is = 0;
for ( $i = 0; $i <= 2; $i++ ) {
$junk = <>;
}
}
#
# Make sure table flag is turned off
elsif (/ SUBSET OF INTENSITY DATA WITH SIGNAL\/NOISE >= \-2.0 AS FUNCTION OF RESOLUTION/) {
$flag_by_shells = 0;
}
#
# Get information about R-values, I/sigI, etc.
elsif ($flag_by_shells) {
$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line
$line =~ s/%//g; # remove percentage sign from numbers
@line_parts = split( / +/, $line );
#
# Get overall values
if ( $line =~ /total/ ) {
$overall_measured = $line_parts[1];
$overall_unique = $line_parts[2];
# $overall_theoretical[is] = $line_parts[3];
$overall_complete = $line_parts[4];
$overall_rvalue = $line_parts[5] / 100;
$overall_int_over_sig = $line_parts[8];
# $overall_r_redun_i[$is] = $line_parts[9];
if ( $column10 eq 'Rmrgd-F' ) {
# $overall_r_redun_f[$is] = $line_parts[10];
}
elsif ( $column10 eq 'CC1/2' ) {
$line_parts[10] =~ s/\*//;
$overall_cc_onehalf = $line_parts[10] / 100.;
}
$flag_by_shells = 0;
}
#
# Store individuals shells in arrays
else {
if ( $is > 0 ) {
$low_res[$is] = $high_res[ $is - 1 ];
}
$high_res[$is] = $line_parts[0];
$measured[$is] = $line_parts[1];
$unique[$is] = $line_parts[2];
# $theoretical[is] = $line_parts[3];
$complete[$is] = $line_parts[4];
$rvalue[$is] = $line_parts[5] / 100;
$int_over_sig[$is] = $line_parts[8];
# $r_redun_i[$is] = $line_parts[9];
if ( $column10 eq 'Rmrgd-F' ) {
# $r_redun_f[$is] = $line_parts[10];
}
elsif ( $column10 eq 'CC1/2' ) {
$line_parts[10] =~ s/\*//;
$cc_onehalf[$is] = $line_parts[10] / 100.;
}
$is++;
}
}
}
#
# Get overall low and high resolution
if ( !defined( $low_res[0] ) ) {
$low_res[0] = 50.;
}
$overall_low_res = $low_res[0];
$overall_high_res = $high_res[$#high_res];
}
sub read_mrfana {
#
# read autoPROC mrfana file and reduce binning to 10
my $line;
my @line_parts;
my $flag_table = 0;
my $ishell = 0;
my $jshell = 0;
#
while (<>) {
if (/binning will be in/) {
$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);
@line_parts = split( / +/, $line );
$no_of_shells = $line_parts[4];
$no_of_new_shells = $no_of_shells / 10;
if ( $no_of_new_shells != int($no_of_new_shells) ) {
$flag_not_integer = 1;
}
}
elsif ( defined($flag_not_integer) && /binning for run/ ) {
$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);
@line_parts = split( / +/, $line );
$no_of_shells = $line_parts[7];
$no_of_new_shells = $no_of_shells / 10;
if ( $no_of_new_shells != int($no_of_new_shells) ) {
die "$no_of_new_shells not evenly divisible by 10\n";
}
else {
undef($flag_not_integer);
}
}
elsif (/^ Resolution \#uniq \#R\w\w\w /) {
$line = <>;
$flag_table = 1;
}
elsif ( $flag_table == 1 ) {
if (/^ -------/) {
if ( $ishell != 10 ) {
$high_res[$ishell] = $line_parts[2];
if ( $unique[$ishell] > 0 ) {
$rvalue[$ishell] = $rvalue[$ishell] / $unique[$ishell];
$int_over_sig[$ishell] = $int_over_sig[$ishell] / $unique[$ishell];
$complete[$ishell] = 100. * $complete[$ishell] / $unique[$ishell];
$redundancy[$ishell] = $measured[$ishell] / $unique[$ishell];
}
}
else {
next;
}
}
elsif (/^ Total/) {
$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);
@line_parts = split( / +/, $line );
$overall_low_res = $line_parts[1];
$overall_high_res = $line_parts[3];
$overall_unique = $line_parts[4];
$overall_measured = $line_parts[5];
$overall_rvalue = $line_parts[6];
$overall_int_over_sig = $line_parts[13];
$overall_complete = 100. * $line_parts[14];
$overall_redundancy = $line_parts[16];
if ( $#line_parts <= 17 ) {
$number_rejected = 0;
}
elsif ( $#line_parts >= 25 ) {
$number_rejected
= $line_parts[18]
+ $line_parts[19]
+ $line_parts[20]
+ $line_parts[21]
+ $line_parts[22]
+ $line_parts[23]
+ $line_parts[24]
+ $line_parts[25];
}
else {
$number_rejected = $line_parts[18];
}
last;
}
else {
$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);
@line_parts = split( / +/, $line );
if ( $jshell % 10 == 0 ) {
$low_res[$ishell] = $line_parts[0];
$unique[$ishell] = $line_parts[3];
$measured[$ishell] = $line_parts[4];
$rvalue[$ishell] = $line_parts[5] * $line_parts[3];
$int_over_sig[$ishell] = $line_parts[12] * $line_parts[3];
$complete[$ishell] = $line_parts[13] * $line_parts[3];
}
else {
$unique[$ishell] = $unique[$ishell] + $line_parts[3];
$measured[$ishell] = $measured[$ishell] + $line_parts[4];
$rvalue[$ishell] = $rvalue[$ishell] + $line_parts[5] * $line_parts[3];
$int_over_sig[$ishell]
= $int_over_sig[$ishell] + $line_parts[12] * $line_parts[3];
$complete[$ishell] = $complete[$ishell] + $line_parts[13] * $line_parts[3];
}
if ( $jshell % 10 == 9 ) {
$high_res[$ishell] = $line_parts[2];
if ( $unique[$ishell] > 0 ) {
$rvalue[$ishell] = $rvalue[$ishell] / $unique[$ishell];
$int_over_sig[$ishell] = $int_over_sig[$ishell] / $unique[$ishell];
$complete[$ishell] = 100. * $complete[$ishell] / $unique[$ishell];
$redundancy[$ishell] = $measured[$ishell] / $unique[$ishell];
}
$ishell++;
}
$jshell++;
}
}
}
return;
}
sub read_xengen {
#
my $i;
my $junk;
my $flag_sg_cell = 0;
my $flag_redundancy = 0;
my $flag_int_over_sig = 0;
#
while (<>) {
if (/Statistical Treatment of Crystallographic Data for/) {
$flag_sg_cell = 1;
}
#
# read through 2 blank lines and "Crystal Description:"
elsif ($flag_sg_cell) {
#
# get space group
if (/Crystal Spacegroup\:/) {
$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line
@line_parts = split( / +/, $line );
$line_parts[5] =~ s/\,//;
$space_group = &get_space_group_name( $line_parts[5] );
}
#
# get unit cell parameters
elsif (/Real-Space Parameters\:/) {
$_ = <>; # parameters are on next line
$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line
@line_parts = split( / +/, $line );
@cell = @line_parts[ 0 .. 5 ];
$flag_sg_cell = 0;
}
}
elsif (/Summary of Observation Redundancies by Shells\:/) {
for ( $i = 0; $i <= 2; $i++ ) {
$junk = <>;
}
$is = 0;
$flag_redundancy = 1;
}
elsif ($flag_redundancy) {
$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line
@line_parts = split( / +/, $line );
#
# Get overall values
if ( $line =~ /Totals\:/ ) {
# $overall_theoretical = $line_parts[2];
$overall_unique = $line_parts[3];
if ( $line_parts[2] > 0 ) {
$overall_complete = 100 * $line_parts[3] / $line_parts[2];
}
else {
$overall_complete = 0;
}
$overall_measured = $line_parts[13];
$flag_redundancy = 0;
}
else {
if ( $is > 0 ) {
$low_res[$is] = $high_res[ $is - 1 ];
}
$high_res[$is] = $line_parts[0];
# $theoretical[$is] = $line_parts[2];
$unique[$is] = $line_parts[3];
if ( $line_parts[2] > 0 ) {
$complete[$is] = 100 * $line_parts[3] / $line_parts[2];
}
else {
$complete[$is] = 0;
}
$measured[$is] = $line_parts[13];
$is++;
}
}
elsif (/Summary of Reflection Intensities and R-factors by Shells\:/) {
for ( $i = 0; $i <= 3; $i++ ) {
$junk = <>;
}
$is = 0;
$flag_int_over_sig = 1;
}
elsif ($flag_int_over_sig) {
$line = $_;
chomp($line);
$line =~ s/^\s+//; # remove white space at beginning of line
@line_parts = split( / +/, $line );
#
# Get overall values for I/sigI and R-value
if ( $line =~ /Totals\:/ ) {
$overall_int_over_sig = $line_parts[2];
$overall_rvalue = $line_parts[17] / 100;
$flag_int_over_sig = 0;
}
#
# Store individual shells in arrays for I/sigI and R-value
else {
$int_over_sig[$is] = $line_parts[2];
$rvalue[$is] = $line_parts[17] / 100;
$is++;
}
}
}
}
sub check_for_scala_log {
my @dir_pieces = split( /\//, $_[0] );
my $line;
my @line_parts;
my $scala_log;
my $flag_dataset_id = 0;
my $flag_space_group = 0;
my $count = 0;
#
if ( $#dir_pieces > 0 ) {
$scala_log = join( '/', @dir_pieces[ 0 .. $#dir_pieces - 1 ] ) . '/scala.log';
}
else {
$scala_log = 'scala.log';
}
if ( -e $scala_log ) {
open( LOG, "<$scala_log" ) || die "Unable to open $scala_log\n";
while (<LOG>) {
if ( m|Dataset ID, project/crystal/dataset names, cell dimensions, wavelength|
&& $flag_dataset_id == 0 ) {
$flag_dataset_id = 1;
$line = <LOG>; # blank line -- skip
$line = <LOG>; # Dataset ID project name
$line = <LOG>; # crystal name
$line = <LOG>; # dataset name
$line = <LOG>; # cell dimensions
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);
@cell = split( / +/, $line );
$line = <LOG>; # wavelength
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);
$wavelength = $line;
$count = $count + 6;
}
elsif ( /Space group =/ && $flag_space_group == 0 ) {
$flag_space_group = 1;
$line = $_;
$space_group = substr(
$line,
index( $line, "\'" ) + 1,
rindex( $line, "\'" ) - ( index( $line, "\'" ) + 1 ) );
$space_group =~ s/ //g;
$count = $count + 2;
}
$count++;
}
}
}
sub check_for_aimless_log {
my @dir_pieces = split( /\//, $_[0] );
my $line;
my @line_parts;
my $aimless_log;
my $flag_average_unit_cell = 0;
my $flag_space_group = 0;
#
if ( $#dir_pieces > 0 ) {
$aimless_log = join( '/', @dir_pieces[ 0 .. $#dir_pieces - 1 ] ) . '/aimless.log';
}
else {
$aimless_log = 'aimless.log';
}
if ( -e $aimless_log ) {
open( LOG, "<$aimless_log" ) || die "Unable to open $aimless_log\n";
while (<LOG>) {
if ( /Average unit cell/ && $flag_average_unit_cell == 0 ) {
$flag_average_unit_cell = 1;
$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);
@line_parts = split( / +/, $line );
@cell = @line_parts[ 3 .. 8 ];
}
elsif (/Wavelength/) {
$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);
@line_parts = split( / +/, $line );
$wavelength = $line_parts[1];
}
elsif (/Space group\:/) {
$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
chomp($line);
$space_group = substr( $line, index( $line, "\:" ) + 1 );
$space_group =~ s/ //g;
}
elsif (/Average mosaicity/) {
$line = $_;
$line =~ s/ +//; # delete spaces at beginning of line
@line_parts = split( / +/, $line );
$average_mosaicity = $line_parts[2];
if ( $mosaicity_min >= 1000. ) {
$mosaicity_min = $average_mosaicity;
}
if ( $mosaicity_max <= -1000. ) {
$mosaicity_max = $average_mosaicity;
}
}
}
}
}
sub read_scl_in {
#
# read scl.in, extract some information, and store scl.in in list
my $i_zone = 0;
my $line;
my @line_parts;
my $flag_est_err = 0;
my $file;
#
my @dir_pieces = split( /\//, $_[0] );
if ( $#dir_pieces > 0 ) {
$file = join( '/', @dir_pieces[ 0 .. $#dir_pieces - 1 ] ) . '/scl.in';
}
else {
$file = 'scl.in';
}
#
# Open old file
# open(INFILE,"<$file") or die "unable to open $file";
open( INFILE, "<$file" ) or return;
while (<INFILE>) {
$line = $_;
chomp($line);
if (/number of zones/) {
$line =~ s/^\s+//; # remove white space at beginning of line
@line_parts = split( / +/, $line );
$n_zone = $line_parts[3];
}
elsif (/estimated error/) {
$flag_est_err = 1;
}
elsif ( /space group/ && !defined($space_group) ) {
$space_group = 1;
}
elsif ( $flag_est_err == 1 ) {
$line =~ s/^\s+//; # remove white space at beginning of line
@line_parts = split( / +/, $line );
if ( $i_zone == 0 ) {
@est_err = @line_parts;
}
else {
push( @est_err, @line_parts );
}
$i_zone = $i_zone + scalar(@line_parts);
if ( $i_zone == $n_zone ) {
last;
}
}
}
close(INFILE);
}
sub read_output_sca {
#
# read output.sca to get cell parameters when reading scale.out for statistics
my $junk;
my $line;
my @line_parts;
my $file;
#
my @dir_pieces = split( /\//, $_[0] );
if ( $#dir_pieces > 0 ) {
$file = join( '/', @dir_pieces[ 0 .. $#dir_pieces - 1 ] ) . '/output.sca';
}
else {
$file = 'output.sca';
}
#
if ( -e $file ) {
open( SCA, "<$file" );
$junk = <SCA>; # Skip first and second lines
$junk = <SCA>;
$line = <SCA>; # Unit cell and space group are third line
close(SCA);
$line =~ s/^\s+//; # remove white space at beginning of line
@line_parts = split( / +/, $line );
@cell = @line_parts[ 0 .. 5 ]; # Unit cell parameters are first 6 values
$space_group = $line_parts[6]; # Space group is 7th value
}
}
sub get_space_group_name {
#
# Use space group dictionary to translate from number to name
$space_group_number = $_[0];
open( SGDIC, "</ap/bms/mmc/space.dic" );
while (<SGDIC>) {
if (/^(P|A|B|C|I|F|R)/) {
$line = $_;
chomp($line);
if ( substr( $line, 21, 3 ) == $space_group_number ) {
@line_parts = split( / +/, $line );
$space_group = $line_parts[0];
last;
}
}
}
return $space_group;
}
答案 0 :(得分:0)
将Perl脚本的输出放在此处。它将生成您需要的结果
您可以将以前的输出作为参数传递给命令行,或者您可以使用管道将上一个脚本的输出传递给此脚本而不保存文件
use strict;
use warnings 'all';
my @data;
while ( <> ) {
s/\s*-\s*/-/;
my @fields = grep /\d/, split;
pop @fields if /Overall|First Shell/;
if ( /mosaicity|Overall|First Shell|Last Shell/ ) {
push @data, \@fields;
}
elsif ( /SG & cell/ ) {
unshift @data, \@fields;
}
}
print join(' ', map @$_, @data), "\n";
p21 64.3 78.1 67.0 90.0 96.8 90.0 0.52-1.59 50.00-2.40 60827 23031 2.6 89.1 0.073 50.00-5.17 7173 2431 3.0 91.9 0.044 2.49-2.40 3105 1722 1.8 66.9 0.343 2.1