Question

我正在尝试从文件中的行中提取字段和下一个匹配项，其中所有字段都经常更改顺序。例如，在这些行中，

if (reader.EndOfStream)
{
    break;
}

我需要能够提取这些字段：

123_abc/previous-2016-04-16-022850/smpp_411_0.log.1.tar.gz:2016-04-14 18:11:46+0100 [WorkerAMQClient,client] Processed outbound message for 123_abc_411: {"transport_name": "123_abc_411", "from_addr_type": null, "group": null, "from_addr": "*123#", "timestamp": "2016-04-14 17:11:46.348000", "helper_metadata": {}, "to_addr": "0007031975326", "to_addr_type": null, "session_id": "1861570762", "transport_metadata": {"abc_mongolia_smpp": {"session_id": "1861570762", "starCode": "123", "requestId": "1534318080", "phase": "2", "clientId": "441", "dcs": "15"}}, "content": "Communication error\n\n0>Back", "session_event": "resume", "routing_metadata": {}, "message_version": "20110921", "transport_type": "smpp", "provider": "abc_mongolia", "in_reply_to": "b613a5fc5c0c4b1b8e1108bb8bd7b946", "message_type": "user_message", "message_id": "1206f738-f0d2-4a8e-9beb-2efeaada77d9"}
123_abc/previous-2016-04-16-022850/smpp_411_0.log.1.tar.gz:2016-04-14 18:11:46+0100 [WorkerAMQClient,client] Working with: <Message payload="{'transport_name': u'123_abc_411', 'from_addr_type': None, 'group': None, 'from_addr': u'*123#', 'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000), 'helper_metadata': {}, 'to_addr': u'0007031975326', 'to_addr_type': None, 'session_id': u'1861570762', 'transport_metadata': {u'abc_mongolia_smpp': {u'session_id': u'1861570762', u'starCode': u'123', u'requestId': u'1534318080', u'phase': u'2', u'clientId': u'441', u'dcs': u'15'}}, 'content': u'Communication error\n\n0>Back', 'session_event': u'resume', 'routing_metadata': {}, 'message_version': u'20110921', 'transport_type': u'smpp', 'provider': u'abc_mongolia', 'in_reply_to': u'b613a5fc5c0c4b1b8e1108bb8bd7b946', 'message_type': u'user_message', 'message_id': u'1206f738-f0d2-4a8e-9beb-2efeaada77d9'}">
123_abc/previous-2016-04-16-022850/smpp_411_0.log.1.tar.gz:2016-04-14 18:11:46+0100 [abracRedis,client] abcmongoliasmppTransport sending outbound message: <Message payload="{'transport_name': u'123_abc_411', 'from_addr_type': None, 'group': None, 'from_addr': u'*123#', 'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000), 'helper_metadata': {}, 'to_addr': u'0007031975326', 'to_addr_type': None, 'session_id': u'1861570762', 'transport_metadata': {u'abc_mongolia_smpp': {u'session_id': u'1861570762', u'starCode': u'123', u'requestId': u'1534318080', u'phase': u'2', u'clientId': u'441', u'dcs': u'15'}}, 'content': u'Communication error\n\n0>Back', 'session_event': u'resume', 'routing_metadata': {}, 'message_version': u'20110921', 'transport_type': u'smpp', 'provider': u'abc_mongolia', 'in_reply_to': u'b613a5fc5c0c4b1b8e1108bb8bd7b946', 'message_type': u'user_message', 'message_id': u'1206f738-f0d2-4a8e-9beb-2efeaada77d9'}">

123_dfg/smpp_37_0.log:2016-04-16 16:24:59+0100 [abracRedis,client] OUTGOING >> {'body': {'mandatory_parameters': {'priority_flag': 0, 'dest_addr_npi': 1, 'source_addr': '123', 'protocol_id': 0, 'replace_if_present_flag': 0, 'registered_delivery': True, 'dest_addr_ton': 1, 'source_addr_npi': 0, 'schedule_delivery_time': '', 'sm_default_msg_id': 0, 'sm_length': 0, 'esm_class': 0, 'data_coding': 0, 'service_type': '', 'source_addr_ton': 0, 'validity_period': '', 'destination_addr': '0007084023687', 'short_message': 'Communication error\n\n0>Back'}, 'optional_parameters': [{'length': 0, 'tag': 'smpp_service_op', 'value': '02'}, {'length': 0, 'tag': 'its_session_info', 'value': '3522'}]}, 'header': {'command_status': 'ESME_ROK', 'command_length': 0, 'sequence_number': 835674, 'command_id': 'submit_sm'}}
123_dfg/smpp_37_0.log:2016-04-16 17:02:40+0100 [WorkerAMQClient,client] Processed outbound message for 123_dfg_37: {"transport_name": "123_dfg_37", "from_addr_type": null, "group": null, "from_addr": "123", "timestamp": "2016-04-16 16:02:40.832000", "helper_metadata": {}, "to_addr": "0008081741472", "to_addr_type": null, "session_id": "d9dac229dec5499286890fa1c81aa16a", "transport_metadata": {"session_info": "5070"}, "content": "Communication error\n\n0>Back", "session_event": "resume", "routing_metadata": {}, "message_version": "20110921", "transport_type": "smpp", "provider": "123_dfg_37", "in_reply_to": "fd3e29028fb04f089fe764ba94d7d9af", "message_type": "user_message", "message_id": "b33737d5-39a7-478b-9680-31c54c15e3b2"}
123_dfg/smpp_37_0.log:2016-04-16 17:02:40+0100 [abracRedis,client] OUTGOING >> {'body': {'mandatory_parameters': {'priority_flag': 0, 'dest_addr_npi': 1, 'source_addr': '123', 'protocol_id': 0, 'replace_if_present_flag': 0, 'registered_delivery': True, 'dest_addr_ton': 1, 'source_addr_npi': 0, 'schedule_delivery_time': '', 'sm_default_msg_id': 0, 'sm_length': 0, 'esm_class': 0, 'data_coding': 0, 'service_type': '', 'source_addr_ton': 0, 'validity_period': '', 'destination_addr': '0008081741472', 'short_message': 'Communication error\n\n0>Back'}, 'optional_parameters': [{'length': 0, 'tag': 'smpp_service_op', 'value': '02'}, {'length': 0, 'tag': 'its_session_info', 'value': '5070'}]}, 'header': {'command_status': 'ESME_ROK', 'command_length': 0, 'sequence_number': 835793, 'command_id': 'submit_sm'}}

并将它们排成一行，每行在原始文件中。

我一直在尝试这种awk系列的不同变体：

'to_addr': u'0007031975326' (or 000
'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000),
'content': u'Communication error\n\n0>Back'

但我真的很短暂。请有人告诉我如何在awk和sed中执行此操作吗？

谢谢！

Answer 1

这是一个perl过滤器，可以解决这个问题：

<强> filter.pl

#!/usr/bin/env perl

use warnings;
use strict;
use JSON::XS;

use constant DEFAULT => 'unknown';

# Iterate over every line
while (my $line = <>) {
    next unless $line =~ m|\w+|;                   # Skip empty lines

    # Remove everything that is not a part of the JSON string
    $line =~ s|^[^{]*{|{|;
    $line =~ s|}[^}]*$|}|;

    my $is_unicode = 0;                            # Python style unicode strings
    if ($line =~ m|: u'\w+'|) {
        $is_unicode = 1;
        $line =~ s/(u)'([^']+)'/"$1|$2|"/g;        # Transform u'foo' -> "u|foo|"
    }

    $line =~ s|'|"|g;                              # Key names need double quotes
    $line =~ s|: (\w+),|: "$1",|g;                 # quote unquoted constructs
    $line =~ s|(datetime.datetime\(.+?\)),|"$1",|; # quote datetime

    my $data = decode_json($line);

    my ($to_addr, $ts, $content);
    if ($data->{body}) {
        # header/body format
        $to_addr = $data->{ body }->{ mandatory_parameters }->{ destination_addr } || DEFAULT;
        $ts = DEFAULT;
        $content = $data->{ body }->{ mandatory_parameters }->{ short_message }    || DEFAULT;
    } else {
        # Normal message
        $to_addr = $data->{ to_addr }   || DEFAULT;
        $ts      = $data->{ timestamp } || DEFAULT;
        $content = $data->{ content}    || DEFAULT;
    }

    for ($to_addr, $ts, $content) {
        if ($is_unicode) {
            s/u\|([^|]+)\|/u'$1'/; # Un-Transform "u|foo|" -> u'foo'
        }
        s|\n|\\n|g; # Escape newlines so they show up as '\n' in the output
    }

    print "'to_addr': $to_addr, 'timestamp': $ts, 'content': $content\n";
}

运行

cat logfile | perl filter.pl
'to_addr': 0007031975326, 'timestamp': 2016-04-14 17:11:46.348000, 'content': Communication error\n\n0>Back
'to_addr': u'0007031975326', 'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000), 'content': u'Communication error\n\n0>Back'
'to_addr': u'0007031975326', 'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000), 'content': u'Communication error\n\n0>Back'
'to_addr': 0007084023687, 'timestamp': unknown, 'content': Communication error\n\n0>Back
'to_addr': 0008081741472, 'timestamp': 2016-04-16 16:02:40.832000, 'content': Communication error\n\n0>Back
'to_addr': 0008081741472, 'timestamp': unknown, 'content': Communication error\n\n0>Back

Answer 2

每当输入中有name = value对时，最好先创建一个name2value数组，然后按名称打印这些值。例如使用GNU awk为第4个arg to split（）：

$ cat tst.awk
BEGIN { OFS=", " }
{
    delete(n2v)
    sub(/}">\s*$/,"")
    $0 = gensub(/u'([^']+)'/,"u" RS "\\1" RS,"g")
    split($0,f,/'[^']+'/,s)
    for (i=1; i in s; i++) {
        gsub(RS,"'",f[i+1])
        sub(/[,[:space:]`]*$/,"",f[i+1])
        n2v[s[i]] = f[i+1]
        names[++numNames] = s[i]
    }

    print nv("'to_addr'"), nv("'timestamp'"), nv("'content'")

    print ""
    for (nameNr=1; nameNr<=numNames; nameNr++) {
        print nv(names[nameNr])
    }
}
function nv(name) { return name n2v[name] }

$ awk -f tst.awk file
'to_addr': u'0007031975326', 'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000), 'content': u'Communication error\n\n0>Back'

'transport_name': u'123_abc_411'
'from_addr_type': None
'group': None
'from_addr': u'*123#'
'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000)
'helper_metadata': {}
'to_addr': u'0007031975326'
'to_addr_type': None
'session_id': u'1861570762'
'transport_metadata': {u'abc_mongolia_smpp': {u'session_id': u'1861570762', u'starCode': u'123', u'requestId': u'1534318080', u'phase': u'2', u'clientId': u'441', u'dcs': u'15'}}
'content': u'Communication error\n\n0>Back'
'session_event': u'resume'
'routing_metadata': {}
'message_version': u'20110921'
'transport_type': u'smpp'
'provider': u'abc_mongolia'
'in_reply_to': u'b613a5fc5c0c4b1b8e1108bb8bd7b946'
'message_type': u'user_message'
'message_id': u'1206f738-f0d2-4a8e-9beb-2efeaada77d9'

提取多个特定字段及其下一个字段

2 个答案: