我正在尝试从文件中的行中提取字段和下一个匹配项,其中所有字段都经常更改顺序。例如,在这些行中,
if (reader.EndOfStream)
{
break;
}
我需要能够提取这些字段:
123_abc/previous-2016-04-16-022850/smpp_411_0.log.1.tar.gz:2016-04-14 18:11:46+0100 [WorkerAMQClient,client] Processed outbound message for 123_abc_411: {"transport_name": "123_abc_411", "from_addr_type": null, "group": null, "from_addr": "*123#", "timestamp": "2016-04-14 17:11:46.348000", "helper_metadata": {}, "to_addr": "0007031975326", "to_addr_type": null, "session_id": "1861570762", "transport_metadata": {"abc_mongolia_smpp": {"session_id": "1861570762", "starCode": "123", "requestId": "1534318080", "phase": "2", "clientId": "441", "dcs": "15"}}, "content": "Communication error\n\n0>Back", "session_event": "resume", "routing_metadata": {}, "message_version": "20110921", "transport_type": "smpp", "provider": "abc_mongolia", "in_reply_to": "b613a5fc5c0c4b1b8e1108bb8bd7b946", "message_type": "user_message", "message_id": "1206f738-f0d2-4a8e-9beb-2efeaada77d9"}
123_abc/previous-2016-04-16-022850/smpp_411_0.log.1.tar.gz:2016-04-14 18:11:46+0100 [WorkerAMQClient,client] Working with: <Message payload="{'transport_name': u'123_abc_411', 'from_addr_type': None, 'group': None, 'from_addr': u'*123#', 'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000), 'helper_metadata': {}, 'to_addr': u'0007031975326', 'to_addr_type': None, 'session_id': u'1861570762', 'transport_metadata': {u'abc_mongolia_smpp': {u'session_id': u'1861570762', u'starCode': u'123', u'requestId': u'1534318080', u'phase': u'2', u'clientId': u'441', u'dcs': u'15'}}, 'content': u'Communication error\n\n0>Back', 'session_event': u'resume', 'routing_metadata': {}, 'message_version': u'20110921', 'transport_type': u'smpp', 'provider': u'abc_mongolia', 'in_reply_to': u'b613a5fc5c0c4b1b8e1108bb8bd7b946', 'message_type': u'user_message', 'message_id': u'1206f738-f0d2-4a8e-9beb-2efeaada77d9'}">
123_abc/previous-2016-04-16-022850/smpp_411_0.log.1.tar.gz:2016-04-14 18:11:46+0100 [abracRedis,client] abcmongoliasmppTransport sending outbound message: <Message payload="{'transport_name': u'123_abc_411', 'from_addr_type': None, 'group': None, 'from_addr': u'*123#', 'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000), 'helper_metadata': {}, 'to_addr': u'0007031975326', 'to_addr_type': None, 'session_id': u'1861570762', 'transport_metadata': {u'abc_mongolia_smpp': {u'session_id': u'1861570762', u'starCode': u'123', u'requestId': u'1534318080', u'phase': u'2', u'clientId': u'441', u'dcs': u'15'}}, 'content': u'Communication error\n\n0>Back', 'session_event': u'resume', 'routing_metadata': {}, 'message_version': u'20110921', 'transport_type': u'smpp', 'provider': u'abc_mongolia', 'in_reply_to': u'b613a5fc5c0c4b1b8e1108bb8bd7b946', 'message_type': u'user_message', 'message_id': u'1206f738-f0d2-4a8e-9beb-2efeaada77d9'}">
123_dfg/smpp_37_0.log:2016-04-16 16:24:59+0100 [abracRedis,client] OUTGOING >> {'body': {'mandatory_parameters': {'priority_flag': 0, 'dest_addr_npi': 1, 'source_addr': '123', 'protocol_id': 0, 'replace_if_present_flag': 0, 'registered_delivery': True, 'dest_addr_ton': 1, 'source_addr_npi': 0, 'schedule_delivery_time': '', 'sm_default_msg_id': 0, 'sm_length': 0, 'esm_class': 0, 'data_coding': 0, 'service_type': '', 'source_addr_ton': 0, 'validity_period': '', 'destination_addr': '0007084023687', 'short_message': 'Communication error\n\n0>Back'}, 'optional_parameters': [{'length': 0, 'tag': 'smpp_service_op', 'value': '02'}, {'length': 0, 'tag': 'its_session_info', 'value': '3522'}]}, 'header': {'command_status': 'ESME_ROK', 'command_length': 0, 'sequence_number': 835674, 'command_id': 'submit_sm'}}
123_dfg/smpp_37_0.log:2016-04-16 17:02:40+0100 [WorkerAMQClient,client] Processed outbound message for 123_dfg_37: {"transport_name": "123_dfg_37", "from_addr_type": null, "group": null, "from_addr": "123", "timestamp": "2016-04-16 16:02:40.832000", "helper_metadata": {}, "to_addr": "0008081741472", "to_addr_type": null, "session_id": "d9dac229dec5499286890fa1c81aa16a", "transport_metadata": {"session_info": "5070"}, "content": "Communication error\n\n0>Back", "session_event": "resume", "routing_metadata": {}, "message_version": "20110921", "transport_type": "smpp", "provider": "123_dfg_37", "in_reply_to": "fd3e29028fb04f089fe764ba94d7d9af", "message_type": "user_message", "message_id": "b33737d5-39a7-478b-9680-31c54c15e3b2"}
123_dfg/smpp_37_0.log:2016-04-16 17:02:40+0100 [abracRedis,client] OUTGOING >> {'body': {'mandatory_parameters': {'priority_flag': 0, 'dest_addr_npi': 1, 'source_addr': '123', 'protocol_id': 0, 'replace_if_present_flag': 0, 'registered_delivery': True, 'dest_addr_ton': 1, 'source_addr_npi': 0, 'schedule_delivery_time': '', 'sm_default_msg_id': 0, 'sm_length': 0, 'esm_class': 0, 'data_coding': 0, 'service_type': '', 'source_addr_ton': 0, 'validity_period': '', 'destination_addr': '0008081741472', 'short_message': 'Communication error\n\n0>Back'}, 'optional_parameters': [{'length': 0, 'tag': 'smpp_service_op', 'value': '02'}, {'length': 0, 'tag': 'its_session_info', 'value': '5070'}]}, 'header': {'command_status': 'ESME_ROK', 'command_length': 0, 'sequence_number': 835793, 'command_id': 'submit_sm'}}
并将它们排成一行,每行在原始文件中。
我一直在尝试这种awk系列的不同变体:
'to_addr': u'0007031975326' (or 000
'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000),
'content': u'Communication error\n\n0>Back'
但我真的很短暂。请有人告诉我如何在awk和sed中执行此操作吗?
谢谢!
答案 0 :(得分:0)
这是一个perl过滤器,可以解决这个问题:
<强> filter.pl 强>
#!/usr/bin/env perl
use warnings;
use strict;
use JSON::XS;
use constant DEFAULT => 'unknown';
# Iterate over every line
while (my $line = <>) {
next unless $line =~ m|\w+|; # Skip empty lines
# Remove everything that is not a part of the JSON string
$line =~ s|^[^{]*{|{|;
$line =~ s|}[^}]*$|}|;
my $is_unicode = 0; # Python style unicode strings
if ($line =~ m|: u'\w+'|) {
$is_unicode = 1;
$line =~ s/(u)'([^']+)'/"$1|$2|"/g; # Transform u'foo' -> "u|foo|"
}
$line =~ s|'|"|g; # Key names need double quotes
$line =~ s|: (\w+),|: "$1",|g; # quote unquoted constructs
$line =~ s|(datetime.datetime\(.+?\)),|"$1",|; # quote datetime
my $data = decode_json($line);
my ($to_addr, $ts, $content);
if ($data->{body}) {
# header/body format
$to_addr = $data->{ body }->{ mandatory_parameters }->{ destination_addr } || DEFAULT;
$ts = DEFAULT;
$content = $data->{ body }->{ mandatory_parameters }->{ short_message } || DEFAULT;
} else {
# Normal message
$to_addr = $data->{ to_addr } || DEFAULT;
$ts = $data->{ timestamp } || DEFAULT;
$content = $data->{ content} || DEFAULT;
}
for ($to_addr, $ts, $content) {
if ($is_unicode) {
s/u\|([^|]+)\|/u'$1'/; # Un-Transform "u|foo|" -> u'foo'
}
s|\n|\\n|g; # Escape newlines so they show up as '\n' in the output
}
print "'to_addr': $to_addr, 'timestamp': $ts, 'content': $content\n";
}
运行
cat logfile | perl filter.pl
'to_addr': 0007031975326, 'timestamp': 2016-04-14 17:11:46.348000, 'content': Communication error\n\n0>Back
'to_addr': u'0007031975326', 'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000), 'content': u'Communication error\n\n0>Back'
'to_addr': u'0007031975326', 'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000), 'content': u'Communication error\n\n0>Back'
'to_addr': 0007084023687, 'timestamp': unknown, 'content': Communication error\n\n0>Back
'to_addr': 0008081741472, 'timestamp': 2016-04-16 16:02:40.832000, 'content': Communication error\n\n0>Back
'to_addr': 0008081741472, 'timestamp': unknown, 'content': Communication error\n\n0>Back
答案 1 :(得分:0)
每当输入中有name = value对时,最好先创建一个name2value数组,然后按名称打印这些值。例如使用GNU awk为第4个arg to split():
$ cat tst.awk
BEGIN { OFS=", " }
{
delete(n2v)
sub(/}">\s*$/,"")
$0 = gensub(/u'([^']+)'/,"u" RS "\\1" RS,"g")
split($0,f,/'[^']+'/,s)
for (i=1; i in s; i++) {
gsub(RS,"'",f[i+1])
sub(/[,[:space:]`]*$/,"",f[i+1])
n2v[s[i]] = f[i+1]
names[++numNames] = s[i]
}
print nv("'to_addr'"), nv("'timestamp'"), nv("'content'")
print ""
for (nameNr=1; nameNr<=numNames; nameNr++) {
print nv(names[nameNr])
}
}
function nv(name) { return name n2v[name] }
$ awk -f tst.awk file
'to_addr': u'0007031975326', 'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000), 'content': u'Communication error\n\n0>Back'
'transport_name': u'123_abc_411'
'from_addr_type': None
'group': None
'from_addr': u'*123#'
'timestamp': datetime.datetime(2016, 4, 14, 17, 11, 46, 348000)
'helper_metadata': {}
'to_addr': u'0007031975326'
'to_addr_type': None
'session_id': u'1861570762'
'transport_metadata': {u'abc_mongolia_smpp': {u'session_id': u'1861570762', u'starCode': u'123', u'requestId': u'1534318080', u'phase': u'2', u'clientId': u'441', u'dcs': u'15'}}
'content': u'Communication error\n\n0>Back'
'session_event': u'resume'
'routing_metadata': {}
'message_version': u'20110921'
'transport_type': u'smpp'
'provider': u'abc_mongolia'
'in_reply_to': u'b613a5fc5c0c4b1b8e1108bb8bd7b946'
'message_type': u'user_message'
'message_id': u'1206f738-f0d2-4a8e-9beb-2efeaada77d9'