首先,我使用以下代码从文件中提取了一些文本:
from collections import Counter
def n_gram_opcodes(source, n):
source = open(source).read()
OPCODES = set(["add","call","cmp","mov","jnz","jmp","jz","lea","pop","push",
"retn","sub","test","xor"])
source_words = source.split()
opcodes = [w for w in source_words if w in OPCODES]
return Counter(zip(*[opcodes[i:] for i in range(n)]))
该代码还允许计算文件中某些单词出现的频率。以字典格式存储单词,如下所示:
Counter({('mov', 'mov', 'mov'): 18, ('xor', 'mov', 'mov'): 6, ('mov', 'mov', 'pop'): 3, ('mov', 'mov', 'push'): 3, ('pop', 'mov', 'mov'): 3, ('mov', 'call', 'cmp'): 3, ('push', 'pop', 'mov'): 3, ('mov', 'add', 'mov'): 3, ('call', 'mov', 'call'): 3, ('mov', 'mov', 'xor'): 3, ('cmp', 'mov', 'cmp'): 2, ('pop', 'mov', 'add'): 2, ('mov', 'pop', 'mov'): 2, ('mov', 'cmp', 'sub'): 2, ('mov', 'mov', 'sub'): 2, ('mov', 'mov', 'call'): 2})
通过上面的这个词典,我想取值(出现频率)并在下面的loglikelihood公式中使用。我的问题是如何修改代码,以便它可以从上面的任何字典中获取值,并将其与下面的代码一起使用。最终结果应返回数字并使用matplotlib绘制图形。
import math
# The placeholder value for 0 counts
epsilon = 0.0001
def opcode_llr(opcode, freq_table_before, freq_table_after):
'''
Args:
opcode: A single opcode mnemonic, e.g., 'mov'
freq_table_before: The frequency table for opcode trigrams *before*
extraction.
freq_table_after: The frequency table for opcode trigrams *after*
extraction.
The keys for both tables are tuples of string. So, each is of the form
{
('mov', 'mov', 'mov'): 5.0,
('mov', 'jmp', 'mov'): 7.0,
...
}
'''
t_b = len(freq_table_before) or epsilon
t_a = len(freq_table_after) or epsilon
# Compute the opcode counts when occurring in positions 0, 1, 2
opcode_counts = [epsilon, epsilon, epsilon]
for triplet in freq_table_after.keys():
for i, comp in enumerate(triplet):
if comp == opcode:
opcode_counts[i] += 1
f1 = opcode_counts[0]
f2 = opcode_counts[1]
f3 = opcode_counts[2]
return (f1 + f2 + f3) * math.log(float(t_b) / t_a)
答案 0 :(得分:1)
以下是从Counter
计算LLR的一般方法。
from collections import Counter
import random
import math
def CntToLLR(cnt):
n = sum(cnt.values()) # total number of samples
LLR = {} # dict to store LLRs (same keys as counter)
for x,y in cnt.items(): # x is the key, and y the count
LLR[x] = math.log(y) - math.log(n - y)
return LLR
# populate a counter with random values
cnt = Counter([random.randrange(10) for x in range(100)])
llrs = CntToLLR(cnt)
# You can convert the dictionary to a list of (key, value)
llrs = list(llrs.iteritems())