Question

我从此网站上其他用户编写的其他脚本修改了此脚本。我尽力了解它，但我遇到了困难。当我尝试使用charset破解哈希值可能只是小写字母时，它可以正常工作。但是当我尝试破解其中包含数字和字母的哈希时，除非我将“spc”变量从1000000增加到100000000，否则它将无效。

import itertools
import math
import string
import multiprocessing
import hashlib
import traceback
import sys

def hashstring(string, algorithm):
    return hashlib.new(algorithm, string).hexdigest()

def gen_product(prefix, charset, length):
    for string in itertools.product(charset, repeat=length):
        yield prefix + "".join(string)

def string_generator(prefix, hash, suffix_length, length, charset, hashalg):
    num_done = 0
    if length <= suffix_length:
        assert prefix == ""
        for possible in gen_product("", charset, length):
            if hashstring(possible, hashalg) == hash:
                return possible
    else:
        assert len(prefix) + suffix_length == length
        for possible in gen_product(prefix, charset, suffix_length):
            if hashstring(possible, hashalg) == hash:
                return possible

    return None

def run_string_generator(*args):
    try:
        return string_generator(*args)
    except:
        raise Exception("".join(traceback.format_exception(*sys.exc_info())))

def do_work(pool, hash, charset, length, hashalg, spc=100000000):
    n = len(charset)
    suffix_len = int(math.ceil(math.log(spc) / math.log(n)) - 1)

    max_short_len = min(suffix_len, length)
    for length in range(1, max_short_len + 1):
        result = pool.apply_async(run_string_generator, args = ("", hash, suffix_len, length, charset, hashalg))
        if result.get() != None:
            return result.get()
    for length in range(max_short_len + 1, length + 1):
        for prefix in gen_product("", charset, length - suffix_len):
            result = pool.apply_async(run_string_generator, args = (prefix, hash, suffix_len, length, charset, hashalg))    
            if result.get() != None:
                return result.get()

    return None


def parallel_bruteforce(hash, charset, length, hashalg="md5", spc=1000000, cores=None):
    pool = multiprocessing.Pool(cores)
    result = do_work(pool, hash, charset, length, hashalg, spc)

    pool.close()
    pool.join()

    return result

if __name__ == "__main__":
    print "Starting..."
    #The hash is an md5 encryption of "test1"
    print parallel_bruteforce("5a105e8b9d40e1329780d62ea2265d8a", string.ascii_lowercase +  string.digits, 5, spc=100000000)

编辑：使用原始代码的其他帖子的链接是https://stackoverflow.com/a/20135250/1769995

Answer 1

提前抱歉我现在没时间解释这个。这是我之前的答案的编辑，它保留了并行性，并且如果散列被破解，则“尽早”停止所有工作者。通常，您不希望传递从不在调用之间变化的参数，因此我在模块级别执行的操作比您更多。附加的代码显示：

workers will cycle through the last 3 chars
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
[etc]
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
the plaintext is 'test1'

在这里。请注意，它通过实现PAIRS（字母表中所有2个字符的字符串列表）来播放另一个低级效率技巧。从长远来看，这再次挽救了数十亿的冗余连接。

import string
import hashlib
from itertools import product

CHARSET = string.ascii_lowercase +  string.digits
MAX_LENGTH = 5
NUM_PROCESSES = None # defaults to all available cores

HASHALG = "md5"
HASH = "5a105e8b9d40e1329780d62ea2265d8a"

PAIRS = ["".join(t) for t in product(CHARSET, repeat=2)]

def make_bases(count):
    bases = [PAIRS] * (count // 2)
    if count & 1:
        bases.insert(0, CHARSET)
    return bases

# string_gen is what the workers run.  Everything else
# runs in the main program.
def string_gen(prefix, suffix_len, length):
    # Generate all strings of length `length` starting with `prefix`.
    # If length > suffix_len, only the last suffix_len characters
    # need to be generated.
    if length <= suffix_len:
        assert prefix == ""
        bases = make_bases(length)
    else:
        assert len(prefix) + suffix_len == length
        bases = make_bases(suffix_len)
    for t in product(*bases):
        result = prefix + "".join(t)
        # do something with result
        if hashlib.new(HASHALG, result).hexdigest() == HASH:
            return result

def record_done(result):
    global all_done, the_secret
    print ".",
    if result is not None:
        print
        the_secret = result
        all_done = True
        pool.close()
        pool.terminate() # stop all workers! we're done

def do_work(pool, strings_per_chunk=1000000):
    global all_done, the_secret
    all_done = False
    the_secret = None
    # What's the most chars we can cycle through without
    # exceeding strings_per_chunk?
    N = len(CHARSET)
    suffix_len = 1
    while N**suffix_len <= strings_per_chunk:
        suffix_len += 1
    suffix_len -= 1
    print "workers will cycle through the last", suffix_len, "chars"

    # There's no point to splitting up very short strings.
    max_short_len = min(suffix_len, MAX_LENGTH)
    for length in range(1, max_short_len + 1):
        pool.apply_async(string_gen, args=("", suffix_len, length),
                         callback=record_done)
        if all_done:
            return
    # And now the longer strings.
    for length in range(max_short_len + 1, MAX_LENGTH + 1):
        for t in product(*make_bases(length - suffix_len)):
            prefix = "".join(t)
            pool.apply_async(string_gen, args=(prefix, suffix_len, length),
                             callback=record_done)
            if all_done:
                return

if __name__ == "__main__":
    import multiprocessing
    pool = multiprocessing.Pool(NUM_PROCESSES)
    do_work(pool)
    pool.close()
    pool.join()
    if the_secret is None:
        print "didn't crack it!"
    else:
        print "the plaintext is", repr(the_secret)

注意：如上所述，对于较大的问题规模和/或非常小的strings_per_chunk，该代码“过于平行”。主程序可以比工作进程处理它们更快地发出apply_async()个调用，因此multiprocessing机器最终可能会尝试排队数十亿个工作项。然后，您可能会耗尽RAM或其他系统资源。当然这也是可以解决的; - ）

修复

multiprocessing没有暴露任何方法来限制其内部队列，所以“一个自然的”解决方案是添加一个具有我们自己的队列的层。这样可以在multiprocessing的内部任务队列中为每个处理器保留最多3个待处理任务，但只要其自己的队列长于此值，就会阻止主程序生成更多前缀。当哈希被破解时，还提出了引发EarlyExit异常的逻辑;这比使用全局标记更容易和更干净。以下内容旨在从record_done()向下替换上述所有内容：

class EarlyExit(Exception):
    def __init__(self, result):
        Exception.__init__(self)
        self.result = result

class CustomDispatcher:
    def __init__(self, pool):
        from collections import deque
        self.pool = pool
        self.q = deque()

    def queue_work(self, *args):
        while len(self.q) > NUM_PROCESSES * 3:
            # provided the workers have significant work to do,
            # it will "take a long time" to finish the work
            # already queued.  Rather than swamp the mp machinery
            # with even more pending tasks, wait for some to
            # finish first.
            self.unqueue()
        self.q.append(self.pool.apply_async(string_gen, args))

    def unqueue(self):
        if self.q:
            # note:  the main program spends most of its time
            # blocked on the .get(); that's because it can
            # generate prefixes far faster than workers can
            # process them
            result = self.q.popleft().get()
            print ".",
            if result is not None:
                print
                raise EarlyExit(result)

    def drain(self):
        while self.q:
            self.unqueue()

def do_work(dispatch, strings_per_chunk=10000000):
    # What's the most chars we can cycle through without
    # exceeding strings_per_chunk?
    N = len(CHARSET)
    suffix_len = 1
    while N**suffix_len <= strings_per_chunk:
        suffix_len += 1
    suffix_len -= 1
    print "workers will cycle through the last", suffix_len, "chars"
    print "so each dot represents", \
          format(len(CHARSET)**suffix_len, ","), "strings"

    # There's no point to splitting up very short strings.
    max_short_len = min(suffix_len, MAX_LENGTH)
    for length in range(1, max_short_len + 1):
        dispatch.queue_work("", suffix_len, length)
    # And now the longer strings.
    for length in range(max_short_len + 1, MAX_LENGTH + 1):
        for t in product(*make_bases(length - suffix_len)):
            dispatch.queue_work("".join(t), suffix_len, length)
    dispatch.drain()  # check remaining tasks for a winner

if __name__ == "__main__":
    import multiprocessing
    pool = multiprocessing.Pool(NUM_PROCESSES)
    dispatch = CustomDispatcher(pool)
    try:
        do_work(dispatch)
    except EarlyExit as e:
        print "the plaintext is", repr(e.result)
    else:
        print "didn't crack it!"
    pool.close()
    pool.terminate() # stop all workers! we're done
    pool.join()

随着字母表的大小和/或产生的最长字符串的大小的增加，可能性的组合爆炸可能意味着你将永远等待结果，但至少有了这个改变，你将不会耗尽RAM - 并且您将以接近100％的容量利用所有核心。

Answer 2

注意：

    result = pool.apply_async(run_string_generator, args = (prefix, hash, suffix_len, length, charset, hashalg))    
    if result.get() != None:
        return result.get()

破坏了所有的并行性。 result.get() 阻止，直到工作进程完成其任务。那时只有一个工人可以活动。您是否注意到只有一个核心处于活动状态？

Answer 3

我首先注意到spc最好被命名 - 如果描述的话，我们不需要对其目的进行逆向工程。它通过parallel_bruteforce从main传递到do_work，仅在一行中使用：

suffix_len = int(math.ceil(math.log(spc) / math.log(n)) - 1)

这一行暗示它与长度有关，但是以一种相当复杂的数学方式。 n是字符集大小。对数的使用适合于计算特定数字集合所需的字符串的长度，以表示多个不同的值。例如，log(50000)/log(16)约为3.9 - 告诉我们需要4个十六进制数来计算50000个值，这与0xffff == 65535匹配良好。这样的除数对数在对数基数之间转换;通常只实现自然和基数10对数（python中的log和log10），但是logx(n)=log(n)/log(x)，regardless of the base of log。

有了这个，我们可以看到spc最有可能与搜索空间有关 - 尝试的尝试次数。它在这里使用的方式必须意味着该算法进行指定次数的尝试，而不是特定宽度所需的尝试次数。如果必须提高宽度，则需要提高spc以匹配：spc=len(charset)**width。这自然地揭示了粗暴强迫与更广泛的字符集的指数性质;它会变慢。

Python并行哈希暴力

3 个答案:

修复