Question

我最近写了一个简短的算法来计算python中的happy numbers。该程序允许您选择一个上限，它将确定它下面的所有快乐数字。为了进行速度比较，我决定将我所知道的算法从python直接翻译成c ++。

令人惊讶的是，c ++版本的运行速度明显慢于python版本。在发现前10,000个满意数字的执行时间之间进行准确的速度测试表明python程序平均在0.59秒内运行，而c ++版本平均在8.5秒内运行。

我将这个速度差异归结为这样一个事实：我必须在c ++版本中为部分计算编写辅助函数（例如确定元素是否在列表/数组/向量中），这些函数已经内置于python语言。

首先，这是否是这种荒谬的速度差异的真正原因，其次，如何更改c ++版本以比python版本更快地执行（在我看来应该如此）。

带有速度测试的两段代码位于：Python Version，C++ Version。谢谢你的帮助。

#include <iostream>
#include <vector>
#include <string>
#include <ctime>
#include <windows.h>

using namespace std;

bool inVector(int inQuestion, vector<int> known);
int sum(vector<int> given);
int pow(int given, int power);
void calcMain(int upperBound);

int main()
{
    while(true)
    {
        int upperBound;
        cout << "Pick an upper bound: ";
        cin >> upperBound;
        long start, end;
        start = GetTickCount();
        calcMain(upperBound);
        end = GetTickCount();
        double seconds = (double)(end-start) / 1000.0;
        cout << seconds << " seconds." << endl << endl;
    }
    return 0;
}

void calcMain(int upperBound)
{
    vector<int> known;
    for(int i = 0; i <= upperBound; i++)
    {
        bool next = false;
        int current = i;
        vector<int> history;
        while(!next)
        {
            char* buffer = new char[10];
            itoa(current, buffer, 10);
            string digits = buffer;
            delete buffer;
            vector<int> squares;
            for(int j = 0; j < digits.size(); j++)
            {
                char charDigit = digits[j];
                int digit = atoi(&charDigit);
                int square = pow(digit, 2);
                squares.push_back(square);
            }
            int squaresum = sum(squares);
            current = squaresum;
            if(inVector(current, history))
            {
                next = true;
                if(current == 1)
                {
                    known.push_back(i);
                    //cout << i << "\t";
                }
            }
            history.push_back(current);
        }
    }
    //cout << "\n\n";
}

bool inVector(int inQuestion, vector<int> known)
{
    for(vector<int>::iterator it = known.begin(); it != known.end(); it++)
        if(*it == inQuestion)
            return true;
    return false;
}

int sum(vector<int> given)
{
    int sum = 0;
    for(vector<int>::iterator it = given.begin(); it != given.end(); it++)
        sum += *it;
    return sum;
}

int pow(int given, int power)
{
    int original = given;
    int current = given;
    for(int i = 0; i < power-1; i++)
        current *= original;
    return current;
}

#!/usr/bin/env python

import timeit

upperBound = 0

def calcMain():
    known = []
    for i in range(0,upperBound+1):
        next = False
        current = i
        history = []
        while not next:
            digits = str(current)
            squares = [pow(int(digit), 2) for digit in digits]
            squaresum = sum(squares)
            current = squaresum
            if current in history:
                next = True
                if current == 1:
                    known.append(i)
                    ##print i, "\t",
            history.append(current)
    ##print "\nend"

while True:    
    upperBound = input("Pick an upper bound: ")
    result = timeit.Timer(calcMain).timeit(1)
    print result, "seconds.\n"

Answer 1

对于100000个元素，Python代码需要6.9秒，而C ++最初需要37秒。

我对您的代码进行了一些基本的优化，并设法使C ++代码比Python实现快100倍。它现在在0.06秒内完成100000个元素。这比原始C ++代码快617倍。

最重要的是在发布模式下进行编译，并进行所有优化。在调试模式下，此代码的速度要慢一些。

接下来，我将解释我所做的优化。

将所有向量声明移到循环之外;用clear（）操作替换它们，这比调用构造函数要快得多。
通过乘法：value * value替换对pow（value，2）的调用。
我没有使用方形向量并在其上调用sum，而是仅使用整数对这些值进行求和。
避免所有字符串操作，与整数操作相比，这些操作非常慢。例如，可以通过重复除以10并获取结果值的模数10来计算每个数字的平方，而不是将值转换为字符串，然后将每个字符转换回int。
避免所有矢量副本，首先通过传递引用替换传递值，最后完全消除辅助函数。
消除了一些临时变量。
我忘记了很多小细节。比较你的代码和我的并排，看看我做了什么。

有可能通过使用预先分配的数组而不是向量来优化代码，但这将是一项更多的工作，我将把它作为练习留给读者。：P

以下是优化代码：

#include <iostream>
#include <vector>
#include <string>
#include <ctime>
#include <algorithm>
#include <windows.h>

using namespace std;

void calcMain(int upperBound, vector<int>& known);

int main()
{
    while(true)
    {
        vector<int> results;
        int upperBound;
        cout << "Pick an upper bound: ";
        cin >> upperBound;
        long start, end;
        start = GetTickCount();
        calcMain(upperBound, results);
        end = GetTickCount();
        for (size_t i = 0; i < results.size(); ++i) {
            cout << results[i] << ", ";
        }
        cout << endl;
        double seconds = (double)(end-start) / 1000.0;
        cout << seconds << " seconds." << endl << endl;
    }
    return 0;
}

void calcMain(int upperBound, vector<int>& known)
{
    vector<int> history;
    for(int i = 0; i <= upperBound; i++)
    {
        int current = i;
        history.clear();
        while(true)
        {
                int temp = current;
                int sum = 0;
                while (temp > 0) {
                    sum += (temp % 10) * (temp % 10);
                    temp /= 10;
                }
                current = sum;
                if(find(history.begin(), history.end(), current) != history.end())
                {
                        if(current == 1)
                        {
                                known.push_back(i);
                        }
                        break;
                }
                history.push_back(current);
        }
    }
}

Answer 2

有一个新的，速度更快的版本a separate answer，所以这个答案已被弃用。

我重写了你的算法，只要它找到快乐或不快乐的数字就会进行缓存。我还尝试尽可能地将其设为pythonic，例如通过创建单独的函数digits()和happy()。很抱歉使用Python 3，但我也可以从中展示一些有用的东西。

此版本快得多。它运行 1.7s ，比 18s 的原始程序快<10次（好吧，我的MacBook很老很慢:) ）

#!/usr/bin/env python3

from timeit import Timer
from itertools import count

print_numbers = False
upperBound = 10**5  # Default value, can be overidden by user.


def digits(x:'nonnegative number') -> "yields number's digits":
    if not (x >= 0): raise ValueError('Number should be nonnegative')
    while x:
        yield x % 10
        x //= 10


def happy(number, known = {1}, happies = {1}) -> 'True/None':
    '''This function tells if the number is happy or not, caching results.

    It uses two static variables, parameters known and happies; the
    first one contains known happy and unhappy numbers; the second 
    contains only happy ones.

    If you want, you can pass your own known and happies arguments. If
    you do, you should keep the assumption commented out on the 1 line.

    '''

#        assert 1 in known and happies <= known  # <= is expensive

    if number in known:
        return number in happies

    history = set()
    while True:
        history.add(number)
        number = sum(x**2 for x in digits(number))
        if number in known or number in history:
            break

    known.update(history)
    if number in happies:
        happies.update(history)
        return True


def calcMain():
    happies = {x for x in range(upperBound) if happy(x) }
    if print_numbers:
        print(happies)


if __name__ == '__main__':
    upperBound = eval(
            input("Pick an upper bound [default {0}]: "
                    .format(upperBound)).strip()
            or repr(upperBound))
    result = Timer(calcMain).timeit(1)
    print ('This computation took {0} seconds'.format(result))

Answer 3

看起来你正在将值向量传递给其他函数。这将是一个显着的减速，因为程序实际上会在向量传递给函数之前制作完整的矢量副本。要解决此问题，请将常量引用传递给向量而不是副本。所以而不是：

int sum(vector<int> given)

使用：

int sum(const vector<int>& given)

执行此操作时，您将无法再使用vector :: iterator，因为它不是常量。你需要用vector :: const_iterator替换它。

您也可以传入非常量引用，但在这种情况下，您根本不需要修改参数。

Answer 4

我可以看到你有很多不必要的堆分配

例如：

while(!next)
        {
            char* buffer = new char[10];

这看起来并不是非常优化。因此，您可能希望预先分配数组并在循环中使用它。这是一种易于发现和完成的基本优化技术。它也可能变得一团糟，所以要小心。

你也在使用atoi（）函数，我真的不知道它是否真的被优化了。也许做模数10并得到数字可能更好（你必须衡量你，我没有测试这个）。

您进行线性搜索（inVector）的事实可能不好。用std :: set替换矢量数据结构可能会加快速度。 hash_set也可以做到这一点。

但我认为最糟糕的问题是字符串和该循环内堆上的东西的这种分配。这看起来不太好。我会先尝试那些地方。

Answer 5

这是我的第二个答案;对于值<= 10**6，缓存诸如平方和之类的内容：

        happy_list[sq_list[x%happy_base] + sq_list[x//happy_base]]

即，

该号码分为3位+ 3位
预先计算的表用于获取两个部分的平方和
添加了这两个结果
咨询预先计算的表格以获得数字的快乐：

我不认为Python版本可以比这更快（好吧，如果你扔掉旧版本，即try:开销，它会快10％）。

我认为这是一个优秀的问题，它确实显示了

必须快速的事情应该用C
然而，通常你不需要快速的东西（即使你需要程序运行一天，它会少于程序员优化它的总时间）。
用Python编写程序更容易，更快捷
但是对于某些问题，尤其是计算问题，C ++解决方案（如上所述）实际上比尝试优化Python程序更具可读性和美观性。

好的，在这里（第2版现在......）：

#!/usr/bin/env python3
'''Provides slower and faster versions of a function to compute happy numbers.

slow_happy() implements the algorithm as in the definition of happy
numbers (but also caches the results).

happy() uses the precomputed lists of sums of squares and happy numbers
to return result in just 3 list lookups and 3 arithmetic operations for
numbers less than 10**6; it falls back to slow_happy() for big numbers.

Utilities: digits() generator, my_timeit() context manager.

'''


from time import time  # For my_timeit.
from random import randint # For example with random number.

upperBound = 10**5  # Default value, can be overridden by user.


class my_timeit:
    '''Very simple timing context manager.'''

    def __init__(self, message):
        self.message = message
        self.start = time()

    def __enter__(self):
        return self

    def __exit__(self, *data):
        print(self.message.format(time() - self.start))


def digits(x:'nonnegative number') -> "yields number's digits":
    if not (x >= 0): raise ValueError('Number should be nonnegative')
    while x:
        yield x % 10
        x //= 10


def slow_happy(number, known = {1}, happies = {1}) -> 'True/None':
    '''Tell if the number is happy or not, caching results.

    It uses two static variables, parameters known and happies; the
    first one contains known happy and unhappy numbers; the second 
    contains only happy ones.

    If you want, you can pass your own known and happies arguments. If
    you do, you should keep the assumption commented out on the 1 line.

    '''
    # This is commented out because <= is expensive.
    # assert {1} <= happies <= known 

    if number in known:
        return number in happies

    history = set()
    while True:
        history.add(number)
        number = sum(x**2 for x in digits(number))
        if number in known or number in history:
            break

    known.update(history)
    if number in happies:
        happies.update(history)
        return True


# This will define new happy() to be much faster ------------------------.

with my_timeit('Preparation time was {0} seconds.\n'):

    LogAbsoluteUpperBound = 6 # The maximum possible number is 10**this.
    happy_list = [slow_happy(x)
                  for x in range(81*LogAbsoluteUpperBound + 1)]
    happy_base = 10**((LogAbsoluteUpperBound + 1)//2)
    sq_list = [sum(d**2 for d in digits(x))
               for x in range(happy_base + 1)]

    def happy(x):
        '''Tell if the number is happy, optimized for smaller numbers.

        This function works fast for numbers <= 10**LogAbsoluteUpperBound.

        '''
        try:
            return happy_list[sq_list[x%happy_base] + sq_list[x//happy_base]]
        except IndexError:
            return slow_happy(x)

# End of happy()'s redefinition -----------------------------------------.


def calcMain(print_numbers, upper_bound):
    happies = [x for x in range(upper_bound + 1) if happy(x)]
    if print_numbers:
        print(happies)


if __name__ == '__main__':
    while True:

        upperBound = eval(input(
            "Pick an upper bound [{0} default, 0 ends, negative number prints]: "
            .format(upperBound)).strip() or repr(upperBound))
        if not upperBound:
            break

        with my_timeit('This computation took {0} seconds.'):
            calcMain(upperBound < 0, abs(upperBound))

        single = 0
        while not happy(single):
            single = randint(1, 10**12)
        print('FYI, {0} is {1}.\n'.format(single,
                    'happy' if happy(single) else 'unhappy')) 

    print('Nice to see you, goodbye!')

Answer 6

嗯，我也给了它一次性。不过，我没有测试甚至编译。

数值计划的一般规则：

永远不要将数字作为文本处理。这就是使较少的语言比Python慢的原因，所以如果你在C语言中这样做，程序将比Python慢。
如果可以避免使用数据结构，请不要使用它们。您正在构建一个数组只是为了添加数字。最好保持总计。
保持STL参考的副本打开，以便您可以使用它而不是编写自己的函数。

void calcMain(int upperBound)
{
    vector<int> known;
    for(int i = 0; i <= upperBound; i++)
    {
        int current = i;
        vector<int> history;
        do
        {
            squaresum = 0
            for ( ; current; current /= 10 )
            {
                int digit = current % 10;
                squaresum += digit * digit;
            }
            current = squaresum;
            history.push_back(current);
        } while ( ! count(history.begin(), history.end() - 1, current) );

        if(current == 1)
        {
            known.push_back(i);
            //cout << i << "\t";
        }

    }
    //cout << "\n\n";
}

Answer 7

通过查看我能够多快找到这些数字来获得关于这个问题的更多关闭，我写了一个Dr_Asik算法的多线程C ++实现。对于这种实现是多线程的事实，有两件事是很重要的。

更多线程不一定会带来更好的执行时间，根据您想要计算的数字量，每种情况都有一个愉快的媒介。
如果比较使用一个线程运行的此版本与原始版本之间的时间，那么可能导致时间差异的唯一因素是启动线程的开销和可变系统性能问题。否则，算法是相同的。

此实现的代码（算法的所有功能归于Dr_Asik）为here。另外，我写了一些速度测试，并对每个测试进行了双重检查，以帮助支持这3点。

计算首个100,000,000个满意的数字：

原文 - 39.061 / 39.000（Dr_Asik的原始实施）
1个主题 - 39.000 / 39.079
2个线程 - 19.750 / 19.890
10个螺纹 - 11.872 / 11.888
30个螺纹 - 10.764 / 10.827
50个线程 - 10.624 / 10.561＆lt; -
100个主题 - 11.060 / 11.216
500线程 - 13.385 / 12.527

从这些结果来看，我们的快乐媒体看起来是大约50个线程，加上或减去十个左右。

Answer 8

其他优化：通过使用数组和使用循环索引直接访问而不是在向量中搜索，并通过缓存先前的总和，以下代码（受Asik博士的回答启发但可能未在所有）比原始C ++代码运行 2445 快一倍， 400 比Python代码快。< / p>

#include <iostream>
#include <windows.h>
#include <vector>

void calcMain(int upperBound, std::vector<int>& known)
{
    int tempDigitCounter = upperBound;
    int numDigits = 0;
    while (tempDigitCounter > 0)
    {
        numDigits++;
        tempDigitCounter /= 10;
    }
    int maxSlots = numDigits * 9 * 9;
    int* history = new int[maxSlots + 1];

    int* cache = new int[upperBound+1];
    for (int jj = 0; jj <= upperBound; jj++)
    {
        cache[jj] = 0;
    }

    int current, sum, temp;
    for(int i = 0; i <= upperBound; i++)
    {
        current = i;
        while(true)
        {
            sum = 0;
            temp = current;

            bool inRange = temp <= upperBound;
            if (inRange)
            {
                int cached = cache[temp];
                if (cached)
                {
                    sum = cached;
                }
            }

            if (sum == 0)
            {
                while (temp > 0)
                {
                    int tempMod = temp % 10;
                    sum += tempMod * tempMod;
                    temp /= 10;
                }
                if (inRange)
                {
                    cache[current] = sum;
                }
            }
            current = sum;
            if(history[current] == i)
            {
                if(current == 1)
                {
                    known.push_back(i);
                }
                break;
            }
            history[current] = i;
        }
    }
}

int main()
{
    while(true)
    {
        int upperBound;
        std::vector<int> known;
        std::cout << "Pick an upper bound: ";
        std::cin >> upperBound;
        long start, end;
        start = GetTickCount();
        calcMain(upperBound, known);
        end = GetTickCount();
        for (size_t i = 0; i < known.size(); ++i) {
            std::cout << known[i] << ", ";
        }               
        double seconds = (double)(end-start) / 1000.0;
        std::cout << std::endl << seconds << " seconds." << std::endl << std::endl;
    }
    return 0;
}

Answer 9

我不是C ++优化方面的专家，但我相信速度差异可能是因为Python列表在开始时预先分配了更多空间，而C ++向量必须重新分配，并且每次增长时都可能复制。

至于GMan关于find的评论，我相信Python“in”运算符也是线性搜索并且速度相同。

修改

另外我只是注意到你推出了自己的战队功能。没有必要这样做，stdlib可能更快。

Answer 10

这是另一种依赖记忆已经探索过的数字的方法。我得到一个因子x4-5，这对于DrAsik的1000和1000000的代码是奇怪的稳定，我期望缓存在我们探索的数字越多时效率越高。否则，已经应用了相同类型的经典优化。顺便说一句，如果编译器接受NRVO （/ RNVO？我永远不会记住确切的术语）或rvalue引用，我们就不需要将该向量作为 out 参数传递。

注意：微观优化仍然是可能的恕我直言，而且缓存是天真的，因为它分配的内存比实际需要的多得多。

enum Status {
    never_seen,
    being_explored,
    happy,
    unhappy
};

char const* toString[] = { "never_seen", "being_explored", "happy", "unhappy" };


inline size_t sum_squares(size_t i) {
    size_t s = 0;
    while (i) {
        const size_t digit = i%10;
        s += digit * digit;
        i /= 10;
    }
    return s ;
}

struct Cache {
    Cache(size_t dim) : m_cache(dim, never_seen) {}
    void set(size_t n, Status status) {
        if (m_cache.size() <= n) {
            m_cache.resize(n+1, never_seen);
        }
        m_cache[n] = status;
        // std::cout << "(c[" << n << "]<-"<<toString[status] << ")";
    }
    Status operator[](size_t n) const {
        if (m_cache.size() <= n) {
            return never_seen;
        } else {
            return m_cache[n];
        }
    }

private:
    std::vector<Status> m_cache;
};

void search_happy_lh(size_t upper_bound, std::vector<size_t> & happy_numbers)
{
    happy_numbers.clear();
    happy_numbers.reserve(upper_bound); // it doesn't improve much the performances

    Cache cache(upper_bound+1);
    std::vector<size_t> current_stack;

    cache.set(1,happy);
    happy_numbers.push_back(1);
    for (size_t i = 2; i<=upper_bound ; ++i) {
        // std::cout << "\r" << i << std::flush;
        current_stack.clear();
        size_t s= i;
        while ( s != 1 && cache[s]==never_seen)
        {
            current_stack.push_back(s);
            cache.set(s, being_explored);
            s = sum_squares(s);
            // std::cout << " - " << s << std::flush;
        }
        const Status update_with = (cache[s]==being_explored ||cache[s]==unhappy) ? unhappy : happy;
        // std::cout << " => " << s << ":" << toString[update_with] << std::endl;
        for (size_t j=0; j!=current_stack.size(); ++j) {
            cache.set(current_stack[j], update_with);
        }
        if (cache[i] == happy) {
            happy_numbers.push_back(i);
        }
    }
}

Answer 11

在无聊的时候偶然发现了这个页面并且认为我用js打高尔夫球。算法是我自己的，我没有彻底检查过我自己计算以外的任何事情（所以它可能是错的）。它计算出第一个1e7个满意的数字并将它们存储在h中。如果要更改它，请同时更改7。

m=1e7,C=7*81,h=[1],t=true,U=[,,,,t],n=w=2;
while(n<m){
z=w,s=0;while(z)y=z%10,s+=y*y,z=0|z/10;w=s;
if(U[w]){if(n<C)U[n]=t;w=++n;}else if(w<n)h.push(n),w=++n;}

这将在控制台或浏览器中为您打印前1000个项目：

o=h.slice(0,m>1e3?1e3:m);
(!this.document?print(o):document.load=document.write(o.join('\n')));

功能部分的155个字符，它似乎与Asik博士在firefox或v8上提供的速度快*（运行时间d8 happygolf时，系统上原始python程序的350-400倍速度） .js或js -a -j -p happygolf.js in spidermonkey）我将敬畏分析技能，任何人都可以弄清楚为什么这个算法做得很好而没有引用更长，评论，fortran版本。

我对它的速度感到好奇，所以我学会了fortran以获得相同算法的比较，如果有任何明显的新手错误，请善待它，这是我的第一个fortran程序。 http://pastebin.com/q9WFaP5C 它是静态记忆，所以为了公平对待其他人，它是在一个自编译的shell脚本中，如果你没有gcc / bash / etc剥离预处理器和bash的东西在顶部，手动设置宏并将其编译为fortran95。

即使你包括编译时间，它也会胜过大多数其他人。如果你没有，它的速度大约是原始python版本的3000-3500倍（并且扩展速度> C ++ *的40,000倍，尽管我没有运行任何版本（C ++程序）。

令人惊讶的是，我在fortran版本中尝试过的许多优化（包括一些类似于循环展开，由于效果和可读性小而我从粘贴版本中删除）对js版本是不利的。这个练习表明，现代的跟踪编译器是非常好的（在精心优化的静态内存fortran的7-10倍之内），如果你走开他们并且不尝试任何棘手的东西。摆脱他们的方式，并试图做一些棘手的事情最后，这是一个更好，更递归的js版本。

// to s, then integer divides x by 10.
// Repeats until x is 0.
function sumsq(x) {
  var y,s=0;
  while(x) {
    y = x % 10; 
    s += y * y;
    x = 0| x / 10; 
  }
  return s;
}
// A boolean cache for happy().
// The terminating happy number and an unhappy number in
// the terminating sequence.
var H=[];
H[1] = true;
H[4] = false;
// Test if a number is happy.
// First check the cache, if that's empty
// Perform one round of sumsq, then check the cache
// For that. If that's empty, recurse.
function happy(x) {
  // If it already exists.
  if(H[x] !== undefined) {
    // Return whatever is already in cache.
    return H[x];
  } else {
    // Else calc sumsq, set and  return cached val, or if undefined, recurse.
    var w = sumsq(x);
    return (H[x] = H[w] !== undefined? H[w]: happy(w));
  }
}
//Main program loop.
var i, hN = []; 
for(i = 1; i < 1e7; i++) {
  if(happy(i)) { hN.push(i); }
}

令人惊讶的是，即使它是相当高的级别，它几乎与spidermonkey中的命令式算法（优化开启）和v8中的关闭（1.2倍）一样好。

故事的道德我想，如果它很重要，花点时间考虑一下你的算法。此外，高级语言已经有很多开销（有时候还有自己的技巧来减少它），所以有时候做一些更直接的或者利用它们的高级功能也同样快。微优化也并不总是有帮助。

*除非我的python安装异常缓慢，否则直接时间有点无意义，因为这是第一代eee。时代是：
forts版12s，没有输出，1e8个满意的数字。
对于fortran版本40s，通过gzip到磁盘输出管道两个js版本都是8-12s。 1e7个满意的数字，没有完全优化的输出两个js版本1e7的10-100s，优化少/没有优化（取决于没有优化的定义，100s是eval（））没有输出

我有兴趣在真实的计算机上查看这些程序的时间。

Answer 12

这里有一些值得思考的问题：如果选择运行1979算法来查找2009计算机中的素数或1979年计算机上的2009算法，您会选择哪种算法？

古代硬件上的新算法将是一个巨大的优势选择。看看你的“助手”功能。

Answer 13

可能有很多优化：

（1）使用const引用

bool inVector(int inQuestion, const vector<int>& known)
{
    for(vector<int>::const_iterator it = known.begin(); it != known.end(); ++it)
        if(*it == inQuestion)
            return true;
    return false;
}

int sum(const vector<int>& given)
{
    int sum = 0;
    for(vector<int>::const_iterator it = given.begin(); it != given.end(); ++it)
        sum += *it;
    return sum;
}

（2）使用倒计时循环

int pow(int given, int power)
{
    int current = 1;
    while(power--)
        current *= given;
    return current;
}

或者，正如其他人所说，使用标准库代码。

（3）不要在不需要的地方分配缓冲区

        vector<int> squares;
        for (int temp = current; temp != 0; temp /= 10)
        {
            squares.push_back(pow(temp % 10, 2));
        }

Answer 14

通过与PotatoSwatter类似的优化，我有10000个数字的时间从1.063秒下降到0.062秒（除了我用原始的标准sprintf替换了itoa）。

使用所有内存优化（不按值传递容器 - 在C ++中，您必须明确决定是否需要副本或引用;移动从内部循环分配内存的操作;如果已经有数字一个char缓冲区，将它复制到std :: string等是什么意思）我把它降到了0.532。

剩下的时间来自使用％10来访问数字，而不是将数字转换为字符串。

我想可能会有另一个算法级别优化（你找到一个幸福数字时遇到的数字本身也是幸福的数字？）但是我不知道获得了多少（在那里没有那么多的快乐数字）第一名）这个优化也不在Python版本中。

顺便说一句，通过不使用字符串转换和列表到方形数字，我将Python版本从0.825秒降低到0.33。

Answer 15

这是一个C＃版本：

using System;
using System.Collections.Generic;
using System.Text;

namespace CSharp
{
  class Program
  {
    static void Main (string [] args)
    {
      while (true)
      {
        Console.Write ("Pick an upper bound: ");

        String
          input = Console.ReadLine ();

        uint
          upper_bound;

        if (uint.TryParse (input, out upper_bound))
        {
          DateTime
            start = DateTime.Now;

          CalcHappyNumbers (upper_bound);

          DateTime
            end = DateTime.Now;

          TimeSpan
            span = end - start;

          Console.WriteLine ("Time taken = " + span.TotalSeconds + " seconds.");
        }
        else
        {
          Console.WriteLine ("Error in input, unable to parse '" + input + "'.");
        }
      }
    }

    enum State
    {
      Happy,
      Sad,
      Unknown
    }

    static void CalcHappyNumbers (uint upper_bound)
    {
      SortedDictionary<uint, State>
        happy = new SortedDictionary<uint, State> ();

      SortedDictionary<uint, bool>
        happy_numbers = new SortedDictionary<uint, bool> ();

      happy [1] = State.Happy;
      happy_numbers [1] = true;

      for (uint current = 2 ; current < upper_bound ; ++current)
      {
        FindState (ref happy, ref happy_numbers, current);
      }

      //foreach (KeyValuePair<uint, bool> pair in happy_numbers)
      //{
      //  Console.Write (pair.Key.ToString () + ", ");
      //}

      //Console.WriteLine ("");
    }

    static State FindState (ref SortedDictionary<uint, State> happy, ref SortedDictionary<uint,bool> happy_numbers, uint value)
    {
      State
        current_state;

      if (happy.TryGetValue (value, out current_state))
      {
        if (current_state == State.Unknown)
        {
          happy [value] = State.Sad;
        }
      }
      else
      {
        happy [value] = current_state = State.Unknown;

        uint
          new_value = 0;

        for (uint i = value ; i != 0 ; i /= 10)
        {
          uint
            lsd = i % 10;

          new_value += lsd * lsd;
        }

        if (new_value == 1)
        {
          current_state = State.Happy;
        }
        else
        {
          current_state = FindState (ref happy, ref happy_numbers, new_value);
        }

        if (current_state == State.Happy)
        {
          happy_numbers [value] = true;
        }

        happy [value] = current_state;
      }

      return current_state;
    }
  }
}

我将它与Dr_Asik的C ++代码进行了比较。对于100000的上限，C ++版本在大约2.9秒内运行，C＃版本在0.35秒内运行。两者都是使用Dev Studio 2005使用默认版本构建选项编译的，并且都是从命令提示符执行的。

Answer 16


#!/usr/bin/env python

import timeit

upperBound = 0

def calcMain():
    known = set()
    for i in xrange(0,upperBound+1):
        next = False
        current = i
        history = set()
        while not next:
            squaresum=0
            while current > 0:
                current, digit = divmod(current, 10)
                squaresum += digit * digit
            current = squaresum
            if current in history:
                next = True
                if current == 1:
                    known.add(i)
            history.add(current)

while True:
    upperBound = input("Pick an upper bound: ")
    result = timeit.Timer(calcMain).timeit(1)
    print result, "seconds.\n"

我对你原来的python代码示例进行了一些小改动，使得代码性能提高了16倍。我做出的改变将100,000箱从大约9.64秒提高到大约3.38秒。

主要的变化是使mod 10和累加器更改在while循环中运行。我做了一些其他的改动，只用了几分之几秒就改善了执行时间。第一个小改动是将main for循环从范围列表解析更改为xrange迭代器。第二个小变化是将set类替换为已知变量和历史变量的列表类。我还尝试了迭代器理解和预先计算方块，但它们都对效率产生了负面影响。我似乎运行的是较慢版本的python，或者运行速度较慢的处理器而不是其他一些贡献者。我会对其他人将我的python代码与同一算法的优化C ++版本之一进行时序比较的结果感兴趣。我也尝试过使用python -O和-OO优化，但它们与预期的效果相反。

Answer 17

为什么每个人都在c ++版本中使用矢量？查找时间是O（N）。

即使它不如python集有效，也可以使用std :: set。查找时间为O（log（N））。

Python和C ++之间不同寻常的速度差异

17 个答案: