假设我有一个std::tuple<T...>
,我希望能够有效地访问其第n个元素,其中n仅在运行时已知。由于类型T...
是异构的,我所能得到的只是void *
而且我很好。这是我到达的地方:
template <size_t ... Indexes, class Tuple>
void * get_element_pointer(std::index_sequence<Indexes...>, Tuple & t, size_t idx) {
static size_t offsets[] = {(size_t)(void *)&std::get<Indexes>(t) - (size_t)(void *)(&t)...};
return (void *)((size_t)(void *)(&t) + offsets[idx]);
}
然后将其称为:
get_element_pointer(std::index_sequence_for<T...>{}, some_tuple, some_index);
这样做的要点是静态创建一个size_t
数组offsets
,其中包含每个元组元素的偏移列表。然后,在运行时,我可以查找偏移量并将其添加到传递的元组中。
我的解决方案有两个问题:
offsets
是在第一次调用此函数时创建的,它是根据当时传递的元组实例创建的。我觉得这有点奇怪。我可以创建一个Tuple
类型的伪造临时,但它可能不是默认的可构造的。或者,我可以将nullptr
投射到Tuple *
,但std::get<Indexes>(*(Tuple *)(nullptr))
会尖叫UB。(size_t)(void *)(&t)
和(void *)((size_t)(void *)(&t) + offsets[idx])
指针杂乱是我能找到的唯一阻止编译器给我警告的方法。我知道当你有虚拟功能等时,指针转换可能很棘手而且非常重要。所以我担心我可能会遗漏一些东西。您认为我的解决方案是否可以接受?你能想到一个更简单的解决方案,更少的指针杂耍吗?
答案 0 :(得分:3)
看了解决方案之后,我把你对性能的担忧铭记于心,并决定看看我们是否能做得更好。
有趣的是,我根据编译器对constexpr进行优化的尝试有不同的结果。
我将比较gcc 5.3和apple clang的输出:
这是我的解决方案:
#include <utility>
#include <tuple>
#include <iostream>
template<class Tuple, size_t Index>
void* get_address(Tuple& t)
{
return std::addressof(std::get<Index>(t));
}
template <size_t ... Indexes, class Tuple>
constexpr void* get_element_pointer(Tuple & t,
size_t idx,
std::index_sequence<Indexes...>)
{
using function_type = void* (*)(Tuple&);
function_type constexpr ptrs[] =
{
&get_address<Tuple, Indexes>...
};
return ptrs[idx](t);
}
template<class Tuple>
__attribute__((noinline))
constexpr
void * get_element_pointer(Tuple& t, size_t index)
{
return get_element_pointer(t,
index,
std::make_index_sequence<std::tuple_size<Tuple>::value>());
}
int main()
{
std::tuple<int, int, int, int, int, int, int , int, int, int> x;
x = std::make_tuple(4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
std::cout << *reinterpret_cast<int*>(get_element_pointer(x, 1)) << std::endl;
}
(为清晰起见,使用-O2 -fomit-frame-pointer编译)
clang的解决方案就是:
__Z19get_element_pointerINSt3__15tupleIJiiiiiiiiiiEEEEPvRT_m:
.align 4, 0x90
leaq __ZZ19get_element_pointerIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9EENSt3__15tupleIJiiiiiiiiiiEEEEPvRT0_mNS0_16integer_sequenceImJXspT_EEEEE4ptrs(%rip), %rax
jmpq *(%rax,%rsi,8) ## TAILCALL
按预期方式引用编译时生成的跳转表:
__ZZ19get_element_pointerIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9EENSt3__15tupleIJiiiiiiiiiiEEEEPvRT0_mNS0_16integer_sequenceImJXspT_EEEEE4ptrs:
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm0EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm1EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm2EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm3EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm4EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm5EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm6EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm7EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm8EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm9EEPvRT_
其中每个访问者函数都是微不足道的(例如提供的一个):
__Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm2EEPvRT_:
leaq 8(%rdi), %rax
retq
这是我假设编译器会做的,是“如果我正在编写机器代码我会怎么做”
然而,gcc似乎错过了优化跳转表并在使用它之前在内存中构建它的机会!
void* get_element_pointer<std::tuple<int, int, int, int, int, int, int, int, int, int> >(std::tuple<int, int, int, int, int, int, int, int, int, int>&, unsigned long):
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 0ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -88(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 1ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -80(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 2ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -72(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 3ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -64(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 4ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -56(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 5ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -48(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 6ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -40(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 7ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -32(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 8ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -24(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 9ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -16(%rsp)
movq -88(%rsp,%rsi,8), %rax
jmp *%rax
在调用类似的琐碎访问者之前:
void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 3ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&):
leaq 24(%rdi), %rax
ret
如此不受欢迎,我想知道非constexpr实现中的常量折叠是否会做得更好:
template <size_t ... Indexes, class Tuple>
void* get_element_pointer(Tuple & t,
size_t idx,
std::index_sequence<Indexes...>)
{
using function_type = void* (*)(Tuple&);
function_type static const ptrs[] =
{
&get_address<Tuple, Indexes>...
};
return ptrs[idx](t);
}
原来确实如此 - 我现在在gcc上获得与使用constexpr解决方案生成的clang相同的代码:
void* get_element_pointer<std::tuple<int, int, int, int, int, int, int, int, int, int> >(std::tuple<int, int, int, int, int, int, int, int, int, int>&, unsigned long):
movq void* get_element_pointer<0ul, 1ul, 2ul, 3ul, 4ul, 5ul, 6ul, 7ul, 8ul, 9ul, std::tuple<int, int, int, int, int, int, int, int, int, int> >(std::tuple<int, int, int, int, int, int, int, int, int, int>&, unsigned long, std::integer_sequence<unsigned long, 0ul, 1ul, 2ul, 3ul, 4ul, 5ul, 6ul, 7ul, 8ul, 9ul>)::ptrs(,%rsi,8), %rax
jmp *%rax
clang做了什么?
__Z19get_element_pointerINSt3__15tupleIJiiiiiiiiiiEEEEPvRT_m:
movq __ZZ19get_element_pointerIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9EENSt3__15tupleIJiiiiiiiiiiEEEEPvRT0_mNS0_16integer_sequenceImJXspT_EEEEE4ptrs@GOTPCREL(%rip), %rax
jmpq *(%rax,%rsi,8) ## TAILCALL
幸运的是同样的结果。
所以这是最终的,可证明是最佳的解决方案:
template<class Tuple, size_t Index>
void* get_address(Tuple& t)
{
return std::addressof(std::get<Index>(t));
}
template <size_t ... Indexes, class Tuple>
void* get_element_pointer(Tuple & t,
size_t idx,
std::index_sequence<Indexes...>)
{
using function_type = void* (*)(Tuple&);
function_type static const ptrs[] =
{
&get_address<Tuple, Indexes>...
};
return ptrs[idx](t);
}
template<class Tuple>
__attribute__((noinline))
constexpr
void * get_element_pointer(Tuple& t, size_t index)
{
return get_element_pointer(t,
index,
std::make_index_sequence<std::tuple_size<Tuple>::value>());
}
答案 1 :(得分:2)
为什么不呢:
template <size_t ... Indexes, class Tuple>
void* get_element_pointer(std::index_sequence<Indexes...>, Tuple & t, size_t idx) {
void* ptrs[] = { static_cast<void *>(std::addressof(std::get<Indexes>(t)))... };
return ptrs[idx];
}
请注意,我使用std::addressof
处理带有重载operator &
的恶意类。
对于您的警告,您应该将std::size_t
替换为std::intptr_t
和/或char*
:
static std::intptr_t offsets[] = {
reinterpret_cast<char *>(std::addresof(std::get<Indexes>(t)))
- reinterpret_cast<char *>(&t)...
};
static_cast<void *>(reinterpret_cast<char *>(&t) + offsets[idx]);
答案 2 :(得分:0)
在正确性方面,使用首先传递的实例对我来说似乎不是问题。您是正确的,如果您尝试提前创建元组,则默认可构造性是一个问题,但是您可以再次将nullptr
强制转换为tuple*
并使用它。
或许(void *)((size_t)(void *)(&t) + offsets[idx])
更简单地写成reinterpret_cast<char*>(&t) + offsets[idx]
。