que - uint32_t c++

Recuento de bits: preprocesador mágico vs moderno C++ (5)

¿Por qué no usar la biblioteca estándar?

#include <bitset> int bits_in(std::uint64_t u) { auto bs = std::bitset<64>(u); return bs.count(); }

ensamblador resultante (Compilado con -O2 -march=native ):

bits_in(unsigned long): xor eax, eax popcnt rax, rdi ret

Vale la pena mencionar en este punto que no todos los procesadores x86 tienen esta instrucción por lo que (al menos con gcc) tendrá que dejarle saber para qué arquitectura compilar.

@tambre mencionó que en realidad, cuando puede, el optimizador irá más allá:

volatile int results[3]; int main() { results[0] = bits_in(255); results[1] = bits_in(1023); results[2] = bits_in(0x8000800080008000); }

ensamblador resultante:

main: mov DWORD PTR results[rip], 8 xor eax, eax mov DWORD PTR results[rip+4], 10 mov DWORD PTR results[rip+8], 4 ret

La vieja escuela de los twiddlers como yo necesita encontrar nuevos problemas para resolver :)

Actualizar

No todos estaban felices de que la solución dependa de la ayuda de la CPU para calcular el conteo de bits. Entonces, ¿qué pasa si utilizamos una tabla autogenerada pero permitimos que el desarrollador configure el tamaño de la misma? (advertencia: tiempo de compilación largo para la versión de tabla de 16 bits)

#include <utility> #include <cstdint> #include <array> #include <numeric> #include <bitset> template<std::size_t word_size, std::size_t...Is> constexpr auto generate(std::integral_constant<std::size_t, word_size>, std::index_sequence<Is...>) { struct popcount_type { constexpr auto operator()(int i) const { int bits = 0; while (i) { i &= i - 1; ++bits; } return bits; } }; constexpr auto popcnt = popcount_type(); return std::array<int, sizeof...(Is)> { {popcnt(Is)...} }; } template<class T> constexpr auto power2(T x) { T result = 1; for (T i = 0; i < x; ++i) result *= 2; return result; } template<class TableWord> struct table { static constexpr auto word_size = std::numeric_limits<TableWord>::digits; static constexpr auto table_length = power2(word_size); using array_type = std::array<int, table_length>; static const array_type& get_data() { static const array_type data = generate(std::integral_constant<std::size_t, word_size>(), std::make_index_sequence<table_length>()); return data; }; }; template<class Word> struct use_table_word { }; template<class Word, class TableWord = std::uint8_t> int bits_in(Word val, use_table_word<TableWord> = use_table_word<TableWord>()) { constexpr auto table_word_size = std::numeric_limits<TableWord>::digits; constexpr auto word_size = std::numeric_limits<Word>::digits; constexpr auto times = word_size / table_word_size; static_assert(times > 0, "incompatible"); auto reduce = [val](auto times) { return (val >> (table_word_size * times)) & (power2(table_word_size) - 1); }; auto const& data = table<TableWord>::get_data(); auto result = 0; for (int i = 0; i < times; ++i) { result += data[reduce(i)]; } return result; } volatile int results[3]; #include <iostream> int main() { auto input = std::uint64_t(1023); results[0] = bits_in(input); results[0] = bits_in(input, use_table_word<std::uint16_t>()); results[1] = bits_in(0x8000800080008000); results[2] = bits_in(34567890); for (int i = 0; i < 3; ++i) { std::cout << results[i] << std::endl; } return 0; }

Actualización final

Esta versión permite el uso de cualquier número de bits en la tabla de búsqueda y admite cualquier tipo de entrada, incluso si es más pequeña que la cantidad de bits en la tabla de búsqueda.

También produce cortocircuitos si los bits altos son cero.

#include <utility> #include <cstdint> #include <array> #include <numeric> #include <algorithm> namespace detail { template<std::size_t bits, typename = void> struct smallest_word; template<std::size_t bits> struct smallest_word<bits, std::enable_if_t<(bits <= 8)>> { using type = std::uint8_t; }; template<std::size_t bits> struct smallest_word<bits, std::enable_if_t<(bits > 8 and bits <= 16)>> { using type = std::uint16_t; }; template<std::size_t bits> struct smallest_word<bits, std::enable_if_t<(bits > 16 and bits <= 32)>> { using type = std::uint32_t; }; template<std::size_t bits> struct smallest_word<bits, std::enable_if_t<(bits > 32 and bits <= 64)>> { using type = std::uint64_t; }; } template<std::size_t bits> using smallest_word = typename detail::smallest_word<bits>::type; template<class WordType, std::size_t bits, std::size_t...Is> constexpr auto generate(std::index_sequence<Is...>) { using word_type = WordType; struct popcount_type { constexpr auto operator()(word_type i) const { int result = 0; while (i) { i &= i - 1; ++result; } return result; } }; constexpr auto popcnt = popcount_type(); return std::array<word_type, sizeof...(Is)> { {popcnt(Is)...} }; } template<class T> constexpr auto power2(T x) { return T(1) << x; } template<std::size_t word_size> struct table { static constexpr auto table_length = power2(word_size); using word_type = smallest_word<word_size>; using array_type = std::array<word_type, table_length>; static const array_type& get_data() { static const array_type data = generate<word_type, word_size>(std::make_index_sequence<table_length>()); return data; }; template<class Type, std::size_t bits> static constexpr auto n_bits() { auto result = Type(); auto b = bits; while(b--) { result = (result << 1) | Type(1); } return result; }; template<class Uint> int operator()(Uint i) const { constexpr auto mask = n_bits<Uint, word_size>(); return get_data()[i & mask]; } }; template<int bits> struct use_bits { static constexpr auto digits = bits; }; template<class T> constexpr auto minimum(T x, T y) { return x < y ? x : y; } template<class Word, class UseBits = use_bits<8>> int bits_in(Word val, UseBits = UseBits()) { using word_type = std::make_unsigned_t<Word>; auto uval = static_cast<word_type>(val); constexpr auto table_word_size = UseBits::digits; constexpr auto word_size = std::numeric_limits<word_type>::digits; auto const& mytable = table<table_word_size>(); int result = 0; while (uval) { result += mytable(uval); #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wshift-count-overflow" uval >>= minimum(table_word_size, word_size); #pragma clang diagnostic pop } return result; } volatile int results[4]; #include <iostream> int main() { auto input = std::uint8_t(127); results[0] = bits_in(input); results[1] = bits_in(input, use_bits<4>()); results[2] = bits_in(input, use_bits<11>()); results[3] = bits_in(input, use_bits<15>()); for (auto&& i : results) { std::cout << i << std::endl; } auto input2 = 0xabcdef; results[0] = bits_in(input2); results[1] = bits_in(input2, use_bits<4>()); results[2] = bits_in(input2, use_bits<11>()); results[3] = bits_in(input2, use_bits<15>()); for (auto&& i : results) { std::cout << i << std::endl; } auto input3 = -1; results[0] = bits_in(input3); results[1] = bits_in(input3, use_bits<4>()); results[2] = bits_in(input3, use_bits<11>()); results[3] = bits_in(input3, use_bits<15>()); for (auto&& i : results) { std::cout << i << std::endl; } return 0; }

ejemplo de salida:

7 7 7 7 17 17 17 17 32 32 32 32

La salida de ensamblaje resultante para una llamada a bits_in(int, use_bits<11>()) convierte en:

.L16: mov edx, edi and edx, 2047 movzx edx, WORD PTR table<11ul>::get_data()::data[rdx+rdx] add eax, edx shr edi, 11 jne .L16

Lo cual me parece razonable.

Supongamos que quiero crear una tabla de búsqueda de recuento de bits construida en tiempo de compilación para enteros de 64 bits en trozos de 16 bits. La única forma en que sé hacer esto es el siguiente código:

#define B4(n) n, n + 1, n + 1, n + 2 #define B6(n) B4(n), B4(n + 1), B4(n + 1), B4(n + 2) #define B8(n) B6(n), B6(n + 1), B6(n + 1), B6(n + 2) #define B10(n) B8(n), B8(n + 1), B8(n + 1), B8(n + 2) #define B12(n) B10(n), B10(n + 1), B10(n + 1), B10(n + 2) #define B14(n) B12(n), B12(n + 1), B12(n + 1), B12(n + 2) #define B16(n) B14(n), B14(n + 1), B14(n + 1), B14(n + 2) #define COUNT_BITS B16(0), B16(1), B16(1), B16(2) unsigned int lookup[65536] = { COUNT_BITS };

¿Existe una forma moderna (C ++ 11/14) de obtener el mismo resultado?

Con c ++ 17 puede usar constexpr para construir la tabla de búsqueda en tiempo de compilación. Con el cálculo del conteo de población, la tabla de búsqueda se puede construir de la siguiente manera:

#include <array> #include <cstdint> template<std::size_t N> constexpr std::array<std::uint16_t, N> make_lookup() { std::array<std::uint16_t, N> table {}; for(std::size_t i = 0; i < N; ++i) { std::uint16_t popcnt = i; popcnt = popcnt - ((popcnt >> 1) & 0x5555); popcnt = (popcnt & 0x3333) + ((popcnt >> 2) & 0x3333); popcnt = ((popcnt + (popcnt >> 4)) & 0x0F0F) * 0x0101; table[i] = popcnt >> 8; } return table; }

Uso de muestra:

auto lookup = make_lookup<65536>();

std::array::operator[] es constexpr desde c ++ 17 , con c ++ 14 el ejemplo anterior compila pero no será un verdadero constexpr .

Si desea castigar a su compilador, también puede inicializar el std::array resultante con plantillas variadic. Esta versión también funcionará con c ++ 14 e incluso con c ++ 11 usando el truco de índices .

#include <array> #include <cstdint> #include <utility> namespace detail { constexpr std::uint8_t popcnt_8(std::uint8_t i) { i = i - ((i >> 1) & 0x55); i = (i & 0x33) + ((i >> 2) & 0x33); return ((i + (i >> 4)) & 0x0F); } template<std::size_t... I> constexpr std::array<std::uint8_t, sizeof...(I)> make_lookup_impl(std::index_sequence<I...>) { return { popcnt_8(I)... }; } } /* detail */ template<std::size_t N> constexpr decltype(auto) make_lookup() { return detail::make_lookup_impl(std::make_index_sequence<N>{}); }

Nota: En el ejemplo anterior cambié a los enteros de 8 bits de los enteros de 16 bits.

Salida de ensamblaje

La versión de 8 bits creará solo 256 argumentos de plantilla para la función detail::make_lookup_impl lugar de 65536. Este último es demasiado y excederá la profundidad máxima de instanciación de la plantilla. Si tiene memoria virtual más que suficiente, puede aumentar este máximo con -ftemplate-depth=65536 indicador de compilación en GCC y volver a los enteros de 16 bits.

De todos modos, eche un vistazo a la siguiente demostración e intente cómo la versión de 8 bits cuenta los bits establecidos de un entero de 64 bits.

Demo en vivo

Esta es una solución C ++ 14, construida básicamente en torno al uso de constexpr :

// this struct is a primitive replacement of the std::array that // has no ''constexpr reference operator[]'' in C++14 template<int N> struct lookup_table { int table[N]; constexpr int& operator[](size_t i) { return table[i]; } constexpr const int& operator[](size_t i) const { return table[i]; } }; constexpr int bit_count(int i) { int bits = 0; while (i) { i &= i-1; ++bits; } return bits; } template<int N> constexpr lookup_table<N> generate() { lookup_table<N> table = {}; for (int i = 0; i < N; ++i) table[i] = bit_count(i); return table; } template<int I> struct Check { Check() { std::cout << I << "/n"; } }; constexpr auto table = generate<65536>(); int main() { // checks that they are evaluated at compile-time Check<table[5]>(); Check<table[65535]>(); return 0; }

Versión ejecutable: http://ideone.com/zQB86O

Uno más para la posteridad, creando una tabla de búsqueda usando una solución recursiva (de la profundidad de log (N)). Hace uso de constexpr-if y constexpr-array-operator [], entonces es mucho C ++ 17.

#include <array> template<size_t Target, size_t I = 1> constexpr auto make_table (std::array<int, I> in = {{ 0 }}) { if constexpr (I >= Target) { return in; } else { std::array<int, I * 2> out {{}}; for (size_t i = 0; i != I; ++i) { out[i] = in[i]; out[I + i] = in[i] + 1; } return make_table<Target> (out); } } constexpr auto population = make_table<65536> ();

Véalo compilar aquí: https://godbolt.org/g/RJG1JA

for (int x = 0; x < 65536; x++) { int mask = 1; int bitCount = 0; for (int y = 0; y < 16; y++) { if ((mask << y) & x) bitCount++; } lookup_1[x] = bitCount; }