; Untested (check for off-by-one array indexing) ; Probably about 3x as fast as the above for large arrays ; on Intel Sandybridge-family. Unrolling by 2 or so would help. ; prototype: unsigned long popcount(int64_t* rdi, int64_t rsi); ; requires byte count in rsi >= 8, and a multiple of 8. ALIGN 16 global popcount popcount: add rdi, rsi ; rdi = end of buffer neg rsi ; negative index counts up to zero xor rax, rax ;do { .cnt mov rdx, [rdi+rsi] ; mov-load breaks the dependency popcnt rdx, rdx ; on rdx from last iteration. add rax, rdx add rsi, 8 ; index+=8 jl .cnt ;}while(index<0) ret