C+сборка (nasm) 9,71ГиБ
Я познакомился с программой под названием простое сито из комментария, и был впечатлен тем, что он может так быстро генерировать простые числа. Каким-то странным образом само существование программы побудило меня к мысли, что я должен и могу создать программу, которая быстрее, чем .so
. I don't know why, but really как-то казалось, что это легкий противник, но это не так. За месяц цель создать более быструю программу, чем gp -qf ./file_name.gp
drained so much of my energy, but unfortunately, the best I could reach was about 0.8 times the speed of pari-gp
(1).
Ну, я все равно мог бы сделать быстрее, если бы просто скопировал тот же алгоритм. forprime(p=2, 10^16, print(p))
used, and apply some of optimization techniques I gathered while trying to beat it. Maybe I will do that later in time, but for now I feel that just re-implementing the same algorithm is not a fun task. Also, one of the reasons why I didn't use the same algorithm is because этот алгоритм довольно сложен, и поэтому его трудно реализовать эффективно. Я действительно думал, что более простой подход будет быстрее, но теперь понятно, почему #include <stdio.h>
#include <array>
// ~~ Algorithm Description ~~
// Since sqrt(max) = 100000000 = M, we should precompute primes until M,
// because all factors of N are always smaller or equal to sqrt(N).
// There are 5761455 primes smaller than M.
// After these maneuvers, assuming that `ull' is 8 bytes large, the binary
// will grow by approximately 46 megabytes.
// Notice that this solution largely makes use of the C++ compile-time features
// and the fact that the program's compile-time isn't timed nor the binary size
// isn't measured.
// BUILDING:
// g++ -O3 -std=c++20 pgen.cpp -o pgen -fconstexpr-ops-limit=1000000000000 -fconstexpr-loop-limit=1000000000
typedef unsigned long long ull;
template <std::size_t N>
constexpr std::array<ull, N> get_ptab() {
std::array<ull, N> ptab;
ull gen = 3;
ptab[0] = 2; ptab[1] = 3; ptab[2] = 5;
for(std::size_t i = 7; gen < N; i += 2) {
bool ok = true;
if(i % 3 == 0 || i % 5 == 0)
continue;
for(std::size_t j = 2; j * j <= i; j++) {
if(i % j == 0) {
ok = false;
break;
}
}
if(ok)
ptab[gen++] = i;
}
return ptab;
}
constexpr std::array<ull, 5761455> ptab = get_ptab<5761455>();
int main(void) {
// Step 1: print all the primes that we've hardcoded already.
for(std::size_t i = 0; i < ptab.size(); i++)
printf("%llu\n", ptab[i]);
// Step 2: starting with ptab.back(), go up until max, checking if
// N is divisible by any of the primes we've hardcoded.
// If it is, skip it. If it isn't, print it.
for(std::size_t i = ptab.back(); i < 10000000000000000; i+=2) {
bool ok = true;
for(std::size_t j = 0; j < ptab.size() && i <= ptab[j]; j++) {
if(i % ptab[j] == 0) {
ok = false;
break;
}
}
if(ok)
printf("%llu\n", i);
}
return 0;
}
took the hard route of implementing such a complex algorithm. It was obviously worth it.
В любом случае, более простой маршрут все равно отнял у меня больше месяца свободного времени и много бессонницы. В результате получается просто более медленная программа, чем public final class Main {
private static Object register;
static {
register = BigComplex.ZERO;
}
public static void main(String[] var0) {
ProgramStack stack = new ProgramStack(var0);
while(true) {
for(int var1 = 500; var1 != 0; --var1) {
if (RuntimeHelpers.truthValue(RuntimeMethods.isPrime(register = RuntimeMethods.increment(register)))) {
stack.push(register);
}
}
System.out.print(RuntimeMethods.joinByNewlines(JyxalList.create(stack)));
}
}
}
but it is still enough as a fast program here.
Я не делал никакой оптимизации ввода-вывода, а просто выполнял цикл MethodHandles
s. There are many ways to make the output faster, but I don't have the energy to do it now.
(1) Если вам интересно этот это альтернатива t
which actually counts primes like java -jar Jyxal.v.0.4.1.jar code.txt t
. Разрыв в скорости составляет около 10–30%, и он увеличивается с увеличением диапазона просеивания, поэтому мой выбор алгоритма был неправильным и {500(&›¥æߥ)W⁋₴ # Takes no input
{ # Infinite loop
500( ) # Repeat 500 times
&› # Increment the register
¥æ # And check it for primality
ß # If it is prime...
¥ # Push the number
W # Wrap the stack in a list
⁋ # Join by newlines
₴ # And then print the result
# Implicitly loops back
's was right.
prime.h
{500(&›¥æߥ)W⁋₴
main.c
#!/bin/sh -x
SRC="main.c sieve_7_13.o sieve_17_89.o sieve_93_.o lut.o -lm"
C="gcc"
O="-O3 -march=native"
$C $O -c sieve_7_13.c
$C $O -c sieve_17_89.c
nasm -felf64 sieve_93_.s
nasm -felf64 lut.s
$C $O -or $SRC
rm *.o
лут.с
%define MD30 -2004318071
%define MD30Q -8608480567731124087
%define f rdi
%define st rsi
%define pp r8
%define nsp ecx
%define c r9
%define c_d r9d
%define fc r9
%define p r10
%define p_d r10d
%define f_ r11
%define q r12
%define fp r13
%define psq rbx
%define d rbp
%define d_d ebp
%define md30 rsp
%define md30_d esp
%define w rbx
%define w_d ebx
%macro eb8 16
mov fp, f_
add fp, p
cmp fp, fc
jae _%1_0_
lea rax, [q + q * 2 + ]
lea rbx, [q + q * 4 + ]
lea rdx, [rax * 2 + ]
lea rbp, [q * 8 + ]
lea rsp, [q + q * 8 + ]
lea r14, [rax + q * 8 + ]
lea r15, [rdx + q * 8 + ]
jmp _%1_0_1
_%1_0_0:
mov fp, f_
add fp, p
cmp fp, fc
jae _%1_0_
_%1_0_1:
and byte [f_], ~(1 << %2)
and byte [f_ + rax], ~(1 << %3)
and byte [f_ + rbx], ~(1 << %4)
and byte [f_ + rdx], ~(1 << %5)
and byte [f_ + rbp], ~(1 << %6)
and byte [f_ + rsp], ~(1 << %7)
and byte [f_ + r14], ~(1 << %8)
and byte [f_ + r15], ~(1 << %9)
mov f_, fp
jmp _%1_0_0
%endmacro
%macro eb 3
and byte [f_], ~(1 << %2)
lea f_, [f_ + q * %1 + %3]
cmp f_, fc
jae next_
%endmacro
%macro eb_4 2
and byte [f_], ~(1 << %1)
lea rax, [q + q * 2 + %2]
add f_, rax
cmp f_, fc
jae next_
%endmacro
%macro eb0 2
eb_4 %1, %2
%endmacro
%macro eb1 2
eb 2, %1, %2
%endmacro
%macro eb2 2
eb 1, %1, %2
%endmacro
%macro eb3 2
eb 2, %1, %2
%endmacro
%macro eb4 2
eb 1, %1, %2
%endmacro
%macro eb5 2
eb 2, %1, %2
%endmacro
%macro eb6 2
eb_4 %1, %2
%endmacro
%macro eb7 2
eb 1, %1, %2
%endmacro
extern L1C
extern BC
extern PADD
extern WIX
align 64
L:
dq _1, 0, 0, _7, 0, _11, _13, 0, _17, _19, 0, _23, 0, 0, _29, 0
dq _1_0, _1_1, _1_2, _1_3, _1_4, _1_5, _1_6, _1_7
dq _7_0, _7_1, _7_2, _7_3, _7_4, _7_5, _7_6, _7_7
dq _11_0, _11_1, _11_2, _11_3, _11_4, _11_5, _11_6, _11_7
dq _13_0, _13_1, _13_2, _13_3, _13_4, _13_5, _13_6, _13_7
dq _17_0, _17_1, _17_2, _17_3, _17_4, _17_5, _17_6, _17_7
dq _19_0, _19_1, _19_2, _19_3, _19_4, _19_5, _19_6, _19_7
dq _23_0, _23_1, _23_2, _23_3, _23_4, _23_5, _23_6, _23_7
dq _29_0, _29_1, _29_2, _29_3, _29_4, _29_5, _29_6, _29_7
align 16
global sieve_93_
sieve_93_:
push rbx
push rbp
push r12
push r13
push r14
push r15
movq xmm0, rsp
mov r8, rdx
mov c_d, [rel L1C]
mov eax, [rel BC]
cmp nsp, -1
cmove c_d, eax
mov p_d, [rel pp]
mov psq, p
imul psq, p
mov d, psq
sub d, st
L0:
imul rax, c, 30
cmp d, rax
jge end
mov f_, f
mov md30_d, MD30
cmp psq, st
ja L00
mov rax, st
xor rdx, rdx
imul rbx, p, 30
div rbx
imul rax, 30
mov rbx, rax
mov eax, edx
shr rdx, 32
div p_d
movzx eax, byte [rax + PADD]
add rax, rbx
mov rbx, rax
imul rax, p
sub rax, st
imul rax, md30
shr rax, 36
cmp eax, c_d
jae next
add f_, rax
mov rax, MD30Q
mul rbx
shr rdx, 4
imul rdx, -30
add rdx, rbx
shr edx, 1
movzx w_d, byte [rdx + WIX]
mov eax, p_d
imul rax, md30
shr rax, 36
imul edx, eax, -30
add edx, p_d
shr edx, 1
L1:
shl eax, 1
mov q, rax
add c, f
jmp [rdx * 8 + L]
align 16
_1:
jmp [w * 8 + L + 16 * 8]
align 16
_1_0:
eb8 1, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0
_1_0_:
eb0 0, 0
_1_1:
eb1 1, 0
_1_2:
eb2 2, 0
_1_3:
eb3 3, 0
_1_4:
eb4 4, 0
_1_5:
eb5 5, 0
_1_6:
eb6 6, 0
_1_7:
eb7 7, 1
jmp _1_0
align 16
_7:
jmp [w * 8 + L + 24 * 8]
align 16
_7_0:
eb8 7, 1, 5, 4, 0, 7, 3, 2, 6, 1, 2, 1, 3, 4, 4, 3
_7_0_:
eb0 1, 1
_7_1:
eb1 5, 1
_7_2:
eb2 4, 1
_7_3:
eb3 0, 0
_7_4:
eb4 7, 1
_7_5:
eb5 3, 1
_7_6:
eb6 2, 1
_7_7:
eb7 6, 1
jmp _7_0
align 16
_11:
jmp [w * 8 + L + 32 * 8]
align 16
_11_0:
eb8 11, 2, 4, 0, 6, 1, 7, 3, 5, 2, 4, 0, 6, 6, 6, 6
_11_0_:
eb0 2, 2
_11_1:
eb1 4, 2
_11_2:
eb2 0, 0
_11_3:
eb3 6, 2
_11_4:
eb4 1, 0
_11_5:
eb5 7, 2
_11_6:
eb6 3, 2
_11_7:
eb7 5, 1
jmp _11_0
align 16
_13:
jmp [w * 8 + L + 40 * 8]
align 16
_13_0:
eb8 13, 3, 0, 6, 5, 2, 1, 7, 4, 3, 4, -1, 7, 8, 6, 7
_13_0_:
eb0 3, 3
_13_1:
eb1 0, 1
_13_2:
eb2 6, 1
_13_3:
eb3 5, 2
_13_4:
eb4 2, 1
_13_5:
eb5 1, 1
_13_6:
eb6 7, 3
_13_7:
eb7 4, 1
jmp _13_0
align 16
_17:
jmp [w * 8 + L + 48 * 8]
align 16
_17_0:
eb8 17, 4, 7, 1, 2, 5, 6, 0, 3, 3, 6, 1, 9, 10, 10, 9
_17_0_:
eb0 4, 3
_17_1:
eb1 7, 3
_17_2:
eb2 1, 1
_17_3:
eb3 2, 2
_17_4:
eb4 5, 1
_17_5:
eb5 6, 3
_17_6:
eb6 0, 3
_17_7:
eb7 3, 1
jmp _17_0
align 16
_19:
jmp [w * 8 + L + 56 * 8]
align 16
_19_0:
eb8 19, 5, 3, 7, 1, 6, 0, 4, 2, 4, 6, 0, 10, 12, 10, 10
_19_0_:
eb0 5, 4
_19_1:
eb1 3, 2
_19_2:
eb2 7, 2
_19_3:
eb3 1, 2
_19_4:
eb4 6, 2
_19_5:
eb5 0, 2
_19_6:
eb6 4, 4
_19_7:
eb7 2, 1
jmp _19_0
align 16
_23:
jmp [w * 8 + L + 64 * 8]
align 16
_23_0:
eb8 23, 6, 2, 3, 7, 0 ,4, 5, 1, 5, 8, -1, 13, 14, 12, 13
_23_0_:
eb0 6, 5
_23_1:
eb1 2, 3
_23_2:
eb2 3, 1
_23_3:
eb3 7, 4
_23_4:
eb4 0, 1
_23_5:
eb5 4, 3
_23_6:
eb6 5, 5
_23_7:
eb7 1, 1
jmp _23_0
align 16
_29:
jmp [w * 8 + L + 72 * 8]
align 16
_29_0:
eb8 29, 7, 6, 5, 4, 3, 2, 1, 0, 6, 10, 0, 16, 18, 16, 16
_29_0_:
eb0 7, 6
_29_1:
eb1 6, 4
_29_2:
eb2 5, 2
_29_3:
eb3 4, 4
_29_4:
eb4 3, 2
_29_5:
eb5 2, 4
_29_6:
eb6 1, 6
_29_7:
eb7 0, 1
jmp _29_0
align 16
next_:
sub c, f
next:
dec nsp
jz end
add pp, 4
mov p_d, [rel pp]
mov psq, p
imul psq, p
mov d, psq
sub d, st
jmp L0
align 16
L00:
mov eax, d_d
imul rax, md30
shr rax, 36
add f_, rax
mov eax, p_d
imul rax, md30
shr rax, 36
imul edx, eax, -30
add edx, p_d
shr edx, 1
movzx w_d, byte [rdx + WIX]
jmp L1
align 16
end:
movq rsp, xmm0
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rbx
ret
sieve_7_13.c
#include "prime.h"
#define eb8(c0, c1, c2, c3, c4, c5, c6, c7, d1, d2, d3, d4, d5, d6, d7) do {\
*f_ &= ~(1 << c0);\
f_[q * 3 + d1] &= ~(1 << c1);\
f_[q * 5 + d2] &= ~(1 << c2);\
f_[q * 6 + d3] &= ~(1 << c3);\
f_[q * 8 + d4] &= ~(1 << c4);\
f_[q * 9 + d5] &= ~(1 << c5);\
f_[q * 11 + d6] &= ~(1 << c6);\
f_[q * 14 + d7] &= ~(1 << c7);\
} while (0)
#define eb(m, i, a) do {\
*f_ &= ~(1 << i);\
if ((f_ += q * m + a) >= f + c) return;\
} while (0)
#define eb0(i, a) eb(3, i, a)
#define eb1(i, a) eb(2, i, a)
#define eb2(i, a) eb(1, i, a)
#define eb3(i, a) eb(2, i, a)
#define eb4(i, a) eb(1, i, a)
#define eb5(i, a) eb(2, i, a)
#define eb6(i, a) eb(3, i, a)
#define eb7(i, a) eb(1, i, a)
static void sieve(unsigned char *f, uint64_t st, unsigned p) {
unsigned c = L1C;
uint64_t psq = sq(p);
int64_t d = psq - st;
unsigned char *f_ = f;
unsigned w;
if (__builtin_expect(psq > st, 0)) {
f_ += (unsigned)d / 30;
w = WIX[p % 30 / 2];
} else {
uint64_t p30 = (uint64_t)p * 30;
uint64_t n = st / p30 * 30 + PADD[st % p30 / p];
f_ += (unsigned)(p * n - st) / 30;
w = WIX[n % 30 / 2];
}
unsigned q = p / 30 * 2;
switch (p % 30) {
case 1:
switch (w) {
for (;;) {
case 0:
for (; f_ + p < f + c; f_ += p) {
eb8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0);
}
eb0(0, 0);
case 1:
eb1(1, 0);
case 2:
eb2(2, 0);
case 3:
eb3(3, 0);
case 4:
eb4(4, 0);
case 5:
eb5(5, 0);
case 6:
eb6(6, 0);
case 7:
eb7(7, 1);
}
default:
__builtin_unreachable();
}
case 7:
switch (w) {
for (;;) {
case 0:
for (; f_ + p < f + c; f_ += p) {
eb8(1, 5, 4, 0, 7, 3, 2, 6, 1, 2, 3, 3, 4, 5, 6);
}
eb0(1, 1);
case 1:
eb1(5, 1);
case 2:
eb2(4, 1);
case 3:
eb3(0, 0);
case 4:
eb4(7, 1);
case 5:
eb5(3, 1);
case 6:
eb6(2, 1);
case 7:
eb7(6, 1);
}
default:
__builtin_unreachable();
}
case 11:
switch (w) {
for (;;) {
case 0:
for (; f_ + p < f + c; f_ += p) {
eb8(2, 4, 0, 6, 1, 7, 3, 5, 2, 4, 4, 6, 6, 8, 10);
}
eb0(2, 2);
case 1:
eb1(4, 2);
case 2:
eb2(0, 0);
case 3:
eb3(6, 2);
case 4:
eb4(1, 0);
case 5:
eb5(7, 2);
case 6:
eb6(3, 2);
case 7:
eb7(5, 1);
}
default:
__builtin_unreachable();
}
case 13:
switch (w) {
for (;;) {
case 0:
for (; f_ + p < f + c; f_ += p) {
eb8(3, 0, 6, 5, 2, 1, 7, 4, 3, 4, 5, 7, 8, 9, 12);
}
eb0(3, 3);
case 1:
eb1(0, 1);
case 2:
eb2(6, 1);
case 3:
eb3(5, 2);
case 4:
eb4(2, 1);
case 5:
eb5(1, 1);
case 6:
eb6(7, 3);
case 7:
eb7(4, 1);
}
default:
__builtin_unreachable();
}
case 17:
switch (w) {
for (;;) {
case 0:
for (; f_ + p < f + c; f_ += p) {
eb8(4, 7, 1, 2, 5, 6, 0, 3, 3, 6, 7, 9, 10, 13, 16);
}
eb0(4, 3);
case 1:
eb1(7, 3);
case 2:
eb2(1, 1);
case 3:
eb3(2, 2);
case 4:
eb4(5, 1);
case 5:
eb5(6, 3);
case 6:
eb6(0, 3);
case 7:
eb7(3, 1);
}
default:
__builtin_unreachable();
}
case 19:
switch (w) {
for (;;) {
case 0:
for (; f_ + p < f + c; f_ += p) {
eb8(5, 3, 7, 1, 6, 0, 4, 2, 4, 6, 8, 10, 12, 14, 18);
}
eb0(5, 4);
case 1:
eb1(3, 2);
case 2:
eb2(7, 2);
case 3:
eb3(1, 2);
case 4:
eb4(6, 2);
case 5:
eb5(0, 2);
case 6:
eb6(4, 4);
case 7:
eb7(2, 1);
}
default:
__builtin_unreachable();
}
case 23:
switch (w) {
for (;;) {
case 0:
for (; f_ + p < f + c; f_ += p) {
eb8(6, 2, 3, 7, 0, 4, 5, 1, 5, 8, 9, 13, 14, 17, 22);
}
eb0(6, 5);
case 1:
eb1(2, 3);
case 2:
eb2(3, 1);
case 3:
eb3(7, 4);
case 4:
eb4(0, 1);
case 5:
eb5(4, 3);
case 6:
eb6(5, 5);
case 7:
eb7(1, 1);
}
default:
__builtin_unreachable();
}
case 29:
switch (w) {
for (;;) {
case 0:
for (; f_ + p < f + c; f_ += p) {
eb8(7, 6, 5, 4, 3, 2, 1, 0, 6, 10, 12, 16, 18, 22, 28);
}
eb0(7, 6);
case 1:
eb1(6, 4);
case 2:
eb2(5, 2);
case 3:
eb3(4, 4);
case 4:
eb4(3, 2);
case 5:
eb5(2, 4);
case 6:
eb6(1, 6);
case 7:
eb7(0, 1);
}
default:
__builtin_unreachable();
}
default:
__builtin_unreachable();
}
}
void sieve_17(unsigned char *f, uint64_t st) {
sieve(f, st, 17);
}
void sieve_19(unsigned char *f, uint64_t st) {
sieve(f, st, 19);
}
void sieve_23(unsigned char *f, uint64_t st) {
sieve(f, st, 23);
}
void sieve_29(unsigned char *f, uint64_t st) {
sieve(f, st, 29);
}
void sieve_31(unsigned char *f, uint64_t st) {
sieve(f, st, 31);
}
void sieve_37(unsigned char *f, uint64_t st) {
sieve(f, st, 37);
}
void sieve_41(unsigned char *f, uint64_t st) {
sieve(f, st, 41);
}
void sieve_43(unsigned char *f, uint64_t st) {
sieve(f, st, 43);
}
void sieve_47(unsigned char *f, uint64_t st) {
sieve(f, st, 47);
}
void sieve_53(unsigned char *f, uint64_t st) {
sieve(f, st, 53);
}
void sieve_59(unsigned char *f, uint64_t st) {
sieve(f, st, 59);
}
void sieve_61(unsigned char *f, uint64_t st) {
sieve(f, st, 61);
}
void sieve_67(unsigned char *f, uint64_t st) {
sieve(f, st, 67);
}
void sieve_71(unsigned char *f, uint64_t st) {
sieve(f, st, 71);
}
void sieve_73(unsigned char *f, uint64_t st) {
sieve(f, st, 73);
}
void sieve_79(unsigned char *f, uint64_t st) {
sieve(f, st, 79);
}
void sieve_83(unsigned char *f, uint64_t st) {
sieve(f, st, 83);
}
void sieve_89(unsigned char *f, uint64_t st) {
sieve(f, st, 89);
}
sieve_17_89.c
#include "prime.h"
#define am7(_0, _1, _2, _3, _4, _5, _6) do {\
m0 = M7[_0]; m1 = M7[_1]; m2 = M7[_2]; m3 = M7[_3]; m4 = M7[_4];\
m5 = M7[_5]; m6 = M7[_6];\
} while (0)
#define am11(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) do {\
m0 = M11[_0]; m1 = M11[_1]; m2 = M11[_2]; m3 = M11[_3]; m4 = M11[_4];\
m5 = M11[_5]; m6 = M11[_6]; m7 = M11[_7]; m8 = M11[_8]; m9 = M11[_9];\
m10 = M11[_10];\
} while (0)
#define am13(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12) do {\
m0 = M13[_0]; m1 = M13[_1]; m2 = M13[_2]; m3 = M13[_3]; m4 = M13[_4];\
m5 = M13[_5]; m6 = M13[_6]; m7 = M13[_7]; m8 = M13[_8]; m9 = M13[_9];\
m10 = M13[_10]; m11 = M13[_11]; m12 = M13[_12];\
} while (0)
void sieve_7_13(unsigned char *f, uint64_t st) {
static const void *L[] = {
&&_7_0, &&_7_1, &&_7_2, &&_7_3, &&_7_4, &&_7_5, &&_7_6,
&&_11_0, &&_11_1, &&_11_2, &&_11_3, &&_11_4, &&_11_5, &&_11_6, &&_11_7,
&&_11_8, &&_11_9, &&_11_10,
&&_13_0, &&_13_1, &&_13_2, &&_13_3, &&_13_4, &&_13_5, &&_13_6, &&_13_7,
&&_13_8, &&_13_9, &&_13_10, &&_13_11, &&_13_12
};
uint64_t stb = st / 30;
v16b *M7 = sieve_7_13_M7;
v16b *M11 = sieve_7_13_M11;
v16b *M13 = sieve_7_13_M13;
v16b m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12;
v16b *vf;
goto *L[stb % 7];
_7_0:
am7(0, 1, 2, 3, 4, 5, 6);
goto _7;
_7_1:
am7(4, 5, 6, 0, 1, 2, 3);
goto _7;
_7_2:
am7(1, 2, 3, 4, 5, 6, 0);
goto _7;
_7_3:
am7(5, 6, 0, 1, 2, 3, 4);
goto _7;
_7_4:
am7(2, 3, 4, 5, 6, 0, 1);
goto _7;
_7_5:
am7(6, 0, 1, 2, 3, 4, 5);
goto _7;
_7_6:
am7(3, 4, 5, 6, 0, 1, 2);
_7:
vf = (v16b *)f;
do {
vf[0] &= m0;
vf[1] &= m1;
vf[2] &= m2;
vf[3] &= m3;
vf[4] &= m4;
vf[5] &= m5;
vf[6] &= m6;
} while ((unsigned char *)(vf += 7) < f + L1C);
goto *(L + 7)[stb % 11];
_11_0:
am11(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
goto _11;
_11_1:
am11(9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8);
goto _11;
_11_2:
am11(7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6);
goto _11;
_11_3:
am11(3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2);
goto _11;
_11_4:
am11(5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4);
goto _11;
_11_5:
am11(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0);
goto _11;
_11_6:
am11(10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9);
goto _11;
_11_7:
am11(8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7);
goto _11;
_11_8:
am11(6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5);
goto _11;
_11_9:
am11(4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3);
goto _11;
_11_10:
am11(2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1);
_11:
vf = (v16b *)f;
do {
vf[0] &= m0;
vf[1] &= m1;
vf[2] &= m2;
vf[3] &= m3;
vf[4] &= m4;
vf[5] &= m5;
vf[6] &= m6;
vf[7] &= m7;
vf[8] &= m8;
vf[9] &= m9;
vf[10] &= m10;
} while ((unsigned char *)(vf += 11) < f + L1C);
goto *(L + 18)[stb % 13];
_13_0:
am13(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
goto _13;
_13_1:
am13(9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8);
goto _13;
_13_2:
am13(5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4);
goto _13;
_13_3:
am13(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0);
goto _13;
_13_4:
am13(10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9);
goto _13;
_13_5:
am13(6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5);
goto _13;
_13_6:
am13(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1);
goto _13;
_13_7:
am13(11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
goto _13;
_13_8:
am13(7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6);
goto _13;
_13_9:
am13(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2);
goto _13;
_13_10:
am13(12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
goto _13;
_13_11:
am13(8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7);
goto _13;
_13_12:
am13(4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3);
_13:
vf = (v16b *)f;
do {
vf[0] &= m0;
vf[1] &= m1;
vf[2] &= m2;
vf[3] &= m3;
vf[4] &= m4;
vf[5] &= m5;
vf[6] &= m6;
vf[7] &= m7;
vf[8] &= m8;
vf[9] &= m9;
vf[10] &= m10;
vf[11] &= m11;
vf[12] &= m12;
} while ((unsigned char *)(vf += 13) < f + L1C);
}
sieve_93_.s
align 16
global PADD
PADD:
db 1, 7, 7, 7, 7, 7, 7, 11, 11, 11, 11, 13, 13, 17, 17, 17, 17
db 19, 19, 23, 23, 23, 23, 29, 29, 29, 29, 29, 29, 31, -1, -1
global WIX
WIX:
db 0, -1, -1, 1, -1, 2, 3, -1, 4, 5, -1, 6, -1, -1, 7, -1
global countPrimes_bf
countPrimes_bf:
dq 0xf93ddbb67e000000, 0x9eeda6eaf31e4fd5, 0xa559dd3bd3d30ce6
dq 0x56a61e78bd92676a, 0x554c2ade2dade356, 0xf8a154039ff0a3d9,
dq 0x3a13f666e944fd2e, 0x54bf11453a2b4cb8, 0x4f8cbcc8b37ac18c,
dq 0xef17c19b71715821, 0x468c83e5081a9654, 0x87588f9265aefb72,
dq 0xa0e3266581d892d2, 0x99eb813c26c73811, 0x4d33f3243e88518d,
dq 0x4c58b42aa71c8b5a, 0xc383dc8219f6264e, 0x02cdcdb50238f12c,
dq 0x307a4c570c944ab2, 0xf8246c44cbf10b43, 0x8dea735ca8950119,
dq 0xc41e22a6502b9624, 0x9c742f3ad40648d1, 0x2e1568bf88056a07,
dq 0x14089851b7e35560, 0x2770494d45aa5a86, 0x618302abcad593d2,
dq 0xada9c22287ce2405, 0xb01689d1784d8c18, 0x522434c0a262c757,
dq 0x4308218d32405aae, 0x60e119d9b6d2b634
global countPrimes_rm
countPrimes_rm:
db 0x01, 0, 0, 0x03, 0, 0x07, 0x0f, 0, 0x1f, 0x3f, 0, 0x7f, 0, 0, 0xff, 0
global sieve_7_13_M7
sieve_7_13_M7:
db 0xfd, 0xdf, 0xef, 0x7e, 0xf7, 0xfb, 0xbf, 0xfd, 0xdf, 0xef, 0x7e, 0xf7
db 0xfb, 0xbf, 0xfd, 0xdf
db 0xef, 0x7e, 0xf7, 0xfb, 0xbf, 0xfd, 0xdf, 0xef, 0x7e, 0xf7, 0xfb, 0xbf
db 0xfd, 0xdf, 0xef, 0x7e
db 0xf7, 0xfb, 0xbf, 0xfd, 0xdf, 0xef, 0x7e, 0xf7, 0xfb, 0xbf, 0xfd, 0xdf
db 0xef, 0x7e, 0xf7, 0xfb
db 0xbf, 0xfd, 0xdf, 0xef, 0x7e, 0xf7, 0xfb, 0xbf, 0xfd, 0xdf, 0xef, 0x7e
db 0xf7, 0xfb, 0xbf, 0xfd
db 0xdf, 0xef, 0x7e, 0xf7, 0xfb, 0xbf, 0xfd, 0xdf, 0xef, 0x7e, 0xf7, 0xfb
db 0xbf, 0xfd, 0xdf, 0xef
db 0x7e, 0xf7, 0xfb, 0xbf, 0xfd, 0xdf, 0xef, 0x7e, 0xf7, 0xfb, 0xbf, 0xfd
db 0xdf, 0xef, 0x7e, 0xf7
db 0xfb, 0xbf, 0xfd, 0xdf, 0xef, 0x7e, 0xf7, 0xfb, 0xbf, 0xfd, 0xdf, 0xef
db 0x7e, 0xf7, 0xfb, 0xbf
global sieve_7_13_M11
sieve_7_13_M11:
db 0xfb, 0xff, 0xef, 0xff, 0xbe, 0xff, 0x7d, 0xff, 0xf7, 0xff, 0xdf, 0xfb
db 0xff, 0xef, 0xff, 0xbe
db 0xff, 0x7d, 0xff, 0xf7, 0xff, 0xdf, 0xfb, 0xff, 0xef, 0xff, 0xbe, 0xff
db 0x7d, 0xff, 0xf7, 0xff
db 0xdf, 0xfb, 0xff, 0xef, 0xff, 0xbe, 0xff, 0x7d, 0xff, 0xf7, 0xff, 0xdf
db 0xfb, 0xff, 0xef, 0xff
db 0xbe, 0xff, 0x7d, 0xff, 0xf7, 0xff, 0xdf, 0xfb, 0xff, 0xef, 0xff, 0xbe
db 0xff, 0x7d, 0xff, 0xf7
db 0xff, 0xdf, 0xfb, 0xff, 0xef, 0xff, 0xbe, 0xff, 0x7d, 0xff, 0xf7, 0xff
db 0xdf, 0xfb, 0xff, 0xef
db 0xff, 0xbe, 0xff, 0x7d, 0xff, 0xf7, 0xff, 0xdf, 0xfb, 0xff, 0xef, 0xff
db 0xbe, 0xff, 0x7d, 0xff
db 0xf7, 0xff, 0xdf, 0xfb, 0xff, 0xef, 0xff, 0xbe, 0xff, 0x7d, 0xff, 0xf7
db 0xff, 0xdf, 0xfb, 0xff
db 0xef, 0xff, 0xbe, 0xff, 0x7d, 0xff, 0xf7, 0xff, 0xdf, 0xfb, 0xff, 0xef
db 0xff, 0xbe, 0xff, 0x7d
db 0xff, 0xf7, 0xff, 0xdf, 0xfb, 0xff, 0xef, 0xff, 0xbe, 0xff, 0x7d, 0xff
db 0xf7, 0xff, 0xdf, 0xfb
db 0xff, 0xef, 0xff, 0xbe, 0xff, 0x7d, 0xff, 0xf7, 0xff, 0xdf, 0xfb, 0xff
db 0xef, 0xff, 0xbe, 0xff
db 0x7d, 0xff, 0xf7, 0xff, 0xdf, 0xfb, 0xff, 0xef, 0xff, 0xbe, 0xff, 0x7d
db 0xff, 0xf7, 0xff, 0xdf
global sieve_7_13_M13
sieve_7_13_M13:
db 0xf7, 0xff, 0xff, 0xfe, 0xbf, 0xdf, 0xff, 0xfb, 0xfd, 0x7f, 0xff, 0xff
db 0xef, 0xf7, 0xff, 0xff
db 0xfe, 0xbf, 0xdf, 0xff, 0xfb, 0xfd, 0x7f, 0xff, 0xff, 0xef, 0xf7, 0xff
db 0xff, 0xfe, 0xbf, 0xdf
db 0xff, 0xfb, 0xfd, 0x7f, 0xff, 0xff, 0xef, 0xf7, 0xff, 0xff, 0xfe, 0xbf
db 0xdf, 0xff, 0xfb, 0xfd
db 0x7f, 0xff, 0xff, 0xef, 0xf7, 0xff, 0xff, 0xfe, 0xbf, 0xdf, 0xff, 0xfb
db 0xfd, 0x7f, 0xff, 0xff
db 0xef, 0xf7, 0xff, 0xff, 0xfe, 0xbf, 0xdf, 0xff, 0xfb, 0xfd, 0x7f, 0xff
db 0xff, 0xef, 0xf7, 0xff
db 0xff, 0xfe, 0xbf, 0xdf, 0xff, 0xfb, 0xfd, 0x7f, 0xff, 0xff, 0xef, 0xf7
db 0xff, 0xff, 0xfe, 0xbf
db 0xdf, 0xff, 0xfb, 0xfd, 0x7f, 0xff, 0xff, 0xef, 0xf7, 0xff, 0xff, 0xfe
db 0xbf, 0xdf, 0xff, 0xfb
db 0xfd, 0x7f, 0xff, 0xff, 0xef, 0xf7, 0xff, 0xff, 0xfe, 0xbf, 0xdf, 0xff
db 0xfb, 0xfd, 0x7f, 0xff
db 0xff, 0xef, 0xf7, 0xff, 0xff, 0xfe, 0xbf, 0xdf, 0xff, 0xfb, 0xfd, 0x7f
db 0xff, 0xff, 0xef, 0xf7
db 0xff, 0xff, 0xfe, 0xbf, 0xdf, 0xff, 0xfb, 0xfd, 0x7f, 0xff, 0xff, 0xef
db 0xf7, 0xff, 0xff, 0xfe
db 0xbf, 0xdf, 0xff, 0xfb, 0xfd, 0x7f, 0xff, 0xff, 0xef, 0xf7, 0xff, 0xff
db 0xfe, 0xbf, 0xdf, 0xff
db 0xfb, 0xfd, 0x7f, 0xff, 0xff, 0xef, 0xf7, 0xff, 0xff, 0xfe, 0xbf, 0xdf
db 0xff, 0xfb, 0xfd, 0x7f
db 0xff, 0xff, 0xef, 0xf7, 0xff, 0xff, 0xfe, 0xbf, 0xdf, 0xff, 0xfb, 0xfd
db 0x7f, 0xff, 0xff, 0xef
section .bss
global L1C
L1C:
resb 4
global BC
BC:
resb 4
build.sh
#include "prime.h"
typedef struct {
uint64_t p;
uint64_t sp;
} np_t;
static np_t fillBase(uint32_t *p, np_t n, unsigned char *f, uint64_t st,
uint64_t end, uint64_t lim) {
#define push(a) do {\
*p_ = st + i * 30 + a;\
if (*p_ < L1C / 8) ++n.sp;\
if (sq(*p_++) > lim) return (np_t){p_ - p + n.p, n.sp};\
} while (0)
p += n.p;
uint32_t *p_ = p;
unsigned i = 0;
do {
if (f[i] & 1 << 0) {
push(1);
}
if (f[i] & 1 << 1) {
push(7);
}
if (f[i] & 1 << 2) {
push(11);
}
if (f[i] & 1 << 3) {
push(13);
}
if (f[i] & 1 << 4) {
push(17);
}
if (f[i] & 1 << 5) {
push(19);
}
if (f[i] & 1 << 6) {
push(23);
}
if (f[i] & 1 << 7) {
push(29);
}
} while (st + ++i * 30 < end);
return (np_t){p_ - p + n.p, n.sp};
#undef push
}
static void sieve(unsigned char *f, uint64_t st, uint32_t *p, unsigned nsp) {
memset(f, -1, BC);
unsigned i = 0;
do {
uint64_t sti30 = st + i * 30;
sieve_7_13(f + i, sti30);
sieve_17(f + i, sti30);
sieve_19(f + i, sti30);
sieve_23(f + i, sti30);
sieve_29(f + i, sti30);
sieve_31(f + i, sti30);
sieve_37(f + i, sti30);
sieve_41(f + i, sti30);
sieve_43(f + i, sti30);
sieve_47(f + i, sti30);
sieve_53(f + i, sti30);
sieve_59(f + i, sti30);
sieve_61(f + i, sti30);
sieve_67(f + i, sti30);
sieve_71(f + i, sti30);
sieve_73(f + i, sti30);
sieve_79(f + i, sti30);
sieve_83(f + i, sti30);
sieve_89(f + i, sti30);
sieve_93_(f + i, sti30, p, nsp);
} while ((i += L1C) < BC);
sieve_93_(f, st, p + nsp, -1);
}
static void print(unsigned char *f, uint64_t st) {
#define p(a) printf("%"PRIu64"\n", st + i * 30 + a);
uint64_t i = 0;
do {
if (f[i] & 1 << 0) {
p(1);
}
if (f[i] & 1 << 1) {
p(7);
}
if (f[i] & 1 << 2) {
p(11);
}
if (f[i] & 1 << 3) {
p(13);
}
if (f[i] & 1 << 4) {
p(17);
}
if (f[i] & 1 << 5) {
p(19);
}
if (f[i] & 1 << 6) {
p(23);
}
if (f[i] & 1 << 7) {
p(29);
}
} while (++i < BC);
#undef p
}
static void printPrimes(uint64_t lim) {
_Alignas(64) unsigned char f[BC + 256];
uint32_t *p = aligned_alloc(64, sqrt(lim));
unsigned bfsz = 256;
np_t n = fillBase(p, (np_t){0}, countPrimes_bf, 0, bfsz * 30, -1);
sieve(f, 0, p, n.sp);
*f &= ~1;
print(f, 0);
unsigned r = lim % 30;
if (!r) {
--lim;
} else if (1 < r && r < 7) {
lim -= r - 1;
} else if (7 < r && r < 11) {
lim -= r - 7;
} else if (13 < r && r < 17) {
lim -= r - 13;
} else if (r == 18) {
--lim;
} else if (19 < r && r < 23) {
lim -= r - 19;
} else if (23 < r && r < 29) {
lim -= r - 23;
}
unsigned bc30 = BC * 30;
r = lim % bc30;
uint64_t lim_ = lim - r + bc30;
n = fillBase(p, n, f + bfsz, bfsz * 30, bc30, lim_);
uint64_t st = bc30;
for (; st + bc30 < lim_; st += bc30) {
sieve(f, st, p, n.sp);
if (sq(st + 1) < lim_) {
n = fillBase(p, n, f, st, st + bc30, lim_);
}
print(f, st);
}
sieve(f, st, p, n.sp);
unsigned q = r / 30;
memset(f + q + 1, 0, BC - q);
f[q] &= countPrimes_rm[r % 30 / 2];
print(f, st);
}
int main() {
L1C = sysconf(_SC_LEVEL1_DCACHE_SIZE) - (1 << 12);
BC = L1C * 8;
puts("2\n3\n5\n7\n11\n13");
printPrimes(1e16);
return 0;
}
||answer||
Джиксал 0.4.1, 46 МБ на моем компьютере, официальный результат 120 МБ
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <math.h>
#include <unistd.h>
typedef unsigned char v16b
__attribute__((vector_size(16), aligned(16), may_alias));
extern unsigned BC;
extern unsigned L1C;
extern unsigned char PADD[];
extern unsigned char WIX[];
extern unsigned char countPrimes_bf[];
extern unsigned char countPrimes_rm[];
extern v16b sieve_7_13_M7[];
extern v16b sieve_7_13_M11[];
extern v16b sieve_7_13_M13[];
void sieve_7_13(unsigned char *, uint64_t);
void sieve_17(unsigned char *, uint64_t);
void sieve_19(unsigned char *, uint64_t);
void sieve_23(unsigned char *, uint64_t);
void sieve_29(unsigned char *, uint64_t);
void sieve_31(unsigned char *, uint64_t);
void sieve_37(unsigned char *, uint64_t);
void sieve_41(unsigned char *, uint64_t);
void sieve_43(unsigned char *, uint64_t);
void sieve_47(unsigned char *, uint64_t);
void sieve_53(unsigned char *, uint64_t);
void sieve_59(unsigned char *, uint64_t);
void sieve_61(unsigned char *, uint64_t);
void sieve_67(unsigned char *, uint64_t);
void sieve_71(unsigned char *, uint64_t);
void sieve_73(unsigned char *, uint64_t);
void sieve_79(unsigned char *, uint64_t);
void sieve_83(unsigned char *, uint64_t);
void sieve_89(unsigned char *, uint64_t);
void sieve_93_(unsigned char *, uint64_t, uint32_t *, unsigned);
static inline uint64_t sq(uint64_t a) {
return a * a;
}
Прочтите README, чтобы узнать, как это запустить.
Для справки: программа-пример вывела на мой компьютер 21,8 МБ.
Я был в восторге от микрооптимизации компилятора. Старый ответ Jyxal был дисквалифицирован, поскольку считал простые числа кратными 5. Новая версия Jyxal почти полностью переписана на Koltin (за исключением математической библиотеки) и содержит новый оптимизатор посткомпиляции (на базе Proguard™). Это позволяет Jyxal перейти на Java 11. Он также оптимизирует циклы. Ненужные нажатия оптимизируются, и по возможности используется стек операндов JVM.
primesieve
Тест на простоту не так сложен, как тест Виксала. Для чисел меньше 9 223 372 036 854 775 807 объект BigComplex преобразуется в long, а затем проверяется последний бит. Если это 0, это означает, что число четное и не простое. Затем, используя оптимизацию, которую я нашел в SO, я использую алгоритм деления методом грубой силы, который пропускает кратные 2 и 3, уменьшая количество делителей, которые мне нужно проверить.
Составлено с
primesieve
main.c
flag disables vectorisation of monads, which removes the need for printf
и позволяет встраивать монаду.
Если вам интересно, вот байт-код, декомпилированный в Java:
primesieve
||answer||
С++
primesieve
Производительность пока неизвестна, поскольку код все еще компилируется на моей машине. Если вам удастся скомпилировать, запустить и рассчитать время, дайте мне знать, насколько быстро он работает.