p511.ans1
======================
-1
1 2 4

=================
p511.in1
======================
2
5 41
3 5

=================
p511.cpp
======================
#include <bits/stdc++.h>

using namespace std;

template<typename T1, typename T2>
ostream& operator<<(ostream& out, const pair<T1, T2>& x) {
    return out << x.first << ' ' << x.second;
}

template<typename T1, typename T2>
istream& operator>>(istream& in, pair<T1, T2>& x) {
    return in >> x.first >> x.second;
}

template<typename T>
istream& operator>>(istream& in, vector<T>& a) {
    for(auto& x: a) {
        in >> x;
    }
    return in;
};

template<typename T>
ostream& operator<<(ostream& out, const vector<T>& a) {
    for(auto x: a) {
        out << x << ' ';
    }
    return out;
};

int n, p;

void read() { cin >> n >> p; }

int64_t modpow(int64_t a, int64_t e, int64_t m) {
    int64_t r = 1;
    for(; e; e >>= 1) {
        if(e & 1) {
            r = r * a % m;
        }
        a = a * a % m;
    }
    return r;
}

int primitive_root(int p) {
    vector<int> divs;
    int phi = p - 1;
    for(int i = 1; (int64_t)i * i <= phi; i++) {
        if(phi % i == 0) {
            divs.push_back(i);
            if(i * i < phi) {
                divs.push_back(phi / i);
            }
        }
    }
    sort(divs.begin(), divs.end());
    for(int g = 2; g < p; g++) {
        bool ok = true;
        for(int i = 0; i + 1 < (int)divs.size() && ok; i++) {
            if(modpow(g, divs[i], p) == 1) {
                ok = false;
            }
        }
        if(ok) {
            return g;
        }
    }
    return -1;
}

int64_t mod_inverse(int64_t a, int64_t m) {
    int64_t g = m, x = 0, y = 1;
    for(int64_t r = a; r != 0;) {
        int64_t q = g / r;
        g -= q * r;
        swap(g, r);
        x -= q * y;
        swap(x, y);
    }
    return (x % m + m) % m;
}

int nth_root(int k, int rt, int g, int n_div_g, int phi_div_g, int p) {
    // Find x such that x^n = rt^(g*k), i.e., x = rt^(k * inverse(n/g, phi/g))
    int64_t e = mod_inverse(n_div_g, phi_div_g) * k % (p - 1);
    return modpow(rt, e, p);
}

void solve() {
    // Starting with a tiny bit of history, Fermat's last theorem was actually a
    // conjecture from 1637 saying that a^n + b^n = c^n has no integer solutions
    // for n > 2. It was finally proven by Andrew Wiles, but here the modular
    // setting makes things much easier. In particular, Schur proved in 1916
    // that for every n, there is some p_0, such that the above has a solution
    // for p >= p_0. That proof was not constructive, but later works show how
    // to give more insight. An example is:
    //
    //     https://www.scirp.org/pdf/apm20241410_35302479.pdf.
    //
    // Although not directly given, an algorithmic way of finding this would be:
    //
    //     1) The n-th powers mod p form a subgroup of (Z/pZ)*.
    //        Let rt be a primitive root of p. For any prime there is a
    //        primitive root, and also if we look at the smallest ones, they
    //        aren't huge - for example, under 10^6 the largest primitive root
    //        is 73 at p=760321. Checking if rt is a primitive root can be done
    //        by making sure there is no smaller cycle that phi(p) = p - 1. The
    //        complexity of finding this primitive root is then O(sqrt(p) *
    //        max_rt), which for the given constraints is quick. As rt is the
    //        primitive root, every nonzero element is rt^k for some k in [0,
    //        p-2]. Then (rt^k)^n = rt^(kn), and the image of the map x
    //        -> x^n is {rt^(kn) : k in [0, p-2]} = {rt^m : g | m} where g =
    //        gcd(n, p-1). This is a classic result about what values can be
    //        achieved from ax mod q, for a = n, q = phi(p), and x is the k.
    //        Clearly, this subgroup has size (p-1)/g and is generated by rt^g.
    //
    //     2) We can search for two elements a, b in this subgroup with a + b
    //        = 1 (mod p). If we find such a pair, then a = x^n and b = y^n for
    //        some x, y, and we have x^n + y^n = 1 = 1^n (mod p), giving
    //        solution (x, y, 1). Why is it enough to search for a + b = 1 (mod
    //        p)? Say we only had a + b = c. Then (a*c^-1) + (b*c^-1) = 1 mod p,
    //        so we know there is also some a' + b' = 1 (mod p). Note that c^-1
    //        (mod p) will always exist a p is a prime.
    //
    //     3) To find such a pair efficiently, let us iterate through powers of
    //        rt^g. Let st = (rt^g)^cnt for cnt = 1, 2, ... Store each st in a
    //        dictionary. For each st, check if (1 - st) mod p is already in the
    //        table. If so, we found a + b = 1. If we complete the full
    //        cycle (st = 1) without finding a pair, no solution exists.
    //
    //     4) To recover x from x^n = rt^(g*k), we need to find the n-th root.
    //        We want x = rt^e such that e*n = g*k (mod p-1). Dividing by g:
    //        e*(n/g) = k (mod (p-1)/g). Since gcd(n/g, (p-1)/g) = 1, the
    //        inverse exists: e = k * inverse(n/g, (p-1)/g), giving x = rt^e.
    //
    //     5) When does no solution exist? Essentially, when the subgroup of
    //        n-th powers has a small size. In particular, for any x^n, there is
    //        only one y^n that satisfies x^n + y^n = 1 mod p, which means the
    //        numbers [1; p) are partitioned into pairs. To not have a solution,
    //        the subgroup has to not have two numbers from the same pair. The
    //        subgroup is not quite random, but a way of thinking about this is
    //        that the process is similar to the birthday paradox, or in
    //        O(sqrt(N)) time we will either find a solution (match), or the
    //        cycle will be too small and we will repeat meaning we terminate.
    //
    // Combining all of the above, we have a solution with time complexity of
    // O(sqrt(N+P) * log(N+P)).

    int g = gcd(n, p - 1);
    int rt = primitive_root(p);
    int rt_g = modpow(rt, g, p);

    vector<int> seen(p, -1);
    int64_t cur = 1;
    int cnt = 0;

    while(true) {
        cur = cur * rt_g % p;
        seen[cur] = ++cnt;
        if(cur == 1) {
            cout << -1 << "\n";
            return;
        }
        int other = (1 - cur % p + p) % p;
        if(seen[other] != -1) {
            int x = nth_root(cnt, rt, g, n / g, (p - 1) / g, p);
            int y = nth_root(seen[other], rt, g, n / g, (p - 1) / g, p);
            cout << x << " " << y << " " << 1 << "\n";
            return;
        }
    }
}

int main() {
    ios_base::sync_with_stdio(false);
    cin.tie(nullptr);

    int T = 1;
    cin >> T;
    for(int test = 1; test <= T; test++) {
        read();
        // cout << "Case #" << test << ": ";
        solve();
    }

    return 0;
}

=================
statement.txt
======================
511. Fermat's Last Theorem
Time limit per test: 0.75 second(s)
Memory limit: 262144 kilobytes
input: standard
output: standard

Given a positive integer n and a positive prime number p, find x, y and z such that xn+yn=zn modulo p and x, y and z are nonzero modulo p or report that there's no such triple.

Input
The first line of the input file contains the number t of testcases to solve, 1 ≤ t ≤ 1000. Each of the next t lines contains two integers n and p, 3 ≤ n ≤ 106, 2 ≤ p ≤ 106.

Output
For each input testcase, output one line:
when there exists a solution, output three integers x, y and z, 1 ≤ x, y, z ≤ p-1. If there are multiple solutions, output any.
when there's no solution, output one integer -1.


Example(s)
sample input
sample output
2
5 41
3 5
-1
1 2 4

=================
