普通NTT只能做\(2^k\)的循环卷积,尝试扩展成长度为\(n\)的循环卷积,保证模意义下\(\omega_n\)存在。
不管怎样还是要算点值。推式子:
\[
\begin{align*}
X_i&=\sum_{j=0}^{n-1}x_j\omega_n^{ij}\&=\sum_{j=0}^{n-1}x_j\omega_n^{{i+j\choose2}-{i\choose 2}-{j\choose 2}}\&=\omega_n^{-{i\choose 2}}\sum_{j=0}^{n-1}x_j\omega_n^{-{j\choose 2}}\times \omega_n^{{i+j\choose2}}
\end{align*}
\]
令\(F_i=x_i\omega_n^{-{i\choose 2}},G_i=\omega_n^{{i\choose 2}}\),那么就有\(X_i=\sum_{j=0}^{n-1} F_jG_{i+j}\),就可以任意模数NTT做了。
发现什么性质都观察不出来,于是直接上DP。\(dp_{i,j,x}\)表示前\(i\)层,走了\(j\)步,走到第二维是\(x\)的点(但第一维不一定是\(i\))的方案数,有转移:
\[
dp_{i,j,x}=dp_{i-1,j,x}+\sum_y dp_{i-1,j-1,y}\times w_{y,x}
\]
容易发现这连暴力分都没有……
发现\(n=1\)的时候,\(dp_i\)可以被看作是一个多项式,于是\(dp_L=(1+wx)^L\),可以写一个任意模数NTT。
怎么扩展?我们可以再记一维\(y\),把数组的意义改成在原有的基础上加\(i\)层、多走\(j\)步,从\(x\)走到\(y\)的方案数。你发现此时\(dp_{i,j,x,y}\)可以合并任意的\(k,i-k\)来完成转移,于是可以倍增。
倍增的时候容易发现\(j\)这一维是个循环卷积,可以MTT优化,于是获得了\(O(n^3K\log L\log K)\)的做法。
显然这是过不去的。我们发现题目保证了\(K|(mod-1)\),于是可以倍增的时候只存点值,只有在开头结尾搞个任意长度NTT,做到\(O(n^3K\log L)\)。
常数巨大,被标算踩爆,但是好想。
(然而标解也不怎么难想的样子?)
#include<bits/stdc++.h>
clock_t t=clock();
namespace my_std{
using namespace std;
#define pii pair<int,int>
#define fir first
#define sec second
#define MP make_pair
#define rep(i,x,y) for (int i=(x);i<=(y);i++)
#define drep(i,x,y) for (int i=(x);i>=(y);i--)
#define go(x) for (int i=head[x];i;i=edge[i].nxt)
#define templ template<typename T>
#define sz 273333
typedef long long ll;
typedef double db;
typedef long double ld;
mt19937 rng(chrono::steady_clock::now().time_since_epoch().count());
templ inline T rnd(T l,T r) {return uniform_int_distribution<T>(l,r)(rng);}
templ inline bool chkmax(T &x,T y){return x<y?x=y,1:0;}
templ inline bool chkmin(T &x,T y){return x>y?x=y,1:0;}
templ inline void read(T& t)
{
t=0;char f=0,ch=getchar();double d=0.1;
while(ch>'9'||ch<'0') f|=(ch=='-'),ch=getchar();
while(ch<='9'&&ch>='0') t=t*10+ch-48,ch=getchar();
if(ch=='.'){ch=getchar();while(ch<='9'&&ch>='0') t+=d*(ch^48),d*=0.1,ch=getchar();}
t=(f?-t:t);
}
template<typename T,typename... Args>inline void read(T& t,Args&... args){read(t); read(args...);}
char __sr[1<<21],__z[20];int __C=-1,__zz=0;
inline void Ot(){fwrite(__sr,1,__C+1,stdout),__C=-1;}
inline void print(register int x)
{
if(__C>1<<20)Ot();if(x<0)__sr[++__C]='-',x=-x;
while(__z[++__zz]=x%10+48,x/=10);
while(__sr[++__C]=__z[__zz],--__zz);__sr[++__C]='\n';
}
void file()
{
#ifdef NTFOrz
freopen("a.in","r",stdin);
#endif
}
inline void chktime()
{
#ifdef NTFOrz
cout<<(clock()-t)/1000.0<<'\n';
#endif
}
ll mod;
ll ksm(ll x,int y){ll ret=1;for (;y;y>>=1,x=x*x%mod) if (y&1) ret=ret*x%mod;return ret;}
ll inv(ll x){return ksm(x,mod-2);}
// inline ll mul(ll a,ll b){ll d=(ll)(a*(double)b/mod+0.5);ll ret=a*b-d*mod;if (ret<0) ret+=mod;return ret;}
}
using namespace my_std;
ll g,Wk;
vector<int>pp;
bool check(ll x){for (auto t:pp) if (ksm(x,t)==1) return 0;return 1;}
void getG(int K)
{
pp.push_back(1);
for (ll i=2;i*i<=mod;++i) if ((mod-1)%i==0) pp.push_back(i),pp.push_back((mod-1)/i);
ll x;
do x=rnd(2ll,mod-1); while (!check(x));
g=x;Wk=ksm(g,(mod-1)/K);
}
int n,x,y,L,K;
ll w[4][4];
const ld PI=acos(-1);
struct Complex
{
ld a,b;
Complex(ld x=0,ld y=0){a=x,b=y;}
};
inline Complex operator + (const Complex &x,const Complex &y){return Complex(x.a+y.a,x.b+y.b);}
inline Complex operator - (const Complex &x,const Complex &y){return Complex(x.a-y.a,x.b-y.b);}
inline Complex operator * (const Complex &x,const Complex &y){return Complex(x.a*y.a-x.b*y.b,x.a*y.b+x.b*y.a);}
int r[sz],limit;
void FFT_init(int N)
{
int l=-1;limit=1;
while (limit<=N) limit<<=1,++l;
rep(i,0,limit-1) r[i]=(r[i>>1]>>1)|((i&1)<<l);
}
void FFT(Complex *a,int type)
{
rep(i,0,limit-1) if (i<r[i]) swap(a[i],a[r[i]]);
for (int mid=1;mid<limit;mid<<=1)
{
Complex Wn(cos(PI/mid),type*sin(PI/mid));
for (int len=mid<<1,j=0;j<limit;j+=len)
{
Complex w(1,0);
for (int k=0;k<mid;k++,w=w*Wn)
{
Complex x=a[j+k],y=w*a[j+mid+k];
a[j+k]=x+y;
a[j+k+mid]=x-y;
}
}
}
if (type==-1) rep(i,0,limit-1) a[i].a/=limit;
}
Complex A[sz],B[sz],C[sz],D[sz],E[sz],F[sz],G[sz],H[sz];
void MTT(ll *a,int n,ll *b,int m,ll *ret)
{
FFT_init(n+m);
rep(i,0,limit-1) A[i]=B[i]=C[i]=D[i]=E[i]=F[i]=G[i]=H[i]=Complex(0,0);
rep(i,0,n-1) A[i].a=a[i]>>15,B[i].a=a[i]&0x7fff;
rep(i,0,m-1) C[i].a=b[i]>>15,D[i].a=b[i]&0x7fff;
FFT(A,1);FFT(B,1);FFT(C,1);FFT(D,1);
rep(i,0,limit-1)
{
E[i]=A[i]*C[i];
F[i]=A[i]*D[i];G[i]=B[i]*C[i];
H[i]=B[i]*D[i];
}
FFT(E,-1);FFT(F,-1);FFT(G,-1);FFT(H,-1);
rep(i,0,n+m-2)
{
ll x1=ll(round(E[i].a))%mod,x2=ll(round(F[i].a))%mod,x3=ll(round(G[i].a))%mod,x4=ll(round(H[i].a))%mod;
ret[i]=((x1<<30)%mod+(x2<<15)%mod+(x3<<15)%mod+x4%mod+mod)%mod;
}
}
ll tmp1[sz],tmp2[sz],tmp3[sz];
int calc(int x){return 1ll*x*(x-1)/2%(mod-1);}
void iDFT(ll *a)
{
ll w=inv(Wk);
rep(i,0,K-1) tmp1[i]=a[i]*ksm(w,mod-1-calc(i))%mod;
rep(i,0,K+K) tmp2[i]=ksm(w,calc(i));
reverse(tmp1,tmp1+K);
MTT(tmp1,K,tmp2,K+K,tmp3);
rep(i,0,K-1) a[i]=ksm(w,mod-1-calc(i))*tmp3[i+K-1]%mod*inv(K)%mod;
}
ll dp[3][3][3][sz];
void merge(int C,int A,int B)
{
rep(s,0,n-1) rep(t,0,n-1) rep(p,0,n-1) rep(i,0,K-1) (dp[C][s][t][i]+=dp[A][s][p][i]*dp[B][p][t][i]%mod)%=mod;
rep(s,0,n-1) rep(t,0,n-1) rep(i,0,K-1) (dp[C][s][t][i]+=dp[A][s][t][i]+dp[B][s][t][i])%=mod;
}
void Clear(int p){rep(i,0,n-1) rep(j,0,n-1) rep(k,0,K-1) dp[p][i][j][k]=0;}
void solve(int L,int p)
{
if (L==1)
{
rep(i,0,n-1) rep(j,0,n-1) rep(k,0,K-1) dp[p][i][j][k]=dp[2][i][j][k];
return;
}
int mid=L>>1;solve(mid,p^1);
Clear(p);
merge(p,p^1,p^1);
if (L&1)
{
Clear(p^1);
merge(p^1,p,2);
rep(i,0,n-1) rep(j,0,n-1) rep(k,0,K-1) dp[p][i][j][k]=dp[p^1][i][j][k];
}
}
int main()
{
file();
read(n,K,L,x,y,mod);
--x,--y;getG(K);
rep(i,0,n-1) rep(j,0,n-1) read(w[i][j]);
rep(i,0,n-1) rep(j,0,n-1) rep(k,0,K-1) dp[2][i][j][k]=w[i][j]*ksm(Wk,k)%mod;
solve(L,0);
iDFT(dp[0][x][y]);
if (x==y) (++dp[0][x][x][0])%=mod;
rep(i,0,K-1) printf("%lld\n",dp[0][x][y][i]);
return 0;
}
原文:https://www.cnblogs.com/p-b-p-b/p/11964304.html