binary trick

几乎没用的东西

学了 \(word RAM model\)，不过感觉还是直接分块预处理的方法（比较）快，而且好打

先直接上代码

namespace binary_trick
{//almost in AC0
	typedef unsigned int u32;
	typedef unsigned long long ull;
#define p2(x) (1ull<<(ull)(x))
#define bget(x,p) (((x)&p2(p))>>p)
#define bgets(x,l,r) (((x)&all_1[r])>>(l))
	const u32 blen=16,hfblen=blen>>1;
	const u32 base=(1<<blen)-1,hfbase=(1<<(blen>>1))-1;
	const ull max_u32=p2(32)-1;
#define MAX_U32 max_u32
	u32 pcnt[base+5],rv[base+5],bcl[base+5],bct[base+5];
	ull all_1[65];
	inline u32 naive_rev(u32 x)
	{
		static bool b[8];
		irep(i,0,7) b[i]=bget(x,i);
		irep(i,0,3) swap(b[i],b[7-i]);
		u32 r=0;
		irep(i,0,7) if(b[i]) r|=p2(i);
		return r;
	}
	inline void init()
	{
		pcnt[0]=0,pcnt[1]=1;
		irep(i,2,base) pcnt[i]=pcnt[i>>1]+(i&1);
		
		irep(i,0,hfbase) rv[i]=naive_rev(i);
		irep(i,hfbase+1,base) rv[i]=(rv[i&hfbase]<<hfblen)|rv[i>>hfblen];
		irep(i,0,hfbase) rv[i]<<=8u;
		
		bcl[0]=16,bcl[1]=15;
		irep(i,2,base) bcl[i]=bcl[i>>1]-1;
		
		bct[0]=0,bct[1]=0;
		irep(i,2,base) bct[i]=i&1?0:(bct[i>>1]+1);
		
		all_1[0]=1;
		irep(i,1,63) all_1[i]=(all_1[i-1]<<1ull)|1ull;
	}
	inline u32 hmw(u32 x)
	{ return pcnt[x>>16]+pcnt[x&base]; }
	inline u32 u64_hmw(ull x)
	{ return pcnt[x>>48]+pcnt[(x>>32)&base]+pcnt[(x>>16)&base]+pcnt[x&base]; }
	inline u32 u32_rev(u32 x)
	{ return (rv[x&base]<<16u)|rv[x>>16u]; }
	inline ull u64_rev(ull x)
	{ return (ull)(u32_rev(x>>32ull)|((ull)(u32_rev(x&MAX_U32))<<32ull)); }
	inline void u32_print(u32 x,char c=' ')
	{ cerr<<(bitset<32>)x<<c; }
	inline void u64_print(ull x,char c=' ')
	{ cerr<<(bitset<64>)x<<c; }
	inline u32 u32_clz(u32 x)
	{ return x>base?bcl[x>>16]:(bcl[x&base]+16); }
	inline u32 u64_clz(ull x)
	{
		if(x>>48>0) return bcl[x>>48];
		if(x>>32>0) return 16u+bcl[(x>>32)&base];
		return 32u+u32_clz(x&MAX_U32);
	}
	inline u32 u32_ctz(u32 x)
	{ return x&base?bct[x&base]:(16u+bct[x>>16u]); }
	inline u32 u64_ctz(ull x)
	{
		if(x&base) return bct[x&base];
		if((x>>16ull)&base) return bct[(x>>16ull)&base]+16u;
		return u32_ctz(x>>32ull)+32u;
	}
	inline u32 lowbit(u32 x) { return 1u<<u32_ctz(x); }
	inline ull lowbit(ull x) { return p2(u64_ctz(x)); }
	inline u32 highbit(u32 x) { return 1u<<(31u-u32_clz(x)); }
	inline ull highbit(ull x) { return p2(63ull-u64_clz(x)); }
	inline bool add_in(ull x,ull y)
	{ return !bcl[x>>48ull]&&!bcl[y>>48ull]; }//判断加法进位
	class u128
	{
	public:
		ull high,low;
		u128()=default;
		u128(int x) { low=x>0?x:-x; }
		u128(u32 x) { low=x,high=0; }
		u128(ull x) { low=x,high=0; }
		u128(ull hb,ull lb) { low=lb,high=hb; }
		~u128()=default;
		inline void part_print() { cerr<<'<'<<high<<','<<low<<'>'; }
		inline u128 friend operator & (u128 a,u128 b) { return u128(a.high&b.high,a.low&b.low); }
		inline void friend operator &=(u128& a,u128 b) { a.low&=b.low,a.high&=b.high; }
		inline u128 friend operator | (u128 a,u128 b) { return u128(a.high|b.high,a.low|b.low); }
		inline void friend operator |=(u128& a,u128 b) { a.low|=b.low,a.high|=b.high; }
		inline u128 friend operator ^ (u128 a,u128 b) { return u128(a.high^b.high,a.low^b.low); }
		inline void friend operator ^=(u128& a,u128 b) { a.low^=b.low,a.high^=b.high; }
		inline bool friend operator ! (u128 a) { return !(a.low|a.high); }
		inline u128 friend operator ~ (u128 a) { return u128(~a.high,~a.low); }
		inline u128 friend operator <<(u128 a,u32 w)
		{
			if(!w) return a;
			if(w>127u) { return u128(0ull,0ull); }
			u128 c(0ull,0ull);
			if(w<64u)
			{
				c.high=(a.low>>(ull)(64u-w))|(a.high<<(ull)w);
				c.low=a.low<<(ull)w;
			}
			else c.low=0ull,c.high=(bgets(a.low,0,127u-w))<<(w-64ull);
			return c;
		}
		inline void operator <<=(u32 w)
		{
			if(!w) return;
			if(w>127u) { low=high=0ull; return; }
			if(w<64u) high=(low>>(ull)(64u-w))|(high<<(ull)w),low=low<<(ull)w;
			else high=(bgets(low,0,127u-w))<<(w-64ull),low=0ull;
		}
		inline u128 friend operator >> (u128 a,u32 w)
		{
			if(!w) return a;
			if(w>127u) return u128(0ull,0ull);
			u128 c(0ull,0ull);
			if(w<64u)
				c.high=a.high>>(ull)w,
				c.low=(a.low>>(ull)w)|((a.high&all_1[w])<<(64ull-w));
			else c.high=0ull,c.low=a.high>>(w-64u);
			return c;
		}
		inline void operator >>=(u32 w)
		{
			if(!w) return;
			if(w>127u) { high=low=0ull; return; }
			if(w<64u) low>>=(ull)w,low|=(high&all_1[w])<<(64ull-w),high>>=(ull)w;
			else low=high>>(w-64u),high=0ull;
		}
		
		inline bool friend operator ==(u128& a,u128& b)
		{ return a.low==b.low&&a.high==b.high; }
		inline bool friend operator < (u128& a,u128& b)
		{ return a.high<b.high?1:(a.high==b.high?a.low<b.low:0); }
		inline bool friend operator > (u128& a,u128& b)
		{ return a.high>b.high?1:(a.high==b.high?a.low>b.low:0); }
		inline bool friend operator <=(u128& a,u128& b)
		{ return a.high<b.high?1:(a.high==b.high?a.low<=b.low:0); }
		inline bool friend operator >=(u128& a,u128& b)
		{ return a.high>b.high?1:(a.high==b.high?a.low>=b.low:0); }
		inline bool friend operator !=(u128& a,u128& b)
		{ return !(a==b); }
		
		inline u128 friend operator + (u128 a,u128 b)
		{
			u128 c;
			if(!bcl[a.low>>48ull]&&!bcl[b.low>>48ull])
			{ c.low=a.low+b.low,c.high=a.high+b.high+1ull; }
			else { c.low=a.low+b.low,c.high=a.high+b.high; }
			return c;
		}
		inline void friend operator +=(u128& a,u128 b)
		{
			if(!bcl[a.low>>48ull]&&!bcl[b.low>>48ull])
			{ a.low=a.low+b.low,a.high=a.high+b.high+1ull; }
			else { a.low=a.low+b.low,a.high=a.high+b.high; }
		}
		inline u128 friend operator - (u128 a,u128 b)
		{
			u128 c;
			if(a.low<b.low) c.high=a.high-b.high-1,c.low=a.low+(compl b.low)+1;
			else c.high=a.high-b.high,c.low=a.low-b.low;
			return c;
		}
		inline void friend operator -=(u128& a,u128 b)
		{
			if(a.low<b.low) a.high=a.high-b.high-1,a.low=a.low+(compl b.low)+1;
			else a.high=a.high-b.high,a.low=a.low-b.low;
		}
		inline u128 friend operator * (u128 a,u128 b)
		{
			return u128(a.high*b.high,a.low*b.low);
		}//it's not suggested
		
		operator bool () const { return (bool)(high|low); }
		
		inline void friend u128_print(u128 a);
		inline u128 friend u128_rev(u128 x);
		inline u32 friend u128_clz(u128 x);
		inline u32 friend u128_ctz(u128 x);
		inline u32 friend u128_hmw(u128 x);
		inline u128 friend u128_lowbit(u128 x);
		inline u128 friend u128_highbit(u128 x);
	};
	const u128 unit_128(0ull,1ull);
	inline void u128_print(u128 a) { cerr<<(bitset<64>)a.high<<(bitset<64>)a.low; }
	inline u128 u128_rev(u128 x) { return u128(u64_rev(x.low),u64_rev(x.high)); }
	inline u32 u128_clz(u128 x) { return x.high?u64_clz(x.high):64u+u64_clz(x.low); }
	inline u32 u128_ctz(u128 x) { return x.low?u64_ctz(x.low):u64_ctz(x.high)+64u; }
	inline u32 u128_hmw(u128 x) { return u64_hmw(x.low)+u64_hmw(x.high); }
	inline u128 u128_lowbit(u128 x) { return unit_128<<u128_ctz(x); }
	inline u128 u128_highbit(u128 x){ return unit_128<<(127u-u128_clz(x)); }
	inline u128 turtle_mul(u128 a,u128 b)
	{
		u128 c(0ull,0ull);
		while(b)
		{
			if(b&unit_128) c+=a;
			a<<=1u,b>>=1u;
		} return c;
	} 
	inline u128 turtle2_mul(u128 a,u128 b)
	{
		u32 tmp;
		u128 c(0ull,0ull);
		while(b)
		{
			tmp=u128_ctz(b);
			a<<=tmp;
			c+=a;
			b>>=tmp+1;
			a<<=1u;
		} return c;
	} 
	inline u128 slow_mul(u128 A,u128 B)
	{
		static u32 a[4],b[4];
		static ull tmp;
		a[0]=A.low&MAX_U32,a[1]=A.low>>32ull,a[2]=A.high&MAX_U32,a[3]=A.high>>32ull;
		b[0]=B.low&MAX_U32,b[1]=B.low>>32ull,b[2]=B.high&MAX_U32,b[3]=B.high>>32ull;
		u128 c( (ull)a[1]*(ull)b[1] , (ull)a[0]*(ull)b[0] );
		tmp=1ull*a[0]*b[1];
			c.high+=(tmp>>32ull)+add_in(c.low,(tmp&MAX_U32)<<32ull);
			c.low+=(tmp&MAX_U32)<<32ull;
		tmp=1ull*a[1]*b[0];
			c.high+=(tmp>>32ull)+add_in(c.low,(tmp&MAX_U32)<<32ull);
			c.low+=(tmp&MAX_U32)<<32ull;
		tmp=1ull*a[2]*b[0]+1ull*a[0]*b[2];
			c.high+=tmp;
		tmp=1ull*a[2]*b[1]+1ull*a[1]*b[2];
			c.high+=(tmp&MAX_U32)<<32ull;
		tmp=1ull*a[3]*b[0]+1ull*a[0]*b[3];
			c.high+=(tmp&MAX_U32)<<32ull;
		return c;
	}//it's the suggested way to get a*b
} namespace efX_bt=binary_trick;

可以用 \(u128\) 了，但是乘法挺慢的，而且没有除法，所以不能输出，毕竟本来就是给你当 \(bitset\) 用的，不过手动循环展开的好像还是快一点，不过不知道这个和编译器版本有没有关系

预处理的时间不到 \(10ms\)，空间不到 \(1MB\)，应该没有什么会被卡的地方

命名的规范是 \(\text{u32_,u64_,u128_}\) 表示对应的函数

有 \(rev=\text{按位翻转},hmw=1\text{的个数},highbit=\text{最高位},lowbit=\text{最低位},clz=\text{前导0的数量},ctz=\text{后缀0数量}\)

完全没用的东西

要计算复杂度，我们需要假设我们做那些操作是 \(O(1)\) 的，这个就是我们采用的 \(\text{model}\)

一个 \(model\) 不能过于不现实，比如我们一般假设 \(\pm\) 是 \(O(1)\) 的运算，但在高精的情形下这就显然不合理

不合理的原因是没有考虑位长 \(w\)，也就是我们一个 unsigned int 或者 unsigned long long 能存储的二进制位数

考虑了之后，我们认为 \(\operatorname{bitand,bitor,xor,compl,\pm,\times,<,>,=,\&,*}\) 是 \(O(1)\) 的，之所以说是认为，是因为 \(\times\) 其实不能做到 \(O(1)\)，不过这个就涉及到比较底层的设计了，和算法关系不大，因为计算机算乘法也很快，所以就认为 \(\times\) 也是 \(O(1)\) 的了

考虑到现在的 \(\text{cpu}\) ，\(w\) 一般取 \(32\) 或 \(64\)

这个 \(\text{model}\) 就叫 \(\text{word RAM model}\)

还有一个常见的（一般是默认的）假设是 \(w=\Omega(\log n)\)，要不然我们取地址都不是 \(O(1)\) 的，就感觉不太能做

这些操作被称为 \(\text{AC}^0\) 基类，因为还可以扩展出一些其他的操作也是 \(O(1)\) 的

clz&ctz

这个东西直接做是不能做到 \(O(1)\) 的，最 \(\text{naive}\) 的就是按位枚举计数，这样是 \(O(w)\) 的

然后二分就可以做到 \(O(\log w)\)

但如果我们预处理，那么就可以做到 \(O(1)\) 求 \(\text{clz}\)（当然不是上面那种分段打表的做法）

我们从 \((a_1a_2a_3a_4)\) 向 \((a_2a_3a_40)\) 和 \((a_2a_3a_41)\) 连边，然后跑欧拉回路，可以在 \(O(w)\) 的时间内制备出一张类似 \(hash\) 表的东西，问的时候就可以 \(O(1)\) 查 \(\lfloor\log_2x\rfloor\) 了

因为我们假设了 \(w=\Omega(\log n)\) ，所以 \(O(w)\) 的制备时间认为白给的

然后 \(\text{clz}\) 都做了，对于 \(ctz\)，我们只需要 \(O(1)\) 求出 \(lowbit(x)\) 就可以了，写过树状数组就知道这是 \(x\operatorname{bitand}-x\)，不过我们一般认为 \(word\) 是 unsigned 的，所以还需要手动模拟一下补码的转化

popcount

这个又叫 \(\text{Hamming Weight}\)，也是我代码中的命名

直接枚举显然也是 \(O(w)\) 的

然后可以分治，以 \(b\) 为块长分块，每次合并使块长变为 \(2b\)，块内记录块内 \(1\) 的个数，一个块的值域是 \(2^b\)，显然不会溢出

这样需要构造出形如 \(00010001\cdots\) 这样的数，即 \(2^i-1\) 个 \(0\) 后接一个 \(1\)，预处理是 \(O(\log w)\) 的，但是发现分治过程就是这个反过来，所以其实不用预处理，可以在线制备，依然是 \(O(\log w)\) 的（不过这么慢的写法不会有人用就是了）

考虑块的值域是 \(O(2^b)\) 而最后答案不超过 \(O(w)\)，所以试着在 \(b=O(\log w)\) 时就停止分治

之后我们假设问题是在一个序列上，我们有 \(c_0,c_1,c_2\cdots c_{O(\log w)}\) 表示每个块内的答案，我们要求 \(\sum c_i\)，最简单的做法就是卷积，而在一个 \(word\) 内卷积是 \(O(1)\) 的，这时我们发现最后一个块内的值就是答案

既然最后一次可以用卷积提速，那之前的操作也可以用卷积提速，把每次分治合并变为卷积合并，这样每次块长从 \(b\) 变为 \(2^{b-1}\)，只会迭代 \(O(\alpha(w))\) 次（\(\alpha()\) 表示反阿克曼函数）

\(\text{popcount}\) 被证明了不能在 \(\text{AC}^0\) 里 \(O(1)\) 的（不考虑分段打表这种需要 \(O(2^{w/c})\) 的预处理时间的做法，这样的预处理时间在理论上不可接受）

pack&unpack

我们想把一堆分开的数放在一起（\(\text{pack}\)），或者进行相反的操作（\(\text{unpack}\)）

\(\operatorname{pack}\) 对于数的规模有要求，假设我们有 \(t\) 个块，每个块的末尾有 \(\frac{w}{t^2}\) 的位置存储了我们想要的数据，我们只需要构造出 \(P=0000\cdots001\cdots001\)（有 \(t-1\)个 \(1\)，相邻的 \(1\) 的距离为 \(\frac{(t-1)w}{t^2}-1\)）

原数与 \(P\) 卷积的最高位的块就是我们需要的答案

不难发现再卷一次就可以 \(\text{unpack}\) 了，不过需要忽略无关位，这个预处理出无关位是 \(0\)，有关位为 \(1\) 的数就可以了

posted @ 2022-10-03 19:47 嘉年华_efX 阅读(113) 评论(1) 收藏举报

刷新页面返回顶部

嘉年华_efX

binary trick

binary trick

几乎没用的东西

完全没用的东西

clz&ctz

popcount

pack&unpack

公告