字符串 Hash 解题报告

题单：

题意简述：

给定 $n$ 个 $A$ 串和 $n$ 个 $B$ 串，长度均为 $m$ ，求一个最短的区间 $[l,\ r]$

使得不存在一个 $A$ 串 $a$ 和一个 $B$ 串 $b$ ，使得 $a[l,\ r]\ =\ b[l,\ r]$

思路：

可以用 Hash 先算出哈希数组，尝试从小到大暴力匹配之后发现会 T 掉三个点（实测 $4 s$），发现答案满足单调性，可以二分代替暴力。

Code：


#include<bits/stdc++.h>
#define int long long

using namespace std;
int n,m;
string s1[505],s2[505];
int f1[505][505],f2[505][505];
int ksm[505];
const int mod=10000019,prime=23;
int b[131];

int ans;

bool check(int len)
{
	for(int l=1;l<=m-len+1;l++)
	{
		int r=l+len-1;
		bool f=1;
		unordered_map<int,bool> mp;
		
		for(int i=1;i<=n;i++)
		{
			int nw=((f1[i][r]-f1[i][l-1]*ksm[len])%mod+mod)%mod;
			mp[nw]=1;
		}
		for(int i=1;i<=n;i++)
		{
			int nw=((f2[i][r]-f2[i][l-1]*ksm[len])%mod+mod)%mod;
			if(mp[nw]) { f=0; break; }
		}
		
//		cout<<len<<" "<<l<<" "<<r<<" "<<f<<"\n";
		
		if(f) return 1;
	}
	return 0;
}

signed main()
{
//	freopen("a.in","r",stdin);
	
	b['A']=1;
	b['G']=2;
	b['C']=3;
	b['T']=4;
	
	ios::sync_with_stdio(0);
	cin.tie(0);
	cout.tie(0);
//	//mt19937_64 myrand(time(0));
	
	ksm[0]=1;
	for(int i=1;i<505;i++) ksm[i]=ksm[i-1]*prime%mod;
	
	cin>>n>>m;
	for(int i=1;i<=n;i++) cin>>s1[i];
	for(int i=1;i<=n;i++) cin>>s2[i];
	
	for(int k=1;k<=n;k++)
	for(int i=1;i<=m;i++)
	{
		f1[k][i]=f1[k][i-1]*prime+b[s1[k][i-1]];
		f2[k][i]=f2[k][i-1]*prime+b[s2[k][i-1]];
		
		f1[k][i]%=mod;
		f2[k][i]%=mod;
	}
	
//	int len;
	int L=1,R=m,mid;
	while(L<R)
	{
		mid=(L+R)>>1;
		if(check(mid)) R=mid-1;
		else L=mid+1;
	}
	
	for(int i=L-2;i<=R+2;i++)
	if(check(i))
	{
		cout<<i;
		return 0;
	}
	
	return 0;
}

P4591 [TJOI2018] 碱基序列

题意：

from Here

给定一个字符串 $A$ 以及 $n$ 组字符串。现在从每组字符串中都随机选出一个字符串，按组号顺序首尾拼接成字符串 $B$。定义该情况的方案数为 $A$ 与 $B$ 相等的子串总个数。求所有情况的方案数总和。

对于 $30\%$ 的数据，$1\leq k\leq 25$，$1\le |s|\leq 10000$，$1\le a_i\leq 3$。
对于 $100\%$ 的数据，$1\leq k\leq100$，$1\le |s|\leq 10000$，$1\le a_i \leq10$。碱基序列的长度均不超过 $15$。字符集为大写字母。

思路：

（听说是计数 Dp？没看出来）

通过观赏数据范围，可得 $O(n \times m)$ 复杂度即可过。

预处理出原串 Hash 值，和每个“碱基序列”的 Hash 值。

由前往后枚举原串的第 $i$ 位，（状态转移方程？）

$ans[i][id]=\sum\limits_{j=1}^k ans[i-len[j]][id-1]\ (i-len[j] \ge 0)$

初始值 $ans[i][0]=1$。

Code：


#include<bits/stdc++.h>
#define int long long
#define ull unsigned long long

using namespace std;

ull ksm[10005];
ull f[10005];
int cnt[105];
ull h[105][15];
int len[105][15];
//unordered_map<>
const ull mod=1e9+7,prime=79;
string s;
int k;
int ans[10005][105];

ull Hash(int i,int j)
{
	string s1;
	cin>>s1;
	
	len[i][j]=s1.size();
	
	ull nw=0;
	
	for(int i=0;i<s1.size();i++) nw=(nw*prime+s1[i]-'A'+1)/*%mod*/;
	return nw;
}
/*
ull find_hash(int l,int r)
{
	if(l>r) swap(l,r); 
	return ((f[r]-f[l-1]*ksm[r-l+1]%mod)%mod+mod)%mod;
}
*/
signed main()
{
	ios::sync_with_stdio(0);
	cin.tie(0);cout.tie(0);
	
	ksm[0]=1;
	for(int i=1;i<(10005);i++) ksm[i]=ksm[i-1]*prime/*%mod*/;
	
	cin>>k;
	cin>>s;
	
	for(int i=0;i<s.size();i++)
		f[i+1]=(f[i]*prime+s[i]-'A'+1)/*%mod*/;
	
	for(int i=1;i<=k;i++)
	{
		cin>>cnt[i];
		for(int j=1;j<=cnt[i];j++)
			h[i][j]=Hash(i,j);
	}
	//mt19937_64 myrand(time(0));
//	ans[0][0]=1;	
//	for(int i=0;i<105;i++) ans[0][i]=1;
//	for(int i=1;i<=cnt[1];i++) ans[len[1][i]][1]++;
	
	for(int r=1;r<=s.size();r++)
	{
		for(int j=1;j<=k;j++)
		for(int x=1;x<=cnt[j];x++)
		{
			int l=r-len[j][x]+1;
			if(l<=0) continue;
	//		cout<<"l="<<l<<" r="<<r<<" id="<<j<<" find="<<find_hash(l,r)<<" h="<<h[j][x]<<" ";
			if(f[r]-f[l-1]*ksm[len[j][x]]==h[j][x])
			{
				ans[r][j]+=/*ans[l-1][j]+*/ans[l-1][j-1]+(bool)(j==1);//与上面方程等价
				ans[r][j]%=mod;
				
		//		cout<<" ans["<<r<<"]["<<j<<"]="<<ans[r][j];
			}
			
	//		cout<<"\n";
		}
	}
	
//	for(int r=1;r<=s.size();r++)
//	for(int j=1;j<=k;j++)
//	cout<<" ans["<<r<<"]["<<j<<"]="<<ans[r][j]<<"\n";
	int tot_ans=0;
	for(int i=1;i<=s.size();i++)	
	tot_ans+=ans[i][k]%mod,tot_ans%=mod;
	
	cout<<tot_ans;
	return 0;
}

P3167 [CQOI2014] 通配符匹配

题意：

给出一个串 $S$，再给出 $n$ 个串 $T_i$，求 $T_i$ 能否被 $S$ 经过通配符全文匹配。

其中，* 可以匹配 $0$ 个及以上的任意字符（$\ge 0$），? 可以匹配恰好一个任意字符（$=1$）。

字符串长度不超过 $100000$
$1 \le n \le 100$
通配符个数不超过 $10$

思路：

听说又是 Dp？

我们又通过观赏数据范围，发现通配符个数最多只有 $10$，可以暴搜。$O($能过$)$。

初始化不好写。

如果 * 连续出现，这些 * 是，和只有 $1$ 个 * 是等价的（大优化）。

可以把输入通过通配符断开，算出，断开之后每个非通配符的 Hash 值，按题意模拟，从头开始进行匹配，合法直接跳出递归。

Code：

#include<bits/stdc++.h>
#define int long long
#define ull unsigned long long

using namespace std;

ull f[1<<20];
ull ksm[1<<20];
const ull prime=79;
string a[105];
ull h[105];
int cnt;
string nw;

bool dfs(int p,int num)
{
//	if(p-l+1>a[num].size()) return 0; 
//	cout<<p<<" "<<num<<" "<<cnt<<" "<<"\n"; 
	if(p==nw.size()+1) 
	{
		return num==cnt+(a[cnt]!="*");
	}
	if(num>cnt) return 0;
	
	if(a[num]=="?") return dfs(p+1,num+1);
	else if(a[num]=="*") 
	{
		bool f=0;
		for(int i=p;i<=nw.size()+1;i++)
		{
			f=dfs(i,num+1);
			if(f) return 1;
		}
		return 0;
	}
	else
	{
		if(p+a[num].size()-1>nw.size()) return 0;
//	cout<<l<<" "<<num<<" "<<" "<<"\n";
		if(h[num]==f[p+a[num].size()-1]-f[p-1]*ksm[a[num].size()]) return dfs(p+a[num].size(),num+1);
		return 0;
	}
	return 0;
}

void solve()
{
//	cout<<"cnt="<<cnt<<" ";
	cin>>nw;
	for(int i=1;i<=nw.size();i++)
	f[i]=f[i-1]*prime+nw[i-1]-'a'+1;
	
	if(dfs(1,1)) puts("YES");
	else puts("NO");
}

ull Hash(string s)
{
	ull nw=0;
	for(int i=1;i<=s.size();i++) nw=nw*prime+s[i-1]-'a'+1;
	return nw;
}

signed main()
{
//	freopen("a.in","r",stdin);
	
	//mt19937_64 myrand(time(0));
	
	ios::sync_with_stdio(0);
	cin.tie(0);
	cout.tie(0);
	
	ksm[0]=1;
	for(int i=1;i<(1<<20);i++) ksm[i]=ksm[i-1]*prime;
	
	string s;
	cin>>s;
	
	s+="?";
//	int l=0;
	string ans;
	if(s[0]=='*') a[++cnt]="*";
	else if(s[0]=='?') a[++cnt]="?";
	else ans+=s[0];
	for(int i=1;i<s.size();i++)
	{
		if(s[i]=='*')
		{
			if(ans!="") a[++cnt]=ans;
			if(a[cnt]!="*") a[++cnt]="*";
			ans="";
		}
		
		else if(s[i]=='?')
		{
			if(ans!="") a[++cnt]=ans;
			a[++cnt]="?";
			ans="";
		}
		else ans+=s[i];
	}
	cnt--;
//	cout<<"cnt="<<cnt<<" ";
//	cout<<a[1]<<" ";
//	cout<<cnt<<" "; 
	
	
//	for(int i=1;i<=cnt;i++)
//	{
//		cout<<a[i]<<"\n";
//	}
	
	for(int i=1;i<=cnt;i++)
		if(a[i]!="*"&&a[i]!="?")
			h[i]=Hash(a[i]);
	
	int q;
	cin>>q;
	while(q--) solve();
	
	return 0;
}

P4824 [USACO15FEB] Censoring S

P3121 [USACO15FEB] Censoring G 的弱化版。

题意：

给出一个字符串 $ S $，删去 $ S $ 中第一次出现的子串 $ T $，然后不断重复这一过程，直到 $ S $ 中不存在子串 $ T $ 。

注意：每次删除一个子串后，可能会出现一个新的子串 $ T $ （说白了就是删除之后，两端的字符串有可能会拼接出来一个新的子串 $ T $ ）。

思路：

设 $T$ 长 $len$。

从头开始枚举，每次枚举到一位时，Hash 数组末尾指针 $+1$，现算 $S$ 的 Hash 值，若出现 $T$（Hash 数组里 $len$ 长的 Hash 值与 $T$ 的 Hash 值相等），则把 Hash 数组的末尾位置指针 $-len$，就是把 Hash 数组删去 $len$ 长，就等价于把 $T$ 删除。

可以维护一个链表统计答案，链表 $l[i]$ 链位置 $i$ 左边第一个没有被删的位置，删除时再用 vis 数组记录哪位被删去，输出时输出 $S$ 中没有被删去的字符。

Code：

#include<bits/stdc++.h>
#define int long long
#define ull unsigned long long

using namespace std;

const int Size=(1<<20)+1;
char buf[Size],*p1=buf,*p2=buf;
char buffer[Size];
int op1=-1;
const int op2=Size-1;
#define getchar()                                                              \
(tt == ss && (tt=(ss=In)+fread(In, 1, 1 << 20, stdin), ss == tt)     \
	? EOF                                                                 \
	: *ss++)
char In[1<<20],*ss=In,*tt=In;
inline int read()
{
	int x=0,c=getchar(),f=0;
	for(;c>'9'||c<'0';f=c=='-',c=getchar());
	for(;c>='0'&&c<='9';c=getchar())
		x=(x<<1)+(x<<3)+(c^48);
	return f?-x:x;
}
inline void write(int x)
{
	if(x<0) x=-x,putchar('-');
	if(x>9)  write(x/10);
	putchar(x%10+'0');
}

string s,t;
ull f[1<<20];
int l[1<<20]/*,r[1<<20]*/;
bool vis[1<<20];
const ull prime=131;

ull Hash(string t)
{
	ull nw=0;
	for(int i=0;i<t.size();i++)
		nw=nw*prime+t[i]-'a'+1;
	return nw;
}

ull KSM(int x,int p)
{
	ull nw=1;
	
	while(p)
	{
		if(p&1) nw*=x;
		p>>=1;
		x*=x;
	}
	
	return nw;
}

signed main()
{
	ios::sync_with_stdio(0);
	cin.tie(0);cout.tie(0);
	
	cin>>s>>t;
	int n=s.size(),m=t.size();
	ull nw=Hash(t);
	
//	cout<<nw<<"\n";
	
	ull ksm=KSM(prime,m);
//	cout<<ksm<<"\n";
	
	for(int i=0;i<=n+1;i++)
	{
	//	hash;
		l[i]=i-1;
//		r[i]=i+1;
		vis[i]=1;
	}
	l[0]=0;
	
	int tot=0;
	for(int i=1;i<=n;i++)
	{
		tot++;
		
	//	cout<<tot<<" ";
		
		f[tot]=f[tot-1]*prime+s[i-1]-'a'+1;
		
	//	if(tot>=m)cout<<f[tot]-f[tot-m]*ksm<<" "<<nw<<"\n";
		
		if(tot>=m&&f[tot]-f[tot-m]*ksm==nw)
		{
			int p=i;
			for(int cnt=1;cnt<=m;cnt++)
			{
				vis[p]=0;
				p=l[p];
			}
			l[i+1]=p;
		//	r[p]=i+1;
			tot-=m;
		}
		
	}
	
	for(int i=1;i<=n;i++)
	{
		if(vis[i])
		putchar(s[i-1]);
	}
	//mt19937_64 myrand(time(0));
	return 0;
}

P3121 [USACO15FEB] Censoring G

P4824 [USACO15FEB] Censoring S 的强化版。

吐槽:

自己原来推出，经子串长度进行分类，最多 $<500$ 种（假设每个子串长度互不相同，且长度由小到大构成公差为 $1$ 的等差数列，等差数列求和公式可求出最多种类数量），再在每个长度上开个 unordered_map，把这个长度的字符串的 Hash 值塞入以这个长度为下标的 unordered_map，单次查询可以做到 $O(1)$，总时间复杂度是 4e7 的，完全能过。

结果被神秘 mp[x][y] 查找困了 $2$ 个小时，发现讨论区有和我一样的思路，又有代码，一行一行对，突然发现神秘 mp[x].count(y)，改后 $2000 ms -> 50 ms$。再交一遍过。

Update：由于字符串 Hash 值冲突概率极小，可以省去 unordered_map 的长度维，只存 Hash 值是否出现即可。

题意：

与弱化版基本一致，只是匹配串由 $1$ 个改为 $n$ 个。

（单词就是匹配串）。

给出字符串 $S$ 和 $n$ 个单词，每次在 $S$ 中找到最早出现的列表中的单词（最早出现指该单词的开始位置最小），然后从 $S$ 中删除这个单词。重复这个操作直到 $S$ 中没有列表里的单词为止。

列表中的单词不会出现一个单词是另一个单词子串的情况，这意味着每个列表中的单词在 $S$ 中出现的开始位置是互不相同的。

思路：

在吐槽里。

只是把要匹配的 $1$ 个串改为了多个串，每次枚举长度时从大到小（题目要求删最先出现的那个），统计答案时类似弱化版，在链表上跳即可。

Update：可以维护一个类似栈的 ans 数组存结果，每次成功匹配后，栈顶指针减去这个字串的长度，发现栈顶指针可以等价为 tot 的变动，可以共用。

Code：

#include<bits/stdc++.h>
#define int long long
#define ull unsigned long long
//#define (1<<17) (1<<17)+1

using namespace std;

string s,t;
ull f[100005];
ull ksm[100005];
const ull prime=131;

unordered_map<ull,bool> mp;
int siz[1005],cnt=0;
bool VIS[100005];
//unordered_map<int,bool> VIS; 

int ans[100005];

signed main()
{
//	freopen("a.in","r",stdin);
	
	ios::sync_with_stdio(0);
//	cin.tie(0);cout.tie(0);
	
	ksm[0]=1;
	for(int i=1;i<(100005);i++) ksm[i]=ksm[i-1]*prime;
	
	cin>>s;
	int n=s.size(),q;
	cin>>q;
	while(q--)
	{
		string t;
		cin>>t;
		
		int len=t.size();
		if(!VIS[len])
		{
			VIS[len]=1;
			siz[++cnt]=len;
		}
		
		ull nw=0;
		for(int i=0;i<len;i++)
			nw=nw*prime+t[i]-'a'+1;
			
		mp[nw]=1;
	}
	
	sort(siz+1,siz+1+cnt/*,cmp*/);
	
//	cout<<cnt<<" ";
	
	int tot=0;
	for(int i=1;i<=n;i++)
	{
		tot++;
		
		ans[tot]=s[i-1];
		f[tot]=f[tot-1]*prime+s[i-1]-'a'+1;
		
		for(int p=cnt;p>=1;p--)
		if(mp.count(f[tot]-f[tot-siz[p]]*ksm[siz[p]])/*mp[f[tot]-f[tot-siz[p]]*ksm[siz[p]]]*/)
			tot-=siz[p];
	}
	
	for(int i=1;i<=tot;i++)
		putchar(ans[i]);
	
	return 0;
}

posted @ 2025-02-08 19:13 Wy_x 阅读(36) 评论(0) 收藏举报

刷新页面返回顶部

༼ つ ◕_◕ ༽つ Wy-x 的 blog 🤗

凤凰台上凤凰游，凤去台空江自流。

字符串 Hash 解题报告

题单：

P3667 [USACO17OPEN] Bovine Genomics G

题意简述：

思路：

Code：

P4591 [TJOI2018] 碱基序列

题意：

思路：

Code：

P3167 [CQOI2014] 通配符匹配

题意：

思路：

Code：

P4824 [USACO15FEB] Censoring S

题意：

思路：

Code：

P3121 [USACO15FEB] Censoring G

吐槽:

题意：

思路：

Code：

公告