AC解 - Life Forms(POJ#3294)
原题: http://poj.org/problem?id=3294
或者:http://acm.nankai.edu.cn/p1312.html
Description
You may have wondered why most extraterrestrial life forms resemble humans, differing by superficial traits such as height, colour, wrinkles, ears, eyebrows and the like. A few bear no human resemblance; these typically have geometric or amorphous shapes like cubes, oil slicks or clouds of dust.
The answer is given in the 146th episode of Star Trek - The Next Generation, titled The Chase. It turns out that in the vast majority of the quadrant's life forms ended up with a large fragment of common DNA.
Given the DNA sequences of several life forms represented as strings of letters, you are to find the longest substring that is shared by more than half of them.
Input
Standard input contains several test cases. Each test case begins with 1 ≤ n ≤ 100, the number of life forms. n lines follow; each contains a string of lower case letters representing the DNA sequence of a life form. Each DNA sequence contains at least one and not more than 1000 letters. A line containing 0 follows the last test case.
Output
For each test case, output the longest string or strings shared by more than half of the life forms. If there are many, output all of them in alphabetical order. If there is no solution with at least one letter, output "?". Leave an empty line between test cases.
Sample Input
3
abcdefg
bcdefgh
cdefghi
3
xxx
yyy
zzz
0
Sample Output
bcdefg
cdefgh
?
分析:本题跟Longest Commong Substring的解法类似,也是求出n个字符串的最长公共子串,只不过本题中要求n的值只需要超过半数即可(而不是所有字符串的公共子串),另外如果有多个等长的公共子串,需要要同时按字典顺序输出。
首 先构造n个DNA序列合成的字符串的后缀树,然后采用深度优先遍历算法(这里使用后序遍历)找出满足条件的最长公共子串。在深度优先算法中,只需要考虑内 部结点下是否有过半的不同后缀(过半后缀个数变量名为dominatedCount)作为它的后代叶子结点。在任何一个内部结点Node中,如果它的所有 儿子结点都不满足含过半后缀的条件,就需要重新统计Node自身是否满足该条件,否则只要有任何一个儿子结点满足条件,结点Node就不需要考虑了,因为 Node的深度肯定比满足条件的儿子结点小。在具体实现中,需要在每个节点中添加一个标志done来标记该结点是否已经被处理。由于使用了另一篇文中介绍 的mcc算法构造后缀树,输出的多个等长的公共子串(如果存在多个的话)已经是字典顺序。 另外,结点的深度就是pathlen属性。
代码:
import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Scanner; import java.util.Stack; /** * * POJ#3294: Life Forms * * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/) * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php) * * @author ljs * 2011-07-04 * */ public class Main { private class SuffixNode { private String text; private List<SuffixNode> children = new LinkedList<SuffixNode>(); private SuffixNode link; private int start; private int end; private int pathlen; private boolean done; public SuffixNode(String text,int start,int end,int pathlen){ this.text = text; this.start = start; this.end = end; this.pathlen = pathlen; } public SuffixNode(String text){ this.text = text; this.start = -1; this.end = -1; this.pathlen = 0; } public int getLength(){ if(start == -1) return 0; else return end - start + 1; } public boolean isRoot(){ return start == -1; } public String getCoordinate(){ return "[" + start+".." + end + "/" + this.pathlen + "]"; } public String getString(){ if(start != -1){ return this.text.substring(start,end+1); }else{ return ""; } } public String toString(){ return this.getString() + getCoordinate() + "["+ this.done + "]"; } } private class State{ private SuffixNode u; //parent(head) private SuffixNode w; //s(head[i-1]) private SuffixNode v; //head[i-1] private int j; //the global index of text starting from 0 to text.length() private boolean finished; //is this suffix insertion finished? } private SuffixNode root; private String text; private int[] termpos; private int dominatedCount; public Main(String text,int[] termpos){ this.text = text; this.termpos = termpos; dominatedCount = termpos.length/2 + 1; } //build a suffix-tree for a string of text private void buildSuffixTree() throws Exception{ if(root==null){ root = new SuffixNode(text); root.link = root; //link to itself } SuffixNode u = root; SuffixNode v = root; State state = new State(); for(int i=0;i<text.length();i++){ //process each suffix SuffixNode s = u.link; int uvLen=v.pathlen - u.pathlen; if(u.isRoot() && !v.isRoot()){ uvLen--; } int j = s.pathlen + i; //init state state.u = s; state.w = s; //if uvLen = 0 state.v = s; state.j = j; state.finished = false; //execute fast scan if(uvLen > 0) { fastscan(state,s,uvLen,j); } //establish the suffix link with v SuffixNode w = state.w; v.link = w; //execute slow scan if(!state.finished){ j = state.j; state.u = w; //w must be an internal node when state.finished=false, then it must have a suffix link, so u can be updated. slowscan(state,w,j); } u = state.u; v = state.v; } } //slow scan until head(=state.v) is found private void slowscan(State state,SuffixNode currNode,int j){ boolean done = false; int keyLen = text.length() - j; for(int i=0;i<currNode.children.size();i++){ SuffixNode child = currNode.children.get(i); //use min(child.key.length, key.length) int childKeyLen = child.getLength(); int len = childKeyLen<keyLen?childKeyLen:keyLen; int delta = 0; for(;delta<len;delta++){ if(text.charAt(j+delta) != text.charAt(child.start+delta)){ break; } } if(delta==0){//this child doesn't match any character with the new key //order keys by lexi-order if(text.charAt(j) < text.charAt(child.start)){ //e.g. child="e" (currNode="abc") // abc abc // / \ =========> / | \ // e f insert "c^" c^ e f int pathlen = text.length() - j + currNode.pathlen; SuffixNode node = new SuffixNode(text,j,text.length()-1,pathlen); currNode.children.add(i,node); //state.u = currNode; //currNode is already registered as state.u, so commented out state.v = currNode; state.finished = true; done = true; break; }else{ //key.charAt(0)>child.key.charAt(0) //don't forget to add the largest new key after iterating all children continue; } }else{//current child's key partially matches with the new key if(delta==len){ if(keyLen>childKeyLen){ //suffix tree with ^ ending can't have other two cases //e.g. child="ab" // ab ab // / \ ==========> / | \ // e f insert "abc^" c^ e f //recursion state.u = child; j += childKeyLen; state.j = j; slowscan(state,child,j); } }else{//0<delta<len //e.g. child="abc" // abc ab // / \ ==========> / \ // e f insert "abd^" c d^ // / \ // e f //insert the new node: ab int nodepathlen = child.pathlen - (child.getLength()-delta); SuffixNode node = new SuffixNode(text, child.start,child.start + delta - 1,nodepathlen); node.children = new LinkedList<SuffixNode>(); int tailpathlen = (text.length() - (j + delta)) + nodepathlen; SuffixNode tail = new SuffixNode(text, j+delta,text.length()-1,tailpathlen); //update child node: c child.start += delta; if(text.charAt(j+delta)<text.charAt(child.start)){ node.children.add(tail); node.children.add(child); }else{ node.children.add(child); node.children.add(tail); } //update parent currNode.children.set(i, node); //state.u = currNode; //currNode is already registered as state.u, so commented out state.v = node; state.finished = true; } done = true; break; } } if(!done){ int pathlen = text.length() - j + currNode.pathlen; SuffixNode node = new SuffixNode(text,j,text.length()-1,pathlen); currNode.children.add(node); //state.u = currNode; //currNode is already registered as state.u, so commented out state.v = currNode; state.finished = true; } } //fast scan until w is found private void fastscan(State state,SuffixNode currNode,int uvLen,int j){ for(int i=0;i<currNode.children.size();i++){ SuffixNode child = currNode.children.get(i); if(text.charAt(child.start) == text.charAt(j)){ int len = child.getLength(); if(uvLen==len){ //then we find w //uvLen = 0; //need slow scan after this child state.u = child; state.w = child; state.j = j+len; }else if(uvLen<len){ //branching and cut child short //e.g. child="abc",uvLen = 2 // abc ab // / \ ================> / \ // e f suffix part: "abd^" c d^ // / \ // e f //insert the new node: ab; child is now c int nodepathlen = child.pathlen - (child.getLength()-uvLen); SuffixNode node = new SuffixNode(text, child.start,child.start + uvLen - 1,nodepathlen); node.children = new LinkedList<SuffixNode>(); int tailpathlen = (text.length() - (j + uvLen)) + nodepathlen; SuffixNode tail = new SuffixNode(text, j+uvLen,text.length()-1,tailpathlen); //update child node: c child.start += uvLen; if(text.charAt(j+uvLen)<text.charAt(child.start)){ node.children.add(tail); node.children.add(child); }else{ node.children.add(child); node.children.add(tail); } //update parent currNode.children.set(i, node); //uvLen = 0; //state.u = currNode; //currNode is already registered as state.u, so commented out state.w = node; state.finished = true; state.v = node; }else{//uvLen>len //e.g. child="abc", uvLen = 4 // abc // / \ ================> // e f suffix part: "abcdefg^" // // //jump to next node uvLen -= len; state.u = child; j += len; state.j = j; fastscan(state,child,uvLen,j); } break; } } } public List<List<SuffixNode>> solve(){ List<List<SuffixNode>> paths = new ArrayList<List<SuffixNode>>(); Stack<SuffixNode> stack = new Stack<SuffixNode>(); stack.push(root); traverse(root,0,paths,stack); //for(List<SuffixNode> path:paths){ // System.out.format("%s%n",path); //} return paths; } public int traverse(SuffixNode currNode,int maxDepth,List<List<SuffixNode>> paths,Stack<SuffixNode> stack){ //post order traverse for(int i=0;i<currNode.children.size();i++){ SuffixNode child = currNode.children.get(i); stack.push(child); maxDepth = traverse(child,maxDepth,paths,stack); } //visit currNode if(!currNode.children.isEmpty()){ boolean accepted = false; for(int i=0;i<currNode.children.size();i++){ SuffixNode child = currNode.children.get(i); if(child.done){ accepted = true; break; } } if(accepted){ //if any one of the children is done, then current node is //out of consideration currNode.done = true; }else{ //if all of the children is not done, then current node should //be considered if(containTerminators(currNode)){ int depth = currNode.pathlen; if(depth>maxDepth){ maxDepth = depth; paths.clear(); paths.add(new ArrayList<SuffixNode>(stack)); //register the path //System.out.format("%s%n",path); }else if(depth==maxDepth){ paths.add(new ArrayList<SuffixNode>(stack)); //register the path //System.out.format("%s%n",path); } currNode.done = true; } } } //else{ //currNode.done = false; //} stack.pop(); return maxDepth; } private boolean containTerminators(SuffixNode currNode){ boolean[] done = new boolean[termpos.length]; return containTerminators(currNode,done); } private boolean containTerminators(SuffixNode currNode,boolean[] done){ for(int i=0;i<currNode.children.size();i++){ SuffixNode child = currNode.children.get(i); if(child.children.isEmpty()){//leaf node //Note: here the order of terminator is important for(int j=0;j<termpos.length;j++){ if(termpos[j] >= child.start && termpos[j] <= child.end){ done[j]=true; break; } } }else{ containTerminators(child,done); } } int k=0; for(int j=0;j<done.length;j++){ if(done[j]){ k++; } } if(k>=dominatedCount){ return true; }else return false; } //for test purpose only public void printTree(){ System.out.format("The suffix tree for S = %s is: %n",this.text); this.print(0, this.root); } private void print(int level, SuffixNode node){ for (int i = 0; i < level; i++) { System.out.format(" "); } System.out.format("|"); for (int i = 0; i < level; i++) { System.out.format("-"); } System.out.format("%s(%d..%d/%d,%s)%n", node.getString(),node.start,node.end,node.pathlen,node.done); //System.out.format("(%d,%d)%n", node.start,node.end); for (SuffixNode child : node.children) { print(level + 1, child); } } public static void main(String[] args) throws Exception { Scanner cin = new Scanner(System.in); String line = cin.nextLine(); line = line.trim(); int lifeformsCount = Integer.parseInt(line); List<String[]> testcases = new ArrayList<String[]>(); while(lifeformsCount>0){ String[] lifeforms = new String[lifeformsCount]; for(int i=0;i<lifeformsCount;i++){ line = cin.nextLine(); line = line.trim(); lifeforms[i] = line; } testcases.add(lifeforms); line = cin.nextLine(); line = line.trim(); lifeformsCount = Integer.parseInt(line); } //test for(int k=0;k<testcases.size();k++){ String[] lifeforms = testcases.get(k); int count = lifeforms.length; if(count==1){ System.out.println(lifeforms[0]); }else{ char[] terminators = new char[count]; char t = '\u0100'; for(int i=t,j=0;j<count;i++,j++){ terminators[j] = (char)i; } int[] termpos = new int[count]; StringBuilder sb = new StringBuilder(); for(int j=0;j<count;j++){ sb.append(lifeforms[j]); sb.append(terminators[j]); termpos[j] = sb.length()-1; } Main stree = new Main(sb.toString(),termpos); stree.buildSuffixTree(); //stree.printTree(); List<List<SuffixNode>> paths = stree.solve(); //stree.printTree(); for(List<SuffixNode> path:paths){ if(path.size()==1){ //only root System.out.println("?"); break; } StringBuilder sbgenes = new StringBuilder(); for(int i=1;i<path.size();i++){ SuffixNode node = path.get(i); sbgenes.append(node.getString()); } System.out.println(sbgenes); } } if(k<testcases.size()-1) System.out.println(); } } }
测试数据(注意n可以取值等于1,1 ≤ n ≤ 100):
3
abcdefg
bcdefgh
cdefghi
3
xxx
yyy
zzz
1
abc
2
aaa
aaaa
0
bcdefg
cdefgh
?
abc
aaa