基于词典的正向最大匹配中文分词算法,能实现中英文数字混合分词

基于词典的正向最大匹配中文分词算法,能实现中英文数字混合分词。比如能分出这样的词:bb霜、3室、乐phone、touch4、mp3、T恤

第一次写中文分词程序,欢迎拍砖。

publicclass MM2
{
privatestaticfinal Log log = LogFactory.getLog(MM2.class);

privatestatic HashMap<String, Integer> dictionary =null;
privatestaticfinalint WORD_MAX_LENGTH =9;
private Reader reader;

static
{
loadDictionary();
}


public MM2(Reader reader)
{
this.reader = reader;
}


//切分出由中文、字母、数字组成的句子
public ArrayList<Sentence> getSentence() throws IOException
{
ArrayList
<Sentence> list=new ArrayList<Sentence>();
StringBuffer cb
=new StringBuffer();
int d=reader.read();
int offset=0;
boolean b=false;
while(d>-1)
{
int type=Character.getType(d);
if(type==2|| type==9|| type==5)
{
d
=toAscii(d);
cb.append((
char)d);
}

else
{
b
=true;
}

d
=reader.read();
if(d==-1|| b)
{
if(d==-1) offset++;
b
=false;
char[] ioBuffer =newchar[cb.length()];
cb.getChars(
0, cb.length(), ioBuffer, 0);
Sentence sen
=new Sentence(ioBuffer,offset-cb.length());
list.add(sen);
cb.setLength(
0);
}

offset
++;
}

return list;
}


//将句子切分出词
public ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
{
ArrayList
<Token> tokenlist=new ArrayList<Token>();
for(Sentence sen:list)
{
StringBuffer word
=new StringBuffer();
int offset=sen.getStartOffset();
int bufferIndex =0;
char c;
boolean b=false;
while(bufferIndex<sen.getText().length)
{
offset
++;
c
=sen.getText()[bufferIndex++];
if(word.length()==0)
word.append(c);
else
{
String temp
= (word.toString() + c).intern();
if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
word.append(c);
elseif(dictionary.containsKey(temp) && bufferIndex<sen.getText().length)
word.append(c);
else
{
bufferIndex
--;
offset
--;
while(word.length()>1&& dictionary.get(word.toString())!=null&& dictionary.get(word.toString())==2)
{
word.deleteCharAt(word.length()
-1);
bufferIndex
--;
offset
--;
}

b
=true;
}

}

if(b || bufferIndex==sen.getText().length)
{
Token token
=new Token(word.toString(),offset-word.length(),offset,"word");
word.setLength(
0);
tokenlist.add(token);
b
=false;
}

}

}

return tokenlist;
}


//将相连的单个英文或数字组合成词
public ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
{
ArrayList
<Token> tokenlist=new ArrayList<Token>();
Token word
=null;
for(int i=0;i<list.size();i++)
{
Token t
=list.get(i);
if(t.getWord().length()==1&& Character.getType((int)t.getWord().charAt(0))!=5)
{
if(word==null)
word
=t;
elseif(word.getEnd()==t.getStart())
{
word.setEnd(t.getEnd());
word.setWord(word.getWord()
+t.getWord());
}

else
{
tokenlist.add(word);
word
=t;
}

}

elseif(word!=null)
{
tokenlist.add(word);
word
=null;
tokenlist.add(t);
}

else
tokenlist.add(t);
}

if(word!=null)
tokenlist.add(word);
return tokenlist;
}


//双角转单角
publicstaticint toAscii(int codePoint)
{
if((codePoint>=65296&& codePoint<=65305) //0-9
|| (codePoint>=65313&& codePoint<=65338) //A-Z
|| (codePoint>=65345&& codePoint<=65370) //a-z
)
{
codePoint
-=65248;
}

return codePoint;
}


//加载词典
publicstaticvoid loadDictionary()
{
if (dictionary ==null)
{
dictionary
=new HashMap<String, Integer>();
InputStream is
=null;
BufferedReader br
=null;
try
{
is
=new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
br
=new BufferedReader(new InputStreamReader(is, "UTF-8"));
String word
=null;
while ((word = br.readLine()) !=null)
{
word
=word.toLowerCase();
if ((word.indexOf("#") ==-1) && (word.length() <= WORD_MAX_LENGTH))
{
dictionary.put(word.intern(),
1);
int i = word.length()-1;
while(i >=2)
{
String temp
= word.substring(0, i).intern();
if (!dictionary.containsKey(temp))
dictionary.put(temp,
2);
i
--;
}

}

}

}

catch (Exception e)
{
log.info(e);
}

finally
{
try
{
if(br!=null)
br.close();
if(is!=null)
is.close();
}

catch (IOException e)
{
log.info(e);
}

}

}

}


publicstatic String[] segWords(Reader input)
{
ArrayList
<String> list=new ArrayList<String>();
try
{
MM2 f
=new MM2(input);
ArrayList
<Token> tlist= f.getNewToken(f.getToken(f.getSentence()));
for(Token t:tlist)
{
list.add(t.getWord());
}

}

catch(IOException e)
{
log.info(e);
}

return (String[])list.toArray(new String[0]);
}


publicstaticvoid main(String[] args)
{
String[] cc
=MM2.segWords(new StringReader("ibm商务机t60p".toLowerCase()));
for(String c:cc)
{
System.out.println(c);
}

}

}
posted @ 2011-11-11 16:46  爱开卷360  阅读(2541)  评论(0编辑  收藏  举报