• 博客园logo
  • 会员
  • 众包
  • 新闻
  • 博问
  • 闪存
  • 赞助商
  • HarmonyOS
  • Chat2DB
    • 搜索
      所有博客
    • 搜索
      当前博客
  • 写随笔 我的博客 短消息 简洁模式
    用户头像
    我的博客 我的园子 账号设置 会员中心 简洁模式 ... 退出登录
    注册 登录
思想人生从关注生活开始
博客园    首页    新随笔    联系   管理    订阅  订阅

中文分词代码(此代码为作者多年经验总结,以前发表过VB,PB版本)

/*
 * created by yzh 2004.5.12
 * 请大家引用时保留这段作者声明,此代码为开源代码;使用不受限制。
 * 中文分词代码
 *此代码为作者多年经验总结,以前发表过VB,PB版本
*/

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Locale;
import java.util.TreeMap;
import java.util.TreeSet;

public class ChineseSegmenter {

   private static ChineseSegmenter segmenter = null;

   // private Hashtable zhwords;
   private TreeMap zhwords;

   private TreeSet cforeign, cnumbers;

   // Char form
   public final static int TRAD = 0;

   public final static int SIMP = 1;

   public final static int BOTH = 2;

   // Charform is TRAD, SIMP or BOTH
   private ChineseSegmenter(int charform, boolean loadwordfile) {
     
      cforeign = new TreeSet();
      cnumbers = new TreeSet();

      if (charform == SIMP) {
         loadset(cnumbers, "data/snumbers_u8.txt");
         loadset(cforeign, "data/sforeign_u8.txt");
      } else if (charform == TRAD) {
         loadset(cnumbers, "data/tnumbers_u8.txt");
         loadset(cforeign, "data/tforeign_u8.txt");
      } else { // BOTH
         loadset(cnumbers, "data/snumbers_u8.txt");
         loadset(cforeign, "data/sforeign_u8.txt");
         loadset(cnumbers, "data/tnumbers_u8.txt");
         loadset(cforeign, "data/tforeign_u8.txt");
      }

      // zhwords = new Hashtable(120000);
      zhwords = new TreeMap();

      if (!loadwordfile) {
         return;
      }

      String newword = null;
      try {
         InputStream worddata = null;
         if (charform == SIMP) {
            worddata = getClass().getResourceAsStream("simplexu8.txt");
         } else if (charform == TRAD) {
            worddata = getClass().getResourceAsStream("tradlexu8.txt");
         } else if (charform == BOTH) {
            worddata = getClass().getResourceAsStream("bothlexu8.txt");
         }
         BufferedReader in = new BufferedReader(new InputStreamReader(
               worddata, "UTF8"));
         while ((newword = in.readLine()) != null) {
            if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {

               zhwords.put(newword.intern(), "1");

               if (newword.length() == 3) {
                  if (zhwords.containsKey(newword.substring(0, 2)
                        .intern()) == false) {
                     zhwords.put(newword.substring(0, 2).intern(), "2");
                  }
               }

               if (newword.length() == 4) {
                  if (zhwords.containsKey(newword.substring(0, 2)
                        .intern()) == false) {
                     zhwords.put(newword.substring(0, 2).intern(), "2");
                  }
                  if (zhwords.containsKey(newword.substring(0, 3)
                        .intern()) == false) {
                     zhwords.put(newword.substring(0, 3).intern(), "2");
                  }
               }
            }
         }
         in.close();
      } catch (IOException e) {
         e.printStackTrace();
      }

   }
  
   public synchronized static void reset() {
      ChineseSegmenter.segmenter = null;
   }

   public synchronized static ChineseSegmenter getGBSegmenter() {
      Locale.setDefault(Locale.SIMPLIFIED_CHINESE);
      if (ChineseSegmenter.segmenter == null) {
         ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.SIMP, true);
      }
      return ChineseSegmenter.segmenter;
   }

   public synchronized static ChineseSegmenter getBig5Segmenter() {
      Locale.setDefault(Locale.TRADITIONAL_CHINESE);
      if (ChineseSegmenter.segmenter == null) {
         ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.TRAD, true);
      }
      return ChineseSegmenter.segmenter;
   }

   private void loadset(TreeSet targetset, String sourcefile) {
      String dataline;
      try {
         InputStream setdata = getClass().getResourceAsStream(sourcefile);
         BufferedReader in = new BufferedReader(new InputStreamReader(
               setdata, "UTF-8"));
         while ((dataline = in.readLine()) != null) {
            if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
               continue;
            }
            targetset.add(dataline.intern());
         }
         in.close();
      } catch (Exception e) {
         System.err.println("Exception loading data file" + sourcefile + " "
               + e);
         e.printStackTrace();
      }

   }

   public boolean isNumber(String testword) {
      boolean result = true;
      for (int i = 0; i < testword.length(); i++) {
         if (cnumbers.contains(testword.substring(i, i + 1).intern()) == false) {
            result = false;
            break;
         }
      }
      return result;
   }

   public boolean isAllForeign(String testword) {
      boolean result = true;
      for (int i = 0; i < testword.length(); i++) {
         if (cforeign.contains(testword.substring(i, i + 1).intern()) == false) {
            result = false;
            break;
         }
      }

      return result;
   }

   public boolean isNotCJK(String testword) {
      boolean result = true;
      for (int i = 0; i < testword.length(); i++) {
         if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
            result = false;
            break;
         }
      }

      return result;
   }

   public String segmentLine(String cline, String separator) {
      StringBuffer currentword = new StringBuffer();
      StringBuffer outline = new StringBuffer();
      int i, clength;
      char currentchar;
      // separator = " ";

      clength = cline.length();
     
      for (i = 0; i < clength; i++) {
         currentchar = cline.charAt(i);
         if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
               || isNumber(cline.substring(i, i + 1)) == true) {
            // Character in CJK block
            if (currentword.length() == 0) { // start looking for next
                                       // word
               if (i > 0
                     && (Character.isWhitespace(cline.charAt(i - 1)) == false)) {
                  outline.append(separator);
               }
               currentword.append(currentchar);

            } else {
               if (zhwords.containsKey(new String(currentword.toString()
                     + currentchar).intern()) == true
                     && ((String) (zhwords.get(new String(currentword
                           .toString()
                           + currentchar).intern()))).equals("1") == true) {
                  // word is in lexicon
                  currentword.append(currentchar);
               } else if (isAllForeign(currentword.toString())
                     && cforeign.contains(new String(
                           new char[] { currentchar }).intern())
                     && i + 2 < clength
                     && (zhwords.containsKey(cline.substring(i, i + 2)
                           .intern()) == false)) {
                  // Possible a transliteration of a foreign name
                  currentword.append(currentchar);
               } else if (isNumber(currentword.toString())
                     && cnumbers.contains(new String(
                           new char[] { currentchar }).intern())
               /*
                * && (i + 2 < clength) &&
                * (zhwords.containsKey(cline.substring(i, i+2).intern()) ==
                * false)
                */) {
                  // Put all consecutive number characters together
                  currentword.append(currentchar);
               } else if ((zhwords.containsKey(new String(currentword
                     .toString()
                     + currentchar).intern()))
                     && (((String) (zhwords.get(new String(currentword
                           .toString()
                           + currentchar).intern()))).equals("2") == true)
                     && i + 1 < clength
                     && (zhwords.containsKey(new String(currentword
                           .toString()
                           + currentchar + cline.charAt(i + 1))
                           .intern()) == true)) {
                  // Starts a word in the lexicon
                  currentword.append(currentchar);

               } else { // Start anew
                     outline.append(currentword.toString());
                  if (Character.isWhitespace(currentchar) == false) {
                     outline.append(separator);
                  }
                  currentword.setLength(0);
                  currentword.append(currentchar);
               }
            }

         } else { // Not chinese character
            // System.err.println("not cjk");
            if (currentword.length() > 0) {
               outline.append(currentword.toString());
               if (Character.isWhitespace(currentchar) == false) {
                  outline.append(separator);
               }
               currentword.setLength(0);
            }
            outline.append(currentchar);
         }
      }

      outline.append(currentword.toString());

      return outline.toString();
      // return offsets;
   }

   public static void main(String[] args) throws Exception {

      ChineseSegmenter seg = ChineseSegmenter.getGBSegmenter();
      System.out.println(seg.segmentLine("Some string in chinese.", " "));
   
   }

}

 

posted @ 2006-03-24 18:18  JackYang  阅读(1298)  评论(0)    收藏  举报
刷新页面返回顶部
博客园  ©  2004-2025
浙公网安备 33010602011771号 浙ICP备2021040463号-3