Posted on 2008-02-01 09:31
脚印 阅读(1794)
评论(11) 编辑 收藏 网摘 所属分类:
Asp.net
我们经常在文本检查的时候要查找其中的数字,但是可能文本是中文数字字符串“两亿三千万”等等,但是要转化为阿拉伯数字才能被程序识别,本文提供了一种将中文或者英文转化为阿拉伯数字的一种方法。
其实中文和英文只是我只知道这两种语言,但是从程序设计的角度来看,该方法是用于多种语种,只要将数据库配置好即可
中文转化为阿拉伯数字: 可以通用于不要分隔符区分单词的语种 ,例如 中文、日文、韩文等(配置相应数据库即可)
英文转化为阿拉伯数字: 可以通用语用空格分隔单词的语种,例如 英文、法文、德文等(配置相应的数据库即可)
1、 设计语种对应表
|
t_SYS_Num
|
|
fid
|
ftext
|
fvalue
|
ftype
|
|
1
|
零
|
0
|
1
|
|
2
|
一
|
1
|
1
|
|
3
|
二
|
2
|
1
|
|
4
|
三
|
3
|
1
|
|
5
|
四
|
4
|
1
|
|
6
|
五
|
5
|
1
|
|
7
|
六
|
6
|
1
|
|
8
|
七
|
7
|
1
|
|
9
|
八
|
8
|
1
|
|
10
|
九
|
9
|
1
|
|
11
|
壹
|
1
|
1
|
|
12
|
贰
|
2
|
1
|
|
13
|
叁
|
3
|
1
|
|
14
|
肆
|
4
|
1
|
|
15
|
伍
|
5
|
1
|
|
16
|
陆
|
6
|
1
|
|
17
|
柒
|
7
|
1
|
|
18
|
捌
|
8
|
1
|
|
19
|
玖
|
9
|
1
|
|
22
|
one
|
1
|
0
|
|
23
|
two
|
2
|
0
|
|
24
|
three
|
3
|
0
|
|
25
|
four
|
4
|
0
|
|
26
|
five
|
5
|
0
|
|
27
|
six
|
6
|
0
|
|
28
|
seven
|
7
|
0
|
|
29
|
eight
|
8
|
0
|
|
30
|
nine
|
9
|
0
|
|
31
|
ten
|
10
|
0
|
|
32
|
eleven
|
11
|
0
|
|
33
|
twelve
|
12
|
0
|
|
34
|
thirteen
|
13
|
0
|
|
35
|
fourteen
|
14
|
0
|
|
36
|
fifteen
|
15
|
0
|
|
37
|
sixteen
|
16
|
0
|
|
38
|
seventeen
|
17
|
0
|
|
39
|
eighteen
|
18
|
0
|
|
40
|
nineteen
|
19
|
0
|
|
41
|
twenty
|
20
|
0
|
|
42
|
twenty-one
|
21
|
0
|
|
43
|
twenty-two
|
22
|
0
|
|
44
|
twenty-three
|
23
|
0
|
|
45
|
twenty-four
|
24
|
0
|
|
46
|
twenty-five
|
25
|
0
|
|
47
|
twenty-six
|
26
|
0
|
|
48
|
twenty-seven
|
27
|
0
|
|
49
|
twenty-eight
|
28
|
0
|
|
50
|
twenty-nine
|
29
|
0
|
|
51
|
thirty
|
30
|
0
|
|
52
|
thirty-one
|
31
|
0
|
|
53
|
thirty-two
|
32
|
0
|
|
54
|
thirty-three
|
33
|
0
|
|
55
|
thirty-four
|
34
|
0
|
|
56
|
thirty-five
|
35
|
0
|
|
57
|
thirty-six
|
36
|
0
|
|
58
|
thirty-seven
|
37
|
0
|
|
59
|
thirty-eight
|
38
|
0
|
|
60
|
thirty-nine
|
39
|
0
|
|
61
|
forty
|
40
|
0
|
|
62
|
forty-one
|
41
|
0
|
|
63
|
forty-two
|
42
|
0
|
|
64
|
forty-three
|
43
|
0
|
|
65
|
forty-four
|
44
|
0
|
|
66
|
forty-five
|
45
|
0
|
|
67
|
forty-six
|
46
|
0
|
|
68
|
forty-seven
|
47
|
0
|
|
69
|
forty-eight
|
48
|
0
|
|
70
|
forty-nine
|
49
|
0
|
|
71
|
fifty
|
50
|
0
|
|
72
|
fifty-one
|
51
|
0
|
|
73
|
fifty-two
|
52
|
0
|
|
74
|
fifty-three
|
53
|
0
|
|
75
|
fifty-four
|
54
|
0
|
|
76
|
fifty-five
|
55
|
0
|
|
77
|
fifty-six
|
56
|
0
|
|
78
|
fifty-seven
|
57
|
0
|
|
79
|
fifty-eight
|
58
|
0
|
|
80
|
fifty-nine
|
59
|
0
|
|
81
|
sixty
|
60
|
0
|
|
82
|
sixty-one
|
61
|
0
|
|
83
|
sixty-two
|
62
|
0
|
|
84
|
sixty-three
|
63
|
0
|
|
85
|
sixty-four
|
64
|
0
|
|
86
|
sixty-five
|
65
|
0
|
|
87
|
sixty-six
|
66
|
0
|
|
88
|
sixty-seven
|
67
|
0
|
|
89
|
sixty-eight
|
68
|
0
|
|
90
|
sixty-nine
|
69
|
0
|
|
91
|
seventy
|
70
|
0
|
|
92
|
seventy-one
|
71
|
0
|
|
93
|
seventy-two
|
72
|
0
|
|
94
|
seventy-three
|
73
|
0
|
|
95
|
seventy-four
|
74
|
0
|
|
96
|
seventy-five
|
75
|
0
|
|
97
|
seventy-six
|
76
|
0
|
|
98
|
seventy-seven
|
77
|
0
|
|
99
|
seventy-eight
|
78
|
0
|
|
100
|
seventy-nine
|
79
|
0
|
|
101
|
eighty
|
80
|
0
|
|
102
|
eighty-one
|
81
|
0
|
|
103
|
eighty-two
|
82
|
0
|
|
104
|
eighty-three
|
83
|
0
|
|
105
|
eighty-four
|
84
|
0
|
|
106
|
eighty-five
|
85
|
0
|
|
107
|
eighty-six
|
86
|
0
|
|
108
|
eighty-seven
|
87
|
0
|
|
109
|
eighty-eight
|
88
|
0
|
|
110
|
eighty-nine
|
89
|
0
|
|
111
|
ninety
|
90
|
0
|
|
112
|
ninety-one
|
91
|
0
|
|
113
|
ninety-two
|
92
|
0
|
|
114
|
ninety-three
|
93
|
0
|
|
115
|
ninety-four
|
94
|
0
|
|
116
|
ninety-five
|
95
|
0
|
|
117
|
ninety-six
|
96
|
0
|
|
118
|
ninety-seven
|
97
|
0
|
|
119
|
ninety-eight
|
98
|
0
|
|
120
|
ninety-nine
|
99
|
0
|
|
121
|
十
|
10
|
1
|
|
122
|
拾
|
10
|
1
|
fId:主键序号
fText:语种字符串
fValue:对应值
fType:语种分类
|
T_SYS_Unit
|
|
fid
|
funit
|
fvalue
|
flevel
|
fisspace
|
ftype
|
|
1
|
十
|
10
|
1
|
No
|
1
|
|
2
|
百
|
100
|
2
|
No
|
1
|
|
3
|
千
|
1000
|
3
|
No
|
1
|
|
4
|
万
|
10000
|
4
|
Yes
|
1
|
|
5
|
亿
|
100000000
|
5
|
Yes
|
1
|
|
6
|
拾
|
10
|
1
|
No
|
1
|
|
7
|
佰
|
100
|
2
|
No
|
1
|
|
8
|
仟
|
1000
|
3
|
No
|
1
|
|
9
|
萬
|
10000
|
4
|
Yes
|
1
|
|
10
|
億
|
100000000
|
5
|
Yes
|
1
|
|
11
|
hundred
|
100
|
1
|
No
|
0
|
|
12
|
thousand
|
1000
|
2
|
No
|
0
|
|
13
|
million
|
1000000
|
3
|
Yes
|
0
|
|
14
|
billion
|
1000000000
|
4
|
Yes
|
0
|
|
15
|
and
|
1
|
0
|
No
|
0
|
|
16
|
hundreds
|
100
|
1
|
No
|
0
|
|
17
|
thousands
|
1000
|
2
|
No
|
0
|
|
18
|
millions
|
1000000
|
3
|
Yes
|
0
|
|
19
|
billions
|
1000000000
|
4
|
Yes
|
0
|
fId:主键序号
fUnit:单位
fValue:对应的倍数
fLevel:单位排序级别
fIsspace:是否作为分隔符
fType:语种分类
2、 搜索符合原文中的数字字符串
a) 在文本中查找符合表t_SYS_Num 和t_SYS_Unit 中text的最长的字符串
3、 用递归法分割数字字符串
a) 首先将按照fLevel最大的text进行分割
i. 例如中文: 以“億”、“亿” 进行分割
ii. “三千二百万零二十一亿零五千” ——被分离为 “三千二百万零二十一” 和“五千”
iii. 然后按照下一级的单位进行分离,直到所有的fisspace为True的分离完毕
4、 将万以为的数字字符串转化为数字
5、 数字*以前分离出来的单位
6、 将所有的分离开的数字加起来得到最后的数值
代码实现:

/**//// <summary>
/// 数字字符串信息
/// </summary>
public struct NumString

{
public int Paragraph;
public int Start;
public int Length;
public string numstring;
public long Num;

}


/**//// <summary>
/// 语种信息
/// </summary>
public enum LanguageType

{
English = 0,
Chinese = 1,
}
private ArrayList GetNumString(string[] content, LanguageType type)

{
switch (type)

{
case LanguageType.Chinese:
return GetNumString_CH(content);
case LanguageType.English:
return GetNumString_EN(content);
default:
return GetNumString_EN(content);

}

}


/**//// <summary>
/// 查找字符串数组中英文字符串
/// </summary>
/// <param name="content">文本数组</param>
/// <returns>数字字符串数组</returns>
private ArrayList GetNumString_EN(string[] content)

{

string sql = "select * from t_SYS_Num where ftype =0";
DataSet ds = DCBase.ExecuteQuery(sql);
sql = "select * from t_SYS_Unit where ftype =0";
DataSet ds1 = DCBase.ExecuteQuery(sql);
DataTable dt1 = ds.Tables[0];
DataTable dt2 = ds1.Tables[0];
DataView dv1 = dt1.DefaultView;
DataView dv2 = dt2.DefaultView;
ArrayList nubmstring = new ArrayList();
for (int i = 0; i < content.Length; i++)

{
if (content[i] == null || content[i] == "")
continue;
string[] content_str = content[i].Split(' ');
int start = 0;
for (int j = 0; j < content_str.Length; j++)

{

int length = 0;
int step = 0;
dv1.RowFilter = "ftext='" + content_str[j] + "'";
//该单词不是数字
if (dv1.Count == 0)

{
start += content_str[j].Length;
start++;
continue;
}
else

{
length += content_str[j].Length;

for (int n = j + 1; n < content_str.Length; n++)

{
dv1.RowFilter = "ftext='" + content_str[n] + "'";
dv2.RowFilter = "funit='" + content_str[n] + "'";
if (dv1.Count == 0 && dv2.Count == 0)

{
break;
}
else

{
length += content_str[n].Length + 1;
step++;
}
}
}
if (length > 0)

{
NumString ns = new NumString();
ns.Length = length;
ns.Start = start;
ns.Paragraph = i;
ns.numstring = content[i].Substring(start, length);
ns.Num = Splite(ns.numstring, LanguageType.English);
nubmstring.Add(ns);
j += step;
start += length + 1;
}
else

{
start += content_str[j].Length;
start++;
}
}
}
return nubmstring;

}


/**//// <summary>
/// 查找字符串数组中中文字符串
/// </summary>
/// <param name="content">文本数组</param>
/// <returns>数字字符串数组</returns>
private ArrayList GetNumString_CH(string[] content)

{
string sql = "select * from t_SYS_Num where ftype =1";
DataSet ds = DCBase.ExecuteQuery(sql);
sql = "select * from t_SYS_Unit where ftype =1";
DataSet ds1 = DCBase.ExecuteQuery(sql);
DataTable dt1 = ds.Tables[0];
DataTable dt2 = ds1.Tables[0];
DataView dv1 = dt1.DefaultView;
DataView dv2 = dt2.DefaultView;
ArrayList nubmstring = new ArrayList();
for (int i = 0; i < content.Length; i++)

{
if (content[i] == null || content[i] == "")
continue;
string content_str = content[i];
for (int j = 0; j < content_str.Length; j++)

{
int length = 0;
dv1.RowFilter = "ftext='" + content_str[j] + "'";
if (dv1.Count == 0)
continue;
else

{
length++;
for (int n = j + 1; n < content_str.Length; n++)

{
dv1.RowFilter = "ftext='" + content_str[n] + "'";
dv2.RowFilter = "funit='" + content_str[n] + "'";
if (dv1.Count == 0 && dv2.Count == 0)
break;
else

{
length++;
}
}
}
if (length > 0)

{
NumString ns = new NumString();
ns.Length = length;
ns.Start = j;
ns.Paragraph = i;
ns.numstring = content_str.Substring(j, length);
ns.Num = Splite(ns.numstring, LanguageType.Chinese);
nubmstring.Add(ns);
j += length;
}
}
}
return nubmstring;
}

/**//// <summary>
/// 将中文字符串数组转化为阿拉伯数组
/// </summary>
/// <param name="number_CH"></param>
/// <returns></returns>
private long[] CH2A(ArrayList number_CH)

{
long[] num = new long[number_CH.Count];
for (int i = 0; i < number_CH.Count; i++)

{
NumString ns = (NumString)number_CH[i];
string numberstring = ns.numstring;
num[i] = Splite(numberstring, LanguageType.Chinese);
}

return num;
}


/**//// <summary>
/// 分解中文或者英文数字字符串
/// </summary>
/// <param name="numstring"></param>
/// <param name="type"></param>
/// <returns></returns>
private long Splite(string numstring, LanguageType type)

{
string sql = "select max(flevel) from t_SYS_Unit where fisspace=true and ftype=" + (int)type;
object obj = DCBase.ExecuteScalar(sql);
int maxlevel = 0;
if (obj != null && obj != DBNull.Value)
maxlevel = (int)obj;

return Splite(numstring, type, maxlevel);
}


/**//// <summary>
/// 分解中文或者英文数字字符串
/// </summary>
/// <param name="numstring"></param>
/// <param name="type"></param>
/// <returns></returns>
private long Splite(string numstring, LanguageType type, int maxLevel)

{
string sql = "select flevel from t_SYS_Unit where fisspace=true and ftype =" + (int)type + " and flevel<=" + maxLevel + " order by flevel desc";
object obj = DCBase.ExecuteScalar(sql);
int maxlevel = 0;
if (obj != null && obj != DBNull.Value)

{
maxlevel = (int)obj;
sql = "select * from t_SYS_Unit where fisspace=true and ftype =" + (int)type + " and flevel=" + maxlevel + " order by flevel desc";
DataSet ds = DCBase.ExecuteQuery(sql);
numstring = numstring.Trim();
if (ds.Tables[0].Rows.Count > 0)

{
string numstring2 = numstring;
foreach (DataRow dr in ds.Tables[0].Rows)

{
string splitestr = "";
if (numstring.ToLower().IndexOf(dr["funit"].ToString()) > -1)

{
splitestr = dr["funit"].ToString();
numstring2 = numstring.ToLower().Replace(splitestr, "♀");
break;
}
}
string[] num = numstring2.Split('♀');

if (num.Length == 1)

{
return Splite(numstring, type, maxLevel - 1);
}
else

{

return Splite(num[0], type, maxLevel - 1) * (int)ds.Tables[0].Rows[0]["fvalue"] + Splite(num[1], type, maxLevel - 1);

}
}
else

{
return Tran2A(numstring, type);
}
}
else

{
return Tran2A(numstring, type);
}
}

/**//// <summary>
/// 万以内的数字(英文)
/// </summary>
/// <param name="numstring"></param>
/// <returns></returns>
private int EN2A(string numstring)

{
string sql = "select * from t_SYS_Num where ftype =0";
DataSet ds = DCBase.ExecuteQuery(sql);
sql = "select * from t_SYS_Unit where fisspace=false and ftype =0 order by flevel desc";
DataSet ds1 = DCBase.ExecuteQuery(sql);
DataTable dt1 = ds.Tables[0];
DataTable dt2 = ds1.Tables[0];
int result = 0;
int index = -1;
int value = 0;
int maxindex = -1;//查找的最优一个单位出现的位置
string[] numstr = numstring.Split(' ');
foreach (DataRow dr in dt2.Rows)

{
index = -1;
value = 0;
for (int i = 0; i < numstr.Length; i++)

{
if (numstr[i].ToLower() == dr["funit"].ToString())

{
index = i;
break;
}
}
if (index > 0)

{
maxindex = index;
foreach (DataRow row in dt1.Rows)

{
value = 0;
if (numstr[index - 1].ToString() == row["ftext"].ToString())

{
value = (int)row["fvalue"];
break;
}
}
}
result += value * (int)dr["fvalue"];
}
//个位结尾的情况
value = 0;
//处理个位数
if (maxindex != numstr.Length - 1)

{
foreach (DataRow row in dt1.Rows)

{
if (numstr[maxindex + 1].ToString() == row["ftext"].ToString())

{
value = (int)row["fvalue"];
break;
}
}
}
return result + value;

}

/**//// <summary>
/// 万以内的数字(中文)
/// </summary>
/// <param name="numstring"></param>
/// <returns></returns>
private long CH2A(string numstring)

{
string sql = "select * from t_SYS_Num where ftype =1";
DataSet ds = DCBase.ExecuteQuery(sql);
sql = "select * from t_SYS_Unit where fisspace=false and ftype =1 order by flevel desc";
DataSet ds1 = DCBase.ExecuteQuery(sql);
DataTable dt1 = ds.Tables[0];
DataTable dt2 = ds1.Tables[0];
long result = 0;
int index = -1;
long value = 0;
int maxindex = -1;//查找的最优一个单位出现的位置
foreach (DataRow dr in dt2.Rows)

{
value = 0;
index = numstring.IndexOf(dr["funit"].ToString());
//如果数字是以单位开头,就在该数字前加“一”,例如“十四” 变成“一十四”;
if (index == 0)

{
numstring = "一" + numstring;
index = numstring.IndexOf(dr["funit"].ToString());
}
if (index > 0)

{
maxindex = index;
foreach (DataRow row in dt1.Rows)

{
value = 0;
if (numstring[index - 1].ToString() == row["ftext"].ToString())

{
value = (int)row["fvalue"];
break;
}
}
}
else
continue;
result += value * (int)dr["fvalue"];
}
//个位结尾的情况
value = 0;
//处理个位数
if (maxindex != numstring.Length - 1)

{
foreach (DataRow row in dt1.Rows)

{
if (numstring[maxindex + 1].ToString() == row["ftext"].ToString())

{
value = (int)row["fvalue"];
break;
}
}
}
return result + value;

}