shell grep正则匹配汉字

Shell grep正则匹配中文

测试文本 demo_exe.c，内容如下，需要注意保存的编码格式，对输出到终端有影响：

我们中文操作系统ASNI默认是GBK的。

 1 #include<stdio.h>
 2 #include<stdlib.h>
 3 #include <string.h>
 4 #include <errno.h>
 5 #include <locale.h>
 6 #include <dlfcn.h>
 7 
 8 /*
 9  * export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp; /data/local/tmp/demo_exe
10  */
11 int main(int argc, char** argv) {
12 // 这个是中文
13     void *handle = NULL;
14     char* locname = setlocale(LC_ALL, "");
15 // 这个是中文
16 //
17     if ((handle = dlopen(demo_dso_so, RTLD_NOW)) == NULL) {
18         printf("dlopen出错: %s\n", dlerror());
19     }
20     printf("@%s[%s]dlopen return handle = %#x.\n", __FILE__, __FUNCTION__, handle);
21 // 这个是
22 // 中文
23     return 0;
24 }

1、匹配特定文字：

$ grep -nP "\xE4\xB8\xAD\xE6\x96\x87|\xD6\xD0\xCE\xC4" ./demo_exe.c
12:// 这个是中文
15:// 这个是中文
22:// 中文

编码	中	文	在线码表
GBK	D6D0	CEC4	http://www.lhelper.org/tech/chinese_internal_code_specification_classified.txt
Unicode	4E2D	6587
UTF-8	%E4%B8%AD	%E6%96%87	http://wenku.baidu.com/link?url=DfbzjKLcRaQ7yVIA_EHVP7mKdVbkggq4hwkCmmO9uR76Jib_5Y1Y_h616NnI21XY_x85YZqN1SQBAdCFQjklS_

GBK码：中=D6D0，文=CEC4

Unicode码：中=4E2D，文=6587

UTF-8码：中=%E4%B8%AD，文=%E6%96%87

2、匹配特定范围文字

$ grep -nP "[\xB0\xA1-\xF7\xFE]+" /home/fangss/c/dynamic_share_object_test/demo_exe.c
12:// 这个是中文
15:// 这个是中文
18: printf("dlopen出错: %s\n", dlerror());
21:// 这个是
22:// 中文

范围：

● GBK/2: GB2312 汉字

B0 ０ １ ２ ３ ４ ５ ６ ７ ８ ９ Ａ Ｂ Ｃ Ｄ Ｅ Ｆ
Ａ 　 啊 阿 埃 挨 哎 唉 哀 皑 癌 蔼 矮 艾 碍 爱 隘
Ｂ 鞍 氨 安 俺 按 暗 岸 胺 案 肮 昂 盎 凹 敖 熬 翱
Ｃ 袄 傲 奥 懊 澳 芭 捌 扒 叭 吧 笆 八 疤 巴 拔 跋
Ｄ 靶 把 耙 坝 霸 罢 爸 白 柏 百 摆 佰 败 拜 稗 斑
Ｅ 班 搬 扳 般 颁 板 版 扮 拌 伴 瓣 半 办 绊 邦 帮
Ｆ 梆 榜 膀 绑 棒 磅 蚌 镑 傍 谤 苞 胞 包 褒 剥

。。。

F7 ０ １ ２ ３ ４ ５ ６ ７ ８ ９ Ａ Ｂ Ｃ Ｄ Ｅ Ｆ
Ａ 　 鳌 鳍 鳎 鳏 鳐 鳓 鳔 鳕 鳗 鳘 鳙 鳜 鳝 鳟 鳢
Ｂ 靼 鞅 鞑 鞒 鞔 鞯 鞫 鞣 鞲 鞴 骱 骰 骷 鹘 骶 骺
Ｃ 骼 髁 髀 髅 髂 髋 髌 髑 魅 魃 魇 魉 魈 魍 魑 飨
Ｄ 餍 餮 饕 饔 髟 髡 髦 髯 髫 髻 髭 髹 鬈 鬏 鬓 鬟
Ｅ 鬣 麽 麾 縻 麂 麇 麈 麋 麒 鏖 麝 麟 黛 黜 黝 黠
Ｆ 黟 黢 黩 黧 黥 黪 黯 鼢 鼬 鼯 鼹 鼷 鼽 鼾 齄

正则表达式

正则表达式30分钟入门教程

转载来源版本：v2.33 (2013-1-10) 作者：deerchao

Get XRegExp 2.0: minified (3.5 KB gzipped), or with comments. Get the full package or the latest development build at GitHub.

Java中正则，中链接 Regular Expressions of Java Tutorial

Java正则表达式教程

<!--<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- saved from url=(0029)http://tool.chinaz.com/ -->
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <title>正则表达式在线测试 - 站长工具</title>
    <meta name="keywords" content="正则表达式在线测试,正则表达式测试工具">
    <meta name="description" content="该工具主要针对程序开发人员，通过该工具可以快速准备的判断所写的正则是否能正确匹配相应的字符">
    <link rel="icon" href="http://tool.chinaz.com/Chinaz.ico" type="image/x-icon">
    <!--<link href="http://tool.chinaz.com/template/default/styles/toolsite.css?ver=2011_11" rel="stylesheet" type="text/css">-->
    <!--<script src="http://tool.chinaz.com/template/default/js/globals.js?ver=2011_10" type="text/javascript"></script>-->
    <!--<link rel="Stylesheet" type="text/css" href="http://tool.chinaz.com/template/default/styles/topbar.css">-->
    <script language="JavaScript">
        //if (self != top) { top.location = self.location; }
    </script>
    <style type="text/css">
        ul, ol, li, dl, dd, p, h1, h2, h3, h4, h5, h6, form, fieldset {
            margin: 0;
            padding: 0;
        }

        h1, h2, h3, h4, h5, h6 {
            font-size: 1em;
        }

        ul, ol {
            list-style: none;
        }

        img {
            vertical-align: middle;
            border: 0;
        }

        a {
            color: #0d8c21;
            text-decoration: none;
        }

            a:hover {
                color: red;
                text-decoration: underline;
            }

        select {
            font-size: 15px;
        }

        .blue {
            color: #3333ff;
        }

        .fl {
            float: left;
        }

        .fr {
            float: right;
        }

        .box h1 {
            line-height: 37px;
            height: 37px;
            padding-left: 20px;
            background: url("http://tool.chinaz.com/template/default/images/h1-bg.gif") repeat-x;
            color: #0066CC;
            border: 1px solid #c5e2f2;
            border-bottom: 0;
            font-size: 14px;
            font-weight: normal;
        }

            .box h1 span {
                float: right;
                background: url("http://tool.chinaz.com/template/default/images/h1.gif") no-repeat right;
                padding: 10px 12px 0 0;
                font-weight: normal;
            }

            .box h1 a {
                color: #3333ff;
            }

        .box .titright {
            float: right;
            padding-right: 10px;
        }

        .box .titleft {
            float: left;
        }

        .box .notice {
            color: red;
            margin-bottom: 5px;
            background: none repeat scroll 0% 0% transparent;
            border: 1px solid rgb(197, 226, 242);
        }

        .clear {
            clear: both;
            font-size: 0;
            line-height: 0;
            height: 10px;
        }

        .input {
            border: 1px solid #94c6e1;
            background: #fff;
            color: #22ac38;
            font-weight: bold;
            padding: 5px;
            margin-bottom: 5px;
        }

        .input {
            font-size: 13px;
        }

        .but {
            width: 90px;
            border: 1px solid #c5e2f2;
            background: #cde4f2 url('http://tool.chinaz.com/template/default/images/but.gif') repeat-x 50% top;
            height: 30px;
            margin-left: 5px;
            cursor: pointer;
            margin-bottom: 5px;
        }

        .but2 {
            border: 1px solid #c5e2f2;
            background: #cde4f2 url('http://tool.chinaz.com/template/default/images/but.gif') repeat-x 50% top;
            height: 30px;
            margin-left: 5px;
            cursor: pointer;
            margin-bottom: 5px;
            width: 90px;
        }

        .but3 {
            border: 1px solid #c5e2f2;
            background: #cde4f2 url('http://tool.chinaz.com/template/default/images/but.gif') repeat-x 50% top;
            height: 30px;
            margin-left: 5px;
            cursor: pointer;
            margin-bottom: 5px;
            width: 50px;
        }

        .but4 {
            border: 1px solid #c5e2f2;
            background: #cde4f2 url('http://tool.chinaz.com/template/default/images/but.gif') repeat-x 50% top;
            height: 30px;
            margin-left: 5px;
            cursor: pointer;
            margin-bottom: 5px;
            width: 120px;
        }

        .input1 {
            border: 1px solid #7f9db9;
            background: #fff;
            color: #333;
            font-weight: bold;
            padding: 3px 5px;
            margin-bottom: 5px;
        }

        .but1 {
            border: 1px solid #7f9db9;
            background: #f0f7fd;
            height: 23px;
            margin-left: 5px;
            cursor: pointer;
            overflow: visible;
            padding: 0 15px;
            margin-bottom: 5px;
        }
        /*w4648*/
        . {
            margin: auto;
            width: 900px;
            clear: both;
        }

        td, th {
            border: 1px solid #C0C0C0;
            border-collapse: collapse;
            padding: 5px;
        }

        table {
            border-collapse: collapse;
            border: 1px solid #C0C0C0;
            margin: 0 auto;
        }

        .menu-list {
            z-index: 5;
        }

        #mainbody {
            padding-top: 10px;
            padding-bottom: 10px;
        }

        #condition ul li {
            float: left;
        }

        #search {
            height: 180px;
            width: 99.75%;
        }

        #input {
            height: 375px;
            margin-top: 10px;
            width: 99.75%;
        }

        .smartField {
            border: 1px solid #CCCCCC;
            overflow: auto;
            position: relative;
        }

            .smartField pre, .smartField textarea {
                width: 100%;
                padding: 0;
                margin: 0;
                font: 100% "courier new",monospace;
            }

            .smartField pre {
                text-align: left;
                color: #F9F9F9;
                z-index: 1;
            }

            .smartField textarea {
                background: none repeat scroll 0 0 transparent;
                border: 0 none;
                height: 100%;
                overflow: hidden;
                position: absolute;
                left: 0px;
                top: 0px;
                z-index: 2;
            }

        b, i, u {
            font-style: normal;
            font-weight: normal;
            text-decoration: none;
        }

        #input b {
            background: none repeat scroll 0 0 #FFF000;
            color: #FFF000;
        }

        #input i {
            background: none repeat scroll 0 0 #80C0FF;
            color: #80C0FF;
        }

        #search b {
            background: none repeat scroll 0 0 #AAD1F7;
            color: #AAD1F7;
        }

        #search i {
            background: none repeat scroll 0 0 #F9CA69;
            color: #F9CA69;
        }

            #search i b {
                background: none repeat scroll 0 0 #F7A700;
                color: #F7A700;
            }

            #search i u {
                background: none repeat scroll 0 0 #EFBA4A;
                color: #EFBA4A;
            }

        #search b.g1 {
            background: none repeat scroll 0 0 #D2F854;
            color: #D2F854;
        }

        #search b.g2 {
            background: none repeat scroll 0 0 #9EC70C;
            color: #9EC70C;
        }

        #search b.g3 {
            background: none repeat scroll 0 0 #ECC9F7;
            color: #ECC9F7;
        }

        #search b.g4 {
            background: none repeat scroll 0 0 #54B70B;
            color: #54B70B;
        }

        #search b.g5 {
            background: none repeat scroll 0 0 #B688CF;
            color: #B688CF;
        }

        #search b.err {
            background: none repeat scroll 0 0 #FF4300 !important;
            color: #FF4300 !important;
        }
    </style>
</head>
<body>
    <div class="w4648">
        <!--main-->
        <div class="main">
            <div class="box">
                <div id="b_1">
                    <h1><a style="color: #3333ff;" href="http://tool.chinaz.com/regex/">正则表达式在线测试</a></h1>
                    <div class="box1" style="text-align:center;">
                        <div id="condition">
                            <ul>
                                <li style=" display:none;"><input type="checkbox" checked="checked" id="toolG"><label for="toolG">全局</label></li>
                                <li><input type="checkbox" id="toolI"><label for="toolI">不区分大小写</label></li>
                                <li><input type="checkbox" id="toolM"><label for="toolM">对^$前后换行也支持</label></li>
                                <li><input type="checkbox" id="toolS"><label for="toolS">符号.匹配所有</label></li>
                            </ul>
                            <span><input type="checkbox" checked="checked" id="highSyntax"><label for="highSyntax">对正则着色</label></span>
                            <span><input type="checkbox" checked="checked" id="highMatch"><label for="highMatch">对匹配结果着色</label></span>
                            <span><input type="checkbox" id="invertMatch"><label for="invertMatch">对无匹配结果着色</label></span>
                        </div>
                        <div id="mainbody">
                            <div class="smartField" id="search">
                                <textarea spellcheck="false" tabindex="1" rows="3" cols="100" id="searchText" style="height: 180px; margin-left: 0px; width: 856px;">(\w+\.){2}\w+</textarea>
                            </div>
                            <div class="smartField" id="input" style="height: 180px;">
                                <textarea spellcheck="false" tabindex="2" rows="10" cols="100" id="inputText" style="height: 180px; margin-left: 0px; width: 856px;">tool.chinaz.com|888|http://www.cnblogs.com/Fang3s/p/4338103.html</textarea>
                            </div>
                        </div>
                    </div>
                </div>
            </div>
            <!--<script type="text/javascript" src="http://tool.chinaz.com/template/default/js/regbase.js"></script>-->
            <!--<script type="text/javascript" src="http://tool.chinaz.com/template/default/js/reg.js"></script>-->
            <script type="text/javascript">
                //http://tool.chinaz.com/template/default/js/regbase.js
                /*    
                    XRegExp 0.2.5
                    (c)Steven Levithan
                    MIT license

                    Provides an augmented, cross-browser implementation of regular
                    expressions, including support for additional flags.
                */
                (function () { if (window.XRegExp) return; var real = { RegExp: RegExp, exec: RegExp.prototype.exec, match: String.prototype.match, replace: String.prototype.replace }; var re = { extended: /(?:[^[#\s\\]+|\\(?:[\S\s]|$)|\[\^?]?(?:[^\\\]]+|\\(?:[\S\s]|$))*]?)+|(\s*#[^\n\r\u2028\u2029]*\s*|\s+)([?*+]|{[0-9]+(?:,[0-9]*)?})?/g, singleLine: /(?:[^[\\.]+|\\(?:[\S\s]|$)|\[\^?]?(?:[^\\\]]+|\\(?:[\S\s]|$))*]?)+|\./g, characterClass: /(?:[^\\[]+|\\(?:[\S\s]|$))+|\[\^?(]?)(?:[^\\\]]+|\\(?:[\S\s]|$))*]?/g, capturingGroup: /(?:[^[(\\]+|\\(?:[\S\s]|$)|\[\^?]?(?:[^\\\]]+|\\(?:[\S\s]|$))*]?|\((?=\?))+|(\()(?:<([$\w]+)>)?/g, namedBackreference: /(?:[^\\[]+|\\(?:[^k]|$)|\[\^?]?(?:[^\\\]]+|\\(?:[\S\s]|$))*]?|\\k(?!<[$\w]+>))+|\\k<([$\w]+)>([0-9]?)/g, replacementVariable: /(?:[^$]+|\$(?![1-9$&`']|{[$\w]+}))+|\$(?:([1-9]\d*|[$&`'])|{([$\w]+)})/g }; XRegExp = function (pattern, flags) { flags = flags || ""; if (flags.indexOf("x") > -1) { pattern = real.replace.call(pattern, re.extended, function ($0, $1, $2) { return $1 ? ($2 || "(?:)") : $0 }) }; var hasNamedCapture = false; if (flags.indexOf("k") > -1) { var captureNames = []; pattern = real.replace.call(pattern, re.capturingGroup, function ($0, $1, $2) { if ($1) { if ($2) hasNamedCapture = true; captureNames.push($2 || null); return "(" } else { return $0 } }); if (hasNamedCapture) { pattern = real.replace.call(pattern, re.namedBackreference, function ($0, $1, $2) { var index = $1 ? captureNames.indexOf($1) : -1; return index > -1 ? "\\" + (index + 1) + ($2 ? "(?:)" + $2 : "") : $0 }) } }; pattern = real.replace.call(pattern, re.characterClass, function ($0, $1) { return $1 ? real.replace.call($0, "]", "\\]") : $0 }); if (flags.indexOf("s") > -1) { pattern = real.replace.call(pattern, re.singleLine, function ($0) { return $0 === "." ? "[\\S\\s]" : $0 }) }; var regex = real.RegExp(pattern, real.replace.call(flags, /[sxk]+/g, "")); if (hasNamedCapture) regex._captureNames = captureNames; return regex }; RegExp.prototype.addFlags = function (flags) { flags = (flags || "") + (this.global ? "g" : "") + (this.ignoreCase ? "i" : "") + (this.multiline ? "m" : ""); var regex = new XRegExp(this.source, flags); if (!regex._captureNames && this._captureNames) regex._captureNames = this._captureNames.slice(0); return regex }; RegExp.prototype.exec = function (str) { var result = real.exec.call(this, str); if (!(this._captureNames && result && result.length > 1)) return result; for (var i = 1; i < result.length; i++) { var name = this._captureNames[i - 1]; if (name) result[name] = result[i] }; return result }; String.prototype.match = function (regex) { if (!regex._captureNames || regex.global) return real.match.call(this, regex); return regex.exec(this) }; String.prototype.replace = function (search, replacement) { if (!(search instanceof real.RegExp && search._captureNames)) return real.replace.apply(this, arguments); if (typeof replacement === "function") { return real.replace.call(this, search, function () { arguments[0] = new String(arguments[0]); for (var i = 0; i < search._captureNames.length; i++) { if (search._captureNames[i]) arguments[0][search._captureNames[i]] = arguments[i + 1] }; return replacement.apply(window, arguments) }) } else { return real.replace.call(this, search, function () { var args = arguments; return real.replace.call(replacement, re.replacementVariable, function ($0, $1, $2) { if ($1) { switch ($1) { case "$": return "$"; case "&": return args[0]; case "`": return args[args.length - 1].slice(0, args[args.length - 2]); case "'": return args[args.length - 1].slice(args[args.length - 2] + args[0].length); default: var literalNumbers = ""; $1 = +$1; while ($1 > search._captureNames.length) { literalNumbers = $1.split("").pop() + literalNumbers; $1 = Math.floor($1 / 10) }; return ($1 ? args[$1] : "$") + literalNumbers } } else if ($2) { var index = search._captureNames.indexOf($2); return index > -1 ? args[index + 1] : $0 } else { return $0 } }) }) } } })(); XRegExp.cache = function (pattern, flags) { var key = "/" + pattern + "/" + (flags || ""); return XRegExp.cache[key] || (XRegExp.cache[key] = new XRegExp(pattern, flags)) }; XRegExp.overrideNative = function () { RegExp = XRegExp }; if (!Array.prototype.indexOf) { Array.prototype.indexOf = function (item, from) { var len = this.length; for (var i = (from < 0) ? Math.max(0, len + from) : from || 0; i < len; i++) { if (this[i] === item) return i }; return -1 } }

                // http://tool.chinaz.com/template/default/js/reg.js
                function $(el) { if (el.nodeName) return el; if (typeof el === "string") return document.getElementById(el); return false }; var trim = function () { var lSpace = /^\s\s*/, rSpace = /\s\s*$/; return function (str) { return str.replace(lSpace, "").replace(rSpace, "") } }(); function replaceHtml(el, html) { var oldEl = $(el); var newEl = oldEl.cloneNode(false); newEl.innerHTML = html; oldEl.parentNode.replaceChild(newEl, oldEl); return newEl }; function replaceOuterHtml(el, html) { el = replaceHtml(el, ""); if (el.outerHTML) { var id = el.id, className = el.className, nodeName = el.nodeName; el.outerHTML = "<" + nodeName + " id=\"" + id + "\" class=\"" + className + "\">" + html + "</" + nodeName + ">"; el = $(id) } else { el.innerHTML = html }; return el }; function getElementsByClassName(className, tagName, parentNode) { var els = ($(parentNode) || document).getElementsByTagName(tagName || "*"), results = []; for (var i = 0; i < els.length; i++) { if (hasClass(className, els[i])) results.push(els[i]) }; return results }; function hasClass(className, el) { return XRegExp.cache("(?:^|\\s)" + className + "(?:\\s|$)").test($(el).className) }; function addClass(className, el) { el = $(el); if (!hasClass(className, el)) { el.className = trim(el.className + " " + className) } }; function removeClass(className, el) { el = $(el); el.className = trim(el.className.replace(XRegExp.cache("(?:^|\\s)" + className + "(?:\\s|$)", "g"), " ")) }; function toggleClass(className, el) { if (hasClass(className, el)) { removeClass(className, el) } else { addClass(className, el) } }; function swapClass(oldClass, newClass, el) { removeClass(oldClass, el); addClass(newClass, el) }; function replaceSelection(textbox, str) { if (textbox.setSelectionRange) { var start = textbox.selectionStart, end = textbox.selectionEnd, offset = (start + str.length); textbox.value = (textbox.value.substring(0, start) + str + textbox.value.substring(end)); textbox.setSelectionRange(offset, offset) } else if (document.selection) { var range = document.selection.createRange(); range.text = str; range.select() } }; function extend(to, from) { for (var property in from) to[property] = from[property]; return to }; function purge(d) { var a = d.attributes, i, l, n; if (a) { l = a.length; for (i = 0; i < l; i += 1) { n = a[i].name; if (typeof d[n] === 'function') { d[n] = null } } }; a = d.childNodes; if (a) { l = a.length; for (i = 0; i < l; i += 1) { purge(d.childNodes[i]) } } }; var isWebKit = navigator.userAgent.indexOf("WebKit") > -1, isIE, isIE6 = isIE && !window.XMLHttpRequest; var RegexPal = { fields: { search: new SmartField("search"), input: new SmartField("input"), options: { flags: { g: $("toolG"), i: $("toolI"), m: $("toolM"), s: $("toolS") }, highlightSyntax: $("highSyntax"), highlightMatches: $("highMatch"), invertMatches: $("invertMatch") } } }; extend(RegexPal, function () { var f = RegexPal.fields, o = f.options; return { highlightMatches: function () { var re = { matchPair: /`~\{((?:[^}]+|\}(?!~`))*)\}~`((?:[^`]+|`(?!~\{(?:[^}]+|\}(?!~`))*\}~`))*)(?:`~\{((?:[^}]+|\}(?!~`))*)\}~`)?/g, sansTrailingAlternator: /^(?:[^\\|]+|\\[\S\s]?|\|(?=[\S\s]))*/ }; return function () { var search = String(f.search.textbox.value), input = String(f.input.textbox.value); if (XRegExp.cache('<[bB] class="?err"?>').test(f.search.bg.innerHTML) || (!search.length && !o.invertMatches.checked) || !o.highlightMatches.checked) { f.input.clearBg(); return }; try { var searchRegex = new XRegExp(re.sansTrailingAlternator.exec(search)[0], (o.flags.g.checked ? "g" : "") + (o.flags.i.checked ? "i" : "") + (o.flags.m.checked ? "m" : "") + (o.flags.s.checked ? "s" : "")) } catch (err) { f.input.clearBg(); return }; if (o.invertMatches.checked) { var output = ("`~{" + input.replace(searchRegex, "}~`$&`~{") + "}~`").replace(XRegExp.cache("`~\\{\\}~`|\\}~``~\\{", "g"), "") } else { var output = input.replace(searchRegex, "`~{$&}~`") }; output = output.replace(XRegExp.cache("[<&>]", "g"), "_").replace(re.matchPair, "<b>$1</b>$2<i>$3</i>"); f.input.setBgHtml(output) } }(), highlightSearchSyntax: function () { if (o.highlightSyntax.checked) { f.search.setBgHtml(parseRegex(f.search.textbox.value)) } else { f.search.clearBg() } } } }()); var parseRegex = function () { var re = { regexToken: /\[\^?]?(?:[^\\\]]+|\\[\S\s]?)*]?|\\(?:0(?:[0-3][0-7]{0,2}|[4-7][0-7]?)?|[1-9][0-9]*|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|c[A-Za-z]|[\S\s]?)|\((?:\?[:=!]?)?|(?:[?*+]|\{[0-9]+(?:,[0-9]*)?\})\??|[^.?*+^${[()|\\]+|./g, characterClassParts: /^(<opening>\[\^?)(<contents>]?(?:[^\\\]]+|\\[\S\s]?)*)(<closing>]?)$/.addFlags("k"), characterClassToken: /[^\\-]+|-|\\(?:[0-3][0-7]{0,2}|[4-7][0-7]?|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|c[A-Za-z]|[\S\s]?)/g, quantifier: /^(?:[?*+]|\{[0-9]+(?:,[0-9]*)?\})\??$/ }, type = { NONE: 0, RANGE_HYPHEN: 1, METACLASS: 2, ALTERNATOR: 3 }; function errorStr(str) { return '<b class="err">' + str + '</b>' }; function getTokenCharCode(token) { if (token.length > 1 && token.charAt(0) === "\\") { var t = token.slice(1); if (XRegExp.cache("^c[A-Za-z]$").test(t)) { return "ABCDEFGHIJKLMNOPQRSTUVWXYZ".indexOf(t.charAt(1).toUpperCase()) + 1 } else if (XRegExp.cache("^(?:x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4})$").test(t)) { return parseInt(t.slice(1), 16) } else if (XRegExp.cache("^(?:[0-3][0-7]{0,2}|[4-7][0-7]?)$").test(t)) { return parseInt(t, 8) } else if (t.length === 1 && "cuxDdSsWw".indexOf(t) > -1) { return false } else if (t.length === 1) { switch (t) { case "b": return 8; case "f": return 12; case "n": return 10; case "r": return 13; case "t": return 9; case "v": return 11; default: return t.charCodeAt(0) } } } else if (token !== "\\") { return token.charCodeAt(0) }; return false }; function parseCharacterClass(value) { var output = "", parts = re.characterClassParts.exec(value), parser = re.characterClassToken, lastToken = { rangeable: false, type: type.NONE }, match, m; output += parts.closing ? parts.opening : errorStr(parts.opening); while (match = parser.exec(parts.contents)) { m = match[0]; if (m.charAt(0) === "\\") { if (XRegExp.cache("^\\\\[cux]$").test(m)) { output += errorStr(m); lastToken = { rangeable: lastToken.type !== type.RANGE_HYPHEN } } else if (XRegExp.cache("^\\\\[dsw]$", "i").test(m)) { output += "<b>" + m + "</b>"; lastToken = { rangeable: lastToken.type !== type.RANGE_HYPHEN, type: type.METACLASS } } else if (m === "\\") { output += errorStr(m) } else { output += "<b>" + m.replace(XRegExp.cache("[<&>]"), "_") + "</b>"; lastToken = { rangeable: lastToken.type !== type.RANGE_HYPHEN, charCode: getTokenCharCode(m) } } } else if (m === "-") { if (lastToken.rangeable) { var lastIndex = parser.lastIndex, nextToken = parser.exec(parts.contents); if (nextToken) { var nextTokenCharCode = getTokenCharCode(nextToken[0]); if ((nextTokenCharCode !== false && lastToken.charCode > nextTokenCharCode) || lastToken.type === type.METACLASS || XRegExp.cache("^\\\\[dsw]$", "i").test(nextToken[0])) { output += errorStr("-") } else { output += "<u>-</u>" }; lastToken = { rangeable: false, type: type.RANGE_HYPHEN } } else { if (parts.closing) { output += "-" } else { output += "<u>-</u>"; break } }; parser.lastIndex = lastIndex } else { output += "-"; lastToken = { rangeable: lastToken.type !== type.RANGE_HYPHEN } } } else { output += m.replace(XRegExp.cache("[<&>]", "g"), "_"); lastToken = { rangeable: (m.length > 1 || lastToken.type !== type.RANGE_HYPHEN), charCode: m.charCodeAt(m.length - 1) } } }; return output + parts.closing }; return function (value) { var output = "", capturingGroupCount = 0, groupStyleDepth = 0, openGroups = [], lastToken = { quantifiable: false, type: type.NONE }, match, m; function groupStyleStr(str) { return '<b class="g' + groupStyleDepth + '">' + str + '</b>' }; while (match = re.regexToken.exec(value)) { m = match[0]; switch (m.charAt(0)) { case "[": output += "<i>" + parseCharacterClass(m) + "</i>"; lastToken = { quantifiable: true }; break; case "(": if (m.length === 2) { output += errorStr(m) } else { if (m.length === 1) capturingGroupCount++; groupStyleDepth = groupStyleDepth === 5 ? 1 : groupStyleDepth + 1; openGroups.push({ index: output.length + 14, opening: m }); output += groupStyleStr(m) }; lastToken = { quantifiable: false }; break; case ")": if (!openGroups.length) { output += errorStr(")"); lastToken = { quantifiable: false } } else { output += groupStyleStr(")"); lastToken = { quantifiable: !XRegExp.cache("^[=!]").test(openGroups[openGroups.length - 1].opening.charAt(2)), style: "g" + groupStyleDepth }; groupStyleDepth = groupStyleDepth === 1 ? 5 : groupStyleDepth - 1; openGroups.pop() }; break; case "\\": if (XRegExp.cache("^[1-9]").test(m.charAt(1))) { var nonBackrefDigits = "", num = +m.slice(1); while (num > capturingGroupCount) { nonBackrefDigits = XRegExp.cache("[0-9]$").exec(num)[0] + nonBackrefDigits; num = Math.floor(num / 10) }; if (num > 0) { output += "<b>\\" + num + "</b>" + nonBackrefDigits } else { var parts = XRegExp.cache("^\\\\([0-3][0-7]{0,2}|[4-7][0-7]?|[89])([0-9]*)").exec(m); output += "<b>\\" + parts[1] + "</b>" + parts[2] } } else if (XRegExp.cache("^[0bBcdDfnrsStuvwWx]").test(m.charAt(1))) { if (XRegExp.cache("^\\\\[cux]$").test(m)) { output += errorStr(m); lastToken = { quantifiable: false }; break }; output += "<b>" + m + "</b>"; if ("bB".indexOf(m.charAt(1)) > -1) { lastToken = { quantifiable: false }; break } } else if (m === "\\") { output += errorStr(m) } else { output += m.replace(XRegExp.cache("[<&>]"), "_") }; lastToken = { quantifiable: true }; break; default: if (re.quantifier.test(m)) { if (lastToken.quantifiable) { var interval = XRegExp.cache("^\\{([0-9]+)(?:,([0-9]*))?").exec(m); if (interval && ((interval[1] > 65535) || (interval[2] && ((interval[2] > 65535) || (+interval[1] > +interval[2]))))) { output += errorStr(m) } else { output += (lastToken.style ? '<b class="' + lastToken.style + '">' : '<b>') + m + '</b>' } } else { output += errorStr(m) }; lastToken = { quantifiable: false } } else if (m === "|") { if (lastToken.type === type.NONE || (lastToken.type === type.ALTERNATOR && !openGroups.length)) { output += errorStr(m) } else { output += openGroups.length ? groupStyleStr("|") : "<b>|</b>" }; lastToken = { quantifiable: false, type: type.ALTERNATOR } } else if ("^$".indexOf(m) > -1) { output += "<b>" + m + "</b>"; lastToken = { quantifiable: false } } else if (m === ".") { output += "<b>.</b>"; lastToken = { quantifiable: true } } else { output += m.replace(XRegExp.cache("[<&>]", "g"), "_"); lastToken = { quantifiable: true } } } }; var numCharsAdded = 0; for (var i = 0; i < openGroups.length; i++) { var errorIndex = openGroups[i].index + numCharsAdded; output = (output.slice(0, errorIndex) + errorStr(openGroups[i].opening) + output.slice(errorIndex + openGroups[i].opening.length)); numCharsAdded += errorStr("").length }; return output } }(); function SmartField(el) { el = $(el); var textboxEl = el.getElementsByTagName("textarea")[0], bgEl = document.createElement("pre"); textboxEl.id = el.id + "Text"; bgEl.id = el.id + "Bg"; el.insertBefore(bgEl, textboxEl); textboxEl.onkeydown = function (e) { SmartField.prototype._onKeyDown(e) }; textboxEl.onkeyup = function (e) { SmartField.prototype._onKeyUp(e) }; if (isIE) el.style.overflowX = "hidden"; if (isWebKit) textboxEl.style.marginLeft = 0; this.field = el; this.textbox = textboxEl; this.bg = bgEl }; extend(SmartField.prototype, { setBgHtml: function (html) { html = html.replace(XRegExp.cache("^\\n"), "\n\n"); this.bg = replaceOuterHtml(this.bg, html + "<br>&nbsp;"); this.setDimensions() }, clearBg: function () { this.setBgHtml(this.textbox.value.replace(XRegExp.cache("[<&>]", "g"), "_")) }, setDimensions: function () { this.textbox.style.width = ""; var scrollWidth = this.textbox.scrollWidth, offsetWidth = this.textbox.offsetWidth; this.textbox.style.width = (scrollWidth === offsetWidth ? offsetWidth - 1 : scrollWidth + 8) + "px"; this.textbox.style.height = Math.max(this.bg.offsetHeight, this.field.offsetHeight - 2) + "px" }, _onKeyDown: function (e) { e = e || event; if (!this._filterKeys(e)) return false; var srcEl = e.srcElement || e.target; switch (srcEl) { case RegexPal.fields.search.textbox: setTimeout(function () { RegexPal.highlightSearchSyntax.call(RegexPal) }, 0); break }; if (isWebKit && srcEl.selectionEnd === srcEl.value.length) { srcEl.parentNode.scrollTop = srcEl.scrollHeight }; this._testKeyHold(e) }, _onKeyUp: function (e) { e = e || event; var srcEl = e.srcElement || e.target; this._keydownCount = 0; if (this._matchOnKeyUp) { this._matchOnKeyUp = false; switch (srcEl) { case RegexPal.fields.search.textbox: case RegexPal.fields.input.textbox: RegexPal.highlightMatches(); break } } }, _testKeyHold: function (e) { var srcEl = e.srcElement || e.target; this._keydownCount++; if (this._keydownCount > 2) { RegexPal.fields.input.clearBg(); this._matchOnKeyUp = true } else { switch (srcEl) { case RegexPal.fields.search.textbox: case RegexPal.fields.input.textbox: setTimeout(function () { RegexPal.highlightMatches.call(RegexPal) }, 0); break } } }, _filterKeys: function (e) { var srcEl = e.srcElement || e.target, f = RegexPal.fields; if (this._deadKeys.indexOf(e.keyCode) > -1) return false; if ((e.keyCode === 9) && (srcEl === f.input.textbox || (srcEl === f.search.textbox && !e.shiftKey))) { if (srcEl === f.input.textbox) { if (e.shiftKey) { f.search.textbox.focus() } else { replaceSelection(srcEl, "\t"); if (window.opera) setTimeout(function () { srcEl.focus() }, 0) } } else { f.input.textbox.focus() }; if (e.preventDefault) e.preventDefault(); else e.returnValue = false }; return true }, _matchOnKeyUp: false, _keydownCount: 0, _deadKeys: [16, 17, 18, 19, 20, 27, 33, 34, 35, 36, 37, 38, 39, 40, 44, 45, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 144, 145] }); (function () { var f = RegexPal.fields, o = f.options; onresize = function (e) { var isIE1 = !!window.ActiveXObject; var isIE61 = isIE1 && !window.XMLHttpRequest; if (isIE61) f.input.field.style.height = Math.max((window.innerHeight || document.documentElement.clientHeight) - 310, 180) + "px"; else f.input.field.style.height = Math.max((window.innerHeight || document.documentElement.clientHeight) - 610, 180) + "px"; f.search.setDimensions(); f.input.setDimensions() }; onresize(); RegexPal.highlightSearchSyntax(); RegexPal.highlightMatches(); for (var flag in o.flags) { o.flags[flag].onclick = RegexPal.highlightMatches }; o.highlightSyntax.onclick = RegexPal.highlightSearchSyntax; o.highlightMatches.onclick = RegexPal.highlightMatches; o.invertMatches.onclick = RegexPal.highlightMatches; function makeResetter(field) { return function () { field.clearBg(); field.textbox.value = ""; field.textbox.onfocus = null } }; if (f.search.textbox.value == "(\\w+\\.){2}\\w+") { f.search.textbox.onfocus = makeResetter(f.search) }; if (f.input.textbox.value === "tool.chinaz.com|888") { f.input.textbox.onfocus = makeResetter(f.input) } })();
            </script>

            <div class="box">
                <div id="b_14">
                    <h1>工具简介</h1>
                    <div class="box1">
                        <span class="info2" style=" font-size: 14px; line-height: 24px; text-align: left;white-space:normal; width:860px;overflow:hidden;">
                            <span style=" font-weight:bold; color:Red;">正则表达式到底是什么东西？</span><br>在编写处理字符串的程序或网页时，经常会有查找符合某些复杂规则的字符串的需要。正则表达式就是用于描述这些规则的工具。换句话说，正则表达式就是记录文本规则的代码。<br><span style=" font-weight:bold; color:Red;">常用元字符</span><br><table cellspacing="0"><thead><tr><th scope="col">代码</th><th scope="col">说明</th></tr></thead><tbody><tr><td><span class="code">.</span></td><td><span class="desc">匹配除换行符以外的任意字符</span></td></tr><tr><td><span class="code">\w</span></td><td><span class="desc">匹配字母或数字或下划线或汉字</span></td></tr><tr><td><span class="code">\s</span></td><td><span class="desc">匹配任意的空白符</span></td></tr><tr><td><span class="code">\d</span></td><td><span class="desc">匹配数字</span></td></tr><tr><td><span class="code">\b</span></td><td><span class="desc">匹配单词的开始或结束</span></td></tr><tr><td><span class="code">^</span></td><td><span class="desc">匹配字符串的开始</span></td></tr><tr><td><span class="code">$</span></td><td><span class="desc">匹配字符串的结束</span></td></tr></tbody></table><br><span style=" font-weight:bold; color:Red;">常用限定符</span><br><table cellspacing="0"><thead><tr><th scope="col">代码/语法</th><th scope="col">说明</th></tr></thead><tbody><tr><td><span class="code">*</span></td><td><span class="desc">重复零次或更多次</span></td></tr><tr><td><span class="code">+</span></td><td><span class="desc">重复一次或更多次</span></td></tr><tr><td><span class="code">?</span></td><td><span class="desc">重复零次或一次</span></td></tr><tr><td><span class="code">{n}</span></td><td><span class="desc">重复n次</span></td></tr><tr><td><span class="code">{n,}</span></td><td><span class="desc">重复n次或更多次</span></td></tr><tr><td><span class="code">{n,m}</span></td><td><span class="desc">重复n到m次</span></td></tr></tbody></table><br><span style=" font-weight:bold; color:Red;">常用反义词</span><br><table cellspacing="0"><thead><tr><th scope="col">代码/语法</th><th scope="col">说明</th></tr></thead><tbody><tr><td><span class="code">\W</span></td><td><span class="desc">匹配任意不是字母，数字，下划线，汉字的字符</span></td></tr><tr><td><span class="code">\S</span></td><td><span class="desc">匹配任意不是空白符的字符</span></td></tr><tr><td><span class="code">\D</span></td><td><span class="desc">匹配任意非数字的字符</span></td></tr><tr><td><span class="code">\B</span></td><td><span class="desc">匹配不是单词开头或结束的位置</span></td></tr><tr><td><span class="code">[^x]</span></td><td><span class="desc">匹配除了x以外的任意字符</span></td></tr><tr><td><span class="code">[^aeiou]</span></td><td><span class="desc">匹配除了aeiou这几个字母以外的任意字符</span></td></tr></tbody></table><br>
                        </span>
                    </div>
                </div>
                <div style=" height:5px;"></div>
            </div>
        </div>
    </div>
</body>
</html>-->

正则表达式在线测试

/* utf-8: 0xc0, 0xe0, 0xf0, 0xf8, 0xfc

char str[] = "hello,中文字", len = strlen(str);
int utf8CharLen;
for (int i = 0, utf8CharLen; i < len; i += utf8CharLen)
{
utf8CharLen = BYTE_WIDTH_UTF8(str[i]);
printf("str[%d] is a word character with %d bytes\n", i, utf8CharLen);
}
*/
unsigned char mblen_table_utf8[] =
{
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
};

#define BYTE_WIDTH_UTF8(x)  mblen_table_utf8[(unsigned char)(x)]

int _tmain(int argc, _TCHAR* argv[])
{
    char str[] = "hello,中文字", len = strlen(str);
    int utf8CharLen;
    for (int i = 0, utf8CharLen; i < len; i += utf8CharLen)
    {
        utf8CharLen = BYTE_WIDTH_UTF8(str[i]);
        printf("str[%d] is a word character with %d bytes\n", i, utf8CharLen);
    }
    getchar();
}

下面

比如某二进制字节是UTF-8编码的字符串的实际字节，则作用和public String(byte bytes[], String charsetName)取codePoint类似

/* UTF8 utilities */

/* This parses a UTF8 string one character at a time. It is passed a pointer
 * to the string and the length of the string. It sets 'value' to the value of
 * the current character. It returns the number of characters read or a
 * negative error code:
 * -1 = string too short
 * -2 = illegal character
 * -3 = subsequent characters not of the form 10xxxxxx
 * -4 = character encoded incorrectly (not minimal length).
 */

int UTF8_getc(const unsigned char *str, int len, unsigned long *val)
{
    const unsigned char *p;
    unsigned long value;
    int ret;
    if(len <= 0) return 0;
    p = str;

    /* Check syntax and work out the encoded value (if correct) */
    if((*p & 0x80) == 0) {
        value = *p++ & 0x7f;
        ret = 1;
    } else if((*p & 0xe0) == 0xc0) {
        if(len < 2) return -1;
        if((p[1] & 0xc0) != 0x80) return -3;
        value = (*p++ & 0x1f) << 6;
        value |= *p++ & 0x3f;
        if(value < 0x80) return -4;
        ret = 2;
    } else if((*p & 0xf0) == 0xe0) {
        if(len < 3) return -1;
        if( ((p[1] & 0xc0) != 0x80)
           || ((p[2] & 0xc0) != 0x80) ) return -3;
        value = (*p++ & 0xf) << 12;
        value |= (*p++ & 0x3f) << 6;
        value |= *p++ & 0x3f;
        if(value < 0x800) return -4;
        ret = 3;
    } else if((*p & 0xf8) == 0xf0) {
        if(len < 4) return -1;
        if( ((p[1] & 0xc0) != 0x80)
           || ((p[2] & 0xc0) != 0x80)
           || ((p[3] & 0xc0) != 0x80) ) return -3;
        value = ((unsigned long)(*p++ & 0x7)) << 18;
        value |= (*p++ & 0x3f) << 12;
        value |= (*p++ & 0x3f) << 6;
        value |= *p++ & 0x3f;
        if(value < 0x10000) return -4;
        ret = 4;
    } else if((*p & 0xfc) == 0xf8) {
        if(len < 5) return -1;
        if( ((p[1] & 0xc0) != 0x80)
           || ((p[2] & 0xc0) != 0x80)
           || ((p[3] & 0xc0) != 0x80)
           || ((p[4] & 0xc0) != 0x80) ) return -3;
        value = ((unsigned long)(*p++ & 0x3)) << 24;
        value |= ((unsigned long)(*p++ & 0x3f)) << 18;
        value |= ((unsigned long)(*p++ & 0x3f)) << 12;
        value |= (*p++ & 0x3f) << 6;
        value |= *p++ & 0x3f;
        if(value < 0x200000) return -4;
        ret = 5;
    } else if((*p & 0xfe) == 0xfc) {
        if(len < 6) return -1;
        if( ((p[1] & 0xc0) != 0x80)
           || ((p[2] & 0xc0) != 0x80)
           || ((p[3] & 0xc0) != 0x80)
           || ((p[4] & 0xc0) != 0x80)
           || ((p[5] & 0xc0) != 0x80) ) return -3;
        value = ((unsigned long)(*p++ & 0x1)) << 30;
        value |= ((unsigned long)(*p++ & 0x3f)) << 24;
        value |= ((unsigned long)(*p++ & 0x3f)) << 18;
        value |= ((unsigned long)(*p++ & 0x3f)) << 12;
        value |= (*p++ & 0x3f) << 6;
        value |= *p++ & 0x3f;
        if(value < 0x4000000) return -4;
        ret = 6;
    } else return -2;
    *val = value;
    return ret;
}

/* This takes a character 'value' and writes the UTF8 encoded value in
 * 'str' where 'str' is a buffer containing 'len' characters. Returns
 * the number of characters written or -1 if 'len' is too small. 'str' can
 * be set to NULL in which case it just returns the number of characters.
 * It will need at most 6 characters.
 */

int UTF8_putc(unsigned char *str, int len, unsigned long value)
{
    if(!str) len = 6;    /* Maximum we will need */
    else if(len <= 0) return -1;
    if(value < 0x80) {
        if(str) *str = (unsigned char)value;
        return 1;
    }
    if(value < 0x800) {
        if(len < 2) return -1;
        if(str) {
            *str++ = (unsigned char)(((value >> 6) & 0x1f) | 0xc0);
            *str = (unsigned char)((value & 0x3f) | 0x80);
        }
        return 2;
    }
    if(value < 0x10000) {
        if(len < 3) return -1;
        if(str) {
            *str++ = (unsigned char)(((value >> 12) & 0xf) | 0xe0);
            *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
            *str = (unsigned char)((value & 0x3f) | 0x80);
        }
        return 3;
    }
    if(value < 0x200000) {
        if(len < 4) return -1;
        if(str) {
            *str++ = (unsigned char)(((value >> 18) & 0x7) | 0xf0);
            *str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80);
            *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
            *str = (unsigned char)((value & 0x3f) | 0x80);
        }
        return 4;
    }
    if(value < 0x4000000) {
        if(len < 5) return -1;
        if(str) {
            *str++ = (unsigned char)(((value >> 24) & 0x3) | 0xf8);
            *str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80);
            *str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80);
            *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
            *str = (unsigned char)((value & 0x3f) | 0x80);
        }
        return 5;
    }
    if(len < 6) return -1;
    if(str) {
        *str++ = (unsigned char)(((value >> 30) & 0x1) | 0xfc);
        *str++ = (unsigned char)(((value >> 24) & 0x3f) | 0x80);
        *str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80);
        *str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80);
        *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
        *str = (unsigned char)((value & 0x3f) | 0x80);
    }
    return 6;
}

http://www.leonerd.org.uk/code/libtickit/doc/

tickit_string_putchar(3)	tickit_string_putchar - append a UTF-8 encoded codepoint to a buffer
tickit_string_seqlen(3)	tickit_string_seqlen - determine the length of a UTF-8 codepoint encoding

static int next_utf8(const char *str, size_t len, uint32_t *cp)
{
  unsigned char b0 = (str++)[0];
  int nbytes;

  if(!len)
    return -1;

  if(!b0)
    return -1;
  else if(b0 < 0x80) { // ASCII
    *cp = b0; return 1;
  }
  else if(b0 < 0xc0) // C1 or continuation
    return -1;
  else if(b0 < 0xe0) {
    nbytes = 2; *cp = b0 & 0x1f;
  }
  else if(b0 < 0xf0) {
    nbytes = 3; *cp = b0 & 0x0f;
  }
  else if(b0 < 0xf8) {
    nbytes = 4; *cp = b0 & 0x07;
  }
  else
    return -1;

  if(len < nbytes)
    return -1;

  for(int i = 1; i < nbytes; i++) {
    b0 = (str++)[0];
    if(!b0)
      return -1;

    *cp <<= 6;
    *cp |= b0 & 0x3f;
  }

  return nbytes;
}

int tickit_string_seqlen(long codepoint)
{
  if(codepoint < 0x0000080) return 1;
  if(codepoint < 0x0000800) return 2;
  if(codepoint < 0x0010000) return 3;
  if(codepoint < 0x0200000) return 4;
  if(codepoint < 0x4000000) return 5;
  return 6;
}

size_t tickit_string_putchar(char *str, size_t len, long codepoint)
{
  int nbytes = tickit_string_seqlen(codepoint);
  if(!str)
    return nbytes;
  if(len < nbytes)
    return -1;

  // This is easier done backwards
  int b = nbytes;
  while(b > 1) {
    b--;
    str[b] = 0x80 | (codepoint & 0x3f);
    codepoint >>= 6;
  }

  switch(nbytes) {
    case 1: str[0] =        (codepoint & 0x7f); break;
    case 2: str[0] = 0xc0 | (codepoint & 0x1f); break;
    case 3: str[0] = 0xe0 | (codepoint & 0x0f); break;
    case 4: str[0] = 0xf0 | (codepoint & 0x07); break;
    case 5: str[0] = 0xf8 | (codepoint & 0x03); break;
    case 6: str[0] = 0xfc | (codepoint & 0x01); break;
  }

  return nbytes;
}

由codePoint计算这个值转化为UTF8应该占几个字节

/* The following functions copied and adapted from libtermkey
 *
 * http://www.leonerd.org.uk/code/libtermkey/
 */
static inline unsigned int utf8_seqlen(long codepoint)
{
  if(codepoint < 0x0000080) return 1;
  if(codepoint < 0x0000800) return 2;
  if(codepoint < 0x0010000) return 3;
  if(codepoint < 0x0200000) return 4;
  if(codepoint < 0x4000000) return 5;
  return 6;
}

/* Does NOT NUL-terminate the buffer */
static int fill_utf8(long codepoint, char *str)
{
  int nbytes = utf8_seqlen(codepoint);

  // This is easier done backwards
  int b = nbytes;
  while(b > 1) {
    b--;
    str[b] = 0x80 | (codepoint & 0x3f);
    codepoint >>= 6;
  }

  switch(nbytes) {
    case 1: str[0] =        (codepoint & 0x7f); break;
    case 2: str[0] = 0xc0 | (codepoint & 0x1f); break;
    case 3: str[0] = 0xe0 | (codepoint & 0x0f); break;
    case 4: str[0] = 0xf0 | (codepoint & 0x07); break;
    case 5: str[0] = 0xf8 | (codepoint & 0x03); break;
    case 6: str[0] = 0xfc | (codepoint & 0x01); break;
  }

  return nbytes;
}
/* end copy */

liblinebreak-2.0 : Line breaking in a Unicode sequence. Designed to be used in a generic text renderer.

typedef unsigned char    utf8_t;        /**< Type for UTF-8 data points */
typedef unsigned short    utf16_t;    /**< Type for UTF-16 data points */
typedef unsigned int    utf32_t;    /**< Type for UTF-32 data points */

/**
 * Gets the next Unicode character in a UTF-8 sequence.  The index will
 * be advanced to the next complete character, unless the end of string
 * is reached in the middle of a UTF-8 sequence.
 *
 * @param[in]     s        input UTF-8 string
 * @param[in]     len    length of the string in bytes
 * @param[in,out] ip    pointer to the index
 * @return                the Unicode character beginning at the index; or
 *                        #EOS if end of input is encountered
 */
utf32_t lb_get_next_char_utf8(
        const utf8_t *s,
        size_t len,
        size_t *ip)
{
    utf8_t ch;
    utf32_t res;

    assert(*ip <= len);
    if (*ip == len)
        return EOS;
    ch = s[*ip];

    if (ch < 0xC2 || ch > 0xF4)
    {    /* One-byte sequence, tail (should not occur), or invalid */
        *ip += 1;
        return ch;
    }
    else if (ch < 0xE0)
    {    /* Two-byte sequence */
        if (*ip + 2 > len)
            return EOS;
        res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
        *ip += 2;
        return res;
    }
    else if (ch < 0xF0)
    {    /* Three-byte sequence */
        if (*ip + 3 > len)
            return EOS;
        res = ((ch & 0x0F) << 12) +
              ((s[*ip + 1] & 0x3F) << 6) +
              ((s[*ip + 2] & 0x3F));
        *ip += 3;
        return res;
    }
    else
    {    /* Four-byte sequence */
        if (*ip + 4 > len)
            return EOS;
        res = ((ch & 0x07) << 18) +
              ((s[*ip + 1] & 0x3F) << 12) +
              ((s[*ip + 2] & 0x3F) << 6) +
              ((s[*ip + 3] & 0x3F));
        *ip += 4;
        return res;
    }
}

/**
 * Gets the next Unicode character in a UTF-16 sequence.  The index will
 * be advanced to the next complete character, unless the end of string
 * is reached in the middle of a UTF-16 surrogate pair.
 *
 * @param[in]     s        input UTF-16 string
 * @param[in]     len    length of the string in words
 * @param[in,out] ip    pointer to the index
 * @return                the Unicode character beginning at the index; or
 *                        #EOS if end of input is encountered
 */
utf32_t lb_get_next_char_utf16(
        const utf16_t *s,
        size_t len,
        size_t *ip)
{
    utf16_t ch;

    assert(*ip <= len);
    if (*ip == len)
        return EOS;
    ch = s[(*ip)++];

    if (ch < 0xD800 || ch > 0xDBFF)
    {    /* If the character is not a high surrogate */
        return ch;
    }
    if (*ip == len)
    {    /* If the input ends here (an error) */
        --(*ip);
        return EOS;
    }
    if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
    {    /* If the next character is not the low surrogate (an error) */
        return ch;
    }
    /* Return the constructed character and advance the index again */
    return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
}

/**
 * Gets the next Unicode character in a UTF-32 sequence.  The index will
 * be advanced to the next character.
 *
 * @param[in]     s        input UTF-32 string
 * @param[in]     len    length of the string in dwords
 * @param[in,out] ip    pointer to the index
 * @return                the Unicode character beginning at the index; or
 *                        #EOS if end of input is encountered
 */
utf32_t lb_get_next_char_utf32(
        const utf32_t *s,
        size_t len,
        size_t *ip)
{
    assert(*ip <= len);
    if (*ip == len)
        return EOS;
    return s[(*ip)++];
}

java String 的内部是char[]，char数组，char是16比特，2字节。，某字符，如汉字“中”，UTF-8编码时，字节数组为{-28, -72, -83}。

在java中用String表示则内部char[]为{0x4e2d}，长度为1；而在c中用char[]或char*表示则直接为{-28, -72, -83}，长度为3

        String one = "中";
        int unicodeCodeUnitCount = one.length();//
        String unicodeCodePointValue = Integer.toHexString(one.codePointAt(0));// Unicode编码值
        byte[] storedBytesUtf8 = one.getBytes();// 如果按UTF-8（默认）编码存储需要的字节存储情况
        char char16Bit = one.charAt(0);

        System.out.println("sizeof char = " + Character.SIZE + " (bytes in Java)");
        System.out.println("one.length() = " + unicodeCodeUnitCount + ", one.codePointAt(0) = " + unicodeCodePointValue);
        System.out.println("one.getBytes().length() = " + storedBytesUtf8.length + " : " + bytesToHexString(storedBytesUtf8)
                + ", " + Arrays.toString(storedBytesUtf8));
        System.out.println("one.charAt(0) = " + char16Bit + " = " + Integer.toBinaryString(char16Bit) + " = "
                + Integer.toHexString(char16Bit));
        System.out.println(Character.charCount(char16Bit));

        System.out.println(Integer.toHexString(new String(new byte[] { -28, -72, -83 }).charAt(0)));

        // sizeof char = 16 (bytes in Java)
        // one.length() = 1, one.codePointAt(0) = 4e2d
        // one.getBytes().length() = 3 : e4b8ad, [-28, -72, -83]
        // one.charAt(0) = 中 = 100111000101101 = 4e2d

Unicode符号范围 | UTF-8编码方式（变长编码）
(十六进制) | 字节数| 首字节范围 | 二进制
----------------------+-------+------------+-----------------------------------------------------
0000 0000 - 0000 007F | 单字节 [0x00, 0x7F] 0xxxxxxx
0000 0080 - 0000 07FF | 两字节 [0xC0, 0xE0) 110xxxxx 10xxxxxx
0000 0800 - 0000 FFFF | 三字节 [0xE0, 0xF0) 1110xxxx 10xxxxxx 10xxxxxx
0001 0000 - 001F FFFF | 四字节 [0xF0, 0xF8) 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0020 0000 - 03FF FFFF | 五字节 [0xF8, 0xFC) 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0400 0000 - 7FFF FFFF | 六字节 [0xFC, 0xFE) 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

Unicode符号范围	UTF-8编码方式（变长编码）
十六进制	Bytes	首字节范围	二进制
0000 0000 - 0000 007F	单字节	[0x00, 0x7F]	0xxxxxxx
0000 0080 - 0000 07FF	两字节	[0xC0, 0xE0)	110xxxxx 10xxxxxx
0000 0800 - 0000 FFFF	三字节	[0xE0, 0xF0)	1110xxxx 10xxxxxx 10xxxxxx
0001 0000 - 001F FFFF	四字节	[0xF0, 0xF8)	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0020 0000 - 03FF FFFF	五字节	[0xF8, 0xFC)	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0400 0000 - 7FFF FFFF	六字节	[0xFC, 0xFE)	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

http://ideone.com/

#include <stdio.h>

typedef unsigned char    utf8_t;        //< Type for UTF-8 data points
typedef unsigned short    utf16_t;    //< Type for UTF-16 data points
typedef unsigned int    utf32_t;    //< Type for UTF-32 data points

#define STRING_TOO_SHORT (-1)        //string too short
#define ILLEGAL_CHARACTER (-2)        //illegal character not starts with 0xxxxxxx, 110xxxxx, 1110xxxx, etc.
#define UNEXPECTED_CHARACTER (-3)    //subsequent characters not of the form 10xxxxxx
//- 4 = character encoded incorrectly(not minimal length).

//   Unicode符号范围    |  UTF-8编码方式（变长编码）
//      (十六进制)      | 字节数| 首字节范围 | 二进制
//----------------------+-------+------------+-----------------------------------------------------
//0000 0000 - 0000 007F | 单字节 [0x00, 0x7F] 0xxxxxxx
//0000 0080 - 0000 07FF | 两字节 [0xC0, 0xE0) 110xxxxx 10xxxxxx
//0000 0800 - 0000 FFFF | 三字节 [0xE0, 0xF0) 1110xxxx 10xxxxxx 10xxxxxx
//0001 0000 - 001F FFFF | 四字节 [0xF0, 0xF8) 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
//0020 0000 - 03FF FFFF | 五字节 [0xF8, 0xFC) 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
//0400 0000 - 7FFF FFFF | 六字节 [0xFC, 0xFE) 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

static inline size_t utf8_seqlen(utf32_t codepoint)
{
    if (codepoint < 0x0000080) return 1;
    if (codepoint < 0x0000800) return 2;
    if (codepoint < 0x0010000) return 3;
    if (codepoint < 0x0200000) return 4;
    if (codepoint < 0x4000000) return 5;
    return 6;
}

static size_t utf8_putc(char *str, size_t len, utf32_t codepoint)
{
    int nbytes = utf8_seqlen(codepoint);
    if (!str)
        return nbytes;
    if (len < nbytes)
        return -1;

    // This is easier done backwards
    int b = nbytes;
    while (b > 1) {
        b--;
        str[b] = 0x80 | (codepoint & 0x3f);
        codepoint >>= 6;
    }

    switch (nbytes) {
    case 1: str[0] = (codepoint & 0x7f); break;
    case 2: str[0] = 0xc0 | (codepoint & 0x1f); break;
    case 3: str[0] = 0xe0 | (codepoint & 0x0f); break;
    case 4: str[0] = 0xf0 | (codepoint & 0x07); break;
    case 5: str[0] = 0xf8 | (codepoint & 0x03); break;
    case 6: str[0] = 0xfc | (codepoint & 0x01); break;
    }

    return nbytes;
}

//static unsigned char utf8_b0masks[] = { 0x1f, 0x0f, 0x07, 0x03, 0x01};
//#define utf8_extra_first_byte(nbytes) utf8_b0masks[nbytes - 2]

//It returns the number of characters read or a negative error code
static size_t utf8_getc(const utf8_t *str, size_t len, utf32_t  *cp)
{
    unsigned char b0 = (str++)[0], b0mask;
    int nbytes;//UTF-8编码下一个字符占有多少字节

    if (b0 < 0x80) { // ASCII
        //nbytes = 1;
        if (len >= 1){
            *cp = b0; return 1;
        }
        return STRING_TOO_SHORT;
    }else if (b0 < 0xc0){ // C1 or continuation
        return ILLEGAL_CHARACTER;
    }else if (b0 < 0xe0) {
        nbytes = 2; b0mask = 0x1f;
    }else if (b0 < 0xf0) {
        nbytes = 3; b0mask = 0x0f;
    }else if (b0 < 0xf8) {
        nbytes = 4; b0mask = 0x07;
    }else if (b0 < 0xfc){
        nbytes = 5; b0mask = 0x03;
    }else if (b0 < 0xfe){
        nbytes = 6; b0mask = 0x01;
    }else
        return ILLEGAL_CHARACTER;

    if (len < nbytes)
        return STRING_TOO_SHORT;

    *cp = b0 & b0mask;
    for (int i = 1; i < nbytes; i++) {
        b0 = (str++)[0];
        if ((b0 & 0xc0) != 0x80) 
            return UNEXPECTED_CHARACTER;
        *cp <<= 6;
        *cp |= b0 & 0x3f;
    }

    return nbytes;
}


int main(void) {
    // your code goes here
    utf32_t cp;
    size_t seqlen = utf8_getc("22", 2, &cp);
    printf("%d, %d", seqlen, cp);
    return 0;
}

posted @ 2015-04-10 16:01 山岚的一缺阅读(14789) 评论(0) 收藏举报

刷新页面返回顶部