DIH处理包含回车符换行符html标签内容的文本

数据样例:2010-03-19 10:18:06130010543234203guqun09-12月-12liuyin18-6月 -14<P style="MARGIN-TOP: 0px; TEXT-JUSTIFY: inter-ideograph; FONT-SIZE: 12pt; MARGIN-BOTTOM: 0px; TEXT-INDENT: 2em; LINE-HEIGHT: 1.5; FONT-FAMILY: 宋体; TEXT-ALIGN: justify">天翼分享是基于中国电信CDMA网络,为天翼客户提供视频、音频等多媒体信息分享服务的移动互联网产品。多媒体信息可以是客户自己录制的视频、拍摄的照片、制作的动漫或MTV等,也可以是中国电信提供的优质多媒体信息内容。

第一种方法:

使用ScriptTransformer处理回车符、换行符。

数据源:

<dataSource name="jdbc" driver="oracle.jdbc.driver.OracleDriver"
    url="jdbc:oracle:thin:@127.0.0.1:1521:ORCLLI" user="kms_user" password="kms_user" />

 

ScriptTransformer脚本模块:

<script>
    <![CDATA[
    function regex(row) {
        var content = row.get('CONTENT');
        var regex = "\\t|\r|\n"
        var p = java.util.regex.Pattern.compile(regex);
        var m = p.matcher(content)
        row.put('CONTENT', m.replaceAll(""));
        return row;
    }
    ]]>
</script>

 document:

<entity name="tm_details" query="select t.docid as id,t.tempid,t.cruser as userid,t.crtime,t.content from TM_DETAILS t  where t.type=2 and  t.docid=10479"
            transformer="ClobTransformer,HTMLStripTransformer,script:regex,DateFormatTransformer">
            <field column="ID" name="id" />
            <field column="TEMPID" name="tempid" />
            <entity name="template" query="select te.name from kmstemplate  te where te.id=${tm_details.TEMPID}">
                <field column="NAME" name="template"/>
            </entity>
            <entity   name="user" query="select msg.name  from tb_sys_loginmsg msg where msg.login_id='${tm_details.USERID}'" >
                <field column="NAME" name="cruser"/>
            </entity>
            <field column="CRTIME" name="crtime"  dateTimeFormat="yyyy-MM-dd HH:mm:ss"/>
            <entity name="doc" query="select rtrim(d.doctitle,'.htm') as title  from kmsdocument d where d.docid=${tm_details.ID}">
                <field column="TITLE" name="title" clob="true"/>
            </entity>
            <field column="CONTENT" name="content"  clob="true"  stripHTML="true"/>
        </entity>
    </document>

 

第二种方法:采用RegexTransformer:

document:

<entity name="tm_details" query="select t.docid as id,t.tempid,t.cruser as userid,t.crtime,t.content from TM_DETAILS t  where t.type=2 and  t.docid=10479"
            transformer="ClobTransformer,HTMLStripTransformer,RegexTransformer,DateFormatTransformer">
            <field column="ID" name="id" />
            <field column="TEMPID" name="tempid" />
            <entity name="template" query="select te.name from kmstemplate  te where te.id=${tm_details.TEMPID}">
                <field column="NAME" name="template"/>
            </entity>
            <entity   name="user" query="select msg.name  from tb_sys_loginmsg msg where msg.login_id='${tm_details.USERID}'" >
                <field column="NAME" name="cruser"/>
            </entity>
            <field column="CRTIME" name="crtime"  dateTimeFormat="yyyy-MM-dd HH:mm:ss"/>
            <entity name="doc" query="select rtrim(d.doctitle,'.htm') as title  from kmsdocument d where d.docid=${tm_details.ID}">
                <field column="TITLE" name="title" clob="true"/>
            </entity>
            <field column="CONTENT" name="content"  clob="true"  stripHTML="true" regex="\\t|\r|\n" replaceWith=""/>
        </entity>
    </document>

 

posted @ 2014-09-28 16:19  勿妄  阅读(701)  评论(0编辑  收藏  举报