夜隼

RYSZ

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理
public static Document transferByNeko(InputStream stream, String charset)
    {
        if (stream == null)
            return null;
        
        if(StringUtils.isEmpty(charset)){
            charset = DEFAULT_CHARSET;
        }


        //NEKOHTML的DOMParser会将html标签转化成大写,是否设置下面的配置都没有意义,解决办法是需要使用xerces的DOMParser
//        DOMParser domParser = new DOMParser();
//        Document doc = null;
//        ByteArrayOutputStream byteOs = null;
//        Writer writer = null;
//        InputSource inputSource = null;
//        DocumentType documentType = null;
//        org.w3c.dom.Document document = null;
//        DOMReader domReader = null;
//        try {
//            domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
//            domParser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
//            domParser.setProperty("http://cyberneko.org/html/properties/default-encoding", "UTF-8");
//
//            domParser.setFeature("http://xml.org/sax/features/namespaces", false);
//            domParser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
//            domParser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-comment-delims", false);
//
//            byteOs = new ByteArrayOutputStream();
//            writer = new Writer(byteOs, charset);
//            XMLDocumentFilter domFilter[] = {
//                writer
//            };
//            domParser.setProperty("http://cyberneko.org/html/properties/filters", domFilter);
//            inputSource = new InputSource(new InputStreamReader(stream, Charset.forName(charset)));
//            domParser.parse(inputSource);
//            document = domParser.getDocument();
//            documentType = document.getDoctype();
//            if (documentType != null)
//                document.removeChild(documentType);
//            domReader = new DOMReader();
//            doc = domReader.read(document);
//        } catch (SAXNotRecognizedException e) {
//            e.printStackTrace();
//        } catch (SAXNotSupportedException e) {
//            e.printStackTrace();
//        } catch (UnsupportedEncodingException e) {
//            e.printStackTrace();
//        } catch (SAXException e) {
//            e.printStackTrace();
//        } catch (IOException e) {
//            e.printStackTrace();
//        }finally{
//            IOUtils.closeQuietly(byteOs);
//            IOUtils.closeQuietly(stream);
//        }

        //采用xerces的DOMParser
        Document doc = null;
        DocumentType documentType = null;
        org.w3c.dom.Document document = null;
        DOMReader domReader = null;
        ByteArrayOutputStream byteOs = null;
        Writer writer = null;
        InputSource inputSource = null;
        try {
            HTMLConfiguration htmlConfiguration = new HTMLConfiguration();
            htmlConfiguration.setProperty("http://cyberneko.org/html/properties/names/elems","lower");
            org.apache.xerces.parsers.DOMParser parser = new org.apache.xerces.parsers.DOMParser(htmlConfiguration);
            inputSource = new InputSource(new InputStreamReader(stream, Charset.forName(charset)));
            parser.parse(inputSource);
            document = parser.getDocument();
            documentType = document.getDoctype();
            if (documentType != null)
                document.removeChild(documentType);
            domReader = new DOMReader();
            doc = domReader.read(document);
        } catch (SAXException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return doc;
    }
    

 

posted on 2018-03-23 08:23  夜隼  阅读(390)  评论(0编辑  收藏  举报