Robin's Blog

记录积累学习成长

:: :: :: :: :: ::

:: ::

公告

使用HTMLParser提取新闻的例子

最近项目开发中编写了一个每日笑话功能。系统每天晚上自动从  Internet  固定网站中获得一条新的笑话并保存下来。笑话可在工作台显示，并支持前后滚动查看。该功能是通过  htmlParser（  一个纯的java写的html解析的库  ）技术实现。小编在这里贴出自己写的通过htmlParser解析html文本抓取新闻的案例。实现思路如下：
设置网络代理
分析网站首页的新闻列表，内容为【<div class=\"hotjd\"></div>】所有网页新闻地址的HTML内容。返回NodeList
提取标题连接标签，获取标题。检查数据数库是否已存在该新闻，不存在就提取标题相应内容保存，跳出循环节点。已存在，就提取下一个连接标签
通过标题标签提取相应的内容
去除新闻中href包含cheshi.com的<a>标签
downloadImages方法下载内容中的图片
标签： HTMLParser
代码片段(1)
[文件] HtmlParser.java ~ 8KB    下载(55)
view source
print ?
`001` `import` `java.io.File;`
`002` `import` `java.io.FileNotFoundException;`
`003` `import` `java.io.FileOutputStream;`
`004` `import` `java.io.IOException;`
`005` `import` `java.io.InputStream;`
`006` `import` `java.net.URL;`
`007` `import` `java.sql.Connection;`
`008` `import` `java.sql.DriverManager;`
`009` `import` `java.sql.PreparedStatement;`
`010` `import` `java.sql.ResultSet;`
`011` `import` `java.sql.SQLException;`
`012`
`013` `import` `org.apache.log4j.Logger;`
`014` `import` `org.apache.log4j.PropertyConfigurator;`
`015` `import` `org.htmlparser.Node;`
`016` `import` `org.htmlparser.NodeFilter;`
`017` `import` `org.htmlparser.Parser;`
`018` `import` `org.htmlparser.Tag;`
`019` `import` `org.htmlparser.filters.TagNameFilter;`
`020` `import` `org.htmlparser.tags.LinkTag;`
`021` `import` `org.htmlparser.util.NodeIterator;`
`022` `import` `org.htmlparser.util.NodeList;`
`023` `import` `org.htmlparser.util.ParserException;`
`024` `import` `org.htmlparser.util.SimpleNodeIterator;`
`025`
`026` `/**`
`027` `* 分析www.cheshi.com首页新闻`
`028` `* @author j.li`
`029` `*/`
`030` `public` `class` `HtmlParser {`
`031`     `private` `static` `Logger logger;`
`032`     `private` `Connection conn =` `null;`
`033`     `private` `static` `final` `String SiteName =` `"";`
`034`
`035`     `public` `void` `indexNewsContent(String sitepath)` `throws` `Exception {`
`036`         `logger.info("分析网站【"` `+ sitepath +` `"】首页的新闻列表，内容为【<div class=\"hotjd\"></div>】所有网页新闻地址的HTML内容。");`
`037`         `Parser myParser =` `new` `Parser(sitepath);`
`038`         `myParser.setEncoding("GBK");`
`039`         `NodeList nodeList = myParser.extractAllNodesThatMatch(new` `NodeFilter() {`
`040`             `public` `boolean` `accept(Node node) {`
`041`                 `return` `((node` `instanceof` `Tag)`
`042`                         `&& !((Tag)node).isEndTag()`
`043`                         `&& ((Tag)node).getTagName().equals("DIV")`
`044`                         `&& ((Tag)node).getAttribute("class") !=` `null`
`045`                         `&& ((Tag)node).getAttribute("class").equals("w_box"));`
`046`             `}`
`047`         `});`
`048`         `Node node = nodeList.elementAt(1);`
`049`         `logger.debug(node.toHtml());`
`050`         `extractText(node.toHtml());`
`051`     `}`
`052`
`053`     `public` `void` `extractText(String inputHtml)` `throws` `Exception {`
`054`         `Parser parser = Parser.createParser(inputHtml,` `"GBK");`
`055`         `TagNameFilter filter =` `new` `TagNameFilter("a");`
`056`         `NodeList nodeList = parser.extractAllNodesThatMatch(filter);`
`057`         `NodeIterator it = nodeList.elements();`
`058`         `getConnection();`
`059`         `while` `(it.hasMoreNodes()) {`
`060`             `LinkTag node = (LinkTag) it.nextNode();`
`061`             `String href = node.getLink();`
`062`             `String title = node.getLinkText();`
`063`             `logger.info("分析首页新闻【"+title+"】，链接地址【"+href+"】");`
`064`             `try` `{`
`065`                 `if(!newsExist(title)) {`
`066`                     `insertDataBase(title, extractContent(href));`
`067`                 `}` `else` `{`
`068`                     `logger.info("新闻【"+title+"】数据库中已经存在，忽略进入下一个新闻分析！");`
`069`                 `}`
`070`             `}` `catch` `(SQLException e) {`
`071`                 `logger.error("插入数据库新闻记录异常！"` `+ e.getMessage());`
`072`                 `e.printStackTrace();`
`073`             `}` `catch` `(Exception e) {`
`074`                 `logger.error(e.getMessage());`
`075`                 `logger.info("分析新闻【"+title+"】，链接地址【"+href+"】失败，进入下一个新闻分析。");`
`076`                 `e.printStackTrace();`
`077`             `}`
`078`         `}`
`079`         `closeConnection();`
`080`     `}`
`081`
`082`     `public` `String extractContent(String content)` `throws` `Exception {`
`083`         `try` `{`
`084`             `Parser myParser =` `new` `Parser(content);`
`085`             `myParser.setEncoding("GBK");`
`086`             `NodeList nodeList = myParser.extractAllNodesThatMatch(new` `NodeFilter() {`
`087`                 `public` `boolean` `accept(Node node) {`
`088`                     `return` `((node` `instanceof` `Tag)`
`089`                             `&& !((Tag)node).isEndTag()`
`090`                             `&& ((Tag)node).getTagName().equals("DIV")`
`091`                             `&& ((Tag)node).getAttribute("class") !=` `null`
`092`                             `&& ((Tag)node).getAttribute("class").equals("cs_content"));`
`093`                 `}`
`094`             `});`
`095`             `int` `size = nodeList.size();`
`096`             `Node node = nodeList.elementAt(size -` `1);`
`097`             `content = node.toHtml();`
`098`             `logger.debug("==========extractContent==============");`
`099`             `logger.debug(content);`
`100`         `}` `catch` `(Exception pe) {`
`101`             `logger.error("分析新闻页面出现异常！"` `+ pe.getMessage() +` `"原因可能出现于新闻页面不存在<div class=\"cs_content\"></div>标记。");`
`102`             `throw` `pe;`
`103`         `}`
`104`         `return` `removeTagA(content);`
`105`     `}`
`106`
`107`     `/**`
`108`      `* 去除新闻中href包含cheshi.com的<a>标签`
`109`      `* @param content 分析html内容`
`110`      `* @return 分析处理后的html内容`
`111`      `*/`
`112`     `public` `String removeTagA(String content)` `throws` `ParserException {`
`113`         `Parser myParser =` `new` `Parser(content);`
`114`         `myParser.setEncoding("GBK");`
`115`         `NodeList nodeList = myParser.extractAllNodesThatMatch(new` `TagNameFilter("a"));`
`116`         `SimpleNodeIterator it = nodeList.elements();`
`117`         `while` `(it.hasMoreNodes()) {`
`118`             `LinkTag node = (LinkTag)it.nextNode();`
`119`             `logger.info("移除新闻内容中包含的文字、图片的链接【"+node.toHtml()+"】。");`
`120`             `if(node.getLink().indexOf("cheshi.com") > -1)`
`121`                 `content = content.replace(node.toHtml(), node.getStringText());`
`122`         `}`
`123`         `logger.debug("==========removeTagA==============");`
`124`         `logger.debug(content);`
`125`         `return` `downloadImages(content,` `"D:\\autodata\\upload\\intersite", SiteName +"upload/intersite");`
`126`     `}`
`127`
`128`     `public` `String downloadImages(String content, String uploadImgPath, String localhost)throws` `ParserException {`
`129`         `File f =` `new` `File(uploadImgPath);`
`130`         `if(!f.exists()) {`
`131`             `f.mkdirs();`
`132`         `}`
`133`         `Parser myParser =` `new` `Parser(content);`
`134`         `myParser.setEncoding("GBK");`
`135`         `NodeList nodeList = myParser.extractAllNodesThatMatch(new` `TagNameFilter("img"));`
`136`         `SimpleNodeIterator it = nodeList.elements();`
`137`         `while(it.hasMoreNodes()) {`
`138`             `Tag tag = (Tag)it.nextNode();`
`139`             `String src = tag.getAttribute("src");`
`140`             `String filename = src.substring(src.lastIndexOf("/") +` `1);`
`141`             `InputStream is =` `null;`
`142`             `FileOutputStream fos =` `null;`
`143`             `try` `{`
`144`                 `URL url =` `new` `URL(src);`
`145`                 `is = url.openStream();`
`146`                 `int` `bytesRead =` `0;`
`147`                 `byte[] buff =` `new` `byte[1024];`
`148`                 `fos =` `new` `FileOutputStream(uploadImgPath+"/"+filename);`
`149`                 `while((bytesRead = is.read(buff,` `0, buff.length)) != -1){`
`150`                     `fos.write(buff,` `0, bytesRead);`
`151`                 `}`
`152`                 `content = content.replace(src, localhost +` `"/"` `+ filename);`
`153`             `}` `catch(FileNotFoundException notFoundException) {`
`154`                 `notFoundException.printStackTrace();`
`155`             `}` `catch(IOException ioe) {`
`156`                 `ioe.printStackTrace();`
`157`             `}` `finally` `{`
`158`                 `try` `{`
`159`                     `if(fos !=` `null) fos.close();`
`160`                     `if(is !=` `null) is.close();`
`161`                 `}` `catch(IOException ioe) {`
`162`                     `ioe.printStackTrace();`
`163`                 `}`
`164`             `}`
`165`         `}`
`166`         `logger.debug("=================downloadImages==================");`
`167`         `logger.debug(content);`
`168`         `return` `content;`
`169`     `}`
`170`
`171`     `public` `void` `getConnection() {`
`172`         `try` `{`
`173`             `Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");`
`174`             `String strCon ="jdbc:microsoft:sqlserver://192.168.99.188:12580;databaseName=Project2009;SelectMethod=cursor";`
`175`             `String strUserName =` `"sa";`
`176`             `String strPWD =` `"qsyjcsxdl@@@web2009@@@";`
`177`             `conn = DriverManager.getConnection(strCon, strUserName, strPWD);`
`178`         `}` `catch` `(java.lang.ClassNotFoundException cnfe) {`
`179`             `cnfe.printStackTrace();`
`180`         `}` `catch` `(SQLException se) {`
`181`             `se.printStackTrace();`
`182`         `}`
`183`     `}`
`184`
`185`     `public` `void` `closeConnection() {`
`186`         `try` `{`
`187`             `if(conn!=` `null` `&& !conn.isClosed()) conn.close();`
`188`         `}` `catch` `(SQLException se) {`
`189`             `se.printStackTrace();`
`190`         `}`
`191`     `}`
`192`
`193`     `public` `void` `insertDataBase(String newsTitle, String newsContent)` `throws` `SQLException {`
`194`         `PreparedStatement pstmt =` `null;`
`195`         `try` `{`
`196`             `pstmt = conn.prepareStatement("INSERT INTO FumNews(NewsTitle, NewsContext, NewsState) values(?, ?, ?)");`
`197`             `pstmt.setString(1, newsTitle);`
`198`             `pstmt.setString(2, newsContent);`
`199`             `pstmt.setInt(3,` `1);`
`200`             `pstmt.executeUpdate();`
`201`         `}` `catch(SQLException e) {`
`202`             `throw` `e;`
`203`         `}` `finally` `{`
`204`             `try` `{`
`205`                 `if(pstmt !=` `null) pstmt.close();`
`206`             `}` `catch(SQLException e) {`
`207`                 `e.printStackTrace();`
`208`             `}`
`209`         `}`
`210`     `}`
`211`
`212`     `public` `boolean` `newsExist(String title)` `throws` `SQLException {`
`213`         `PreparedStatement pstmt =` `null;`
`214`         `try` `{`
`215`             `pstmt = conn.prepareStatement("SELECT top 1 NewsId from FumNews where NewsTitle = ?");`
`216`             `pstmt.setString(1, title);`
`217`             `ResultSet rs = pstmt.executeQuery();`
`218`             `return` `rs.next();`
`219`         `}` `catch(SQLException e) {`
`220`             `throw` `e;`
`221`         `}` `finally` `{`
`222`             `try` `{`
`223`                 `if(pstmt !=` `null) pstmt.close();`
`224`             `}` `catch(SQLException e) {`
`225`                 `e.printStackTrace();`
`226`             `}`
`227`         `}`
`228`     `}`
`229`
`230`     `public` `static` `void` `main(String[] args) {`
`231`         `HtmlParser html =` `new` `HtmlParser();`
`232` `//      设置代理链接网络`
`233` `//      System.getProperties().put("proxySet", "true");`
`234` `//      System.getProperties().put("proxyHost", "192.168.99.100");`
`235` `//      System.getProperties().put("proxyPort", "80");`
`236`         `URL url = html.getClass().getResource("log4j.properties");`
`237`         `PropertyConfigurator.configure(url);`
`238`         `logger = Logger.getLogger(HtmlParser.class);`
`239`         `try` `{`
`240`             `html.indexNewsContent("http://www.cheshi.com/");`
`241`         `}` `catch` `(Exception e) {`
`242`             `e.printStackTrace();`
`243`             `logger.error("分析网页遇到错误，原因："+e.getMessage());`
`244`         `}`
`245`         `logger.info("分析网页内容完成。");`
`246`     `}`
`247` `}`

posted on 2012-04-26 11:58 Robin99 阅读(298) 评论(0) 收藏举报

刷新页面返回顶部