代码片段(1)
[文件] HtmlParser.java ~ 8KB 下载(55)
001import java.io.File;
002import java.io.FileNotFoundException;
003import java.io.FileOutputStream;
004import java.io.IOException;
005import java.io.InputStream;
006import java.net.URL;
007import java.sql.Connection;
008import java.sql.DriverManager;
009import java.sql.PreparedStatement;
010import java.sql.ResultSet;
011import java.sql.SQLException;
012
013import org.apache.log4j.Logger;
014import org.apache.log4j.PropertyConfigurator;
015import org.htmlparser.Node;
016import org.htmlparser.NodeFilter;
017import org.htmlparser.Parser;
018import org.htmlparser.Tag;
019import org.htmlparser.filters.TagNameFilter;
020import org.htmlparser.tags.LinkTag;
021import org.htmlparser.util.NodeIterator;
022import org.htmlparser.util.NodeList;
023import org.htmlparser.util.ParserException;
024import org.htmlparser.util.SimpleNodeIterator;
025
026/**
027 * 分析www.cheshi.com首页新闻
028 * @author j.li
029 */
030public class HtmlParser {
031 private static Logger logger;
032 private Connection conn = null;
033 private static final String SiteName = "";
034
035 public void indexNewsContent(String sitepath) throws Exception {
036 logger.info("分析网站【" + sitepath + "】首页的新闻列表,内容为【<div class=\"hotjd\"></div>】所有网页新闻地址的HTML内容。");
037 Parser myParser = new Parser(sitepath);
038 myParser.setEncoding("GBK");
039 NodeList nodeList = myParser.extractAllNodesThatMatch(new NodeFilter() {
040 public boolean accept(Node node) {
041 return ((node instanceof Tag)
042 && !((Tag)node).isEndTag()
043 && ((Tag)node).getTagName().equals("DIV")
044 && ((Tag)node).getAttribute("class") != null
045 && ((Tag)node).getAttribute("class").equals("w_box"));
046 }
047 });
048 Node node = nodeList.elementAt(1);
049 logger.debug(node.toHtml());
050 extractText(node.toHtml());
051 }
052
053 public void extractText(String inputHtml) throws Exception {
054 Parser parser = Parser.createParser(inputHtml, "GBK");
055 TagNameFilter filter = new TagNameFilter("a");
056 NodeList nodeList = parser.extractAllNodesThatMatch(filter);
057 NodeIterator it = nodeList.elements();
058 getConnection();
059 while (it.hasMoreNodes()) {
060 LinkTag node = (LinkTag) it.nextNode();
061 String href = node.getLink();
062 String title = node.getLinkText();
063 logger.info("分析首页新闻【"+title+"】,链接地址【"+href+"】");
064 try {
065 if(!newsExist(title)) {
066 insertDataBase(title, extractContent(href));
067 } else {
068 logger.info("新闻【"+title+"】数据库中已经存在,忽略进入下一个新闻分析!");
069 }
070 } catch (SQLException e) {
071 logger.error("插入数据库新闻记录异常!" + e.getMessage());
072 e.printStackTrace();
073 } catch (Exception e) {
074 logger.error(e.getMessage());
075 logger.info("分析新闻【"+title+"】,链接地址【"+href+"】失败,进入下一个新闻分析。");
076 e.printStackTrace();
077 }
078 }
079 closeConnection();
080 }
081
082 public String extractContent(String content) throws Exception {
083 try {
084 Parser myParser = new Parser(content);
085 myParser.setEncoding("GBK");
086 NodeList nodeList = myParser.extractAllNodesThatMatch(new NodeFilter() {
087 public boolean accept(Node node) {
088 return ((node instanceof Tag)
089 && !((Tag)node).isEndTag()
090 && ((Tag)node).getTagName().equals("DIV")
091 && ((Tag)node).getAttribute("class") != null
092 && ((Tag)node).getAttribute("class").equals("cs_content"));
093 }
094 });
095 int size = nodeList.size();
096 Node node = nodeList.elementAt(size - 1);
097 content = node.toHtml();
098 logger.debug("==========extractContent==============");
099 logger.debug(content);
100 } catch (Exception pe) {
101 logger.error("分析新闻页面出现异常!" + pe.getMessage() + "原因可能出现于新闻页面不存在<div class=\"cs_content\"></div>标记。");
102 throw pe;
103 }
104 return removeTagA(content);
105 }
106
107 /**
108 * 去除新闻中href包含cheshi.com的<a>标签
109 * @param content 分析html内容
110 * @return 分析处理后的html内容
111 */
112 public String removeTagA(String content) throws ParserException {
113 Parser myParser = new Parser(content);
114 myParser.setEncoding("GBK");
115 NodeList nodeList = myParser.extractAllNodesThatMatch(new TagNameFilter("a"));
116 SimpleNodeIterator it = nodeList.elements();
117 while (it.hasMoreNodes()) {
118 LinkTag node = (LinkTag)it.nextNode();
119 logger.info("移除新闻内容中包含的文字、图片的链接【"+node.toHtml()+"】。");
120 if(node.getLink().indexOf("cheshi.com") > -1)
121 content = content.replace(node.toHtml(), node.getStringText());
122 }
123 logger.debug("==========removeTagA==============");
124 logger.debug(content);
125 return downloadImages(content, "D:\\autodata\\upload\\intersite", SiteName +"upload/intersite");
126 }
127
128 public String downloadImages(String content, String uploadImgPath, String localhost)throws ParserException {
129 File f = new File(uploadImgPath);
130 if(!f.exists()) {
131 f.mkdirs();
132 }
133 Parser myParser = new Parser(content);
134 myParser.setEncoding("GBK");
135 NodeList nodeList = myParser.extractAllNodesThatMatch(new TagNameFilter("img"));
136 SimpleNodeIterator it = nodeList.elements();
137 while(it.hasMoreNodes()) {
138 Tag tag = (Tag)it.nextNode();
139 String src = tag.getAttribute("src");
140 String filename = src.substring(src.lastIndexOf("/") + 1);
141 InputStream is = null;
142 FileOutputStream fos = null;
143 try {
144 URL url = new URL(src);
145 is = url.openStream();
146 int bytesRead = 0;
147 byte[] buff = new byte[1024];
148 fos = new FileOutputStream(uploadImgPath+"/"+filename);
149 while((bytesRead = is.read(buff, 0, buff.length)) != -1){
150 fos.write(buff, 0, bytesRead);
151 }
152 content = content.replace(src, localhost + "/" + filename);
153 } catch(FileNotFoundException notFoundException) {
154 notFoundException.printStackTrace();
155 } catch(IOException ioe) {
156 ioe.printStackTrace();
157 } finally {
158 try {
159 if(fos != null) fos.close();
160 if(is != null) is.close();
161 } catch(IOException ioe) {
162 ioe.printStackTrace();
163 }
164 }
165 }
166 logger.debug("=================downloadImages==================");
167 logger.debug(content);
168 return content;
169 }
170
171 public void getConnection() {
172 try {
173 Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
174 String strCon ="jdbc:microsoft:sqlserver://192.168.99.188:12580;databaseName=Project2009;SelectMethod=cursor";
175 String strUserName = "sa";
176 String strPWD = "qsyjcsxdl@@@web2009@@@";
177 conn = DriverManager.getConnection(strCon, strUserName, strPWD);
178 } catch (java.lang.ClassNotFoundException cnfe) {
179 cnfe.printStackTrace();
180 } catch (SQLException se) {
181 se.printStackTrace();
182 }
183 }
184
185 public void closeConnection() {
186 try {
187 if(conn!= null && !conn.isClosed()) conn.close();
188 } catch (SQLException se) {
189 se.printStackTrace();
190 }
191 }
192
193 public void insertDataBase(String newsTitle, String newsContent) throws SQLException {
194 PreparedStatement pstmt = null;
195 try {
196 pstmt = conn.prepareStatement("INSERT INTO FumNews(NewsTitle, NewsContext, NewsState) values(?, ?, ?)");
197 pstmt.setString(1, newsTitle);
198 pstmt.setString(2, newsContent);
199 pstmt.setInt(3, 1);
200 pstmt.executeUpdate();
201 } catch(SQLException e) {
202 throw e;
203 } finally {
204 try {
205 if(pstmt != null) pstmt.close();
206 } catch(SQLException e) {
207 e.printStackTrace();
208 }
209 }
210 }
211
212 public boolean newsExist(String title) throws SQLException {
213 PreparedStatement pstmt = null;
214 try {
215 pstmt = conn.prepareStatement("SELECT top 1 NewsId from FumNews where NewsTitle = ?");
216 pstmt.setString(1, title);
217 ResultSet rs = pstmt.executeQuery();
218 return rs.next();
219 } catch(SQLException e) {
220 throw e;
221 } finally {
222 try {
223 if(pstmt != null) pstmt.close();
224 } catch(SQLException e) {
225 e.printStackTrace();
226 }
227 }
228 }
229
230 public static void main(String[] args) {
231 HtmlParser html = new HtmlParser();
232// 设置代理链接网络
233// System.getProperties().put("proxySet", "true");
234// System.getProperties().put("proxyHost", "192.168.99.100");
235// System.getProperties().put("proxyPort", "80");
236 URL url = html.getClass().getResource("log4j.properties");
237 PropertyConfigurator.configure(url);
238 logger = Logger.getLogger(HtmlParser.class);
239 try {
240 html.indexNewsContent("http://www.cheshi.com/");
241 } catch (Exception e) {
242 e.printStackTrace();
243 logger.error("分析网页遇到错误,原因:"+e.getMessage());
244 }
245 logger.info("分析网页内容完成。");
246 }
247}
代码片段(1)
[文件] HtmlParser.java ~ 8KB 下载(55)
001 | import java.io.File; |
002 | import java.io.FileNotFoundException; |
003 | import java.io.FileOutputStream; |
004 | import java.io.IOException; |
005 | import java.io.InputStream; |
006 | import java.net.URL; |
007 | import java.sql.Connection; |
008 | import java.sql.DriverManager; |
009 | import java.sql.PreparedStatement; |
010 | import java.sql.ResultSet; |
011 | import java.sql.SQLException; |
012 |
013 | import org.apache.log4j.Logger; |
014 | import org.apache.log4j.PropertyConfigurator; |
015 | import org.htmlparser.Node; |
016 | import org.htmlparser.NodeFilter; |
017 | import org.htmlparser.Parser; |
018 | import org.htmlparser.Tag; |
019 | import org.htmlparser.filters.TagNameFilter; |
020 | import org.htmlparser.tags.LinkTag; |
021 | import org.htmlparser.util.NodeIterator; |
022 | import org.htmlparser.util.NodeList; |
023 | import org.htmlparser.util.ParserException; |
024 | import org.htmlparser.util.SimpleNodeIterator; |
025 |
026 | /** |
027 | * 分析www.cheshi.com首页新闻 |
028 | * @author j.li |
029 | */ |
030 | public class HtmlParser { |
031 | private static Logger logger; |
032 | private Connection conn = null; |
033 | private static final String SiteName = ""; |
034 |
035 | public void indexNewsContent(String sitepath) throws Exception { |
036 | logger.info("分析网站【" + sitepath + "】首页的新闻列表,内容为【<div class=\"hotjd\"></div>】所有网页新闻地址的HTML内容。"); |
037 | Parser myParser = new Parser(sitepath); |
038 | myParser.setEncoding("GBK"); |
039 | NodeList nodeList = myParser.extractAllNodesThatMatch(new NodeFilter() { |
040 | public boolean accept(Node node) { |
041 | return ((node instanceof Tag) |
042 | && !((Tag)node).isEndTag() |
043 | && ((Tag)node).getTagName().equals("DIV") |
044 | && ((Tag)node).getAttribute("class") != null |
045 | && ((Tag)node).getAttribute("class").equals("w_box")); |
046 | } |
047 | }); |
048 | Node node = nodeList.elementAt(1); |
049 | logger.debug(node.toHtml()); |
050 | extractText(node.toHtml()); |
051 | } |
052 | |
053 | public void extractText(String inputHtml) throws Exception { |
054 | Parser parser = Parser.createParser(inputHtml, "GBK"); |
055 | TagNameFilter filter = new TagNameFilter("a"); |
056 | NodeList nodeList = parser.extractAllNodesThatMatch(filter); |
057 | NodeIterator it = nodeList.elements(); |
058 | getConnection(); |
059 | while (it.hasMoreNodes()) { |
060 | LinkTag node = (LinkTag) it.nextNode(); |
061 | String href = node.getLink(); |
062 | String title = node.getLinkText(); |
063 | logger.info("分析首页新闻【"+title+"】,链接地址【"+href+"】"); |
064 | try { |
065 | if(!newsExist(title)) { |
066 | insertDataBase(title, extractContent(href)); |
067 | } else { |
068 | logger.info("新闻【"+title+"】数据库中已经存在,忽略进入下一个新闻分析!"); |
069 | } |
070 | } catch (SQLException e) { |
071 | logger.error("插入数据库新闻记录异常!" + e.getMessage()); |
072 | e.printStackTrace(); |
073 | } catch (Exception e) { |
074 | logger.error(e.getMessage()); |
075 | logger.info("分析新闻【"+title+"】,链接地址【"+href+"】失败,进入下一个新闻分析。"); |
076 | e.printStackTrace(); |
077 | } |
078 | } |
079 | closeConnection(); |
080 | } |
081 |
082 | public String extractContent(String content) throws Exception { |
083 | try { |
084 | Parser myParser = new Parser(content); |
085 | myParser.setEncoding("GBK"); |
086 | NodeList nodeList = myParser.extractAllNodesThatMatch(new NodeFilter() { |
087 | public boolean accept(Node node) { |
088 | return ((node instanceof Tag) |
089 | && !((Tag)node).isEndTag() |
090 | && ((Tag)node).getTagName().equals("DIV") |
091 | && ((Tag)node).getAttribute("class") != null |
092 | && ((Tag)node).getAttribute("class").equals("cs_content")); |
093 | } |
094 | }); |
095 | int size = nodeList.size(); |
096 | Node node = nodeList.elementAt(size - 1); |
097 | content = node.toHtml(); |
098 | logger.debug("==========extractContent=============="); |
099 | logger.debug(content); |
100 | } catch (Exception pe) { |
101 | logger.error("分析新闻页面出现异常!" + pe.getMessage() + "原因可能出现于新闻页面不存在<div class=\"cs_content\"></div>标记。"); |
102 | throw pe; |
103 | } |
104 | return removeTagA(content); |
105 | } |
106 | |
107 | /** |
108 | * 去除新闻中href包含cheshi.com的<a>标签 |
109 | * @param content 分析html内容 |
110 | * @return 分析处理后的html内容 |
111 | */ |
112 | public String removeTagA(String content) throws ParserException { |
113 | Parser myParser = new Parser(content); |
114 | myParser.setEncoding("GBK"); |
115 | NodeList nodeList = myParser.extractAllNodesThatMatch(new TagNameFilter("a")); |
116 | SimpleNodeIterator it = nodeList.elements(); |
117 | while (it.hasMoreNodes()) { |
118 | LinkTag node = (LinkTag)it.nextNode(); |
119 | logger.info("移除新闻内容中包含的文字、图片的链接【"+node.toHtml()+"】。"); |
120 | if(node.getLink().indexOf("cheshi.com") > -1) |
121 | content = content.replace(node.toHtml(), node.getStringText()); |
122 | } |
123 | logger.debug("==========removeTagA=============="); |
124 | logger.debug(content); |
125 | return downloadImages(content, "D:\\autodata\\upload\\intersite", SiteName +"upload/intersite"); |
126 | } |
127 |
128 | public String downloadImages(String content, String uploadImgPath, String localhost)throws ParserException { |
129 | File f = new File(uploadImgPath); |
130 | if(!f.exists()) { |
131 | f.mkdirs(); |
132 | } |
133 | Parser myParser = new Parser(content); |
134 | myParser.setEncoding("GBK"); |
135 | NodeList nodeList = myParser.extractAllNodesThatMatch(new TagNameFilter("img")); |
136 | SimpleNodeIterator it = nodeList.elements(); |
137 | while(it.hasMoreNodes()) { |
138 | Tag tag = (Tag)it.nextNode(); |
139 | String src = tag.getAttribute("src"); |
140 | String filename = src.substring(src.lastIndexOf("/") + 1); |
141 | InputStream is = null; |
142 | FileOutputStream fos = null; |
143 | try { |
144 | URL url = new URL(src); |
145 | is = url.openStream(); |
146 | int bytesRead = 0; |
147 | byte[] buff = new byte[1024]; |
148 | fos = new FileOutputStream(uploadImgPath+"/"+filename); |
149 | while((bytesRead = is.read(buff, 0, buff.length)) != -1){ |
150 | fos.write(buff, 0, bytesRead); |
151 | } |
152 | content = content.replace(src, localhost + "/" + filename); |
153 | } catch(FileNotFoundException notFoundException) { |
154 | notFoundException.printStackTrace(); |
155 | } catch(IOException ioe) { |
156 | ioe.printStackTrace(); |
157 | } finally { |
158 | try { |
159 | if(fos != null) fos.close(); |
160 | if(is != null) is.close(); |
161 | } catch(IOException ioe) { |
162 | ioe.printStackTrace(); |
163 | } |
164 | } |
165 | } |
166 | logger.debug("=================downloadImages=================="); |
167 | logger.debug(content); |
168 | return content; |
169 | } |
170 | |
171 | public void getConnection() { |
172 | try { |
173 | Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver"); |
174 | String strCon ="jdbc:microsoft:sqlserver://192.168.99.188:12580;databaseName=Project2009;SelectMethod=cursor"; |
175 | String strUserName = "sa"; |
176 | String strPWD = "qsyjcsxdl@@@web2009@@@"; |
177 | conn = DriverManager.getConnection(strCon, strUserName, strPWD); |
178 | } catch (java.lang.ClassNotFoundException cnfe) { |
179 | cnfe.printStackTrace(); |
180 | } catch (SQLException se) { |
181 | se.printStackTrace(); |
182 | } |
183 | } |
184 | |
185 | public void closeConnection() { |
186 | try { |
187 | if(conn!= null && !conn.isClosed()) conn.close(); |
188 | } catch (SQLException se) { |
189 | se.printStackTrace(); |
190 | } |
191 | } |
192 | |
193 | public void insertDataBase(String newsTitle, String newsContent) throws SQLException { |
194 | PreparedStatement pstmt = null; |
195 | try { |
196 | pstmt = conn.prepareStatement("INSERT INTO FumNews(NewsTitle, NewsContext, NewsState) values(?, ?, ?)"); |
197 | pstmt.setString(1, newsTitle); |
198 | pstmt.setString(2, newsContent); |
199 | pstmt.setInt(3, 1); |
200 | pstmt.executeUpdate(); |
201 | } catch(SQLException e) { |
202 | throw e; |
203 | } finally { |
204 | try { |
205 | if(pstmt != null) pstmt.close(); |
206 | } catch(SQLException e) { |
207 | e.printStackTrace(); |
208 | } |
209 | } |
210 | } |
211 | |
212 | public boolean newsExist(String title) throws SQLException { |
213 | PreparedStatement pstmt = null; |
214 | try { |
215 | pstmt = conn.prepareStatement("SELECT top 1 NewsId from FumNews where NewsTitle = ?"); |
216 | pstmt.setString(1, title); |
217 | ResultSet rs = pstmt.executeQuery(); |
218 | return rs.next(); |
219 | } catch(SQLException e) { |
220 | throw e; |
221 | } finally { |
222 | try { |
223 | if(pstmt != null) pstmt.close(); |
224 | } catch(SQLException e) { |
225 | e.printStackTrace(); |
226 | } |
227 | } |
228 | } |
229 |
230 | public static void main(String[] args) { |
231 | HtmlParser html = new HtmlParser(); |
232 | // 设置代理链接网络 |
233 | // System.getProperties().put("proxySet", "true"); |
234 | // System.getProperties().put("proxyHost", "192.168.99.100"); |
235 | // System.getProperties().put("proxyPort", "80"); |
236 | URL url = html.getClass().getResource("log4j.properties"); |
237 | PropertyConfigurator.configure(url); |
238 | logger = Logger.getLogger(HtmlParser.class); |
239 | try { |
240 | html.indexNewsContent("http://www.cheshi.com/"); |
241 | } catch (Exception e) { |
242 | e.printStackTrace(); |
243 | logger.error("分析网页遇到错误,原因:"+e.getMessage()); |
244 | } |
245 | logger.info("分析网页内容完成。"); |
246 | } |
247 | } |

浙公网安备 33010602011771号