szgzwf

http://www.etsec.com.cn

导航

java网页数据抓取

对于加密的网站还没去研究,不知道能不能抓取,现在只是对一些没有加密的网站进行网页数据抓取。刚刚开始写的时候以为很多网站都能抓取,但是发现很多都加密了,本来以为一些地址可以通过网页数据检测工具测出他的数据变化,但是只能监测到一些通过js显示的数据,依然不能抓取到加密的网站。嗨,这个问题以后再说吧。

我抓取的网站是手机号查询和身份证查询的网站。http://qq.ip138.com/idsearch/index.asp这个是查询身份证的网站,源地址是这个,但当你输入自己的身份证是地址就会变成http://qq.ip138.com/idsearch/index.asp?action=idcard&userid=你输入的省份证&B1=%B2%E9+%D1%AF根据这个变化就可以抓取到输入特定身份证的网页源代码,再通过解析读取到需要的数据。下面是代码:

 

[java] view plaincopyprint?
  1. import java.net.* ;
  2. import java.io.* ;
  3. import java.util.regex.* ;
  4. publicclass Capture{
  5. publicstaticvoid main(String args[])throws Exception{
  6. System.out.println("*************************手机号查询************************") ;
  7. System.out.println("我的位置是:" + new GrabMobile().grabMobileLocation("15023141745")) ;
  8. System.out.println("手机卡类型是:" + new GrabMobile().grabMobileType("15023141745")) ;
  9. System.out.println("我的邮编是:" + new GrabMobile().grabMobilePost("15023141745")) ;
  10. System.out.println("*************************身份证查询************************") ;
  11. System.out.println("我的性别是:" + new GrabIdentity().grabIdentitySex("362203199208243575")) ;
  12. System.out.println("我的生日是:" + new GrabIdentity().grabIdentityBirth("362203199208243575")) ;
  13. System.out.println("我的家乡是:" + new GrabIdentity().grabIdentityHome("362203199208243575")) ;
  14. }
  15. }
  16. class GrabMobile{
  17. public String grabMobileLocation(String m)throws Exception{
  18. String strUrl = "http://www.ip138.com:8080/search.asp?action=mobile&mobile=" + m;
  19. URL url = new URL(strUrl) ;
  20. HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
  21. InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
  22. BufferedReader bufRead = new BufferedReader(inRead) ;
  23. StringBuffer strBuf = new StringBuffer() ;
  24. String line = "" ;
  25. while ((line = bufRead.readLine()) != null) {
  26. strBuf.append(line);
  27. }
  28. String strStart = "卡号归属地" ;
  29. String strEnd = "卡 类 型";
  30. String strAll = strBuf.toString() ;
  31. int start = strAll.indexOf(strStart) ;
  32. int end = strAll.indexOf(strEnd) ;
  33. String result = strAll.substring(start+42,end-33) ;
  34. result = drawChMob(result) ;
  35. return result ;
  36. }
  37. public String grabMobileType(String m)throws Exception{
  38. String strUrl = "http://www.ip138.com:8080/search.asp?action=mobile&mobile=" + m;
  39. URL url = new URL(strUrl) ;
  40. HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
  41. InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
  42. BufferedReader bufRead = new BufferedReader(inRead) ;
  43. StringBuffer strBuf = new StringBuffer() ;
  44. String line = "" ;
  45. while ((line = bufRead.readLine()) != null) {
  46. strBuf.append(line);
  47. }
  48. String strStart = "卡 类 型" ;
  49. String strEnd = "<TD align=\"center\">区 号</TD>";
  50. String strAll = strBuf.toString() ;
  51. int start = strAll.indexOf(strStart) ;
  52. int end = strAll.indexOf(strEnd) ;
  53. String result = strAll.substring(start+12,end) ;
  54. result = drawChMob(result) ;
  55. result = result.substring(1) ;
  56. return result ;
  57. }
  58. public String grabMobilePost(String m)throws Exception{
  59. String strUrl = "http://www.ip138.com:8080/search.asp?action=mobile&mobile=" + m;
  60. URL url = new URL(strUrl) ;
  61. HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
  62. InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
  63. BufferedReader bufRead = new BufferedReader(inRead) ;
  64. StringBuffer strBuf = new StringBuffer() ;
  65. String line = "" ;
  66. while ((line = bufRead.readLine()) != null) {
  67. strBuf.append(line);
  68. }
  69. String strStart = "邮 编" ;
  70. String strEnd = "更详细的..";
  71. String strAll = strBuf.toString() ;
  72. int start = strAll.indexOf(strStart) ;
  73. int end = strAll.indexOf(strEnd) ;
  74. String result = strAll.substring(start+40,end-55) ;
  75. return result ;
  76. }
  77. public String drawChMob(String str){
  78. StringBuffer strBuf = new StringBuffer() ;
  79. String regex="([\u4e00-\u9fa5]+)";
  80. Matcher matcher = Pattern.compile(regex).matcher(str);
  81. while(matcher.find()){
  82. strBuf.append(matcher.group(0)).toString() ;
  83. }
  84. return strBuf.toString() ;
  85. }
  86. }
  87. class GrabIdentity{
  88. public String grabIdentitySex(String userid)throws Exception{
  89. String strUrl = "http://qq.ip138.com/idsearch/index.asp?action=idcard&userid=" + userid + "&B1=%B2%E9+%D1%AF";
  90. URL url = new URL(strUrl) ;
  91. HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
  92. InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
  93. BufferedReader bufRead = new BufferedReader(inRead) ;
  94. StringBuffer strBuf = new StringBuffer() ;
  95. String line = "" ;
  96. while ((line = bufRead.readLine()) != null) {
  97. strBuf.append(line);
  98. }
  99. String strStart = " 别" ;
  100. String strEnd = "出生日期";
  101. String strAll = strBuf.toString() ;
  102. int start = strAll.indexOf(strStart) ;
  103. int end = strAll.indexOf(strEnd) ;
  104. String result = strAll.substring(start+7,end) ;
  105. result = drawCh(result) ;
  106. return result ;
  107. }
  108. public String grabIdentityBirth(String userid)throws Exception{
  109. String strUrl = "http://qq.ip138.com/idsearch/index.asp?action=idcard&userid=" + userid + "&B1=%B2%E9+%D1%AF";
  110. URL url = new URL(strUrl) ;
  111. HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
  112. InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
  113. BufferedReader bufRead = new BufferedReader(inRead) ;
  114. StringBuffer strBuf = new StringBuffer() ;
  115. String line = "" ;
  116. while ((line = bufRead.readLine()) != null) {
  117. strBuf.append(line);
  118. }
  119. String strStart = "出生日期:</td><td class=\"tdc2\">" ;
  120. String strEnd = "</td><tr><tr><td class=";
  121. String strAll = strBuf.toString() ;
  122. int start = strAll.indexOf(strStart) ;
  123. int end = strAll.indexOf(strEnd) ;
  124. String result = strAll.substring(start+27,end) ;
  125. return result ;
  126. }
  127. public String grabIdentityHome(String userid)throws Exception{
  128. String strUrl = "http://qq.ip138.com/idsearch/index.asp?action=idcard&userid=" + userid + "&B1=%B2%E9+%D1%AF";
  129. URL url = new URL(strUrl) ;
  130. HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
  131. InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
  132. BufferedReader bufRead = new BufferedReader(inRead) ;
  133. StringBuffer strBuf = new StringBuffer() ;
  134. String line = "" ;
  135. while ((line = bufRead.readLine()) != null) {
  136. strBuf.append(line);
  137. }
  138. String strStart = "证 地:</td><td class=\"tdc2\">" ;
  139. String strEnd = "<br/></td></tr><tr><td class=\"tdc3\" valign=\"top\" align=\"right\">部分或" ;
  140. String strAll = strBuf.toString() ;
  141. int start = strAll.indexOf(strStart) ;
  142. int end = strAll.indexOf(strEnd) ;
  143. String result = strAll.substring(start+31,end) ;
  144. return result ;
  145. }
  146. public String drawCh(String str){
  147. StringBuffer strBuf = new StringBuffer() ;
  148. String regex="([\u4e00-\u9fa5]+)";
  149. Matcher matcher = Pattern.compile(regex).matcher(str);
  150. if(matcher.find()){
  151. str = strBuf.append(matcher.group(0)).toString() ;
  152. }
  153. return str ;
  154. }
  155. }

posted on 2012-11-07 10:54  szgzwf  阅读(354)  评论(0)    收藏  举报