我自己随便写的爬虫

有关爬虫,自己半年前写了一个,写过就忘记了,今天才看到

 

  1 import java.io.IOException;
  2 import java.net.URISyntaxException;
  3 import java.nio.charset.StandardCharsets;
  4 import java.util.ArrayList;
  5 import java.util.Date;
  6 import java.util.HashMap;
  7 import java.util.List;
  8 import java.util.Map;
  9 
 10 import org.apache.commons.lang.time.DateFormatUtils;
 11 import org.apache.http.HttpEntity;
 12 import org.apache.http.ParseException;
 13 import org.apache.http.client.ClientProtocolException;
 14 import org.apache.http.client.methods.CloseableHttpResponse;
 15 import org.apache.http.client.methods.HttpPost;
 16 import org.apache.http.client.utils.URIBuilder;
 17 import org.apache.http.entity.StringEntity;
 18 import org.apache.http.impl.client.CloseableHttpClient;
 19 import org.apache.http.impl.client.HttpClientBuilder;
 20 import org.apache.http.util.EntityUtils;
 21 import org.apache.xerces.util.URI;
 22 import org.jsoup.Jsoup;
 23 import org.jsoup.nodes.Document;
 24 import org.jsoup.nodes.Element;
 25 import org.jsoup.select.Elements;
 26 import org.quartz.Job;
 27 import org.quartz.JobExecutionContext;
 28 import org.quartz.JobExecutionException;
 29 
 30 /*
 31  * @说明
 32  * 由于针对网页取数据,1页有100条数据,二期数据一直是变化的;
 33  * 目前设计是每5分钟抓取一次,所以抓一次,存一次,之前的数据仍旧保留,但是只抓第一页数据。
 34  * 其他排名靠后的数据就不抓了
 35  * 默认排序为 totalvolpct 总成交占比
 36  * 
 37  * 
 38  *                     <th>名称</th>
 39                     <th><a href="#" onclick="return window['sortTable']('symbol', ' ');">代码</a> </th>
 40 <th><a href="#" onclick="return window['sortTable']('totalvol', ' ');">总成交量(万股)</a> </th>
 41 <td><a href="#" onclick="return window['sortTable']('totalvolpct', ' ');">总成交量占比</a> </td>  ----百分比,入库时是去掉百分号入库的
 42 <td><a href="#" onclick="return window['sortTable']('totalamt', ' ');">总成交额(万元)</a> </td>
 43 <td><a href="#" onclick="return window['sortTable']('totalamtpct', ' ');">总成交额占比</a> </td>----百分比,入库时是去掉百分号入库的
 44 <td><a href="#" onclick="return window['sortTable']('avgprice', ' ');">平均成交价(元)</a> </td>
 45 <td><a href="#" onclick="return window['sortTable']('kuvolume', '↓');">主买量(万股)</a>↓</td>
 46 <td><a href="#" onclick="return window['sortTable']('kevolume', ' ');">中性量(万股)</a> </td>
 47 <td><a href="#" onclick="return window['sortTable']('kdvolume', ' ');">主卖量(万股)</a> </td>
 48                     <th>详情 </th>
 49 
 50 
 51 建数据库表时,字段顺序一定要按照上面的顺序来建,否则会有问题
 52 
 53  * 
 54  * 
 55 */
 56 
 57 
 58 public class Crawler implements Job{
 59 
 60     
 61     private String url="http://vip.stock.finance.sina.com.cn/quotes_service/view/cn_bill_sum.php?num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=1";
 62     private String encode="UTF-8";
 63     
 64     public String getUrlData() {
 65 
 66         String out=new String();
 67         
 68         //---大单分析---
 69         //---http://vip.stock.finance.sina.com.cn/quotes_service/view/cn_bill_sum.php?num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=2
 70 
 71         // 获得Http客户端(可以理解为:你得先有一个浏览器;注意:实际上HttpClient与浏览器是不一样的)
 72                 CloseableHttpClient httpClient = HttpClientBuilder.create().build();
 73          
 74 //                // 创建Post请求
 75 
 76                 HttpPost httpPost = new  HttpPost(url);
 77          
 78                 //---下面这句话暂时没有起作用,不知道原因;其实参数是可以不用放在上面的httpPost对象中的
 79                 StringEntity entity = new StringEntity("num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=1", encode);
 80          
 81                 // post请求是将参数放在请求体里面传过去的;这里将entity放入post请求体中
 82                 httpPost.setEntity(entity);
 83          
 84                 httpPost.setHeader("Content-Type", "text/html;charset=utf8");
 85          
 86                 // 响应模型
 87                 CloseableHttpResponse response = null;
 88                 try {
 89                     // 由客户端执行(发送)Post请求
 90                     response = httpClient.execute(httpPost);
 91                     // 从响应模型中获取响应实体
 92                     HttpEntity responseEntity = response.getEntity();
 93          
 94                     System.out.println("响应状态为:" + response.getStatusLine());
 95                     if (responseEntity != null) {
 96                         System.out.println("响应内容长度为:" + responseEntity.getContentLength());
 97                         //System.out.println("响应内容为:" + EntityUtils.toString(responseEntity,"GBK"));
 98                         out=EntityUtils.toString(responseEntity,"GBK");
 99                     }
100                 } catch (ClientProtocolException e) {
101                     e.printStackTrace();
102                 } catch (ParseException e) {
103                     e.printStackTrace();
104                 } catch (IOException e) {
105                     e.printStackTrace();
106                 } finally {
107                     try {
108                         // 释放资源
109                         if (httpClient != null) {
110                             httpClient.close();
111                         }
112                         if (response != null) {
113                             response.close();
114                         }
115                     } catch (IOException e) {
116                         e.printStackTrace();
117                     }
118                 }
119         
120         
121         
122         return out;
123 
124     }
125 
126     
127     //---想直接跳过字符串处理,暂时没成功---//
128     public void DealUrlString(String inStr,org.springframework.jdbc.core.JdbcTemplate db) {
129         
130         String out=new String();
131         
132         Document doc = Jsoup.parseBodyFragment(inStr);
133         
134         Element et = doc.getElementById("divListTemplate");
135         Elements et_tab = et.getElementsByTag("table");
136     
137         Elements trs = et_tab.first().getElementsByTag("tr");
138         
139         
140         System.out.println("====size===="+trs.size());  
141         
142         
143         int n=0;
144         
145         
146         
147         for (Element element : trs) {
148             if(n==0) {
149                 n++;
150             }
151             else {
152                 StringBuffer insert_sql=new StringBuffer();
153                 
154                 insert_sql.append("insert into stock_bigdeal_analyse (cn_name,symbol,totalvol,totalvolpct,totalamt,totalamtpct,avgprice,kuvolume,kevolume,kdvolume,input_time) values ( ");
155                 
156                 Elements ele_ths= element.getElementsByTag("th");
157                 insert_sql.append(  "'" + ele_ths.get(0).text().trim()+"', ");
158                 insert_sql.append(  "'" + ele_ths.get(1).text().replaceAll(" ", "")+"', ");//--看不见的特殊符号,
159                 
160                 
161                 Elements ele_tds= element.getElementsByTag("td");
162               
163                 insert_sql.append( "'" + ele_tds.get(0).text().trim().replaceAll(" ", "").replaceAll(",", "")+"', ");
164                 insert_sql.append(  "'" + ele_tds.get(1).text().trim().replaceAll(" ", "").replaceAll("%", "")+"',");
165                 insert_sql.append(  "'" + ele_tds.get(2).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',");
166                 insert_sql.append(  "'" + ele_tds.get(3).text().trim().replaceAll(" ", "").replaceAll("%", "")+"',");
167                 insert_sql.append(  "'" + ele_tds.get(4).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',");
168                 insert_sql.append(  "'" + ele_tds.get(5).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',");
169                 insert_sql.append(  "'" + ele_tds.get(6).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',");
170                 insert_sql.append(  "'" + ele_tds.get(7).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',");
171                 
172                 insert_sql.append(  "'" +DateFormatUtils.format(new Date(),"yyyyMMddHHmmssSSS")+"') ");
173                 
174               //  System.out.println(insert_sql);
175                 
176                 db.execute(insert_sql.toString());
177                 
178             }
179         }
180         
181         
182         
183     }
184     
185     
186     
187     public  static void main(String args[]) {
188         
189         Crawler cr= new Crawler();
190         
191         SpringDb sd = new SpringDb();
192         org.springframework.jdbc.core.JdbcTemplate db= sd.getJdbc();
193         
194         String data_in = cr.getUrlData();
195         cr.DealUrlString(data_in,db);
196         
197         
198     }
199 
200 
201     @Override
202     public void execute(JobExecutionContext arg0) throws JobExecutionException {
203 
204         Crawler cr= new Crawler();
205         
206         SpringDb sd = new SpringDb();
207         org.springframework.jdbc.core.JdbcTemplate db= sd.getJdbc();
208         
209         String data_in = cr.getUrlData();
210         cr.DealUrlString(data_in,db);
211         
212     }
213     
214     
215     
216     
217     
218 }
posted @ 2020-01-16 16:14  苦行者的刀  阅读(166)  评论(0编辑  收藏  举报