JAVA爬虫入门学习

之前分享一个简单的获取数据流的方法,

今天分享下学习到的SpringBoot+Maven的框架

 

首先pom.xml

 1 <!-- 继承父包 -->
 2     <parent>
 3         <groupId>org.springframework.boot</groupId>
 4         <artifactId>spring-boot-starter-parent</artifactId>
 5         <version>1.5.9.RELEASE</version>
 6     </parent>
 7 
 8     <dependencies>
 9         <!-- spring-boot使用jetty容器配置begin -->
10         <dependency>
11             <groupId>org.springframework.boot</groupId>
12             <artifactId>spring-boot-starter-web</artifactId>
13             <!-- 排除默认的tomcat,引入jetty容器. -->
14             <exclusions>
15                 <exclusion>
16                     <groupId>org.springframework.boot</groupId>
17                     <artifactId>spring-boot-starter-tomcat</artifactId>
18                 </exclusion>
19             </exclusions>
20         </dependency>
21         <!-- jetty 容器. -->
22         <dependency>
23             <groupId>org.springframework.boot</groupId>
24             <artifactId>spring-boot-starter-jetty</artifactId>
25         </dependency>
26         <dependency>
27             <groupId>org.springframework.boot</groupId>
28             <artifactId>spring-boot-starter-test</artifactId>
29             <scope>test</scope>
30         </dependency>
31         <!-- spring-boot使用jetty容器配置end -->
32 
33         <dependency>
34             <groupId>org.jsoup</groupId>
35             <artifactId>jsoup</artifactId>
36             <version>1.10.3</version>
37         </dependency>
38 
39     </dependencies>
其中Jetty 是一个开源的servlet容器,它为基于Java的web容器,例如JSP和servlet提供运行环境。
Jetty是使用JAVA编写的,它的API以一组JAR包的形式发布。
开发人员可以将Jetty容器实例化成一个对象,可以迅速为一些独立运行(stand-alone)的Java应用提供网络和web连接。
Jetty提供了一下HTTPClient的一些发包方法。
下面是控制器内写的接口
 1 @Controller  /*返回视图页名称*/
 2 //@RestController  返回json数据类型
 3 public class MainController {
 4 
 5     @Autowired//根据类型匹配需要配合Qualifier
 6     //@Qualifier//名称
 7     //@Resource//根据名称自动匹配,区别于@Autowired
 8     public GetCaptcha Captcha;
 9 
10     @GetMapping("Captcha")//等同于下面
11     //@RequestMapping(value = "name",method = RequestMethod.GET)
12     public @ResponseBody String index(HttpServletRequest request){//错误需要另外处理
13 
14         //初始化
15         HttpClient httpClient = Captcha.GetHttpClient();
16         CaptchaData captchaData = new CaptchaData();
17         captchaData.setHttpClient(httpClient);
18 
19         if(httpClient==null){
20             return "创建对象错误,检查代理是否到期";
21         }
22 
23         CaptchaData imgValue = Captcha.GetCaptcha(captchaData);
24 
25         request.getSession().setAttribute("captchaData",imgValue);
26 
27         String htmlText = imgValue.getImg()+"<form action=\"/json\" method=\"POST\">\n" +
28                 "请输入用户名:<br>\n" +
29                 "<input type=\"text\" name=\"name\" value=\"张三\">\n" +
30                 "<br>\n" +
31                 "身份证号:<br>\n" +
32                 "<input type=\"text\" name=\"cardId\" value=\"123456789098765432\">\n" +
33                 "<br>\n" +
34                 "验证码:<br>\n" +
35                 "<input type=\"text\" name=\"captcha\" value=\"\">\n" +
36                 "<br><br>\n" +
37                 "<input type=\"submit\" value=\"提交\">\n" +
38                 "</form> ";
39         return htmlText;//返回视图名称
40     }
41 
42     @PostMapping("json")
43     public @ResponseBody String resp(Userdb user,HttpServletRequest request){
44 
45         CaptchaData captchaData = (CaptchaData) request.getSession().getAttribute("captchaData");
46         HttpClient httpClient = captchaData.getHttpClient();
47 
48         String returnValue = "";
49         try {
50             returnValue = Captcha.PostData(user,captchaData);
51         }catch (Exception e){
52             returnValue = "数据传输出现错误,请检查数据信息";
53         }finally {
54             try{
55                 httpClient.stop();
56             }
57             catch (Exception e){
58                 returnValue += " \n HttpClient关闭异常";
59             }
60         }
61         return returnValue;
62     }
63 }

在控制器中调用的方法分别包括:

获取验证码、发送数据到目标地址、初始化httpClient对象

代码如下:

 1 public CaptchaData GetCaptcha(CaptchaData captchaData) {
 2         HttpClient httpClient = GetHttpClient();
 3         try {
 4             ContentResponse response1 = httpClient.newRequest("https://www.baidu.com").send();
 5             //System.out.println(response1.getContentAsString());
 6             //首页
 7             String TOKEN;
 8             response1 = httpClient.newRequest("XXXXXXXX/index1.do")
 9                     .method(HttpMethod.GET)
10                     .header(HttpHeader.REFERER, "https://XXXXXXXX/")
11                     .send();
12 
13             //点击进入页面
14             response1 = httpClient.newRequest("XXXXXXXX/xxxxx.xxx?xxxx=initLogin")
15                     .method(HttpMethod.GET)
16                     .header(HttpHeader.REFERER, "https://XXXXXXXX/index1.do")
17                     .send();
18 
19             //点击注册页面
20             response1 = httpClient.newRequest("https://XXXXXXXX/xxxxx?method=initReg")
21                     .method(HttpMethod.POST)
22                     .header("Referer", "https://XXXXXXXX/loginreg.jsp")
23                     .send();
24 
25             //获取TOKEN
26             Document documentUrlOne = Jsoup.parse(response1.getContentAsString());
27             Elements firstNode = documentUrlOne.select("div.white-con");
28             Elements formNode = firstNode.select("form[action]");
29             Element inputNode = formNode.select("input").first();
30             TOKEN = inputNode.attr("value");
31             //存入
32             captchaData.setTOKEN(TOKEN);
33             //TOKEN需要在注册提交页面生成的才有效
34             //System.out.println(TOKEN+"---获取org.apache.struts.taglib.html.TOKEN----");
35 
36             //获取验证码。
37             String timeString = String.valueOf(new Date().getTime());
38             String ImgStr = "https://XXXXXXXX/imgrc.do?a=" + timeString;
39 
40             response1 = httpClient.newRequest(ImgStr)
41                     .method(HttpMethod.GET)
42                     .header("Referer", "https://XXXXXXXXXXXXXXXX.do")
43                     .send();
44             //图片类型转换
45             String encodeBase64 = Base64.getEncoder().encodeToString(response1.getContent());
46             //<img src=''/>
47             //String filepath = "XXXXXXXX\a.png";
48             //convertBase64DataToImage(encodeBase64, filepath);
49             //System.out.printf("图片下载成功")
50             captchaData.setImg("<img src='data:image/png;base64," + encodeBase64 + "'/>");
51             captchaData.setResponse1(response1);
52             return captchaData;
53         } catch (Exception e) {
54             e.printStackTrace();
55         } finally {
56             return captchaData;
57         }
58     }

 

public String PostData(Userdb User, CaptchaData captchaData) {

        String Name = User.getName();
        String CardId = User.getCardId();
        String CAPTCHA = User.getCaptcha();
        //发送信息提交
        String TOKEN =captchaData.getTOKEN();
        String postdata = "org.apache.struts.taglib.html.TOKEN=" + TOKEN +
                "&method=checkIdentity" +
                "&userInfoVO.name=" + Name +
                "&userInfoVO.certType=0" +
                "&userInfoVO.certNo=" + CardId +
                "&_%40IMGRC%40_=" + CAPTCHA +
                "&1=on";
        HttpClient httpClient = captchaData.getHttpClient();


        try{
            ContentResponse response1 = httpClient.newRequest("https://xxxxxxxxxdo?" + postdata)
                    .method(HttpMethod.POST)
                    .header("Referer", "https:/xxxxxxxx.do")
                    .send();

            //  获取结果。检查是否出现错误
            Document documentPage = Jsoup.parse(response1.getContentAsString());
            Elements ErrorNode = documentPage.select("div.erro_div1");
            Elements ErrorValueNode = ErrorNode.select("span[id=_error_field_]");
            String ErrorValue = ErrorValueNode.text();
            if (ErrorValue.equals("")) {
                return "身份信息正确---END";
                //System.out.println("身份信息正确---END");
            } else {
                return ErrorValue + "cw---END";
                //System.out.println(ErrorValue+"---END");
            }
        }catch (Exception e){
            return "异常哎";
        }

    }

 


public HttpClient GetHttpClient() {
        //创建HTTPClient对象,
        //初始化
        SslContextFactory sslContextFactory = new SslContextFactory();
        sslContextFactory.setTrustAll(true);
        HttpClient httpClient = new HttpClient(sslContextFactory);
        httpClient.setConnectTimeout(10000);
        httpClient.setFollowRedirects(true);
        httpClient.setUserAgentField(new HttpField("User-Agent", "xxxxxxxxxxxxxxxxxxxxxxxxxx"));

        ProxyConfiguration proxyConfig = httpClient.getProxyConfiguration();
        HttpProxy proxy = new HttpProxy("xxxxxx", xxx);
        proxyConfig.getProxies().add(proxy);
        try {
            httpClient.start();
            return httpClient;
        } catch (Exception e) {
            return null;
        }

    }

1 //下载图片
2     public void convertBase64DataToImage(String base64ImgData, String filePath) throws IOException {
3         BASE64Decoder d = new BASE64Decoder();
4         byte[] bs = d.decodeBuffer(base64ImgData);
5         FileOutputStream os = new FileOutputStream(filePath);
6         os.write(bs);
7         os.close();
8     }

 

最后是application.yml配置
 1 server:
 2     port: 8080
 3 
 4 
 5 spring:
 6     http:
 7         encoding:
 8         charset: utf-8
 9         enabled: true
10         force: true
11     thymeleaf:
12         mode: HTML5
13         encoding: UTF-8
14         content-type: text/html
15         cache: false
16         prefix: classpath:/templates/
17         suffix: .html
18     resources:
19         static-locations: classpath:/static/

 

一个完整的爬虫就实现了,使用服务接口的形式

posted on 2018-02-01 11:27  梦林``ysl  阅读(286)  评论(0编辑  收藏  举报

导航