手机微博(weibo.cn)模拟登录及页面解析

package com.laudandjolynn.test;

import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.lang3.StringUtils;
import org.apache.tika.exception.TikaException;
import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public class WeiboCnUtils {
private final static Logger logger = LoggerFactory
.getLogger(WeiboCnUtils.class);
private final static Pattern PATTERN_SID = Pattern.compile("uid=(\d+)");
private final static Pattern PATTERN_WB_POST_DATE_TIME1 = Pattern
.compile("(\d{2}):(\d{2})");
private final static Pattern PATTERN_WB_POST_DATE_TIME2 = Pattern
.compile("(\d{2})月(\d{2})日\s+(\d{2}:\d{2})");
private final static Pattern PATTERN_WB_POST_DATE_TIME3 = Pattern
.compile("(\d{1,4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})");
private final static String SINA_IMG_HREF_PREFIX = "http://ww1.sinaimg.cn/thumbnail/";

/**
 * 解析weibo.cn页面
 * 
 * @param html
 * @param userService
 * @return
 * @throws ParserConfigurationException
 * @throws SAXException
 * @throws IOException
 * @throws XPathExpressionException
 */
public static void parsePageAtWeibocn(String html) throws ParserConfigurationException,
        SAXException, IOException, XPathExpressionException {
    DocumentBuilderFactory domFactory = DocumentBuilderFactory
            .newInstance();
    domFactory.setIgnoringComments(true);
    domFactory.setValidating(false);

    // 需要注意处理html不规范的问题,因此这里使用jsoup得到可以正常解析的html
    DocumentBuilder domBuilder = domFactory.newDocumentBuilder();
    Document doc = domBuilder.parse(new InputSource(new StringReader(Jsoup
            .parse(html).html())));

    XPathFactory factory = XPathFactory.newInstance();
    XPath xpath = factory.newXPath();

    NodeList nodes = (NodeList) xpath.evaluate(
            "html/body//div[@class='c' and @id]", doc,
            XPathConstants.NODESET);
    for (int i = 0; i < nodes.getLength(); i++) {
        Node node = nodes.item(i);
        try {
            // 微博id
            String weiboid = ((String) xpath.evaluate("@id", node,
                    XPathConstants.STRING)).substring(2);
            logger.info("weibo.cn - (" + i + "), weiboid: " + weiboid);

            NodeList children = (NodeList) xpath.evaluate("child::*", node,
                    XPathConstants.NODESET);

            Node feed1 = children.item(0);

            // uid
            String uid = null;
            String weibo = null;
            // 博主昵称
            String nickName = ((String) xpath.evaluate(
                    "a[@class='nk']/text()", feed1, XPathConstants.STRING))
                    .trim();
            logger.debug("weibo.cn - (" + i + "), nickname: " + nickName);

            boolean v = false;
            String pic = null;
            long attitudeCount = 0;
            long repostsCount = 0;
            long commentsCount = 0;

            // #########################
            // 被转发用户uid
            String fuid = null;
            String fweibo = null;
            // 被转发用户昵称
            String fnickName = null;
            // 被转发用户身份
            boolean fv = false;
            String fpic = null;
            long fattitudeCount = 0;
            long frepostsCount = 0;
            long fcommentsCount = 0;
            long fcreatedAt = 0;

            // 身份:加V,达人等
            if (xpath.evaluate("img[@alt][1]", feed1, XPathConstants.NODE) != null) {
                v = true;
            }
            logger.debug("weibo.cn - (" + i + "), vip: " + v);

            // 是否转发
            boolean isForward = false;
            Node forwardNode = (Node) xpath.evaluate("span[@class='cmt']",
                    feed1, XPathConstants.NODE);

            if (forwardNode != null) {
                fnickName = ((String) xpath.evaluate("a[@href]/text()",
                        forwardNode, XPathConstants.STRING)).trim();
                logger.debug("weibo.cn - (" + i + "), forward nickname: "
                        + fnickName);
                if (xpath.evaluate("img[@alt and @src]", forwardNode,
                        XPathConstants.NODE) != null) {
                    fv = true;
                }
                isForward = true;
                logger.debug("weibo.cn - (" + i + "), forward: "
                        + isForward);
            }

            // 微博内容
            String tweibo = ((String) xpath.evaluate(
                    "span[@class='ctt']/text()", feed1,
                    XPathConstants.STRING)).trim();

            logger.debug("weibo.cn - (" + i + "), content: " + tweibo);

            weibo = isForward ? null : tweibo;
            fweibo = isForward ? tweibo : null;

            int childCount = children.getLength();
            if (childCount == 1) {
                uid = getUid(feed1, xpath);
                logger.debug("weibo.cn - (" + i + "), uid: " + uid);
                getFeedStatistic(feed1, xpath, i);
                continue;
            }

            if (childCount >= 2) {
                // 是否单图还是组图
                boolean hasGroupPic = false;
                String groupPicText = (String) xpath.evaluate(
                        "a[contains(@href,'picAll')]/text()", feed1,
                        XPathConstants.STRING);
                if (!StringUtils.isEmpty(groupPicText)) {
                    int picCount = Integer.valueOf(groupPicText.substring(
                            3, groupPicText.length() - 1));
                    hasGroupPic = true;
                    logger.debug("weibo.cn - (" + i
                            + "), group picture count: " + picCount);
                }

                // 微博图片、原微博统计数据
                Node feed2 = children.item(1);
                String tmpPicSrc = null;
                // 取出微博图片链接
                if (hasGroupPic) {
                    String imageHref = (String) xpath.evaluate(
                            "a[contains(@href,'oripic')]/@href", feed2,
                            XPathConstants.STRING);
                    String imageName = imageHref.substring(imageHref
                            .indexOf("u=") + 2);
                    int index = imageName.indexOf("&");
                    if (index != -1) {
                        imageName = imageName.substring(0, index) + ".jpg";
                    } else {
                        imageName += ".jpg";
                    }

                    tmpPicSrc = SINA_IMG_HREF_PREFIX + imageName;
                } else {
                    String src = (String) xpath.evaluate(
                            "a/img[@class='ib']/@src", feed2,
                            XPathConstants.STRING);
                    tmpPicSrc = SINA_IMG_HREF_PREFIX
                            + src.substring(src.lastIndexOf("/") + 1);
                }
                logger.debug("weibo.cn - (" + i + "), picture: "
                        + tmpPicSrc);

                pic = isForward ? null : tmpPicSrc;
                fpic = isForward ? tmpPicSrc : null;

                if (!isForward) {
                    uid = getUid(feed2, xpath);
                    logger.debug("weibo.cn - (" + i + "), uid: " + uid);
                    getFeedStatistic(feed2, xpath, i);
                } else {
                    fattitudeCount = ((Number) xpath
                            .evaluate(
                                    "substring-after(substring-before(span[@class='cmt'][1]/text(),']'),'[')",
                                    feed2, XPathConstants.NUMBER))
                            .longValue();
                    frepostsCount = ((Number) xpath
                            .evaluate(
                                    "substring-after(substring-before(span[@class='cmt'][2]/text(),']'),'[')",
                                    feed2, XPathConstants.NUMBER))
                            .longValue();
                    fcommentsCount = ((Number) xpath
                            .evaluate(
                                    "substring-after(substring-before(a[contains(@href,'comment') and @class='cc']/text(), ']'),'[')",
                                    feed2, XPathConstants.NUMBER))
                            .longValue();
                    fuid = getUid(feed2, xpath);
                    logger.debug("weibo.cn - (" + i + "), forward uid:"
                            + fuid + " 赞: " + fattitudeCount + ", 转发: "
                            + frepostsCount + ", 评论: " + fcommentsCount);
                }
            }

            if (childCount == 3) {
                Node feed3 = children.item(2);
                // 转发理由
                weibo = ((String) xpath.evaluate("./text()", feed3,
                        XPathConstants.STRING)).trim();

                attitudeCount = ((Number) xpath
                        .evaluate(
                                "substring-after(substring-before((a[contains(@href,'attitude')]|span[@class='cmt'][2])/text(),']'),'[')",
                                feed3, XPathConstants.NUMBER)).longValue();
                repostsCount = ((Number) xpath
                        .evaluate(
                                "substring-after(substring-before(a[contains(@href,'repost')]/text(),']'),'[')",
                                feed3, XPathConstants.NUMBER)).longValue();
                commentsCount = ((Number) xpath
                        .evaluate(
                                "substring-after(substring-before(a[contains(@href,'comment') and @class='cc']/text(),']'),'[')",
                                feed3, XPathConstants.NUMBER)).longValue();

                uid = getUid(feed3, xpath);
                // 发表时间
                String postDateTime = ((String) xpath.evaluate(
                        "span[@class='ct']/text()", feed3,
                        XPathConstants.STRING)).trim();
                fcreatedAt = getCreatedAt(postDateTime);

                logger.debug("weibo.cn - (" + i + "), uid: " + uid + " 赞: "
                        + attitudeCount + ", 转发: " + repostsCount
                        + ", 评论: " + commentsCount + ", 发表时间: "
                        + postDateTime);
            }
        } catch (Exception e) {
            continue;
        }

    }

}

private static void getFeedStatistic(Node feed, XPath xpath, int index)
        throws XPathExpressionException {
    // 微博赞数
    long attitudeCount = ((Number) xpath
            .evaluate(
                    "substring-after(substring-before((a[contains(@href,'attitude')]|span[@class='cmt'])/text(),']'),'[')",
                    feed, XPathConstants.NUMBER)).longValue();

    // 转发数
    long repostsCount = ((Number) xpath
            .evaluate(
                    "substring-after(substring-before(a[contains(@href,'repost')]/text(),']'),'[')",
                    feed, XPathConstants.NUMBER)).longValue();

    // 评论数
    long commentsCount = ((Number) xpath
            .evaluate(
                    "substring-after(substring-before(a[contains(@href,'comment') and @class='cc']/text(),']'),'[')",
                    feed, XPathConstants.NUMBER)).longValue();

    // 发表时间
    String postDateTime = ((String) xpath.evaluate(
            "span[@class='ct']/text()", feed, XPathConstants.STRING))
            .trim();

    logger.debug("weibo.cn - (" + index + "), 赞: " + attitudeCount
            + ", 转发: " + repostsCount + ", 评论: " + commentsCount
            + ", 发表时间: " + postDateTime);
}

private static long getCreatedAt(String postDateTime)
        throws XPathExpressionException {
    // 发表时间,1分钟前/11:3/06月01日 12:30/2013-03-01 11:30:10
    Calendar calendar = Calendar.getInstance();

    Matcher timeMatcher = null;
    if ((timeMatcher = PATTERN_WB_POST_DATE_TIME1.matcher(postDateTime))
            .find()) {
        // 11:00
        calendar.set(Calendar.HOUR_OF_DAY,
                Integer.valueOf(timeMatcher.group(1)));
        calendar.set(Calendar.MINUTE, Integer.valueOf(timeMatcher.group(2)));
    } else if ((timeMatcher = PATTERN_WB_POST_DATE_TIME3
            .matcher(postDateTime)).find()) {
        // 06月01日 00:00
        calendar.set(Calendar.MONTH, Integer.valueOf(timeMatcher.group(1)));
        calendar.set(Calendar.DAY_OF_MONTH,
                Integer.valueOf(timeMatcher.group(2)));
        calendar.set(Calendar.HOUR_OF_DAY,
                Integer.valueOf(timeMatcher.group(3)));
        calendar.set(Calendar.MINUTE, Integer.valueOf(timeMatcher.group(4)));
    } else if ((timeMatcher = PATTERN_WB_POST_DATE_TIME2
            .matcher(postDateTime)).find()) {
        // 2013-03-01 11:30:10
        calendar.set(Integer.valueOf(timeMatcher.group(1)),
                Integer.valueOf(timeMatcher.group(2)),
                Integer.valueOf(timeMatcher.group(3)),
                Integer.valueOf(timeMatcher.group(4)),
                Integer.valueOf(timeMatcher.group(5)),
                Integer.valueOf(timeMatcher.group(6)));
    } else {
        // n分钟前 or 刚刚
        // do nothing, user current time instead.
    }
    return calendar.getTimeInMillis();
}

private static String getUid(Node node, XPath xpath)
        throws XPathExpressionException {
    String href = (String) xpath.evaluate(
            "a[contains(@href,'comment') and @class='cc']/@href", node,
            XPathConstants.STRING);
    // 用户sid
    Matcher matcher = PATTERN_SID.matcher(href);
    if (matcher.find()) {
        return matcher.group(1);
    }
    return null;
}

private final static String WEIBO_CN_LOGIN_URL = "http://login.weibo.cn/login/?ns=1&revalid=2&backURL=http%3A%2F%2Fweibo.cn%2F&backTitle=%CE%A2%B2%A9&vt=";
private final static Pattern PATTERN_RAND_VALUE = Pattern
        .compile("rand=(\\d+)");

/**
 * 模拟登录weibo.cn
 * 
 * @param loginName
 *            微博帐号
 * @param password
 *            明文密码
 * @return map, 包含cookie, cookie_expire, uid and success(true, false)
 */
public static Map<String, String> getCookieAndUidAtWeibocn(String loginName,
        String password) {
    Map<String, String> result = new HashMap<String, String>();
    result.put("success", "false");
    try {
        Connection conn = Jsoup.connect(WEIBO_CN_LOGIN_URL);
        conn.header("Accept",
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
        conn.header("Accept-Encoding", "gzip, deflate, sdch");
        conn.header("Accept-Language",
                "en-GB,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2,zh-TW;q=0.2");
        conn.header("Cache-Control", "no-cache");
        conn.header("Connection", "Keep-Alive");
        conn.header("Content-Type", "application/x-www-form-urlencoded");
        conn.header("Host", "login.weibo.cn");
        conn.header("Pragma", "no-cache");
        conn.header("Referer", "http://weibo.cn/pub/");
        conn.header(
                "User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36");
        Response getResponse = conn.method(Method.GET).execute();
        org.jsoup.nodes.Document doc = getResponse.parse();
        Element form = doc.select("form[method=post]").get(0);
        String action = form.attr("action");
        Matcher matcher = PATTERN_RAND_VALUE.matcher(action);
        Map<String, String> dataMap = new HashMap<String, String>();
        String rand = null;
        if (matcher.find()) {
            rand = matcher.group(1);
        } else {
            return result;
        }
        dataMap.put("backURL", form.select("div input[name=backURL]")
                .get(0).attr("value"));
        dataMap.put("backTitle", form.select("div input[name=backTitle]")
                .get(0).attr("value"));
        dataMap.put("mobile", loginName);
        dataMap.put(
                form.select("div input[type=password]").get(0).attr("name"),
                password);
        dataMap.put("remember", "on");
        dataMap.put("tryCount", form.select("div input[name=tryCount]")
                .get(0).attr("value"));
        dataMap.put("vk",
                form.select("div input[name=vk]").get(0).attr("value"));
        dataMap.put("submit", form.select("div input[name=submit]").get(0)
                .attr("value"));

        String postUrl = "http://login.weibo.cn/login/?rand="
                + rand
                + "&backURL=http%3A%2F%2Fweibo.cn%2F%3Fs2w%3Dlogin&backTitle=%E5%BE%AE%E5%8D%9A&vt=4&revalid=2&ns=1";

        Map<String, String> header = new HashMap<String, String>();
        header.put("Accept",
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");

        header.put("Accept-Encoding", "gzip, deflate");
        header.put("Accept-Language",
                "en-GB,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2,zh-TW;q=0.2");
        header.put("Cache-Control", "no-cache");
        header.put("Connection", "keep-alive");
        header.put("Content-Type", "application/x-www-form-urlencoded");
        header.put("Host", "login.weibo.cn");
        header.put("Origin", "http://login.weibo.cn");
        header.put("Pragma", "no-cache");
        header.put(
                "Referer",
                "http://login.weibo.cn/login/?ns=1&revalid=2&backURL=http%3A%2F%2Fweibo.cn%2F%3Fs2w%3Dlogin&backTitle=%CE%A2%B2%A9&vt=");
        header.put(
                "User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36");
        Map<String, List<String>> responseHeader = postThenGetHeader(
                postUrl, header, dataMap);
        if (responseHeader.containsKey("X-Log-Uid")) {
            result.put("uid", responseHeader.get("X-Log-Uid").get(0));
        }
        if (responseHeader.containsKey("Set-Cookie")) {
            StringBuffer cookieBuffer = new StringBuffer();
            for (String v : responseHeader.get("Set-Cookie")) {
                String[] cookies = v.split(";");
                for (String c : cookies) {
                    if (c.contains("gsid") || c.contains("_T_WM")
                            || c.contains("SUB") || c.contains("PHPSESSID")) {
                        cookieBuffer.append(c).append(";");
                    } else if (c.contains("expires")) {
                        result.put("cookie_expire", c.split("=")[1]);
                    }
                }
            }
            result.put("cookie", cookieBuffer.toString());
        }
        if (!result.containsKey("cookie")
                || result.get("cookie").contains("=deleted")) {
            result.put("success", "false");
        } else {
            CACHE_WEIBO_CN.put(loginName, result);
            result.put("success", "true");
        }
        return result;
    } catch (IOException e) {
        logger.error(e.getMessage(), e);
    }

    return result;
}

private static Map<String, List<String>> postThenGetHeader(String url,
        Map<String, String> header, Map<String, String> data) {
    HttpURLConnection conn = null;
    Map<String, List<String>> resultHeader = new HashMap<String, List<String>>();
    try {
        URL _url = new URL(url);
        conn = (HttpURLConnection) _url.openConnection();
        conn.setRequestMethod("POST");
        conn.setInstanceFollowRedirects(false);
        conn.setDoOutput(true);
        conn.setDoInput(true);
        for (String key : header.keySet()) {
            conn.addRequestProperty(key, header.get(key));
        }
        conn.connect();
        writePost(data, conn.getOutputStream());

        resultHeader.putAll(conn.getHeaderFields());
        return resultHeader;
    } catch (Exception e) {
        logger.error(e.getMessage(), e);
    } finally {
        if (conn != null) {
            conn.disconnect();
        }
    }
    return resultHeader;
}

private static void writePost(Map<String, String> data,
        OutputStream outputStream) throws IOException {
    OutputStreamWriter w = new OutputStreamWriter(outputStream, "UTF-8");
    boolean first = true;
    for (String key : data.keySet()) {
        if (!(first))
            w.append('&');
        else {
            first = false;
        }
        w.write(URLEncoder.encode(key, "UTF-8"));
        w.write(61);
        w.write(URLEncoder.encode(data.get(key), "UTF-8"));
    }
    w.close();
}

}

posted @ 2015-07-17 16:06 eventer 阅读(...) 评论(...) 编辑 收藏