手机微博(weibo.cn)模拟登录及页面解析

package com.laudandjolynn.test;

import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.lang3.StringUtils;
import org.apache.tika.exception.TikaException;
import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public class WeiboCnUtils {
private final static Logger logger = LoggerFactory
.getLogger(WeiboCnUtils.class);
private final static Pattern PATTERN_SID = Pattern.compile("uid=(\d+)");
private final static Pattern PATTERN_WB_POST_DATE_TIME1 = Pattern
.compile("(\d{2})😦\d{2})");
private final static Pattern PATTERN_WB_POST_DATE_TIME2 = Pattern
.compile("(\d{2})月(\d{2})日\s+(\d{2}:\d{2})");
private final static Pattern PATTERN_WB_POST_DATE_TIME3 = Pattern
.compile("(\d{1,4})-(\d{2})-(\d{2})\s+(\d{2})😦\d{2})😦\d{2})");
private final static String SINA_IMG_HREF_PREFIX = "http://ww1.sinaimg.cn/thumbnail/";

/**
 * 解析weibo.cn页面
 * 
 * @param html
 * @param userService
 * @return
 * @throws ParserConfigurationException
 * @throws SAXException
 * @throws IOException
 * @throws XPathExpressionException
 */
public static void parsePageAtWeibocn(String html) throws ParserConfigurationException,
		SAXException, IOException, XPathExpressionException {
	DocumentBuilderFactory domFactory = DocumentBuilderFactory
			.newInstance();
	domFactory.setIgnoringComments(true);
	domFactory.setValidating(false);

	// 需要注意处理html不规范的问题,因此这里使用jsoup得到可以正常解析的html
	DocumentBuilder domBuilder = domFactory.newDocumentBuilder();
	Document doc = domBuilder.parse(new InputSource(new StringReader(Jsoup
			.parse(html).html())));

	XPathFactory factory = XPathFactory.newInstance();
	XPath xpath = factory.newXPath();

	NodeList nodes = (NodeList) xpath.evaluate(
			"html/body//div[@class='c' and @id]", doc,
			XPathConstants.NODESET);
	for (int i = 0; i < nodes.getLength(); i++) {
		Node node = nodes.item(i);
		try {
			// 微博id
			String weiboid = ((String) xpath.evaluate("@id", node,
					XPathConstants.STRING)).substring(2);
			logger.info("weibo.cn - (" + i + "), weiboid: " + weiboid);

			NodeList children = (NodeList) xpath.evaluate("child::*", node,
					XPathConstants.NODESET);

			Node feed1 = children.item(0);

			// uid
			String uid = null;
			String weibo = null;
			// 博主昵称
			String nickName = ((String) xpath.evaluate(
					"a[@class='nk']/text()", feed1, XPathConstants.STRING))
					.trim();
			logger.debug("weibo.cn - (" + i + "), nickname: " + nickName);

			boolean v = false;
			String pic = null;
			long attitudeCount = 0;
			long repostsCount = 0;
			long commentsCount = 0;

			// #########################
			// 被转发用户uid
			String fuid = null;
			String fweibo = null;
			// 被转发用户昵称
			String fnickName = null;
			// 被转发用户身份
			boolean fv = false;
			String fpic = null;
			long fattitudeCount = 0;
			long frepostsCount = 0;
			long fcommentsCount = 0;
			long fcreatedAt = 0;

			// 身份:加V,达人等
			if (xpath.evaluate("img[@alt][1]", feed1, XPathConstants.NODE) != null) {
				v = true;
			}
			logger.debug("weibo.cn - (" + i + "), vip: " + v);

			// 是否转发
			boolean isForward = false;
			Node forwardNode = (Node) xpath.evaluate("span[@class='cmt']",
					feed1, XPathConstants.NODE);

			if (forwardNode != null) {
				fnickName = ((String) xpath.evaluate("a[@href]/text()",
						forwardNode, XPathConstants.STRING)).trim();
				logger.debug("weibo.cn - (" + i + "), forward nickname: "
						+ fnickName);
				if (xpath.evaluate("img[@alt and @src]", forwardNode,
						XPathConstants.NODE) != null) {
					fv = true;
				}
				isForward = true;
				logger.debug("weibo.cn - (" + i + "), forward: "
						+ isForward);
			}

			// 微博内容
			String tweibo = ((String) xpath.evaluate(
					"span[@class='ctt']/text()", feed1,
					XPathConstants.STRING)).trim();

			logger.debug("weibo.cn - (" + i + "), content: " + tweibo);

			weibo = isForward ? null : tweibo;
			fweibo = isForward ? tweibo : null;

			int childCount = children.getLength();
			if (childCount == 1) {
				uid = getUid(feed1, xpath);
				logger.debug("weibo.cn - (" + i + "), uid: " + uid);
				getFeedStatistic(feed1, xpath, i);
				continue;
			}

			if (childCount >= 2) {
				// 是否单图还是组图
				boolean hasGroupPic = false;
				String groupPicText = (String) xpath.evaluate(
						"a[contains(@href,'picAll')]/text()", feed1,
						XPathConstants.STRING);
				if (!StringUtils.isEmpty(groupPicText)) {
					int picCount = Integer.valueOf(groupPicText.substring(
							3, groupPicText.length() - 1));
					hasGroupPic = true;
					logger.debug("weibo.cn - (" + i
							+ "), group picture count: " + picCount);
				}

				// 微博图片、原微博统计数据
				Node feed2 = children.item(1);
				String tmpPicSrc = null;
				// 取出微博图片链接
				if (hasGroupPic) {
					String imageHref = (String) xpath.evaluate(
							"a[contains(@href,'oripic')]/@href", feed2,
							XPathConstants.STRING);
					String imageName = imageHref.substring(imageHref
							.indexOf("u=") + 2);
					int index = imageName.indexOf("&");
					if (index != -1) {
						imageName = imageName.substring(0, index) + ".jpg";
					} else {
						imageName += ".jpg";
					}

					tmpPicSrc = SINA_IMG_HREF_PREFIX + imageName;
				} else {
					String src = (String) xpath.evaluate(
							"a/img[@class='ib']/@src", feed2,
							XPathConstants.STRING);
					tmpPicSrc = SINA_IMG_HREF_PREFIX
							+ src.substring(src.lastIndexOf("/") + 1);
				}
				logger.debug("weibo.cn - (" + i + "), picture: "
						+ tmpPicSrc);

				pic = isForward ? null : tmpPicSrc;
				fpic = isForward ? tmpPicSrc : null;

				if (!isForward) {
					uid = getUid(feed2, xpath);
					logger.debug("weibo.cn - (" + i + "), uid: " + uid);
					getFeedStatistic(feed2, xpath, i);
				} else {
					fattitudeCount = ((Number) xpath
							.evaluate(
									"substring-after(substring-before(span[@class='cmt'][1]/text(),']'),'[')",
									feed2, XPathConstants.NUMBER))
							.longValue();
					frepostsCount = ((Number) xpath
							.evaluate(
									"substring-after(substring-before(span[@class='cmt'][2]/text(),']'),'[')",
									feed2, XPathConstants.NUMBER))
							.longValue();
					fcommentsCount = ((Number) xpath
							.evaluate(
									"substring-after(substring-before(a[contains(@href,'comment') and @class='cc']/text(), ']'),'[')",
									feed2, XPathConstants.NUMBER))
							.longValue();
					fuid = getUid(feed2, xpath);
					logger.debug("weibo.cn - (" + i + "), forward uid:"
							+ fuid + " 赞: " + fattitudeCount + ", 转发: "
							+ frepostsCount + ", 评论: " + fcommentsCount);
				}
			}

			if (childCount == 3) {
				Node feed3 = children.item(2);
				// 转发理由
				weibo = ((String) xpath.evaluate("./text()", feed3,
						XPathConstants.STRING)).trim();

				attitudeCount = ((Number) xpath
						.evaluate(
								"substring-after(substring-before((a[contains(@href,'attitude')]|span[@class='cmt'][2])/text(),']'),'[')",
								feed3, XPathConstants.NUMBER)).longValue();
				repostsCount = ((Number) xpath
						.evaluate(
								"substring-after(substring-before(a[contains(@href,'repost')]/text(),']'),'[')",
								feed3, XPathConstants.NUMBER)).longValue();
				commentsCount = ((Number) xpath
						.evaluate(
								"substring-after(substring-before(a[contains(@href,'comment') and @class='cc']/text(),']'),'[')",
								feed3, XPathConstants.NUMBER)).longValue();

				uid = getUid(feed3, xpath);
				// 发表时间
				String postDateTime = ((String) xpath.evaluate(
						"span[@class='ct']/text()", feed3,
						XPathConstants.STRING)).trim();
				fcreatedAt = getCreatedAt(postDateTime);

				logger.debug("weibo.cn - (" + i + "), uid: " + uid + " 赞: "
						+ attitudeCount + ", 转发: " + repostsCount
						+ ", 评论: " + commentsCount + ", 发表时间: "
						+ postDateTime);
			}
		} catch (Exception e) {
			continue;
		}

	}

}

private static void getFeedStatistic(Node feed, XPath xpath, int index)
		throws XPathExpressionException {
	// 微博赞数
	long attitudeCount = ((Number) xpath
			.evaluate(
					"substring-after(substring-before((a[contains(@href,'attitude')]|span[@class='cmt'])/text(),']'),'[')",
					feed, XPathConstants.NUMBER)).longValue();

	// 转发数
	long repostsCount = ((Number) xpath
			.evaluate(
					"substring-after(substring-before(a[contains(@href,'repost')]/text(),']'),'[')",
					feed, XPathConstants.NUMBER)).longValue();

	// 评论数
	long commentsCount = ((Number) xpath
			.evaluate(
					"substring-after(substring-before(a[contains(@href,'comment') and @class='cc']/text(),']'),'[')",
					feed, XPathConstants.NUMBER)).longValue();

	// 发表时间
	String postDateTime = ((String) xpath.evaluate(
			"span[@class='ct']/text()", feed, XPathConstants.STRING))
			.trim();

	logger.debug("weibo.cn - (" + index + "), 赞: " + attitudeCount
			+ ", 转发: " + repostsCount + ", 评论: " + commentsCount
			+ ", 发表时间: " + postDateTime);
}

private static long getCreatedAt(String postDateTime)
		throws XPathExpressionException {
	// 发表时间,1分钟前/11:3/06月01日 12:30/2013-03-01 11:30:10
	Calendar calendar = Calendar.getInstance();

	Matcher timeMatcher = null;
	if ((timeMatcher = PATTERN_WB_POST_DATE_TIME1.matcher(postDateTime))
			.find()) {
		// 11:00
		calendar.set(Calendar.HOUR_OF_DAY,
				Integer.valueOf(timeMatcher.group(1)));
		calendar.set(Calendar.MINUTE, Integer.valueOf(timeMatcher.group(2)));
	} else if ((timeMatcher = PATTERN_WB_POST_DATE_TIME3
			.matcher(postDateTime)).find()) {
		// 06月01日 00:00
		calendar.set(Calendar.MONTH, Integer.valueOf(timeMatcher.group(1)));
		calendar.set(Calendar.DAY_OF_MONTH,
				Integer.valueOf(timeMatcher.group(2)));
		calendar.set(Calendar.HOUR_OF_DAY,
				Integer.valueOf(timeMatcher.group(3)));
		calendar.set(Calendar.MINUTE, Integer.valueOf(timeMatcher.group(4)));
	} else if ((timeMatcher = PATTERN_WB_POST_DATE_TIME2
			.matcher(postDateTime)).find()) {
		// 2013-03-01 11:30:10
		calendar.set(Integer.valueOf(timeMatcher.group(1)),
				Integer.valueOf(timeMatcher.group(2)),
				Integer.valueOf(timeMatcher.group(3)),
				Integer.valueOf(timeMatcher.group(4)),
				Integer.valueOf(timeMatcher.group(5)),
				Integer.valueOf(timeMatcher.group(6)));
	} else {
		// n分钟前 or 刚刚
		// do nothing, user current time instead.
	}
	return calendar.getTimeInMillis();
}

private static String getUid(Node node, XPath xpath)
		throws XPathExpressionException {
	String href = (String) xpath.evaluate(
			"a[contains(@href,'comment') and @class='cc']/@href", node,
			XPathConstants.STRING);
	// 用户sid
	Matcher matcher = PATTERN_SID.matcher(href);
	if (matcher.find()) {
		return matcher.group(1);
	}
	return null;
}

private final static String WEIBO_CN_LOGIN_URL = "http://login.weibo.cn/login/?ns=1&revalid=2&backURL=http%3A%2F%2Fweibo.cn%2F&backTitle=%CE%A2%B2%A9&vt=";
private final static Pattern PATTERN_RAND_VALUE = Pattern
		.compile("rand=(\\d+)");

/**
 * 模拟登录weibo.cn
 * 
 * @param loginName
 *            微博帐号
 * @param password
 *            明文密码
 * @return map, 包含cookie, cookie_expire, uid and success(true, false)
 */
public static Map<String, String> getCookieAndUidAtWeibocn(String loginName,
		String password) {
	Map<String, String> result = new HashMap<String, String>();
	result.put("success", "false");
	try {
		Connection conn = Jsoup.connect(WEIBO_CN_LOGIN_URL);
		conn.header("Accept",
				"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
		conn.header("Accept-Encoding", "gzip, deflate, sdch");
		conn.header("Accept-Language",
				"en-GB,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2,zh-TW;q=0.2");
		conn.header("Cache-Control", "no-cache");
		conn.header("Connection", "Keep-Alive");
		conn.header("Content-Type", "application/x-www-form-urlencoded");
		conn.header("Host", "login.weibo.cn");
		conn.header("Pragma", "no-cache");
		conn.header("Referer", "http://weibo.cn/pub/");
		conn.header(
				"User-Agent",
				"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36");
		Response getResponse = conn.method(Method.GET).execute();
		org.jsoup.nodes.Document doc = getResponse.parse();
		Element form = doc.select("form[method=post]").get(0);
		String action = form.attr("action");
		Matcher matcher = PATTERN_RAND_VALUE.matcher(action);
		Map<String, String> dataMap = new HashMap<String, String>();
		String rand = null;
		if (matcher.find()) {
			rand = matcher.group(1);
		} else {
			return result;
		}
		dataMap.put("backURL", form.select("div input[name=backURL]")
				.get(0).attr("value"));
		dataMap.put("backTitle", form.select("div input[name=backTitle]")
				.get(0).attr("value"));
		dataMap.put("mobile", loginName);
		dataMap.put(
				form.select("div input[type=password]").get(0).attr("name"),
				password);
		dataMap.put("remember", "on");
		dataMap.put("tryCount", form.select("div input[name=tryCount]")
				.get(0).attr("value"));
		dataMap.put("vk",
				form.select("div input[name=vk]").get(0).attr("value"));
		dataMap.put("submit", form.select("div input[name=submit]").get(0)
				.attr("value"));

		String postUrl = "http://login.weibo.cn/login/?rand="
				+ rand
				+ "&backURL=http%3A%2F%2Fweibo.cn%2F%3Fs2w%3Dlogin&backTitle=%E5%BE%AE%E5%8D%9A&vt=4&revalid=2&ns=1";

		Map<String, String> header = new HashMap<String, String>();
		header.put("Accept",
				"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");

		header.put("Accept-Encoding", "gzip, deflate");
		header.put("Accept-Language",
				"en-GB,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2,zh-TW;q=0.2");
		header.put("Cache-Control", "no-cache");
		header.put("Connection", "keep-alive");
		header.put("Content-Type", "application/x-www-form-urlencoded");
		header.put("Host", "login.weibo.cn");
		header.put("Origin", "http://login.weibo.cn");
		header.put("Pragma", "no-cache");
		header.put(
				"Referer",
				"http://login.weibo.cn/login/?ns=1&revalid=2&backURL=http%3A%2F%2Fweibo.cn%2F%3Fs2w%3Dlogin&backTitle=%CE%A2%B2%A9&vt=");
		header.put(
				"User-Agent",
				"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36");
		Map<String, List<String>> responseHeader = postThenGetHeader(
				postUrl, header, dataMap);
		if (responseHeader.containsKey("X-Log-Uid")) {
			result.put("uid", responseHeader.get("X-Log-Uid").get(0));
		}
		if (responseHeader.containsKey("Set-Cookie")) {
			StringBuffer cookieBuffer = new StringBuffer();
			for (String v : responseHeader.get("Set-Cookie")) {
				String[] cookies = v.split(";");
				for (String c : cookies) {
					if (c.contains("gsid") || c.contains("_T_WM")
							|| c.contains("SUB") || c.contains("PHPSESSID")) {
						cookieBuffer.append(c).append(";");
					} else if (c.contains("expires")) {
						result.put("cookie_expire", c.split("=")[1]);
					}
				}
			}
			result.put("cookie", cookieBuffer.toString());
		}
		if (!result.containsKey("cookie")
				|| result.get("cookie").contains("=deleted")) {
			result.put("success", "false");
		} else {
			CACHE_WEIBO_CN.put(loginName, result);
			result.put("success", "true");
		}
		return result;
	} catch (IOException e) {
		logger.error(e.getMessage(), e);
	}

	return result;
}

private static Map<String, List<String>> postThenGetHeader(String url,
		Map<String, String> header, Map<String, String> data) {
	HttpURLConnection conn = null;
	Map<String, List<String>> resultHeader = new HashMap<String, List<String>>();
	try {
		URL _url = new URL(url);
		conn = (HttpURLConnection) _url.openConnection();
		conn.setRequestMethod("POST");
		conn.setInstanceFollowRedirects(false);
		conn.setDoOutput(true);
		conn.setDoInput(true);
		for (String key : header.keySet()) {
			conn.addRequestProperty(key, header.get(key));
		}
		conn.connect();
		writePost(data, conn.getOutputStream());

		resultHeader.putAll(conn.getHeaderFields());
		return resultHeader;
	} catch (Exception e) {
		logger.error(e.getMessage(), e);
	} finally {
		if (conn != null) {
			conn.disconnect();
		}
	}
	return resultHeader;
}

private static void writePost(Map<String, String> data,
		OutputStream outputStream) throws IOException {
	OutputStreamWriter w = new OutputStreamWriter(outputStream, "UTF-8");
	boolean first = true;
	for (String key : data.keySet()) {
		if (!(first))
			w.append('&');
		else {
			first = false;
		}
		w.write(URLEncoder.encode(key, "UTF-8"));
		w.write(61);
		w.write(URLEncoder.encode(data.get(key), "UTF-8"));
	}
	w.close();
}

}

posted @ 2015-07-17 16:06  eventer  阅读(1901)  评论(0编辑  收藏  举报