import java.io.IOException;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.X509TrustManager;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
/*
CREATE TABLE `ing` (
`id` int(11) unsigned NOT NULL,
`url` varchar(500) DEFAULT NULL,
`user` varchar(100) DEFAULT NULL,
`date` varchar(30) DEFAULT NULL,
`content` varchar(5000) DEFAULT NULL,
`lucky` tinyint(4) DEFAULT NULL,
`userlink` varchar(500) DEFAULT NULL,
`mtime` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE `comment` (
`id` int(11) unsigned NOT NULL,
`ingid` int(11) DEFAULT NULL,
`user` varchar(100) DEFAULT NULL,
`content` varchar(5000) DEFAULT NULL,
`date` varchar(30) DEFAULT NULL,
`userlink` varchar(100) DEFAULT NULL,
`mtime` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
* */
public class IngCrawler {
static {
try {
HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
public boolean verify(String hostname, SSLSession session) {
return true;
}
});
SSLContext context = SSLContext.getInstance("TLS");
context.init(null, new X509TrustManager[] { new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
} }, new SecureRandom());
HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
int id = Inserter.getNextId();
int lastestid = Crawler.getLastestId();
for (; id <= lastestid; id++) {
Crawler.crawl("https://ing.cnblogs.com/u/1/status/" + id, id);
}
}
static class Ing {
int id;
String url;
String user;
String date;
String content;
boolean lucky;
String userlink;
List<Comment> comments = new ArrayList<Comment>();
@Override
public String toString() {
StringBuilder sb = new StringBuilder(id + " - [" + date + "][" + user + "] - " + content);
for (Comment c : this.comments) {
sb.append("\n\t" + c);
}
return sb.toString();
}
static Ing parseIng(Document doc, String url, int id) {
Ing ing = new Ing();
ing.id = id;
ing.url = url;
if (doc.select(".ing_detail_title").size() == 0) {
return ing;
}
ing.user = doc.select(".ing_item_author").text().trim();
ing.userlink = doc.select(".ing_item_author").attr("href");
ing.date = doc.select(".ing_detail_title").text().trim();
if (ing.date.indexOf(":") != -1) {
ing.date = ing.date.substring(ing.date.indexOf(":") + 1).trim();
}
ing.content = doc.select("#ing_detail_body").text().trim();
ing.lucky = doc.select(".ing_icon_lucky").size() > 0;
for (Element e : doc.select("#comment_block_" + id).get(0).children()) {
ing.comments.add(Comment.parseComment(e, id));
}
return ing;
}
static class Comment {
int id;
int ingid;
String user;
String content;
String date;
String userlink;
static Comment parseComment(Element e, int ingid) {
Comment comment = new Comment();
comment.id = Integer.parseInt(e.id().substring(8));
comment.ingid = ingid;
comment.user = e.select("#comment_author_" + comment.id).text().trim();
comment.userlink = e.select("#comment_author_" + comment.id).attr("href");
comment.date = e.select(".text_green").attr("title").trim();
e.select("#comment_author_" + comment.id).remove();
e.select(".text_green").remove();
e.select(".gray3").remove();
comment.content = e.select("div").text().trim();
if (comment.content.startsWith(":")) {
comment.content = comment.content.substring(1).trim();
}
return comment;
}
@Override
public String toString() {
return "[" + user + "] - " + content;
}
}
}
static class Crawler implements Runnable {
static ExecutorService crawler = Executors.newFixedThreadPool(10);
String url;
int id;
public Crawler(String url, int id) {
this.url = url;
this.id = id;
}
public static int getLastestId() {
return 1054304;
}
public static void crawl(String url, int id) {
crawler.execute(new Crawler(url, id));
}
@Override
public void run() {
System.out.println("crawl for: " + url);
try {
String cookie = "YOUR COOKIE HERE";
String useragent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36";
Inserter.insert(Ing.parseIng(Jsoup.connect(url).header("cookie", cookie).userAgent(useragent).get(), url, id));
} catch (IOException e) {
e.printStackTrace();
}
}
}
static class Inserter implements Runnable {
static ExecutorService inserter = Executors.newFixedThreadPool(1);
static Connection conn;
static PreparedStatement pstating, pstatcmt;
static {
try {
Class.forName("com.mysql.jdbc.Driver");
conn = DriverManager.getConnection(
"jdbc:mysql://localhost:3306/ing?useUnicode=true&characterEncoding=utf-8&autoReconnect=true", "root", "");
pstating = conn
.prepareStatement("insert into ing (id,url,user,date,content,lucky,userlink) values (?,?,?,?,?,?,?)");
pstatcmt = conn
.prepareStatement("insert into comment (id,ingid,user,content,date,userlink) values (?,?,?,?,?,?)");
} catch (Exception e) {
e.printStackTrace();
}
}
Ing ing;
public Inserter(Ing ing) {
this.ing = ing;
}
public static int getNextId() {
try {
Statement stat = conn.createStatement();
ResultSet rs = stat.executeQuery("select max(id) as id from ing");
if (rs.next()) {
return rs.getInt("id") + 1;
}
} catch (SQLException e) {
// ignore
}
return 1;
}
static int no = 0;
public static void insert(Ing ing) {
inserter.execute(new Inserter(ing));
}
@Override
public void run() {
System.out.println(++no + ". " + ing);
try {
pstating.setInt(1, ing.id);
pstating.setString(2, ing.url);
pstating.setString(3, ing.user);
pstating.setString(4, ing.date);
pstating.setString(5, ing.content);
pstating.setInt(6, ing.lucky ? 1 : 0);
pstating.setString(7, ing.userlink);
pstating.executeUpdate();
for (Ing.Comment c : ing.comments) {
pstatcmt.setInt(1, c.id);
pstatcmt.setInt(2, c.ingid);
pstatcmt.setString(3, c.user);
pstatcmt.setString(4, c.content);
pstatcmt.setString(5, c.date);
pstatcmt.setString(6, c.userlink);
pstatcmt.executeUpdate();
}
} catch (SQLException e) {
System.err.println("ERROR - " + e.getMessage() + " - " + ing);
}
}
}
}