import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.jsoup.select.Selector.SelectorParseException;
public class ICBCategoryTest {
public void print(String s) {
System.out.println(s);
}
public String[] ICB(String URL) throws IOException {
String[] result = {"","","","","","","","","","","","",""};
try {
Document doc = Jsoup.connect(URL).timeout(120000).get();
// title
try {
Elements title = doc.select("title");
result[0] = title.text();
// System.out.println(result[0]);
} catch (SelectorParseException e) {
result[0] = "";
}
// descrption
try {
Elements description = doc.select("meta[name]");
result[1] = description.attr("content");
} catch (SelectorParseException e) {
result[1] = "";
}
// canonical
try {
Elements canonical = doc.select("link[rel=canonical]");
result[2] = canonical.attr("href");
result[2] = result[2].replaceAll("www.internetcorkboard.", "staging.internetcorkboard.");
} catch (SelectorParseException e) {
result[2] = "";
}
// og:site_name
try {
Elements site_name = doc.select("meta[property=og:site_name]");
result[3] = site_name.attr("content");
} catch (SelectorParseException e) {
result[3] = "";
}
// og:image:width
try {
Elements image_width = doc.select("meta[property=og:image:width]");
result[4] = image_width.attr("content");
} catch (SelectorParseException e) {
result[4] = "";
}
// og:image:height
try {
Elements image_height = doc
.select("meta[property=og:image:height]");
result[5] = image_height.attr("content");
} catch (SelectorParseException e) {
result[5] = "";
}
// og:title
try {
Elements og_title = doc.select("meta[property=og:title]");
result[6] = og_title.attr("content");
} catch (SelectorParseException e) {
result[6] = "";
}
// og:description
try {
Elements og_description = doc.select("meta[property=og:description]");
result[7] = og_description.attr("content");
} catch (SelectorParseException e) {
result[7] = "";
}
// og:url
try{
Elements og_url = doc.select("meta[property=og:description]");
result[8] = og_url.attr("content");
result[8] = result[8].replaceAll("www.", "staging");
}catch(SelectorParseException e){
result[8] = "";
}
// og:type
try{
Elements og_type = doc.select("meta[property=og:description]");
result[9] = og_type.attr("content");
}catch(SelectorParseException e){
result[9] = "";
}
//body
try{
Elements body = doc.getElementsByClass("NoAdsBody");
result[10] = body.text();
}catch(SelectorParseException e){
result[10] = "";
}
//related articles
try{
Elements related = doc.getElementsByClass("relatedarticles");
result[11] = related.text();
}catch(SelectorParseException e){
result[11] = "";
}
//you may also like
try{
Elements related = doc.getElementsByClass("rgtitle");
result[12] = related.text();
}catch(SelectorParseException e){
result[12] = "";
}
return(result);
} catch (java.lang.NullPointerException e) {
System.out.println("null "+URL);
} catch (org.jsoup.HttpStatusException e) {
int i = e.getStatusCode();
System.out.println(i+" "+URL);
} catch(java.net.ConnectException e){
System.out.println("Time out :"+URL);
}
return result;
}
public static void main(String args[]) throws IOException {
ArrayList<String[]> a=new ArrayList<String[]>();
ArrayList<String[]> b=new ArrayList<String[]>();
ArrayList<String> CategoryUrl=new ArrayList<String>();
File f1 = new File("C:/ICBTest/CategoryUrl.txt");
File f2 = new File("C:/ICBTest/CategoryError.txt");
String line = "";
String Url="";
FileReader reader = new FileReader(f1);
FileWriter writer = new FileWriter(f2, true);
BufferedReader br = new BufferedReader(reader);
BufferedWriter bw = new BufferedWriter(writer);
while ((line = br.readLine()) != null) {
CategoryUrl.add(line);
Url = "http://www.internetcorkboard.com"+line+"?source=miva";
a.add(new ICBCategoryTest().ICB(Url));
Url = "http://staging.internetcorkboard.com"+line+"?source=miva";
b.add(new ICBCategoryTest().ICB(Url));
}
String[] list={"title","descrption","canonical","og:site_name","og:image:width","og:image:height","og:title","og:description","og:url","og:type","body","related articles","you may also like"};
if(a.size()==b.size()){
for(int i=0;i<a.size();i++){
String[] aa=a.get(i);
String[] bb=b.get(i);
String url=CategoryUrl.get(i);
for(int j=0;j<aa.length;j++){
if(aa[j].equals(bb[j])==false){
bw.write("Error:"+"\t"+url+"\t"+list[j]);
bw.newLine();
bw.flush();
}
}
}
}else{
System.out.println("总数不一致");
}
br.close();
bw.close();
}
}