package GetUrls;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class GetIt {
private static BufferedWriter bw;
public static ArrayList<String> AllUrls=new ArrayList<String>();
public static ArrayList<String> get=new ArrayList<String>();
public static String sitename="smarter";
public static GetIt a=new GetIt();
public static ArrayList<String> JsName=new ArrayList<String>();
private static BufferedReader br;
public static ArrayList<String> tmp=new ArrayList<String>();
public static ArrayList<String> Error=new ArrayList<String>();
public static ExecutorService p = null;
// public static ArrayList<String> ErrorName=new ArrayList<String>();
public static void main(String args[]) throws IOException{
File f1 = new File("./src/GetAllUrljs/jsname");
FileReader reader = new FileReader(f1);
br = new BufferedReader(reader);
String line="";
while((line=br.readLine()) != null){
JsName.add(line);
}
// File f = new File("./src/GetUrls/errorname.txt");
// FileReader reader1 = new FileReader(f);
// br = new BufferedReader(reader1);
// String line1="";
// while((line1=br.readLine()) != null){
// ErrorName.add(line1);
// }
String url="http://www."+sitename+".com";
a.getAll(url);
for(int i=0;i<get.size();i++){
if(AllUrls.contains(get.get(i))==false){
tmp.add(get.get(i));
}
}
a.getrun();
if(Error.size()>0){
File f2 = new File("./src/GetUrls/"+sitename+"error.txt");
FileWriter writer = new FileWriter(f2, true);
bw = new BufferedWriter(writer);
for(int i=0;i<Error.size();i++){
bw.write(Error.get(i));
bw.newLine();
bw.flush();
}
bw.close();
}
}
public void getrun() {
// TODO Auto-generated method stub
get.clear();
p=Executors.newFixedThreadPool(10);
for(int i=0;i<tmp.size();i++){
p.execute(new runer(tmp.get(i)));
}
while (p.isTerminated() == false) {
p.shutdown();
tmp.clear();
}
for(int i=0;i<get.size();i++){
if(AllUrls.contains(get.get(i))==false){
tmp.add(get.get(i));
}
}
if(tmp.size()>0){
a.getrun();
}
}
public void getAll(String url) throws IOException {
// TODO Auto-generated method stub
AllUrls.add(url);
try{
Document doc = Jsoup.connect(url).timeout(120000).get();
Elements links=doc.select("a[href]");
for(Element link :links){
String urls=link.attr("href");
// for(int i=0;i<ErrorName.size();i++){
// if(urls.equals(ErrorName.get(i))){
// System.out.println(url+"\t"+ErrorName.get(i));
// }
// }
// CSUS判断逻辑
// if(urls !=null){
// if((urls.substring(0, 1)).equals("/") && urls.length()>2 && urls.indexOf("/topic/")<0 && urls.indexOf("/sst/")<0 && urls.indexOf("search.php")<0 && urls.indexOf(".jpg")<0 && urls.indexOf("/track/scripts/")<0){
// urls="http://www."+sitename+".com"+urls;
// if(get.contains(urls)==false)
// get.add(urls);
// }
// if(urls.indexOf("www."+sitename)>0 && urls.length()>2 && urls.indexOf("/topic/")<0 && urls.indexOf("/sst/")<0 && urls.indexOf("search.php")<0 && urls.indexOf(".jpg")<0 && urls.indexOf("/track/scripts/")<0){
// if(get.contains(urls)==false)
// get.add(urls);
// }
// }
// smarter判断逻辑
if(urls != null){
if((urls.substring(0, 1)).equals("/") && urls.length()>2 && urls.indexOf("/pl--")<0 && urls.indexOf("/se--")<0 && urls.indexOf("/sd--")<0 && urls.indexOf("/sz--")<0 && urls.indexOf("/cl--")<0 && urls.indexOf("/scripts/")<0){
urls="http://www."+sitename+".com"+urls;
if(get.contains(urls)==false)
get.add(urls);
}
if(urls.indexOf("http://www."+sitename)>0 && urls.length()>2 && urls.indexOf("/pl--")<0 && urls.indexOf("/se--")<0 && urls.indexOf("/sd--")<0 && urls.indexOf("/sz--")<0 && urls.indexOf("/cl--")<0 && urls.indexOf("/scripts/")<0){
if(get.contains(urls)==false)
get.add(urls);
}
}
}
String html=doc.html();
int station[]=new int[JsName.size()];
for(int i=0;i<JsName.size();i++){
if(html.indexOf(JsName.get(i))>0)
station[i]=1;
else
station[i]=0;
}
a.witer(url,station);
}catch(Exception e){
Error.add(url+"\t"+e.getMessage());
}
}
private void witer(String url, int[] station) throws IOException {
// TODO Auto-generated method stub
File f2 = new File("./src/GetUrls/"+sitename+"report.txt");
FileWriter writer = new FileWriter(f2, true);
bw = new BufferedWriter(writer);
bw.write(url+"\t"+station[0]+"\t"+station[1]+"\t"+station[2]+"\t"+station[3]+"\t"+station[4]+"\t"+station[5]);
bw.newLine();
bw.flush();
bw.close();
}
}
package GetUrls;
import java.io.IOException;
public class runer implements Runnable {
String url=null;
public runer(String s) {
// TODO Auto-generated constructor stub
this.url=s;
}
@Override
public void run() {
// TODO Auto-generated method stub
GetIt a=new GetIt();
try {
a.getAll(url);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}