java 抓数据
搜索某网站搜索超链接页面,解决的路径冲突,就是TMD扯出一大堆boss,比如电信啊,工商啊,新浪啊!!! public List<HttpLinkItem> getLink(HttpLink httpLink,List<HttpLinkItem> list,List<HttpLink> baditem) throws IOException { //检查页面是否被查询过 boolean b=false; if(list.size()!=1){ for(HttpLink e:baditem){ if(e.getUrl().equals(httpLink.getUrl())){ b=true; } } if(b){ return list; } } Document doc =null; //防止某页面打不开导致程序中断 try{ doc = Jsoup.connect(httpLink.getUrl()).get(); }catch (Exception e) { } if(doc!=null){ Elements links = doc.select("a[href]"); String url; baditem.add(httpLink); //获取所有链接 for (Element link : links) { url=link.attr("abs:href"); HttpLinkItem item=new HttpLinkItem(); item.setName(trim(link.text(), 35)); item.setUrl(url); item.setLink(httpLink); if(!list.contains(item)){ System.out.println(item.tosString()); list.add(item); getLink(item.toLink(),list, baditem); }else if(!list.contains(item)){ list.add(item); System.out.println(item.tosString()); getLink(item.toLink(),list, baditem); } } } return list; }
|
搜索某网站搜索超链接页面,解决的路径冲突,就是TMD扯出一大堆boss,比如电信啊,工商啊,新浪啊!!!
public List<HttpLinkItem> getLink(HttpLink httpLink,List<HttpLinkItem> list,List<HttpLink> baditem) throws IOException { //检查页面是否被查询过 boolean b=false; if(list.size()!=1){ for(HttpLink e:baditem){ if(e.getUrl().equals(httpLink.getUrl())){ b=true; } } if(b){ return list; } } Document doc =null; //防止某页面打不开导致程序中断 try{ doc = Jsoup.connect(httpLink.getUrl()).get(); }catch (Exception e) { } if(doc!=null){ Elements links = doc.select("a[href]"); String url; baditem.add(httpLink); //获取所有链接 for (Element link : links) { url=link.attr("abs:href"); HttpLinkItem item=new HttpLinkItem(); item.setName(trim(link.text(), 35)); item.setUrl(url); item.setLink(httpLink); if(!list.contains(item)){ System.out.println(item.tosString()); list.add(item); getLink(item.toLink(),list, baditem); }else if(!list.contains(item)){ list.add(item); System.out.println(item.tosString()); getLink(item.toLink(),list, baditem); } } } return list; } |

浙公网安备 33010602011771号