四脚猫下载

思路:

1、 首先是采用curl的方式采集到这个网页的内容

2、 用正则的方式或者html解析器把url分析出来

3、 对于每一个url,进行请求,如果状态不是2xx、3xx等就定义为异常。

<?php  
class http_stat{  
    public $url;  
    private $document;  
    private $links;  
    private $domain;  
    private $links_stat;  
    public function get_all_link_status($url){  
        if($this->__get_document($url)!=false){  
            $this->url = $url;  
            $this->document = $this->__get_document($this->url);  
            $this->links = $this->__strip_links($this->document);  
            foreach($this->links as $val){  
                if($val == '#'){  
                    $res['empty'] +=1;  
                }elseif(strpos($val,'http')!==false){  
                    $state_num = $this->__get_http_status($val);  
                    $res[$state_num][] = $val;  
                }else{  
                    $url = $this->url.$val;  
                    $state_num = $this->__get_http_status($url);  
                    $res[$state_num][] = $val;  
                }  
            }  
            return $res;  
        }  
    }  
    private function __get_http_status($s_url){  
        $curl = curl_init();  
        curl_setopt($curl,CURLOPT_URL,$s_url);  
        curl_setopt($curl,CURLOPT_HEADER,1);  
        curl_setopt($curl,CURLOPT_NOBODY,1);  
        curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);  
        curl_setopt($curl,CURLOPT_TIMEOUT,30);  
        curl_exec($curl);  
        $rtn= curl_getinfo($curl,CURLINFO_HTTP_CODE);  
        curl_close($curl);  
        return  $rtn;  
    }  
    private function __get_document($url){  
        $ch = curl_init();  
        curl_setopt($ch, CURLOPT_URL, $url);  
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);  
        $response = curl_exec($ch);  
        if(curl_errno($ch))  
        {  
            print curl_error($ch);  
            return false;  
        }  
        curl_close($ch);  
        return $response;  
    }  
    private function __strip_links($document){  
        preg_match_all('|<a(.*?)href="(.*?)"(.*?)>(.*?)</a>|i', $document, $links);  
        while(list($key,$val) = each($links[2])){  
            if(!empty($val))$match[] = $val;  
        }  
        return $match;  
    }  
}  
$t = new http_stat();  
$res = $t->get_all_link_status("http://www.sina.com.cn");  
var_dump($res);  

转自 http://blog.csdn.net/qq43599939/article/details/78168307

posted @ 2017-12-17 20:19  yangchunlong  阅读(144)  评论(0)    收藏  举报