四脚猫下载
思路:
1、 首先是采用curl的方式采集到这个网页的内容
2、 用正则的方式或者html解析器把url分析出来
3、 对于每一个url,进行请求,如果状态不是2xx、3xx等就定义为异常。
<?php class http_stat{ public $url; private $document; private $links; private $domain; private $links_stat; public function get_all_link_status($url){ if($this->__get_document($url)!=false){ $this->url = $url; $this->document = $this->__get_document($this->url); $this->links = $this->__strip_links($this->document); foreach($this->links as $val){ if($val == '#'){ $res['empty'] +=1; }elseif(strpos($val,'http')!==false){ $state_num = $this->__get_http_status($val); $res[$state_num][] = $val; }else{ $url = $this->url.$val; $state_num = $this->__get_http_status($url); $res[$state_num][] = $val; } } return $res; } } private function __get_http_status($s_url){ $curl = curl_init(); curl_setopt($curl,CURLOPT_URL,$s_url); curl_setopt($curl,CURLOPT_HEADER,1); curl_setopt($curl,CURLOPT_NOBODY,1); curl_setopt($curl,CURLOPT_RETURNTRANSFER,1); curl_setopt($curl,CURLOPT_TIMEOUT,30); curl_exec($curl); $rtn= curl_getinfo($curl,CURLINFO_HTTP_CODE); curl_close($curl); return $rtn; } private function __get_document($url){ $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $response = curl_exec($ch); if(curl_errno($ch)) { print curl_error($ch); return false; } curl_close($ch); return $response; } private function __strip_links($document){ preg_match_all('|<a(.*?)href="(.*?)"(.*?)>(.*?)</a>|i', $document, $links); while(list($key,$val) = each($links[2])){ if(!empty($val))$match[] = $val; } return $match; } } $t = new http_stat(); $res = $t->get_all_link_status("http://www.sina.com.cn"); var_dump($res);
转自 http://blog.csdn.net/qq43599939/article/details/78168307

浙公网安备 33010602011771号