php 根据url 获取标题
方法1:
function get_siteurl_curlinfo($url='') { // header("Content-Type:text/html;charset=utf-8"); $contents = @file_get_contents("compress.zlib://".$url); // $contents = iconv("gb2312", "utf-8//IGNORE", $contents); if (!$contents) return FALSE; preg_match("/charset=(.*)>/i", $contents, $charsets); $strs = "UTF-8"; if(strstr($charsets[1], "gb2312") != ""){ $strs = "gb2312"; $contents = iconv("gb2312", "utf-8//IGNORE", $contents); } if( strstr($charsets[1], "BIG5") != ""){ $contents = iconv("BIG5", "utf-8//IGNORE", $contents); } preg_match("/<title>(.*)<\/title>/i", $contents, $matches); // if($charsets!=="" && strstr($charsets, "gb2312") !="") { // $contents = iconv("gb2312", "utf-8//IGNORE", $contents); // return$charsets; // }else{ // return$matches[1]; // } return$matches[1]; // $matches[1]; }
方法2:
function get_siteurl_curlinfo($url='', $timeout=5, $conntimeout=3) { // header("Content-Type:text/html;charset=utf-8"); // $contents = @file_get_contents("compress.zlib://".$url); // $contents = @file_get_contents($url); // // $contents = iconv("gb2312", "utf-8//IGNORE", $contents); // if (!$contents) { // print_r( 'url:' .$url .'<br>'); $ch = curl_init(); $url_host = explode("/", $url)[2]; $header = array(); // print_r( 'url_host:' .$url_host .'<br>'); array_push($header, 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'); array_push($header, 'Referer:' . $url); array_push($header, 'host:' . $url_host); array_push($header, 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'); array_push($header, 'upgrade-insecure-requests:1'); // print_r( 'header:' .$header .'<br>'); curl_setopt($ch, CURLOPT_HTTPHEADER, $header); curl_setopt($ch, CURLOPT_AUTOREFERER, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);// HTTP 头中的 "Location: "重定向 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);// 字符串返回 curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);// https请求 不验证证书和hosts curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE); curl_setopt($ch, CURLOPT_HEADER, 1);// 0表示不输出Header,1表示输出 curl_setopt($ch, CURLOPT_NOBODY, 0);// 0表示不输出Body,1表示输出 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $conntimeout);// 尝试连接时等待的秒数。设置为0,则无限等待 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout+5);// 允许 cURL 函数执行的最长秒数 curl_setopt($ch, CURLOPT_URL, $url); // print_r( '$ch1:' .$ch .'<br>'); $output = curl_exec($ch); $curl_info = curl_getinfo($ch); curl_close($ch); preg_match('/charset=(.*?)>/i', $output, $charsets); // print_r('$charsets:' .$charsets[1].'<br>'); $strs = "UTF-8"; if(strstr($charsets[1], "gb2312") != ""){ $strs = "gb2312"; $output = iconv("gb2312", "utf-8//IGNORE", $output); } if( strstr($charsets[1], "BIG5") != ""){ $output = iconv("BIG5", "utf-8//IGNORE", $output); } if( strstr($charsets[1], "GBK") != ""){ $output = iconv("GBK", "utf-8//IGNORE", $output); } if( strpos($charsets[1], 'gbk')){ $output = iconv("gbk", "utf-8//IGNORE", $output); } if( strstr($charsets[1], "GB18030") != ""){ $output = iconv("GB18030", "utf-8//IGNORE", $output); } if( strstr($charsets[1], "Unicode") != ""){ $output = iconv("Unicode", "utf-8//IGNORE", $output); } preg_match("/<title>(.*)<\/title>/i", $output, $matches); // print_r('$contents:' .$contents.'<br>'); // print_r('title:' .$matches[1].'<br>'); return $matches[1]; // }; // preg_match("/charset=(.*)>/i", $contents, $charsets); // $strs = "UTF-8"; // if(strstr($charsets[1], "gb2312") != ""){ // $strs = "gb2312"; // $contents = iconv("gb2312", "utf-8//IGNORE", $contents); // } // if( strstr($charsets[1], "BIG5") != ""){ // $contents = iconv("BIG5", "utf-8//IGNORE", $contents); // } // preg_match("/<title>(.*)<\/title>/i", $contents, $matches); // return$matches[1]; // $matches[1]; }
实例:
function get_page_info($output, $curl_info=array(), $url) { $page_info = array(); $page_info['url']=$url; $page_info['site_title'] = ''; $page_info['site_description'] = ''; $page_info['site_keywords'] = ''; $page_info['friend_link_status'] = 0; $page_info['site_claim_status'] = 0; $page_info['site_home_size'] = 0; $page_info['charset']=''; $meta_content_type = ''; $contents = @file_get_contents("compress.zlib://".$url); // print_r('contents:'.$contents.'<br>'); if(empty($output)) { $contents = @file_get_contents("compress.zlib://".$url); preg_match('/charset=(.*?)("|\/>|>)/si', $contents, $charsete); $meta_content_type=$charsete[1]; if(!in_array(strtolower($meta_content_type), array('','utf-8','utf8','UTF-8'))) { $contents = iconv($meta_content_type, "utf-8//IGNORE", $contents); } preg_match('/(<|<\s+)TITLE(\s+>|>)([\w\W]*?)<\/TITLE>/si', $contents, $matchesk); // print_r('title:'.$matchesk[3].'<br>'); $page_info['site_title'] = $matchesk[3]?$matchesk[3]:''; return $page_info; } // 获取网页编码,把非utf-8网页编码转成utf-8,防止网页出现乱码 $meta_content_type = explode("charset=", $curl_info['content_type'])[1]; if(isset($curl_info['content_type']) && strstr($curl_info['content_type'], "charset=") != "") { $meta_content_type = explode("charset=", $curl_info['content_type'])[1]; // print_r('$meta_content_type2w3:'.$meta_content_type.'<br>'); } if(isset($curl_info['content_type']) && strstr($curl_info['content_type'], "charset=") == ""){ // print_r('charset为空<br>'); $contents = @file_get_contents("compress.zlib://".$url); // print_r('contents:'.$contents.'<br>'); preg_match('/charset=(.*?)("|\/>|>|\s+)/si', $contents, $charsete); $meta_content_type=$charsete[1]; // print_r('$charsete[1]:'.$charsete[1].'<br>'); } // print_r('$meta_content_type:'.$meta_content_type.'<br>'); if($meta_content_type == ''){ preg_match('/<META\s+http-equiv="Content-Type"\s+content="([\w\W]*?)"/si', $output, $matches);// 中文编码,如 http://www.qq.com // print_r('$matches:'.$matches[1].'<br>'); if(empty($matches[1])){ preg_match('/<META\s+content="([\w\W]*?)"\s+http-equiv="Content-Type"/si', $output, $matches); } if(empty($matches[1])){ preg_match('/<META\s+charset="([\w\W]*?)"/si', $output, $matches);// 特殊字符编码,如 http://www.500.com } if (!empty($matches[1]) && strstr($matches[1], "charset=") != ""){ $meta_content_type = explode("charset=", $matches[1])[1]; } } if(!in_array(strtolower($meta_content_type), array('','utf-8','utf8'))) { // print_r('gbk, gb2312<br>'); $output = mb_convert_encoding($output, "utf-8", $meta_content_type);// gbk, gb2312 } // 若网页仍然有乱码,有乱码则gbk转utf-8 if(json_encode( $output ) == '' || json_encode( $output ) == null){ // print_r('仍然有乱码<br>'); $output = mb_convert_encoding($output, "utf-8", 'gbk'); $$output = iconv($meta_content_type, "utf-8//IGNORE", $output); } // print_r('$output:'.$output.'<br>'); $page_info['site_home_size'] = strlen($output); $page_info['charset']=$meta_content_type; # Title preg_match('/(<|<\s+)TITLE(\s+>|>)([\w\W]*?)<\/TITLE>/si', $output, $matches); // print_r('$matches[3]:'.$matches[3].'<br>'); if (!empty($matches[3])) { $page_info['site_title'] = $matches[3]; }else{ if(!in_array(strtolower($meta_content_type), array('','utf-8','utf8'))) { $contents = iconv($meta_content_type, "utf-8//IGNORE", $contents); } preg_match('/(<|<\s+)TITLE(\s+>|>)([\w\W]*?)<\/TITLE>/si', $contents, $matchesk); // print_r('title:'.$matchesk[3].'<br>'); // print_r('$contents:'.$contents.'<br>'); $page_info['site_title'] = $matchesk[3]?$matchesk[3]:''; } // 正则匹配,获取全部的meta元数据 preg_match_all('/<META(.*?)>/si', $output, $matches); $meta_str_array = $matches[0]; $meta_array = array(); $meta_array['description'] = ''; $meta_array['keywords'] = ''; foreach($meta_str_array as $meta_str){ // print_r('$meta_str:'.$meta_str.'<br>'); preg_match('/<META\s+name="([\w\W]*?)"\s+content="([\w\W]*?)"/si', $meta_str, $res); if(!empty($res)) $meta_array[strtolower($res[1])] = $res[2]; preg_match('/<META\s+content="([\w\W]*?)"\s+name="([\w\W]*?)"/si', $meta_str, $res); if(!empty($res)) $meta_array[strtolower($res[2])] = $res[1]; preg_match('/<META\s+http-equiv="([\w\W]*?)"\s+content="([\w\W]*?)"/si', $meta_str, $res); if(!empty($res)) $meta_array[strtolower($res[1])] = $res[2]; preg_match('/<META\s+content="([\w\W]*?)"\s+http-equiv="([\w\W]*?)"/si', $meta_str, $res); if(!empty($res)) $meta_array[strtolower($res[2])] = $res[1]; preg_match('/<META\s+scheme="([\w\W]*?)"\s+content="([\w\W]*?)"/si', $meta_str, $res); if(!empty($res)) $meta_array[strtolower($res[1])] = $res[2]; preg_match('/<META\s+content="([\w\W]*?)"\s+scheme="([\w\W]*?)"/si', $meta_str, $res); if(!empty($res)) $meta_array[strtolower($res[2])] = $res[1]; } $page_info['site_keywords'] = $meta_array['keywords']; $page_info['site_description'] = $meta_array['description']; $page_info['meta_array'] = $meta_array; # mimvp-site-verification preg_match('/<META\s+name="mimvp-site-verification"\s+content="([\w\W]*?)"/si', $output, $matches); if (empty($matches[1])) { preg_match('/<META\s+content="([\w\W]*?)"\s+name="mimvp-site-verification"/si', $output, $matches); } if (!empty($matches[1])) { $page_info['site_claim_status'] = 1; } # mimvp-site-verification if(strstr($output, 'https://proxy.mimvp.com') != ""){ $page_info['friend_link_status'] = 1; } return $page_info; }

浙公网安备 33010602011771号