php 根据url 获取标题

方法1:

function get_siteurl_curlinfo($url='') {
    // header("Content-Type:text/html;charset=utf-8");
    $contents = @file_get_contents("compress.zlib://".$url);

    // $contents = iconv("gb2312", "utf-8//IGNORE", $contents); 
   if (!$contents) return FALSE;
   preg_match("/charset=(.*)>/i", $contents, $charsets);
   
   
   $strs = "UTF-8";
   
   if(strstr($charsets[1], "gb2312") != ""){
        $strs = "gb2312";
        $contents = iconv("gb2312", "utf-8//IGNORE", $contents); 
    }
    if( strstr($charsets[1], "BIG5") != ""){
        $contents = iconv("BIG5", "utf-8//IGNORE", $contents); 
    }
    
   
   preg_match("/<title>(.*)<\/title>/i", $contents, $matches);
//    if($charsets!=="" && strstr($charsets, "gb2312") !="") {
//         $contents = iconv("gb2312", "utf-8//IGNORE", $contents);
//         return$charsets;
//    }else{
//     return$matches[1]; 
//    }
   return$matches[1]; 
    //  $matches[1]; 
   
}

 

方法2:

function get_siteurl_curlinfo($url='', $timeout=5, $conntimeout=3) {
    // header("Content-Type:text/html;charset=utf-8");
    // $contents = @file_get_contents("compress.zlib://".$url);
//     $contents = @file_get_contents($url);
//     // $contents = iconv("gb2312", "utf-8//IGNORE", $contents); 
//    if (!$contents) {
    // print_r( 'url:' .$url .'<br>');
    $ch = curl_init();
    $url_host = explode("/", $url)[2];
    $header = array();
    // print_r( 'url_host:' .$url_host .'<br>');
    array_push($header, 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36');
    array_push($header, 'Referer:' . $url);
    array_push($header, 'host:' . $url_host);
    array_push($header, 'accept:  text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8');
    array_push($header, 'upgrade-insecure-requests:1');
    // print_r( 'header:' .$header .'<br>');
    curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
    curl_setopt($ch, CURLOPT_AUTOREFERER, 1);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);// HTTP 头中的 "Location: "重定向
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);// 字符串返回
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);// https请求 不验证证书和hosts
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
    curl_setopt($ch, CURLOPT_HEADER, 1);// 0表示不输出Header,1表示输出
    curl_setopt($ch, CURLOPT_NOBODY, 0);// 0表示不输出Body,1表示输出
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $conntimeout);// 尝试连接时等待的秒数。设置为0,则无限等待
    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout+5);// 允许 cURL 函数执行的最长秒数
    curl_setopt($ch, CURLOPT_URL, $url);
    // print_r( '$ch1:' .$ch .'<br>');
    $output = curl_exec($ch);
    $curl_info = curl_getinfo($ch);
    curl_close($ch);
    
    preg_match('/charset=(.*?)>/i', $output, $charsets);
    
    
    // print_r('$charsets:' .$charsets[1].'<br>');

    $strs = "UTF-8";

    if(strstr($charsets[1], "gb2312") != ""){
        $strs = "gb2312";
        $output = iconv("gb2312", "utf-8//IGNORE", $output); 
        
    }
    if( strstr($charsets[1], "BIG5") != ""){
        $output = iconv("BIG5", "utf-8//IGNORE", $output);
        
    }
    if( strstr($charsets[1], "GBK") != ""){
        $output = iconv("GBK", "utf-8//IGNORE", $output);
       
    }
    if( strpos($charsets[1], 'gbk')){
        $output = iconv("gbk", "utf-8//IGNORE", $output); 
        
       
    }
    if( strstr($charsets[1], "GB18030") != ""){
        $output = iconv("GB18030", "utf-8//IGNORE", $output); 
        
    }
    if( strstr($charsets[1], "Unicode") != ""){
        $output = iconv("Unicode", "utf-8//IGNORE", $output);
         
    }
    preg_match("/<title>(.*)<\/title>/i", $output, $matches);
    // print_r('$contents:' .$contents.'<br>');
    // print_r('title:' .$matches[1].'<br>');
    return $matches[1];
    // };

//    preg_match("/charset=(.*)>/i", $contents, $charsets);
   
   
//    $strs = "UTF-8";
   
//    if(strstr($charsets[1], "gb2312") != ""){
//         $strs = "gb2312";
//         $contents = iconv("gb2312", "utf-8//IGNORE", $contents); 
//     }
//     if( strstr($charsets[1], "BIG5") != ""){
//         $contents = iconv("BIG5", "utf-8//IGNORE", $contents); 
//     }
    
   
//    preg_match("/<title>(.*)<\/title>/i", $contents, $matches);

//    return$matches[1]; 
    //  $matches[1]; 
}

 

实例:

function get_page_info($output, $curl_info=array(), $url) {
    $page_info = array();
    $page_info['url']=$url;
    $page_info['site_title'] = '';
    $page_info['site_description'] = '';
    $page_info['site_keywords'] = '';
    $page_info['friend_link_status'] = 0;
    $page_info['site_claim_status'] = 0;
    $page_info['site_home_size'] = 0;
    $page_info['charset']='';

    $meta_content_type = '';
    $contents = @file_get_contents("compress.zlib://".$url);
    // print_r('contents:'.$contents.'<br>');
    
    if(empty($output)) {
        $contents = @file_get_contents("compress.zlib://".$url);
        preg_match('/charset=(.*?)("|\/>|>)/si', $contents, $charsete);
        $meta_content_type=$charsete[1];
        if(!in_array(strtolower($meta_content_type), array('','utf-8','utf8','UTF-8'))) {
            $contents = iconv($meta_content_type, "utf-8//IGNORE", $contents);
        }
        preg_match('/(<|<\s+)TITLE(\s+>|>)([\w\W]*?)<\/TITLE>/si', $contents, $matchesk);
        // print_r('title:'.$matchesk[3].'<br>');
        $page_info['site_title'] = $matchesk[3]?$matchesk[3]:'';
        return $page_info;
    }
    // 获取网页编码,把非utf-8网页编码转成utf-8,防止网页出现乱码
    
    $meta_content_type = explode("charset=", $curl_info['content_type'])[1];
    if(isset($curl_info['content_type']) && strstr($curl_info['content_type'], "charset=") != "") {
        $meta_content_type = explode("charset=", $curl_info['content_type'])[1];
        // print_r('$meta_content_type2w3:'.$meta_content_type.'<br>');
    }
    if(isset($curl_info['content_type']) && strstr($curl_info['content_type'], "charset=") == ""){
        // print_r('charset为空<br>');
        $contents = @file_get_contents("compress.zlib://".$url);
        // print_r('contents:'.$contents.'<br>');
        preg_match('/charset=(.*?)("|\/>|>|\s+)/si', $contents, $charsete);
        $meta_content_type=$charsete[1];
        // print_r('$charsete[1]:'.$charsete[1].'<br>');
    }
    // print_r('$meta_content_type:'.$meta_content_type.'<br>');

    if($meta_content_type == ''){

        preg_match('/<META\s+http-equiv="Content-Type"\s+content="([\w\W]*?)"/si', $output, $matches);// 中文编码,如 http://www.qq.com
        // print_r('$matches:'.$matches[1].'<br>');
        if(empty($matches[1])){
            preg_match('/<META\s+content="([\w\W]*?)"\s+http-equiv="Content-Type"/si', $output, $matches);
        }
        if(empty($matches[1])){
            preg_match('/<META\s+charset="([\w\W]*?)"/si', $output, $matches);// 特殊字符编码,如 http://www.500.com
        }
        if (!empty($matches[1]) && strstr($matches[1], "charset=") != ""){
            $meta_content_type = explode("charset=", $matches[1])[1];
        }
    }
    if(!in_array(strtolower($meta_content_type), array('','utf-8','utf8'))) {
        // print_r('gbk, gb2312<br>');
        $output = mb_convert_encoding($output, "utf-8", $meta_content_type);// gbk, gb2312
    }
    
// 若网页仍然有乱码,有乱码则gbk转utf-8
if(json_encode( $output ) == '' || json_encode( $output ) == null){
    // print_r('仍然有乱码<br>');
    $output = mb_convert_encoding($output, "utf-8", 'gbk');
    $$output = iconv($meta_content_type, "utf-8//IGNORE", $output); 
}


// print_r('$output:'.$output.'<br>');
$page_info['site_home_size'] = strlen($output);
$page_info['charset']=$meta_content_type;

# Title
preg_match('/(<|<\s+)TITLE(\s+>|>)([\w\W]*?)<\/TITLE>/si', $output, $matches);

// print_r('$matches[3]:'.$matches[3].'<br>');
if (!empty($matches[3])) {
    $page_info['site_title'] = $matches[3];
}else{
    if(!in_array(strtolower($meta_content_type), array('','utf-8','utf8'))) {
        $contents = iconv($meta_content_type, "utf-8//IGNORE", $contents);
    }
    preg_match('/(<|<\s+)TITLE(\s+>|>)([\w\W]*?)<\/TITLE>/si', $contents, $matchesk);
    // print_r('title:'.$matchesk[3].'<br>');
    // print_r('$contents:'.$contents.'<br>');
    $page_info['site_title'] = $matchesk[3]?$matchesk[3]:'';
}
    // 正则匹配,获取全部的meta元数据
    preg_match_all('/<META(.*?)>/si', $output, $matches);
    $meta_str_array = $matches[0];
    $meta_array = array();
    $meta_array['description'] = '';
    $meta_array['keywords'] = '';
    foreach($meta_str_array as $meta_str){
        // print_r('$meta_str:'.$meta_str.'<br>');
        preg_match('/<META\s+name="([\w\W]*?)"\s+content="([\w\W]*?)"/si', $meta_str, $res);
        if(!empty($res)) $meta_array[strtolower($res[1])] = $res[2];
        preg_match('/<META\s+content="([\w\W]*?)"\s+name="([\w\W]*?)"/si', $meta_str, $res);
        if(!empty($res)) $meta_array[strtolower($res[2])] = $res[1];
        preg_match('/<META\s+http-equiv="([\w\W]*?)"\s+content="([\w\W]*?)"/si', $meta_str, $res);
        if(!empty($res)) $meta_array[strtolower($res[1])] = $res[2];
        
        preg_match('/<META\s+content="([\w\W]*?)"\s+http-equiv="([\w\W]*?)"/si', $meta_str, $res);
        if(!empty($res)) $meta_array[strtolower($res[2])] = $res[1];
        
        preg_match('/<META\s+scheme="([\w\W]*?)"\s+content="([\w\W]*?)"/si', $meta_str, $res);
        if(!empty($res)) $meta_array[strtolower($res[1])] = $res[2];
        
        preg_match('/<META\s+content="([\w\W]*?)"\s+scheme="([\w\W]*?)"/si', $meta_str, $res);
         if(!empty($res)) $meta_array[strtolower($res[2])] = $res[1];
}
        $page_info['site_keywords'] = $meta_array['keywords'];
        $page_info['site_description'] = $meta_array['description'];
        $page_info['meta_array'] = $meta_array;
        
        # mimvp-site-verification
        preg_match('/<META\s+name="mimvp-site-verification"\s+content="([\w\W]*?)"/si', $output, $matches);
        if (empty($matches[1])) {
            preg_match('/<META\s+content="([\w\W]*?)"\s+name="mimvp-site-verification"/si', $output, $matches);
        }
        if (!empty($matches[1])) {
            $page_info['site_claim_status'] = 1;
        }
         # mimvp-site-verification
         if(strstr($output, 'https://proxy.mimvp.com') != ""){
             $page_info['friend_link_status'] = 1;
            }
            return $page_info;
        }

 

posted @ 2020-04-13 17:57  huihui2014  阅读(165)  评论(0)    收藏  举报