2个爬虫

<?php
namespace Util\data;
use Util\data\DbUtil;

class Index_m
{
  /*1，获取新闻首页
    2，获取新闻链接数组
    3，循环，将每个链接截取 作者、标题、内容，写入数组
    4，将数组写进数据库1
  */

  public function update_m(){
    $url = 'http://news.ifeng.com/listpage/11502/0/1/rtlist.shtml';
    $curl = $this->curl($url);
    
    //获取子新闻们的url
    $urls = $this->geturls($curl);

    $news = [];
    $num = 0;
    //获取子链接每个新闻的标题、正文、url
    foreach($urls as $value){
      $new_curl = $this->curl($value);
      //如果是图集的话
      if(substr_count($new_curl, 'picBoxPrev')>0){
        echo '图集被删除';
          continue;
      }  
      //获取标题
      $title = $this->getKeyWord($new_curl,'<title>','</title>')[0];   
      if($x = strpos($title,'_凤凰')){
          $title = substr($title,0,$x);
      }
     

      //获取内容
      if(substr_count($new_curl,'<!--mainContent begin-->')>0){
        $body  = $this->getKeyWord($new_curl,'<!--mainContent begin-->','<span class="ifengLogo"><a')[0];
      }elseif(substr_count($new_curl,'<!-- 正文begin -->')>0){
        $body  = $this->getKeyWord($new_curl,' <!-- 正文begin -->','<span class="ifengLogo"><a')[0];
      }    
 
      

      //获取url
      $img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg';
     if(substr_count($body,'<img src="')>0){
        $img = $this->getKeyWord($body,'<img src="','">')[0];
        
        if(substr_count($img,'gif')>0){
            $index = strpos($img,'gif');
            $img = substr($img,0,$index+3);
        }
        elseif(substr_count($img,'jpeg')>0){
              $index = strpos($img,'jpeg');
              $img = substr($img,0,$index+4);
       }elseif(substr_count($img,'jpg')>0){
          $index = strpos($img,'jpg');
           $img = substr($img,0,$index+3);
       }
     }
     if(preg_match('/[\x{4e00}-\x{9fa5}]/u', $img)>0){
      $img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg';
     }
     /* $img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg';*/
/*      $rule = '/^<img src="(*)"$/'
      preg_match_all($body, $rule,$img);*/


      if(strlen($body)<100||strlen($title)<20){
        continue;
      } 
        $news[$num]['news_title'] =  trim($title);
        $news[$num]['news_body'] = trim($body);
        $news[$num]['news_pic'] = trim($img);
        $news[$num]['news_autuor'] = '新闻网';
        $num++;
    }
    $arr = $this->do_sql($news);
    return  $arr;
    
  }




  //将新闻们写进数据库
  public function do_sql($news){

    $b = array_rand($news,6);
    foreach($b as $k =>$v){
      $arr[] = $news[$v];
    }

    //总数
    $sum = 0;
    //写进的数量
    $succ = 0;
    //重复的数量
    $ready = 0;

    foreach($arr as $value=>$key){
       $sum++;
       //去重
       if(DbUtil::getdb()->table('news')->where(array('news_title'=>$key['news_title']))->count()>0){
          $ready++;
          continue;
       }
       $title = $key['news_title'];
       if(DbUtil::getdb()->table('news')->insert($key)){
          $succ++;
       }
      print_r($key['news_title']);
    }
    //$sum:总数  $succ：成功个数 $ready：重复的个数
      return array($sum,$succ,$ready,);
   
  }
  //获取子链接
  public function geturls($curl){
    $urls = [];
    $url = $this->getKeyWord($curl,'<h2>即时新闻</h2>','<div class="clear"></div>')[0];
    $index = 0;
    for($x = 0;$x<20;$x++){
      $arr  = $this->getKeyWord($url,'<a href="','" target="_blank">',$index);
      $urls []  = trim($arr[0]);
      $index = $arr[1];
    }
    return $urls;
  }


/*截取有用的子串(爬虫相关)
$info=网页  $first_key=开始的字符串  $last_key=结束的字符串
return 中间的字符串;
  $index:结束字符串的索引(选填)*/
function getKeyWord($info,$first_key,$last_key,$index = 0){
    $len = strlen($first_key);
    $first_key_start = strpos($info,$first_key,$index);
    $last_key_start = strpos($info,$last_key,$first_key_start);
    $keyword = trim(substr($info,$first_key_start+$len,$last_key_start-$first_key_start-$len));
    //return array(关键词,最后的索引，方便循环)
    return array($keyword,$last_key_start);
}

/*$url :html链接
return :解析后的html文档（字符串）
获取CURL请求的输出信息，这个可以爬取https，非常好*/
function curl($url,$coding='utf-8') { 
    //初始化
    $ch = curl_init();
    //设置选项，包括url

    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_HEADER, 0);//不返回response头部信息
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //TRUE 将curl_exec()获取的信息以字符串返回，而不是直接输出。
  
  /* curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //支持重定向*/
    //不验证证书和host
/*    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);*/


    $result = curl_exec($ch);
    //释放curl句柄
    curl_close($ch);
      //如果网站不是utf-8编码的话要转码
      if($coding!='utf-8'){
          $result= iconv($coding,"utf-8//IGNORE",$result);  
      }   
    return $result;   
}

}
?>
posted @ 2018-05-13 15:44 cl94 阅读(139) 评论(0) 收藏举报
刷新页面返回顶部
cl94

2个爬虫

公告