<?php
namespace Util\data;
use Util\data\DbUtil;
class Index_m
{
/*1,获取新闻首页
2,获取新闻链接数组
3,循环,将每个链接截取 作者、标题、内容,写入数组
4,将数组写进数据库1
*/
public function update_m(){
$url = 'http://news.ifeng.com/listpage/11502/0/1/rtlist.shtml';
$curl = $this->curl($url);
//获取子新闻们的url
$urls = $this->geturls($curl);
$news = [];
$num = 0;
//获取子链接每个新闻的标题、正文、url
foreach($urls as $value){
$new_curl = $this->curl($value);
//如果是图集的话
if(substr_count($new_curl, 'picBoxPrev')>0){
echo '图集被删除';
continue;
}
//获取标题
$title = $this->getKeyWord($new_curl,'<title>','</title>')[0];
if($x = strpos($title,'_凤凰')){
$title = substr($title,0,$x);
}
//获取内容
if(substr_count($new_curl,'<!--mainContent begin-->')>0){
$body = $this->getKeyWord($new_curl,'<!--mainContent begin-->','<span class="ifengLogo"><a')[0];
}elseif(substr_count($new_curl,'<!-- 正文begin -->')>0){
$body = $this->getKeyWord($new_curl,' <!-- 正文begin -->','<span class="ifengLogo"><a')[0];
}
//获取url
$img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg';
if(substr_count($body,'<img src="')>0){
$img = $this->getKeyWord($body,'<img src="','">')[0];
if(substr_count($img,'gif')>0){
$index = strpos($img,'gif');
$img = substr($img,0,$index+3);
}
elseif(substr_count($img,'jpeg')>0){
$index = strpos($img,'jpeg');
$img = substr($img,0,$index+4);
}elseif(substr_count($img,'jpg')>0){
$index = strpos($img,'jpg');
$img = substr($img,0,$index+3);
}
}
if(preg_match('/[\x{4e00}-\x{9fa5}]/u', $img)>0){
$img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg';
}
/* $img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg';*/
/* $rule = '/^<img src="(*)"$/'
preg_match_all($body, $rule,$img);*/
if(strlen($body)<100||strlen($title)<20){
continue;
}
$news[$num]['news_title'] = trim($title);
$news[$num]['news_body'] = trim($body);
$news[$num]['news_pic'] = trim($img);
$news[$num]['news_autuor'] = '新闻网';
$num++;
}
$arr = $this->do_sql($news);
return $arr;
}
//将新闻们写进数据库
public function do_sql($news){
$b = array_rand($news,6);
foreach($b as $k =>$v){
$arr[] = $news[$v];
}
//总数
$sum = 0;
//写进的数量
$succ = 0;
//重复的数量
$ready = 0;
foreach($arr as $value=>$key){
$sum++;
//去重
if(DbUtil::getdb()->table('news')->where(array('news_title'=>$key['news_title']))->count()>0){
$ready++;
continue;
}
$title = $key['news_title'];
if(DbUtil::getdb()->table('news')->insert($key)){
$succ++;
}
print_r($key['news_title']);
}
//$sum:总数 $succ:成功个数 $ready:重复的个数
return array($sum,$succ,$ready,);
}
//获取子链接
public function geturls($curl){
$urls = [];
$url = $this->getKeyWord($curl,'<h2>即时新闻</h2>','<div class="clear"></div>')[0];
$index = 0;
for($x = 0;$x<20;$x++){
$arr = $this->getKeyWord($url,'<a href="','" target="_blank">',$index);
$urls [] = trim($arr[0]);
$index = $arr[1];
}
return $urls;
}
/*截取有用的子串(爬虫相关)
$info=网页 $first_key=开始的字符串 $last_key=结束的字符串
return 中间的字符串;
$index:结束字符串的索引(选填)*/
function getKeyWord($info,$first_key,$last_key,$index = 0){
$len = strlen($first_key);
$first_key_start = strpos($info,$first_key,$index);
$last_key_start = strpos($info,$last_key,$first_key_start);
$keyword = trim(substr($info,$first_key_start+$len,$last_key_start-$first_key_start-$len));
//return array(关键词,最后的索引,方便循环)
return array($keyword,$last_key_start);
}
/*$url :html链接
return :解析后的html文档(字符串)
获取CURL请求的输出信息,这个可以爬取https,非常好*/
function curl($url,$coding='utf-8') {
//初始化
$ch = curl_init();
//设置选项,包括url
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);//不返回response头部信息
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //TRUE 将curl_exec()获取的信息以字符串返回,而不是直接输出。
/* curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //支持重定向*/
//不验证证书和host
/* curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);*/
$result = curl_exec($ch);
//释放curl句柄
curl_close($ch);
//如果网站不是utf-8编码的话要转码
if($coding!='utf-8'){
$result= iconv($coding,"utf-8//IGNORE",$result);
}
return $result;
}
}
?>