<?php
//1.告诉采集页面的地址
$url = 'http://www.zgjiemeng.com/dongwu/';
//2.读取采集页面地址
$str = file_get_contents($url);
// echo $str;
echo '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
//3.定义采集文章链接区域的正则
$pattern_qu = '/<ul class=\"list2 clearfix\">([\S\s]*?)<\/ul>/';
//4.进行正则匹配 将文章区域的链接匹配到
preg_match($pattern_qu,$str,$match_url);
// var_dump($match_url[1]);
preg_match_all ("/<li>(.*)<\/li>/U", $match_url[1], $pat_array);
print_r( $pat_array[0][1]);
preg_match_all ("/<li><a target=\"_blank\" title=\"(.*)\" href=\"(.*)\">(.*)<\/a><\/li>/U", $pat_array[0][1], $pat_array);
print_r($pat_array);
//preg_match_all("/(<([\w]+)[^>]*>)(.*?)(<\/\\2>)/", $match_url[1], $matches, PREG_SET_ORDER);
//print_r($matches );
die;
preg_match($pattern_qu,$match_url[1],$match_url);
var_dump($match_url);
die;
//5.定义匹配文章链接的正则
$pattern_url = '/<a\s+href=\"(.*?)\"\s+title/S';
//6.匹配文章的链接地址
preg_match_all($pattern_url,$match_url[1],$match);
var_dump($match);
die;
$num = 1;
//7.遍历匹配到的所有文章内容地址
foreach($match[1] as $k=>$v){
//echo $v.'<br/>';
//7.1循环打开文章内容地址
$content = file_get_contents($v);
//7.2定义匹配文章内容的正则
$con_pattern = '/<div\s+class=\"ad\"><\/div>(.*?)<span\s+id=\"supports\"\s+class=\"praise\"/Ss';
//7.3定义匹配文章标题的正则
$title_pattern = '/<title>(.*?)<\/title>/Ss';
//7.4进行文章内容的匹配
preg_match($con_pattern,$content,$newCon);
//var_dump($newCon);exit;
//7.5进行文章标题的匹配
preg_match($title_pattern,$content,$newTitle);
//var_dump($newTitle);
//7.6组成字符串
$newStr = $newTitle[0].'<meta charset="utf-8" />'.$newCon[1];
//7.7写入到指定文件中保存
file_put_contents('./collect/'.$num.'.html',$newStr);
$num ++;
}