php 采集常用代码
function curl_get($url, $gzip=false){
$curl = curl_init($url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($curl,CURLOPT_USERAGENT,"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
// curl_setopt($curl, CURLOPT_ENCODING, "gzip"); // gzip
$content = curl_exec($curl);
curl_close($curl);
return $content;
}
function get_middle($before,$after,$str)/*{{{*/
{
if(strpos($str, $before) === false){
return '';
}
$data = explode($before, $str);
$data = $data[1];
if(strpos($data, $after) === false){
return '';
}
$data = explode($after, $data);
$data = $data[0];
return trim($data);
}/*}}}*/
用法
$html=curl_get($url);
$title=get_middle('<title>','</title>',$html); //截取以<title>开头至</title>之间的字符
常用的一些过滤代码
$content= preg_replace( "@<svg(.*?)</svg>@is", "", $content ); //将svg标签内容替换为空
提取某个变量
<a href="/cat/2546">我是大侠</a>
preg_match_all("|<a href=\"/cat/(.*)\">(.*)</a>|isU",$html,$daijiejue);
$c1=$daijiejue[1][0]; // /cat/2546
$c2=$daijiejue[2][0]; //我是大侠
浙公网安备 33010602011771号