抓取页面
<?php
namespace Doctor\Controller;
use Think\Controller;
class OldTrusteeshipController extends Controller {
/*
* 抓取页面
*/
public function index() {
$url = 'http://www.wmdhw.com/it/top1000world.html';
$contents = file_get_contents($url);
//转码
$getcontent = iconv("gb2312", "utf-8",$contents);
//获取所有的<tr>标签内容
preg_match_all('/<tr.*?>(.*?)<\/tr>/ism',$getcontent,$matchall);
$td = $matchall[0]['1'];
foreach($matchall[0] as $k=>$td){
//dump($td);
if($k>0){
//获取所有的<td>标签
preg_match_all('/\<td\>(.+?)\<\/td\>/',$td,$matcha);
dump($matcha['1']['0']); //网站排名
//获取所有的<a>标签
preg_match_all ('/<a href=\"(.*?)\".*?>(.*?)<\/a>/i',$td,$matches);
dump($matches['1']['0']); //网站
$s = ($matches['2']['0']); //名称
preg_match_all('/\<td.+?\>(.+?)\<\/td\>/',$td,$matcha);
$a = ($matcha['1']['0']); //网站内荣
$a = preg_replace('# #', '', $a); //去空格
$a = preg_replace('/([\x80-\xff]*)/i','',$a); //去汉字
$da = '['.$s.','.$a.']'."\r\n";
//写入文件
dump(file_put_contents("./test.txt", "$da", FILE_APPEND));
}
}
}
}
浙公网安备 33010602011771号