抓取页面

<?php

namespace Doctor\Controller;

use Think\Controller;

class OldTrusteeshipController extends Controller {
/*
* 抓取页面
*/
public function index() {
$url = 'http://www.wmdhw.com/it/top1000world.html';
$contents = file_get_contents($url);

//转码
$getcontent = iconv("gb2312", "utf-8",$contents);

//获取所有的<tr>标签内容
preg_match_all('/<tr.*?>(.*?)<\/tr>/ism',$getcontent,$matchall);

$td = $matchall[0]['1'];
foreach($matchall[0] as $k=>$td){
//dump($td);
if($k>0){

//获取所有的<td>标签
preg_match_all('/\<td\>(.+?)\<\/td\>/',$td,$matcha);
dump($matcha['1']['0']); //网站排名

 

//获取所有的<a>标签

preg_match_all ('/<a href=\"(.*?)\".*?>(.*?)<\/a>/i',$td,$matches);
dump($matches['1']['0']); //网站
$s = ($matches['2']['0']); //名称

preg_match_all('/\<td.+?\>(.+?)\<\/td\>/',$td,$matcha);
$a = ($matcha['1']['0']); //网站内荣
$a = preg_replace('# #', '', $a); //去空格
$a = preg_replace('/([\x80-\xff]*)/i','',$a); //去汉字

$da = '['.$s.','.$a.']'."\r\n";

//写入文件
dump(file_put_contents("./test.txt", "$da", FILE_APPEND));
}
}
}
}

posted @ 2017-10-09 11:24  让双脚&去腾空  阅读(143)  评论(0)    收藏  举报