php中实现的一个curl批处理的实例

curl是利用URL语法在命令行方式下工作的开源文件传输工具

本文在php中实现了的一个curl批处理的实例。

代码如下:

  1 header("Content-Type:text/html;charset=utf8");
  2 
  3 /* 先获取两个页面的所有a标签 */
  4 // 初始化两个简单处理句柄
  5 $ch1 = curl_init();
  6 $ch2 = curl_init();
  7 curl_setopt_array($ch1,array(
  8     CURLOPT_URL => 'http://www.sina.com.cn',
  9     CURLOPT_HEADER => 0,
 10     CURLOPT_RETURNTRANSFER => 1,
 11 ));
 12 curl_setopt_array($ch2,array(
 13     CURLOPT_URL => 'http://www.baidu.com/',
 14     CURLOPT_HEADER => 0,
 15     CURLOPT_RETURNTRANSFER => 1,
 16 ));
 17 
 18 // 初始化批处理句柄,并添加简单处理句柄
 19 $mh = curl_multi_init();
 20 curl_multi_add_handle($mh,$ch1);
 21 curl_multi_add_handle($mh,$ch2);
 22 
 23 // 初始化执行状态
 24 $state = null;
 25 
 26 // 执行批处理
 27 do{
 28     $mc = curl_multi_exec($mh,$state);    
 29 }while($mc == CURLM_CALL_MULTI_PERFORM);
 30 while($mc == CURLM_OK && $state) {
 31     while (curl_multi_exec($mh, $state) === CURLM_CALL_MULTI_PERFORM);
 32     // 经过实验,发现curl_multi_select($mh)总是返回-1,意味着一下代码不会执行
 33     if(curl_multi_select($mh) != -1) {
 34         do{
 35             $mc = curl_multi_exec($mh,$state);
 36         }while($mc == CURLM_CALL_MULTI_PERFORM);
 37     }    
 38 }
 39 
 40 // 获取内容
 41 $text  = curl_multi_getcontent($ch1);
 42 $text .= curl_multi_getcontent($ch2);
 43 
 44 // 找到页面中所有的a标签,保存到$matches
 45 $matches = null;
 46 preg_match_all("/<a.*?href\s*?=\s*?[\'\"](.*?)[\'\"].*?>(.*?)<\/a>/",$text,$matches);
 47 
 48 // 关闭各个句柄
 49 curl_multi_remove_handle($mh,$ch1);
 50 curl_multi_remove_handle($mh,$ch2);
 51 curl_multi_close($mh);
 52 
 53 /*在找到的连接中继续查找title标签 */
 54 
 55 $handle = array(); // 存储简单处理句柄的数组
 56 $mhandle = curl_multi_init(); //批处理句柄
 57 // 处理100个页面
 58 foreach(array_slice($matches[1],0,100) as $href) {
 59     $tmp_h = curl_init();
 60     curl_setopt_array($tmp_h,array(
 61         CURLOPT_URL => $href,
 62         CURLOPT_HEADER => 0,
 63         CURLOPT_RETURNTRANSFER => 1,
 64     ));
 65     curl_multi_add_handle($mhandle,$tmp_h);
 66     $handle[] = $tmp_h;
 67 }
 68 do{
 69     $mrc = curl_multi_exec($mhandle,$active);
 70 }while($mrc == CURLM_CALL_MULTI_PERFORM);
 71 while($mrc == CURLM_OK && $active) {
 72     while(curl_multi_exec($mhandle,$active) == CURLM_CALL_MULTI_PERFORM);
 73     if(curl_multi_select($mhandle) != -1) {
 74         do{
 75             $mrc = curl_multi_exec($mhandle,$active);
 76         }while($mrc == CURLM_CALL_MULTI_PERFORM);
 77     }
 78 }
 79 
 80 // 获取这些页面的内容
 81 $mtext = null;
 82 foreach($handle as $tmp_h) {
 83     $mtext .= curl_multi_getcontent($tmp_h);
 84     curl_multi_remove_handle($mhandle, $tmp_h);
 85 }
 86 $mmatches = array();
 87 preg_match_all("/<title>(.*?)<\/title>/",$mtext, $mmatches);
 88 
 89 // 编码转换
 90 mb_detect_order('GB2312,GBK,BIG5,GB18030,UNICODE ,CP936');
 91 foreach($mmatches[1] as $key => $val) {
 92     $encoding = mb_detect_encoding($val);
 93     if($encoding != 'UTF-8' && $encoding != 'CP936' && $encoding != 'GB18030' && $encoding !='') {
 94         $mmatches[1][$key] = iconv($encoding,'UTF-8//IGNORE',$val);
 95     }
 96 }
 97 
 98 // 打印title信息
 99 var_dump($mmatches[1]);
100 
101 // 关闭批处理句柄
102 curl_multi_close($mhandle);

 

posted @ 2016-04-24 17:05  猿客  Views(1417)  Comments(2Edit  收藏  举报