php curl 多线程

php 多线程控制(说明:本文代码在查询百度排名时并不好使,仅供批评)

  1 <html>
  2 <head>
  3 <meta http-equiv="content-type" content="text/html;charset=utf-8" />
  4 <title>百度关键词排名批量查询</title>
  5 </head>
  6 <body>
  7 <h3>百度关键词排名批量查询</h3>
  8 <form action="/baidu/index.php" method="post">
  9 输入关键词(每行一个)<br />
 10 <textarea name="keyword" style="resize:none;width:243px;height:70px;"></textarea>
 11 <br />输入网址:
 12 <input type="text" name="url" size="20" value="39.net" />(如:39.net 勿加http://)<br />
 13 <input type="submit" name="sub" value="查询" />
 14 </form>
 15 <hr />
 16 </body>
 17 </html>
 18 <?php
 19 if(isset($_POST['sub'])){
 20     $start_time = microtime_float();
 21     $kw = $_POST['keyword'];
 22     $findurl = $_POST['url'];
 23     $httpcurl = new CoreHttpCurl();
 24     $keywords = $httpcurl->get_keywords($kw);    //查询的关键词数组
 25     $urls = $httpcurl->get_urls($keywords);        //百度搜索结果页面,array("关键词"=>"url",)
 26     $ranks = $httpcurl->get($urls,10,$findurl);    //关键词排名,array("关键词"=>"排名",)
 27     $output = "<table border='1' bordercolor='green' cellspacing='0'><tr><th>关键词</th><th>排名</th></tr>";
 28     foreach($ranks as $keyword=>$rank){
 29         $output .= "<tr><td>{$keyword}</td><td>{$rank}</td></tr>";
 30     }
 31     $output .= "</table>";
 32     echo $output;
 33     
 34     $end_time = microtime_float();
 35     $con_time = $end_time - $start_time;
 36     echo "查询耗时:".$con_time;
 37 }
 38 
 39 /**
 40     * 计算耗时
 41 **/
 42 function microtime_float(){
 43     list($usec,$sec) = explode(" ",microtime());
 44     return ((float)$usec+(float)$sec);
 45 }
 46 
 47 class CoreHttpCurl{
 48     protected $keywords = array();    //查询的关键词
 49     protected $findurl = null;        //查询的网站url
 50     protected $urls = array();        //获取到的所有urls请求地址
 51     
 52     protected $http_data = array();            //....
 53     protected $multi_exec_num = 10;            //多列队任务进程数,0表示不限制
 54     static protected $connecttimeout_ms = 3000;    //默认连接超时时间
 55     
 56     function __construct(){
 57     }
 58     
 59     /**
 60         *分析提交的关键词,并拆分成数组
 61     **/
 62     public function get_keywords($keyword){
 63         $keyword = str_replace("\r\n","\n",$keyword);    //换行符替换
 64         $this->keywords = explode("\n",$keyword);        //关键词数组
 65         return $this->keywords;
 66     }
 67     
 68     /**
 69         *获取请求的URL数组
 70         @param array $keywords
 71         @return array $urls        key为关键词,value为对应的查询网址
 72     **/
 73     public function get_urls($keywords){
 74         foreach($keywords as $word){
 75             $this->urls[$word] = "http://www.baidu.com/s?wd={$word}&cl=3&pn=0&rn=50";
 76         }
 77         return $this->urls;
 78     }
 79     
 80     /**
 81         *创建一个 CURL 对象
 82         @param string $url 每个url请求地址
 83         @param int $timeout 超时时间
 84         @return curl_init()
 85     **/
 86     protected function create_curl($url,$timeout){
 87         $ch = curl_init();
 88         curl_setopt($ch, CURLOPT_URL, $url);
 89         curl_setopt($ch, CURLOPT_HEADER, true);
 90         curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
 91         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 92         curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
 93         curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS,CoreHttpCurl::$connecttimeout_ms);
 94         
 95         return $ch;
 96     }
 97     
 98     /**
 99         *支持多线程获取网页
100         @param Array $urls
101         @param int $timeout
102         @return array()
103     **/
104     function request_urls($urls,$timeout){
105         $urls = array_unique($urls);        // 去重
106         if(!$urls) return array();            // $urls不存在,直接返回空数组
107         $mh = curl_multi_init();            // cURL批处理句柄
108         
109         $listener_list = array();            // 监听列表
110         $result = array();                    // 返回的数据
111         $list_num = 0;                        // 总列队数
112         $multi_list = array();                // 排队列表
113         
114         foreach($urls as $kw=>$url){
115             $current = $this->create_curl($url,$timeout);    // 创建一个curl对象
116             if($this->multi_exec_num > 0 && $list_num >= $this->multi_exec_num){
117                 $multi_list[] = $url;        // 加入排队列表
118             }else{
119                 // 列队数控制
120                 curl_multi_add_handle($mh, $current);
121                 $listener_list[$kw] = $current;
122                 $list_num++;
123             }
124             $result[$kw] = null;        //与原文不同,这里使用关键词做键名
125             $this->http_data[$kw] = null;
126         }
127         unset($current);        // 删除已加入队列的
128         $running = null;
129         
130         $done_num = 0;            // 已完成数
131         
132         do{
133             while(($execrun = curl_multi_exec($mh, $running)) == CURLM_CALL_MULTI_PERFORM);
134             if($execrun != CURLM_OK) break;
135             
136             while(($done = curl_multi_info_read($mh)) == true){
137                 foreach ($listener_list as $done_kw=>$listener){
138                     if($listener === $done['handle']){
139                         //获取内容
140                         $this->http_data[$done_kw] = $this->get_data(curl_multi_getcontent($done['handle']),$done['handle']);
141                         
142                         if($this->http_data[$done_kw]['code'] != 200){
143                             $result[$done_kw] = false;
144                         }else{
145                             // 返回内容
146                             $result[$done_kw] = $this->http_data[$done_kw]['data'];
147                         }
148                         
149                         curl_close($done['handle']);                        //关闭已经处理完的 curl 会话
150                         curl_multi_remove_handle($mh, $done['handle']);        //从 $mh 中移除
151                         unset($listener_list[$done_kw],$listener);            //从监听列表中移除
152                         $done_num++;
153                         
154                         //如果还有排队列表,则继续加入
155                         if($multi_list){
156                             $current_url = array_shift($multi_list);            // 获取队列中的第一条url
157                             $current = $this->create_curl($current_url, $timeout);    // 创建 curl 对象
158                             curl_multi_add_handle($mh, $current);                // 加入到队列中
159                             
160                             $listen_list[$current_url] = $current;                // 更新监听队列信息
161                             unset($current);
162                             
163                             $list_num++;                                        //更新队列数
164                         }
165                         break;
166                     }
167                 }
168             }
169             if($done_num >= $list_num) break;
170         }while(true);
171         curl_multi_close($mh);    //关闭列队
172         return $result;
173     }
174     
175     /**
176         * GET方式获取数据,支持多个URL
177     **/
178     public function get($urls, $timeout=10,$findurl){
179         $data = $this->request_urls($urls, $timeout);
180         //$this->clear_set();
181         $ranks = $this->baid_rank($data, $findurl);    //查询排名
182         return $ranks;
183     }
184     
185     /**
186         * 获取内容的函数
187     */
188     protected function get_data($data,$ch){
189         $header_size        = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
190         $result['code']        = curl_getinfo($ch, CURLINFO_HTTP_CODE);
191         $result['data']        = substr($data, $header_size);
192         $result['header']    = explode("\r\n", substr($data, 0, $header_size));
193         $result['time']        = curl_getinfo($ch, CURLINFO_TOTAL_TIME);
194         return $result;
195     }
196     
197     /**
198         * 排名查询
199         @param array $serp 搜索结果返回数据(key为关键词,value为页面源代码)
200         @param string $findurl 查询关键词排名的网站URL,如39.net,勿加http://
201         @return array
202     **/
203     protected function baid_rank(array $serp, $findurl){
204         $ranks = array();
205         foreach($serp as $keyword=>$source){
206             $pattern = "#<span class=\"g\">.*<\/span>#U";
207             preg_match_all($pattern, $source, $m);
208             if(!strpos(implode($m[0]),$findurl)){
209                 $ranks[$keyword] = 0;
210             }else{
211                 foreach($m[0] as $k=>$v){
212                     if(strpos($v, $findurl)){
213                         $ranks[$keyword] = $k+1;
214                         break;
215                     }
216                 }
217             }
218         }
219         return $ranks;
220     }
221     
222     /**
223         *清理设置
224     **/
225 }
226 
227 ?>

 上面似乎不好用,下面稍微好一点:

<html>
<head>
<meta http-equiv="content-type" content="text/html;charset=utf-8" />
<title>百度关键词排名批量查询</title>
</head>
<body>
<h3>百度关键词排名批量查询</h3>
<form action="test5.php" method="post">
输入关键词(每行一个)<br />
<textarea name="keyword" style="resize:none;width:243px;height:70px;"></textarea>
<br />输入网址:
<input type="text" name="url" size="20" value="39.net" />(如:39.net 勿加http://)<br />
<input type="submit" name="sub" value="查询" />
</form>
<hr />
</body>
</html>
<?php
/**
 * Wget Curl驱动核心
 *
 * @author     jonwang(jonwang@myqee.com)
 * @category   MyQEE
 * @package    System
 * @subpackage Core
 * @copyright  Copyright (c) 2008-2012 myqee.com
 * @license    http://www.myqee.com/license.html
 */
set_time_limit(0);
 if(isset($_POST['sub'])){
    $start_time = microtime_float();
    $kw = $_POST['keyword'];
    $findurl = $_POST['url'];
    $httpcurl = new Core_HttpClient_Driver_Curl();
    $keywords = $httpcurl->get_keywords($kw);//获取关键词数组
    $urls = $httpcurl->get_urls($keywords);//获取请求的url    array("关键词"=>"搜索url")    
    $data = $httpcurl->get($urls);//获取搜索结果页面的源代码 array("关键词"=>"网页内容")
    $ranks = $httpcurl->get_rank($data, $findurl); //获取排名
    $end_time = microtime_float();
    $con_time = $end_time - $start_time;
    echo "查询耗时:".$con_time;
    
    $output = "<table border='1' bordercolor='green' cellspacing='0'><tr><th>序号</th><th>关键词</th><th>排名</th></tr>";
    $i=1;
    foreach($ranks as $keyword=>$rank){
        $output .= "<tr><td>{$i}</td><td>{$keyword}</td><td>{$rank}</td></tr>";
        $i++;
    }
    $output .= "</table>";
    echo $output;
}

/**
    * 计算耗时
**/
function microtime_float(){
    list($usec,$sec) = explode(" ",microtime());
    return ((float)$usec+(float)$sec);
}
 
class Core_HttpClient_Driver_Curl{
    protected $http_data = array();
    protected $agent;
    protected $cookies;
    protected $referer;
    protected $ip;
    protected $header = array();
    protected $_option = array();
    protected $_post_data = array();
    
    protected $keywords = array();    //提交的关键词数组
    protected $urls = array();        //百度查询页面URL

    /**
     * 多列队任务进程数,0表示不限制
     * 采集百度,太大会被封,伪装来路和ip似乎也没有用;太小耗时间
     * @var int
     */
    protected $multi_exec_num = 3;

    /**
     * 默认连接超时时间,毫秒
     *
     * @var int
     */
    protected static $connecttimeout_ms = 3000;
    const ERROR_HOST = '请求的URL错误';
    const ERROR_GET = 'GET请求错误';
    const ERROR_POST = 'POST请求错误';
    
    function __construct(){
    }

    /**
     * 设置$cookie
     *
     * @param $agent
     * @return HttpClient_Driver_Curl
     */
    public function set_agent($agent)
    {
        $this->agent = $agent;
        return $this;
    }

    /**
     * 设置$cookie
     *
     * @param string $cookie
     * @return HttpClient_Driver_Curl
     */
    public function set_cookies($cookies)
    {
        $this->cookies = $cookies;
        return $this;
    }

    /**
     * 设置$referer
     *
     * @param string $referer
     * @return HttpClient_Driver_Curl
     */
    public function set_referer($referer)
    {
        $this->referer = $referer;
        return $this;
    }

    /**
     * 设置IP
     *
     * @param string $ip
     * @return HttpClient_Driver_Curl
     */
    public function set_ip($ip)
    {
        $this->ip = $ip;
        return $this;
    }

    /**
     * 设置curl参数
     *
     * @param string $key
     * @param value $value
     * @return HttpClient_Driver_Curl
     */
    public function set_option($key, $value)
    {
        if ( $key===CURLOPT_HTTPHEADER )
        {
            $this->header = array_merge($this->header,$value);
        }
        else
        {
            $this->_option[$key] = $value;
        }
        return $this;
    }

    /**
     * 设置多个列队默认排队数上限
     *
     * @param int $num
     * @return HttpClient_Driver_Curl
     */
    public function set_multi_max_num($num=0)
    {
        $this->multi_exec_num = (int)$num;
        return $this;
    }

    /**
     * 用POST方式提交,支持多个URL
     *
     *   $urls = array
     *   (
     *     'http://www.baidu.com/',
     *     'http://mytest.com/url',
     *     'http://www.abc.com/post',
     *   );
     *   $data = array
     *   (
     *      array('k1'=>'v1','k2'=>'v2'),
     *      array('a'=>1,'b'=>2),
     *      'aa=1&bb=3&cc=3',
     *   );
     *   HttpClient::factory()->post($url,$data);
     *
     * @param $url
     * @param string/array $vars
     * @param $timeout 超时时间,默认120秒
     * @return string, false on failure
     */
    public function post($url, $vars, $timeout = 60)
    {
        # POST模式
        $this->set_option( CURLOPT_HTTPHEADER, array('Expect:') );
        $this->set_option( CURLOPT_POST, true );

        if (is_array($url))
        {
            $myvars = array();
            foreach ($url as $k=>$url)
            {
                if (isset($vars[$k]))
                {
                    if (is_array($vars[$k]))
                    {
                        $myvars[$url] = http_build_query($vars[$k]);
                    }
                    else
                    {
                        $myvars[$url] = $vars[$k];
                    }
                }
            }
        }
        else
        {
            $myvars = array($url=>$vars);
        }
        $this->_post_data = $myvars;

        return $this->get($url,$timeout);
    }

    /**
     * GET方式获取数据,支持多个URL
     *
     * @param string/array $url
     * @param $timeout
     * @return string, false on failure
     */
    public function get($url, $timeout = 10)
    {
        if ( is_array($url) )
        {
            $getone = false;
            $urls = $url;
        }
        else
        {
            $getone = true;
            $urls = array($url);//单个url,也转为数组
        }

        $data = $this->request_urls($urls, $timeout);

        $this->clear_set();
        return $data;

        /* if ( $getone ){
            $this->http_data = $this->http_data[$done_kw];
            return $data[$done_kw];
        }
        else{
            return $data;
        } */
    }

    /**
     * 创建一个CURL对象
     *
     * @param string $url URL地址
     * @param int $timeout 超时时间
     * @return curl_init()
     */
    protected function _create($url,$timeout)
    {
        if ( false===strpos($url, '://') )
        {
            preg_match('#^(http(?:s)?\://[^/]+/)#', $_SERVER["SCRIPT_URI"] , $m);
            $the_url = $m[1].ltrim($url,'/');
        }
        else
        {
            $the_url = $url;
        }
        
        /* if ($this->ip)
        {
            # 如果设置了IP,则把URL替换,然后设置Host的头即可
            if ( preg_match('#^(http(?:s)?)\://([^/\:]+)(\:[0-9]+)?/#', $the_url.'/',$m) )
            {
                $this->header[] = 'Host: '.$m[2];
                $the_url = $m[1].'://'.$this->ip.$m[3].'/'.substr($the_url,strlen($m[0]));
            } 
            $this->header['Client-IP'] = $this->ip;
        } */

        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $the_url);
        curl_setopt($ch, CURLOPT_HEADER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, Core_HttpClient_Driver_Curl::$connecttimeout_ms);

        if ( preg_match('#^https://#i', $the_url) )
        {
            curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
            curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        }

        if ( $this->cookies )
        {
            curl_setopt($ch, CURLOPT_COOKIE, http_build_query($this->cookies, '', ';'));
        }
        $this->refer = "http://www.bd".mt_rand(1,9999).".com/";
        if ( $this->referer )
        {
            curl_setopt($ch, CURLOPT_REFERER, $this->referer);
        }        
        
        $this->agent = "Mozilla/".mt_rand(1,100)." (Windows NT 6.1; rv:18.0) Gecko/20100101 Firefox/18.0";
        if ( $this->agent )
        {
            curl_setopt($ch, CURLOPT_USERAGENT, $this->agent);
        }
        elseif ( array_key_exists('HTTP_USER_AGENT', $_SERVER) )
        {
            curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
        }

        foreach ( $this->_option as $k => $v )
        {
            curl_setopt($ch, $k, $v);
        }

        /* $this->ip = mt_rand(10,100).".".mt_rand(10,200).".".mt_rand(10,200).".".mt_rand(1,200);
        $this->header['CLIENT-IP'] = $this->ip;
        $this->header['X-FORWARDER-FOR'] = $this->ip; */
        if ( $this->header )
        {
            $header = array();
            foreach ($this->header as $item)
            {
                # 防止有重复的header
                if (preg_match('#(^[^:]*):.*$#', $item,$m))
                {
                    $header[$m[1]] = $item;
                }
            }
            curl_setopt($ch, CURLOPT_HTTPHEADER, array_values($header));
        }

        # 设置POST数据
        if (isset($this->_post_data[$url]))
        {
            curl_setopt($ch , CURLOPT_POSTFIELDS , $this->_post_data[$url]);
        }

        return $ch;
    }

    /**
     * 支持多线程获取网页
     *
     * @see http://cn.php.net/manual/en/function.curl-multi-exec.php#88453
     * @param Array/string $urls
     * @param Int $timeout
     * @return Array
     */
    protected function request_urls($urls, $timeout = 10)
    {
        # 去重
        $urls = array_unique($urls);

        if (!$urls)return array();

        $mh = curl_multi_init();

        # 监听列表
        $listener_list = array();

        # 返回值
        $result = array();

        # 总列队数
        $list_num = 0;

        # 排队列表
        $multi_list = array();
        foreach ( $urls as $kw=>$url )
        {
            # 创建一个curl对象
            $current = $this->_create($url, $timeout);

            if ( $this->multi_exec_num>0 && $list_num>=$this->multi_exec_num )
            {
                # 加入排队列表
                $multi_list[$kw] = $url;
            }
            else
            {
                # 列队数控制
                curl_multi_add_handle($mh, $current);
                $listener_list[$kw] = $current;
                $list_num++;
            }

            $result[$kw] = null;
            $this->http_data[$kw] = null;
        }
        unset($current);
        $running = null;

        # 已完成数
        $done_num = 0;

        do
        {
            while ( ($execrun = curl_multi_exec($mh, $running)) == CURLM_CALL_MULTI_PERFORM );
            if ( $execrun != CURLM_OK ) break;

            while ( true==($done = curl_multi_info_read($mh)) )
            {
                foreach ( $listener_list as $done_kw=>$listener )
                {
                    if ( $listener === $done['handle'] ){
                        # 获取内容
                        $this->http_data[$done_kw] = $this->get_data(curl_multi_getcontent($done['handle']), $done['handle']);
                        if ( $this->http_data[$done_kw]['code'] != 200 ){
                            //Core::debug()->error('URL:'.$done_url.' ERROR,TIME:' . $this->http_data[$done_url]['time'] . ',CODE:' . $this->http_data[$done_url]['code'] );
                            $result[$done_kw] = false;
                        }
                        else{
                            # 返回内容
                            $result[$done_kw] = $this->http_data[$done_kw]['data'];
                            //Core::debug()->info('URL:'.$done_url.' OK.TIME:' . $this->http_data[$done_url]['time'] );
                        }

                        curl_close($done['handle']);
                        curl_multi_remove_handle($mh, $done['handle']);

                        # 把监听列表里移除
                        unset($listener_list[$done_kw],$listener);
                        $done_num++;

                        # 如果还有排队列表,则继续加入
                        if ( $multi_list ){
                            # 获取列队中的一条URL
                            $kw = array_keys($multi_list)[0];
                            $current_url = array_shift($multi_list);
                            # 创建CURL对象
                            $current = $this->_create($current_url, $timeout);
                            # 加入到列队
                            curl_multi_add_handle($mh, $current);
                            # 更新监听列队信息
                            $listener_list[$kw] = $current;
                            unset($current);
                            # 更新列队数
                            $list_num++;
                        }
                        break;
                    }
                }
            }
            if ($done_num>=$list_num)break;
        } while (true);
        # 关闭列队
        curl_multi_close($mh);
        return $result;
    }

    public function get_resut_data()
    {
        return $this->http_data;
    }

    protected function get_data($data, $ch)
    {
        $header_size      = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
        $result['code']   = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        $result['data']   = substr($data, $header_size);
        $result['header'] = explode("\r\n", substr($data, 0, $header_size));
        $result['time']   = curl_getinfo($ch, CURLINFO_TOTAL_TIME);

        return $result;
    }

    /**
     * 清理设置
     */
    protected function clear_set()
    {
        $this->_option = array();
        $this->header = array();
        $this->ip = null;
        $this->cookies = null;
        $this->referer = null;
        $this->_post_data = array();
    }
    
    /**
        *分析提交的关键词,并拆分成数组
    **/
    public function get_keywords($keyword){
        $keyword = str_replace("\r\n","\n",$keyword);    //换行符替换
        $this->keywords = explode("\n",$keyword);        //关键词数组
        return $this->keywords;
    }
    
    /**
        *获取请求的URL数组
        @param array $keywords
        @return array $urls        key为关键词,value为对应的查询网址
    **/
    public function get_urls($keywords){
        foreach($keywords as $word){
            $this->urls[$word] = "http://www.baidu.com/s?wd={$word}&cl=3&pn=0&rn=50";
        }
        return $this->urls;
    }
    
    /**
        * 排名查询
        @param array $serp 搜索结果返回数据(key为关键词,value为页面源代码)
        @param string $findurl 查询关键词排名的网站URL,如39.net,勿加http://
        @return array
    **/
    public function get_rank(array $serp, $findurl){
        $ranks = array();
        foreach($serp as $keyword=>$source){
            $pattern = "#<span class=\"g\">.*<\/span>#U";
            preg_match_all($pattern, $source, $m);
            if(!strpos(implode($m[0]),$findurl)){
                $ranks[$keyword] = 0;
            }else{
                foreach($m[0] as $k=>$v){
                    if(strpos($v, $findurl)){
                        $ranks[$keyword] = $k+1;
                        break;
                    }
                }
            }
        }
        return $ranks;
    }
}

 防止被百度封IP的另一个思路:搜索请求url改成ip,如:http://www.baidu.com/改成http://115.239.210.26/,百度ip很多,全部整理出来,做轮换,下回尝试是否可行(curl类中刚好有域名换成ip的代码)。

posted @ 2013-01-30 13:10  php之路  阅读(1083)  评论(0编辑  收藏  举报