php获取页面并切割页面div内容

亮点：

1、利用php也能实现对页面div的切割处理。这里的做法抛砖引玉，希望读者能够提供更加完美的解决方案。

2、切割处理方法已经封装成一个方法，可以直接引用。

3、顺便加上博客园标签云的截取。//getWebDiv('id="taglist"','http://www.cnblogs.com/Zjmainstay/tag/');

View Code

<?php
    header("Content-type: text/html; charset=utf-8"); 
    function getWebDiv($div_id,$url=false,$data=false){
        if($url !== false){
            $data = file_get_contents( $url );
        }
        $charset_pos = stripos($data,'charset');
        if($charset_pos) {
            if(stripos($data,'charset=utf-8',$charset_pos)) {
                $data = iconv('utf-8','utf-8',$data);
            }else if(stripos($data,'charset=gb2312',$charset_pos)) {
                $data = iconv('gb2312','utf-8',$data);
            }else if(stripos($data,'charset=gbk',$charset_pos)) {
                $data = iconv('gbk','utf-8',$data);
            }
        }
        
        preg_match_all('/<div/i',$data,$pre_matches,PREG_OFFSET_CAPTURE);    //获取所有div前缀
        preg_match_all('/<\/div/i',$data,$suf_matches,PREG_OFFSET_CAPTURE); //获取所有div后缀
        $hit = strpos($data,$div_id);
        if($hit == -1) return false;    //未命中
        $divs = array();    //合并所有div
        foreach($pre_matches[0] as $index=>$pre_div){
            $divs[(int)$pre_div[1]] = 'p';
            $divs[(int)$suf_matches[0][$index][1]] = 's';    
        }
        
        //对div进行排序
        $sort = array_keys($divs);
        asort($sort);
        
        $count = count($pre_matches[0]);
        foreach($pre_matches[0] as $index=>$pre_div){
            //<div $hit <div+1    时div被命中
            if(($pre_matches[0][$index][1] < $hit) && ($hit < $pre_matches[0][$index+1][1])){
                $deeper = 0;
                //弹出被命中div前的div
                while(array_shift($sort) != $pre_matches[0][$index][1] && ($count--)) continue;
                //对剩余div进行匹配，若下一个为前缀，则向下一层，$deeper加1，
                //否则后退一层，$deeper减1，$deeper为0则命中匹配，计算div长度
                foreach($sort as $key){
                    if($divs[$key] == 'p') $deeper++;
                    else if($deeper == 0) {
                        $length = $key-$pre_matches[0][$index][1];
                        break;
                    }else {
                        $deeper--;
                    }
                }
                $hitDivString = substr($data,$pre_matches[0][$index][1],$length).'</div>';
                break;
            }
        }
        return $hitDivString;
    }
    
    echo getWebDiv('id="taglist"','http://www.cnblogs.com/Zjmainstay/tag/');

//End_php

考虑到id符号问题，id="u"由用户自己填写。

声明：此段php只针对带 id div内容的读取。

——————————————————————————完善：匹配任意可闭合带id标签————————————————————————————————————————————

View Code

 1 <?php
 2     header("Content-type: text/html; charset=utf-8"); 
 3     function getWebTag($tag_id,$url=false,$tag='div',$data=false){
 4         if($url !== false){
 5             $data = file_get_contents( $url );
 6         }
 7         $charset_pos = stripos($data,'charset');
 8         if($charset_pos) {
 9             if(stripos($data,'charset=utf-8',$charset_pos)) {
10                 $data = iconv('utf-8','utf-8',$data);
11             }else if(stripos($data,'charset=gb2312',$charset_pos)) {
12                 $data = iconv('gb2312','utf-8',$data);
13             }else if(stripos($data,'charset=gbk',$charset_pos)) {
14                 $data = iconv('gbk','utf-8',$data);
15             }
16         }
17         
18         preg_match_all('/<'.$tag.'/i',$data,$pre_matches,PREG_OFFSET_CAPTURE);    //获取所有div前缀
19         preg_match_all('/<\/'.$tag.'/i',$data,$suf_matches,PREG_OFFSET_CAPTURE); //获取所有div后缀
20         $hit = strpos($data,$tag_id);
21         if($hit == -1) return false;    //未命中
22         $divs = array();    //合并所有div
23         foreach($pre_matches[0] as $index=>$pre_div){
24             $divs[(int)$pre_div[1]] = 'p';
25             $divs[(int)$suf_matches[0][$index][1]] = 's';    
26         }
27         
28         //对div进行排序
29         $sort = array_keys($divs);
30         asort($sort);
31         
32         $count = count($pre_matches[0]);
33         foreach($pre_matches[0] as $index=>$pre_div){
34             //<div $hit <div+1    时div被命中
35             if(($pre_matches[0][$index][1] < $hit) && ($hit < $pre_matches[0][$index+1][1])){
36                 $deeper = 0;
37                 //弹出被命中div前的div
38                 while(array_shift($sort) != $pre_matches[0][$index][1] && ($count--)) continue;
39                 //对剩余div进行匹配，若下一个为前缀，则向下一层，$deeper加1，
40                 //否则后退一层，$deeper减1，$deeper为0则命中匹配，计算div长度
41                 foreach($sort as $key){
42                     if($divs[$key] == 'p') $deeper++;
43                     else if($deeper == 0) {
44                         $length = $key-$pre_matches[0][$index][1];
45                         break;
46                     }else {
47                         $deeper--;
48                     }
49                 }
50                 $hitDivString = substr($data,$pre_matches[0][$index][1],$length).'</'.$tag.'>';
51                 break;
52             }
53         }
54         return $hitDivString;
55     }
56     
57     echo getWebTag('id="nav"','http://mail.163.com/html/mail_intro/','ul');
58     echo getWebTag('id="homeBanners"','http://mail.163.com/html/mail_intro/');
59     echo getWebTag('id="performance"','http://mail.163.com/html/mail_intro/','section');
60 
61 //End_php

修复：stripos($data,'charset=utf-8',$charset_pos) 加入charset=，避免有些gb2312格式的网页中包含utf-8造成错误。或者用户可以自行修改函数传入一个确定的charset参数。

演示地址：parseDiv

posted @ 2012-08-06 10:05 Zjmainstay 阅读(5125) 评论(0) 收藏举报

刷新页面返回顶部

Zjmainstay

——毕有生之年，去虚，度年华

php获取页面并切割页面div内容

公告