页面抓取利器

博客已经搬家,请访问如下地址:http://www.czhphp.com

抓取页面,正则不会写该怎么办那,有个好工具推荐给大家simple_html_dom.php。。。。

直接贴代码了,哈哈

 

<?php
define('DBHOST', '');
define('DBUSER', '');
define('DBPW', '');
define('DBNAME', '');
require("simple_html_dom.php");

$sfarr = array(
        //game
        array(
        "site"=>"game.hao123.htm",
        "func"=>"snap",
        "f"=>"dl.fav_container dt.fav_tl",
        "s"=>"dl.fav_container dd.fav_links",
        'classid'=>4,
        'url'=>"http://game.hao123.com/"
        ),
    //book
        array(
        "site"=>"book.hao123.htm",
        "func"=>"snap",
        "f"=>"dl.favsites-list dt.fav-title",
        "s"=>"dl.favsites-list dd.fav-links",
        "classid"=>3,
        'url'=>"http://book.hao123.com/"
        ),
    //shopping
        array(
        "site"=>"gouwu.hao123.htm",
        "func"=>"snap",
        "f"=>"dl.netbuy-box dt",
        "s"=>"dl.netbuy-box",
        "classid"=>1,
        'url'=>"http://gouwu.hao123.com/sc/"
        ),
    //video
        array(
        "site"=>"video.hao123.htm",
        "func"=>"snap",
        "f"=>"div.content-con h2",
        "s"=>"div.content-con ul.content-link",
        "classid"=>2,
        'url'=>"http://www.hao123.com/video"
        ),
);

foreach($sfarr as $var){
    
    $data = $var['func']($var);
    foreach($data as $key){
        
        $con = mysql_connect(DBHOST, DBUSER, DBPW, MYSQL_CLIENT_INTERACTIVE);
        if (!$con) {
          die('Could not connect: ' . mysql_error());
        }

        mysql_select_db(DBNAME,$con);
        mysql_query("set names utf8",$con);
        
        $sub = !empty($key['sub']) ? $key['sub'] : '';
        $classid = !empty($key['classid']) ? $key['classid'] : 0;
        $site = $key['site'];
        $subclassid = 0;
        if($sub){

            $sub = str_replace(array("[","]"), '', $sub);
            $sql = "insert into subclass (name ,classid) values('".$sub."', ".$classid.")";
           // echo $sql;exit;
            $res = mysql_query($sql);
            $subclassid = mysql_insert_id();
            echo $subclassid;exit;
            echo 'subclassid  '. $subclassid.' has create succ in:'.$var['site'].'</br>';

            if($site){
                foreach($site as $key){
                    $sql = "insert into site (name,url,classid, subclassid) values('".$key['text']."', '".$key['href']."', ".$classid.", ".$subclassid.")";
                    $res = mysql_query($sql);
                }
            }

            echo $var['site'].'website snap succ </br>';
        }  
    }
}

function snap($data){

    $html = file_get_html($data['site']);
    $dts = $html->find($data['f']);
    $cnt = count($dts);

    for($i=0;$i<$cnt;$i++){

        $ss = array();
        $ss['sub'] = trim($dts[$i]->plaintext);
        $ss['classid'] = $data['classid'];
        $h = str_get_html($html->find($data['s'], $i)->innertext);

        foreach($h->find("a") as $a){

            if($a->href && $a->plaintext){
                $tmp = array();
                $tmp['text']= trim($a->plaintext);
                $tmp['href']= $a->href;
                $ss['site'] []= $tmp;
            }
        }

        $all[]= $ss;
    }

    return $all;
}



?>

博客已经搬家,请访问如下地址:http://www.czhphp.com

posted @ 2012-08-17 16:12  曹振华  阅读(351)  评论(0编辑  收藏  举报