记一次爬取豆瓣电影详情
帮朋友爬取豆瓣电影的介绍里面的内容,废话不多说了,上代码
简单的爬取分为两个文件
fectch.php
<?php
require "./getfunction.php";
$name = "复仇者联盟3:无限战争";
$url = "https://movie.douban.com/j/subject_suggest?q=".$name;
$curl = curl_init(); // 启动一个CURL会话
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);  // 从证书中检查SSL加密算法是否存在
$tmpInfo = curl_exec($curl);     //返回api的json对象
$tmpInfo = json_decode($tmpInfo);
// var_dump($tmpInfo);die;
$arrat_res = [];
foreach ($tmpInfo as $v) {
    if ($name == $v->title) {
        $arrat_res[] = $v;
    }
}
if (empty($arrat_res)) {
   $data = [
     "code"=>10001,
     "msg"=>"暂无片源信息"
   ];
   echo json_encode($data);die;
}
$url2 = $arrat_res[0]->url;
curl_setopt($curl, CURLOPT_URL, $url2);
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);  // 从证书中检查SSL加密算法是否存在
$tmpInfo2 = curl_exec($curl);     //返回api的json对象
if (!$tmpInfo2) {
    echo "<br />cURL error number:" .curl_errno($curl);
    echo "<br />cURL error:" . curl_error($curl);
    exit;
}
//创建一个DomDocument对象,用于处理一个HTML
$dom = new DOMDocument();
//从一个字符串加载HTML
@$dom->loadHTML($tmpInfo2);
//使该HTML规范化
$dom->normalize();
//用DOMXpath加载DOM,用于查询
$xpath = new DOMXPath($dom);
//获取导演信息
$directors = $xpath->evaluate("//*[@id='info']/span[1]/span[2]/a/text()");
$directors_res = "";
for ($i = 0; $i < $directors->length; $i++) {
    $director = $directors->item($i);
    $director = $director->nodeValue;
    if ($i != 0) {
      $directors_res = $directors_res.",".$director;
    }else{
      $directors_res = $director;
    }
}
//名称
$name = $xpath->evaluate("//*[@id='content']/h1/span[1]/text()");
if (!empty($name->length)) {
    $name = $name->item(0)->nodeValue;
}
//年份
$years = $xpath->evaluate("//*[@id='content']/h1/span[2]/text()");
if (!empty($years->length)) {
  $years = $years->item(0)->nodeValue;
}
//海报
//*[@id="mainpic"]/a/img
$img = $xpath->evaluate("//*[@id='mainpic']/a/img/@src");
if (!empty($img->length)) {
    $img = $img->item(0)->nodeValue;
}
// var_dump($img);die;
//是否上映
//*[@id="interest_sectl"]/div/div[2]/div/div[2]
$is_on = $xpath->evaluate("//*[@id='interest_sectl']/div/div[2]/div/div[2]");
if (!empty($is_on->length)) {
    $is_on = $is_on->item(0)->nodeValue;
    if (trim($is_on) == "尚未上映") {
      $is_on = 1;
    }else{
      $is_on = 2;
    }
}
// var_dump($is_on);die;
//获取编剧信息
$screenwriters = $xpath->evaluate("//*[@id='info']/span[2]/span[2]/a/text()");
$screenwriters_res = "";
for ($i = 0; $i < $screenwriters->length; $i++) {
    $screenwriter = $screenwriters->item($i);
    $screenwriter = $screenwriter->nodeValue;
    if ($i != 0) {
       $screenwriters_res = $screenwriters_res. ",".$screenwriter;
    }else{
       $screenwriters_res = $screenwriter;
    }
}
//获取演员信息
//*[@id="info"]/span[3]/span[2]/span[1]/a
$actors = $xpath->query("//*[@id='info']/span[3]/span[2]");
$actors_res = "";
for ($i = 0; $i < $actors->length; $i++) {
    $actor = $actors->item($i);
    $actor = $actor->nodeValue;
    if ($i != 0) {
       $actors_res = $actors_res. ",".$actor;
    }else{
       $actors_res = $actor;
    }
}
// $types = $xpath->query("//*[@id='info']/span[30]");
// var_dump($types->item(0)->nodeValue);die;
//获取类型
$getfunction = new getFunction();
$sear_res = $getfunction->getRes(5,"制片国家/地区:",$xpath);
$types_res = $sear_res["res"];
$num = $sear_res["num"];
//获取语言
$attr = [];
$langs = $xpath->evaluate("//*[@id='info']/text()");
for ($i = 0; $i < $langs->length; $i++) {
    $lang = $langs->item($i);
    $lang = $lang->nodeValue;
    if (preg_match('/[\x{4e00}-\x{9fa5}]/u', $lang)>0) {
        $attr [] = $lang;
    }
}
// var_dump($attr);die;
// if (count($attr) == 3) {
//   // code...
// }
if ($is_on == 1) {
  $show_res = "";
  $sear2_res = $getfunction->getRes($num+4,"又名:",$xpath);
  $time_res = $sear2_res["res"];
  $num = $sear2_res["num"];
}else{
  //获取上映时间
  $sear2_res = $getfunction->getRes($num+4,"片长:",$xpath);
  $time_res = $sear2_res["res"];
  $num = $sear2_res["num"];
  //时长
  $sear3_res = $getfunction->getRes($num+1,"又名:",$xpath);
  $show_res = $sear3_res["res"];
  $num = $sear3_res["num"];
}
if (count($attr) == 4) {
  $show_res = $show_res.$attr[2];
  $country = $attr[0];
  $languages = $attr[1];
  $byname = $attr[3];
}else{
  $country = $attr[0];
  $languages = $attr[1];
  $byname = $attr[2];
}
$imbd = "";
$urlim = $xpath->evaluate("//*[@id='info']/a[2]/@href");
if (!empty($urlim->length)) {
  $urlim = $urlim->item(0)->nodeValue;
  //获取url
  $urls = "";
  $urls = $xpath->evaluate("//*[@id='info']/a[1]/@href");
  if (!empty($urls->length)) {
    $urls = $urls->item(0)->nodeValue;
  }
}else{
  $urls = "";
  $urlim = $xpath->evaluate("//*[@id='info']/a[1]/@href");
  if (!empty($urlim->length)) {
    $urlim = $urlim->item(0)->nodeValue;
  }
}
$final_res = [
  "all_name" => $name.$years,
  "name" => $name,
  "year" => $years,
  "img" => $img,
  "directors" => $directors_res,
  "screenwriters" => $screenwriters_res,
  "actors" => $actors_res,
  "types" => $types_res,
  "web_url" => $urls,
  "country" => $country,
  "languages" => $languages,
  "ontime" => $time_res,
  "showtime" => $show_res,
  "byname" => $byname,
  "imbd" => $urlim
];
$return = ["code"=>0, "msg"=>"抓取成功", "data"=>$final_res ];
echo json_encode($return);
getfunction.php
<?php
class getFunction{
  public static function getRes($start,$key,$xpath){
    $res = "";
    $num = "";
    // $key = "官方网站:";
    for($i = $start; $i<30; $i++ ){
      $types = $xpath->query("//*[@id='info']/span[".$i."]");
      if (!empty($types->length)) {
        $info_res = $types->item(0)->nodeValue;
        if ($info_res == $key) {
          $num = $i;
        }elseif ($info_res == "官方网站:") {
          $num = $i;
        }else{
          if(empty($num)){
            if ($i != $start) {
               $res = $res. ",".$info_res;
            }else{
               $res = $info_res;
            }
          }
        }
      }
    }
    $data = ["res"=>$res,"num"=>$num];
    return $data;
  }
}
 
                    
                     
                    
                 
                    
                


 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号