实验楼的php比赛题,网页数据提取。
实验楼的php比赛题,网页数据提取。
题目的地址:https://www.shiyanlou.com/contests/lou5/challenges
以下代码是题目的答案
<?php
header("Content-Type:text/html;charset=utf-8");
class Crawler{
private $content;
private $data;
static private $mysql;
public function __construct(){
echo "开始爬取内容....";
}
public function loadFile($file_path){
echo "正在加载文件";
$this->content = file_get_contents($file_path);
}
public function parseCourseBody(){
$regex = "/<body[^>]*?>(.*\s*?)<\/body>/is";
if(preg_match_all($regex, $this->content, $matches)){
$this->content = $matches[0];
}
}
public function parseContent(){
echo "开始解析内容...<br/>";
$this->parseCourseBody();
$this->parseTitle();
$this->parseDesc();
$this->parseType();
$this->titleIsLong();
$this->saveData();
echo "解析内容结束!<br/>";
}
public function saveData(){
echo "存入数据库...<br/>";
self::$mysql = mysql_connect("localhost","root","root");
mysql_query("set names utf8");
mysql_select_db("databases",self::$mysql);
$cnames = $this->data['cnames'];
$cdescs = $this->data['cdescs'];
$ctypes = $this->data['ctypes'];
$nlongs = $this->data['nlongs'];
foreach ($cnames as $key => $value) {
$sql = "insert into `course_data`(`cname`,`cdesc`,`ctype`,`nlong`) values('".$cnames[$key]."','".$cdescs[$key]."','".$ctypes[$key]."','".$nlongs[$key]."')";
mysql_query($sql);
}
mysql_close();
}
public function parseTitle(){
echo "解析课程标题...<br/>";
$regex= "/<div class=\"course-name\".*?>.*?<\/div>/ism";
if(preg_match_all($regex, $this->content, $matches)){
$cnames = $matches[0];
}
foreach ($cnames as &$value) {
$value = str_replace("</div>","",str_replace("<div class=\"course-name\">", "", $value));
}
$this->data['cnames'] = $cnames;
}
public function parseDesc(){
echo "解析课程简介...<br/>";
$regex4= "/<div class=\"course-desc\".*?>.*?<\/div>/ism";
if(preg_match_all($regex, $this->content, $matches)){
$cdescs = $matches[0];
}
foreach ($cdescs as &$value) {
$value = str_replace("</div>","",str_replace("<div class=\"course-desc\">", "", $value));
}
$this->data['cdescs'] = $cdescs;
}
public function parseType(){
echo "解析课程类型...<br/>";
$regex= "/<div class=\"course-footer\".*?>.*?<\/div>/ism";
if(preg_match_all($regex, $this->content, $matches)){
$ctypes = $matches[0];
}
foreach ($ctypes as &$value) {
$str = str_replace("</div>","",str_replace("<div class=\"course-footer\">", "", $value));
if(preg_match_all("/([\x{4e00}-\x{9fa5}])/u", $str, $match)){
$value = join("",$match[0]);
}else{
$value = "免费";
}
$this->data['ctypes'] = $ctypes;
}
public function titleIsLong(){
echo "判断课程名是否超长...<br/>";
$cnames = $this->data['cnames'];
foreach ($cnames as $value) {
$nlongs[] = mb_strlen($value) > 16 : "true" : "false";
}
$this->data['nlongs'] = $nlongs;
}
}
$Crawler = new Crawler();
$Crawler->loadFile("test.html");
$Crawler->parseContent();
/**
表结构
cname(varchar):完整的课程名
cdesc(varchar):课程描述
ctype(varchar):课程类型,值为 免费,会员,训练营。
nlong(enum('true','false')):课程名是否过长,课程名称超过16字符的时候为 true,否则为 false
create table `course_data`(
`id` int(11) not null auto_increment,
`cname` varchar(255) default null,
`cdesc` varchar(255) default null,
`ctype` varchar(255) default null,
`nlong` enum('true','false') default null,
primary key (`id`)
)engine=InnoDB default charset=utf8;
*/

浙公网安备 33010602011771号