Perl解析当当网图书信息页面

功能描述:

  • 首先判断html文件是否为当当图书的页面,不是则不处理
  • 把图书标题,价格,作者,出版社等信息抽取出来存入文件
  • perl程序运行命令:perl  programfile    html_file_list

原代码奉上:

#!/usr/bin/perl
use HTML::Element;
use HTML::TreeBuilder;
use HTML::Parser;

open DATAFH,">>data" || die "open file failed:$!";
select DATAFH;
foreach my $file_name (@ARGV) {
	unless(-e $file_name){
		print "$file_name文件不存在.\n";
		next;
	}
	##首先判断文件编码是不是UTF-8,如果不是要转换成UTF-8,因为我发现perl的HTML模块好像只能处理UTF-8编码的文件,处理GBK文件时会出现中文乱码。而我们从网上下载的页面几乎都是GBK编码。
	system "enca","-L","zh_CN","-x","UTF-8",$file_name;		##用enca比用iconv的好处在于不需要管原文件是什么编码(即使本来就UFTF-8编码),直接转换成UTF-8编码就是了。
    my $root = HTML::TreeBuilder->new;
    $root->parse_file($file_name);
    $title=$root->find_by_tag_name('title');
    $str_title=$title->as_text();
    if($str_title=~qr/图书 - 当当网$/){
    	print "网页标题:$str_title\n";
    	$div_h1=$root->look_down("_tag","div","class","h1_title book_head");
    	$h1=$div_h1->look_down('_tag','h1');
    	print "图书名称:",$h1->as_text(),"\n";
    	$div_info=$root->look_down("_tag","div","class","info book_r");
    	$price_d=$div_info->look_down("_tag","p","class","price_d");
    	print $price_d->as_text(),"\n";
    	$price_m=$div_info->look_down("_tag","p",
    								"class","price_m");
    	foreach my $node_r ($price_m->content_refs_list){
    		next if ref $$node_r;
    		print $$node_r,"\n";
    	}
    	$div_detail=$div_info->look_down("_tag","div","class","book_detailed","name","__Property_pub");
    	@elements=$div_detail->find_by_tag_name('p');
    	foreach(@elements){
			print $_->as_text(),"\n";
		}
		$clear=$div_detail->look_down("_tag","ul","class","clearfix");
		@lis=$clear->content_list();
		@spans=$lis[0]->content_list();
		print $spans[1]->as_text(),"\n";
		@spans=$lis[2]->content_list();
		print $spans[1]->as_text(),"\n";
		print $spans[2]->as_text(),"\n";
		print "********************************************\n";
		$|=1;
    }
    else{
		print STDOUT "$file_name不是当当网图书信息页面。\n";
		next;
	}
    $tree = $root->delete;
}
close DATAFH;

输出文件里的内容:

posted @ 2011-05-21 20:37  张朝阳  阅读(1405)  评论(1编辑  收藏  举报