孤独的猫

  博客园 :: 首页 :: 新随笔 :: 联系 :: 订阅 :: 管理 ::

抓取网页图像并保持

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sgmllib
import sys
import pycurl
import StringIO
import os

def download(fileName):
    buf = StringIO.StringIO()
    curl = pycurl.Curl()
    curl.setopt(pycurl.URL, fileName)
    curl.setopt(pycurl.WRITEFUNCTION, buf.write)
    curl.setopt(pycurl.FOLLOWLOCATION, 1)
    curl.setopt(pycurl.MAXREDIRS, 5)
    curl.perform()
    return buf.getvalue()

def getTopDirname(name):
    oldName = name
    dirname = os.path.dirname(oldName)
    while dirname != 'http:':
        oldName = dirname
        dirname = os.path.dirname(oldName)
    return oldName

def getFullName(fileName, dirname, net):
    if fileName.startswith('/') and net:
        fullName = os.path.join(getTopDirname(dirname), fileName[1:])
    else:
        fullName = os.path.join(dirname, fileName)
    return fullName

def getDownloadDir():
    dwdir = os.getenv('DOWNLOAD_DIR')
    if not dwdir:
        dwdir = os.path.join(os.getenv('HOME'), '个人/下载/网页图片')
    return dwdir

def createDownloadDir():
    dwdir = getDownloadDir()
    if not os.path.exists(dwdir):
        os.makedirs(dwdir)

def downloadPicture(fileName, dirname):
    fullName = getFullName(fileName, dirname, True)
    buf = StringIO.StringIO()
    curl = pycurl.Curl()
    curl.setopt(pycurl.URL, fullName)
    curl.setopt(pycurl.WRITEFUNCTION, buf.write)
    curl.setopt(pycurl.FOLLOWLOCATION, 1)
    curl.setopt(pycurl.MAXREDIRS, 5)
    curl.perform()
    f = open(os.path.join(getDownloadDir(), os.path.basename(fileName)), 'w')
    f.write(buf.getvalue())
    f.close()

class imgSrcLister(sgmllib.SGMLParser):
    def reset(self):
        sgmllib.SGMLParser.reset(self)
        self.picture_uris = []
   
    def start_img(self, attrs):
        for key, value in attrs:
            if key == 'src':
                if not value in self.picture_uris:
                    self.picture_uris.append(value)

if __name__ == '__main__':
    if len(sys.argv) > 1:
        fileName = sys.argv[1]
    else:
        fileName = raw_input('Input the file name: ')
    if fileName.startswith('http://'):
        c = download(fileName)
    else:
        f = open(fileName, 'r')
        c = f.read()
        f.close()
   
    lister = imgSrcLister()
    lister.feed(c)
    if fileName.startswith('http://'):
        net = True
        dirname = os.path.dirname(fileName)
        if dirname == 'http:':
            dirname = fileName
    else:
        net = False
        dirname = os.path.dirname(fileName)
   
    if '-p' in sys.argv or '--print' in sys.argv or not net:
        for item in lister.picture_uris:
            print getFullName(item, dirname, net)
    else:
        createDownloadDir()
        for item in lister.picture_uris:
            downloadPicture(item, dirname)

[tlcr: 0] [01/02/2010 17:24:14] [tusooa@tusooa-laptop] [~]
>> cat /tmp/test.htm
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/1999/xhtml">
<html>
        <head>
                <title>test</title>
        </head>
        <body>
                <p>
                        <img alt='test pic' src='/usr/share/icons/default.kde4/16x16/mimetypes/application-pdf.png' />
                </p>
        </body>
</html>
[tlcr: 0] [01/02/2010 17:24:41] [tusooa@tusooa-laptop] [~]
>> get_html_pictures /tmp/test.htm
/usr/share/icons/default.kde4/16x16/mimetypes/application-pdf.png
[tlcr: 0] [01/02/2010 17:25:10] [tusooa@tusooa-laptop] [~]
>> cat /tmp/test.htm
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/1999/xhtml">
<html>
        <head>
                <title>test</title>
        </head>
        <body>
                <p>
                        <img alt='test pic' src='usr/share/icons/default.kde4/16x16/mimetypes/application-pdf.png' />
                </p>
        </body>
</html>
[tlcr: 0] [01/02/2010 17:25:12] [tusooa@tusooa-laptop] [~]
>> get_html_pictures /tmp/test.htm
/tmp/usr/share/icons/default.kde4/16x16/mimetypes/application-pdf.png
[tlcr: 0] [01/02/2010 17:26:35] [tusooa@tusooa-laptop] [~]
>> get_html_pictures http://forum.ubuntu.org.cn/ -p
http://forum.ubuntu.org.cn/./styles/UbuntuCN/theme/images/icon_mini_login.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/theme/images/icon_mini_register.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/theme/images/icon_mini_faq.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/imageset/forum_read.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/imageset/icon_topic_latest.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/imageset/forum_read_subforum.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/theme/images/whosonline.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/imageset/forum_unread.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/imageset/forum_read_locked.gif

=========================================================================

#!/usr/bin/perl -w

use strict;

my $url = $ARGV[0];
my $url_base = $1 if $url =~ /((?:http:\/\/)?[^\/]*\/)/;
chomp($url);
chomp($url_base);

system '/usr/bin/wget', '-O/tmp/index.html', '-q', $url || die "Cannot get the $url page: $!";

open(FILE, '<', "/tmp/index.html") || die "Cannot open index.html file: $!";

while (<FILE>) {
   if (/src=\"([^? ]+\.\w{3,})\"/i) {
   
   my $img = $1;
   $img = $url_base . $img unless $img =~ /http/;
   
   system '/usr/bin/wget', '-c',  '-T1', '-t1', $img || die "Cannot get the $img image: $!";
   #print "$img\n";
   }
}

close(FILE);

使用:

perl getImage.pl http://tianxiamm.com/viewthread.php?tid=57326&extra=page%3D1
posted on 2011-04-19 15:14  孤独的猫  阅读(321)  评论(0)    收藏  举报