抓取网页图像并保持
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sgmllib
import sys
import pycurl
import StringIO
import os
def download(fileName):
buf = StringIO.StringIO()
curl = pycurl.Curl()
curl.setopt(pycurl.URL, fileName)
curl.setopt(pycurl.WRITEFUNCTION, buf.write)
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, 5)
curl.perform()
return buf.getvalue()
def getTopDirname(name):
oldName = name
dirname = os.path.dirname(oldName)
while dirname != 'http:':
oldName = dirname
dirname = os.path.dirname(oldName)
return oldName
def getFullName(fileName, dirname, net):
if fileName.startswith('/') and net:
fullName = os.path.join(getTopDirname(dirname), fileName[1:])
else:
fullName = os.path.join(dirname, fileName)
return fullName
def getDownloadDir():
dwdir = os.getenv('DOWNLOAD_DIR')
if not dwdir:
dwdir = os.path.join(os.getenv('HOME'), '个人/下载/网页图片')
return dwdir
def createDownloadDir():
dwdir = getDownloadDir()
if not os.path.exists(dwdir):
os.makedirs(dwdir)
def downloadPicture(fileName, dirname):
fullName = getFullName(fileName, dirname, True)
buf = StringIO.StringIO()
curl = pycurl.Curl()
curl.setopt(pycurl.URL, fullName)
curl.setopt(pycurl.WRITEFUNCTION, buf.write)
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, 5)
curl.perform()
f = open(os.path.join(getDownloadDir(), os.path.basename(fileName)), 'w')
f.write(buf.getvalue())
f.close()
class imgSrcLister(sgmllib.SGMLParser):
def reset(self):
sgmllib.SGMLParser.reset(self)
self.picture_uris = []
def start_img(self, attrs):
for key, value in attrs:
if key == 'src':
if not value in self.picture_uris:
self.picture_uris.append(value)
if __name__ == '__main__':
if len(sys.argv) > 1:
fileName = sys.argv[1]
else:
fileName = raw_input('Input the file name: ')
if fileName.startswith('http://'):
c = download(fileName)
else:
f = open(fileName, 'r')
c = f.read()
f.close()
lister = imgSrcLister()
lister.feed(c)
if fileName.startswith('http://'):
net = True
dirname = os.path.dirname(fileName)
if dirname == 'http:':
dirname = fileName
else:
net = False
dirname = os.path.dirname(fileName)
if '-p' in sys.argv or '--print' in sys.argv or not net:
for item in lister.picture_uris:
print getFullName(item, dirname, net)
else:
createDownloadDir()
for item in lister.picture_uris:
downloadPicture(item, dirname)
[tlcr: 0] [01/02/2010 17:24:14] [tusooa@tusooa-laptop] [~]
>> cat /tmp/test.htm
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/1999/xhtml">
<html>
<head>
<title>test</title>
</head>
<body>
<p>
<img alt='test pic' src='/usr/share/icons/default.kde4/16x16/mimetypes/application-pdf.png' />
</p>
</body>
</html>
[tlcr: 0] [01/02/2010 17:24:41] [tusooa@tusooa-laptop] [~]
>> get_html_pictures /tmp/test.htm
/usr/share/icons/default.kde4/16x16/mimetypes/application-pdf.png
[tlcr: 0] [01/02/2010 17:25:10] [tusooa@tusooa-laptop] [~]
>> cat /tmp/test.htm
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/1999/xhtml">
<html>
<head>
<title>test</title>
</head>
<body>
<p>
<img alt='test pic' src='usr/share/icons/default.kde4/16x16/mimetypes/application-pdf.png' />
</p>
</body>
</html>
[tlcr: 0] [01/02/2010 17:25:12] [tusooa@tusooa-laptop] [~]
>> get_html_pictures /tmp/test.htm
/tmp/usr/share/icons/default.kde4/16x16/mimetypes/application-pdf.png
[tlcr: 0] [01/02/2010 17:26:35] [tusooa@tusooa-laptop] [~]
>> get_html_pictures http://forum.ubuntu.org.cn/ -p
http://forum.ubuntu.org.cn/./styles/UbuntuCN/theme/images/icon_mini_login.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/theme/images/icon_mini_register.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/theme/images/icon_mini_faq.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/imageset/forum_read.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/imageset/icon_topic_latest.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/imageset/forum_read_subforum.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/theme/images/whosonline.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/imageset/forum_unread.gif
http://forum.ubuntu.org.cn/./styles/UbuntuCN/imageset/forum_read_locked.gif
=========================================================================
#!/usr/bin/perl -w
use strict;
my $url = $ARGV[0];
my $url_base = $1 if $url =~ /((?:http:\/\/)?[^\/]*\/)/;
chomp($url);
chomp($url_base);
system '/usr/bin/wget', '-O/tmp/index.html', '-q', $url || die "Cannot get the $url page: $!";
open(FILE, '<', "/tmp/index.html") || die "Cannot open index.html file: $!";
while (<FILE>) {
if (/src=\"([^? ]+\.\w{3,})\"/i) {
my $img = $1;
$img = $url_base . $img unless $img =~ /http/;
system '/usr/bin/wget', '-c', '-T1', '-t1', $img || die "Cannot get the $img image: $!";
#print "$img\n";
}
}
close(FILE);
使用:

浙公网安备 33010602011771号