小爬虫爬一个贴吧网页的图片

#!/usr/bin/python
import re
import urllib

def getHtml(url):
        page = urllib.urlopen(url)
        html = page.read()
        return html

def getImg(html):
        reg = r'src="(.*?\.jpg)" width'
        imgre = re.compile(reg)
        imglist = re.findall(imgre,html)
        x=0
        for imgurl in imglist:
                urllib.urlretrieve(imgurl,'%s.jpg' % x)
                x+=1

html =  getHtml("http://image.baidu.com/")
getImg(html)

python version 3.4 仿照别人的脚本修改后运行成功：

#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
import urllib.request as urllib2

def getHtml(url):
	page = urllib2.urlopen(url)
	html = page.read()
	return html

def getImage(html):
	imglist=[]
	imgre = re.compile(r'src="(http://imgsrc.*?\.jpg)" size')
	html = html.decode('utf-8')
	imglist = imgre.findall(html)
	x=0
	for imgurl in imglist:
		urllib2.urlretrieve(imgurl,'E:\\%s.jpg' %x)
		x+=1
html = getHtml("http://tieba.baidu.com/p/4866459683")
getImage(html)

<wiz_tmp_tag id="wiz-table-range-border" contenteditable="false" style="display: none;">

posted @ 2019-12-05 15:27 raisok 阅读(224) 评论(0) 收藏举报

刷新页面返回顶部