爬虫2

#coding=utf-8
import urllib
import re
import os

def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html

def getImg(html):
reg = r'src="..(.+?\.JPG)"'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0


for imgurl in imglist:
print imgurl
imgurl = "http://www.cust.edu.cn"+imgurl
print imgurl

urllib.urlretrieve(imgurl,'D:\img\%s.jpg' % x)
x+=1


par = r'<span style="font-family:宋体">(.*)</span>'
parre = re.compile(par)
parlist = re.findall(parre,html)


for item in parlist:
print item
print '-----------------------------------'



html = getHtml("http://www.cust.edu.cn/lgxw/32913.htm")

print getImg(html)

posted on 2017-06-28 15:58  天才程序猿  阅读(120)  评论(0编辑  收藏  举报

导航