import urllib.request
import urllib.error
import re
data=urllib.request.urlopen("http://bbs.hupu.com/").read()
data=data.decode("utf-8","ignore")
pat='<a href="(.*?.html)" target="_blank" title='
allurl=re.compile(pat).findall(data)
for i in range(0,len(allurl)):
allurl[i]='https://bbs.hupu.com/'+allurl[i]
fh=open('./result.txt','a',encoding='utf8')
for i in range(0,len(allurl)):
try:
nowurl=allurl[i]
print('正在爬取第'+str(i+1)+'个帖子')
print(nowurl)
data=urllib.request.urlopen(nowurl).read()
data=data.decode("utf-8","ignore")
pat='<title>\n(.*?)\n</title>'
result=re.compile(pat).findall(data)
fh.write(result[0]+'\n')
print('----打印成功----')
except urllib.error.URLError as e:
print('爬取第' + str(i + 1) + '个帖子失败')
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
fh.close()