python 2.7
#coding:utf-8
import urllib2
import re
class Tools(object):
remove_n=re.compile(r'\n')
replace_br = re.compile(r'<br>|<br>')
remove_ele = re.compile(r'<.*?>', re.S)
def replace_rs(self,rs):
name = re.sub(self.remove_n, '', rs[0])
content = re.sub(self.remove_n, '', rs[2])
content = re.sub(self.replace_br, r'\n', content)
content = re.sub(self.remove_ele, '', content)
dz_tuple = (name, rs[1], content, rs[3], rs[4])
return dz_tuple
class QSBK(object):
def __init__(self):
self.baseURL = 'https://www.qiushibaike.com/hot/page'
self.headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}
self.tool = Tools()
def get_page(self,num):
url = self.baseURL+str(num)
request=urllib2.Request(url,headers=self.headers)
try:
response = urllib2.urlopen(request)
except (urllib2.URLError,Exception),e:
print '连接糗事百科失败,原因:%s'%e
return None
else:
return response.read()
def get_data(self,html):
pattern = re.compile( r'<div class="author clearfix">.*?<h2>(.*?)</h2>.*?Icon">(.*?)</div>.*?class="content">(.*?)</span>.*?<i class="number">(.*?)</i>.*?<i class="number">(.*?)</i>',re.S)
results = re.findall(pattern, html)
rs_data =[]
for rs in results:
dz_tuple = self.tool.replace_rs(rs)
rs_data.append(dz_tuple)
return rs_data
def start(self):
html = self.get_page(1)
if html == None:
return
rs_data = self.get_data(html)
for rs in rs_data:
s = raw_input('敲击回车查看下一条段子,输入Q结束:')
if s == 'Q':
print '程序结束!'
break
print '用户名:%s 年龄:%s 好笑数:%s 评论数:%s'%(rs[0],rs[1],rs[3],rs[4])
print rs[2]
print '\n'
if __name__ == '__main__':
qsbk = QSBK()
qsbk.start()