__author__ == 'Jeffery Gao'
#coding=utf-8
import urllib2
import re
import os
class Tool:
removeImg = re.compile('<img.*?>| {7}')
removeAddr = re.compile('<a.*?>|</a>')
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
replaceTD = re.compile('<td>')
replacePara = re.compile('<p.*?>')
replaceBR = re.compile('<br><br>|</br>')
removeExtraTag = re.compile('<.*?>')
def repalce(self, x):
x = re.sub(self.removeImg, "", x)
x = re.sub(self.removeAddr, "", x)
x = re.sub(self.replaceLine, "\n", x)
x = re.sub(self.replaceTD, "\t", x)
x = re.sub(self.replacePara, "\n ", x)
x = re.sub(self.replaceBR, "\n", x)
x = re.sub(self.removeExtraTag, "", x)
return x.strip()
class BDTB:
#baseURL and seelz
def __init__(self, baseURL = '', seeLz = 1):
if baseURL:
self.baseURL = baseURL
else:
self.baseURL = 'https://tieba.baidu.com/p/3138733512'
self.seeLz = '?see_lz='+str(seeLz)
self.defaultTitle = 'NewPost'
self.cutOffRule = '*' * 60
self.postsAndPageNum = [0, 0]
#self.page_index = 0
#get page html code
def getPageCode(self, pageNum):
pageStr = '&pn=' + str(pageNum)
try:
url = self.baseURL + self.seeLz + '&pn=' + pageStr
request = urllib2.Request(url)
response = urllib2.urlopen(request)
pageCode = response.read().decode('utf-8')
#print(pageCode)
return pageCode
except urllib2.URLError, e:
if hasattr(e, 'reason'):
print('Get Page Code Failure:'+e.reason)
return None
# extract the title/total pages/contents/floor number from pageCode
def getTitle(self, pageCode):
if pageCode:
pattern = re.compile(r'<h3 class=".*?"> title="([\s\S]*?)".*?>')
titleName = re.search(pattern, pageCode)
# print(titleName.groups())
return titleName.group(1).strip()
else:
print('Get Title Failure !')
return None
# get total pages and reply posts number
def getTotalPage(self, pageCode):
if pageCode:
pattern = re.compile(r'<li class="l_reply_num".*?>[\s\S]*?<span class=.*?>(.*?)</span>\
[\s\S]*?<span class="red">(.*?)</span>')
items = re.search(pattern, pageCode)
# print(type(items))
if items:
# total reply posts and page number
postsAndPageNum = [items.group(1), items.group(2)]
self.postsAndPageNum = postsAndPageNum
return postsAndPageNum
return None
# get contents / floor number / reply time
def getContents(self, pageIndex):
pageCode = self.getPageCode(pageIndex)
pattern = re.compile(r'<div id="post_content.*?>([\s\S]*?)<div>[\s\S]*?\
<span class="tail_info">(.*?)</span>[\s\S]*?<span class="tail_info">(.*?)</span>')
items = re.findall(pattern, pageCode)
if not items:
print('Re Module Error: getContents')
contentAndFloorAndTime = []
for item in items:
#print(items[0])
#print(items[1])
#print(items[2])
#content---floor number----time
contentAndFloorAndTime.append([item[0], item[1], item[2]])
return contentAndFloorAndTime
return None
#save file
def setFileName(self, title=''):
if title:
fileName = title + '.txt'
self.fileFP = open(fileName, 'w+')
self.fileFP.write(title)
self.fileFP.write('Post total '+ self.postsAndPageNum[0] + 'pages, and ' + self.postsAndPageNum[1])
else:
filename = self.defaultTitle + '.txt'
self.fileFP = open(fileName, 'w+')
self.fileFP.write('This Post is None')
self.fileFP.write('Post total ' + self.postsAndPageNum[0] + 'pages, and ' + self.postsAndPageNum[1])
return self.fileFP
def witreData(self, contents = []):
for item in contents:
self.fileFP.write(self.cutOffRule)
self.fileFP.write(item1[0]+item[1])
self.fileFP.write(item[2])
def start(self):
pageCode = self.getPageCode(1)
title = self.getTitle(pageCode)
postsAndPageNum = self.getTotalPage(pageCode)
self.setFileName(title)
print('this post total {0} pages and {1} reply'.format(postsAndPageNum))
contents = self.getContents(1)
print('Now is loading page 1')
self.witreData(contents)
totalPage = postsAndPageNum[2]
pageNum = 2
while pageNum<= totalPage:
print('page {0} is finished! Now is loading page {1}'.format(pageNum-1, pageNum))
pageCode = self.getPageCode(pageNum)
contents = self.getContents(pageCode)
self.witreData(contents)
print('All Finished')
def main():
baseURL = raw_input("输入帖子网址:")
seeLz = raw_input("是否选择只看楼主(0否1是):")
bdtb = BDTB(baseURL,seeLz)
bdtb.start()
if __name__ == '__main__':
main()