1 # -*- coding: utf-8 -*-
2 import string
3 import urllib2
4 import re
5 from BeautifulSoup import BeautifulSoup
6
7 class Baidu_Spider:
8 def __init__(self,url):
9 self.myUrl=url;
10
11 # 初始化加载页面并将其转码储存
12 def baidu_tieba(self,page):
13 self.myUrl=self.myUrl+str(page)
14 print self.myUrl
15 # 读取页面的原始信息并将其从gbk转码
16 myPage = urllib2.urlopen(self.myUrl).read().decode("gbk")
17 soup = BeautifulSoup(myPage)
18 thread_list = soup.findAll("div",attrs={"class":'t_con clearfix'})
19 for record in thread_list:
20 #print record
21 author = self.find_author(record)
22 #print author
23 hot = self.find_hot(record)
24 #print hot
25 title = self.find_title(record)
26 #print title
27 content = self.find_content(record)
28 #print content
29 url = self.find_url(record)
30 #print url
31 if url !="":
32 self.save_data(url,title,content,author,hot)
33
34 # 用来寻找该帖的相关信息
35 def find_url(self,record):
36 ahref=record.findAll("a",attrs={"class":'j_th_tit'})
37 if len(ahref)>0:
38 url='http://tieba.baidu.com'+ahref[0]['href']
39 else:
40 url=""
41
42 return url
43 def find_author(self,record):
44 author1=record.find("span",attrs={"class":'tb_icon_author '})
45 author2=author1.find("a",attrs={"class":'j_user_card'})
46 if author2 is not None:
47 author=author2.next
48 else:
49 author=author1.text
50 return author
51 def find_hot(self,record):
52 hot=record.find("div",attrs={"class":'threadlist_rep_num'}).text
53 return hot
54 def find_title(self,record):
55 tt=record.findAll("a",attrs={"class":'j_th_tit'})
56 if len(tt)>0:
57 title=tt[0]['title']
58 else:
59 title=''
60 return title
61 def find_content(self,record):
62 content=record.find("div",attrs={"class":'threadlist_abs threadlist_abs_onlyline'})
63 if content is not None:
64 content=content.next.text
65 else:
66 content=''
67 return content
68
69
70 # 用来存储楼主发布的内容
71 def save_data(self,url,title,content,author,hot):
72 data=url+"\007"+hot+"\007"+author+"\007"+title+"\007"+content+"\n"
73 data=data.encode('utf-8')
74 #print data
75 f = open('spider'+'.txt','a')
76 f.write(data)
77 f.close()
78
79 #-------- 程序入口处 ------------------
80 page=0
81 print u'已经启动百度贴吧爬虫'
82 while True:
83 bdurl ='http://tieba.baidu.com/f?kw=%C9%CF%BA%A3%BD%BB%CD%A8%B4%F3%D1%A7&tp=0&pn='
84 mySpider = Baidu_Spider(bdurl)
85 mySpider.baidu_tieba(page)
86 page=page+50