1 #! /usr/bin/env python
2 #coding=utf-8
3
4 import requests
5 import re,json
6 import sys,os
7 import Queue,threading
8 from bs4 import BeautifulSoup
9 reload(sys)
10 sys.setdefaultencoding("utf8")
11
12 def http_req_get(siteurl):
13 headers = {
14 "Host": "www.xuebang.com.cn",
15 "Connection": "keep-alive",
16 "Cache-Control": "max-age=0",
17 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
18 "Upgrade-Insecure-Requests": "1",
19 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
20 "Accept-Encoding": "gzip, deflate, sdch",
21 "Accept-Language": "zh-CN,zh;q=0.8",
22 "Cookie": "__cfduid=da7335f4b0e760976f98697b651fc10041447572288; pgv_pvi=7944819712; deptNumOf11=140; deptNumOf89=60; deptNumOf711=60; commentNumOf11215=1074; deptNumOf1411=56; JSESSIONID=abcqLyMOLKEVDbynTTtev; a2666_pages=1; a2666_times=4; pgv_si=s4040530944; Hm_lvt_8147cdaed425fa804276ea12cd523210=1447572289,1447678990,1447734730; Hm_lpvt_8147cdaed425fa804276ea12cd523210=1447734730; CNZZDATA5928106=cnzz_eid%3D1168227404-1447570407-%26ntime%3D1447729389; Hm_lvt_863e19f68502f1ae0f9af1286bb12475=1447572289,1447678990,1447734730; Hm_lpvt_863e19f68502f1ae0f9af1286bb12475=1447734730; _ga=GA1.3.122575526.1447572289; _gat=1"}
23 try:
24 urlobj = requests.get(
25 url = siteurl,
26 headers = headers,
27 )
28 return urlobj
29 except Exception,e:
30 print 'yichang'
31 pass
32
33
34 #def request_get(siteurl):
35
36 class LinksParser(object):
37 def __init__(self,urlobj):
38 self.urlobj = urlobj
39 self.soup = BeautifulSoup(self.urlobj.text, "html.parser")
40
41 #创建大学目录
42 def createDaXueDir(self):
43 #获取当前文件路径 如果不存在就创建文件夹
44 path = sys.argv[0]
45 current_dir = os.path.dirname(path)
46 real_path = current_dir + '\\' + self.soup.title.text.encode('gb18030')
47 if os.path.exists(real_path):
48 pass
49 else:
50 try:
51 os.mkdir(real_path)
52 except:
53 pass
54 return real_path
55
56 #获取学院 并把学校的院系写入文件
57 def xueyuan(self,path):
58 try:
59 fh = open(real_path + '/xueyuan.txt','wb')
60 for line in self.soup.find_all('a',{'class':'yxcologe'}):#.encode('gb18030')
61 fh.writelines(line.text.encode('gb18030').strip() + '\n')
62 fh.close()
63 except:
64 pass
65
66 #获取该学院的教师列表的url
67 def teacher(self,path):
68 lst = []
69 length = len(self.soup.find_all('a',{'class':'yxcologe'}))
70 for i in xrange(length):
71 #依次获取每个院系的老师链接
72 url = self.soup.find_all('a',{'class':'yxcologe'})[i]['href'].encode('gb18030')
73 lst.append(url)
74 return lst
75
76
77 #获取所有教师列表
78 def teacher_lst(self):
79 length = len(self.soup.find('span',{'class','TJszlist'}).find_all('li'))
80 for i in xrange(length):
81 lst = self.soup.find('span',{'class','TJszlist'}).find_all('li')[i].find('a')['title']
82 #开始截取该系的名称
83 yuanxi = str(self.soup.find('span',{'class','t_dqwz'}))
84 yuanxi = yuanxi[-40:]
85 yuanxi = yuanxi.split('»')[1]
86 yuanxi = yuanxi.split('<')[0]
87 teacher_lst.append({'department':yuanxi,'name':lst})
88 url = self.soup.find('span',{'class','TJszlist'}).find_all('li')[i].find('a')['href'].encode('gb18030')
89 teacher_comment_url.append(url)
90
91 #获取教师和该教师下所有评论
92 def comment_teacher(self):
93 length = len(self.soup.find_all('span',{'class','TJR_info'}))
94 if length == 0:
95 return 'no comments'
96 else:
97 for i in xrange(length):
98 teacher_content = self.soup.find_all('span',{'class','TJR_info'})[i].find('p',{'class','TJlycon'}).text
99 teacher_name = self.soup.find(color='#0088cc').text
100 teacher_time = self.soup.find_all('span',{'class','TJR_info'})[i].find('span').string
101 teacher_all_comment.append({'teacher_id':teacher_name,'comment':teacher_content,'time':teacher_time})
102 json_data = json.dumps(teacher_all_comment, encoding="UTF-8", ensure_ascii=False)
103 return json_data
104
105 class myThreads(threading.Thread):
106 def __init__(self,queue):
107 threading.Thread.__init__(self)
108 self.queue = queue
109
110 def run(self):
111 while True:
112 if self.queue.empty():
113 break
114 else:
115 try:
116 url = self.queue.get_nowait()
117 res_obj = LinksParser(http_req_get(url))
118 res_obj.teacher_lst()
119 except Exception,e:
120 break
121 class commentThreads(threading.Thread):
122 def __init__(self,queue):
123 threading.Thread.__init__(self)
124 self.queue = queue
125
126 def run(self):
127 while True:
128 if self.queue.empty():
129 break
130 else:
131 try:
132 url = self.queue.get_nowait()
133 res_obj = LinksParser(http_req_get(url))
134 test = res_obj.comment_teacher()
135 fh = open(real_path + '/teacher_comment_lst.txt','wb')
136 fh.write(test)
137 fh.close()
138 except Exception,e:
139 break
140
141 if __name__ == '__main__':
142 #输入学院的ID
143 #i = sys.argv[1]
144 idlist = [11, 129, 70, 71]
145 for i in idlist:
146 i = str(i)
147 thread_number = 50
148 url = 'http://www.xuebang.com.cn/' + i + '/deptlist'
149 try:
150 urlobj = http_req_get(url)
151 #生成院系列表
152 response_obj = LinksParser(urlobj)
153 #文件所保存的路径
154 real_path = response_obj.createDaXueDir()
155 response_obj.xueyuan(real_path)
156
157 #获取该学院的教师列表的url 即每个系学院下面的教师
158 xi_to_teacher = response_obj.teacher(real_path)
159 #获取所有的教师列表
160
161
162 global teacher_lst
163 teacher_lst = []
164
165
166 #教师comment链接
167 global teacher_comment_url
168 teacher_comment_url = []
169
170 #多线程加快速度
171 queue = Queue.Queue()
172 for line in xi_to_teacher:
173 queue.put(line)
174 threads = []
175 for i in xrange(thread_number):
176 threads.append(myThreads(queue))
177 for t in threads:
178 t.start()
179 for t in threads:
180 t.join()
181 teacher_lst = json.dumps(teacher_lst, encoding="UTF-8", ensure_ascii=False)
182 #print len(teacher_lst)
183
184 #将教师写入文件
185 try:
186 fh = open(real_path + '/teacher_lst.txt','wb')
187 fh.write(teacher_lst)
188 fh.close()
189 except:
190 pass
191
192 global teacher_all_comment
193 teacher_all_comment = []
194 #获取教师及教师评论
195 queu = Queue.Queue()
196 for line_url in teacher_comment_url:
197 queu.put(line_url)
198
199 comments = []
200 for line in xrange(thread_number):
201 comments.append(commentThreads(queu))
202 for t in comments:
203 t.start()
204 for t in comments:
205 t.join()
206 except:
207 pass
208
209 #print teacher_all_comment