import requests
import re
import lxml.html
class Exam_spider:
def __init__(self):
self.base_url = 'http://datamining.comratings.com/exam'
self.s = requests.session()
def down_first(self):
"""
进行第一次访问
:return: sessionid
"""
res = self.s.get(self.base_url)
sessionid = res.cookies.get_dict().get('session')
return sessionid
def down_second(self, cookie):
"""
进行第二次访问
:param cookie: 访问需要的完整cookie
:return: 响应结果
"""
res = self.s.get(self.base_url + '3', cookies=cookie)
return res.content
def f1(self, a):
"""
获得js动态加载的cookie
:param a: 第一次访问获得到的cookie中的sessionid
:return: js动态加载的cookie
"""
encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
length = len(a)
i = 0
b = ""
while i < length:
c = ord(a[i]) & 0xff
i += 1
if i == length:
b += encoderchars[c >> 2]
b += encoderchars[(c & 0x3) << 4]
b += "=="
break
c2 = ord(a[i])
i += 1
if i == length:
b += encoderchars[c >> 2]
b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)]
b += encoderchars[(c2 & 0xf) << 2]
b += "="
break
c3 = ord(a[i])
i += 1
b += encoderchars[c >> 2]
b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)]
b += encoderchars[((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6)]
b += encoderchars[c3 & 0x3f]
return b
def make_cookie(self, sessionid):
"""
获得完整的cookie
:param sessionid: 第一访问得到的sessionid
:return: 完整的cookie
"""
lt = []
lt.append("session=" + sessionid + ';')
lt.append("c1=" + self.f1(sessionid[1:4]) + ';')
lt.append("c2=" + self.f1(sessionid))
cookie = {
'Cookie': " ".join(lt)
}
return cookie
def save_result(self, result):
"""
将结果保存进文件中
:param result: 第二次访问的响应结果
:return:
"""
with open('example_spider_result.html', 'wb') as fp:
fp.write(result)
def analysis_content(self, result):
"""
解析文件,得到ip
:param result:
:return:
"""
test_data = result.decode('utf-8')
pattern = re.compile(r'\.([A-Z]+){display:none}')
class_none_list = pattern.findall(test_data)
pattern_div = re.compile('<div\s.*')
t = pattern_div.sub("", test_data)
pattern_span_none = re.compile('<span\sstyle="display:none">.*?</span>')
t1 = pattern_span_none.sub("", t)
pattern_class_none1 = re.compile('<span\sclass="' + class_none_list[0] + '">.*</span>')
t2 = pattern_class_none1.sub("", t1)
pattern_class_none2 = re.compile('<span\sclass="' + class_none_list[1] + '">.*</span>')
t3 = pattern_class_none2.sub("", t2)
html = lxml.html.fromstring(t3.replace("\n", ""))
html_data = html.xpath('//body/descendant-or-self::text()')
tt = ""
ln = []
for i in html_data[1:]:
if tt.count('.') == 3 and tt[-1] != '.':
ln.append(tt)
tt = ""
tt = tt + i
ln.append(tt)
print(ln)
print(len(ln))
def run(self):
"""
运行主线程
:return:
"""
sesionid = self.down_first()
cookie = self.make_cookie(sesionid)
result = self.down_second(cookie)
self.analysis_content(result)
self.save_result(result)
if __name__ == '__main__':
e = Exam_spider()
e.run()