【shikaobang】 python爬虫脚本
"""
事考帮更新url加密数字后,无法解码。只能用【<div class="title">相关推荐</div>】里面的链接来处理
解决办法:相关推荐是按题目顺序排列,以最后一个为起始网址,不断循环复制加密编码,起到原来的效果
"""
import pandas as pd
import urllib
import urllib2
from bs4 import BeautifulSoup
import codecs
import re
a1 = 101500 #需要自己修改起始值
urlname_list = []
url_name_start = u'/questionbank/5YmJvWgYm6' #填入查询到开始的urlname
url_name_end = u'/questionbank/G5mbgoM1aX' #填入查询到最后的urlname
urlname_list.append(url_name_start)
a = 1
b = 1
while True:
url_name = "http://www.shikaobang.cn" + url_name_start
user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/2011122 Ubuntu/10.10 (maverick) Firefox/2.5.1"
request = urllib2.Request(url_name, headers={'User-Agent':user_agent})
html = urllib2.urlopen(request)
html_data = BeautifulSoup(html,"html.parser")
if html_data.find(name='a') is None:
urlname_list.pop()
url_name_start = urlname_list[-1]
print "网页抓取失败,此时网址为:" + url_name_start
continue
for m in html_data.find_all(href=re.compile("/questionbank/")) :
if m['href'] == url_name_end:
urlname_list.append(m['href'])
break
else:
urlname_list.append(m['href'])
a = a + 1
url_name_start = urlname_list[-1]
if url_name_end == url_name_start:
break
print u"网页抓取成功,此时网址为:" + url_name_start
print u"查询结果共" + str(a) + u"条"
print u"最终查询结果共" + str(a) + u"条"
print u'开始爬取网页'
#爬取网页
import pandas as pd
import urllib
import urllib2
from bs4 import BeautifulSoup
import codecs
import time
time_start=time.time()
"""
修改题目对应网页数值
"""
a2 = a1
for i in urlname_list:
try:
url_name = "http://www.shikaobang.cn" + i
user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/2011122 Ubuntu/10.10 (maverick) Firefox/2.5.1"
request = urllib2.Request(url_name, headers={'User-Agent':user_agent})
html = urllib2.urlopen(request)
f = codecs.open('html/sz_'+str(a1),'w')
f.write(html.read())
f.close()
a1 = a1 + 1
except:
print i
pass
continue
print "下次使用该编码作为起始值:" + str((int(a1/100)+1)*100)
print "爬取网页结束,开始处理文本"
# -*- coding: utf-8 -*-
def html_chuli(html):
html_data = BeautifulSoup(html)
t_miaosu = html_data.find(attrs={'name':'description'})['content'] #题目描述
t_news_title = html_data.find_all(attrs={'class':'news-content-title'})
t_news_typs = html_data.find_all(attrs={'class':'news-typs'})
t_news_time = html_data.find_all(attrs={'class':'news-time'})
tdata1 = html_data.find("div", attrs={'class':'main-content'})#抓取第一个框架
if tdata1:
t_leixing = tdata1.select('span')[0].string #题目类型
t_content = tdata1.select('div.question-title')[0].string #题目内容 注:id是#;name是.
t_xueze = tdata1.select('div.question-item') #题目所有选项
x_ABCD = [] #选项ABCD
x_content = [] #选项ABCD对应内容
z_xueze = [] #正确选项
for item in t_xueze:
item_middle = item.get_text().split()
x_ABCD.append(item_middle[:1])
x_content.append(item_middle[1:])
for item in tdata1.select('label.actives'):#选择
z_xueze.append(item.string)
for item in tdata1.select('div.question-item.correct i'):#判断
z_xueze.append(item.string)
return t_miaosu,t_leixing,t_content,x_ABCD,x_content,z_xueze,t_news_title,t_news_typs,t_news_time
else:
return '0'
#文本处理
import pandas as pd
import urllib
import urllib2
import re
import json
import random
from bs4 import BeautifulSoup
import codecs
"""
修改提取后对应文本编码
"""
for i in range(a2,a1):
try:
with open('html/sz_'+str(i), 'r') as f:
s_1 = ""
s_2 = ""
t_n = ""
contents = f.read().decode("utf-8", "ignore") #处理�
t_miaosu,t_leixing,t_content,x_ABCD,x_content,z_xueze,t_news_title,t_news_typs,t_news_time = html_chuli(contents)
for m in range(len(x_ABCD)):
if x_ABCD[m][0]:
s1 = x_ABCD[m][0]
else:
s1=""
if x_content[m][0]:
s2 = x_content[m][0]
else:
s2=""
s_1 = s_1 + s1 + ":" + s2 + " "
for n in range(len(z_xueze)):
s_2 = s_2 + z_xueze[n].strip()
for z in range(len(t_news_title)):
if t_news_title[z]:
new1 = t_news_title[z].text
else:
new1=""
if t_news_typs[z]:
new2 = t_news_typs[z].text
else:
new2=""
if t_news_time[z]:
new3 = t_news_time[z].text
else:
new3=""
t_n = t_n + new1 + "|" + new2 + "|" + new3 + "&"
if t_leixing is None:
continue
k1 = str(i) + "#" + t_miaosu.replace("\n", "") + "#" + t_leixing + "#" + t_content.replace(" ", "").replace("\n", "") + "#" + s_1.replace("\n", "") + "#" + s_2.replace("\n", "") + "#" + t_n.replace("\n", "")
f1 = codecs.open(u'out/时政202011-20210325.txt','a',encoding="utf-8") #修改导出txt文件编号
f1.write(k1 + "\n")
except:
f2 = codecs.open('out/fail_num.txt','a',encoding="utf-8")
k2 = str(i)
f2.write(k2 + "\n")
print str(i) + u"号html文件导入失败!"
f2.close()
pass
continue
f1.close()
print u"处理完毕!再次执行请修改“输出文件名”,并保存py文件,然后重新开始!!!"
此代码仅纪念作用,目前已不可用
study just for life!

浙公网安备 33010602011771号