# !/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib.request
import re
import MySQLdb
import socket
domain = 'http://www.quanshuwang.com'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
#获取分类列表
def getTypeList(type):
req = urllib.request.Request('http://www.quanshuwang.com/map/%s.html'%type)
req.headers = headers #替换头信息
#req.add_header() #添加单个头信息
res = urllib.request.urlopen(req) # 获取源码
html = res.read().decode('gbk') # 解码
reg = r'<a href="(/book/.+?)" target="_blank">(.+?)</a>'
reg = re.compile(reg) # 编译
return re.findall(reg,html)
def getNovelList(href):
req = urllib.request.Request(domain + href)
req.headers = headers
res = urllib.request.urlopen(req)
html = res.read().decode('gbk')
reg = r'<li><a href="(.+?)" title="(.+?)">(.+?)</a></li>'
reg = re.compile(reg)
return re.findall(reg,html)
def getNovelContent(url):
req = urllib.request.Request(domain + url)
req.headers = headers
res = urllib.request.urlopen(req)
html = res.read().decode('gbk','ignore')
reg = r'style5\(\);</script>(.*?)<script type="text/javascript">style6\(\)'
reg = re.compile(reg,re.S)
print(domain + url)
return re.findall(reg,html)[0]
class Sql(object):
conn = MySQLdb.connect(host='localhost',port=x,user='x',password='x',db='novel',charset='utf8')
def addnovels(self,sort,novelname):
cur = self.conn.cursor() #游标
cur.execute("insert into novel(sort,novelname) values('%s','%s')"%(sort,novelname))
lastrowid = cur.lastrowid
cur.close()
self.conn.commit()
return lastrowid
def addchapters(self,novelid,chaptername,content):
cur = self.conn.cursor()
cur.execute("insert into chapter(novelid,chaptername,content) values(%s,'%s','%s')"%(novelid,chaptername,content))
cur.close()
self.conn.commit()
mysql = Sql()
if __name__ == '__main__':
for type in range(1,10):
if type == 1:
sort = "玄幻魔法"
elif type == 2:
sort = "武侠修真"
elif type == 3:
sort = "历史军事"
elif type == 4:
sort = "女频言情"
elif type == 5:
sort = "侦探推理"
elif type == 6:
sort = "网络动漫"
elif type == 7:
sort = "科幻小说"
elif type == 8:
sort = "恐怖灵异"
elif type == 9:
sort = "美文同人"
else:
print("请求的小说类型有误!!!")
for href,novelname in getTypeList(type):
lastrowid = mysql.addnovels(sort,novelname)
for url,title,title in getNovelList(href):
try:
print("正在爬取------------%s 《%s》 %s"%(sort,novelname,title))
content = getNovelContent(href.replace('index.html',url))
mysql.addchapters(novelid=lastrowid,chaptername=title,content=content)
socket.setdefaulttimeout(30)
except Exception as e:
print("连接中断,发生错误:%s !!!!"%e)