利用Python爬取大学官网上边的精品文章(对初学者很友好)
一、需求分析
1.包含发布⽇期,作者,标题,阅读数以及正⽂。
2.可⾃动翻⻚。
3.范围:2020年内
二、实现代码
def import_mysql(data_list):
db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders', charset='utf8')
cursor = db.cursor()
# 如果数据表已经存在使用execute()方法删除表。
cursor.execute("DROP TABLE IF EXISTS novel")
sql = 'CREATE TABLE IF NOT EXISTS novel (' \
'date VARCHAR(255) NOT NULL, ' \
'title VARCHAR(255) NOT NULL, ' \
'author VARCHAR(255) NOT NULL, ' \
'read_count VARCHAR(255) NOT NULL,' \
'content VARCHAR(10000) NOT NULL)'
cursor.execute(sql)
table = 'novel'
for data in data_list:
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
try:
if cursor.execute(sql, tuple(data.values())):
print('Successful import mysql')
db.commit()
except:
print('Failed import mysql')
db.rollback()
db.close()
def __init__(self):
self.__head = None
self.__node_be_inserted = None
self.__node_be_moved = None
#采用函数层次架构
#层次一:
def traverse_get_be_inserted_node(self): # find be inserted node ok
first_node = self.__head
bottom_child_node_list = []
def traverse_heap(current_node): # recursive function
if (current_node.get_child_node("left") == None) | (current_node.get_child_node("left") == None):
bottom_child_node_list.append(current_node)
if current_node.get_child_node("left") != None:
traverse_heap(current_node.get_child_node("left"))
if current_node.get_child_node("right") != None:
traverse_heap(current_node.get_child_node("right"))
traverse_heap(self.__head)
min_priority_number = 100000 # max priority
min_priority_node = None
for ele_instance in bottom_child_node_list:
if ele_instance.get_insert_priority() < min_priority_number:
min_priority_number = ele_instance.get_insert_priority()
min_priority_node = ele_instance
self.__node_be_inserted = min_priority_node
def value_float(self, newNode):# min value float ok
currentNode = newNode
currentNodeValue = currentNode.get_node_value()
currentParentNode = currentNode.get_parent_node()
if currentParentNode == None:
return "已经上浮到 root"
currentParentNodeValue = currentParentNode.get_node_value()
while currentNodeValue < currentParentNodeValue:
currentNode.set_node_value(currentParentNodeValue)
currentParentNode.set_node_value(currentNodeValue)
currentNode = currentParentNode
currentNodeValue = currentNode.get_node_value()
currentParentNode = currentNode.get_parent_node()
if currentParentNode == None:
#print("值已经上浮根节点值为{}".format(self.__head.get_node_value()))
return "已经上浮到root"
currentParentNodeValue = currentParentNode.get_node_value()
"""
doubleParentNode = parentNode.get_parent() # 2020.11.26 #交换 node_value 对象之间的链接关系不发生改变
def judge_left_or_right(parentNode, newNode):
if parentNode.get_child_node("left") == newNode:
return "left"
else:
return "right"
result_one = judge_left_or_right(parentNode, newNodeValue)
result_two = judge_left_or_right(doubleParentNode, parentNode)
"""
def traverse_get_be_moved_node(self): # ok
#改进方案:与traverse_get_be_inserted_node()函数代码复用程度极高可用类继承减少复用
first_node = self.__head
bottom_child_node_list = []
def traverse_heap(current_node): # recursive function
if (current_node.get_child_node("left") == None) | (current_node.get_child_node("right") == None):## error ########
bottom_child_node_list.append(current_node)
if current_node.get_child_node("left") != None:
traverse_heap(current_node.get_child_node("left"))
if current_node.get_child_node("right") != None:
traverse_heap(current_node.get_child_node("right"))
traverse_heap(self.__head)
max_priority_number = -1 # max priority
max_priority_node = None
for ele_instance in bottom_child_node_list:
if ele_instance.get_insert_priority() > max_priority_number:
max_priority_number = ele_instance.get_insert_priority()
max_priority_node = ele_instance
self.__node_be_moved = max_priority_node
#print("被移动节点值为{}, priority值为{}".format(self.__node_be_moved.get_node_value(), self.__node_be_moved.get_insert_priority()))
def value_down(self):
current_node = self.__head
while current_node.get_child_node("left") != None and current_node.get_child_node("right") != None:
current_tuple = (current_node, current_node.get_node_value()) #current_node, current_node_value
current_left_child_node = current_node.get_child_node("left")
current_left_tuple = (current_left_child_node, current_left_child_node.get_node_value()) #current_left_child_node, current_left_child_node_value
current_right_child_node = current_node.get_child_node("right")
current_right_tuple = (current_right_child_node, current_right_child_node.get_node_value()) #current_right_child_node, current_right_child_node_value
current_tuple_list = [current_tuple, current_left_tuple, current_right_tuple]
sequence_current_tuple_list = []
for ele_tuple in current_tuple_list:
if len(sequence_current_tuple_list) == 0:
sequence_current_tuple_list.append(ele_tuple)
elif len(sequence_current_tuple_list) == 1:
if ele_tuple[1] < sequence_current_tuple_list[0][1]:
sequence_current_tuple_list.insert(0, ele_tuple)
else:
sequence_current_tuple_list.append(ele_tuple)
else: #len(sequence_current_tuple_list) == 2
if ele_tuple[1] < sequence_current_tuple_list[0][1]:
sequence_current_tuple_list.insert(0, ele_tuple)
elif ele_tuple[1] > sequence_current_tuple_list[1][1]:
sequence_current_tuple_list.append(ele_tuple)
else:
sequence_current_tuple_list.insert(1, ele_tuple)
current_node_index = sequence_current_tuple_list.index(current_tuple)
current_min_child_node = sequence_current_tuple_list[0][0]
if current_node_index >= 1:
current_min_child_node_value = current_min_child_node.get_node_value()
current_node = current_tuple[0]
current_node_value = current_tuple[1]
current_node.set_node_value(current_min_child_node_value)
current_min_child_node.set_node_value(current_node_value)
current_node = current_min_child_node
else:
return "value down successful"
if current_node.get_child_node("left") != None and current_node.get_child_node("right") == None:
current_node_value = current_node.get_node_value()
current_child_node = current_node.get_child_node("left")
current_child_node_value = current_child_node.get_node_value()
if current_node_value > current_child_node_value:
current_child_node.set_node_value(current_node_value)
current_node.set_node_value(current_child_node_value)
return "value down successful"
if current_node.get_child_node("right") != None and current_node.get_child_node("left") == None:
current_node_value = current_node.get_node_value()
current_child_node = current_node.get_child_node("right")
current_child_node_value = current_child_node.get_node_value()
if current_node_value > current_child_node_value:
current_child_node.set_node_value(current_node_value)
current_node.set_node_value(current_child_node_value)
return "value down successful"
return "value down successful"
#层次二
def insert(self, value):
if self.__head == None: # start first node
newNode = Node(value, None, 0)
self.__head = newNode
self.__node_be_inserted = newNode
else:
parent_node = self.__node_be_inserted
newNode = Node(value, parent_node, parent_node.get_insert_priority()+1)
if parent_node.get_child_node("left") == None:
parent_node.append_child_node("left", newNode)
elif parent_node.get_child_node("right") == None:
parent_node.append_child_node("right", newNode)
else:
print("得到的self.__node_be_inserted出现错误")
self.value_float(newNode)
self.traverse_get_be_inserted_node()
def deleMin(self):
if self.__head == None:
return None
if (self.__head.get_child_node("left") == None) and (self.__head.get_child_node("right") == None):
minValue = self.__head.get_node_value()
self.__head = None
return minValue
def judge_left_or_right(parentNode, childNode):
if parentNode.get_child_node("left") == childNode:
return "left"
else:
return "right"
min_node = self.__head
minValue = min_node.get_node_value()
self.traverse_get_be_moved_node()
be_moved_node = self.__node_be_moved
min_node.set_node_value(be_moved_node.get_node_value())
parentNode = be_moved_node.get_parent_node()
result = judge_left_or_right(parentNode, be_moved_node)
parentNode.delete_child_node(result)
self.value_down()
return minValue
#层次三
def buildHeap(self, _list):
_list.sort()
for i in _list:
self.insert(i)
#函数层次架构外
def findMin(self):
minValue = self.__head.get_node_value()
return minValue
@property
def isEmpty(self):
if self.__head == None:
return "True"
else:
return "False"
@property
def size(self):
first_node = self.__head
def traverse_heap(current_node): # recursive function
if current_node != None:
#print("执行一次,当前节点值为", current_node.get_node_value())
return 1 + traverse_heap(current_node. get_child_node("left")) + traverse_heap(current_node.get_child_node("right"))
else:
return 0
if first_node == None:
return 0
else:
return traverse_heap(first_node)
#递归方法递归堆中每一个节点求出堆中key的个数
三、效果展示

搞定,还望各位大佬不吝赐教~

浙公网安备 33010602011771号