# coding=utf-8
# !/usr/bin/env python
'''
author: dangxusheng
desc : 动态分页抓取 游民星空 的资讯
date : 2018-08-29
'''
import requests
from bs4 import BeautifulSoup
import json
import time
url = "https://www.gamersky.com/news/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36 Qiyu/2.1.1.1",
"Referer": "https://www.gamersky.com/news/"
}
# 获取每一页
def once_page_info(page_index=1):
time_stramp = str(time.time()).replace('.', '')[0:13]
time_stramp = str(time_stramp)
# 分页提取
url = "https://db2.gamersky.com/LabelJsonpAjax.aspx?callback=jQuery18308266280560965529_1541308409652&jsondata=%7B%22type%22%3A%22updatenodelabel%22%2C%22isCache%22%3Atrue%2C%22cacheTime%22%3A60%2C%22nodeId%22%3A%2211007%22%2C%22isNodeId%22%3A%22true%22%2C%22page%22%3A" + str(
page_index) + "%7D&_=" + time_stramp
r = requests.get(url, headers=headers)
# 返回回来的数据,内部是json字符串格式,但是开头和结尾有一部分干扰字符串,去除即可
now_page_html = json.loads(r.text[41:-2])['body']
soup = BeautifulSoup(now_page_html, 'html.parser')
# ul = soup.find('ul', attrs={"class": "pictxt contentpaging"})
li_list = soup.find_all('li')
ls = []
for once_li in li_list:
once_type = once_li.find('a', attrs={'class': 'dh'}).string
once_type = once_type if once_type != None else "暂无类型"
once_title = once_li.find('a', attrs={'class': 'tt'}).string
once_title = once_title if once_title != None else "暂无标题"
once_info = once_li.find('div', attrs={'class': 'txt'}).string
once_info = once_info if once_info != None else "暂无简介"
once_time = once_li.find('div', attrs={'class': 'time'}).string
once_visited = once_li.find('div', attrs={'class': 'visit gshit'}).string
once_comment = once_li.find('div', attrs={'class': 'pls cy_comment'}).string
once_img_url = once_li.find('img', attrs={'class': 'pe_u_thumb'}).attrs['src']
ls.append(
{'type': once_type, 'title': once_title, 'info': once_info, 'time': once_time, 'visited': once_visited,
'comment': once_comment, 'img_url': once_img_url})
return ls
# 保存每一个的内容
def save_to_file(all_info):
with open('./gemersky.txt', 'a', encoding='utf-8') as file:
for o in all_info:
# 按照指定格式保存
file.write("%s::%s::%s::%s::%s::%s::%s\n"%(o['type'],o['title'],o['time'],o['visited'],o['comment'],o['img_url'],o['info']))
for i in range(1, 10):
page_info = once_page_info(i)
save_to_file(page_info)
print('第%i页下载完成' % i)