<爬虫>常见网址的爬虫整理
# 是告诉操作系统执行这个脚本的时候,调用/usr/bin下的python3解释器;
# !/usr/bin/python3
# -*- coding: utf-8 -*-
"""
请求URL分析 https://tieba.baidu.com/f?kw=魔兽世界&ie=utf-8&pn=50
请求方式分析 GET
请求参数分析 pn每页50发生变化,其他参数固定不变
请求头分析 只需要添加User-Agent
"""
# 代码实现流程
# 1. 实现面向对象构建爬虫对象
# 2. 爬虫流程四步骤
# 2.1 获取url列表
# 2.2 发送请求获取响应
# 2.3 从响应中提取数据
# 2.4 保存数据
import requests
class TieBa_Spier():
def __init__(self, max_page, kw):
# 初始化
self.max_page = max_page # 最大页码
self.kw = kw # 贴吧名称
self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
def get_url_list(self):
"""获取url列表"""
# 根据pn每50进入下一页,构建url列表
return [self.base_url.format(self.kw, pn) for pn in range(0, self.max_page * 50, 50)]
def get_content(self, url):
"""发送请求获取响应内容"""
response = requests.get(
url=url,
headers=self.headers
)
# print(response.text)
return response.content
def save_items(self, content, idx):
"""从响应内容中提取数据"""
with open('{}.html'.format(idx), 'wb') as f:
f.write(content)
return None
def run(self):
"""运行程序"""
# 获取url_list
url_list = self.get_url_list()
for url in url_list:
# 发送请求获取响应
content = self.get_content(url)
# 保存数据,按照url的索引+1命名保存的文件
items = self.save_items(content, url_list.index(url) + 1)
# 测试
# print(items)
if __name__ == '__main__':
# 最大页码,贴吧名
spider = TieBa_Spier(2, "神无月")
spider.run()
002.京东商品评论
# !/usr/bin/python3
# -*- coding: utf-8 -*-
import requests
import re
import pandas as pd
"""
请求URL分析 https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4962&productId=5089225&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1
请求方式分析 GET
请求参数分析 page每页加1发生变化,其他参数固定不变
请求头分析 不需要添加User-Agent
"""
# 代码实现流程
# 1. 实现面向对象构建爬虫对象
# 2. 爬虫流程四步骤
# 2.1 获取url列表
# 2.2 发送请求获取响应
# 2.3 从响应中提取数据
# 2.4 保存数据
class JD_Spier():
def __init__(self, max_page):
# 初始化
self.max_page = max_page # 最大页码
self.base_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4962&productId=5089225&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1"
def get_url_list(self):
"""获取url列表"""
# 根据page每1进入下一页,构建url列表
return [self.base_url.format(page) for page in range(0, self.max_page, 1)]
def get_content(self, url):
"""发送请求获取响应内容"""
response = requests.get(url=url)
# print(response.text)
return response.text
def save_items(self, content):
"""从响应内容中提取数据"""
with open('comment_iphone11.txt', 'a', encoding='utf-8') as f:
pat = '"content":"(.*?)","'
res = re.findall(pat, content)
for index, i in enumerate(res):
i = i.replace('\\n', '')
# print(i)
f.write(str(index) + ':' + i)
f.write('\n')
f.write('\n')
return None
def run(self):
"""运行程序"""
# 获取url_list
url_list = self.get_url_list()
for index, url in enumerate(url_list):
# 发送请求获取响应
try:
print('正在爬第%s页...' % index)
content = self.get_content(url)
# 保存数据
self.save_items(content)
except:
print('爬取第' + str(index) + '页出现问题')
continue
if __name__ == '__main__':
# 最大页码
spider = JD_Spier(99)
spider.run()
顺带做个词云图
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud
# 进行分词的数据
f = open('comment_iphone11.txt','r',encoding='utf-8')
text = f.read()
cut_text = ' '.join(jieba.lcut(text))
print(cut_text)
# 词云形状
color_mask = imread("201910051325286.jpg")
cloud = WordCloud(
# 注意字体在同路径
font_path='FZMWFont.ttf', # 字体最好放在与脚本相同的目录下,而且必须设置
background_color='white',
mask=color_mask,
max_words=200,
max_font_size=5000
)
word_cloud = cloud.generate(cut_text)
plt.imshow(word_cloud)
plt.axis('off')
plt.show()
效果图

003.豆瓣电影top250(三种解析)
# 目标:爬取豆瓣电影排行榜TOP250的电影信息
# 信息包括:电影名字,上映时间,主演,评分,导演,一句话评价
# 解析用学过的几种方法都实验一下①正则表达式.②BeautifulSoup③xpath
import requests
import re # 正则表达式
import json
from bs4 import BeautifulSoup # BS
from lxml import etree # xpath
# 进程池
from multiprocessing import Pool
import multiprocessing
def get_one_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
def zhengze_parse(html):
pattern = re.compile(
'<em class="">(.*?)</em>.*?<img.*?alt="(.*?)".*?src="(.*?)".*?property="v:average">(.*?)</span>.*?<span>(.*?)</span>.*?'
+ 'class="inq">(.*?)</span>', re.S)
items = re.findall(pattern, html)
# print(items)
# 因为125个影片没有描述,根本没有匹配到- -,更改也简单,描述单独拿出来,这里我就不改了
for item in items:
yield {
'index': item[0],
'title': item[1],
'image': item[2],
'score': item[3],
'people': item[4].strip()[:-2],
'Evaluation': item[5]
}
def soup_parse(html):
soup = BeautifulSoup(html, 'lxml')
for data in soup.find_all('div', class_='item'):
index = data.em.text
image = data.img['src']
title = data.img['alt']
people = data.find_all('span')[-2].text[:-2]
score = data.find('span', class_='rating_num').text
# 第125个影片没有描述,用空代替
if data.find('span', class_='inq'):
Evaluation = data.find('span', class_='inq').text
else:
Evaluation = ''
yield {
'index': index,
'image': image,
'title': title,
'people': people,
'score': score,
'Evaluation': Evaluation,
}
def xpath_parse(html):
html = etree.HTML(html)
for data in html.xpath('//ol[@class="grid_view"]/li'):
index = data.xpath('.//em/text()')[0]
image = data.xpath('.//a/img/@src')[0]
title = data.xpath('.//a/img/@alt')[0]
people = data.xpath('.//div[@class="star"]/span[4]/text()')[0][:-2]
score = data.xpath('.//div[@class="star"]/span[2]/text()')[0]
# 第125个影片没有描述,用空代替
if data.xpath('.//p[@class="quote"]/span/text()'):
Evaluation = data.xpath('.//p[@class="quote"]/span/text()')[0]
else:
Evaluation = ''
yield {
'index': index,
'image': image,
'title': title,
'people': people,
'score': score,
'Evaluation': Evaluation,
}
def write_to_file(content, flag):
with open('豆瓣电影TOP250(' + str(flag) + ').txt', 'a', encoding='utf-8')as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
def search(Num):
url = 'https://movie.douban.com/top250?start=' + str(Num)
html = get_one_page(url)
for item in zhengze_parse(html):
write_to_file(item, '正则表达式')
for item in soup_parse(html):
write_to_file(item, 'BS4')
for item in xpath_parse(html):
write_to_file(item, 'xpath')
page = str(Num / 25 + 1)
print("正在爬取第" + page[:-2] + '页')
def main():
pool = Pool()
pool.map(search, [i * 25 for i in range(10)])
# # 提供页码--不用进程池
# for i in range(0, 10):
# Num = i * 25
# search(Num)
print("爬取完成")
if __name__ == '__main__':
# 打包之后,windows执行多进程出错,需要加入这一行
multiprocessing.freeze_support()
# 入口
main()
打包成exe可执行文件
pyinstaller -F 豆瓣电影排行.py
运行效果

004.今日头条(街拍美图)
# 拼接URL
from urllib.parse import urlencode
# 请求URL
import requests
# 文件操作
import os
# md5:类似加密,不会重复
from hashlib import md5
# 进程池
from multiprocessing.pool import Pool
# 延迟
import time
base_url = 'https://www.toutiao.com/api/search/content/?'
headers = {
'Referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(offset):
# https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis
# 根据链接传入params,offset是变化的
params = {
'aid': '24',
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'ture',
'count': '20',
'en_qc': '1',
'cur_tab': '1',
'from': 'search_tab',
'pd': 'synthesis',
}
url = base_url + urlencode(params)
# 返回json格式的数据
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
def get_images(json):
if json:
items = json.get('data')
for item in items:
# 标题
title = item.get('title')
# 图片列表
images = item.get('image_list')
for image in images:
# 返回单个图片链接+标题的字典
yield {
'image': image.get('url'),
'title': title,
}
def save_image(item):
# 如果没有文件夹就创建文件夹
dirs = 'F:\\domo'
if not os.path.exists(dirs):
os.mkdir("F:\\domo")
# 改变当前工作目录
os.chdir('F:\\domo')
# 如果没有item传过来title命名的文件,就创建一个
if not os.path.exists(item.get('title')):
os.mkdir(item.get('title'))
try:
# 请求图片URL
response = requests.get(item.get('image'))
if response.status_code == 200:
# 构造图片名字
file_path = '{0}\\{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
# 如果不存在这张图片就以二进制方式写入
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print("已经下载过这个文件了", file_path)
except:
print("图片下载失败")
GROUP_START = 1
GROUP_END = 20
def main(offset):
json = get_page(offset)
for item in get_images(json):
print(item)
save_image(item)
if __name__ == '__main__':
pool = Pool()
# 构造一个offset列表 20-400(20页)
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
# 多进程运行main函数
pool.map(main, groups)
# 关闭进程池
pool.close()
# 等待还没运行完的进程
pool.join()
爬10页左右就不给数据了,需要添加UA池
总结:1.os模块的基本操作
os.chdir('路径') --------------------表示改变当前工作目录到路径
os.path.exists('文件名') ------------当前目录下是否存在该文件,存在返回Ture,不存在返回False
os.mkdir()-----------创建文件夹
2. 用MD5值命名文件,可以有效的解决重复抓取的问题
3.进程池能大大降低爬取时间
005.微博
# url拼接
from urllib.parse import urlencode
# 去掉html标签
from pyquery import PyQuery as pq
# 请求
import requests
# 链接mongo
from pymongo import MongoClient
# 爬的太快大概36页的时候就会出现418,加点延迟吧
import time
# 连接
client = MongoClient()
# 指定数据库
db = client['weibo']
# 指定表
collection = db['weibo_domo2']
max_page = 100
# 存储到mongoDB
def save_to_mongo(result):
if collection.insert(result):
print("saved to mongo")
# https://m.weibo.cn/api/container/getIndex?containerid=1076032830678474&page=2
# 找到X-Requested-With: XMLHttpRequest的Ajax请求
# 基础url,之后利用urlencode进行拼接
base_url = 'https://m.weibo.cn/api/container/getIndex?'
# https://m.weibo.cn/api/container/getIndex?type=uid&value=1005052830678474&containerid=1005051005052830678474
headers = {
'host': 'm.weibo.cn',
# 手机端打开,查到链接,在解析
# 'Referer': 'https://m.weibo.cn/p/1005052830678474',
'Referer': 'https://m.weibo.cn/u/2202323951',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
params = {
'type': 'uid',
'value': '2202323951',
# 'containerid': '1076032830678474',
'containerid': '1076032202323951',
'page': page,
}
url = base_url + urlencode(params)
print(url)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
# response = json.dump(response.text)
return response.json(), page
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json, page: int):
if json:
# 只需要data下的cards内的数据
items = json.get('data').get('cards')
# index 下标
for index, item in enumerate(items):
# 在第一页,index==1没有mblog,只有这个没用,所以直接循环会导则索引报错
# 跳过这段
if index == 1 and page == 1:
continue
else:
item = item.get('mblog')
weibo = {}
# 微博ID
# "id":"4349509976406880",
weibo['ID'] = item.get('id')
# 微博内容 使用pq去掉html标签
weibo['text'] = pq(item.get('text')).text()
# 发表所用手机
weibo['phone'] = item.get('source')
# 发表时间
weibo['time'] = item.get('edit_at')
# 赞数量 attitudes:态度,意思,姿态
weibo['attitudes'] = item.get('attitudes_count')
# 评论数 comment:评论
weibo['comments'] = item.get('comments_count')
# 转发数 repost:转帖
weibo['reposts'] = item.get('reposts_count')
yield weibo
if __name__ == '__main__':
for page in range(1, max_page + 1):
json = get_page(page)
# *json==*args 将返回的json和page传入
results = parse_page(*json)
time.sleep(3)
for result in results:
print(result)
save_to_mongo(result)
总结:
1.不加延迟爬到36-38页会出现418 (418 I’m a teapot 服务器拒绝尝试用 “茶壶冲泡咖啡”。)
2. Ajax请求中可能在中间出现不是你想要的数据,例如微博page1,index1代表的是关注列表,关注的信息,不是你想要的数据
3.使用手机端获取Ajax数据,比在PC端,容易很多.
4.启动mongo需要先指定dbpath(数据存储的地方),查询插入文件的数量
形如:mongod --dbpath="F:\MongoDB\Server\3.4\data"
形如: db.weibo_domo2.find().count()
5.最终爬取出了朱子奇的所有微博,一共959条
006.猫眼电影top100
https://www.cnblogs.com/shuimohei/p/10400814.html
007.百度百科
https://www.cnblogs.com/shuimohei/p/10339891.html
008.斗鱼直播
'''
Ajax含有很多加密参数,我们无法直接进行爬取,只能借助Selenium
'''
# !/usr/bin/env python
# -*- coding:utf-8 -*-
import unittest
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
class douyu(unittest.TestCase):
# 初始化方法,必须是setUp()
def setUp(self):
# self.driver = webdriver.Chrome()
self.driver = webdriver.PhantomJS()
self.num = 0
self.count = 0
# 测试方法必须有test字样开头
def testDouyu(self):
self.driver.get("https://www.douyu.com/directory/all")
while True:
soup = bs(self.driver.page_source, "lxml")
# 房间名, 返回列表
names = soup.find_all("h3", {"class": "DyListCover-intro"})
# 直播间热度, 返回列表
numbers = soup.find_all("span", {"class": "DyListCover-hot"})
print(names,numbers)
for name, number in zip(names, numbers):
self.num += 1
print(
u"直播间热度: -" + number.get_text().strip() + u"-\t房间名: " + name.get_text().strip() + u'-\t直播数量' + str(
self.num))
result = u"直播间热度: -" + number.get_text().strip() + u"-\t房间名: " + name.get_text().strip() + u'-\t直播数量' + str(
self.num)
with open('123.txt', 'a', encoding='utf-8') as f:
f.write(result)
# self.count += int(number.get_text().strip())
# 如果在页面源码里找到"下一页"为隐藏的标签,就退出循环
if self.driver.page_source.find("dy-Pagination-disabled dy-Pagination-next") != -1:
break
#网络不好,加个延时,也可以考虑用直到标签能够点击的判断
time.sleep(1)
# 一直点击下一页
self.driver.find_element_by_class_name("dy-Pagination-next").click()
time.sleep(1)
# 测试结束执行的方法
def tearDown(self):
# 退出PhantomJS()浏览器
print("当前网站直播人数" + str(self.num))
print("当前网站总热度" + str(self.count))
self.driver.quit()
if __name__ == "__main__":
# 启动测试模块
unittest.main()
selenium还是慢了点,加了延时后更慢了
009.阳光热线问政平台
1.创建项目
scrapy startproject dongguan
2.创建爬虫
scrapy genspider -t crawl sun wz.sun0769.com
3.items.py
import scrapy
class DongguanItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
data = scrapy.Field()
num = scrapy.Field()
content = scrapy.Field()
url = scrapy.Field()
4.sun.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dongguan.items import DongguanItem
class SunSpider(CrawlSpider):
name = 'sun'
allowed_domains = ['wz.sun0769.com']
start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1']
rules = (
# 翻页
Rule(LinkExtractor(allow=r'page=\d+'), follow=True),
# 每个链接的
Rule(LinkExtractor(allow=r'id=\d+'), callback='parse_item', follow=False),
)
def parse_item(self, response):
print(response.url)
print(response)
item = DongguanItem()
item['title'] = response.xpath('//p[@class="focus-details"]/text()').extract_first()
item['data'] = response.xpath('//span[@class="fl"]/text()').extract()[0][4:]
item['num'] = response.xpath('//span[@class="fl"]/text()').extract()[2][3:]
# normalize-space,xpath中去掉\r\t\n
item['content'] = response.xpath('normalize-space(//div[@class="details-box"]/pre/text())').extract_first()
item['url'] = response.url
yield item
5.pipelines.py
import json
class DongguanPipeline(object):
def __init__(self):
self.filename = open('dongguan.txt', 'wb')
def process_item(self, item, spider):
text = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.filename.write(text.encode('utf-8'))
return item
def close_spider(self, spider):
self.filename.close()
6.settings.py
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'dongguan.pipelines.DongguanPipeline': 300,
}
# 日志文件名和处理等级
LOG_FILE = "dg.log"
LOG_LEVEL = "DEBUG"
7.运行爬虫
scrapy crawl sun
8.运行效果

010.新浪网分类资讯整站爬虫
1.创建项目
scrapy startproject sina
2.创建爬虫
scrapy genspider xinlang sina.com.cn
3.items.py
# -*- coding: utf-8 -*- import scrapy import sys, importlib importlib.reload(sys) class SinaItem(scrapy.Item): # 第一层:大类的标题 和 url parentTitle = scrapy.Field() parentUrls = scrapy.Field() # 第二层:小类的标题 和 子url subTitle = scrapy.Field() subUrls = scrapy.Field() # 存储到本地:小类目录存储路径 subFilename = scrapy.Field() # 第三层:小类下的子链接 sonUrls = scrapy.Field() # 抓到数据:文章标题和内容 head = scrapy.Field() content = scrapy.Field()
4.xinlang.py----新闻的解析方式太多了,没有写完全
# -*- coding: utf-8 -*-
import scrapy
# 创建文件夹
import os
from sina.items import SinaItem
class XinlangSpider(scrapy.Spider):
name = 'xinlang'
allowed_domains = ['sina.com.cn']
start_urls = ['http://news.sina.com.cn/guide/']
def parse(self, response):
items = []
# 用xpath找出所有大类的URL和标题 19个
parentUrls = response.xpath('//div[@id="tab01"]/div/h3/a/@href').extract()
parentTitle = response.xpath('//div[@id="tab01"]/div/h3/a/text()').extract()
# 找出所有小类的ur 和 标题 299个
subUrls = response.xpath('//div[@id="tab01"]/div/ul/li/a/@href').extract()
subTitle = response.xpath('//div[@id="tab01"]/div/ul/li/a/text()').extract()
# 爬取所有大类
for i in range(0, len(parentTitle)):
# 指定大类目录的路径和目录名
parentFilename = "./Data/" + parentTitle[i]
# 如果目录不存在,则创建目录
if (not os.path.exists(parentFilename)):
os.makedirs(parentFilename)
# 爬取所有小类
for j in range(0, len(subUrls)):
item = SinaItem()
# 保存大类的title和urls
item['parentTitle'] = parentTitle[i]
item['parentUrls'] = parentUrls[i]
# 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba)
if_belong = subUrls[j].startswith(item['parentUrls'])
# 如果属于本大类,将存储目录放在本大类目录下
if (if_belong):
subFilename = parentFilename + '/' + subTitle[j]
# 如果目录不存在,则创建目录
if (not os.path.exists(subFilename)):
os.makedirs(subFilename)
# 存储 小类url、title和filename字段数据
item['subUrls'] = subUrls[j]
item['subTitle'] = subTitle[j]
item['subFilename'] = subFilename
items.append(item)
# 发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理
for item in items:
yield scrapy.Request(url=item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)
# 对于返回的小类的url,再进行递归请求
def second_parse(self, response):
# 提取每次Response的meta数据
meta_1 = response.meta['meta_1']
# 取出小类里所有子链接,只要a标签下的链接
sonUrls = response.xpath('//a/@href').extract()
items = []
for i in range(0, len(sonUrls)):
# 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True,确保是个新闻
if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls'])
# 如果属于本大类,获取字段值放在同一个item下便于传输
if (if_belong):
item = SinaItem()
item['parentTitle'] = meta_1['parentTitle']
item['parentUrls'] = meta_1['parentUrls']
item['subUrls'] = meta_1['subUrls']
item['subTitle'] = meta_1['subTitle']
item['subFilename'] = meta_1['subFilename']
item['sonUrls'] = sonUrls[i]
items.append(item)
# 发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse 方法处理
for item in items:
yield scrapy.Request(url=item['sonUrls'], meta={'meta_2': item}, callback=self.detail_parse)
# 数据解析方法,获取文章标题和内容
def detail_parse(self, response):
item = response.meta['meta_2']
content = ""
head = response.xpath('//h1[@class="main-title"]/text()').extract()
content_list = response.xpath('//div[@class="article"]/p/text()').extract()
# 如果新闻的类型没有匹配到
if len(content_list) < 1:
# 按照新闻中心的匹配http://news.sina.com.cn/w/2004-12-20/11314575163s.shtml
head = response.xpath('//th[@class="f24"]//h1/text()').extract()
content_list = response.xpath('//td[@class="l17"]/font/p/text()').extract()
if len(content_list) < 1:
# http://news.sina.com.cn/c/2012-09-21/092225223127.shtml
head = response.xpath('//div[@class="blk_content"]/h1/text()').extract()
content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
if len(content_list) < 1:
# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
head = response.xpath('//h1[@id="artibodyTitle"]/text()').extract()
content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
if len(content_list) < 1:
# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
head = response.xpath('//h1[@class="main-title"]/text()').extract()
content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
if len(content_list) < 1:
# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
head = response.xpath('//h1[@id="artibodyTitle"]/font/text()').extract()
content_list = response.xpath('//div[@id="artibody"]//span/text()').extract()
if len(head) < 1:
# 漏网只鱼
head = ['error']
content_list = [response.url]
# 将p标签里的文本内容合并到一起
for content_one in content_list:
content += content_one
item['head'] = head
item['content'] = content
yield item
5.pipelines.py
import json
from scrapy import signals
class SinaPipeline(object):
def process_item(self, item, spider):
sonUrls = item['sonUrls']
# 文件名为子链接url中间部分,并将 / 替换为 _,保存为 .txt格式
filename = sonUrls[7:-6].replace('/', '_')
filename += ".txt"
fp = open(item['subFilename'] + '/' + filename, 'w', encoding='utf-8')
fp.write(item['content'])
fp.close()
return item
6. settings.py
BOT_NAME = 'sina'
SPIDER_MODULES = ['sina.spiders']
NEWSPIDER_MODULE = 'sina.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 0.5
ITEM_PIPELINES = {
'sina.pipelines.SinaPipeline': 300,
}
# 日志文件名和处理等级
LOG_FILE = "dg.log"
LOG_LEVEL = "DEBUG"
7.main.py
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdline
cmdline.execute('scrapy crawl xinlang'.split())
8.执行程序
运行main.py文件即可
9.效果
能爬一部分新闻,不够完善
请求成功次数:4416
最大深度:2

浙公网安备 33010602011771号