Loading

1-由浅入深学爬虫

爬虫

爬虫入门

urllib

from urllib import request


url = 'http://www.baidu.com'
# User-Agent: 模拟浏览器,防止服务器反爬
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# 使用request发送请求
# 创建请求对象
req = request.Request(url=url, headers=headers)
# 发送请求
response = request.urlopen(req)
# 响应数据
# print(response.read())  # 二进制
print(response.read().decode())  # 解码,得到字符串
# print(response.info())  # 响应信息
# print(response.status)  # 状态码

urllib模拟百度搜索

from urllib import request
from urllib import parse


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}

def baidu_search(params, key):
    # 百度搜索url
    url = f'https://www.baidu.com/s?{params}'
    # 发送请求
    req =request.Request(url, headers=headers)
    res = request.urlopen(req)
    content =res.read().decode()
    print(content)
    # 保存爬取的数据
    with open(f'{key}.html', 'w', encoding='utf-8') as fp:
        fp.write(content)
        fp.flush()
'''
如果向服务器发送数据,那么data参数必须是一个有数据的bytes对象,否则为None。HTTP请求使用POST方法时,data必须有数据;使用GET方法时,data写成None
data = bytes(parse.urlencode({"pro": "value"}, encoding="utf8"))
response = request.urlopen("http://www.baidu.com", data=data)
'''


if __name__ == '__main__':
    key = input('请输入要搜索的内容')
    params ={'wd': key}
    params = parse.urlencode(params)  # 解决url中出现中文的问题
    # print(params)  # wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
    baidu_search(params, key)

urllib爬取51job

import re
from urllib import request


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# url
url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,Python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
# 发送请求
req = request.Request(url, headers=headers)
res = request.urlopen(req)
# 获取数据
content = res.read().decode('gbk')
# 使用正则
pattern = '"jobid_count":"(.*?)"'  # 捕获
result = re.findall(pattern, content, re.S)  # 让.可以匹配换行
print(result)

urllib下载图片

from urllib import request


# 下载图片
request.urlretrieve(
    url='https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1600067380374&di=16addb0b6e336ab847a1403cebc09a43&imgtype=0&src=http%3A%2F%2Fgss0.baidu.com%2F-vo3dSag_xI4khGko9WTAnF6hhy%2Fzhidao%2Fpic%2Fitem%2Fb17eca8065380cd72cbb313da744ad34588281bd.jpg',
    filename='人民币.png'
)
request.urlcleanup()  # 清理缓存

urllib爬取豆瓣电影

import json
from urllib import request
import pymysql


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=0'
req = request.Request(url, headers=headers)
res = request.urlopen(req)
# json解析:json反序列化
# json一定要用双引号
# 不能在json中注释
content = res.read().decode()
result = json.loads(content)
'''
# 1.将电影数据存入本地txt文件
movie_list = result['data']
for movie in movie_list:
    title = movie['title']
    url = movie['url']
    with open('douban.txt', 'a', encoding='utf-8') as fp:
        s = str((title, url)) + '\n'
        fp.write(s)
        fp.flush()
'''
# 2.将电影数据存储到MySQL
# 连接MySQL
db = pymysql.connect(
    host='localhost', port=3306,
    user='root', password='nzw19940611',
    database='spider2003', charset='utf8mb4'
)
cur = db.cursor()  # 游标:执行SQL
# 执行SQL
movie_list = result['data']
for movie in movie_list:
    title = movie['title']
    url = movie['url']
    try:
        # sql
        sql = 'insert into tb_douban_movie(movie_title, url) values("%s", "%s")' % (title, url)
        cur.execute(sql)
        db.commit()  # 事务提交
    except Exception as e:
        print('插入失败:', e)
        db.rollback()  # 回滚
print('--插入MySQL完成--')
# content = eval(res.read().decode())
# for i in range(len(content['data'])):
#     with open('豆瓣.txt', 'a', encoding='utf-8') as fp:
#         fp.write(content['data'][i]['title']+'\n')
#         fp.flush()

urllib使用代理IP

import random
from urllib import request
import json


# 先获取芝麻代理ip
url = 'http://http.tiqu.alicdns.com/getip3?num=10&type=2&pro=0&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions=&gm=4'
# 请求芝麻代理API
response = request.urlopen(url)
content = response.read().decode()
# print(content)
# json解析,提取ip和port
result = json.loads(content)
ip_list = result['data']
# 把ip格式化后存入proxy_list
proxy_list = []
for ip in ip_list:
    ip_dict = {
        'HTTP': f'{ip["ip"]}:{ip["port"]}'
    }
    proxy_list.append(ip_dict)
# print(proxy_list)  # {'http': 'http://58.218.92.13:6905'}......
# url = UserAgent池
UserAgentList = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
    "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36"
]
# 获取随机的代理IP
proxy = random.choice(proxy_list)
# 随机的UA
ua = random.choice(UserAgentList)
# 使用代理IP和UA
proxy_handler = request.ProxyHandler(proxies=proxy)  # 构建一个代理
opener = request.build_opener(proxy_handler)  # 使用构建的代理创建一个opener对象
# 发送请求
req = request.Request('http://www.baidu.com')
req.add_header('User-Agent', ua)  # 随机的ua
# 使用带代理的opener对象打开某个url/request
response = opener.open(req)  # 等价于request.urlopen()
res = response.read().decode()
print(res)

requests基础

import requests


# get请求
'''
response = requests.get('http://www.baidu.com')
# print(response)  # <Response [200]>
print(response.text)  # 默认使用utf-8解码,内容字符串
print(response.content)  # 二进制
# print(response.json())  # json解析

# print(response.headers)  # 头部信息
# print(response.cookies)  # 响应的cookie
# print(response.status_code)  # 状态码
'''

'''
# get请求:百度搜索
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
url = 'https://www.baidu.com/s?wd=hello'
response = requests.get(url, headers=headers)
print(response.text)
'''

# post请求:有道翻译
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
kw = input('请输入要翻译的单词:')
# data是post的参数
data = {
    "i": kw,
    "from": "AUTO",
    "to": "AUTO",
    "smartresult": "dict",
    "client": "fanyideskweb",
    "salt": "16000738465941",
    "sign": "bf2e220fb6fe0ec8e03524a390dc0b5c",
    "lts": "1600073846594",
    "bv": "e915c77f633538e8cf44c657fe201ebb",
    "doctype": "json",
    "version": "2.1",
    "keyfrom": "fanyi.web",
    "action": "FY_BY_CLICKBUTTION"
}
response = requests.post(url, data=data, headers=headers)
result = response.json()  # json解析,解析成字典
src = result['translateResult'][0][0]['src']
tgt = result['translateResult'][0][0]['tgt']
print(src, tgt)

bs4和xpath

requests使用代理

import random
import requests


'''
58.218.200.228:9150
58.218.200.223:4432
58.218.200.226:8256
58.218.200.228:7837
58.218.200.223:8915
'''
# proxy
proxy_list = [
    {"HTTP": "58.218.200.228:9150"},
    {"HTTP": "58.218.200.223:4432"},
    {"HTTP": "58.218.200.226:8256"},
    {"HTTP": "58.218.200.228:7837"},
    {"HTTP": "58.218.200.223:8915"}
]
# 获取随机代理IP
proxy = random.choice(proxy_list)
# 使用代理
res = requests.get('http://www.baidu.com', proxies=proxy)
print(res.text)

requests使用session

import requests


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
url = 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=1&s=1&click=0'
# 使用session
session = requests.session()
# 使用session发送请求:保持会话,存储cookie
response = session.get(url, headers=headers)
print(response.text)
# 当继续使用session访问其他url时,会自动携带之前的cookie
url2 = 'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=2&s=27&scrolling=y&log_id=1600483717480.6970&tpl=3_M&isList=1&show_items='
response2 = session.get(url2, headers=headers)
print(response2.text)

requests使用cookies

import requests


url = 'http://www.baidu.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
cookies = {
    "PSTM": "1600136817",
    "BDRCVFR[feWj1Vr5u3D]": "mk3SLVN4HKm",
    "BAIDUID": " E922D90277D06E37B8B783C0082C650A:FG=1",
    "delPer": "0",
    "BD_CK_SAM": "1",
    "PSINO": "6",
    "H_PS_PSSID": "7506_32606_1424_7605_32116_31709_26350",
    "BIDUPSID": "89E6649E57A3DC9DABE613D88595BA0D",
    "BDORZ": "B490B5EBF6F3CD402E515D22BCDA1598",
    "BD_UPN": "12314753",
    "COOKIE_SESSION": "16_0_2_5_3_11_0_0_0_2_0_0_67596_0_0_0_1600136510_0_1600136818%7C5%230_0_1600136818%7C1",
    "H_PS_645EC": "3fcbYEWAxGp5VGowaCXsud%2BK436DuYp%2Bu6fs%2FUwAz9UFcCyuSSHqbS7CSMLQBpsMjeN%2F"
}
response = requests.get(url, headers=headers, cookies=cookies)
# print(response.text)
# print(response.cookies)
# 将服务器返回的cookiejar,转换成字典dict
cookie_dict = requests.utils.dict_from_cookiejar(response.cookies)
print(cookie_dict)

bs4基本用法

from bs4 import BeautifulSoup
# 安装Beautifulsop4
# pip install Beacutifulsoup4
# 安装HTML解析器lxml


html_doc = """
<html>
    <head>
        <title>呵呵</title>
    </head>
    <body>
    <p class="title">
        <b>哈哈</b>
    </p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="first" class="sister" id="link1">first</a>,
    <a href="second" class="sister" id="link2">second</a> and
    <a href="third" class="sister" id="link3">third</a>;
    </p>
    <p class="story">end</p>
    </body>
</html>
"""
# 使用bs4
# 创建bs4对象
soup = BeautifulSoup(html_doc, 'lxml')
# print(soup)
# print(type(soup))
# tag标签
# print(soup.head)
# print(type(soup.head))  # <class 'bs4.element.Tag'>
# print(soup.title)  # title标签
# print(soup.b)  # 哈哈
# print(soup.body.p.b)
# attribute属性
# print(soup.p.attrs)  # {'class': ['title']}第一个p所有属性
# print(soup.a.attrs)  # {'href': 'first', 'class': ['sister'], 'id': 'link1'}第一个a的所有属性
# print(soup.a.attrs['href'])  # 获取某个属性值
# 文本内容,建议使用text
# print(soup.b.string)  # 哈哈
# print(soup.b.text)  # 哈哈
# print(soup.p.string)  # None
# print(soup.p.text)  # 哈哈
# find_all():找到所有匹配的节点
# print(soup.find_all('p'))  # 所有p节点
# print(soup.find_all('p')[2])
# 根据属性来查找
# print(soup.find_all('p', attrs={'class': 'story'}))
# print(soup.find_all('a', attrs={'id': 'link1'}))
# print(soup.find_all('a', id='link1'))
# print(soup.find_all('a', limit=2))  # 前两个a标签
# print(soup.find_all(['a', 'b']))  # 找所有a标签和b标签
# css选择器
# soup.select()
# print(soup.select('p'))  # 标签选择器
# print(soup.select('#link2'))  # id选择器
# print(soup.select('.sister'))  # class选择器
# print(soup.select('p #link3'))  # 后代选择器
# 从文档中获取所有文字内容
print(soup.get_text())

bs4解析股票基金数据

import requests
from bs4 import BeautifulSoup


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# 股票网址
url = 'http://quote.stockstar.com/fund/stock.shtml'
response = requests.get(url, headers=headers)
content = response.content.decode('gb2312')
# print(content)
# bs4解析网页内容
soup = BeautifulSoup(content, 'lxml')
tr_list = soup.select('#datalist tr')
# print(tr_list)
for tr in tr_list:
    s_code = tr.find_all('td')[0].a.text  # 基金代码
    s_name = tr.find_all('td')[1].a.text  # 基金名称
    s_unit = tr.find_all('td')[2].text  # 单位
    s = str((s_code, s_name, s_unit)) + '\n'
    with open('fund.txt', 'a', encoding='utf-8') as fp:
        fp.write(s)
        fp.flush()

xpath的基本使用

# xpath需要安装lxml
# pip install lxml
from lxml import etree


html_doc = """
<html>
    <head>
        <title>呵呵</title>
    </head>
    <body>
        <ul>
            <li class="item" id="box1">
                <a href="aabb">打仗1</a>
            </li>
            <li class="item" id="box2">
                <a href="aabbcc">打仗2</a>
            </li>
            <li class="item" id="box3">
                <a href="bbccdd">打仗3</a>
            </li>
            <li class="item" id="box4">
                <a href="ddee">打仗4</a>
            </li>
        </ul>
        <p class="item">
            <a href="aabb">打仗5</a>
        </p>
    </body>
</html>
"""
# 使用xpath
# 创建etree对象
mytree = etree.HTML(html_doc)
# print(mytree)  # <Element html at 0x1feda822e08>
# print(type(mytree))  # <class 'lxml.etree._Element'>
# /:子节点
# //:后代节点
# print(mytree.xpath('/html'))  # html标签
# print(mytree.xpath('/html/head'))  # head标签
# print(mytree.xpath('/html/body/ul/li'))  # 所有li标签
# print(mytree.xpath('//li'))  # 所有li标签
# print(mytree.xpath('//li')[1])  # 第二个li标签,得到etree对象
# print(mytree.xpath('//li[2]/@id'))
# text():文本内容
# li_list = mytree.xpath('//li')
# for li in li_list:
#     # 里面的.表示当前节点,不能省略
#     content = li.xpath('./a/text()')  # 文本内容
#     attr = li.xpath('./@id')  # 属性值
#     print(content, attr)
# 谓语:加条件
# 谓词写在[]中
# print(mytree.xpath('//li[1]/a/text()'))  # ['打仗1']
# print(mytree.xpath('//li[last()]/a/text()'))  # ['打仗4']
# print(mytree.xpath('//li[last()-1]/a/text()'))  # ['打仗3'],倒数第二个
# print(mytree.xpath('//li[position()<3]/a/text()'))  # ['打仗1', '打仗2']
# print(mytree.xpath('//li[position()>=3]/a/text()'))  # ['打仗3', '打仗4']
# print(mytree.xpath('//li[@id="box1"]/a/text()'))  # ['打仗1']
# print(mytree.xpath('//li[@class="item"]/a/text()'))  # ['打仗1', '打仗2', '打仗3', '打仗4']
# *通配符
# print(mytree.xpath('//*[@class="item"]/a/text()'))  # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5']
# |或
# print(mytree.xpath('//li[@class="item"]/a/text() | //p[@class="item"]/a/text()'))  # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5']  # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5']
# 包含contains()
# print(mytree.xpath('//li/a[contains(@href, "aa")]/text()'))  # ['打仗1', '打仗2']
print(mytree.xpath('//li/a[contains(text(), "2")]/text()'))  # ['打仗2']

xpath解析股票基金数据

import requests
from lxml import etree


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# 股票网址
url = 'http://quote.stockstar.com/fund/stock.shtml'
response = requests.get(url, headers=headers)
content = response.content.decode('gb2312')
# print(content)
# xpath解析网页内容
mytree = etree.HTML(content)
tr_list = mytree.xpath('//tbody[@id="datalist"]/tr')
for i, tr in enumerate(tr_list):
    f_code = tr.xpath('./td[1]/a/text()')[0]
    f_name = tr.xpath('./td[2]/a/text()')[0]
    f_unit = tr.xpath('./td[3]/text()')[0]
    # csv文件
    with open('fund.csv', 'a', encoding='gb2312') as fp:
        if i==0:
            fp.write('基金代码,基金名称,单位净值\n')
        f = f'{f_code},{f_name},{f_unit}\n'
        fp.write(f)
        fp.flush()

selenium和验证码破解

超级鹰破解验证码

import requests
from hashlib import md5


class Chaojiying_Client(object):
    def __init__(self, username, password, soft_id):
        self.username = username
        password = password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }
    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()
    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':
    chaojiying = Chaojiying_Client('lotuslaw', '******', '908114')
    # 用户中心>>软件ID 生成一个替换 96001
    img = open('../a.jpg', 'rb').read()
    #本地图片文件路径 来替换 a.jpg
    print(chaojiying.PostPic(img, 1902))
    # 1902 验证码类型

selenium的基本用法

import time
from selenium import webdriver

# 创建浏览器驱动
# 可以手动配置驱动的路径
# 将chromedriver.exe放到python.exe同目录
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
# 打开浏览器
driver.get('http://www.baidu.com')
# 获取网页源码
# print(driver.page_source)
# 关闭
# time.sleep(5)
# driver.close()  # 只关闭一个窗口
# driver.quit()  # 退出,关闭所有窗口
# 百度贴吧
driver.get('https://tieba.baidu.com/index.html')
'''
# 查找元素
wd1 = driver.find_element_by_id('wd1')
# wd1.send_keys('美女', Keys.ENTER)  # 给输入框填充内容,自动按回车
time.sleep(2)
wd1.send_keys('美女')
# 点击按钮
btn = driver.find_element_by_xpath('//a[@class="search_btn search_btn_enter_ba j_enter_ba"]')
# btn.click()
# 获取内容和属性值
print(btn.get_attribute('innerText'))  # innerText, innerHTML
print(wd1.get_attribute('value'))  # 输入框的值
'''
# 执行js
time.sleep(3)
# 执行JS脚本
# driver.execute_script('window.scrollBy(0, 5000)')
for i in range(5):
    driver.execute_script('window.scrollBy(0,5000)')
    time.sleep(2)
# 截图
# driver.save_screenshot('teiba.png')

selenium登录知乎

import time
from selenium import webdriver


# 知乎登录页面
url = 'https://www.zhihu.com/signin?next=%2F'
# 打开知乎页面
driver = webdriver.Chrome()
driver.get(url)
time.sleep(2)
# 点击qq
driver.find_element_by_xpath('//*[@class="Button Login-socialButton Button--plain"][2]').click()
# 停10秒,手动扫码登录
time.sleep(10)
# 刷新页面
driver.refresh()
# 获取页面
print(driver.page_source)
print(driver.get_cookies())

headless无头浏览器

from selenium import webdriver
from selenium.webdriver import ChromeOptions

# 在正常页面跑通流程后,使用无头浏览器节约资源
options = ChromeOptions()
options.add_argument('--headless')  # 无头浏览器
options.add_argument('--disable-gpu')  # 禁用GPU
# 创建驱动对象
driver = webdriver.Chrome(options=options)
driver.get('http://www.baidu.com')
print(driver.page_source)

selenium设置代理

from selenium import webdriver


options = webdriver.ChromeOptions()
# 设置代理IP
options.add_argument('--proxy-sever=http://58.218.200.226:8256')
# 创建驱动
driver = webdriver.Chrome(options=options)
driver.get('http://www.baidu.com')
print(driver.page_source)

超级鹰人人网验证码破解登录

from Day03.chaojiying import chaojiying
import requests
import random

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
}

def get_code():
    url = 'http://icode.renren.com/getcode.do?t=web_login&rnd=' + str(random.random())

    # 获取验证码图片
    res = session.get(url, headers=headers)
    content = res.content  # 图片二进制

    # 使用超级鹰破解
    cjy = chaojiying.Chaojiying_Client('lotuslaw', '******', '908114')
    code = cjy.PostPic(content, 1902)
    # print(code)
    return code


def login(code):
    # 登录接口抓取:给一个错误的密码进行登录
    login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2020831616448'

    login_data = {
        "email": "18566218480",
        "icode": code,
        "origURL": "http://www.renren.com/home",
        "domain": "renren.com",
        "key_id": "1",
        "captcha_type": "web_login",
        "password": "88d7f48bf698c0f1b0dcca94bfb40361c6c82ced70f8cbf0619d725e0341d2e5",
        "rkey": "e8d80414c49ceb424291126858ee6226",
        "f": ''
    }
    # 发送请求
    res = session.post(login_url, data=login_data, headers=headers)
    content = res.text
    print(content)
    
    
# 登录后访问个人中心
def user_center():
    url = 'http://www.renren.com/480089210/profile'
    res = session.get(url, headers=headers)
    print(res.text)


if __name__ == '__main__':
    session = requests.session()
    code = get_code()
    login(code)
    user_center()

Scrapy基础

  • Scrapy框架介绍

    • Scrapy是用纯Python实现的一个为了爬取网站数据、提取结构性数据而编写的应用框架,用途非常广泛
    • 用户只需要定制开发几个模块就可以轻松的实现一个爬虫,用来抓取网页内容以及各种图片,非常之方便
    • Scrapy 使用了Twisted(其主要对手是Tornado)多线程异步网络框架来处理网络通讯,可以加快我们的下载速度,不用自己去实现异步框架,并且包含了各种中间件接口,可以灵活的完成各种需求
  • Scrapy框架组件

    • Scrapy Engine(引擎)
      • 负责Spider、ItemPipeline、Downloader、Scheduler中间的通讯,信号、数据传递等
    • Scheduler(调度器)
      • 它负责接受引擎发送过来的Request请求,并按照一定的方式进行整理排列,入队,当引擎需要时,交还给引擎
    • Downloader(下载器)
      • 负责下载Scrapy Engine(引擎)发送的所有Requests请求,并将其获取到的Responses交还给Scrapy Engine(引擎),由引擎交给Spider来处理
    • Spider(爬虫)
      • 它负责处理所有Responses,从中分析提取数据,获取Item字段需要的数据,并将需要跟进的URL提交给引擎,再次进入Scheduler(调度器)
    • Item Pipeline(管道)
      • 它负责处理Spider中获取到的Item,并进行后期处理(详细分析、过滤、存储等)的地方
    • Downloader Middlewares(下载中间件)
      • 你可以当作是一个可以自定义扩展下载功能的组件
    • Spider Middlewares(Spider中间件)
      • 你可以理解为是一个可以自定扩展和操作引擎Spider中间通信的功能组件(比如进入Spider的Responses和从Spider出去的Requests)
  • 安装使用

    • 安装

    • 使用

      • 新建项目

        • 进入项目存放的目录
        • scrapy startproject meiju
      • 创建爬虫程序

        • 进入项目目录
        • scrapy genspider meijuSpider meijutt.tv
          • meijuSpider为爬虫文件名
          • meijutt.tv为爬取网址的域名
        • 工程文件介绍
          • scrapy.cfg
            • 项目的配置信息,主要为Scrapy命令行工具提供一个基础的配置信息。(真正爬虫相关的配置信息在settings.py文件中)
          • items.py
            • 设置数据存储模板,用于结构化数据,如:Django的Model
          • pipelines
            • 数据处理行为,如:一般结构化的数据持久化
          • settings.py
            • 配置文件,如:递归的层数、并发数,延迟下载等
          • spiders
            • 爬虫目录,如:创建文件,编写爬虫规则
      • 定义Item

        • class MeijuItem(scrapy.Item):
              name = scrapy.Field()
          
      • 编写爬虫

        • 修改起始爬取的url

        • 数据处理

        • def parse(self, response):
              item = MeijuItem()
              item['name'] = name
              yield item
          
      • 启用一个Item Pipeline组件,在settings.py开启

        • ITEM_PIPELINES = {
             'meiju.pipelines.MeijuPipeline': 300,
          }
          
      • 编写 Pipeline 来存储提取到的Item

        • 快速存储

          • 在运行文件中存储

          • scrapy crawl meijuSpider -o meiju.json
            scrapy crawl meijuSpider -o meiju.csv
            scrapy crawl meijuSpider -o meiju.xml
            
      • 运行爬虫

        • 命令行运行

          • scrapy crawl meijuSpider
          • scrapy crawl meijuSpider --nolog
        • 新建一个运行文件start

        • # 执行scrapy命令:开启爬虫
          scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju'])
          # scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju', '--nolog'])
          scrapy.cmdline.execute('scrapy crawl mymeiju'.split())
          # scrapy.cmdline.execute('scrapy crawl mymeiju --nolog'.split())
          

Scrapy爬取美剧网

  • mymeiju.py
import scrapy
from ..items import MeijuItem


class MymeijuSpider(scrapy.Spider):
    # 爬虫名:唯一
    name = 'mymeiju'
    # 允许的域名列表
    allowed_domains = ['meijutt.tv']
    # 开始的url列表:启动项目后回直接自动爬取的url列表
    start_urls = ['https://www.meijutt.tv/new100.html']
    # 解析数据方法:
    # 1.当start_urls中的网页请求完成后回自动调用当前的parse方法,并返回响应
    def parse(self, response, **kwargs):
        print('*' * 100)
        # print(response)
        # print(type(response))  # <class 'scrapy.http.response.html.HtmlResponse'>
        # print(response.text)  # 获取文本内容
        # print(response.body)  # 二进制内容
        # print(response.json())  # 解析json
        print('*' * 100)
        # 解析数据:xpath
        li_list = response.xpath('//ul[@class="top-list  fn-clear"]/li')
        for li in li_list:
            # 有3种方式获取内容
            # name = li.xpath('./h5/a/text()').get()
            # name = li.xpath('./h5/a/text()')[0].extract()
            # name = li.xpath('./h5/a/text()').extract_first()
            # name = li.xpath('./span/text()').getall()  # 获取所有匹配的内容,他是一个列表
            name = li.xpath('./h5/a/text()').get()  # 剧名
            state = li.xpath('./span[1]/font/text()').get()  # 状态:级数
            mjzm = li.xpath('./span[2]/em/text()').get()  # 字幕
            mjjq = li.xpath('./span[3]/text()').get()  # 分类
            mjtv = li.xpath('./span[4]/text()').get()  # 电视台
            mjtime = li.xpath('./div[last]/font/text()').get()  # 更新时间
            if not mjtime:
                mjtime = li.xpath('./div[last()]/text()').get()
            # print(name)
            # item:封装每个数据
            # item = MeijuItem()
            # item['name'] = name  # 不能用点语法
            item = MeijuItem(
                name=name, state=state, mjzm=mjzm,
                mjjq=mjjq, mjtv=mjtv, mjtime=mjtime
            )
            # 生成器,既是迭代器,又是可迭代对象
            yield item
            # 这里的item回传入到pipelines中,需要做两个事情
            # 1.需要在parse方法中yield item
            # 2.需要在settings中将ITEM_PIPELINES设置好
            # yield返回2种值
            # 1.返回item
            # 2.返回Request/FormRequest
  • items.py
import scrapy


# Item: 类似Django种的Model
class MeijuItem(scrapy.Item):
    name = scrapy.Field()
    state = scrapy.Field()
    mjzm = scrapy.Field()
    mjjq = scrapy.Field()
    mjtv = scrapy.Field()
    mjtime = scrapy.Field()
  • pipelines.py
from itemadapter import ItemAdapter


# pipeline:专门用来存储数据
class MeijuPipeline:
    # 开始爬虫:自动调用该函数一次
    def open_spider(self, spider):
        pass
        # 打开文件
        # self.fp = open('meiju.txt', 'a', encoding='utf-8')
        # print('开始爬取......')

    # 关闭爬虫:自动调用
    def close_spider(self, spider):
        pass
        # 关闭文件
        # self.fp.close()
        # print('爬虫结束!')

    # process_item:会被调用很多次(取决于yield item的次数)
    def process_item(self, item, spider):
        # print(spider.name)  # 爬虫名
        # print(f'item:{item}', type(item))
        # 写入文件
        # with open('meiju.txt', 'a', encoding='utf-8') as fp:
        #     fp.write(str(item) + '\n')
        # self.fp.write(str(item) + '\n')
        # print(f'{item["name"]}写入成功')
        return item
  • settings.py
BOT_NAME = 'meiju'
SPIDER_MODULES = ['meiju.spiders']
NEWSPIDER_MODULE = 'meiju.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   'meiju.pipelines.MeijuPipeline': 300,
}
  • start.py
import scrapy.cmdline

# 执行scrapy命令:开启爬虫
# scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju'])
# scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju', '--nolog'])

# 使用split
# scrapy.cmdline.execute('scrapy crawl mymeiju'.split())
# scrapy.cmdline.execute('scrapy crawl mymeiju --nolog'.split())

# 快速存储成指定格式的文件
# 支持的文件格式('json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle')
# scrapy.cmdline.execute('scrapy crawl mymeiju -o meiju2.json'.split())
scrapy.cmdline.execute('scrapy crawl mymeiju -o meiju2.csv'.split())

Scrapy爬取当当网

  • dangdang_spider.py
import scrapy
from ..items import DangdangItem


class DangdangSpiderSpider(scrapy.Spider):
    name = 'dangdang_spider'
    allowed_domains = ['dangdang.com']
    start_urls = ['http://category.dangdang.com/pg1-cp01.01.02.00.00.00.html']
    def parse(self, response, **kwargs):
        li_list = response.xpath('//ul[@id="component_59"]/li')
        for li in li_list:
            book_name = li.xpath('./a/@title').get()
            book_price = li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()').get()
            book_author = li.xpath('./p[@class="search_book_author"]/span[1]/a/text()').get()
            book_publishers = li.xpath('./p[@class="search_book_author"]/span[3]/a/text()').get()
            book_star = li.xpath('./p[@class="search_star_line"]/span/span/@style').get()[6:-1]
            book_comment = li.xpath('./p[4]/a/text()').get()
            book_picture = li.xpath('./a/img/@data-original')
            if book_picture:
                book_picture = book_picture.get()
            else:
                book_picture = li.xpath('./a/img/@src').get()
            print(book_picture)
            item = DangdangItem(
                book_name=book_name,
                book_price=book_price,
                book_author=book_author,
                book_publishers=book_publishers,
                book_star=book_star,
                book_comment=book_comment,
                book_picture=book_picture
            )
            yield item
  • items.py
import scrapy


class DangdangItem(scrapy.Item):
    book_name = scrapy.Field()
    book_price = scrapy.Field()
    book_author = scrapy.Field()
    book_publishers = scrapy.Field()
    book_star = scrapy.Field()
    book_comment = scrapy.Field()
    book_picture = scrapy.Field()
  • pipelines.py
import pymysql


class DangdangPipeline:
    def open_spider(self, spider):
        print('开始爬取')
        self.db = pymysql.connect(
            host='localhost',
            port=3306,
            user='root',
            password='******',
            database='spider2003',
            charset='utf8'
        )
        self.cur = self.db.cursor()
    def close_spider(self, spider):
        print('爬取结束')
        self.cur.close()
        self.db.close()
    def process_item(self, item, spider):
        # item['name'].replace('"', "'")  # 单引号替换双引号
        sql = 'insert into dangdang(book_name, book_price, book_author, book_publishers, book_star, book_comment, book_picture) values ("%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (item['book_name'], item['book_price'], item['book_author'], item['book_publishers'], item['book_star'], item['book_comment'], item['book_picture'])
        try:
            self.cur.execute(sql)
            self.db.commit()
        except Exception as e:
            print(e)
            self.db.rollback()
        return item
  • settings.py
BOT_NAME = 'dangdang'
SPIDER_MODULES = ['dangdang.spiders']
NEWSPIDER_MODULE = 'dangdang.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   'dangdang.pipelines.DangdangPipeline': 300,
}
  • start.py
import scrapy.cmdline


scrapy.cmdline.execute('scrapy crawl dangdang_spider'.split())

Scrapy进阶

  • Scrapy Shell

  • Selectors选择器

    • Scrapy Selectors 内置 XPath 和 CSS Selector 表达式机制
    • Selector四个基本的方法(xpath最常用)
      • xpath()
        • 传入xpath表达式,返回该表达式所对应的所有节点的selector list列表
      • extract()
        • 序列化该节点为Unicode字符串并返回list, extract_first()
      • css()
        • 传入CSS表达式,返回该表达式所对应的所有节点的selector list列表,语法同 BeautifulSoup4中soup.select()
      • re()
        • 根据传入的正则表达式对数据进行提取,返回Unicode字符串list列表
  • Spider类

    • 概述
      • Spider类定义了如何爬取某个(或某些)网站。包括了爬取的动作(例如:是否跟进链接)以及如何从网页的内容中提取结构化数据(爬取item)。 换句话说,Spider就是你定义爬取的动作及分析某个网页(或者是有些网页)的地方
      • scrapy.Spider是最基本的类,所有编写的爬虫必须继承这个类
    • 主要用到的函数及调用顺序
      • _init_()
        • 初始化爬虫名字和start_urls列表
      • start_requests()
        • 调用make_requests_from_url():生成Requests对象交给Scrapy下载并返回response
      • parse(self, response)
        • 解析response,并返回Item或Requests(需指定回调函数)
        • Item传给Item pipline持久化,而Requests交由Scrapy下载,并由指定的回调函数处理(默认parse()),一直进行循环,直到处理完所有的数据为止
    • 主要属性和方法
      • name
        • 定义spider名字的字符串。唯一
      • allowed_domains
        • 包含了spider允许爬取的域名(domain)的列表,可选
      • start_urls
        • 初始URL元组/列表。当没有指定特定的URL时,spider将从该列表中开始进行爬取
      • start_requests(self)
        • 该方法必须返回一个可迭代对象(iterable)。该对象包含了spider用于爬取(默认实现是使用 start_urls 的url)的第一个Request
        • 当spider启动爬取并且未指定start_urls时,该方法被调用
      • parse(self, response)
        • 当请求url返回网页没有指定回调函数时,默认的Request对象回调函数。用来处理网页返回的response,以及生成Item或者Request对象
      • log(self, message[, level, component])
        • 使用 scrapy.log.msg() 方法记录日志信息
  • CrawlSpider类

    • 概述
      • CrawlSpider是Spider的派生类
      • Spider类的设计原则是只爬取start_urls列表中的网页
      • CrawlSpider类定义了一些规则(rule)来提供跟进link的方便的机制,从爬取的网页中获取link并继续爬取的工作更适合
        • 自动翻页
    • LinkExtractors
      • 概述
        • 使用LinkExtractors 的目的: 提取链接
        • 每个LinkExtractor有唯一的公共方法是 extract_links(),它接收一个 Response 对象,并返回一个 scrapy.link.Link 对象
      • 主要参数
        • allow
          • 满足括号中“正则表达式”的值会被提取,如果为空,则全部匹配
        • deny
          • 与这个正则表达式(或正则表达式列表)匹配的URL一定不提取
        • allow_domains
          • 会被提取的链接的domains/域名
        • deny_domains
          • 一定不会被提取链接的domains
        • restrict_xpaths
          • 使用xpath表达式,和allow共同作用过滤链接/范围
    • rules
      • 概述
        • 在rules中包含一个或多个Rule对象,每个Rule对爬取网站的动作定义了特定操作
        • 如果多个Rule匹配了相同的链接,则根据规则在本集合中被定义的顺序,第一个会被使用
      • 主要参数
        • link_extractor
          • 是一个Link Extractor对象,用于定义需要提取的链接
        • callback
          • 从link_extractor中每获取到链接时,参数所指定的值作为回调函数,该回调函数接受一个response作为其第一个参数(尽量避免使用parse)
        • follow
          • 是一个布尔(boolean)值,指定了根据该规则从response提取的链接是否需要跟进
          • follow=True
            • 跟随:会自动匹配子网页中的其他符合规则的链接并爬取
        • process_links
          • 指定该spider中哪个的函数将会被调用,从link_extractor中获取到链接列表时将会调用该函数
          • 该方法主要用来过滤
        • process_request
          • 指定该spider中哪个的函数将会被调用, 该规则提取到每个request时都会调用该函数(用来过滤request)
  • Robots协议

    • 概述
      • Robots协议(也称为爬虫协议、机器人协议等)的全称是“网络爬虫排除标准”(Robots Exclusion Protocol),网站通过Robots协议告诉搜索引擎哪些页面可以抓取,哪些页面不能抓取
      • robots.txt文件是一个文本文件。当一个搜索蜘蛛访问一个站点时,它会首先检查该站点根目录下是否存在robots.txt,如果存在,搜索机器人就会按照该文件中的内容来确定访问的范围;如果该文件不存在,所有的搜索蜘蛛将能够访问网站上所有没有被口令保护的页面
    • 使用
      • 禁止robots协议将 ROBOTSTXT_OBEY = True改为False
  • 深度爬取

    • 爬取到链接,进入链接继续爬取,爬取到链接,再次进入链接爬取......

    • yield scrapy.Request(
                      url=href,  # url链接
                      callback=self.parse_detail,  # 回调函数:请求成功后的响应
                      meta={'name': name}  # 传入到parse_detail中的数据
                  )
      
      • scrapy.Request异步爬取
    • name = response.meta['name']

      • 取出小说名
      • 逐级传递
    • yield BiqugeItem(name=name, zj_name=zj_name, zj_content=zj_content)

      • 将数据传入管道
  • 循环遍历实现翻页

    • # 爬取下一页
      if self.page <= 100:
          print(f'---开始爬取{self.page}页---')
          self.page = self.page + 1
          url = 'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_%d.shtml' % self.page
          yield scrapy.Request(url, callback=self.parse)
      

Scrapy爬取笔趣阁

  • biquege_spider.py
import requests
import scrapy
from ..items import BiqugeItem


class BiqugeSpiderSpider(scrapy.Spider):
    name = 'biquge_spider'
    allowed_domains = ['biquge5200.cc']
    start_urls = ['https://www.biquge5200.cc/xuanhuanxiaoshuo/']
    # 爬取笔趣阁的首页
    def parse(self, response, **kwargs):
        # 解析数据
        li_list = response.xpath('//div[@class="l"]/ul/li')
        for li in li_list:
            name = li.xpath('./span[@class="s2"]/a/text()').get()  # 小说名
            href = li.xpath('./span[@class="s2"]/a/@href').get()  # 小说链接
            # requests:同步
            # print(len(requests.get(href).text))
            # print('-' * 100)
            # 异步:scrapy.Request
            # 请求小说详情页
            yield scrapy.Request(
                url=href,  # url链接
                callback=self.parse_detail,  # 回调函数:请求成功后的响应
                meta={'name': name}  # 传入到parse_detail中的数据
            )
    # 详情页
    def parse_detail(self, response):
        # 取出小说名
        name = response.meta['name']
        # 解析数据
        dd_list = response.xpath('//div[@id="list"]/dl/dd')
        for dd in dd_list:
            zj_name = dd.xpath('./a/text()').get()  # 章节名称
            zj_href = dd.xpath('./a/@href').get()  # 章节内容链接
            # 请求每个章节的小说内容
            yield scrapy.Request(
                url=zj_href,
                callback=self.parse_content,
                meta={'name': name, 'zj_name': zj_name}
            )
    # 小说内容页
    def parse_content(self, response):
        # 取出小说名及章节名
        name = response.meta['name']
        zj_name = response.meta['zj_name']
        # 解析数据
        p_list = response.xpath('//*[@id="content"]/p/text()').getall()
        zj_content = '\n'.join(p_list)
        # item
        # 将数据传入管道
        yield BiqugeItem(name=name, zj_name=zj_name, zj_content=zj_content)
  • items.py
import scrapy


class BiqugeItem(scrapy.Item):
    name = scrapy.Field()
    zj_name = scrapy.Field()
    zj_content = scrapy.Field()
  • pipelines.py
import os
from itemadapter import ItemAdapter


class BiqugePipeline:
    # def __init__(self):
    #     self.path = r'C:\Users\86188\Desktop\Spider\Day05\scrapy_project\biquge\books'
    def process_item(self, item, spider):
        if not os.path.isdir("books/%s" % item['name']):
            os.mkdir("books/%s" % item['name'])
        else:
            with open('books/%s/%s.txt' % (item["name"], item["zj_name"]), 'a', encoding='utf-8') as fp:
                fp.write(item["zj_content"])
                fp.flush()
            print(f'item:{item["name"]}-{item["zj_name"]}')
        return item
  • settings.py
BOT_NAME = 'biquge'
SPIDER_MODULES = ['biquge.spiders']
NEWSPIDER_MODULE = 'biquge.spiders'
USER_AGENT = 'biquge (+http://www.yourdomain.com)'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
   'biquge.pipelines.BiqugePipeline': 300,
}
  • start.py
import scrapy.cmdline


scrapy.cmdline.execute('scrapy crawl biquge_spider'.split())

Scrapy爬取京东

  • jd_spider.py
import scrapy
from selenium import webdriver
from ..items import JdItem


'''
常见的反爬虫策略之一。
这个参数的值,表明你是从哪个网页跳转过来的。
比如说我请求获得淘宝评论的时候,他的referer是商品详情页面,表明我从这件商品详情页请求的相关评论,没有referer就不会给你这个评论
from fake_useragent import UserAgent
#伪装成浏览器
ua = UserAgent()
headers = {'User-Agent':ua.random} #一般网站伪装成这样也就够了,但是如果想爬图片,图片反盗链的话。如下
#其实很好理解,就是告诉你要下载的那个图片页面,我是从主页面来的,现在把数据给我。
headers = {'User-Agent':ua.random,'Referer':'这里放入图片的主页面'}
#然后在后续requests中传入header即可
'''
class JdSpiderSpider(scrapy.Spider):
    name = 'jd_spider'
    allowed_domains = ['jd.com']
    start_urls = [
        # 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=1&s=1&click=0',
        # 'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=4&s=79&scrolling=y&log_id=1600660067305.2410&tpl=3_M&isList=1&show_items=',
        'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=6&s=131&scrolling=y&log_id=1600661434422.8716&tpl=3_M&isList=1&show_items='
    ]
    page1 = 1
    # page2 = 2
    s1 = 1
    # s2 = 27
    def parse(self, response, **kwargs):
        # driver = webdriver.Chrome()
        # driver.execute_script('window.scrollBy(0,10000)')
        li_list = response.xpath('//li[@class="gl-item"]')
        print(len(li_list))
        for li in li_list:
            shoes_name = li.xpath('./div/div[@class="p-img"]/a/@title').get()
            shoes_price = li.xpath('./div/div[@class="p-price"]/strong/i/text()').get()
            shoes_picture = li.xpath('./div/div[@class="p-img"]/a/img/@data-lazy-img').get()
            print(shoes_name, shoes_price, shoes_picture)
            yield JdItem(shoes_name=shoes_name, shoes_price=shoes_price, shoes_picture=shoes_picture)
        # driver.close()
        # if self.page1 <= 10:
        # # if self.page2 <= 200:
        #     print(f'---开始爬取{self.page1}页---')
        #     # print(f'---开始爬取{self.page2}页---')
        #     self.page1 = self.page1 + 2
        #     self.s1 = self.s1 + 52
        #     # self.page2 = self.page2 + 2
        #     # self.s2 = self.s2 + 52
        #     url = f'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page={self.page1}&s={self.s1}&click=0'
        #     # url =  f'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page={self.page2}&s={self.s2}&scrolling=y&log_id=1600431181482.2679&tpl=3_M&isList=1&show_items='
        #
        #
        #     yield scrapy.Request(url, callback=self.parse)
  • items.py
import scrapy


class JdItem(scrapy.Item):
    shoes_name = scrapy.Field()
    shoes_price = scrapy.Field()
    shoes_picture = scrapy.Field()
  • pipelines.py
import pymysql
from itemadapter import ItemAdapter


class JdPipeline:
    def open_spider(self, spider):
        print('连接数据库')
        self.db = pymysql.connect(
            user='root', password='******',database='spider2003'
        )
        self.cur = self.db.cursor()
    def close_spider(self, spider):
        print('关闭连接')
        self.cur.close()
        self.db.close()
    def process_item(self, item, spider):
        sql = 'insert into jd (shoes_name, shoes_price, shoes_picture) values ("%s", "%s", "%s")' % (item['shoes_name'], item['shoes_price'], item['shoes_picture'])
        try:
            self.cur.execute(sql)
            self.db.commit()
        except Exception as e:
            print(e)
            self.db.rollback()
        return item
  • settings.py
BOT_NAME = 'jd'
SPIDER_MODULES = ['jd.spiders']
NEWSPIDER_MODULE = 'jd.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'referer': 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=3&s=53&click=0'
}
ITEM_PIPELINES = {
   'jd.pipelines.JdPipeline': 300,
}
  • start.py
import scrapy.cmdline


# scrapy.cmdline.execute('scrapy crawl jd_spider --nolog'.split())
scrapy.cmdline.execute('scrapy crawl jd_spider'.split())

Scrapy爬取糗事百科

  • qsbk_spider.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import QiushibaikeItem
# 导入日志模块
import logging
# 配置日志输出格式
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(module)s - %(message)s"  # 设置输出格式
DATE_FORMAT = "%Y/%m/%d %H:%M:%S"  # 设置时间格式
logging.basicConfig(filename='qsbk.log', filemode='a+', format=LOG_FORMAT, datefmt=DATE_FORMAT)


class QsbkSpiderSpider(CrawlSpider):
# class QsbkSpiderSpider(scrapy.Spider):
    name = 'qsbk_spider'
    allowed_domains = ['qiushibaike.com']
    start_urls = ['https://www.qiushibaike.com/text/page/1/']
    rules = [
        Rule(
            LinkExtractor(
                allow=('/text/page/\d+/',),
                restrict_xpaths=('//ul[@class="pagination"]',)
            ),
            callback="parse_item",
            follow=True
        )
    ]
    def parse_item(self, response, **kwargs):
        div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
        for div in div_list:
            author = div.xpath('./div[@class="author clearfix"]/a[2]/h2/text()').get()
            content = div.xpath('./a[@class="contentHerf"]/div/span/text()').getall()  # 有br换行时,要用getall,但是要处理结果
            logging.info(f'download:{author}')
            yield QiushibaikeItem(author=author, content=content)
  • items.py
import scrapy


class QiushibaikeItem(scrapy.Item):
    author = scrapy.Field()
    content = scrapy.Field()
  • pipelines.py
import os
import random
from itemadapter import ItemAdapter


class QiushibaikePipeline:
    def process_item(self, item, spider):
        with open('cross_talk/%s-%f.txt' % (item['author'].replace('\n', ''), random.random()), 'w', encoding='utf-8') as fp:
            fp.write((''.join(item['content'])).replace('\n', ''))
            fp.flush()
        return item
  • settings.py
BOT_NAME = 'qiushibaike'
SPIDER_MODULES = ['qiushibaike.spiders']
NEWSPIDER_MODULE = 'qiushibaike.spiders'
USER_AGENT = 'qiushibaike (+http://www.yourdomain.com)'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
   'qiushibaike.pipelines.QiushibaikePipeline': 300,
}
  • start.py
import scrapy.cmdline


# scrapy.cmdline.execute('scrapy crawl qsbk_spider --nolog'.split())
scrapy.cmdline.execute('scrapy crawl qsbk_spider'.split())

Scrapy爬取新浪新闻

  • news_spider.py
import scrapy
from ..items import SinaNewsItem


class NewsSpiderSpider(scrapy.Spider):
    name = 'news_spider'
    allowed_domains = ['sina.com.cn']
    start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_1.shtml']
    # 自定义类属性
    page = 1
    def parse(self, response, **kwargs):
        li_list = response.xpath('//ul[@class="list_009"]/li')
        for li in li_list:
            news = li.xpath('./a/text()').get()
            news_time = li.xpath('./span/text()').get()
            news_link = li.xpath('./a/@href').get()

            item = SinaNewsItem(
                news=news,
                news_time=news_time,
                news_link=news_link,
            )
            yield item
        # 爬取下一页
        if self.page <= 100:
            print(f'---开始爬取{self.page}页---')
            self.page = self.page + 1
            url = 'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_%d.shtml' % self.page
            yield scrapy.Request(url, callback=self.parse)
  • items.py
import scrapy


class SinaNewsItem(scrapy.Item):
    news = scrapy.Field()
    news_time = scrapy.Field()
    news_link = scrapy.Field()
  • pipelines.py
import pymysql
from itemadapter import ItemAdapter


class SinaNewsPipeline:
    def open_spider(self, spider):
        print('开始爬取')
        self.db = pymysql.connect(
            host='localhost',
            port=3306,
            user='root',
            password='******',
            database='spider2003',
            charset='utf8'
        )
        self.cur = self.db.cursor()
    def close_spider(self, spider):
        print('爬取结束')
        self.cur.close()
        self.db.close()
    def process_item(self, item, spider):
        news = item['news']
        news_time = item['news_time']
        news_link = item['news_link']
        try:
            sql = 'insert into sina_news(news, news_time, news_link) values ("%s", "%s", "%s")' % (news, news_time, news_link)
            self.cur.execute(sql)
            self.db.commit()
        except Exception as e:
            print(e)
            self.db.rollback()
        return item
  • settings.py
BOT_NAME = 'sina_news'
SPIDER_MODULES = ['sina_news.spiders']
NEWSPIDER_MODULE = 'sina_news.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
   'sina_news.pipelines.SinaNewsPipeline': 300,
}
  • start.py
import scrapy.cmdline


scrapy.cmdline.execute('scrapy crawl news_spider'.split())

Scrapy高级

  • 日志logging

    • Scrapy提供的log功能

      • 可以修改配置文件settings.py,任意位置添加下面两行,效果会清爽很多

        • LOG_ENABLED = True  # 开启
          LOG_FILE = "mySpider.log" #日志文件名
          LOG_LEVEL = "INFO" #日志级别
          
      • Log levels

        • Scrapy提供5层logging级别
          • CRITICAL - 严重错误(critical)
          • ERROR - 一般错误(regular errors)
          • WARNING - 警告信息(warning messages)
          • INFO - 一般信息(informational messages)
          • DEBUG - 调试信息(debugging messages)
      • logging设置

        • 通过在setting.py中进行以下设置可以被用来配置logging
          • LOG_ENABLED
            • 默认: True,启用logging
          • LOG_ENCODING
            • 默认: 'utf-8',logging使用的编码
          • LOG_FILE
            • 默认: None,在当前目录里创建logging输出文件的文件名
          • LOG_LEVEL
            • 默认: 'DEBUG',log的最低级别
        • scrapy的日志模块已经被scrapy弃用
    • 使用python自带日志模块

      • import logging
        LOG_FORMAT = "%(asctime)s - %(levelname)s - %(module)s - %(message)s"  # 设置输出格式
        DATE_FORMAT = "%Y/%m/%d %H:%M:%S"  # 设置时间格式
        logging.basicConfig(filename='sina.log', filemode='a+', format=LOG_FORMAT, datefmt=DATE_FORMAT)
        logging.warning('错误')
        
  • settings配置

    • 概述

      • Scrapy设置(settings)提供了定制Scrapy组件的方法。可以控制包括核心(core),插件(extension),pipeline及spider组件
    • 设置

      • BOT_NAME

        • 默认: 'scrapybot'
        • Scrapy项目实现的bot的名字(也为项目名称)。 这将用来构造默认 User-Agent,同时也用来log
        • 当您使用startproject命令创建项目时其也被自动赋值
      • CONCURRENT_ITEMS

        • 默认: 100
        • Item Processor(即 Item Pipeline同时处理每个response(item)的最大值
      • CONCURRENT_REQUESTS

        • 默认: 16
        • Scrapy downloader 并发请求(concurrent requests)的最大值
      • CONCURRENT_REQUESTS_PER_DOMAIN

        • 默认: 8
        • 对单个网站进行并发请求的最大值
      • CONCURRENT_REQUESTS_PER_IP

        • 默认: 0
        • 对单个IP进行并发请求的最大值
        • 如果非0,则忽略CONCURRENT_REQUESTS_PER_DOMAIN,设定, 使用该设定
      • DEFAULT_REQUEST_HEADERS

        • DEFAULT_REQUEST_HEADERS = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en',
          }
          
        • Scrapy HTTP Request使用的默认header,由DefaultHeadersMiddleware产生

      • DEPTH_LIMIT

        • 默认: 0
        • 爬取网站最大允许的深度(depth)值。如果为0,则没有限制
      • DOWNLOADER

        • 默认: 'scrapy.core.downloader.Downloader'
        • 用于crawl的downloader
      • DOWNLOADER_MIDDLEWARES

        • 默认:{}
        • 保存项目中启用的下载中间件及其顺序的字典
      • DOWNLOADER_MIDDLEWARES_BASE

        • 默认
        • 包含Scrapy默认启用的下载中间件的字典。 永远不要在项目中修改该设定
      • DOWNLOAD_DELAY

        • 默认: 0下载器在下载同一个网站下一个页面前需要等待的时间。该选项可以用来限制爬取速度, 减轻服务器压力。同时也支持小数
      • DOWNLOAD_TIMEOUT

        • 默认: 180
        • 下载器超时时间(单位: 秒)
      • ITEM_PIPELINES

        • 默认: {}
        • 保存项目中启用的pipeline及其顺序的字典。该字典默认为空,值(value)任意。 不过值(value)习惯设定在0-1000范围内,越小,优先级越高
      • ITEM_PIPELINES_BASE

        • 默认: {}
        • 保存项目中默认启用的pipeline的字典。 永远不要在项目中修改该设定,而是修改ITEM_PIPELINES
      • LOG_ENABLED

        • 默认:True
        • 是否启用logging
      • LOG_ENCODING

        • 默认: 'utf-8'
        • logging使用的编码
      • LOG_FILE

        • 默认: None
        • logging输出的文件名。如果为None,则使用标准错误输出(standard error)
      • LOG_LEVEL

        • 默认: 'DEBUG'
        • 可选的级别有: CRITICAL、 ERROR、WARNING、INFO、DEBUG
      • LOG_STDOUT

        • 默认: False
        • 如果为 True ,进程所有的标准输出(及错误)将会被重定向到log中。例如, 执行 print 'hello' ,其将会在Scrapy log中显示
      • REDIRECT_MAX_TIMES

        • 默认: 20
        • 定义request允许重定向的最大次数。超过该限制后该request直接返回获取到的结果。 对某些任务我们使用Firefox默认值
      • ROBOTSTXT_OBEY

        • 默认:True
        • 如果启用,Scrapy将会遵守 robots.txt策略
      • SCHEDULER

        • 默认:{}
        • 保存项目中启用的下载中间件及其顺序的字典
      • SPIDER_MIDDLEWARES_BASE

        • 默认:
        • 保存项目中默认启用的spider中间件的字典。 永远不要在项目中修改该设定,而是修改SPIDER_MIDDLEWARES
      • SPIDER_MODULES

        • 默认: []
        • Scrapy搜索spider的模块列表
      • URLLENGTH_LIMIT

        • 默认: 2083
        • 爬取URL的最大长度
      • USER_AGENT

        • 默认: "Scrapy/VERSION (+http://scrapy.org)"
        • 爬取的默认User-Agent,除非被覆盖
      • REACTOR_THREADPOOL_MAXSIZE

        • 线程池数量,默认10条
  • 自定义中间件

    • 中间件种类

      • process_request(self, request, spider)

        • 当每个request通过下载中间件时,该方法被调用
      • process_response(self, request, response, spider)

        • 当下载器完成http请求,传递响应给引擎的时候调用
      • 自定义

        • 创建中间件类

        • # 随机的User-Agent
          class RandomUserAgent(object):
              def process_request(self, request, spider):
                  useragent = random.choice(USER_AGENTS)
                  request.headers.setdefault("User-Agent", useragent)
          
        • # 随机代理IP
          class RandomProxy(object):
              def process_request(self, request, spider):
                  proxy = random.choice(PROXIES)
                  request.meta['proxy'] = "http://" + proxy['ip_port']
          
        • 配置中间件

          • 最后设置setting.py里的DOWNLOADER_MIDDLEWARES,添加自己编写的下载中间件类

          • DOWNLOADER_MIDDLEWARES = {
               'baidu.middlewares.BaiduDownloaderMiddleware': 543,
               # 配置中间件
               'baidu.middlewares.UADownloaderMiddleware': 300,
               'baidu.middlewares.ProxyDownloaderMiddleware': 200,
            }
            
            USER_AGENTS = [
                "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
                "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
                "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
                "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
                "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
            ]
            
            PROXIES = [
                {'ip_port': '58.218.200.214:8730'},
                {'ip_port': '58.218.200.247:2359'},
                {'ip_port': '58.218.200.248:8503'},
                {'ip_port': '58.218.200.229:4612'},
                {'ip_port': '58.218.200.214:5570'},
                {'ip_port': '58.218.200.214:8801'},
            ]
            
  • POST请求

    • 如果第一个请求是post

      • 需要注释掉start_urls属性,并重写start_request方法

      • def start_requests(self):
            yield scrapy.FormRequest(
                    url='http://fanyi.baidu.com/sug',
                    formdata={'kw': 'wolf'},
                    callback=self.parse_item
                )
        
    • 如果第一个请求不是post

      • response = requests.post("http://www.baidu.com/", data = data, headers=headers)
        yield  scrapy.FormRequest(url=url,formdata=data,callback=self.parse_item)	
        

爬取新片场(综合)

  • xpc_spider.py
import scrapy
from ..items import *


class XpcSpiderSpider(scrapy.Spider):
    name = 'xpc_spider'
    allowed_domains = ['xinpianchang.com']
    start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']
    def parse(self, response, **kwargs):
        # 解析数据
        # 视频列表数据
        li_list = response.xpath('//ul[@class="video-list"][1]/li')
        for li in li_list:
            # 作品id
            pid = li.xpath('./@data-articleid').get()
            # 作品标题
            title = li.xpath('./div/div[1]/a/p/text()').get()
            # 缩略图
            thumbnail = li.xpath('./a/img/@_src').get()
            category_list = li.xpath('.//div[@class="new-cate"]/span[@class="fs_12 fw_300 c_b_9"]/text()').getall()
            # 分类
            category = '|'.join(category_list)
            category = category.replace(' ', '').replace('\n', '').replace('\t', '')
            # 发布时间
            created_at = li.xpath('.//p[@class="fs_12"]/text()').get()
            # item
            item = PostsItem()
            item['pid'] = pid
            item['title'] = title
            item['thumbnail'] = thumbnail
            item['category'] = category
            item['created_at'] = created_at
            # 进入详情页
            post_url = f'https://www.xinpianchang.com/a{pid}?from=ArticleList'
            request = scrapy.Request(url=post_url, callback=self.post_detail)
            request.meta['post_item'] = item
            yield request
    # 作品详情页
    def post_detail(self, response):
        post_item = response.meta.get('post_item')
        pid = post_item['pid']
        # 解析数据
        # 作品描述
        description_list = response.xpath('//p[@class="desc line-hide fs_14 c_b_3 fw_300 line-hide-3"]/text()').getall()
        description = ''.join(description_list)
        description = description.replace(' ', '').replace('\n', '').replace('\t', '')
        post_item['description'] = description
        # 播放次数
        play_counts = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/@data-curplaycounts').get()
        post_item['play_counts'] = play_counts
        # 点赞次数
        like_counts = response.xpath('//span[@class="v-center like-counts fs_12 c_w_f fw_300"]/@data-counts').get()
        post_item['like_counts'] = like_counts
        # 视频数据
        # video_url = 'https://mod-api.xinpianchang.com/mod/api/v2/media/ryM1l4365Wzwod2V?appKey=61a2f329348b3bf77&extend=userInfo%2CuserStatus'
        vid = response.xpath('//a[@class="collection-star hollow-star"]/@data-vid').get()
        video_url = f'https://mod-api.xinpianchang.com/mod/api/v2/media/{vid}?appKey=61a2f329348b3bf77&extend=userInfo%2CuserStatus'
        # 请求视频数据
        request = scrapy.Request(url=video_url, callback=self.video_detail)
        request.meta['post_item'] = post_item
        yield request
        # 创作者数据
        li_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul/li')
        for li in li_list:
            # 创作者id
            cid = li.xpath('./a/@data-userid').get()
            # item
            composer_item = ComposersItem()
            composer_item['cid'] = cid
            # 创作者url
            composer_url = li.xpath('./a/@href').get()
            composer_url = 'https://www.xinpianchang.com/' + composer_url
            # 访问创作者详情页
            request2 = scrapy.Request(url=composer_url, callback=self.composer_detail)
            request2.meta['composer_item'] = composer_item
            yield request2
            # 版权/角色数据
            cr_item = CopyrightsItem()
            cr_item['pcid'] = f'{pid}_{cid}'
            cr_item['pid'] = pid
            cr_item['cid'] = cid
            cr_item['roles'] = li.xpath('.//span[@class="roles fs_12 fw_300 c_b_9"]/text()').get()
            yield cr_item
        # 评论数据
        comment_url = f'https://app.xinpianchang.com/comments?resource_id={pid}&type=article&page=1&per_page=24'
        yield scrapy.Request(
            url=comment_url,
            callback=self.comment_detail
        )
    # 视频数据
    def video_detail(self, response):
        post_item = response.meta.get('post_item')
        # 解析数据
        content = response.json()
        # 视频预览图
        preview = content['data']['cover']
        # 视频链接
        video = content['data']['resource']['progressive'][0]['url']
        # 视频格式
        video_format = content['data']['resource']['progressive'][0]['mime']
        # 视频时长
        duration = content['data']['duration']
        # item
        post_item['preview'] = preview
        post_item['video'] = video
        post_item['video_format'] = video_format
        post_item['duration'] = duration
        # print(post_item)
        yield post_item
        # 创作者详情页
    def composer_detail(self, response):
        composer_item = response.meta.get('composer_item')
        # banner图
        banner = response.xpath('//div[@class="banner-wrap"]/@style').get()
        banner = banner[banner.find('(')+1: -1]
        # 用户头像
        avatar = response.xpath('//div[@class="banner-wrap"]/div/span/img/@src').get()
        # 是否加V
        verified = response.xpath('//div[@class="banner-wrap"]/div/span/span[contains(@class, "author-v")]').get()
        verified = 'yes' if verified else 'no'
        # 名字
        name = response.xpath('//p[@class="creator-name fs_26 fw_600 c_b_26"]/text()').get()
        # 自我介绍
        intro = response.xpath('//p[@class="creator-desc fs_14 fw_300 c_b_3 line-hide-1"]/text()').get()
        # 被点赞次数
        like_counts = response.xpath('//span[@class="like-counts fw_600 v-center"]/text()').get()
        like_counts = like_counts.replace(',', '')
        # 被关注数量
        fans_counts = response.xpath('//span[@class="fans-counts fw_600 v-center"]/text()').get()
        fans_counts = fans_counts.replace(',', '')
        # 关注数量
        follow_counts = response.xpath('//span[@class="follow-wrap"]/span[@class="fw_600 v-center"]/text()').get()
        follow_counts = follow_counts.replace(',', '')
        # 所在位置
        location = response.xpath('//span[@class="icon-location v-center"]/following-sibling::*/text()').get()
        location = location if location else ''
        # 职业
        career = response.xpath('//span[@class="icon-career v-center"]/following-sibling::*/text()').get()
        career = career if career else ''
        # item
        composer_item['banner'] = banner
        composer_item['avatar'] = avatar
        composer_item['verified'] = verified
        composer_item['name'] = name
        composer_item['intro'] = intro
        composer_item['like_counts'] = like_counts
        composer_item['fans_counts'] = fans_counts
        composer_item['follow_counts'] = follow_counts
        composer_item['location'] = location
        composer_item['career'] = career
        yield composer_item
    # 评论数据
    def comment_detail(self, response):
        content = response.json()
        comment_list = content['data']['list']
        for comment in comment_list:
            # 评论其他评论的数量
            reply = comment.get('referer')
            if reply:
                reply = reply.get('id')
            else:
                reply = 0
            item = CommentsItem(
                commentid=comment['id'],
                pid=comment['resource_id'],
                cid=comment['userid'],
                avatar=comment['userInfo']['avatar'],
                uname=comment['userInfo']['username'],
                created_at=comment['addtime'],
                content=comment['content'],
                like_counts=comment['count_approve'],
                reply=reply
            )
            yield item
  • items.py
from scrapy import Item, Field


# 作品
class PostsItem(Item):
    table_name = 'posts'  # 表名
    pid = Field()
    title = Field()
    thumbnail = Field()
    preview = Field()
    video = Field()
    video_format = Field()
    category = Field()
    duration = Field()
    created_at = Field()
    description = Field()
    play_counts = Field()
    like_counts = Field()
class ComposersItem(Item):
    table_name = 'composers'  # 表名
    cid = Field()
    banner = Field()
    avatar = Field()
    verified = Field()
    name = Field()
    intro = Field()
    like_counts = Field()
    fans_counts = Field()

    follow_counts = Field()
    location = Field()
    career = Field()
class CommentsItem(Item):
    table_name = 'comments'  # 表名
    commentid = Field()
    pid = Field()
    cid = Field()
    avatar = Field()
    uname = Field()
    created_at = Field()
    content = Field()
    like_counts = Field()
    reply = Field()
# 版权:作者在作品中的角色
class CopyrightsItem(Item):
    table_name = 'copyrights'  # 表名
    pcid = Field()
    pid = Field()
    cid = Field()
    roles = Field()
  • pipelines.py
import pymysql
from itemadapter import ItemAdapter


class XpcPipeline:
    def open_spider(self, spider):
        print('---开始存入MySQL---')
        self.db = pymysql.connect(user='root', password='nzw19940611', database='xpc_2020')
        self.cur = self.db.cursor()
    def close_spider(self, spider):
        print('---存入MySQL结束---')
        self.cur.close()
        self.db.close()
    def process_item(self, item, spider):
        # 表名
        table_name = item.table_name
        keys = list(item.keys())
        values = list(item.values())
        # 所有字段组成的字符串
        key_str = ','.join(["`%s`" % key for key in keys])
        # 所有的值组成的字符串
        # value_str = ','.join(['"%s"' % value for value in values])
        value_str = ','.join(["%s"] * len(values))
        # 如果key冲突,则用新数据更新旧数据
        update_str = ','.join(["`{}`=%s".format(key) for key in keys])
        # sql
        sql = 'insert into `{}` ({}) values ({}) on duplicate key update {}'.format(
            table_name,
            key_str,
            value_str,
            update_str
        )
        # 执行sql
        self.cur.execute(sql, values*2)
        self.db.commit()
        print(f'---插入成功:{table_name}---')
        return item
  • settings.py
BOT_NAME = 'xpc'
SPIDER_MODULES = ['xpc.spiders']
NEWSPIDER_MODULE = 'xpc.spiders'
USER_AGENT = 'xpc (+http://www.yourdomain.com)'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
   'xpc.pipelines.XpcPipeline': 300,
}
  • start.py
import scrapy.cmdline


scrapy.cmdline.execute('scrapy crawl xpc_spider --nolog'.split())
# scrapy.cmdline.execute('scrapy crawl xpc_spider'.split())
posted @ 2021-04-16 08:30  lotuslaw  阅读(100)  评论(0编辑  收藏  举报