常用数据库分页

Oracle

-- Oracle 12c+ : OFFSET FETCH 语法
SELECT * FROM users 
ORDER BY id 
OFFSET 20 ROWS 
FETCH NEXT 10 ROWS ONLY;

-- Oracle 11g及以下
SELECT * FROM (
    SELECT a.*, ROWNUM rn FROM (
        SELECT * FROM users ORDER BY id
    ) a WHERE ROWNUM <= 30
) WHERE rn > 20;-- 使用ROWNUM(三层嵌套查询)

SELECT * FROM (
    SELECT u.*, ROW_NUMBER() OVER (ORDER BY id) as rn
    FROM users u
) WHERE rn BETWEEN 21 AND 30;-- 使用ROW_NUMBER()

-- 注意:ROWNUM 为一个独立的字段,不能加表的别名修饰(即:t.ROWNUM是错误的)
select ROWNUM  ,  t.*  from emp t where ROWNUM <=5; 


-- 分析函数分页
SELECT * FROM (
    SELECT u.*, 
           RANK() OVER (ORDER BY score DESC) as rank_num,
           DENSE_RANK() OVER (ORDER BY score DESC) as dense_rank_num
    FROM users u
) WHERE rank_num BETWEEN 1 AND 10;-- 使用RANK()和DENSE_RANK()

MySQL

-- 基础分页(LIMIT OFFSET)
SELECT * FROM users ORDER BY id LIMIT 10 OFFSET 20;
SELECT * FROM users ORDER BY id LIMIT 20, 10;-- 简写形式

-- 优化分页
SELECT u.* FROM users u 
INNER JOIN (
    SELECT id FROM users ORDER BY id LIMIT 10 OFFSET 10000
) t ON u.id = t.id; -- 延迟关联优化大偏移量
SELECT * FROM users WHERE id > 100 ORDER BY id LIMIT 10; -- 游标分页

-- 特殊功能
-- 使用SQL_CALC_FOUND_ROWS获取总数(MySQL 5.7及以下)
SELECT SQL_CALC_FOUND_ROWS * FROM users LIMIT 10;
SELECT FOUND_ROWS() as total;

PostgreSQL

-- 基础分页(LIMIT OFFSET)
SELECT * FROM users ORDER BY id LIMIT 10 OFFSET 20;

-- 窗口函数分页
SELECT * FROM (
    SELECT *, ROW_NUMBER() OVER (ORDER BY id) as row_num
    FROM users
) t WHERE row_num BETWEEN 21 AND 30; -- 使用ROW_NUMBER()

SELECT * FROM (
    SELECT *, RANK() OVER (ORDER BY score DESC) as rank_num
    FROM users
) t WHERE rank_num BETWEEN 1 AND 10; -- 使用RANK()处理相同值

-- 游标分页
SELECT * FROM users WHERE id > $1 ORDER BY id LIMIT 10; -- 基于ID的游标分页
SELECT * FROM users 
WHERE (created_at, id) > ($1, $2) 
ORDER BY created_at, id LIMIT 10; -- 基于多字段的游标分页

-- 性能优化
CREATE INDEX idx_users_id ON users(id); -- 使用索引扫描
CREATE INDEX idx_users_cover ON users(id, name, email); -- 覆盖索引

SQL Server

-- 基础分页(SQL Server 2012+)
SELECT * FROM users 
ORDER BY id 
OFFSET 20 ROWS 
FETCH NEXT 10 ROWS ONLY;

-- 旧版本分页(SQL Server 2008及以下)
WITH PagedResults AS (
    SELECT *, ROW_NUMBER() OVER (ORDER BY id) as RowNum
    FROM users
)
SELECT * FROM PagedResults 
WHERE RowNum BETWEEN 21 AND 30;-- 使用ROW_NUMBER()

SELECT TOP 10 * FROM users 
WHERE id NOT IN (
    SELECT TOP 20 id FROM users ORDER BY id
) ORDER BY id;-- 使用TOP和子查询

-- 性能优化
WITH OrderedUsers AS (
    SELECT id, name, email,
           ROW_NUMBER() OVER (ORDER BY id) as RowNum,
           COUNT(*) OVER() as TotalCount
    FROM users
)
SELECT * FROM OrderedUsers 
WHERE RowNum BETWEEN 21 AND 30; -- 使用CTE和窗口函数

SQLite

-- 基础分页
SELECT * FROM users ORDER BY id LIMIT 10 OFFSET 20;

-- 性能优化
CREATE INDEX idx_users_id ON users(id);-- 使用索引
SELECT * FROM users WHERE id > 100 ORDER BY id LIMIT 10;-- 游标分页

--特殊考虑
--1.SQLite的LIMIT性能较好,但大偏移量仍有问题
--2.建议使用游标分页处理大数据集

MongoDB

-- 基础分页
db.users.find().sort({_id: 1}).skip(20).limit(10); -- skip() 和 limit()

-- 游标分页
db.users.find({_id: {$gt: ObjectId("...")}}).sort({_id: 1}).limit(10);-- 基于_id的游标分页
db.users.find({created_at: {$gt: ISODate("2011-01-01")}})
        .sort({created_at: 1}).limit(10);-- 基于自定义字段的游标分页
        
-- 聚合分页

db.users.aggregate([
    {$sort: {_id: 1}},
    {$skip: 20},
    {$limit: 10}
]);-- 使用聚合管道

db.users.aggregate([
    {$facet: {
        data: [{$sort: {_id: 1}}, {$skip: 20}, {$limit: 10}],
        count: [{$count: "total"}]
    }}
]);-- 获取总数和分页数据

Redis

-- 有序集合分页
    -- ZRANGE分页
    ZRANGE leaderboard 0 9 WITHSCORES  -- 前10名
    ZRANGE leaderboard 10 19 WITHSCORES  -- 11-20名
    ZREVRANGE leaderboard 0 9 WITHSCORES -- ZREVRANGE降序分页

-- 列表分页
    -- LRANGE分页
    LRANGE messages 0 9  -- 前10名
    LRANGE messages 10 19  -- 11-20名
        
-- 基于游标的分页
   -- SCAN
   SCAN 0 MATCH user:* COUNT 10

Elasticsearch

// 基础分页
{
    "from": 20,
    "size": 10,
    "query": {"match_all": {}},
    "sort": [{"_id": "asc"}]
}

// 滚动分页
    # 初始请求
    {
        "size": 10,
        "scroll": "1m",
        "query": {"match_all": {}}
    }

    # 后续请求
    {
        "scroll": "1m",
        "scroll_id": "scroll_id_here"
    }
    
    
// Search After 分页
{
    "size": 10,
    "query": {"match_all": {}},
    "search_after": [1234567890000, "some_tie_breaker"],
    "sort": [
        {"timestamp": "asc"},
        {"_id": "asc"}
    ]
}

Hive

-- 基础分页
SELECT * FROM users ORDER BY id LIMIT 10; -- LIMIT 语法(最基本)

SELECT * FROM (
    SELECT *, ROW_NUMBER() OVER (ORDER BY id) as rn
    FROM users
) t WHERE rn BETWEEN 21 AND 30; -- 使用子查询实现OFFSET效果(性能较差)


-- 窗口函数分页
SELECT user_id, name, email
FROM (
    SELECT user_id, name, email,
           ROW_NUMBER() OVER (ORDER BY user_id) as row_num
    FROM users
    WHERE status = 'active'
) ranked_users
WHERE row_num BETWEEN 21 AND 30; -- 使用ROW_NUMBER()进行分页

SELECT user_id, score
FROM (
    SELECT user_id, score,
           RANK() OVER (ORDER BY score DESC) as rank_num
    FROM user_scores
) ranked_scores
WHERE rank_num <= 10; -- 使用RANK()处理并列情况


-- 分区分页
SELECT * FROM user_logs
WHERE dt >= '2011-01-01' AND dt <= '2013-01-31'
ORDER BY log_time
LIMIT 1000; -- 基于分区的分页查询

SELECT user_id, action, log_time
FROM (
    SELECT user_id, action, log_time,
           ROW_NUMBER() OVER (PARTITION BY dt ORDER BY log_time) as rn
    FROM user_logs
    WHERE dt = '2013-01-01'
) partitioned_logs
WHERE rn BETWEEN 1 AND 100; -- 分区内分页


-- 大数据量分页优化
SELECT * FROM large_table
TABLESAMPLE(BUCKET 1 OUT OF 100 ON RAND())
LIMIT 1000; -- 使用TABLESAMPLE进行数据采样

SELECT * FROM events
WHERE event_date >= '2012-01-01' 
  AND event_date < '2012-02-02'
  AND event_time > '2012-01-01 10:00:00'
ORDER BY event_time
LIMIT 10000; -- 基于时间分区的增量分页


-- 性能优化策略
SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict; -- 1. 使用分区裁剪

SET hive.vectorized.execution.enabled=true; -- 2. 启用向量化执行

CREATE TABLE users_orc (
    id bigint,
    name string,
    email string
) STORED AS ORC; -- 3. 使用列式存储格式

CREATE INDEX idx_users_id ON TABLE users (id)
AS 'COMPACT'
WITH DEFERRED REBUILD; -- 4. 创建索引(Hive 3.0之前)


-- 流式分页处理
SELECT TRANSFORM(user_id, name, email)
USING 'python pagination_script.py'
AS (user_id, name, email, page_num)
FROM users
DISTRIBUTE BY page_num;-- 使用STREAMING进行大数据集处理


-- 与外部系统集成
-- 导出分页数据到HDFS
INSERT OVERWRITE DIRECTORY '/output/users_page_1'
SELECT * FROM users
ORDER BY id
LIMIT 10000;

-- 使用Sqoop增量导入
sqoop import \
  --connect jdbc:mysql://localhost/db \
  --table users \
  --incremental append \
  --check-column id \
  --last-value 1000 \
  --target-dir /user/hive/warehouse/users
-- Hive分页特殊注意事项

-- 1. Hive不支持标准的OFFSET语法
-- 错误示例:
-- SELECT * FROM users LIMIT 10 OFFSET 20;  -- 不支持

-- 2. 大表分页建议使用采样
SELECT * FROM (
    SELECT *, RAND() as random_val
    FROM large_table
    DISTRIBUTE BY random_val
    SORT BY random_val
) sampled
LIMIT 1000;

-- 3. 使用分桶表优化分页
CREATE TABLE users_bucketed (
    id bigint,
    name string
) CLUSTERED BY (id) INTO 32 BUCKETS
STORED AS ORC;

数据库分页方法选择建议

小数据量(< 10万条)

  • MySQL/PostgreSQL/SQLite: LIMIT OFFSET
  • SQL Server: OFFSET FETCH
  • Oracle: ROWNUM或ROW_NUMBER()

大数据量(> 10万条)

  • 关系型数据库: 游标分页(基于索引字段)
  • NoSQL: 游标分页(基于_id或时间戳)
  • 搜索引擎: Search After 或 Scroll API
  • Hive: 分区分页 + 窗口函数,避免全表扫描

实时数据流

  • 所有数据库: 基于时间戳的游标分页
  • 消息队列: 基于偏移量的消费
  • Hive: 基于时间分区的增量处理

性能优化通用原则

  • 确保排序字段有索引
  • 避免大偏移量的OFFSET
  • 使用覆盖索引减少回表
  • 考虑数据缓存策略
  • Hive特殊优化: 使用分区裁剪、向量化执行、列式存储
  • 根据业务场景选择合适的分页方式
posted @ 2019-09-10 09:23  lvlin241  阅读(98)  评论(0)    收藏  举报