MySQL高级SQL系统梳理指南

数据准备

测试表结构及数据
-- 短视频平台数据分析 - 系统化窗口函数案例
-- 场景:类似抖音/快手的短视频平台用户行为分析
CREATE database if not exists ds DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_bin ;
use ds;
DROP TABLE IF EXISTS videos;
-- 视频发布表
CREATE TABLE videos (
    video_id INT PRIMARY KEY,
    creator_id INT,
    creator_name VARCHAR(50),
    category VARCHAR(20),
    publish_time DATETIME,
    duration INT,           -- 视频时长(秒)
    views INT,             -- 播放量
    likes INT,             -- 点赞数
    comments INT,          -- 评论数
    shares INT             -- 分享数
);

-- 插入测试数据
INSERT INTO ds.videos (video_id, creator_id, creator_name, category, publish_time, duration, views, likes, comments, shares) VALUES
(1, 123, 'Alice', 'Gaming', '2023-05-15 14:23:45', 480, 25000, 1200, 85, 45),
(2, 456, 'Bob', 'Music', '2023-08-22 09:15:30', 240, 180000, 8500, 420, 380),
(3, 789, 'Charlie', 'Comedy', '2023-03-10 20:45:12', 360, 45000, 2800, 150, 95),
(4, 234, 'David', 'Education', '2023-11-05 16:30:20', 720, 95000, 4200, 380, 220),
(5, 567, 'Eva', 'Technology', '2023-07-18 11:20:35', 420, 320000, 15600, 850, 1200),
(6, 890, 'Frank', 'Sports', '2023-09-03 19:45:50', 300, 78000, 3900, 280, 185),
(7, 345, 'Grace', 'Food', '2023-04-28 13:15:25', 600, 125000, 7200, 450, 380),
(8, 678, 'Henry', 'Travel', '2023-12-12 08:30:40', 900, 220000, 11500, 680, 950),
(9, 901, 'Ivy', 'Beauty', '2023-06-07 15:45:15', 180, 85000, 6800, 320, 280),
(10, 112, 'Jack', 'Fashion', '2023-01-25 12:20:30', 240, 55000, 4200, 180, 125),
(11, 334, 'Kate', 'News', '2023-10-14 17:30:45', 480, 380000, 12000, 1200, 850),
(12, 556, 'Leo', 'Science', '2023-02-28 10:45:20', 840, 450000, 28500, 1850, 2200),
(13, 778, 'Mary', 'Health', '2023-08-16 21:15:35', 360, 95000, 5800, 420, 380),
(14, 223, 'Nick', 'Finance', '2023-05-30 14:30:50', 720, 180000, 8900, 680, 520),
(15, 445, 'Olivia', 'Politics', '2023-09-22 16:45:25', 600, 280000, 15200, 1350, 980),
(16, 667, 'Peter', 'Entertainment', '2023-03-18 18:20:40', 420, 125000, 9800, 580, 420),
(17, 889, 'Queen', 'Documentary', '2023-11-08 13:15:55', 1800, 850000, 45000, 3200, 4500),
(18, 111, 'Rose', 'Animation', '2023-07-04 09:30:15', 480, 320000, 18500, 980, 1200),
(19, 333, 'Sam', 'Tutorial', '2023-12-20 15:45:30', 960, 180000, 12800, 850, 680),
(20, 555, 'Tom', 'Review', '2023-04-12 11:20:45', 720, 95000, 7200, 420, 280),
(21, 777, '张三', 'Gaming', '2023-06-25 20:15:20', 360, 68000, 4200, 280, 185),
(22, 999, '李四', 'Music', '2023-01-15 14:30:35', 240, 125000, 8500, 520, 380),
(23, 121, '王五', 'Comedy', '2023-09-08 17:45:50', 300, 45000, 3200, 150, 95),
(24, 343, '赵六', 'Education', '2023-05-22 12:15:25', 600, 180000, 9800, 680, 520),
(25, 565, '孙七', 'Technology', '2023-11-18 19:30:40', 480, 380000, 22500, 1200, 1850),
(26, 787, '周八', 'Sports', '2023-03-28 16:45:15', 420, 95000, 5800, 380, 280),
(27, 909, '吴九', 'Food', '2023-08-14 21:20:30', 720, 220000, 15600, 850, 1200),
(28, 131, '郑十', 'Travel', '2023-12-05 08:15:45', 900, 125000, 8900, 520, 420),
(29, 353, '冯十一', 'Beauty', '2023-07-20 13:30:20', 180, 85000, 7200, 420, 320),
(30, 575, '陈十二', 'Fashion', '2023-02-14 10:45:35', 360, 68000, 4800, 280, 185),
(31, 797, 'Alice', 'Gaming', '2023-10-30 15:20:50', 540, 32000, 1850, 95, 68),
(32, 919, 'Bob', 'Music', '2023-04-18 18:15:25', 300, 155000, 9200, 580, 420),
(33, 141, 'Charlie', 'Comedy', '2023-12-28 11:30:40', 420, 58000, 3800, 185, 125),
(34, 363, 'David', 'Education', '2023-06-12 20:45:15', 780, 125000, 7800, 520, 380),
(35, 585, 'Eva', 'Technology', '2023-01-28 14:20:30', 360, 280000, 18500, 980, 1350),
(36, 807, 'Frank', 'Sports', '2023-09-15 17:15:45', 480, 85000, 5200, 320, 220),
(37, 929, 'Grace', 'Food', '2023-05-08 12:30:20', 600, 180000, 12800, 680, 850),
(38, 151, 'Henry', 'Travel', '2023-11-22 19:45:35', 1200, 320000, 19800, 1200, 1850),
(39, 373, 'Ivy', 'Beauty', '2023-03-16 16:20:50', 240, 95000, 7800, 420, 380),
(40, 595, 'Jack', 'Fashion', '2023-08-05 21:15:25', 300, 68000, 4800, 280, 185),
(41, 817, 'Kate', 'News', '2023-12-18 09:30:40', 480, 450000, 22500, 1850, 2200),
(42, 939, 'Leo', 'Science', '2023-07-10 13:45:15', 960, 580000, 35000, 2800, 3800),
(43, 161, 'Mary', 'Health', '2023-02-25 18:20:30', 420, 125000, 8900, 520, 420),
(44, 383, 'Nick', 'Finance', '2023-10-12 15:15:45', 600, 220000, 15600, 980, 850),
(45, 605, 'Olivia', 'Politics', '2023-04-30 11:30:20', 780, 380000, 25200, 1850, 1950),
(46, 827, 'Peter', 'Entertainment', '2023-09-18 20:45:35', 360, 155000, 12500, 680, 520),
(47, 949, 'Queen', 'Documentary', '2023-06-28 14:20:50', 2400, 950000, 58000, 4200, 5800),
(48, 171, 'Rose', 'Animation', '2023-01-12 17:15:25', 420, 280000, 18800, 1200, 1350),
(49, 393, 'Sam', 'Tutorial', '2023-11-05 12:30:40', 840, 185000, 14500, 850, 680),
(50, 615, 'Tom', 'Review', '2023-08-22 19:45:15', 600, 125000, 9200, 580, 420),
(51, 123, '张三', 'Gaming', '2023-03-15 10:20:30', 480, 85000, 5800, 380, 280),
(52, 456, '李四', 'Music', '2023-12-08 16:15:45', 320, 155000, 11200, 680, 520),
(53, 789, '王五', 'Comedy', '2023-07-25 21:30:20', 240, 58000, 4200, 220, 125),
(54, 234, '赵六', 'Education', '2023-02-18 13:45:35', 720, 220000, 15600, 980, 850),
(55, 567, '孙七', 'Technology', '2023-10-28 18:20:50', 540, 450000, 32500, 2200, 2800),
(56, 890, '周八', 'Sports', '2023-05-15 15:15:25', 360, 125000, 8500, 520, 380),
(57, 345, '吴九', 'Food', '2023-11-30 11:30:40', 600, 280000, 18800, 1200, 1500),
(58, 678, '郑十', 'Travel', '2023-08-18 20:45:15', 960, 185000, 12800, 850, 680),
(59, 901, '冯十一', 'Beauty', '2023-04-05 14:20:30', 180, 95000, 8200, 480, 380),
(60, 112, '陈十二', 'Fashion', '2023-12-22 17:15:45', 300, 78000, 5800, 320, 220),
(61, 334, 'Alice', 'Gaming', '2023-06-10 12:30:20', 420, 45000, 2800, 150, 95),
(62, 556, 'Bob', 'Music', '2023-01-25 19:45:35', 360, 220000, 15600, 850, 1200),
(63, 778, 'Charlie', 'Comedy', '2023-09-12 16:20:50', 300, 68000, 4800, 280, 185),
(64, 223, 'David', 'Education', '2023-05-28 21:15:25', 840, 155000, 9800, 680, 520),
(65, 445, 'Eva', 'Technology', '2023-12-15 08:30:40', 480, 380000, 28500, 1850, 2500),
(66, 667, 'Frank', 'Sports', '2023-03-22 13:45:15', 420, 95000, 6200, 380, 280),
(67, 889, 'Grace', 'Food', '2023-11-08 18:20:30', 720, 250000, 18200, 1200, 1350),
(68, 111, 'Henry', 'Travel', '2023-07-18 15:15:45', 1080, 185000, 13500, 850, 680),
(69, 333, 'Ivy', 'Beauty', '2023-02-28 11:30:20', 240, 125000, 9800, 580, 420),
(70, 555, 'Jack', 'Fashion', '2023-10-15 20:45:35', 360, 85000, 6500, 380, 280),
(71, 777, 'Kate', 'News', '2023-04-22 14:20:50', 600, 520000, 35000, 2800, 3200),
(72, 999, 'Leo', 'Science', '2023-12-05 17:15:25', 900, 680000, 45000, 3800, 4500),
(73, 121, 'Mary', 'Health', '2023-08-28 12:30:40', 480, 155000, 12500, 680, 520),
(74, 343, 'Nick', 'Finance', '2023-06-15 19:45:15', 720, 280000, 19800, 1350, 1200),
(75, 565, 'Olivia', 'Politics', '2023-01-30 16:20:30', 600, 450000, 32500, 2500, 2800),
(76, 787, 'Peter', 'Entertainment', '2023-09-25 21:15:45', 420, 185000, 15200, 850, 680),
(77, 909, 'Queen', 'Documentary', '2023-05-12 09:30:20', 3600, 1200000, 85000, 6500, 8200),
(78, 131, 'Rose', 'Animation', '2023-12-28 13:45:35', 540, 350000, 25200, 1650, 1950),
(79, 353, 'Sam', 'Tutorial', '2023-03-18 18:20:50', 780, 220000, 16800, 1200, 980),
(80, 575, 'Tom', 'Review', '2023-11-02 15:15:25', 480, 155000, 11500, 680, 520),
(81, 797, '张三', 'Gaming', '2023-07-20 11:30:40', 360, 95000, 6800, 420, 320),
(82, 919, '李四', 'Music', '2023-02-15 20:45:15', 240, 185000, 13800, 850, 680),
(83, 141, '王五', 'Comedy', '2023-10-08 14:20:30', 300, 68000, 4800, 280, 185),
(84, 363, '赵六', 'Education', '2023-04-25 17:15:45', 600, 280000, 19800, 1350, 1200),
(85, 585, '孙七', 'Technology', '2023-12-18 12:30:20', 420, 520000, 38500, 2800, 3500),
(86, 807, '周八', 'Sports', '2023-08-05 19:45:35', 480, 125000, 8900, 580, 420),
(87, 929, '吴九', 'Food', '2023-06-22 16:20:50', 720, 320000, 22500, 1500, 1850),
(88, 151, '郑十', 'Travel', '2023-01-18 21:15:25', 840, 220000, 15600, 980, 850),
(89, 373, '冯十一', 'Beauty', '2023-09-30 08:30:40', 180, 115000, 9500, 520, 420),
(90, 595, '陈十二', 'Fashion', '2023-05-15 13:45:15', 320, 85000, 6200, 380, 280),
(91, 817, 'Alice', 'Gaming', '2023-12-08 18:20:30', 540, 58000, 3800, 220, 155),
(92, 939, 'Bob', 'Music', '2023-03-25 15:15:45', 280, 280000, 19800, 1200, 1350),
(93, 161, 'Charlie', 'Comedy', '2023-11-12 11:30:20', 360, 78000, 5500, 320, 220),
(94, 383, 'David', 'Education', '2023-07-28 20:45:35', 720, 185000, 12800, 850, 680),
(95, 605, 'Eva', 'Technology', '2023-02-12 14:20:50', 480, 450000, 32500, 2200, 2800),
(96, 827, 'Frank', 'Sports', '2023-10-30 17:15:25', 420, 115000, 8200, 480, 350),
(97, 949, 'Grace', 'Food', '2023-06-18 12:30:40', 600, 380000, 28500, 1850, 2200),
(98, 171, 'Henry', 'Travel', '2023-01-05 19:45:15', 1200, 250000, 18200, 1200, 1350),
(99, 393, 'Ivy', 'Beauty', '2023-09-22 16:20:30', 240, 145000, 11500, 680, 520),
(100, 615, 'Jack', 'Fashion', '2023-05-08 21:15:45', 360, 95000, 7200, 420, 320);

1. 窗口函数 (Window Functions)

基本语法

SELECT column_name,
       window_function() OVER (
           [PARTITION BY partition_expression]
           [ORDER BY sort_expression]
           [ROWS/RANGE window_frame]
       )
FROM table_name;

常用窗口函数

排名函数

  • ROW_NUMBER: 分页、去重
  • RANK/DENSE_RANK: 排行榜 
  • PERCENT_RANK: 性能评估

ROW_NUMBER - 唯一行号,常用于分页和去重

-- ROW_NUMBER  唯一行号,常用于分页和去重
SELECT
	ROW_NUMBER() OVER(ORDER BY views desc) as row_id,s.*,
    CASE 
	    WHEN ROW_NUMBER() OVER (ORDER BY views DESC) <= 5 THEN '热门推荐'
        WHEN ROW_NUMBER() OVER (ORDER BY views DESC) <=8 THEN '中度推荐'
        WHEN ROW_NUMBER() OVER (ORDER BY views DESC) <=10 THEN '轻度推荐'
	END as rec_tag
FROM ds.videos s;

RANK vs DENSE_RANK - 处理并列排名的两种策略

SELECT creator_name, likes,
       RANK() OVER (ORDER BY likes DESC) as rank_with_gaps,        -- 跳跃式排名:1,2,2,4
       DENSE_RANK() OVER (ORDER BY likes DESC) as rank_no_gaps     -- 连续式排名:1,2,2,3
FROM videos
ORDER BY likes DESC;

PERCENT_RANK - 百分位排名,用于性能评估

SELECT video_id, creator_name, views,
		percent_rank() OVER (ORDER BY views) as percentile
FROM ds.videos s;
SELECT video_id, creator_name, views,
       PERCENT_RANK() OVER (ORDER BY views) as percentile,
       CASE 
           WHEN PERCENT_RANK() OVER (ORDER BY views) >= 0.8 THEN 'Top 20%'
           WHEN PERCENT_RANK() OVER (ORDER BY views) >= 0.5 THEN 'Top 50%'
           ELSE 'Below Average'
       END as performance_tier
FROM videos;

取值函数

  • LAG/LEAD: 环比增长率
  • FIRST/LAST_VALUE: 基准对比
  • NTH_VALUE: 特定位置取值

FIRST_VALUE/LAST_VALUE - 窗口内首尾值

SELECT creator_name, video_id, publish_time, views,
       FIRST_VALUE(views) OVER (PARTITION BY creator_name ORDER BY publish_time) as first_video_views,
       LAST_VALUE(views) OVER (PARTITION BY creator_name ORDER BY publish_time 
                               ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as latest_video_views -- ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING 定义窗口函数的计算范围为整个分区,从第一行到最后一行。
FROM videos
ORDER BY creator_name, publish_time;

LAG/LEAD - 时间序列分析核心

SELECT creator_name, video_id, publish_time, views,
       LAG(views,1) OVER(PARTITION BY creator_name ORDER BY publish_time) as prev_views
FROM videos
ORDER BY creator_name, publish_time;
SELECT creator_name, video_id, publish_time, views,
       LEAD(views,1) OVER(PARTITION BY creator_name ORDER BY publish_time) as next_views
FROM videos
ORDER BY creator_name, publish_time;
增长率计算
SELECT creator_name, video_id, publish_time, views,
       (views - LAG(views,1) OVER(PARTITION BY creator_name ORDER BY publish_time)) * 100.0 / LAG(views,1) OVER(PARTITION BY creator_name ORDER BY publish_time) as growth_rate_pct
FROM videos
ORDER BY creator_name, publish_time;
增长率计算(保留2位小数)
SELECT creator_name, video_id, publish_time, views,
       ROUND(
           (views - LAG(views, 1) OVER (PARTITION BY creator_name ORDER BY publish_time)) * 100.0 / 
           LAG(views, 1) OVER (PARTITION BY creator_name ORDER BY publish_time), 2
       ) as growth_rate_pct
FROM videos
ORDER BY creator_name, publish_time;

NTH_VALUE - 取指定位置的值

SELECT creator_name, views,
       NTH_VALUE(views, 2) OVER (PARTITION BY creator_name ORDER BY views DESC 
                                 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as second_best_views -- ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING 定义窗口函数的计算范围为整个分区,从第一行到最后一行。
FROM videos;

聚合函数

  • SUM() OVER: 累计求和、移动平均
  • AVG() OVER: 个体与群体比较

移动窗口聚合 - 趋势分析

SELECT 
    DATE(publish_time) as date,
    SUM(views) as daily_views,
    -- 累计播放量
    SUM(SUM(views)) OVER (ORDER BY DATE(publish_time)) as cumulative_views
FROM videos 
GROUP BY DATE(publish_time)
ORDER BY date;
SELECT 
    DATE(publish_time) as date,
    SUM(views) as daily_views,
    -- 3日移动平均
    AVG(SUM(views)) OVER (ORDER BY DATE(publish_time) 
                          ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as ma3_views, -- 计算按日期排序的浏览量总和的三行移动平均值。
    -- 累计播放量
    SUM(SUM(views)) OVER (ORDER BY DATE(publish_time)) as cumulative_views
FROM videos 
GROUP BY DATE(publish_time)
ORDER BY date;

分组聚合对比 - 个体与群体比较

SELECT creator_name, category, views,
       AVG(views) OVER (PARTITION BY category) as category_avg_views,
       views - AVG(views) OVER (PARTITION BY category) as vs_category_avg
FROM videos;
SELECT creator_name, category, views,
       SUM(views) OVER (PARTITION BY creator_name) as creator_total_views,
       views * 100.0 / SUM(views) OVER (PARTITION BY creator_name) AS contribution_pct, -- 创作者贡献度
       ROUND(views * 100.0 / SUM(views) OVER (PARTITION BY creator_name),2) AS contribution_pct -- 创作者贡献度(保留2位小数)
FROM videos;

分布函数(例子不明显待完善)

  • NTILE: 用户分层、数据分桶
  • CUME_DIST: 异常检测
SELECT creator_name, views, likes,
       NTILE(3) OVER (ORDER BY views) as views_tier,
       NTILE(4) OVER (ORDER BY likes) as likes_quartile,
       -- 组合分层策略
       CASE 
           WHEN NTILE(3) OVER (ORDER BY views) = 1 THEN '播放量-低'
           WHEN NTILE(3) OVER (ORDER BY views) = 2 THEN '播放量-中'
           ELSE '播放量-高'
       END as views_level
FROM videos;
SELECT video_id, views,
       CUME_DIST() OVER (ORDER BY views) as cumulative_dist,
       CASE 
           WHEN CUME_DIST() OVER (ORDER BY views) > 0.95 THEN '异常高播放量'
           WHEN CUME_DIST() OVER (ORDER BY views) < 0.05 THEN '异常低播放量'
           ELSE '正常范围'
       END as anomaly_flag
FROM videos;

应用场景

  • 内容创作者画像分析
WITH creator_stats AS (
    SELECT 
        creator_name,
        COUNT(*) as video_count,
        AVG(views) as avg_views,
        AVG(likes) as avg_likes,
        SUM(views) as total_views,
        AVG(duration) as avg_duration
    FROM videos 
    GROUP BY creator_name
)
SELECT 
    creator_name,
    video_count,
    avg_views,
    -- 创作者分层
    NTILE(3) OVER (ORDER BY total_views DESC) as creator_tier,
    -- 产出效率排名
    RANK() OVER (ORDER BY avg_views DESC) as efficiency_rank,
    -- 与平台平均的比较
    ROUND(avg_views / AVG(avg_views) OVER (), 2) as vs_platform_avg
FROM creator_stats
ORDER BY total_views DESC;
  • 病毒式传播分析 - 找出爆款视频规律
SELECT 
    video_id,
    creator_name,
    category,
    views,
    likes,
    -- 互动率计算
    ROUND(likes * 100.0 / NULLIF(views, 0), 2) as engagement_rate,
    -- 爆款识别
    CASE 
        WHEN views > AVG(views) OVER () * 2 THEN '超级爆款'
        WHEN views > AVG(views) OVER () * 1.5 THEN '准爆款'
        ELSE '普通视频'
    END as viral_status,
    -- 类别内排名
    RANK() OVER (PARTITION BY category ORDER BY views DESC) as category_rank
FROM videos
ORDER BY views DESC;
  • 用户留存分析 - 模拟连续发布创作者
SELECT 
    creator_name,
    video_id,
    publish_time,
    -- 发布间隔天数
    DATEDIFF(
        publish_time, 
        LAG(publish_time) OVER (PARTITION BY creator_name ORDER BY publish_time)
    ) as days_since_last_post,
    -- 连续发布标识
    CASE 
        WHEN DATEDIFF(publish_time, LAG(publish_time) OVER (PARTITION BY creator_name ORDER BY publish_time)) <= 1 
        THEN '连续创作者'
        ELSE '非连续创作者'
    END as creator_consistency
FROM videos
ORDER BY creator_name, publish_time;

2. 公用表表达式 (CTE) 

基于MySQL 8.x

基础CTE - 替代子查询

  • 简单CTE - 找出高于平均播放量的视频
WITH avg_stats AS (
    SELECT AVG(views) as avg_views, AVG(likes) as avg_likes
    FROM videos
)
SELECT v.video_id, v.creator_name, v.views, v.likes,
       ROUND(v.views / a.avg_views, 2) as views_ratio,
       ROUND(v.likes / a.avg_likes, 2) as likes_ratio
FROM videos v, avg_stats a
WHERE v.views > a.avg_views;
  • 多个CTE - 创作者分层分析
WITH creator_stats AS (
    SELECT creator_name,
           COUNT(*) as video_count,
           SUM(views) as total_views,
           AVG(views) as avg_views
    FROM videos 
    GROUP BY creator_name
),
platform_benchmarks AS (
    SELECT AVG(video_count) as avg_video_count,
           AVG(total_views) as avg_total_views
    FROM creator_stats
)
SELECT cs.creator_name,
       cs.video_count,
       cs.total_views,
       -- 与平台基准比较
       CASE 
           WHEN cs.total_views > pb.avg_total_views THEN '头部创作者'
           WHEN cs.video_count > pb.avg_video_count THEN '活跃创作者'
           ELSE '普通创作者'
       END as creator_tier
FROM creator_stats cs, platform_benchmarks pb
ORDER BY cs.total_views DESC;

递归CTE - 处理层次数据

病毒传播模拟 - 递归计算传播层级

WITH RECURSIVE viral_spread AS (
    -- 初始种子视频(播放量>80000的爆款)
    SELECT video_id, creator_name, views, 1 as spread_level
    FROM videos 
    WHERE views > 80000
    
    UNION ALL
    
    -- 模拟传播:每层传播影响相同创作者的其他视频
    SELECT v.video_id, v.creator_name, v.views, vs.spread_level + 1
    FROM videos v
    INNER JOIN viral_spread vs ON v.creator_name = vs.creator_name
    WHERE vs.spread_level < 3 AND v.video_id != vs.video_id
)
SELECT DISTINCT video_id, creator_name, views, spread_level,
       CASE spread_level
           WHEN 1 THEN '源头爆款'
           WHEN 2 THEN '一度传播'
           WHEN 3 THEN '二度传播'
       END as spread_type
FROM viral_spread
ORDER BY spread_level, views DESC;

窗口函数 + CTE 组合

  • 创作者成长轨迹分析
WITH creator_timeline AS (
    SELECT creator_name, video_id, publish_time, views,
           ROW_NUMBER() OVER (PARTITION BY creator_name ORDER BY publish_time) as video_seq,
           LAG(views) OVER (PARTITION BY creator_name ORDER BY publish_time) as prev_views
    FROM videos
),
growth_analysis AS (
    SELECT *,
           CASE 
               WHEN prev_views IS NULL THEN '首发视频'
               WHEN views > prev_views * 1.5 THEN '大幅增长'
               WHEN views > prev_views THEN '稳步增长'
               ELSE '播放下滑'
           END as growth_trend
    FROM creator_timeline
)
SELECT creator_name,
       COUNT(*) as total_videos,
       SUM(CASE WHEN growth_trend = '大幅增长' THEN 1 ELSE 0 END) as big_growth_count,
       SUM(CASE WHEN growth_trend = '稳步增长' THEN 1 ELSE 0 END) as steady_growth_count,
       ROUND(
           SUM(CASE WHEN growth_trend IN ('大幅增长', '稳步增长') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2
       ) as growth_rate_pct
FROM growth_analysis
WHERE video_seq > 1  -- 排除首发视频
GROUP BY creator_name
ORDER BY growth_rate_pct DESC;
  • 热门内容挖掘 - 多维度分析
WITH category_performance AS (
    SELECT category,
           COUNT(*) as video_count,
           AVG(views) as avg_views,
           AVG(likes) as avg_likes,
           RANK() OVER (ORDER BY AVG(views) DESC) as category_rank
    FROM videos
    GROUP BY category
),
top_videos AS (
    SELECT video_id, creator_name, category, views, likes,
           RANK() OVER (PARTITION BY category ORDER BY views DESC) as rank_in_category
    FROM videos
)
SELECT cp.category,
       cp.avg_views,
       cp.category_rank,
       tv.creator_name as top_creator,
       tv.views as top_video_views,
       ROUND(tv.views / cp.avg_views, 2) as vs_category_avg
FROM category_performance cp
INNER JOIN top_videos tv ON cp.category = tv.category
WHERE tv.rank_in_category = 1
ORDER BY cp.category_rank;

复杂业务场景 - CTE最佳实践

  • 漏斗分析 - 用户行为转化
WITH video_metrics AS (
    SELECT video_id, creator_name, views, likes, comments, shares,
           -- 计算转化率
           ROUND(likes * 100.0 / NULLIF(views, 0), 2) as like_rate,
           ROUND(comments * 100.0 / NULLIF(views, 0), 2) as comment_rate,
           ROUND(shares * 100.0 / NULLIF(views, 0), 2) as share_rate
    FROM videos
),
conversion_tiers AS (
    SELECT *,
           CASE 
               WHEN like_rate >= 5 THEN '高互动'
               WHEN like_rate >= 2 THEN '中互动' 
               ELSE '低互动'
           END as engagement_tier
    FROM video_metrics
)
SELECT engagement_tier,
       COUNT(*) as video_count,
       AVG(views) as avg_views,
       AVG(like_rate) as avg_like_rate,
       AVG(comment_rate) as avg_comment_rate,
       AVG(share_rate) as avg_share_rate
FROM conversion_tiers
GROUP BY engagement_tier
ORDER BY AVG(like_rate) DESC;
  • AB测试效果分析 - 模拟不同时长对播放量的影响
WITH duration_groups AS (
    SELECT video_id, creator_name, duration, views,
           CASE 
               WHEN duration <= 60 THEN '短视频(≤1分钟)'
               WHEN duration <= 180 THEN '中视频(1-3分钟)'
               ELSE '长视频(>3分钟)'
           END as duration_group
    FROM videos
),
group_stats AS (
    SELECT duration_group,
           COUNT(*) as sample_size,
           AVG(views) as avg_views,
           STDDEV(views) as stddev_views
    FROM duration_groups
    GROUP BY duration_group
)
SELECT duration_group,
       sample_size,
       ROUND(avg_views, 0) as avg_views,
       ROUND(stddev_views, 0) as stddev_views,
       -- 置信区间计算
       ROUND(avg_views - 1.96 * stddev_views / SQRT(sample_size), 0) as ci_lower,
       ROUND(avg_views + 1.96 * stddev_views / SQRT(sample_size), 0) as ci_upper
FROM group_stats
ORDER BY avg_views DESC;
  • CTE vs 子查询 性能对比示例  
    • 传统子查询写法(不推荐)  
SELECT creator_name, video_count, avg_views
FROM (
    SELECT creator_name, 
           COUNT(*) as video_count,
           AVG(views) as avg_views
    FROM videos 
    GROUP BY creator_name
) creator_summary
WHERE video_count >= (
    SELECT AVG(video_count) 
    FROM (
        SELECT COUNT(*) as video_count 
        FROM videos 
        GROUP BY creator_name
    ) sub
);

    • CTE优化写法(推荐)  
WITH creator_summary AS (
    SELECT creator_name, 
           COUNT(*) as video_count,
           AVG(views) as avg_views
    FROM videos 
    GROUP BY creator_name
),
benchmark AS (
    SELECT AVG(video_count) as avg_video_count
    FROM creator_summary
)
SELECT cs.creator_name, cs.video_count, cs.avg_views
FROM creator_summary cs, benchmark b
WHERE cs.video_count >= b.avg_video_count;

3. 高级子查询

  • 标量子查询 - 返回单值,用于比较和计算(返回单个值,可用于SELECT/WHERE )
-- 在SELECT中使用标量子查询 - 与全局基准对比
SELECT video_id, creator_name, views,
       (SELECT AVG(views) FROM videos) as platform_avg_views,
       views - (SELECT AVG(views) FROM videos) as vs_avg_diff,
       ROUND(views * 100.0 / (SELECT SUM(views) FROM videos), 2) as market_share_pct
FROM videos
ORDER BY views DESC;
-- 在WHERE中使用标量子查询 - 动态筛选条件
SELECT creator_name, video_id, views, likes
FROM videos
WHERE views > (SELECT AVG(views) FROM videos)  -- 高于平均播放量
ORDER BY views DESC;
-- 相关标量子查询 - 每个创作者与自己历史对比
SELECT creator_name, video_id, views, publish_time,
       (SELECT AVG(views) FROM videos v2 WHERE v2.creator_name = v1.creator_name) as personal_avg,
       (SELECT MAX(views) FROM videos v2 WHERE v2.creator_name = v1.creator_name) as personal_best,
       CASE 
           WHEN views = (SELECT MAX(views) FROM videos v2 WHERE v2.creator_name = v1.creator_name) THEN '个人最佳'
           WHEN views > (SELECT AVG(views) FROM videos v2 WHERE v2.creator_name = v1.creator_name) THEN '超常发挥'
           ELSE '低于平均'
       END as performance_level
FROM videos v1
ORDER BY creator_name, views DESC;
  • 行子查询 - 返回单行,用于行级匹配 
-- 行子查询 - 查找最高播放量的视频完整信息
SELECT creator_name, video_id, views, likes, comments
FROM videos
WHERE (views, likes) = (SELECT MAX(views), MAX(likes) FROM videos);
-- 行子查询 - 多维度最优匹配
SELECT creator_name, category, views, likes,
       views + likes as total_engagement
FROM videos
WHERE (views, likes) >= (SELECT AVG(views), AVG(likes) FROM videos)  -- 双高视频
ORDER BY total_engagement DESC;
  • 列子查询 - 返回单列,配合IN/ANY/ALL
-- IN子查询 - 查找头部创作者的所有视频
SELECT creator_name, video_id, views, category
FROM videos
WHERE creator_name IN (
    SELECT creator_name 
    FROM videos 
    GROUP BY creator_name 
    HAVING SUM(views) > 100000  -- 总播放量超过10万的创作者
)
ORDER BY creator_name, views DESC;
-- ANY/SOME子查询 - 存在性检查
SELECT creator_name, video_id, views
FROM videos v1
WHERE views > ANY (  -- 播放量超过任意一个科技类视频
    SELECT views FROM videos WHERE category = '科技'
)
AND category != '科技'  -- 排除科技类本身
ORDER BY views DESC;
-- ALL子查询 - 全量比较
SELECT creator_name, video_id, views, category
FROM videos
WHERE views > ALL (  -- 播放量超过所有健身类视频
    SELECT views FROM videos WHERE category = '健身'
)
ORDER BY views DESC;
  • 表子查询 - 返回结果集,用于复杂分析
-- 表子查询 + JOIN - 创作者分层分析
SELECT base.creator_name, base.total_views, base.video_count,
       tier.tier_name, tier.min_views, tier.max_views
FROM (
    SELECT creator_name, 
           SUM(views) as total_views,
           COUNT(*) as video_count
    FROM videos 
    GROUP BY creator_name
) base
JOIN (
    SELECT '头部创作者' as tier_name, 150000 as min_views, 999999999 as max_views
    UNION ALL
    SELECT '腰部创作者', 50000, 149999
    UNION ALL  
    SELECT '尾部创作者', 0, 49999
) tier ON base.total_views BETWEEN tier.min_views AND tier.max_views
ORDER BY base.total_views DESC;
-- 复杂表子查询 - 类别竞争力分析
SELECT category_stats.category,
       category_stats.avg_views,
       category_stats.video_count,
       category_stats.top_creator,
       category_stats.competition_index
FROM (
    SELECT category,
           AVG(views) as avg_views,
           COUNT(*) as video_count,
           -- 找到该类别播放量最高的创作者
           (SELECT creator_name FROM videos v2 
            WHERE v2.category = v1.category 
            ORDER BY views DESC LIMIT 1) as top_creator,
           -- 计算竞争指数:标准差/平均值
           ROUND(STDDEV(views) / AVG(views), 2) as competition_index
    FROM videos v1
    GROUP BY category
) category_stats
ORDER BY category_stats.avg_views DESC;
  • EXISTS子查询 - 存在性判断
-- EXISTS - 找出有多个视频的活跃创作者
SELECT DISTINCT creator_name, 
       (SELECT COUNT(*) FROM videos v2 WHERE v2.creator_name = v1.creator_name) as video_count,
       (SELECT MAX(views) FROM videos v2 WHERE v2.creator_name = v1.creator_name) as best_performance
FROM videos v1
WHERE EXISTS (
    SELECT 1 FROM videos v2 
    WHERE v2.creator_name = v1.creator_name 
    AND v2.video_id != v1.video_id  -- 存在其他视频
)
ORDER BY video_count DESC;
-- NOT EXISTS - 找出某类别的独家创作者
SELECT creator_name, category, views
FROM videos v1
WHERE NOT EXISTS (
    SELECT 1 FROM videos v2 
    WHERE v2.creator_name = v1.creator_name 
    AND v2.category != v1.category  -- 不存在其他类别的视频
)
ORDER BY category, views DESC;
-- 找出每个类别中超越该类别平均水平的视频
select a.category,v.creator_name,v.video_id,v.views, a.avs
from (
	select category,avg(views) as avs
	from ds.videos
	group by category
) a, ds.videos v
where v.views > a.avs and v.category = a.category;
-- 子查询 + 窗口函数 - 动态基准比较
SELECT creator_name, video_id, views, category,
       -- 动态计算:与同类别前3名的平均值比较
       (SELECT AVG(views) FROM (
           SELECT views FROM videos v2 
           WHERE v2.category = v1.category 
           ORDER BY views DESC 
           LIMIT 3
       ) top3) as category_top3_avg,
       CASE 
           WHEN views >= (SELECT AVG(views) FROM (
               SELECT views FROM videos v2 
               WHERE v2.category = v1.category 
               ORDER BY views DESC 
               LIMIT 3
           ) top3) THEN '顶级水平'
           WHEN views >= (SELECT AVG(views) FROM videos v2 WHERE v2.category = v1.category) THEN '优秀水平'
           ELSE '有待提升'
       END as performance_rating
FROM videos v1
ORDER BY category, views DESC;
-- 复杂业务场景 - 潜力视频挖掘
SELECT v1.video_id, v1.creator_name, v1.views, v1.likes, v1.category,
       growth_potential,
       engagement_score
FROM videos v1
JOIN (
    SELECT video_id,
           -- 计算增长潜力:与同创作者历史最佳的差距
           (SELECT MAX(views) FROM videos v2 WHERE v2.creator_name = v.creator_name) - views as growth_potential,
           -- 计算互动分数:相对于同类别的表现
           ROUND(
               (likes * 1.0 / NULLIF(views, 0)) / 
               (SELECT AVG(likes * 1.0 / NULLIF(views, 0)) FROM videos v3 WHERE v3.category = v.category) * 100, 
               2
           ) as engagement_score
    FROM videos v
) metrics ON v1.video_id = metrics.video_id
WHERE growth_potential > 0  -- 有增长空间
  AND engagement_score > 100  -- 互动率高于同类别平均
ORDER BY growth_potential DESC, engagement_score DESC;

4. 高级聚合操作

窗口函数(Window Functions)

  • 排名函数
-- 按播放量排名(相同排名会跳跃)
SELECT creator_name, views,
       RANK() OVER (ORDER BY views DESC) as rank_views
FROM videos;

-- 按播放量密集排名(相同排名不跳跃)
SELECT creator_name, views,
       DENSE_RANK() OVER (ORDER BY views DESC) as dense_rank_views
FROM videos;

-- 行号(连续编号)
SELECT creator_name, views,
       ROW_NUMBER() OVER (ORDER BY views DESC) as row_num
FROM videos;

-- 分类别排名
SELECT creator_name, category, views,
       RANK() OVER (PARTITION BY category ORDER BY views DESC) as category_rank
FROM videos;
  • 聚合窗口函数
-- 累计播放量
SELECT creator_name, publish_time, views,
       SUM(views) OVER (ORDER BY publish_time) as cumulative_views
FROM videos;

-- 移动平均(3天窗口)
SELECT creator_name, publish_time, views,
       AVG(views) OVER (ORDER BY publish_time ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as moving_avg
FROM videos;

-- 分组累计
SELECT creator_name, category, views,
       SUM(views) OVER (PARTITION BY category ORDER BY publish_time) as category_cumulative
FROM videos;
  • 偏移函数
-- 获取上一条记录的播放量
SELECT creator_name, views,
       LAG(views, 1) OVER (ORDER BY publish_time) as prev_views,
       views - LAG(views, 1) OVER (ORDER BY publish_time) as views_diff
FROM videos;

-- 获取下一条记录的播放量
SELECT creator_name, views,
       LEAD(views, 1) OVER (ORDER BY publish_time) as next_views
FROM videos;

分组统计与多维分析

ROLLUP - 层级汇总

-- 按创作者和分类进行层级汇总
SELECT creator_name, category, 
       SUM(views) as total_views,
       COUNT(*) as video_count
FROM videos
GROUP BY creator_name, category WITH ROLLUP;

结果说明:

  • NULL就是"加起来"的意思,比如 旅行家,NULL 是旅行家所有视频总和
  • ROLLUP 就是自动加"合计"行,像收银员给你小票加个"总计"

  • 前端展示可以使用的统计数据
with tmp as(
SELECT creator_name, category, 
       SUM(views) as total_views,
       COUNT(*) as video_count
FROM ds.videos
GROUP BY creator_name, category WITH ROLLUP) 
select *
from tmp
where tmp.category is NULL;

条件聚合

  • CASE WHEN 配合聚合
-- 按时长分类统计
SELECT creator_name,
       SUM(CASE WHEN duration < 300 THEN views ELSE 0 END) as short_video_views,
       SUM(CASE WHEN duration >= 300 AND duration < 600 THEN views ELSE 0 END) as medium_video_views,
       SUM(CASE WHEN duration >= 600 THEN views ELSE 0 END) as long_video_views,
       COUNT(CASE WHEN likes/views > 0.05 THEN 1 END) as high_engagement_videos
FROM videos
GROUP BY creator_name;
  • 过滤聚合
-- 使用 FILTER 子句(注意: MySQL不支持,postgresql支持)
SELECT category,
       COUNT(*) as total_videos,
       COUNT(*) FILTER (WHERE views > 10000) as popular_videos,
       AVG(views) FILTER (WHERE duration > 300) as avg_long_video_views
FROM videos
GROUP BY category;

高级分组查询

  • HAVING 与复杂条件
-- 找出平均播放量高且视频数量多的创作者
SELECT creator_name,
       COUNT(*) as video_count,
       AVG(views) as avg_views,
       SUM(likes) as total_likes
FROM videos
GROUP BY creator_name
HAVING COUNT(*) >= 2 
   AND AVG(views) > 50000
   AND SUM(likes)/SUM(views) > 0.01;
  • 子查询配合聚合
-- 找出播放量超过分类平均值的视频
SELECT v.creator_name, v.category, v.views,
       cat_avg.avg_views
FROM videos v
JOIN (
    SELECT category, AVG(views) as avg_views
    FROM videos
    GROUP BY category
) cat_avg ON v.category = cat_avg.category
WHERE v.views > cat_avg.avg_views;

高级分析函数

  • 相关性分析
-- 计算点赞率和分享率的相关性
SELECT category,
       COUNT(*) as sample_size,
       STDDEV(views) as views_std_dev, -- STDDEV(标准差) 算数据的“分散程度”,一组数字(比如浏览量)离平均值有多远,值越大表示数据波动越大
       VAR_POP(duration) as duration_variance
FROM videos
WHERE views > 0
GROUP BY category;

时间序列聚合

  • 时间窗口分析
-- 每月发布视频统计
SELECT DATE_FORMAT(publish_time, '%Y-%m') as month,
       COUNT(*) as videos_count,
       AVG(views) as avg_views,
       SUM(views) as total_views,
       LAG(SUM(views)) OVER (ORDER BY DATE_FORMAT(publish_time, '%Y-%m')) as prev_month_views
FROM videos
GROUP BY DATE_FORMAT(publish_time, '%Y-%m')
ORDER BY month;
  • 滑动窗口计算
-- 7天滑动平均播放量
SELECT publish_time,
       AVG(views) OVER (
           ORDER BY publish_time 
           RANGE BETWEEN INTERVAL 7 DAY PRECEDING AND CURRENT ROW
       ) as week_moving_avg
FROM videos
ORDER BY publish_time;

5. 复杂连接查询

  • 基础连接(左连接、右连接、内连接、外连接)
  • 自连接
-- 找出同一创作者中播放量差异较大的视频对
SELECT v1.video_id as video1_id, v1.views as views1,
       v2.video_id as video2_id, v2.views as views2,
       v1.creator_name,
       ABS(v1.views - v2.views) as views_diff
FROM videos v1
JOIN videos v2 ON v1.creator_id = v2.creator_id
WHERE v1.video_id < v2.video_id  -- 避免重复比较
  AND ABS(v1.views - v2.views) > 5000;

6. 高级字符串处理

核心函数

  • 基础操作
-- 字符串拼接与截取
SELECT 
    CONCAT(creator_name, '-', category) AS display_name,
    SUBSTRING(creator_name, 1, 3) AS prefix,
    LEFT(creator_name, 5) AS short_name,
    SUBSTRING_INDEX(creator_name, ' ', 1) AS first_name
FROM videos;

-- 长度与清理
SELECT 
    TRIM(creator_name) AS clean_name,
    LENGTH(creator_name) AS byte_len,
    CHAR_LENGTH(creator_name) AS char_len,
    UPPER(category) AS category_upper
FROM videos;
  • 查找替换
-- 字符串替换与定位
SELECT 
    REPLACE(category, '_', ' ') AS formatted_category,
    LOCATE(' ', creator_name) AS space_pos,
    REVERSE(creator_name) AS reversed_name
FROM videos;

模式匹配

  • LIKE与通配符
-- 模糊匹配
SELECT * FROM videos WHERE creator_name LIKE '段%';
SELECT * FROM videos WHERE category LIKE '%科技%';
  • 正则表达式
-- 格式验证
SELECT creator_name FROM videos 
WHERE creator_name REGEXP '^[A-Z][a-z]+ [A-Z][a-z]+$';

-- 提取替换
SELECT 
    REGEXP_SUBSTR(creator_name, '[A-Z][a-z]+') AS first_word,
    REGEXP_REPLACE(creator_name, '[0-9]', '') AS no_numbers
FROM videos;

字符串聚合

-- 分组聚合
SELECT 
    category,
    GROUP_CONCAT(creator_name ORDER BY views DESC) AS creators,
    GROUP_CONCAT(DISTINCT creator_name SEPARATOR '|') AS unique_creators
FROM videos GROUP BY category;

条件字符串处理

SELECT 
    creator_name,
    CASE 
        WHEN views > 80000 THEN CONCAT(creator_name, '(热门)')
        WHEN views > 5000 THEN CONCAT(creator_name, '(推荐)')
        ELSE creator_name
    END AS display_creator
FROM videos;

高级技巧

  • 字符串函数组合
-- 复合处理
SELECT 
    CONCAT(
        UPPER(LEFT(creator_name, 1)),
        LOWER(SUBSTRING(creator_name, 2))
    ) AS title_case,
    
    SUBSTRING_INDEX(
        SUBSTRING_INDEX(creator_name, ' ', 2), ' ', -1
    ) AS middle_name
FROM videos;
  • 数据清洗
-- 标准化处理
UPDATE videos SET 
    creator_name = TRIM(REGEXP_REPLACE(creator_name, '\\s+', ' ')),
    category = UPPER(LEFT(category, 1)) + LOWER(SUBSTRING(category, 2));
  • 搜索优化
-- 创建搜索字段
ALTER TABLE videos ADD COLUMN search_text TEXT 
GENERATED ALWAYS AS (
    CONCAT_WS(' ', creator_name, category, 
        CASE WHEN views > 50000 THEN 'popular' ELSE '' END)
) STORED;

-- 创建索引
CREATE INDEX idx_creator_prefix ON videos(creator_name(10));
CREATE FULLTEXT INDEX ft_search ON videos(search_text);

实用函数

  • 自定义搜索
-- 模糊搜索函数
CREATE FUNCTION search_creator(term VARCHAR(50))
RETURNS TEXT READS SQL DATA
BEGIN
    RETURN (
        SELECT GROUP_CONCAT(creator_name)
        FROM videos 
        WHERE creator_name LIKE CONCAT('%', term, '%')
           OR SOUNDEX(creator_name) = SOUNDEX(term)
    );
END;

7. 高级日期时间处理

核心函数

  • 获取当前时间
SELECT 
    NOW(), -- '2018-04-10 13:26:27'
    UNIX_TIMESTAMP(),--  '1752935187'
    CURRENT_TIME(), -- '13:26:27'
    CURDATE(), -- '2018-04-10'
    CURRENT_DATE(),-- '2018-04-10'
    CURRENT_TIMESTAMP(),-- '2018-04-10 13:26:27'
    CURTIME(), -- '13:26:27'
    CURRENT_USER() -- 'root@%'
FROM DUAL;
  • 日期提取
-- 从发布时间提取各部分
SELECT 
    video_id,
    publish_time,
    YEAR(publish_time) AS publish_year,
    EXTRACT(YEAR FROM publish_time) AS pub_year,
    MONTH(publish_time) AS publish_month,
    EXTRACT(MONTH FROM publish_time) AS pub_mth,
    DAY(publish_time) AS publish_day,
    EXTRACT(DAY FROM publish_time) AS pub_day,
    HOUR(publish_time) AS publish_hour,
    EXTRACT(HOUR FROM publish_time) AS pub_hour,
    WEEKDAY(publish_time) AS weekday,
    DAYNAME(publish_time) AS day_name
FROM videos;

日期格式化

-- 格式化显示
SELECT 
    video_id,
    DATE_FORMAT(publish_time, '%Y-%m-%d') AS publish_date,
    DATE_FORMAT(publish_time, '%H:%i:%s') AS publish_time_only,
    DATE_FORMAT(publish_time, '%Y年%m月%d日 %H:%i') AS chinese_format,
    DATE_FORMAT(publish_time, '%W, %M %D, %Y') AS full_format
FROM videos;

日期计算

-- 计算发布时间到现在的间隔
SELECT 
    video_id,
    creator_name,
    publish_time,
    DATEDIFF(NOW(), publish_time) AS days_ago,
    TIMESTAMPDIFF(HOUR, publish_time, NOW()) AS hours_ago,
    TIMESTAMPDIFF(MINUTE, publish_time, NOW()) AS minutes_ago
FROM videos;
-- 日期加减
SELECT 
    video_id,
    publish_time,
    DATE_ADD(publish_time, INTERVAL 7 DAY) AS week_later,
    DATE_SUB(publish_time, INTERVAL 1 MONTH) AS month_before,
    publish_time + INTERVAL 1 YEAR AS next_year,
    publish_time - INTERVAL 2 HOUR AS two_hours_before
FROM videos;

时间段分析

-- 按年月分组统计
SELECT 
    YEAR(publish_time) AS year,
    MONTH(publish_time) AS month,
    COUNT(*) AS video_count,
    AVG(views) AS avg_views,
    SUM(likes) AS total_likes
FROM videos 
GROUP BY YEAR(publish_time), MONTH(publish_time)
ORDER BY year DESC, month DESC;

-- 按小时分析发布规律
SELECT 
    HOUR(publish_time) AS hour,
    COUNT(*) AS video_count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM videos), 2) AS percentage
FROM videos 
GROUP BY HOUR(publish_time)
ORDER BY hour;
-- 工作日周末分析
SELECT 
    CASE 
        WHEN WEEKDAY(publish_time) IN (5, 6) THEN '周末'
        ELSE '工作日'
    END AS day_type,
    COUNT(*) AS video_count,
    AVG(views) AS avg_views,
    AVG(likes) AS avg_likes
FROM videos
GROUP BY day_type;

时间区间查询

-- 最近7天的视频
SELECT * FROM videos 
WHERE publish_time >= DATE_SUB(NOW(), INTERVAL 2 DAY);

-- 本月发布的视频
SELECT * FROM videos 
WHERE YEAR(publish_time) = YEAR(NOW())
  AND MONTH(publish_time) = MONTH(NOW());

-- 指定时间段
SELECT * FROM videos 
WHERE publish_time BETWEEN '2018-01-01' AND '2018-12-31';
-- 根据播放量计算热门时间段
SELECT 
    DATE_FORMAT(publish_time, '%Y-%m') AS month,
    COUNT(*) AS video_count,
    AVG(views) AS avg_views,
    CASE 
        WHEN AVG(views) > 100000 THEN '热门月份'
        WHEN AVG(views) > 50000 THEN '普通月份'
        ELSE '冷门月份'
    END AS month_type
FROM videos 
GROUP BY DATE_FORMAT(publish_time, '%Y-%m')
ORDER BY avg_views DESC;

时长处理

  • 时长格式化
-- 将秒数转换为时分秒格式
SELECT 
    video_id,
    duration,
    CONCAT(
        FLOOR(duration / 3600), ':',
        LPAD(FLOOR((duration % 3600) / 60), 2, '0'), ':',
        LPAD(duration % 60, 2, '0')
    ) AS duration_formatted,
    
    CASE 
        WHEN duration < 60 THEN '短视频'
        WHEN duration < 600 THEN '中等视频'
        ELSE '长视频'
    END AS video_type
FROM videos;
  • 时长统计
-- 时长分析
SELECT 
    CASE 
        WHEN duration < 60 THEN '0-1分钟'
        WHEN duration < 300 THEN '1-5分钟'
        WHEN duration < 600 THEN '5-10分钟'
        WHEN duration < 1800 THEN '10-30分钟'
        ELSE '30分钟以上'
    END AS duration_range,
    COUNT(*) AS video_count,
    AVG(views) AS avg_views,
    AVG(likes) AS avg_likes
FROM videos
GROUP BY duration_range
ORDER BY AVG(views) DESC;

时间序列分析

趋势分析
-- 按周统计发布趋势
SELECT 
    YEAR(publish_time) AS year,
    WEEK(publish_time) AS week,
    COUNT(*) AS video_count,
    AVG(views) AS avg_views,
    LAG(COUNT(*)) OVER (ORDER BY YEAR(publish_time), WEEK(publish_time)) AS prev_week_count
FROM videos
GROUP BY YEAR(publish_time), WEEK(publish_time)
ORDER BY year, week;

-- 累计统计
SELECT 
    DATE_FORMAT(publish_time, '%Y-%m') AS month,
    COUNT(*) AS monthly_count,
    SUM(COUNT(*)) OVER (ORDER BY DATE_FORMAT(publish_time, '%Y-%m')) AS cumulative_count
FROM videos
GROUP BY DATE_FORMAT(publish_time, '%Y-%m')
ORDER BY month;

实用查询模式

热门时间段识别

-- 找出最佳发布时间
SELECT 
    HOUR(publish_time) AS best_hour,
    DAYNAME(publish_time) AS best_day,
    AVG(views) AS avg_views,
    AVG(likes) AS avg_likes,
    COUNT(*) AS sample_count
FROM videos
WHERE publish_time >= DATE_SUB(NOW(), INTERVAL 3 MONTH)
GROUP BY HOUR(publish_time), DAYNAME(publish_time)
HAVING COUNT(*) >= 5
ORDER BY avg_views DESC
LIMIT 10;

复合时间查询

创建时间维度视图

-- 时间分析视图
CREATE VIEW video_time_analysis AS
SELECT 
    video_id,
    creator_name,
    category,
    publish_time,
    views,
    likes,
    
    -- 时间维度
    YEAR(publish_time) AS publish_year,
    QUARTER(publish_time) AS publish_quarter,
    MONTH(publish_time) AS publish_month,
    WEEK(publish_time) AS publish_week,
    DAYOFWEEK(publish_time) AS publish_weekday,
    HOUR(publish_time) AS publish_hour,
    
    -- 时间计算
    DATEDIFF(NOW(), publish_time) AS days_since_publish,
    TIMESTAMPDIFF(HOUR, publish_time, NOW()) AS hours_since_publish,
    
    -- 时间分类
    CASE 
        WHEN HOUR(publish_time) BETWEEN 6 AND 11 THEN '上午'
        WHEN HOUR(publish_time) BETWEEN 12 AND 17 THEN '下午'
        WHEN HOUR(publish_time) BETWEEN 18 AND 23 THEN '晚上'
        ELSE '深夜'
    END AS time_period,
    
    CASE 
        WHEN WEEKDAY(publish_time) IN (5, 6) THEN '周末'
        ELSE '工作日'
    END AS day_type
FROM videos;

8. 存储过程与函数

存储函数

基础计算函数
-- 计算互动率
DELIMITER $$
CREATE FUNCTION calculate_engagement_rate(p_views INT, p_likes INT, p_comments INT, p_shares INT)
RETURNS DECIMAL(5,2)
READS SQL DATA
DETERMINISTIC
BEGIN
    DECLARE engagement_rate DECIMAL(5,2);
    
    IF p_views = 0 OR p_views IS NULL THEN
        RETURN 0;
    END IF;
    
    SET engagement_rate = ((p_likes + p_comments + p_shares) / p_views) * 100;
    RETURN engagement_rate;
END$$
DELIMITER ;

-- 使用函数
SELECT 
    video_id,
    creator_name,
    views,
    calculate_engagement_rate(views, likes, comments, shares) AS engagement_rate
FROM videos;
查看代码
-- 视频热度等级
DELIMITER $$
CREATE FUNCTION get_video_grade(p_views INT, p_likes INT)
RETURNS VARCHAR(10)
READS SQL DATA
DETERMINISTIC
BEGIN
    DECLARE grade VARCHAR(10);
    DECLARE engagement_score INT;
    
    SET engagement_score = p_views + (p_likes * 10);
    
    CASE
        WHEN engagement_score >= 1000000 THEN SET grade = 'S级';
        WHEN engagement_score >= 500000 THEN SET grade = 'A级';
        WHEN engagement_score >= 100000 THEN SET grade = 'B级';
        WHEN engagement_score >= 10000 THEN SET grade = 'C级';
        ELSE SET grade = 'D级';
    END CASE;
    
    RETURN grade;
END$$
DELIMITER ;
时长格式化函数
-- 格式化视频时长
DELIMITER $$
CREATE FUNCTION format_duration(p_duration INT)
RETURNS VARCHAR(20)
DETERMINISTIC
BEGIN
    DECLARE hours INT;
    DECLARE minutes INT;
    DECLARE seconds INT;
    DECLARE result VARCHAR(20);
    
    SET hours = FLOOR(p_duration / 3600);
    SET minutes = FLOOR((p_duration % 3600) / 60);
    SET seconds = p_duration % 60;
    
    IF hours > 0 THEN
        SET result = CONCAT(hours, ':', LPAD(minutes, 2, '0'), ':', LPAD(seconds, 2, '0'));
    ELSE
        SET result = CONCAT(minutes, ':', LPAD(seconds, 2, '0'));
    END IF;
    
    RETURN result;
END$$
DELIMITER ;

存储过程

-- 创作者统计分析
DELIMITER $$
CREATE PROCEDURE analyze_creator_performance(IN p_creator_id INT)
BEGIN
    DECLARE v_total_videos INT;
    DECLARE v_total_views BIGINT;
    DECLARE v_avg_engagement DECIMAL(5,2);
    
    -- 统计视频数量
    SELECT COUNT(*) INTO v_total_videos
    FROM videos WHERE creator_id = p_creator_id;
    
    -- 统计总播放量
    SELECT SUM(views) INTO v_total_views
    FROM videos WHERE creator_id = p_creator_id;
    
    -- 计算平均互动率
    SELECT AVG(calculate_engagement_rate(views, likes, comments, shares)) 
    INTO v_avg_engagement
    FROM videos WHERE creator_id = p_creator_id;
    
    -- 返回结果
    SELECT 
        p_creator_id AS creator_id,
        v_total_videos AS total_videos,
        v_total_views AS total_views,
        v_avg_engagement AS avg_engagement_rate,
        CASE 
            WHEN v_avg_engagement >= 5.0 THEN '优秀'
            WHEN v_avg_engagement >= 3.0 THEN '良好'
            WHEN v_avg_engagement >= 1.0 THEN '一般'
            ELSE '较差'
        END AS performance_level;
END$$
DELIMITER ;

-- 调用过程
CALL analyze_creator_performance(1001);
数据更新过程
-- 批量更新视频等级
DELIMITER $$
CREATE PROCEDURE update_video_grades()
BEGIN
    DECLARE done INT DEFAULT FALSE;
    DECLARE v_video_id INT;
    DECLARE v_views INT;
    DECLARE v_likes INT;
    DECLARE v_grade VARCHAR(10);
    
    DECLARE video_cursor CURSOR FOR 
        SELECT video_id, views, likes FROM videos;
    DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
    
    -- 添加等级列(如果不存在)
    SET @sql = 'ALTER TABLE videos ADD COLUMN IF NOT EXISTS grade VARCHAR(10)';
    PREPARE stmt FROM @sql;
    EXECUTE stmt;
    DEALLOCATE PREPARE stmt;
    
    OPEN video_cursor;
    
    read_loop: LOOP
        FETCH video_cursor INTO v_video_id, v_views, v_likes;
        IF done THEN
            LEAVE read_loop;
        END IF;
        
        SET v_grade = get_video_grade(v_views, v_likes);
        
        UPDATE videos 
        SET grade = v_grade 
        WHERE video_id = v_video_id;
        
    END LOOP;
    
    CLOSE video_cursor;
    
    SELECT CONCAT('更新完成,共处理 ', ROW_COUNT(), ' 条记录') AS result;
END$$
DELIMITER ;
分页查询过程
-- 分页获取视频列表
DELIMITER $$
CREATE PROCEDURE get_videos_page(
    IN p_page_num INT,
    IN p_page_size INT,
    IN p_category VARCHAR(20),
    IN p_order_by VARCHAR(20)
)
BEGIN
    DECLARE v_offset INT;
    DECLARE v_sql TEXT;
    
    SET v_offset = (p_page_num - 1) * p_page_size;
    
    SET v_sql = 'SELECT 
        video_id,
        creator_name,
        category,
        views,
        likes,
        comments,
        format_duration(duration) AS duration_formatted,
        calculate_engagement_rate(views, likes, comments, shares) AS engagement_rate,
        publish_time
    FROM videos WHERE 1=1';
    
    IF p_category IS NOT NULL AND p_category != '' THEN
        SET v_sql = CONCAT(v_sql, ' AND category = ''', p_category, '''');
    END IF;
    
    CASE p_order_by
        WHEN 'views' THEN SET v_sql = CONCAT(v_sql, ' ORDER BY views DESC');
        WHEN 'likes' THEN SET v_sql = CONCAT(v_sql, ' ORDER BY likes DESC');
        WHEN 'publish_time' THEN SET v_sql = CONCAT(v_sql, ' ORDER BY publish_time DESC');
        ELSE SET v_sql = CONCAT(v_sql, ' ORDER BY video_id DESC');
    END CASE;
    
    SET v_sql = CONCAT(v_sql, ' LIMIT ', p_page_size, ' OFFSET ', v_offset);
    
    SET @sql = v_sql;
    PREPARE stmt FROM @sql;
    EXECUTE stmt;
    DEALLOCATE PREPARE stmt;
END$$
DELIMITER ;

-- 调用分页查询
CALL get_videos_page(1, 10, 'tech', 'views');

条件控制与循环

复杂业务逻辑处理
-- 视频推荐算法
DELIMITER $$
CREATE PROCEDURE recommend_videos(
    IN p_user_category VARCHAR(20),
    IN p_min_views INT,
    OUT p_recommended_count INT
)
BEGIN
    DECLARE v_count INT DEFAULT 0;
    
    DROP TEMPORARY TABLE IF EXISTS temp_recommendations;
    CREATE TEMPORARY TABLE temp_recommendations (
        video_id INT,
        creator_name VARCHAR(50),
        score DECIMAL(10,2)
    );
    
    -- 基于分类推荐
    IF p_user_category IS NOT NULL THEN
        INSERT INTO temp_recommendations
        SELECT 
            video_id,
            creator_name,
            (views * 0.6 + likes * 0.3 + comments * 0.1) as score
        FROM videos 
        WHERE category = p_user_category 
          AND views >= p_min_views
        ORDER BY score DESC
        LIMIT 20;
        
        SET v_count = ROW_COUNT();
    END IF;
    
    -- 如果推荐数量不足,补充热门视频
    IF v_count < 10 THEN
        INSERT INTO temp_recommendations
        SELECT 
            video_id,
            creator_name,
            (views * 0.6 + likes * 0.3 + comments * 0.1) as score
        FROM videos 
        WHERE video_id NOT IN (SELECT video_id FROM temp_recommendations)
          AND views >= p_min_views
        ORDER BY score DESC
        LIMIT (10 - v_count);
    END IF;
    
    SELECT * FROM temp_recommendations ORDER BY score DESC;
    SELECT COUNT(*) INTO p_recommended_count FROM temp_recommendations;
    
    DROP TEMPORARY TABLE temp_recommendations;
END$$
DELIMITER ;

-- 调用推荐过程
SET @rec_count = 0;
CALL recommend_videos('tech', 1000, @rec_count);
SELECT @rec_count AS total_recommendations;

异常处理

带异常处理的数据操作
-- 安全的数据更新过程
DELIMITER $$
CREATE PROCEDURE safe_update_video_stats(
    IN p_video_id INT,
    IN p_new_views INT,
    IN p_new_likes INT,
    OUT p_success BOOLEAN,
    OUT p_message VARCHAR(200)
)
BEGIN
    DECLARE EXIT HANDLER FOR SQLEXCEPTION
    BEGIN
        ROLLBACK;
        GET DIAGNOSTICS CONDITION 1
            p_message = MESSAGE_TEXT;
        SET p_success = FALSE;
    END;
    
    START TRANSACTION;
    
    -- 验证输入参数
    IF p_video_id IS NULL OR p_new_views < 0 OR p_new_likes < 0 THEN
        SET p_success = FALSE;
        SET p_message = '参数错误:视频ID不能为空,播放量和点赞数不能为负数';
        ROLLBACK;
    ELSE
        -- 检查视频是否存在
        IF NOT EXISTS(SELECT 1 FROM videos WHERE video_id = p_video_id) THEN
            SET p_success = FALSE;
            SET p_message = CONCAT('视频ID ', p_video_id, ' 不存在');
            ROLLBACK;
        ELSE
            -- 执行更新
            UPDATE videos 
            SET views = p_new_views, 
                likes = p_new_likes
            WHERE video_id = p_video_id;
            
            COMMIT;
            SET p_success = TRUE;
            SET p_message = '更新成功';
        END IF;
    END IF;
END$$
DELIMITER ;

-- 调用安全更新过程
SET @success = FALSE;
SET @msg = '';
CALL safe_update_video_stats(1001, 50000, 2000, @success, @msg);
SELECT @success AS is_success, @msg AS message;

实用工具过程

数据备份与清理
-- 数据归档过程
DELIMITER $$
CREATE PROCEDURE archive_old_videos(IN p_days_old INT)
BEGIN
    DECLARE v_archive_count INT;
    
    -- 创建归档表(如果不存在)
    CREATE TABLE IF NOT EXISTS videos_archive LIKE videos;
    
    -- 将旧数据移动到归档表
    INSERT INTO videos_archive
    SELECT * FROM videos 
    WHERE publish_time < DATE_SUB(NOW(), INTERVAL p_days_old DAY);
    
    SET v_archive_count = ROW_COUNT();
    
    -- 删除原表中的旧数据
    DELETE FROM videos 
    WHERE publish_time < DATE_SUB(NOW(), INTERVAL p_days_old DAY);
    
    SELECT CONCAT('已归档 ', v_archive_count, ' 条记录') AS result;
END$$
DELIMITER ;
统计报表生成
-- 生成统计报表
DELIMITER $$
CREATE PROCEDURE generate_stats_report()
BEGIN
    -- 总体统计
    SELECT 
        '总体统计' AS report_type,
        COUNT(*) AS total_videos,
        SUM(views) AS total_views,
        SUM(likes) AS total_likes,
        AVG(calculate_engagement_rate(views, likes, comments, shares)) AS avg_engagement
    FROM videos
    
    UNION ALL
    
    -- 分类统计
    SELECT 
        CONCAT('分类-', category) AS report_type,
        COUNT(*) AS total_videos,
        SUM(views) AS total_views,
        SUM(likes) AS total_likes,
        AVG(calculate_engagement_rate(views, likes, comments, shares)) AS avg_engagement
    FROM videos
    GROUP BY category
    
    UNION ALL
    
    -- 创作者TOP5
    SELECT 
        CONCAT('创作者-', creator_name) AS report_type,
        COUNT(*) AS total_videos,
        SUM(views) AS total_views,
        SUM(likes) AS total_likes,
        AVG(calculate_engagement_rate(views, likes, comments, shares)) AS avg_engagement
    FROM videos
    GROUP BY creator_name
    ORDER BY total_views DESC
    LIMIT 5;
END$$
DELIMITER ;

调试与管理

查看存储过程状态
-- 查看已创建的存储过程和函数
SHOW PROCEDURE STATUS WHERE Db = DATABASE();
SHOW FUNCTION STATUS WHERE Db = DATABASE();

-- 查看具体定义
SHOW CREATE PROCEDURE analyze_creator_performance;
SHOW CREATE FUNCTION calculate_engagement_rate;

-- 删除存储过程和函数
DROP PROCEDURE IF EXISTS analyze_creator_performance;
DROP FUNCTION IF EXISTS calculate_engagement_rate;

9. 性能优化技巧

慢查询分析

-- 开启慢查询日志
SET GLOBAL slow_query_log = 'ON';
SET GLOBAL long_query_time = 2;

-- 查看慢查询状态
SHOW GLOBAL STATUS LIKE 'Slow_queries';

-- 分析具体查询性能
explain SELECT DISTINCT
    v1.creator_name as creator,
    v1.category,
    COUNT(v1.video_id) as video_count
FROM videos v1
WHERE YEAR(v1.publish_time) = 2018 
  AND MONTH(v1.publish_time) = 1
GROUP BY v1.creator_id, v1.creator_name, v1.category
ORDER BY COUNT(v1.video_id) DESC;

-- 使用EXPLAIN FORMAT=JSON获取详细信息
EXPLAIN FORMAT=JSON 
SELECT DISTINCT
    v1.creator_name as creator,
    v1.category,
    COUNT(v1.video_id) as video_count
FROM videos v1
WHERE YEAR(v1.publish_time) = 2018 
  AND MONTH(v1.publish_time) = 1
GROUP BY v1.creator_id, v1.creator_name, v1.category
ORDER BY COUNT(v1.video_id) DESC;

表结构优化

数据类型优化
-- 优化数据类型选择
ALTER TABLE videos 
MODIFY COLUMN creator_id INT UNSIGNED,
MODIFY COLUMN views INT UNSIGNED,
MODIFY COLUMN likes MEDIUMINT UNSIGNED,
MODIFY COLUMN comments MEDIUMINT UNSIGNED,
MODIFY COLUMN shares SMALLINT UNSIGNED,
MODIFY COLUMN duration SMALLINT UNSIGNED;

-- 使用ENUM替代VARCHAR(如果值固定)
ALTER TABLE videos 
ADD COLUMN status ENUM('draft', 'published', 'archived', 'deleted') 
DEFAULT 'published';
垂直分割
-- 将不常用字段分离到扩展表
CREATE TABLE video_extended (
    video_id INT PRIMARY KEY,
    description TEXT,
    tags TEXT,
    thumbnail_url VARCHAR(255),
    processing_info JSON,
    FOREIGN KEY (video_id) REFERENCES videos(video_id)
);

-- 热点数据表
CREATE TABLE video_stats (
    video_id INT PRIMARY KEY,
    views INT UNSIGNED,
    likes MEDIUMINT UNSIGNED,
    comments MEDIUMINT UNSIGNED,
    shares SMALLINT UNSIGNED,
    last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    FOREIGN KEY (video_id) REFERENCES videos(video_id)
);

配置优化

-- 查看当前配置
SHOW VARIABLES LIKE 'innodb_buffer_pool_size';
SHOW VARIABLES LIKE 'query_cache%';
SHOW VARIABLES LIKE 'tmp_table_size';

-- 建议配置(需要修改my.cnf)
/*
[mysqld]
# InnoDB缓冲池大小(物理内存的70-80%)
innodb_buffer_pool_size = 2G

# 查询缓存
query_cache_type = 1
query_cache_size = 256M

# 临时表大小
tmp_table_size = 64M
max_heap_table_size = 64M

# 连接数
max_connections = 200
max_connect_errors = 1000

# 日志设置
slow_query_log = 1
long_query_time = 2
*/

【待完善】

索引策略: 复合索引、覆盖索引、前缀索引
查询优化: 避免函数、优化JOIN、改写子查询
表结构: 合适的数据类型、垂直/水平分割
配置调优: 缓冲池、连接数、慢查询
缓存机制: 查询缓存、结果缓存、汇总表
批量操作: 批量插入/更新、事务控制
监控诊断: 性能监控、索引分析、碎片清理
分区策略: 时间分区、哈希分区、范围分区

参考:http://spring.io/guides/gs/rest-service-cors/

posted @ 2018-04-10 13:55  lvlin241  阅读(124)  评论(0)    收藏  举报