基础测试表结构
测试数据库
use `testdb`;
CREATE TABLE users (
user_id INT PRIMARY KEY,
username VARCHAR(50),
email VARCHAR(100),
register_date DATE,
city VARCHAR(50),
age INT,
status VARCHAR(20) DEFAULT 'active'
);
CREATE TABLE products (
product_id INT PRIMARY KEY,
product_name VARCHAR(100),
category VARCHAR(50),
price DECIMAL(10,2),
cost DECIMAL(10,2),
stock_quantity INT,
create_date DATE
);
CREATE TABLE orders (
order_id INT PRIMARY KEY,
user_id INT,
order_date DATETIME,
total_amount DECIMAL(10,2),
status VARCHAR(20),
FOREIGN KEY (user_id) REFERENCES users(user_id)
);
CREATE TABLE order_items (
item_id INT PRIMARY KEY,
order_id INT,
product_id INT,
quantity INT,
unit_price DECIMAL(10,2),
FOREIGN KEY (order_id) REFERENCES orders(order_id),
FOREIGN KEY (product_id) REFERENCES products(product_id)
);
-- 测试数据
INSERT INTO users VALUES
(1, 'Alice', 'alice@email.com', '2023-01-15', 'Beijing', 25, 'active'),
(2, 'Bob', 'bob@email.com', '2023-02-20', 'Shanghai', 30, 'active'),
(3, 'Charlie', 'charlie@email.com', '2023-03-10', 'Guangzhou', 28, 'inactive'),
(4, 'David', 'david@email.com', '2023-04-05', 'Beijing', 35, 'active'),
(5, 'Eva', 'eva@email.com', '2023-05-12', 'Shanghai', 27, 'active'),
(6, 'Frank', 'frank@email.com', '2023-06-18', 'Shenzhen', 32, 'active'),
(7, 'Grace', 'grace@email.com', '2023-07-22', 'Beijing', 29, 'active'),
(8, 'Henry', 'henry@email.com', '2023-08-30', 'Shanghai', 31, 'inactive');
INSERT INTO products VALUES
(101, 'iPhone 14', 'Electronics', 6999.00, 5000.00, 50, '2023-01-01'),
(102, 'MacBook Pro', 'Electronics', 15999.00, 12000.00, 30, '2023-01-01'),
(103, 'Nike Shoes', 'Fashion', 799.00, 400.00, 100, '2023-01-01'),
(104, 'Coffee Maker', 'Home', 299.00, 150.00, 80, '2023-01-01'),
(105, 'Book Set', 'Books', 199.00, 100.00, 200, '2023-01-01'),
(106, 'Samsung TV', 'Electronics', 3999.00, 2800.00, 25, '2023-01-01'),
(107, 'Adidas Jacket', 'Fashion', 599.00, 300.00, 60, '2023-01-01'),
(108, 'Kitchen Knife', 'Home', 89.00, 40.00, 150, '2023-01-01');
INSERT INTO orders VALUES
(1001, 1, '2024-01-10 10:30:00', 7798.00, 'completed'),
(1002, 2, '2024-01-15 14:20:00', 16798.00, 'completed'),
(1003, 1, '2024-01-20 09:15:00', 1098.00, 'completed'),
(1004, 3, '2024-01-25 16:45:00', 498.00, 'pending'),
(1005, 4, '2024-02-01 11:20:00', 7999.00, 'completed'),
(1006, 2, '2024-02-05 13:30:00', 4598.00, 'completed'),
(1007, 5, '2024-02-10 15:10:00', 1398.00, 'completed'),
(1008, 6, '2024-02-15 12:45:00', 888.00, 'completed'),
(1009, 7, '2024-02-20 17:20:00', 6999.00, 'completed'),
(1010, 8, '2024-02-25 10:15:00', 299.00, 'cancelled'),
(1011, 1, '2024-03-01 14:30:00', 3999.00, 'completed'),
(1012, 4, '2024-03-05 16:10:00', 688.00, 'completed');
INSERT INTO order_items VALUES
(1, 1001, 101, 1, 6999.00),
(2, 1001, 103, 1, 799.00),
(3, 1002, 102, 1, 15999.00),
(4, 1002, 103, 1, 799.00),
(5, 1003, 103, 1, 799.00),
(6, 1003, 104, 1, 299.00),
(7, 1004, 104, 1, 299.00),
(8, 1004, 105, 1, 199.00),
(9, 1005, 101, 1, 6999.00),
(10, 1005, 102, 1, 1000.00),
(11, 1006, 106, 1, 3999.00),
(12, 1006, 107, 1, 599.00),
(13, 1007, 103, 1, 799.00),
(14, 1007, 107, 1, 599.00),
(15, 1008, 108, 10, 89.00),
(16, 1009, 101, 1, 6999.00),
(17, 1010, 104, 1, 299.00),
(18, 1011, 106, 1, 3999.00),
(19, 1012, 107, 1, 599.00),
(20, 1012, 108, 1, 89.00);
1. 窗口函数
用户消费排名和累计分析
-- 解决排名、累计、环比等复杂分析需求
-- 原理:在结果集的分区内进行排序和计算,不改变行数
-- 场景:排名、累计计算、移动平均、同比环比
SELECT
u.user_id,
u.username,
u.city,
SUM(o.total_amount) as total_spent,
-- 排名函数
ROW_NUMBER() OVER (ORDER BY SUM(o.total_amount) DESC) as row_num,
RANK() OVER (ORDER BY SUM(o.total_amount) DESC) as rank_num,
DENSE_RANK() OVER (ORDER BY SUM(o.total_amount) DESC) as dense_rank_num,
-- 分组排名
RANK() OVER (PARTITION BY u.city ORDER BY SUM(o.total_amount) DESC) as city_rank,
-- 累计计算
SUM(SUM(o.total_amount)) OVER (ORDER BY SUM(o.total_amount) DESC) as cumulative_spent,
-- 占比计算
ROUND(SUM(o.total_amount) * 100.0 / SUM(SUM(o.total_amount)) OVER(), 2) as percentage
FROM users u
JOIN orders o ON u.user_id = o.user_id
WHERE o.status = 'completed'
GROUP BY u.user_id, u.username, u.city
ORDER BY total_spent DESC;
![]()
月度销售趋势分析
WITH monthly_sales AS (
SELECT
DATE_FORMAT(order_date, '%Y-%m') as month,
SUM(total_amount) as monthly_revenue,
COUNT(*) as order_count
FROM orders
WHERE status = 'completed'
GROUP BY DATE_FORMAT(order_date, '%Y-%m')
)
SELECT
month,
monthly_revenue,
order_count,
-- 环比增长
LAG(monthly_revenue, 1) OVER (ORDER BY month) as prev_month_revenue,
ROUND((monthly_revenue - LAG(monthly_revenue, 1) OVER (ORDER BY month)) /
LAG(monthly_revenue, 1) OVER (ORDER BY month) * 100, 2) as mom_growth,
-- 移动平均
ROUND(AVG(monthly_revenue) OVER (ORDER BY month ROWS BETWEEN 2 PRECEDING AND CURRENT ROW), 2) as ma_3month,
-- 累计收入
SUM(monthly_revenue) OVER (ORDER BY month) as cumulative_revenue
FROM monthly_sales
ORDER BY month;
2. 条件聚合
用户行为分析和订单状态统计
-- 条件聚合 - CASE WHEN + 聚合函数
-- 原理:根据条件对不同数据进行分组聚合
-- 场景:数据透视、多维度统计、业务指标计算
SELECT
u.city,
COUNT(DISTINCT u.user_id) as total_users,
-- 按年龄段统计用户数
SUM(CASE WHEN u.age < 25 THEN 1 ELSE 0 END) as users_under_25,
SUM(CASE WHEN u.age BETWEEN 25 AND 30 THEN 1 ELSE 0 END) as users_25_30,
SUM(CASE WHEN u.age > 30 THEN 1 ELSE 0 END) as users_over_30,
-- 按订单状态统计
COUNT(CASE WHEN o.status = 'completed' THEN 1 END) as completed_orders,
COUNT(CASE WHEN o.status = 'pending' THEN 1 END) as pending_orders,
COUNT(CASE WHEN o.status = 'cancelled' THEN 1 END) as cancelled_orders,
-- 按金额区间统计
SUM(CASE WHEN o.total_amount < 1000 THEN o.total_amount ELSE 0 END) as low_value_sales,
SUM(CASE WHEN o.total_amount >= 1000 AND o.total_amount < 5000 THEN o.total_amount ELSE 0 END) as mid_value_sales,
SUM(CASE WHEN o.total_amount >= 5000 THEN o.total_amount ELSE 0 END) as high_value_sales,
-- 平均客单价
ROUND(AVG(CASE WHEN o.status = 'completed' THEN o.total_amount END), 2) as avg_order_value
FROM users u
LEFT JOIN orders o ON u.user_id = o.user_id
GROUP BY u.city
ORDER BY total_users DESC;
3. CTE:提高复杂查询的可读性和维护性
客户生命周期价值分析
WITH customer_metrics AS (
-- 第一步:计算客户基础指标
SELECT
u.user_id,
u.username,
u.register_date,
MIN(o.order_date) as first_order_date,
MAX(o.order_date) as last_order_date,
COUNT(o.order_id) as total_orders,
SUM(o.total_amount) as total_spent,
AVG(o.total_amount) as avg_order_value,
DATEDIFF(MAX(o.order_date), MIN(o.order_date)) as customer_lifespan_days
FROM users u
LEFT JOIN orders o ON u.user_id = o.user_id AND o.status = 'completed'
GROUP BY u.user_id, u.username, u.register_date
),
customer_segments AS (
-- 第二步:客户分层
SELECT *,
CASE
WHEN total_orders >= 3 AND total_spent >= 10000 THEN 'VIP'
WHEN total_orders >= 2 AND total_spent >= 5000 THEN 'Premium'
WHEN total_orders >= 1 AND total_spent >= 1000 THEN 'Regular'
WHEN total_orders >= 1 THEN 'New'
ELSE 'Inactive'
END as customer_segment,
CASE
WHEN customer_lifespan_days > 60 THEN 'Long-term'
WHEN customer_lifespan_days > 30 THEN 'Medium-term'
WHEN customer_lifespan_days > 0 THEN 'Short-term'
ELSE 'Single-purchase'
END as relationship_length
FROM customer_metrics
)
-- 第三步:最终分析
SELECT
customer_segment,
relationship_length,
COUNT(*) as customer_count,
ROUND(AVG(total_spent), 2) as avg_clv,
ROUND(AVG(avg_order_value), 2) as avg_aov,
ROUND(AVG(total_orders), 2) as avg_frequency,
ROUND(AVG(customer_lifespan_days), 1) as avg_lifespan_days
FROM customer_segments
GROUP BY customer_segment, relationship_length
ORDER BY
CASE customer_segment
WHEN 'VIP' THEN 1
WHEN 'Premium' THEN 2
WHEN 'Regular' THEN 3
WHEN 'New' THEN 4
ELSE 5
END;
![]()
4. 子查询:灵活的数据过滤和动态计算
相关子查询和标量子查询
-- 子查询
-- 原理:在查询中嵌套另一个查询,用于过滤、计算或比较
-- 场景:数据过滤、动态计算、存在性检查
-- 业务:产品销售分析和异常检测
SELECT
p.product_id,
p.product_name,
p.category,
p.price,
-- 标量子查询:计算销售数量
(SELECT COALESCE(SUM(oi.quantity), 0)
FROM order_items oi
JOIN orders o ON oi.order_id = o.order_id
WHERE oi.product_id = p.product_id AND o.status = 'completed') as total_sold,
-- 标量子查询:计算销售收入
(SELECT COALESCE(SUM(oi.quantity * oi.unit_price), 0)
FROM order_items oi
JOIN orders o ON oi.order_id = o.order_id
WHERE oi.product_id = p.product_id AND o.status = 'completed') as total_revenue,
-- 标量子查询:计算利润
(SELECT COALESCE(SUM(oi.quantity * (oi.unit_price - p.cost)), 0)
FROM order_items oi
JOIN orders o ON oi.order_id = o.order_id
WHERE oi.product_id = p.product_id AND o.status = 'completed') as total_profit,
-- 比较子查询:与平均价格比较
CASE
WHEN p.price > (SELECT AVG(price) FROM products WHERE category = p.category)
THEN 'Above Average'
ELSE 'Below Average'
END as price_level,
-- 存在性检查:是否为热销产品
CASE
WHEN EXISTS (
SELECT 1 FROM order_items oi
JOIN orders o ON oi.order_id = o.order_id
WHERE oi.product_id = p.product_id
AND o.status = 'completed'
AND o.order_date >= DATE_SUB(CURRENT_DATE, INTERVAL 30 DAY)
) THEN 'Active'
ELSE 'Inactive'
END as sales_status
FROM products p
ORDER BY total_revenue DESC;
![]()
5. 多表连接:整合多源数据,构建完整视图
JOIN的各种形式
-- 多表连接
-- 原理:根据关联条件组合多个表的数据
-- 场景:关联查询、数据整合、业务报表
-- 业务:完整的销售报表分析
SELECT
o.order_id,
o.order_date,
u.username,
u.city,
u.age,
p.product_name,
p.category,
oi.quantity,
oi.unit_price,
oi.quantity * oi.unit_price as item_total,
o.total_amount as order_total,
p.cost,
(oi.unit_price - p.cost) * oi.quantity as item_profit,
-- 计算利润率
ROUND(((oi.unit_price - p.cost) / oi.unit_price) * 100, 2) as profit_margin,
o.status
FROM orders o
INNER JOIN users u ON o.user_id = u.user_id
INNER JOIN order_items oi ON o.order_id = oi.order_id
INNER JOIN products p ON oi.product_id = p.product_id
WHERE o.status = 'completed'
ORDER BY o.order_date DESC, o.order_id;
-- 案例:用户购买偏好分析(左连接)
SELECT
u.user_id,
u.username,
u.city,
u.age,
COUNT(DISTINCT o.order_id) as total_orders,
COUNT(DISTINCT oi.product_id) as unique_products,
GROUP_CONCAT(DISTINCT p.category) as preferred_categories,
COALESCE(SUM(o.total_amount), 0) as total_spent,
CASE
WHEN COUNT(DISTINCT o.order_id) = 0 THEN 'No Purchase'
WHEN COUNT(DISTINCT o.order_id) = 1 THEN 'One-time Buyer'
WHEN COUNT(DISTINCT o.order_id) <= 3 THEN 'Occasional Buyer'
ELSE 'Frequent Buyer'
END as buyer_type
FROM users u
LEFT JOIN orders o ON u.user_id = o.user_id AND o.status = 'completed'
LEFT JOIN order_items oi ON o.order_id = oi.order_id
LEFT JOIN products p ON oi.product_id = p.product_id
GROUP BY u.user_id, u.username, u.city, u.age
ORDER BY total_spent DESC;
![]()
6. 分组聚合:基础统计分析的核心技术
聚合函数
-- 分组聚合 - GROUP BY + 聚合函数
-- 原理:按指定列分组,对每组应用聚合函数
-- 场景:统计分析、汇总报表、维度分析
-- 业务:多维度销售分析
SELECT
p.category,
DATE_FORMAT(o.order_date, '%Y-%m') as sales_month,
COUNT(DISTINCT o.order_id) as order_count,
COUNT(DISTINCT o.user_id) as customer_count,
SUM(oi.quantity) as total_quantity,
SUM(oi.quantity * oi.unit_price) as total_revenue,
AVG(oi.quantity * oi.unit_price) as avg_item_value,
MIN(oi.unit_price) as min_price,
MAX(oi.unit_price) as max_price,
-- 计算集中度(标准差)
ROUND(STDDEV(oi.quantity * oi.unit_price), 2) as revenue_stddev
FROM products p
JOIN order_items oi ON p.product_id = oi.product_id
JOIN orders o ON oi.order_id = o.order_id
WHERE o.status = 'completed'
GROUP BY p.category, DATE_FORMAT(o.order_date, '%Y-%m')
HAVING total_revenue > 500 -- HAVING过滤聚合结果
ORDER BY p.category, sales_month;
![]()
7. 数据去重
数据去重
-- 数据去重 - DISTINCT 和 GROUP BY
-- 原理:消除重复记录,保证数据唯一性
-- 场景:数据清洗、唯一性统计、去重查询
-- 业务_1:客户去重和活跃度分析
SELECT
COUNT(DISTINCT u.user_id) as total_users,
COUNT(DISTINCT CASE WHEN o.order_id IS NOT NULL THEN u.user_id END) as active_users,
COUNT(DISTINCT u.city) as cities_covered,
COUNT(DISTINCT p.category) as categories_sold,
-- 去重后的平均年龄
ROUND(AVG(DISTINCT u.age), 1) as avg_age,
-- 活跃用户比例
ROUND(COUNT(DISTINCT CASE WHEN o.order_id IS NOT NULL THEN u.user_id END) * 100.0 /
COUNT(DISTINCT u.user_id), 2) as active_user_rate
FROM users u
LEFT JOIN orders o ON u.user_id = o.user_id AND o.status = 'completed'
LEFT JOIN order_items oi ON o.order_id = oi.order_id
LEFT JOIN products p ON oi.product_id = p.product_id;
![]()
产品交叉销售分析
SELECT DISTINCT
p1.product_name as product_a,
p2.product_name as product_b,
COUNT(DISTINCT o.order_id) as co_purchase_count,
COUNT(DISTINCT o.user_id) as co_purchase_users
FROM order_items oi1
JOIN order_items oi2 ON oi1.order_id = oi2.order_id AND oi1.product_id < oi2.product_id
JOIN products p1 ON oi1.product_id = p1.product_id
JOIN products p2 ON oi2.product_id = p2.product_id
JOIN orders o ON oi1.order_id = o.order_id
WHERE o.status = 'completed'
GROUP BY p1.product_id, p1.product_name, p2.product_id, p2.product_name
HAVING co_purchase_count >= 2
ORDER BY co_purchase_count DESC;
8. 排序分页
排序和分页
-- 排序和分页 - ORDER BY + LIMIT
-- 原理:对结果排序并限制返回行数
-- 场景:排行榜、分页查询、Top N分析
-- 业务:销售排行榜和分页 (Top 5 畅销产品)
SELECT
p.product_name,
p.category,
p.price,
SUM(oi.quantity) as total_sold,
SUM(oi.quantity * oi.unit_price) as total_revenue,
COUNT(DISTINCT o.order_id) as order_count
FROM products p
JOIN order_items oi ON p.product_id = oi.product_id
JOIN orders o ON oi.order_id = o.order_id
WHERE o.status = 'completed'
GROUP BY p.product_id, p.product_name, p.category, p.price
ORDER BY total_revenue DESC
LIMIT 5;
![]()
客户消费排行
-- 业务_2:客户消费排行(分页示例)
SELECT
u.username,
u.city,
COUNT(o.order_id) as order_count,
SUM(o.total_amount) as total_spent,
ROUND(AVG(o.total_amount), 2) as avg_order_value,
MAX(o.order_date) as last_order_date
FROM users u
JOIN orders o ON u.user_id = o.user_id
WHERE o.status = 'completed'
GROUP BY u.user_id, u.username, u.city
ORDER BY total_spent DESC, order_count DESC
LIMIT 10 OFFSET 0; -- 第一页,每页10条
![]()
9. 时间函数
时间函数
-- 时间函数 - 日期时间处理
-- 原理:处理时间数据,进行时间计算和格式化
-- 场景:时间序列分析、周期性统计、业务时间逻辑
-- 业务:时间维度销售分析
SELECT
DATE_FORMAT(o.order_date, '%Y') as year,
DATE_FORMAT(o.order_date, '%m') as month,
DAYNAME(o.order_date) as day_of_week,
HOUR(o.order_date) as hour_of_day,
COUNT(*) as order_count,
SUM(o.total_amount) as total_revenue,
-- 计算订单间隔
AVG(TIMESTAMPDIFF(HOUR,
LAG(o.order_date) OVER (PARTITION BY o.user_id ORDER BY o.order_date),
o.order_date)) as avg_hours_between_orders
FROM orders o
WHERE o.status = 'completed'
GROUP BY DATE_FORMAT(o.order_date, '%Y'), DATE_FORMAT(o.order_date, '%m'),
DAYNAME(o.order_date), HOUR(o.order_date)
ORDER BY year, month, FIELD(day_of_week, 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday');
-- 业务_2:客户生命周期分析
SELECT
u.user_id,
u.username,
u.register_date,
MIN(o.order_date) as first_purchase_date,
MAX(o.order_date) as last_purchase_date,
DATEDIFF(MIN(o.order_date), u.register_date) as days_to_first_purchase,
DATEDIFF(MAX(o.order_date), MIN(o.order_date)) + 1 as customer_lifetime_days,
DATEDIFF(CURRENT_DATE, MAX(o.order_date)) as days_since_last_order,
COUNT(o.order_id) as total_orders,
-- 客户状态判断
CASE
WHEN DATEDIFF(CURRENT_DATE, MAX(o.order_date)) <= 30 THEN 'Active'
WHEN DATEDIFF(CURRENT_DATE, MAX(o.order_date)) <= 90 THEN 'At Risk'
ELSE 'Churned'
END as customer_status
FROM users u
LEFT JOIN orders o ON u.user_id = o.user_id AND o.status = 'completed'
GROUP BY u.user_id, u.username, u.register_date
HAVING COUNT(o.order_id) > 0
ORDER BY last_purchase_date DESC;
![]()
10. 字符串函数
字符串函数
-- 字符串函数 - 文本处理
-- 原理:处理字符串数据,进行拼接、截取、格式化
-- 场景:数据清洗、格式化、文本分析
-- 业务:用户和产品信息格式化
SELECT
u.user_id,
-- 字符串拼接
CONCAT(u.username, ' (', u.city, ')') as user_info,
-- 邮箱处理
SUBSTRING_INDEX(u.email, '@', 1) as email_username,
SUBSTRING_INDEX(u.email, '@', -1) as email_domain,
-- 用户名首字母大写
CONCAT(UPPER(LEFT(u.username, 1)), LOWER(SUBSTRING(u.username, 2))) as formatted_name,
-- 城市长度
LENGTH(u.city) as city_length,
-- 年龄段描述
CASE
WHEN u.age < 25 THEN 'Young (Under 25)'
WHEN u.age <= 35 THEN CONCAT('Adult (', u.age, ' years old)')
ELSE 'Senior (Over 35)'
END as age_description,
-- 注册时间格式化
DATE_FORMAT(u.register_date, '%Y年%m月%d日') as register_date_chinese,
-- 购买的产品类别(去重并排序)
GROUP_CONCAT(DISTINCT p.category ORDER BY p.category SEPARATOR ', ') as purchased_categories,
-- 订单摘要
CONCAT(COUNT(DISTINCT o.order_id), ' orders, Total: ¥',
FORMAT(COALESCE(SUM(o.total_amount), 0), 2)) as order_summary
FROM users u
LEFT JOIN orders o ON u.user_id = o.user_id AND o.status = 'completed'
LEFT JOIN order_items oi ON o.order_id = oi.order_id
LEFT JOIN products p ON oi.product_id = p.product_id
GROUP BY u.user_id, u.username, u.email, u.city, u.age, u.register_date
ORDER BY u.user_id;
![]()