Skip to content

聚合函数与 GROUP BY

聚合函数将多行数据汇总成一行,是数据分析的核心工具。结合 GROUP BY,可以做分组统计。


聚合函数一览

函数说明忽略 NULL
COUNT(*)计数(包含 NULL)不适用
COUNT(col)计数(忽略 NULL)✅ 忽略
COUNT(DISTINCT col)去重计数✅ 忽略
SUM(col)求和✅ 忽略
AVG(col)平均值✅ 忽略
MAX(col)最大值✅ 忽略
MIN(col)最小值✅ 忽略
GROUP_CONCAT(col)字符串拼接✅ 忽略
STD / STDDEV(col)标准差✅ 忽略
VARIANCE(col)方差✅ 忽略

COUNT 的三种写法

sql
-- COUNT(*):统计所有行(包括重复和 NULL)
SELECT COUNT(*) FROM orders;  -- 1000

-- COUNT(col):统计非 NULL 的行
SELECT COUNT(phone) FROM users;  -- 忽略 phone 为 NULL 的用户

-- COUNT(DISTINCT col):去重计数
SELECT COUNT(DISTINCT status) FROM orders;  -- 返回有多少种不同的状态

-- COUNT + DISTINCT 配合:组合使用
SELECT COUNT(DISTINCT user_id) AS unique_customers,
       COUNT(DISTINCT product_id) AS unique_products
FROM orders;

COUNT(*) vs COUNT(col)

sql
-- orders 表有 1000 行,其中 50 行的 remark 字段为 NULL
SELECT COUNT(*) FROM orders;       -- 1000(统计所有行)
SELECT COUNT(remark) FROM orders;   -- 950(忽略 NULL)
SELECT COUNT(DISTINCT remark) FROM orders;  -- 100(去重后的非 NULL 值)

SUM / AVG / MAX / MIN

sql
-- 统计订单总金额
SELECT SUM(amount) AS total_amount FROM orders;

-- 平均订单金额
SELECT AVG(amount) AS avg_amount FROM orders;

-- 最高和最低薪资
SELECT MAX(salary) AS max_salary, MIN(salary) AS min_salary FROM employees;

-- 组合使用
SELECT 
    COUNT(*) AS order_count,
    SUM(amount) AS total_amount,
    AVG(amount) AS avg_amount,
    MAX(amount) AS max_amount,
    MIN(amount) AS min_amount
FROM orders
WHERE status = 'completed';

聚合函数忽略 NULL

sql
-- 假设 amount 列有 3 行:100, NULL, 200
SELECT SUM(amount) FROM t;   -- 300(忽略 NULL)
SELECT AVG(amount) FROM t;   -- 150(忽略 NULL,只算2行)
SELECT COUNT(amount) FROM t;  -- 2(忽略 NULL)

-- 如果想把 NULL 当 0 处理:
SELECT SUM(IFNULL(amount, 0)) FROM t;   -- 300
SELECT AVG(IFNULL(amount, 0)) FROM t;   -- 100(NULL→0,共3行)

GROUP BY:分组

基本用法

sql
-- 按状态统计订单数
SELECT 
    status,
    COUNT(*) AS order_count
FROM orders
GROUP BY status;

-- 按月统计收入
SELECT 
    DATE_FORMAT(created_at, '%Y-%m') AS month,
    SUM(amount) AS monthly_revenue,
    COUNT(*) AS order_count
FROM orders
WHERE status = 'paid'
GROUP BY DATE_FORMAT(created_at, '%Y-%m')
ORDER BY month;

GROUP BY + 聚合函数组合

sql
-- 按用户统计消费
SELECT 
    user_id,
    COUNT(*) AS order_count,
    SUM(amount) AS total_spent,
    AVG(amount) AS avg_order_value,
    MAX(created_at) AS last_order_date
FROM orders
GROUP BY user_id
HAVING order_count >= 5  -- 过滤:至少下过 5 单的用户
ORDER BY total_spent DESC;

HAVING:过滤分组

HAVING 和 WHERE 的区别:

子句作用对象执行时机能用聚合函数
WHERE行(分组前过滤)先于 GROUP BY❌ 不能
HAVING分组(分组后过滤)后于 GROUP BY✅ 能
sql
-- 查消费总额超过 10000 的用户
SELECT 
    user_id,
    SUM(amount) AS total_spent
FROM orders
GROUP BY user_id
HAVING SUM(amount) > 10000
ORDER BY total_spent DESC;

-- WHERE 和 HAVING 配合
SELECT 
    status,
    COUNT(*) AS cnt,
    SUM(amount) AS total
FROM orders
WHERE created_at >= '2024-01-01'    -- 先过滤行
GROUP BY status
HAVING COUNT(*) > 100                 -- 再过滤分组
ORDER BY total DESC;

GROUP_CONCAT:字符串聚合

GROUP_CONCAT 将分组内的多行字符串合并成一行:

sql
-- 按部门列出所有员工
SELECT 
    department,
    GROUP_CONCAT(name ORDER BY name SEPARATOR ', ') AS employee_list
FROM employees
GROUP BY department;

-- 结果:
-- Engineering | Alice, Bob, Charlie
-- Sales       | David, Eve, Frank

-- SEPARATOR:分隔符(默认逗号)
SELECT 
    order_id,
    GROUP_CONCAT(product_id SEPARATOR '; ') AS products
FROM order_items
GROUP BY order_id;

-- DISTINCT:去重
SELECT 
    department,
    GROUP_CONCAT(DISTINCT LEFT(name, 1) ORDER BY LEFT(name, 1) SEPARATOR '') AS initials
FROM employees
GROUP BY department;

GROUP_CONCAT 的长度限制

sql
-- 查看默认长度
SHOW VARIABLES LIKE 'group_concat_max_len';  -- 默认 1024

-- 设置全局或会话级别长度
SET SESSION group_concat_max_len = 8192;

多级分组

ROLLUP:合计行

MySQL 不支持 CUBE,但支持 ROLLUP:

sql
-- 按部门和小组统计,再加合计行
SELECT 
    department,
    team,
    COUNT(*) AS headcount,
    SUM(salary) AS total_salary
FROM employees
GROUP BY department, team WITH ROLLUP;

-- 结果:
-- Engineering, Backend, 10, 800000    ← 各组
-- Engineering, NULL, 25, 2000000     ← Engineering 合计
-- NULL, NULL, 50, 5000000         ← 总计
-- 注意:WITH ROLLUP 会在分组层级添加 NULL 行

手动模拟 CUBE(交叉聚合)

sql
-- CUBE:所有分组组合的交叉聚合
-- MySQL 不直接支持,用 UNION ALL 模拟
SELECT department, team, SUM(salary) AS total
FROM employees GROUP BY department, team
UNION ALL
SELECT department, 'ALL' AS team, SUM(salary) AS total
FROM employees GROUP BY department
UNION ALL
SELECT 'ALL' AS department, team, SUM(salary) AS total
FROM employees GROUP BY team
UNION ALL
SELECT 'ALL', 'ALL', SUM(salary) FROM employees;

综合案例

场景一:用户留存分析

sql
-- 计算每月新增用户数和次月留存率
WITH monthly_users AS (
    SELECT 
        DATE_FORMAT(created_at, '%Y-%m') AS month,
        DATE_FORMAT(created_at, '%Y-%m') AS reg_month,
        id AS user_id
    FROM users
),
user_orders AS (
    SELECT 
        DATE_FORMAT(o.created_at, '%Y-%m') AS order_month,
        mu.reg_month,
        COUNT(DISTINCT mu.user_id) AS active_users
    FROM monthly_users mu
    JOIN orders o ON mu.user_id = o.user_id
    GROUP BY DATE_FORMAT(o.created_at, '%Y-%m'), mu.reg_month
)
SELECT 
    mu.month AS register_month,
    COUNT(DISTINCT mu.user_id) AS new_users,
    COUNT(DISTINCT CASE 
        WHEN DATE_FORMAT(o.created_at, '%Y-%m') > mu.month 
        THEN mu.user_id END) AS retained_users,
    COUNT(DISTINCT CASE 
        WHEN DATE_FORMAT(o.created_at, '%Y-%m') = DATE_FORMAT(DATE_ADD(STR_TO_DATE(CONCAT(mu.month, '-01'), INTERVAL 1 MONTH), '%Y-%m')
        THEN mu.user_id END) AS m1_retained
FROM monthly_users mu
LEFT JOIN orders o ON mu.user_id = o.user_id
GROUP BY mu.month
ORDER BY mu.month;

场景二:商品销售排行榜

sql
-- 按品类统计销售额 Top 3 商品
SELECT 
    category,
    product_name,
    total_sales
FROM (
    SELECT 
        p.category,
        p.name AS product_name,
        SUM(oi.quantity * oi.price) AS total_sales,
        ROW_NUMBER() OVER (
            PARTITION BY p.category 
            ORDER BY SUM(oi.quantity * oi.price) DESC
        ) AS rank_in_category
    FROM products p
    JOIN order_items oi ON p.id = oi.product_id
    JOIN orders o ON oi.order_id = o.id
    WHERE o.status = 'completed'
    GROUP BY p.category, p.id, p.name
) ranked
WHERE rank_in_category <= 3
ORDER BY category, total_sales DESC;

小结

聚合函数的核心要点:

要点说明
聚合函数忽略 NULLSUM/AVG/COUNT 都不统计 NULL 值
COUNT(*) vs COUNT(col)* 统计所有行,col 只统计非 NULL
HAVING vs WHEREWHERE 过滤行,HAVING 过滤分组
GROUP_CONCAT将分组内的字符串合并,支持去重和排序

记住:GROUP BY 是聚合的前提(除 COUNT(*) 外),聚合函数将多行变成一行——理解这一点,就理解了聚合的本质。

基于 VitePress 构建