BI分析-计算核心

基于MySQL的BI核心

使用MySQL作为数据仓库及计算核心

源数据表

假设简单源数据表为:

create table 8_2
(
	a int(2) 	null comment "维度1",
	b int(2)	null comment "维度2",
	c int(2) 	null comment "维度3",
	d int(2)	null comment "维度4",
	e int(2) 	null comment "维度5",
	f int(2)	null comment "维度6",
	g int(2) 	null comment "维度7",
	h int(2) 	null comment "维度8"
);

目标表

假设简单目标数据表为:

create table 8_3
(
	a int(2)	null comment "维度1",
	b int(2)	null comment "维度2",
	c int(2)	null comment "维度3",
	d int(2)	null comment "维度4",
	e int(2) 	null comment "维度5",
	f int(2)	null comment "维度6",
	g int(2) 	null comment "维度7",
	h int(2)	null comment "维度8",
	i int 		null comment "指标"
)

深度汇总:with cube函数

在oracle、postgre数据库或者hive数仓工具都有cube函数,该函数就可以完成对数据深度汇总工作

SELECT a, b, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY a,b,c WITH CUBE

等价于

SELECT a, b, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY a,b,c
UNION ALL
SELECT a, b, null, count(1) FROM 8_2 WHERE 1=1 GROUP BY a,b
UNION ALL
SELECT a, null, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY a,c
UNION ALL
SELECT null, b, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY b,c
UNION ALL
SELECT a, null, null, count(1) FROM 8_2 WHERE 1=1 GROUP BY a
UNION ALL
SELECT null, b, null, count(1) FROM 8_2 WHERE 1=1 GROUP BY b
UNION ALL
SELECT NULL, null, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY c
UNION ALL
SELECT null, null, null, count(1) FROM 8_2 WHERE 1=1 

所以group by a,b,c with cube过程为:(a, b, c), (a, b), (a, c), (a), (b), (c), ()

但是mysql不支持cube函数,现在有两种解决方案:

1:mysql支持with rollup函数

SELECT a, b, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY a,b,c WITH ROLLUP;

等价于

SELECT a, b, c, count(1) from 8_2 WHERE 1=1 GROUP BY a,b,c
UNION ALL
SELECT a, b, null, count(1) from 8_2 WHERE 1=1 GROUP BY a,b
UNION ALL
SELECT a, null, null, count(1) from 8_2 WHERE 1=1 GROUP BY a
UNION ALL
SELECT null, null, null, count(1) from 8_2 WHERE 1=1 

所以group by a,b,c with rollup过程为:(a, b, c), (a, b), (a), ()
所以rollup是cube的子集
所以cube(a,b,c) = rollup(a, b, c) union rollup(a, c, b) union rollup(b, a, c)union rollup(b, c, a)union rollup(c, a, b)union rollup(c, b, a)

两个实现对比:
使用with rollup计算次数为N!
实现with cube计算次数为2^N
所以在分析维度大于4个时,实现with cube的计算压力更小

示例代码

简单源数据表简单目标表为例

组合算法类:

package com.ylc.spring_test;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

public class CombineAlgorithm {

    /**
     *存放结果的二维数组
     */
    public Object[][] obj;
    /**
     *src数组的长度
     */
    private int m;
    /**
     *需要获取N个数
     */
    private int n;
    /**
     *临时变量,obj的行数
     */
    private int objLineIndex;

    public CombineAlgorithm(Object[] src, int getNum) throws Exception {
        if (src == null) {
            throw new Exception("原数组为空.");
        }
        if (src.length < getNum) {
            throw new Exception("要取的数据比原数组个数还 大.");
        }
        m = src.length;
        n = getNum;
        /* 初始化 */
        objLineIndex = 0;
        obj = new Object[combination(m, n)][n];
        Object[] tmp = new Object[n];
        combine(src, 0, 0, n, tmp);
    }

    public static List<String> combineAlgorithm(List<String> criteriaArr){
        if(criteriaArr==null||criteriaArr.size()==0){
            return Collections.emptyList();
        }
        ArrayList<String> combine = new ArrayList<>();
        for(int i=0;i<criteriaArr.size();i++){
            CombineAlgorithm ca = null;
            try {
                ca=new CombineAlgorithm(criteriaArr.toArray(),i+1);
            } catch(Exception e){
                e.printStackTrace();
            }
            assert ca != null;
            Object[][] c=ca.getResult();
            for(Object[]objects:c){
                combine.add(Arrays.stream(objects).map(String::valueOf).collect(Collectors.joining(",")));
            }
        }
        return combine;
    }

    /**
     *<p>
     *计算C(m,n)个数=(m!)/(n!*(m-n)!)
     * </p>
     *从M个数中选N个数,函数返回有多少种选法参数m必须大于等于nm=0;n=0;retuan1;
     */
    public int combination(int m, int n) {
        if (m < n) {
            return 0;//如果总数小于取出的数,直接返回
        }
        int k = 1;
        int j = 1;
        //该种算法约掉了分母的(m-n)!,这样分子相乘的个数就是有n个了
        for (int i = n; i >= 1; i--) {
            k = k * m;
            j = j * n;
            m--;
            n--;
        }
        return k / j;
    }


    /**
     *<p>
     *递归算法,把结果写到obj二维数组对象
     *</p>
     */
    private void combine(Object[]src,int srcIndex,int i,int n,Object[]tmp) {
        int j;
        for (j = srcIndex; j < src.length - (n - 1); j++) {
            tmp[i] = src[j];
            if (n == 1) {
                System.arraycopy(tmp, 0, obj[objLineIndex], 0, tmp.length);
                objLineIndex++;
            } else {
                n--;
                i++;
                combine(src, j + 1, i, n, tmp);
                n++;
                i--;
            }
        }
    }
    public Object[][] getResult() {
        return obj;
    }

    public interface ICombineAlgorithm{
        String getCombineDim();
    }
}

组合算法工具类:

package com.ylc.spring_test;

import java.util.ArrayList;
import java.util.List;

public class CombineAlgorithmUtil {
    public static List<String> criteriaArr(CombineAlgorithm.ICombineAlgorithm[] values) {
        List < String > arr = new ArrayList<>();
        for (CombineAlgorithm.ICombineAlgorithm value : values) {
            arr.add(value.getCombineDim());
        }
        return arr;
    }

    public static String getDimSql(List<String>dims,CombineAlgorithm.ICombineAlgorithm[]values) {
        StringBuilder sql = new StringBuilder();
        for (CombineAlgorithm.ICombineAlgorithm value : values) {
            if (dims.contains(value.getCombineDim())) {
                sql.append(value.getCombineDim());
            } else {
                sql.append("-1");
            }
            sql.append(" ").append("AS").append(" ").append(value.getCombineDim()).append(",");
        }
        return sql.toString();
    }
}

维度枚举类:

package com.ylc.spring_test;

import lombok.AllArgsConstructor;
import lombok.Getter;

import java.util.stream.Stream;

@Getter
@AllArgsConstructor
public enum TestEnum implements CombineAlgorithm.ICombineAlgorithm{
    A("`a`"),
    B("`b`"),
    C("`c`"),
    D("`d`"),
    E("`e`"),
    F("`f`"),
    G("`g`"),
    H("`h`")
    ;

    private final String combineDim;
}

Sql语句拼装:

package com.ylc.spring_test;

import java.util.Arrays;
import java.util.List;

public class Test {
    private static final String COUNT_SQL_T=
            "INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT ${dim_sql} count(1) FROM 8_2 where 1=1 GROUP BY ${group_sql}";
    private static final String REPLACE_DIM_SQL="${dim_sql}";
    private static final String REPLACE_GROUP_SQL="${group_sql}";
    public static void main(String[] args) {
        dsStatistics1(COUNT_SQL_T, TestEnum.values());
    }

    protected static void dsStatistics1(String countSql, CombineAlgorithm.ICombineAlgorithm[] enumEntities) {
        statistics(countSql,enumEntities);
    }

    private static void statistics(String countSql, CombineAlgorithm. ICombineAlgorithm[] enumEntities) {
        List<String> listGroupBy = CombineAlgorithm.combineAlgorithm(CombineAlgorithmUtil.criteriaArr(enumEntities));
        for (String sql : listGroupBy) {
            String dimSql = CombineAlgorithmUtil.getDimSql(Arrays.asList(sql.split(",")), enumEntities);
            String s = countSql.replace(REPLACE_DIM_SQL, dimSql);
            s = s.replace(REPLACE_GROUP_SQL, sql);
            System.out.println(s + ";");
        }
    }
}

执行效果如下(部分sql):

INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT `a` AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `a`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,`b` AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `b`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,`c` AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `c`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,`d` AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `d`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,`e` AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `e`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,`f` AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `f`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,`g` AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `g`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,`h` AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `h`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT `a` AS `a`,`b` AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `a`,`b`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT `a` AS `a`,-1 AS `b`,`c` AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `a`,`c`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT `a` AS `a`,-1 AS `b`,-1 AS `c`,`d` AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `a`,`d`;
...

度量数据指数级膨胀解释

源数据经深度汇总后会指数级膨胀,以日常业务为例,不到10W条的元数据,胜读汇总后,数据量达到3000W+,很难给同事,特别是老板解释

解释

这是一个高中数学问题

简单场景

以简单源数据表为例,假设8个维度(a, b, c, d, e, f, g, h),每个维度2个值(0, 1),理想且极端的情况下会产生\(2^8 = 256\)条源数据,

如下,例:

INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,0,0,0);
INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,0,0,1);
INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,0,1,0);
INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,0,1,1);
INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,1,0,0);
INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,1,0,1);
...

需要将所有维度整体打包聚合,需要执行:

\[C_8^0 + C_8^1 +C_8^2 +C_8^3 +C_8^4 +C_8^5 +C_8^6 +C_8^7 +C_8^8 \\ = 1+8+28+56+70+56+28+8+1=256(条sql) \]

如下,例:

INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT `a` AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `a`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,`b` AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `b`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,`c` AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `c`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,`d` AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `d`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,`e` AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `e`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,`f` AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `f`;
...

会产生:

\[1*2^0+8*2^1+28*2^2+56*2^3+70*2^4+56*2^5+28*2^6+8*2^7+1*2^8\\ =1+16+112+448+1120+1792+1792+1024+256=6561(条集合结果数据) \]

增加维度

假设源数据表9个维度(a, b, c, d, e, f, g, h, i),每个维度2个值(0, 1),理想且极端的情况下会产生\(2^9 = 512\)条源数据(sql略)

假设目标数据表,9个维度(a, b, c, d, e, f, g, h, i),一个指标(j)

需要将所有维度整体打包聚合,需要执行:

\[C_9^0+C_9^1+C_9^2+C_9^3+C_9^4+C_9^5+C_9^6+C_9^7+C_9^8+C_9^9\\ =1+9+36+84+126+126+84+36+9+1 = 512(条sql,sql略) \]

会产生:

\[1*2^0+9*2^1+36*2^2+84*2^3+126*2^4+126*2^5+84*2^6+36*2^7+9*2^8+1*2^9\\ =1+18+144+672+2016+4032+5376+4608+2304+512=19638(条聚合数据) \]

增加维度值

假设源数据表9个维度(a, b, c, d, e, f, g, h, i),每个维度3个值(0, 1, 2),理想且极端的情况下会产生\(3^9 = 19638\)条源数据(sql略)

假设目标数据表,9个维度(a, b, c, d, e, f, g, h, i),一个指标(j)

需要将所有维度整体打包聚合,需要执行:

\[C_9^0+C_9^1+C_9^2+C_9^3+C_9^4+C_9^5+C_9^6+C_9^7+C_9^8+C_9^9\\ =1+9+36+84+126+126+84+36+9+1 = 512(条sql,sql略) \]

会产生:

\[1*3^0+9*3^1+36*3^2+84*3^3+126*3^4+126*3^5+84*3^6+36*3^7+9*3^8+1*3^9\\ =1+27+324+2268+10206+30618+61236+78732+59049+19683 =262144(条聚合数据) \]

总结

实际场景下,10+个维度,每个维度10+个属性是很正常的,所以统计后数据汇大量膨胀

posted @ 2023-06-29 17:56  ylc0x01  阅读(43)  评论(0)    收藏  举报