BI分析-计算核心
基于MySQL的BI核心
使用MySQL作为数据仓库及计算核心
源数据表
假设简单源数据表为:
create table 8_2
(
a int(2) null comment "维度1",
b int(2) null comment "维度2",
c int(2) null comment "维度3",
d int(2) null comment "维度4",
e int(2) null comment "维度5",
f int(2) null comment "维度6",
g int(2) null comment "维度7",
h int(2) null comment "维度8"
);
目标表
假设简单目标数据表为:
create table 8_3
(
a int(2) null comment "维度1",
b int(2) null comment "维度2",
c int(2) null comment "维度3",
d int(2) null comment "维度4",
e int(2) null comment "维度5",
f int(2) null comment "维度6",
g int(2) null comment "维度7",
h int(2) null comment "维度8",
i int null comment "指标"
)
深度汇总:with cube函数
在oracle、postgre数据库或者hive数仓工具都有cube函数,该函数就可以完成对数据深度汇总工作
SELECT a, b, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY a,b,c WITH CUBE
等价于
SELECT a, b, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY a,b,c
UNION ALL
SELECT a, b, null, count(1) FROM 8_2 WHERE 1=1 GROUP BY a,b
UNION ALL
SELECT a, null, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY a,c
UNION ALL
SELECT null, b, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY b,c
UNION ALL
SELECT a, null, null, count(1) FROM 8_2 WHERE 1=1 GROUP BY a
UNION ALL
SELECT null, b, null, count(1) FROM 8_2 WHERE 1=1 GROUP BY b
UNION ALL
SELECT NULL, null, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY c
UNION ALL
SELECT null, null, null, count(1) FROM 8_2 WHERE 1=1
所以group by a,b,c with cube过程为:(a, b, c), (a, b), (a, c), (a), (b), (c), ()
但是mysql不支持cube函数,现在有两种解决方案:
1:mysql支持with rollup函数
SELECT a, b, c, count(1) FROM 8_2 WHERE 1=1 GROUP BY a,b,c WITH ROLLUP;
等价于
SELECT a, b, c, count(1) from 8_2 WHERE 1=1 GROUP BY a,b,c
UNION ALL
SELECT a, b, null, count(1) from 8_2 WHERE 1=1 GROUP BY a,b
UNION ALL
SELECT a, null, null, count(1) from 8_2 WHERE 1=1 GROUP BY a
UNION ALL
SELECT null, null, null, count(1) from 8_2 WHERE 1=1
所以group by a,b,c with rollup过程为:(a, b, c), (a, b), (a), ()
所以rollup是cube的子集
所以cube(a,b,c) = rollup(a, b, c) union rollup(a, c, b) union rollup(b, a, c)union rollup(b, c, a)union rollup(c, a, b)union rollup(c, b, a)
两个实现对比:
使用with rollup计算次数为N!
实现with cube计算次数为2^N
所以在分析维度大于4个时,实现with cube的计算压力更小
示例代码
以简单源数据表和简单目标表为例
组合算法类:
package com.ylc.spring_test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
public class CombineAlgorithm {
/**
*存放结果的二维数组
*/
public Object[][] obj;
/**
*src数组的长度
*/
private int m;
/**
*需要获取N个数
*/
private int n;
/**
*临时变量,obj的行数
*/
private int objLineIndex;
public CombineAlgorithm(Object[] src, int getNum) throws Exception {
if (src == null) {
throw new Exception("原数组为空.");
}
if (src.length < getNum) {
throw new Exception("要取的数据比原数组个数还 大.");
}
m = src.length;
n = getNum;
/* 初始化 */
objLineIndex = 0;
obj = new Object[combination(m, n)][n];
Object[] tmp = new Object[n];
combine(src, 0, 0, n, tmp);
}
public static List<String> combineAlgorithm(List<String> criteriaArr){
if(criteriaArr==null||criteriaArr.size()==0){
return Collections.emptyList();
}
ArrayList<String> combine = new ArrayList<>();
for(int i=0;i<criteriaArr.size();i++){
CombineAlgorithm ca = null;
try {
ca=new CombineAlgorithm(criteriaArr.toArray(),i+1);
} catch(Exception e){
e.printStackTrace();
}
assert ca != null;
Object[][] c=ca.getResult();
for(Object[]objects:c){
combine.add(Arrays.stream(objects).map(String::valueOf).collect(Collectors.joining(",")));
}
}
return combine;
}
/**
*<p>
*计算C(m,n)个数=(m!)/(n!*(m-n)!)
* </p>
*从M个数中选N个数,函数返回有多少种选法参数m必须大于等于nm=0;n=0;retuan1;
*/
public int combination(int m, int n) {
if (m < n) {
return 0;//如果总数小于取出的数,直接返回
}
int k = 1;
int j = 1;
//该种算法约掉了分母的(m-n)!,这样分子相乘的个数就是有n个了
for (int i = n; i >= 1; i--) {
k = k * m;
j = j * n;
m--;
n--;
}
return k / j;
}
/**
*<p>
*递归算法,把结果写到obj二维数组对象
*</p>
*/
private void combine(Object[]src,int srcIndex,int i,int n,Object[]tmp) {
int j;
for (j = srcIndex; j < src.length - (n - 1); j++) {
tmp[i] = src[j];
if (n == 1) {
System.arraycopy(tmp, 0, obj[objLineIndex], 0, tmp.length);
objLineIndex++;
} else {
n--;
i++;
combine(src, j + 1, i, n, tmp);
n++;
i--;
}
}
}
public Object[][] getResult() {
return obj;
}
public interface ICombineAlgorithm{
String getCombineDim();
}
}
组合算法工具类:
package com.ylc.spring_test;
import java.util.ArrayList;
import java.util.List;
public class CombineAlgorithmUtil {
public static List<String> criteriaArr(CombineAlgorithm.ICombineAlgorithm[] values) {
List < String > arr = new ArrayList<>();
for (CombineAlgorithm.ICombineAlgorithm value : values) {
arr.add(value.getCombineDim());
}
return arr;
}
public static String getDimSql(List<String>dims,CombineAlgorithm.ICombineAlgorithm[]values) {
StringBuilder sql = new StringBuilder();
for (CombineAlgorithm.ICombineAlgorithm value : values) {
if (dims.contains(value.getCombineDim())) {
sql.append(value.getCombineDim());
} else {
sql.append("-1");
}
sql.append(" ").append("AS").append(" ").append(value.getCombineDim()).append(",");
}
return sql.toString();
}
}
维度枚举类:
package com.ylc.spring_test;
import lombok.AllArgsConstructor;
import lombok.Getter;
import java.util.stream.Stream;
@Getter
@AllArgsConstructor
public enum TestEnum implements CombineAlgorithm.ICombineAlgorithm{
A("`a`"),
B("`b`"),
C("`c`"),
D("`d`"),
E("`e`"),
F("`f`"),
G("`g`"),
H("`h`")
;
private final String combineDim;
}
Sql语句拼装:
package com.ylc.spring_test;
import java.util.Arrays;
import java.util.List;
public class Test {
private static final String COUNT_SQL_T=
"INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT ${dim_sql} count(1) FROM 8_2 where 1=1 GROUP BY ${group_sql}";
private static final String REPLACE_DIM_SQL="${dim_sql}";
private static final String REPLACE_GROUP_SQL="${group_sql}";
public static void main(String[] args) {
dsStatistics1(COUNT_SQL_T, TestEnum.values());
}
protected static void dsStatistics1(String countSql, CombineAlgorithm.ICombineAlgorithm[] enumEntities) {
statistics(countSql,enumEntities);
}
private static void statistics(String countSql, CombineAlgorithm. ICombineAlgorithm[] enumEntities) {
List<String> listGroupBy = CombineAlgorithm.combineAlgorithm(CombineAlgorithmUtil.criteriaArr(enumEntities));
for (String sql : listGroupBy) {
String dimSql = CombineAlgorithmUtil.getDimSql(Arrays.asList(sql.split(",")), enumEntities);
String s = countSql.replace(REPLACE_DIM_SQL, dimSql);
s = s.replace(REPLACE_GROUP_SQL, sql);
System.out.println(s + ";");
}
}
}
执行效果如下(部分sql):
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT `a` AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `a`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,`b` AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `b`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,`c` AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `c`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,`d` AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `d`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,`e` AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `e`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,`f` AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `f`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,`g` AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `g`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,`h` AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `h`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT `a` AS `a`,`b` AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `a`,`b`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT `a` AS `a`,-1 AS `b`,`c` AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `a`,`c`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT `a` AS `a`,-1 AS `b`,-1 AS `c`,`d` AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `a`,`d`;
...
度量数据指数级膨胀解释
源数据经深度汇总后会指数级膨胀,以日常业务为例,不到10W条的元数据,胜读汇总后,数据量达到3000W+,很难给同事,特别是老板解释
解释
这是一个高中数学问题
简单场景
以简单源数据表为例,假设8个维度(a, b, c, d, e, f, g, h),每个维度2个值(0, 1),理想且极端的情况下会产生\(2^8 = 256\)条源数据,
如下,例:
INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,0,0,0);
INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,0,0,1);
INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,0,1,0);
INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,0,1,1);
INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,1,0,0);
INSERT INTO 8_2 (a,b,c,d,e,f,g,h) VALUES (0,0,0,0,0,1,0,1);
...
需要将所有维度整体打包聚合,需要执行:
如下,例:
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT `a` AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `a`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,`b` AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `b`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,`c` AS `c`,-1 AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `c`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,`d` AS `d`,-1 AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `d`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,`e` AS `e`,-1 AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `e`;
INSERT INTO 8_3 (a,b,c,d,e,f,g,h,i) SELECT -1 AS `a`,-1 AS `b`,-1 AS `c`,-1 AS `d`,-1 AS `e`,`f` AS `f`,-1 AS `g`,-1 AS `h`, count(1) FROM 8_2 where 1=1 GROUP BY `f`;
...
会产生:
增加维度
假设源数据表9个维度(a, b, c, d, e, f, g, h, i),每个维度2个值(0, 1),理想且极端的情况下会产生\(2^9 = 512\)条源数据(sql略)
假设目标数据表,9个维度(a, b, c, d, e, f, g, h, i),一个指标(j)
需要将所有维度整体打包聚合,需要执行:
会产生:
增加维度值
假设源数据表9个维度(a, b, c, d, e, f, g, h, i),每个维度3个值(0, 1, 2),理想且极端的情况下会产生\(3^9 = 19638\)条源数据(sql略)
假设目标数据表,9个维度(a, b, c, d, e, f, g, h, i),一个指标(j)
需要将所有维度整体打包聚合,需要执行:
会产生:
总结
实际场景下,10+个维度,每个维度10+个属性是很正常的,所以统计后数据汇大量膨胀

浙公网安备 33010602011771号