以下是结合 MySQL(OLTP) 和 ClickHouse(OLAP) 的 Great Expectations 完整落地流程,针对商超线上 / 线下场景设计,包含环境搭建、数据生成、质量验证及结果分析:

1. 环境准备与安装

1.1. 环境准备

image

1.2. 安装依赖工具

# 创建虚拟环境
python3 -m venv ge_env
source ge_env/bin/activate  # Linux/macOS
# 或 ge_env\Scripts\activate  # Windows

# 安装核心依赖(使用清华数据源)
pip3 install urllib3==1.26.6 cryptography==38.0.4 great_expectations==0.17.19 -i https://pypi.tuna.tsinghua.edu.cn/simple      # Great Expectations本体
pip3 install mysql-connector-python  -i https://pypi.tuna.tsinghua.edu.cn/simple  # 连接MySQL
pip3 install clickhouse-connect  -i https://pypi.tuna.tsinghua.edu.cn/simple      # 连接ClickHouse
pip3 install pandas faker  -i https://pypi.tuna.tsinghua.edu.cn/simple            # 数据生成工具
pip3 install "sqlalchemy==1.4.46" -i https://pypi.tuna.tsinghua.edu.cn/simple # sqlalchemy 依赖 ,不能安装2.0以上版本

2. 步骤 1:创建 MySQL 和 ClickHouse 测试表

2.1. MySQL(OLTP)表设计(线上订单表)

-- 连接MySQL(示例命令)
mysql -u root -p

-- 创建数据库
CREATE DATABASE supermarket_oltp;
USE supermarket_oltp;

-- 线上订单表(线上环境)
CREATE TABLE online_orders (
  order_id INT PRIMARY KEY AUTO_INCREMENT,
  user_id INT NOT NULL,
  product_id VARCHAR(20) NOT NULL,
  quantity INT NOT NULL,
  unit_price DECIMAL(10,2) NOT NULL,
  total_amount DECIMAL(10,2) NOT NULL,
  payment_status ENUM('unpaid', 'paid', 'refunded') NOT NULL,
  order_time DATETIME NOT NULL,
  delivery_addr VARCHAR(255)
);

2.2. ClickHouse(OLAP)表设计(线下销售汇总表)

-- 连接ClickHouse(示例命令)
clickhouse-client --host localhost --port 9000

-- 创建数据库
CREATE DATABASE IF NOT EXISTS supermarket_olap;
USE supermarket_olap;

-- 线下销售汇总表(线下环境,按日分区)
CREATE TABLE offline_sales (
  sale_date Date NOT NULL,
  store_id INT NOT NULL,
  product_id VARCHAR(20) NOT NULL,
  total_quantity INT NOT NULL,
  total_revenue DECIMAL(15,2) NOT NULL,
  avg_price DECIMAL(10,2) NOT NULL,
  sale_count INT NOT NULL  -- 当日销售笔数
) ENGINE = MergeTree()
PARTITION BY sale_date
ORDER BY (store_id, product_id);

3. 步骤 2:Python 脚本生成测试数据

创建 generate_test_data.py,生成含正常数据和异常数据的测试集:
 
生成商超测试数据的Python脚本
import mysql.connector
import clickhouse_connect
from faker import Faker
import random
from datetime import datetime, timedelta

# 初始化Faker
fake = Faker('zh_CN')

# 1. 连接MySQL
mysql_conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="your_mysql_password",  # 替换为你的密码
    database="supermarket_oltp"
)
mysql_cursor = mysql_conn.cursor()

# 2. 连接ClickHouse
ck_client = clickhouse_connect.get_client(
    host="localhost",
    port=8123,
    username="default",
    password=""  # 默认无密码,根据实际情况修改
)

# 生成线上订单数据(MySQL)
def generate_online_orders(count=100):
    orders = []
    for _ in range(count):
        # 正常数据
        user_id = random.randint(1000, 9999)
        product_id = f"SP-{random.randint(1000, 9999)}"
        quantity = random.randint(1, 10)
        unit_price = round(random.uniform(10, 500), 2)
        total_amount = round(quantity * unit_price, 2)
        payment_status = random.choice(['unpaid', 'paid', 'refunded'])
        order_time = fake.date_time_between(start_date='-30d', end_date='now')
        delivery_addr = fake.address()
        
        # 随机插入异常数据(10%概率)
        if random.random() < 0.1:
            if random.random() < 0.5:
                total_amount = -total_amount  # 异常:金额为负
            else:
                payment_status = 'invalid'  # 异常:状态非法
        
        orders.append((
            user_id, product_id, quantity, unit_price,
            total_amount, payment_status, order_time, delivery_addr
        ))
    
    # 插入MySQL
    insert_sql = """
    INSERT INTO online_orders 
    (user_id, product_id, quantity, unit_price, total_amount, 
     payment_status, order_time, delivery_addr)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """
    mysql_cursor.executemany(insert_sql, orders)
    mysql_conn.commit()
    print(f"已插入 {count} 条线上订单数据到MySQL")

# 生成线下销售数据(ClickHouse)
def generate_offline_sales(count=50):
    sales = []
    start_date = datetime.now() - timedelta(days=30)
    
    for _ in range(count):
        # 正常数据
        sale_date = (start_date + timedelta(days=random.randint(0, 29))).date()
        store_id = random.randint(1, 20)  # 假设20家门店
        product_id = f"SP-{random.randint(1000, 9999)}"
        total_quantity = random.randint(5, 200)
        avg_price = round(random.uniform(20, 1000), 2)
        total_revenue = round(total_quantity * avg_price, 2)
        sale_count = random.randint(3, 50)  # 销售笔数
        
        # 随机插入异常数据(10%概率)
        if random.random() < 0.1:
            if random.random() < 0.5:
                total_quantity = -total_quantity  # 异常:销量为负
            else:
                product_id = f"INVALID-{random.randint(1, 99)}"  # 异常:商品ID格式错误
        
        sales.append({
            "sale_date": sale_date,
            "store_id": store_id,
            "product_id": product_id,
            "total_quantity": total_quantity,
            "total_revenue": total_revenue,
            "avg_price": avg_price,
            "sale_count": sale_count
        })
    
    # 插入ClickHouse
    ck_client.insert("supermarket_olap.offline_sales", sales)
    print(f"已插入 {count} 条线下销售数据到ClickHouse")

if __name__ == "__main__":
    generate_online_orders(100)  # 生成100条线上订单
    generate_offline_sales(50)   # 生成50条线下销售记录
    
    # 关闭连接
    mysql_cursor.close()
    mysql_conn.close()
    ck_client.close()
View Code

运行脚本生成数据

# 替换脚本中的数据库密码后执行
python generate_test_data.py

4. 步骤 3:使用 Great Expectations 验证数据质量

4.1. 初始化 Great Expectations 项目

mkdir ge_supermarket && cd ge_supermarket
great_expectations init  # 生成项目结构
[root@zb-yunweitest-mysql-204-200 ge_supermarket]# great_expectations  --version
great_expectations, version 0.17.19
[root@zb-yunweitest-mysql-204-200 ge_supermarket]# 
[root@zb-yunweitest-mysql-204-200 ge_supermarket]# tree gx
gx
├── checkpoints
├── expectations
├── great_expectations.yml
├── plugins
│   └── custom_data_docs
│       ├── renderers
│       ├── styles
│       │   └── data_docs_custom_styles.css
│       └── views
├── profilers
└── uncommitted
    ├── config_variables.yml
    ├── data_docs
    └── validations

11 directories, 3 files
[root@zb-yunweitest-mysql-204-200 ge_supermarket]# 

4.2. 配置数据源(MySQL 和 ClickHouse)

1)老版本Great Expectations 0.17.19
在 Great Expectations 0.17.19 版本中,datasources 目录默认不会自动创建(新版本才会拆分到独立目录),数据源配置需要直接写在 great_expectations.yml 主配置文件中。
[root@zb-yunweitest-mysql-204-200 gx]# cat great_expectations.yml
config_version: 3.0


datasources:
  mysql_online_orders:
    class_name: Datasource
    module_name: great_expectations.datasource
    execution_engine:
      class_name: SqlAlchemyExecutionEngine
      module_name: great_expectations.execution_engine
      connection_string: mysql+mysqlconnector://root:root@127.0.0.1:3306/supermarket_oltp
    data_connectors:
      default_runtime_data_connector:
        name: default_runtime_data_connector
        class_name: RuntimeDataConnector
        module_name: great_expectations.datasource.data_connector
        batch_identifiers:
          - default_identifier_name
      default_inferred_data_connector:  # 检查点中要引用的名称
        class_name: InferredAssetSqlDataConnector
        module_name: great_expectations.datasource.data_connector
        include_schema_name: false  # 不包含数据库名(因为连接串已指定库)
        introspection_directives:
          schema_name: supermarket_oltp  # 改为MySQL数据库名(即schema名) 
config_variables_file_path: uncommitted/config_variables.yml

# The plugins_directory will be added to your python path for custom modules
# used to override and extend Great Expectations.
plugins_directory: plugins/

stores:
# Stores are configurable places to store things like Expectations, Validations
# Data Docs, and more. These are for advanced users only - most users can simply
# leave this section alone.
#
# Three stores are required: expectations, validations, and
# evaluation_parameters, and must exist with a valid store entry. Additional
# stores can be configured for uses such as data_docs, etc.
  expectations_store:
    class_name: ExpectationsStore
    store_backend:
      class_name: TupleFilesystemStoreBackend
      base_directory: expectations/

  validations_store:
    class_name: ValidationsStore
    store_backend:
      class_name: TupleFilesystemStoreBackend
      base_directory: uncommitted/validations/

  evaluation_parameter_store:
    class_name: EvaluationParameterStore
  checkpoint_store:
    class_name: CheckpointStore
    store_backend:
      class_name: TupleFilesystemStoreBackend
      suppress_store_backend_id: true
      base_directory: checkpoints/

  profiler_store:
    class_name: ProfilerStore
    store_backend:
      class_name: TupleFilesystemStoreBackend
      suppress_store_backend_id: true
      base_directory: profilers/

expectations_store_name: expectations_store
validations_store_name: validations_store
evaluation_parameter_store_name: evaluation_parameter_store
checkpoint_store_name: checkpoint_store

data_docs_sites:
  # Data Docs make it simple to visualize data quality in your project. These
  # include Expectations, Validations & Profiles. The are built for all
  # Datasources from JSON artifacts in the local repo including validations &
  # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/docs/terms/data_docs
  local_site:
    class_name: SiteBuilder
    show_how_to_buttons: true
    store_backend:
      class_name: TupleFilesystemStoreBackend
      base_directory: uncommitted/data_docs/local_site/
    site_index_builder:
      class_name: DefaultSiteIndexBuilder

anonymous_usage_statistics:
  data_context_id: aa5ab25d-4235-4e8d-ac07-144f9ab6466b
  enabled: true
fluent_datasources:
  default_pandas_datasource:
    type: pandas
notebooks:
include_rendered_content:
  globally: false
  expectation_suite: false
  expectation_validation_result: false
View Code
2)新版本创建 great_expectations/datasources/supermarket_datasources.yml
Great Expectations数据源配置
name: supermarket_datasources
class_name: Datasource
execution_engine:
  class_name: PandasExecutionEngine
data_connectors:
  # MySQL线上订单数据源
  mysql_online_orders:
    class_name: ConfiguredAssetSqlDataConnector
    credentials:
      drivername: mysql+mysqlconnector
      host: localhost
      port: 3306
      username: root
      password: your_mysql_password  # 替换为你的密码
      database: supermarket_oltp
    assets:
      online_orders:
        table_name: online_orders
        schema_name: supermarket_oltp

  # ClickHouse线下销售数据源
  ck_offline_sales:
    class_name: ConfiguredAssetSqlDataConnector
    credentials:
      drivername: clickhouse+clickhouse_connect
      host: localhost
      port: 8123
      username: default
      password: ""  # 替换为你的密码
      database: supermarket_olap
    assets:
      offline_sales:
        table_name: offline_sales
        schema_name: supermarket_olap
View Code

4.3. 创建期望套件(数据质量规则)

(1)线上订单数据规则(MySQL)
创建 great_expectations/expectations/online_orders_suite.json
线上订单数据质量规则
{
  "expectation_suite_name": "online_orders_suite",
  "data_asset_type": "Dataset",
  "expectations": [
    {
      "expectation_type": "expect_column_values_to_not_be_null",
      "kwargs": {"column": "order_id"}  
    },
    {
      "expectation_type": "expect_column_values_to_match_regex",
      "kwargs": {
        "column": "product_id",
        "regex": "^SP-\\d{4}$"  
      }
    },
    {
      "expectation_type": "expect_column_values_to_be_in_set",
      "kwargs": {
        "column": "payment_status",
        "value_set": ["unpaid", "paid", "refunded"] 
      }
    }
  ]
}
View Code
(2)线下销售数据规则(ClickHouse)
创建 great_expectations/expectations/offline_sales_suite.json
线下销售数据质量规则
{
  "data_asset_type": "Dataset",
  "expectations": [
    {
      "expectation_type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "sale_date"
      }
    },
    {
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "store_id",
        "min_value": 1,
        "max_value": 20
      }
    },
    {
      "expectation_type": "expect_column_values_to_match_regex",
      "kwargs": {
        "column": "product_id",
        "regex": "^SP-\\d{4}$"
      }
    },
    {
      "expectation_type": "expect_column_values_to_be_greater_than",
      "kwargs": {
        "column": "total_quantity",
        "value": 0
      }
    },
    {
      "expectation_type": "expect_column_values_to_be_greater_than",
      "kwargs": {
        "column": "total_revenue",
        "value": 0
      }
    }
  ],
  "meta": {
    "great_expectations_version": "0.18.11"
  },
  "expectation_suite_name": "offline_sales_suite"
}
View Code

4.4 执行数据质量验证-命令行

[root@zb-yunweitest-mysql-204-200 gx]# great_expectations checkpoint run supermarket_checkpoint
Calculating Metrics: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 93.19it/s]
Validation failed!

Suite Name                                   Status     Expectations met
- online_orders_suite                        ✖ Failed   1 of 3 (33.33 %)

生成HTML文件

[root@zb-yunweitest-mysql-204-200 gx]# great_expectations docs build

The following Data Docs sites will be built:

 - local_site: file:///data/ge_supermarket/gx/uncommitted/data_docs/local_site/index.html

Would you like to proceed? [Y/n]: Y

Building Data Docs...

Done building Data Docs

[root@zb-yunweitest-mysql-204-200 gx]# cd /data/ge_supermarket/gx/uncommitted/data_docs/local_site/
[root@zb-yunweitest-mysql-204-200 local_site]# ll
total 40K
drwxr-xr-x 5 root root  77 Aug 14 15:12 .
drwxr-xr-x 3 root root  24 Aug 14 15:12 ..
drwxr-xr-x 2 root root  38 Aug 14 15:12 expectations
-rw-r--r-- 1 root root 37K Aug 14 15:14 index.html
drwxr-xr-x 5 root root  47 Aug 14 15:12 static
drwxr-xr-x 3 root root  33 Aug 14 15:12 validations
[root@zb-yunweitest-mysql-204-200 local_site]# 

4.4. 执行数据质量验证-Python执行

创建 run_validation.py 脚本:
执行数据质量验证的脚本
from great_expectations.data_context import FileDataContext

# 初始化数据上下文
context = FileDataContext.create(project_root_dir=".")

# 验证MySQL线上订单数据
def validate_online_orders():
    batch_request = context.get_batch_request(
        datasource_name="supermarket_datasources",
        data_connector_name="mysql_online_orders",
        data_asset_name="online_orders"
    )
    
    # 执行验证
    results = context.run_validation_operator(
        "action_list_operator",
        assets_to_validate=[batch_request],
        expectation_suite_name="online_orders_suite"
    )
    
    print("线上订单数据验证完成")
    return results

# 验证ClickHouse线下销售数据
def validate_offline_sales():
    batch_request = context.get_batch_request(
        datasource_name="supermarket_datasources",
        data_connector_name="ck_offline_sales",
        data_asset_name="offline_sales"
    )
    
    # 执行验证
    results = context.run_validation_operator(
        "action_list_operator",
        assets_to_validate=[batch_request],
        expectation_suite_name="offline_sales_suite"
    )
    
    print("线下销售数据验证完成")
    return results

if __name__ == "__main__":
    online_results = validate_online_orders()
    offline_results = validate_offline_sales()
    
    # 生成数据文档(HTML报告)
    context.build_data_docs()
    print("数据质量报告已生成:请查看 great_expectations/uncommitted/data_docs/local_site/")
View Code

运行验证脚本

python run_validation.py

5. 步骤 4:数据质量结果输出与分析

5.1. 查看 HTML 报告

验证完成后,打开自动生成的报告:
# Linux/macOS
open great_expectations/uncommitted/data_docs/local_site/index.html

# Windows
start great_expectations/uncommitted/data_docs/local_site/index.html

5.2. 典型结果分析(商超场景)

(1)线上订单数据问题
  • 常见异常:
    • 10% 的订单出现 total_amount < 0(金额为负),可能是支付系统计算错误。
    • 5% 的订单 payment_status 为 invalid,可能是订单状态流转逻辑漏洞。
    • 2% 的 product_id 格式错误(如 SP-123 位数不足),需检查商品编码生成规则。
(2)线下销售数据问题
  • 常见异常:
    • 8% 的记录 total_quantity < 0(销量为负),可能是退货未正确标记导致统计错误。
    • 6% 的 product_id 为 INVALID-XX,可能是 POS 系统扫码错误或手工录入失误。
    • 1% 的 store_id > 20,可能是新增门店未及时更新系统配置。

5.3. 商超场景改进建议

  • 线上系统:在订单提交接口增加金额校验和状态合法性检查,阻断异常数据写入。
  • 线下系统:优化 POS 机商品扫码逻辑,增加销量为负时的二次确认流程。
  • 数据 pipeline:在 MySQL 同步到 ClickHouse 的 ETL 过程中,增加异常数据过滤环节。

6. 总结

通过 Great Expectations 可实现商超线上 / 线下数据的自动化质量监控:

  1. MySQL(OLTP) 聚焦实时订单的合法性校验,确保交易数据准确。
  2. ClickHouse(OLAP) 侧重汇总数据的一致性检查,保障分析决策可靠。
  3. 结合自动生成的质量报告,可快速定位业务系统漏洞和数据流转问题,提升商超数据可信度。
 posted on 2025-08-13 14:50  xibuhaohao  阅读(21)  评论(0)    收藏  举报