关于python调用Beeline客户端来执行任务的想法
公司用上了华为云,不过有个糟糕的场景是,这东西是内网的,环境和互联网隔离,按着官网的文档,试了下~还是没有配置好开发环境. 不过庆幸的是,华为云的大数据客户端是安装成功的。所以,那就基于Beeline来完成一些简单的任务吧。
import subprocess
import logging
import re
from typing import List, Dict, Any, Optional
def execute_hive_sql(
sql: str,
beeline_path: str = "beeline",
jdbc_url: str = "jdbc:hive2://localhost:10000/default",
user: Optional[str] = None,
password: Optional[str] = None,
additional_args: List[str] = None,
log_level: int = logging.INFO
) -> Dict[str, Any]:
"""
使用Beeline客户端执行Hive SQL并捕获执行日志
Args:
sql: 要执行的SQL语句
beeline_path: Beeline客户端路径
jdbc_url: Hive JDBC连接URL
user: 数据库用户名
password: 数据库密码
additional_args: Beeline额外参数
log_level: 日志级别
Returns:
包含执行结果的字典,包含以下键:
- success: 执行是否成功
- exit_code: 进程退出码
- output: 标准输出内容
- error: 错误输出内容
- execution_time: 执行时间(秒)
- rows_affected: 受影响的行数(如果可用)
"""
# 配置日志
logger = logging.getLogger("hive_executor")
logger.setLevel(log_level)
if not logger.handlers:
ch = logging.StreamHandler()
ch.setLevel(log_level)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
# 构建beeline命令
cmd = [beeline_path, "-u", jdbc_url]
if user:
cmd.extend(["-n", user])
if password:
cmd.extend(["-p", password])
if additional_args:
cmd.extend(additional_args)
# 添加执行SQL的选项
cmd.extend(["-e", sql])
logger.info(f"执行Beeline命令: {' '.join(cmd)}")
try:
# 执行命令并捕获输出
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True
)
# 实时捕获输出
stdout_lines = []
stderr_lines = []
for line in iter(process.stdout.readline, ''):
line = line.strip()
stdout_lines.append(line)
logger.info(line)
for line in iter(process.stderr.readline, ''):
line = line.strip()
stderr_lines.append(line)
logger.error(line)
# 等待进程完成并获取返回码
exit_code = process.wait()
# 解析执行结果
stdout = "\n".join(stdout_lines)
stderr = "\n".join(stderr_lines)
# 尝试提取执行时间
execution_time = None
time_match = re.search(r"Time taken: (\d+\.\d+) seconds", stdout)
if time_match:
execution_time = float(time_match.group(1))
# 尝试提取受影响的行数
rows_affected = None
rows_match = re.search(r"(\d+) rows affected", stdout)
if rows_match:
rows_affected = int(rows_match.group(1))
# 判断执行是否成功
success = exit_code == 0 and not any(
error_keyword in stderr.lower()
for error_keyword in ["error", "exception", "failed"]
)
return {
"success": success,
"exit_code": exit_code,
"output": stdout,
"error": stderr,
"execution_time": execution_time,
"rows_affected": rows_affected
}
except Exception as e:
logger.exception(f"执行Hive SQL时发生错误: {str(e)}")
return {
"success": False,
"exit_code": -1,
"output": "",
"error": f"执行命令时发生异常: {str(e)}",
"execution_time": None,
"rows_affected": None
}
# 使用示例
if __name__ == "__main__":
# 示例1: 执行简单的SELECT语句
result = execute_hive_sql("SELECT * FROM employees LIMIT 10")
print(f"执行状态: {'成功' if result['success'] else '失败'}")
print(f"执行时间: {result['execution_time']}秒")
# 示例2: 执行DDL语句
ddl_result = execute_hive_sql("CREATE TABLE IF NOT EXISTS test_table (id INT, name STRING)")
print(f"DDL执行状态: {'成功' if ddl_result['success'] else '失败'}")

浙公网安备 33010602011771号