hive
hive是一个基于hdfs的数据库,用于大数据量存储,也可以理解为mr的一个sql引擎。

外部通过http调用,传入sql,hive的sql引擎首先会分词,然后根据语法解析语法树ASTNode,
然后经过BaseSemanticAnalyzer的自子类将语法树解析为可执行的tasks。
BaseSemanticAnalyzer的子类包括
DDLSemanticAnalyzer用于解析DDL的sql如createTable,dropTable等,生成的task主要是操作mateStore。
SemanticAnalyzer 用于解析查询,查询条件会被解析若干个opater,oprater最终会在reducer中执行。
此类解析最终产生fetchWork,fetchWork的inputPath依赖与ExecDriver,ExecDriver会最终执行mapreduce任务。
FileInputFormat.setInputPaths(job, currPath);
package org.apache.hadoop.hive.ql;
import java.io.DataInput;
import java.io.IOException;
import java.io.Serializable;
import java.util.*;
import org.apache.hadoop.hive.ql.parse.ASTNode;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.parse.ParseDriver;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.ParseException;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer;
import org.apache.hadoop.hive.ql.parse.SemanticAnalyzerFactory;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.FetchTask;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.history.HiveHistory;
import org.apache.hadoop.hive.ql.history.HiveHistory.Keys;
import org.apache.hadoop.hive.ql.plan.tableDesc;
import org.apache.hadoop.hive.serde2.ByteStream;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class Driver implements CommandProcessor {
static final private Log LOG = LogFactory.getLog("hive.ql.Driver");
private int maxRows = 100;
ByteStream.Output bos = new ByteStream.Output();
private ParseDriver pd;
private HiveConf conf;
private DataInput resStream;
private LogHelper console;
private Context ctx;
private BaseSemanticAnalyzer sem;
public int countJobs(List<Task<? extends Serializable>> tasks) {
return countJobs(tasks, new ArrayList<Task<? extends Serializable>>());
}
public int countJobs(List<Task<? extends Serializable>> tasks, List<Task<? extends Serializable>> seenTasks) {
if (tasks == null)
return 0;
int jobs = 0;
for (Task<? extends Serializable> task : tasks) {
if (!seenTasks.contains(task)) {
seenTasks.add(task);
if (task.isMapRedTask()) {
jobs++;
}
jobs += countJobs(task.getChildTasks(), seenTasks);
}
}
return jobs;
}
/**
* Return the Thrift DDL string of the result
*/
public String getSchema() throws Exception {
if (sem != null && sem.getFetchTask() != null) {
if (!sem.getFetchTaskInit()) {
sem.setFetchTaskInit(true);
sem.getFetchTask().initialize(conf);
}
FetchTask ft = (FetchTask) sem.getFetchTask();
tableDesc td = ft.getTblDesc();
String tableName = "result";
List<FieldSchema> lst = MetaStoreUtils.getFieldsFromDeserializer(
tableName, td.getDeserializer());
String schema = MetaStoreUtils.getDDLFromFieldSchema(tableName, lst);
return schema;
}
return null;
}
/**
* Return the maximum number of rows returned by getResults
*/
public int getMaxRows() {
return maxRows;
}
/**
* Set the maximum number of rows returned by getResults
*/
public void setMaxRows(int maxRows) {
this.maxRows = maxRows;
}
public boolean hasReduceTasks(List<Task<? extends Serializable>> tasks) {
if (tasks == null)
return false;
boolean hasReduce = false;
for (Task<? extends Serializable> task : tasks) {
if (task.hasReduce()) {
return true;
}
hasReduce = (hasReduce || hasReduceTasks(task.getChildTasks()));
}
return hasReduce;
}
/**
* for backwards compatibility with current tests
*/
public Driver(HiveConf conf) {
console = new LogHelper(LOG);
this.conf = conf;
ctx = new Context(conf);
}
public Driver() {
console = new LogHelper(LOG);
if (SessionState.get() != null) {
conf = SessionState.get().getConf();
ctx = new Context(conf);
}
}
private String makeQueryId() {
GregorianCalendar gc = new GregorianCalendar();
String userid = System.getProperty("user.name");
return userid + "_" +
String.format("%1$4d%2$02d%3$02d%4$02d%5$02d%5$02d", gc.get(Calendar.YEAR),
gc.get(Calendar.MONTH) + 1,
gc.get(Calendar.DAY_OF_MONTH),
gc.get(Calendar.HOUR_OF_DAY),
gc.get(Calendar.MINUTE), gc.get(Calendar.SECOND));
}
public int run(String command) {
boolean noName = StringUtils.isEmpty(conf
.getVar(HiveConf.ConfVars.HADOOPJOBNAME));
int maxlen = conf.getIntVar(HiveConf.ConfVars.HIVEJOBNAMELENGTH);
conf.setVar(HiveConf.ConfVars.HIVEQUERYSTRING, command);
String queryId = makeQueryId();
conf.setVar(HiveConf.ConfVars.HIVEQUERYID, queryId);
try {
TaskFactory.resetId();
LOG.info("Starting command: " + command);
ctx.clear();
ctx.makeScratchDir();
if (SessionState.get() != null)
SessionState.get().getHiveHistory().startQuery(command, conf.getVar(HiveConf.ConfVars.HIVEQUERYID) );
resStream = null;
pd = new ParseDriver();
ASTNode tree = pd.parse(command);
while ((tree.getToken() == null) && (tree.getChildCount() > 0)) {
tree = (ASTNode) tree.getChild(0);
}
sem = SemanticAnalyzerFactory.get(conf, tree);
// Do semantic analysis and plan generation
sem.analyze(tree, ctx);
LOG.info("Semantic Analysis Completed");
int jobs = countJobs(sem.getRootTasks());
if (jobs > 0) {
console.printInfo("Total MapReduce jobs = " + jobs);
}
if (SessionState.get() != null){
SessionState.get().getHiveHistory().setQueryProperty(queryId, Keys.QUERY_NUM_TASKS, String.valueOf(jobs));
SessionState.get().getHiveHistory().setIdToTableMap(sem.getIdToTableNameMap());
}
String jobname = Utilities.abbreviate(command, maxlen - 6);
int curJobNo = 0;
// A very simple runtime that keeps putting runnable takss
// on a list and when a job completes, it puts the children at the back of
// the list
// while taking the job to run from the front of the list
Queue<Task<? extends Serializable>> runnable = new LinkedList<Task<? extends Serializable>>();
for (Task<? extends Serializable> rootTask : sem.getRootTasks()) {
if (runnable.offer(rootTask) == false) {
LOG.error("Could not insert the first task into the queue");
return (1);
}
}
while (runnable.peek() != null) {
Task<? extends Serializable> tsk = runnable.remove();
if (SessionState.get() != null)
SessionState.get().getHiveHistory().startTask(queryId, tsk, tsk.getClass().getName());
if (tsk.isMapRedTask()) {
curJobNo++;
if (noName) {
conf.setVar(HiveConf.ConfVars.HADOOPJOBNAME, jobname + "(" + curJobNo
+ "/" + jobs + ")");
}
}
tsk.initialize(conf);
int exitVal = tsk.execute();
if (SessionState.get() != null) {
SessionState.get().getHiveHistory().setTaskProperty(queryId, tsk.getId(), Keys.TASK_RET_CODE, String.valueOf(exitVal));
SessionState.get().getHiveHistory().endTask(queryId, tsk);
}
if (exitVal != 0) {
console.printError("FAILED: Execution Error, return code " + exitVal + " from " + tsk.getClass().getName());
return 9;
}
tsk.setDone();
if (tsk.getChildTasks() == null) {
continue;
}
for (Task<? extends Serializable> child : tsk.getChildTasks()) {
// Check if the child is runnable
if (!child.isRunnable()) {
continue;
}
if (runnable.offer(child) == false) {
LOG.error("Could not add child task to queue");
}
}
}
if (SessionState.get() != null){
SessionState.get().getHiveHistory().setQueryProperty(queryId,
Keys.QUERY_RET_CODE, String.valueOf(0));
SessionState.get().getHiveHistory().printRowCount(queryId);
}
} catch (SemanticException e) {
if (SessionState.get() != null)
SessionState.get().getHiveHistory().setQueryProperty(queryId,
Keys.QUERY_RET_CODE, String.valueOf(10));
console.printError("FAILED: Error in semantic analysis: "
+ e.getMessage(), "\n"
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
return (10);
} catch (ParseException e) {
if (SessionState.get() != null)
SessionState.get().getHiveHistory().setQueryProperty(queryId,
Keys.QUERY_RET_CODE, String.valueOf(11));
console.printError("FAILED: Parse Error: " + e.getMessage(), "\n"
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
return (11);
} catch (Exception e) {
if (SessionState.get() != null)
SessionState.get().getHiveHistory().setQueryProperty(queryId,
Keys.QUERY_RET_CODE, String.valueOf(12));
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
console.printError("FAILED: Unknown exception : " + e.getMessage(), "\n"
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
return (12);
} finally {
if (SessionState.get() != null)
SessionState.get().getHiveHistory().endQuery(queryId);
if (noName) {
conf.setVar(HiveConf.ConfVars.HADOOPJOBNAME, "");
}
}
console.printInfo("OK");
return (0);
}
public boolean getResults(Vector<String> res) {
if (sem != null && sem.getFetchTask() != null) {
if (!sem.getFetchTaskInit()) {
sem.setFetchTaskInit(true);
sem.getFetchTask().initialize(conf);
}
FetchTask ft = (FetchTask) sem.getFetchTask();
ft.setMaxRows(maxRows);
return ft.fetch(res);
}
if (resStream == null)
resStream = ctx.getStream();
if (resStream == null)
return false;
int numRows = 0;
String row = null;
while (numRows < maxRows) {
if (resStream == null) {
if (numRows > 0)
return true;
else
return false;
}
bos.reset();
Utilities.streamStatus ss;
try {
ss = Utilities.readColumn(resStream, bos);
if (bos.getCount() > 0)
row = new String(bos.getData(), 0, bos.getCount(), "UTF-8");
else if (ss == Utilities.streamStatus.TERMINATED)
row = new String();
if (row != null) {
numRows++;
res.add(row);
}
} catch (IOException e) {
console.printError("FAILED: Unexpected IO exception : "
+ e.getMessage());
res = null;
return false;
}
if (ss == Utilities.streamStatus.EOF)
resStream = ctx.getStream();
}
return true;
}
public int close() {
try {
// Delete the scratch directory from the context
ctx.removeScratchDir();
ctx.clear();
} catch (Exception e) {
console.printError("FAILED: Unknown exception : " + e.getMessage(), "\n"
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
return (13);
}
return (0);
}
}
浙公网安备 33010602011771号