实验6 熟悉Hive的基本操作
实验步骤
(1)创建一个内部表stocks,字段分隔符为英文逗号,表结构如表14-11所示。
表14-11 stocks表结构
|
col_name |
data_type |
|
exchange |
string |
|
symbol |
string |
|
ymd |
string |
|
price_open |
float |
|
price_high |
float |
|
price_low |
float |
|
price_close |
float |
|
volume |
int |
|
price_adj_close |
float |
(2)创建一个外部分区表dividends(分区字段为exchange和symbol),字段分隔符为英文逗号,表结构如表14-12所示。
表14-12 dividends表结构
|
col_name |
data_type |
|
ymd |
string |
|
dividend |
float |
|
exchange |
string |
|
symbol |
string |
(3)从stocks.csv文件向stocks表中导入数据。
(4) 创建一个未分区的外部表dividends_unpartitioned,并从dividends.csv向其中导入数据,表结构如表14-13所示。
表14-13 dividends_unpartitioned表结构
|
col_name |
data_type |
|
ymd |
string |
|
dividend |
float |
|
exchange |
string |
|
symbol |
string |
(5)通过对dividends_unpartitioned的查询语句,利用Hive自动分区特性向分区表dividends各个分区中插入对应数据。
(6)查询IBM公司(symbol=IBM)从2000年起所有支付股息的交易日(dividends表中有对应记录)的收盘价(price_close)。
(7)查询苹果公司(symbol=AAPL)2008年10月每个交易日的涨跌情况,涨显示rise,跌显示fall,不变显示unchange。
(8)查询stocks表中收盘价(price_close)比开盘价(price_open)高得最多的那条记录的交易所(exchange)、股票代码(symbol)、日期(ymd)、收盘价、开盘价及二者差价。
(9)从stocks表中查询苹果公司(symbol=AAPL)年平均调整后收盘价(price_adj_close) 大于50美元的年份及年平均调整后收盘价。
(10)查询每年年平均调整后收盘价(price_adj_close)前三名的公司的股票代码及年平均调整后收盘价。
具体代码
-- (1)创建内部表stocks
CREATE TABLE stocks (
`exchange` STRING,
symbol STRING,
ymd STRING,
price_open FLOAT,
price_high FLOAT,
price_low FLOAT,
price_close FLOAT,
volume INT,
price_adj_close FLOAT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
-- (2)创建外部分区表dividends
CREATE EXTERNAL TABLE dividends (
ymd STRING,
dividend FLOAT
)
PARTITIONED BY (`exchange` STRING, symbol STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
-- (3)从本地文件系统加载数据到stocks表
LOAD DATA LOCAL INPATH '/export/server/stocks.csv' INTO TABLE stocks;
-- (4.1)按照CSV文件的实际字段顺序创建表
CREATE EXTERNAL TABLE dividends_unpartitioned (
`exchange` STRING, -- 第一列:交易所
symbol STRING, -- 第二列:股票代码
ymd STRING, -- 第三列:日期
dividend FLOAT -- 第四列:分红金额
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/user/hive/warehouse/dividends_unpartitioned';
-- (4.2)加载数据
LOAD DATA LOCAL INPATH '/export/server/dividends.csv' INTO TABLE dividends_unpartitioned;
-- (5) 使用动态分区从dividends_unpartitioned向dividends分区表插入数据
-- 增加更多资源给MapReduce任务
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
SET hive.exec.max.dynamic.partitions = 10000;
SET hive.exec.max.dynamic.partitions.pernode = 1000;
-- 大幅增加内存配置
SET mapreduce.map.memory.mb = 4096;
SET mapreduce.reduce.memory.mb = 4096;
SET mapreduce.map.java.opts = -Xmx3072m;
SET mapreduce.reduce.java.opts = -Xmx3072m;
-- 设置更激进的超时和重试参数
SET mapreduce.task.timeout = 1800000;
SET mapreduce.map.maxattempts = 5;
SET mapreduce.reduce.maxattempts = 5;
-- 启用推测执行
SET mapreduce.map.speculative = true;
SET mapreduce.reduce.speculative = true;
-- 然后尝试完整插入
INSERT OVERWRITE TABLE dividends PARTITION (`exchange`, symbol)
SELECT
ymd,
dividend,
`exchange`,
symbol
FROM dividends_unpartitioned;
-- (6) 查询IBM公司(symbol=IBM)从2000年起所有支付股息的交易日的收盘价
SELECT
d.ymd AS dividend_date,
d.dividend,
s.price_close
FROM dividends d
JOIN stocks s ON d.symbol = s.symbol AND d.ymd = s.ymd
WHERE d.symbol = 'IBM'
AND d.ymd >= '2000-01-01'
ORDER BY d.ymd;
-- (7) 查询苹果公司(symbol=AAPL)2008年10月每个交易日的涨跌情况
SELECT
ymd,
price_open,
price_close,
price_close - price_open AS price_change,
CASE
WHEN price_close > price_open THEN 'rise'
WHEN price_close < price_open THEN 'fall'
ELSE 'unchange'
END AS price_trend
FROM stocks
WHERE symbol = 'AAPL'
AND ymd LIKE '2008-10-%'
ORDER BY ymd;
-- (8) 查询stocks表中收盘价比开盘价高得最多的记录
SELECT
`exchange`,
symbol,
ymd,
price_open,
price_close,
(price_close - price_open) AS price_difference
FROM stocks
WHERE (price_close - price_open) = (
SELECT MAX(price_close - price_open)
FROM stocks
WHERE price_close > price_open
);
-- (9) 查询苹果公司年平均调整后收盘价大于50美元的年份
SELECT
SUBSTR(ymd, 1, 4) AS year,
AVG(price_adj_close) AS avg_adj_close
FROM stocks
WHERE symbol = 'AAPL'
GROUP BY SUBSTR(ymd, 1, 4)
HAVING AVG(price_adj_close) > 50
ORDER BY year;
-- (10) 查询每年年平均调整后收盘价前三名的公司
WITH yearly_avg AS (
SELECT
SUBSTR(ymd, 1, 4) AS year,
symbol,
AVG(price_adj_close) AS avg_adj_close
FROM stocks
GROUP BY SUBSTR(ymd, 1, 4), symbol
),
ranked_companies AS (
SELECT
year,
symbol,
avg_adj_close,
ROW_NUMBER() OVER (PARTITION BY year ORDER BY avg_adj_close DESC) AS rank
FROM yearly_avg
)
SELECT
year,
symbol,
avg_adj_close
FROM ranked_companies
WHERE rank <= 3
ORDER BY year, rank;
浙公网安备 33010602011771号