Hive 高阶应用开发示例(一)

Hive的一些常用的高阶开发

内容
   1.开窗函数
   2.行转列,列转行,多行转一行,一行转多行
   3.分组: 增强型group
  4.排序
  5.关联

本次的内容: 内容1 和内容2,采用的是示例数据以及对应的实现。数据可以直接放在Hive中执行。可以直观的观察数据,进而对函数以及相应的功能有所熟悉。

  对于不同的场景的数据计算,了解SQL的基本语法以及一些高阶用法,在这些基础上组合相应的功能。这些都是一些工程上的应用,多练习的。通过构建数据集来验证的方式,是可以自己来确认一些似是而非的语法。对于Hive底层原理和代码的了解也是途径之一。构建数据集验证与通过原理去分析了解的方法可以相互配合使用。最终的目标之一就是更好的实现业务分析目标。– over() 子句 有order by, 分区内排序后一个个叠– windows子句 WINDOW子句(灵活控制窗口的子集)

— 
WITH table_1 AS(
SELECT ‘1’ AS mem_id , 10 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 2 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 3 AS score , ‘2020-08-09 09:08:10’ AS createtime
UNION ALL 
SELECT ‘2’ AS mem_id , 6 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘3’ AS mem_id , 7 AS score , ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘3’ AS mem_id , 9 AS score , ‘2020-08-09 09:08:10’ AS createtime
)
SELECT 
mem_id
, score 
, SUM(score) OVER(PARTITION BY mem_id ) AS pv1
, SUM(score) OVER(PARTITION BY mem_id ORDER BY createtime) AS pv1 — 默认为从起点到当前行
, SUM(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS pv2 –从起点到当前行,结果同pv1 
, SUM(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW ) AS pv3 –当前行+往前3行
, SUM(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING ) AS pv4 –当前行+往前3行+往后1行
, SUM(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv5 — —当前行+往后所有行 
FROM table_1 
;

WITH table_1 AS(
SELECT ‘1’ AS mem_id , 10 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 5 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 2 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 3 AS score , ‘2020-08-09 09:08:10’ AS createtime 
)
SELECT 
mem_id
, score 
, createtime
, MAX(score) OVER(PARTITION BY mem_id ) AS pv1 — 分组的
, MAX(score) OVER(PARTITION BY mem_id ORDER BY createtime) AS pv1 — 默认为从起点到当前行
, MAX(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2 –从起点到当前行,结果同pv1 
, MAX(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv3 –当前行+往前3行
, MAX(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv4 –当前行+往前3行+往后1行
, MAX(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv5 —当前行+往后所有行 
FROM table_1 
;

— 1.排序开窗函数
— row_number() :从1开始,按照顺序,生成分组内记录的序列,row_number()的值不会存在重复 1 2 3 4
— dense_rank() :生成数据项在分组中的排名,排名相等会在名次中不会留下空位 1 2 2 3
— rank() :生成数据项在分组中的排名,排名相等会在名次中留下空位 1 2 2 4

WITH table_1 AS(
SELECT ‘1’ AS mem_id , 10 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 5 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 2 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 3 AS score , ‘2020-08-09 09:08:10’ AS createtime 
)
SELECT 
mem_id
, score 
, createtime
, ROW_NUMBER() OVER(PARTITION BY mem_id ORDER BY createtime ) AS ROW_NUMBER_pv1 — 1 2 3 4 
, DENSE_RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS DENSE_RANK_pv2 — 1 2 2 3
, RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS RANK_pv3 — 1 2 2 4
, ROW_NUMBER() OVER(PARTITION BY mem_id ORDER BY createtime desc) AS ROW_NUMBER_desc_pv1 — 1 2 3 4 
, DENSE_RANK() OVER(PARTITION BY mem_id ORDER BY createtime desc) AS DENSE_RANK_desc_pv2 — 1 2 2 3
, RANK() OVER(PARTITION BY mem_id ORDER BY createtime desc) AS RANK_desc_pv3 — 1 2 2 4
FROM table_1 
ORDER BY createtime
;

— ntile(10) over ( partition by t1.grp_cd order by t1.pay_amt desc ) Monyrank
WITH table_1 AS(
SELECT ‘1’ AS mem_id , 10 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 5 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 2 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 3 AS score , ‘2020-08-09 09:08:10’ AS createtime 
UNION ALL 
SELECT ‘2’ AS mem_id , 6 AS score , ‘2020-08-09 09:08:10’ AS createtime 
UNION ALL 
SELECT ‘1’ AS mem_id , 7 AS score , ‘2020-08-09 09:08:10’ AS createtime 
)
SELECT 
mem_id
, score 
, createtime
, ntile(3) OVER(PARTITION BY mem_id ORDER BY createtime ) AS ntile_pv1 
, ntile(3) OVER(PARTITION BY score ORDER BY createtime ) AS ntile_pv2 
, ntile(3) OVER(PARTITION BY mem_id ORDER BY score ) AS ntile_pv3 
FROM table_1 
;

 — datediff(from_unixtime(unix_timestamp(‘${hivevar:statis_date}’,’yyyyMMdd’),’yyyy-MM-dd’), from_unixtime(unix_timestamp(statis_date,’yyyyMMdd’),’yyyy-MM-dd’) ) as date_flag

— pow(2, date_flag ) AS data_flag 
— conv(CAST(SUM(data_flag ) AS int),10,2) AS continuity_flag
— locate(‘0’,REVERSE(continuity_flag)) AS continuity_locate,
— length(continuity_flag) AS continuity_len
— CASE WHEN continuity_locate= 0 then continuity_len ELSE continuity_locate-1 END AS con_pv_day,

— 序列函数不支持WINDOW子句
CUME_DIST — 小于等于当前值的行数/分组内总行数 — 统计小于等于当前薪水的人数,所占总人数的比例
PERCENT_RANK — 分组内当前行的RANK值-1/分组内总行数-1

WITH table_1 AS(
SELECT ‘1’ AS mem_id , 10 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 5 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 2 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 3 AS score , ‘2020-08-09 09:08:10’ AS createtime 
)
SELECT 
mem_id
, score 
, createtime
, ROW_NUMBER() OVER(PARTITION BY mem_id ORDER BY createtime ) AS ROW_NUMBER_pv1 — 1 2 3 4 
, RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS RANK_pv3 — 1 2 2 4
, CUME_DIST() OVER(PARTITION BY mem_id ORDER BY createtime ) AS ROW_NUMBER_desc_pv1 — 小于等于当前值的行数/分组内总行数 
, PERCENT_RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS DENSE_RANK_desc_pv2 — 分组内当前行的RANK值-1/分组内总行数-1 
FROM table_1 
ORDER BY createtime
;–1.LEAD(col,n,DEFAULT) 用于统计窗口内往下第n行值— 第一个参数为列名,第二个参数为往下第n行(可选,默认为1,不可为负数),第三个参数为默认值(当往下第n行为NULL时候,取默认值,如不指定,则为–2.LAG(col,n,DEFAULT) 用于统计窗口内往上第n行值— 第一个参数为列名,第二个参数为往上第n行(可选,默认为1,不可为负数),第三个参数为默认值(当往上第n行为NULL时候,取默认值,如不指定,则为NULL)

WITH table_1 AS(
SELECT ‘1’ AS mem_id , 10 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 2 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 3 AS score , ‘2020-08-09 09:08:10’ AS createtime
UNION ALL 
SELECT ‘2’ AS mem_id , 6 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘3’ AS mem_id , 7 AS score , ‘2020-08-07 09:09:10’ AS createtime
UNION ALL 
SELECT ‘3’ AS mem_id , 9 AS score , ‘2020-08-09 09:08:10’ AS createtime
)
select  mem_id
,createtime
,score
,lead(score,2) over (partition by mem_id order by createtime) as lead_2_pv
,lead(score,1) over (partition by mem_id order by createtime) as lead_1_pv –
,lead(score,1,-9999) over (partition by mem_id order by createtime) as lead_1_null_pv
,LAG (score,1,-9999) over (partition by mem_id order by createtime) as lag_1_pv — 统计窗口内往上第n行值
FROM table_1 
;

WITH table_1 AS(
SELECT ‘1’ AS mem_id , 10 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 2 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 3 AS score , ‘2020-08-09 09:08:10’ AS createtime
UNION ALL 
SELECT ‘2’ AS mem_id , 6 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘3’ AS mem_id , 7 AS score , ‘2020-08-07 09:09:10’ AS createtime
UNION ALL 
SELECT ‘3’ AS mem_id , 9 AS score , ‘2020-08-09 09:08:10’ AS createtime
)
select 
mem_id
,createtime
,score
,LAG (score,2,-9999) over (partition by mem_id order by createtime) as lag_1_pv
,LAG (score,1,-9999) over (partition by mem_id order by createtime desc) as lag_1_desc_pv
,LAG (score,1) over (partition by mem_id order by createtime desc) as lag_1_desc_pv
,LAG (createtime,1,-9999) over (partition by mem_id order by createtime) as lag_1_pv
,LAG (createtime,1,-9999) over (partition by mem_id order by createtime desc) as lag_1_desc_pv — — 统计窗口内往上第n行值
FROM table_1 
order by mem_id,createtime
;

— FIRST_VALUE取分组内排序后,截止到当前行,第一个值,
— 需要两个参数。第一个参数是您想要第一个值的列,第二个(可选)参数必须是false默认为布尔值的布尔值。如果设置为true,则跳过空值。
— LAST_VALUE取分组内排序后,截止到当前行,最后一个值,
— 需要两个参数。第一个参数是您想要第一个值的列,第二个(可选)参数必须是false默认为布尔值的布尔值。如果设置为true,则跳过空值。
WITH table_1 AS(
SELECT ‘1’ AS mem_id , 10 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 2 AS score , ‘2020-08-08 09:08:10’ AS createtime
UNION ALL 
SELECT ‘1’ AS mem_id , 3 AS score , ‘2020-08-09 09:08:10’ AS createtime
UNION ALL 
SELECT ‘2’ AS mem_id , 6 AS score, ‘2020-08-07 09:08:10’ AS createtime
UNION ALL 
SELECT ‘3’ AS mem_id , 7 AS score , ‘2020-08-07 09:09:10’ AS createtime
UNION ALL 
SELECT ‘3’ AS mem_id , 9 AS score , ‘2020-08-09 09:08:10’ AS createtime
)
SELECT 
mem_id
, score 
, createtime
, ROW_NUMBER() OVER(PARTITION BY mem_id ORDER BY createtime ) AS ROW_NUMBER_pv1 — 1 2 3 4 
, DENSE_RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS DENSE_RANK_pv2 — 1 2 2 3
, RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS RANK_pv3 — 1 2 2 4
, FIRST_VALUE(score) OVER(partition by mem_id order by createtime) as first1
, LAST_VALUE(score) OVER(partition by mem_id order by createtime) as last1 — 分组内排序后,截止到当前行,最后一个值
FROM table_1 
ORDER BY createti

— 多行转一行 hive collect_set 结果顺序不一致
— concat_ws、collect_set
WITH table_1 AS(
SELECT ‘1’ AS mem_id , 10 AS score, ‘2020-08-07 09:08:10′ AS createtime, ’10ab’ AS topic_id
UNION ALL 
SELECT ‘1’ AS mem_id , 2 AS score , ‘2020-08-08 09:08:10’ AS createtime, ‘2hb’ AS topic_id
UNION ALL 
SELECT ‘1’ AS mem_id , 3 AS score , ‘2020-08-09 09:08:10’ AS createtime, ‘3fg’ AS topic_id
UNION ALL 
SELECT ‘2’ AS mem_id , 6 AS score, ‘2020-08-07 09:08:10’ AS createtime, ‘6sf’ AS topic_id
UNION ALL 
SELECT ‘3’ AS mem_id , 7 AS score , ‘2020-08-07 09:09:10’ AS createtime, ‘7dr’ AS topic_id
UNION ALL 
SELECT ‘3’ AS mem_id , 9 AS score , ‘2020-08-09 09:08:10’ AS createtime, ‘9ng’ AS topic_id
)
SELECT
mem_id
, concat_ws(‘,’,collect_list(score)) as order_value 
, concat_ws(‘,’,sort_array(collect_list(score))) as order_value 
, collect_list( concat_ws(‘:’,lpad(cast(score as string),5,’0′),cast(topic_id as string)) ) AS demo1
, sort_array( collect_list( concat_ws(‘:’,lpad(cast(score as string),5,’0′),cast(topic_id as string)) )) AS demo2
,concat_ws(‘,’, sort_array( collect_list( concat_ws(‘:’,lpad(cast(score as string),5,’0′),cast(topic_id as string))) ) ) AS demo3
, regexp_replace(
concat_ws(‘,’,
sort_array(
collect_list(
concat_ws(‘:’,lpad(cast(score as string),5,’0′),cast(topic_id as string))),‘\\d+\:’,”) AS data
FROM table_1
group by mem_id

— 一行转多行 select explode(map_col) as (may_key_col, may_value_col) from table_name
— posexplode 相比在 explode 之上,将一列数据转为多行之后,还会输出数据的下标
WITH table_1 AS(
select “1” AS class_id,split(‘Test400|Test531|Test536′,’\\|’) AS stu_id, split(’60|30|90′,’\\|’) AS score
UNION ALL
select “2” AS class_id,split(‘Test400|Test531|Test536′,’\\|’) AS stu_id, split(’70|60|70′,’\\|’) AS score
UNION ALL
select “3” AS class_id,split(‘Test500|Test521|Test536′,’\\|’) AS stu_id, split(’70|60|70′,’\\|’) AS score
)
SELECT class_id,stu_id,examples_id1
FROM table_1
LATERAL VIEW explode(stu_id) examples as examples_id1 
;

— 一行转多行 两列的匹配 — 
WITH table_1 AS(
select “1” AS class_id,split(‘Test400|Test531|Test536′,’\\|’) AS stu_id, split(’60|30|90′,’\\|’) AS score
UNION ALL
select “2” AS class_id,split(‘Test400|Test531|Test536′,’\\|’) AS stu_id, split(’70|60|70′,’\\|’) AS score
UNION ALL
select “3” AS class_id,split(‘Test500|Test521|Test536′,’\\|’) AS stu_id, split(’70|60|70′,’\\|’) AS score
)
SELECT class_id,sn_name,sn_score
FROM table_1
lateral view posexplode(stu_id ) sn as sn_index ,sn_name
lateral view posexplode(score ) sc as sc_index ,sn_score 
WHERE sc_index = sn_index;

— 行转列
WITH table_1 AS(
SELECT ‘1’ AS mem_id , “开心” AS tagtype, ‘2020-08-07 09:08:10′ AS createtime, ’10ab’ AS topic_id
UNION ALL 
SELECT ‘1’ AS mem_id , “开心” AS tagtype , ‘2020-08-08 09:08:10’ AS createtime, ‘2hb’ AS topic_id
UNION ALL 
SELECT ‘1’ AS mem_id , “有趣” AS tagtype , ‘2020-08-09 09:08:10’ AS createtime, ‘3fg’ AS topic_id
UNION ALL 
SELECT ‘2’ AS mem_id , “有趣” AS tagtype, ‘2020-08-07 09:08:10’ AS createtime, ‘6sf’ AS topic_id
UNION ALL 
SELECT ‘3’ AS mem_id , “开心” AS tagtype , ‘2020-08-07 09:09:10’ AS createtime, ‘7dr’ AS topic_id
UNION ALL 
SELECT ‘3’ AS mem_id , “抗压” AS tagtype , ‘2020-08-09 09:08:10’ AS createtime, ‘9ng’ AS topic_id
)
select mem_id
,case when tagtype=’有趣’ then “1” else ‘0’ end as import_fun
,case when tagtype=’开心’ then “1” else ‘0’ end as import_status
,case when tagtype=’抗压’ then “1” else ‘0’ end as import_chara
,createtime
,topic_id
from table_1
;

 

—纵表变横表
— 字段 userid flag
‘张三’ AS userid,’收藏’ AS flag
‘张三’ AS userid,’购买’ AS flag
‘张三’ AS userid,’点击’ AS flag
‘李四’ AS userid,’点击’ AS flag
‘李四’ AS userid,’收藏’ AS flag
–结果数据
userid collction purchase click
‘张三’,’1′,’1′,’1′
‘李四’,’1′,’0′,’1′
–解决方案
— 使用两种解决方案–使用union之后max
WITH t1 AS (
SELECT ‘张三’ AS userid,’收藏’ AS flag
UNION ALL
SELECT ‘张三’ AS userid,’购买’ AS flag
UNION ALL
SELECT ‘张三’ AS userid,’点击’ AS flag
UNION ALL
SELECT ‘李四’ AS userid,’点击’ AS flag
UNION ALL
SELECT ‘李四’ AS userid,’收藏’ AS flag
)
SELECT tt1.userid
, MAX(tt1.collction) AS collction
, MAX(tt1.purchase) AS purchase
, MAX(tt1.click) AS click
FROM
(SELECT
t1.userid, ‘1’ AS collction ,’0′ AS purchase, ‘0’ AS click
FROM t1
WHERE t1.flag=’收藏’
UNION ALL
select
t1.userid, ‘0’ AS collction ,’1′ AS purchase, ‘0’ AS click
FROM t1
WHERE t1.flag=’购买’
UNION ALL
select
t1.userid, ‘0’ AS collction ,’0′ AS purchase, ‘1’ AS click
FROM t1
WHERE t1.flag=’点击’)tt1
GROUP BY tt1.userid
;
–使用left join的方式
WITH t1 AS (
SELECT ‘张三’ AS userid,’收藏’ AS flag
UNION ALL
SELECT ‘张三’ AS userid,’购买’ AS flag
UNION ALL
SELECT ‘张三’ AS userid,’点击’ AS flag
UNION ALL
SELECT ‘李四’ AS userid,’点击’ AS flag
UNION ALL
SELECT ‘李四’ AS userid,’收藏’ AS flag
)
SELECT
tt1.userid,
CASE WHEN tt2.userid IS NOT NULL then’1′ ELSE ‘0’END AS collction,
CASE WHEN tt3.userid IS NOT NULL then’1′ ELSE ‘0’END AS purchase,
CASE WHEN tt4.userid IS NOT NULL then’1′ ELSE ‘0’END AS click
FROM (SELECT DISTINCT t1.userid FROM t1)tt1
LEFT JOIN (SELECT DISTINCT t1.userid FROM t1 WHERE t1.flag = ‘收藏’)tt2
ON tt1.userid = tt2.userid
LEFT JOIN (SELECT DISTINCT t1.userid FROM t1 WHERE t1.flag = ‘购买’)tt3
ON tt1.userid = tt3.userid
LEFT JOIN (SELECT DISTINCT t1.userid FROM t1 WHERE t1.flag = ‘点击’)tt4
ON tt1.userid = tt4.userid

–多列转一列 横表变纵表,列转行
WITH table_1 AS (
SELECT “张三” AS userid, ‘1’ AS collction ,’1′ AS purchase, ‘1’ AS click
UNION ALL
“张三” AS userid, ‘0’ AS collction ,’1′ AS purchase, ‘1’ AS click)
SELECT
from
(SELECT userid,’收藏’ AS flag FROM table_1 WHERE collction=’1′
UNION ALL
SELECT userid,’购买’ AS flag FROM table_1 WHERE purchase=’1′
UNION ALL
SELECT userid,’点击’ AS flag FROM table_1 WHERE click=’1′)t1

本次分享主要是涉及开窗函数以及行列的一些开发内容。后续的一些内容,也是应用开发中的一些比较常见的要注意和区分的点。

本示例参考了一些网上的资料和书本的内容,由于来源未做标记,如有侵删。

 

Tags: