SELECT key , sort_array(collect_list(STRUCT(-cnt, item_cnt_pair))).col2 as item_cnt_list_sorted-- col2 为 STRUCT 中的默认列名, 负号表示倒序排列FROM( SELECT key, cnt , concat(item, ':', cnt) AS item_cnt_pair -- 可以通过 SPLIT 取值-- , STRUCT(item, cnt) AS item_cnt_pair -- 根据 STRUCT 的默认列名取值-- , MAP(item, cnt) AS item_cnt_pair -- 可以利用 map_keys/map_values 取值 FROM ( SELECT key, item, count(1) AS cnt-- FROM db.some_table A FROM ( SELECT 'A'ASkey, 'red'AS item UNION ALL SELECT 'A'ASkey, 'red'AS item UNION ALL SELECT 'A'ASkey, 'blue'AS item UNION ALL SELECT 'A'ASkey, 'blue'AS item UNION ALL SELECT 'A'ASkey, 'yellow'AS item UNION ALL SELECT 'A'ASkey, 'yellow'AS item UNION ALL SELECT 'A'ASkey, 'yellow'AS item UNION ALL SELECT 'B'ASkey, 'yellow'AS item UNION ALL SELECT 'B'ASkey, 'yellow'AS item UNION ALL SELECT 'B'ASkey, 'green'AS item ) A GROUP BYkey, item ) A) AGROUP BY key;-- A, ["yellow:3","blue:2","red:2"]-- B, ["yellow:2","green:1"]
-- 语法lateralView: LATERAL VIEW udtf(expression) tableAlias AS columnAlias (',' columnAlias)*fromClause: FROM baseTable (lateralView)*-- 示例 1: 基本用法SELECT A.a, B.bFROM (SELECT 'id'AS a, array(1,2,3) AS arr) A LATERAL VIEW explode(A.arr) B AS b;-- 产生 3 条数据-- 示例 2: 多个侧视图, 相当于做了笛卡尔积SELECT A.a, B.b, C.cFROM (SELECT 'id'AS a, array(1,2,3) AS arr1, array('a','b','c') AS arr2) A LATERAL VIEW explode(A.arr1) B AS bLATERAL VIEW explode(A.arr2) C AS c;-- 产生 9 条数据-- 示例 3: 多个数组并列, 相当于 Python 中的 zipSELECT A.a, B.b, A.arr2[i] AS cFROM (SELECT 'id'AS a, array(1,2,3) AS arr1, array('a','b','c') AS arr2) A LATERAL VIEW posexplode(A.arr1) B AS i, b;-- 产生 3 条数据-- LATERAL VIEW OUTERSELECT A.a, B.b FROM (SELECT explode(array(1,2,3)) a) A LATERAL VIEW explode(array()) B AS b; -- 结果为空, 相当于 JOIN 中右表为空, 导致整体为空SELECT A.a, B.b FROM (SELECT explode(array(1,2,3)) a) A LATERAL VIEW OUTER explode(array()) B AS b; -- 结果不为空, 相当于 LEFT JOIN, 不影响左表的结果
侧视图 for Presto (CROSS JOIN)
Presto 中与 Hive 三个示例对应的写法
以下 SQL 未经过测试, 其中构造的 array 的方法在 Presto 中可能有问题;
-- 示例 1SELECT A.a, B.bFROM (SELECT 'id'AS a, array(1,2,3) AS arr) A CROSS JOIN UNNEST(A.arr) AS B(b);-- LATERAL VIEW explode(A.arr) B AS b;-- 产生 3 条数据-- 示例 2SELECT A.a, B.b, C.cFROM (SELECT 'id'AS a, array(1,2,3) AS arr1, array('a','b','c') AS arr2) A CROSS JOIN UNNEST(A.arr1) AS B(b)-- LATERAL VIEW explode(A.arr1) B AS bCROSS JOIN UNNEST(A.arr2) AS C(c);-- LATERAL VIEW explode(A.arr2) C AS c;-- 产生 9 条数据-- 示例 3SELECT A.a, B.b, B.c-- SELECT A.a, B.b, A.arr2[i] AS cFROM (SELECT 'id'AS a, array(1,2,3) AS arr1, array('a','b','c') AS arr2) A CROSS JOIN UNNEST(A.arr1, A.arr2) AS B(b, c)-- LATERAL VIEW posexplode(A.arr1) B AS i, b;-- 产生 3 条数据
-- CTE with multiple column aliasesWITH t(x, y) AS (SELECT 1, 2)SELECT * FROM t WHERE x =1 AND y =2;-- CTE in subquerySELECT max(c) FROM ( WITH t(c) AS (SELECT 1) SELECT * FROM t);-- 多表WITH t1 AS ( SELECT ...),t2 AS ( SELECT ...),t3 AS ( SELECT * FROM t1 -- 可以使用前面子查询的结果 WHERE ...)SELECT * FROM t2 JOIN t3 ON ...;-- 嵌套 (好像有的 sql 不支持)WITH t AS ( WITH t2 AS (SELECT 1) SELECT * FROM t2)SELECT * FROM t;-- 嵌套冲突时SET spark.sql.legacy.ctePrecedencePolicy = CORRECTED;-- 如果不开启该设置, 会抛 AnalysisExceptionWITH t AS (SELECT 1), t2 AS ( WITH t AS (SELECT 2) SELECT * FROM t )SELECT * FROM t2;
-- 写法 1: When a = b, returns c; when a = d, returns e; else returns f.CASE a WHEN b THEN c WHEN d THEN e-- WHEN ... THEN ... ELSE fEND-- 写法 2: When a is true, returns b; when c is true, returns d; else returns e.CASE WHEN a THEN b WHEN c THEN d-- WHEN ... THEN ... ELSE eEND
SELECT cookieid, pt, pv,ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY pv DESC) AS rn1, -- 形如 1,2,3,4,5 (最常用) RANK() OVER(PARTITION BY cookieid ORDER BY pv DESC) AS rn2, -- 形如 1,1,3,3,5 DENSE_RANK() OVER(PARTITION BY cookieid ORDER BY pv DESC) AS rn3 -- 形如 1,1,2,2,3FROM (-- 测试数据 SELECT cookieid, T.col2[idx] AS pt, T.col3[idx] AS pv FROM ( SELECT ARRAY( STRUCT('cookie1', '2015-04-10', 1) , STRUCT('cookie1', '2015-04-11', 5) , STRUCT('cookie1', '2015-04-12', 7) , STRUCT('cookie1', '2015-04-13', 3) , STRUCT('cookie1', '2015-04-14', 2) , STRUCT('cookie1', '2015-04-15', 4) , STRUCT('cookie1', '2015-04-16', 4)-- , STRUCT('cookie2', '2015-04-10', 2)-- , STRUCT('cookie2', '2015-04-11', 3)-- , STRUCT('cookie2', '2015-04-12', 5)-- , STRUCT('cookie2', '2015-04-13', 6)-- , STRUCT('cookie2', '2015-04-14', 3)-- , STRUCT('cookie2', '2015-04-15', 9)-- , STRUCT('cookie2', '2015-04-16', 7) ) T ) A LATERAL VIEW posexplode(T.col1) B AS idx, cookieid) A;-- cookieid pt pv rn1 rn2 rn3-- cookie1 2015-04-12 7 1 1 1-- cookie1 2015-04-11 5 2 2 2-- cookie1 2015-04-15 4 3 3 3-- cookie1 2015-04-16 4 4 3 3-- cookie1 2015-04-13 3 5 5 4-- cookie1 2015-04-14 2 6 6 5-- cookie1 2015-04-10 1 7 7 6
切片 (NTILE) TODO
去重 (基于 ROW_NUMBER)
去重最常用的方法是使用 GROUP BY, 但有时不适用, 比如线上存在多个模型的结果, 我们需要最近出现次数最多的一个, 这时使用 ROW_NUMBER 更方便;
-- 场景: 对每个 query, 线上存在多个改写的结果, 现在需要取出最近最多的一个SELECT *FROM ( SELECT *, , ROW_NUMBER() OVER(PARTITION BY query ORDER BY dt DESC, cnt DESC) AS rn-- 注意这里是 PARTITION BY query 而不是 PARTITION BY query, rewrite FROM ( SELECT query, rewrite, dt, count(1) AS cnt FROM db.table WHERE dt > DATA_SUB('${env.today}', $DAYS) GROUP BY1, 2, 3 ) A) AWHERE rn =1
排序分位值
-- 场景:计算 query 的 pv 分位值SELECT query , rn , 1.0* acc_pv / sum_pv AS pr , pv , acc_pv , sum_pvFROM ( SELECT query , pv , ROW_NUMBER() OVER(ORDER BY pv desc) AS rn , SUM(pv) OVER() AS sum_pv , SUM(pv) OVER(ORDER BY pv desc ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as acc_pv FROM ( SELECT stack(5,'A', 100,'B', 40,'C', 30,'D', 20,'E', 10 ) AS (query, pv) ) A) AORDER BY rn;--- result ---query rn pr pv acc_pv sum_pvA 10.5100100200B 20.740140200C 30.8530170200D 40.9520190200E 51.010200200-- pr 列的含义: query A 占了 50% 的流量, A、B 占了 70% 的流量, ...
SELECT lower(c) AS color, count(1) AS cntFROM ( SELECT explode(ARRAY('red', 'red', 'RED', 'Red', 'Blue', 'blue')) AS c) A-- GROUP BY color -- FAILED: Invalid Table Alias or Column ReferenceGROUP BYlower(c) -- OK-- GROUP BY c -- OK, 但不符合预期, 当 c 和 color 同名时非常容易犯这个错误-- GROUP BY 1 -- OK, 需要 SET hive.groupby.position.alias=true;