基本操作
1.查看所有设置
hive (bigdata)>set;
2.查看HDFS 路径信息
hive (bigdata)> dfs -ls /user/hive/warehouse;
3.内部表转外部表
alter table t1 set tblproperties('EXTERNAL'='TRUE');
4.查看分区信息
hive (bigdata)> show partitions ods_score;
5.增加分区
alter table t3 add partition(dt='2020-06-03');
6.修改分区位置
alter table t3 partition(dt='2020-06-01') set location '/user/hive/warehouse/t3/dt=2020-06-03';
7.删除分区:
alter table t3 partition(dt='2020-06-01') set location '/user/hive/warehouse/t3/dt=2020-06-03';
8.分桶规则:
- 分桶字段.hashCode % 分桶数
- 分桶表加载数据时,使用 insert… select … 方式进行
- 网上有资料说要使用分区表需要设置 hive.enforce.bucketing=true,那是Hive1.x 以前的版本;Hive 2.x 中,删除了该参数,始终可以分桶;
插入数据
9.hive 插入数据
insert into table tabC
partition(month='202001')
values (5, 'wangwu', 'BJ'), (4, 'lishi', 'SH'), (3,
'zhangsan', 'TJ');
– 插入查询的结果数据
insert into table tabC partition(month='202002')
select id, name, area from tabC where month='202001';
– 多表(多分区)插入模式
from tabC
insert overwrite table tabC partition(month='202003')
select id, name, area where month='202002'
insert overwrite table tabC partition(month='202004')
select id, name, area where month='202002';
–使用import 导入数据
import table student2 partition(month='201709')
from '/user/hive/warehouse/export/student';
10.数据导出
– 将查询结果导出到本地
insert overwrite local directory '/home/hadoop/data/tabC'
select * from tabC;
– 将查询结果格式化输出到本地
insert overwrite local directory '/home/hadoop/data/tabC2'
row format delimited fields terminated by ' '
select * from tabC;
– 将查询结果导出到HDFS
insert overwrite directory '/user/hadoop/data/tabC3'
row format delimited fields terminated by ' '
select * from tabC;
– dfs 命令导出数据到本地。本质是执行数据文件的拷贝
dfs -get /user/hive/warehouse/mydb.db/tabc/month=202001
/home/hadoop/data/tabC4
窗口函数
11.explode 和 lateral view 的使用
– explode,炸裂函数
-- 就是将一行中复杂的 array 或者 map 结构拆分成多行
select explode(array('A','B','C')) as col;
select explode(map('a', 8, 'b', 88, 'c', 888));
-- UDTF's are not supported outside the SELECT clause, nor nested in expressions
-- SELECT pageid, explode(adid_list) AS myCol... is not supported
-- SELECT explode(explode(adid_list)) AS myCol... is not supported
– lateral view 常与 表生成函数explode结合使用
-- lateral view 语法:
lateralView: LATERAL VIEW udtf(expression) tableAlias AS
columnAlias (',' columnAlias)*
fromClause: FROM baseTable (lateralView)*
-- lateral view 的基本使用
with t1 as (
select 'OK' cola, split('www.lagou.com', '\\.') colb
) select cola, colc
from t1 lateral view explode(colb) t2 as colc;
案例:找到每个学生的最好成绩
数据形式:
lisi|Chinese:90,Math:80,English:70
wangwu|Chinese:88,Math:90,English:96
maliu|Chinese:99,Math:65,English:60
hive中创建表:
create table studscore(
name string,
score map<string,string>
) row format delimited fields terminated by '|'
collection items terminated by ','
map keys terminated by ':';
查询表:
hive (test)> select * from studscore;
OK
studscore.name studscore.score
lisi {"Chinese":"90","Math":"80","English":"70"}
wangwu {"Chinese":"88","Math":"90","English":"96"}
maliu {"Chinese":"99","Math":"65","English":"60"}
Time taken: 0.434 seconds, Fetched: 3 row(s)
使用explode 查询:
hive (test)> select explode(score) from studscore;
OK
key value
Chinese 90
Math 80
English 70
Chinese 88
Math 90
English 96
Chinese 99
Math 65
English 60
explode 结合使用:
hive (test)> select name, subject, score1 as score from studscore lateral view explode(score) t1 as subject, score1;
OK
name subject score
lisi Chinese 90
lisi Math 80
lisi English 70
wangwu Chinese 88
wangwu Math 90
wangwu English 96
maliu Chinese 99
maliu Math 65
maliu English 60
Time taken: 0.086 seconds, Fetched: 9 row(s)
最终取得结果:
hive (test)> with tmp as (select name, subject, mark as score from studscore lateral view explode(score) t1 as subject, mark)
> select name ,max(score) from tmp group by name;
Automatically selecting local only mode for query
name _c1
lisi 90
maliu 99
wangwu 96
Time taken: 3.671 seconds, Fetched: 3 row(s)
12.over 关键字
使用窗口函数之前一般要要通过over()进行开窗
partition by子句
在over窗口中进行分区,对某一列进行分区统计,窗口的大小就是分区的大小
order by 子句
order by 子句对输入的数据进行排序
sum(score) over() : 对窗口中所有的数据进行求和,窗口大小为全部的行数据
sum(score) over(partition by id) : 根据id 对数据进行分组,对分组后的窗口进行窗口内的统计sum值
sum(score) over(partition by id order by score) : 根据id进行划分窗口,在窗口内按照score 进行排序操作,然后统计sum值 ,默认组内是第一行到当前所有行的和
Window子句
unbounded preceding。组内第一行数据
n preceding。组内当前行的前n行数据
current row。当前行数据
n following。组内当前行的后n行数据
unbounded following。组内最后一行数据
– rows between … and … 子句
– 等价。组内,第一行到当前行的和
select ename, sal, deptno,
sum(sal) over(partition by deptno order by ename) from
emp;
select ename, sal, deptno,
sum(sal) over(partition by deptno order by ename
rows between unbounded preceding and
current row
)
from emp;
– 组内,第一行到最后一行的和
select ename, sal, deptno,
sum(sal) over(partition by deptno order by ename
rows between unbounded preceding and
unbounded following
)
from emp;
– 组内,前一行、当前行、后一行的和
select ename, sal, deptno,
sum(sal) over(partition by deptno order by ename
rows between 1 preceding and 1 following
)
from emp;
排名函数
都是从1开始,生成数据项在分组中的排名。
row_number()。排名顺序增加不会重复;如1、2、3、4、... ...
RANK()。 排名相等会在名次中留下空位;如1、2、2、4、5、... ...
DENSE_RANK()。 排名相等会在名次中不会留下空位 ;如1、2、2、3、4、... ...
– 需求:连续7天登录的用户
-- 1、使用 row_number 在组内给数据编号(rownum)
-- 2、某个值 - rownum = gid,得到结果可以作为后面分组计算的依据
-- 3、根据求得的gid,作为分组条件,求最终结果
select uid, dt, row_number() over (partition by uid order
by dt )
from ulogin
where status=1;
OK
uid dt _wcol0
1 2019-07-11 1
1 2019-07-12 2
1 2019-07-13 3
1 2019-07-14 4
1 2019-07-15 5
1 2019-07-16 6
1 2019-07-17 7
1 2019-07-18 8
2 2019-07-11 1
2 2019-07-12 2
2 2019-07-14 3
2 2019-07-15 4
2 2019-07-17 5
3 2019-07-11 1
3 2019-07-12 2
3 2019-07-13 3
3 2019-07-15 4
3 2019-07-16 5
3 2019-07-17 6
3 2019-07-18 7
select uid, dt,
date_sub(dt, row_number() over (partition by uid order
by dt)) gid
from ulogin
where status=1;
OK
uid dt gid
1 2019-07-11 2019-07-10
1 2019-07-12 2019-07-10
1 2019-07-13 2019-07-10
1 2019-07-14 2019-07-10
1 2019-07-15 2019-07-10
1 2019-07-16 2019-07-10
1 2019-07-17 2019-07-10
1 2019-07-18 2019-07-10
2 2019-07-11 2019-07-10
2 2019-07-12 2019-07-10
2 2019-07-14 2019-07-11
2 2019-07-15 2019-07-11
2 2019-07-17 2019-07-12
3 2019-07-11 2019-07-10
3 2019-07-12 2019-07-10
3 2019-07-13 2019-07-10
3 2019-07-15 2019-07-11
3 2019-07-16 2019-07-11
3 2019-07-17 2019-07-11
3 2019-07-18 2019-07-11
select uid, count(*) logincount
from (select uid, dt,
date_sub(dt, row_number() over (partition by
uid order by dt)) gid
from ulogin
where status=1) t1
group by uid, gid
having logincount>=7;
OK
uid logincount
1 8
– 需求:编写sql语句实现每班前三名,分数一样并列,同时求出前三名按名次排序的分差
-- 求解思路:
-- 1、上排名函数,分数一样并列,所以用dense_rank
-- 2、将上一行数据下移,相减即得到分数差
-- 3、处理 NULL
1 1901 90
2 1901 90
3 1901 83
4 1901 60
5 1902 66
6 1902 23
7 1902 99
8 1902 67
9 1902 87
create table stu(
sno int,
class string,
score int
)row format delimited fields terminated by ' ';
select sno, class, score, dense_rank() over (partition by class order by score desc) as rank from stu ;
sno class score rank
2 1901 90 1
1 1901 90 1
3 1901 83 2
4 1901 60 3
7 1902 99 1
9 1902 87 2
8 1902 67 3
5 1902 66 4
6 1902 23 5
with tmp as (
select sno, class, score,
dense_rank() over (partition by class order by score
desc) as rank
from stu
)
select class, score, rank,
nvl(score - lag(score) over (partition by class order
by score desc), 0) lagscore
from tmp
where rank<=3;
OK
class score rank lagscore
1901 90 1 0
1901 90 1 0
1901 83 2 -7
1901 60 3 -23
1902 99 1 0
1902 87 2 -12
1902 67 3 -20
序列函数
- lag : 返回当前数据行的上一行数据
- lead:返回当前数据行的下一行数据
- first_value : 取分组内排序后,截止到当前行的第一个值。
- last_value : 取分组内排序后,截止到当前行的最后一个值。
- ntile : 将分组的数据按照顺序切分成n片,返回当前切片的值
– 测试数据 userpv.dat。cid ctime pv
cookie1,2019-04-10,1
cookie1,2019-04-11,5
cookie1,2019-04-12,7
cookie1,2019-04-13,3
cookie1,2019-04-14,2
cookie1,2019-04-15,4
cookie1,2019-04-16,4
cookie2,2019-04-10,2
cookie2,2019-04-11,3
cookie2,2019-04-12,5
cookie2,2019-04-13,6
cookie2,2019-04-14,3
cookie2,2019-04-15,9
cookie2,2019-04-16,7
– 建表语句
create table userpv(
cid string,
ctime date,
pv int
)
row format delimited fields terminated by ",";
–lag / lead
select cid, ctime, pv,
lag(pv) over(partition by cid order by ctime) lagpv,
lead(pv) over(partition by cid order by ctime) leadpv
from userpv;
cid ctime pv lagpv leadpv
cookie1 2019-04-10 1 NULL 5
cookie1 2019-04-11 5 1 7
cookie1 2019-04-12 7 5 3
cookie1 2019-04-13 3 7 2
cookie1 2019-04-14 2 3 4
cookie1 2019-04-15 4 2 4
cookie1 2019-04-16 4 4 NULL
cookie2 2019-04-10 2 NULL 3
cookie2 2019-04-11 3 2 5
cookie2 2019-04-12 5 3 6
cookie2 2019-04-13 6 5 3
cookie2 2019-04-14 3 6 9
cookie2 2019-04-15 9 3 7
cookie2 2019-04-16 7 9 NULL
Time taken: 3.853 seconds, Fetched: 14 row(s)
– first_value / last_value
select cid, ctime, pv,
first_value(pv) over (partition by cid order by ctime
rows between unbounded preceding and unbounded following) as
firstpv,
last_value(pv) over (partition by cid order by ctime
rows between unbounded preceding and unbounded following) as
lastpv
from userpv;
cid ctime pv firstpv lastpv
cookie1 2019-04-10 1 1 4
cookie1 2019-04-11 5 1 4
cookie1 2019-04-12 7 1 4
cookie1 2019-04-13 3 1 4
cookie1 2019-04-14 2 1 4
cookie1 2019-04-15 4 1 4
cookie1 2019-04-16 4 1 4
cookie2 2019-04-10 2 2 7
cookie2 2019-04-11 3 2 7
cookie2 2019-04-12 5 2 7
cookie2 2019-04-13 6 2 7
cookie2 2019-04-14 3 2 7
cookie2 2019-04-15 9 2 7
cookie2 2019-04-16 7 2 7
Time taken: 1.494 seconds, Fetched: 14 row(s)
– ntile。按照cid进行分组,每组数据分成2份
select cid, ctime, pv,
ntile(2) over(partition by cid order by ctime) ntile
from userpv;
cid ctime pv ntile
cookie1 2019-04-10 1 1
cookie1 2019-04-11 5 1
cookie1 2019-04-12 7 1
cookie1 2019-04-13 3 1
cookie1 2019-04-14 2 2
cookie1 2019-04-15 4 2
cookie1 2019-04-16 4 2
cookie2 2019-04-10 2 1
cookie2 2019-04-11 3 1
cookie2 2019-04-12 5 1
cookie2 2019-04-13 6 1
cookie2 2019-04-14 3 2
cookie2 2019-04-15 9 2
cookie2 2019-04-16 7 2
Time taken: 1.507 seconds, Fetched: 14 row(s)
行转列
– 数据:id course
1 java
1 hadoop
1 hive
1 hbase
2 java
2 hive
2 spark
2 flink
3 java
3 hadoop
3 hive
3 kafka
– 建表加载数据
create table rowline1(
id string,
course string
)row format delimited fields terminated by ' ';
load data local inpath '/root/data/data1.dat' into table
rowline1;
– 编写sql,得到结果如下(1表示选修,0表示未选修)
id java hadoop hive hbase spark flink kafka
1 1 1 1 1 0 0 0
2 1 0 1 0 1 1 0
3 1 1 1 0 0 0 1
– 使用case when;group by + sum
select id,
sum(case when course="java" then 1 else 0 end) as java,
sum(case when course="hadoop" then 1 else 0 end) as hadoop,
sum(case when course="hive" then 1 else 0 end) as hive,
sum(case when course="hbase" then 1 else 0 end) as hbase,
sum(case when course="spark" then 1 else 0 end) as spark,
sum(case when course="flink" then 1 else 0 end) as flink,
sum(case when course="kafka" then 1 else 0 end) as kafka
from rowline1
group by id;