1. 创建数据库,切换数据库
create database testdb; use testdb;
2. 创建管理表
create table emp( empno int, empname string, job string, mgr int, hiredate string, salary double, comm double, deptno int) row format delimited fields terminated by ‘\t‘; 加载数据 load data local inpath ‘/opt/test/emp.txt‘ overwrite into table emp;
101 ‘duan‘ ‘it‘ 1, ‘hiredate‘ 100.0 10.0 1
102 ‘duan2‘ ‘product‘ 1, ‘2018‘ 200.0 20.0 1
3. 创建外部表
hdfs dfs -mkdir /user/hive/warehouse/testdb2.db/emp_ext hdfs dfs -put emp.txt /user/hive/warehouse/testdb2.db/emp_ext/
create external table emp_ext( empno int, empname string, job string, mgr int, hiredate string, salary double, comm double, deptno int) row format delimited fields terminated by ‘\t‘ location ‘/user/hive/warehouse/testdb2.db/emp_ext/‘;
4. 创建分区表
create table emp_part( empno int, empname string, job string, mgr int, hiredate string, salary double, comm double, deptno int) partitioned by (year string, month string) row format delimited fields terminated by ‘\t‘;
FAILED: SemanticException [Error 10035]: Column repeated in partitioning columns
load data local inpath ‘/opt/test/emp.txt‘ into table emp_part partition (year=‘2016‘, month=‘3‘); load data local inpath ‘/opt/test/emp.txt‘ into table emp_part partition (year=‘2016‘, month=‘4‘);
alter table emp_part add partition (year=‘2016‘, month=‘5‘) location ‘/data‘;
load data inpath ‘/emp.txt‘ into table emp_part partition (year=‘2016‘, month=‘6‘);
(1) create-as
create table emp3 as select * from emp;
(2) create-like
create table emp4 like emp; load data local inpath ‘/opt/test/emp.txt‘ overwrite into table emp4;
insert overwrite table emp4 select * from emp;
(1) 指定orc格式
create table emp_orc( empno int, empname string, job string, mgr int, hiredate string, salary double, comm double, deptno int) stored as orc;
指定为非文本格式时无需再指定row format delimited fields terminated by ‘\t‘
插入数据 insert into table emp_orc select * from emp;
create table emp_orc2 like emp_orc; 插入数据 insert overwrite table emp_orc2 select * from emp;
(2) 指定orc+snappy格式
create table emp_orc_snappy( empno int, empname string, job string, mgr int, hiredate string, salary double, comm double, deptno int) stored as orc tblproperties("orc.compression"="snappy"); 插入数据 insert overwrite table emp_orc_snappy select * from emp;
create table emp_orc_snappy2 like emp_orc tblproperties ("orc.compression"="snappy"); insert overwrite table emp_orc_snappy2 select * from emp;
create table emp_orc_snappy3 stored as orc tblproperties("orc.compression"="snappy") as select * from emp;
hive -e "select * from db_hive01.emp"
hive -f emp.hql
hive --hiveconf hive.root.logger=DEBUG,console
insert overwrite local directory ‘/opt/test/local‘ row format delimited fields terminated by ‘\t‘ select * from emp;
如果不指定row format delimited fields terminated by ‘\t‘,字段间默认没有分割符
hive -e ‘select * from testdb2.emp‘ >> ./emp_export.txt
insert overwrite directory ‘/export_data‘ select * from emp;
hive 0.13.1版本还不支持导出数据到hdfs时指定分隔符row format delimited fields terminated by ‘\t‘
export table emp to ‘/export_data‘;
导出后会在会生成/export_data/data目录, emp.txt存放在此目录中,即/export_data/data/emp.txt
9. 排序
(1)order by 全局排序
insert overwrite local directory ‘/opt/test/local‘ row format delimited fields terminated by ‘\t‘ select * from emp order by empno;
(2)sort by 与 distributed by
类似MR中partition,进行分区,结合sort by使用
每个reduce内部进行排序,全局不是排序, distribute by 一定是放在sort by 前面,
set mapreduce.job.reduces=3; insert overwrite local directory ‘/opt/test/local‘ row format delimited fields terminated by ‘\t‘ select * from emp distribute by deptno sort by empno;
(3)cluster by
当distributed by和sort by 字段一样的时候,直接使用cluster by
select upper(empname) from emp; select unix_timestamp(trackTime) from bflog limit 3 ; select year(hiredate) from emp ; select month(hiredate) from emp ; select hour(hiredate) from emp ; select substr(hiredate,1,4) from .emp ; select split(hiredate,‘-‘)[1] from emp ; select reverse(hiredate) from emp ; select concat(empno,‘-‘,empname) from emp ; case when 条件1 then ... when 条件2 then ... else end
可以使用desc function substr 查看函数说明, substr第二个参数为index 从1技术,第三个参数为length
11. 自定义UDF
add jar /opt/test/mylower.jar ; CREATE TEMPORARY FUNCTION mylower AS ‘org.gh.hadoop.hive.MyLower‘;
12. 使用正则表达式加载数据字段
create table beifenglog( remote_addr string, remote_user string, time_local string, request string, status string, body_bytes_sent string, request_body string, http_referer string, http_user_agent string, http_x_forwarded_for string, host string) row format serde ‘org.apache.hadoop.hive.contrib.serde2.RegexSerDe‘ with serdeproperties( "input.regex" = "(\\\"[\\d\\.]+\\\") (\\\"[^ ]+\\\") (\\\".*?\\\") (\\\".*?\\\") (\\\"\\d+\\\") (\\\"\\d+\\\") ([^ ]+) (\\\"[^ ]+\\\") (\\\".*?\\\") (\\\"[^ ]+\\\") (\\\"[^ ]+\\\")" ) stored as textfile; 加载原表数据 load data local inpath ‘/opt/test/beifenglog.data‘ overwrite into table beifenglog;
(1)在创建表(无论管理表还是外部表)时,如果没有指定location,可以使用load data加载数据
a) 指定本地目录中的数据,会上传数据文件到hdfs中
b) 指定hdfs中数据文件,如果指定的路径与表所在的目录不一致,则移动数据文件到表目录中
create external table emp_ext2 like emp; load data inpath ‘/emp.txt‘ into table emp_ext2; 会把/emp.txt移动到/user/hive/warehouse/testdb2.db/emp_ext2/目录中
create table emp2 like emp; load data inpath ‘/emp.txt‘ into table emp2; 会把/emp.txt移动到/user/hive/warehouse/testdb2.db/emp2/目录中
(2)create-like时不能指定stored as为其他格式,否则报错
以下操作会报错 FAILED: ParseException line 1:31 missing EOF at ‘stored‘ near ‘emp‘
create table emp_orc2 like emp stored as orc;