用户名,月份,访问次数
A,2015-01,5 A,2015-01,15 B,2015-01,5 A,2015-01,8 B,2015-01,25 A,2015-01,5 A,2015-02,4 A,2015-02,6 B,2015-02,10 B,2015-02,5 A,2015-03,16 A,2015-03,22 B,2015-03,23 B,2015-03,10 B,2015-03,1
use myhive; create external table if not exists t_access( uname string comment ‘用户名‘, umonth string comment ‘月份‘, ucount int comment ‘访问次数‘ ) comment ‘用户访问表‘ row format delimited fields terminated by "," location "/hive/t_access";
load data local inpath "/home/hadoop/access.txt" into table t_access;
select * from t_access;
现要求出:
每个用户截止到每月为止的最大单月访问次数和累计到该月的总访问次数,结果数据格式如下
此结果需要根据用户+月份进行分组
--求当月访问次数 create table tmp_access( name string, mon string, num int ); insert into table tmp_access select uname,umonth,sum(ucount) from t_access t group by t.uname,t.umonth;
select * from tmp_access;
create view tmp_view as select a.name anme,a.mon amon,a.num anum,b.name bname,b.mon bmon,b.num bnum from tmp_access a join tmp_access b on a.name=b.name; select * from tmp_view;
select anme,amon,anum,max(bnum) as max_access,sum(bnum) as sum_access from tmp_view where amon>=bmon group by anme,amon,anum;
use myhive; CREATE TABLE `course` ( `id` int, `sid` int , `course` string, `score` int ) ;
// 插入数据 // 字段解释:id, 学号, 课程, 成绩 INSERT INTO `course` VALUES (1, 1, ‘yuwen‘, 43); INSERT INTO `course` VALUES (2, 1, ‘shuxue‘, 55); INSERT INTO `course` VALUES (3, 2, ‘yuwen‘, 77); INSERT INTO `course` VALUES (4, 2, ‘shuxue‘, 88); INSERT INTO `course` VALUES (5, 3, ‘yuwen‘, 98); INSERT INTO `course` VALUES (6, 3, ‘shuxue‘, 65);
求:所有数学课程成绩 大于 语文课程成绩的学生的学号
create view tmp_course_view as select sid, case course when "shuxue" then score else 0 end as shuxue, case course when "yuwen" then score else 0 end as yuwen from course; select * from tmp_course_view;
create view tmp_course_view1 as select aa.sid, max(aa.shuxue) as shuxue, max(aa.yuwen) as yuwen from tmp_course_view aa group by sid; select * from tmp_course_view1;
select * from tmp_course_view1 where shuxue > yuwen;
数据格式
2010012325
具体数据
2014010216
2014010410
2012010609
2012010812
2012011023
2001010212
2001010411
2013010619
2013010812
2013011023
2008010216
2008010414
2007010619
2007010812
2007011023
2010010216
2010010410
2015010649
2015010812
2015011023
2014010114 2014010216 2014010317 2014010410 2014010506 2012010609 2012010732 2012010812 2012010919 2012011023 2001010116 2001010212 2001010310 2001010411 2001010529 2013010619 2013010722 2013010812 2013010929 2013011023 2008010105 2008010216 2008010337 2008010414 2008010516 2007010619 2007010712 2007010812 2007010999 2007011023 2010010114 2010010216 2010010317 2010010410 2010010506 2015010649 2015010722 2015010812 2015010999 2015011023
数据解释
2010012325表示在2010年01月23日的气温为25度
比如:2010012325表示在2010年01月23日的气温为25度。现在要求使用hive,计算每一年出现过的最大气温的日期+温度。
要计算出每一年的最大气温。我用
select substr(data,1,4),max(substr(data,9,2)) from table2 group by substr(data,1,4);
出来的是 年份 + 温度 这两列数据例如 2015 99
但是如果我是想select 的是:具体每一年最大气温的那一天 + 温度 。例如 20150109 99
请问该怎么执行hive语句。。
group by 只需要substr(data,1,4),
但是select substr(data,1,8),又不在group by 的范围内。
是我陷入了思维死角。一直想不出所以然。。求大神指点一下。
在select 如果所需要的。不在group by的条件里。这种情况如何去分析?
create table tmp_weather as select substr(data,1,4) years,substr(data,5,2) months,substr(data,7,2) days,substr(data,9,2) temp from weather;
select * from tmp_weather;
create table tmp_year_weather as select substr(data,1,4) years,max(substr(data,9,2)) max_temp from weather group by substr(data,1,4);
select * from tmp_year_weather;
select * from tmp_year_weather a join tmp_weather b on a.years=b.years and a.max_temp=b.temp;
id course 1,a 1,b 1,c 1,e 2,a 2,c 2,d 2,f 3,a 3,b 3,c 3,e
表示有id为1,2,3的学生选修了课程a,b,c,d,e,f中其中几门。
create table t_course(id int,course string) row format delimited fields terminated by ",";
load data local inpath "/home/hadoop/course/course.txt" into table t_course;
编写Hive的HQL语句来实现以下结果:表中的1表示选修,表中的0表示未选修
id a b c d e f 1 1 1 1 0 1 0 2 1 0 1 1 0 1 3 1 1 1 0 1 0
第一步:
select collect_set(course) as courses from id_course;
第二步:
set hive.strict.checks.cartesian.product=false; create table id_courses as select t1.id as id,t1.course as id_courses,t2.course courses from ( select id as id,collect_set(course) as course from id_course group by id ) t1 join (select collect_set(course) as course from id_course) t2;
启用严格模式:hive.mapred.mode = strict // Deprecated
hive.strict.checks.large.query = true
该设置会禁用:1. 不指定分页的orderby
2. 对分区表不指定分区进行查询
3. 和数据量无关,只是一个查询模式hive.strict.checks.type.safety = true
严格类型安全,该属性不允许以下操作:1. bigint和string之间的比较
2. bigint和double之间的比较hive.strict.checks.cartesian.product = true
该属性不允许笛卡尔积操作
第三步:得出最终结果:
思路:
拿出course字段中的每一个元素在id_courses中进行判断,看是否存在。
select id, case when array_contains(id_courses, courses[0]) then 1 else 0 end as a, case when array_contains(id_courses, courses[1]) then 1 else 0 end as b, case when array_contains(id_courses, courses[2]) then 1 else 0 end as c, case when array_contains(id_courses, courses[3]) then 1 else 0 end as d, case when array_contains(id_courses, courses[4]) then 1 else 0 end as e, case when array_contains(id_courses, courses[5]) then 1 else 0 end as f from id_courses;
a,01,150 a,01,200 b,01,1000 b,01,800 c,01,250 c,01,220 b,01,6000 a,02,2000 a,02,3000 b,02,1000 b,02,1500 c,02,350 c,02,280 a,03,350 a,03,250
店铺,月份,金额
use class; create table t_store( name string, months int, money int ) row format delimited fields terminated by ",";
load data local inpath "/home/hadoop/store.txt" into table t_store;
编写Hive的HQL语句求出每个店铺的当月销售额和累计到当月的总销售额
(1)按照商店名称和月份进行分组统计
create table tmp_store1 as select name,months,sum(money) as money from t_store group by name,months; select * from tmp_store1;
(2)对tmp_store1 表里面的数据进行自连接
create table tmp_store2 as select a.name aname,a.months amonths,a.money amoney,b.name bname,b.months bmonths,b.money bmoney from tmp_store1 a join tmp_store1 b on a.name=b.name order by aname,amonths; select * from tmp_store2;
(3)比较统计
select aname,amonths,amoney,sum(bmoney) as total from tmp_store2 where amonths >= bmonths group by aname,amonths,amoney;
原文:https://www.cnblogs.com/h-kang/p/10916643.html