第2节 网站点击流项目(下):7、hive的级联求和

mac2022-06-30  64

一、hive级联求和的简单例子:

create table t_salary_detail(username string,month string,salary int)row format delimited fields terminated by ',';

load data local inpath '/export/servers/hivedatas/accumulate/t_salary_detail.dat' into table t_salary_detail;

用户 时间 收到小费金额A,2015-01,5A,2015-01,15B,2015-01,5A,2015-01,8B,2015-01,25A,2015-01,5A,2015-02,4A,2015-02,6B,2015-02,10B,2015-02,5A,2015-03,7A,2015-03,9B,2015-03,11B,2015-03,6

需求:统计每个用户每个月总共获得多少小费

select t.month,t.username,sum(salary) as salSumfrom t_salary_detail t group by t.username,t.month;

+----------+-------------+---------+--+| t.month | t.username | salsum |+----------+-------------+---------+--+| 2015-01 | A | 33 | | 2015-02 | A | 10 | | 2015-03 | A | 16 | | 2015-01 | B | 30 | | 2015-02 | B | 15 | | 2015-03 | B | 17 | +----------+-------------+---------+--+

需求:统计每个用户累计小费select t.username,sum(salary) as salSumfrom t_salary_detail t group by t.username;

+----------+-------------+---------+--+| t.month | t.username | salsum | 累计小费+----------+-------------+---------+--+| 2015-01 | A | 33 | 33| 2015-02 | A | 10 | 43| 2015-03 | A | 16 | 59| 2015-01 | B | 30 | 30| 2015-02 | B | 15 | 45| 2015-03 | B | 17 | 62+----------+-------------+---------+--+

第一步:求每个用户的每个月的小费总和

select t.month,t.username,sum(salary) as salSumfrom t_salary_detail t group by t.username,t.month;

+----------+-------------+---------+--+| t.month | t.username | salsum |+----------+-------------+---------+--+| 2015-01 | A | 33 || 2015-02 | A | 10 || 2015-03 | A | 16 || 2015-01 | B | 30 || 2015-02 | B | 15 || 2015-03 | B | 17 |+----------+-------------+---------+--+

 

第二步:使用inner join 实现自己连接自己

selectA.* ,B.*from(select t.month,t.username,sum(salary) as salSumfrom t_salary_detail t group by t.username,t.month) Ainner join (select t.month,t.username,sum(salary) as salSumfrom t_salary_detail t group by t.username,t.month) Bon A.username = B.username;

+----------+-------------+-----------+----------+-------------+-----------+--+| a.month | a.username | a.salsum | b.month | b.username | b.salsum |+----------+-------------+-----------+----------+-------------+-----------+--+取这一个作为一组| 2015-01 | A | 33 | 2015-01 | A | 33 |

| 2015-01 | A | 33 | 2015-02 | A | 10 || 2015-01 | A | 33 | 2015-03 | A | 16 |取这两个作为一组| 2015-02 | A | 10 | 2015-01 | A | 33 || 2015-02 | A | 10 | 2015-02 | A | 10 |

| 2015-02 | A | 10 | 2015-03 | A | 16 |取这三个作为一组| 2015-03 | A | 16 | 2015-01 | A | 33 || 2015-03 | A | 16 | 2015-02 | A | 10 || 2015-03 | A | 16 | 2015-03 | A | 16 |

| 2015-01 | B | 30 | 2015-01 | B | 30 || 2015-01 | B | 30 | 2015-02 | B | 15 || 2015-01 | B | 30 | 2015-03 | B | 17 || 2015-02 | B | 15 | 2015-01 | B | 30 || 2015-02 | B | 15 | 2015-02 | B | 15 || 2015-02 | B | 15 | 2015-03 | B | 17 || 2015-03 | B | 17 | 2015-01 | B | 30 || 2015-03 | B | 17 | 2015-02 | B | 15 || 2015-03 | B | 17 | 2015-03 | B | 17 |+----------+-------------+-----------+----------+-------------+-----------+--+

每一步相对于上一步的结果

加参数继续变形selectA.* ,B.*from(select t.month,t.username,sum(salary) as salSumfrom t_salary_detail t group by t.username,t.month) Ainner join (select t.month,t.username,sum(salary) as salSumfrom t_salary_detail t group by t.username,t.month) Bon A.username = B.usernamewhere B.month <= A.month;

+----------+-------------+-----------+----------+-------------+-----------+--+| a.month | a.username | a.salsum | b.month | b.username | b.salsum |+----------+-------------+-----------+----------+-------------+-----------+--+| 2015-01 | A | 33 | 2015-01 | A | 33 | 33

| 2015-02 | A | 10 | 2015-01 | A | 33 | 43| 2015-02 | A | 10 | 2015-02 | A | 10 |

| 2015-03 | A | 16 | 2015-01 | A | 33 | 59| 2015-03 | A | 16 | 2015-02 | A | 10 || 2015-03 | A | 16 | 2015-03 | A | 16 |

| 2015-01 | B | 30 | 2015-01 | B | 30 | 30

| 2015-02 | B | 15 | 2015-01 | B | 30 | 45| 2015-02 | B | 15 | 2015-02 | B | 15 |

| 2015-03 | B | 17 | 2015-01 | B | 30 | 62| 2015-03 | B | 17 | 2015-02 | B | 15 || 2015-03 | B | 17 | 2015-03 | B | 17 |+----------+-------------+-----------+----------+-------------+-----------+--+

 

第三步:从第二步的结果中继续通过a.month与a.username进行分组,并对分组后的b.salsum进行累加求和即可

selectA.username,A.month,max(A.salSum),sum(B.salSum) as accumulatefrom(select t.month,t.username,sum(salary) as salSum from t_salary_detail t group by t.username,t.month) Ainner join (select t.month,t.username,sum(salary) as salSum from t_salary_detail t group by t.username,t.month) Bon A.username = B.usernamewhere B.month <= A.monthgroup by A.username,A.monthorder by A.username,A.month;

累计的小费进行求和+-------------+----------+------+-------------+--+| a.username | a.month | _c2 | accumulate |+-------------+----------+------+-------------+--+| A | 2015-01 | 33 | 33 || A | 2015-02 | 10 | 43 || A | 2015-03 | 16 | 59 || B | 2015-01 | 30 | 30 || B | 2015-02 | 15 | 45 || B | 2015-03 | 17 | 62 |+-------------+----------+------+-------------+--+

 

二、路径转换(漏斗模型)

0: jdbc:hive2://node03:10000> select * from ods_click_pageviews limit 10;+---------------------------------------+----------------------------------+----------------------------------+---------------------------------+------------------------------+---------------------------------+------------------------------------+----------------------------------------------------+----------------------------------------------------+--------------------------------------+-----------------------------+------------------------------+--+| ods_click_pageviews.session | ods_click_pageviews.remote_addr | ods_click_pageviews.remote_user | ods_click_pageviews.time_local | ods_click_pageviews.request | ods_click_pageviews.visit_step | ods_click_pageviews.page_staylong | ods_click_pageviews.http_referer | ods_click_pageviews.http_user_agent | ods_click_pageviews.body_bytes_sent | ods_click_pageviews.status | ods_click_pageviews.datestr |+---------------------------------------+----------------------------------+----------------------------------+---------------------------------+------------------------------+---------------------------------+------------------------------------+----------------------------------------------------+----------------------------------------------------+--------------------------------------+-----------------------------+------------------------------+--+| 9ff03c4c-25f4-43fc-b3e0-08ea88b38fcc | 1.80.249.223 | - | 2013-09-18 07:57:33 | /hadoop-hive-intro/ | 1 | 60 | "http://www.google.com.hk/url?sa=t&rct=j&q=hive的安装&source=web&cd=2&ved=0CC4QFjAB&url=http://blog.fens.me/hadoop-hive-intro/&ei=5lw5Uo-2NpGZiQfCwoG4BA&usg=AFQjCNF8EFxPuCMrm7CvqVgzcBUzrJZStQ&bvm=bv.52164340,d.aGc&cad=rjt" | "Mozilla/5.0(WindowsNT5.2;rv:23.0)Gecko/20100101Firefox/23.0" | 14764 | 200 | 20130918 || ba46b150-ca63-47d1-9cdd-e278df01f5d3 | 101.226.167.201 | - | 2013-09-18 09:30:36 | /hadoop-mahout-roadmap/ | 1 | 60 | "http://blog.fens.me/hadoop-mahout-roadmap/" | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider" | 10335 | 200 | 20130918 || 30e183c4-e03d-4a5a-b5ba-55fff2ea1be1 | 101.226.167.205 | - | 2013-09-18 09:30:32 | /hadoop-family-roadmap/ | 1 | 60 | "http://blog.fens.me/hadoop-family-roadmap/" | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider" | 11715 | 200 | 20130918 || ea77f279-451d-4efa-8a7f-3d321675ad4d | 101.226.169.215 | - | 2013-09-18 10:07:31 | /about | 1 | 60 | "http://blog.fens.me/about" | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider" | 5 | 301 | 20130918 || bbe902cb-9496-46fe-b201-2065996373c3 | 110.211.10.14 | - | 2013-09-18 13:31:10 | /hadoop-mahout-roadmap/ | 1 | 60 | "http://f.dataguru.cn/forum.php?mod=viewthread&tid=175501" | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;WOW64;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MALN;InfoPath.2;.NET4.0C;MediaCenterPC6.0)" | 10335 | 200 | 20130918 || 1646b21a-a2d6-40ef-ab7a-56496ba0e493 | 111.161.17.104 | - | 2013-09-18 12:17:25 | /hadoop-hive-intro/ | 1 | 60 | "http://blog.fens.me/series-hadoop-cloud/" | "Mozilla/5.0(WindowsNT6.2;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/29.0.1547.66Safari/537.36" | 14763 | 200 | 20130918 || d8261f93-be31-45a9-82d9-094657157468 | 111.193.224.9 | - | 2013-09-18 07:17:25 | /hadoop-family-roadmap/ | 1 | 60 | "https://www.google.com.hk/" | "Mozilla/5.0(Macintosh;IntelMacOSX10_8_5)AppleWebKit/537.36(KHTML,likeGecko)Chrome/29.0.1547.57Safari/537.36" | 11715 | 200 | 20130918 || a854f511-3ca9-406b-8a34-5c9cc6a61782 | 112.65.193.16 | - | 2013-09-18 08:48:31 | /hadoop-mahout-roadmap/ | 1 | 60 | "-" | "Mozilla/4.0" | 38590 | 200 | 20130918 || 01c87dee-e91c-4ad7-b3cc-0c121bc03806 | 113.107.237.31 | - | 2013-09-18 09:06:46 | /finance-rhive-repurchase/ | 1 | 60 | "-" | "-" | 45271 | 200 | 20130918 || f8660978-9b1e-414b-a075-c04f1cb7197a | 113.90.232.163 | - | 2013-09-19 00:58:00 | /hadoop-mahout-roadmap/ | 1 | 60 | "http://h2w.iask.cn/jump.php?url=http://blog.fens.me/hadoop-mahout-roadmap/" | "Mozilla/5.0(iPhone;CPUiPhoneOS6_0_1likeMacOSX)AppleWebKit/536.26(KHTML,likeGecko)Mobile/10A523" | 10321 | 200 | 20130918 |+---------------------------------------+----------------------------------+----------------------------------+---------------------------------+------------------------------+---------------------------------+------------------------------------+----------------------------------------------------+----------------------------------------------------+--------------------------------------+-----------------------------+------------------------------+--+

求两个指标: 第一个指标:每一步相对于第一步的转化率 第二个指标:每一步相对于上一步的转化率

# 使用模型生成的数据,可以满足我们的转化率的求取load data inpath '/weblog/clickstream/pageviews/click-part-r-00000' overwrite into table ods_click_pageviews partition(datestr='20130920');load data local inpath '/export/servers/hivedatas/click-part-r-00000' overwrite into table ods_click_pageviews partition(datestr='20130920');

-------------------------------------------------------------1、查询每一个步骤的总访问人数

Step1、 /item 1000 相对上一步 相对第一步 1000Step2、 /category 800 0.8 0.8 1800Step3、 /index 500 0.625 0.5 2300Step4、 /order 100 0.2 0.1 2400

 

 

create table dw_oute_numbs as select 'step1' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20130920' and request like '/item%'union allselect 'step2' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20130920' and request like '/category%'union allselect 'step3' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20130920' and request like '/order%'union allselect 'step4' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20130920' and request like '/index%';

+---------------------+----------------------+--+| dw_oute_numbs.step | dw_oute_numbs.numbs |+---------------------+----------------------+--+| step1 | 1029 || step2 | 1029 || step3 | 1028 || step4 | 1018 |+---------------------+----------------------+--+

 

 

------------------------------------------------------------------------------2、查询每一步骤相对于路径起点人数的比例--级联查询,自己跟自己join

select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rninner join dw_oute_numbs rr;

自join后结果如下图所示:每一步相对于第一步的转化率

+---------+----------+---------+----------+--+| rnstep | rnnumbs | rrstep | rrnumbs |+---------+----------+---------+----------+--+| step1 | 1029 | step1 | 1029 || step2 | 1029 | step1 | 1029 || step3 | 1028 | step1 | 1029 || step4 | 1018 | step1 | 1029 || step1 | 1029 | step2 | 1029 || step2 | 1029 | step2 | 1029 || step3 | 1028 | step2 | 1029 || step4 | 1018 | step2 | 1029 || step1 | 1029 | step3 | 1028 || step2 | 1029 | step3 | 1028 || step3 | 1028 | step3 | 1028 || step4 | 1018 | step3 | 1028 || step1 | 1029 | step4 | 1018 || step2 | 1029 | step4 | 1018 || step3 | 1028 | step4 | 1018 || step4 | 1018 | step4 | 1018 |+---------+----------+---------+----------+--+

过滤只取step1的所有的数据select tempTab.rnnumbs/tempTab.rrnumbs from (select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rninner join dw_oute_numbs rr where rr.step = 'step1') tempTab;

 

--每一步的人数/第一步的人数==每一步相对起点人数比例select tmp.rnstep,tmp.rnnumbs/tmp.rrnumbs as ratiofrom(select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rninner join dw_oute_numbs rr) tmpwhere tmp.rrstep='step1';

简化sql语句;select a.step,a.numbs/b.numbs compareFirst from dw_oute_numbs a,dw_oute_numbs b where b.step='step1'order by a.step;+---------+---------------------+--+| a.step | comparefirst |+---------+---------------------+--+| step1 | 1.0 || step2 | 1.0 || step3 | 0.9990281827016521 || step4 | 0.989310009718173 |+---------+---------------------+--+

----------------------------------------------------------------------------------3、查询每一步骤相对于上一步骤的漏出率--首先通过自join表过滤出每一步跟上一步的记录

select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rninner join dw_oute_numbs rrwhere cast(substr(rn.step,5,1) as int)=cast(substr(rr.step,5,1) as int)-1;

select newTable.rnnumbs/newTable.rrnumbs from (select * from (select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rninner join dw_oute_numbs rr ) tmpTablewhere cast(substr(tmpTable.rrStep,5,1) as int ) = cast(substr(tmpTable.rnstep,5,1) as int )-1) newTable

where temTable.rrstep.截串 >= temTable.rnstep.截串

注意:cast为hive的内置函数,主要用于类型的转换用例:select cast(1 as float);select cast('2018-06-22' as date);

+---------+----------+---------+----------+--+| rnstep | rnnumbs | rrstep | rrnumbs |+---------+----------+---------+----------+--+| step1 | 1029 | step2 | 1029 || step2 | 1029 | step3 | 1028 || step3 | 1028 | step4 | 1018 |+---------+----------+---------+----------+--+

--然后就可以非常简单的计算出每一步相对上一步的漏出率select tmp.rrstep as step,tmp.rrnumbs/tmp.rnnumbs as leakage_ratefrom(select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rninner join dw_oute_numbs rr) tmpwhere cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1;

我的思路:-- cast('123' AS FLOAT);字符串显示转换为浮点数 select a.step,a.numbs,b.step,b.numbs,case when a.numbs is null then '相对上一步' else b.numbs/a.numbs end compareLast from dw_oute_numbs a right outer join dw_oute_numbs b on regexp_replace(b.step,'step','')=regexp_replace(a.step,'step','')+1 --regexp_replace函数改为substr(b.step,5,1)=substr(a.step,5,1)+1,不用正则表达式,可以提高效率order by b.step;+---------+----------+---------+----------+---------------------+--+| a.step | a.numbs | b.step | b.numbs | comparelast |+---------+----------+---------+----------+---------------------+--+| NULL | NULL | step1 | 1029 | 相对前一步 || step1 | 1029 | step2 | 1029 | 1.0 || step2 | 1029 | step3 | 1028 | 0.9990281827016521 || step3 | 1028 | step4 | 1018 | 0.9902723735408561 |+---------+----------+---------+----------+---------------------+--+

-------------------------------------------------------------------------------------4、汇总以上两种指标select abs.step,abs.numbs,abs.rate as abs_ratio,rel.rate as leakage_ratefrom (select tmp.rnstep as step,tmp.rnnumbs as numbs,tmp.rnnumbs/tmp.rrnumbs as ratefrom(select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rninner join dw_oute_numbs rr) tmpwhere tmp.rrstep='step1') absleft outer join(select tmp.rrstep as step,tmp.rrnumbs/tmp.rnnumbs as ratefrom(select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rninner join dw_oute_numbs rr) tmpwhere cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1) relon abs.step=rel.step;

 

转载于:https://www.cnblogs.com/mediocreWorld/p/11108484.html

最新回复(0)