原始数据表:对应mr清洗完之后的数据,而不是原始日志数据
drop table if exists ods_weblog_origin;create table ods_weblog_origin(valid string,remote_addr string,remote_user string,time_local string,request string,status string,body_bytes_sent string,http_referer string,http_user_agent string)partitioned by (datestr string)row format delimitedfields terminated by '\001';
---------------------------------------------------点击流pageview表
drop table if exists ods_click_pageviews;create table ods_click_pageviews(session string,remote_addr string,remote_user string,time_local string,request string,visit_step string,page_staylong string,http_referer string,http_user_agent string,body_bytes_sent string,status string)partitioned by (datestr string)row format delimitedfields terminated by '\001';
-----------------------------------------------点击流visit表
drop table if exists ods_click_stream_visit;create table ods_click_stream_visit(session string,remote_addr string,inTime string,outTime string,inPage string,outPage string,referal string,pageVisits int)partitioned by (datestr string)row format delimitedfields terminated by '\001';
-------------------------------------------维度表示例:drop table if exists t_dim_time;create table t_dim_time(date_key int,year string,month string,day string,hour string) row format delimited fields terminated by ',';
hdfs dfs -mkdir -p /weblog/preprocessedhdfs dfs -mkdir -p /weblog/clickstream/pageviewshdfs dfs -mkdir -p /weblog/clickstream/visitshdfs dfs -mkdir -p /weblog/dim_time
hdfs dfs -put part-m-00000 /weblog/preprocessedhdfs dfs -put part-r-00000 /weblog/clickstream/pageviewshdfs dfs -put part-r-00000 /weblog/clickstream/visitshdfs dfs -put dim_time_dat.txt /weblog/dim_time
设置hive本地模式运行(开启本地模式,解决多个小文件输入的时候,分配资源时间超过数据的计算时间的问题)set hive.exec.mode.local.auto=true;
--------------------------------------------------------------------------
导入清洗结果数据到贴源数据表ods_weblog_origin
load data inpath '/weblog/preprocessed/' overwrite into table ods_weblog_origin partition(datestr='20130918');show partitions ods_weblog_origin;select count(*) from ods_weblog_origin;---------------------------------------------------------------------------导入点击流模型pageviews数据到ods_click_pageviews表
load data inpath '/weblog/clickstream/pageviews' overwrite into table ods_click_pageviews partition(datestr='20130918');select count(*) from ods_click_pageviews;-----------------------------------------------------------------------------导入点击流模型visit数据到ods_click_stream_visit表
load data inpath '/weblog/clickstream/visits' overwrite into table ods_click_stream_visit partition(datestr='20130918');
----------------------------------------------------------------------------------------------------------------------时间维度表数据导入参考数据《dim_time_dat.txt》load data inpath '/weblog/dim_time' overwrite into table t_dim_time;
建表——明细宽表 ods_weblog_detail
drop table ods_weblog_detail;create table ods_weblog_detail(valid string, --有效标识remote_addr string, --来源IPremote_user string, --用户标识time_local string, --访问完整时间daystr string, --访问日期timestr string, --访问时间month string, --访问月day string, --访问日hour string, --访问时request string, --请求的urlstatus string, --响应码body_bytes_sent string, --传输字节数http_referer string, --来源urlref_host string, --来源的hostref_path string, --来源的路径ref_query string, --来源参数queryref_query_id string, --来源参数query的值http_user_agent string --客户终端标识)partitioned by(datestr string);
-------------------------------------------------------------------------------通过查询插入数据到明细宽表 ods_weblog_detail中
分步:--抽取refer_url到中间表 t_ods_tmp_referurl --也就是将来访url分离出host path query query id--eg. http://www.google.com/url?sa=t&rct=j&q=nodejs 异步广播&source=web&cd=1&cad=rja&ved=0CCgQFjAA&url=http://blog.fens.me/nodejs-socketio-chat/&ei=rko5UrylAefOiAe7_IGQBw&usg=AFQjCNG6YWoZsJ_bSj8kTnMHcH51hYQkAA&bvm=bv.52288139,d.aGcdrop table if exists t_ods_tmp_referurl;create table t_ods_tmp_referurl asSELECT a.*,b.*FROM ods_weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as host, path, query, query_id;
--抽取转换time_local字段到中间表明细表 t_ods_tmp_detail --eg. 2013-09-18 06:49:42 drop table if exists t_ods_tmp_detail;create table t_ods_tmp_detail as select b.*,substring(time_local,0,10) as daystr,substring(time_local,12) as tmstr,substring(time_local,6,2) as month,substring(time_local,9,2) as day,substring(time_local,12,2) as hourFrom t_ods_tmp_referurl b;
以上语句可以改写成:insert into table ods_weblog_detail partition(datestr='20130918')select c.valid,c.remote_addr,c.remote_user,c.time_local,substring(c.time_local,0,10) as daystr,substring(c.time_local,12) as tmstr,substring(c.time_local,6,2) as month,substring(c.time_local,9,2) as day,substring(c.time_local,12,2) as hour,c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agentfrom (SELECT a.valid,a.remote_addr,a.remote_user,a.time_local,a.request,a.status,a.body_bytes_sent,a.http_referer,a.http_user_agent,b.ref_host,b.ref_path,b.ref_query,b.ref_query_id FROM ods_weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(a.http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as ref_host, ref_path, ref_query, ref_query_id) c;
show partitions ods_weblog_detail;
转载于:https://www.cnblogs.com/mediocreWorld/p/11107178.html
