create table n_data(MARKET string,CATEGORY string,D map<string,string>,monthid int,value DOUBLE)
STORED AS ORC
;
我将数据加载到其中(超过45000000行),查看hive warehouse。
结果表由5个文件组成,每个文件大小为10MB-20MB,但是dfs.block.size设置为128MB,这不是存储小文件的最佳选择,因为它使用整个块!
如何设置HIVE拆分文件为128 MB? 编辑 插入查询:
insert into n_data
select tmp.market,tmp.category,d,adTable.monthid,tmp.factperiod[adTable.monthid] as fact
from (select market,category,d,factperiod,map_keys(factperiod) as month_arr from n_src where market is not null) as tmp
LATERAL VIEW explode(month_arr) adTable AS monthid