Monday, November 16, 2015

Hive Cheat Sheet


--Create Partitioned table from a non partitioned table
set hive.exec.dynamic.partition = true;
set hive.exec.dynamic.partition.mode = nonstrict;

create table user.city (id int, name string, countrycode string, district string, population string);
load data local inpath '/home/user/data/cities.pipes' into table user.city;

create external table part_city (id int, name string, district string, population int) partitioned by (countrycode string) location '/user/user/fads/city';

insert overwrite table part_city partition(countrycode) select id, name, district, population, countrycode from city;

***Careful, insert overwrite does some weird stuff to the source table, use insert into instead


--Create Partitioned table from a non partitioned table  (Compressed)
set mapred.output.compress=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.GZipCodec;

create external table part_city (id string, name string, district string, population string) partitioned by (countrycode string) row format delimited lines terminated by '\n' stored as orcfile location '/user/user/fads/city';
insert overwrite table part_city partition(countrycode) select id, name, district, population, countrycode from city;

--Equivalent of drop column, in the columns () select the columns you want to retain, data will not be lost
alter table city replace columns(id string, name string, countrycode string, district string, population string);

--Create Table from existing table
create table dupcity like city;

--Get insights how the table was created
show create table city;


--Use UDF, transform and create a new table
add file /path/to/udf.py;
insert into table temp select transform(id, name, countrycode, district, population) using 'python udf.py' AS (id, name, countrycode, district, population) from city;


Examples use the cities database found in mysql example databases

python code for udf used:(Expands few abbreviated  country codes)

import sys

lkp = {'VAT':'Vatican','SYR':'Syria','BRA':'Brazil','PSE':'Palestine'}

for lines in sys.stdin:
        fields = lines.split('\t')
        print '\t'.join(map(lkp.get, fields, fields)).strip()

No comments:

Post a Comment