--Create Partitioned table from a non partitioned table
set hive.exec.dynamic.partition = true;
set hive.exec.dynamic.partition.mode = nonstrict;
create table user.city (id int, name string, countrycode string, district string, population string);
load data local inpath '/home/user/data/cities.pipes' into table user.city;
create external table part_city (id int, name string, district string, population int) partitioned by (countrycode string) location '/user/user/fads/city';
insert overwrite table part_city partition(countrycode) select id, name, district, population, countrycode from city;
***Careful, insert overwrite does some weird stuff to the source table, use insert into instead
--Create Partitioned table from a non partitioned table (Compressed)
set mapred.output.compress=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.GZipCodec;
create external table part_city (id string, name string, district string, population string) partitioned by (countrycode string) row format delimited lines terminated by '\n' stored as orcfile location '/user/user/fads/city';
insert overwrite table part_city partition(countrycode) select id, name, district, population, countrycode from city;
--Equivalent of drop column, in the columns () select the columns you want to retain, data will not be lost
alter table city replace columns(id string, name string, countrycode string, district string, population string);
--Create Table from existing table
create table dupcity like city;
--Get insights how the table was created
show create table city;
--Use UDF, transform and create a new table
add file /path/to/udf.py;
insert into table temp select transform(id, name, countrycode, district, population) using 'python udf.py' AS (id, name, countrycode, district, population) from city;
Examples use the cities database found in mysql example databases
python code for udf used:(Expands few abbreviated country codes)
import sys
lkp = {'VAT':'Vatican','SYR':'Syria','BRA':'Brazil','PSE':'Palestine'}
for lines in sys.stdin:
fields = lines.split('\t')
print '\t'.join(map(lkp.get, fields, fields)).strip()
No comments:
Post a Comment