Monday, October 15, 2012

Hive SerDe !

SerDe means Serialisation/Deserialisation, it is one of the amazing power of Hive/Hadoop.
CREATE EXTERNAL TABLE t_access_log_part (
 c_proxy STRING, c_ip STRING, c_timetaken STRING,
 c_jour STRING, c_mois STRING,
 c_annee STRING, c_hour STRING,
 c_reste_timestamp STRING, c_commande STRING,
 c_fichier STRING, c_protocole STRING,
 c_code_retour STRING, c_size STRING,
 c_reste STRING, c_identifiant STRING
)
PARTITIONED BY (c_date string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES  (
"input.regex" = "([a-zA-Z-0-9]*)[^\\t]*\\t(\\d{1,3}[.]\\d{1,3}[.]\\d{1,3}[.]\\d{1,3})[ ][^ ]+[ ](\\d+)[ ]+\\[(\\d+)/([0-9a-zA-Z]+)/([0-9a-zA-Z]+):(\\d+):(.*)\\][ ]\"(\\w+)[ ](.*)[ ]+([A-Za-z0-9/.]+)\"[ ]+([0-9A-Za-z]+)[ ]+([0-9A-Za-z-]+)[ ]+\"(.*)\"[ ]+\"(.*)\"",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s"
)
STORED AS TEXTFILE;


No ETL !