这篇文档介绍DLA支持的文件格式。
CREATE EXTERNAL TABLE IF NOT EXISTS test_avro (
L_ORDERKEY INT,
L_PARTKEY INT,
L_SUPPKEY INT,
L_LINENUMBER INT,
L_QUANTITY DOUBLE,
L_EXTENDEDPRICE DOUBLE,
L_DISCOUNT DOUBLE,
L_TAX DOUBLE,
L_RETURNFLAG STRING,
L_LINESTATUS STRING,
L_SHIPDATE DATE,
L_COMMITDATE DATE,
L_RECEIPTDATE DATE,
L_SHIPINSTRUCT STRING,
L_SHIPMODE STRING,
L_COMMENT STRING
)
STORED AS AVRO
LOCATION 'oss://bucket001/datasets/test/test_avro/';
ORC
CREATE EXTERNAL TABLE test_orc (
L_ORDERKEY INT,
L_PARTKEY INT,
L_SUPPKEY INT,
L_LINENUMBER INT,
L_QUANTITY DOUBLE,
L_EXTENDEDPRICE DOUBLE,
L_DISCOUNT DOUBLE,
L_TAX DOUBLE,
L_RETURNFLAG STRING,
L_LINESTATUS STRING,
L_SHIPDATE DATE,
L_COMMITDATE DATE,
L_RECEIPTDATE DATE,
L_SHIPINSTRUCT STRING,
L_SHIPMODE STRING,
L_COMMENT STRING
)
STORED AS ORC
LOCATION 'oss://bucket001/datasets/test/test_orc';
Parquet
CREATE EXTERNAL TABLE test_parquet_hive_serde (
L_ORDERKEY INT,
L_PARTKEY INT,
L_SUPPKEY INT,
L_LINENUMBER INT,
L_QUANTITY DOUBLE,
L_EXTENDEDPRICE DOUBLE,
L_DISCOUNT DOUBLE,
L_TAX DOUBLE,
L_RETURNFLAG STRING,
L_LINESTATUS STRING,
L_SHIPDATE DATE,
L_COMMITDATE DATE,
L_RECEIPTDATE DATE,
L_SHIPINSTRUCT STRING,
L_SHIPMODE STRING,
L_COMMENT STRING
)
STORED AS PARQUET
LOCATION 'oss://bucket001/datasets/test/test_parquet_hive_serde';
RcFile
CREATE EXTERNAL TABLE test_rcfile (
L_ORDERKEY INT,
L_PARTKEY INT,
L_SUPPKEY INT,
L_LINENUMBER INT,
L_QUANTITY DOUBLE,
L_EXTENDEDPRICE DOUBLE,
L_DISCOUNT DOUBLE,
L_TAX DOUBLE,
L_RETURNFLAG STRING,
L_LINESTATUS STRING,
L_SHIPDATE DATE,
L_COMMITDATE DATE,
L_RECEIPTDATE DATE,
L_SHIPINSTRUCT STRING,
L_SHIPMODE STRING,
L_COMMENT STRING
)
STORED AS RCFILE
LOCATION 'oss://bucket001/datasets/test/test_rcfile';
TextFile: Normal
CREATE EXTERNAL TABLE IF NOT EXISTS test_text_null (
URL STRING,
TITLE STRING
)
STORED AS TEXTFILE
LOCATION 'oss://bucket001/datasets/test/test_null/textfile/';
TextFile: OpenCSV
在使用OpenCSVSerde时需要注意以下几点:
- 用户可以为行的字段指定字段分隔符、字段内容引用符号和转义字符,例如:WITH SERDEPROPERTIES (“separatorChar” = “,”, “quoteChar” = “`”, “escapeChar” = “\” );
- 不支持字段内嵌入的行分割符;
- 所有字段定义STRING类型;
- 其他数据类型的处理,可以在SQL中使用函数进行转换。
CREATE EXTERNAL TABLE test_opencsv (
id STRING,
name STRING,
location STRING,
create_date STRING,
create_timestamp STRING,
longitude STRING,
latitude STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties(
"separatorChar"=",",
"quoteChar"="\"",
"escapeChar"="\\"
)
STORED AS TEXTFILE
LOCATION 'oss://bucket001/datasets/test/test_opencsv';
TextFile: MultiDelimitSerDe
此文件格式的主要特点是对于TextFile里面列分隔符可以使用多个字符来做分隔。
CREATE EXTERNAL TABLE test_csv_multidelimit (
id STRING,
name STRING,
location STRING,
create_date STRING,
create_timestamp STRING,
longitude STRING,
latitude STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.MultiDelimitSerDe'
with serdeproperties(
"field.delim"="||"
)
STORED AS TEXTFILE
LOCATION 'oss://bucket001/datasets/test/test_multidelimiter';
RegexSerDe
此文件格式的主要使用场景是对一些没有固定格式、需要借助正则表达式来对数据中的字段进行扣取。
CREATE EXTERNAL TABLE IF NOT EXISTS test_regex (
host STRING,
identity STRING,
`user` STRING,
time STRING,
request STRING,
status STRING,
size INT,
referer STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
)
STORED AS TEXTFILE
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/test/test_regex';
JSON: org.apache.hadoop.hive.serde2.JsonSerDe
CREATE EXTERNAL TABLE IF NOT EXISTS `customer_case_jiahe`.`single_latin1_broken` (
`id` int ,
`name` string ,
`age` int
)
STORED AS JSON
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/test/customer_case/jiahe/single_latin1_encode_broken_record.log';
JSON: org.apache.hive.hcatalog.data.JsonSerDe
CREATE external TABLE json_table_1 (
docid string,
user_1 struct<
id:INT,
username:string,
name:string,
shippingaddress:struct<
address1:string,
address2:string,
city:string,
state:string
>,
orders:array<
struct<
itemid:INT,
orderdate:string
>
>
>
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/test/json/hcatalog_serde/table_1/test_json.json';
JSON EsriJsonSerDe
DLA支持Esri ArcGIS的地理JSON数据文件的SerDe处理,关于这种地理JSON数据格式说明,可以参考:https://github.com/Esri/spatial-framework-for-hadoop/wiki/JSON-Formats
CREATE EXTERNAL TABLE IF NOT EXISTS california_counties
(
Name string,
BoundaryShape binary
)
ROW FORMAT SERDE 'com.esri.hadoop.hive.serde.EsriJsonSerDe'
STORED AS INPUTFORMAT 'com.esri.json.hadoop.EnclosedEsriJsonInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/geospatial/california-counties/'