Closed WanliTian closed 6 years ago
load this parquet file to hive, all rows is null
hi, @WanliTian my code to create parquet file:
package main
import (
"log"
"github.com/xitongsys/parquet-go/ParquetFile"
"github.com/xitongsys/parquet-go/ParquetWriter"
"github.com/xitongsys/parquet-go/parquet"
)
type Pack struct {
Uid *int64 `parquet:"name=uid,type=INT64"`
Ut *int64 `parquet:"name=ut, type=INT64"`
UniqId *string `parquet:"name=uniq_id, type=UTF8, encoding=PLAIN_DICTIONARY"`
}
func main() {
var err error
fw, err := ParquetFile.NewLocalFileWriter("flat.parquet")
if err != nil {
log.Println("Can't create local file", err)
return
}
pw, err := ParquetWriter.NewParquetWriter(fw, new(Pack), 4)
if err != nil {
log.Println("Can't create parquet writer", err)
return
}
pw.RowGroupSize = 128 * 1024 * 1024
pw.CompressionType = parquet.CompressionCodec_SNAPPY
uid, ut := int64(1), int64(2)
uniqid := "uniqid"
ts := Pack{
Uid: &uid,
Ut: &ut,
UniqId: &uniqid,
}
if err = pw.Write(ts); err != nil {
log.Println("Write error", err)
}
_ = pw.WriteStop()
fw.Close()
}
the generated parquet file
xtzhang@xtzhang-mac:~/testparquet$ parquet-tools cat flat.parquet
uid = 1
ut = 2
uniq_id = uniqid
xtzhang@xtzhang-mac:~/testparquet$ parquet-tools dump flat.parquet
row group 0
--------------------------------------------------------------------------------
uid: INT64 SNAPPY DO:0 FPO:4 SZ:55/53/0.96 VC:1 ENC:BIT_PACKED,PLAIN,RLE
ut: INT64 SNAPPY DO:0 FPO:59 SZ:55/53/0.96 VC:1 ENC:BIT_PACKED,PLAIN,RLE
uniq_id: BINARY SNAPPY DO:114 FPO:139 SZ:56/52/0.93 VC:1 ENC:BIT_PACK [more]...
uid TV=1 RL=0 DL=1
----------------------------------------------------------------------------
page 0: DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 1, [more]... VC:1
ut TV=1 RL=0 DL=1
----------------------------------------------------------------------------
page 0: DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 2, [more]... VC:1
uniq_id TV=1 RL=0 DL=1 DS: 1 DE:PLAIN
----------------------------------------------------------------------------
page 0: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY S [more]... VC:1
INT64 uid
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 1 ***
value 1: R:0 D:1 V:1
INT64 ut
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 1 ***
value 1: R:0 D:1 V:2
BINARY uniq_id
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 1 ***
value 1: R:0 D:1 V:uniqid
the hive create table statements:
hive> show create table test;
OK
CREATE TABLE `test`(
`uid` bigint,
`ut` bigint,
`uniq_id` string)
PARTITIONED BY (
`tag_id` bigint)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://0.0.0.0:9000/user/hive/warehouse/test'
TBLPROPERTIES (
'transient_lastDdlTime'='1536276555')
Time taken: 3.637 seconds, Fetched: 16 row(s)
upload the file to hive
[root@a40bfafb422f /]# hdfs dfs -ls /user/hive/warehouse/test/tag_id=0/
Found 1 items
-rw-r--r-- 1 root supergroup 431 2018-09-06 23:33 /user/hive/warehouse/test/tag_id=0/flat.parquet
hive query result
hive> select * from test;
OK
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
1 2 uniqid 0
Time taken: 5.323 seconds, Fetched: 1 row(s)
everything is ok. So please check your process and confirm the issue :)
@xitongsys
oops i got it
i pass an address of object to ParquetWriter.Write
i will change my code
an plant for support this?
hi, @WanliTian It's already supported in the latest version see here : #95
table
CREATE TABLE
test(
uidbigint,
utbigint,
uniq_idstring ) PARTITIONED BY (
tag_idbigint) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
struct
type Pack struct { Uid *int64
parquet:"name=uid,type=INT64"Ut *int64
parquet:"name=ut, type=INT64"UniqId *string
parquet:"name=uniq_id, type=UTF8, encoding=PLAIN_DICTIONARY"` }`