Open cmackenzie1 opened 1 year ago
package main
import (
"bytes"
"fmt"
"github.com/xitongsys/parquet-go-source/local"
"github.com/xitongsys/parquet-go/parquet"
"github.com/xitongsys/parquet-go/tool/schematool"
"io"
)
func main() {
// Assuming you have a Parquet file stored in a byte slice named 'data'
reader, err := local.NewLocalFileReader(bytes.NewReader(data))
if err != nil {
fmt.Println("Error opening Parquet file:", err)
return
}
defer reader.Close()
// Create a Parquet reader
pReader, err := parquet.NewParquetReader(reader, new(MySchema), 4)
if err != nil {
fmt.Println("Error creating Parquet reader:", err)
return
}
defer pReader.ReadStop()
// Read rows into a slice of MySchema (you can create your custom schema)
var rows []MySchema
for {
if err = pReader.Read(&rows); err == io.EOF {
break
}
if err != nil {
fmt.Println("Error reading rows:", err)
return
}
}
// Now, you can access the data
for _, row := range rows {
fmt.Println("ID:", row.ID)
fmt.Println("A.B:", row.A.B)
// Access other fields as needed
}
}
// Define a custom schema (MySchema) that matches the structure of your data
type MySchema struct {
ID int32 `parquet:"name=id, type=INT32"`
A struct {
B int32 `parquet:"name=b, type=INT32"`
} `parquet:"name=a, repetitiontype=REQUIRED"`
}
func init() {
// Register the schema with Parquet for marshaling and unmarshaling
parquet.RegisterSchemaFromStruct("MySchema", reflect.TypeOf(MySchema{}))
}
I am looking for more guidance around how to read parquet files, especially when reading dynamic parquet files without a corresponding Go struct to read the data into. I was looking at some of the unit tests to see examples, but wasn't able to find many that didn't use a Go struct to read data into first before accessing values.
So far I've got the following:
Is that the best way to read them? Are there already existing methods to determine if one column path is a subset of another?