stack-spot / stackspot-studio-data-roadmap

Repositorio para equipe de criacao de conteudo para poder criar as issues de repositorio
https://stackspot.com
Apache License 2.0
0 stars 0 forks source link

[DM] H1 - Desenho de Solução para Linhagem de Dados #169

Open luizcarloszup opened 9 months ago

elpasckezup commented 9 months ago

Exemplo de payload enviado pelo SplineAgent

{
    "id": "2d9337b2-96e8-4ce2-a733-2db21b055d6b",
    "operations": {
        "write": {
            "outputSource": "s3://data-lake-curated-326093229706-us-east-1/employee",
            "append": false,
            "id": 0,
            "childIds": [
                1
            ],
            "params": {
                "table": {
                    "identifier": {
                        "table": "employee",
                        "database": "curated_db"
                    },
                    "storage": {
                        "locationUri": "s3://data-lake-curated-326093229706-us-east-1/employee",
                        "compressed": false,
                        "properties": {}
                    }
                }
            },
            "extra": {
                "destinationType": "parquet",
                "name": "CreateDataSourceTableAsSelectCommand"
            }
        },
        "reads": [
            {
                "childIds": [],
                "inputSources": [
                    "s3://data-lake-raw-326093229706-us-east-1/employee"
                ],
                "id": 3,
                "schema": [
                    "0",
                    "1",
                    "2",
                    "3",
                    "4",
                    "5",
                    "6",
                    "7",
                    "8",
                    "9",
                    "10"
                ],
                "params": {
                    "table": {
                        "identifier": {
                            "table": "employee",
                            "database": "raw_db"
                        },
                        "storage": {
                            "locationUri": "s3://data-lake-raw-326093229706-us-east-1/employee",
                            "inputFormat": "org.apache.hadoop.mapred.TextInputFormat",
                            "outputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
                            "serde": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
                            "compressed": false,
                            "properties": {
                                "field.delim": ","
                            }
                        }
                    }
                },
                "extra": {
                    "sourceType": "hive",
                    "name": "HiveTableRelation"
                }
            }
        ],
        "other": [
            {
                "id": 2,
                "childIds": [
                    3
                ],
                "schema": [
                    "0",
                    "1",
                    "2",
                    "3",
                    "4",
                    "5",
                    "6",
                    "7",
                    "8",
                    "9",
                    "10"
                ],
                "params": {
                    "identifier": "spark_catalog.raw_db.employee"
                },
                "extra": {
                    "name": "SubqueryAlias"
                }
            },
            {
                "id": 1,
                "childIds": [
                    2
                ],
                "schema": [
                    "0",
                    "1",
                    "2",
                    "3",
                    "4",
                    "5",
                    "6",
                    "7",
                    "8",
                    "9",
                    "10"
                ],
                "params": {
                    "projectList": [
                        {
                            "_typeHint": "expr.AttrRef",
                            "refId": "0"
                        },
                        {
                            "_typeHint": "expr.AttrRef",
                            "refId": "1"
                        },
                        {
                            "_typeHint": "expr.AttrRef",
                            "refId": "2"
                        },
                        {
                            "_typeHint": "expr.AttrRef",
                            "refId": "3"
                        },
                        {
                            "_typeHint": "expr.AttrRef",
                            "refId": "4"
                        },
                        {
                            "_typeHint": "expr.AttrRef",
                            "refId": "5"
                        },
                        {
                            "_typeHint": "expr.AttrRef",
                            "refId": "6"
                        },
                        {
                            "_typeHint": "expr.AttrRef",
                            "refId": "7"
                        },
                        {
                            "_typeHint": "expr.AttrRef",
                            "refId": "8"
                        },
                        {
                            "_typeHint": "expr.AttrRef",
                            "refId": "9"
                        },
                        {
                            "_typeHint": "expr.AttrRef",
                            "refId": "10"
                        }
                    ]
                },
                "extra": {
                    "name": "Project"
                }
            }
        ]
    },
    "systemInfo": {
        "name": "spark",
        "version": "3.1.1-amzn-0"
    },
    "agentInfo": {
        "name": "spline",
        "version": "0.6.1"
    },
    "extraInfo": {
        "appName": "nativespark-RawToCurated_employee_optimize-jr_dcdc37e2eb5465b256b414836134f8c78c4a123c461543d79c40d88dbea848dd",
        "dataTypes": [
            {
                "_typeHint": "dt.Simple",
                "id": "9172aa44-dec9-4345-a98e-308670e1dd0a",
                "name": "long",
                "nullable": true
            },
            {
                "_typeHint": "dt.Simple",
                "id": "c4d26ac0-df0e-4018-99c1-9a8eee16697b",
                "name": "string",
                "nullable": true
            }
        ],
        "attributes": [
            {
                "id": "0",
                "name": "employee_id",
                "dataTypeId": "9172aa44-dec9-4345-a98e-308670e1dd0a"
            },
            {
                "id": "1",
                "name": "first_name",
                "dataTypeId": "c4d26ac0-df0e-4018-99c1-9a8eee16697b"
            },
            {
                "id": "2",
                "name": "last_name",
                "dataTypeId": "c4d26ac0-df0e-4018-99c1-9a8eee16697b"
            },
            {
                "id": "3",
                "name": "email",
                "dataTypeId": "c4d26ac0-df0e-4018-99c1-9a8eee16697b"
            },
            {
                "id": "4",
                "name": "phone_number",
                "dataTypeId": "c4d26ac0-df0e-4018-99c1-9a8eee16697b"
            },
            {
                "id": "5",
                "name": "hire_date",
                "dataTypeId": "c4d26ac0-df0e-4018-99c1-9a8eee16697b"
            },
            {
                "id": "6",
                "name": "job_id",
                "dataTypeId": "c4d26ac0-df0e-4018-99c1-9a8eee16697b"
            },
            {
                "id": "7",
                "name": "salary",
                "dataTypeId": "9172aa44-dec9-4345-a98e-308670e1dd0a"
            },
            {
                "id": "8",
                "name": "commission_pct",
                "dataTypeId": "9172aa44-dec9-4345-a98e-308670e1dd0a"
            },
            {
                "id": "9",
                "name": "manager_id",
                "dataTypeId": "9172aa44-dec9-4345-a98e-308670e1dd0a"
            },
            {
                "id": "10",
                "name": "department_id",
                "dataTypeId": "9172aa44-dec9-4345-a98e-308670e1dd0a"
            }
        ]
    }
}
elpasckezup commented 9 months ago

C4 model Producer

https://app.diagrams.net/#G1HoweTEYjLVy5vKmhGscjYJEvNrep1T7V#%7B%22pageId%22%3A%226E0ccBRsCMtLOnT8ir38%22%7D

C4 model Governance Federated

https://app.diagrams.net/#G1LLJpNB27mq9yiT2Ilk6wZYvXT7GRpyAp#%7B%22pageId%22%3A%226E0ccBRsCMtLOnT8ir38%22%7D

elpasckezup commented 9 months ago

Sugestão de modelagem Image

Sugestão de Payload

users

{
    "name": "users",
    "type": "API",
    "fields":[
        {
            "name": "id",
            "metadata": {                  
                "type": "int",
                "required": true
            }
        },
        {
            "name": "username",
            "metadata": {
                "type": "varchar",
                "length": 50,
                "required": true
            }
        },
        {
            "name": "gender",
            "metadata": {
                "type": "char",
                "length": 5,
                "required": true
            }
        },
        {
            "name": "email",
            "metadata": {
                "type": "varchar",
                "length": 250,
                "required": true
            }
        }
    ]
}

sales

{
    "name": "sales",
    "type": "ADAPTER",
    "fields":[
        {
            "name": "sales_id",
            "metadata": {                  
                "type": "int",
                "required": true
            }
        },
        {
            "name": "client_id",
            "metadata": {                  
                "type": "int",
                "required": true
            }
        },
        {
            "name": "gross_value",
            "metadata": {
                "type": "float",
                "scale": 5,
                "precision": 18,
                "required": true
            }
        },
        {
            "name": "net_value",
            "metadata": {
                "type": "float",
                "scale": 5,
                "precision": 18,
                "required": true
            }
        }
    ]
}

_user_salessum

{
    "name": "user_sales_sum",
    "type": "DATASET",
    "fields":[
        {
            "name": "id",
            "metadata": {                  
                "type": "int",
                "required": true
            },
            "sources": [
                {
                    "name": "users",
                    "fields": [
                        "id"
                    ]
                },
                {
                    "name": "sales",
                    "fields": [
                        "client_id"
                    ]
                }
            ]
        },
        {
            "name": "total_gross_value",
            "metadata": {
                "type": "float",
                "scale": 5,
                "precision": 18,
                "required": true
            },
            "sources": [
                {
                    "name": "sales",
                    "fields": [
                        "gross_value"
                    ]
                }
            ]
        }
    ]
}
elpasckezup commented 9 months ago

Abaixo referência e exemplo:

Apache TinkerPop Spline Local Gremlin Server (Docker) and Sample Write / Read Code Data Lineage for Data Lake