OpenLineage / OpenLineage

An Open Standard for lineage metadata collection
http://openlineage.io
Apache License 2.0
1.78k stars 309 forks source link

sql extractor: mismatch between table lineage dataset name and column lineage dataset name #1665

Open mobuchowski opened 1 year ago

mobuchowski commented 1 year ago

For example:

{
    "1": {
        "eventType": "COMPLETE",
        "eventTime": "2023-02-27T00:41:18.092707Z",
        "run": {
            "runId": "a5f85048-e0f6-3457-9370-5cc1069c9339",
            "facets": {
                "nominalTime": null,
                "parent": null
            }
        },
        "job": {
            "namespace": "default",
            "name": "postgres.postgres_daily_users_green_jellys_table",
            "facets": {
                "documentation": null,
                "sourceCodeLocation": null,
                "sql": {
                    "_producer": "https://github.com/OpenLineage/OpenLineage/tree/0.0.1/integration/airflow",
                    "_schemaURL": "https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet",
                    "query": "\n    CREATE TABLE jelly_daily_users_green_jellys AS \n    SELECT u.name, c.name as color\n    FROM jelly_active_daily_users u\n    JOIN jelly_favorite_colors f ON f.user_uuid = u.user_uuid\n    JOIN jelly_colors c ON c.color_uuid=f.color_uuid\n    WHERE c.name = 'green'\n    "
                }
            }
        },
        "inputs": [{
                "namespace": "postgres://db:5432",
                "name": "data.public.jelly_active_daily_users",
                "facets": {
                    "documentation": null,
                    "schema": {
                        "_producer": "https://github.com/OpenLineage/OpenLineage/tree/0.0.1/integration/airflow",
                        "_schemaURL": "https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
                        "fields": [{
                                "name": "user_uuid",
                                "type": "varchar",
                                "description": null
                            },
                            {
                                "name": "name",
                                "type": "varchar",
                                "description": null
                            },
                            {
                                "name": "sum",
                                "type": "int8",
                                "description": null
                            }
                        ]
                    },
                    "dataSource": {
                        "_producer": "https://github.com/OpenLineage/OpenLineage/tree/0.0.1/integration/airflow",
                        "_schemaURL": "https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/DataSourceDatasetFacet",
                        "name": "postgres://db:5432",
                        "uri": "postgres://db:5432/data"
                    },
                    "description": null,
                    "lifecycleStateChange": null,
                    "columnLineage": null,
                    "symlinks": null
                }
            },
            {
                "namespace": "postgres://db:5432",
                "name": "data.public.jelly_colors",
                "facets": {
                    "documentation": null,
                    "schema": {
                        "_producer": "https://github.com/OpenLineage/OpenLineage/tree/0.0.1/integration/airflow",
                        "_schemaURL": "https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
                        "fields": [{
                                "name": "color_uuid",
                                "type": "varchar",
                                "description": null
                            },
                            {
                                "name": "name",
                                "type": "varchar",
                                "description": null
                            }
                        ]
                    },
                    "dataSource": {
                        "_producer": "https://github.com/OpenLineage/OpenLineage/tree/0.0.1/integration/airflow",
                        "_schemaURL": "https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/DataSourceDatasetFacet",
                        "name": "postgres://db:5432",
                        "uri": "postgres://db:5432/data"
                    },
                    "description": null,
                    "lifecycleStateChange": null,
                    "columnLineage": null,
                    "symlinks": null
                }
            },
            {
                "namespace": "postgres://db:5432",
                "name": "data.public.jelly_favorite_colors",
                "facets": {
                    "documentation": null,
                    "schema": {
                        "_producer": "https://github.com/OpenLineage/OpenLineage/tree/0.0.1/integration/airflow",
                        "_schemaURL": "https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
                        "fields": [{
                                "name": "user_uuid",
                                "type": "varchar",
                                "description": null
                            },
                            {
                                "name": "color_uuid",
                                "type": "varchar",
                                "description": null
                            }
                        ]
                    },
                    "dataSource": {
                        "_producer": "https://github.com/OpenLineage/OpenLineage/tree/0.0.1/integration/airflow",
                        "_schemaURL": "https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/DataSourceDatasetFacet",
                        "name": "postgres://db:5432",
                        "uri": "postgres://db:5432/data"
                    },
                    "description": null,
                    "lifecycleStateChange": null,
                    "columnLineage": null,
                    "symlinks": null
                }
            }
        ],
        "outputs": [{
            "namespace": "postgres://db:5432",
            "name": "data.public.jelly_daily_users_green_jellys",
            "facets": {
                "documentation": null,
                "schema": {
                    "_producer": "https://github.com/OpenLineage/OpenLineage/tree/0.0.1/integration/airflow",
                    "_schemaURL": "https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
                    "fields": [{
                            "name": "name",
                            "type": "varchar",
                            "description": null
                        },
                        {
                            "name": "color",
                            "type": "varchar",
                            "description": null
                        }
                    ]
                },
                "dataSource": {
                    "_producer": "https://github.com/OpenLineage/OpenLineage/tree/0.0.1/integration/airflow",
                    "_schemaURL": "https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/DataSourceDatasetFacet",
                    "name": "postgres://db:5432",
                    "uri": "postgres://db:5432/data"
                },
                "description": null,
                "lifecycleStateChange": null,
                "columnLineage": {
                    "_producer": "https://github.com/OpenLineage/OpenLineage/tree/0.0.1/integration/airflow",
                    "_schemaURL": "https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ColumnLineageDatasetFacet",
                    "fields": {
                        "name": {
                            "inputFields": [{
                                "namespace": "postgres://db:5432",
                                "name": "public.jelly_active_daily_users",
                                "field": "name"
                            }],
                            "transformationDescription": "",
                            "transformationType": ""
                        },
                        "color": {
                            "inputFields": [{
                                "namespace": "postgres://db:5432",
                                "name": "public.jelly_colors",
                                "field": "name"
                            }],
                            "transformationDescription": "",
                            "transformationType": ""
                        }
                    }
                },
                "symlinks": null
            }
        }],
        "producer": "https://github.com/OpenLineage/OpenLineage/tree/0.20.6/integration/airflow"
    }
}
pawel-big-lebowski commented 1 year ago

@mobuchowski could you give more details on what should be done here? Shall this be implemented within SQL integration of OpenLineage?