catalyst-cooperative / pudl

The Public Utility Data Liberation Project provides analysis-ready energy system data to climate advocates, researchers, policymakers, and journalists.
https://catalyst.coop/pudl
MIT License
465 stars 107 forks source link

Enable visualization of `XbrlCalculationForest` #2689

Closed zaneselvans closed 1 year ago

zaneselvans commented 1 year ago

The XbrlCalculationForestFerc1 class contains a bunch of structural information about how various FERC 1 reported values are related to each other, as well as attribute metadata. For understanding what it means and debugging (and presenting information to others) it would be really nice to be able to visualize the forests (groups of several trees).

image

Nature of the Data

Attributes to Visualize

Interface

Design Thoughts

jdangerx commented 1 year ago

OK, so there's some automatic scaling / parameterization stuff that needs to go into this if we really want to package it up and not make it a pain - I'll get on that after this comment. But..

from pudl.output.ferc1 import XbrlCalculationTreeFerc1
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

def create_tree(depth=0, node_num=1):
    if depth <= 0:
        children = []
    else:
        children = [
            create_tree(depth=depth-1, node_num=(node_num + 1)),
            create_tree(depth=depth-1, node_num=(node_num + 2 ** (depth))),
        ]
    return XbrlCalculationTreeFerc1(**{
        "xbrl_factoid": f"reported_{node_num}",
        "source_table": "table_1",
        "xbrl_factoid_original": "reported_original_{node_num}",
        "weight": 1.0,
        "children": children
    })

def labelmaker(graph, fields):
    return {
        n: "\n".join(f"{field_name}: {n[field_index]}" for field_name, field_index in fields) for n in graph.nodes()
    }

deep = create_tree(depth=3, node_num=1).to_networkx()
labels = labelmaker(deep, fields=[("Source table", 0), ("Factoid", 1)])
pos = nx.nx_agraph.graphviz_layout(deep, prog="dot", args="-Grankdir=LR -Granksep=0.5 -Gnodesep=1.0")

fig, ax = plt.subplots()
# make a figure that's the right size
# scale the axes to that
edges = nx.draw_networkx(
    deep,
    pos,
    ax=ax,
    with_labels=False
)

label_offset = -50
label_pos = {node: (pos[0], pos[1] + label_offset) for node, pos in pos.items()}
labels = nx.draw_networkx_labels(
    deep,
    label_pos,
    labels=labels,
    ax=ax,
    bbox={"boxstyle": "round", "facecolor": "wheat", "alpha": 1.0},
)
fig.set_figheight(12)
fig.set_figwidth(12)
ax.set_xbound(-100, 1000)
ax.set_ybound(-100, 1000)

gets us image

I think we need to make the layout, then use that to infer the height/width of the figure as well as set the axis bounds accordingly.

jdangerx commented 1 year ago

Here's a snippet that graphs things and seems to make a legible layout:

import itertools

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

from pudl.output.ferc1 import XbrlCalculationTreeFerc1

def create_tree(depth=0, node_num=1):
    # only for visualization testing...
    if depth <= 0:
        children = []
    else:
        children = [
            create_tree(depth=depth-1, node_num=(node_num + 1)),
            create_tree(depth=depth-1, node_num=(node_num + 2 ** (depth))),
        ]
    return XbrlCalculationTreeFerc1(**{
        "xbrl_factoid": f"reported_{node_num}",
        "source_table": "table_1",
        "xbrl_factoid_original": "reported_original_{node_num}",
        "weight": 1.0,
        "children": children
    })

def labelmaker(graph, fields):
    # TODO: if we use namedtuples for the nodes instead of unnamed ones, we can avoid this janky index thing.
    # TODO: unfortunately, we strip out everything except for these two fields in the .to_networkx function
    FIELD_INDICES = {
        "Source table": 0,
        "Factoid": 1
    }
    return {
        n: "\n".join(f"{field}: {n[FIELD_INDICES[field]]}"
        for field in fields) for n in graph.nodes()
    }

def bounding_box(*positions, xmargin, ymargin):
    all_pos = itertools.chain.from_iterable(pos.values() for pos in positions)
    min_x, min_y = max_x, max_y = next(all_pos)
    for pos in all_pos:
        min_x = min(pos[0], min_x)
        max_x = max(pos[0], max_x)
        min_y = min(pos[1], min_y)
        max_y = max(pos[1], max_y)

    min_x -= xmargin
    max_x += xmargin
    min_y -= ymargin
    max_y += ymargin

    return {
        "xbounds": (min_x, max_x),
        "ybounds": (min_y, max_y),
        "width": max_x - min_x,
        "height": max_y - min_y
    }

def render_tree(graph, fields, xmargin=2, ymargin=1, ranksep=1.0, nodesep=1.2, base_label_offset=-0.35, **kwargs):
    """Everything gets passed in, in inches.

    Increase ranksep if labels are too close together LR.
    Increase nodesep of labels are too close together TB.
    Increase x/ymargin if labels are being cut off at the edges.
    """
    fig, ax = plt.subplots(**kwargs)
    dpi = fig.dpi

    labels = labelmaker(deep, fields=fields)
    xmargin *= dpi
    ymargin *= dpi
    label_offset = base_label_offset - 0.1 * len(fields)
    label_offset *= dpi
    pos = nx.nx_agraph.graphviz_layout(
        graph,
        prog="dot",
        args=f"-Grankdir=LR -Granksep={ranksep} -Gnodesep={nodesep}"
    )
    label_pos = {node: (pos[0], pos[1] + label_offset) for node, pos in pos.items()}

    # first draw the nodes + edges
    unlabeled = nx.draw_networkx(
        deep,
        pos,
        ax=ax,
        with_labels=False,
        node_color="lightsteelblue"
    )

    # then draw only the labels, with an offset
    labels = nx.draw_networkx_labels(
        deep,
        label_pos,
        labels=labels,
        ax=ax,
        bbox={"boxstyle": "round", "facecolor": "lavenderblush", "alpha": 1.0},
    )

    # set figure size + shape based on the graph layout
    # nx.draw* sets the bounding box automatically, so we need to force it here
    bounds = bounding_box(pos, label_pos, xmargin=xmargin, ymargin=ymargin)
    fig.set_figheight(bounds["height"] / dpi)
    fig.set_figwidth(bounds["width"] / dpi)
    ax.set_xbound(*bounds["xbounds"])
    ax.set_ybound(*bounds["ybounds"])

    return fig

And here's it in action:

deep = create_tree(depth=4, node_num=1).to_networkx()
fig = render_tree(deep, ["Source table", "Factoid"])

image

jdangerx commented 1 year ago

For the node coloring stuff, we could pass in a list of colors to draw_networkx() that corresponds to the individual nodes - something like

def determine_color(node):
    if node[0] == "table_1":
        return "red"
    else:
        return "blue"

colors = [determine_color(n) for n in list(graph)]

draw_networkx(..., node_colors=colors,...)