Open sdementen opened 4 years ago
I see you have five nodes with x=0.0 and one with x=1.0.
For my Plotly app I'm writing functions that generate x and y coordinates for a go.Sankey, and if some numbers that come out of them happen to be 0.0 or 1.0 the nodes go haywire similarly to what you have in your screenshot. I fixed it for me by making the coordinates go from 0.01 to 0.99.
If you replace your 0.0s by some arbitrarily small positive number and your 1.0s by some number arbitrarily close to 1.0, does it circumvent the bug?
Indeed, thank you! After some testing, it appears the issue is with x=0 (x=1 is fine). It would be good to update the documentation of the Sankey plot to clarify that x (and maybe y) should be in the (0,1] range. It can even go beyond 1 (the element and then on the right of the plot but still in the right order).
Hi all, In my case I have years as the first branch (from 2013 to 2022) and the order they show is, apparently, by size of the branch.
This is my code
# -*- coding: utf-8 -*-
# Import Libraries
import time
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import pandas as pd
import dash
from dash import Input, Output, dcc, html
import dash_bootstrap_components as dbc
# First Data Treatment
# Data Treatment
df_in = pd.read_csv('https://raw.githubusercontent.com/vostpt/ICNF_DATA/main/ICNF_2013_2022_SANKEY.csv') #generated by updater.py
dummy_year=[2017,2020]
dummy_district = ['Faro','Bragança']
# Cleanup where DISTRITO and CONCELHO have the same value
df_in["CONCELHO"] = np.where(df_in["DISTRITO"]==df_in["CONCELHO"], df_in["CONCELHO"]+"_concelho", df_in["CONCELHO"])
# Deal with some duplicates names across source and target
df_in["CONCELHO"] = df_in["CONCELHO"].str.capitalize()
# Sort values in dataframe
df_in = df_in.sort_values(["ANO", "DISTRITO","CONCELHO"])
# Use isin function to filter dataframe
# by district from dropdown
df_filter_district = df_in[df_in['DISTRITO'].isin(dummy_district)].reset_index()
# by year
df_filter = df_filter_district[df_filter_district['ANO'].isin(dummy_year)].reset_index()
# More Data Treatment
# Filter by ANO and DISTRITO while summing NCCO.
# Also renaming columns for readibility
df = df_in.groupby(["ANO","DISTRITO"], as_index=False)["NCCO"].sum().rename(columns={"ANO":"source","DISTRITO":"target","NCCO":"value"})
# Change ANO type to string
df["source"] = df["source"].astype(int).astype(str)
# Concatenate previous dataframe with a new dataframe that
# groups DISTRITO and CONCELHO.
# This can be done enumerous times to create more steps for the Sankey
df = pd.concat([df, df_in.groupby(["DISTRITO","CONCELHO"], as_index=False)["NCCO"].sum().rename(columns={"DISTRITO":"source","CONCELHO":"target", "NCCO":"value"})])
# Create Nodes
nodes = np.unique(df[["source","target"]], axis=None)
nodes = pd.Series(index=nodes, data=range(len(nodes)))
# Create Node Colors
# node_colors = [np.random.choice(colors) for node in nodes]
# define color scale
colors = px.colors.qualitative.Plotly
# define one random color for every node
node_colors_mappings = dict([(node,np.random.choice(colors)) for node in nodes])
node_colors = [node_colors_mappings[node] for node in nodes]
edge_colors = [node_colors_mappings[node] for node in nodes]
# Plot Graphs
fig = go.Figure(
go.Sankey(
node=dict(
label = nodes.index,
line = dict(color = "white", width = 1.0),
color = node_colors,
),
link={
"source": nodes.loc[df["source"]],
"target": nodes.loc[df["target"]],
"value": df["value"],
},
)
)
# Update Layout
#fig.update_layout(title_text="FOREST FIRES IN PORTUGAL",
# height = 900,
# width=1600,
# font_size=12)
fig.update_layout(plot_bgcolor='black', paper_bgcolor='black',font=dict(size = 10, color = 'white'),)
# START APP -----------------------------------------------------
app = dash.Dash(
external_stylesheets=[dbc.themes.CYBORG],
#suppress_callback_exceptions=True,
meta_tags=[{"name": "viewport", "content": "width=device-width, initial-scale=1"}],
)
app.title = 'VOSTPT - ICNF'
app.layout = dbc.Container(
[
# First Row
dbc.Row(
[
dbc.Col(
html.Hr(
style={
"borderWidth": "2vh",
"width": "100%",
"borderColor": "#A30000",
"opacity": "unset",
}
),
width={"size": 12},
),
],
className="g-0",
), # end of first row
# Second Row
dbc.Row(
[ # you have to create a children's array to have more than one column in a row
dbc.Col(
html.H3("FOREST FIRES IN PORTUGAL"),
width={"size": 6, "offset": 0},
), # First Column
dbc.Col(
html.H4("Data ICNF", style={"color": "#A30000"}),
width={"size": 5, "offset": 0},
), # Second Column
], # Close Children of Second Row
), # End of second row
# Third Row
dbc.Row(
[
# Year Dropdown
dbc.Col(
dcc.Dropdown(
id="dropdown_year",
options=[
{"label": i, "value": i}
for i in df_in.ANO.unique()
],
optionHeight=35, # height/space between dropdown options
value=[2013,2022], # dropdown value selected automatically when page loads
disabled=False, # disable dropdown value selection
multi=True, # allow multiple dropdown values to be selected
searchable=True, # allow user-searching of dropdown values
search_value="", # remembers the value searched in dropdown
placeholder="Please select year", # gray, default text shown when no option is selected
clearable=True, # allow user to removes the selected value
style={
"width": "100%"
}, # use dictionary to define CSS styles of your dropdown
# className='select_box', #activate separate CSS document in assets folder
# persistence=True, #remembers dropdown value. Used with persistence_type
# persistence_type='memory' #remembers dropdown value selected until...
),
),
# District Dropdown
dbc.Col(
dcc.Dropdown(
id="dropdown_district",
options=[
{"label": i, "value": i}
for i in df_in.DISTRITO.unique()
],
optionHeight=35, # height/space between dropdown options
value=['Aveiro','Viseu'], # dropdown value selected automatically when page loads
disabled=False, # disable dropdown value selection
multi=True, # allow multiple dropdown values to be selected
searchable=True, # allow user-searching of dropdown values
search_value="", # remembers the value searched in dropdown
placeholder="Please select District", # gray, default text shown when no option is selected
clearable=True, # allow user to removes the selected value
style={
"width": "100%"
}, # use dictionary to define CSS styles of your dropdown
# className='select_box', #activate separate CSS document in assets folder
# persistence=True, #remembers dropdown value. Used with persistence_type
# persistence_type='memory' #remembers dropdown value selected until...
),
),
],
),
# Fourth Row
dbc.Row(
dbc.Col(dcc.Graph(id="sankey", figure=fig))
),
],
)
@app.callback(
Output(component_id="sankey",component_property="figure"),
Input(component_id="dropdown_year", component_property="value"),
Input(component_id="dropdown_district", component_property="value"),
)
def build_graph(dropdown_year, dropdown_district):
# Data Treatment
df_in = pd.read_csv('https://raw.githubusercontent.com/vostpt/ICNF_DATA/main/ICNF_2013_2022_SANKEY.csv') #generated by updater.py
# Cleanup where DISTRITO and CONCELHO have the same value
df_in["CONCELHO"] = np.where(df_in["DISTRITO"]==df_in["CONCELHO"], df_in["CONCELHO"]+"_concelho", df_in["CONCELHO"])
# Deal with some duplicates names across source and target
df_in["CONCELHO"] = df_in["CONCELHO"].str.capitalize()
# Sort values in dataframe
df_in = df_in.sort_values(["ANO", "DISTRITO","CONCELHO"])
# Use isin function to filter dataframe
# by year from dropdown
df_filter_year = df_in[df_in['ANO'].isin(dropdown_year)].reset_index()
# by district
df_filter = df_filter_year[df_filter_year['DISTRITO'].isin(dropdown_district)].reset_index()
# More Data Treatment
# Filter by ANO and DISTRITO while summing NCCO.
# Also renaming columns for readibility
df = df_filter.groupby(["ANO","DISTRITO"], as_index=False)["NCCO"].sum().rename(columns={"ANO":"source","DISTRITO":"target","NCCO":"value"})
# Change ANO type to string
df["source"] = df["source"].astype(int).astype(str)
# Concatenate previous dataframe with a new dataframe that
# groups DISTRITO and CONCELHO.
# This can be done enumerous times to create more steps for the Sankey
df = pd.concat([df, df_filter.groupby(["DISTRITO","CONCELHO"], as_index=False)["NCCO"].sum().rename(columns={"DISTRITO":"source","CONCELHO":"target", "NCCO":"value"})])
# Create Nodes
nodes = np.unique(df[["source","target"]], axis=None)
nodes = pd.Series(index=nodes, data=range(len(nodes)))
# define color scale
colors = px.colors.sequential.Plasma_r
# define one random color for every node
node_colors_mappings = dict([(node,np.random.choice(colors)) for node in nodes])
node_colors = [node_colors_mappings[node] for node in nodes]
edge_colors = [node_colors_mappings[node] for node in nodes]
# Plot Graphs
fig = go.Figure(
go.Sankey(
node=dict(
label = nodes.index,
color = node_colors,
),
link={
"source": nodes.loc[df["source"]],
"target": nodes.loc[df["target"]],
"value": df["value"],
},
)
)
# Update Layout
fig.update_layout(plot_bgcolor='black', paper_bgcolor='black',font=dict(size = 10, color = 'white'))
# Update Orientation
fig.update_traces(orientation="v", selector=dict(type='sankey'))
return fig
if __name__ == "__main__":
app.run_server(debug=True, port=8888)
# END APP
# END App
One would expect that the years would appear by order, but they don't as you can see per this image:
Is there any way to force the Sankey graph to respect the sorting order of the dataframe?
Hi!
I haven't found a way to force an ordering in the Sankey links through the node
and link
attributes in Python code.
In the project I worked on, which dealt with years of study (1º through 12º), the Sankey diagram eventually started to mix them up
and my solution was to manually position the nodes by passing coordinates in the (0, 1] range to the node['x']
and node['y']
attributes.
But I have been reading some source code for a little bit. Many visualizations implemented in Plotly.js use the D3.js library, including Sankey, which uses the d3-sankey module. Now, d3-sankey itself seems to respects the ordering of the arrays passed to the sankey.links and sankey.nodes functions by default. The ordering can be further customized through the linkSort and nodeSort functions.
So, I think that Plotly's Sankey diagrams could respect the order of links and nodes in the dataframe, but I think that at the moment Plotly doesn't provides us the options to do so. I wasn't able to pinpoint where exactly the order is getting mangled yet. I wonder if it is a good idea to open a new issue asking for new sorting attributes in plotly.graph_objects.Sankey
.
Hi @diogotito, thank you for your feedback. After running some experiments I found out that the ordering is done by the value of passed to each node, as you can actually clearly see in the image I posted (I did other tests just to make sure). Opening an issue might might be a good idea, based on what you describe, yes
I am experiencing this even without any 0's
code to repro here
import plotly.graph_objects as go
x = [0.06101736788793636, 0.23475059910566495, 0.23475059910566495, 0.23475059910566495, 0.43176371766682314, 0.43176371766682314, 0.43176371766682314, 0.6881809422635077, 0.6881809422635077, 0.6881809422635077, 0.6881809422635077, 0.6881809422635077, 0.6881809422635077, 1.0]
y = [0.22857142857142856, 0.4, 0.22857142857142856, 0.05714285714285714, 0.5714285714285714, 0.4, 0.22857142857142856, 1.0, 0.8285714285714286, 0.6571428571428571, 0.4857142857142857, 0.3142857142857143, 0.14285714285714285, 0.7428571428571429]
labels = list(map(str, (range(len(x)))))
colors = 'blue'
source_links = (0, 0, 0, 1, 1, 1, 4, 4, 4, 4, 4, 4, 10, 7)
target_links = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13)
values = [19890, 20082, 60028, 4262, 15628, 0, 815, 887, 2560, 859, 853, 2550, 1504, 1504]
fig = go.Figure(data=[go.Sankey(
arrangement = "fixed",
node = dict(
# pad = 15,
# thickness = 20,
line = dict(color = "black", width = 0.5),
label = labels,
color = colors,
x=x,
y=y
),
link = dict(
source = source_links,
target = target_links,
value = values,
label = values,
))])
fig.update_layout(title_text="repro", font_size=10)
fig.show()
comes out like this
but should look like this
The code below
generates which is not expected as:
I may have misunderstood the (x,y) coordinates logic or it is a bug...
Libraries version: plotly==4.9.0 dash==1.16.3 dash-core-components==1.12.1 dash-html-components==1.1.1