mwaskom / seaborn

Statistical data visualization in Python
https://seaborn.pydata.org
BSD 3-Clause "New" or "Revised" License
12.59k stars 1.93k forks source link

Error bars not attached to actual hight of sns.barplot when using bottom arg #3764

Closed droricu closed 1 month ago

droricu commented 1 month ago

I'm creating a stacked bar graph using this code: `fig, ax = plt.subplots(figsize=(30,10))

sns.barplot(x='Sample' , y=Normalized_Summary['>3nt insertion'], data=Normalized_Summary, bottom = AvgNorm1['>3nt deletion']+AvgNorm1['1-3nt deletion']+AvgNorm1['1-3nt insertion'], label='>3nt insertion', color="blue") sns.barplot(x='Sample' , y=Normalized_Summary['1-3nt insertion'], data=Normalized_Summary, bottom = AvgNorm1['>3nt deletion']+AvgNorm1['1-3nt deletion'], label='1-3nt insertion', color="green") sns.barplot(x=Normalized_Summary['Sample'] , y=Normalized_Summary['1-3nt deletion'], data=Normalized_Summary, bottom = AvgNorm1['>3nt deletion'], label='1-3nt deletion', color="orange") sns.barplot(x=Normalized_Summary['Sample'] , y=Normalized_Summary['>3nt deletion'], data=Normalized_Summary, bottom = 0, label='>3nt deletion', color="red") `

Where each Sample shows up multiple times.

And getting this result: image

As you can see, the error bars are not appearing at the actual top of the stacked bar, but at the height of the bar as if it began at 0.

Thanks!

mwaskom commented 1 month ago

Hello, bottom is not a seaborn parameter, it's a parameter of the underlying matplotlib plt.bar function, which does not get the error bar data.

This isn't expected to or advertised to work. Personally, I don't think error bars on stacked bar plots make very much sense, as the error estimates on the "upper" bars should take into account the error in the bars they are stacked on top of to properly represent uncertainty in that value.

VikramJaryal commented 1 month ago

Please refer below :

""" Components for parsing variable assignments and internally representing plot data. """ from future import annotations

from collections.abc import Mapping, Sized from typing import cast import warnings

import pandas as pd from pandas import DataFrame

from seaborn._core.typing import DataSource, VariableSpec, ColumnName from seaborn.utils import _version_predates

class PlotData: """ Data table with plot variable schema and mapping to original names.

Contains logic for parsing variable specification arguments and updating
the table with layer-specific data and/or mappings.

Parameters
----------
data
    Input data where variable names map to vector values.
variables
    Keys are names of plot variables (x, y, ...) each value is one of:

    - name of a column (or index level, or dictionary entry) in `data`
    - vector in any format that can construct a

:class:pandas.DataFrame

Attributes
----------
frame
    Data table with column names having defined plot variables.
names
    Dictionary mapping plot variable names to names in source data

structure(s). ids Dictionary mapping plot variable names to unique data source identifiers.

"""
frame: DataFrame
frames: dict[tuple, DataFrame]
names: dict[str, str | None]
ids: dict[str, str | int]
source_data: DataSource
source_vars: dict[str, VariableSpec]

def __init__(
    self,
    data: DataSource,
    variables: dict[str, VariableSpec],
):

    data = handle_data_source(data)
    frame, names, ids = self._assign_variables(data, variables)

    self.frame = frame
    self.names = names
    self.ids = ids

    # The reason we possibly have a dictionary of frames is to support

the

Plot.pair operation, post scaling, where each x/y variable needs

its

own frame. This feels pretty clumsy and there are a bunch of

places in

the client code with awkard if frame / elif frames constructions.

    # It would be great to have a cleaner abstraction here.
    self.frames = {}

    self.source_data = data
    self.source_vars = variables

def __contains__(self, key: str) -> bool:
    """Boolean check on whether a variable is defined in this

dataset.""" if self.frame is None: return any(key in df for df in self.frames.values()) return key in self.frame

def join(
    self,
    data: DataSource,
    variables: dict[str, VariableSpec] | None,
) -> PlotData:
    """Add, replace, or drop variables and return as a new dataset."""
    # Inherit the original source of the upstream data by default
    if data is None:
        data = self.source_data

    # TODO allow `data` to be a function (that is called on the source

data?)

    if not variables:
        variables = self.source_vars

    # Passing var=None implies that we do not want that variable in

this layer disinherit = [k for k, v in variables.items() if v is None]

    # Create a new dataset with just the info passed here
    new = PlotData(data, variables)

    # -- Update the inherited DataSource with this new information

    drop_cols = [k for k in self.frame if k in new.frame or k in

disinherit] parts = [self.frame.drop(columns=drop_cols), new.frame]

    # Because we are combining distinct columns, this is perhaps more
    # naturally thought of as a "merge"/"join". But using concat because
    # some simple testing suggests that it is marginally faster.
    frame = pd.concat(parts, axis=1, sort=False, copy=False)

    names = {k: v for k, v in self.names.items() if k not in disinherit}
    names.update(new.names)

    ids = {k: v for k, v in self.ids.items() if k not in disinherit}
    ids.update(new.ids)

    new.frame = frame
    new.names = names
    new.ids = ids

    # Multiple chained operations should always inherit from the

original object new.source_data = self.source_data new.source_vars = self.source_vars

    return new

def _assign_variables(
    self,
    data: DataFrame | Mapping | None,
    variables: dict[str, VariableSpec],
) -> tuple[DataFrame, dict[str, str | None], dict[str, str | int]]:
    """
    Assign values for plot variables given long-form data and/or vector

inputs.

    Parameters
    ----------
    data
        Input data where variable names map to vector values.
    variables
        Keys are names of plot variables (x, y, ...) each value is one

of:

        - name of a column (or index level, or dictionary entry) in

data

def handle_data_source(data: object) -> pd.DataFrame | Mapping | None: """Convert the data source object to a common union representation.""" if isinstance(data, pd.DataFrame) or hasattr(data, "dataframe"):

Check for pd.DataFrame inheritance could be removed once

    # minimal pandas version supports dataframe interchange (1.5.0).
    data = convert_dataframe_to_pandas(data)
elif data is not None and not isinstance(data, Mapping):
    err = f"Data source must be a DataFrame or Mapping, not

{type(data)!r}." raise TypeError(err)

return data

def convert_dataframe_to_pandas(data: object) -> pd.DataFrame: """Use the DataFrame exchange protocol, or fail gracefully.""" if isinstance(data, pd.DataFrame): return data

if not hasattr(pd.api, "interchange"):
    msg = (
        "Support for non-pandas DataFrame objects requires a version of

pandas " "that implements the DataFrame interchange protocol. Please upgrade " "your pandas version or coerce your data to pandas before passing " "it to seaborn." ) raise TypeError(msg)

if _version_predates(pd, "2.0.2"):
    msg = (
        "DataFrame interchange with pandas<2.0.2 has some known issues.

" f"You are using pandas {pd.version}. " "Continuing, but it is recommended to carefully inspect the results and to " "consider upgrading." ) warnings.warn(msg, stacklevel=2)

try:
    # This is going to convert all columns in the input dataframe, even

though

we may only need one or two of them. It would be more efficient

to select

the columns that are going to be used in the plot prior to

interchange.

Solving that in general is a hard problem, especially with the

objects

interface where variables passed in Plot() may only be referenced

later

in Plot.add(). But noting here in case this seems to be a

bottleneck. return pd.api.interchange.from_dataframe(data) except Exception as err: msg = ( "Encountered an exception when converting data source " "to a pandas DataFrame. See traceback above for details." ) raise RuntimeError(msg) from err

On Mon, 14 Oct, 2024, 5:26 pm Michael Waskom, @.***> wrote:

Closed #3764 https://github.com/mwaskom/seaborn/issues/3764 as completed.

— Reply to this email directly, view it on GitHub https://github.com/mwaskom/seaborn/issues/3764#event-14634313077, or unsubscribe https://github.com/notifications/unsubscribe-auth/BA2LFLYBUB6T3YXI4FHRONTZ3OWPPAVCNFSM6AAAAABPVT6WE2VHI2DSMVQWIX3LMV45UABCJFZXG5LFIV3GK3TUJZXXI2LGNFRWC5DJN5XDWMJUGYZTIMZRGMYDONY . You are receiving this because you are subscribed to this thread.Message ID: @.***>