has2k1 / plotnine

A Grammar of Graphics for Python
https://plotnine.org
MIT License
4.01k stars 217 forks source link

Request for geom_sina #221

Closed eric-czech closed 5 years ago

eric-czech commented 5 years ago

Is there already a way to build something like a sina plot with existing layers/stats? If not, geom_sina would be a nice enhancement (a la ggforce).

For reference:

screen shot 2018-10-30 at 10 02 51 am
has2k1 commented 5 years ago

Yes. I had forgotten about it. I reviewed geom_sina when it was submitted to ggplot2.

TyberiusPrime commented 5 years ago

I can't quite nail it...

image

(positions/position_sina.py)

from copy import deepcopy

import numpy as np
import pandas as pd

from ..utils import resolution
from .position import position
from ..stats.stat_density import compute_density

class position_sina(position):
    """
    Jitter points to avoid overplotting
    in x-axis. Jitter width is determined
    by local density, similar to the contours
    of a violin plot.

    Parameters
    ----------
    width: overall factor of jitter

    """
    REQUIRED_AES = {'x', 'y'}

    def __init__(self, width=.5, bins=128):
        self.params = {'width': width, 'bins': bins}

    def setup_params(self, data):
        params = deepcopy(self.params)
        params['width'] = resolution(data['x']) * params['width']
        return params

    @classmethod
    def compute_layer(cls, data, params, layout):
        rejoined = []
        for idx, sub_df in data.groupby(['PANEL', 'group']):
            org_index = sub_df.index
            sub_df = sub_df.sort_values('y')
            nbin = params['bins']
            density = compute_density(
                sub_df['y'],
                weight=[1] * len(sub_df),
                range=(sub_df['y'].min(), sub_df['y'].max()),
                kernel='gau',
                bw='normal_reference',
                adjust=1,
                cut=3,
                gridsize=None,
                clip=(-np.inf, np.inf),
                n=nbin)
            density = density['density']
            density /= density.max()
            hist, bin_edges = np.histogram(sub_df['y'], nbin)
            d = []
            for ii, count in enumerate(hist):
                d.extend([density.iloc[ii]] * count)
            d = pd.Series(d, index=sub_df.index).loc[org_index]
            jitter = np.random.uniform(-1, 1,
                                       len(sub_df)) * d * params['width']
            sub_df['x'] += jitter
            rejoined.append(sub_df)
        return pd.concat(rejoined)

Plotsource:

def rnorm(size, loc, scale):
    return np.random.normal(loc, scale, size)
a = rnorm(500, 6, 1)
b = rnorm(400, 5, 1.5)

# Bimodal
c = np.hstack([rnorm(200, 3, .7), rnorm(50, 7, 0.4)])

# Trimodal
d = np.hstack([rnorm(200, 2, 0.7), rnorm(300, 5.5, 0.4), rnorm(100, 8, 0.4)])

df = pd.DataFrame({
  "Distribution": ["Unimodal 1"] * len(a) + 
                     ["Unimodal 2"]* len(b) + 
                     ["Bimodal"]* len(c)+  
                     ["Trimodal"]* len(d),
  "Value": np.hstack([a, b, c, d])})

g = (p9.ggplot(df, p9.aes('Distribution','Value'))     
     + p9.geom_violin(p9.aes(color='Distribution'))
     + p9.geom_point(p9.aes(color='Distribution'), position=p9.position_sina(bins=4096))
    )
roumail commented 2 years ago

Hello, I'm running into problem with some of the code submitted in the commit 357cea0. Specifically the function that checks for groups using has_groups(data).

def setup_data(self, data):
        if (array_kind.continuous(data['x']) and
                not has_groups(data) and
                (data['x'] != data.loc['x', 0]).any()):
            raise TypeError("Continuous x aesthetic -- did you forget "
                            "aes(group=...)?")
        return data

The error trace is

File ~/user_files/envs//lib/python3.8/site-packages/plotnine/mapping/aes.py:543, in has_groups(data)
    528 """
    529 Check if data is grouped
    530 
   (...)
    539     If True, the data has groups.
    540 """
    541 # If any row in the group column is equal to NO_GROUP, then
    542 # the data all of them are and the data has no groups
--> 543 return data.loc[0, 'group'] != NO_GROUP

File ~/user_files/envs//lib/python3.8/site-packages/pandas/core/indexing.py:960, in _LocationIndexer.__getitem__(self, key)
    958     key = tuple(com.apply_if_callable(x, self.obj) for x in key)
    959     if self._is_scalar_access(key):
--> 960         return self.obj._get_value(*key, takeable=self._takeable)
    961     return self._getitem_tuple(key)
    962 else:
    963     # we by definition only have the 0th axis

File ~/user_files/envs//lib/python3.8/site-packages/pandas/core/frame.py:3622, in DataFrame._get_value(self, index, col, takeable)
   3616 engine = self.index._engine
   3618 if not isinstance(self.index, MultiIndex):
   3619     # CategoricalIndex: Trying to use the engine fastpath may give incorrect
   3620     #  results if our categories are integers that dont match our codes
   3621     # IntervalIndex: IntervalTree has no get_loc
-> 3622     row = self.index.get_loc(index)
   3623     return series._values[row]
   3625 # For MultiIndex going through engine effectively restricts us to
   3626 #  same-length tuples; see test_get_set_value_no_partial_indexing

File ~/user_files/envs//lib/python3.8/site-packages/pandas/core/indexes/base.py:3623, in Index.get_loc(self, key, method, tolerance)
   3621     return self._engine.get_loc(casted_key)
   3622 except KeyError as err:
-> 3623     raise KeyError(key) from err
   3624 except TypeError:
   3625     # If we have a listlike key, _check_indexing_error will raise
   3626     #  InvalidIndexError. Otherwise we fall through and re-raise
   3627     #  the TypeError.
   3628     self._check_indexing_error(key)

KeyError: 0

I tried to specify the group aesthetic and a number of other things but it doesn't seem to work..

has2k1 commented 2 years ago

@roumail, please file a new issue with a minimal reproducible example.

idavi-bcs commented 1 year ago

It seems to be caused by having a discontinuous index on the data frame, or maybe not having a value with index == 0. Doing df = df.reset_index(drop=True) fixed the KeyError for me.