Closed eric-czech closed 5 years ago
Yes. I had forgotten about it. I reviewed geom_sina when it was submitted to ggplot2.
I can't quite nail it...
(positions/position_sina.py)
from copy import deepcopy
import numpy as np
import pandas as pd
from ..utils import resolution
from .position import position
from ..stats.stat_density import compute_density
class position_sina(position):
"""
Jitter points to avoid overplotting
in x-axis. Jitter width is determined
by local density, similar to the contours
of a violin plot.
Parameters
----------
width: overall factor of jitter
"""
REQUIRED_AES = {'x', 'y'}
def __init__(self, width=.5, bins=128):
self.params = {'width': width, 'bins': bins}
def setup_params(self, data):
params = deepcopy(self.params)
params['width'] = resolution(data['x']) * params['width']
return params
@classmethod
def compute_layer(cls, data, params, layout):
rejoined = []
for idx, sub_df in data.groupby(['PANEL', 'group']):
org_index = sub_df.index
sub_df = sub_df.sort_values('y')
nbin = params['bins']
density = compute_density(
sub_df['y'],
weight=[1] * len(sub_df),
range=(sub_df['y'].min(), sub_df['y'].max()),
kernel='gau',
bw='normal_reference',
adjust=1,
cut=3,
gridsize=None,
clip=(-np.inf, np.inf),
n=nbin)
density = density['density']
density /= density.max()
hist, bin_edges = np.histogram(sub_df['y'], nbin)
d = []
for ii, count in enumerate(hist):
d.extend([density.iloc[ii]] * count)
d = pd.Series(d, index=sub_df.index).loc[org_index]
jitter = np.random.uniform(-1, 1,
len(sub_df)) * d * params['width']
sub_df['x'] += jitter
rejoined.append(sub_df)
return pd.concat(rejoined)
Plotsource:
def rnorm(size, loc, scale):
return np.random.normal(loc, scale, size)
a = rnorm(500, 6, 1)
b = rnorm(400, 5, 1.5)
# Bimodal
c = np.hstack([rnorm(200, 3, .7), rnorm(50, 7, 0.4)])
# Trimodal
d = np.hstack([rnorm(200, 2, 0.7), rnorm(300, 5.5, 0.4), rnorm(100, 8, 0.4)])
df = pd.DataFrame({
"Distribution": ["Unimodal 1"] * len(a) +
["Unimodal 2"]* len(b) +
["Bimodal"]* len(c)+
["Trimodal"]* len(d),
"Value": np.hstack([a, b, c, d])})
g = (p9.ggplot(df, p9.aes('Distribution','Value'))
+ p9.geom_violin(p9.aes(color='Distribution'))
+ p9.geom_point(p9.aes(color='Distribution'), position=p9.position_sina(bins=4096))
)
Hello, I'm running into problem with some of the code submitted in the commit 357cea0
. Specifically the function that checks for groups using has_groups(data)
.
def setup_data(self, data):
if (array_kind.continuous(data['x']) and
not has_groups(data) and
(data['x'] != data.loc['x', 0]).any()):
raise TypeError("Continuous x aesthetic -- did you forget "
"aes(group=...)?")
return data
The error trace is
File ~/user_files/envs//lib/python3.8/site-packages/plotnine/mapping/aes.py:543, in has_groups(data)
528 """
529 Check if data is grouped
530
(...)
539 If True, the data has groups.
540 """
541 # If any row in the group column is equal to NO_GROUP, then
542 # the data all of them are and the data has no groups
--> 543 return data.loc[0, 'group'] != NO_GROUP
File ~/user_files/envs//lib/python3.8/site-packages/pandas/core/indexing.py:960, in _LocationIndexer.__getitem__(self, key)
958 key = tuple(com.apply_if_callable(x, self.obj) for x in key)
959 if self._is_scalar_access(key):
--> 960 return self.obj._get_value(*key, takeable=self._takeable)
961 return self._getitem_tuple(key)
962 else:
963 # we by definition only have the 0th axis
File ~/user_files/envs//lib/python3.8/site-packages/pandas/core/frame.py:3622, in DataFrame._get_value(self, index, col, takeable)
3616 engine = self.index._engine
3618 if not isinstance(self.index, MultiIndex):
3619 # CategoricalIndex: Trying to use the engine fastpath may give incorrect
3620 # results if our categories are integers that dont match our codes
3621 # IntervalIndex: IntervalTree has no get_loc
-> 3622 row = self.index.get_loc(index)
3623 return series._values[row]
3625 # For MultiIndex going through engine effectively restricts us to
3626 # same-length tuples; see test_get_set_value_no_partial_indexing
File ~/user_files/envs//lib/python3.8/site-packages/pandas/core/indexes/base.py:3623, in Index.get_loc(self, key, method, tolerance)
3621 return self._engine.get_loc(casted_key)
3622 except KeyError as err:
-> 3623 raise KeyError(key) from err
3624 except TypeError:
3625 # If we have a listlike key, _check_indexing_error will raise
3626 # InvalidIndexError. Otherwise we fall through and re-raise
3627 # the TypeError.
3628 self._check_indexing_error(key)
KeyError: 0
I tried to specify the group aesthetic and a number of other things but it doesn't seem to work..
@roumail, please file a new issue with a minimal reproducible example.
It seems to be caused by having a discontinuous index on the data frame, or maybe not having a value with index == 0. Doing df = df.reset_index(drop=True)
fixed the KeyError for me.
Is there already a way to build something like a sina plot with existing layers/stats? If not, geom_sina would be a nice enhancement (a la ggforce).
For reference: