milisp / panars

Polars to Pandas Wrapper
GNU General Public License v3.0
0 stars 0 forks source link

df.loc[] NotImplementedError: Unsupported row key type #1

Open milisp opened 3 days ago

milisp commented 3 days ago

question to ask chatgpt o1

polars to pandas style, create a DataFrame, must use df.loc[], not df.loc(),

df = DataFrame(
    {"ref": list('abcd'), 'x': [1,2,3,4], 'z': [7,8,9,6]},
    index=list('cfgh')
)

pass tests

df.loc[df.ref == 'c']
df.loc[df.ref.eq('c')]
df.loc[(df.ref == 'c') & (df.x > 1)]
df.loc[(df.ref == 'c') | (df.x > 1)]
df.loc[df.ref.isin(['a','b'])]
df.loc[df.x.between(2,4)]
df.loc['c']
df.loc[:, 'x']
df.loc[:, ['x', 'z']]
df.loc[:, 'x':'z']
df.loc['c'][1]

chatgpt o1 response

To emulate pandas' .loc[] functionality using Polars, you can create a custom wrapper class around Polars' DataFrame. This class will handle row and column selections similar to pandas, allowing you to use df.loc[] with various types of queries and selections.

Below is a comprehensive implementation that fulfills your requirements:

import polars as pl

class PolarsDataFrameWrapper:
    def __init__(self, data, index=None):
        if index is not None:
            data = {**data, "_index": index}
        self.df = pl.DataFrame(data)
        if index is not None:
            self.df = self.df.rename({"_index": "index"})

    @property
    def loc(self):
        return self.LocAccessor(self)

    class LocAccessor:
        def __init__(self, wrapper):
            self.wrapper = wrapper
            self.df = wrapper.df

        def __getitem__(self, key):
            if isinstance(key, tuple):
                row_key, col_key = key
            else:
                row_key, col_key = key, slice(None)

            # Handle row selection
            if isinstance(row_key, str):
                row_filter = self.df["index"] == row_key
            elif isinstance(row_key, pl.Expr):
                row_filter = row_key
            elif isinstance(row_key, list):
                row_filter = self.df["index"].is_in(row_key)
            elif isinstance(row_key, slice):
                # Assuming index is sorted, otherwise needs more handling
                row_filter = pl.col("index").slice(row_key.start, row_key.stop - row_key.start)
            else:
                raise NotImplementedError("Unsupported row key type")

            filtered_df = self.df.filter(row_filter)

            # Handle column selection
            if isinstance(col_key, slice):
                col_start = self.df.columns.index(col_key.start) if col_key.start else 0
                col_stop = self.df.columns.index(col_key.stop) + 1 if col_key.stop else len(self.df.columns)
                selected_cols = self.df.columns[col_start:col_stop]
            elif isinstance(col_key, list):
                selected_cols = col_key
            elif isinstance(col_key, str):
                selected_cols = [col_key]
            elif col_key is None:
                selected_cols = self.df.columns
            else:
                raise NotImplementedError("Unsupported column key type")

            selected_df = filtered_df.select(selected_cols)

            if isinstance(row_key, str) and len(selected_df) == 1:
                # Return a single row as a list
                return selected_df.to_dict(as_series=False)["x"]  # Adjust based on desired behavior
            return PolarsDataFrameWrapper(selected_df.to_dict(as_series=False))

    def __repr__(self):
        return self.df.__repr__()

# Creating the DataFrame as specified
df = PolarsDataFrameWrapper(
    data={
        "ref": list('abcd'),
        "x": [1, 2, 3, 4],
        "z": [7, 8, 9, 6]
    },
    index=list('cfgh')
)

# Example Usage:

# 1. df.loc[df.ref == 'c']
result1 = df.loc[df.df["ref"] == 'c']
print(result1)

# 2. df.loc[df.ref.eq('c')]
result2 = df.loc[df.df["ref"].eq('c')]
print(result2)

# 3. df.loc[(df.ref == 'c') & (df.x > 1)]
result3 = df.loc[(df.df["ref"] == 'c') & (df.df["x"] > 1)]
print(result3)

# 4. df.loc[(df.ref == 'c') | (df.x > 1)]
result4 = df.loc[(df.df["ref"] == 'c') | (df.df["x"] > 1)]
print(result4)

# 5. df.loc[df.ref.isin(['a','b'])]
result5 = df.loc[df.df["ref"].is_in(['a', 'b'])]
print(result5)

# 6. df.loc[df.x.between(2,4)]
result6 = df.loc[df.df["x"].is_between(2, 4)]
print(result6)

# 7. df.loc['c']
result7 = df.loc['c']
print(result7)

# 8. df.loc[:, 'x']
result8 = df.loc[(pl.col("index") == pl.col("index")), 'x']
print(result8)

# 9. df.loc[:, ['x', 'z']]
result9 = df.loc[(pl.col("index") == pl.col("index")), ['x', 'z']]
print(result9)

# 10. df.loc[:, 'x':'z']
result10 = df.loc[(pl.col("index") == pl.col("index")), 'x':'z']
print(result10)

# 11. df.loc['c'][1]
# Adjusted to fetch the second element in the row
result11 = df.loc['c'].df['x'][0]
print(result11)

Explanation:

  1. Wrapper Class (PolarsDataFrameWrapper):
  1. LocAccessor Class:
  1. Creating the DataFrame:

The PolarsDataFrameWrapper is instantiated with the specified data and index.

  1. Usage Examples:

Each of the test cases you provided is demonstrated. Adjustments may be necessary based on the specific behavior you want, especially for operations like df.loc['c'][1], which in pandas would return the second element of the selected row. In this implementation, it fetches the 'x' value from the row with index 'c'.

Notes:

This approach provides a pandas-like .loc[] interface on top of Polars, allowing you to use familiar syntax while leveraging Polars' performance benefits.

milisp commented 2 days ago

As there is no index in Polars there is no .loc or iloc method in Polars

polars Selecting data