Open matquant14 opened 1 year ago
I've never had to do this before, and I'm not sure how good a fit this is to the columnar style...
That said - maybe this code snippet helps:
df.pipe(lambda x: x.select(x.columns[1:], x.columns[0]))
You could turn this into a function too:
def _shifth(self:pl.DataFrame, n:int)->pl.DataFrame:
if n < 0:
n = len(self.columns)+n
return self.select(self.columns[n:] + self.columns[:n])
df.pipe(_shifth, -3)
and register it as a method using the "Extend the API" instructions: https://pola-rs.github.io/polars/py-polars/html/reference/api.html
Thanks for the suggestion. Not quite what I was referring to. I was looking more for shifting rows while keeping columns in tact.
In pandas its as easy as this:
def _shift_columns(df: pd.DataFrame,
start: int,
end: int,
periods: int,
axis: int | str,
fill = str) -> pd.DataFrame:
df.iloc[start:end, :] = df.iloc[start:end, :].shift(periods = periods, axis = axis, fill_value = fill)
return df
Could probably be simplified - but perhaps this helps:
import polars as pl
df = pl.from_repr("""
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ 5 ┆ 9 │
│ 2 ┆ 6 ┆ 10 │
│ 3 ┆ 7 ┆ 11 │
│ 4 ┆ 8 ┆ 12 │
└─────┴─────┴─────┘
""")
start = 1
end = 2
periods = 2
fill_value = "moo"
values = [
pl.lit(fill_value) if col is None else col
for col in pl.Series(df.columns).shift(periods)
]
exprs = dict(zip(df.columns, values))
df.update(
df.select(**exprs)
.with_columns(
pl.when(pl.first().cumcount().is_between(start, end))
.then(pl.all())
)
)
shape: (4, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ 5 ┆ 9 │
│ moo ┆ moo ┆ 2 │
│ moo ┆ moo ┆ 3 │
│ 4 ┆ 8 ┆ 12 │
└─────┴─────┴─────┘
Thanks @cmdlineluser! I still need familiarize myself w/ the API a little more, but your code helps a lot. The workaround I wrote isn't quite as elegant
def _hshift(df: pl.DataFrame | pl.LazyFrame,
periods: int,
idx: int | List[int],
fill_value: Any = None) -> pl.DataFrame | pl.LazyFrame:
"""
Method to shift columns horizontally w/in dataframe
"""
is_lazy = isinstance(df, pl.LazyFrame)
if is_lazy:
df = df.collect()
rows = df.rows()
if isinstance(idx, list):
for x in idx:
rows[x] = rows[x][-periods:] + rows[x][:-periods]
if fill_value:
rows[x] = list(rows[x])
rows[x][-periods + 1] = fill_value
else:
rows[idx] = rows[idx][-periods:] + rows[idx][:-periods]
if fill_value:
rows[idx] = list(rows[idx])
rows[idx][-periods + 1] = fill_value
df = pl.DataFrame(rows[1:]).transpose(column_names = rows[0])
if is_lazy:
return df.lazy()
else:
return df
I feel this is so wrong. What are you trying to do?
I'm parsing tables from PDF files using tabula-py, which uses JAVA on the backend. Tabula can dump the parsed table into a pandas dataframe, but sometimes it doesn't always parse all the rows correctly. Sometimes the values end up shifted into the wrong column.
I already have a process set up using pandas but I was hoping to fix the parsed table w/ polars (without having to rewrite the entire I/O process), since I've been incorporating it more into my workflows.
For example, after parsing the PDF the dataframe may look like this
| a | b | c |
|--------|--------|-------|
| A | 1/2/00 | 17:00 |
| D | 5/4/18 | 07:00 |
| 6/7/22 | 07:00 | |
but I need to get it to look like this
| a | b | c |
|--------|--------|-------|
| A | 1/2/00 | 17:00 |
| D | 5/4/18 | 07:00 |
| D | 6/7/22 | 07:00 |
With regards to PDF parsing - pdfplumber
may be of interest if you're not aware of it.
https://github.com/jsvine/pdfplumber#extracting-tables
You can customize all of the table finding parameters if the defaults don't extract the data cleanly.
One option that will not perform well (but you might not mind because the data on each page of the pdf is small) is to use transpose
.
1 - transpose rows to columns 2- shift columns 3- transpose back columns to rows
thanks @cmdlineluser. I know of pdfplumber, but haven't used it. Looks like the table extraction is similar/based off Tabula, but I'll take a look.
@lucazanna I did consider that, but I was running into performance issues as you mention. This is my current workaround, after a few iterations:
def _switch_rows(rows: Tuple[Any] | List[Tuple[Any, ...]] | List[Dict[str, Any]],
idx: int | List[int],
periods: int,
fill_value: str | None = None
) -> Tuple[List[Any], ...]:
"""
Switch values in a List or tuple
"""
rows = [list(row) for row in rows]
if isinstance(idx, list):
for x in idx:
rows[x] = rows[x][-periods:] + rows[x][:-periods]
if fill_value:
rows[x][-periods + 1] = fill_value
else:
rows[idx] = rows[idx][-periods:] + rows[idx][:-periods]
if fill_value:
rows[idx][-periods + 1] = fill_value
switched_rows = tuple(list(x) for x in zip(*rows))
return switched_rows
@staticmethod
def _hshift(df: pl.DataFrame | pl.LazyFrame,
periods: int,
idx: int | List[int],
fill_value: str | int | float | None = None) -> pl.DataFrame | pl.LazyFrame:
"""
Method to shift columns horizontally w/in dataframe
"""
columns = df.columns
match df:
case pl.LazyFrame():
rows = Schedule._switch_rows(df.fetch().rows(), idx, periods, fill_value)
ldf = pl.LazyFrame(rows, schema = columns)
return ldf
case pl.DataFrame():
rows = Schedule._switch_rows(df.rows(), idx, periods, fill_value)
df = pl.DataFrame(rows, schema = columns)
return df
Here's a right shift using the logic that the column c is null that only uses expressions, it doesn't decompose the dataframe into rows and put it back together.
df=pl.DataFrame({'a':['A','D','6/7/22'], 'b':['1/2/00','5/4/18','07:00'], 'c':['17:00','07:00',None]})
shape: (3, 3)
┌────────┬────────┬───────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞════════╪════════╪═══════╡
│ A ┆ 1/2/00 ┆ 17:00 │
│ D ┆ 5/4/18 ┆ 07:00 │
│ 6/7/22 ┆ 07:00 ┆ null │
└────────┴────────┴───────┘
df.with_columns(
allcols= pl.when(pl.col('c').is_not_null())
.then(pl.struct(pl.all()))
.otherwise(pl.struct([
pl.col(df.columns[i]).alias(df.columns[i+1])
for i in range(len(df.columns)-1)
]))
).drop(df.columns).unnest('allcols')
shape: (3, 3)
┌──────┬────────┬───────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞══════╪════════╪═══════╡
│ A ┆ 1/2/00 ┆ 17:00 │
│ D ┆ 5/4/18 ┆ 07:00 │
│ null ┆ 6/7/22 ┆ 07:00 │
└──────┴────────┴───────┘
that's just the right shift, if the logic should be that the missing a
should just come down from the previous row then just add a shift term like this:
df.with_columns(
allcols= pl.when(pl.col('c').is_not_null())
.then(pl.struct(pl.all()))
.otherwise(pl.struct([
pl.col(df.columns[i]).alias(df.columns[i+1])
for i in range(len(df.columns)-1)
] + [pl.col(df.columns[0]).shift()]))
).drop(df.columns).unnest('allcols')
shape: (3, 3)
┌─────┬────────┬───────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞═════╪════════╪═══════╡
│ A ┆ 1/2/00 ┆ 17:00 │
│ D ┆ 5/4/18 ┆ 07:00 │
│ D ┆ 6/7/22 ┆ 07:00 │
└─────┴────────┴───────┘
If you want to do it by row index then this should work
df.with_row_count().with_columns(
allcols= pl.when(pl.col('row_nr')!=2)
.then(pl.struct(pl.exclude('row_nr')))
.otherwise(pl.struct([
pl.col(df.columns[i]).alias(df.columns[i+1])
for i in range(len(df.columns)-1)
] + [pl.col(df.columns[0]).shift()]))
).drop(df.columns + ['row_nr']).unnest('allcols')
shape: (3, 3)
┌─────┬────────┬───────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞═════╪════════╪═══════╡
│ A ┆ 1/2/00 ┆ 17:00 │
│ D ┆ 5/4/18 ┆ 07:00 │
│ D ┆ 6/7/22 ┆ 07:00 │
└─────┴────────┴───────┘
Thanks @deanm0000!
Working off @deanm0000's suggestion, think I've got an improved implementation vs. my previous one. might still slow down using row counts, but better...
def _hshift(df: pl.LazyFrame | pl.DataFrame,
idx: int | List[int],
fill_value: str | int | float | None = None) -> pl.LazyFrame | pl.DataFrame:
import polars.selectors as cs
if isinstance(idx, int):
idx = [idx]
columns = cs.expand_selector(df, cs.all())
ndf = (df.with_row_count(name = 'rownum')
.with_columns((pl.when(pl.col('rownum').is_in(idx))
.then(pl.struct([pl.col(columns[i]).alias(columns[i + 1])
for i in range(len(columns) - 1)]
)
)
.otherwise(pl.struct(pl.all().exclude('rownum')))
)
.alias('allcols')
)
.select('allcols')
.unnest('allcols')
)
if fill_value:
return (ndf
.with_row_count(name = 'rownum')
.with_columns(pl.when(pl.col('rownum').is_in(idx))
.then(pl.all().fill_null(fill_value))
.otherwise(pl.all())
)
.select(pl.all().exclude('rownum'))
)
return ndf
Problem description
Pandas permits shifting values along both row and column axes.
Can similar implementation be done to shift rows, left and right? Currently you can only shift up and down. Would be great if there's an option to set the axis it shifts on.