Open prutskov opened 4 years ago
Current implementation of DataFrame.unstack and Series.unstack is slightly faster than default_to_pandas.
DataFrame.unstack
Series.unstack
default_to_pandas
Additional performance information described here
System information
Describe the problem
Current implementation of
DataFrame.unstack
andSeries.unstack
is slightly faster thandefault_to_pandas
.Source code / logs
Script to measure
```python import numpy as np import os import pandas from timeit import default_timer as timer RAND_LOW = -100 RAND_HIGH = 100 N = 50000 M = 128 MULTILINE = False TEST_FILENAME = os.path.abspath( f"int_dataset-{N},{M},{RAND_LOW},{RAND_HIGH},{MULTILINE}.csv" ) def generate_data_file(filename, row_n, col_n, multiline_rows=False): data = { f"col{i}": np.concatenate( [ np.concatenate( [ ["some\nvery very very\nlong string\nwith many multilines"], np.random.randint(RAND_LOW, RAND_HIGH, 9), ] ) for _ in np.arange(row_n // 10) ] ) if (i % 10 == 0 and multiline_rows) else np.random.randint(RAND_LOW, RAND_HIGH, row_n) for i in np.arange(col_n) } print("dict generated!") df = pandas.DataFrame(data) print("dataframe created!") df.to_csv(filename) print("csv ready!") def multiIndex_generator(df, axis=0): if axis == 0: df.index = pandas.MultiIndex.from_tuples( [(j, i) for j in np.arange(10) for i in np.arange(len(df.index)/10)] ) else: df.columns = pandas.MultiIndex.from_tuples( [(0, i) for i in np.arange(len(df.columns))] ) return df if __name__ == "__main__": import modin.pandas as pd if not os.path.exists(TEST_FILENAME): generate_data_file(TEST_FILENAME, N, M, MULTILINE) md_df = multiIndex_generator(pd.read_csv(TEST_FILENAME)) pd_df = multiIndex_generator(pandas.read_csv(TEST_FILENAME)) print( f"DataFrame shape: ({N}, {M}) ~ {os.stat(TEST_FILENAME).st_size // (1024 * 1024)}MB, {pd.DEFAULT_NPARTITIONS} cores" ) t1 = timer() res = repr(pd_df.unstack()) print("PD unstack:", "{:.2f}".format(timer() - t1), "s") t1 = timer() res = repr(md_df.unstack()) print("MD unstack:", "{:.2f}".format(timer() - t1), "s") ```