srlearn / datasets

srlearn-compatible relational datasets
https://srlearn.github.io/relational-datasets/
MIT License
2 stars 0 forks source link

California Housing Dataset #20

Closed hayesall closed 2 years ago

hayesall commented 2 years ago

Converted from the sklearn.datasets.fetch_california_housing, basically like this:

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import KFold
from sklearn.preprocessing import KBinsDiscretizer

from relational_datasets.convert import from_numpy

housing = fetch_california_housing()

X, y = housing.data, housing.target

variable_names = [
    "med_inc", "house_age", "ave_rooms",
    "ave_bedrms", "population", "ave_occup",
    "latitude", "longitude", "med_house_val",
]

disc = KBinsDiscretizer(n_bins=5, encode="ordinal")

for i, (train_ind, test_ind) in enumerate(KFold(n_splits=5).split(X)):

    X_train = disc.fit_transform(X[train_ind]).astype(int)
    X_test = disc.transform(X[test_ind]).astype(int)

    train, modes = from_numpy(X_train, y[train_ind], names=variable_names)
    test, _ = from_numpy(X_test, y[test_ind], names=variable_names)

print("\n".join(modes))