Closed zuliani99 closed 3 years ago
@zuliani99 Thank you for submitting the issue. To help the Ludwig Team assist you, we need additional information. Please post the complete log output. I also suggest adding the following code fragments:
Add to your imports import logging
When creating the model
instance add the parameter
model = LudwigModel(create_dict(label, target), logging_level=logging.INFO)
After creating train
do following and post the output.
print(train.dtypes)
print(train.head())
Thank you.
Output from the print commands and ludwig:
date float64
day category
period float64
nswprice float64
nswdemand float64
vicprice float64
vicdemand float64
transfer float64
class string
dtype: object
-----------------------------------
date day period nswprice ... vicprice vicdemand transfer class
39673 0.898323 2 0.531915 0.071664 ... 0.004715 0.601502 0.532895 UP
41263 0.902836 7 0.659574 0.054732 ... 0.003515 0.308389 0.782895 DOWN
26705 0.469271 5 0.361702 0.060106 ... 0.003911 0.487830 0.483772 UP
163 0.000133 5 0.404255 0.089798 ... 0.003467 0.422915 0.414912 UP
39811 0.898456 5 0.404255 0.106791 ... 0.007164 0.661833 0.385526 UP
[5 rows x 9 columns]
date day period ... vicprice vicdemand transfer
26192 0.468785 1 0.680851 ... 0.001836 0.161833 0.698684
15084 0.434052 1 0.255319 ... 0.003467 0.422915 0.414912
13276 0.429185 5 0.595745 ... 0.003467 0.422915 0.414912
16740 0.438609 7 0.765957 ... 0.003467 0.422915 0.414912
28360 0.473873 4 0.851064 ... 0.001836 0.395650 0.743421
[5 rows x 8 columns]
{'input_features': [{'name': 'date', 'type': 'numerical'}, {'name': 'day', 'type': 'category'}, {'name': 'period', 'type': 'numerical'}, {'name': 'nswprice', 'type': 'numerical'}, {'name': 'nswdemand', 'type': 'numerical'}, {'name': 'vicprice', 'type': 'numerical'}, {'name': 'vicdemand', 'type': 'numerical'}, {'name': 'transfer', 'type': 'numerical'}], 'output_features': [{'name': 'class', 'type': 'category'}]}
Experiment name: api_experiment
Model name: run
Output directory: results/api_experiment_run_8
ludwig_version: '0.3.3'
command: ('/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py -f '
'/root/.local/share/jupyter/runtime/kernel-a80d8961-51a5-4e73-9abd-f89315480a1c.json')
random_seed: 42
data_format: "<class 'pandas.core.frame.DataFrame'>"
config: { 'combiner': {'type': 'concat'},
'input_features': [ { 'column': 'date',
'name': 'date',
'proc_column': 'date_mZFLky',
'tied': None,
'type': 'numerical'},
{ 'column': 'day',
'name': 'day',
'proc_column': 'day_mZFLky',
'tied': None,
'type': 'category'},
{ 'column': 'period',
'name': 'period',
'proc_column': 'period_mZFLky',
'tied': None,
'type': 'numerical'},
{ 'column': 'nswprice',
'name': 'nswprice',
'proc_column': 'nswprice_mZFLky',
'tied': None,
'type': 'numerical'},
{ 'column': 'nswdemand',
'name': 'nswdemand',
'proc_column': 'nswdemand_mZFLky',
'tied': None,
'type': 'numerical'},
{ 'column': 'vicprice',
'name': 'vicprice',
'proc_column': 'vicprice_mZFLky',
'tied': None,
'type': 'numerical'},
{ 'column': 'vicdemand',
'name': 'vicdemand',
'proc_column': 'vicdemand_mZFLky',
'tied': None,
'type': 'numerical'},
{ 'column': 'transfer',
'name': 'transfer',
'proc_column': 'transfer_mZFLky',
'tied': None,
'type': 'numerical'}],
'output_features': [ { 'column': 'class',
'dependencies': [],
'loss': { 'class_similarities_temperature': 0,
'class_weights': 1,
'confidence_penalty': 0,
'labels_smoothing': 0,
'robust_lambda': 0,
'type': 'softmax_cross_entropy',
'weight': 1},
'name': 'class',
'proc_column': 'class_mZFLky',
'reduce_dependencies': 'sum',
'reduce_input': 'sum',
'top_k': 3,
'type': 'category'}],
'preprocessing': { 'audio': { 'audio_feature': {'type': 'raw'},
'audio_file_length_limit_in_s': 7.5,
'in_memory': True,
'missing_value_strategy': 'backfill',
'norm': None,
'padding_value': 0},
'bag': { 'fill_value': '<UNK>',
'lowercase': False,
'missing_value_strategy': 'fill_with_const',
'most_common': 10000,
'tokenizer': 'space'},
'binary': { 'fill_value': 0,
'missing_value_strategy': 'fill_with_const'},
'category': { 'fill_value': '<UNK>',
'lowercase': False,
'missing_value_strategy': 'fill_with_const',
'most_common': 10000},
'date': { 'datetime_format': None,
'fill_value': '',
'missing_value_strategy': 'fill_with_const'},
'force_split': False,
'h3': { 'fill_value': 576495936675512319,
'missing_value_strategy': 'fill_with_const'},
'image': { 'in_memory': True,
'missing_value_strategy': 'backfill',
'num_processes': 1,
'resize_method': 'interpolate',
'scaling': 'pixel_normalization'},
'numerical': { 'fill_value': 0,
'missing_value_strategy': 'fill_with_const',
'normalization': None},
'sequence': { 'fill_value': '<UNK>',
'lowercase': False,
'missing_value_strategy': 'fill_with_const',
'most_common': 20000,
'padding': 'right',
'padding_symbol': '<PAD>',
'sequence_length_limit': 256,
'tokenizer': 'space',
'unknown_symbol': '<UNK>',
'vocab_file': None},
'set': { 'fill_value': '<UNK>',
'lowercase': False,
'missing_value_strategy': 'fill_with_const',
'most_common': 10000,
'tokenizer': 'space'},
'split_probabilities': (0.7, 0.1, 0.2),
'stratify': None,
'text': { 'char_most_common': 70,
'char_sequence_length_limit': 1024,
'char_tokenizer': 'characters',
'char_vocab_file': None,
'fill_value': '<UNK>',
'lowercase': True,
'missing_value_strategy': 'fill_with_const',
'padding': 'right',
'padding_symbol': '<PAD>',
'pretrained_model_name_or_path': None,
'unknown_symbol': '<UNK>',
'word_most_common': 20000,
'word_sequence_length_limit': 256,
'word_tokenizer': 'space_punct',
'word_vocab_file': None},
'timeseries': { 'fill_value': '',
'missing_value_strategy': 'fill_with_const',
'padding': 'right',
'padding_value': 0,
'timeseries_length_limit': 256,
'tokenizer': 'space'},
'vector': { 'fill_value': '',
'missing_value_strategy': 'fill_with_const'}},
'training': { 'batch_size': 128,
'bucketing_field': None,
'decay': False,
'decay_rate': 0.96,
'decay_steps': 10000,
'early_stop': 5,
'epochs': 100,
'eval_batch_size': 0,
'gradient_clipping': None,
'increase_batch_size_on_plateau': 0,
'increase_batch_size_on_plateau_max': 512,
'increase_batch_size_on_plateau_patience': 5,
'increase_batch_size_on_plateau_rate': 2,
'learning_rate': 0.001,
'learning_rate_warmup_epochs': 1,
'optimizer': { 'beta_1': 0.9,
'beta_2': 0.999,
'epsilon': 1e-08,
'type': 'adam'},
'reduce_learning_rate_on_plateau': 0,
'reduce_learning_rate_on_plateau_patience': 5,
'reduce_learning_rate_on_plateau_rate': 0.5,
'regularization_lambda': 0,
'regularizer': 'l2',
'staircase': False,
'validation_field': 'combined',
'validation_metric': 'loss'}}
tf_version: '2.4.1'
Using full dataframe
Building dataset (it may take a while)
Log errors:
ValueError Traceback (most recent call last)
<ipython-input-19-441538d59b28> in <module>()
21 model = LudwigModel(create_dict(label, target), logging_level=logging.INFO)
22
---> 23 train_stats = model.train(train)
24
25 predictions = model.predict(test)
13 frames
/usr/local/lib/python3.7/dist-packages/ludwig/api.py in train(self, dataset, training_set, validation_set, test_set, training_set_metadata, data_format, experiment_name, model_name, model_resume_path, skip_save_training_description, skip_save_training_statistics, skip_save_model, skip_save_progress, skip_save_log, skip_save_processed_input, output_directory, random_seed, debug, **kwargs)
415 random_seed=random_seed,
416 devbug=debug,
--> 417 **kwargs,
418 )
419 (training_set,
/usr/local/lib/python3.7/dist-packages/ludwig/api.py in preprocess(self, dataset, training_set, validation_set, test_set, training_set_metadata, data_format, skip_save_processed_input, random_seed, debug, **kwargs)
1259 preprocessing_params=self.config[PREPROCESSING],
1260 backend=self.backend,
-> 1261 random_seed=random_seed
1262 )
1263
/usr/local/lib/python3.7/dist-packages/ludwig/data/preprocessing.py in preprocess_for_training(config, dataset, training_set, validation_set, test_set, training_set_metadata, data_format, skip_save_processed_input, preprocessing_params, backend, random_seed)
1395 preprocessing_params=preprocessing_params,
1396 backend=backend,
-> 1397 random_seed=random_seed
1398 )
1399 training_set, test_set, validation_set, training_set_metadata = processed
/usr/local/lib/python3.7/dist-packages/ludwig/data/preprocessing.py in preprocess_for_training(features, dataset, training_set, validation_set, test_set, training_set_metadata, skip_save_processed_input, preprocessing_params, backend, random_seed)
183 preprocessing_params=preprocessing_params,
184 backend=backend,
--> 185 random_seed=random_seed
186 )
187
/usr/local/lib/python3.7/dist-packages/ludwig/data/preprocessing.py in _preprocess_df_for_training(features, dataset, training_set, validation_set, test_set, training_set_metadata, preprocessing_params, backend, random_seed)
1645 metadata=training_set_metadata,
1646 random_seed=random_seed,
-> 1647 backend=backend
1648 )
1649
/usr/local/lib/python3.7/dist-packages/ludwig/data/preprocessing.py in build_dataset(dataset_df, features, global_preprocessing_parameters, metadata, backend, random_seed)
1013 features,
1014 global_preprocessing_parameters,
-> 1015 backend
1016 )
1017
/usr/local/lib/python3.7/dist-packages/ludwig/data/preprocessing.py in build_metadata(dataset_df, features, global_preprocessing_parameters, backend)
1107 dataset_df,
1108 feature,
-> 1109 preprocessing_parameters
1110 )
1111
/usr/local/lib/python3.7/dist-packages/ludwig/data/preprocessing.py in handle_missing_values(dataset_df, feature, preprocessing_parameters)
1188 if computed_fill_value is not None:
1189 dataset_df[feature[COLUMN]] = dataset_df[feature[COLUMN]].fillna(
-> 1190 computed_fill_value,
1191 )
1192 elif missing_value_strategy in ['backfill', 'bfill', 'pad', 'ffill']:
/usr/local/lib/python3.7/dist-packages/pandas/core/series.py in fillna(self, value, method, axis, inplace, limit, downcast)
4533 inplace=inplace,
4534 limit=limit,
-> 4535 downcast=downcast,
4536 )
4537
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in fillna(self, value, method, axis, inplace, limit, downcast)
6056
6057 new_data = self._mgr.fillna(
-> 6058 value=value, limit=limit, inplace=inplace, downcast=downcast
6059 )
6060
/usr/local/lib/python3.7/dist-packages/pandas/core/internals/managers.py in fillna(self, value, limit, inplace, downcast)
584 def fillna(self, value, limit, inplace: bool, downcast) -> "BlockManager":
585 return self.apply(
--> 586 "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
587 )
588
/usr/local/lib/python3.7/dist-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs)
404 applied = b.apply(f, **kwargs)
405 else:
--> 406 applied = getattr(b, f)(**kwargs)
407 result_blocks = _extend_blocks(applied, result_blocks)
408
/usr/local/lib/python3.7/dist-packages/pandas/core/internals/blocks.py in fillna(self, value, limit, inplace, downcast)
1777 def fillna(self, value, limit=None, inplace=False, downcast=None):
1778 values = self.values if inplace else self.values.copy()
-> 1779 values = values.fillna(value=value, limit=limit)
1780 return [
1781 self.make_block_same_class(
/usr/local/lib/python3.7/dist-packages/pandas/core/arrays/categorical.py in fillna(self, value, method, limit)
1719 elif is_hashable(value):
1720 if not isna(value) and value not in self.categories:
-> 1721 raise ValueError("fill value must be in categories")
1722
1723 mask = codes == -1
ValueError: fill value must be in categories
Thank you a lot for the support I appreciate, I also hope you have a great Easter holiday
@zuliani99 Thank you for the additional info. From the error stack trace, the issue appears to be in the pandas library. I've searched for known issues and there may be some candidate issues. To help me pin down root cause please post a small sample of the train
data set that experiences the error. I suggest writing train
to a csv and post the csv to this issue. With that I'm hoping to recreate the error on my side.
@zuliani99 I took a closer look at your code and I just noticed you're using a packaged sklearn dataset. Ignore my request to post a data set. I should be able to recreate the dataset on my side.
If I'm unable to recreate a dataset, I'll reach back out.
Hi @zuliani99 , after looking into it in detail, your usecase uncovered an issue in the way Ludwig treats the filling on NaN values for pandas category columns. We are working on it to find a clean solution in the codebase, but in the meantime you have a couple workarounds.
{ 'column': 'day', 'type': 'category', 'preprocessing': {'fill_value': 'Monday'}}
train['day'] = train['day'].astype('object')
or also "string".@zuliani99 In addition to what @w4nderlust suggested as a short-term work-around, you'll need to remove this line of code:
model.close()
The close()
api is relevant to an earlier version of Ludwig. This api was removed starting in v0.3 and other breaking changes occurred. Please refer to the current API doc for the current API behavior.
All right, @w4nderlust, @jimthompson5802 thank you a lot for the support. With all the changes suggested by you two and other little changes, code finally works. By the way, is suggested to cast categorical features to object features for make ludwig works? And an other question, how can i get the mean accuracy from a classification problem? Should I read the training_statistics.json file from the last api_experiment_run folder created?
Running code:
from ludwig.api import LudwigModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
import logging
def create_dict(label, target):
model = { 'input_features': [], 'output_features': [], 'training':{}}
for lab in label.columns:
#print(lab + " " +pd.api.types.infer_dtype(label[lab]))
t = pd.api.types.infer_dtype(label[lab])
if t == "floating" or t == "integer":
model['input_features'].append({'name': lab, 'type': 'numerical'})
if t == "categorical":
model['input_features'].append({'name': lab, 'type': 'category'})
if t == "boolean":
model['input_features'].append({'name': lab, 'type': 'binary'})
if t == "string":
model['input_features'].append({'name': lab, 'type': 'text'})
t = pd.api.types.infer_dtype(target[target.columns[0]])
if t == "floating" or t == "integer":
model['output_features'].append({ 'name': target.columns[0], 'type': 'numerical' })
if t == "categorical":
model['output_features'].append({ 'name': target.columns[0], 'type': 'category' })
if t == "string":
model['output_features'].append({ 'name': target.columns[0], 'type': 'text' })
print(model)
return model
def ludwig(df):
y = df.iloc[:, -1].to_frame()
X = df.iloc[:, :-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train[y_train.columns[0]] = y_train
#train = X_train.convert_dtypes()
train['day'] = train['day'].astype('object')
test['day'] = test['day'].astype('object')
print(train.dtypes)
print("-----------------------------------")
print(train.head())
print("----------------------------------------------------------------------")
print(test.info())
print("-----------------------------------")
print(test.head())
model = LudwigModel(create_dict(X, y), logging_level=logging.INFO)
train_stats = model.train(dataset=train, logging_level=logging.INFO)
return model.predict(test)
X, y = fetch_openml(data_id=151, as_frame=True, return_X_y=True, cache=True)
y = y.to_frame()
X[y.columns[0]] = y
df = X
res = ludwig(df)
print(res)
@zuliani99 re: reporting on accuracy.
I suggest you look at Ludwig visualizations. This link shows the example of the compare_performance
report. This link describes the Ludwig api to generate that report from a Python program. This link is an example Jupyter notebook illustrating how to call the Ludwig visualization apis.
If the standard Ludwig visualizations are not sufficient for your purposes, it is possible, as you suggest, to read the train/test .json files to extract out metrics. Here is an example of extracting metrics from the .json files and using seaborn
/matplotlib
to display the results.
Hi, I'm trying to create a semple prediction of a dateset downloaded from openML using thier own API. In Addition to that I've create a function to automate the creation of the configuration dictionary. The problem is that when I try to train the model the "ValueError: fill value must be in categories." appears, but the train dataframe hasn't any nan value inside, infact train.isna().sum() display all the labels with the count to 0.
This is the entire code:
I'm using Google Colab for this particular task