Open thu-wangz17 opened 4 years ago
Hi!
only_new=True
will train the model on the data which is passed to it in that function call.
Traditionally, when active learning was used in combination with more classical methods like random forest, SVM and others (essentialy anything which is not a neural network), the entire model was retrained on the old data + new data after the query. Since neural networks train incrementally because of the batch gradient descent, the traditional method is not particularly the best for them. (It would be very costly also.)
So, when you set only_new=True
, you basically perform a training step on a single batch.
Thank you for the reply.Actually,since I set train_split=CVSplit(0.2)
in NeuralNet
,but with active learning,the model is only teached with 1 data point,then the validation set will be empty if only_new=True
.However when I set only_new=False
,the code runs.
import torch
from torch import nn
from modAL.models import ActiveLearner
from skorch import NeuralNet
from skorch.dataset import CVSplit
from modAL.uncertainty import uncertainty_sampling
class net(nn.Module):
def __init__(self):
super(net, self).__init__()
self.model = nn.Sequential(
nn.Linear(30, 256),
nn.ReLU(),
nn.Linear(256, 1),
)
def forward(self, x):
out = self.model(x)
return out
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = NeuralNet(
module=net,
criterion=nn.MSELoss,
optimizer=torch.optim.Adam,
lr=3e-4,
max_epochs=10,
batch_size=32,
train_split=CVSplit(0.2),
verbose=1,
device=device
)
learner = ActiveLearner(
estimator=model,
query_strategy=uncertainty_sampling,
X_training=torch.rand(50, 30),
y_training=torch.rand(50, 1)
)
pool = torch.rand(20000, 30)
n_queries = 5
for i in range(n_queries):
print('Query no. %d' % (i + 1))
query_idx, query_inst = learner.query(pool, n_instances=1)
learner.teach(
X=pool[query_idx],
y=torch.rand(1, 1),
# only_new=True
)
pool = np.delete(pool, query_idx, axis=0)
If only_new=True
,the following error will be returned:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-4-67dbdfd957b1> in <module>
50 X=pool[query_idx],
51 y=torch.rand(1, 1),
---> 52 only_new=True
53 )
54 pool = np.delete(pool, query_idx, axis=0)
c:\users\anaconda3\lib\site-packages\modAL\models\learners.py in teach(self, X, y, bootstrap, only_new, **fit_kwargs)
97 self._fit_to_known(bootstrap=bootstrap, **fit_kwargs)
98 else:
---> 99 self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs)
100
101
c:\users\anaconda3\lib\site-packages\modAL\models\base.py in _fit_on_new(self, X, y, bootstrap, **fit_kwargs)
129
130 if not bootstrap:
--> 131 self.estimator.fit(X, y, **fit_kwargs)
132 else:
133 bootstrap_idx = np.random.choice(range(X.shape[0]), X.shape[0], replace=True)
c:\users\anaconda3\lib\site-packages\skorch\net.py in fit(self, X, y, **fit_params)
852 self.initialize()
853
--> 854 self.partial_fit(X, y, **fit_params)
855 return self
856
c:\users\anaconda3\lib\site-packages\skorch\net.py in partial_fit(self, X, y, classes, **fit_params)
811 self.notify('on_train_begin', X=X, y=y)
812 try:
--> 813 self.fit_loop(X, y, **fit_params)
814 except KeyboardInterrupt:
815 pass
c:\users\anaconda3\lib\site-packages\skorch\net.py in fit_loop(self, X, y, epochs, **fit_params)
715
716 dataset_train, dataset_valid = self.get_split_datasets(
--> 717 X, y, **fit_params)
718 on_epoch_kwargs = {
719 'dataset_train': dataset_train,
c:\users\anaconda3\lib\site-packages\skorch\net.py in get_split_datasets(self, X, y, **fit_params)
1199 if self.train_split:
1200 dataset_train, dataset_valid = self.train_split(
-> 1201 dataset, y, **fit_params)
1202 else:
1203 dataset_train, dataset_valid = dataset, None
c:\users\anaconda3\lib\site-packages\skorch\dataset.py in __call__(self, dataset, y, groups)
323 args = args + (to_numpy(y),)
324
--> 325 idx_train, idx_valid = next(iter(cv.split(*args, groups=groups)))
326 dataset_train = torch.utils.data.Subset(dataset, idx_train)
327 dataset_valid = torch.utils.data.Subset(dataset, idx_valid)
c:\users\anaconda3\lib\site-packages\sklearn\model_selection\_split.py in split(self, X, y, groups)
1327 """
1328 X, y, groups = indexable(X, y, groups)
-> 1329 for train, test in self._iter_indices(X, y, groups):
1330 yield train, test
1331
c:\users\anaconda3\lib\site-packages\sklearn\model_selection\_split.py in _iter_indices(self, X, y, groups)
1435 n_train, n_test = _validate_shuffle_split(
1436 n_samples, self.test_size, self.train_size,
-> 1437 default_test_size=self._default_test_size)
1438
1439 rng = check_random_state(self.random_state)
c:\users\anaconda3\lib\site-packages\sklearn\model_selection\_split.py in _validate_shuffle_split(n_samples, test_size, train_size, default_test_size)
1803 'resulting train set will be empty. Adjust any of the '
1804 'aforementioned parameters.'.format(n_samples, test_size,
-> 1805 train_size)
1806 )
1807
ValueError: With n_samples=1, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.
Thus I'm confused about the mechanism with only_new=False
.
Hi,I have a question about the keyward
only_new
inteach
method.In the document,it says that to make sure that you train only on newly queried labels, passonly_new=True
to the.teach()
method of the learner.Then if I setonly_new=False
with Pytorch models in modAL workflows,the models will be retrained with all the labelled data or continue learning with all the labelled data?If the workflow is the last one,it seems that the model will overfit on the repetitive data point,which makes me confused. Thank you.