makcedward / nlpaug

Data augmentation for NLP
https://makcedward.github.io/
MIT License
4.43k stars 462 forks source link

aw.RandomWordAug(action='crop',aug_p=0.5, aug_min=0) failing with #183

Closed amitkml closed 3 years ago

amitkml commented 3 years ago

Hi,

NLP Augmentation with crop is failing with error Sample larger than population or is negative.


ValueError Traceback (most recent call last)

in () 7 print(augmented_text) 8 train_st_data_crop_aug = train_st_data ----> 9 train_st_data_crop_aug['sentence_aug'] = train_st_data_crop_aug.apply(lambda x: aug.augment(x['sentence']),axis=1) ## Delete a set of contunous word will be removed randomly¶ 14 frames /usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwds) 7550 kwds=kwds, 7551 ) -> 7552 return op.get_result() 7553 7554 def applymap(self, func) -> "DataFrame": /usr/local/lib/python3.6/dist-packages/pandas/core/apply.py in get_result(self) 178 return self.apply_raw() 179 --> 180 return self.apply_standard() 181 182 def apply_empty_result(self): /usr/local/lib/python3.6/dist-packages/pandas/core/apply.py in apply_standard(self) 269 270 def apply_standard(self): --> 271 results, res_index = self.apply_series_generator() 272 273 # wrap results /usr/local/lib/python3.6/dist-packages/pandas/core/apply.py in apply_series_generator(self) 298 for i, v in enumerate(series_gen): 299 # ignore SettingWithCopy here in case the user mutates --> 300 results[i] = self.f(v) 301 if isinstance(results[i], ABCSeries): 302 # If we have a view on v, we need to make a copy because in (x) 7 print(augmented_text) 8 train_st_data_crop_aug = train_st_data ----> 9 train_st_data_crop_aug['sentence_aug'] = train_st_data_crop_aug.apply(lambda x: aug.augment(x['sentence']),axis=1) ## Delete a set of contunous word will be removed randomly¶ /usr/local/lib/python3.6/dist-packages/nlpaug/base_augmenter.py in augment(self, data, n, num_thread) 113 # Single input with/without multiple input 114 else: --> 115 augmented_results = self._parallel_augment(action_fx, clean_data, n=n, num_thread=num_thread) 116 117 if len(augmented_results) >= expected_output_num: /usr/local/lib/python3.6/dist-packages/nlpaug/base_augmenter.py in _parallel_augment(cls, action_fx, data, n, num_thread) 174 def _parallel_augment(cls, action_fx, data, n, num_thread=2): 175 pool = ThreadPool(num_thread) --> 176 results = pool.map(action_fx, [data] * n) 177 pool.close() 178 pool.join() /usr/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize) 264 in a list that is returned. 265 ''' --> 266 return self._map_async(func, iterable, mapstar, chunksize).get() 267 268 def starmap(self, func, iterable, chunksize=None): /usr/lib/python3.6/multiprocessing/pool.py in get(self, timeout) 642 return self._value 643 else: --> 644 raise self._value 645 646 def _set(self, i, obj): /usr/lib/python3.6/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception) 117 job, i, func, args, kwds = task 118 try: --> 119 result = (True, func(*args, **kwds)) 120 except Exception as e: 121 if wrap_exception and func is not _helper_reraises_exception: /usr/lib/python3.6/multiprocessing/pool.py in mapstar(args) 42 43 def mapstar(args): ---> 44 return list(map(*args)) 45 46 def starmapstar(args): /usr/local/lib/python3.6/dist-packages/nlpaug/augmenter/word/random.py in crop(self, data) 185 doc = Doc(data, self.tokenizer(data)) 186 --> 187 aug_idxes = self._get_aug_range_idxes(doc.get_original_tokens()) 188 aug_idxes.sort(reverse=True) 189 /usr/local/lib/python3.6/dist-packages/nlpaug/augmenter/word/word_augmenter.py in _get_aug_range_idxes(self, tokens) 105 word_idxes = [i for i, _ in enumerate(tokens[aug_cnt-1:])] 106 --> 107 start_aug_idx = self.sample(word_idxes, 1)[0] 108 aug_idxes = [start_aug_idx + _*direction for _ in range(aug_cnt)] 109 /usr/local/lib/python3.6/dist-packages/nlpaug/base_augmenter.py in sample(cls, x, num) 222 def sample(cls, x, num=None): 223 if isinstance(x, list): --> 224 return random.sample(x, num) 225 elif isinstance(x, int): 226 return np.random.randint(1, x-1) /usr/lib/python3.6/random.py in sample(self, population, k) 318 n = len(population) 319 if not 0 <= k <= n: --> 320 raise ValueError("Sample larger than population or is negative") 321 result = [None] * k 322 setsize = 21 # size of a small set minus size of an empty list ValueError: Sample larger than population or is negative
amitkml commented 3 years ago

duplicate