Closed Sayyam-Jain closed 4 years ago
make_lmdb_data.py
using following code:
""" clone from deep-text-recognition-benchmark repositpry https://github.com/clovaai/deep-text-recognition-benchmark/blob/master/create_lmdb_dataset.py, a modified version of CRNN torch repository https://github.com/bgshih/crnn/blob/master/tool/create_dataset.py """
import fire import os import lmdb import cv2
import numpy as np
def checkImageIsValid(imageBin): if imageBin is None: return False imageBuf = np.frombuffer(imageBin, dtype=np.uint8) img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE) imgH, imgW = img.shape[0], img.shape[1] if imgH * imgW == 0: return False return True
def writeCache(env, cache): with env.begin(write=True) as txn: for k, v in cache.items(): txn.put(k, v)
def createDataset(inputPath, gtFile, outputPath, checkValid=True): """ Create LMDB dataset for training and evaluation. ARGS: inputPath : input folder path where starts imagePath outputPath : LMDB output path gtFile : list of image path and label checkValid : if true, check the validity of every image """ os.makedirs(outputPath, exist_ok=True) env = lmdb.open(outputPath, map_size=1099511627) cache = {} cnt = 1
with open(gtFile, 'r', encoding='utf-8') as data:
datalist = data.readlines()
nSamples = len(datalist)
for i in range(nSamples):
imagePath, label = datalist[i].strip('\n').split('\t')
imagePath = os.path.join(inputPath, imagePath)
# # only use alphanumeric data
# if re.search('[^a-zA-Z0-9]', label):
# continue
if not os.path.exists(imagePath):
print('%s does not exist' % imagePath)
continue
with open(imagePath, 'rb') as f:
imageBin = f.read()
if checkValid:
try:
if not checkImageIsValid(imageBin):
print('%s is not a valid image' % imagePath)
continue
except:
print('error occured', i)
with open(outputPath + '/error_image_log.txt', 'a') as log:
log.write('%s-th image data occured error\n' % str(i))
continue
imageKey = 'image-%09d'.encode() % cnt
labelKey = 'label-%09d'.encode() % cnt
cache[imageKey] = imageBin
cache[labelKey] = label.encode()
if cnt % 1000 == 0:
writeCache(env, cache)
cache = {}
print('Written %d / %d' % (cnt, nSamples))
cnt += 1
nSamples = cnt-1
cache['num-samples'.encode()] = str(nSamples).encode()
writeCache(env, cache)
print('Created dataset with %d samples' % nSamples)
if name == 'main': fire.Fire(createDataset)
2. Run the scrip as follows
```python
python make_lmdb_data.py --inputPath image_path --gtFile txt_file.txt --outputPath save_path
Thank you. Just one last thing, I can just replace the lmdb file path in cfg with my custom , train, test and validation files, right? Thanks again
Yes.
Hey! I've added an extra lmdb file and made the necessary changes in 'concat-dict'. However, the balance sampler gives me an error (related to oversample), and I didn't quite figure a way around it. I set the oversample (bool) variable to False, and it seems to train fine with the new added data.
How does setting oversample to False affect the new dataset? I'm not familiar with balance samplers.
Thank you.
Hey! I've added an extra lmdb file and made the necessary changes in 'concat-dict'. However, the balance sampler gives me an error (related to oversample), and I didn't quite figure a way around it. I set the oversample (bool) variable to False, and it seems to train fine with the new added data.
How does setting oversample to False affect the new dataset? I'm not familiar with balance samplers.
Thank you.
Can you show me the erro infos?
If you set oversample to True, the dataset with less length will be oversampled to ensure balance in each batch. If you set it as False, it will do nothing which means the batch may not always be balanced.
2021-04-14 05:34:37,562 - INFO - Use GPU 0
2021-04-14 05:34:37,562 - INFO - Set cudnn deterministic False
2021-04-14 05:34:37,562 - INFO - Set cudnn benchmark True
2021-04-14 05:34:37,562 - INFO - Set seed 1111
2021-04-14 05:34:37,564 - INFO - Build model
2021-04-14 05:34:38,059 - INFO - GResNet init weights
2021-04-14 05:34:38,509 - INFO - AttHead init weights
2021-04-14 05:34:51,723 - INFO - current dataset length is 3096 in /content/gdrive/MyDrive/mlt_hin_drive/training/train_hin/Real/realhin_train
2021-04-14 05:34:53,947 - INFO - current dataset length is 8000 in /content/gdrive/MyDrive/mlt_hin_drive/training/train_hin/Syn/hindi_train
2021-04-14 05:34:55,349 - INFO - current dataset length is 3656 in /content/gdrive/MyDrive/mlt_hin_drive/training/train_hin/MLT
2021-04-14 05:34:55,349 - INFO - The truly used batch ratios are [0.5 0.5]
Traceback (most recent call last):
File "tools/train.py", line 42, in
Here is the error info. Thank you!
Here is the error info. Thank you!
It looks you use three datasets for training, so you should change the batch_ratio in config. For example, batch_ratios = [0.3, 0.3, 0.4].
I will add a assert
to make the erro information clearly.
Thank you very much again :)
Thank you very much again :)
You're welcome.
Thank you for the awesome work. Can you please tell me how to add custom lmdb folders for training and validation? Thanks