MichalBusta / DeepTextSpotter

285 stars 101 forks source link

pre-training ctc_loss #71

Open ghost opened 6 years ago

ghost commented 6 years ago

Hello, I've been trying to make own language model.

Before, doing this I want to made my own model checking it works fine.

A lot of Issue posted on this repository said pre-training is important.

So, I got two pre-training python script. (Thanks to @Linchenguang, little bit modifying for my path and data structure)

  1. text detection part 2. text recognition part.

The first part works fine. But the second part is not working for me.

The training set that I used is : Synthetic Word Dataset (http://www.robots.ox.ac.uk/~vgg/data/text/)

ex) 2_refs_63925

The awkward thing is sf(softmax)

at first looks like:

[[[[ 6.74129114e-04 -1.13475189e-05 -1.13475189e-05 ..., -1.13475189e-05 -1.13475189e-05 -1.13475189e-05]]

after few steps:

[[[[ 6.54371595 -0.12445793 -0.12445793 ..., -0.12445793 -0.12445793 -0.12445793]]

Can you give me any adivce???

Here is my python script.

# -*- coding: utf-8 -*-
import numpy as np
import sys, os
reload(sys)
sys.setdefaultencoding('utf8')
import caffe

sys.path.insert(0, '/usr/local/python/')
baseDir = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{0}/build'.format(baseDir))

caffe.set_device(0)  # if we have multiple GPUs, pick the first one
caffe.set_mode_gpu()

import cv2

import math

import re

import random 
from models import create_recognizer_solver

from data import DataLoader

#import vis
import matplotlib.pyplot as plt

import argparse

# import generate_codec_rev as gen

#from validation import validate    
image_no = 0

import utils
from utils import intersect, union, area, get_normalized_image, get_obox

buckets = [54, 80, 124, 182, 272, 410, 614, 922, 1383, 2212]  
image_sizes = [[352, 352], [416, 416] ] #,[480, 480], [544, 544], [576, 576]]    
image_size = [160, 160]
it = 0
mean_loss = 0
mean_rec = 0
# bak is 256
data_batch = 384
data_total = 0
data_index = 0
f_list = []

valid_interval = 100
snapshot_interval = 4000

codec = u' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|}~£ÁČĎÉĚÍŇÓŘŠŤÚŮÝŽáčďéěíňóřšťúůýž'
codec_rev = {}
index = 4
for i in range(0, len(codec)):
  codec_rev[ord(codec[i])] = index
  index += 1

def process_batch( net , optim , args ):
  global it,data_batch,data_total,data_index,f_list,codec_rev,codec
  text = []
  W = []
  H = []
  bucket_images = {}
  dummy={}
  net_ctc = net.net
  #print( len(codec))
  #1 Read images and gts ( circular buffer form for small samples)   
  for img_ind in range( min(data_batch,data_total) ):
    circ_ind = (data_index + img_ind ) % data_total

    img_path = os.path.join( args.data_dir, f_list[circ_ind].replace('../','') )
    #print('img_ind=%d,img_path=%s' %(data_index,img_path))
    img=cv2.imread( img_path.strip() )

    if img == None:
      print("im read error")
      print("img_path:%s" %img_path)
    if img.shape[2]==3:
      img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    W = img.shape[1]
    H = img.shape[0]

    item = img_path.strip()
    anns = item.split("/")
    text = anns[-1].split("_")[1].decode('utf8')
    data_index+=1
    #2 Adjust image sizes to fixed height and variable width
    width_scale = 32.0 / H
    width = W * width_scale

    best_diff = width
    bestb = 0
    for b in range(0, len(buckets)):
      if best_diff > abs(width * 1.3 - buckets[b]):
        best_diff = abs(width * 1.3 - buckets[b])
        bestb = b

    scaled = cv2.resize(img, (buckets[bestb], 32))  
    scaled = np.asarray(scaled, dtype=np.float)
    delta = scaled.max() - scaled.min()
    #print('scaled.max=%d scaled.min=%d delta=%d mean=%d' %(scaled.max(),scaled.min(),delta, scaled.mean()))
    scaled = (scaled) / (delta / 2.0)
    scaled -= scaled.mean()
    #print( 'scaled')
    #print(scaled.shape )
    if not bucket_images.has_key(bestb):
      bucket_images[bestb] = {}
      bucket_images[bestb]['img'] = []  
      bucket_images[bestb]['sizes'] = []    
      bucket_images[bestb]['txt'] = []
      bucket_images[bestb]['gt_enc'] = []
      dummy[bestb] = 1
    gt_labels = []
    txt_enc = ''
    for k in range( len(text) ):
      t_unicode = ord(text[k])
      if t_unicode > 0:
        if codec_rev.has_key( t_unicode ):
          gt_labels.append( codec_rev[ t_unicode ] )
        else:
          gt_labels.append( 3 )
      else:
        gt_labels.append( 0 )

    if scaled.ndim==3:
      print( scaled.shape )
      scaled = cv2.cvtColor(scaled, cv2.COLOR_RGB2GRAY)
    if args.debug:
      cv2.imshow('scaled', scaled)
    bucket_images[bestb]['sizes'].append(len(gt_labels))
    bucket_images[bestb]['gt_enc'].append(gt_labels)
    bucket_images[bestb]['txt'].append(text)
    bucket_images[bestb]['img'].append(scaled)
  data_index = data_index % data_total 

  #3 Transfer the data into the net 
  for bucket in bucket_images.keys():
    #print(bucket)  
    imtf = np.asarray(bucket_images[bucket]['img'], dtype=np.float)
    #print('imtf.shape')
    #print( imtf.shape )
    imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))
    #print('imtf reshape')
    #print( 'imtf.shape[0]=%d imtf.shape[1]=%d imtf.shape[2]=%d imtf.shape[3]=%d' %(imtf.shape[0],imtf.shape[1],imtf.shape[2],imtf.shape[3]) )
    net_ctc.blobs['data'].reshape(imtf.shape[0],imtf.shape[1],imtf.shape[2], imtf.shape[3]) 
    net_ctc.blobs['data'].data[...] = imtf

    labels = bucket_images[bucket]['gt_enc']
    txt = bucket_images[bucket]['txt']
    # indentical length needed     
    max_len = 0
    for l in range(0, len(labels)):
      max_len = max(max_len, len(labels[l]))
    for l in range(0, len(labels)):
      while len(labels[l]) <  max_len:
        labels[l].append(0)

    labels = np.asarray(labels, np.float)

    net_ctc.blobs['label'].reshape(labels.shape[0], labels.shape[1])

    net_ctc.blobs['label'].data[...] = labels    
    #4 Compute forward-backward
    #optim.step(1)
    net.step(1)
    it +=1
    #5 If loss is large, print it
    #if net_ctc.blobs['loss'].data[...] > 10:
    sf = net_ctc.blobs['transpose'].data[...]
    print("==================================")
    print(sf)
    #print( 'sf.shape' )
    #print(sf.shape)
    labels2 = sf.argmax(3)
    out = utils.print_seq(labels2[:,0, :])
    print(u'{0} <--> {1}'.format(out, txt[0])  )

    if it%snapshot_interval == 0:
      #optim.snapshot()
      net.snapshot()
      print ( 'it is %d, and snapshot_interval is %d' %(it,snapshot_interval) )
      print( 'snapshot saved')

def train_dir( net, optim,args):  
  caffe.set_mode_gpu()
  while True:
    process_batch(net,optim, args)

parser = argparse.ArgumentParser()
parser.add_argument('-data_dir', default='/home/hclee/data/SynthText')
parser.add_argument('-train_list', default='/home/hclee/data/SynthText/2908_imlist.txt')
parser.add_argument('-debug', type=int, default=0)

args = parser.parse_args()

with open(args.train_list,'r') as f:
  f_list = f.readlines()

data_total = len( f_list )

net_ctc_sgd = create_recognizer_solver(args)
# when I use pre-trained mdoel It works, but when I try to make my own model something goes wrong..
# net_ctc_sgd.net.copy_from('models/model.caffemodel')
net = net_ctc_sgd
train_dir( net ,net_ctc_sgd,args)