tesseract-ocr / tessdoc

Tesseract documentation
https://tesseract-ocr.github.io/tessdoc/
1.77k stars 362 forks source link

Tabular Data - Read Column By Column #33

Open tmgcassidy opened 3 years ago

tmgcassidy commented 3 years ago

Is there any way to leverage Tesseract for table extraction? Below is an example of a table I would need to extract Chinese Characters (Simplified) from.

image

blackholeearth commented 2 years ago

For tesseract.exe, You need to use hocr command. İt gives x,y cordination of recognized text in html.

You have to enable wordLevelSegmentation. (İ dont know the exact cmd commands.)


For c# https://stackoverflow.com/questions/51282214/tesseract-ocr-text-position

You can specify word segmentation in c# code.

\\how to use
Void Main()
{
\\ İm writing on AndroidPhone.
\\ img is either. 
Var img =  "imgFilePath.jpg";
\\ or
Var img = bitmap("imgFilePath.jpg");

var li1 = GetWordsFromImage(img);
var TessWord_li = AssignColumnNo (li1);

Var AllColumnNos = TessWord_li.Select(x=>x.ColumnNo). Distinct ().ToList();

\\\get the 1st n 2nd Column. Etc
Var wordsThatAreOn_col1 = TessWord_li.Where(x=>x.ColumnNo==1).ToList();
Var wordsThatAreOn_col2 = TessWord_li.Where(x=>x.ColumnNo==2).ToList();

}

\\ required class and functions
Class tessWord
{
  Public String Text;
  Public Rectangle rect; // bounds
  Public Rectangle  rectYCordZeroed; 
  Public int ColumnNo=0;
}

List<TessWord> GetWordsFromImage(img)
{
   Var tessWord_li = new List<TessWord>();
   var myLevel = 
   Tesseract.PageIteratorLevel.Word;
   using (var page = Engine.Process(img))
   using (var iter = page.GetIterator())
   {
       iter.Begin();
       do
       {
          Rectangle curRect;
           if (iter.TryGetBoundingBox(myLevel, out var curRect))
           {
            var curText = iter.GetText(myLevel);

          // Add recognized word ToList
           Var w= new tessWord (){
              Text =curText,
             Rect = curRect,   
             \\Set  rectangle Ycord to zero
            rectYCordZeroed = /*todo */ ;
           };
           TessWord_li.Add(w);
        }
    } while (iter.Next(myLevel));

}
}

List<tessWord> AssignColumnNo(t  List<TessWord> tessWord_li )
{

Foreach(var word1 in TessWord_li)
Foreach(var word2 in TessWord_li)
{
   İf (Word1.rectYzero.intersects( word2.rectYZero) )
   {
      \\Two words on same column.
      If(Word1.ColumnNo ==0)
      {
         \\if word has noColNo. Give it new ColNo
         Word1.ColumnNo = TessWord_li.Select(x=>x.ColumnNo).Max()+1;
      }
      Word2.ColumnNo = Word1.ColumnNo;
   }
}
 return tessWord_li; 
}