Open tmgcassidy opened 3 years ago
For tesseract.exe, You need to use hocr command. İt gives x,y cordination of recognized text in html.
You have to enable wordLevelSegmentation. (İ dont know the exact cmd commands.)
For c# https://stackoverflow.com/questions/51282214/tesseract-ocr-text-position
You can specify word segmentation in c# code.
\\how to use
Void Main()
{
\\ İm writing on AndroidPhone.
\\ img is either.
Var img = "imgFilePath.jpg";
\\ or
Var img = bitmap("imgFilePath.jpg");
var li1 = GetWordsFromImage(img);
var TessWord_li = AssignColumnNo (li1);
Var AllColumnNos = TessWord_li.Select(x=>x.ColumnNo). Distinct ().ToList();
\\\get the 1st n 2nd Column. Etc
Var wordsThatAreOn_col1 = TessWord_li.Where(x=>x.ColumnNo==1).ToList();
Var wordsThatAreOn_col2 = TessWord_li.Where(x=>x.ColumnNo==2).ToList();
}
\\ required class and functions
Class tessWord
{
Public String Text;
Public Rectangle rect; // bounds
Public Rectangle rectYCordZeroed;
Public int ColumnNo=0;
}
List<TessWord> GetWordsFromImage(img)
{
Var tessWord_li = new List<TessWord>();
var myLevel =
Tesseract.PageIteratorLevel.Word;
using (var page = Engine.Process(img))
using (var iter = page.GetIterator())
{
iter.Begin();
do
{
Rectangle curRect;
if (iter.TryGetBoundingBox(myLevel, out var curRect))
{
var curText = iter.GetText(myLevel);
// Add recognized word ToList
Var w= new tessWord (){
Text =curText,
Rect = curRect,
\\Set rectangle Ycord to zero
rectYCordZeroed = /*todo */ ;
};
TessWord_li.Add(w);
}
} while (iter.Next(myLevel));
}
}
List<tessWord> AssignColumnNo(t List<TessWord> tessWord_li )
{
Foreach(var word1 in TessWord_li)
Foreach(var word2 in TessWord_li)
{
İf (Word1.rectYzero.intersects( word2.rectYZero) )
{
\\Two words on same column.
If(Word1.ColumnNo ==0)
{
\\if word has noColNo. Give it new ColNo
Word1.ColumnNo = TessWord_li.Select(x=>x.ColumnNo).Max()+1;
}
Word2.ColumnNo = Word1.ColumnNo;
}
}
return tessWord_li;
}
Is there any way to leverage Tesseract for table extraction? Below is an example of a table I would need to extract Chinese Characters (Simplified) from.