naptha / tesseract.js

Pure Javascript OCR for more than 100 Languages 📖🎉🖥
http://tesseract.projectnaptha.com/
Apache License 2.0
34.91k stars 2.21k forks source link

Parallel processing is not working correctly. #884

Closed Kishlay-notabot closed 8 months ago

Kishlay-notabot commented 8 months ago

Tesseract.js version (version number for npm/GitHub release, or specific commit for repo)
v5.0.4

Describe the bug I am running node js v20 and I am trying parallel processing on multiple images at once, but the code assigns the workers linearly, one by one and only a single worker gets used even if i assign 7 of them.

To Reproduce Steps to reproduce the behavior:
This is the code:

const path = require('path');
const fs = require('fs').promises;

async function processImages() {
  const folderPath = path.resolve(__dirname, './testing');

  try {
    console.log('Reading files from the specified folder:', folderPath);
    const files = await fs.readdir(folderPath);

    if (files.length === 0) {
      throw new Error('No image files found in the specified folder.');
    }

    console.log('Number of image files found:', files.length);

    const imageArr = files.map(file => path.join(folderPath, file));

    const scheduler = createScheduler();

    const workerGen = async () => {
      console.log('Creating a worker.');
      const worker = await createWorker("eng", 1, { logger:m => {console.log(m)} ,cachePath: "." });
      scheduler.addWorker(worker);
    }

    const workerN = 7;

    console.log(`Creating ${workerN} workers.`);
    const resArr = Array(workerN);
    for (let i = 0; i < workerN; i++) {
      resArr[i] = workerGen();
    }
    await Promise.all(resArr);

    console.log('Processing images and performing OCR:');

    const results = [];

    for (let i = 0; i < imageArr.length; i++) {
      const imagePath = imageArr[i];
      console.log(`Processing image ${i + 1}/${imageArr.length}: ${imagePath}`);
      const out = await scheduler.addJob('recognize', imagePath);
      const result = {
        imageName: path.basename(imagePath),
        words: out.data.words.map(word => ({
          text: word.text,
          confidence: word.confidence.toFixed(2),
          bbox: word.bbox,
        })),
      };
      results.push(result);

      console.log(`Processing of image ${i + 1}/${imageArr.length} complete.`);
    }

    await scheduler.terminate(); //terminate workers

    console.log('OCR processing completed.');

    // Save the results as a JSON file
    const jsonFilePath = path.resolve(__dirname, 'ocr_results.json');
    console.log('Exporting OCR results to JSON file:', jsonFilePath);
    await fs.writeFile(jsonFilePath, JSON.stringify(results, null, 2));

    console.log('OCR results saved to:', jsonFilePath);
  } catch (error) {
    console.error('Error:', error.message);
  }
}

processImages();  

Maybe I have messed up the worker assigning logic.
The code above initiates recognition, and then stores the words and their corresponding confidence value in an array and then finally dumps them into a json file with bbox data for further processing and manipulation.

Expected behavior Parallel processing running successfully on the images in the folder.

Device Version:

Balearica commented 8 months ago

This code is written in a way that runs recognition jobs one at a time, despite using a scheduler. The function scheduler.addJob returns a promise that resolves when the job completes. The await keyword waits for a promise to resolve. Therefore, await scheduler.addJob('recognize', imagePath) is creating a new recognition job, and waiting until it completes before moving to the next line. The code should be adapted such that the next job is run without waiting for the previous job to finish.

Kishlay-notabot commented 8 months ago

Thankyou for pointing out the error, I'll try resolving this, and close the issue soon Also, there's a pending pull request generated by me, please check that out @Balearica