"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const tesseract_js_1 = require("tesseract.js");
const OcrDriverBase_1 = require("../OcrDriverBase");
const time_1 = require("@joplin/utils/time");
const shim_1 = require("../../../shim");
const Logger_1 = require("@joplin/utils/Logger");
const filterOcrText_1 = require("../utils/filterOcrText");
const Resource_1 = require("../../../models/Resource");
const types_1 = require("../../database/types");
const logger = Logger_1.default.create('OcrDriverTesseract');
let workerId_ = 1;
const formatTesseractBoundingBox = (boundingBox) => {
    return [boundingBox.x0, boundingBox.x1, boundingBox.y0, boundingBox.y1];
};
// 2023-12-13: Empirically, it seems anything below 70 is not usable. Between 70
// and 75 it's hit and miss, but often it's good enough that we should keep the result.
// Above this is usually reliable. Using 70 for now.
//
// 2025-04-03: Changed to 55 to detect text in images that are supported in
// other tools but were not in Joplin.
//
// https://github.com/laurent22/joplin/issues/11608
const minConfidence = 55;
class OcrDriverTesseract extends OcrDriverBase_1.default {
    constructor(tesseract, { workerPath, corePath, languageDataPath }) {
        super();
        this.tesseract_ = null;
        this.languageDataPath_ = null;
        this.workers_ = {};
        this.tesseract_ = tesseract;
        this.workerPath_ = workerPath;
        this.corePath_ = corePath;
        this.languageDataPath_ = languageDataPath;
    }
    get driverId() {
        return types_1.ResourceOcrDriverId.PrintedText;
    }
    static async clearLanguageDataCache() {
        if (typeof indexedDB === 'undefined') {
            throw new Error('Missing indexedDB access!');
        }
        logger.info('Clearing cached language data...');
        const requestAsPromise = (request) => {
            return new Promise((resolve, reject) => {
                request.addEventListener('success', () => { resolve(request.result); });
                request.addEventListener('error', (event) => {
                    if ('error' in event) {
                        reject(new Error(`Request failed: ${event.error}`));
                    }
                    else {
                        reject(new Error('Request failed with unknown error.'));
                    }
                });
            });
        };
        const db = await requestAsPromise(indexedDB.open('keyval-store'));
        const getStore = (mode) => {
            return db.transaction(['keyval'], mode).objectStore('keyval');
        };
        const allKeys = await requestAsPromise(getStore('readonly').getAllKeys());
        const languageDataExtension = '.traineddata';
        const keysToClear = allKeys.filter(key => key.endsWith(languageDataExtension));
        for (const key of keysToClear) {
            logger.info('Clearing language data with key', key);
            await requestAsPromise(getStore('readwrite').delete(key));
        }
    }
    async acquireWorker(language) {
        if (!this.workers_[language])
            this.workers_[language] = [];
        const existingWorker = this.workers_[language].find(w => !w.busy);
        if (existingWorker) {
            existingWorker.busy = true;
            return existingWorker;
        }
        const createWorkerOptions = {
            workerBlobURL: false,
        };
        if (this.workerPath_)
            createWorkerOptions.workerPath = this.workerPath_;
        if (this.corePath_)
            createWorkerOptions.corePath = this.corePath_;
        if (this.languageDataPath_)
            createWorkerOptions.langPath = this.languageDataPath_;
        const worker = await this.tesseract_.createWorker(language, tesseract_js_1.OEM.LSTM_ONLY, createWorkerOptions);
        const output = {
            id: workerId_++,
            instance: worker,
            busy: true,
        };
        logger.info(`Created worker: ${output.id}`);
        this.workers_[language].push(output);
        return output;
    }
    async dispose() {
        for (const [language, workers] of Object.entries(this.workers_)) {
            for (const w of workers) {
                await w.instance.terminate();
            }
            this.workers_[language] = [];
        }
    }
    async terminateWorker(id) {
        for (const [, workers] of Object.entries(this.workers_)) {
            const idx = workers.findIndex(w => w.id === id);
            if (idx < 0)
                continue;
            await workers[idx].instance.terminate();
            workers.splice(idx, 1);
            break;
        }
    }
    async releaseWorker(worker) {
        worker.busy = false;
    }
    async recognize(language, filePath) {
        // eslint-disable-next-line no-async-promise-executor -- can't think of any way to handle the timeout without using `new Promise`
        return new Promise(async (resolve, reject) => {
            const worker = await this.acquireWorker(language);
            let hasTimedOut = false;
            const terminateTimeout_ = shim_1.default.setTimeout(async () => {
                await this.terminateWorker(worker.id);
                hasTimedOut = true;
                reject(new Error(`Recognize operation timed out on: ${filePath}`));
            }, 10 * time_1.Minute);
            let result = null;
            try {
                result = await worker.instance.recognize(filePath, {}, { text: false, blocks: true });
            }
            catch (e) {
                const error = typeof e === 'string' ? new Error(e) : e;
                error.message = `Recognition failed on: ${filePath}: ${error.message}`;
                if (!hasTimedOut)
                    reject(error);
                return;
            }
            if (hasTimedOut)
                return;
            shim_1.default.clearTimeout(terminateTimeout_);
            await this.releaseWorker(worker);
            const goodParagraphs = [];
            let goodLines = [];
            for (const block of result.data.blocks) {
                for (const paragraph of block.paragraphs) {
                    const lines = [];
                    for (const line of paragraph.lines) {
                        // If the line confidence is above the threshold we keep the
                        // whole text. The confidence of individual words will vary and
                        // may be below the treshold, but there's a chance they will
                        // still be correct if the line as a whole is well recognised.
                        if (line.confidence < minConfidence)
                            continue;
                        const lineBaselineAt = (x, top) => {
                            const dy = line.baseline.y1 - line.baseline.y0;
                            const dx = line.baseline.x1 - line.baseline.x0;
                            // Avoid division by zero
                            if (dx === 0) {
                                return top ? line.baseline.y0 : line.baseline.y1;
                            }
                            else {
                                const slope = dy / dx;
                                return slope * (x - line.baseline.x0) + line.baseline.y0;
                            }
                        };
                        const goodWords = line.words
                            .map(w => {
                            const baselineX1 = w.bbox.x0;
                            const baselineY1 = lineBaselineAt(baselineX1, true);
                            const baselineX2 = w.bbox.x1;
                            const baselineY2 = lineBaselineAt(baselineX2, false);
                            const output = {
                                t: w.text,
                                bb: formatTesseractBoundingBox(w.bbox),
                                bl: [baselineX1, baselineX2, baselineY1, baselineY2],
                            };
                            return output;
                        });
                        lines.push({
                            words: goodWords,
                        });
                    }
                    goodParagraphs.push({
                        text: lines.map(l => l.words.map(w => w.t).join(' ')).join('\n'),
                    });
                    goodLines = goodLines.concat(lines);
                }
            }
            resolve({
                // Note that Tesseract provides a `.text` property too, but it's the
                // concatenation of all lines, even those with a low confidence
                // score, so we recreate it here based on the good lines.
                ocr_text: (0, filterOcrText_1.default)(goodParagraphs.map(p => p.text).join('\n')),
                ocr_details: Resource_1.default.serializeOcrDetails(goodLines),
                ocr_status: types_1.ResourceOcrStatus.Done,
                ocr_error: '',
            });
        });
    }
}
exports.default = OcrDriverTesseract;
//# sourceMappingURL=OcrDriverTesseract.js.map