import { createWorker } from 'tesseract.js';

//@ts-ignore
import * as pdfjsLib from 'pdfjs-dist/build/pdf.mjs';

class PDFExtractor {
  private file: File;

  constructor(file: File) {
    this.file = file;
  }

  async extractText(): Promise<string> {
    const extractedText: string[] = [];

    // Load PDF.js worker
    pdfjsLib.GlobalWorkerOptions.workerSrc = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.mjs`;

    // Load PDF
    const pdfData = await this.file.arrayBuffer();
    const pdf = await pdfjsLib.getDocument({ data: pdfData }).promise;

    // Process each page
    for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
      const page = await pdf.getPage(pageNum);

      // Extract text content from the page
      const textContent = await page.getTextContent();
      const pageText = textContent.items
        .map((item: any) => item.str) // Extract text from each item
        .join(' '); // Join items into a single string
      extractedText.push(pageText);
    }

    return extractedText.join('\n').trim();
  }

  async extractTextWithOCR(): Promise<string> {
    // Load PDF.js worker

    pdfjsLib.GlobalWorkerOptions.workerSrc = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.mjs`;

    const extractedText: string[] = [];

    // Load PDF
    const pdfData = await this.file.arrayBuffer();
    const pdf = await pdfjsLib.getDocument({ data: pdfData }).promise;

    // Create Tesseract worker
    const worker = await createWorker('eng');

    // Process each page
    for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
      const page = await pdf.getPage(pageNum);
      const viewport = page.getViewport({ scale: 2.0 });

      // Create canvas for rendering
      const canvas = document.createElement('canvas');
      const context = canvas.getContext('2d')!;
      canvas.width = viewport.width;
      canvas.height = viewport.height;

      // Render page to canvas
      await page.render({
        canvasContext: context,
        viewport,
      }).promise;

      // Perform OCR on canvas
      const {
        data: { text },
      } = await worker.recognize(canvas);
      extractedText.push(text);
    }

    // Terminate worker
    await worker.terminate();

    return extractedText.join('\n').trim();
  }
}

export default PDFExtractor;
