PDF to OCR




<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>PDF OCR Converter</title> <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"> <style> :root { --bg-color: #ffffff; --text-color: #212529; }
    [data-theme="dark"] {
        --bg-color: #1a1a1a;
        --text-color: #f8f9fa;
    }

    body {
        background-color: var(--bg-color);
        color: var(--text-color);
        min-height: 100vh;
    }

    .upload-container {
        border: 2px dashed #6c757d;
        border-radius: 15px;
        transition: all 0.3s ease;
    }

    .upload-container:hover {
        border-color: #0d6efd;
        background-color: rgba(13, 110, 253, 0.05);
    }

    #preview {
        max-width: 100%;
        height: auto;
        display: none;
    }

    #ocrResult {
        white-space: pre-wrap;
        font-family: monospace;
    }

    .progress {
        height: 25px;
        display: none;
    }
</style>
</head> <body> <div class="container py-5"> <h1 class="text-center mb-4">PDF to OCR Converter</h1>
    <div class="row justify-content-center mb-4">
        <div class="col-md-8">
            <div class="upload-container p-5 text-center" 
                 ondragover="event.preventDefault()"
                 ondrop="handleFileDrop(event)">
                <input type="file" id="fileInput" accept="application/pdf" hidden>
                <button class="btn btn-primary mb-3" onclick="document.getElementById('fileInput').click()">
                    Choose PDF File
                </button>
                <p class="mb-0">or drag and drop PDF here</p>
            </div>
        </div>
    </div>

    <div class="progress mb-4">
        <div class="progress-bar progress-bar-striped progress-bar-animated" 
             role="progressbar" style="width: 0%"></div>
    </div>

    <div class="row">
        <div class="col-md-6">
            <canvas id="pdfCanvas" class="d-none"></canvas>
            <img id="preview" alt="PDF preview">
        </div>
        <div class="col-md-6">
            <div class="card">
                <div class="card-header d-flex justify-content-between align-items-center">
                    <span>OCR Results</span>
                    <button class="btn btn-sm btn-outline-secondary" onclick="toggleTheme()">
                        Toggle Theme
                    </button>
                </div>
                <div class="card-body">
                    <textarea id="ocrResult" class="form-control" rows="15" readonly></textarea>
                </div>
            </div>
        </div>
    </div>
</div>

<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script>
<script src="https://unpkg.com/[email protected]/dist/tesseract.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.4.120/pdf.min.js"></script>

<script>
    const worker = Tesseract.createWorker();
    let currentTheme = 'light';

    async function initializeOCR() {
        await worker.load();
        await worker.loadLanguage('eng');
        await worker.initialize('eng');
    }

    async function processPDF(file) {
        showProgress(0);
        const pdfData = await readFileAsArrayBuffer(file);
        const pdf = await window.pdfjsLib.getDocument({ data: pdfData }).promise;
        
        let textResult = '';
        
        for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
            const page = await pdf.getPage(pageNum);
            const viewport = page.getViewport({ scale: 2 });
            const canvas = document.getElementById('pdfCanvas');
            const context = canvas.getContext('2d');
            
            canvas.height = viewport.height;
            canvas.width = viewport.width;
            
            await page.render({ canvasContext: context, viewport }).promise;
            const imageData = canvas.toDataURL('image/jpeg');
            
            showProgress((pageNum / pdf.numPages) * 100);
            const result = await worker.recognize(imageData);
            textResult += result.data.text + '\n\n';
        }
        
        document.getElementById('ocrResult').value = textResult;
        hideProgress();
    }

    function handleFileDrop(e) {
        e.preventDefault();
        const file = e.dataTransfer.files[0];
        if (file.type === 'application/pdf') {
            processPDF(file);
        }
    }

    function readFileAsArrayBuffer(file) {
        return new Promise((resolve) => {
            const reader = new FileReader();
            reader.onload = () => resolve(reader.result);
            reader.readAsArrayBuffer(file);
        });
    }

    function showProgress(percent) {
        document.querySelector('.progress').style.display = 'block';
        document.querySelector('.progress-bar').style.width = `${percent}%`;
    }

    function hideProgress() {
        document.querySelector('.progress').style.display = 'none';
    }

    function toggleTheme() {
        currentTheme = currentTheme === 'light' ? 'dark' : 'light';
        document.documentElement.setAttribute('data-theme', currentTheme);
    }

    // Initialize
    document.getElementById('fileInput').addEventListener('change', (e) => {
        processPDF(e.target.files[0]);
    });

    initializeOCR();
</script>
</body> </html>