PDF to OCR
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>PDF OCR Converter</title>
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
<style>
:root {
--bg-color: #ffffff;
--text-color: #212529;
}
[data-theme="dark"] {
--bg-color: #1a1a1a;
--text-color: #f8f9fa;
}
body {
background-color: var(--bg-color);
color: var(--text-color);
min-height: 100vh;
}
.upload-container {
border: 2px dashed #6c757d;
border-radius: 15px;
transition: all 0.3s ease;
}
.upload-container:hover {
border-color: #0d6efd;
background-color: rgba(13, 110, 253, 0.05);
}
#preview {
max-width: 100%;
height: auto;
display: none;
}
#ocrResult {
white-space: pre-wrap;
font-family: monospace;
}
.progress {
height: 25px;
display: none;
}
</style>
</head>
<body>
<div class="container py-5">
<h1 class="text-center mb-4">PDF to OCR Converter</h1>
<div class="row justify-content-center mb-4">
<div class="col-md-8">
<div class="upload-container p-5 text-center"
ondragover="event.preventDefault()"
ondrop="handleFileDrop(event)">
<input type="file" id="fileInput" accept="application/pdf" hidden>
<button class="btn btn-primary mb-3" onclick="document.getElementById('fileInput').click()">
Choose PDF File
</button>
<p class="mb-0">or drag and drop PDF here</p>
</div>
</div>
</div>
<div class="progress mb-4">
<div class="progress-bar progress-bar-striped progress-bar-animated"
role="progressbar" style="width: 0%"></div>
</div>
<div class="row">
<div class="col-md-6">
<canvas id="pdfCanvas" class="d-none"></canvas>
<img id="preview" alt="PDF preview">
</div>
<div class="col-md-6">
<div class="card">
<div class="card-header d-flex justify-content-between align-items-center">
<span>OCR Results</span>
<button class="btn btn-sm btn-outline-secondary" onclick="toggleTheme()">
Toggle Theme
</button>
</div>
<div class="card-body">
<textarea id="ocrResult" class="form-control" rows="15" readonly></textarea>
</div>
</div>
</div>
</div>
</div>
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script>
<script src="https://unpkg.com/[email protected]/dist/tesseract.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.4.120/pdf.min.js"></script>
<script>
const worker = Tesseract.createWorker();
let currentTheme = 'light';
async function initializeOCR() {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
}
async function processPDF(file) {
showProgress(0);
const pdfData = await readFileAsArrayBuffer(file);
const pdf = await window.pdfjsLib.getDocument({ data: pdfData }).promise;
let textResult = '';
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: 2 });
const canvas = document.getElementById('pdfCanvas');
const context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
await page.render({ canvasContext: context, viewport }).promise;
const imageData = canvas.toDataURL('image/jpeg');
showProgress((pageNum / pdf.numPages) * 100);
const result = await worker.recognize(imageData);
textResult += result.data.text + '\n\n';
}
document.getElementById('ocrResult').value = textResult;
hideProgress();
}
function handleFileDrop(e) {
e.preventDefault();
const file = e.dataTransfer.files[0];
if (file.type === 'application/pdf') {
processPDF(file);
}
}
function readFileAsArrayBuffer(file) {
return new Promise((resolve) => {
const reader = new FileReader();
reader.onload = () => resolve(reader.result);
reader.readAsArrayBuffer(file);
});
}
function showProgress(percent) {
document.querySelector('.progress').style.display = 'block';
document.querySelector('.progress-bar').style.width = `${percent}%`;
}
function hideProgress() {
document.querySelector('.progress').style.display = 'none';
}
function toggleTheme() {
currentTheme = currentTheme === 'light' ? 'dark' : 'light';
document.documentElement.setAttribute('data-theme', currentTheme);
}
// Initialize
document.getElementById('fileInput').addEventListener('change', (e) => {
processPDF(e.target.files[0]);
});
initializeOCR();
</script>
</body>
</html>