Loading snippets...
Extract text from screenshots, photos, PDFs, dll.
/*
* file to text ocr
* Base: https://imagetotext.my
* Sumber: https://whatsapp.com/channel/0029VbB4Kw8EFeXfeExaXc3Q
* Note: extract teks dari file
*
* Usage:
* node ocr.js
* ^ untuk info cara pakai
*/
const fs = require('fs');
const path = require('path');
// ── Config ───────────────────────────────────────────────────────────────────
const BASE = 'https://imagetotext.my';
const API = `${BASE}/index.php`;
const MIME = {
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png',
'.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp',
'.tif': 'image/tiff', '.tiff': 'image/tiff', '.pdf': 'application/pdf',
};
const BASE_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': `${BASE}/`,
'Origin': BASE,
};
const sleep = ms => new Promise(r => setTimeout(r, ms));
// ── Cookie auto-fetch ─────────────────────────────────────────────────────────
let _cookieCache = null;
async function fetchCookie() {
if (_cookieCache) return _cookieCache;
const res = await fetch(BASE, { method: 'GET', headers: BASE_HEADERS, redirect: 'follow' });
// Ambil semua Set-Cookie header
// Node 18+ fetch: res.headers.getSetCookie() → string[]
// Node <18 / undici lama: res.headers.get('set-cookie') → comma-joined string
let cookies = [];
if (typeof res.headers.getSetCookie === 'function') {
cookies = res.headers.getSetCookie();
} else {
const raw = res.headers.get('set-cookie');
if (raw) cookies = raw.split(/,(?=[^ ])/).map(s => s.trim());
}
// Ambil nilai cookie (sebelum ;) yang namanya 'login'
const loginCookie = cookies
.map(c => c.split(';')[0].trim())
.find(c => c.startsWith('login='));
if (!loginCookie) throw new Error('Gagal ambil session cookie dari homepage');
_cookieCache = loginCookie;
return loginCookie;
}
function clearCookieCache() {
_cookieCache = null;
}
// ── Core ─────────────────────────────────────────────────────────────────────
async function upload(input, name) {
const cookie = await fetchCookie();
const buf = typeof input === 'string' ? fs.readFileSync(input) : input;
const filename = name || (typeof input === 'string' ? path.basename(input) : 'image.png');
const mime = MIME[path.extname(filename).toLowerCase()] || 'application/octet-stream';
const boundary = `----OCR${Date.now()}`;
const head = Buffer.from(
`--${boundary}\r\nContent-Disposition: form-data; name="op"\r\n\r\nupload_direct\r\n` +
`--${boundary}\r\nContent-Disposition: form-data; name="file"; filename="${filename}"\r\nContent-Type: ${mime}\r\n\r\n`
);
const tail = Buffer.from(`\r\n--${boundary}--\r\n`);
const body = Buffer.concat([head, buf, tail]);
const headers = {
...BASE_HEADERS,
'Cookie': cookie,
'Content-Type': `multipart/form-data; boundary=${boundary}`,
'Content-Length': String(body.length),
};
const res = await fetch(API, { method: 'POST', headers, body });
const json = await res.json();
if (!json?.success) {
// Cookie mungkin expired walau baru di-fetch, clear cache & lempar error
clearCookieCache();
throw new Error(
json?.error === 'file_too_large'
? 'File terlalu besar (max 10 MB)'
: json?.message || 'Upload gagal'
);
}
const { key, file_id, url } = json.data;
if (!key || !file_id) throw new Error('Response upload tidak valid');
return { fileId: String(file_id), key, url };
}
async function poll(fileId, key, timeout = 5 * 60 * 1000) {
const cookie = await fetchCookie();
const params = new URLSearchParams({
op: 'status', action: 'check_task_status',
file_id: fileId, filename: key,
});
const headers = { ...BASE_HEADERS, 'Cookie': cookie };
const start = Date.now();
while (true) {
if (Date.now() - start > timeout)
throw new Error(`Timeout setelah ${Math.round((Date.now() - start) / 1000)}s`);
try {
const res = await fetch(`${API}?${params}`, { headers });
const json = await res.json();
const data = json?.data || json || {};
const status = String(data.status || '').toLowerCase();
if (['completed', 'complete', 'done', 'success'].includes(status)) {
const r = data.result || data;
return { text: r.ocr_text || r.text || '', confidence: r.confidence ?? null, language: r.language ?? null };
}
if (status.includes('fail') || status.includes('error'))
throw new Error(`OCR error: ${status}`);
} catch (e) {
if (e.message.startsWith('OCR error') || e.message.startsWith('Timeout')) throw e;
}
await sleep(2000);
}
}
// ── Public API ────────────────────────────────────────────────────────────────
async function extract(input, opts = {}) {
const name = opts.filename || (typeof input === 'string' ? path.basename(input) : 'image.png');
const { fileId, key } = await upload(input, name);
const result = await poll(fileId, key, opts.timeout);
return { text: result.text, confidence: result.confidence, language: result.language, fileId };
}
async function extractBatch(inputs, opts = {}) {
// Pre-fetch cookie sekali sebelum batch dimulai
await fetchCookie();
const limit = opts.concurrency ?? 5;
const results = new Array(inputs.length);
let idx = 0;
const worker = async () => {
while (idx < inputs.length) {
const i = idx++;
try {
results[i] = { file: inputs[i], ...(await extract(inputs[i], opts)) };
} catch (e) {
results[i] = { file: inputs[i], text: null, error: e.message };
}
}
};
await Promise.all(Array.from({ length: Math.min(limit, inputs.length) }, worker));
return results;
}
module.exports = { extract, extractBatch, upload, poll, fetchCookie, clearCookieCache };
// ── CLI ───────────────────────────────────────────────────────────────────────
if (require.main === module) {
const argv = process.argv.slice(2);
if (!argv.length) {
console.log(`
imagetotext.my OCR — Node.js wrapper
Usage:
node ocr.js <file> [options]
node ocr.js <file1> <file2> ... (batch)
Options:
-o <output> Simpan ke file (contoh: -o hasil.json / hasil.txt / hasil.md)
Tanpa -o → langsung print JSON ke terminal
Examples:
node ocr.js foto.png
node ocr.js scan.pdf -o hasil.txt
node ocr.js a.png b.jpg c.png -o batch.json
Note: Cookie session di-fetch otomatis dari homepage, tidak perlu hardcode.
`);
process.exit(0);
}
const files = [];
let outFile = null;
for (let i = 0; i < argv.length; i++) {
if (argv[i] === '-o' && argv[i + 1]) { outFile = argv[++i]; }
else files.push(argv[i]);
}
const ext = outFile ? path.extname(outFile).toLowerCase() : '.json';
const isTxt = ext === '.txt' || ext === '.md';
(async () => {
process.stderr.write('Fetching session cookie...\n');
await fetchCookie();
process.stderr.write('Processing...\n');
try {
const raw = files.length === 1
? await extract(files[0])
: await extractBatch(files);
let output;
if (isTxt && files.length === 1) {
output = raw.text;
} else if (isTxt) {
output = raw.map(r => `=== ${r.file} ===\n${r.text || '[error] ' + r.error}`).join('\n\n');
} else {
output = JSON.stringify(raw, null, 2);
}
if (outFile) {
fs.writeFileSync(outFile, output, 'utf8');
process.stderr.write(`Saved: ${outFile}\n`);
} else {
console.log(output);
}
} catch (err) {
console.error(JSON.stringify({ error: err.message }));
process.exit(1);
}
})();
}