mirror of
https://github.com/stackblitz-labs/bolt.diy
synced 2025-06-25 09:47:37 +00:00
34 lines
1.1 KiB
TypeScript
34 lines
1.1 KiB
TypeScript
import * as pdfjsLib from 'pdfjs-dist/build/pdf.mjs';
|
|
import pdfWorker from 'pdfjs-dist/build/pdf.worker.mjs';
|
|
import mammoth from 'mammoth/mammoth.browser';
|
|
|
|
pdfjsLib.GlobalWorkerOptions.workerSrc = pdfWorker;
|
|
|
|
export async function extractTextFromFile(file: File): Promise<string> {
|
|
if (file.type === 'application/pdf' || file.name.toLowerCase().endsWith('.pdf')) {
|
|
const arrayBuffer = await file.arrayBuffer();
|
|
const pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise;
|
|
const texts: string[] = [];
|
|
|
|
for (let i = 1; i <= pdf.numPages; i++) {
|
|
const page = await pdf.getPage(i);
|
|
const content = await page.getTextContent();
|
|
texts.push(content.items.map((item: any) => item.str).join(' '));
|
|
}
|
|
|
|
return texts.join('\n');
|
|
}
|
|
|
|
if (
|
|
file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
|
|
file.name.toLowerCase().endsWith('.docx')
|
|
) {
|
|
const arrayBuffer = await file.arrayBuffer();
|
|
const result = await mammoth.extractRawText({ arrayBuffer });
|
|
|
|
return result.value;
|
|
}
|
|
|
|
return file.text();
|
|
}
|