Spaces:

CultriX
/

Generate-Knowledge-Graphs

Running

App Files Files Community

Generate-Knowledge-Graphs / src /document_processor.py

CultriX

First commit

e86199a 11 months ago

raw

history blame contribute delete

4.37 kB

	import os
	import json
	from typing import List, Dict, Any
	import pdfplumber
	from docx import Document
	from config.settings import Config

	class DocumentProcessor:
	def __init__(self):
	self.config = Config()

	def validate_file_size(self, file_path: str) -> bool:
	"""Validate file size is within limits."""
	file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
	return file_size_mb <= self.config.MAX_FILE_SIZE_MB

	def load_document(self, file_path: str) -> str:
	"""Load document content based on file extension."""
	if not self.validate_file_size(file_path):
	raise ValueError(f"File size exceeds {self.config.MAX_FILE_SIZE_MB}MB limit")

	file_ext = os.path.splitext(file_path)[1].lower()

	if file_ext == '.pdf':
	return self._load_pdf(file_path)
	elif file_ext == '.docx':
	return self._load_docx(file_path)
	elif file_ext == '.txt':
	return self._load_txt(file_path)
	elif file_ext == '.json':
	return self._load_json(file_path)
	else:
	raise ValueError(f"Unsupported file format: {file_ext}")

	def _load_pdf(self, file_path: str) -> str:
	"""Load PDF content."""
	text = ""
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	return text

	def _load_docx(self, file_path: str) -> str:
	"""Load DOCX content."""
	doc = Document(file_path)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text

	def _load_txt(self, file_path: str) -> str:
	"""Load TXT content."""
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()

	def _load_json(self, file_path: str) -> str:
	"""Load JSON content and convert to text."""
	with open(file_path, 'r', encoding='utf-8') as file:
	data = json.load(file)
	return json.dumps(data, indent=2)

	def chunk_text(self, text: str) -> List[str]:
	"""Split text into overlapping chunks for processing."""
	if len(text) <= self.config.CHUNK_SIZE:
	return [text]

	chunks = []
	start = 0

	while start < len(text):
	end = start + self.config.CHUNK_SIZE

	# Try to break at sentence boundaries
	if end < len(text):
	# Look for sentence endings
	sentence_end = text.rfind('.', start, end)
	if sentence_end == -1:
	sentence_end = text.rfind('!', start, end)
	if sentence_end == -1:
	sentence_end = text.rfind('?', start, end)

	if sentence_end != -1 and sentence_end > start + self.config.CHUNK_SIZE // 2:
	end = sentence_end + 1

	chunk = text[start:end].strip()
	if chunk:
	chunks.append(chunk)

	start = end - self.config.CHUNK_OVERLAP
	if start >= len(text):
	break

	return chunks

	def process_documents(self, file_paths: List[str], batch_mode: bool = False) -> List[Dict[str, Any]]:
	"""Process multiple documents."""
	results = []

	for file_path in file_paths:
	try:
	content = self.load_document(file_path)
	chunks = self.chunk_text(content)

	results.append({
	'file_path': file_path,
	'content': content,
	'chunks': chunks,
	'status': 'success'
	})

	if not batch_mode:
	break # Process only one file if not in batch mode

	except Exception as e:
	results.append({
	'file_path': file_path,
	'content': '',
	'chunks': [],
	'status': 'error',
	'error': str(e)
	})

	return results