Spaces:

technophyle
/

code-compass

Runtime error

App Files Files Community

code-compass / src /document_processor.py

technophyle

Sync from GitHub via hub-sync

35c1d2c verified about 5 hours ago

raw

history blame contribute delete

2.94 kB

	import hashlib
	from typing import List, Tuple
	from pathlib import Path
	import pypdf


	class DocumentProcessor:
	# FIX: 512 chars was too small — caused mid-thought splits that hurt faithfulness.
	# 1200 chars with 150 overlap keeps full function signatures + docstrings intact.
	def __init__(self, chunk_size: int = 1200, chunk_overlap: int = 150):
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap

	def extract_text_from_pdf(self, file_path: str) -> str:
	text = ""
	try:
	with open(file_path, "rb") as file:
	pdf_reader = pypdf.PdfReader(file)
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	except Exception as e:
	raise ValueError(f"Error reading PDF: {str(e)}")
	return text.strip()

	def chunk_text(self, text: str) -> List[str]:
	if not text:
	return []

	chunks = []
	start = 0
	text_length = len(text)

	while start < text_length:
	end = start + self.chunk_size
	chunk = text[start:end]

	if end < text_length:
	# FIX: prefer paragraph breaks > sentence breaks > newlines.
	# Previously only checked period + newline, missing paragraph boundaries.
	last_double_newline = chunk.rfind("\n\n")
	last_period = chunk.rfind(".")
	last_newline = chunk.rfind("\n")

	if last_double_newline > self.chunk_size * 0.4:
	break_point = last_double_newline + 1
	elif last_period > self.chunk_size * 0.5:
	break_point = last_period + 1
	elif last_newline > self.chunk_size * 0.4:
	break_point = last_newline + 1
	else:
	break_point = self.chunk_size

	chunk = chunk[:break_point]
	end = start + break_point

	stripped = chunk.strip()
	if stripped:
	chunks.append(stripped)

	start = end - self.chunk_overlap

	return [c for c in chunks if c]

	def process_document(self, file_path: str) -> Tuple[str, List[str]]:
	file_ext = Path(file_path).suffix.lower()
	if file_ext == ".pdf":
	text = self.extract_text_from_pdf(file_path)
	elif file_ext == ".txt":
	with open(file_path, "r", encoding="utf-8") as f:
	text = f.read()
	else:
	raise ValueError(f"Unsupported file type: {file_ext}")

	chunks = self.chunk_text(text)
	return text, chunks

	@staticmethod
	def compute_file_hash(file_path: str) -> str:
	hash_md5 = hashlib.md5()
	with open(file_path, "rb") as f:
	for chunk in iter(lambda: f.read(4096), b""):
	hash_md5.update(chunk)
	return hash_md5.hexdigest()