biswanathroul
/

efficient-context

efficient-context

context-optimization

cpu-optimization

resource-constrained

memory-management

Model card Files Files and versions

efficient-context / efficient_context /utils /text.py

biswanath2.roul

Initial commit

e4d5155 10 months ago

history blame contribute delete

3.78 kB

	"""
	Text processing utilities for the efficient-context library.
	"""

	import re
	from typing import List, Dict, Any
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def split_into_sentences(text: str) -> List[str]:
	"""
	Split text into sentences.

	Args:
	text: Text to split

	Returns:
	sentences: List of sentences
	"""
	# Simple but effective sentence splitting
	# This handles most common sentence endings while preserving common abbreviations
	text = text.replace('\n', ' ')

	# Try to use NLTK if available for better sentence splitting
	try:
	import nltk
	try:
	return nltk.sent_tokenize(text)
	except Exception as e:
	logger.warning(f"NLTK sentence tokenizer error: {e}. Using fallback.")
	return _simple_sentence_split(text)
	except ImportError:
	logger.warning("NLTK not available, using fallback sentence splitter")
	return _simple_sentence_split(text)

	def _simple_sentence_split(text: str) -> List[str]:
	"""Fallback sentence splitter without dependencies."""
	# This is a simplified version, not as accurate as NLTK but works without dependencies
	# Handle common abbreviations to avoid splitting them
	for abbr in ['Mr.', 'Mrs.', 'Dr.', 'vs.', 'e.g.', 'i.e.', 'etc.']:
	text = text.replace(abbr, abbr.replace('.', '<POINT>'))

	# Split on sentence endings
	sentences = re.split(r'(?<=[.!?])\s+', text)

	# Restore abbreviations
	sentences = [s.replace('<POINT>', '.') for s in sentences]

	# Remove empty sentences
	return [s for s in sentences if s.strip()]

	def get_sentence_importance(sentences: List[str]) -> List[float]:
	"""
	Calculate importance scores for sentences based on heuristics.

	Args:
	sentences: List of sentences to score

	Returns:
	importances: List of importance scores (0.0 to 1.0)
	"""
	# Simple heuristics for scoring sentence importance
	importances = []

	for sentence in sentences:
	score = 0.0
	words = sentence.split()

	# Longer sentences tend to be more informative (up to a point)
	length_score = min(len(words) / 20, 1.0)

	# Keywords suggest important content
	keyword_score = 0.0
	keywords = ['important', 'significant', 'key', 'critical', 'crucial',
	'essential', 'main', 'major', 'primary', 'central',
	'result', 'conclusion', 'finding', 'discovered', 'shows']

	for word in words:
	if word.lower() in keywords:
	keyword_score += 0.2

	keyword_score = min(keyword_score, 0.6) # Cap keyword importance

	# Presence of numbers often indicates factual content
	number_score = 0.0
	if re.search(r'\d', sentence):
	number_score = 0.2

	# Combine scores
	score = 0.5 * length_score + 0.3 * keyword_score + 0.2 * number_score

	# Cap at 1.0
	importances.append(min(score, 1.0))

	return importances

	def calculate_text_overlap(text1: str, text2: str) -> float:
	"""
	Calculate simple text overlap between two strings.

	Args:
	text1: First text
	text2: Second text

	Returns:
	overlap_ratio: Ratio of shared tokens (0.0 to 1.0)
	"""
	# Convert to sets of tokens
	tokens1 = set(text1.lower().split())
	tokens2 = set(text2.lower().split())

	# Calculate overlap
	if not tokens1 or not tokens2:
	return 0.0

	overlap = tokens1.intersection(tokens2)
	return len(overlap) / min(len(tokens1), len(tokens2))