| """ |
| Text processing utilities for the efficient-context library. |
| """ |
|
|
| import re |
| from typing import List, Dict, Any |
| import logging |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| def split_into_sentences(text: str) -> List[str]: |
| """ |
| Split text into sentences. |
| |
| Args: |
| text: Text to split |
| |
| Returns: |
| sentences: List of sentences |
| """ |
| |
| |
| text = text.replace('\n', ' ') |
| |
| |
| try: |
| import nltk |
| try: |
| return nltk.sent_tokenize(text) |
| except Exception as e: |
| logger.warning(f"NLTK sentence tokenizer error: {e}. Using fallback.") |
| return _simple_sentence_split(text) |
| except ImportError: |
| logger.warning("NLTK not available, using fallback sentence splitter") |
| return _simple_sentence_split(text) |
|
|
| def _simple_sentence_split(text: str) -> List[str]: |
| """Fallback sentence splitter without dependencies.""" |
| |
| |
| for abbr in ['Mr.', 'Mrs.', 'Dr.', 'vs.', 'e.g.', 'i.e.', 'etc.']: |
| text = text.replace(abbr, abbr.replace('.', '<POINT>')) |
| |
| |
| sentences = re.split(r'(?<=[.!?])\s+', text) |
| |
| |
| sentences = [s.replace('<POINT>', '.') for s in sentences] |
| |
| |
| return [s for s in sentences if s.strip()] |
|
|
| def get_sentence_importance(sentences: List[str]) -> List[float]: |
| """ |
| Calculate importance scores for sentences based on heuristics. |
| |
| Args: |
| sentences: List of sentences to score |
| |
| Returns: |
| importances: List of importance scores (0.0 to 1.0) |
| """ |
| |
| importances = [] |
| |
| for sentence in sentences: |
| score = 0.0 |
| words = sentence.split() |
| |
| |
| length_score = min(len(words) / 20, 1.0) |
| |
| |
| keyword_score = 0.0 |
| keywords = ['important', 'significant', 'key', 'critical', 'crucial', |
| 'essential', 'main', 'major', 'primary', 'central', |
| 'result', 'conclusion', 'finding', 'discovered', 'shows'] |
| |
| for word in words: |
| if word.lower() in keywords: |
| keyword_score += 0.2 |
| |
| keyword_score = min(keyword_score, 0.6) |
| |
| |
| number_score = 0.0 |
| if re.search(r'\d', sentence): |
| number_score = 0.2 |
| |
| |
| score = 0.5 * length_score + 0.3 * keyword_score + 0.2 * number_score |
| |
| |
| importances.append(min(score, 1.0)) |
| |
| return importances |
|
|
| def calculate_text_overlap(text1: str, text2: str) -> float: |
| """ |
| Calculate simple text overlap between two strings. |
| |
| Args: |
| text1: First text |
| text2: Second text |
| |
| Returns: |
| overlap_ratio: Ratio of shared tokens (0.0 to 1.0) |
| """ |
| |
| tokens1 = set(text1.lower().split()) |
| tokens2 = set(text2.lower().split()) |
| |
| |
| if not tokens1 or not tokens2: |
| return 0.0 |
| |
| overlap = tokens1.intersection(tokens2) |
| return len(overlap) / min(len(tokens1), len(tokens2)) |
|
|