| """ |
| Unit Tests for Document Intelligence Subsystem |
| |
| Tests core components: |
| - BoundingBox operations |
| - Chunk models |
| - Schema and extraction |
| - Evidence building |
| """ |
|
|
| import pytest |
| from pathlib import Path |
|
|
|
|
| class TestBoundingBox: |
| """Tests for BoundingBox model.""" |
|
|
| def test_create_bbox(self): |
| from src.document_intelligence.chunks import BoundingBox |
|
|
| bbox = BoundingBox( |
| x_min=0.1, |
| y_min=0.2, |
| x_max=0.5, |
| y_max=0.6, |
| normalized=True |
| ) |
|
|
| assert bbox.x_min == 0.1 |
| assert bbox.y_min == 0.2 |
| assert bbox.x_max == 0.5 |
| assert bbox.y_max == 0.6 |
| assert bbox.normalized is True |
|
|
| def test_bbox_properties(self): |
| from src.document_intelligence.chunks import BoundingBox |
|
|
| bbox = BoundingBox( |
| x_min=10, |
| y_min=20, |
| x_max=50, |
| y_max=80, |
| normalized=False |
| ) |
|
|
| assert bbox.width == 40 |
| assert bbox.height == 60 |
| assert bbox.area == 2400 |
| assert bbox.center == (30, 50) |
| assert bbox.xyxy == (10, 20, 50, 80) |
|
|
| def test_bbox_to_pixel(self): |
| from src.document_intelligence.chunks import BoundingBox |
|
|
| bbox = BoundingBox( |
| x_min=0.1, |
| y_min=0.2, |
| x_max=0.5, |
| y_max=0.6, |
| normalized=True |
| ) |
|
|
| pixel_bbox = bbox.to_pixel(1000, 800) |
|
|
| assert pixel_bbox.x_min == 100 |
| assert pixel_bbox.y_min == 160 |
| assert pixel_bbox.x_max == 500 |
| assert pixel_bbox.y_max == 480 |
| assert pixel_bbox.normalized is False |
|
|
| def test_bbox_to_normalized(self): |
| from src.document_intelligence.chunks import BoundingBox |
|
|
| bbox = BoundingBox( |
| x_min=100, |
| y_min=160, |
| x_max=500, |
| y_max=480, |
| normalized=False |
| ) |
|
|
| norm_bbox = bbox.to_normalized(1000, 800) |
|
|
| assert abs(norm_bbox.x_min - 0.1) < 0.001 |
| assert abs(norm_bbox.y_min - 0.2) < 0.001 |
| assert abs(norm_bbox.x_max - 0.5) < 0.001 |
| assert abs(norm_bbox.y_max - 0.6) < 0.001 |
| assert norm_bbox.normalized is True |
|
|
| def test_bbox_iou(self): |
| from src.document_intelligence.chunks import BoundingBox |
|
|
| bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) |
| bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150) |
|
|
| |
| |
| |
| iou = bbox1.iou(bbox2) |
| assert 0.1 < iou < 0.2 |
|
|
| def test_bbox_contains(self): |
| from src.document_intelligence.chunks import BoundingBox |
|
|
| bbox = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) |
|
|
| assert bbox.contains((50, 50)) is True |
| assert bbox.contains((0, 0)) is True |
| assert bbox.contains((100, 100)) is True |
| assert bbox.contains((150, 50)) is False |
|
|
|
|
| class TestDocumentChunk: |
| """Tests for DocumentChunk model.""" |
|
|
| def test_create_chunk(self): |
| from src.document_intelligence.chunks import ( |
| DocumentChunk, |
| ChunkType, |
| BoundingBox, |
| ) |
|
|
| bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True) |
|
|
| chunk = DocumentChunk( |
| chunk_id="test_chunk_001", |
| doc_id="doc_001", |
| chunk_type=ChunkType.PARAGRAPH, |
| text="This is a test paragraph.", |
| page=1, |
| bbox=bbox, |
| confidence=0.95, |
| sequence_index=0, |
| ) |
|
|
| assert chunk.chunk_id == "test_chunk_001" |
| assert chunk.chunk_type == ChunkType.PARAGRAPH |
| assert chunk.text == "This is a test paragraph." |
| assert chunk.page == 1 |
| assert chunk.confidence == 0.95 |
|
|
| def test_generate_chunk_id(self): |
| from src.document_intelligence.chunks import ( |
| DocumentChunk, |
| BoundingBox, |
| ) |
|
|
| bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True) |
|
|
| chunk_id = DocumentChunk.generate_chunk_id( |
| doc_id="doc_001", |
| page=1, |
| bbox=bbox, |
| chunk_type_str="paragraph" |
| ) |
|
|
| |
| chunk_id_2 = DocumentChunk.generate_chunk_id( |
| doc_id="doc_001", |
| page=1, |
| bbox=bbox, |
| chunk_type_str="paragraph" |
| ) |
|
|
| assert chunk_id == chunk_id_2 |
| assert len(chunk_id) == 16 |
|
|
|
|
| class TestTableChunk: |
| """Tests for TableChunk model.""" |
|
|
| def test_create_table_chunk(self): |
| from src.document_intelligence.chunks import ( |
| TableChunk, |
| TableCell, |
| BoundingBox, |
| ) |
|
|
| bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) |
|
|
| cells = [ |
| TableCell(row=0, col=0, text="Header 1", is_header=True, |
| bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.5, y_max=0.3)), |
| TableCell(row=0, col=1, text="Header 2", is_header=True, |
| bbox=BoundingBox(x_min=0.5, y_min=0.2, x_max=0.9, y_max=0.3)), |
| TableCell(row=1, col=0, text="Value 1", |
| bbox=BoundingBox(x_min=0.1, y_min=0.3, x_max=0.5, y_max=0.4)), |
| TableCell(row=1, col=1, text="Value 2", |
| bbox=BoundingBox(x_min=0.5, y_min=0.3, x_max=0.9, y_max=0.4)), |
| ] |
|
|
| table = TableChunk( |
| chunk_id="table_001", |
| doc_id="doc_001", |
| text="Table content", |
| page=1, |
| bbox=bbox, |
| confidence=0.9, |
| sequence_index=0, |
| cells=cells, |
| num_rows=2, |
| num_cols=2, |
| ) |
|
|
| assert table.num_rows == 2 |
| assert table.num_cols == 2 |
| assert len(table.cells) == 4 |
|
|
| def test_table_get_cell(self): |
| from src.document_intelligence.chunks import ( |
| TableChunk, |
| TableCell, |
| BoundingBox, |
| ) |
|
|
| bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) |
|
|
| cells = [ |
| TableCell(row=0, col=0, text="A", |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| TableCell(row=0, col=1, text="B", |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| TableCell(row=1, col=0, text="C", |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| TableCell(row=1, col=1, text="D", |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| ] |
|
|
| table = TableChunk( |
| chunk_id="table_001", |
| doc_id="doc_001", |
| text="Table", |
| page=1, |
| bbox=bbox, |
| confidence=0.9, |
| sequence_index=0, |
| cells=cells, |
| num_rows=2, |
| num_cols=2, |
| ) |
|
|
| assert table.get_cell(0, 0).text == "A" |
| assert table.get_cell(0, 1).text == "B" |
| assert table.get_cell(1, 0).text == "C" |
| assert table.get_cell(1, 1).text == "D" |
|
|
| def test_table_to_markdown(self): |
| from src.document_intelligence.chunks import ( |
| TableChunk, |
| TableCell, |
| BoundingBox, |
| ) |
|
|
| bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) |
|
|
| cells = [ |
| TableCell(row=0, col=0, text="Name", |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| TableCell(row=0, col=1, text="Value", |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| TableCell(row=1, col=0, text="A", |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| TableCell(row=1, col=1, text="100", |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
| ] |
|
|
| table = TableChunk( |
| chunk_id="table_001", |
| doc_id="doc_001", |
| text="Table", |
| page=1, |
| bbox=bbox, |
| confidence=0.9, |
| sequence_index=0, |
| cells=cells, |
| num_rows=2, |
| num_cols=2, |
| ) |
|
|
| md = table.to_markdown() |
| assert "| Name | Value |" in md |
| assert "| --- | --- |" in md |
| assert "| A | 100 |" in md |
|
|
|
|
| class TestExtractionSchema: |
| """Tests for ExtractionSchema.""" |
|
|
| def test_create_schema(self): |
| from src.document_intelligence.extraction import ( |
| ExtractionSchema, |
| FieldSpec, |
| FieldType, |
| ) |
|
|
| schema = ExtractionSchema(name="TestSchema") |
| schema.add_string_field("name", "Person name", required=True) |
| schema.add_number_field("age", "Person age", required=False, is_integer=True) |
| schema.add_date_field("birth_date", "Date of birth") |
|
|
| assert schema.name == "TestSchema" |
| assert len(schema.fields) == 3 |
| assert schema.get_field("name").required is True |
| assert schema.get_field("age").field_type == FieldType.INTEGER |
|
|
| def test_schema_to_json_schema(self): |
| from src.document_intelligence.extraction import ExtractionSchema |
|
|
| schema = ExtractionSchema(name="Invoice") |
| schema.add_string_field("invoice_number", required=True) |
| schema.add_currency_field("total_amount", required=True) |
|
|
| json_schema = schema.to_json_schema() |
|
|
| assert json_schema["type"] == "object" |
| assert "invoice_number" in json_schema["properties"] |
| assert "total_amount" in json_schema["properties"] |
| assert "invoice_number" in json_schema["required"] |
|
|
| def test_schema_from_json_schema(self): |
| from src.document_intelligence.extraction import ExtractionSchema |
|
|
| json_schema = { |
| "type": "object", |
| "properties": { |
| "name": {"type": "string", "description": "Name"}, |
| "value": {"type": "number", "minimum": 0}, |
| }, |
| "required": ["name"], |
| } |
|
|
| schema = ExtractionSchema.from_json_schema(json_schema, name="Test") |
|
|
| assert len(schema.fields) == 2 |
| assert schema.get_field("name").required is True |
| assert schema.get_field("value").required is False |
|
|
| def test_preset_schemas(self): |
| from src.document_intelligence.extraction import ( |
| create_invoice_schema, |
| create_receipt_schema, |
| create_contract_schema, |
| ) |
|
|
| invoice = create_invoice_schema() |
| assert invoice.get_field("invoice_number") is not None |
| assert invoice.get_field("total_amount") is not None |
|
|
| receipt = create_receipt_schema() |
| assert receipt.get_field("merchant_name") is not None |
|
|
| contract = create_contract_schema() |
| assert contract.get_field("effective_date") is not None |
|
|
|
|
| class TestEvidenceBuilder: |
| """Tests for EvidenceBuilder.""" |
|
|
| def test_create_evidence(self): |
| from src.document_intelligence.grounding import EvidenceBuilder |
| from src.document_intelligence.chunks import ( |
| DocumentChunk, |
| ChunkType, |
| BoundingBox, |
| ) |
|
|
| chunk = DocumentChunk( |
| chunk_id="chunk_001", |
| doc_id="doc_001", |
| chunk_type=ChunkType.PARAGRAPH, |
| text="The total amount is $500.00.", |
| page=1, |
| bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3), |
| confidence=0.9, |
| sequence_index=0, |
| ) |
|
|
| builder = EvidenceBuilder() |
| evidence = builder.create_evidence( |
| chunk=chunk, |
| value="$500.00", |
| field_name="total_amount" |
| ) |
|
|
| assert evidence.chunk_id == "chunk_001" |
| assert evidence.page == 1 |
| assert "$500.00" in evidence.snippet or "500" in evidence.snippet |
|
|
|
|
| class TestSemanticChunker: |
| """Tests for SemanticChunker.""" |
|
|
| def test_chunk_text(self): |
| from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig |
|
|
| config = ChunkingConfig( |
| min_chunk_chars=10, |
| max_chunk_chars=100, |
| target_chunk_chars=50, |
| ) |
|
|
| chunker = SemanticChunker(config) |
|
|
| text = """# Heading 1 |
| |
| This is the first paragraph with some text content. |
| |
| This is the second paragraph with more content. |
| |
| # Heading 2 |
| |
| Another section with different content. |
| """ |
|
|
| chunks = chunker.chunk_text(text) |
|
|
| assert len(chunks) > 0 |
| for chunk in chunks: |
| assert "text" in chunk |
| assert len(chunk["text"]) >= config.min_chunk_chars |
|
|
| def test_chunk_long_text(self): |
| from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig |
|
|
| config = ChunkingConfig( |
| min_chunk_chars=10, |
| max_chunk_chars=200, |
| target_chunk_chars=100, |
| ) |
|
|
| chunker = SemanticChunker(config) |
|
|
| |
| text = " ".join(["This is sentence number {}.".format(i) for i in range(50)]) |
|
|
| chunks = chunker.chunk_text(text) |
|
|
| assert len(chunks) > 1 |
| for chunk in chunks: |
| assert len(chunk["text"]) <= config.max_chunk_chars * 1.1 |
|
|
|
|
| class TestValidation: |
| """Tests for extraction validation.""" |
|
|
| def test_validate_extraction(self): |
| from src.document_intelligence.extraction import ( |
| ExtractionSchema, |
| ExtractionValidator, |
| ) |
| from src.document_intelligence.chunks import ExtractionResult, FieldExtraction |
|
|
| schema = ExtractionSchema(name="Test") |
| schema.add_string_field("name", required=True) |
| schema.add_number_field("value", required=False, is_integer=True) |
|
|
| result = ExtractionResult( |
| data={"name": "Test Name", "value": 42}, |
| fields=[], |
| evidence=[], |
| overall_confidence=0.8, |
| abstained_fields=[], |
| ) |
|
|
| validator = ExtractionValidator() |
| validation = validator.validate(result, schema) |
|
|
| assert validation.is_valid is True |
| assert validation.error_count == 0 |
|
|
| def test_validate_missing_required(self): |
| from src.document_intelligence.extraction import ( |
| ExtractionSchema, |
| ExtractionValidator, |
| ) |
| from src.document_intelligence.chunks import ExtractionResult |
|
|
| schema = ExtractionSchema(name="Test") |
| schema.add_string_field("name", required=True) |
| schema.add_string_field("description", required=True) |
|
|
| result = ExtractionResult( |
| data={"name": "Test"}, |
| fields=[], |
| evidence=[], |
| overall_confidence=0.5, |
| abstained_fields=["description"], |
| ) |
|
|
| validator = ExtractionValidator() |
| validation = validator.validate(result, schema) |
|
|
| assert validation.is_valid is False |
| assert validation.error_count >= 1 |
|
|
|
|
| if __name__ == "__main__": |
| pytest.main([__file__, "-v"]) |
|
|