Spaces:
Sleeping
Sleeping
| """Streamlit front‑end entry‑point.""" | |
| import yaml | |
| import json | |
| import streamlit as st | |
| import logging | |
| from dotenv import load_dotenv | |
| from orchestrator.planner import Planner | |
| from orchestrator.executor import Executor | |
| from config.settings import settings | |
| from config.config_manager import config_manager | |
| import fitz # PyMuPDF local import to avoid heavy load on startup | |
| import pandas as pd | |
| from datetime import datetime | |
| from services.cost_tracker import CostTracker | |
| # Create a custom stream handler to capture logs | |
| class LogCaptureHandler(logging.StreamHandler): | |
| def __init__(self): | |
| super().__init__() | |
| self.logs = [] | |
| def emit(self, record): | |
| try: | |
| msg = self.format(record) | |
| self.logs.append(msg) | |
| except Exception: | |
| self.handleError(record) | |
| def get_logs(self): | |
| return "\n".join(self.logs) | |
| def clear(self): | |
| self.logs = [] | |
| # Initialize session state for storing execution history | |
| if 'execution_history' not in st.session_state: | |
| st.session_state.execution_history = [] | |
| # Initialize session state for field descriptions tables | |
| if 'field_descriptions_table' not in st.session_state: | |
| st.session_state.field_descriptions_table = [] | |
| # Initialize session state for unique indices descriptions table | |
| if 'unique_indices_descriptions_table' not in st.session_state: | |
| st.session_state.unique_indices_descriptions_table = [] | |
| # Initialize session state for fields string | |
| if 'fields_str' not in st.session_state: | |
| st.session_state.fields_str = "Chain, Percentage, Seq Loc" | |
| # Set up logging capture | |
| log_capture = LogCaptureHandler() | |
| log_capture.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) | |
| # Configure root logger | |
| root_logger = logging.getLogger() | |
| root_logger.setLevel(logging.INFO) | |
| root_logger.addHandler(log_capture) | |
| # Configure specific loggers | |
| for logger_name in ['orchestrator', 'agents', 'services']: | |
| logger = logging.getLogger(logger_name) | |
| logger.setLevel(logging.INFO) | |
| logger.addHandler(log_capture) | |
| load_dotenv() | |
| st.set_page_config(page_title="PDF Field Extractor", layout="wide") | |
| # Sidebar navigation | |
| st.sidebar.title("Navigation") | |
| page = st.sidebar.radio("Go to", ["Documentation", "Traces", "Execution"]) | |
| # Documentation Page | |
| if page == "Documentation": | |
| st.title("Deep‑Research PDF Field Extractor") | |
| st.markdown(""" | |
| ## Overview | |
| This system uses a multi-agent architecture to extract fields from PDFs with high accuracy and reliability. | |
| ### Core Components | |
| 1. **Planner** | |
| - Generates execution plans using Azure OpenAI | |
| - Determines optimal extraction strategy | |
| - Manages task dependencies | |
| 2. **Executor** | |
| - Executes the generated plan | |
| - Manages agent execution flow | |
| - Handles context and result management | |
| 3. **Agents** | |
| - `TableAgent`: Extracts text and tables using Azure Document Intelligence | |
| - `FieldMapper`: Maps fields to values using extracted content | |
| - `ForEachField`: Controls field iteration flow | |
| ### Processing Pipeline | |
| 1. **Document Processing** | |
| - Text and table extraction using Azure Document Intelligence | |
| - Layout and structure preservation | |
| - Support for complex document formats | |
| 2. **Field Extraction** | |
| - Document type inference | |
| - User profile determination | |
| - Page-by-page scanning | |
| - Value extraction and validation | |
| 3. **Context Building** | |
| - Document metadata | |
| - Field descriptions | |
| - User context | |
| - Execution history | |
| ### Key Features | |
| #### Smart Field Extraction | |
| - Two-step extraction strategy: | |
| 1. Page-by-page scanning for precise extraction | |
| 2. Semantic search fallback if no value found | |
| - Basic context awareness for improved extraction | |
| - Support for tabular data extraction | |
| #### Document Intelligence | |
| - Azure Document Intelligence integration | |
| - Layout and structure preservation | |
| - Table extraction and formatting | |
| - Complex document handling | |
| #### Execution Monitoring | |
| - Detailed execution traces | |
| - Success/failure status | |
| - Comprehensive logging | |
| - Result storage and retrieval | |
| ### Technical Requirements | |
| - Azure OpenAI API key | |
| - Azure Document Intelligence endpoint | |
| - Python 3.9 or higher | |
| - Required Python packages (see requirements.txt) | |
| ### Getting Started | |
| 1. **Upload Your PDF** | |
| - Click the "Upload PDF" button | |
| - Select your PDF file | |
| 2. **Specify Fields** | |
| - Enter comma-separated field names | |
| - Example: `Date, Name, Value, Location` | |
| 3. **Optional: Add Field Descriptions** | |
| - Provide YAML-formatted field descriptions | |
| - Helps improve extraction accuracy | |
| 4. **Run Extraction** | |
| - Click "Run extraction" | |
| - Monitor progress in execution trace | |
| - View results in table format | |
| 5. **Download Results** | |
| - Export as CSV | |
| - View detailed execution logs | |
| ### Support | |
| For detailed technical documentation, please refer to: | |
| - [Architecture Overview](ARCHITECTURE.md) | |
| - [Developer Documentation](DEVELOPER.md) | |
| """) | |
| # Traces Page | |
| elif page == "Traces": | |
| st.title("Execution Traces") | |
| if not st.session_state.execution_history: | |
| st.info("No execution traces available yet. Run an extraction to see traces here.") | |
| else: | |
| # Create a DataFrame from the execution history | |
| history_data = [] | |
| for record in st.session_state.execution_history: | |
| history_data.append({ | |
| "filename": record["filename"], | |
| "datetime": record["datetime"], | |
| "fields": ", ".join(record.get("fields", [])), | |
| "logs": record.get("logs", []), | |
| "results": record.get("results", None) | |
| }) | |
| history_df = pd.DataFrame(history_data) | |
| # Display column headers | |
| col1, col2, col3, col4, col5 = st.columns([2, 2, 3, 1, 1]) | |
| with col1: | |
| st.markdown("**Filename**") | |
| with col2: | |
| st.markdown("**Timestamp**") | |
| with col3: | |
| st.markdown("**Fields**") | |
| with col4: | |
| st.markdown("**Logs**") | |
| with col5: | |
| st.markdown("**Results**") | |
| st.markdown("---") # Add a separator line | |
| # Display the table with download buttons | |
| for idx, row in history_df.iterrows(): | |
| col1, col2, col3, col4, col5 = st.columns([2, 2, 3, 1, 1]) | |
| with col1: | |
| st.write(row["filename"]) | |
| with col2: | |
| st.write(row["datetime"]) | |
| with col3: | |
| st.write(row["fields"]) | |
| with col4: | |
| if row["logs"]: # Check if we have any logs | |
| st.download_button( | |
| "Download Logs", | |
| row["logs"], # Use the stored logs | |
| file_name=f"logs_{row['filename']}_{row['datetime']}.txt", | |
| key=f"logs_dl_{idx}" | |
| ) | |
| else: | |
| st.write("No Logs") | |
| with col5: | |
| if row["results"] is not None: | |
| results_df = pd.DataFrame(row["results"]) | |
| st.download_button( | |
| "Download Results", | |
| results_df.to_csv(index=False), | |
| file_name=f"results_{row['filename']}_{row['datetime']}.csv", | |
| key=f"results_dl_{idx}" | |
| ) | |
| else: | |
| st.write("No Results") | |
| st.markdown("---") # Add a separator line between rows | |
| # Execution Page | |
| else: # page == "Execution" | |
| st.title("Deep‑Research PDF Field Extractor (POC)") | |
| def flatten_json_response(json_data, fields): | |
| """Flatten the nested JSON response into a tabular structure with dynamic columns.""" | |
| logger = logging.getLogger(__name__) | |
| logger.info("Starting flatten_json_response") | |
| logger.info(f"Input fields: {fields}") | |
| # Handle the case where the response is a string | |
| if isinstance(json_data, str): | |
| logger.info("Input is a string, attempting to parse as JSON") | |
| try: | |
| json_data = json.loads(json_data) | |
| logger.info("Successfully parsed JSON string") | |
| except json.JSONDecodeError as e: | |
| logger.error(f"Failed to parse JSON string: {e}") | |
| return pd.DataFrame(columns=fields) | |
| # If the data is wrapped in an array, get the first item | |
| if isinstance(json_data, list) and len(json_data) > 0: | |
| logger.info("Data is wrapped in an array, extracting first item") | |
| json_data = json_data[0] | |
| # If the data is a dictionary with numeric keys, get the first value | |
| if isinstance(json_data, dict): | |
| keys = list(json_data.keys()) | |
| logger.info(f"Checking dictionary keys: {keys}") | |
| # Check if all keys are integers or string representations of integers | |
| if all(isinstance(k, int) or (isinstance(k, str) and k.isdigit()) for k in keys): | |
| logger.info("Data has numeric keys, extracting first value") | |
| first_key = sorted(keys, key=lambda x: int(x) if isinstance(x, str) else x)[0] | |
| json_data = json_data[first_key] | |
| logger.info(f"Extracted data from key '{first_key}'") | |
| logger.info(f"JSON data keys: {list(json_data.keys()) if isinstance(json_data, dict) else 'Not a dict'}") | |
| # Create a list to store rows | |
| rows = [] | |
| # Get the length of the first array to determine number of rows | |
| if isinstance(json_data, dict) and len(json_data) > 0: | |
| first_field = list(json_data.keys())[0] | |
| num_rows = len(json_data[first_field]) if isinstance(json_data[first_field], list) else 1 | |
| logger.info(f"Number of rows to process: {num_rows}") | |
| # Create a row for each index | |
| for i in range(num_rows): | |
| logger.debug(f"Processing row {i}") | |
| row = {} | |
| for field in fields: | |
| if field in json_data and isinstance(json_data[field], list) and i < len(json_data[field]): | |
| row[field] = json_data[field][i] | |
| logger.debug(f"Field '{field}' value at index {i}: {json_data[field][i]}") | |
| else: | |
| row[field] = None | |
| logger.debug(f"Field '{field}' not found or index {i} out of bounds") | |
| rows.append(row) | |
| else: | |
| logger.error(f"Unexpected data structure: {type(json_data)}") | |
| return pd.DataFrame(columns=fields) | |
| # Create DataFrame with all requested fields as columns | |
| df = pd.DataFrame(rows) | |
| logger.info(f"Created DataFrame with shape: {df.shape}") | |
| logger.info(f"DataFrame columns: {df.columns.tolist()}") | |
| # Ensure columns are in the same order as the fields list | |
| df = df[fields] | |
| logger.info(f"Final DataFrame columns after reordering: {df.columns.tolist()}") | |
| return df | |
| # ============================================================================ | |
| # SECTION 1: FILE UPLOAD | |
| # ============================================================================ | |
| st.header("📄 Step 1: Upload Document") | |
| pdf_file = st.file_uploader("Upload PDF", type=["pdf"], help="Select a PDF file to process") | |
| if pdf_file: | |
| st.success(f"✅ File uploaded: {pdf_file.name}") | |
| # ============================================================================ | |
| # SECTION 2: STRATEGY SELECTION | |
| # ============================================================================ | |
| st.header("🎯 Step 2: Select Extraction Strategy") | |
| strategy = st.radio( | |
| "Choose your extraction approach:", | |
| ["Original Strategy", "Unique Indices Strategy"], | |
| help="**Original Strategy**: Process document page by page, extracting each field individually. **Unique Indices Strategy**: Process entire document at once using unique combinations of indices.", | |
| horizontal=True | |
| ) | |
| if strategy == "Original Strategy": | |
| st.info("📋 **Original Strategy**: Will extract fields one by one from the document pages.") | |
| else: | |
| st.info("🔍 **Unique Indices Strategy**: Will find unique combinations and extract additional fields for each.") | |
| # ============================================================================ | |
| # SECTION 3: CONFIGURATION (Only for Unique Indices Strategy) | |
| # ============================================================================ | |
| if strategy == "Unique Indices Strategy": | |
| st.header("⚙️ Step 3: Configuration") | |
| # File Type Selection | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| # Get available configurations | |
| config_names = config_manager.get_config_names() | |
| selected_config_name = st.selectbox( | |
| "Select File Type Configuration:", | |
| config_names, | |
| format_func=lambda x: config_manager.get_config(x)['name'] if config_manager.get_config(x) else x, | |
| help="Choose a predefined configuration or create a new one" | |
| ) | |
| with col2: | |
| if st.button("🔄 Load Config", help="Load the selected configuration"): | |
| config = config_manager.get_config(selected_config_name) | |
| if config: | |
| # Update fields | |
| st.session_state.fields_str = config.get('fields', '') | |
| # Update field descriptions table | |
| field_descs = config.get('field_descriptions', {}) | |
| st.session_state.field_descriptions_table = [] | |
| for field_name, field_info in field_descs.items(): | |
| st.session_state.field_descriptions_table.append({ | |
| 'field_name': field_name, | |
| 'field_description': field_info.get('description', ''), | |
| 'format': field_info.get('format', ''), | |
| 'examples': field_info.get('examples', ''), | |
| 'possible_values': field_info.get('possible_values', '') | |
| }) | |
| # Update unique indices descriptions table | |
| unique_descs = config.get('unique_indices_descriptions', {}) | |
| st.session_state.unique_indices_descriptions_table = [] | |
| for field_name, field_info in unique_descs.items(): | |
| st.session_state.unique_indices_descriptions_table.append({ | |
| 'field_name': field_name, | |
| 'field_description': field_info.get('description', ''), | |
| 'format': field_info.get('format', ''), | |
| 'examples': field_info.get('examples', ''), | |
| 'possible_values': field_info.get('possible_values', '') | |
| }) | |
| st.session_state.last_selected_config = selected_config_name | |
| st.success(f"✅ Configuration '{config['name']}' loaded successfully!") | |
| st.rerun() | |
| else: | |
| st.error("❌ Failed to load configuration") | |
| # Clear Configuration Button | |
| if st.button("🗑️ Clear All Configuration", help="Clear all configuration and start fresh"): | |
| st.session_state.field_descriptions_table = [] | |
| st.session_state.unique_indices_descriptions_table = [] | |
| st.session_state.fields_str = "" | |
| st.session_state.last_selected_config = "" | |
| st.success("✅ Configuration cleared!") | |
| st.rerun() | |
| # ============================================================================ | |
| # SECTION 4: FIELD DESCRIPTIONS | |
| # ============================================================================ | |
| st.subheader("📝 Field Descriptions") | |
| st.markdown(""" | |
| <div style="background-color: #e8f4fd; padding: 1rem; border-radius: 0.5rem; border-left: 4px solid #1f77b4; color: #333;"> | |
| <strong>Field Descriptions</strong><br> | |
| Add descriptions for the fields you want to extract. These help the system understand what to look for. | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Create the table interface | |
| col1, col2, col3, col4, col5, col6 = st.columns([2, 3, 2, 2, 2, 1]) | |
| with col1: | |
| st.markdown("**Field Name**") | |
| with col2: | |
| st.markdown("**Field Description**") | |
| with col3: | |
| st.markdown("**Format**") | |
| with col4: | |
| st.markdown("**Examples**") | |
| with col5: | |
| st.markdown("**Possible Values**") | |
| with col6: | |
| st.markdown("**Actions**") | |
| # Display existing rows | |
| for i, row in enumerate(st.session_state.field_descriptions_table): | |
| col1, col2, col3, col4, col5, col6 = st.columns([2, 3, 2, 2, 2, 1]) | |
| with col1: | |
| field_name = st.text_input("", value=row.get('field_name', ''), key=f"field_name_{i}") | |
| with col2: | |
| field_desc = st.text_input("", value=row.get('field_description', ''), key=f"field_desc_{i}") | |
| with col3: | |
| field_format = st.text_input("", value=row.get('format', ''), key=f"field_format_{i}") | |
| with col4: | |
| field_examples = st.text_input("", value=row.get('examples', ''), key=f"field_examples_{i}") | |
| with col5: | |
| field_possible_values = st.text_input("", value=row.get('possible_values', ''), key=f"field_possible_values_{i}") | |
| with col6: | |
| if st.button("🗑️", key=f"delete_{i}", help="Delete this row"): | |
| st.session_state.field_descriptions_table.pop(i) | |
| st.rerun() | |
| # Update the row in session state | |
| st.session_state.field_descriptions_table[i] = { | |
| 'field_name': field_name, | |
| 'field_description': field_desc, | |
| 'format': field_format, | |
| 'examples': field_examples, | |
| 'possible_values': field_possible_values | |
| } | |
| # Add new row button | |
| if st.button("➕ Add Field Description Row"): | |
| st.session_state.field_descriptions_table.append({ | |
| 'field_name': '', | |
| 'field_description': '', | |
| 'format': '', | |
| 'examples': '', | |
| 'possible_values': '' | |
| }) | |
| st.rerun() | |
| # ============================================================================ | |
| # SECTION 5: UNIQUE FIELD DESCRIPTIONS | |
| # ============================================================================ | |
| st.subheader("🔑 Unique Field Descriptions") | |
| st.markdown(""" | |
| <div style="background-color: #fff8e1; padding: 1rem; border-radius: 0.5rem; border-left: 4px solid #ffc107; color: #333;"> | |
| <strong>Unique Field Descriptions</strong><br> | |
| Add descriptions for the unique fields that will be used to identify different combinations in the document. | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Create the table interface for unique indices | |
| col1, col2, col3, col4, col5, col6 = st.columns([2, 3, 2, 2, 2, 1]) | |
| with col1: | |
| st.markdown("**Field Name**") | |
| with col2: | |
| st.markdown("**Field Description**") | |
| with col3: | |
| st.markdown("**Format**") | |
| with col4: | |
| st.markdown("**Examples**") | |
| with col5: | |
| st.markdown("**Possible Values**") | |
| with col6: | |
| st.markdown("**Actions**") | |
| # Display existing rows for unique indices | |
| for i, row in enumerate(st.session_state.unique_indices_descriptions_table): | |
| col1, col2, col3, col4, col5, col6 = st.columns([2, 3, 2, 2, 2, 1]) | |
| with col1: | |
| idx_field_name = st.text_input("", value=row.get('field_name', ''), key=f"unique_field_name_{i}") | |
| with col2: | |
| idx_field_desc = st.text_input("", value=row.get('field_description', ''), key=f"unique_field_desc_{i}") | |
| with col3: | |
| idx_field_format = st.text_input("", value=row.get('format', ''), key=f"unique_field_format_{i}") | |
| with col4: | |
| idx_field_examples = st.text_input("", value=row.get('examples', ''), key=f"unique_field_examples_{i}") | |
| with col5: | |
| idx_field_possible_values = st.text_input("", value=row.get('possible_values', ''), key=f"unique_field_possible_values_{i}") | |
| with col6: | |
| if st.button("🗑️", key=f"unique_delete_{i}", help="Delete this row"): | |
| st.session_state.unique_indices_descriptions_table.pop(i) | |
| st.rerun() | |
| # Update the row in session state | |
| st.session_state.unique_indices_descriptions_table[i] = { | |
| 'field_name': idx_field_name, | |
| 'field_description': idx_field_desc, | |
| 'format': idx_field_format, | |
| 'examples': idx_field_examples, | |
| 'possible_values': idx_field_possible_values | |
| } | |
| # Add new row button for unique indices | |
| if st.button("➕ Add Unique Field Description Row"): | |
| st.session_state.unique_indices_descriptions_table.append({ | |
| 'field_name': '', | |
| 'field_description': '', | |
| 'format': '', | |
| 'examples': '', | |
| 'possible_values': '' | |
| }) | |
| st.rerun() | |
| # ============================================================================ | |
| # SECTION 6: SAVE CONFIGURATION | |
| # ============================================================================ | |
| st.subheader("💾 Save Configuration") | |
| st.markdown(""" | |
| <div style="background-color: #e1f5fe; padding: 1rem; border-radius: 0.5rem; border-left: 4px solid #17a2b8; color: #333;"> | |
| <strong>Save Current Configuration</strong><br> | |
| Save your current configuration as a new file type for future use. | |
| </div> | |
| """, unsafe_allow_html=True) | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| save_config_name = st.text_input( | |
| "Configuration Name:", | |
| placeholder="Enter a name for this configuration (e.g., 'Biotech Report', 'Clinical Data')", | |
| help="Choose a descriptive name that will appear in the dropdown" | |
| ) | |
| with col2: | |
| if st.button("💾 Save Config", help="Save the current configuration"): | |
| if save_config_name: | |
| # Prepare configuration data | |
| field_descs = {} | |
| for row in st.session_state.field_descriptions_table: | |
| if row['field_name']: # Only include rows with field names | |
| field_descs[row['field_name']] = { | |
| 'description': row['field_description'], | |
| 'format': row['format'], | |
| 'examples': row['examples'], | |
| 'possible_values': row['possible_values'] | |
| } | |
| # Get unique indices descriptions | |
| unique_indices_descs = {} | |
| for row in st.session_state.unique_indices_descriptions_table: | |
| if row['field_name']: # Only include rows with field names | |
| unique_indices_descs[row['field_name']] = { | |
| 'description': row['field_description'], | |
| 'format': row['format'], | |
| 'examples': row['examples'], | |
| 'possible_values': row['possible_values'] | |
| } | |
| # Get fields from unique indices | |
| fields_str = ", ".join([row['field_name'] for row in st.session_state.unique_indices_descriptions_table if row['field_name']]) | |
| config_data = { | |
| 'name': save_config_name, | |
| 'description': f"Configuration for {save_config_name}", | |
| 'fields': fields_str, | |
| 'field_descriptions': field_descs, | |
| 'unique_indices_descriptions': unique_indices_descs | |
| } | |
| if config_manager.save_config(save_config_name, config_data): | |
| st.success(f"✅ Configuration '{save_config_name}' saved successfully!") | |
| config_manager.reload_configs() | |
| st.rerun() | |
| else: | |
| st.error("❌ Failed to save configuration") | |
| else: | |
| st.error("❌ Please enter a configuration name") | |
| # ============================================================================ | |
| # SECTION 7: ORIGINAL STRATEGY CONFIGURATION | |
| # ============================================================================ | |
| else: # Original Strategy | |
| st.header("⚙️ Step 3: Field Configuration") | |
| fields_str = st.text_input( | |
| "Fields to Extract (comma-separated):", | |
| value=st.session_state.fields_str, | |
| key="fields_input", | |
| help="Enter the field names you want to extract, separated by commas" | |
| ) | |
| st.session_state.fields_str = fields_str | |
| # ============================================================================ | |
| # SECTION 8: EXECUTION | |
| # ============================================================================ | |
| st.header("🚀 Step 4: Run Extraction") | |
| # Convert table to JSON for processing | |
| field_descs = {} | |
| if st.session_state.field_descriptions_table: | |
| for row in st.session_state.field_descriptions_table: | |
| if row['field_name']: # Only include rows with field names | |
| field_descs[row['field_name']] = { | |
| 'description': row['field_description'], | |
| 'format': row['format'], | |
| 'examples': row['examples'], | |
| 'possible_values': row['possible_values'] | |
| } | |
| # Prepare unique indices for Unique Indices Strategy | |
| unique_indices = None | |
| unique_indices_descriptions = None | |
| if strategy == "Unique Indices Strategy": | |
| # Convert unique indices table to JSON for processing and extract field names | |
| unique_indices_descriptions = {} | |
| unique_indices = [] | |
| if st.session_state.unique_indices_descriptions_table: | |
| for row in st.session_state.unique_indices_descriptions_table: | |
| if row['field_name']: # Only include rows with field names | |
| unique_indices.append(row['field_name']) | |
| unique_indices_descriptions[row['field_name']] = { | |
| 'description': row['field_description'], | |
| 'format': row['format'], | |
| 'examples': row['examples'], | |
| 'possible_values': row['possible_values'] | |
| } | |
| # Status indicator | |
| if pdf_file: | |
| if strategy == "Original Strategy": | |
| field_count = len([f.strip() for f in st.session_state.fields_str.split(",") if f.strip()]) | |
| st.info(f"📊 Ready to extract {field_count} fields using Original Strategy") | |
| else: | |
| unique_count = len(unique_indices) if unique_indices else 0 | |
| field_count = len(field_descs) | |
| st.info(f"📊 Ready to extract {field_count} additional fields for {unique_count} unique combinations using Unique Indices Strategy") | |
| # Run button | |
| if st.button("🚀 Run Extraction", type="primary", disabled=not pdf_file): | |
| if not pdf_file: | |
| st.error("❌ Please upload a PDF file first") | |
| else: | |
| # Prepare field list based on strategy | |
| if strategy == "Original Strategy": | |
| field_list = [f.strip() for f in st.session_state.fields_str.split(",") if f.strip()] | |
| else: # Unique Indices Strategy | |
| # For Unique Indices Strategy, get additional fields from the field descriptions table | |
| field_list = [] | |
| if st.session_state.field_descriptions_table: | |
| for row in st.session_state.field_descriptions_table: | |
| if row['field_name']: # Only include rows with field names | |
| field_list.append(row['field_name']) | |
| try: | |
| with st.spinner("Planning …"): | |
| # quick first-page text preview to give LLM document context | |
| doc = fitz.open(stream=pdf_file.getvalue(), filetype="pdf") # type: ignore[arg-type] | |
| preview = "\n".join(page.get_text() for page in doc[:10])[:20000] # first 2 pages, 2k chars | |
| # Create a cost tracker for this run | |
| cost_tracker = CostTracker() | |
| planner = Planner(cost_tracker=cost_tracker) | |
| plan = planner.build_plan( | |
| pdf_meta={"filename": pdf_file.name}, | |
| doc_preview=preview, | |
| fields=field_list, | |
| field_descs=field_descs, | |
| strategy=strategy, | |
| unique_indices=unique_indices, | |
| unique_indices_descriptions=unique_indices_descriptions | |
| ) | |
| # Add a visual separator | |
| st.markdown("---") | |
| with st.spinner("Executing …"): | |
| executor = Executor(settings=settings, cost_tracker=cost_tracker) | |
| results, logs = executor.run(plan, pdf_file) | |
| # Get detailed costs | |
| costs = executor.cost_tracker.calculate_current_file_costs() | |
| model_cost = costs["openai"]["total_cost"] | |
| di_cost = costs["document_intelligence"]["total_cost"] | |
| # Add debug logging for cost tracking | |
| logger.info(f"Cost tracker debug info:") | |
| logger.info(f" LLM input tokens: {executor.cost_tracker.llm_input_tokens}") | |
| logger.info(f" LLM output tokens: {executor.cost_tracker.llm_output_tokens}") | |
| logger.info(f" DI pages: {executor.cost_tracker.di_pages}") | |
| logger.info(f" LLM calls count: {len(executor.cost_tracker.llm_calls)}") | |
| logger.info(f" Current file costs: {executor.cost_tracker.current_file_costs}") | |
| logger.info(f" Calculated costs: {costs}") | |
| # Display detailed costs table | |
| st.subheader("Detailed Costs") | |
| costs_df = executor.cost_tracker.get_detailed_costs_table() | |
| st.dataframe(costs_df, use_container_width=True) | |
| st.info( | |
| f"LLM input tokens: {executor.cost_tracker.llm_input_tokens}, " | |
| f"LLM output tokens: {executor.cost_tracker.llm_output_tokens}, " | |
| f"DI pages: {executor.cost_tracker.di_pages}, " | |
| f"Model cost: ${model_cost:.4f}, " | |
| f"DI cost: ${di_cost:.4f}, " | |
| f"Total cost: ${model_cost + di_cost:.4f}" | |
| ) | |
| # Add detailed logging about what executor returned | |
| logger.info(f"Executor returned results of type: {type(results)}") | |
| logger.info(f"Results content: {results}") | |
| # Check if results is already a DataFrame | |
| if isinstance(results, pd.DataFrame): | |
| logger.info(f"Results is already a DataFrame with shape: {results.shape}") | |
| logger.info(f"DataFrame columns: {results.columns.tolist()}") | |
| logger.info(f"DataFrame head: {results.head()}") | |
| df = results | |
| else: | |
| logger.info("Results is not a DataFrame, calling flatten_json_response") | |
| # Process results using flatten_json_response | |
| df = flatten_json_response(results, field_list) | |
| # Log final DataFrame info | |
| logger.info(f"Final DataFrame shape: {df.shape}") | |
| logger.info(f"Final DataFrame columns: {df.columns.tolist()}") | |
| if not df.empty: | |
| logger.info(f"Final DataFrame sample: {df.head()}") | |
| # Store execution in history | |
| execution_record = { | |
| "filename": pdf_file.name, | |
| "datetime": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| "fields": field_list, | |
| "logs": log_capture.get_logs(), # Store the actual logs | |
| "results": df.to_dict() if not df.empty else None | |
| } | |
| st.session_state.execution_history.append(execution_record) | |
| log_capture.clear() # Clear logs after storing them | |
| # ----------------- UI: show execution tree ----------------- | |
| st.subheader("Execution trace") | |
| for log in logs: | |
| indent = " " * 4 * log["depth"] | |
| # Add error indicator if there was an error | |
| error_indicator = "❌ " if log.get("error") else "✓ " | |
| # Use a fixed preview text instead of the result | |
| with st.expander(f"{indent}{error_indicator}{log['tool']} – Click to view result"): | |
| st.markdown(f"**Args**: `{log['args']}`", unsafe_allow_html=True) | |
| if log.get("error"): | |
| st.error(f"Error: {log['error']}") | |
| # Special handling for IndexAgent output | |
| if log['tool'] == "IndexAgent" and isinstance(log["result"], dict): | |
| # Display chunk statistics if available | |
| if "chunk_stats" in log["result"]: | |
| st.markdown("### Chunk Statistics") | |
| # Create a DataFrame for better visualization | |
| stats_df = pd.DataFrame(log["result"]["chunk_stats"]) | |
| st.dataframe(stats_df) | |
| # Add summary statistics | |
| st.markdown("### Summary") | |
| st.markdown(f""" | |
| - Total chunks: {len(stats_df)} | |
| - Average chunk length: {stats_df['length'].mean():.0f} characters | |
| - Shortest chunk: {stats_df['length'].min()} characters | |
| - Longest chunk: {stats_df['length'].max()} characters | |
| """) | |
| # Add a bar chart of chunk lengths | |
| st.markdown("### Chunk Length Distribution") | |
| st.bar_chart(stats_df.set_index('chunk_number')['length']) | |
| else: | |
| st.code(log["result"]) | |
| if not df.empty: | |
| st.success("Done ✓") | |
| st.dataframe(df) | |
| st.download_button("Download CSV", df.to_csv(index=False), "results.csv") | |
| else: | |
| st.warning("No results were extracted. Check the execution trace for errors.") | |
| except Exception as e: | |
| logging.exception("App error:") | |
| st.error(f"An error occurred: {e}") |