Spaces:
Running
Running
Arif
Updated backend. Added ML system. Both if mlx is running in debug also docker model runner if not debug mode.
e020ac8
| """ | |
| ML suggestions - anomaly detection and insights | |
| """ | |
| from typing import List, Dict, Any | |
| import logging | |
| import statistics | |
| class MLSuggester: | |
| """Generate ML-based suggestions from data""" | |
| def __init__(self): | |
| self.logger = logging.getLogger(__name__) | |
| def generate(self, data: List[Dict[str, Any]], context: str = None) -> List[Dict[str, Any]]: | |
| """Generate ML suggestions""" | |
| suggestions = [] | |
| if not data: | |
| return suggestions | |
| # Detect missing values | |
| suggestions.extend(self._check_missing_values(data)) | |
| # Detect outliers | |
| suggestions.extend(self._detect_outliers(data)) | |
| # Detect imbalances | |
| suggestions.extend(self._detect_imbalances(data)) | |
| # Detect data quality issues | |
| suggestions.extend(self._detect_quality_issues(data)) | |
| # Sort by confidence | |
| suggestions.sort(key=lambda x: x["confidence"], reverse=True) | |
| return suggestions[:10] # Top 10 suggestions | |
| def _check_missing_values(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """Check for missing values""" | |
| suggestions = [] | |
| for col in data.keys(): | |
| missing_count = sum(1 for row in data if row[col] is None or row[col] == "") | |
| if missing_count > 0: | |
| percentage = (missing_count / len(data)) * 100 | |
| if percentage > 50: | |
| suggestions.append({ | |
| "title": f"High Missing Values in {col}", | |
| "description": f"{percentage:.1f}% of {col} is missing. Consider data imputation or removal.", | |
| "confidence": min(0.95, percentage / 100), | |
| "action": "impute_or_remove", | |
| "category": "data_quality" | |
| }) | |
| elif percentage > 10: | |
| suggestions.append({ | |
| "title": f"Missing Values in {col}", | |
| "description": f"{percentage:.1f}% of {col} is missing. Consider handling these values.", | |
| "confidence": min(0.8, percentage / 100), | |
| "action": "handle_missing", | |
| "category": "data_quality" | |
| }) | |
| return suggestions | |
| def _detect_outliers(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """Detect outliers""" | |
| suggestions = [] | |
| numeric_cols = self._get_numeric_columns(data) | |
| for col in numeric_cols: | |
| values = [row[col] for row in data if row[col] is not None] | |
| if len(values) > 4: | |
| try: | |
| q1 = sorted(values)[len(values) // 4] | |
| q3 = sorted(values)[3 * len(values) // 4] | |
| iqr = q3 - q1 | |
| outliers = [v for v in values if v < q1 - 1.5 * iqr or v > q3 + 1.5 * iqr] | |
| if outliers: | |
| outlier_percentage = (len(outliers) / len(values)) * 100 | |
| if outlier_percentage > 5: | |
| suggestions.append({ | |
| "title": f"Outliers Detected in {col}", | |
| "description": f"{len(outliers)} outlier(s) ({outlier_percentage:.1f}%) found. Review and handle appropriately.", | |
| "confidence": min(0.85, outlier_percentage / 100), | |
| "action": "review_outliers", | |
| "category": "anomaly" | |
| }) | |
| except: | |
| pass | |
| return suggestions | |
| def _detect_imbalances(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """Detect data imbalances""" | |
| suggestions = [] | |
| for col in data.keys(): | |
| values = [row[col] for row in data if row[col] is not None] | |
| if len(set(values)) < len(values) * 0.1: # Few unique values | |
| from collections import Counter | |
| counts = Counter(values) | |
| most_common_count = counts.most_common(1) | |
| imbalance_ratio = most_common_count / len(values) | |
| if imbalance_ratio > 0.8: | |
| suggestions.append({ | |
| "title": f"Class Imbalance in {col}", | |
| "description": f"One value appears {imbalance_ratio*100:.1f}% of the time. Data is highly imbalanced.", | |
| "confidence": min(0.9, imbalance_ratio), | |
| "action": "rebalance_data", | |
| "category": "pattern" | |
| }) | |
| return suggestions | |
| def _detect_quality_issues(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """Detect data quality issues""" | |
| suggestions = [] | |
| # Check for duplicates | |
| if len(data) != len(set(str(row) for row in data)): | |
| duplicate_count = len(data) - len(set(str(row) for row in data)) | |
| suggestions.append({ | |
| "title": "Duplicate Records Found", | |
| "description": f"{duplicate_count} duplicate row(s) detected. Consider deduplication.", | |
| "confidence": 0.9, | |
| "action": "deduplicate", | |
| "category": "data_quality" | |
| }) | |
| return suggestions | |
| def _get_numeric_columns(data: List[Dict[str, Any]]) -> List[str]: | |
| """Get numeric columns""" | |
| if not data: | |
| return [] | |
| numeric = [] | |
| for key in data.keys(): | |
| try: | |
| for row in data: | |
| if row[key] is not None: | |
| float(row[key]) | |
| numeric.append(key) | |
| except (ValueError, TypeError): | |
| pass | |
| return numeric | |