File size: 12,568 Bytes
e020ac8
 
 
 
 
 
 
 
09c07f9
e020ac8
 
 
3f44a73
09c07f9
e020ac8
3f44a73
09c07f9
e020ac8
 
 
 
 
 
 
 
 
09c07f9
e020ac8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09c07f9
e020ac8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09c07f9
e020ac8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee1a304
e020ac8
 
 
 
ee1a304
09c07f9
e020ac8
09c07f9
e020ac8
09c07f9
 
 
 
 
 
 
 
 
89d2b21
09c07f9
 
e020ac8
89d2b21
e020ac8
 
 
 
09c07f9
 
 
 
 
e020ac8
 
 
 
89d2b21
e020ac8
 
697bc47
89d2b21
e020ac8
 
09c07f9
e020ac8
09c07f9
 
e020ac8
 
 
 
 
 
 
 
 
09c07f9
 
 
 
 
e020ac8
 
 
 
 
09c07f9
e020ac8
 
 
 
 
697bc47
e020ac8
89d2b21
e020ac8
 
 
 
 
697bc47
e020ac8
5ca944e
e020ac8
 
 
 
 
 
 
 
 
 
 
 
 
5ca944e
697bc47
e020ac8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09c07f9
 
e020ac8
 
09c07f9
 
 
 
 
 
 
 
 
 
 
e020ac8
3f44a73
09c07f9
 
 
89d2b21
09c07f9
 
 
bf03cdf
 
e020ac8
 
 
 
 
 
09c07f9
e020ac8
bf03cdf
09c07f9
bf03cdf
09c07f9
89d2b21
bf03cdf
89d2b21
bf03cdf
89d2b21
09c07f9
89d2b21
bf03cdf
89d2b21
bf03cdf
09c07f9
 
 
 
89d2b21
09c07f9
89d2b21
09c07f9
 
 
 
 
 
 
89d2b21
09c07f9
 
bf03cdf
89d2b21
09c07f9
 
bf03cdf
 
09c07f9
bf03cdf
 
 
 
 
 
 
09c07f9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
"""
Dual-mode LLM Service
- DEBUG=true: Uses MLX with Apple Silicon GPU
- DEBUG=false: Uses Docker Model Runner (OpenAI-compatible API)
- Fallback: Mock mode if neither available
"""
import asyncio
import logging
import os
from abc import ABC, abstractmethod
from typing import List, Optional
import httpx


logger = logging.getLogger(__name__)


# Import MLX conditionally
try:
    from mlx_lm import load
    from mlx_lm.generate import generate
    HAS_MLX = True
except ImportError:
    HAS_MLX = False



class BaseLLMService(ABC):
    """Abstract base class for LLM services"""
    
    def __init__(self, model_name: str, max_tokens: int, temperature: float):
        self.model_name = model_name
        self.max_tokens = max_tokens
        self.temperature = temperature
        self.is_loaded = False
        self.is_mock = False
        self.logger = logging.getLogger(__name__)
    
    @abstractmethod
    async def load_model(self) -> bool:
        """Load/initialize the model"""
        pass
    
    @abstractmethod
    async def generate(self, prompt: str) -> str:
        """Generate text from prompt"""
        pass
    
    async def chat(self, messages: List[dict], system_prompt: str = None) -> str:
        """Chat interface - converts chat format to prompt format"""
        prompt = self._build_prompt(messages, system_prompt)
        return await self.generate(prompt)
    
    def _build_prompt(self, messages: List[dict], system_prompt: str = None) -> str:
        """Build prompt from chat messages"""
        prompt_parts = []
        
        if system_prompt:
            prompt_parts.append(f"System: {system_prompt}\n\n")
        
        for msg in messages:
            role = msg.get("role", "user")
            content = msg.get("content", "")
            prompt_parts.append(f"{role.capitalize()}: {content}\n")
        
        prompt_parts.append("Assistant: ")
        return "".join(prompt_parts)



class LLMServiceMLX(BaseLLMService):
    """MLX implementation for Apple Silicon (DEBUG=true)"""
    
    def __init__(self, model_name: str, max_tokens: int, temperature: float, device: str):
        super().__init__(model_name, max_tokens, temperature)
        self.device = device
        self.model = None
        self.tokenizer = None
    
    async def load_model(self) -> bool:
        """Load MLX model"""
        if self.is_loaded:
            return True
        
        if not HAS_MLX:
            self.logger.error("❌ MLX not available")
            return False
        
        try:
            self.logger.info(f"πŸ”„ Loading MLX model: {self.model_name}")
            loop = asyncio.get_event_loop()
            self.model, self.tokenizer = await loop.run_in_executor(
                None,
                self._load_model_sync
            )
            self.is_loaded = True
            self.logger.info(f"βœ… MLX model loaded: {self.model_name}")
            return True
        except Exception as e:
            self.logger.error(f"❌ MLX model loading failed: {e}")
            return False
    
    def _load_model_sync(self):
        """Synchronous MLX model loading"""
        if not HAS_MLX:
            raise RuntimeError("MLX not installed")
        
        self.logger.info("πŸ”„ Starting model download/load...")
        model, tokenizer = load(self.model_name)
        self.logger.info("βœ… Model download/load complete")
        return model, tokenizer
    
    async def generate(self, prompt: str) -> str:
        """Generate with MLX"""
        if not self.is_loaded:
            raise RuntimeError("Model not loaded")
        
        try:
            loop = asyncio.get_event_loop()
            response = await loop.run_in_executor(
                None,
                self._generate_sync,
                prompt
            )
            return response
        except Exception as e:
            self.logger.error(f"❌ MLX generation failed: {e}")
            raise
    
    def _generate_sync(self, prompt: str) -> str:
        """Synchronous text generation with MLX"""
        response = generate(
            model=self.model,
            tokenizer=self.tokenizer,
            prompt=prompt,
            max_tokens=self.max_tokens
        )
        return response




class LLMServiceDockerModelRunner(BaseLLMService):
    """Docker Model Runner implementation - OpenAI-compatible API
    
    Uses stateless HTTP calls to DMR running on host machine.
    Optimal for Apple Silicon GPU acceleration via llama.cpp Metal backend.
    """
    
    def __init__(
        self, 
        model_name: str, 
        max_tokens: int, 
        temperature: float, 
        runner_url: str,
        timeout: int = 300
    ):
        super().__init__(model_name, max_tokens, temperature)
        self.runner_url = runner_url.rstrip("/")  # Remove trailing slash
        self.timeout = timeout
        self.client = None
    
    async def load_model(self) -> bool:
        """Initialize Docker Model Runner connection
        
        Tests connectivity to the DMR HTTP API endpoint.
        DMR itself handles model loading on the host.
        """
        if self.is_loaded:
            return True
        
        try:
            self.logger.info(f"πŸ”„ Connecting to Docker Model Runner: {self.runner_url}")
            self.client = httpx.AsyncClient(timeout=self.timeout)
            
            # OpenAI-compatible endpoint: GET /v1/models
            response = await self.client.get(f"{self.runner_url}/models")
            
            if response.status_code == 200:
                models = response.json()
                self.logger.info(f"βœ… Docker Model Runner connected")
                self.logger.info(f"πŸ“‹ Available models: {models}")
                self.is_loaded = True
                return True
            else:
                self.logger.error(f"❌ Docker Model Runner returned {response.status_code}")
                return False
        except Exception as e:
            self.logger.error(f"❌ Docker Model Runner connection failed: {e}")
            return False
    
    async def generate(self, prompt: str) -> str:
        """Generate with Docker Model Runner (OpenAI-compatible API)
        
        Makes HTTP request to DMR at host.docker.internal:11434
        Model inference happens on host GPU (Apple Metal backend)
        """
        if not self.is_loaded:
            raise RuntimeError("Docker Model Runner not connected")
        
        try:
            payload = {
                "model": self.model_name,
                "messages": [{"role": "user", "content": prompt}],
                "temperature": self.temperature,
                "max_tokens": self.max_tokens,
            }
            
            # OpenAI-compatible endpoint: POST /v1/chat/completions
            response = await self.client.post(
                f"{self.runner_url}/chat/completions",
                json=payload
            )
            
            if response.status_code == 200:
                result = response.json()
                return result["choices"][0]["message"]["content"]
            else:
                self.logger.error(f"❌ Docker Model Runner error: {response.status_code} - {response.text}")
                raise RuntimeError(f"Model Runner error: {response.status_code}")
        except Exception as e:
            self.logger.error(f"❌ Docker Model Runner generation failed: {e}")
            raise
    
    async def __aenter__(self):
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.client:
            await self.client.aclose()




class LLMServiceMock(BaseLLMService):
    """Mock implementation as fallback"""
    
    def __init__(self, model_name: str, max_tokens: int, temperature: float):
        super().__init__(model_name, max_tokens, temperature)
        self.is_mock = True
    
    async def load_model(self) -> bool:
        """Mock loading"""
        self.logger.warning("⚠️  Using MOCK mode (no real LLM available)")
        self.is_loaded = True
        return True
    
    async def generate(self, prompt: str) -> str:
        """Generate mock response"""
        return self._generate_mock_response(prompt)
    
    def _generate_mock_response(self, prompt: str) -> str:
        """Generate intelligent mock responses"""
        prompt_lower = prompt.lower()
        
        if "hello" in prompt_lower or "hi" in prompt_lower:
            return "Hello! I'm running in mock mode (no LLM available). I can still help you analyze CSV and Excel files!"
        elif "analyze" in prompt_lower or "data" in prompt_lower:
            return "I can analyze your data with statistical analysis, trend detection, outlier detection, and correlation matrices."
        elif "what can" in prompt_lower or "help" in prompt_lower:
            return "I can help with: 1) Chatting, 2) Uploading files (CSV/Excel), 3) Statistical analysis, 4) Trend detection, 5) Anomaly detection."
        elif "machine learning" in prompt_lower:
            return "Machine learning is about creating algorithms that can learn from data and make predictions without being explicitly programmed."
        else:
            return f"Mock response: I processed your prompt about '{prompt[:40]}...' - please note I'm in mock mode with no real LLM."



def get_llm_service(debug: bool = None, mlx_config: dict = None, docker_config: dict = None, settings=None) -> BaseLLMService:
    """
    Factory function to get appropriate LLM service
    
    Fallback chain: MLX (DEBUG=true) β†’ Docker Model Runner β†’ Mock
    
    Args:
        debug: Force DEBUG mode (True=MLX, False=Docker). If None, reads from env/settings
        mlx_config: Manual MLX config dict
        docker_config: Manual Docker config dict
        settings: Pydantic Settings object with llm config
    
    Returns:
        BaseLLMService: One of MLX, DockerModelRunner, or Mock implementation
    """
    
    # Determine debug mode
    if debug is None:
        debug = os.getenv("DEBUG", "false").lower() == "true"
        if settings and hasattr(settings, "debug"):
            debug = settings.debug
    
    # Try MLX first (if DEBUG=true)
    if debug and HAS_MLX:
        try:
            config = mlx_config or {
                "model_name": "mlx-community/Llama-3.2-3B-Instruct-4bit",
                "max_tokens": 512,
                "temperature": 0.7,
                "device": "auto"
            }
            logger.info("πŸ“Œ Mode: MLX (DEBUG=true) with Apple Silicon GPU")
            return LLMServiceMLX(**config)
        except Exception as e:
            logger.warning(f"⚠️  MLX failed: {e}, falling back to Docker Model Runner")
    
    # Try Docker Model Runner (Metis pattern)
    runner_url = None
    if docker_config:
        runner_url = docker_config.get("runner_url")
    elif settings:
        runner_url = getattr(settings, "runner_url", None)
    else:
        runner_url = os.getenv("MODEL_RUNNER_URL")
    
    if runner_url:
        try:
            model_name = None
            if docker_config:
                model_name = docker_config.get("model_name")
            elif settings:
                model_name = getattr(settings, "llm_model", None)
            else:
                model_name = os.getenv("MODEL_NAME", "ai/llama3.2:1B-Q4_0")
            
            config = {
                "model_name": model_name,
                "max_tokens": (docker_config or {}).get("max_tokens", 
                    getattr(settings, "llm_max_tokens", 512) if settings else 512),
                "temperature": (docker_config or {}).get("temperature", 
                    getattr(settings, "llm_temperature", 0.7) if settings else 0.7),
                "runner_url": runner_url,
                "timeout": (docker_config or {}).get("timeout", 
                    getattr(settings, "docker_timeout", 300) if settings else 300)
            }
            logger.info(f"πŸ“Œ Mode: Docker Model Runner at {runner_url}")
            logger.info(f"πŸ“Œ Model: {config['model_name']}")
            logger.info(f"βœ… Using host GPU acceleration (llama.cpp Metal backend)")
            return LLMServiceDockerModelRunner(**config)
        except Exception as e:
            logger.warning(f"⚠️  Docker Model Runner failed: {e}, falling back to Mock")
    
    # Fallback to mock
    logger.warning("⚠️  Using MOCK mode (no LLM available)")
    return LLMServiceMock(
        model_name="mock",
        max_tokens=512,
        temperature=0.7
    )