Spaces:
Sleeping
Sleeping
Arif
commited on
Commit
Β·
89d2b21
1
Parent(s):
09c07f9
Docker model runner still error, trying to resolve
Browse files- backend/app/config.py +7 -17
- backend/app/main.py +2 -2
- backend/app/services/llm_service.py +15 -15
backend/app/config.py
CHANGED
|
@@ -60,15 +60,15 @@ class Settings(BaseSettings):
|
|
| 60 |
|
| 61 |
# ===== DOCKER MODEL RUNNER MODE (DEBUG=false) =====
|
| 62 |
# Metis pattern: stateless HTTP API to DMR on host
|
| 63 |
-
|
| 64 |
-
default="http://host.docker.internal:11434/v1",
|
| 65 |
env="MODEL_RUNNER_URL",
|
| 66 |
description="Docker Model Runner API endpoint (from containers use host.docker.internal)"
|
| 67 |
)
|
| 68 |
-
|
| 69 |
-
default="llama3.2:1B-Q4_0",
|
| 70 |
env="MODEL_NAME",
|
| 71 |
-
description="Model name as
|
| 72 |
)
|
| 73 |
docker_timeout: int = Field(
|
| 74 |
default=300,
|
|
@@ -76,18 +76,6 @@ class Settings(BaseSettings):
|
|
| 76 |
description="Timeout for Docker Model Runner requests (seconds)"
|
| 77 |
)
|
| 78 |
|
| 79 |
-
# ===== BACKWARDS COMPATIBILITY (deprecated) =====
|
| 80 |
-
# These are kept for backwards compatibility but use new names above
|
| 81 |
-
@property
|
| 82 |
-
def docker_model_runner_url(self) -> str:
|
| 83 |
-
"""Backwards compatible alias for model_runner_url"""
|
| 84 |
-
return self.model_runner_url
|
| 85 |
-
|
| 86 |
-
@property
|
| 87 |
-
def llm_model_name_docker(self) -> str:
|
| 88 |
-
"""Backwards compatible alias for model_name"""
|
| 89 |
-
return self.model_name
|
| 90 |
-
|
| 91 |
# ===== DATA PROCESSING =====
|
| 92 |
max_file_size_mb: int = Field(
|
| 93 |
default=50,
|
|
@@ -102,6 +90,8 @@ class Settings(BaseSettings):
|
|
| 102 |
env_file = ".env.local"
|
| 103 |
case_sensitive = False
|
| 104 |
extra = "allow"
|
|
|
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
@lru_cache
|
|
|
|
| 60 |
|
| 61 |
# ===== DOCKER MODEL RUNNER MODE (DEBUG=false) =====
|
| 62 |
# Metis pattern: stateless HTTP API to DMR on host
|
| 63 |
+
runner_url: str = Field(
|
| 64 |
+
default="http://host.docker.internal:11434/engines/llama.cpp/v1",
|
| 65 |
env="MODEL_RUNNER_URL",
|
| 66 |
description="Docker Model Runner API endpoint (from containers use host.docker.internal)"
|
| 67 |
)
|
| 68 |
+
llm_model: str = Field(
|
| 69 |
+
default="ai/llama3.2:1B-Q4_0",
|
| 70 |
env="MODEL_NAME",
|
| 71 |
+
description="Model name as OCI reference (e.g., ai/llama3.2:1B-Q4_0)"
|
| 72 |
)
|
| 73 |
docker_timeout: int = Field(
|
| 74 |
default=300,
|
|
|
|
| 76 |
description="Timeout for Docker Model Runner requests (seconds)"
|
| 77 |
)
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
# ===== DATA PROCESSING =====
|
| 80 |
max_file_size_mb: int = Field(
|
| 81 |
default=50,
|
|
|
|
| 90 |
env_file = ".env.local"
|
| 91 |
case_sensitive = False
|
| 92 |
extra = "allow"
|
| 93 |
+
# Fix Pydantic warning about protected namespaces
|
| 94 |
+
protected_namespaces = ('settings_',)
|
| 95 |
|
| 96 |
|
| 97 |
@lru_cache
|
backend/app/main.py
CHANGED
|
@@ -35,10 +35,10 @@ async def lifespan(app: FastAPI):
|
|
| 35 |
}
|
| 36 |
|
| 37 |
docker_config = {
|
| 38 |
-
"model_name": settings.
|
| 39 |
"max_tokens": settings.llm_max_tokens,
|
| 40 |
"temperature": settings.llm_temperature,
|
| 41 |
-
"docker_url": settings.
|
| 42 |
"timeout": settings.docker_timeout
|
| 43 |
}
|
| 44 |
|
|
|
|
| 35 |
}
|
| 36 |
|
| 37 |
docker_config = {
|
| 38 |
+
"model_name": settings.llm_model,
|
| 39 |
"max_tokens": settings.llm_max_tokens,
|
| 40 |
"temperature": settings.llm_temperature,
|
| 41 |
+
"docker_url": settings.runner_url,
|
| 42 |
"timeout": settings.docker_timeout
|
| 43 |
}
|
| 44 |
|
backend/app/services/llm_service.py
CHANGED
|
@@ -152,11 +152,11 @@ class LLMServiceDockerModelRunner(BaseLLMService):
|
|
| 152 |
model_name: str,
|
| 153 |
max_tokens: int,
|
| 154 |
temperature: float,
|
| 155 |
-
|
| 156 |
timeout: int = 300
|
| 157 |
):
|
| 158 |
super().__init__(model_name, max_tokens, temperature)
|
| 159 |
-
self.
|
| 160 |
self.timeout = timeout
|
| 161 |
self.client = None
|
| 162 |
|
|
@@ -170,11 +170,11 @@ class LLMServiceDockerModelRunner(BaseLLMService):
|
|
| 170 |
return True
|
| 171 |
|
| 172 |
try:
|
| 173 |
-
self.logger.info(f"π Connecting to Docker Model Runner: {self.
|
| 174 |
self.client = httpx.AsyncClient(timeout=self.timeout)
|
| 175 |
|
| 176 |
# OpenAI-compatible endpoint: GET /v1/models
|
| 177 |
-
response = await self.client.get(f"{self.
|
| 178 |
|
| 179 |
if response.status_code == 200:
|
| 180 |
models = response.json()
|
|
@@ -208,7 +208,7 @@ class LLMServiceDockerModelRunner(BaseLLMService):
|
|
| 208 |
|
| 209 |
# OpenAI-compatible endpoint: POST /v1/chat/completions
|
| 210 |
response = await self.client.post(
|
| 211 |
-
f"{self.
|
| 212 |
json=payload
|
| 213 |
)
|
| 214 |
|
|
@@ -285,7 +285,7 @@ def get_llm_service(debug: bool = None, mlx_config: dict = None, docker_config:
|
|
| 285 |
# Determine debug mode
|
| 286 |
if debug is None:
|
| 287 |
debug = os.getenv("DEBUG", "false").lower() == "true"
|
| 288 |
-
if hasattr(settings, "debug"):
|
| 289 |
debug = settings.debug
|
| 290 |
|
| 291 |
# Try MLX first (if DEBUG=true)
|
|
@@ -303,23 +303,23 @@ def get_llm_service(debug: bool = None, mlx_config: dict = None, docker_config:
|
|
| 303 |
logger.warning(f"β οΈ MLX failed: {e}, falling back to Docker Model Runner")
|
| 304 |
|
| 305 |
# Try Docker Model Runner (Metis pattern)
|
| 306 |
-
|
| 307 |
if docker_config:
|
| 308 |
-
|
| 309 |
elif settings:
|
| 310 |
-
|
| 311 |
else:
|
| 312 |
-
|
| 313 |
|
| 314 |
-
if
|
| 315 |
try:
|
| 316 |
model_name = None
|
| 317 |
if docker_config:
|
| 318 |
model_name = docker_config.get("model_name")
|
| 319 |
elif settings:
|
| 320 |
-
model_name = getattr(settings, "
|
| 321 |
else:
|
| 322 |
-
model_name = os.getenv("MODEL_NAME", "llama3.2:1B-Q4_0")
|
| 323 |
|
| 324 |
config = {
|
| 325 |
"model_name": model_name,
|
|
@@ -327,11 +327,11 @@ def get_llm_service(debug: bool = None, mlx_config: dict = None, docker_config:
|
|
| 327 |
getattr(settings, "llm_max_tokens", 512) if settings else 512),
|
| 328 |
"temperature": (docker_config or {}).get("temperature",
|
| 329 |
getattr(settings, "llm_temperature", 0.7) if settings else 0.7),
|
| 330 |
-
"
|
| 331 |
"timeout": (docker_config or {}).get("timeout",
|
| 332 |
getattr(settings, "docker_timeout", 300) if settings else 300)
|
| 333 |
}
|
| 334 |
-
logger.info(f"π Mode: Docker Model Runner at {
|
| 335 |
logger.info(f"π Model: {config['model_name']}")
|
| 336 |
logger.info(f"β
Using host GPU acceleration (llama.cpp Metal backend)")
|
| 337 |
return LLMServiceDockerModelRunner(**config)
|
|
|
|
| 152 |
model_name: str,
|
| 153 |
max_tokens: int,
|
| 154 |
temperature: float,
|
| 155 |
+
runner_url: str,
|
| 156 |
timeout: int = 300
|
| 157 |
):
|
| 158 |
super().__init__(model_name, max_tokens, temperature)
|
| 159 |
+
self.runner_url = runner_url.rstrip("/") # Remove trailing slash
|
| 160 |
self.timeout = timeout
|
| 161 |
self.client = None
|
| 162 |
|
|
|
|
| 170 |
return True
|
| 171 |
|
| 172 |
try:
|
| 173 |
+
self.logger.info(f"π Connecting to Docker Model Runner: {self.runner_url}")
|
| 174 |
self.client = httpx.AsyncClient(timeout=self.timeout)
|
| 175 |
|
| 176 |
# OpenAI-compatible endpoint: GET /v1/models
|
| 177 |
+
response = await self.client.get(f"{self.runner_url}/models")
|
| 178 |
|
| 179 |
if response.status_code == 200:
|
| 180 |
models = response.json()
|
|
|
|
| 208 |
|
| 209 |
# OpenAI-compatible endpoint: POST /v1/chat/completions
|
| 210 |
response = await self.client.post(
|
| 211 |
+
f"{self.runner_url}/chat/completions",
|
| 212 |
json=payload
|
| 213 |
)
|
| 214 |
|
|
|
|
| 285 |
# Determine debug mode
|
| 286 |
if debug is None:
|
| 287 |
debug = os.getenv("DEBUG", "false").lower() == "true"
|
| 288 |
+
if settings and hasattr(settings, "debug"):
|
| 289 |
debug = settings.debug
|
| 290 |
|
| 291 |
# Try MLX first (if DEBUG=true)
|
|
|
|
| 303 |
logger.warning(f"β οΈ MLX failed: {e}, falling back to Docker Model Runner")
|
| 304 |
|
| 305 |
# Try Docker Model Runner (Metis pattern)
|
| 306 |
+
runner_url = None
|
| 307 |
if docker_config:
|
| 308 |
+
runner_url = docker_config.get("runner_url")
|
| 309 |
elif settings:
|
| 310 |
+
runner_url = getattr(settings, "runner_url", None)
|
| 311 |
else:
|
| 312 |
+
runner_url = os.getenv("MODEL_RUNNER_URL")
|
| 313 |
|
| 314 |
+
if runner_url:
|
| 315 |
try:
|
| 316 |
model_name = None
|
| 317 |
if docker_config:
|
| 318 |
model_name = docker_config.get("model_name")
|
| 319 |
elif settings:
|
| 320 |
+
model_name = getattr(settings, "llm_model", None)
|
| 321 |
else:
|
| 322 |
+
model_name = os.getenv("MODEL_NAME", "ai/llama3.2:1B-Q4_0")
|
| 323 |
|
| 324 |
config = {
|
| 325 |
"model_name": model_name,
|
|
|
|
| 327 |
getattr(settings, "llm_max_tokens", 512) if settings else 512),
|
| 328 |
"temperature": (docker_config or {}).get("temperature",
|
| 329 |
getattr(settings, "llm_temperature", 0.7) if settings else 0.7),
|
| 330 |
+
"runner_url": runner_url,
|
| 331 |
"timeout": (docker_config or {}).get("timeout",
|
| 332 |
getattr(settings, "docker_timeout", 300) if settings else 300)
|
| 333 |
}
|
| 334 |
+
logger.info(f"π Mode: Docker Model Runner at {runner_url}")
|
| 335 |
logger.info(f"π Model: {config['model_name']}")
|
| 336 |
logger.info(f"β
Using host GPU acceleration (llama.cpp Metal backend)")
|
| 337 |
return LLMServiceDockerModelRunner(**config)
|