Oleg Lavrovsky commited on
Commit
f8c7edf
·
unverified ·
1 Parent(s): 41dfffc

Completion API

Browse files
Files changed (1) hide show
  1. app.py +16 -11
app.py CHANGED
@@ -2,6 +2,7 @@ from contextlib import asynccontextmanager
2
  from fastapi import FastAPI, HTTPException
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from pydantic import BaseModel, ValidationError
 
5
 
6
  from torch import cuda
7
  from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -40,10 +41,16 @@ class ModelResponse(BaseModel):
40
  confidence: float
41
  processing_time: float
42
 
 
 
 
 
43
  class Completion(BaseModel):
44
- model: str
45
- prompt: str
46
- max_tokens: int = 65536
 
 
47
 
48
  @asynccontextmanager
49
  async def lifespan(app: FastAPI):
@@ -104,13 +111,10 @@ def fit_to_length(text, min_length=3, max_length=100):
104
  return text
105
 
106
 
107
- def get_model_reponse(query: str):
108
  """Process the text content."""
109
 
110
  # Prepare the model input
111
- messages_think = [
112
- {"role": "user", "content": query}
113
- ]
114
  text = tokenizer.apply_chat_template(
115
  messages_think,
116
  tokenize=False,
@@ -144,9 +148,7 @@ async def completion(data: Completion):
144
  raise HTTPException(status_code=503, detail="Model not loaded")
145
 
146
  try:
147
- text = fit_to_length(data.prompt, 3, data.max_tokens)
148
-
149
- result = get_model_reponse(text, model)
150
 
151
  return {
152
  "choices": [
@@ -181,7 +183,10 @@ async def predict(q: str):
181
 
182
  text = fit_to_length(input_data.text, input_data.min_length, input_data.max_length)
183
 
184
- result = get_model_reponse(text, model)
 
 
 
185
 
186
  # Checkpoint
187
  processing_time = time.time() - start_time
 
2
  from fastapi import FastAPI, HTTPException
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from pydantic import BaseModel, ValidationError
5
+ from typing import List, Optional
6
 
7
  from torch import cuda
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
41
  confidence: float
42
  processing_time: float
43
 
44
+ class ChatMessage(BaseModel):
45
+ role: str
46
+ content: str
47
+
48
  class Completion(BaseModel):
49
+ model: str = "apertus"
50
+ messages: List[ChatMessage]
51
+ max_tokens: Optional[int] = 512
52
+ temperature: Optional[float] = 0.1
53
+ top_p: Optional[float] = 0.9
54
 
55
  @asynccontextmanager
56
  async def lifespan(app: FastAPI):
 
111
  return text
112
 
113
 
114
+ def get_model_reponse(messages_think):
115
  """Process the text content."""
116
 
117
  # Prepare the model input
 
 
 
118
  text = tokenizer.apply_chat_template(
119
  messages_think,
120
  tokenize=False,
 
148
  raise HTTPException(status_code=503, detail="Model not loaded")
149
 
150
  try:
151
+ result = get_model_reponse(data)
 
 
152
 
153
  return {
154
  "choices": [
 
183
 
184
  text = fit_to_length(input_data.text, input_data.min_length, input_data.max_length)
185
 
186
+ messages_think = [
187
+ {"role": "user", "content": text}
188
+ ]
189
+ result = get_model_reponse(messages_think)
190
 
191
  # Checkpoint
192
  processing_time = time.time() - start_time