Buckets:
| syntax = "proto3"; | |
| option go_package = "github.com/go-skynet/LocalAI/pkg/grpc/proto"; | |
| option java_multiple_files = true; | |
| option java_package = "io.skynet.localai.backend"; | |
| option java_outer_classname = "LocalAIBackend"; | |
| package backend; | |
| service Backend { | |
| rpc Health(HealthMessage) returns (Reply) {} | |
| rpc Free(HealthMessage) returns (Result) {} | |
| rpc Predict(PredictOptions) returns (Reply) {} | |
| rpc LoadModel(ModelOptions) returns (Result) {} | |
| rpc PredictStream(PredictOptions) returns (stream Reply) {} | |
| rpc Embedding(PredictOptions) returns (EmbeddingResult) {} | |
| rpc GenerateImage(GenerateImageRequest) returns (Result) {} | |
| rpc GenerateVideo(GenerateVideoRequest) returns (Result) {} | |
| rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {} | |
| rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {} | |
| rpc TTS(TTSRequest) returns (Result) {} | |
| rpc TTSStream(TTSRequest) returns (stream Reply) {} | |
| rpc SoundGeneration(SoundGenerationRequest) returns (Result) {} | |
| rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {} | |
| rpc Status(HealthMessage) returns (StatusResponse) {} | |
| rpc Detect(DetectOptions) returns (DetectResponse) {} | |
| rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {} | |
| rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {} | |
| rpc VoiceVerify(VoiceVerifyRequest) returns (VoiceVerifyResponse) {} | |
| rpc VoiceAnalyze(VoiceAnalyzeRequest) returns (VoiceAnalyzeResponse) {} | |
| rpc VoiceEmbed(VoiceEmbedRequest) returns (VoiceEmbedResponse) {} | |
| rpc StoresSet(StoresSetOptions) returns (Result) {} | |
| rpc StoresDelete(StoresDeleteOptions) returns (Result) {} | |
| rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {} | |
| rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {} | |
| rpc Rerank(RerankRequest) returns (RerankResult) {} | |
| rpc GetMetrics(MetricsRequest) returns (MetricsResponse); | |
| rpc VAD(VADRequest) returns (VADResponse) {} | |
| rpc Diarize(DiarizeRequest) returns (DiarizeResponse) {} | |
| rpc AudioEncode(AudioEncodeRequest) returns (AudioEncodeResult) {} | |
| rpc AudioDecode(AudioDecodeRequest) returns (AudioDecodeResult) {} | |
| rpc AudioTransform(AudioTransformRequest) returns (AudioTransformResult) {} | |
| rpc AudioTransformStream(stream AudioTransformFrameRequest) returns (stream AudioTransformFrameResponse) {} | |
| // AudioToAudioStream is the bidirectional any-to-any S2S RPC. Backends | |
| // that load a speech-to-speech model consume input audio frames and emit | |
| // interleaved audio + transcript + tool-call deltas as typed events. | |
| // Backends without S2S support return UNIMPLEMENTED. | |
| rpc AudioToAudioStream(stream AudioToAudioRequest) returns (stream AudioToAudioResponse) {} | |
| rpc ModelMetadata(ModelOptions) returns (ModelMetadataResponse) {} | |
| // Fine-tuning RPCs | |
| rpc StartFineTune(FineTuneRequest) returns (FineTuneJobResult) {} | |
| rpc FineTuneProgress(FineTuneProgressRequest) returns (stream FineTuneProgressUpdate) {} | |
| rpc StopFineTune(FineTuneStopRequest) returns (Result) {} | |
| rpc ListCheckpoints(ListCheckpointsRequest) returns (ListCheckpointsResponse) {} | |
| rpc ExportModel(ExportModelRequest) returns (Result) {} | |
| // Quantization RPCs | |
| rpc StartQuantization(QuantizationRequest) returns (QuantizationJobResult) {} | |
| rpc QuantizationProgress(QuantizationProgressRequest) returns (stream QuantizationProgressUpdate) {} | |
| rpc StopQuantization(QuantizationStopRequest) returns (Result) {} | |
| } | |
| // Define the empty request | |
| message MetricsRequest {} | |
| message MetricsResponse { | |
| int32 slot_id = 1; | |
| string prompt_json_for_slot = 2; // Stores the prompt as a JSON string. | |
| float tokens_per_second = 3; | |
| int32 tokens_generated = 4; | |
| int32 prompt_tokens_processed = 5; | |
| } | |
| message RerankRequest { | |
| string query = 1; | |
| repeated string documents = 2; | |
| int32 top_n = 3; | |
| } | |
| message RerankResult { | |
| Usage usage = 1; | |
| repeated DocumentResult results = 2; | |
| } | |
| message Usage { | |
| int32 total_tokens = 1; | |
| int32 prompt_tokens = 2; | |
| } | |
| message DocumentResult { | |
| int32 index = 1; | |
| string text = 2; | |
| float relevance_score = 3; | |
| } | |
| message StoresKey { | |
| repeated float Floats = 1; | |
| } | |
| message StoresValue { | |
| bytes Bytes = 1; | |
| } | |
| message StoresSetOptions { | |
| repeated StoresKey Keys = 1; | |
| repeated StoresValue Values = 2; | |
| } | |
| message StoresDeleteOptions { | |
| repeated StoresKey Keys = 1; | |
| } | |
| message StoresGetOptions { | |
| repeated StoresKey Keys = 1; | |
| } | |
| message StoresGetResult { | |
| repeated StoresKey Keys = 1; | |
| repeated StoresValue Values = 2; | |
| } | |
| message StoresFindOptions { | |
| StoresKey Key = 1; | |
| int32 TopK = 2; | |
| } | |
| message StoresFindResult { | |
| repeated StoresKey Keys = 1; | |
| repeated StoresValue Values = 2; | |
| repeated float Similarities = 3; | |
| } | |
| message HealthMessage {} | |
| // The request message containing the user's name. | |
| message PredictOptions { | |
| string Prompt = 1; | |
| int32 Seed = 2; | |
| int32 Threads = 3; | |
| int32 Tokens = 4; | |
| int32 TopK = 5; | |
| int32 Repeat = 6; | |
| int32 Batch = 7; | |
| int32 NKeep = 8; | |
| float Temperature = 9; | |
| float Penalty = 10; | |
| bool F16KV = 11; | |
| bool DebugMode = 12; | |
| repeated string StopPrompts = 13; | |
| bool IgnoreEOS = 14; | |
| float TailFreeSamplingZ = 15; | |
| float TypicalP = 16; | |
| float FrequencyPenalty = 17; | |
| float PresencePenalty = 18; | |
| int32 Mirostat = 19; | |
| float MirostatETA = 20; | |
| float MirostatTAU = 21; | |
| bool PenalizeNL = 22; | |
| string LogitBias = 23; | |
| bool MLock = 25; | |
| bool MMap = 26; | |
| bool PromptCacheAll = 27; | |
| bool PromptCacheRO = 28; | |
| string Grammar = 29; | |
| string MainGPU = 30; | |
| string TensorSplit = 31; | |
| float TopP = 32; | |
| string PromptCachePath = 33; | |
| bool Debug = 34; | |
| repeated int32 EmbeddingTokens = 35; | |
| string Embeddings = 36; | |
| float RopeFreqBase = 37; | |
| float RopeFreqScale = 38; | |
| float NegativePromptScale = 39; | |
| string NegativePrompt = 40; | |
| int32 NDraft = 41; | |
| repeated string Images = 42; | |
| bool UseTokenizerTemplate = 43; | |
| repeated Message Messages = 44; | |
| repeated string Videos = 45; | |
| repeated string Audios = 46; | |
| string CorrelationId = 47; | |
| string Tools = 48; // JSON array of available tools/functions for tool calling | |
| string ToolChoice = 49; // JSON string or object specifying tool choice behavior | |
| int32 Logprobs = 50; // Number of top logprobs to return (maps to OpenAI logprobs parameter) | |
| int32 TopLogprobs = 51; // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter) | |
| map<string, string> Metadata = 52; // Generic per-request metadata (e.g., enable_thinking) | |
| float MinP = 53; // Minimum probability sampling threshold (0.0 = disabled) | |
| } | |
| // ToolCallDelta represents an incremental tool call update from the C++ parser. | |
| // Used for both streaming (partial diffs) and non-streaming (final tool calls). | |
| message ToolCallDelta { | |
| int32 index = 1; // tool call index (0-based) | |
| string id = 2; // tool call ID (e.g., "call_abc123") | |
| string name = 3; // function name (set on first appearance) | |
| string arguments = 4; // arguments chunk (incremental in streaming, full in non-streaming) | |
| } | |
| // ChatDelta represents incremental content/reasoning/tool_call updates parsed by the C++ backend. | |
| message ChatDelta { | |
| string content = 1; // content text delta | |
| string reasoning_content = 2; // reasoning/thinking text delta | |
| repeated ToolCallDelta tool_calls = 3; // tool call deltas | |
| } | |
| // The response message containing the result | |
| message Reply { | |
| bytes message = 1; | |
| int32 tokens = 2; | |
| int32 prompt_tokens = 3; | |
| double timing_prompt_processing = 4; | |
| double timing_token_generation = 5; | |
| bytes audio = 6; | |
| bytes logprobs = 7; // JSON-encoded logprobs data matching OpenAI format | |
| repeated ChatDelta chat_deltas = 8; // Parsed chat deltas from C++ autoparser (streaming + non-streaming) | |
| } | |
| message GrammarTrigger { | |
| string word = 1; | |
| } | |
| message ModelOptions { | |
| string Model = 1; | |
| int32 ContextSize = 2; | |
| int32 Seed = 3; | |
| int32 NBatch = 4; | |
| bool F16Memory = 5; | |
| bool MLock = 6; | |
| bool MMap = 7; | |
| bool VocabOnly = 8; | |
| bool LowVRAM = 9; | |
| bool Embeddings = 10; | |
| bool NUMA = 11; | |
| int32 NGPULayers = 12; | |
| string MainGPU = 13; | |
| string TensorSplit = 14; | |
| int32 Threads = 15; | |
| float RopeFreqBase = 17; | |
| float RopeFreqScale = 18; | |
| float RMSNormEps = 19; | |
| int32 NGQA = 20; | |
| string ModelFile = 21; | |
| // Diffusers | |
| string PipelineType = 26; | |
| string SchedulerType = 27; | |
| bool CUDA = 28; | |
| float CFGScale = 29; | |
| bool IMG2IMG = 30; | |
| string CLIPModel = 31; | |
| string CLIPSubfolder = 32; | |
| int32 CLIPSkip = 33; | |
| string ControlNet = 48; | |
| string Tokenizer = 34; | |
| // LLM (llama.cpp) | |
| string LoraBase = 35; | |
| string LoraAdapter = 36; | |
| float LoraScale = 42; | |
| bool NoMulMatQ = 37; | |
| string DraftModel = 39; | |
| string AudioPath = 38; | |
| // vllm | |
| string Quantization = 40; | |
| float GPUMemoryUtilization = 50; | |
| bool TrustRemoteCode = 51; | |
| bool EnforceEager = 52; | |
| int32 SwapSpace = 53; | |
| int32 MaxModelLen = 54; | |
| int32 TensorParallelSize = 55; | |
| string LoadFormat = 58; | |
| bool DisableLogStatus = 66; | |
| string DType = 67; | |
| int32 LimitImagePerPrompt = 68; | |
| int32 LimitVideoPerPrompt = 69; | |
| int32 LimitAudioPerPrompt = 70; | |
| string MMProj = 41; | |
| string RopeScaling = 43; | |
| float YarnExtFactor = 44; | |
| float YarnAttnFactor = 45; | |
| float YarnBetaFast = 46; | |
| float YarnBetaSlow = 47; | |
| string Type = 49; | |
| string FlashAttention = 56; | |
| bool NoKVOffload = 57; | |
| string ModelPath = 59; | |
| repeated string LoraAdapters = 60; | |
| repeated float LoraScales = 61; | |
| repeated string Options = 62; | |
| string CacheTypeKey = 63; | |
| string CacheTypeValue = 64; | |
| repeated GrammarTrigger GrammarTriggers = 65; | |
| bool Reranking = 71; | |
| repeated string Overrides = 72; | |
| // EngineArgs carries a JSON-encoded map of backend-native engine arguments | |
| // applied verbatim to the backend's engine constructor (e.g. vLLM AsyncEngineArgs). | |
| // Unknown keys produce an error at LoadModel time. | |
| string EngineArgs = 73; | |
| } | |
| message Result { | |
| string message = 1; | |
| bool success = 2; | |
| } | |
| message EmbeddingResult { | |
| repeated float embeddings = 1; | |
| } | |
| message TranscriptRequest { | |
| string dst = 2; | |
| string language = 3; | |
| uint32 threads = 4; | |
| bool translate = 5; | |
| bool diarize = 6; | |
| string prompt = 7; | |
| float temperature = 8; | |
| repeated string timestamp_granularities = 9; | |
| bool stream = 10; | |
| } | |
| message TranscriptResult { | |
| repeated TranscriptSegment segments = 1; | |
| string text = 2; | |
| string language = 3; | |
| float duration = 4; | |
| } | |
| message TranscriptStreamResponse { | |
| string delta = 1; | |
| TranscriptResult final_result = 2; | |
| } | |
| message TranscriptWord { | |
| int64 start = 1; | |
| int64 end = 2; | |
| string text = 3; | |
| } | |
| message TranscriptSegment { | |
| int32 id = 1; | |
| int64 start = 2; | |
| int64 end = 3; | |
| string text = 4; | |
| repeated int32 tokens = 5; | |
| string speaker = 6; | |
| repeated TranscriptWord words = 7; | |
| } | |
| message GenerateImageRequest { | |
| int32 height = 1; | |
| int32 width = 2; | |
| int32 step = 4; | |
| int32 seed = 5; | |
| string positive_prompt = 6; | |
| string negative_prompt = 7; | |
| string dst = 8; | |
| string src = 9; | |
| // Diffusers | |
| string EnableParameters = 10; | |
| int32 CLIPSkip = 11; | |
| // Reference images for models that support them (e.g., Flux Kontext) | |
| repeated string ref_images = 12; | |
| } | |
| message GenerateVideoRequest { | |
| string prompt = 1; | |
| string negative_prompt = 2; // Negative prompt for video generation | |
| string start_image = 3; // Path or base64 encoded image for the start frame | |
| string end_image = 4; // Path or base64 encoded image for the end frame | |
| int32 width = 5; | |
| int32 height = 6; | |
| int32 num_frames = 7; // Number of frames to generate | |
| int32 fps = 8; // Frames per second | |
| int32 seed = 9; | |
| float cfg_scale = 10; // Classifier-free guidance scale | |
| int32 step = 11; // Number of inference steps | |
| string dst = 12; // Output path for the generated video | |
| } | |
| message TTSRequest { | |
| string text = 1; | |
| string model = 2; | |
| string dst = 3; | |
| string voice = 4; | |
| optional string language = 5; | |
| } | |
| message VADRequest { | |
| repeated float audio = 1; | |
| } | |
| message VADSegment { | |
| float start = 1; | |
| float end = 2; | |
| } | |
| message VADResponse { | |
| repeated VADSegment segments = 1; | |
| } | |
| // --- Speaker diarization messages --- | |
| // | |
| // Pure speaker diarization: "who spoke when". Returns time-stamped segments | |
| // labelled with cluster IDs (the same string for the same speaker across | |
| // segments). Some backends (e.g. vibevoice.cpp) produce diarization as a | |
| // by-product of ASR and may also fill in `text` per segment; backends with a | |
| // dedicated diarization pipeline (e.g. sherpa-onnx pyannote) leave `text` | |
| // empty and emit only the segmentation. | |
| message DiarizeRequest { | |
| string dst = 1; // path to audio file (HTTP layer materialises uploads to a temp file) | |
| uint32 threads = 2; | |
| string language = 3; // optional; only meaningful for transcription-bundling backends | |
| int32 num_speakers = 4; // exact speaker count if known (>0 forces); 0 = auto | |
| int32 min_speakers = 5; // hint when auto-detecting; 0 = unset | |
| int32 max_speakers = 6; // hint when auto-detecting; 0 = unset | |
| float clustering_threshold = 7; // distance threshold when num_speakers unknown; 0 = backend default | |
| float min_duration_on = 8; // discard segments shorter than this (seconds); 0 = backend default | |
| float min_duration_off = 9; // merge gaps shorter than this (seconds); 0 = backend default | |
| bool include_text = 10; // when the backend can emit per-segment transcript for free, ask it to populate `text` | |
| } | |
| message DiarizeSegment { | |
| int32 id = 1; | |
| float start = 2; // seconds | |
| float end = 3; // seconds | |
| string speaker = 4; // backend-emitted speaker label (e.g. "0", "SPEAKER_00") | |
| string text = 5; // optional per-segment transcript (empty unless include_text and supported) | |
| } | |
| message DiarizeResponse { | |
| repeated DiarizeSegment segments = 1; | |
| int32 num_speakers = 2; // count of distinct speaker labels in `segments` | |
| float duration = 3; // total audio duration in seconds (0 if unknown) | |
| string language = 4; // optional, when the backend bundles transcription | |
| } | |
| message SoundGenerationRequest { | |
| string text = 1; | |
| string model = 2; | |
| string dst = 3; | |
| optional float duration = 4; | |
| optional float temperature = 5; | |
| optional bool sample = 6; | |
| optional string src = 7; | |
| optional int32 src_divisor = 8; | |
| optional bool think = 9; | |
| optional string caption = 10; | |
| optional string lyrics = 11; | |
| optional int32 bpm = 12; | |
| optional string keyscale = 13; | |
| optional string language = 14; | |
| optional string timesignature = 15; | |
| optional bool instrumental = 17; | |
| } | |
| message TokenizationResponse { | |
| int32 length = 1; | |
| repeated int32 tokens = 2; | |
| } | |
| message MemoryUsageData { | |
| uint64 total = 1; | |
| map<string, uint64> breakdown = 2; | |
| } | |
| message StatusResponse { | |
| enum State { | |
| UNINITIALIZED = 0; | |
| BUSY = 1; | |
| READY = 2; | |
| ERROR = -1; | |
| } | |
| State state = 1; | |
| MemoryUsageData memory = 2; | |
| } | |
| message Message { | |
| string role = 1; | |
| string content = 2; | |
| // Optional fields for OpenAI-compatible message format | |
| string name = 3; // Tool name (for tool messages) | |
| string tool_call_id = 4; // Tool call ID (for tool messages) | |
| string reasoning_content = 5; // Reasoning content (for thinking models) | |
| string tool_calls = 6; // Tool calls as JSON string (for assistant messages with tool calls) | |
| } | |
| message DetectOptions { | |
| string src = 1; | |
| string prompt = 2; // Text prompt (for SAM 3 PCS mode) | |
| repeated float points = 3; // Point coordinates as [x1, y1, label1, x2, y2, label2, ...] (label: 1=pos, 0=neg) | |
| repeated float boxes = 4; // Box coordinates as [x1, y1, x2, y2, ...] | |
| float threshold = 5; // Detection confidence threshold | |
| } | |
| message Detection { | |
| float x = 1; | |
| float y = 2; | |
| float width = 3; | |
| float height = 4; | |
| float confidence = 5; | |
| string class_name = 6; | |
| bytes mask = 7; // PNG-encoded binary segmentation mask | |
| } | |
| message DetectResponse { | |
| repeated Detection Detections = 1; | |
| } | |
| // --- Face recognition messages --- | |
| message FacialArea { | |
| float x = 1; | |
| float y = 2; | |
| float w = 3; | |
| float h = 4; | |
| } | |
| message FaceVerifyRequest { | |
| string img1 = 1; // base64-encoded image | |
| string img2 = 2; // base64-encoded image | |
| float threshold = 3; // cosine-distance threshold; 0 = use backend default | |
| bool anti_spoofing = 4; // run MiniFASNet liveness on each image; failed liveness forces verified=false | |
| } | |
| message FaceVerifyResponse { | |
| bool verified = 1; | |
| float distance = 2; // 1 - cosine_similarity | |
| float threshold = 3; | |
| float confidence = 4; // 0-100 | |
| string model = 5; // e.g. "buffalo_l" | |
| FacialArea img1_area = 6; | |
| FacialArea img2_area = 7; | |
| float processing_time_ms = 8; | |
| bool img1_is_real = 9; // anti-spoofing result when enabled | |
| float img1_antispoof_score = 10; | |
| bool img2_is_real = 11; | |
| float img2_antispoof_score = 12; | |
| } | |
| message FaceAnalyzeRequest { | |
| string img = 1; // base64-encoded image | |
| repeated string actions = 2; // subset of ["age","gender","emotion","race"]; empty = all-supported | |
| bool anti_spoofing = 3; | |
| } | |
| message FaceAnalysis { | |
| FacialArea region = 1; | |
| float face_confidence = 2; | |
| float age = 3; | |
| string dominant_gender = 4; // "Man" | "Woman" | |
| map<string, float> gender = 5; | |
| string dominant_emotion = 6; // reserved; empty in MVP | |
| map<string, float> emotion = 7; | |
| string dominant_race = 8; // not populated | |
| map<string, float> race = 9; | |
| bool is_real = 10; // anti-spoofing result when enabled | |
| float antispoof_score = 11; | |
| } | |
| message FaceAnalyzeResponse { | |
| repeated FaceAnalysis faces = 1; | |
| } | |
| // --- Voice (speaker) recognition messages --- | |
| // | |
| // Analogous to the Face* messages above, but for speaker biometrics. | |
| // Audio fields accept a filesystem path (same convention as | |
| // TranscriptRequest.dst). The HTTP layer materialises base64 / URL / | |
| // data-URI inputs to a temp file before calling the gRPC backend. | |
| message VoiceVerifyRequest { | |
| string audio1 = 1; // path to first audio clip | |
| string audio2 = 2; // path to second audio clip | |
| float threshold = 3; // cosine-distance threshold; 0 = use backend default | |
| bool anti_spoofing = 4; // reserved for future AASIST bolt-on | |
| } | |
| message VoiceVerifyResponse { | |
| bool verified = 1; | |
| float distance = 2; // 1 - cosine_similarity | |
| float threshold = 3; | |
| float confidence = 4; // 0-100 | |
| string model = 5; // e.g. "speechbrain/spkrec-ecapa-voxceleb" | |
| float processing_time_ms = 6; | |
| } | |
| message VoiceAnalyzeRequest { | |
| string audio = 1; // path to audio clip | |
| repeated string actions = 2; // subset of ["age","gender","emotion"]; empty = all-supported | |
| } | |
| message VoiceAnalysis { | |
| float start = 1; // segment start time in seconds (0 if single-utterance) | |
| float end = 2; // segment end time in seconds | |
| float age = 3; | |
| string dominant_gender = 4; | |
| map<string, float> gender = 5; | |
| string dominant_emotion = 6; | |
| map<string, float> emotion = 7; | |
| } | |
| message VoiceAnalyzeResponse { | |
| repeated VoiceAnalysis segments = 1; | |
| } | |
| message VoiceEmbedRequest { | |
| string audio = 1; // path to audio clip | |
| } | |
| message VoiceEmbedResponse { | |
| repeated float embedding = 1; | |
| string model = 2; | |
| } | |
| message ToolFormatMarkers { | |
| string format_type = 1; // "json_native", "tag_with_json", "tag_with_tagged" | |
| // Tool section markers | |
| string section_start = 2; // e.g., "<tool_call>", "[TOOL_CALLS]" | |
| string section_end = 3; // e.g., "</tool_call>" | |
| string per_call_start = 4; // e.g., "<|tool_call_begin|>" | |
| string per_call_end = 5; // e.g., "<|tool_call_end|>" | |
| // Function name markers (TAG_WITH_JSON / TAG_WITH_TAGGED) | |
| string func_name_prefix = 6; // e.g., "<function=" | |
| string func_name_suffix = 7; // e.g., ">" | |
| string func_close = 8; // e.g., "</function>" | |
| // Argument markers (TAG_WITH_TAGGED) | |
| string arg_name_prefix = 9; // e.g., "<param=" | |
| string arg_name_suffix = 10; // e.g., ">" | |
| string arg_value_prefix = 11; | |
| string arg_value_suffix = 12; // e.g., "</param>" | |
| string arg_separator = 13; // e.g., "\n" | |
| // JSON format fields (JSON_NATIVE) | |
| string name_field = 14; // e.g., "name" | |
| string args_field = 15; // e.g., "arguments" | |
| string id_field = 16; // e.g., "id" | |
| bool fun_name_is_key = 17; | |
| bool tools_array_wrapped = 18; | |
| reserved 19; | |
| // Reasoning markers | |
| string reasoning_start = 20; // e.g., "<think>" | |
| string reasoning_end = 21; // e.g., "</think>" | |
| // Content markers | |
| string content_start = 22; | |
| string content_end = 23; | |
| // Args wrapper markers | |
| string args_start = 24; // e.g., "<args>" | |
| string args_end = 25; // e.g., "</args>" | |
| // JSON parameter ordering | |
| string function_field = 26; // e.g., "function" (wrapper key in JSON) | |
| repeated string parameter_order = 27; | |
| // Generated ID field (alternative field name for generated IDs) | |
| string gen_id_field = 28; // e.g., "call_id" | |
| // Call ID markers (position and delimiters for tool call IDs) | |
| string call_id_position = 29; // "none", "pre_func_name", "between_func_and_args", "post_args" | |
| string call_id_prefix = 30; // e.g., "[CALL_ID]" | |
| string call_id_suffix = 31; // e.g., "" | |
| } | |
| message AudioEncodeRequest { | |
| bytes pcm_data = 1; | |
| int32 sample_rate = 2; | |
| int32 channels = 3; | |
| map<string, string> options = 4; | |
| } | |
| message AudioEncodeResult { | |
| repeated bytes frames = 1; | |
| int32 sample_rate = 2; | |
| int32 samples_per_frame = 3; | |
| } | |
| message AudioDecodeRequest { | |
| repeated bytes frames = 1; | |
| map<string, string> options = 2; | |
| } | |
| message AudioDecodeResult { | |
| bytes pcm_data = 1; | |
| int32 sample_rate = 2; | |
| int32 samples_per_frame = 3; | |
| } | |
| // Generic audio transform: an audio-in, audio-out operation, optionally | |
| // conditioned on a second reference signal. Concrete transforms include | |
| // AEC + noise suppression + dereverberation (LocalVQE), voice conversion | |
| // (reference = target speaker), pitch shifting, etc. | |
| message AudioTransformRequest { | |
| string audio_path = 1; // required, primary input file path | |
| string reference_path = 2; // optional auxiliary; empty => zero-fill | |
| string dst = 3; // required, output file path | |
| map<string, string> params = 4; // backend-specific tuning | |
| } | |
| message AudioTransformResult { | |
| string dst = 1; | |
| int32 sample_rate = 2; | |
| int32 samples = 3; | |
| bool reference_provided = 4; | |
| } | |
| // Bidirectional streaming audio transform. The first message MUST carry a | |
| // Config; subsequent messages carry Frames. A second Config mid-stream | |
| // resets streaming state before the next frame. | |
| message AudioTransformFrameRequest { | |
| oneof payload { | |
| AudioTransformStreamConfig config = 1; | |
| AudioTransformFrame frame = 2; | |
| } | |
| } | |
| message AudioTransformStreamConfig { | |
| enum SampleFormat { | |
| F32_LE = 0; | |
| S16_LE = 1; | |
| } | |
| SampleFormat sample_format = 1; | |
| int32 sample_rate = 2; // 0 => backend default | |
| int32 frame_samples = 3; // 0 => backend default | |
| map<string, string> params = 4; | |
| bool reset = 5; // reset streaming state before next frame | |
| } | |
| message AudioTransformFrame { | |
| bytes audio_pcm = 1; // frame_samples samples in stream's format | |
| bytes reference_pcm = 2; // empty => zero-fill (silent reference) | |
| } | |
| message AudioTransformFrameResponse { | |
| bytes pcm = 1; | |
| int64 frame_index = 2; | |
| } | |
| // === AudioToAudioStream messages ========================================= | |
| // | |
| // Bidirectional stream between the LocalAI core and an any-to-any audio | |
| // model. The client opens the stream with a Config payload, then alternates | |
| // Frame (input audio) and Control (turn boundaries, function-call results, | |
| // session updates) payloads. The server streams back typed events: audio | |
| // frames carry PCM in `pcm`; transcript / tool-call deltas carry JSON in | |
| // `meta`; the stream ends with a `response.done` (success) or `error` event. | |
| message AudioToAudioRequest { | |
| oneof payload { | |
| AudioToAudioConfig config = 1; | |
| AudioToAudioFrame frame = 2; | |
| AudioToAudioControl control = 3; | |
| } | |
| } | |
| message AudioToAudioConfig { | |
| // PCM format for client→server audio. 0 => backend default | |
| // (16 kHz for the LFM2-Audio Conformer encoder). | |
| int32 input_sample_rate = 1; | |
| // Preferred server→client audio rate. 0 => backend default | |
| // (24 kHz for the LFM2-Audio vocoder). | |
| int32 output_sample_rate = 2; | |
| // Optional system prompt override. Empty => backend chooses based on | |
| // mode (e.g. "Respond with interleaved text and audio."). | |
| string system_prompt = 3; | |
| // Optional baked-voice id. Models that only ship a fixed set of | |
| // voices (e.g. LFM2-Audio: us_male/us_female/uk_male/uk_female) match | |
| // this against their voice table; an empty string keeps the default. | |
| string voice = 4; | |
| // JSON-encoded array of tool definitions in OpenAI Chat Completions | |
| // format. Empty => no tools. | |
| string tools = 5; | |
| // Free-form sampling / decoding parameters (temperature, top_k, | |
| // max_new_tokens, audio_top_k, etc). | |
| map<string, string> params = 6; | |
| // True => reset any session-scoped state before processing further | |
| // frames on this stream. The first Config implicitly resets. | |
| bool reset = 7; | |
| } | |
| message AudioToAudioFrame { | |
| // Raw PCM s16le mono at config.input_sample_rate. Empty pcm + end_of_input | |
| // is a valid "user finished speaking" marker without trailing audio. | |
| bytes pcm = 1; | |
| // Marks the last frame of a user turn. The backend may begin emitting | |
| // a response immediately after seeing this. | |
| bool end_of_input = 2; | |
| } | |
| message AudioToAudioControl { | |
| // Free-form control event names. Initial set: | |
| // "input_audio_buffer.commit" — user finished speaking | |
| // "response.cancel" — abort in-flight generation | |
| // "conversation.item.create" — inject a non-audio item (e.g. | |
| // function_call_output as JSON in | |
| // `payload`) | |
| // "session.update" — re-configure mid-stream | |
| string event = 1; | |
| // Event-specific JSON payload. | |
| bytes payload = 2; | |
| } | |
| message AudioToAudioResponse { | |
| // Event identifies what this frame carries. Mirrors the OpenAI Realtime | |
| // API server-event names where applicable. Initial set: | |
| // "response.audio.delta" | |
| // "response.audio_transcript.delta" | |
| // "response.function_call_arguments.delta" | |
| // "response.function_call_arguments.done" | |
| // "response.done" | |
| // "error" | |
| string event = 1; | |
| // Populated when event = response.audio.delta. | |
| bytes pcm = 2; | |
| // Populated alongside pcm to identify its rate. 0 => same as the | |
| // session's negotiated output_sample_rate. | |
| int32 sample_rate = 3; | |
| // JSON payload for non-PCM events (transcript chunk, tool args, error | |
| // body). | |
| bytes meta = 4; | |
| // Monotonic per-stream counter, useful for client reordering and | |
| // debugging. | |
| int64 sequence = 5; | |
| } | |
| message ModelMetadataResponse { | |
| bool supports_thinking = 1; | |
| string rendered_template = 2; // The rendered chat template with enable_thinking=true (empty if not applicable) | |
| ToolFormatMarkers tool_format = 3; // Auto-detected tool format markers from differential template analysis | |
| string media_marker = 4; // Marker the backend expects in the prompt for each multimodal input (images/audio/video). Empty when the backend does not use a marker. | |
| } | |
| // Fine-tuning messages | |
| message FineTuneRequest { | |
| // Model identification | |
| string model = 1; // HF model name or local path | |
| string training_type = 2; // "lora", "loha", "lokr", "full" — what parameters to train | |
| string training_method = 3; // "sft", "dpo", "grpo", "rloo", "reward", "kto", "orpo", "network_training" | |
| // Adapter config (universal across LoRA/LoHa/LoKr for LLM + diffusion) | |
| int32 adapter_rank = 10; // LoRA rank (r), default 16 | |
| int32 adapter_alpha = 11; // scaling factor, default 16 | |
| float adapter_dropout = 12; // default 0.0 | |
| repeated string target_modules = 13; // layer names to adapt | |
| // Universal training hyperparameters | |
| float learning_rate = 20; // default 2e-4 | |
| int32 num_epochs = 21; // default 3 | |
| int32 batch_size = 22; // default 2 | |
| int32 gradient_accumulation_steps = 23; // default 4 | |
| int32 warmup_steps = 24; // default 5 | |
| int32 max_steps = 25; // 0 = use epochs | |
| int32 save_steps = 26; // 0 = only save final | |
| float weight_decay = 27; // default 0.01 | |
| bool gradient_checkpointing = 28; | |
| string optimizer = 29; // adamw_8bit, adamw, sgd, adafactor, prodigy | |
| int32 seed = 30; // default 3407 | |
| string mixed_precision = 31; // fp16, bf16, fp8, no | |
| // Dataset | |
| string dataset_source = 40; // HF dataset ID, local file/dir path | |
| string dataset_split = 41; // train, test, etc. | |
| // Output | |
| string output_dir = 50; | |
| string job_id = 51; // client-assigned or auto-generated | |
| // Resume training from a checkpoint | |
| string resume_from_checkpoint = 55; // path to checkpoint dir to resume from | |
| // Backend-specific AND method-specific extensibility | |
| map<string, string> extra_options = 60; | |
| } | |
| message FineTuneJobResult { | |
| string job_id = 1; | |
| bool success = 2; | |
| string message = 3; | |
| } | |
| message FineTuneProgressRequest { | |
| string job_id = 1; | |
| } | |
| message FineTuneProgressUpdate { | |
| string job_id = 1; | |
| int32 current_step = 2; | |
| int32 total_steps = 3; | |
| float current_epoch = 4; | |
| float total_epochs = 5; | |
| float loss = 6; | |
| float learning_rate = 7; | |
| float grad_norm = 8; | |
| float eval_loss = 9; | |
| float eta_seconds = 10; | |
| float progress_percent = 11; | |
| string status = 12; // queued, caching, loading_model, loading_dataset, training, saving, completed, failed, stopped | |
| string message = 13; | |
| string checkpoint_path = 14; // set when a checkpoint is saved | |
| string sample_path = 15; // set when a sample is generated (video/image backends) | |
| map<string, float> extra_metrics = 16; // method-specific metrics | |
| } | |
| message FineTuneStopRequest { | |
| string job_id = 1; | |
| bool save_checkpoint = 2; | |
| } | |
| message ListCheckpointsRequest { | |
| string output_dir = 1; | |
| } | |
| message ListCheckpointsResponse { | |
| repeated CheckpointInfo checkpoints = 1; | |
| } | |
| message CheckpointInfo { | |
| string path = 1; | |
| int32 step = 2; | |
| float epoch = 3; | |
| float loss = 4; | |
| string created_at = 5; | |
| } | |
| message ExportModelRequest { | |
| string checkpoint_path = 1; | |
| string output_path = 2; | |
| string export_format = 3; // lora, loha, lokr, merged_16bit, merged_4bit, gguf, diffusers | |
| string quantization_method = 4; // for GGUF: q4_k_m, q5_k_m, q8_0, f16, etc. | |
| string model = 5; // base model name (for merge operations) | |
| map<string, string> extra_options = 6; | |
| } | |
| // Quantization messages | |
| message QuantizationRequest { | |
| string model = 1; // HF model name or local path | |
| string quantization_type = 2; // q4_k_m, q5_k_m, q8_0, f16, etc. | |
| string output_dir = 3; // where to write output files | |
| string job_id = 4; // client-assigned job ID | |
| map<string, string> extra_options = 5; // hf_token, custom flags, etc. | |
| } | |
| message QuantizationJobResult { | |
| string job_id = 1; | |
| bool success = 2; | |
| string message = 3; | |
| } | |
| message QuantizationProgressRequest { | |
| string job_id = 1; | |
| } | |
| message QuantizationProgressUpdate { | |
| string job_id = 1; | |
| float progress_percent = 2; | |
| string status = 3; // queued, downloading, converting, quantizing, completed, failed, stopped | |
| string message = 4; | |
| string output_file = 5; // set when completed — path to the output GGUF file | |
| map<string, float> extra_metrics = 6; // e.g. file_size_mb, compression_ratio | |
| } | |
| message QuantizationStopRequest { | |
| string job_id = 1; | |
| } | |
Xet Storage Details
- Size:
- 32.5 kB
- Xet hash:
- 781e82fb740c3d72989546f657f1bea6efd9f422885a3c4adb551c1a4887c32e
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.