Buckets:

MisterAI
/

LocalAI_Demo_backends

Files

xet

MisterAI/LocalAI_Demo_backends / cpu-diffusers /backend.proto

MisterAI

1 day ago

download

raw

32.5 kB

	syntax = "proto3";

	option go_package = "github.com/go-skynet/LocalAI/pkg/grpc/proto";
	option java_multiple_files = true;
	option java_package = "io.skynet.localai.backend";
	option java_outer_classname = "LocalAIBackend";

	package backend;

	service Backend {
	rpc Health(HealthMessage) returns (Reply) {}
	rpc Free(HealthMessage) returns (Result) {}
	rpc Predict(PredictOptions) returns (Reply) {}
	rpc LoadModel(ModelOptions) returns (Result) {}
	rpc PredictStream(PredictOptions) returns (stream Reply) {}
	rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
	rpc GenerateImage(GenerateImageRequest) returns (Result) {}
	rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
	rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
	rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {}
	rpc TTS(TTSRequest) returns (Result) {}
	rpc TTSStream(TTSRequest) returns (stream Reply) {}
	rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
	rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
	rpc Status(HealthMessage) returns (StatusResponse) {}
	rpc Detect(DetectOptions) returns (DetectResponse) {}
	rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
	rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
	rpc VoiceVerify(VoiceVerifyRequest) returns (VoiceVerifyResponse) {}
	rpc VoiceAnalyze(VoiceAnalyzeRequest) returns (VoiceAnalyzeResponse) {}
	rpc VoiceEmbed(VoiceEmbedRequest) returns (VoiceEmbedResponse) {}

	rpc StoresSet(StoresSetOptions) returns (Result) {}
	rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
	rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
	rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}

	rpc Rerank(RerankRequest) returns (RerankResult) {}

	rpc GetMetrics(MetricsRequest) returns (MetricsResponse);

	rpc VAD(VADRequest) returns (VADResponse) {}

	rpc Diarize(DiarizeRequest) returns (DiarizeResponse) {}

	rpc AudioEncode(AudioEncodeRequest) returns (AudioEncodeResult) {}
	rpc AudioDecode(AudioDecodeRequest) returns (AudioDecodeResult) {}

	rpc AudioTransform(AudioTransformRequest) returns (AudioTransformResult) {}
	rpc AudioTransformStream(stream AudioTransformFrameRequest) returns (stream AudioTransformFrameResponse) {}
	// AudioToAudioStream is the bidirectional any-to-any S2S RPC. Backends
	// that load a speech-to-speech model consume input audio frames and emit
	// interleaved audio + transcript + tool-call deltas as typed events.
	// Backends without S2S support return UNIMPLEMENTED.
	rpc AudioToAudioStream(stream AudioToAudioRequest) returns (stream AudioToAudioResponse) {}

	rpc ModelMetadata(ModelOptions) returns (ModelMetadataResponse) {}

	// Fine-tuning RPCs
	rpc StartFineTune(FineTuneRequest) returns (FineTuneJobResult) {}
	rpc FineTuneProgress(FineTuneProgressRequest) returns (stream FineTuneProgressUpdate) {}
	rpc StopFineTune(FineTuneStopRequest) returns (Result) {}
	rpc ListCheckpoints(ListCheckpointsRequest) returns (ListCheckpointsResponse) {}
	rpc ExportModel(ExportModelRequest) returns (Result) {}

	// Quantization RPCs
	rpc StartQuantization(QuantizationRequest) returns (QuantizationJobResult) {}
	rpc QuantizationProgress(QuantizationProgressRequest) returns (stream QuantizationProgressUpdate) {}
	rpc StopQuantization(QuantizationStopRequest) returns (Result) {}

	}

	// Define the empty request
	message MetricsRequest {}

	message MetricsResponse {
	int32 slot_id = 1;
	string prompt_json_for_slot = 2; // Stores the prompt as a JSON string.
	float tokens_per_second = 3;
	int32 tokens_generated = 4;
	int32 prompt_tokens_processed = 5;
	}

	message RerankRequest {
	string query = 1;
	repeated string documents = 2;
	int32 top_n = 3;
	}

	message RerankResult {
	Usage usage = 1;
	repeated DocumentResult results = 2;
	}

	message Usage {
	int32 total_tokens = 1;
	int32 prompt_tokens = 2;
	}

	message DocumentResult {
	int32 index = 1;
	string text = 2;
	float relevance_score = 3;
	}

	message StoresKey {
	repeated float Floats = 1;
	}

	message StoresValue {
	bytes Bytes = 1;
	}

	message StoresSetOptions {
	repeated StoresKey Keys = 1;
	repeated StoresValue Values = 2;
	}

	message StoresDeleteOptions {
	repeated StoresKey Keys = 1;
	}

	message StoresGetOptions {
	repeated StoresKey Keys = 1;
	}

	message StoresGetResult {
	repeated StoresKey Keys = 1;
	repeated StoresValue Values = 2;
	}

	message StoresFindOptions {
	StoresKey Key = 1;
	int32 TopK = 2;
	}

	message StoresFindResult {
	repeated StoresKey Keys = 1;
	repeated StoresValue Values = 2;
	repeated float Similarities = 3;
	}

	message HealthMessage {}

	// The request message containing the user's name.
	message PredictOptions {
	string Prompt = 1;
	int32 Seed = 2;
	int32 Threads = 3;
	int32 Tokens = 4;
	int32 TopK = 5;
	int32 Repeat = 6;
	int32 Batch = 7;
	int32 NKeep = 8;
	float Temperature = 9;
	float Penalty = 10;
	bool F16KV = 11;
	bool DebugMode = 12;
	repeated string StopPrompts = 13;
	bool IgnoreEOS = 14;
	float TailFreeSamplingZ = 15;
	float TypicalP = 16;
	float FrequencyPenalty = 17;
	float PresencePenalty = 18;
	int32 Mirostat = 19;
	float MirostatETA = 20;
	float MirostatTAU = 21;
	bool PenalizeNL = 22;
	string LogitBias = 23;
	bool MLock = 25;
	bool MMap = 26;
	bool PromptCacheAll = 27;
	bool PromptCacheRO = 28;
	string Grammar = 29;
	string MainGPU = 30;
	string TensorSplit = 31;
	float TopP = 32;
	string PromptCachePath = 33;
	bool Debug = 34;
	repeated int32 EmbeddingTokens = 35;
	string Embeddings = 36;
	float RopeFreqBase = 37;
	float RopeFreqScale = 38;
	float NegativePromptScale = 39;
	string NegativePrompt = 40;
	int32 NDraft = 41;
	repeated string Images = 42;
	bool UseTokenizerTemplate = 43;
	repeated Message Messages = 44;
	repeated string Videos = 45;
	repeated string Audios = 46;
	string CorrelationId = 47;
	string Tools = 48; // JSON array of available tools/functions for tool calling
	string ToolChoice = 49; // JSON string or object specifying tool choice behavior
	int32 Logprobs = 50; // Number of top logprobs to return (maps to OpenAI logprobs parameter)
	int32 TopLogprobs = 51; // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter)
	map<string, string> Metadata = 52; // Generic per-request metadata (e.g., enable_thinking)
	float MinP = 53; // Minimum probability sampling threshold (0.0 = disabled)
	}

	// ToolCallDelta represents an incremental tool call update from the C++ parser.
	// Used for both streaming (partial diffs) and non-streaming (final tool calls).
	message ToolCallDelta {
	int32 index = 1; // tool call index (0-based)
	string id = 2; // tool call ID (e.g., "call_abc123")
	string name = 3; // function name (set on first appearance)
	string arguments = 4; // arguments chunk (incremental in streaming, full in non-streaming)
	}

	// ChatDelta represents incremental content/reasoning/tool_call updates parsed by the C++ backend.
	message ChatDelta {
	string content = 1; // content text delta
	string reasoning_content = 2; // reasoning/thinking text delta
	repeated ToolCallDelta tool_calls = 3; // tool call deltas
	}

	// The response message containing the result
	message Reply {
	bytes message = 1;
	int32 tokens = 2;
	int32 prompt_tokens = 3;
	double timing_prompt_processing = 4;
	double timing_token_generation = 5;
	bytes audio = 6;
	bytes logprobs = 7; // JSON-encoded logprobs data matching OpenAI format
	repeated ChatDelta chat_deltas = 8; // Parsed chat deltas from C++ autoparser (streaming + non-streaming)
	}

	message GrammarTrigger {
	string word = 1;
	}

	message ModelOptions {
	string Model = 1;
	int32 ContextSize = 2;
	int32 Seed = 3;
	int32 NBatch = 4;
	bool F16Memory = 5;
	bool MLock = 6;
	bool MMap = 7;
	bool VocabOnly = 8;
	bool LowVRAM = 9;
	bool Embeddings = 10;
	bool NUMA = 11;
	int32 NGPULayers = 12;
	string MainGPU = 13;
	string TensorSplit = 14;
	int32 Threads = 15;
	float RopeFreqBase = 17;
	float RopeFreqScale = 18;
	float RMSNormEps = 19;
	int32 NGQA = 20;
	string ModelFile = 21;



	// Diffusers
	string PipelineType = 26;
	string SchedulerType = 27;
	bool CUDA = 28;
	float CFGScale = 29;
	bool IMG2IMG = 30;
	string CLIPModel = 31;
	string CLIPSubfolder = 32;
	int32 CLIPSkip = 33;
	string ControlNet = 48;

	string Tokenizer = 34;

	// LLM (llama.cpp)
	string LoraBase = 35;
	string LoraAdapter = 36;
	float LoraScale = 42;

	bool NoMulMatQ = 37;
	string DraftModel = 39;

	string AudioPath = 38;

	// vllm
	string Quantization = 40;
	float GPUMemoryUtilization = 50;
	bool TrustRemoteCode = 51;
	bool EnforceEager = 52;
	int32 SwapSpace = 53;
	int32 MaxModelLen = 54;
	int32 TensorParallelSize = 55;
	string LoadFormat = 58;
	bool DisableLogStatus = 66;
	string DType = 67;
	int32 LimitImagePerPrompt = 68;
	int32 LimitVideoPerPrompt = 69;
	int32 LimitAudioPerPrompt = 70;

	string MMProj = 41;

	string RopeScaling = 43;
	float YarnExtFactor = 44;
	float YarnAttnFactor = 45;
	float YarnBetaFast = 46;
	float YarnBetaSlow = 47;

	string Type = 49;

	string FlashAttention = 56;
	bool NoKVOffload = 57;

	string ModelPath = 59;

	repeated string LoraAdapters = 60;
	repeated float LoraScales = 61;

	repeated string Options = 62;

	string CacheTypeKey = 63;
	string CacheTypeValue = 64;

	repeated GrammarTrigger GrammarTriggers = 65;

	bool Reranking = 71;

	repeated string Overrides = 72;

	// EngineArgs carries a JSON-encoded map of backend-native engine arguments
	// applied verbatim to the backend's engine constructor (e.g. vLLM AsyncEngineArgs).
	// Unknown keys produce an error at LoadModel time.
	string EngineArgs = 73;
	}

	message Result {
	string message = 1;
	bool success = 2;
	}

	message EmbeddingResult {
	repeated float embeddings = 1;
	}

	message TranscriptRequest {
	string dst = 2;
	string language = 3;
	uint32 threads = 4;
	bool translate = 5;
	bool diarize = 6;
	string prompt = 7;
	float temperature = 8;
	repeated string timestamp_granularities = 9;
	bool stream = 10;
	}

	message TranscriptResult {
	repeated TranscriptSegment segments = 1;
	string text = 2;
	string language = 3;
	float duration = 4;
	}

	message TranscriptStreamResponse {
	string delta = 1;
	TranscriptResult final_result = 2;
	}

	message TranscriptWord {
	int64 start = 1;
	int64 end = 2;
	string text = 3;
	}

	message TranscriptSegment {
	int32 id = 1;
	int64 start = 2;
	int64 end = 3;
	string text = 4;
	repeated int32 tokens = 5;
	string speaker = 6;
	repeated TranscriptWord words = 7;
	}

	message GenerateImageRequest {
	int32 height = 1;
	int32 width = 2;
	int32 step = 4;
	int32 seed = 5;
	string positive_prompt = 6;
	string negative_prompt = 7;
	string dst = 8;
	string src = 9;

	// Diffusers
	string EnableParameters = 10;
	int32 CLIPSkip = 11;

	// Reference images for models that support them (e.g., Flux Kontext)
	repeated string ref_images = 12;
	}

	message GenerateVideoRequest {
	string prompt = 1;
	string negative_prompt = 2; // Negative prompt for video generation
	string start_image = 3; // Path or base64 encoded image for the start frame
	string end_image = 4; // Path or base64 encoded image for the end frame
	int32 width = 5;
	int32 height = 6;
	int32 num_frames = 7; // Number of frames to generate
	int32 fps = 8; // Frames per second
	int32 seed = 9;
	float cfg_scale = 10; // Classifier-free guidance scale
	int32 step = 11; // Number of inference steps
	string dst = 12; // Output path for the generated video
	}

	message TTSRequest {
	string text = 1;
	string model = 2;
	string dst = 3;
	string voice = 4;
	optional string language = 5;
	}

	message VADRequest {
	repeated float audio = 1;
	}

	message VADSegment {
	float start = 1;
	float end = 2;
	}

	message VADResponse {
	repeated VADSegment segments = 1;
	}

	// --- Speaker diarization messages ---
	//
	// Pure speaker diarization: "who spoke when". Returns time-stamped segments
	// labelled with cluster IDs (the same string for the same speaker across
	// segments). Some backends (e.g. vibevoice.cpp) produce diarization as a
	// by-product of ASR and may also fill in `text` per segment; backends with a
	// dedicated diarization pipeline (e.g. sherpa-onnx pyannote) leave `text`
	// empty and emit only the segmentation.

	message DiarizeRequest {
	string dst = 1; // path to audio file (HTTP layer materialises uploads to a temp file)
	uint32 threads = 2;
	string language = 3; // optional; only meaningful for transcription-bundling backends
	int32 num_speakers = 4; // exact speaker count if known (>0 forces); 0 = auto
	int32 min_speakers = 5; // hint when auto-detecting; 0 = unset
	int32 max_speakers = 6; // hint when auto-detecting; 0 = unset
	float clustering_threshold = 7; // distance threshold when num_speakers unknown; 0 = backend default
	float min_duration_on = 8; // discard segments shorter than this (seconds); 0 = backend default
	float min_duration_off = 9; // merge gaps shorter than this (seconds); 0 = backend default
	bool include_text = 10; // when the backend can emit per-segment transcript for free, ask it to populate `text`
	}

	message DiarizeSegment {
	int32 id = 1;
	float start = 2; // seconds
	float end = 3; // seconds
	string speaker = 4; // backend-emitted speaker label (e.g. "0", "SPEAKER_00")
	string text = 5; // optional per-segment transcript (empty unless include_text and supported)
	}

	message DiarizeResponse {
	repeated DiarizeSegment segments = 1;
	int32 num_speakers = 2; // count of distinct speaker labels in `segments`
	float duration = 3; // total audio duration in seconds (0 if unknown)
	string language = 4; // optional, when the backend bundles transcription
	}

	message SoundGenerationRequest {
	string text = 1;
	string model = 2;
	string dst = 3;
	optional float duration = 4;
	optional float temperature = 5;
	optional bool sample = 6;
	optional string src = 7;
	optional int32 src_divisor = 8;
	optional bool think = 9;
	optional string caption = 10;
	optional string lyrics = 11;
	optional int32 bpm = 12;
	optional string keyscale = 13;
	optional string language = 14;
	optional string timesignature = 15;
	optional bool instrumental = 17;
	}

	message TokenizationResponse {
	int32 length = 1;
	repeated int32 tokens = 2;
	}

	message MemoryUsageData {
	uint64 total = 1;
	map<string, uint64> breakdown = 2;
	}

	message StatusResponse {
	enum State {
	UNINITIALIZED = 0;
	BUSY = 1;
	READY = 2;
	ERROR = -1;
	}
	State state = 1;
	MemoryUsageData memory = 2;
	}

	message Message {
	string role = 1;
	string content = 2;
	// Optional fields for OpenAI-compatible message format
	string name = 3; // Tool name (for tool messages)
	string tool_call_id = 4; // Tool call ID (for tool messages)
	string reasoning_content = 5; // Reasoning content (for thinking models)
	string tool_calls = 6; // Tool calls as JSON string (for assistant messages with tool calls)
	}

	message DetectOptions {
	string src = 1;
	string prompt = 2; // Text prompt (for SAM 3 PCS mode)
	repeated float points = 3; // Point coordinates as [x1, y1, label1, x2, y2, label2, ...] (label: 1=pos, 0=neg)
	repeated float boxes = 4; // Box coordinates as [x1, y1, x2, y2, ...]
	float threshold = 5; // Detection confidence threshold
	}

	message Detection {
	float x = 1;
	float y = 2;
	float width = 3;
	float height = 4;
	float confidence = 5;
	string class_name = 6;
	bytes mask = 7; // PNG-encoded binary segmentation mask
	}

	message DetectResponse {
	repeated Detection Detections = 1;
	}

	// --- Face recognition messages ---

	message FacialArea {
	float x = 1;
	float y = 2;
	float w = 3;
	float h = 4;
	}

	message FaceVerifyRequest {
	string img1 = 1; // base64-encoded image
	string img2 = 2; // base64-encoded image
	float threshold = 3; // cosine-distance threshold; 0 = use backend default
	bool anti_spoofing = 4; // run MiniFASNet liveness on each image; failed liveness forces verified=false
	}

	message FaceVerifyResponse {
	bool verified = 1;
	float distance = 2; // 1 - cosine_similarity
	float threshold = 3;
	float confidence = 4; // 0-100
	string model = 5; // e.g. "buffalo_l"
	FacialArea img1_area = 6;
	FacialArea img2_area = 7;
	float processing_time_ms = 8;
	bool img1_is_real = 9; // anti-spoofing result when enabled
	float img1_antispoof_score = 10;
	bool img2_is_real = 11;
	float img2_antispoof_score = 12;
	}

	message FaceAnalyzeRequest {
	string img = 1; // base64-encoded image
	repeated string actions = 2; // subset of ["age","gender","emotion","race"]; empty = all-supported
	bool anti_spoofing = 3;
	}

	message FaceAnalysis {
	FacialArea region = 1;
	float face_confidence = 2;
	float age = 3;
	string dominant_gender = 4; // "Man" \| "Woman"
	map<string, float> gender = 5;
	string dominant_emotion = 6; // reserved; empty in MVP
	map<string, float> emotion = 7;
	string dominant_race = 8; // not populated
	map<string, float> race = 9;
	bool is_real = 10; // anti-spoofing result when enabled
	float antispoof_score = 11;
	}

	message FaceAnalyzeResponse {
	repeated FaceAnalysis faces = 1;
	}

	// --- Voice (speaker) recognition messages ---
	//
	// Analogous to the Face* messages above, but for speaker biometrics.
	// Audio fields accept a filesystem path (same convention as
	// TranscriptRequest.dst). The HTTP layer materialises base64 / URL /
	// data-URI inputs to a temp file before calling the gRPC backend.

	message VoiceVerifyRequest {
	string audio1 = 1; // path to first audio clip
	string audio2 = 2; // path to second audio clip
	float threshold = 3; // cosine-distance threshold; 0 = use backend default
	bool anti_spoofing = 4; // reserved for future AASIST bolt-on
	}

	message VoiceVerifyResponse {
	bool verified = 1;
	float distance = 2; // 1 - cosine_similarity
	float threshold = 3;
	float confidence = 4; // 0-100
	string model = 5; // e.g. "speechbrain/spkrec-ecapa-voxceleb"
	float processing_time_ms = 6;
	}

	message VoiceAnalyzeRequest {
	string audio = 1; // path to audio clip
	repeated string actions = 2; // subset of ["age","gender","emotion"]; empty = all-supported
	}

	message VoiceAnalysis {
	float start = 1; // segment start time in seconds (0 if single-utterance)
	float end = 2; // segment end time in seconds
	float age = 3;
	string dominant_gender = 4;
	map<string, float> gender = 5;
	string dominant_emotion = 6;
	map<string, float> emotion = 7;
	}

	message VoiceAnalyzeResponse {
	repeated VoiceAnalysis segments = 1;
	}

	message VoiceEmbedRequest {
	string audio = 1; // path to audio clip
	}

	message VoiceEmbedResponse {
	repeated float embedding = 1;
	string model = 2;
	}

	message ToolFormatMarkers {
	string format_type = 1; // "json_native", "tag_with_json", "tag_with_tagged"

	// Tool section markers
	string section_start = 2; // e.g., "<tool_call>", "[TOOL_CALLS]"
	string section_end = 3; // e.g., "</tool_call>"
	string per_call_start = 4; // e.g., "<\|tool_call_begin\|>"
	string per_call_end = 5; // e.g., "<\|tool_call_end\|>"

	// Function name markers (TAG_WITH_JSON / TAG_WITH_TAGGED)
	string func_name_prefix = 6; // e.g., "<function="
	string func_name_suffix = 7; // e.g., ">"
	string func_close = 8; // e.g., "</function>"

	// Argument markers (TAG_WITH_TAGGED)
	string arg_name_prefix = 9; // e.g., "<param="
	string arg_name_suffix = 10; // e.g., ">"
	string arg_value_prefix = 11;
	string arg_value_suffix = 12; // e.g., "</param>"
	string arg_separator = 13; // e.g., "\n"

	// JSON format fields (JSON_NATIVE)
	string name_field = 14; // e.g., "name"
	string args_field = 15; // e.g., "arguments"
	string id_field = 16; // e.g., "id"
	bool fun_name_is_key = 17;
	bool tools_array_wrapped = 18;
	reserved 19;

	// Reasoning markers
	string reasoning_start = 20; // e.g., "<think>"
	string reasoning_end = 21; // e.g., "</think>"

	// Content markers
	string content_start = 22;
	string content_end = 23;

	// Args wrapper markers
	string args_start = 24; // e.g., "<args>"
	string args_end = 25; // e.g., "</args>"

	// JSON parameter ordering
	string function_field = 26; // e.g., "function" (wrapper key in JSON)
	repeated string parameter_order = 27;

	// Generated ID field (alternative field name for generated IDs)
	string gen_id_field = 28; // e.g., "call_id"

	// Call ID markers (position and delimiters for tool call IDs)
	string call_id_position = 29; // "none", "pre_func_name", "between_func_and_args", "post_args"
	string call_id_prefix = 30; // e.g., "[CALL_ID]"
	string call_id_suffix = 31; // e.g., ""
	}

	message AudioEncodeRequest {
	bytes pcm_data = 1;
	int32 sample_rate = 2;
	int32 channels = 3;
	map<string, string> options = 4;
	}

	message AudioEncodeResult {
	repeated bytes frames = 1;
	int32 sample_rate = 2;
	int32 samples_per_frame = 3;
	}

	message AudioDecodeRequest {
	repeated bytes frames = 1;
	map<string, string> options = 2;
	}

	message AudioDecodeResult {
	bytes pcm_data = 1;
	int32 sample_rate = 2;
	int32 samples_per_frame = 3;
	}

	// Generic audio transform: an audio-in, audio-out operation, optionally
	// conditioned on a second reference signal. Concrete transforms include
	// AEC + noise suppression + dereverberation (LocalVQE), voice conversion
	// (reference = target speaker), pitch shifting, etc.
	message AudioTransformRequest {
	string audio_path = 1; // required, primary input file path
	string reference_path = 2; // optional auxiliary; empty => zero-fill
	string dst = 3; // required, output file path
	map<string, string> params = 4; // backend-specific tuning
	}

	message AudioTransformResult {
	string dst = 1;
	int32 sample_rate = 2;
	int32 samples = 3;
	bool reference_provided = 4;
	}

	// Bidirectional streaming audio transform. The first message MUST carry a
	// Config; subsequent messages carry Frames. A second Config mid-stream
	// resets streaming state before the next frame.
	message AudioTransformFrameRequest {
	oneof payload {
	AudioTransformStreamConfig config = 1;
	AudioTransformFrame frame = 2;
	}
	}

	message AudioTransformStreamConfig {
	enum SampleFormat {
	F32_LE = 0;
	S16_LE = 1;
	}
	SampleFormat sample_format = 1;
	int32 sample_rate = 2; // 0 => backend default
	int32 frame_samples = 3; // 0 => backend default
	map<string, string> params = 4;
	bool reset = 5; // reset streaming state before next frame
	}

	message AudioTransformFrame {
	bytes audio_pcm = 1; // frame_samples samples in stream's format
	bytes reference_pcm = 2; // empty => zero-fill (silent reference)
	}

	message AudioTransformFrameResponse {
	bytes pcm = 1;
	int64 frame_index = 2;
	}

	// === AudioToAudioStream messages =========================================
	//
	// Bidirectional stream between the LocalAI core and an any-to-any audio
	// model. The client opens the stream with a Config payload, then alternates
	// Frame (input audio) and Control (turn boundaries, function-call results,
	// session updates) payloads. The server streams back typed events: audio
	// frames carry PCM in `pcm`; transcript / tool-call deltas carry JSON in
	// `meta`; the stream ends with a `response.done` (success) or `error` event.

	message AudioToAudioRequest {
	oneof payload {
	AudioToAudioConfig config = 1;
	AudioToAudioFrame frame = 2;
	AudioToAudioControl control = 3;
	}
	}

	message AudioToAudioConfig {
	// PCM format for client→server audio. 0 => backend default
	// (16 kHz for the LFM2-Audio Conformer encoder).
	int32 input_sample_rate = 1;
	// Preferred server→client audio rate. 0 => backend default
	// (24 kHz for the LFM2-Audio vocoder).
	int32 output_sample_rate = 2;
	// Optional system prompt override. Empty => backend chooses based on
	// mode (e.g. "Respond with interleaved text and audio.").
	string system_prompt = 3;
	// Optional baked-voice id. Models that only ship a fixed set of
	// voices (e.g. LFM2-Audio: us_male/us_female/uk_male/uk_female) match
	// this against their voice table; an empty string keeps the default.
	string voice = 4;
	// JSON-encoded array of tool definitions in OpenAI Chat Completions
	// format. Empty => no tools.
	string tools = 5;
	// Free-form sampling / decoding parameters (temperature, top_k,
	// max_new_tokens, audio_top_k, etc).
	map<string, string> params = 6;
	// True => reset any session-scoped state before processing further
	// frames on this stream. The first Config implicitly resets.
	bool reset = 7;
	}

	message AudioToAudioFrame {
	// Raw PCM s16le mono at config.input_sample_rate. Empty pcm + end_of_input
	// is a valid "user finished speaking" marker without trailing audio.
	bytes pcm = 1;
	// Marks the last frame of a user turn. The backend may begin emitting
	// a response immediately after seeing this.
	bool end_of_input = 2;
	}

	message AudioToAudioControl {
	// Free-form control event names. Initial set:
	// "input_audio_buffer.commit" — user finished speaking
	// "response.cancel" — abort in-flight generation
	// "conversation.item.create" — inject a non-audio item (e.g.
	// function_call_output as JSON in
	// `payload`)
	// "session.update" — re-configure mid-stream
	string event = 1;
	// Event-specific JSON payload.
	bytes payload = 2;
	}

	message AudioToAudioResponse {
	// Event identifies what this frame carries. Mirrors the OpenAI Realtime
	// API server-event names where applicable. Initial set:
	// "response.audio.delta"
	// "response.audio_transcript.delta"
	// "response.function_call_arguments.delta"
	// "response.function_call_arguments.done"
	// "response.done"
	// "error"
	string event = 1;
	// Populated when event = response.audio.delta.
	bytes pcm = 2;
	// Populated alongside pcm to identify its rate. 0 => same as the
	// session's negotiated output_sample_rate.
	int32 sample_rate = 3;
	// JSON payload for non-PCM events (transcript chunk, tool args, error
	// body).
	bytes meta = 4;
	// Monotonic per-stream counter, useful for client reordering and
	// debugging.
	int64 sequence = 5;
	}

	message ModelMetadataResponse {
	bool supports_thinking = 1;
	string rendered_template = 2; // The rendered chat template with enable_thinking=true (empty if not applicable)
	ToolFormatMarkers tool_format = 3; // Auto-detected tool format markers from differential template analysis
	string media_marker = 4; // Marker the backend expects in the prompt for each multimodal input (images/audio/video). Empty when the backend does not use a marker.
	}

	// Fine-tuning messages

	message FineTuneRequest {
	// Model identification
	string model = 1; // HF model name or local path
	string training_type = 2; // "lora", "loha", "lokr", "full" — what parameters to train
	string training_method = 3; // "sft", "dpo", "grpo", "rloo", "reward", "kto", "orpo", "network_training"

	// Adapter config (universal across LoRA/LoHa/LoKr for LLM + diffusion)
	int32 adapter_rank = 10; // LoRA rank (r), default 16
	int32 adapter_alpha = 11; // scaling factor, default 16
	float adapter_dropout = 12; // default 0.0
	repeated string target_modules = 13; // layer names to adapt

	// Universal training hyperparameters
	float learning_rate = 20; // default 2e-4
	int32 num_epochs = 21; // default 3
	int32 batch_size = 22; // default 2
	int32 gradient_accumulation_steps = 23; // default 4
	int32 warmup_steps = 24; // default 5
	int32 max_steps = 25; // 0 = use epochs
	int32 save_steps = 26; // 0 = only save final
	float weight_decay = 27; // default 0.01
	bool gradient_checkpointing = 28;
	string optimizer = 29; // adamw_8bit, adamw, sgd, adafactor, prodigy
	int32 seed = 30; // default 3407
	string mixed_precision = 31; // fp16, bf16, fp8, no

	// Dataset
	string dataset_source = 40; // HF dataset ID, local file/dir path
	string dataset_split = 41; // train, test, etc.

	// Output
	string output_dir = 50;
	string job_id = 51; // client-assigned or auto-generated

	// Resume training from a checkpoint
	string resume_from_checkpoint = 55; // path to checkpoint dir to resume from

	// Backend-specific AND method-specific extensibility
	map<string, string> extra_options = 60;
	}

	message FineTuneJobResult {
	string job_id = 1;
	bool success = 2;
	string message = 3;
	}

	message FineTuneProgressRequest {
	string job_id = 1;
	}

	message FineTuneProgressUpdate {
	string job_id = 1;
	int32 current_step = 2;
	int32 total_steps = 3;
	float current_epoch = 4;
	float total_epochs = 5;
	float loss = 6;
	float learning_rate = 7;
	float grad_norm = 8;
	float eval_loss = 9;
	float eta_seconds = 10;
	float progress_percent = 11;
	string status = 12; // queued, caching, loading_model, loading_dataset, training, saving, completed, failed, stopped
	string message = 13;
	string checkpoint_path = 14; // set when a checkpoint is saved
	string sample_path = 15; // set when a sample is generated (video/image backends)
	map<string, float> extra_metrics = 16; // method-specific metrics
	}

	message FineTuneStopRequest {
	string job_id = 1;
	bool save_checkpoint = 2;
	}

	message ListCheckpointsRequest {
	string output_dir = 1;
	}

	message ListCheckpointsResponse {
	repeated CheckpointInfo checkpoints = 1;
	}

	message CheckpointInfo {
	string path = 1;
	int32 step = 2;
	float epoch = 3;
	float loss = 4;
	string created_at = 5;
	}

	message ExportModelRequest {
	string checkpoint_path = 1;
	string output_path = 2;
	string export_format = 3; // lora, loha, lokr, merged_16bit, merged_4bit, gguf, diffusers
	string quantization_method = 4; // for GGUF: q4_k_m, q5_k_m, q8_0, f16, etc.
	string model = 5; // base model name (for merge operations)
	map<string, string> extra_options = 6;
	}

	// Quantization messages

	message QuantizationRequest {
	string model = 1; // HF model name or local path
	string quantization_type = 2; // q4_k_m, q5_k_m, q8_0, f16, etc.
	string output_dir = 3; // where to write output files
	string job_id = 4; // client-assigned job ID
	map<string, string> extra_options = 5; // hf_token, custom flags, etc.
	}

	message QuantizationJobResult {
	string job_id = 1;
	bool success = 2;
	string message = 3;
	}

	message QuantizationProgressRequest {
	string job_id = 1;
	}

	message QuantizationProgressUpdate {
	string job_id = 1;
	float progress_percent = 2;
	string status = 3; // queued, downloading, converting, quantizing, completed, failed, stopped
	string message = 4;
	string output_file = 5; // set when completed — path to the output GGUF file
	map<string, float> extra_metrics = 6; // e.g. file_size_mb, compression_ratio
	}

	message QuantizationStopRequest {
	string job_id = 1;
	}

Xet Storage Details

Size:: 32.5 kB
Xet hash:: 781e82fb740c3d72989546f657f1bea6efd9f422885a3c4adb551c1a4887c32e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.