| {%- set system_prompt = system_prompt | default(" Transform the text provided by various speakers into speech output, utilizing the distinct voice of each respective speaker. | |
| ") -%} | |
| {{ system_prompt -}} | |
| {%- set speech_start_token = speech_start_token | default("<|vision_start|>") %} | |
| {%- set speech_end_token = speech_end_token | default("<|vision_end|>") %} | |
| {%- set speech_diffusion_token = speech_diffusion_token | default("<|vision_pad|>") %} | |
| {%- set ns = namespace(speakers_with_audio="") %} | |
| {%- for message in messages %} | |
| {%- set role = message['role'] %} | |
| {%- set content = message['content'] %} | |
| {%- set has_audio = content | selectattr('type', 'equalto', 'audio') | list | length > 0 %} | |
| {%- if has_audio and role not in ns.speakers_with_audio %} | |
| {%- set ns.speakers_with_audio = ns.speakers_with_audio + role + "," %} | |
| {%- endif %} | |
| {%- endfor %} | |
| {%- if ns.speakers_with_audio %} | |
| {{ " Voice input: | |
| " }} | |
| {%- for speaker in ns.speakers_with_audio.rstrip(',').split(',') %} | |
| {%- if speaker %} | |
| Speaker {{ speaker }}:{{ speech_start_token }}{{ speech_diffusion_token }}{{ speech_end_token }}{{ " | |
| " }} | |
| {%- endif %} | |
| {%- endfor %} | |
| {%- endif %} | |
| Text input:{{ " | |
| " }} | |
| {%- for message in messages %} | |
| {%- set role = message['role'] %} | |
| {%- set text_items = message['content'] | selectattr('type', 'equalto', 'text') | list %} | |
| {%- for item in text_items %} | |
| Speaker {{ role }}: {{ item['text'] }}{{ " | |
| " }} | |
| {%- endfor %} | |
| {%- endfor %} | |
| Speech output:{{ " | |
| " }}{{ speech_start_token }} |