Module: SmartPrompt::SiliconFlow::Voice

Included in:
SmartPrompt::SiliconFlowAdapter
Defined in:
lib/smart_prompt/adapters/siliconflow/voice.rb

Overview

Speech synthesis (CosyVoice2 / MOSS-TTSD), speech recognition (SenseVoiceSmall), and custom-voice cloning management.

Instance Method Summary collapse

Instance Method Details

#delete_voice(uri) ⇒ Object



91
92
93
94
95
96
97
98
99
# File 'lib/smart_prompt/adapters/siliconflow/voice.rb', line 91

def delete_voice(uri)
  SmartPrompt.logger.info "SiliconFlowAdapter: delete voice #{uri}"
  response = http_post_json(@voice_delete_url, { "uri" => uri })
  { deleted: response["deleted"].nil? ? true : response["deleted"], uri: uri, raw: response }
rescue LLMAPIError, Error
  raise
rescue => e
  raise LLMAPIError, "Failed to delete SiliconFlow voice: #{e.message}"
end

#list_voicesObject



79
80
81
82
83
84
85
86
87
88
89
# File 'lib/smart_prompt/adapters/siliconflow/voice.rb', line 79

def list_voices
  SmartPrompt.logger.info "SiliconFlowAdapter: list voices"
  response = http_get_json(@voice_list_url)
  (response["result"] || response["voices"] || response).yield_self do |items|
    items.is_a?(Array) ? items.map { |v| { uri: v["uri"], name: v["customName"] || v["name"] } } : response
  end
rescue LLMAPIError, Error
  raise
rescue => e
  raise LLMAPIError, "Failed to list SiliconFlow voices: #{e.message}"
end

#synthesize_speech(text, voice: nil, model: nil, response_format: "mp3", **opts) ⇒ Object

Returns a base64 data URL for the synthesized audio. SiliconFlow’s /audio/speech returns the raw binary audio stream (NOT base64 / NOT JSON), so we base64-encode it ourselves.



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/smart_prompt/adapters/siliconflow/voice.rb', line 9

def synthesize_speech(text, voice: nil, model: nil, response_format: "mp3", **opts)
  SmartPrompt.logger.info "SiliconFlowAdapter: TTS"
  raise Error, "Text cannot be empty" if text.nil? || text.to_s.strip.empty?

  model_name = model || @config["tts_model"] || "FunAudioLLM/CosyVoice2-0.5B"
  body = { "model" => model_name, "input" => text.to_s }
  body["voice"]           = voice            if voice
  body["response_format"] = response_format
  body["speed"]           = opts[:speed]           if opts[:speed]
  body["sample_rate"]     = opts[:sample_rate]     if opts[:sample_rate]
  body["gain"]            = opts[:gain]            if opts[:gain]
  body["language"]        = opts[:language]        if opts[:language]

  audio = http_post_binary(@speech_url, body)
  "data:audio/#{response_format};base64,#{Base64.strict_encode64(audio)}"
rescue LLMAPIError, Error
  raise
rescue => e
  raise Error, "Failed to call SiliconFlow TTS: #{e.message}"
end

#synthesize_to_file(text, output_path, voice: nil, model: nil, response_format: "mp3", **opts) ⇒ Object



30
31
32
33
34
35
36
37
# File 'lib/smart_prompt/adapters/siliconflow/voice.rb', line 30

def synthesize_to_file(text, output_path, voice: nil, model: nil, response_format: "mp3", **opts)
  data_url = synthesize_speech(text, voice: voice, model: model, response_format: response_format, **opts)
  FileUtils.mkdir_p(File.dirname(output_path))
  audio_bytes = Base64.decode64(data_url.sub(/\Adata:audio\/\w+;base64,/, ""))
  File.binwrite(output_path, audio_bytes)
  SmartPrompt.logger.info "SiliconFlow audio saved to #{output_path}"
  { file_path: output_path, format: response_format }
end

#transcribe_audio(audio_file, model: nil, language: nil, **opts) ⇒ Object

Transcribe an audio file (local path). Returns text:. The transcription endpoint takes multipart/form-data with a ‘file` field.



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/smart_prompt/adapters/siliconflow/voice.rb', line 41

def transcribe_audio(audio_file, model: nil, language: nil, **opts)
  SmartPrompt.logger.info "SiliconFlowAdapter: ASR #{File.basename(audio_file)}"
  raise Error, "Audio file not found: #{audio_file}" unless File.exist?(audio_file)

  model_name = model || @config["asr_model"] || "FunAudioLLM/SenseVoiceSmall"
  form = { "model" => model_name }
  form["language"]         = language if language
  form["prompt"]           = opts[:prompt]           if opts[:prompt]
  form["response_format"]  = opts[:response_format]  if opts[:response_format]

  mime = "audio/#{File.extname(audio_file).downcase.delete(".") || "wav"}"
  response = http_post_multipart(@transcription_url, form, "file", audio_file, mime)
  { text: response["text"] }
rescue LLMAPIError, Error
  raise
rescue => e
  raise e.is_a?(SmartPrompt::Error) ? e : Error, "Failed to call SiliconFlow ASR: #{e.message}"
end

#upload_voice(name, audio_file, text: nil, model: nil) ⇒ Object

Upload a reference audio to clone a custom voice. SiliconFlow returns “speech:…”. ‘customName` (camelCase) is the display name.



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/smart_prompt/adapters/siliconflow/voice.rb', line 62

def upload_voice(name, audio_file, text: nil, model: nil)
  SmartPrompt.logger.info "SiliconFlowAdapter: upload voice #{name}"
  raise Error, "Audio file not found: #{audio_file}" unless File.exist?(audio_file)

  model_name = model || @config["tts_model"] || "FunAudioLLM/CosyVoice2-0.5B"
  form = { "model" => model_name, "customName" => name.to_s }
  form["text"] = text.to_s if text
  mime = "audio/#{File.extname(audio_file).downcase.delete(".") || "wav"}"
  response = http_post_multipart(@voice_upload_url, form, "file", audio_file, mime)
  raise LLMAPIError, "No uri in SiliconFlow voice upload response: #{response.inspect}" unless response["uri"]
  { uri: response["uri"], name: name.to_s, raw: response }
rescue LLMAPIError, Error
  raise
rescue => e
  raise e.is_a?(SmartPrompt::Error) ? e : Error, "Failed to upload SiliconFlow voice: #{e.message}"
end