Class: Typecast::SpeechComposer
- Inherits:
-
Object
- Object
- Typecast::SpeechComposer
- Defined in:
- lib/typecast/composer.rb
Instance Method Summary collapse
- #defaults(voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil) ⇒ Object
- #generate ⇒ Object
-
#initialize(text_to_speech) ⇒ SpeechComposer
constructor
A new instance of SpeechComposer.
-
#pause(seconds) ⇒ Object
Inserts silence between speech segments.
- #say(text, voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil) ⇒ Object
Constructor Details
#initialize(text_to_speech) ⇒ SpeechComposer
Returns a new instance of SpeechComposer.
29 30 31 32 33 |
# File 'lib/typecast/composer.rb', line 29 def initialize(text_to_speech) @text_to_speech = text_to_speech @defaults = {} @parts = [] end |
Instance Method Details
#defaults(voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil) ⇒ Object
35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/typecast/composer.rb', line 35 def defaults(voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil) @defaults = merge_settings(@defaults, settings_hash( voice_id: voice_id, model: model, language: language, prompt: prompt, output: output, seed: seed )) self end |
#generate ⇒ Object
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/typecast/composer.rb', line 75 def generate plan = build_plan unless plan.any? { |part| part.is_a?(Hash) && part[:kind] == "speech" } raise ArgumentError, "at least one speech segment is required" end output_format = @defaults.dig(:output, :audio_format) || Models::AUDIO_WAV unless [Models::AUDIO_WAV, Models::AUDIO_MP3].include?(output_format) raise ArgumentError, "unsupported composed speech output format: #{output_format}" end wav_spec = nil output_samples = [] plan.each do |part| if part.is_a?(PausePart) raise ArgumentError, "pause cannot be the first composed part" if wav_spec.nil? output_samples.concat(Array.new(seconds_to_samples(part.seconds, wav_spec[:sample_rate]), 0)) next end response = @text_to_speech.call(request_from_settings(part[:text], part[:settings])) wav = parse_wav(response.audio_data) if wav_spec && wav[:spec] != wav_spec raise ArgumentError, "all composed WAV segments must use the same PCM format" end wav_spec = wav[:spec] output_samples.concat(trim_silence(wav[:samples])) end wav_data = encode_wav(output_samples, wav_spec) raise ArgumentError, "ffmpeg is required to encode composed speech as mp3" if output_format == Models::AUDIO_MP3 Models::TTSResponse.new( audio_data: wav_data, duration: output_samples.length.to_f / wav_spec[:sample_rate], format: Models::AUDIO_WAV ) end |
#pause(seconds) ⇒ Object
Inserts silence between speech segments.
seconds is a duration in seconds. Use 0.3 for 300 ms, 3 for 3 seconds.
66 67 68 69 70 71 72 73 |
# File 'lib/typecast/composer.rb', line 66 def pause(seconds) unless seconds.is_a?(Numeric) && seconds.finite? && seconds.positive? raise ArgumentError, "pause seconds must be greater than 0" end @parts << PausePart.new(kind: "pause", seconds: seconds.to_f) self end |
#say(text, voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil) ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/typecast/composer.rb', line 47 def say(text, voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil) @parts << { kind: "speech", text: text.to_s, settings: merge_settings(@defaults, settings_hash( voice_id: voice_id, model: model, language: language, prompt: prompt, output: output, seed: seed )) } self end |