Class: Typecast::SpeechComposer

Inherits:
Object
  • Object
show all
Defined in:
lib/typecast/composer.rb

Instance Method Summary collapse

Constructor Details

#initialize(text_to_speech) ⇒ SpeechComposer

Returns a new instance of SpeechComposer.



29
30
31
32
33
# File 'lib/typecast/composer.rb', line 29

def initialize(text_to_speech)
  @text_to_speech = text_to_speech
  @defaults = {}
  @parts = []
end

Instance Method Details

#defaults(voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
# File 'lib/typecast/composer.rb', line 35

def defaults(voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil)
  @defaults = merge_settings(@defaults, settings_hash(
    voice_id: voice_id,
    model: model,
    language: language,
    prompt: prompt,
    output: output,
    seed: seed
  ))
  self
end

#generateObject

Raises:

  • (ArgumentError)


75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/typecast/composer.rb', line 75

def generate
  plan = build_plan
  unless plan.any? { |part| part.is_a?(Hash) && part[:kind] == "speech" }
    raise ArgumentError, "at least one speech segment is required"
  end

  output_format = @defaults.dig(:output, :audio_format) || Models::AUDIO_WAV
  unless [Models::AUDIO_WAV, Models::AUDIO_MP3].include?(output_format)
    raise ArgumentError, "unsupported composed speech output format: #{output_format}"
  end

  wav_spec = nil
  output_samples = []
  plan.each do |part|
    if part.is_a?(PausePart)
      raise ArgumentError, "pause cannot be the first composed part" if wav_spec.nil?

      output_samples.concat(Array.new(seconds_to_samples(part.seconds, wav_spec[:sample_rate]), 0))
      next
    end

    response = @text_to_speech.call(request_from_settings(part[:text], part[:settings]))
    wav = parse_wav(response.audio_data)
    if wav_spec && wav[:spec] != wav_spec
      raise ArgumentError, "all composed WAV segments must use the same PCM format"
    end

    wav_spec = wav[:spec]
    output_samples.concat(trim_silence(wav[:samples]))
  end

  wav_data = encode_wav(output_samples, wav_spec)
  raise ArgumentError, "ffmpeg is required to encode composed speech as mp3" if output_format == Models::AUDIO_MP3

  Models::TTSResponse.new(
    audio_data: wav_data,
    duration: output_samples.length.to_f / wav_spec[:sample_rate],
    format: Models::AUDIO_WAV
  )
end

#pause(seconds) ⇒ Object

Inserts silence between speech segments.

seconds is a duration in seconds. Use 0.3 for 300 ms, 3 for 3 seconds.



66
67
68
69
70
71
72
73
# File 'lib/typecast/composer.rb', line 66

def pause(seconds)
  unless seconds.is_a?(Numeric) && seconds.finite? && seconds.positive?
    raise ArgumentError, "pause seconds must be greater than 0"
  end

  @parts << PausePart.new(kind: "pause", seconds: seconds.to_f)
  self
end

#say(text, voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/typecast/composer.rb', line 47

def say(text, voice_id: nil, model: nil, language: nil, prompt: nil, output: nil, seed: nil)
  @parts << {
    kind: "speech",
    text: text.to_s,
    settings: merge_settings(@defaults, settings_hash(
      voice_id: voice_id,
      model: model,
      language: language,
      prompt: prompt,
      output: output,
      seed: seed
    ))
  }
  self
end