Class: SmartPrompt::STTAdapter

Inherits:
LLMAdapter show all
Defined in:
lib/smart_prompt/stt_adapter.rb

Constant Summary collapse

SUPPORTED_AUDIO_FORMATS =

Supported audio formats

%w[mp3 mp4 mpeg mpga m4a wav webm]
SUPPORTED_LANGUAGES =

Supported languages for speech recognition

%w[zh en ja ko]
MAX_FILE_SIZE =

Maximum file size (25MB)

25 * 1024 * 1024

Instance Attribute Summary

Attributes inherited from LLMAdapter

#last_response

Instance Method Summary collapse

Constructor Details

#initialize(config) ⇒ STTAdapter

Returns a new instance of STTAdapter.



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/smart_prompt/stt_adapter.rb', line 17

def initialize(config)
  super
  api_key = @config["api_key"]
  if api_key.is_a?(String) && api_key.start_with?("ENV[") && api_key.end_with?("]")
    api_key = eval(api_key)
  end
  begin
    @client = OpenAI::Client.new(
      access_token: api_key,
      uri_base: @config["url"],
      request_timeout: 120,
    )
  rescue OpenAI::ConfigurationError => e
    SmartPrompt.logger.error "Failed to initialize STT client: #{e.message}"
    raise LLMAPIError, "Invalid STT configuration: #{e.message}"
  rescue OpenAI::Error => e
    SmartPrompt.logger.error "Failed to initialize STT client: #{e.message}"
    raise LLMAPIError, "STT authentication failed: #{e.message}"
  rescue SocketError => e
    SmartPrompt.logger.error "Failed to initialize STT client: #{e.message}"
    raise LLMAPIError, "Network error: Unable to connect to STT API"
  rescue => e
    SmartPrompt.logger.error "Failed to initialize STT client: #{e.message}"
    raise Error, "Unexpected error initializing STT client: #{e.message}"
  ensure
    SmartPrompt.logger.info "Successfully created an STT client."
  end
end

Instance Method Details

#detect_language(text) ⇒ Object

Language detection (basic implementation)



231
232
233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/smart_prompt/stt_adapter.rb', line 231

def detect_language(text)
  SmartPrompt.logger.info "STTAdapter: Detecting language from text"

  # Simple language detection based on character ranges
  if text =~ /[\u4e00-\u9fff]/
    "zh"
  elsif text =~ /[\u3040-\u309f\u30a0-\u30ff]/
    "ja"
  elsif text =~ /[\uac00-\ud7af]/
    "ko"
  else
    "en"
  end
end

#get_audio_info(audio_file) ⇒ Object

Get audio file information



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/smart_prompt/stt_adapter.rb', line 193

def get_audio_info(audio_file)
  SmartPrompt.logger.info "STTAdapter: Getting audio file information"

  begin
    unless File.exist?(audio_file)
      raise Error, "Audio file not found: #{audio_file}"
    end

    file_ext = File.extname(audio_file).downcase.delete(".")
    unless SUPPORTED_AUDIO_FORMATS.include?(file_ext)
      raise Error, "Unsupported audio format: #{file_ext}"
    end

    file_size = File.size(audio_file)
    if file_size > MAX_FILE_SIZE
      raise Error, "Audio file too large (max #{MAX_FILE_SIZE / (1024 * 1024)}MB)"
    end

    # Estimate duration (rough calculation)
    # Note: This is a simplified estimation, actual duration may vary
    duration = estimate_audio_duration(file_size, file_ext)

    {
      file_path: audio_file,
      file_name: File.basename(audio_file),
      file_size: file_size,
      format: file_ext,
      estimated_duration: duration,
      supported: true
    }

  rescue => e
    SmartPrompt.logger.error "Error getting audio info: #{e.message}"
    raise Error, "Error getting audio info: #{e.message}"
  end
end

#transcribe_audio(audio_file, model: nil, language: nil, prompt: nil, temperature: 0.0, response_format: "json") ⇒ Object

Speech-to-text transcription



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/smart_prompt/stt_adapter.rb', line 47

def transcribe_audio(audio_file, model: nil, language: nil, prompt: nil, temperature: 0.0, response_format: "json")
  SmartPrompt.logger.info "STTAdapter: Transcribing audio to text"

  model_name = model || @config["model"]

  # Validate parameters
  validate_stt_parameters(audio_file, language, response_format)

  begin
    # Prepare audio file
    audio_data = prepare_audio_file(audio_file)

    parameters = {
      model: model_name,
      file: audio_data[:file],
      temperature: temperature,
      response_format: response_format
    }

    # Add optional parameters
    parameters[:language] = language if language
    parameters[:prompt] = prompt if prompt

    SmartPrompt.logger.info "STT parameters: #{parameters.except(:file)}"

    # Custom implementation for STT since OpenAI gem doesn't support audio transcription endpoints
    response = submit_stt_request(parameters)

    @last_response = response

    # Process response
    if response["text"]
      transcription_data = {
        text: response["text"],
        language: language,
        duration: audio_data[:duration],
        file_size: audio_data[:file_size],
        format: audio_data[:format]
      }

      SmartPrompt.logger.info "STT transcription successful, transcribed #{response['text'].length} characters"
      return transcription_data
    else
      SmartPrompt.logger.error "No text in STT response"
      raise LLMAPIError, "No text in STT response"
    end

  rescue OpenAI::Error => e
    SmartPrompt.logger.error "STT API error: #{e.message}"
    raise LLMAPIError, "STT API error: #{e.message}"
  rescue => e
    SmartPrompt.logger.error "Unexpected error during STT transcription: #{e.message}"
    raise Error, "Unexpected error during STT transcription: #{e.message}"
  end
end

#transcribe_audio_url(audio_url, model: nil, language: nil, prompt: nil, temperature: 0.0, response_format: "json") ⇒ Object

Transcribe audio from URL



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# File 'lib/smart_prompt/stt_adapter.rb', line 104

def transcribe_audio_url(audio_url, model: nil, language: nil, prompt: nil, temperature: 0.0, response_format: "json")
  SmartPrompt.logger.info "STTAdapter: Transcribing audio from URL"

  model_name = model || @config["model"]

  begin
    parameters = {
      model: model_name,
      audio_url: audio_url,
      temperature: temperature,
      response_format: response_format
    }

    # Add optional parameters
    parameters[:language] = language if language
    parameters[:prompt] = prompt if prompt

    SmartPrompt.logger.info "STT URL parameters: #{parameters}"

    # Custom implementation for URL-based STT
    response = submit_stt_url_request(parameters)

    @last_response = response

    if response["text"]
      transcription_data = {
        text: response["text"],
        language: language,
        audio_url: audio_url
      }

      SmartPrompt.logger.info "STT URL transcription successful, transcribed #{response['text'].length} characters"
      return transcription_data
    else
      SmartPrompt.logger.error "No text in STT URL response"
      raise LLMAPIError, "No text in STT URL response"
    end

  rescue => e
    SmartPrompt.logger.error "Error in URL transcription: #{e.message}"
    raise Error, "Error in URL transcription: #{e.message}"
  end
end

#transcribe_batch(audio_files, model: nil, language: nil, prompt: nil, temperature: 0.0) ⇒ Object

Batch transcription



149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/smart_prompt/stt_adapter.rb', line 149

def transcribe_batch(audio_files, model: nil, language: nil, prompt: nil, temperature: 0.0)
  SmartPrompt.logger.info "STTAdapter: Batch transcribing #{audio_files.size} audio files"

  results = []

  audio_files.each_with_index do |audio_file, index|
    begin
      SmartPrompt.logger.info "Transcribing file #{index + 1}/#{audio_files.size}: #{File.basename(audio_file)}"

      result = transcribe_audio(
        audio_file,
        model: model,
        language: language,
        prompt: prompt,
        temperature: temperature
      )

      results << {
        file: audio_file,
        index: index,
        transcription: result,
        success: true
      }

    rescue => e
      SmartPrompt.logger.error "Failed to transcribe #{audio_file}: #{e.message}"
      results << {
        file: audio_file,
        index: index,
        error: e.message,
        success: false
      }
    end
  end

  {
    total_files: audio_files.size,
    successful: results.count { |r| r[:success] },
    failed: results.count { |r| !r[:success] },
    results: results
  }
end