Class: Yosina::TransliterationRecipe

Inherits:
Object
  • Object
show all
Defined in:
lib/yosina/recipes.rb

Overview

Configuration recipe for building transliterator chains rubocop:disable Metrics/ClassLength

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(kanji_old_new: false, hira_kata: nil, replace_japanese_iteration_marks: false, replace_suspicious_hyphens_to_prolonged_sound_marks: false, replace_combined_characters: false, replace_circled_or_squared_characters: false, replace_ideographic_annotations: false, replace_radicals: false, replace_spaces: false, replace_hyphens: false, replace_mathematical_alphanumerics: false, replace_roman_numerals: false, replace_archaic_hirakatas: false, replace_small_hirakatas: false, convert_historical_hirakatas: nil, combine_decomposed_hiraganas_and_katakanas: false, to_fullwidth: false, to_halfwidth: false, remove_ivs_svs: false, charset: 'unijis_2004') ⇒ TransliterationRecipe

Initialize a new transliterator recipe

@example
  # Input:  "時々"
  # Output: "時時"
  # Input:  "いすゞ"
  # Output: "いすず"
@example
  # Input:  "スーパ-" (with hyphen-minus)
  # Output: "スーパー" (becomes prolonged sound mark)
@example
  # Input:  "①②③"
  # Output: "(1)(2)(3)"
  # Input:  "㊙㊗"
  # Output: "(秘)(祝)"

rubocop:disable Metrics/ParameterLists

Parameters:

  • kanji_old_new (Boolean) (defaults to: false)

    Replace old-style kanji glyphs with modern equivalents @example

    # Input:  "舊字體の變換"
    # Output: "旧字体の変換"
    
  • hira_kata (String, nil) (defaults to: nil)

    Convert between hiragana and katakana scripts @example

    # Input:  "ひらがな" (with 'hira-to-kata')
    # Output: "ヒラガナ"
    # Input:  "カタカナ" (with 'kata-to-hira')
    # Output: "かたかな"
    
  • replace_japanese_iteration_marks (Boolean) (defaults to: false)

    Replace Japanese iteration marks with the characters they represent

  • replace_suspicious_hyphens_to_prolonged_sound_marks (Boolean) (defaults to: false)

    Replace suspicious hyphens with prolonged sound marks

  • replace_combined_characters (Boolean) (defaults to: false)

    Replace combined characters with their corresponding characters @example

    # Input:  "㍻" (single character for Heisei era)
    # Output: "平成"
    # Input:  "㈱"
    # Output: "(株)"
    
  • replace_circled_or_squared_characters (Boolean, String) (defaults to: false)

    Replace circled or squared characters with templates

  • replace_ideographic_annotations (Boolean) (defaults to: false)

    Replace ideographic annotations @example

    # Input:  "㆖㆘" (ideographic annotations)
    # Output: "上下"
    
  • replace_radicals (Boolean) (defaults to: false)

    Replace Kangxi radicals with CJK ideographs @example

    # Input:  "⾔⾨⾷" (Kangxi radicals)
    # Output: "言門食" (CJK ideographs)
    
  • replace_spaces (Boolean) (defaults to: false)

    Replace various space characters @example

    # Input:  "A B" (ideographic space U+3000)
    # Output: "A B" (half-width space)
    # Input:  "A B" (non-breaking space U+00A0)
    # Output: "A B" (regular space)
    
  • replace_hyphens (Boolean, Array<String>) (defaults to: false)

    Replace various dash/hyphen symbols @example

    # Input:  "2019—2020" (em dash)
    # Output: "2019-2020" (hyphen-minus)
    # Input:  "A–B" (en dash)
    # Output: "A-B"
    
  • replace_mathematical_alphanumerics (Boolean) (defaults to: false)

    Replace mathematical alphanumerics @example

    # Input:  "𝐀𝐁𝐂" (mathematical bold)
    # Output: "ABC"
    # Input:  "𝟏𝟐𝟑" (mathematical bold digits)
    # Output: "123"
    
  • replace_roman_numerals (Boolean) (defaults to: false)

    Replace roman numeral characters @example

    # Input:  "Ⅲ" (Roman numeral III)
    # Output: "III"
    # Input:  "ⅻ" (Roman numeral xii)
    # Output: "xii"
    
  • combine_decomposed_hiraganas_and_katakanas (Boolean) (defaults to: false)

    Combine decomposed hiraganas/katakanas @example

    # Input:  "が" (か + ゙)
    # Output: "が" (single character)
    # Input:  "ヘ゜" (ヘ + ゜)
    # Output: "ペ" (single character)
    
  • to_fullwidth (Boolean, String) (defaults to: false)

    Replace half-width with fullwidth characters @example

    # Input:  "ABC123"
    # Output: "ABC123"
    # Input:  "カタカナ"
    # Output: "カタカナ"
    
  • to_halfwidth (Boolean, String) (defaults to: false)

    Replace full-width with half-width characters @example

    # Input:  "ABC123"
    # Output: "ABC123"
    # Input:  "カタカナ" (with hankaku-kana)
    # Output: "カタカナ"
    
  • remove_ivs_svs (Boolean, String) (defaults to: false)

    Remove IVS/SVS selectors @example

    # Input:  "葛󠄀" (葛 + IVS U+E0100)
    # Output: "葛" (without selector)
    # Input:  "辻󠄀" (辻 + IVS)
    # Output: "辻"
    
  • charset (String) (defaults to: 'unijis_2004')

    Charset for IVS/SVS transliteration



161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/yosina/recipes.rb', line 161

def initialize(kanji_old_new: false, hira_kata: nil, replace_japanese_iteration_marks: false,
               replace_suspicious_hyphens_to_prolonged_sound_marks: false,
               replace_combined_characters: false, replace_circled_or_squared_characters: false,
               replace_ideographic_annotations: false, replace_radicals: false,
               replace_spaces: false, replace_hyphens: false,
               replace_mathematical_alphanumerics: false, replace_roman_numerals: false,
               replace_archaic_hirakatas: false, replace_small_hirakatas: false,
               convert_historical_hirakatas: nil,
               combine_decomposed_hiraganas_and_katakanas: false,
               to_fullwidth: false, to_halfwidth: false, remove_ivs_svs: false,
               charset: 'unijis_2004')
  @kanji_old_new = kanji_old_new
  @hira_kata = hira_kata
  @replace_japanese_iteration_marks = replace_japanese_iteration_marks
  @replace_suspicious_hyphens_to_prolonged_sound_marks = replace_suspicious_hyphens_to_prolonged_sound_marks
  @replace_combined_characters = replace_combined_characters
  @replace_circled_or_squared_characters = replace_circled_or_squared_characters
  @replace_ideographic_annotations = replace_ideographic_annotations
  @replace_radicals = replace_radicals
  @replace_spaces = replace_spaces
  @replace_hyphens = replace_hyphens
  @replace_mathematical_alphanumerics = replace_mathematical_alphanumerics
  @replace_roman_numerals = replace_roman_numerals
  @replace_archaic_hirakatas = replace_archaic_hirakatas
  @replace_small_hirakatas = replace_small_hirakatas
  @convert_historical_hirakatas = convert_historical_hirakatas
  @combine_decomposed_hiraganas_and_katakanas = combine_decomposed_hiraganas_and_katakanas
  @to_fullwidth = to_fullwidth
  @to_halfwidth = to_halfwidth
  @remove_ivs_svs = remove_ivs_svs
  @charset = charset
end

Instance Attribute Details

#charsetObject

Returns the value of attribute charset.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def charset
  @charset
end

#combine_decomposed_hiraganas_and_katakanasObject

Returns the value of attribute combine_decomposed_hiraganas_and_katakanas.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def combine_decomposed_hiraganas_and_katakanas
  @combine_decomposed_hiraganas_and_katakanas
end

#convert_historical_hirakatasObject

Returns the value of attribute convert_historical_hirakatas.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def convert_historical_hirakatas
  @convert_historical_hirakatas
end

#hira_kataObject

Returns the value of attribute hira_kata.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def hira_kata
  @hira_kata
end

#kanji_old_newObject

Returns the value of attribute kanji_old_new.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def kanji_old_new
  @kanji_old_new
end

#remove_ivs_svsObject

Returns the value of attribute remove_ivs_svs.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def remove_ivs_svs
  @remove_ivs_svs
end

#replace_archaic_hirakatasObject

Returns the value of attribute replace_archaic_hirakatas.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_archaic_hirakatas
  @replace_archaic_hirakatas
end

#replace_circled_or_squared_charactersObject

Returns the value of attribute replace_circled_or_squared_characters.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_circled_or_squared_characters
  @replace_circled_or_squared_characters
end

#replace_combined_charactersObject

Returns the value of attribute replace_combined_characters.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_combined_characters
  @replace_combined_characters
end

#replace_hyphensObject

Returns the value of attribute replace_hyphens.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_hyphens
  @replace_hyphens
end

#replace_ideographic_annotationsObject

Returns the value of attribute replace_ideographic_annotations.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_ideographic_annotations
  @replace_ideographic_annotations
end

#replace_japanese_iteration_marksObject

Returns the value of attribute replace_japanese_iteration_marks.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_japanese_iteration_marks
  @replace_japanese_iteration_marks
end

#replace_mathematical_alphanumericsObject

Returns the value of attribute replace_mathematical_alphanumerics.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_mathematical_alphanumerics
  @replace_mathematical_alphanumerics
end

#replace_radicalsObject

Returns the value of attribute replace_radicals.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_radicals
  @replace_radicals
end

#replace_roman_numeralsObject

Returns the value of attribute replace_roman_numerals.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_roman_numerals
  @replace_roman_numerals
end

#replace_small_hirakatasObject

Returns the value of attribute replace_small_hirakatas.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_small_hirakatas
  @replace_small_hirakatas
end

#replace_spacesObject

Returns the value of attribute replace_spaces.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_spaces
  @replace_spaces
end

#replace_suspicious_hyphens_to_prolonged_sound_marksObject

Returns the value of attribute replace_suspicious_hyphens_to_prolonged_sound_marks.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def replace_suspicious_hyphens_to_prolonged_sound_marks
  @replace_suspicious_hyphens_to_prolonged_sound_marks
end

#to_fullwidthObject

Returns the value of attribute to_fullwidth.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def to_fullwidth
  @to_fullwidth
end

#to_halfwidthObject

Returns the value of attribute to_halfwidth.



56
57
58
# File 'lib/yosina/recipes.rb', line 56

def to_halfwidth
  @to_halfwidth
end

Instance Method Details

#build_transliterator_configsArray<Array>

Build transliterator configurations from this recipe

Returns:

  • (Array<Array>)

    Array of transliterator configurations

Raises:

  • (ArgumentError)

    If the recipe contains mutually exclusive options



199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# File 'lib/yosina/recipes.rb', line 199

def build_transliterator_configs
  # Check for mutually exclusive options
  errors = []
  errors << 'to_fullwidth and to_halfwidth are mutually exclusive' if to_fullwidth && to_halfwidth

  raise ArgumentError, errors.join('; ') unless errors.empty?

  ctx = TransliteratorConfigListBuilder.new

  # Apply transformations in the specified order
  ctx = apply_kanji_old_new(ctx)
  ctx = apply_replace_suspicious_hyphens_to_prolonged_sound_marks(ctx)
  ctx = apply_replace_circled_or_squared_characters(ctx)
  ctx = apply_replace_combined_characters(ctx)
  ctx = apply_replace_ideographic_annotations(ctx)
  ctx = apply_replace_radicals(ctx)
  ctx = apply_replace_spaces(ctx)
  ctx = apply_replace_hyphens(ctx)
  ctx = apply_replace_mathematical_alphanumerics(ctx)
  ctx = apply_replace_roman_numerals(ctx)
  ctx = apply_replace_archaic_hirakatas(ctx)
  ctx = apply_replace_small_hirakatas(ctx)
  ctx = apply_convert_historical_hirakatas(ctx)
  ctx = apply_combine_decomposed_hiraganas_and_katakanas(ctx)
  ctx = apply_to_fullwidth(ctx)
  ctx = apply_hira_kata(ctx)
  ctx = apply_replace_japanese_iteration_marks(ctx)
  ctx = apply_to_halfwidth(ctx)
  ctx = apply_remove_ivs_svs(ctx)

  ctx.build
end