Class: IsoDoc::I18n

Inherits:

Object

Object
IsoDoc::I18n

show all

Defined in:: lib/isodoc/date.rb,
lib/isodoc/i18n.rb,
lib/isodoc/l10n.rb,
lib/isodoc/l10n_cjk.rb,
lib/isodoc/i18n-yaml.rb,
lib/isodoc/i18n/version.rb,
lib/isodoc/liquid/liquid.rb

Defined Under Namespace

Modules: Liquid

Constant Summary collapse

CJK_SCRIPTS =

%w(Hans Hant Jpan Kore).freeze

INFLECTIONS =

{
  number: "sg",
  case: "nom",
  gender: "m",
  person: "3rd",
  voice: "act",
  mood: "ind",
  tense: "pres",
}.freeze

INFLECTION_ORDER =

%i(voice mood tense number case gender person).freeze

ZH_CHAR = Use comprehensive CJK definition from metanorma-utils This includes Han, Katakana, Hiragana, Hangul, Bopomofo and all CJK extensions

"(#{Metanorma::Utils::CJK})".freeze

LATIN_PUNCT =

/[:,.()\[\];?!-]/.freeze

ZH_NON_PUNCT = CJK character which is not punctuation

"(#{
[
  Metanorma::Utils.singleton_class::HAN,
  Metanorma::Utils.singleton_class::HAN_IDC,
  Metanorma::Utils.singleton_class::KANBUN,
  Metanorma::Utils.singleton_class::CJK_COMPAT_IDEOGRAPHS,
  Metanorma::Utils.singleton_class::HAN_COMPAT_IDEOGRAPHS,
  Metanorma::Utils.singleton_class::HANGUL,
  Metanorma::Utils.singleton_class::HIRAGANA,
  Metanorma::Utils.singleton_class::KATAKANA,
  Metanorma::Utils.singleton_class::BOPOMOFO,
].join("|")})".freeze

ZH1_PUNCT = Condition for converting punctuation to double width, in case of options (Strict condition) CJK before, CJK after, modulo ignorable characters: 1a. CJK character, or start of string. Latin spaces optional.

/(#{ZH_CHAR}|^)(\s*)$/xo.freeze

ZH2_PUNCT = 1b. Latin spaces optional, Latin punct which will also convert to CJK, CJK character, or end of string.

/^\s*#{LATIN_PUNCT}*(#{ZH_CHAR}|$)/xo.freeze

ZH1_NO_SPACE = CJK before, space after: 2a. CJK char, followed by optional Latin punct which will also convert to CJK

/#{ZH_CHAR}#{LATIN_PUNCT}*$/xo.freeze

OPT_PUNCT_SPACE = 2b. optional Latin punct which wil also convert to CJK, then space

/^($|#{LATIN_PUNCT}*\s)/xo.freeze

ZH_NUMERALS = Chinese numerals (common + formal/financial forms) Explicit characters needed because Chinese numeral ideographs are not tagged with Unicode Number property Using alternation instead of character class to properly include pN

"(?:[零一二三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟萬億兆]|\\p{N})".freeze

ZH1_DASH = Contexts for converting en-dashes to full-width Before: CJK or start of string, no digits

/(#{ZH_CHAR}|^)(?<!=#{ZH_NUMERALS})$/xo.freeze

ZH2_DASH = After: no optional digits, CJK or end of string

/^(?!#{ZH_NUMERALS})(#{ZH_CHAR}|$)/xo.freeze

ZH1_NUM_DASH = Before: CJK or start of string, optional digits

/#{ZH_NUMERALS}$/xo.freeze

ZH2_NUM_DASH = After: optional digits, CJK or end of string

/^#{ZH_NUMERALS}/xo.freeze

ZH_PUNCT_CONTEXTS =

[[ZH1_PUNCT, ZH2_PUNCT], [ZH1_NO_SPACE, OPT_PUNCT_SPACE],
[/(\s|^)$/, /^#{ZH_CHAR}/o]].freeze

ZH_PUNCT_AUTOTEXT = map of YAML punct keys to auto-text Latin equivalents

{
  colon: ":",
  comma: ",",
  # "enum-comma": ",", # enum-comma is ambiguous with comma
  semicolon: ";",
  period: ".",
  "close-paren": ")",
  "open-paren": "(",
  "close-bracket": "]",
  "open-bracket": "[",
  "question-mark": "?",
  "exclamation-mark": "!",
  "em-dash": "—",
  "open-quote": "“",
  "close-quote": "”",
  "open-nested-quote": "’",
  "close-nested-quote": "’",
  ellipse: "…",
}.freeze

VERSION =

"1.4.4".freeze

Instance Attribute Summary collapse

#labels ⇒ Object

Returns the value of attribute labels.
#lang ⇒ Object readonly

Returns the value of attribute lang.
#locale ⇒ Object readonly

Returns the value of attribute locale.
#script ⇒ Object readonly

Returns the value of attribute script.

Class Method Summary collapse

.cjk_extend(text) ⇒ Object
.l10n(text, lang = @lang, script = @script, options = {}) ⇒ Object

Instance Method Summary collapse

#am_pm_i18n(val) ⇒ Object
#bidiwrap(text, lang, script) ⇒ Object
#bidiwrap_vars(lang, script) ⇒ Object
#boolean_conj(list, conn) ⇒ Object
#build_esc_indices(xml, text_nodes) ⇒ Object

Build set of indices for text nodes within <esc> tags Handles both namespaced and non-namespaced <esc> elements.
#build_text_cache(text_nodes, prev_context = nil, foll_context = nil) ⇒ Object

Cache text content once per method call to avoid repeated .text calls Build text cache with optional prepended/appended context Also, reduce multiple spaces to single, to avoid miscrecognition of space.
#calendar_data ⇒ Object
#cjk_extend(title) ⇒ Object
#cleanup_entities(text, is_xml: true) ⇒ Object
#convert_date_format(fmt) ⇒ Object
#date(value, format) ⇒ Object
#date_i18n(val) ⇒ Object
#day_i18n(val) ⇒ Object
#enum_comma ⇒ Object
#get ⇒ Object
#inflect(word, options) ⇒ Object

can skip category if not present.
#inflect_ordinal(num, term, ord_class) ⇒ Object

ord class is either SpelloutRules or OrdinalRules.
#init_labels(i18nyaml, i18nhash) ⇒ Object
#init_zh_punct_map ⇒ Object

Pre-defined punctuation mappings for efficiency.
#initialize(lang, script, locale: nil, i18nyaml: nil, i18nhash: nil) ⇒ I18n constructor

A new instance of I18n.
#interleave_space_cjk?(text) ⇒ Boolean
#l10_context_valid?(context, idx, delim, regex) ⇒ Boolean
#l10_zh1(text, prev, foll, _script, options) ⇒ Object

note: we can’t differentiate comma from enumeration comma 、 def l10_zh1(text, _script).
#l10n(text, lang = @lang, script = @script, options = {}) ⇒ Object

function localising spaces and punctuation options and options are optional context strings options allows contextual full-width vs half-width punctuation.
#l10n_context(nodes, idx) ⇒ Object

Fallback method for backward compatibility.
#l10n_context_cached(text_cache, idx) ⇒ Object

previous, following context of current text node: do not use just the immediately adjoining text tokens for context deal with spaces and empty text by just concatenating entire context Optimized to avoid O(n²) complexity by using pre-cached text content.
#l10n_context_found_delimiter?(token, delim) ⇒ Boolean
#l10n_fr(text, locale, options) ⇒ Object
#l10n_fr1(text, prev, foll, locale) ⇒ Object
#l10n_gsub(text, prev, foll, delim, regexes) ⇒ Object

text: string we are scanning for instances of delim to replace prev: string preceding text, as additional token of context foll: string following text, as additional token of context delim: delim is the symbol we want to replace, delim its replacement regexes: a list of regex pairs: the context before the found token, and the context after the found token, under which replacing it with delim is permitted.
#l10n_gsub_context(text, prev, foll, delim) ⇒ Object

split string being scanned, and its contextual tokens before and after, into array of tokens determining whether to replace instances of delim.
#l10n_prep(text, options) ⇒ Object
#l10n_zh(text, script, options) ⇒ Object
#l10n_zh_dash(text, prev, foll) ⇒ Object
#l10n_zh_punct(text, prev, foll, options) ⇒ Object
#l10n_zh_remove_space(text, prev, foll) ⇒ Object
#liquid_init ⇒ Object
#load_yaml(lang, script, i18nyaml = nil, i18nhash = nil) ⇒ Object
#load_yaml1(lang, script) ⇒ Object
#load_yaml2(lang) ⇒ Object

locally defined in calling class.
#merge(new_labels) ⇒ Object
#merge_yaml_files(ret, i18nyaml) ⇒ Object

i18nyaml entries are nominally paths, but callers sometimes pass YAML values (label keys, multi-line prose).
#month_i18n(val) ⇒ Object
#normalise_hash(ret) ⇒ Object
#ordinal_key(term) ⇒ Object
#parse_path(path_expr) ⇒ Object
#populate(keys, vars = {}) ⇒ Object

populate with variables, Liquid, inflections, ordinals/spellout.
#postprocess(labels) ⇒ Object
#resolve_path(path_expr, labels, original_expr) ⇒ Object
#resolve_references(obj, labels) ⇒ Object
#resolve_string_references(str, labels) ⇒ Object
#self_reference_resolve(labels) ⇒ Object
#set(key, val) ⇒ Object
#to_xml(node) ⇒ Object
#tw_cldr_lang ⇒ Object
#tw_cldr_localize(num) ⇒ Object

Constructor Details

#initialize(lang, script, locale: nil, i18nyaml: nil, i18nhash: nil) ⇒ `I18n`

Returns a new instance of I18n.

# File 'lib/isodoc/i18n.rb', line 16

def initialize(lang, script, locale: nil, i18nyaml: nil, i18nhash: nil)
  @lang = lang
  @script = script
  @locale = locale
  @cal = calendar_data
  @cal_en = TwitterCldr::Shared::Calendar.new(:en)
  @c = HTMLEntities.new
  init_labels(i18nyaml, i18nhash)
  liquid_init
  self
end

Instance Attribute Details

#labels ⇒ `Object`

Returns the value of attribute labels.



13
14
15

# File 'lib/isodoc/i18n.rb', line 13

def labels
  @labels
end

#lang ⇒ `Object` (readonly)

Returns the value of attribute lang.



14
15
16

# File 'lib/isodoc/i18n.rb', line 14

def lang
  @lang
end

#locale ⇒ `Object` (readonly)

Returns the value of attribute locale.



14
15
16

# File 'lib/isodoc/i18n.rb', line 14

def locale
  @locale
end

#script ⇒ `Object` (readonly)

Returns the value of attribute script.



14
15
16

# File 'lib/isodoc/i18n.rb', line 14

def script
  @script
end

Class Method Details

.cjk_extend(text) ⇒ `Object`



141
142
143

# File 'lib/isodoc/l10n_cjk.rb', line 141

def self.cjk_extend(text)
  cjk_extend(text)
end

.l10n(text, lang = @lang, script = @script, options = {}) ⇒ `Object`



6
7
8

# File 'lib/isodoc/l10n.rb', line 6

def self.l10n(text, lang = @lang, script = @script, options = {})
  l10n(text, lang, script, options)
end

Instance Method Details

#am_pm_i18n(val) ⇒ `Object`

# File 'lib/isodoc/date.rb', line 19

def am_pm_i18n(val)
  val.gsub(/%\u200cP<am>/, @cal.periods[:am].downcase)
    .gsub(/%\u200cP<pm>/, @cal.periods[:pm].downcase)
    .gsub(/%\u200cp<AM>/, @cal.periods[:am].upcase)
    .gsub(/%\u200cp<PM>/, @cal.periods[:pm].upcase)
end

#bidiwrap(text, lang, script) ⇒ `Object`

# File 'lib/isodoc/l10n.rb', line 24

def bidiwrap(text, lang, script)
  my_script, my_rtl, outer_rtl = bidiwrap_vars(lang, script)
  if my_rtl && !outer_rtl
    mark = %w(Arab Aran).include?(my_script) ? "&#x61c;" : "&#x200f;"
    "#{mark}#{text}#{mark}"
  elsif !my_rtl && outer_rtl then "&#x200e;#{text}&#x200e;"
  else text
  end
end

#bidiwrap_vars(lang, script) ⇒ `Object`

# File 'lib/isodoc/l10n.rb', line 34

def bidiwrap_vars(lang, script)
  my_script = script || Metanorma::Utils.default_script(lang)
  [my_script,
   Metanorma::Utils.rtl_script?(my_script),
   Metanorma::Utils.rtl_script?(@script || Metanorma::Utils
     .default_script(@lang))]
end

#boolean_conj(list, conn) ⇒ `Object`

# File 'lib/isodoc/i18n.rb', line 58

def boolean_conj(list, conn)
  case list.size
  when 0 then ""
  when 1 then list.first
  when 2 then @labels["binary_#{conn}"].sub(/%1/, list[0])
    .sub(/%2/, list[1])
  else
    @labels["multiple_#{conn}"]
      .sub(/%1/, l10n(list[0..-2].join(enum_comma), @lang, @script))
      .sub(/%2/, list[-1])
  end
end

#build_esc_indices(xml, text_nodes) ⇒ `Object`

Build set of indices for text nodes within <esc> tags Handles both namespaced and non-namespaced <esc> elements

# File 'lib/isodoc/l10n.rb', line 56

def build_esc_indices(xml, text_nodes)
  # Try both non-namespaced and namespace-agnostic queries
  esc_text_nodes = Set.new(xml.xpath(".//esc//text()") +
                           xml.xpath(".//*[local-name()='esc']//text()"))
  Set.new.tap do |indices|
    text_nodes.each_with_index do |node, i|
      indices.add(i) if esc_text_nodes.include?(node)
    end
  end
end

#build_text_cache(text_nodes, prev_context = nil, foll_context = nil) ⇒ `Object`

Cache text content once per method call to avoid repeated .text calls Build text cache with optional prepended/appended context Also, reduce multiple spaces to single, to avoid miscrecognition of space

# File 'lib/isodoc/l10n.rb', line 70

def build_text_cache(text_nodes, prev_context = nil, foll_context = nil)
  text_cache = text_nodes.map(&:text).map { |x| x.gsub(/\s+/, " ") }
  text_cache.unshift(prev_context) if prev_context
  text_cache.push(foll_context) if foll_context
  text_cache
end

#calendar_data ⇒ `Object`

# File 'lib/isodoc/i18n.rb', line 34

def calendar_data
  TwitterCldr::Shared::Calendar.new(tw_cldr_lang)
rescue StandardError
  TwitterCldr::Shared::Calendar.new(:en)
end

#cjk_extend(title) ⇒ `Object`

# File 'lib/isodoc/l10n_cjk.rb', line 145

def cjk_extend(title)
  @c.decode(title).chars.map.with_index do |n, i|
    if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
      n
    else "\u3000#{n}"
    end
  end.join
end

#cleanup_entities(text, is_xml: true) ⇒ `Object`

# File 'lib/isodoc/i18n.rb', line 78

def cleanup_entities(text, is_xml: true)
  if is_xml
    text.split(/([<>])/).each_slice(4).map do |a|
      a[0] = @c.decode(a[0])
      a
    end.join
  else
    @c.decode(text)
  end
end

#convert_date_format(fmt) ⇒ `Object`

# File 'lib/isodoc/date.rb', line 10

def convert_date_format(fmt)
  fmt.gsub(/%_/, " ")
    .gsub(/%(\^?)([BbhPpAa])/, "%\u200c\\1\\2<%\\2>")
end

#date(value, format) ⇒ `Object`

# File 'lib/isodoc/date.rb', line 5

def date(value, format)
  date_i18n(DateTime.iso8601(value)
    .strftime(convert_date_format(format)))
end

#date_i18n(val) ⇒ `Object`



15
16
17

# File 'lib/isodoc/date.rb', line 15

def date_i18n(val)
  day_i18n(month_i18n(am_pm_i18n(val)))
end

#day_i18n(val) ⇒ `Object`

# File 'lib/isodoc/date.rb', line 37

def day_i18n(val)
  { A: :wide, a: :abbreviated }.each do |f, t|
    @cal_en.calendar_data[:days][:format][t].each do |k, v|
      m = @cal.calendar_data[:days][:format][t][k]
      val.gsub!(/%\u200c#{f}<#{v}>/, m)
      val.gsub!(/%\u200c\^#{f}<#{v}>/, m.upcase)
    end
  end
  val
end

#enum_comma ⇒ `Object`

# File 'lib/isodoc/i18n.rb', line 71

def enum_comma
  c = @labels.dig("punct", "enum-comma")
  c && CJK_SCRIPTS.include?(@script) and
    return "<enum-comma>#{c}</enum-comma>"
  "<enum-comma>,</enum-comma> "
end

#get ⇒ `Object`



129
130
131

# File 'lib/isodoc/i18n-yaml.rb', line 129

def get
  @labels
end

#inflect(word, options) ⇒ `Object`

can skip category if not present

# File 'lib/isodoc/i18n.rb', line 132

def inflect(word, options)
  i = @labels.dig("inflection", word) or return word
  i.is_a? String and return i
  INFLECTION_ORDER.each do |x|
    infl = options[x] || INFLECTIONS[x]
    i = i[infl] if i[infl]
    i.is_a? String and return i
  end
  word
end

#inflect_ordinal(num, term, ord_class) ⇒ `Object`

ord class is either SpelloutRules or OrdinalRules

# File 'lib/isodoc/i18n.rb', line 90

def inflect_ordinal(num, term, ord_class)
  lbl = if @labels["ordinal_keys"].nil? || @labels["ordinal_keys"].empty?
          @labels[ord_class]
        else @labels[ord_class][ordinal_key(term)]
        end
  tw_cldr_localize(num).to_rbnf_s(ord_class, lbl)
rescue StandardError
  num.localize(@lang.to_sym).to_rbnf_s(ord_class, lbl)
end

#init_labels(i18nyaml, i18nhash) ⇒ `Object`

# File 'lib/isodoc/i18n.rb', line 40

def init_labels(i18nyaml, i18nhash)
  @labels = load_yaml(@lang, @script, i18nyaml, i18nhash)
  @labels["language"] = @lang
  @labels["script"] = @script
  @labels.each_key do |k|
    methods.include?(k.downcase.to_sym) ||
      self.class.methods.include?(k.downcase.to_sym) and next
    self.class.send(:define_method, k.downcase) { get[k] }
  end
end

#init_zh_punct_map ⇒ `Object`

Pre-defined punctuation mappings for efficiency

# File 'lib/isodoc/l10n_cjk.rb', line 78

def init_zh_punct_map
  ZH_PUNCT_AUTOTEXT.each_with_object([]) do |(k, v), m|
    @labels.dig("punct", k.to_s) or next
    m << [v, @labels["punct"][k.to_s], ZH_PUNCT_CONTEXTS]
  end
end

#interleave_space_cjk?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/isodoc/l10n_cjk.rb', line 154

def interleave_space_cjk?(text)
  text.size == 2 or return
  ["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
   "\u22ef\u22ef"].include?(text) ||
    /\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
    /^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
    /[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
    /[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
  true
end

#l10_context_valid?(context, idx, delim, regex) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/isodoc/l10n.rb', line 132

def l10_context_valid?(context, idx, delim, regex)
  l10n_context_found_delimiter?(context[idx], delim) or return false
  regex.nil? and return true
  regex.detect do |r|
    r[0].match?(context[0...idx].join) && # preceding context
      r[1].match?(context[(idx + 1)..-1].join) # foll context
  end
end

#l10_zh1(text, prev, foll, _script, options) ⇒ `Object`

note: we can’t differentiate comma from enumeration comma 、def l10_zh1(text, _script)

# File 'lib/isodoc/l10n_cjk.rb', line 101

def l10_zh1(text, prev, foll, _script, options)
  r = l10n_zh_punct(text, prev, foll, options)
  r = l10n_zh_remove_space(r, prev, foll)
  l10n_zh_dash(r, prev, foll)
end

#l10n(text, lang = @lang, script = @script, options = {}) ⇒ `Object`

function localising spaces and punctuation options and options are optional context strings options allows contextual full-width vs half-width punctuation

# File 'lib/isodoc/l10n.rb', line 14

def l10n(text, lang = @lang, script = @script, options = {})
  locale = options[:locale] || @locale
  %w(zh ja ko).include?(lang) and
    text = l10n_zh(text, script, options)
  lang == "fr" and
    text = l10n_fr(text, locale || "FR", options)
  text&.gsub!(/<esc>|<\/esc>/, "") # Strip esc tags
  bidiwrap(text, lang, script)
end

#l10n_context(nodes, idx) ⇒ `Object`

Fallback method for backward compatibility

# File 'lib/isodoc/l10n.rb', line 88

def l10n_context(nodes, idx)
  prev = nodes[0...idx].map(&:text).join
  foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
  [prev, foll]
end

#l10n_context_cached(text_cache, idx) ⇒ `Object`

previous, following context of current text node: do not use just the immediately adjoining text tokens for context deal with spaces and empty text by just concatenating entire context Optimized to avoid O(n²) complexity by using pre-cached text content

# File 'lib/isodoc/l10n.rb', line 81

def l10n_context_cached(text_cache, idx)
  prev = text_cache[0...idx].join
  foll = text_cache[(idx + 1)...text_cache.size].join
  [prev, foll]
end

#l10n_context_found_delimiter?(token, delim) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/isodoc/l10n.rb', line 141

def l10n_context_found_delimiter?(token, delim)
  if delim[0].is_a?(Regexp) # punct to convert
    delim[0].match?(token)
  else
    token == delim[0]
  end
end

#l10n_fr(text, locale, options) ⇒ `Object`

# File 'lib/isodoc/l10n.rb', line 94

def l10n_fr(text, locale, options)
  t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
  t.each_with_index do |n, i|
    next if esc_indices.include?(i) # Skip escaped nodes

    prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
    text = cleanup_entities(n.text, is_xml: false)
    n.content = l10n_fr1(text, prev_ctx, foll_ctx, locale)
  end
  to_xml(xml)
end

#l10n_fr1(text, prev, foll, locale) ⇒ `Object`

# File 'lib/isodoc/l10n.rb', line 149

def l10n_fr1(text, prev, foll, locale)
  text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
                   [[/\p{Alnum}$/, /^(\s|$)/]])
  text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"],
                   [[/$/, /^(?!\p{Zs})./]])
  colonsp = locale == "CH" ? "\u202f" : "\u00a0"
  l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
            [[/\p{Alnum}$/, /^(\s|$)/]])
end

#l10n_gsub(text, prev, foll, delim, regexes) ⇒ `Object`

text: string we are scanning for instances of delim to replace prev: string preceding text, as additional token of context foll: string following text, as additional token of context delim: delim is the symbol we want to replace, delim its replacement regexes: a list of regex pairs: the context before the found token, and the context after the found token, under which replacing it with delim is permitted. If regex is nil, always allow the replacement

# File 'lib/isodoc/l10n.rb', line 113

def l10n_gsub(text, prev, foll, delim, regexes)
  delim[1] or return text
  context = l10n_gsub_context(text, prev, foll, delim) or return text
  (1...(context.size - 1)).each do |i|
    l10_context_valid?(context, i, delim, regexes) and
      context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
  end
  context[1...(context.size - 1)].join
end

#l10n_gsub_context(text, prev, foll, delim) ⇒ `Object`

split string being scanned, and its contextual tokens before and after, into array of tokens determining whether to replace instances of delim

# File 'lib/isodoc/l10n.rb', line 125

def l10n_gsub_context(text, prev, foll, delim)
  d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
  context = text.split(/(#{d})/) # delim to replace
  context.size == 1 and return
  [prev, context.reject(&:empty?), foll].flatten
end

#l10n_prep(text, options) ⇒ `Object`

# File 'lib/isodoc/l10n.rb', line 42

def l10n_prep(text, options)
  xml = Nokogiri::XML::DocumentFragment.parse(text)
  t = xml.xpath(".//text()").reject { |node| node.text.empty? }
  text_cache = build_text_cache(t, options[:prev], options[:foll])

  # Find all text nodes within <esc> tags in one XPath query
  # This is O(n) instead of O(n*m) where m is tree depth
  esc_indices = build_esc_indices(xml, t)

  [t, text_cache, xml, options[:prev], options[:foll], esc_indices]
end

#l10n_zh(text, script, options) ⇒ `Object`

# File 'lib/isodoc/l10n_cjk.rb', line 85

def l10n_zh(text, script, options)
  script ||= "Hans"
  t, text_cache, xml, prev, _foll, esc_indices = l10n_prep(text, options)
  t.each_with_index do |n, i|
    next if esc_indices.include?(i) # Skip escaped nodes

    # Adjust index if prev context prepended
    prev_ctx, foll_ctx = l10n_context_cached(text_cache, prev ? i + 1 : i)
    text = cleanup_entities(n.text, is_xml: false)
    n.content = l10_zh1(text, prev_ctx, foll_ctx, script, options)
  end
  to_xml(xml) #.gsub(/<\/?em>|<\/?strong>|<\/?i>|<\/?b>/, "")
end

#l10n_zh_dash(text, prev, foll) ⇒ `Object`

# File 'lib/isodoc/l10n_cjk.rb', line 119

def l10n_zh_dash(text, prev, foll)
  text = l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "en-dash")],
                   [[ZH1_DASH, ZH2_DASH]])
  l10n_gsub(text, prev, foll, ["–", @labels.dig("punct", "number-en-dash")],
            [[ZH1_NUM_DASH, ZH2_NUM_DASH]])
end

#l10n_zh_punct(text, prev, foll, options) ⇒ `Object`

# File 'lib/isodoc/l10n_cjk.rb', line 107

def l10n_zh_punct(text, prev, foll, options)
  # Use pre-defined mapping for better performance
  @zh_punct_map ||= init_zh_punct_map
  @zh_punct_map.each do |mapping|
    punct_from, punct_to, regexes = mapping
    options[:proportional_mixed_cjk] or regexes = nil
    text = l10n_gsub(text, prev, foll, [punct_from, punct_to],
                     regexes)
  end
  text
end

#l10n_zh_remove_space(text, prev, foll) ⇒ `Object`

# File 'lib/isodoc/l10n_cjk.rb', line 126

def l10n_zh_remove_space(text, prev, foll)
  text = l10n_gsub(text, prev, foll, [/\s+/, ""],
                   [[/(#{ZH_CHAR})$/o, /^#{ZH_CHAR}/o]])
  if sep = @labels.dig("punct", "cjk-latin-separator")
    # Skip over punctuation to find Latin letters/numbers
    text = l10n_gsub(text, prev, foll, [/\s+/, sep],
                     [[/#{ZH_CHAR}$/o, /^\p{P}*[\p{Latin}\p{N}]/o]])
    l10n_gsub(text, prev, foll, [/\s+/, sep],
              [[/[\p{Latin}\p{N}]\p{P}*$/o, /^#{ZH_NON_PUNCT}/o]])
  else
    l10n_gsub(text, prev, foll, [/\s+/, ""],
              [[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o]])
  end
end

#liquid_init ⇒ `Object`



30
31
32

# File 'lib/isodoc/i18n.rb', line 30

def liquid_init
  ::Liquid::Environment.default.register_filter(::IsoDoc::I18n::Liquid)
end

#load_yaml(lang, script, i18nyaml = nil, i18nhash = nil) ⇒ `Object`

# File 'lib/isodoc/i18n-yaml.rb', line 8

def load_yaml(lang, script, i18nyaml = nil, i18nhash = nil)
  ret = load_yaml1(lang, script)
  i18nyaml and return postprocess(merge_yaml_files(ret, i18nyaml))
  i18nhash and return postprocess(ret.deep_merge(i18nhash))
  postprocess(ret)
end

#load_yaml1(lang, script) ⇒ `Object`

# File 'lib/isodoc/i18n-yaml.rb', line 109

def load_yaml1(lang, script)
  case lang
  when "zh"
    if script then load_yaml2("zh-#{script}")
    else load_yaml2("zh-Hans")
    end
  else
    load_yaml2(lang)
  end
end

#load_yaml2(lang) ⇒ `Object`

locally defined in calling class

# File 'lib/isodoc/i18n-yaml.rb', line 121

def load_yaml2(lang)
  YAML.load_file(File.join(File.dirname(__FILE__),
                           "../isodoc-yaml/i18n-#{lang}.yaml"))
rescue StandardError
  YAML.load_file(File.join(File.dirname(__FILE__),
                           "../isodoc-yaml/i18n-en.yaml"))
end

#merge(new_labels) ⇒ `Object`



137
138
139

# File 'lib/isodoc/i18n-yaml.rb', line 137

def merge(new_labels)
  @labels = @labels.deep_merge(new_labels)
end

#merge_yaml_files(ret, i18nyaml) ⇒ `Object`

i18nyaml entries are nominally paths, but callers sometimes pass YAML values (label keys, multi-line prose). Skip anything that plainly isn’t a real file path before touching the filesystem —‘File.exist?` raises on strings containing 0 (Windows + Ruby 3.2).

# File 'lib/isodoc/i18n-yaml.rb', line 19

def merge_yaml_files(ret, i18nyaml)
  Array(i18nyaml).compact.each do |y|
    path = y.to_s
    next if path.empty? || path.include?("\0") || path.include?("\n")
    next unless File.file?(path)

    ret = ret.deep_merge(YAML.load_file(path))
  end
  ret
end

#month_i18n(val) ⇒ `Object`

# File 'lib/isodoc/date.rb', line 26

def month_i18n(val)
  { B: :wide, b: :abbreviated, h: :abbreviated }.each do |f, t|
    @cal_en.calendar_data[:months][:format][t].each do |k, v|
      m = @cal.calendar_data[:months][:format][t][k]
      val.gsub!(/%\u200c#{f}<#{v}>/, m)
      val.gsub!(/%\u200c\^#{f}<#{v}>/, m.upcase)
    end
  end
  val
end

#normalise_hash(ret) ⇒ `Object`

# File 'lib/isodoc/i18n-yaml.rb', line 96

def normalise_hash(ret)
  case ret
  when Hash
    ret.each do |k, v|
      ret[k] = normalise_hash(v)
    end
    ret
  when Array then ret.map { |n| normalise_hash(n) }
  when String then cleanup_entities(ret.unicode_normalize(:nfc))
  else ret
  end
end

#ordinal_key(term) ⇒ `Object`

# File 'lib/isodoc/i18n.rb', line 118

def ordinal_key(term)
  @labels["ordinal_keys"].each_with_object([]) do |k, m|
    m << (term[k.to_s] || INFLECTIONS[k.to_sym])
  end.join(".")
end

#parse_path(path_expr) ⇒ `Object`

# File 'lib/isodoc/i18n-yaml.rb', line 83

def parse_path(path_expr)
  # Split by dots and brackets while preserving the content
  parts = path_expr.sub(/^\./, "").scan(/\.?([\w-]+)|\[([^\]]+)\]/)
  parts.each_with_object([]) do |(dot_part, bracket_part), segments|
    if dot_part
      segments << dot_part
    elsif bracket_part
      segment = bracket_part.strip.gsub(/^["']|["']$/, "")
      segments << segment
    end
  end
end

#populate(keys, vars = {}) ⇒ `Object`

populate with variables, Liquid, inflections, ordinals/spellout

# File 'lib/isodoc/i18n.rb', line 52

def populate(keys, vars = {})
  ::IsoDoc::I18n::Liquid.set(self)
  ::Liquid::Template.parse(@labels.dig(*Array(keys)))
    .render(vars.merge("labels" => @labels))
end

#postprocess(labels) ⇒ `Object`



30
31
32

# File 'lib/isodoc/i18n-yaml.rb', line 30

def postprocess(labels)
  self_reference_resolve(normalise_hash(labels))
end

#resolve_path(path_expr, labels, original_expr) ⇒ `Object`

# File 'lib/isodoc/i18n-yaml.rb', line 60

def resolve_path(path_expr, labels, original_expr)
  segments = parse_path(path_expr)
  current = labels

  segments.each do |segment|
    case current
    when Hash
      current.key?(segment) or
        raise "Self-reference error: Path '#{original_expr}' not found - key '#{segment}' does not exist"
      current = current[segment]
    when Array
      index = segment.to_i
      segment =~ /^\d+$/ && index >= 0 && index < current.length or
        raise "Self-reference error: Path '#{original_expr}' not found - invalid array index '#{segment}'"
      current = current[index]
    else
      raise "Self-reference error: Path '#{original_expr}' not found - cannot navigate through non-collection type"
    end
  end

  current.to_s
end

#resolve_references(obj, labels) ⇒ `Object`

# File 'lib/isodoc/i18n-yaml.rb', line 38

def resolve_references(obj, labels)
  case obj
  when Hash
    obj.transform_values { |v| resolve_references(v, labels) }
  when Array
    obj.map { |item| resolve_references(item, labels) }
  when String
    resolve_string_references(obj, labels)
  else
    obj
  end
end

#resolve_string_references(str, labels) ⇒ `Object`

# File 'lib/isodoc/i18n-yaml.rb', line 51

def resolve_string_references(str, labels)
  # Match patterns like #{self["key"]["subkey"]} or #{self.key.subkey}
  # Allow spaces around the self expression
  str.gsub(/\#\{\s*self([^}]+?)\s*\}/) do |match|
    path_expr = Regexp.last_match(1)
    resolve_path(path_expr, labels, match)
  end
end

#self_reference_resolve(labels) ⇒ `Object`



34
35
36

# File 'lib/isodoc/i18n-yaml.rb', line 34

def self_reference_resolve(labels)
  resolve_references(labels, labels)
end

#set(key, val) ⇒ `Object`



133
134
135

# File 'lib/isodoc/i18n-yaml.rb', line 133

def set(key, val)
  @labels[key] = val
end

#to_xml(node) ⇒ `Object`

# File 'lib/isodoc/l10n.rb', line 159

def to_xml(node)
  node&.to_xml(encoding: "UTF-8", indent: 0,
               save_with: Nokogiri::XML::Node::SaveOptions::AS_XML)
end

#tw_cldr_lang ⇒ `Object`

# File 'lib/isodoc/i18n.rb', line 124

def tw_cldr_lang
  if @lang == "zh" && @script == "Hans" then :"zh-cn"
  elsif @lang == "zh" && @script == "Hant" then :"zh-tw"
  else @lang.to_sym
  end
end

#tw_cldr_localize(num) ⇒ `Object`

# File 'lib/isodoc/i18n.rb', line 100

def tw_cldr_localize(num)
  num.localize(tw_cldr_lang)
rescue StandardError
  num.localize(:en)
end

Class: IsoDoc::I18n

Defined Under Namespace

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lang, script, locale: nil, i18nyaml: nil, i18nhash: nil) ⇒ I18n

Instance Attribute Details

#labels ⇒ Object

#lang ⇒ Object (readonly)

#locale ⇒ Object (readonly)

#script ⇒ Object (readonly)

Class Method Details

.cjk_extend(text) ⇒ Object

.l10n(text, lang = @lang, script = @script, options = {}) ⇒ Object

Instance Method Details

#am_pm_i18n(val) ⇒ Object

#bidiwrap(text, lang, script) ⇒ Object

#bidiwrap_vars(lang, script) ⇒ Object

#boolean_conj(list, conn) ⇒ Object

#build_esc_indices(xml, text_nodes) ⇒ Object

#build_text_cache(text_nodes, prev_context = nil, foll_context = nil) ⇒ Object

#calendar_data ⇒ Object

#cjk_extend(title) ⇒ Object

#cleanup_entities(text, is_xml: true) ⇒ Object

#convert_date_format(fmt) ⇒ Object

#date(value, format) ⇒ Object

#date_i18n(val) ⇒ Object

#day_i18n(val) ⇒ Object

#enum_comma ⇒ Object

#get ⇒ Object

#inflect(word, options) ⇒ Object

#inflect_ordinal(num, term, ord_class) ⇒ Object

#init_labels(i18nyaml, i18nhash) ⇒ Object

#init_zh_punct_map ⇒ Object

#interleave_space_cjk?(text) ⇒ Boolean

#l10_context_valid?(context, idx, delim, regex) ⇒ Boolean

#l10_zh1(text, prev, foll, _script, options) ⇒ Object

#l10n(text, lang = @lang, script = @script, options = {}) ⇒ Object

#l10n_context(nodes, idx) ⇒ Object

#l10n_context_cached(text_cache, idx) ⇒ Object

#l10n_context_found_delimiter?(token, delim) ⇒ Boolean

#l10n_fr(text, locale, options) ⇒ Object

#l10n_fr1(text, prev, foll, locale) ⇒ Object

#l10n_gsub(text, prev, foll, delim, regexes) ⇒ Object

#l10n_gsub_context(text, prev, foll, delim) ⇒ Object

#l10n_prep(text, options) ⇒ Object

#l10n_zh(text, script, options) ⇒ Object

#l10n_zh_dash(text, prev, foll) ⇒ Object

#l10n_zh_punct(text, prev, foll, options) ⇒ Object

#l10n_zh_remove_space(text, prev, foll) ⇒ Object

#liquid_init ⇒ Object

#load_yaml(lang, script, i18nyaml = nil, i18nhash = nil) ⇒ Object

#load_yaml1(lang, script) ⇒ Object

#load_yaml2(lang) ⇒ Object

#merge(new_labels) ⇒ Object

#merge_yaml_files(ret, i18nyaml) ⇒ Object

#month_i18n(val) ⇒ Object

#normalise_hash(ret) ⇒ Object

#ordinal_key(term) ⇒ Object

#parse_path(path_expr) ⇒ Object

#populate(keys, vars = {}) ⇒ Object

#postprocess(labels) ⇒ Object

#resolve_path(path_expr, labels, original_expr) ⇒ Object

#resolve_references(obj, labels) ⇒ Object

#resolve_string_references(str, labels) ⇒ Object

#self_reference_resolve(labels) ⇒ Object

#set(key, val) ⇒ Object

#to_xml(node) ⇒ Object

#tw_cldr_lang ⇒ Object

#tw_cldr_localize(num) ⇒ Object

#initialize(lang, script, locale: nil, i18nyaml: nil, i18nhash: nil) ⇒ `I18n`

#labels ⇒ `Object`

#lang ⇒ `Object` (readonly)

#locale ⇒ `Object` (readonly)

#script ⇒ `Object` (readonly)

.cjk_extend(text) ⇒ `Object`

.l10n(text, lang = @lang, script = @script, options = {}) ⇒ `Object`

#am_pm_i18n(val) ⇒ `Object`

#bidiwrap(text, lang, script) ⇒ `Object`

#bidiwrap_vars(lang, script) ⇒ `Object`

#boolean_conj(list, conn) ⇒ `Object`

#build_esc_indices(xml, text_nodes) ⇒ `Object`

#build_text_cache(text_nodes, prev_context = nil, foll_context = nil) ⇒ `Object`

#calendar_data ⇒ `Object`

#cjk_extend(title) ⇒ `Object`

#cleanup_entities(text, is_xml: true) ⇒ `Object`

#convert_date_format(fmt) ⇒ `Object`

#date(value, format) ⇒ `Object`

#date_i18n(val) ⇒ `Object`

#day_i18n(val) ⇒ `Object`

#enum_comma ⇒ `Object`

#get ⇒ `Object`

#inflect(word, options) ⇒ `Object`

#inflect_ordinal(num, term, ord_class) ⇒ `Object`

#init_labels(i18nyaml, i18nhash) ⇒ `Object`

#init_zh_punct_map ⇒ `Object`

#interleave_space_cjk?(text) ⇒ `Boolean`

#l10_context_valid?(context, idx, delim, regex) ⇒ `Boolean`

#l10_zh1(text, prev, foll, _script, options) ⇒ `Object`

#l10n(text, lang = @lang, script = @script, options = {}) ⇒ `Object`

#l10n_context(nodes, idx) ⇒ `Object`

#l10n_context_cached(text_cache, idx) ⇒ `Object`

#l10n_context_found_delimiter?(token, delim) ⇒ `Boolean`

#l10n_fr(text, locale, options) ⇒ `Object`

#l10n_fr1(text, prev, foll, locale) ⇒ `Object`

#l10n_gsub(text, prev, foll, delim, regexes) ⇒ `Object`

#l10n_gsub_context(text, prev, foll, delim) ⇒ `Object`

#l10n_prep(text, options) ⇒ `Object`

#l10n_zh(text, script, options) ⇒ `Object`

#l10n_zh_dash(text, prev, foll) ⇒ `Object`

#l10n_zh_punct(text, prev, foll, options) ⇒ `Object`

#l10n_zh_remove_space(text, prev, foll) ⇒ `Object`

#liquid_init ⇒ `Object`

#load_yaml(lang, script, i18nyaml = nil, i18nhash = nil) ⇒ `Object`

#load_yaml1(lang, script) ⇒ `Object`

#load_yaml2(lang) ⇒ `Object`

#merge(new_labels) ⇒ `Object`

#merge_yaml_files(ret, i18nyaml) ⇒ `Object`

#month_i18n(val) ⇒ `Object`

#normalise_hash(ret) ⇒ `Object`

#ordinal_key(term) ⇒ `Object`

#parse_path(path_expr) ⇒ `Object`

#populate(keys, vars = {}) ⇒ `Object`

#postprocess(labels) ⇒ `Object`

#resolve_path(path_expr, labels, original_expr) ⇒ `Object`

#resolve_references(obj, labels) ⇒ `Object`

#resolve_string_references(str, labels) ⇒ `Object`

#self_reference_resolve(labels) ⇒ `Object`

#set(key, val) ⇒ `Object`

#to_xml(node) ⇒ `Object`

#tw_cldr_lang ⇒ `Object`

#tw_cldr_localize(num) ⇒ `Object`