Class: Glossarist::ReferenceExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/glossarist/reference_extractor.rb

Defined Under Namespace

Classes: IdentifierResolver, Pattern

Constant Summary collapse

LANG_CODES =
Glossarist::LANG_CODES

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.identifier_resolversObject



27
28
29
# File 'lib/glossarist/reference_extractor.rb', line 27

def identifier_resolvers
  @identifier_resolvers.dup
end

.patternsObject



23
24
25
# File 'lib/glossarist/reference_extractor.rb', line 23

def patterns
  @patterns.dup
end

.register_identifier_resolver(prefix, &resolver) ⇒ Object



14
15
16
17
# File 'lib/glossarist/reference_extractor.rb', line 14

def register_identifier_resolver(prefix, &resolver)
  @identifier_resolvers << IdentifierResolver.new(prefix: prefix,
                                                  resolver: resolver)
end

.register_pattern(name:, regex:, &resolver) ⇒ Object



19
20
21
# File 'lib/glossarist/reference_extractor.rb', line 19

def register_pattern(name:, regex:, &resolver)
  @patterns << Pattern.new(name: name, regex: regex, resolver: resolver)
end

Instance Method Details

#extract_from_concept_hash(concept_hash) ⇒ Object



50
51
52
53
54
55
56
# File 'lib/glossarist/reference_extractor.rb', line 50

def extract_from_concept_hash(concept_hash)
  LANG_CODES.flat_map do |lang|
    next [] unless concept_hash[lang].is_a?(Hash)

    extract_from_localized(concept_hash[lang])
  end
end

#extract_from_localized(lc_hash) ⇒ Object



46
47
48
# File 'lib/glossarist/reference_extractor.rb', line 46

def extract_from_localized(lc_hash)
  gather_texts(lc_hash).flat_map { |t| extract_from_text(t) }
end

#extract_from_localized_concept(l10n) ⇒ Object



64
65
66
67
68
69
70
# File 'lib/glossarist/reference_extractor.rb', line 64

def extract_from_localized_concept(l10n)
  texts = []
  l10n.data.definition&.each { |d| texts << d.content if d.content }
  l10n.data.notes&.each { |n| texts << n.content if n.content }
  l10n.data.examples&.each { |e| texts << e.content if e.content }
  texts.flat_map { |t| extract_from_text(t) }
end

#extract_from_managed_concept(concept) ⇒ Object



58
59
60
61
62
# File 'lib/glossarist/reference_extractor.rb', line 58

def extract_from_managed_concept(concept)
  concept.localizations.flat_map do |l10n|
    extract_from_localized_concept(l10n)
  end
end

#extract_from_text(text) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/glossarist/reference_extractor.rb', line 32

def extract_from_text(text)
  return [] unless text.is_a?(String)

  refs = []
  self.class.patterns.each do |pattern|
    text.scan(pattern.regex).each do |captures|
      captures = [captures] unless captures.is_a?(Array)
      ref = pattern.resolver.call(self, *captures)
      refs << ref if ref
    end
  end
  deduplicate(refs)
end

#resolve_by_identifier(identifier, display) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/glossarist/reference_extractor.rb', line 87

def resolve_by_identifier(identifier, display)
  # Check registered identifier resolvers (built-in + custom)
  self.class.identifier_resolvers.each do |ir|
    next unless identifier.start_with?(ir.prefix)

    return ir.resolver.call(self, identifier, display)
  end

  case identifier
  when /\A\d[\d.-]*\z/
    resolve_local(display || identifier, identifier)
  else
    resolve_designation(identifier, display)
  end
end

#resolve_designation(text, display) ⇒ Object



112
113
114
115
116
117
118
119
# File 'lib/glossarist/reference_extractor.rb', line 112

def resolve_designation(text, display)
  ConceptReference.new(
    term: display || text,
    concept_id: nil,
    source: nil,
    ref_type: "designation",
  )
end

#resolve_generic_urn(urn, display) ⇒ Object



144
145
146
147
148
149
150
151
# File 'lib/glossarist/reference_extractor.rb', line 144

def resolve_generic_urn(urn, display)
  ConceptReference.new(
    term: display || "",
    concept_id: nil,
    source: urn,
    ref_type: "urn",
  )
end

#resolve_iec_urn(urn, display) ⇒ Object



121
122
123
124
125
126
127
128
129
130
# File 'lib/glossarist/reference_extractor.rb', line 121

def resolve_iec_urn(urn, display)
  concept_id = extract_iec_concept_id(urn)

  ConceptReference.new(
    term: display || "",
    concept_id: concept_id,
    source: "urn:iec:std:iec:60050",
    ref_type: "urn",
  )
end

#resolve_iso_urn(urn, display) ⇒ Object



132
133
134
135
136
137
138
139
140
141
142
# File 'lib/glossarist/reference_extractor.rb', line 132

def resolve_iso_urn(urn, display)
  if (m = urn.match(/\Aurn:iso:std:iso:(\d+)(?::(.*))?\z/))
    term_id = extract_term_id_from_urn_tail(m[2])
    ConceptReference.new(
      term: display || "",
      concept_id: term_id,
      source: "urn:iso:std:iso:#{m[1]}",
      ref_type: "urn",
    )
  end
end

#resolve_local(term, concept_id) ⇒ Object



103
104
105
106
107
108
109
110
# File 'lib/glossarist/reference_extractor.rb', line 103

def resolve_local(term, concept_id)
  ConceptReference.new(
    term: term.strip,
    concept_id: concept_id.strip,
    source: nil,
    ref_type: "local",
  )
end

#resolve_mention(content) ⇒ Object

Unified concept mention dispatcher. Content is the text inside {…}.



74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/glossarist/reference_extractor.rb', line 74

def resolve_mention(content)
  content = content.strip

  if content.include?(",")
    parts = content.split(",", 2)
    display = parts[0].strip
    identifier = parts[1].strip
    resolve_by_identifier(identifier, display)
  else
    resolve_by_identifier(content, nil)
  end
end