Class: Glossarist::ReferenceExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/glossarist/reference_extractor.rb

Defined Under Namespace

Classes: IdentifierResolver, Pattern

Constant Summary collapse

LANG_CODES =
Glossarist::LANG_CODES

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.identifier_resolversObject



27
28
29
# File 'lib/glossarist/reference_extractor.rb', line 27

def identifier_resolvers
  @identifier_resolvers.dup
end

.patternsObject



23
24
25
# File 'lib/glossarist/reference_extractor.rb', line 23

def patterns
  @patterns.dup
end

.register_identifier_resolver(prefix, &resolver) ⇒ Object



14
15
16
17
# File 'lib/glossarist/reference_extractor.rb', line 14

def register_identifier_resolver(prefix, &resolver)
  @identifier_resolvers << IdentifierResolver.new(prefix: prefix,
                                                  resolver: resolver)
end

.register_pattern(name:, regex:, &resolver) ⇒ Object



19
20
21
# File 'lib/glossarist/reference_extractor.rb', line 19

def register_pattern(name:, regex:, &resolver)
  @patterns << Pattern.new(name: name, regex: regex, resolver: resolver)
end

Instance Method Details

#extract_all_from_managed_concept(concept) ⇒ Object

Extract all reference types from a managed concept.



220
221
222
223
224
# File 'lib/glossarist/reference_extractor.rb', line 220

def extract_all_from_managed_concept(concept)
  concept_refs = extract_from_managed_concept(concept)
  asset_refs = extract_asset_refs_from_concept(concept)
  concept_refs + asset_refs
end

#extract_asset_refs_from_concept(concept) ⇒ Object

Extract asset references from model attributes (NonVerbRep, GraphicalSymbol).



168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/glossarist/reference_extractor.rb', line 168

def extract_asset_refs_from_concept(concept)
  refs = []

  concept.localizations.each do |l10n|
    Array(l10n.non_verb_rep).each do |nvr|
      next unless nvr.is_a?(NonVerbRep)

      Array(nvr.images).each do |image|
        next unless image.is_a?(FigureImage)
        next if image.src.nil? || image.src.strip.empty?

        refs << AssetReference.new(path: image.src.strip)
      end
    end

    (l10n.data&.terms || []).each do |term|
      if term.is_a?(Designation::GraphicalSymbol) && term.image && !term.image.strip.empty?
        refs << AssetReference.new(path: term.image.strip)
      end
    end
  end

  refs
end

#extract_bib_refs_from_concept(concept) ⇒ Object

Extract bibliographic xrefs from model-level source citations.



194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/glossarist/reference_extractor.rb', line 194

def extract_bib_refs_from_concept(concept)
  refs = []
  concept.localizations.each do |l10n|
    l10n.all_sources.each do |source|
      origin = source.origin
      next unless origin

      ref = origin.ref
      next unless ref

      source_text = ref.source
      if source_text && !source_text.strip.empty?
        refs << BibliographicReference.new(anchor: source_text)
      end

      next unless ref.source && ref.id

      key = "#{ref.source} #{ref.id}"
      refs << BibliographicReference.new(anchor: key)
      refs << BibliographicReference.new(anchor: ref.id.to_s)
    end
  end
  refs
end

#extract_from_concept_hash(concept_hash) ⇒ Object



50
51
52
53
54
55
56
# File 'lib/glossarist/reference_extractor.rb', line 50

def extract_from_concept_hash(concept_hash)
  LANG_CODES.flat_map do |lang|
    next [] unless concept_hash[lang].is_a?(Hash)

    extract_from_localized(concept_hash[lang])
  end
end

#extract_from_localized(lc_hash) ⇒ Object



46
47
48
# File 'lib/glossarist/reference_extractor.rb', line 46

def extract_from_localized(lc_hash)
  gather_texts(lc_hash).flat_map { |t| extract_from_text(t) }
end

#extract_from_localized_concept(l10n) ⇒ Object



64
65
66
# File 'lib/glossarist/reference_extractor.rb', line 64

def extract_from_localized_concept(l10n)
  l10n.text_content.flat_map { |t| extract_from_text(t) }
end

#extract_from_managed_concept(concept) ⇒ Object



58
59
60
61
62
# File 'lib/glossarist/reference_extractor.rb', line 58

def extract_from_managed_concept(concept)
  concept.localizations.flat_map do |l10n|
    extract_from_localized_concept(l10n)
  end
end

#extract_from_text(text) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/glossarist/reference_extractor.rb', line 32

def extract_from_text(text)
  return [] unless text.is_a?(String)

  refs = []
  self.class.patterns.each do |pattern|
    text.scan(pattern.regex).each do |captures|
      captures = [captures] unless captures.is_a?(Array)
      ref = pattern.resolver.call(self, *captures)
      refs << ref if ref
    end
  end
  deduplicate(refs)
end

#resolve_asciidoc_xref(target) ⇒ Object



226
227
228
# File 'lib/glossarist/reference_extractor.rb', line 226

def resolve_asciidoc_xref(target)
  BibliographicReference.new(anchor: target.strip)
end

#resolve_by_identifier(identifier, display) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/glossarist/reference_extractor.rb', line 83

def resolve_by_identifier(identifier, display)
  # Check registered identifier resolvers (built-in + custom)
  self.class.identifier_resolvers.each do |ir|
    next unless identifier.start_with?(ir.prefix)

    return ir.resolver.call(self, identifier, display)
  end

  case identifier
  when /\A\d[\d.-]*\z/
    resolve_local(display || identifier, identifier)
  else
    resolve_designation(identifier, display)
  end
end

#resolve_cite_key(identifier, display) ⇒ Object



117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/glossarist/reference_extractor.rb', line 117

def resolve_cite_key(identifier, display)
  cleaned = identifier.delete_prefix("cite:").strip
  return nil if cleaned.empty?

  if cleaned.start_with?('"') && cleaned.end_with?('"') && cleaned.length >= 2
    cleaned = cleaned[1..-2].gsub('""', '"')
  end

  ConceptReference.new(
    concept_id: cleaned,
    source: nil,
    term: display || cleaned,
    ref_type: "cite",
  )
end

#resolve_designation(text, display) ⇒ Object



108
109
110
111
112
113
114
115
# File 'lib/glossarist/reference_extractor.rb', line 108

def resolve_designation(text, display)
  ConceptReference.new(
    term: display || text,
    concept_id: nil,
    source: nil,
    ref_type: "designation",
  )
end

#resolve_generic_urn(urn, display) ⇒ Object



156
157
158
159
160
161
162
163
# File 'lib/glossarist/reference_extractor.rb', line 156

def resolve_generic_urn(urn, display)
  ConceptReference.new(
    term: display || "",
    concept_id: nil,
    source: urn,
    ref_type: "urn",
  )
end

#resolve_iec_urn(urn, display) ⇒ Object



133
134
135
136
137
138
139
140
141
142
# File 'lib/glossarist/reference_extractor.rb', line 133

def resolve_iec_urn(urn, display)
  concept_id = extract_iec_concept_id(urn)

  ConceptReference.new(
    term: display || "",
    concept_id: concept_id,
    source: "urn:iec:std:iec:60050",
    ref_type: "urn",
  )
end

#resolve_image_ref(path) ⇒ Object



230
231
232
# File 'lib/glossarist/reference_extractor.rb', line 230

def resolve_image_ref(path)
  AssetReference.new(path: path.strip)
end

#resolve_iso_urn(urn, display) ⇒ Object



144
145
146
147
148
149
150
151
152
153
154
# File 'lib/glossarist/reference_extractor.rb', line 144

def resolve_iso_urn(urn, display)
  if (m = urn.match(/\Aurn:iso:std:iso:(\d+)(?::(.*))?\z/))
    term_id = extract_term_id_from_urn_tail(m[2])
    ConceptReference.new(
      term: display || "",
      concept_id: term_id,
      source: "urn:iso:std:iso:#{m[1]}",
      ref_type: "urn",
    )
  end
end

#resolve_local(term, concept_id) ⇒ Object



99
100
101
102
103
104
105
106
# File 'lib/glossarist/reference_extractor.rb', line 99

def resolve_local(term, concept_id)
  ConceptReference.new(
    term: term.strip,
    concept_id: concept_id.strip,
    source: nil,
    ref_type: "local",
  )
end

#resolve_mention(content) ⇒ Object

Unified concept mention dispatcher. Content is the text inside {…}.



70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/glossarist/reference_extractor.rb', line 70

def resolve_mention(content)
  content = content.strip

  if content.include?(",")
    parts = content.split(",", 2)
    identifier = parts[0].strip
    display = parts[1].strip
    resolve_by_identifier(identifier, display)
  else
    resolve_by_identifier(content, nil)
  end
end

#resolve_non_verbal_mention(prefix, identifier, display, ref_class) ⇒ Object

Unified non-verbal entity mention resolver for fig:/table:/formula:. Strips the prefix and produces the appropriate reference type.



236
237
238
239
240
241
# File 'lib/glossarist/reference_extractor.rb', line 236

def resolve_non_verbal_mention(prefix, identifier, display, ref_class)
  cleaned = identifier.delete_prefix(prefix).strip
  return nil if cleaned.empty?

  ref_class.new(entity_id: cleaned, display: display)
end