Class: Glossarist::ReferenceExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/glossarist/reference_extractor.rb

Defined Under Namespace

Classes: IdentifierResolver, Pattern

Constant Summary collapse

LANG_CODES =
Glossarist::LANG_CODES

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.identifier_resolversObject



27
28
29
# File 'lib/glossarist/reference_extractor.rb', line 27

def identifier_resolvers
  @identifier_resolvers.dup
end

.patternsObject



23
24
25
# File 'lib/glossarist/reference_extractor.rb', line 23

def patterns
  @patterns.dup
end

.register_identifier_resolver(prefix, &resolver) ⇒ Object



14
15
16
17
# File 'lib/glossarist/reference_extractor.rb', line 14

def register_identifier_resolver(prefix, &resolver)
  @identifier_resolvers << IdentifierResolver.new(prefix: prefix,
                                                  resolver: resolver)
end

.register_pattern(name:, regex:, &resolver) ⇒ Object



19
20
21
# File 'lib/glossarist/reference_extractor.rb', line 19

def register_pattern(name:, regex:, &resolver)
  @patterns << Pattern.new(name: name, regex: regex, resolver: resolver)
end

Instance Method Details

#extract_all_from_managed_concept(concept) ⇒ Object

Extract all reference types from a managed concept.



199
200
201
202
203
# File 'lib/glossarist/reference_extractor.rb', line 199

def extract_all_from_managed_concept(concept)
  concept_refs = extract_from_managed_concept(concept)
  asset_refs = extract_asset_refs_from_concept(concept)
  concept_refs + asset_refs
end

#extract_asset_refs_from_concept(concept) ⇒ Object

Extract asset references from model attributes (NonVerbRep, GraphicalSymbol).



152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/glossarist/reference_extractor.rb', line 152

def extract_asset_refs_from_concept(concept)
  refs = []

  concept.localizations.each do |l10n|
    Array(l10n.non_verb_rep).each do |nvr|
      next unless nvr.is_a?(NonVerbRep) && nvr.ref && !nvr.ref.strip.empty?

      refs << AssetReference.new(path: nvr.ref.strip)
    end

    (l10n.data&.terms || []).each do |term|
      if term.is_a?(Designation::GraphicalSymbol) && term.image && !term.image.strip.empty?
        refs << AssetReference.new(path: term.image.strip)
      end
    end
  end

  refs
end

#extract_bib_refs_from_concept(concept) ⇒ Object

Extract bibliographic xrefs from model-level source citations.



173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/glossarist/reference_extractor.rb', line 173

def extract_bib_refs_from_concept(concept)
  refs = []
  concept.localizations.each do |l10n|
    l10n.all_sources.each do |source|
      origin = source.origin
      next unless origin

      ref = origin.ref
      next unless ref

      source_text = ref.source
      if source_text && !source_text.strip.empty?
        refs << BibliographicReference.new(anchor: source_text)
      end

      next unless ref.source && ref.id

      key = "#{ref.source} #{ref.id}"
      refs << BibliographicReference.new(anchor: key)
      refs << BibliographicReference.new(anchor: ref.id.to_s)
    end
  end
  refs
end

#extract_from_concept_hash(concept_hash) ⇒ Object



50
51
52
53
54
55
56
# File 'lib/glossarist/reference_extractor.rb', line 50

def extract_from_concept_hash(concept_hash)
  LANG_CODES.flat_map do |lang|
    next [] unless concept_hash[lang].is_a?(Hash)

    extract_from_localized(concept_hash[lang])
  end
end

#extract_from_localized(lc_hash) ⇒ Object



46
47
48
# File 'lib/glossarist/reference_extractor.rb', line 46

def extract_from_localized(lc_hash)
  gather_texts(lc_hash).flat_map { |t| extract_from_text(t) }
end

#extract_from_localized_concept(l10n) ⇒ Object



64
65
66
# File 'lib/glossarist/reference_extractor.rb', line 64

def extract_from_localized_concept(l10n)
  l10n.text_content.flat_map { |t| extract_from_text(t) }
end

#extract_from_managed_concept(concept) ⇒ Object



58
59
60
61
62
# File 'lib/glossarist/reference_extractor.rb', line 58

def extract_from_managed_concept(concept)
  concept.localizations.flat_map do |l10n|
    extract_from_localized_concept(l10n)
  end
end

#extract_from_text(text) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/glossarist/reference_extractor.rb', line 32

def extract_from_text(text)
  return [] unless text.is_a?(String)

  refs = []
  self.class.patterns.each do |pattern|
    text.scan(pattern.regex).each do |captures|
      captures = [captures] unless captures.is_a?(Array)
      ref = pattern.resolver.call(self, *captures)
      refs << ref if ref
    end
  end
  deduplicate(refs)
end

#resolve_asciidoc_xref(target) ⇒ Object



205
206
207
# File 'lib/glossarist/reference_extractor.rb', line 205

def resolve_asciidoc_xref(target)
  BibliographicReference.new(anchor: target.strip)
end

#resolve_by_identifier(identifier, display) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/glossarist/reference_extractor.rb', line 83

def resolve_by_identifier(identifier, display)
  # Check registered identifier resolvers (built-in + custom)
  self.class.identifier_resolvers.each do |ir|
    next unless identifier.start_with?(ir.prefix)

    return ir.resolver.call(self, identifier, display)
  end

  case identifier
  when /\A\d[\d.-]*\z/
    resolve_local(display || identifier, identifier)
  else
    resolve_designation(identifier, display)
  end
end

#resolve_designation(text, display) ⇒ Object



108
109
110
111
112
113
114
115
# File 'lib/glossarist/reference_extractor.rb', line 108

def resolve_designation(text, display)
  ConceptReference.new(
    term: display || text,
    concept_id: nil,
    source: nil,
    ref_type: "designation",
  )
end

#resolve_generic_urn(urn, display) ⇒ Object



140
141
142
143
144
145
146
147
# File 'lib/glossarist/reference_extractor.rb', line 140

def resolve_generic_urn(urn, display)
  ConceptReference.new(
    term: display || "",
    concept_id: nil,
    source: urn,
    ref_type: "urn",
  )
end

#resolve_iec_urn(urn, display) ⇒ Object



117
118
119
120
121
122
123
124
125
126
# File 'lib/glossarist/reference_extractor.rb', line 117

def resolve_iec_urn(urn, display)
  concept_id = extract_iec_concept_id(urn)

  ConceptReference.new(
    term: display || "",
    concept_id: concept_id,
    source: "urn:iec:std:iec:60050",
    ref_type: "urn",
  )
end

#resolve_image_ref(path) ⇒ Object



209
210
211
# File 'lib/glossarist/reference_extractor.rb', line 209

def resolve_image_ref(path)
  AssetReference.new(path: path.strip)
end

#resolve_iso_urn(urn, display) ⇒ Object



128
129
130
131
132
133
134
135
136
137
138
# File 'lib/glossarist/reference_extractor.rb', line 128

def resolve_iso_urn(urn, display)
  if (m = urn.match(/\Aurn:iso:std:iso:(\d+)(?::(.*))?\z/))
    term_id = extract_term_id_from_urn_tail(m[2])
    ConceptReference.new(
      term: display || "",
      concept_id: term_id,
      source: "urn:iso:std:iso:#{m[1]}",
      ref_type: "urn",
    )
  end
end

#resolve_local(term, concept_id) ⇒ Object



99
100
101
102
103
104
105
106
# File 'lib/glossarist/reference_extractor.rb', line 99

def resolve_local(term, concept_id)
  ConceptReference.new(
    term: term.strip,
    concept_id: concept_id.strip,
    source: nil,
    ref_type: "local",
  )
end

#resolve_mention(content) ⇒ Object

Unified concept mention dispatcher. Content is the text inside {…}.



70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/glossarist/reference_extractor.rb', line 70

def resolve_mention(content)
  content = content.strip

  if content.include?(",")
    parts = content.split(",", 2)
    display = parts[0].strip
    identifier = parts[1].strip
    resolve_by_identifier(identifier, display)
  else
    resolve_by_identifier(content, nil)
  end
end