Class: Glossarist::ReferenceExtractor
- Inherits:
-
Object
- Object
- Glossarist::ReferenceExtractor
show all
- Defined in:
- lib/glossarist/reference_extractor.rb
Defined Under Namespace
Classes: IdentifierResolver, Pattern
Constant Summary
collapse
- LANG_CODES =
Glossarist::LANG_CODES
Class Method Summary
collapse
Instance Method Summary
collapse
-
#extract_all_from_managed_concept(concept) ⇒ Object
Extract all reference types from a managed concept.
-
#extract_asset_refs_from_concept(concept) ⇒ Object
Extract asset references from model attributes (NonVerbRep, GraphicalSymbol).
-
#extract_bib_refs_from_concept(concept) ⇒ Object
Extract bibliographic xrefs from model-level source citations.
-
#extract_from_concept_hash(concept_hash) ⇒ Object
-
#extract_from_localized(lc_hash) ⇒ Object
-
#extract_from_localized_concept(l10n) ⇒ Object
-
#extract_from_managed_concept(concept) ⇒ Object
-
#extract_from_text(text) ⇒ Object
-
#resolve_asciidoc_xref(target) ⇒ Object
-
#resolve_by_identifier(identifier, display) ⇒ Object
-
#resolve_cite_key(identifier, display) ⇒ Object
-
#resolve_designation(text, display) ⇒ Object
-
#resolve_generic_urn(urn, display) ⇒ Object
-
#resolve_iec_urn(urn, display) ⇒ Object
-
#resolve_image_ref(path) ⇒ Object
-
#resolve_iso_urn(urn, display) ⇒ Object
-
#resolve_local(term, concept_id) ⇒ Object
-
#resolve_mention(content) ⇒ Object
Unified concept mention dispatcher.
-
#resolve_non_verbal_mention(prefix, identifier, display, ref_class) ⇒ Object
Unified non-verbal entity mention resolver for fig:/table:/formula:.
Class Method Details
.identifier_resolvers ⇒ Object
27
28
29
|
# File 'lib/glossarist/reference_extractor.rb', line 27
def identifier_resolvers
@identifier_resolvers.dup
end
|
.patterns ⇒ Object
23
24
25
|
# File 'lib/glossarist/reference_extractor.rb', line 23
def patterns
@patterns.dup
end
|
.register_identifier_resolver(prefix, &resolver) ⇒ Object
14
15
16
17
|
# File 'lib/glossarist/reference_extractor.rb', line 14
def register_identifier_resolver(prefix, &resolver)
@identifier_resolvers << IdentifierResolver.new(prefix: prefix,
resolver: resolver)
end
|
.register_pattern(name:, regex:, &resolver) ⇒ Object
19
20
21
|
# File 'lib/glossarist/reference_extractor.rb', line 19
def register_pattern(name:, regex:, &resolver)
@patterns << Pattern.new(name: name, regex: regex, resolver: resolver)
end
|
Instance Method Details
Extract all reference types from a managed concept.
220
221
222
223
224
|
# File 'lib/glossarist/reference_extractor.rb', line 220
def (concept)
concept_refs = (concept)
asset_refs = (concept)
concept_refs + asset_refs
end
|
Extract asset references from model attributes (NonVerbRep, GraphicalSymbol).
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
|
# File 'lib/glossarist/reference_extractor.rb', line 168
def (concept)
refs = []
concept.localizations.each do |l10n|
Array(l10n.non_verb_rep).each do |nvr|
next unless nvr.is_a?(NonVerbRep)
Array(nvr.images).each do |image|
next unless image.is_a?(FigureImage)
next if image.src.nil? || image.src.strip.empty?
refs << AssetReference.new(path: image.src.strip)
end
end
(l10n.data&.terms || []).each do |term|
if term.is_a?(Designation::GraphicalSymbol) && term.image && !term.image.strip.empty?
refs << AssetReference.new(path: term.image.strip)
end
end
end
refs
end
|
Extract bibliographic xrefs from model-level source citations.
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
|
# File 'lib/glossarist/reference_extractor.rb', line 194
def (concept)
refs = []
concept.localizations.each do |l10n|
l10n.all_sources.each do |source|
origin = source.origin
next unless origin
ref = origin.ref
next unless ref
source_text = ref.source
if source_text && !source_text.strip.empty?
refs << BibliographicReference.new(anchor: source_text)
end
next unless ref.source && ref.id
key = "#{ref.source} #{ref.id}"
refs << BibliographicReference.new(anchor: key)
refs << BibliographicReference.new(anchor: ref.id.to_s)
end
end
refs
end
|
50
51
52
53
54
55
56
|
# File 'lib/glossarist/reference_extractor.rb', line 50
def (concept_hash)
LANG_CODES.flat_map do |lang|
next [] unless concept_hash[lang].is_a?(Hash)
(concept_hash[lang])
end
end
|
46
47
48
|
# File 'lib/glossarist/reference_extractor.rb', line 46
def (lc_hash)
gather_texts(lc_hash).flat_map { |t| (t) }
end
|
64
65
66
|
# File 'lib/glossarist/reference_extractor.rb', line 64
def (l10n)
l10n.text_content.flat_map { |t| (t) }
end
|
58
59
60
61
62
|
# File 'lib/glossarist/reference_extractor.rb', line 58
def (concept)
concept.localizations.flat_map do |l10n|
(l10n)
end
end
|
32
33
34
35
36
37
38
39
40
41
42
43
44
|
# File 'lib/glossarist/reference_extractor.rb', line 32
def (text)
return [] unless text.is_a?(String)
refs = []
self.class.patterns.each do |pattern|
text.scan(pattern.regex).each do |captures|
captures = [captures] unless captures.is_a?(Array)
ref = pattern.resolver.call(self, *captures)
refs << ref if ref
end
end
deduplicate(refs)
end
|
#resolve_asciidoc_xref(target) ⇒ Object
226
227
228
|
# File 'lib/glossarist/reference_extractor.rb', line 226
def resolve_asciidoc_xref(target)
BibliographicReference.new(anchor: target.strip)
end
|
#resolve_by_identifier(identifier, display) ⇒ Object
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
# File 'lib/glossarist/reference_extractor.rb', line 83
def resolve_by_identifier(identifier, display)
self.class.identifier_resolvers.each do |ir|
next unless identifier.start_with?(ir.prefix)
return ir.resolver.call(self, identifier, display)
end
case identifier
when /\A\d[\d.-]*\z/
resolve_local(display || identifier, identifier)
else
resolve_designation(identifier, display)
end
end
|
#resolve_cite_key(identifier, display) ⇒ Object
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
# File 'lib/glossarist/reference_extractor.rb', line 117
def resolve_cite_key(identifier, display)
cleaned = identifier.delete_prefix("cite:").strip
return nil if cleaned.empty?
if cleaned.start_with?('"') && cleaned.end_with?('"') && cleaned.length >= 2
cleaned = cleaned[1..-2].gsub('""', '"')
end
ConceptReference.new(
concept_id: cleaned,
source: nil,
term: display || cleaned,
ref_type: "cite",
)
end
|
#resolve_designation(text, display) ⇒ Object
108
109
110
111
112
113
114
115
|
# File 'lib/glossarist/reference_extractor.rb', line 108
def resolve_designation(text, display)
ConceptReference.new(
term: display || text,
concept_id: nil,
source: nil,
ref_type: "designation",
)
end
|
#resolve_generic_urn(urn, display) ⇒ Object
156
157
158
159
160
161
162
163
|
# File 'lib/glossarist/reference_extractor.rb', line 156
def resolve_generic_urn(urn, display)
ConceptReference.new(
term: display || "",
concept_id: nil,
source: urn,
ref_type: "urn",
)
end
|
#resolve_iec_urn(urn, display) ⇒ Object
133
134
135
136
137
138
139
140
141
142
|
# File 'lib/glossarist/reference_extractor.rb', line 133
def resolve_iec_urn(urn, display)
concept_id = (urn)
ConceptReference.new(
term: display || "",
concept_id: concept_id,
source: "urn:iec:std:iec:60050",
ref_type: "urn",
)
end
|
#resolve_image_ref(path) ⇒ Object
230
231
232
|
# File 'lib/glossarist/reference_extractor.rb', line 230
def resolve_image_ref(path)
AssetReference.new(path: path.strip)
end
|
#resolve_iso_urn(urn, display) ⇒ Object
144
145
146
147
148
149
150
151
152
153
154
|
# File 'lib/glossarist/reference_extractor.rb', line 144
def resolve_iso_urn(urn, display)
if (m = urn.match(/\Aurn:iso:std:iso:(\d+)(?::(.*))?\z/))
term_id = (m[2])
ConceptReference.new(
term: display || "",
concept_id: term_id,
source: "urn:iso:std:iso:#{m[1]}",
ref_type: "urn",
)
end
end
|
#resolve_local(term, concept_id) ⇒ Object
99
100
101
102
103
104
105
106
|
# File 'lib/glossarist/reference_extractor.rb', line 99
def resolve_local(term, concept_id)
ConceptReference.new(
term: term.strip,
concept_id: concept_id.strip,
source: nil,
ref_type: "local",
)
end
|
#resolve_mention(content) ⇒ Object
Unified concept mention dispatcher. Content is the text inside {…}.
70
71
72
73
74
75
76
77
78
79
80
81
|
# File 'lib/glossarist/reference_extractor.rb', line 70
def resolve_mention(content)
content = content.strip
if content.include?(",")
parts = content.split(",", 2)
identifier = parts[0].strip
display = parts[1].strip
resolve_by_identifier(identifier, display)
else
resolve_by_identifier(content, nil)
end
end
|
#resolve_non_verbal_mention(prefix, identifier, display, ref_class) ⇒ Object
Unified non-verbal entity mention resolver for fig:/table:/formula:. Strips the prefix and produces the appropriate reference type.
236
237
238
239
240
241
|
# File 'lib/glossarist/reference_extractor.rb', line 236
def resolve_non_verbal_mention(prefix, identifier, display, ref_class)
cleaned = identifier.delete_prefix(prefix).strip
return nil if cleaned.empty?
ref_class.new(entity_id: cleaned, display: display)
end
|