Class: Relaton::Bipm::SiBrochureParser
- Inherits:
-
Object
- Object
- Relaton::Bipm::SiBrochureParser
- Defined in:
- lib/relaton/bipm/si_brochure_parser.rb
Class Method Summary collapse
-
.parse(data_fetcher) ⇒ Object
Parse documents from SI brochure dataset and write thems to YAML files.
Instance Method Summary collapse
-
#deep_merge(hash1, hash2) ⇒ Hash
Deep merge two hashes.
-
#downcase_all(content) ⇒ Array, ...
Downcase all values in hash or array.
- #extract_editorialgroup(xml) ⇒ Object
-
#fix_si_brochure_id(item) ⇒ void
Update ID of SI brochure.
- #has_committee_contributor?(item) ⇒ Boolean
-
#initialize(data_fetcher) ⇒ SiBrochureParser
constructor
Create new parser.
-
#parse ⇒ Object
Parse SI brochure and write them to YAML files.
- #primary_id(item) ⇒ Object
- #update_id(item) ⇒ Object
Constructor Details
#initialize(data_fetcher) ⇒ SiBrochureParser
Create new parser
11 12 13 |
# File 'lib/relaton/bipm/si_brochure_parser.rb', line 11 def initialize(data_fetcher) @data_fetcher = WeakRef.new data_fetcher end |
Class Method Details
.parse(data_fetcher) ⇒ Object
Parse documents from SI brochure dataset and write thems to YAML files
20 21 22 |
# File 'lib/relaton/bipm/si_brochure_parser.rb', line 20 def self.parse(data_fetcher) new(data_fetcher).parse end |
Instance Method Details
#deep_merge(hash1, hash2) ⇒ Hash
Deep merge two hashes
146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/relaton/bipm/si_brochure_parser.rb', line 146 def deep_merge(hash1, hash2) # rubocop:disable Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity hash1.merge(hash2) do |_, oldval, newval| if oldval.is_a?(Hash) && newval.is_a?(Hash) deep_merge(oldval, newval) elsif oldval.is_a?(Array) && newval.is_a?(Array) (oldval + newval).uniq { |i| downcase_all i } else newval || oldval end end end |
#downcase_all(content) ⇒ Array, ...
Downcase all values in hash or array
165 166 167 168 169 170 171 172 |
# File 'lib/relaton/bipm/si_brochure_parser.rb', line 165 def downcase_all(content) case content when Hash then content.transform_values { |v| downcase_all v } when Array then content.map { |v| downcase_all v } when String then content.downcase else content end end |
#extract_editorialgroup(xml) ⇒ Object
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/relaton/bipm/si_brochure_parser.rb', line 117 def extract_editorialgroup(xml) doc = Nokogiri::XML(xml) doc.xpath("//editorialgroup/committee").map do |committee| acronym = committee["acronym"] names = committee.xpath("variant").map do |v| Relaton::Bib::TypedLocalizedString.new( content: v.text, language: v["language"], script: v["script"], ) end subdiv = Relaton::Bib::Subdivision.new( type: "committee", name: names, abbreviation: Relaton::Bib::LocalizedString.new(content: acronym), ) bipm_name = [Relaton::Bib::TypedLocalizedString.new(content: acronym)] org = Relaton::Bib::Organization.new(name: bipm_name, subdivision: [subdiv]) desc = Relaton::Bib::LocalizedMarkedUpString.new(content: "committee") role = Relaton::Bib::Contributor::Role.new(type: "author", description: [desc]) Relaton::Bib::Contributor.new(organization: org, role: [role]) end end |
#fix_si_brochure_id(item) ⇒ void
This method returns an undefined value.
Update ID of SI brochure
81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/relaton/bipm/si_brochure_parser.rb', line 81 def fix_si_brochure_id(item) # isbn = hash["docid"].detect { |id| id["type"] == "ISBN" } # num = isbn && isbn["id"] == "978-92-822-2272-0" ? "SI Brochure" : "SI Brochure, Appendix 4" update_id item prid = primary_id item if item.docnumber item.docnumber.sub!(/^Brochure(?:\sConcise|\sFAQ)?$/i, prid.sub(/^BIPM\s/, "")) else item.docnumber = prid.sub(/^BIPM\s/, "") end item.id = prid.gsub(/[,\s-]/, "") end |
#has_committee_contributor?(item) ⇒ Boolean
111 112 113 114 115 |
# File 'lib/relaton/bipm/si_brochure_parser.rb', line 111 def has_committee_contributor?(item) item.contributor.any? do |c| c.role.any? { |r| r.description.any? { |d| d.content == "committee" } } end end |
#parse ⇒ Object
Parse SI brochure and write them to YAML files
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/relaton/bipm/si_brochure_parser.rb', line 27 def parse # rubocop:disable Metrics/AbcSize, Metrics/MethodLength # puts "Parsing SI brochure..." # puts "Ls #{Dir['*']}" # puts "Ls #{Dir['bipm-si-brochure/*']}" # puts "Ls #{Dir['bipm-si-brochure/site/*']}" # puts "Ls #{Dir['bipm-si-brochure/site/documents/*']}" Dir["bipm-si-brochure/_site/documents/*.rxl"].each do |f| puts "Parsing #{f}" xml = File.read(f, encoding: "UTF-8") xml = xml.force_encoding("UTF-8") if xml.encoding != Encoding::UTF_8 item1 = Bibdata.from_xml(xml) # Workaround for relaton-bib Version#content bug: whitespace between # legacy <revision-date>/<draft> children gets captured as @content, # blocking the legacy-fold path. Clear it so the getter recomputes. item1.version.each do |v| c = v.instance_variable_get(:@content) next unless c.is_a?(Array) || (c.is_a?(String) && c.strip.empty?) v.instance_variable_set(:@content, nil) end @data_fetcher.errors[:si_brochure_title] &&= item1.title.empty? @data_fetcher.errors[:si_brochure_docidentifier] &&= item1.docidentifier.empty? unless has_committee_contributor?(item1) contribs = extract_editorialgroup(xml) contribs.each { |c| item1.contributor << c } end fix_si_brochure_id item1 basename = File.join @data_fetcher.output, File.basename(f).sub(/(?:-(?:en|fr))?\.rxl$/, "") outfile = "#{basename}.#{@data_fetcher.ext}" key = item1.docnumber || basename @data_fetcher.index.add_or_update Id.new.parse(key).to_hash, outfile item = if File.exist? outfile warn_duplicate = false item2 = Item.from_yaml File.read(outfile, encoding: "UTF-8") fix_si_brochure_id item2 hash1 = YAML.safe_load item1.to_yaml hash2 = YAML.safe_load item2.to_yaml Item.from_yaml deep_merge(hash1, hash2).to_yaml else warn_duplicate = true item1 end @data_fetcher.write_file outfile, item, warn_duplicate: warn_duplicate puts "Saved to #{outfile}" end end |
#primary_id(item) ⇒ Object
105 106 107 108 109 |
# File 'lib/relaton/bipm/si_brochure_parser.rb', line 105 def primary_id(item) item.docidentifier.detect do |id| id.primary && (id.language == "en" || id.language.nil?) end.content end |
#update_id(item) ⇒ Object
96 97 98 99 100 101 102 103 |
# File 'lib/relaton/bipm/si_brochure_parser.rb', line 96 def update_id(item) item.docidentifier.each do |id| next unless id.type == "BIPM" && id.content&.match?(/BIPM Brochure/i) id.primary = true id.content.sub!(/(?<=^BIPM\s)(Brochure)/i, "SI \\1") end end |