Class: Relaton::Oasis::DataParser
- Inherits:
-
Object
- Object
- Relaton::Oasis::DataParser
- Includes:
- DataParserUtils
- Defined in:
- lib/relaton/oasis/data_parser.rb
Overview
Parser for OASIS document.
Instance Method Summary collapse
-
#document_part_refs ⇒ Array<String>
Look for “Cite as” references.
-
#initialize(node, errors = {}) ⇒ DataParser
constructor
Initialize parser.
- #link_node ⇒ Object
- #links ⇒ Object
-
#parse ⇒ ItemData
Parse document.
-
#parse_abstract ⇒ Array<Bib::LocalizedMarkedUpString>
Parse abstract.
-
#parse_authorizer ⇒ Object
rubocop:disable Metrics/MethodLength.
-
#parse_date ⇒ Array<Bib::Date>
Parse date.
-
#parse_docnumber ⇒ String
Parse document number.
-
#parse_editorialgroup_contributor ⇒ Array<Bib::Contributor>
Parse editorial group as contributors.
-
#parse_link ⇒ Object
rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity.
-
#parse_relation ⇒ Array<Bib::Relation>
Parse relation.
-
#parse_technology_area ⇒ Array<String>
Parse technology areas.
-
#parse_title ⇒ Array<Bib::Title>
Parse title.
- #parts ⇒ Object
-
#parts_to_docid(parts) ⇒ String
Create document identifier from parts references.
- #text ⇒ Object
- #title ⇒ Object
-
#title_to_docid(title) ⇒ String
Create document identifier from title.
Methods included from DataParserUtils
#create_contribution_info, #create_ext, #create_person, #decode_cf_email, #page, #parse_chairs, #parse_contributor, #parse_docid, #parse_doctype, #parse_editors, #parse_editors_from_text, #parse_errata, #parse_part, #parse_spec, #person_affiliation, #person_email, #publisher_oasis, #retry_page
Constructor Details
#initialize(node, errors = {}) ⇒ DataParser
Initialize parser.
12 13 14 15 |
# File 'lib/relaton/oasis/data_parser.rb', line 12 def initialize(node, errors = {}) @node = node @errors = errors end |
Instance Method Details
#document_part_refs ⇒ Array<String>
Look for “Cite as” references.
177 178 179 180 181 182 |
# File 'lib/relaton/oasis/data_parser.rb', line 177 def document_part_refs @node.css( ".standard__grid--cite-as > p > strong", "span.Refterm", "span.abbrev", "span.citationLabel > strong" ).map { |p| p.text.gsub(/^\[{1,2}|\]$/, "").strip } end |
#link_node ⇒ Object
142 143 144 145 146 147 |
# File 'lib/relaton/oasis/data_parser.rb', line 142 def link_node xpath = "./div/div/div[contains(@class, " \ "'standard__grid--cite-as')]" \ "/p[strong or span/strong]/a" @link_node ||= @node.at(xpath) end |
#links ⇒ Object
207 208 209 210 211 |
# File 'lib/relaton/oasis/data_parser.rb', line 207 def links l = @node.xpath("./div/div/div[1]/p[1]/a[@href]") l = @node.xpath("./div/div/div[1]/p[2]/a[@href]") if l.empty? l end |
#parse ⇒ ItemData
Parse document.
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/relaton/oasis/data_parser.rb', line 33 def parse # rubocop:disable Metrics/MethodLength ItemData.new( type: "standard", title: parse_title, docidentifier: parse_docid, source: parse_link, docnumber: parse_docnumber, date: parse_date, contributor: parse_contributor, abstract: parse_abstract, language: ["en"], script: ["Latn"], relation: parse_relation, ext: create_ext, ) end |
#parse_abstract ⇒ Array<Bib::LocalizedMarkedUpString>
Parse abstract.
83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/relaton/oasis/data_parser.rb', line 83 def parse_abstract c = @node.xpath( "./summary/div/div[@class='standard__description']/p", ).map { |a| a.text.gsub(/[\n\t]+/, " ").strip }.join("\n") result = if c.empty? [] else [Bib::Abstract.new( content: c, language: "en", script: "Latn", )] end @errors[:abstract] &&= result.empty? result end |
#parse_authorizer ⇒ Object
rubocop:disable Metrics/MethodLength
127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# File 'lib/relaton/oasis/data_parser.rb', line 127 def # rubocop:disable Metrics/MethodLength result = @node.xpath("./div[@class='standard__details']/a").map do |a| org = Bib::Organization.new( name: [Bib::TypedLocalizedString.new(content: a.text.strip)], uri: [Bib::Uri.new(type: "uri", content: a[:href])], ) desc = [Bib::LocalizedMarkedUpString.new(content: "Committee")] role = Bib::Contributor::Role.new(type: "authorizer", description: desc) Bib::Contributor.new(organization: org, role: [role]) end @errors[:authorizer] &&= result.empty? result end |
#parse_date ⇒ Array<Bib::Date>
Parse date.
67 68 69 70 71 72 73 74 75 76 |
# File 'lib/relaton/oasis/data_parser.rb', line 67 def parse_date xpath = "./summary/div/time[@class='standard__date']" result = @node.xpath(xpath).map do |d| date_str = d.text.match(/\d{2}\s\w+\s\d{4}/).to_s date = Date.parse(date_str).to_s Bib::Date.new(at: date, type: "issued") end @errors[:date] &&= result.empty? result end |
#parse_docnumber ⇒ String
Parse document number.
218 219 220 221 222 223 224 225 226 227 228 229 |
# File 'lib/relaton/oasis/data_parser.rb', line 218 def parse_docnumber parts = document_part_refs result = case parts.size when 0 txt = @node.at("./summary/div/h2").text parse_spec title_to_docid(txt) when 1 then parse_part parse_spec(parts[0]) else parts_to_docid parts end @errors[:docnumber] &&= result.nil? result end |
#parse_editorialgroup_contributor ⇒ Array<Bib::Contributor>
Parse editorial group as contributors.
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/relaton/oasis/data_parser.rb', line 103 def parse_editorialgroup_contributor # rubocop:disable Metrics/AbcSize, Metrics/MethodLength tcs = @node.xpath("./div[@class='standard__details']/a") if tcs.empty? result = [] else subdivisions = tcs.map do |a| name = [Bib::TypedLocalizedString.new(content: a.text.strip)] Bib::Subdivision.new(type: "technical-committee", name: name) end org = Bib::Organization.new( name: [Bib::TypedLocalizedString.new(content: "OASIS")], subdivision: subdivisions, ) desc = [Bib::LocalizedMarkedUpString.new(content: "committee")] role = Bib::Contributor::Role.new( type: "author", description: desc, ) result = [Bib::Contributor.new(organization: org, role: [role])] end @errors[:editorialgroup_contributor] &&= result.empty? result end |
#parse_link ⇒ Object
rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
# File 'lib/relaton/oasis/data_parser.rb', line 184 def parse_link # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity result = if parts.size > 1 [] else links.map do |l| type = l[:href].match(/\.(\w+)$/)&.captures&.first type ||= "src" type.sub!("docx", "doc") type.sub!("html", "src") Bib::Uri.new(type: type, content: l[:href]) end end @errors[:link] &&= result.empty? result end |
#parse_relation ⇒ Array<Bib::Relation>
Parse relation.
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# File 'lib/relaton/oasis/data_parser.rb', line 154 def parse_relation # rubocop:disable Metrics/MethodLength xpath = "./div/div/div[contains(@class, " \ "'standard__grid--cite-as')]" \ "/p[strong or span/strong or b/span]" rels = @node.xpath(xpath) result = if rels.size > 1 rels.map do |r| docid = DataPartParser.new(r).parse_docid bib = ItemData.new(formattedref: Bib::Formattedref.new(content: docid[0].content)) Bib::Relation.new(type: "hasPart", bibitem: bib) end else [] end @errors[:relation] &&= result.empty? result end |
#parse_technology_area ⇒ Array<String>
Parse technology areas.
297 298 299 300 301 |
# File 'lib/relaton/oasis/data_parser.rb', line 297 def parse_technology_area result = super(@node) @errors[:technology_area] &&= result.empty? result end |
#parse_title ⇒ Array<Bib::Title>
Parse title.
55 56 57 58 59 60 |
# File 'lib/relaton/oasis/data_parser.rb', line 55 def parse_title result = [Bib::Title.new(type: "main", content: title, language: "en", script: "Latn")] @errors[:title] &&= result.empty? result end |
#parts ⇒ Object
200 201 202 203 204 205 |
# File 'lib/relaton/oasis/data_parser.rb', line 200 def parts xpath = "./div/div/div[contains(@class, " \ "'standard__grid--cite-as')]" \ "/p[strong or span/strong]" @parts ||= @node.xpath(xpath) end |
#parts_to_docid(parts) ⇒ String
Create document identifier from parts references.
238 239 240 241 242 243 244 245 246 247 248 249 |
# File 'lib/relaton/oasis/data_parser.rb', line 238 def parts_to_docid(parts) # rubocop:disable Metrics/AbcSize id = parts[1..].each_with_object(parts[0].split("-")) do |part, acc| chunks = part.split "-" chunks.each.with_index do |chunk, idx| unless chunk.casecmp(acc[idx])&.zero? acc.slice!(idx..-1) break end end end.join("-") parse_part parse_spec(id) end |
#text ⇒ Object
21 22 23 24 25 26 |
# File 'lib/relaton/oasis/data_parser.rb', line 21 def text xpath = "./div/div/div[contains(@class, " \ "'standard__grid--cite-as')]" \ "/p[em or i or a or span]" @text ||= @node.at(xpath)&.text&.strip end |
#title ⇒ Object
17 18 19 |
# File 'lib/relaton/oasis/data_parser.rb', line 17 def title @title ||= @node.at("./summary/div/h2").text end |
#title_to_docid(title) ⇒ String
Create document identifier from title.
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 |
# File 'lib/relaton/oasis/data_parser.rb', line 258 def title_to_docid(title) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity abbrs = title.scan(/(?<=\()[^)]+(?=\))/) if abbrs.any? id = abbrs.map { |abbr| abbr.split.join("-") }.join "-" /(?:Version\s|v)(?<ver>[\d.]+)/ =~ title id += "-v#{ver}" if ver /(?<eb>ebXML|ebMS)/ =~ title id = "#{eb}-#{id}" if eb id else series_end = false title.sub(/\s\[OASIS\s\d+\]$/, "").split(/[,:]?\s|-|(?<=[a-z])(?=[A-Z][a-z])/) .each_with_object([""]) do |word, acc| if word =~ /^v[\d.]+/ acc << $MATCH.to_s series_end = true elsif word.match?(/^Version/) acc << "v" series_end = false elsif word.match?(/^\d|ebXML|ebMS/) series_end ? acc << word : acc[-1] += word series_end = true elsif word.match?(/^\w+$/) && word == word.upcase series_end ? acc << word : acc[-1] = word series_end = true elsif word.match?(/[A-Z]+[a-z]+/) series_end ? acc << word[0] : acc[-1] += word[0] series_end = false end end.join "-" end end |