Class: Relaton::Oasis::DataParser
- Inherits:
-
Object
- Object
- Relaton::Oasis::DataParser
- Includes:
- DataParserUtils
- Defined in:
- lib/relaton/oasis/data_parser.rb
Overview
Parser for OASIS document.
Constant Summary
Constants included from DataParserUtils
Relaton::Oasis::DataParserUtils::RETRIABLE_PAGE_ERRORS
Instance Method Summary collapse
-
#document_part_refs ⇒ Array<String>
Look for “Cite as” references.
-
#initialize(node, errors = {}, agent: nil) ⇒ DataParser
constructor
Initialize parser.
- #link_node ⇒ Object
- #links ⇒ Object
-
#parse ⇒ ItemData
Parse document.
-
#parse_abstract ⇒ Array<Bib::LocalizedMarkedUpString>
Parse abstract.
-
#parse_authorizer ⇒ Object
rubocop:disable Metrics/MethodLength.
-
#parse_date ⇒ Array<Bib::Date>
Parse date.
-
#parse_docnumber ⇒ String
Parse document number.
-
#parse_editorialgroup_contributor ⇒ Array<Bib::Contributor>
Parse editorial group as contributors.
-
#parse_link ⇒ Object
rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity.
-
#parse_relation ⇒ Array<Bib::Relation>
Parse relation.
-
#parse_technology_area ⇒ Array<String>
Parse technology areas.
-
#parse_title ⇒ Array<Bib::Title>
Parse title.
- #parts ⇒ Object
-
#parts_to_docid(parts) ⇒ String
Create document identifier from parts references.
- #text ⇒ Object
- #title ⇒ Object
-
#title_to_docid(title) ⇒ String
Create document identifier from title.
Methods included from DataParserUtils
#create_contribution_info, #create_ext, #create_person, #decode_cf_email, #page, #parse_chairs, #parse_contributor, #parse_docid, #parse_doctype, #parse_editors, #parse_editors_from_text, #parse_errata, #parse_part, #parse_spec, #person_affiliation, #person_email, #publisher_oasis, #retry_page
Constructor Details
#initialize(node, errors = {}, agent: nil) ⇒ DataParser
Initialize parser.
12 13 14 15 16 |
# File 'lib/relaton/oasis/data_parser.rb', line 12 def initialize(node, errors = {}, agent: nil) @node = node @errors = errors @agent = agent end |
Instance Method Details
#document_part_refs ⇒ Array<String>
Look for “Cite as” references.
178 179 180 181 182 183 |
# File 'lib/relaton/oasis/data_parser.rb', line 178 def document_part_refs @node.css( ".standard__grid--cite-as > p > strong", "span.Refterm", "span.abbrev", "span.citationLabel > strong" ).map { |p| p.text.gsub(/^\[{1,2}|\]$/, "").strip } end |
#link_node ⇒ Object
143 144 145 146 147 148 |
# File 'lib/relaton/oasis/data_parser.rb', line 143 def link_node xpath = "./div/div/div[contains(@class, " \ "'standard__grid--cite-as')]" \ "/p[strong or span/strong]/a" @link_node ||= @node.at(xpath) end |
#links ⇒ Object
208 209 210 211 212 |
# File 'lib/relaton/oasis/data_parser.rb', line 208 def links l = @node.xpath("./div/div/div[1]/p[1]/a[@href]") l = @node.xpath("./div/div/div[1]/p[2]/a[@href]") if l.empty? l end |
#parse ⇒ ItemData
Parse document.
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
# File 'lib/relaton/oasis/data_parser.rb', line 34 def parse # rubocop:disable Metrics/MethodLength ItemData.new( type: "standard", title: parse_title, docidentifier: parse_docid, source: parse_link, docnumber: parse_docnumber, date: parse_date, contributor: parse_contributor, abstract: parse_abstract, language: ["en"], script: ["Latn"], relation: parse_relation, ext: create_ext, ) end |
#parse_abstract ⇒ Array<Bib::LocalizedMarkedUpString>
Parse abstract.
84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/relaton/oasis/data_parser.rb', line 84 def parse_abstract c = @node.xpath( "./summary/div/div[@class='standard__description']/p", ).map { |a| a.text.gsub(/[\n\t]+/, " ").strip }.join("\n") result = if c.empty? [] else [Bib::Abstract.new( content: c, language: "en", script: "Latn", )] end @errors[:abstract] &&= result.empty? result end |
#parse_authorizer ⇒ Object
rubocop:disable Metrics/MethodLength
128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# File 'lib/relaton/oasis/data_parser.rb', line 128 def # rubocop:disable Metrics/MethodLength result = @node.xpath("./div[@class='standard__details']/a").map do |a| org = Bib::Organization.new( name: [Bib::TypedLocalizedString.new(content: a.text.strip)], uri: [Bib::Uri.new(type: "uri", content: a[:href])], ) desc = [Bib::LocalizedMarkedUpString.new(content: "Committee")] role = Bib::Contributor::Role.new(type: "authorizer", description: desc) Bib::Contributor.new(organization: org, role: [role]) end @errors[:authorizer] &&= result.empty? result end |
#parse_date ⇒ Array<Bib::Date>
Parse date.
68 69 70 71 72 73 74 75 76 77 |
# File 'lib/relaton/oasis/data_parser.rb', line 68 def parse_date xpath = "./summary/div/time[@class='standard__date']" result = @node.xpath(xpath).map do |d| date_str = d.text.match(/\d{2}\s\w+\s\d{4}/).to_s date = Date.parse(date_str).to_s Bib::Date.new(at: date, type: "issued") end @errors[:date] &&= result.empty? result end |
#parse_docnumber ⇒ String
Parse document number.
219 220 221 222 223 224 225 226 227 228 229 230 |
# File 'lib/relaton/oasis/data_parser.rb', line 219 def parse_docnumber parts = document_part_refs result = case parts.size when 0 txt = @node.at("./summary/div/h2").text parse_spec title_to_docid(txt) when 1 then parse_part parse_spec(parts[0]) else parts_to_docid parts end @errors[:docnumber] &&= result.nil? result end |
#parse_editorialgroup_contributor ⇒ Array<Bib::Contributor>
Parse editorial group as contributors.
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/relaton/oasis/data_parser.rb', line 104 def parse_editorialgroup_contributor # rubocop:disable Metrics/AbcSize, Metrics/MethodLength tcs = @node.xpath("./div[@class='standard__details']/a") if tcs.empty? result = [] else subdivisions = tcs.map do |a| name = [Bib::TypedLocalizedString.new(content: a.text.strip)] Bib::Subdivision.new(type: "technical-committee", name: name) end org = Bib::Organization.new( name: [Bib::TypedLocalizedString.new(content: "OASIS")], subdivision: subdivisions, ) desc = [Bib::LocalizedMarkedUpString.new(content: "committee")] role = Bib::Contributor::Role.new( type: "author", description: desc, ) result = [Bib::Contributor.new(organization: org, role: [role])] end @errors[:editorialgroup_contributor] &&= result.empty? result end |
#parse_link ⇒ Object
rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
# File 'lib/relaton/oasis/data_parser.rb', line 185 def parse_link # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity result = if parts.size > 1 [] else links.map do |l| type = l[:href].match(/\.(\w+)$/)&.captures&.first type ||= "src" type.sub!("docx", "doc") type.sub!("html", "src") Bib::Uri.new(type: type, content: l[:href]) end end @errors[:link] &&= result.empty? result end |
#parse_relation ⇒ Array<Bib::Relation>
Parse relation.
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# File 'lib/relaton/oasis/data_parser.rb', line 155 def parse_relation # rubocop:disable Metrics/MethodLength xpath = "./div/div/div[contains(@class, " \ "'standard__grid--cite-as')]" \ "/p[strong or span/strong or b/span]" rels = @node.xpath(xpath) result = if rels.size > 1 rels.map do |r| docid = DataPartParser.new(r, @errors, agent: @agent).parse_docid bib = ItemData.new(formattedref: Bib::Formattedref.new(content: docid[0].content)) Bib::Relation.new(type: "hasPart", bibitem: bib) end else [] end @errors[:relation] &&= result.empty? result end |
#parse_technology_area ⇒ Array<String>
Parse technology areas.
298 299 300 301 302 |
# File 'lib/relaton/oasis/data_parser.rb', line 298 def parse_technology_area result = super(@node) @errors[:technology_area] &&= result.empty? result end |
#parse_title ⇒ Array<Bib::Title>
Parse title.
56 57 58 59 60 61 |
# File 'lib/relaton/oasis/data_parser.rb', line 56 def parse_title result = [Bib::Title.new(type: "main", content: title, language: "en", script: "Latn")] @errors[:title] &&= result.empty? result end |
#parts ⇒ Object
201 202 203 204 205 206 |
# File 'lib/relaton/oasis/data_parser.rb', line 201 def parts xpath = "./div/div/div[contains(@class, " \ "'standard__grid--cite-as')]" \ "/p[strong or span/strong]" @parts ||= @node.xpath(xpath) end |
#parts_to_docid(parts) ⇒ String
Create document identifier from parts references.
239 240 241 242 243 244 245 246 247 248 249 250 |
# File 'lib/relaton/oasis/data_parser.rb', line 239 def parts_to_docid(parts) # rubocop:disable Metrics/AbcSize id = parts[1..].each_with_object(parts[0].split("-")) do |part, acc| chunks = part.split "-" chunks.each.with_index do |chunk, idx| unless chunk.casecmp(acc[idx])&.zero? acc.slice!(idx..-1) break end end end.join("-") parse_part parse_spec(id) end |
#text ⇒ Object
22 23 24 25 26 27 |
# File 'lib/relaton/oasis/data_parser.rb', line 22 def text xpath = "./div/div/div[contains(@class, " \ "'standard__grid--cite-as')]" \ "/p[em or i or a or span]" @text ||= @node.at(xpath)&.text&.strip end |
#title ⇒ Object
18 19 20 |
# File 'lib/relaton/oasis/data_parser.rb', line 18 def title @title ||= @node.at("./summary/div/h2").text end |
#title_to_docid(title) ⇒ String
Create document identifier from title.
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 |
# File 'lib/relaton/oasis/data_parser.rb', line 259 def title_to_docid(title) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity abbrs = title.scan(/(?<=\()[^)]+(?=\))/) if abbrs.any? id = abbrs.map { |abbr| abbr.split.join("-") }.join "-" /(?:Version\s|v)(?<ver>[\d.]+)/ =~ title id += "-v#{ver}" if ver /(?<eb>ebXML|ebMS)/ =~ title id = "#{eb}-#{id}" if eb id else series_end = false title.sub(/\s\[OASIS\s\d+\]$/, "").split(/[,:]?\s|-|(?<=[a-z])(?=[A-Z][a-z])/) .each_with_object([""]) do |word, acc| if word =~ /^v[\d.]+/ acc << $MATCH.to_s series_end = true elsif word.match?(/^Version/) acc << "v" series_end = false elsif word.match?(/^\d|ebXML|ebMS/) series_end ? acc << word : acc[-1] += word series_end = true elsif word.match?(/^\w+$/) && word == word.upcase series_end ? acc << word : acc[-1] = word series_end = true elsif word.match?(/[A-Z]+[a-z]+/) series_end ? acc << word[0] : acc[-1] += word[0] series_end = false end end.join "-" end end |