Module: Relaton::Oasis::DataParserUtils
- Included in:
- DataParser, DataPartParser
- Defined in:
- lib/relaton/oasis/data_parser_utils.rb
Overview
Common methods for document and part parsers.
Constant Summary collapse
- RETRIABLE_PAGE_ERRORS =
[ Errno::ETIMEDOUT, Net::OpenTimeout, Ferrum::TimeoutError, Ferrum::PendingConnectionsError, Ferrum::StatusError, ].freeze
Instance Method Summary collapse
- #create_contribution_info(person_node, type, description = []) ⇒ Object
- #create_ext ⇒ Object
- #create_person(name, email = nil, org = nil) ⇒ Object
- #decode_cf_email(encoded) ⇒ Object
- #page ⇒ Object
-
#parse_chairs ⇒ Object
rubocop:disable Metrics/MethodLength.
-
#parse_contributor ⇒ Array<Bib::Contributor>
Parse contributor.
-
#parse_docid ⇒ Array<Bib::Docidentifier>
Parse document identifier.
-
#parse_doctype ⇒ Doctype
Parse document type.
-
#parse_editors ⇒ Object
rubocop:disable Metrics/MethodLength.
-
#parse_editors_from_text ⇒ Object
rubocop:disable Metrics/MethodLength.
-
#parse_errata(id) ⇒ String
Parse document identifier errata.
-
#parse_part(docid) ⇒ String
Parse document identifier part.
-
#parse_spec(num) ⇒ String
Parse document identifier specification.
-
#parse_technology_area(node) ⇒ Array<String>
Parse technology area.
- #person_affiliation(org) ⇒ Object
- #person_email(email) ⇒ Object
- #publisher_oasis ⇒ Object
-
#retry_page(url, agent, retries = 3) ⇒ Nokogiri::HTML::Document, ...
Retry to get page.
Instance Method Details
#create_contribution_info(person_node, type, description = []) ⇒ Object
130 131 132 133 134 135 136 137 138 139 140 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 130 def create_contribution_info(person_node, type, description = []) name = person_node.text.match(/^[^(]+/).to_s.strip return nil if name.empty? || !name.match?(/\A\p{L}/) || name.match?(%r{\A(?:https?://|urn:)}) email, org = person_node.xpath ".//a[@href]" entity = create_person name, email, org desc = description.map { |d| Bib::LocalizedMarkedUpString.new(content: d) } role = Bib::Contributor::Role.new(type: type, description: desc) Bib::Contributor.new(role: [role], person: entity) end |
#create_ext ⇒ Object
287 288 289 290 291 292 293 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 287 def create_ext Ext.new( doctype: parse_doctype, flavor: "oasis", technology_area: parse_technology_area, ) end |
#create_person(name, email = nil, org = nil) ⇒ Object
142 143 144 145 146 147 148 149 150 151 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 142 def create_person(name, email = nil, org = nil) forename, surname = name.split fn = Bib::FullNameType::Forename.new(content: forename, language: "en", script: "Latn") sn = Bib::LocalizedString.new(content: surname, language: "en", script: "Latn") fullname = Bib::FullName.new(surname: sn, forename: [fn]) Bib::Person.new(name: fullname, email: person_email(email), affiliation: person_affiliation(org)) end |
#decode_cf_email(encoded) ⇒ Object
175 176 177 178 179 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 175 def decode_cf_email(encoded) bytes = [encoded].pack("H*").bytes key = bytes.first bytes[1..].map { |b| (b ^ key).chr }.join end |
#page ⇒ Object
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 60 def page return @page if defined? @page @page = nil return @page unless link_node && link_node[:href].match?(/\.html$/) if @agent doc = retry_page(link_node[:href], @agent) @page = doc if doc && @agent.last_status == 200 else # No injected agent (e.g. unit tests with VCR cassettes): fall back # to a Mechanize request — VCR can intercept it. agent = Mechanize.new agent.agent.allowed_error_codes = [403, 404, 503] resp = retry_page(link_node[:href], agent) @page = resp if resp && resp.code == "200" end end |
#parse_chairs ⇒ Object
rubocop:disable Metrics/MethodLength
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 97 def parse_chairs # rubocop:disable Metrics/MethodLength result = if page xpath = "//p[preceding-sibling::p" \ "[starts-with(., 'Chair')]]" \ "[following-sibling::p" \ "[starts-with(., 'Editor')]]" page.xpath(xpath).map do |p| create_contribution_info(p, "editor", ["Chair"]) end.compact else [] end @errors[:chairs] &&= result.empty? result end |
#parse_contributor ⇒ Array<Bib::Contributor>
Parse contributor.
20 21 22 23 24 25 26 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 20 def parse_contributor result = publisher_oasis + + parse_editorialgroup_contributor + parse_chairs + parse_editors @errors[:contributor] &&= result.empty? result end |
#parse_docid ⇒ Array<Bib::Docidentifier>
Parse document identifier.
245 246 247 248 249 250 251 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 245 def parse_docid id = "OASIS #{parse_docnumber}" result = [Bib::Docidentifier.new(type: "OASIS", content: id, primary: true)] @errors[:docid] &&= result.empty? result end |
#parse_doctype ⇒ Doctype
Parse document type.
258 259 260 261 262 263 264 265 266 267 268 269 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 258 def parse_doctype type = case text when /OASIS Project Specification/, /Committee Specification/ "specification" when /Technical Memorandum/ then "memorandum" when /Technical Resolution/ then "resolution" else "standard" end result = Doctype.new(content: type) @errors[:doctype] &&= result.nil? result end |
#parse_editors ⇒ Object
rubocop:disable Metrics/MethodLength
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 113 def parse_editors # rubocop:disable Metrics/MethodLength result = if page xpath = "//p[contains(@class, 'Contributor')]" \ "[preceding-sibling::p" \ "[starts-with(., 'Editor')]]" \ "[following-sibling::p" \ "[contains(@class, 'Title')]]" page.xpath(xpath).map do |p| create_contribution_info(p, "editor") end.compact else parse_editors_from_text end @errors[:editors] &&= result.empty? result end |
#parse_editors_from_text ⇒ Object
rubocop:disable Metrics/MethodLength
45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 45 def parse_editors_from_text # rubocop:disable Metrics/MethodLength result = if text text.match(/(?<=Edited\sby\s)[^.]+/).to_s .split(/,?\sand\s|,\s/).map do |c| role = [Bib::Contributor::Role.new(type: "editor")] Bib::Contributor.new(role: role, person: create_person(c)) end else [] end @errors[:editors] &&= result.empty? result end |
#parse_errata(id) ⇒ String
Parse document identifier errata.
230 231 232 233 234 235 236 237 238 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 230 def parse_errata(id) return id.sub("errata", "Errata") if id.match?(/errata\d+/i) case title when /Plus\sErrata\s(\d+)/ then "#{id}-plus-Errata#{$1}" when /Errata\s(\d+)/ then "#{id}-Errata#{$1}" else id end end |
#parse_part(docid) ⇒ String
Parse document identifier part.
214 215 216 217 218 219 220 221 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 214 def parse_part(docid) return docid if docid.match?(/(?:Part|Pt)\d+/i) case title when /Part\s(\d+)/ then "#{docid}-Pt#{$1}" else docid end end |
#parse_spec(num) ⇒ String
Parse document identifier specification.
199 200 201 202 203 204 205 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 199 def parse_spec(num) case text when /OASIS Project Specification (\d+)/ then "#{num}-PS#{$1}" when /Committee Specification (\d+)/ then "#{num}-CS#{$1}" else num end end |
#parse_technology_area(node) ⇒ Array<String>
Parse technology area.
276 277 278 279 280 281 282 283 284 285 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 276 def parse_technology_area(node) xpath = "./summary/div/div" \ "/ul[@class='technology-areas__list']/li/a" result = node.xpath(xpath).map do |ta| ta.text.strip.gsub(/\s/, "-") .sub("development", "Development") end @errors[:technology_area] &&= result.empty? result end |
#person_affiliation(org) ⇒ Object
181 182 183 184 185 186 187 188 189 190 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 181 def person_affiliation(org) return [] unless org org_name = org.text.gsub(/[\r\n]+/, " ") organization = Bib::Organization.new( name: [Bib::TypedLocalizedString.new(content: org_name)], uri: [Bib::Uri.new(type: "uri", content: org[:href])], ) [Bib::Affiliation.new(organization: organization)] end |
#person_email(email) ⇒ Object
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 153 def person_email(email) return [] unless email href = email[:href] if href.start_with?("mailto:") [href.split(":")[1]] elsif (cf_email = email.at(".//span[@data-cfemail]")) decoded = decode_cf_email(cf_email["data-cfemail"]) return [] if decoded.empty? # Cloudflare obfuscates ASCII email characters in the data-cfemail # span but leaves non-ASCII characters (e.g. the Latin "fl" ligature # U+FB02) as plain text outside the span. Concatenate any sibling # text and NFKC-normalize so ligatures become their ASCII equivalent. prefix = cf_email.xpath("./preceding-sibling::node()").map(&:text).join suffix = cf_email.xpath("./following-sibling::node()").map(&:text).join [(prefix + decoded + suffix).unicode_normalize(:nfkc)] else [] end end |
#publisher_oasis ⇒ Object
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 28 def publisher_oasis org = Bib::Organization.new( name: [Bib::TypedLocalizedString.new(content: "OASIS")], uri: [Bib::Uri.new(type: "uri", content: "https://www.oasis-open.org/")], ) role = [ Bib::Contributor::Role.new( type: "authorizer", description: [Bib::LocalizedMarkedUpString.new( content: "Standards Development Organization", )], ), Bib::Contributor::Role.new(type: "publisher"), ] [Bib::Contributor.new(organization: org, role: role)] end |
#retry_page(url, agent, retries = 3) ⇒ Nokogiri::HTML::Document, ...
Retry to get page.
88 89 90 91 92 93 94 95 |
# File 'lib/relaton/oasis/data_parser_utils.rb', line 88 def retry_page(url, agent, retries = 3) sleep 1 # to avoid 429 error agent.get url rescue *RETRIABLE_PAGE_ERRORS => e retry if (retries -= 1).positive? Util.error "Failed to get page `#{url}`\n#{e.}" nil end |