Module: Relaton::Oasis::DataParserUtils

Included in:
DataParser, DataPartParser
Defined in:
lib/relaton/oasis/data_parser_utils.rb

Overview

Common methods for document and part parsers.

Instance Method Summary collapse

Instance Method Details

#create_contribution_info(person_node, type, description = []) ⇒ Object



112
113
114
115
116
117
118
119
# File 'lib/relaton/oasis/data_parser_utils.rb', line 112

def create_contribution_info(person_node, type, description = [])
  name = person_node.text.match(/^[^(]+/).to_s.strip
  email, org = person_node.xpath ".//a[@href]"
  entity = create_person name, email, org
  desc = description.map { |d| Bib::LocalizedMarkedUpString.new(content: d) }
  role = Bib::Contributor::Role.new(type: type, description: desc)
  Bib::Contributor.new(role: [role], person: entity)
end

#create_extObject



258
259
260
261
262
263
264
# File 'lib/relaton/oasis/data_parser_utils.rb', line 258

def create_ext
  Ext.new(
    doctype: parse_doctype,
    flavor: "oasis",
    technology_area: parse_technology_area,
  )
end

#create_person(name, email = nil, org = nil) ⇒ Object



121
122
123
124
125
126
127
128
129
130
# File 'lib/relaton/oasis/data_parser_utils.rb', line 121

def create_person(name, email = nil, org = nil)
  forename, surname = name.split
  fn = Bib::FullNameType::Forename.new(content: forename, language: "en",
                                       script: "Latn")
  sn = Bib::LocalizedString.new(content: surname, language: "en",
                                script: "Latn")
  fullname = Bib::FullName.new(surname: sn, forename: [fn])
  Bib::Person.new(name: fullname, email: person_email(email),
                  affiliation: person_affiliation(org))
end

#decode_cf_email(encoded) ⇒ Object



146
147
148
149
150
# File 'lib/relaton/oasis/data_parser_utils.rb', line 146

def decode_cf_email(encoded)
  bytes = [encoded].pack("H*").bytes
  key = bytes.first
  bytes[1..].map { |b| (b ^ key).chr }.join
end

#pageObject



50
51
52
53
54
55
56
57
58
59
# File 'lib/relaton/oasis/data_parser_utils.rb', line 50

def page
  return @page if defined? @page

  if link_node && link_node[:href].match?(/\.html$/)
    agent = Mechanize.new
    agent.agent.allowed_error_codes = [404]
    resp = retry_page(link_node[:href], agent)
    @page = resp if resp && resp.code == "200"
  end
end

#parse_chairsObject

rubocop:disable Metrics/MethodLength



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/relaton/oasis/data_parser_utils.rb', line 79

def parse_chairs # rubocop:disable Metrics/MethodLength
  result = if page
             xpath = "//p[preceding-sibling::p" \
                     "[starts-with(., 'Chair')]]" \
                     "[following-sibling::p" \
                     "[starts-with(., 'Editor')]]"
             page.xpath(xpath).map do |p|
               create_contribution_info(p, "editor", ["Chair"])
             end
           else
             []
           end
  @errors[:chairs] &&= result.empty?
  result
end

#parse_contributorArray<Bib::Contributor>

Parse contributor.

Returns:

  • (Array<Bib::Contributor>)

    contributors



10
11
12
13
14
15
16
# File 'lib/relaton/oasis/data_parser_utils.rb', line 10

def parse_contributor
  result = publisher_oasis + parse_authorizer +
    parse_editorialgroup_contributor +
    parse_chairs + parse_editors
  @errors[:contributor] &&= result.empty?
  result
end

#parse_docidArray<Bib::Docidentifier>

Parse document identifier.

Returns:

  • (Array<Bib::Docidentifier>)

    document identifier



216
217
218
219
220
221
222
# File 'lib/relaton/oasis/data_parser_utils.rb', line 216

def parse_docid
  id = "OASIS #{parse_docnumber}"
  result = [Bib::Docidentifier.new(type: "OASIS", content: id,
                                   primary: true)]
  @errors[:docid] &&= result.empty?
  result
end

#parse_doctypeDoctype

Parse document type.

Returns:



229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/relaton/oasis/data_parser_utils.rb', line 229

def parse_doctype
  type = case text
         when /OASIS Project Specification/, /Committee Specification/
           "specification"
         when /Technical Memorandum/ then "memorandum"
         when /Technical Resolution/ then "resolution"
         else "standard"
         end
  result = Doctype.new(content: type)
  @errors[:doctype] &&= result.nil?
  result
end

#parse_editorsObject

rubocop:disable Metrics/MethodLength



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/relaton/oasis/data_parser_utils.rb', line 95

def parse_editors # rubocop:disable Metrics/MethodLength
  result = if page
             xpath = "//p[contains(@class, 'Contributor')]" \
                     "[preceding-sibling::p" \
                     "[starts-with(., 'Editor')]]" \
                     "[following-sibling::p" \
                     "[contains(@class, 'Title')]]"
             page.xpath(xpath).map do |p|
               create_contribution_info(p, "editor")
             end
           else
             parse_editors_from_text
           end
  @errors[:editors] &&= result.empty?
  result
end

#parse_editors_from_textObject

rubocop:disable Metrics/MethodLength



35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/relaton/oasis/data_parser_utils.rb', line 35

def parse_editors_from_text # rubocop:disable Metrics/MethodLength
  result = if text
             text.match(/(?<=Edited\sby\s)[^.]+/).to_s
               .split(/,?\sand\s|,\s/).map do |c|
               role = [Bib::Contributor::Role.new(type: "editor")]
               Bib::Contributor.new(role: role,
                                    person: create_person(c))
             end
           else
             []
           end
  @errors[:editors] &&= result.empty?
  result
end

#parse_errata(id) ⇒ String

Parse document identifier errata.

Parameters:

  • id (String)

    document identifier

Returns:

  • (String)

    document identifier with errata if needed



201
202
203
204
205
206
207
208
209
# File 'lib/relaton/oasis/data_parser_utils.rb', line 201

def parse_errata(id)
  return id.sub("errata", "Errata") if id.match?(/errata\d+/i)

  case title
  when /Plus\sErrata\s(\d+)/ then "#{id}-plus-Errata#{$1}"
  when /Errata\s(\d+)/ then "#{id}-Errata#{$1}"
  else id
  end
end

#parse_part(docid) ⇒ String

Parse document identifier part.

Parameters:

  • docid (String)

    document identifier

Returns:

  • (String)

    document identifier with part if needed



185
186
187
188
189
190
191
192
# File 'lib/relaton/oasis/data_parser_utils.rb', line 185

def parse_part(docid)
  return docid if docid.match?(/(?:Part|Pt)\d+/i)

  case title
  when /Part\s(\d+)/ then "#{docid}-Pt#{$1}"
  else docid
  end
end

#parse_spec(num) ⇒ String

Parse document identifier specification.

Parameters:

  • num (String)

    document number

Returns:

  • (String)

    document identifier with specification if needed



170
171
172
173
174
175
176
# File 'lib/relaton/oasis/data_parser_utils.rb', line 170

def parse_spec(num)
  case text
  when /OASIS Project Specification (\d+)/ then "#{num}-PS#{$1}"
  when /Committee Specification (\d+)/ then "#{num}-CS#{$1}"
  else num
  end
end

#parse_technology_area(node) ⇒ Array<String>

Parse technology area.

Returns:

  • (Array<String>)

    technology areas



247
248
249
250
251
252
253
254
255
256
# File 'lib/relaton/oasis/data_parser_utils.rb', line 247

def parse_technology_area(node)
  xpath = "./summary/div/div" \
          "/ul[@class='technology-areas__list']/li/a"
  result = node.xpath(xpath).map do |ta|
    ta.text.strip.gsub(/\s/, "-")
      .sub("development", "Development")
  end
  @errors[:technology_area] &&= result.empty?
  result
end

#person_affiliation(org) ⇒ Object



152
153
154
155
156
157
158
159
160
161
# File 'lib/relaton/oasis/data_parser_utils.rb', line 152

def person_affiliation(org)
  return [] unless org

  org_name = org.text.gsub(/[\r\n]+/, " ")
  organization = Bib::Organization.new(
    name: [Bib::TypedLocalizedString.new(content: org_name)],
    uri: [Bib::Uri.new(type: "uri", content: org[:href])],
  )
  [Bib::Affiliation.new(organization: organization)]
end

#person_email(email) ⇒ Object



132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/relaton/oasis/data_parser_utils.rb', line 132

def person_email(email)
  return [] unless email

  href = email[:href]
  if href.start_with?("mailto:")
    [href.split(":")[1]]
  elsif (cf_email = email.at(".//span[@data-cfemail]"))
    decoded = decode_cf_email(cf_email["data-cfemail"])
    decoded.empty? ? [] : [decoded]
  else
    []
  end
end

#publisher_oasisObject



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/relaton/oasis/data_parser_utils.rb', line 18

def publisher_oasis
  org = Bib::Organization.new(
    name: [Bib::TypedLocalizedString.new(content: "OASIS")],
    uri: [Bib::Uri.new(type: "uri", content: "https://www.oasis-open.org/")],
  )
  role = [
    Bib::Contributor::Role.new(
      type: "authorizer",
      description: [Bib::LocalizedMarkedUpString.new(
        content: "Standards Development Organization",
      )],
    ),
    Bib::Contributor::Role.new(type: "publisher"),
  ]
  [Bib::Contributor.new(organization: org, role: role)]
end

#retry_page(url, agent, retries = 3) ⇒ Mechanize::Page?

Retry to get page.

Parameters:

  • url (String)

    page URL

  • agent (Mechanize)

    HTTP client

  • retries (Integer) (defaults to: 3)

    number of retries

Returns:

  • (Mechanize::Page, nil)

    page or nil



70
71
72
73
74
75
76
77
# File 'lib/relaton/oasis/data_parser_utils.rb', line 70

def retry_page(url, agent, retries = 3)
  sleep 1 # to avoid 429 error
  agent.get url
rescue Errno::ETIMEDOUT, Net::OpenTimeout => e
  retry if (retries -= 1).positive?
  Util.error "Failed to get page `#{url}`\n#{e.message}"
  nil
end