Module: Pubid::Iso::Utilities

Defined in:
lib/pubid/iso/utilities.rb

Overview

Utility methods for ISO identifiers

Class Method Summary collapse

Class Method Details

.parse_from_title(title) ⇒ Identifier?

Parse identifier from document title

This method attempts to extract an ISO identifier from a document title by progressively removing trailing words until a valid identifier is found.

Examples:

Extract identifier from title

parse_from_title("ISO 9001:2015 Quality management systems")
=> #<Pubid::Iso::Identifiers::InternationalStandard>

Parameters:

  • title (String)

    Document title

Returns:

  • (Identifier, nil)

    Parsed identifier, or nil if not found



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/pubid/iso/utilities.rb', line 22

def parse_from_title(title)
  return nil if title.nil? || title.empty?

  # Try to extract from the title by looking for common patterns
  # Pattern 1: Title starts with identifier (e.g., "ISO 9001:2015 - Quality...")
  if match = title.match(/^([A-Z]{2,4}(?:\/[A-Z]{2,4})?\s+\d+(?:-\d+)?(?::\d{4})?(?:\/[A-Z]+\s+\d+(?::\d{4})?)?)/i)
    candidate = match[1]
    begin
      return Pubid::Iso.parse(candidate)
    rescue Parslet::ParseFailed
      # Continue to next method
    end
  end

  # Pattern 2: Look for identifier within title (e.g., "Document on ISO 9001 requirements")
  # Common ISO identifier pattern: ISO/IEC 1234-1:2015
  identifier_pattern = /
    \b
    (?:ISO|IEC)
    (?:\/[A-Z]{2,4})?  # Optional copublisher
    \s+
    \d+(?:-\d+)?       # Number and optional part
    (?::\d{4})?        # Optional year
    (?:\/[A-Z]+\s+\d+(?::\d{4})?)?  # Optional supplement
    \b
  /ix

  # Scan for matches and try each one
  matches = title.to_enum(:scan, identifier_pattern).map do
    Regexp.last_match
  end
  matches.each do |m|
    return Pubid::Iso.parse(m[0].strip)
  rescue Parslet::ParseFailed
    # Try next match
  end

  # Pattern 3: Split title into words and try progressively shorter suffixes
  words = title.split(/\s+/)
  (2..[words.length, 8].min).each do |count|
    candidate = words[-count..].join(" ")
    begin
      return Pubid::Iso.parse(candidate)
    rescue Parslet::ParseFailed
      # Try with fewer words
    end
  end

  # Pattern 4: Try removing trailing descriptive text
  # Common patterns: " — Title", " - Title", ": Title"
  title.split(/[\s—:-]+/).reverse.each do |part|
    next if part.nil? || part.length < 5 # Skip very short parts

    begin
      return Pubid::Iso.parse(part.strip)
    rescue Parslet::ParseFailed
      # Try next part
    end
  end

  nil
end