Module: Dommy::Internal::UrlParser

Defined in:
lib/dommy/internal/url_parser.rb

Overview

A WHATWG URL Standard “basic URL parser” (url.spec.whatwg.org/).

Replaces the previous Ruby ‘URI`-based resolution, which diverged from the spec on empty userinfo, relative resolution against opaque/special bases, port-range validation, leading-colon inputs, and percent-encoding. Produces a `Record` (the spec’s “URL record”); ‘Dommy::URL` wraps it.

Defined Under Namespace

Classes: Failure, Record

Constant Summary collapse

SPECIAL =

scheme => default port (file has none).

{"ftp" => 21, "file" => nil, "http" => 80, "https" => 443, "ws" => 80, "wss" => 443}.freeze
FORBIDDEN_HOST =
host parsing =====
[0x00, 0x09, 0x0A, 0x0D, 0x20, 0x23, 0x2F, 0x3A, 0x3C, 0x3E,
0x3F, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C].freeze

Class Method Summary collapse

Class Method Details

.c0?(cp) ⇒ Boolean

percent-encode sets =====

Returns:

  • (Boolean)


41
# File 'lib/dommy/internal/url_parser.rb', line 41

def c0?(cp) = cp <= 0x1F || cp > 0x7E

.double_dot?(seg) ⇒ Boolean

Returns:

  • (Boolean)


306
307
308
# File 'lib/dommy/internal/url_parser.rb', line 306

def double_dot?(seg)
  ["..", ".%2e", "%2e.", "%2e%2e"].include?(seg.downcase)
end

.ends_in_number?(input) ⇒ Boolean

Returns:

  • (Boolean)


118
119
120
121
122
123
124
125
126
127
128
# File 'lib/dommy/internal/url_parser.rb', line 118

def ends_in_number?(input)
  parts = input.split(".", -1)
  parts.pop if parts.length > 1 && parts.last == ""
  return false if parts.empty?

  last = parts.last
  return false if last.empty?
  return true if last.match?(/\A[0-9]+\z/)

  last.match?(/\A0[xX][0-9A-Fa-f]*\z/)
end

.forbidden_domain?(cp) ⇒ Boolean

Returns:

  • (Boolean)


79
# File 'lib/dommy/internal/url_parser.rb', line 79

def forbidden_domain?(cp) = forbidden_host?(cp) || cp <= 0x1F || cp == 0x25 || cp == 0x7F

.forbidden_host?(cp) ⇒ Boolean

Returns:

  • (Boolean)


78
# File 'lib/dommy/internal/url_parser.rb', line 78

def forbidden_host?(cp) = FORBIDDEN_HOST.include?(cp)

.fragment_set?(cp) ⇒ Boolean

Returns:

  • (Boolean)


42
# File 'lib/dommy/internal/url_parser.rb', line 42

def fragment_set?(cp) = c0?(cp) || [0x20, 0x22, 0x3C, 0x3E, 0x60].include?(cp)

.normalized_windows_drive_letter?(seg) ⇒ Boolean

Returns:

  • (Boolean)


298
# File 'lib/dommy/internal/url_parser.rb', line 298

def normalized_windows_drive_letter?(seg) = seg.length == 2 && seg[0].match?(/[A-Za-z]/) && seg[1] == ":"

.parse(input, base_input = nil) ⇒ Object



31
32
33
34
35
36
37
# File 'lib/dommy/internal/url_parser.rb', line 31

def parse(input, base_input = nil)
  base = nil
  if base_input && base_input != ""
    base = base_input.is_a?(Record) ? base_input : run(base_input.to_s, nil)
  end
  run(input.to_s, base)
end

.parse_host(input, special) ⇒ Object

Raises:



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/dommy/internal/url_parser.rb', line 81

def parse_host(input, special)
  if input.start_with?("[")
    raise Failure, "unclosed IPv6 address" unless input.end_with?("]")

    return "[#{parse_ipv6(input[1...-1])}]"
  end
  return parse_opaque_host(input) unless special
  raise Failure, "empty host" if input.empty?

  # UTF-8-decode-without-BOM the percent-decoded bytes: malformed
  # sequences become U+FFFD (which domain-to-ASCII then rejects),
  # matching the spec rather than crashing on invalid encoding.
  domain = percent_decode(input).force_encoding("UTF-8").scrub("")
  ascii =
    begin
      IDNA.to_ascii(domain, check_hyphens: false, verify_dns_length: false)
    rescue IDNA::Error, Punycode::Error => e
      raise Failure, "domain to ASCII: #{e.message}"
    end
  raise Failure, "empty domain" if ascii.empty?
  raise Failure, "forbidden domain code point" if ascii.each_char.any? { |ch| forbidden_domain?(ch.ord) }

  if ends_in_number?(ascii)
    ip = Ipv4Parser.parse(ascii)
    raise Failure, "invalid IPv4 address" if ip.nil?

    return ip
  end
  ascii
end

.parse_ipv6(input) ⇒ Object

WHATWG IPv6 parser -> compressed serialized string (no brackets).



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# File 'lib/dommy/internal/url_parser.rb', line 131

def parse_ipv6(input)
  address = [0, 0, 0, 0, 0, 0, 0, 0]
  piece_index = 0
  compress = nil
  chars = input.chars
  ptr = 0
  c = ->(i) { i < chars.length ? chars[i] : nil }

  if c.call(ptr) == ":"
    raise Failure, "IPv6 starts with single colon" unless c.call(ptr + 1) == ":"

    ptr += 2
    piece_index += 1
    compress = piece_index
  end

  while c.call(ptr)
    raise Failure, "too many IPv6 pieces" if piece_index == 8

    if c.call(ptr) == ":"
      raise Failure, "multiple IPv6 compressions" unless compress.nil?

      ptr += 1
      piece_index += 1
      compress = piece_index
      next
    end

    value = 0
    length = 0
    while length < 4 && c.call(ptr)&.match?(/[0-9A-Fa-f]/)
      value = value * 16 + c.call(ptr).to_i(16)
      ptr += 1
      length += 1
    end

    if c.call(ptr) == "."
      raise Failure, "IPv4-in-IPv6 with no digits" if length.zero?

      ptr -= length
      raise Failure, "too few pieces for embedded IPv4" if piece_index > 6

      numbers_seen = 0
      while c.call(ptr)
        ipv4_piece = nil
        if numbers_seen.positive?
          if c.call(ptr) == "." && numbers_seen < 4
            ptr += 1
          else
            raise Failure, "invalid embedded IPv4"
          end
        end
        raise Failure, "invalid embedded IPv4 digit" unless c.call(ptr)&.match?(/[0-9]/)

        while c.call(ptr)&.match?(/[0-9]/)
          number = c.call(ptr).to_i
          if ipv4_piece.nil?
            ipv4_piece = number
          elsif ipv4_piece.zero?
            raise Failure, "leading zero in embedded IPv4"
          else
            ipv4_piece = ipv4_piece * 10 + number
          end
          raise Failure, "embedded IPv4 piece > 255" if ipv4_piece > 255

          ptr += 1
        end
        address[piece_index] = address[piece_index] * 0x100 + ipv4_piece
        numbers_seen += 1
        piece_index += 1 if numbers_seen == 2 || numbers_seen == 4
      end
      raise Failure, "incomplete embedded IPv4" unless numbers_seen == 4

      break
    elsif c.call(ptr) == ":"
      ptr += 1
      raise Failure, "trailing colon in IPv6" if c.call(ptr).nil?
    elsif c.call(ptr)
      raise Failure, "invalid IPv6 code point"
    end

    address[piece_index] = value
    piece_index += 1
  end

  if compress
    swaps = piece_index - compress
    piece_index = 7
    while piece_index != 0 && swaps.positive?
      address[piece_index], address[compress + swaps - 1] = address[compress + swaps - 1], address[piece_index]
      piece_index -= 1
      swaps -= 1
    end
  elsif piece_index != 8
    raise Failure, "too few IPv6 pieces"
  end

  serialize_ipv6(address)
end

.parse_opaque_host(input) ⇒ Object

Raises:



112
113
114
115
116
# File 'lib/dommy/internal/url_parser.rb', line 112

def parse_opaque_host(input)
  raise Failure, "forbidden host code point" if input.each_char.any? { |ch| forbidden_host?(ch.ord) }

  input.each_char.map { |ch| pe(ch, method(:c0?)) }.join
end

.path_set?(cp) ⇒ Boolean

Returns:

  • (Boolean)


45
# File 'lib/dommy/internal/url_parser.rb', line 45

def path_set?(cp) = query_set?(cp) || [0x3F, 0x5E, 0x60, 0x7B, 0x7D].include?(cp)

.pe(char, set) ⇒ Object

UTF-8 percent-encode a single code point against ‘set` (a predicate).



49
50
51
52
53
# File 'lib/dommy/internal/url_parser.rb', line 49

def pe(char, set)
  return char unless set.call(char.ord)

  char.b.bytes.map { |b| format("%%%02X", b) }.join
end

.percent_decode(str) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/dommy/internal/url_parser.rb', line 55

def percent_decode(str)
  out = +"".b
  bytes = str.b
  i = 0
  while i < bytes.bytesize
    b = bytes.getbyte(i)
    if b == 0x25 && i + 2 < bytes.bytesize &&
        bytes.byteslice(i + 1, 2) =~ /\A[0-9A-Fa-f]{2}\z/
      out << bytes.byteslice(i + 1, 2).to_i(16)
      i += 3
    else
      out << b
      i += 1
    end
  end
  out
end

.query_set?(cp) ⇒ Boolean

Returns:

  • (Boolean)


43
# File 'lib/dommy/internal/url_parser.rb', line 43

def query_set?(cp) = c0?(cp) || [0x20, 0x22, 0x23, 0x3C, 0x3E].include?(cp)

.run(input, base) ⇒ Object

the basic URL parser state machine =====


320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
# File 'lib/dommy/internal/url_parser.rb', line 320

def run(input, base)
  input = input.dup
  # Strip leading/trailing C0 controls and spaces, then remove all
  # ASCII tab/newline.
  input = input.sub(/\A[\x00-\x20]+/, "").sub(/[\x00-\x20]+\z/, "")
  input = input.gsub(/[\t\n\r]/, "")

  chars = input.chars
  len = chars.length
  state = :scheme_start
  url = Record.new("", "", "", nil, nil, [], nil, nil)
  buffer = +""
  at_sign_seen = false
  inside_brackets = false
  password_token_seen = false
  ptr = 0

  cp = lambda { ptr < len ? chars[ptr] : nil }

  loop do
    c = cp.call

    case state
    when :scheme_start
      if c&.match?(/[A-Za-z]/)
        buffer << c.downcase
        state = :scheme
      else
        state = :no_scheme
        next # reprocess (do not advance)
      end

    when :scheme
      if c&.match?(/[A-Za-z0-9+\-.]/)
        buffer << c.downcase
      elsif c == ":"
        url.scheme = buffer
        buffer = +""
        if url.scheme == "file"
          state = :file
        elsif url.special? && base && base.scheme == url.scheme
          state = :special_relative_or_authority
        elsif url.special?
          state = :special_authority_slashes
        elsif input[(ptr + 1)..].to_s.start_with?("/")
          state = :path_or_authority
          ptr += 1
        else
          url.path = ""
          state = :opaque_path
        end
      else
        buffer = +""
        state = :no_scheme
        ptr = -1 # restart from 0 (advance makes it 0)
      end

    when :no_scheme
      raise Failure, "missing scheme" if base.nil? || (base.opaque_path? && c != "#")

      if base.opaque_path? && c == "#"
        url.scheme = base.scheme
        url.path = base.path
        url.query = base.query
        url.fragment = +""
        state = :fragment
      elsif base.scheme != "file"
        state = :relative
        next
      else
        state = :file
        next
      end

    when :special_relative_or_authority
      if c == "/" && input[(ptr + 1)..].to_s.start_with?("/")
        state = :special_authority_ignore_slashes
        ptr += 1
      else
        state = :relative
        next
      end

    when :path_or_authority
      if c == "/"
        state = :authority
      else
        state = :path
        next
      end

    when :relative
      url.scheme = base.scheme
      if c == "/"
        state = :relative_slash
      elsif url.special? && c == "\\"
        state = :relative_slash
      else
        url.username = base.username
        url.password = base.password
        url.host = base.host
        url.port = base.port
        url.path = base.path.dup
        url.query = base.query
        if c == "?"
          url.query = +""
          state = :query
        elsif c == "#"
          url.fragment = +""
          state = :fragment
        elsif c
          url.query = nil
          shorten_path(url)
          state = :path
          next
        end
      end

    when :relative_slash
      if url.special? && (c == "/" || c == "\\")
        state = :special_authority_ignore_slashes
      elsif c == "/"
        state = :authority
      else
        url.username = base.username
        url.password = base.password
        url.host = base.host
        url.port = base.port
        state = :path
        next
      end

    when :special_authority_slashes
      if c == "/" && input[(ptr + 1)..].to_s.start_with?("/")
        state = :special_authority_ignore_slashes
        ptr += 1
      else
        state = :special_authority_ignore_slashes
        next
      end

    when :special_authority_ignore_slashes
      if c != "/" && c != "\\"
        state = :authority
        next
      end

    when :authority
      if c == "@"
        buffer = "%40#{buffer}" if at_sign_seen
        at_sign_seen = true
        buffer.each_char do |ch|
          if ch == ":" && !password_token_seen
            password_token_seen = true
            next
          end
          encoded = pe(ch, method(:userinfo_set?))
          if password_token_seen
            url.password += encoded
          else
            url.username += encoded
          end
        end
        buffer = +""
      elsif c.nil? || ["/", "?", "#"].include?(c) || (url.special? && c == "\\")
        raise Failure, "empty host with credentials" if at_sign_seen && buffer.empty?

        ptr -= (buffer.length + 1)
        buffer = +""
        state = :host
      else
        buffer << c
      end

    when :host, :hostname
      if c == ":" && !inside_brackets
        raise Failure, "empty host" if buffer.empty?

        url.host = parse_host(buffer, url.special?)
        buffer = +""
        state = :port
      elsif c.nil? || ["/", "?", "#"].include?(c) || (url.special? && c == "\\")
        ptr -= 1
        raise Failure, "empty special host" if url.special? && buffer.empty?

        url.host = parse_host(buffer, url.special?)
        buffer = +""
        state = :path_start
      else
        inside_brackets = true if c == "["
        inside_brackets = false if c == "]"
        buffer << c
      end

    when :port
      if c&.match?(/[0-9]/)
        buffer << c
      elsif c.nil? || ["/", "?", "#"].include?(c) || (url.special? && c == "\\")
        unless buffer.empty?
          port = buffer.to_i
          raise Failure, "port out of range" if port > 65_535

          url.port = (port == url.default_port ? nil : port)
          buffer = +""
        end
        state = :path_start
        next
      else
        raise Failure, "invalid port"
      end

    when :file
      url.scheme = "file"
      url.host = ""
      if c == "/" || c == "\\"
        state = :file_slash
      elsif base && base.scheme == "file"
        url.host = base.host
        url.path = base.path.dup
        url.query = base.query
        if c == "?"
          url.query = +""
          state = :query
        elsif c == "#"
          url.fragment = +""
          state = :fragment
        elsif c
          url.query = nil
          shorten_path(url) unless starts_with_windows_drive_letter?(input[ptr..].to_s)
          url.path = [] if starts_with_windows_drive_letter?(input[ptr..].to_s)
          state = :path
          next
        end
      else
        state = :path
        next
      end

    when :file_slash
      if c == "/" || c == "\\"
        state = :file_host
      else
        if base && base.scheme == "file"
          url.host = base.host
          if !starts_with_windows_drive_letter?(input[ptr..].to_s) &&
              base.path[0] && normalized_windows_drive_letter?(base.path[0])
            url.path << base.path[0]
          end
        end
        state = :path
        next
      end

    when :file_host
      if c.nil? || ["/", "\\", "?", "#"].include?(c)
        ptr -= 1
        if buffer.match?(/\A[A-Za-z][:|]\z/)
          state = :path
        elsif buffer.empty?
          url.host = ""
          state = :path_start
        else
          host = parse_host(buffer, true)
          host = "" if host == "localhost"
          url.host = host
          buffer = +""
          state = :path_start
        end
      else
        buffer << c
      end

    when :path_start
      if url.special?
        state = :path
        next unless c == "/" || c == "\\"
      elsif c == "?"
        url.query = +""
        state = :query
      elsif c == "#"
        url.fragment = +""
        state = :fragment
      elsif c
        state = :path
        next unless c == "/"
      end

    when :path
      if c.nil? || c == "/" || (url.special? && c == "\\") ||
          c == "?" || c == "#"
        if double_dot?(buffer)
          shorten_path(url)
          url.path << "" unless c == "/" || (url.special? && c == "\\")
        elsif single_dot?(buffer)
          url.path << "" unless c == "/" || (url.special? && c == "\\")
        else
          if url.scheme == "file" && url.path.empty? && windows_drive_letter?(buffer)
            buffer[1] = ":"
          end
          url.path << buffer
        end
        buffer = +""
        if c == "?"
          url.query = +""
          state = :query
        elsif c == "#"
          url.fragment = +""
          state = :fragment
        end
      else
        buffer << pe(c, method(:path_set?))
      end

    when :opaque_path
      if c == "?"
        url.query = +""
        state = :query
      elsif c == "#"
        url.fragment = +""
        state = :fragment
      elsif c == " "
        # A space is only percent-encoded when it abuts the end of
        # the opaque path (a following `?`/`#`); an interior space
        # stays literal. (Trailing-at-EOF spaces are already gone
        # via the leading/trailing strip.)
        nxt = chars[ptr + 1]
        url.path += (nxt == "?" || nxt == "#") ? "%20" : " "
      elsif c
        url.path += pe(c, method(:c0?))
      end

    when :query
      if c.nil? || c == "#"
        set = url.special? ? method(:special_query_set?) : method(:query_set?)
        url.query += buffer.each_char.map { |ch| pe(ch, set) }.join
        buffer = +""
        if c == "#"
          url.fragment = +""
          state = :fragment
        end
      else
        buffer << c
      end

    when :fragment
      url.fragment += pe(c, method(:fragment_set?)) if c
    end

    break if ptr >= len

    ptr += 1
  end

  url
end

.serialize(record, exclude_fragment: false) ⇒ Object



274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# File 'lib/dommy/internal/url_parser.rb', line 274

def serialize(record, exclude_fragment: false)
  out = +"#{record.scheme}:"
  if record.host
    out << "//"
    if record.includes_credentials?
      out << record.username
      out << ":#{record.password}" unless record.password.empty?
      out << "@"
    end
    out << record.host
    out << ":#{record.port}" if record.port
  elsif !record.opaque_path? && record.path.is_a?(Array) &&
      record.path.length > 1 && record.path[0] == ""
    out << "/."
  end
  out << serialize_path(record)
  out << "?#{record.query}" if record.query
  out << "##{record.fragment}" if record.fragment && !exclude_fragment
  out
end

.serialize_ipv6(pieces) ⇒ Object



231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# File 'lib/dommy/internal/url_parser.rb', line 231

def serialize_ipv6(pieces)
  # Find the longest run (length > 1) of zero pieces to compress.
  best_start = nil
  best_len = 0
  i = 0
  while i < 8
    if pieces[i].zero?
      j = i
      j += 1 while j < 8 && pieces[j].zero?
      if (j - i) > best_len
        best_len = j - i
        best_start = i
      end
      i = j
    else
      i += 1
    end
  end
  best_start = nil if best_len < 2

  out = +""
  i = 0
  while i < 8
    if best_start == i
      out << (i.zero? ? "::" : ":")
      i += best_len
      next
    end
    out << pieces[i].to_s(16)
    out << ":" if i < 7
    i += 1
  end
  out
end

.serialize_path(record) ⇒ Object

serialization =====


268
269
270
271
272
# File 'lib/dommy/internal/url_parser.rb', line 268

def serialize_path(record)
  return record.path if record.opaque_path?

  record.path.map { |seg| "/#{seg}" }.join
end

.shorten_path(record) ⇒ Object



310
311
312
313
314
315
316
# File 'lib/dommy/internal/url_parser.rb', line 310

def shorten_path(record)
  path = record.path
  return if path.empty?
  return if record.scheme == "file" && path.length == 1 && normalized_windows_drive_letter?(path[0])

  path.pop
end

.single_dot?(seg) ⇒ Boolean

Returns:

  • (Boolean)


304
# File 'lib/dommy/internal/url_parser.rb', line 304

def single_dot?(seg) = [".", "%2e"].include?(seg.downcase)

.special_query_set?(cp) ⇒ Boolean

Returns:

  • (Boolean)


44
# File 'lib/dommy/internal/url_parser.rb', line 44

def special_query_set?(cp) = query_set?(cp) || cp == 0x27

.starts_with_windows_drive_letter?(s) ⇒ Boolean

Returns:

  • (Boolean)


299
300
301
302
# File 'lib/dommy/internal/url_parser.rb', line 299

def starts_with_windows_drive_letter?(s)
  s.length >= 2 && s[0].match?(/[A-Za-z]/) && [":", "|"].include?(s[1]) &&
    (s.length == 2 || ["/", "\\", "?", "#"].include?(s[2]))
end

.userinfo_set?(cp) ⇒ Boolean

Returns:

  • (Boolean)


46
# File 'lib/dommy/internal/url_parser.rb', line 46

def userinfo_set?(cp) = path_set?(cp) || [0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x7C].include?(cp)

.windows_drive_letter?(seg) ⇒ Boolean

helpers for path normalization =====

Returns:

  • (Boolean)


297
# File 'lib/dommy/internal/url_parser.rb', line 297

def windows_drive_letter?(seg) = seg.length == 2 && seg[0].match?(/[A-Za-z]/) && [":", "|"].include?(seg[1])