Module: UrlScrubber

Defined in:
lib/url_scrubber.rb,
lib/url_scrubber/version.rb

Constant Summary collapse

VERSION =
"0.8.27"

Class Method Summary collapse

Class Method Details

.check_for_facebook_redirection(uri_str, limit = 5) ⇒ Object



445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
# File 'lib/url_scrubber.rb', line 445

def self.check_for_facebook_redirection(uri_str, limit = 5)
  #puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
  # finds any redirects   intended for facebook URLs only!!!!
   = [
    # pages that require user logins
    %r{^.*/login[^/]*$}
  ]

  failure_patterns = [
    # pages that give 200 codes but actually indicate a not found
    %r{linkedin\.com/home\?report%2Efailure}i
  ]

  raise 'Too many HTTP redirects' if limit == 0

  uri_str_new = uri_str.sub('http://', 'https://')
  uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")

  begin
    url = URI.parse(URI.escape(uri_str_new))
  rescue URI::InvalidURIError => e
    return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
  end

  http = Net::HTTP.new(url.host, url.port)
  http = Net::HTTP.new(url.host, url.port)
  http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
  http.read_timeout = 10 # and up to 10 seconds for a response
  if url.port == 443
    http.use_ssl = true
    http.verify_mode = OpenSSL::SSL::VERIFY_NONE
  else
    http.use_ssl = false
  end
  request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })

  begin
    response = http.request(request)
  rescue Timeout::Error
    #Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
    failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
    return [uri_str_new, failure_response]
  rescue Exception => e
    failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
    return [uri_str_new, failure_response]
  end

  if response.is_a? Net::HTTPRedirection
    if response['location'][0,4] == "http"
      if failure_patterns.any? { |pattern| response['location'].match(pattern) }
        # got redirected to a page indicating failure, so act like it's a 404
        failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
        #puts "check_for_facebook_redirection 404"
        return [uri_str_new, failure_response]
      end

      if .any? { |pattern| redirected_url.match(pattern) }
        # got redirected to a login page. return the ultimate response, but the previous url
        failure_response = Net::HTTPClientError.new('1.1', '401', 'Inaccessible')
        #puts "check_for_facebook_redirection 401"
        return [uri_str_new, failure_response]
      end
      #puts "check_for_facebook_redirection 1 limit=#{limit.to_s}"
      redirected_url, base_response = check_for_facebook_redirection(response['location'], limit - 1)
      return [redirected_url, base_response]

    else
      redir_url = "http://#{url.host}#{response['location']}"
      #puts "check_for_facebook_redirection recalled limit =#{limit.to_s}"
      redirected_url, base_response = check_for_facebook_redirection(redir_url, limit - 1)
      return [redirected_url, base_response]
    end
  else
    #puts "check_for_facebook_redirection return code #{response.code.to_s}"
    return [uri_str_new, response]
  end
end

.downcase_domain(url) ⇒ Object



185
186
187
188
189
190
191
192
# File 'lib/url_scrubber.rb', line 185

def self.downcase_domain(url)
  domain_match = url.match(%r{http://[^/]+}i)
  if domain_match
    domain_match[0].downcase + domain_match.post_match
  else
    url
  end
end

.drop_anchor!(url) ⇒ Object



244
245
246
247
248
# File 'lib/url_scrubber.rb', line 244

def self.drop_anchor!(url)
  #puts "drop anchor"
  url.sub!(/#.*$/, '')
  url
end

.drop_url_ampersand!(url) ⇒ Object



232
233
234
235
# File 'lib/url_scrubber.rb', line 232

def self.drop_url_ampersand!(url)
  url.sub!(/\&.*$/, '')
  url
end

.drop_url_query!(url) ⇒ Object



238
239
240
241
# File 'lib/url_scrubber.rb', line 238

def self.drop_url_query!(url)
  url.sub!(/\?.*$/, '')
  url
end

.find_identity_from_url(url) ⇒ Object



126
127
128
129
130
# File 'lib/url_scrubber.rb', line 126

def self.find_identity_from_url(url)
  return nil unless url.present?
  url = UrlScrubber.scrub(url)
  url ? url.split("/").last : nil
end

.find_linkedin_identity_from_url(url) ⇒ Object



133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/url_scrubber.rb', line 133

def self.find_linkedin_identity_from_url(url)
  return nil if url.nil?
  scrubbed_url = scrub(url)
  if scrubbed_url && linkedin_company_url?(scrubbed_url)
    scrubbed_url.split("/").last
  elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/')
    scrubbed_url.split("/").last
  elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/pub/')
    id_partition = scrubbed_url.partition('linkedin.com/pub/')
    id_partition[2] && id_partition[2] != "" ? drop_url_ampersand!(id_partition[2].split('/').first) : nil
  elsif scrubbed_url.include?('linkedin.com/groups/')
    scrubbed_url.split("/").last
  elsif scrubbed_url.include?('linkedin.com/groups?gid=')
    id_partition = scrubbed_url.partition('linkedin.com/groups?gid=')
    drop_url_ampersand!(id_partition[2])
  end
end

.ideal_form?(url) ⇒ Boolean

Returns:

  • (Boolean)


72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/url_scrubber.rb', line 72

def self.ideal_form?(url)
  url = scrub(url)
  return false unless url

  case service_of(url)
  when :vkontakte
    !!url.match(%r{^http://vk\.com/[\w_]+$})
  when :weibo
    !!url.match(%r{^http://weibo\.com/[\w_-]+$})
  when :youtube
    !!url.match(%r{^http://youtube\.com/[\w_-]+$})
  when :twitter
    !!url.match(%r{^http://twitter\.com/[\w_]+$})
  when :facebook
    !!url.match(%r{^http://facebook\.com/(profile\.php?id=\d+|[\w_\.-]+)$}) || !!url.match(%r{^http://facebook\.com/groups/[\w_\.-]+$})
  when :linkedin
    !!url.match(%r{^http://linkedin\.com/pub/[\w-]+/[\w]+/[\w]+/[\w]+$}) || !!url.match(%r{^http://linkedin\.com/in/[\w_-]+$}) || !!url.match(%r{^http://linkedin\.com/(company/[\w_-]+|profile/view\?id=\d+)$}) || !!url.match(%r{^http://linkedin\.com/(groups\?gid=[0-9]+)$}) || !!url.match(%r{^http://linkedin\.com/(groups/[\w_-]+)$})
  when :google
    !!url.match(%r{^http://plus\.google\.com/(\+[\w_-]+|\d+)$}) || !!url.match(%r{^http://plus\.google\.com/communities/\d+$})
  when :slideshare
    !!url.match(%r{^http://slideshare\.net/[\w_-]+$})
  when :flickr
    !!url.match(%r{^http://flickr\.com/[\w_\@-]+$}) || !!url.match(%r{^http://flickr\.com/groups/[\w_\@\.-]+$})
  when :pinterest
    !!url.match(%r{^http://pinterest\.com/[\w_-]+$})
  when :yelp
    !!url.match(%r{^http://yelp\.com/[\w_-]+$})
  when :vimeo
    (!!url.match(%r{^http://vimeo\.com/[\w_-]+$}) && !url.match(%r{/\d+$})) || !!url.match(%r{^http://vimeo\.com/groups/[\w_\.-]+$})
  when :instagram
    !!url.match(%r{^http://instagram\.com/[\w_]+$})
  when :tumblr
    #Rails.logger.debug "CCC  Tumblr - url=#{url}, ideal=#{!!url.match(%r{^http://[\w_]+\.tumblr\.com$})}, www=#{url.index("://www.") ? url.index("://www.") : 'NIL'}"
    !!url.match(%r{^http://[\w_]+\.tumblr\.com$}) && !url.index("://www.")
  else
    true
  end
end

.linkedin_company_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


112
113
114
115
116
# File 'lib/url_scrubber.rb', line 112

def self.linkedin_company_url?(url)
  url = scrub(url)
  return false unless url
  return url.include?('http://linkedin.com/company/')
end

.linkedin_personal_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


119
120
121
122
123
# File 'lib/url_scrubber.rb', line 119

def self.linkedin_personal_url?(url)
  url = scrub(url)
  return false unless url
  return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
end

.maps_to_public_url(url) ⇒ Object



165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/url_scrubber.rb', line 165

def self.maps_to_public_url(url)
  scrubbed = scrub(url)
  parsed = URI.parse(URI.escape(url)) or return nil
  host = Domainatrix.parse(parsed.host)
  if host.domain == "facebook" && host.subdomain == "business" && scrubbed.exclude?("/settings/business_info")
    public_url = scrubbed.sub("http://business.facebook.com", "http://facebook.com")
  elsif host.domain == "google" && host.subdomain == "business"
    public_url = scrubbed.sub("http://business.google.com", "http://plus.google.com")
  else
    public_url = nil
  end
  public_url
end

.remove_html_tags!(url) ⇒ Object



226
227
228
229
# File 'lib/url_scrubber.rb', line 226

def self.remove_html_tags!(url)
  url.gsub!(/<\/?[^>]+>/, '')
  url
end

.remove_subdomain!(url) ⇒ Object



215
216
217
218
219
220
221
222
223
# File 'lib/url_scrubber.rb', line 215

def self.remove_subdomain!(url)
  # url.sub!(%r{://www\d*\.}, '://')
  url.sub!(%r{^https?://www?w?\d*\.}i, 'http://')
  url.sub!(%r{^https?://m\d*\.}i, 'http://')
  url.sub!(%r{^https?://mobile\d*\.}i, 'http://')
  url.sub!(%r{^https?://touch\d*\.}i, 'http://')
  url.sub!(%r{^https?://mbasic\.facebook\.com}i, 'http://facebook.com')
  url
end

.sc_facebook(url) ⇒ Object

TODO This needs to be rewritten to be independent of the Facebook domain and public suffix used: e.g. facebook.com vs fb.com vs. fb.me



296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# File 'lib/url_scrubber.rb', line 296

def self.sc_facebook(url)

  url = url.gsub(/(_rdr=.+&)|(&_rdr=.+$)/,"")

  regex1  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(((?<group>groups?)|pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
  regex2  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
  regex3  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(((?<group>groups?)|pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
  regex4  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
  regex5  = /^(?<url>(https?:\/\/)((business|www)\.)?facebook\.com\/(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
  regex6  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/home\/accounts\?business_id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
  regex7  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/latest\/settings\/business_info\?business_id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i

  # If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
  # then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
  if mdata = /^(?<base_url>.+)\/posts\/(?<postid>[0-9]+).*$/.match(url)
    url = mdata[:base_url]
  end

  if url.match("/media/albums") || url.match("/media/set")
    url = url.match('\&') ? url.split('&',2)[0] : url
  elsif mdata = url.match(regex1)
    # "http://facebook.com/pages/Command-Canada/1434248516885065/timeline"
    url = mdata[:url]
    uname = mdata[:uname]
    uid = mdata[:uid]
  elsif mdata = url.match(regex2)
    # "https://www.facebook.com/profile.php?id=100009574328879"
    url, http_response = check_for_facebook_redirection(mdata[:url])
    uid = mdata[:uid]
  elsif mdata = url.match(regex4)
    # "http://facebook.com/home.php?#!/person.name"
    url = mdata[:url] + mdata[:uname]
    url = drop_url_query!(url)
  elsif mdata = url.match(regex5)
    # "https://www.facebook.com/100009574328879"
    url = "http://facebook.com/" + mdata[:uid]
    uid = mdata[:uid]
  elsif mdata = url.match(regex6)
    # "http://business.facebook.com/home/accounts?business_id=1145724702268347"
    url = mdata[:url]
    uid = mdata[:uid]
  elsif mdata = url.match(regex7)
    # "http://business.facebook.com/home/accounts?business_id=1145724702268347"
    url = mdata[:url]
    uid = mdata[:uid]
  elsif mdata = url.match(regex3)
    # "http://facebook.com/TonyMollHomeLoans/timeline"
    # "http://facebook.com/pg/TonyMollHomeLoans/timeline"
    # "https://www.facebook.com/groups/practicewithclaritygroup"
    if ["group", "groups", "page", "pages", "pg"].exclude?(mdata[:uname])
      url = (mdata[:group] ? "http://facebook.com/groups/" : "http://facebook.com/") + mdata[:uname]
      uname = mdata[:uname]
    end
    url = drop_url_query!(url)
  elsif url.include?("facebook.com/profile.php?id=")
    # puts "profile.php"
    # these were being truncated, they do redirect, but typically a 301 response is generated
    # so the url is returned unchanged.  Better than truncation.
    url, http_response = check_for_facebook_redirection(url)
  else
    # puts "else"
    url = drop_url_query!(url)
  end

  # Due to the redirection check, "https" and "www." can be re-introduced
  url = url.sub(%r{^https?://www.}i, 'http://')
  url = url.sub(/\?_rdr.*/, '')
  url
end

.sc_flickr(url) ⇒ Object



407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
# File 'lib/url_scrubber.rb', line 407

def self.sc_flickr(url)
  if url.include?('flickr.com/groups/')
    groups_partition = url.partition('flickr.com/groups/')
    if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
      extraneous_slash_partition = groups_partition[2].partition('/')
      if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
        # need to trim off the sub page stuff
        return "http://flickr.com/groups/" + extraneous_slash_partition[0]
      else
        return url
      end
    end
  end
  user_match = url.match(%r{flickr\.com/(photos/|people/)?([^/]+)})
  return url unless user_match

  "http://flickr.com/#{user_match[2]}"
end

.sc_generic(url) ⇒ Object



439
440
441
442
# File 'lib/url_scrubber.rb', line 439

def self.sc_generic(url)
  drop_url_query!(url)
  url
end

.sc_google_plus(url) ⇒ Object



391
392
393
394
395
396
397
398
399
400
401
402
403
404
# File 'lib/url_scrubber.rb', line 391

def self.sc_google_plus(url)
  url.sub!('com/u/0/b/', 'com/')
  url.sub!('com/u/0/', 'com/')
  url.sub!('com/b/', 'com/')
  url.sub!('/photos', '')
  url.sub!('/of', '')
  url.sub!('/albums', '')

  community_page = url.include?('plus.google.com/communities/')
  path_match = community_page ? url.match(/^http:\/\/plus\.google\.com\/communities\/([^\/]+)/) : url.match(/^http:\/\/plus\.google\.com\/([^\/]+)/)
  return url unless path_match

  community_page ? "http://plus.google.com/communities/#{path_match[1]}" : "http://plus.google.com/#{path_match[1]}"
end

.sc_linkedin(url) ⇒ Object

TODO This needs to be rewritten to be independent of the LinkedIn domain and public suffix used: e.g. linkedin.com vs lnkd.in vs linkedin.ca



368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
# File 'lib/url_scrubber.rb', line 368

def self.sc_linkedin(url)
  url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
  if !!url.match(%r{com/company/})
    drop_url_query!(url)
  elsif !!url.match(%r{com/in/})
    drop_url_query!(url)
  elsif !!url.match(%r{com/pub/})
    drop_url_query!(url)
  elsif url.include?('linkedin.com/groups/')
    drop_url_query!(url)
  elsif url.include?('linkedin.com/groups?gid=')
    drop_url_ampersand!(url)
  elsif url.include?('linkedin.com/groups?home=&gid=')
    id_partition = url.partition('linkedin.com/groups?home=&gid=')
    url = "http://linkedin.com/groups?gid=" + drop_url_ampersand!(id_partition[2])
  elsif url.include?('linkedin.com/groups?homeNewMember=&gid=')
    id_partition = url.partition('linkedin.com/groups?homeNewMember=&gid=')
    url = "http://linkedin.com/groups?gid=" + drop_url_ampersand!(id_partition[2])
  end
  url
end

.sc_pinterest(url) ⇒ Object



427
428
429
430
# File 'lib/url_scrubber.rb', line 427

def self.sc_pinterest(url)
  drop_url_query!(url)
  url
end

.sc_twitter(url) ⇒ Object



276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# File 'lib/url_scrubber.rb', line 276

def self.sc_twitter(url)
  url.sub!('twitter.com/@', 'twitter.com/')

  status_match = url.match(%r{(twitter\.com/[^/]+)/statuses/\d+})
  if status_match
    url = "http://#{status_match[1]}"
  end

  search_match = url.match(%r{twitter\.com/search(?:/realtime)?(?:/|\?q=)(?:@|%40)(\S*)$})
  if search_match
    url = "http://twitter.com/#{search_match[1]}"
  end

  url = drop_url_query!(url)

  url
end

.sc_vimeo(url) ⇒ Object



261
262
263
264
265
266
267
268
269
270
271
272
273
# File 'lib/url_scrubber.rb', line 261

def self.sc_vimeo(url)
  if url.include?('vimeo.com/groups/')
    groups_partition = url.partition('vimeo.com/groups/')
    if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
      extraneous_slash_partition = groups_partition[2].partition('/')
      if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
        # need to trim off the sub page stuff
        return "http://vimeo.com/groups/" + extraneous_slash_partition[0]
      end
    end
  end
  url
end

.sc_yelp(url) ⇒ Object



433
434
435
436
# File 'lib/url_scrubber.rb', line 433

def self.sc_yelp(url)
  drop_url_query!(url)
  url
end

.sc_youtube(url) ⇒ Object



251
252
253
254
255
256
257
258
# File 'lib/url_scrubber.rb', line 251

def self.sc_youtube(url)
  # We need to allow the /user version of the URL due to how YouTube allows users to have their own URL
  # which is not separate channel with it's own customUrl.
  # url.sub!('youtube.com/user/', 'youtube.com/')
  url.sub!('youtube.com/profile?user=', 'youtube.com/')
  drop_url_query!(url)
  url
end

.scrub(url) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/url_scrubber.rb', line 9

def self.scrub(url)
  return url if url.blank?
  return url if /^app:\/\//.match(url)  # Do not scrub app-only URLs
  return url if /^https?:\/\/(www.)?business.tiktok\.com\/manage\//.match(url) # Don't scrub tik tok business manager urls, quick fix until we can implement a different solution, https://business.tiktok.com/manage/overview?org_id=6974497704617492482
  return url if /^http(s)?:\/\/(.+\.)?youtube\.com\/watch\?v=(?<vid>[^&]+)/i.match(url) # Don't scrub youtube video urls, just let it through!
  

  url = url.clone # don't modify the original argument

  m = url.match(/(htt?ps?:\/\/\S+)/i)
  return nil unless m

  url = m[1]
  url.sub!(/^https/i, 'http')
  url.sub!(/^htp/i, 'http')
  url.sub!(/\/+$/, '')
  url.sub!(/;+$/, '')
  url.sub!('#!/', '')
  url.sub!('%27', '\'')
  url = downcase_domain(url)
  remove_subdomain!(url)
  remove_html_tags!(url)
  # CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL
  url = drop_anchor!(special_cases(url))
  url.sub!(/,+$/, "")    # remove one or more trailing commas at the end of the URL
  url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL
  return url
end

.service_of(url) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/url_scrubber.rb', line 39

def self.service_of(url)
  url_parts = Domainatrix.parse(url)
  if url_parts.host.present?

    case url_parts.domain
    when 'facebook'           then return :facebook
    when 'fb'                 then return :facebook
    when 'flickr'             then return :flickr
    when 'instagram'          then return :instagram
    when 'linkedin'           then return :linkedin
    when 'pinterest'          then return :pinterest
    when 'slideshare'         then return :slideshare
    when 'tumblr'             then return :tumblr
    when 'twitter'            then return :twitter
    when 'vimeo'              then return :vimeo
    when 'vk'                 then return :vkontakte
    when 'weibo'              then return :weibo
    when 'yelp'               then return :yelp
    when 'youtube'            then return :youtube
    end

    case url_parts.host
    when /\bplus\.google\.com$/ then return :google
    end

  else
    Rails.logger.debug "No Domain Match"
  end

  :other
end

.special_cases(url) ⇒ Object



195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/url_scrubber.rb', line 195

def self.special_cases(url)
  #puts "special_cases"
  case service_of(url)
  when :youtube   then return sc_youtube(url)
  when :twitter   then return sc_twitter(url)
  when :facebook  then return sc_facebook(url)
  when :linkedin  then return sc_linkedin(url)
  when :google    then return sc_google_plus(url)
  when :flickr    then return sc_flickr(url)
  when :pinterest then return sc_pinterest(url)
  when :vimeo     then return sc_vimeo(url)
  when :yelp      then return sc_yelp(url)
  else
    sc_generic(url)
  end

  url
end

.valid_url?(url) ⇒ Boolean

Requirements:

  1. must have http/https scheme

  2. no “@” in any of the passed in url string

  3. valid uri as determined by Addressable::URI

Returns:

  • (Boolean)


156
157
158
159
160
161
162
# File 'lib/url_scrubber.rb', line 156

def self.valid_url?(url)
  schemes = %w(http https)
  parsed = URI.parse(URI.escape(url)) or return false
  schemes.include?(parsed.scheme) && !url.include?("@")
  rescue URI::InvalidURIError
  false
end