Class: Glib::JsonCrawler::Router

Inherits:

Object

Object
Glib::JsonCrawler::Router

show all

Defined in:: lib/glib/json_crawler/router.rb

Instance Attribute Summary collapse

#deferred_actions ⇒ Object readonly

Returns the value of attribute deferred_actions.
#host ⇒ Object

Returns the value of attribute host.
#http_actions ⇒ Object readonly

Returns the value of attribute http_actions.
#last_log ⇒ Object readonly

Returns the value of attribute last_log.
#logger ⇒ Object readonly

Returns the value of attribute logger.
#read_only_actions ⇒ Object readonly

deprecated.
#skip_similar_page ⇒ Object

Returns the value of attribute skip_similar_page.

Instance Method Summary collapse

#_puts(text) ⇒ Object
#allowed?(url) ⇒ Boolean
#assert_target_ids_exist(args) ⇒ Object
#begin_page(spec, url) ⇒ Object
#crawl_multiple(views, block) ⇒ Object
#end_page(spec) ⇒ Object
#follow_v2(http, crawler_actions) ⇒ Object

@depth += 1 target_actions.each do |crawler_action| action, url = crawler_action http.get(url, action, {}) end end.
#initialize ⇒ Router constructor

A new instance of Router.
#last_form ⇒ Object
#log(action, key_data, response = nil) ⇒ Object
#page_spec ⇒ Object
#page_url ⇒ Object
#process_action(http, spec) ⇒ Object
#should_defer_crawl?(action_crawler, args) ⇒ Boolean
#step(http, args) ⇒ Object

Constructor Details

#initialize ⇒ `Router`

Returns a new instance of Router.

# File 'lib/glib/json_crawler/router.rb', line 54

def initialize
  @depth = -1
  @logger = ''
  @visitor = Glib::Json::Traversal::Visitor.new(crawler_test: true)
  @read_only_actions = Set.new
  @http_actions = Set.new
  # default rails's development host
  @host ||= 'localhost:3000'
  @page_specs = []
  @page_urls = []
  @skip_similar_page = false
end

Instance Attribute Details

#deferred_actions ⇒ `Object` (readonly)

Returns the value of attribute deferred_actions.



7
8
9

# File 'lib/glib/json_crawler/router.rb', line 7

def deferred_actions
  @deferred_actions
end

#host ⇒ `Object`

Returns the value of attribute host.



9
10
11

# File 'lib/glib/json_crawler/router.rb', line 9

def host
  @host
end

#http_actions ⇒ `Object` (readonly)

Returns the value of attribute http_actions.



8
9
10

# File 'lib/glib/json_crawler/router.rb', line 8

def http_actions
  @http_actions
end

#last_log ⇒ `Object` (readonly)

Returns the value of attribute last_log.



7
8
9

# File 'lib/glib/json_crawler/router.rb', line 7

def last_log
  @last_log
end

#logger ⇒ `Object` (readonly)

Returns the value of attribute logger.



7
8
9

# File 'lib/glib/json_crawler/router.rb', line 7

def logger
  @logger
end

#read_only_actions ⇒ `Object` (readonly)

deprecated



6
7
8

# File 'lib/glib/json_crawler/router.rb', line 6

def read_only_actions
  @read_only_actions
end

#skip_similar_page ⇒ `Object`

Returns the value of attribute skip_similar_page.



9
10
11

# File 'lib/glib/json_crawler/router.rb', line 9

def skip_similar_page
  @skip_similar_page
end

Instance Method Details

#_puts(text) ⇒ `Object`



29
30
31

# File 'lib/glib/json_crawler/router.rb', line 29

def _puts(text)
  puts '  ' * @depth + text
end

#allowed?(url) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/glib/json_crawler/router.rb', line 252

def allowed?(url)
  regex = Regexp.new("#{host}.+(?<!\.pdf)$")
  regex.match(url)
end

#assert_target_ids_exist(args) ⇒ `Object`

# File 'lib/glib/json_crawler/router.rb', line 33

def assert_target_ids_exist(args)
  # This saves targetId so that later we can check to make sure that it indeed exists
  # within the page.
  if (target_ids = args['targetIds'])
    target_ids.each do |target_id|
      @visitor.defer_action(nil, target_id)
    end
  elsif (target_id = args['targetId'])
    @visitor.defer_action(nil, target_id)
  end
end

#begin_page(spec, url) ⇒ `Object`

# File 'lib/glib/json_crawler/router.rb', line 232

def begin_page(spec, url)
  @page_specs << spec
  @page_urls << url
  @visitor.begin_page(spec)
end

#crawl_multiple(views, block) ⇒ `Object`



228
229
230

# File 'lib/glib/json_crawler/router.rb', line 228

def crawl_multiple(views, block)
  @visitor.traverse_multiple views, block
end

#end_page(spec) ⇒ `Object`

# File 'lib/glib/json_crawler/router.rb', line 238

def end_page(spec)
  @page_specs.pop
  @page_urls.pop
  @visitor.end_page(spec)
end

#follow_v2(http, crawler_actions) ⇒ `Object`

@depth += 1

target_actions.each do |crawler_action|
  action, url = crawler_action
  http.get(url, action, {})
end

end

# File 'lib/glib/json_crawler/router.rb', line 198

def follow_v2(http, crawler_actions)
  @depth += 1
  crawler_actions.each do |crawler_action|
    action, url, params = crawler_action

    # In full mode, wrap each action in a transaction that gets rolled back
    # to ensure database state is reset between each URL check (prevent database contamination)
    if ENV['GLIB_DISABLE_PERMISSION_TEST_SKIP'] == 'true'
      # This solution is important for permissions tests (not as much in the crawler tests),
      # because in permission tests, the user hits every single available URLs with a single purpose
      # of checking the permission of every URL, meaning that one incorrect result (e.g. 403 instead of 200 due to
      # side effect from previous URL requests) cannot be tolerated.
      #
      # On the other hand, crawler tests are expected to cover only one scenario anyway, so
      # having the scenario changed (due to side effects) is fine. We decided it's better not
      # to apply this solution for crawler tests out of performance considerations.
      ActiveRecord::Base.transaction do
        execute_crawler_action(http, action, url, params)
        raise ActiveRecord::Rollback
      end
    else
      # In skip mode, add the permission test parameter
      if url.present?
        url = add_params(url, __glib_permission_test: true)
      end
      execute_crawler_action(http, action, url, params)
    end
  end
end

#last_form ⇒ `Object`



176
177
178

# File 'lib/glib/json_crawler/router.rb', line 176

def last_form
  @visitor.forms.last
end

#log(action, key_data, response = nil) ⇒ `Object`

# File 'lib/glib/json_crawler/router.rb', line 11

def log(action, key_data, response = nil)
  # Sometimes `key_data` may not be an actual URL, e.g. in the context of dialogs_alert,
  # it is the alert message.
  if key_data&.start_with?('http://', 'https://')
    key_data = remove_params(key_data, [:__glib_permission_test])
  end

  @last_log = [
    action,
    response.present? ? response.code : nil,
    key_data
  ].compact.join(
    ' :: '
  )

  @logger += '  ' * @depth + @last_log + "\n"
end

#page_spec ⇒ `Object`



244
245
246

# File 'lib/glib/json_crawler/router.rb', line 244

def page_spec
  @page_specs.last
end

#page_url ⇒ `Object`



248
249
250

# File 'lib/glib/json_crawler/router.rb', line 248

def page_url
  @page_urls.last
end

#process_action(http, spec) ⇒ `Object`

# File 'lib/glib/json_crawler/router.rb', line 100

def process_action(http, spec)
  action = spec&.fetch('action')
  params = spec

  assert_target_ids_exist(params)

  if action.present?
    return if similar_page?(params)

    @depth += 1
    case action
    when 'initiate_navigation'
      # @read_only_actions.add([action, params['url']])
      http_actions.add([action, params['url']])
      JsonCrawler::NavInitiate.new(http, params, action)
    when 'runMultiple-v1', 'runMultiple'
      JsonCrawler::RunMultiple.new(http, params, action)
    when 'windows/open-v1', 'dialogs/open-v1', 'windows/reload-v1', 'windows/open',
      'dialogs/open', 'windows/reload', 'windows/openWeb', 'windows/openWeb-v1'
      if allowed?(params['url'])
        # @read_only_actions.add([action, params['url']])
        http_actions.add([action, params['url']])
        JsonCrawler::WindowsOpen.new(http, params, action)
      else
        # IMPORTANT — do not drop the `http_actions.add` below.
        #
        # This `else` is reached by BOTH genuinely external links AND by
        # same-host file/download endpoints that `allowed?` rejected for the
        # file-extension rule (e.g. *.pdf). Recording the same-host ones is the
        # ONLY way their file authorization ever gets exercised: the permission
        # test replays each recorded action per user and snapshots the response.
        # Remove this line and every file/download endpoint silently falls out
        # of permission coverage -- a wrong authorization on one would then ship
        # unnoticed (exactly the gap this was added to close).
        #
        # We do NOT traverse/download them (no WindowsOpen crawler is created
        # here), and the permission replay runs with inspect_http:false so it
        # never follows the storage redirect. External links are filtered out by
        # `internal_url?`. Behaviour is guarded by
        # test/dummy-app/test/json_crawler/router_test.rb -- if you delete the
        # line below, that test goes red.
        http_actions.add([action, params['url']]) if internal_url?(params['url'])
        self.log action, params['url']
      end
    when 'dialogs/show-v1', 'dialogs/show', 'popovers/show-v1', 'popovers/show'
      JsonCrawler::DialogsShow.new(http, params, action)
    when 'sheets/select-v1', 'sheets/select'
      JsonCrawler::Menu.new(http, params, action)
    when 'http/post-v1', 'http/post'
      JsonCrawler::ActionHttp.new(:post, http, params, action)
    when 'forms/submit-v1', 'forms/submit'
      # forms = @visitor.forms
      # JsonCrawler::FormsSubmit.new(http, params, forms.last)
      JsonCrawler::FormsSubmit.new(http, params)
    when 'dialogs/alert-v1', 'dialogs/alert'
      JsonCrawler::DialogsAlert.new(http, params, action)
    when 'dialogs/close-v1', 'dialogs/close', 'popovers/close', 'popovers/close-v1'
      JsonCrawler::DialogsClose.new(http, params, action)
    else
      unless [
        'http/delete-v1',
        'dialogs/oauth-v1',
        'http/delete',
        'dialogs/oauth'
      ].include?(
        action
             )
        # @read_only_actions.add([action, params['url']])
        http_actions.add([action, params['url']])
      end
      self.log action, params['url']
    end
    @depth -= 1
  end
end

#should_defer_crawl?(action_crawler, args) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/glib/json_crawler/router.rb', line 45

def should_defer_crawl?(action_crawler, args)
  if (target_id = args['targetId'])
    @visitor.defer_action(action_crawler, target_id)
    return true
  end

  false
end

#step(http, args) ⇒ `Object`

# File 'lib/glib/json_crawler/router.rb', line 67

def step(http, args)
  # TODO: Refactor
  case args['view']
  when 'fields/submit-v1', 'fields/submit'
    @depth += 1
    # forms = @visitor.forms
    # JsonCrawler::FormsSubmit.new(http, args, forms.last)
    JsonCrawler::FormsSubmit.new(http, args)
    @depth -= 1
    return
  when 'panels/web-v1', 'panels/web'
    # A panels/web embeds content by URL in an inline viewer -- a file
    # preview (PDF/image), an inline HTML preview, etc. There's no onClick
    # action to catch here, so when the URL is one of our own (same-host)
    # endpoints we record it directly: the client fetches it to render the
    # panel, so its authorization should be exercised by the permission test
    # (it replays each per user and snapshots the response). We only record
    # (no fetch); external embeds are filtered out by internal_url?.
    url = args['url']
    http_actions.add(['panels/web-v1', url]) if url.present? && internal_url?(url)
  end

  if args.is_a?(Hash) && args['rel'] != 'nofollow'
    on_click = args.fetch('onClick', nil)

    if on_click && !args['disabled']
      process_action(http, on_click)
    end
  end

  # @read_only_actions.replace(@read_only_actions.sort_by { |e| e[1].to_s })
end

Class: Glib::JsonCrawler::Router

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ Router

Instance Attribute Details

#deferred_actions ⇒ Object (readonly)

#host ⇒ Object

#http_actions ⇒ Object (readonly)

#last_log ⇒ Object (readonly)

#logger ⇒ Object (readonly)

#read_only_actions ⇒ Object (readonly)

#skip_similar_page ⇒ Object

Instance Method Details

#_puts(text) ⇒ Object

#allowed?(url) ⇒ Boolean

#assert_target_ids_exist(args) ⇒ Object

#begin_page(spec, url) ⇒ Object

#crawl_multiple(views, block) ⇒ Object

#end_page(spec) ⇒ Object

#follow_v2(http, crawler_actions) ⇒ Object

#last_form ⇒ Object

#log(action, key_data, response = nil) ⇒ Object

#page_spec ⇒ Object

#page_url ⇒ Object

#process_action(http, spec) ⇒ Object

#should_defer_crawl?(action_crawler, args) ⇒ Boolean

#step(http, args) ⇒ Object