Class: Arachni::Parser
- Includes:
- UI::Output, Utilities
- Defined in:
- lib/arachni/parser.rb,
lib/arachni/parser/sax.rb,
lib/arachni/parser/document.rb,
lib/arachni/parser/nodes/base.rb,
lib/arachni/parser/nodes/text.rb,
lib/arachni/parser/nodes/comment.rb,
lib/arachni/parser/nodes/element.rb,
lib/arachni/parser/with_children.rb,
lib/arachni/parser/extractors/base.rb,
lib/arachni/parser/nodes/with_value.rb,
lib/arachni/parser/with_children/search.rb,
lib/arachni/parser/nodes/element/with_attributes.rb,
lib/arachni/parser/nodes/element/with_attributes/attributes.rb
Overview
Analyzes HTML code extracting inputs vectors and supporting information.
Defined Under Namespace
Modules: Extractors, Nodes, WithChildren Classes: Document, SAX
Constant Summary collapse
- CACHE_SIZES =
{ parse: 50, parse_xml: 50, parse_fragment: 100, html?: 100_000 }
- CACHE =
{}
- WHITELIST =
%w( title base a form frame iframe meta input select option script link area textarea input select button comment !-- )
- IGNORE_REQUEST_HEADERS =
[ HTTP::Client::SEED_HEADER_NAME, 'Content-Length' ]
Instance Attribute Summary collapse
Class Method Summary collapse
- .html?(string) ⇒ Boolean
- .markup?(string) ⇒ Boolean
- .parse(html, options = {}) ⇒ Object
- .parse_fragment(html) ⇒ Object
- .parse_xml(xml) ⇒ Object
- .push_parse(options = {}) ⇒ Object
Instance Method Summary collapse
-
#base ⇒ String
Base `href`, if there is one.
- #body ⇒ Object
-
#body=(string) ⇒ String
Override the #response body for the parsing process.
-
#cookie_jar ⇒ Array<Element::Cookie>
Cookies with which to update the HTTP cookie-jar.
-
#cookies ⇒ Array<Element::Cookie>
Cookies from HTTP headers and response body.
-
#cookies_to_be_audited ⇒ Array<Element::Cookie>
Cookies to be audited.
-
#document ⇒ Arachni::Parser::Document?
Returns a parsed HTML document from the body of the HTTP response or `nil` if the response data wasn't text-based or the response couldn't be parsed.
-
#forms ⇒ Array<Element::Form>
Forms from #document.
- #from_document? ⇒ Boolean
- #from_response? ⇒ Boolean
-
#headers ⇒ Hash
List of valid auditable HTTP header fields.
-
#initialize(resource) ⇒ Parser
constructor
A new instance of Parser.
- #jsons ⇒ Array<Element::JSON>
-
#link ⇒ Element::Link
Link to the page.
-
#link_template ⇒ Element::LinkTemplate
LinkTemplate for the current page.
-
#link_templates ⇒ Array<Element::LinkTemplate>
Links matching OptionsGroups::Audit#link_templates in #document.
-
#link_vars ⇒ Hash
Parameters found in #url.
-
#links ⇒ Array<Element::Link>
Links in #document.
-
#nested_cookies ⇒ Array<Element::NestedCookie>
Nested cookies from #cookies_to_be_audited.
- #page ⇒ Page
-
#paths ⇒ Array<String>
Distinct links to follow.
-
#text? ⇒ Boolean
`true` if the given HTTP response data are text based, `false` otherwise.
-
#to_absolute(relative_url) ⇒ String
Converts a relative URL to an absolute one.
-
#ui_forms ⇒ Object
Dummy method, only the browser can fill this in.
-
#ui_inputs ⇒ Object
Dummy method, only the browser can fill this in.
- #xmls ⇒ Array<Element::XML>
Methods included from Utilities
#available_port, available_port_mutex, #bytes_to_kilobytes, #bytes_to_megabytes, #caller_name, #caller_path, #cookie_decode, #cookie_encode, #cookies_from_file, #cookies_from_parser, #cookies_from_response, #exception_jail, #exclude_path?, #follow_protocol?, #form_decode, #form_encode, #forms_from_parser, #forms_from_response, #full_and_absolute_url?, #generate_token, #get_path, #hms_to_seconds, #html_decode, #html_encode, #include_path?, #links_from_parser, #links_from_response, #normalize_url, #page_from_response, #page_from_url, #parse_set_cookie, #path_in_domain?, #path_too_deep?, #port_available?, #rand_port, #random_seed, #redundant_path?, #regexp_array_match, #remove_constants, #request_parse_body, #seconds_to_hms, #skip_page?, #skip_path?, #skip_resource?, #skip_response?, #uri_decode, #uri_encode, #uri_parse, #uri_parse_query, #uri_parser, #uri_rewrite
Methods included from UI::Output
#debug?, #debug_level_1?, #debug_level_2?, #debug_level_3?, #debug_level_4?, #debug_off, #debug_on, #disable_only_positives, #included, #mute, #muted?, #only_positives, #only_positives?, #print_bad, #print_debug, #print_debug_backtrace, #print_debug_level_1, #print_debug_level_2, #print_debug_level_3, #print_debug_level_4, #print_error, #print_error_backtrace, #print_exception, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, reset_output_options, #unmute, #verbose?, #verbose_on
Constructor Details
#initialize(resource) ⇒ Parser
Returns a new instance of Parser.
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
# File 'lib/arachni/parser.rb', line 176 def initialize( resource ) case resource when Document @resource = :document @document = resource when HTTP::Response @resource = :response @response = resource self.url = @response.url when Array @secondary_responses = resource[1..-1] @secondary_responses.compact! if @secondary_responses response = resource.shift @resource = :response @response = response self.url = response.url end end |
Instance Attribute Details
#response ⇒ HTTP::Response
170 171 172 |
# File 'lib/arachni/parser.rb', line 170 def response @response end |
Class Method Details
.html?(string) ⇒ Boolean
117 118 119 120 121 122 123 124 125 |
# File 'lib/arachni/parser.rb', line 117 def html?( string ) CACHE[__method__].fetch string do begin _html? string rescue => e false end end end |
.markup?(string) ⇒ Boolean
109 110 111 112 113 114 115 |
# File 'lib/arachni/parser.rb', line 109 def markup?( string ) begin Ox.parse( string ).is_a?( Ox::Element ) rescue => e false end end |
.parse(html, options = {}) ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/arachni/parser.rb', line 66 def parse( html, = {} ) CACHE[__method__].fetch [html, ] do handler, = ( ) begin Ox.sax_html( handler, StringIO.new( html ), ) rescue SAX::Stop end handler.document end end |
.parse_fragment(html) ⇒ Object
94 95 96 97 98 99 100 101 |
# File 'lib/arachni/parser.rb', line 94 def parse_fragment( html ) CACHE[__method__].fetch html do parse( html ).children.first.tap do |o| o.parent = nil o.document = nil end end end |
Instance Method Details
#base ⇒ String
Returns Base `href`, if there is one.
459 460 461 |
# File 'lib/arachni/parser.rb', line 459 def base @base ||= document.nodes_by_name( :base ).map { |b| b['href'] }.first || @url end |
#body ⇒ Object
252 253 254 |
# File 'lib/arachni/parser.rb', line 252 def body @body || (@response.body if from_response?) end |
#body=(string) ⇒ String
Returns Override the #response body for the parsing process.
247 248 249 250 |
# File 'lib/arachni/parser.rb', line 247 def body=( string ) @links = @forms = @cookies = @document = nil @body = string end |
#cookie_jar ⇒ Array<Element::Cookie>
Returns Cookies with which to update the HTTP cookie-jar.
434 435 436 437 438 439 440 441 442 443 444 445 |
# File 'lib/arachni/parser.rb', line 434 def return @cookie_jar.freeze if @cookie_jar from_jar = [] # Make a list of the response cookie names. = Set.new( .map( &:name ) ) from_jar |= HTTP::Client..for_url( @url ). reject { || .include?( .name ) } @cookie_jar = ( | from_jar) end |
#cookies ⇒ Array<Element::Cookie>
Returns Cookies from HTTP headers and response body.
389 390 391 392 393 394 395 396 |
# File 'lib/arachni/parser.rb', line 389 def return @cookies.freeze if @cookies @cookies = Cookie.from_headers( @url, @response.headers ) return @cookies if !text? || !Cookie.in_html?( body ) @cookies |= Cookie.from_parser( self ) end |
#cookies_to_be_audited ⇒ Array<Element::Cookie>
Returns Cookies to be audited.
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 |
# File 'lib/arachni/parser.rb', line 408 def return @cookies_to_be_audited.freeze if @cookies_to_be_audited return [] if !text? # Make a list of the response cookie names. = Set.new( .map(&:name) ) # Grab all cookies from the cookiejar giving preferrence to the ones # specified by the current page, if there are any. from_http_jar = HTTP::Client...reject do |c| .include?( c.name ) end # These cookies are to be audited and thus are dirty and anarchistic, # so they have to contain even cookies completely irrelevant to the # current page. I.e. it contains all cookies that have been observed # since the beginning of the scan @cookies_to_be_audited = ( | from_http_jar).map do |c| dc = c.dup dc.action = @url dc end end |
#document ⇒ Arachni::Parser::Document?
Returns a parsed HTML document from the body of the HTTP response or `nil` if the response data wasn't text-based or the response couldn't be parsed.
260 261 262 263 264 265 |
# File 'lib/arachni/parser.rb', line 260 def document return @document if @document return if !text? @document = self.class.parse( body, filter: true ) end |
#forms ⇒ Array<Element::Form>
Returns Forms from #document.
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 |
# File 'lib/arachni/parser.rb', line 291 def forms return @forms.freeze if @forms return [] if !text? || (body && !Form.in_html?( body )) f = Form.from_parser( self ) return f if !@secondary_responses @secondary_responses.each do |response| next if response.body.to_s.empty? Form.from_parser( Parser.new( response ) ).each do |form2| f.each do |form| next if "#{form.coverage_id}:#{form.name_or_id}" != "#{form2.coverage_id}:#{form2.name_or_id}" form.inputs.each do |k, v| next if v == form2.inputs[k] || form.field_type_for( k ) != :hidden form.nonce_name = k end end end end @forms = f end |
#from_document? ⇒ Boolean
241 242 243 |
# File 'lib/arachni/parser.rb', line 241 def from_document? @resource == :document end |
#from_response? ⇒ Boolean
237 238 239 |
# File 'lib/arachni/parser.rb', line 237 def from_response? @resource == :response end |
#headers ⇒ Hash
It will include common request headers as well headers from the HTTP request.
Returns List of valid auditable HTTP header fields.
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 |
# File 'lib/arachni/parser.rb', line 272 def headers @headers ||= { 'Accept' => 'text/html,application/xhtml+xml,application' + '/xml;q=0.9,*/*;q=0.8', 'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'From' => Options. || '', 'User-Agent' => Options.http.user_agent || '', 'Referer' => @url, 'Pragma' => 'no-cache' }.merge( response.request.headers.dup.tap do |h| IGNORE_REQUEST_HEADERS.each { |k| h.delete k } end ).map { |k, v| Header.new( url: @url, inputs: { k => v } ) }.freeze end |
#jsons ⇒ Array<Element::JSON>
360 361 362 |
# File 'lib/arachni/parser.rb', line 360 def jsons @jsons ||= [JSON.from_request( @url, response.request )].compact end |
#link ⇒ Element::Link
Returns Link to the page.
321 322 323 324 |
# File 'lib/arachni/parser.rb', line 321 def link return if link_vars.empty? && (@response && !@response.redirection?) Link.new( url: @url, inputs: link_vars ) end |
#link_template ⇒ Element::LinkTemplate
Returns LinkTemplate for the current page.
328 329 330 331 332 333 334 335 336 337 338 |
# File 'lib/arachni/parser.rb', line 328 def link_template template, inputs = LinkTemplate.extract_inputs( @url ) return if !template LinkTemplate.new( url: @url.freeze, action: @url.freeze, inputs: inputs, template: template ) end |
#link_templates ⇒ Array<Element::LinkTemplate>
Returns Links matching OptionsGroups::Audit#link_templates in #document.
351 352 353 354 355 356 357 |
# File 'lib/arachni/parser.rb', line 351 def link_templates return @link_templates.freeze if @link_templates return @link_templates = [link_template].compact if !text? @link_templates = [link_template].compact | LinkTemplate.from_parser( self ) end |
#link_vars ⇒ Hash
Returns Parameters found in #url.
371 372 373 374 375 |
# File 'lib/arachni/parser.rb', line 371 def link_vars return {} if !(parsed = uri_parse( @url )) @link_vars ||= parsed.rewrite.query_parameters.freeze end |
#links ⇒ Array<Element::Link>
Returns Links in #document.
342 343 344 345 346 347 |
# File 'lib/arachni/parser.rb', line 342 def links return @links.freeze if @links return @links = [link].compact if !text? || (body && !Link.in_html?( body )) @links = [link].compact | Link.from_parser( self ) end |
#nested_cookies ⇒ Array<Element::NestedCookie>
Returns Nested cookies from #cookies_to_be_audited.
400 401 402 403 404 |
# File 'lib/arachni/parser.rb', line 400 def return @nested_cookies.freeze if @nested_cookies @nested_cookies = NestedCookie.( ) end |
#page ⇒ Page
227 228 229 |
# File 'lib/arachni/parser.rb', line 227 def page @page ||= Page.new( parser: self ) end |
#paths ⇒ Array<String>
Returns Distinct links to follow.
449 450 451 452 453 454 455 |
# File 'lib/arachni/parser.rb', line 449 def paths return @paths if @paths @paths = [] return @paths.freeze if !document @paths = run_extractors.freeze end |
#text? ⇒ Boolean
Returns `true` if the given HTTP response data are text based, `false` otherwise.
233 234 235 |
# File 'lib/arachni/parser.rb', line 233 def text? from_response? ? @response.text? : true end |
#to_absolute(relative_url) ⇒ String
Converts a relative URL to an absolute one.
216 217 218 219 220 221 222 223 224 |
# File 'lib/arachni/parser.rb', line 216 def to_absolute( relative_url ) if (url = base) base_url = url else base_url = @url end super( relative_url, base_url ) end |
#ui_forms ⇒ Object
Dummy method, only the browser can fill this in.
383 384 385 |
# File 'lib/arachni/parser.rb', line 383 def ui_forms [] end |
#ui_inputs ⇒ Object
Dummy method, only the browser can fill this in.
378 379 380 |
# File 'lib/arachni/parser.rb', line 378 def ui_inputs [] end |
#xmls ⇒ Array<Element::XML>
365 366 367 |
# File 'lib/arachni/parser.rb', line 365 def xmls @xmls ||= [XML.from_request( @url, response.request )].compact end |