Class: Arachni::Parser

Inherits:
Object show all
Includes:
UI::Output, Utilities
Defined in:
lib/arachni/parser.rb,
lib/arachni/parser/sax.rb,
lib/arachni/parser/document.rb,
lib/arachni/parser/nodes/base.rb,
lib/arachni/parser/nodes/text.rb,
lib/arachni/parser/nodes/comment.rb,
lib/arachni/parser/nodes/element.rb,
lib/arachni/parser/with_children.rb,
lib/arachni/parser/extractors/base.rb,
lib/arachni/parser/nodes/with_value.rb,
lib/arachni/parser/with_children/search.rb,
lib/arachni/parser/nodes/element/with_attributes.rb,
lib/arachni/parser/nodes/element/with_attributes/attributes.rb

Overview

Analyzes HTML code extracting inputs vectors and supporting information.

Author:

  • Tasos “Zapotek” Laskos <tasos.laskos@arachni-scanner.com>

Defined Under Namespace

Modules: Extractors, Nodes, WithChildren Classes: Document, SAX

Constant Summary collapse

CACHE_SIZES =
{
    parse:          50,
    parse_xml:      50,
    parse_fragment: 100,
    html?:          100_000
}
CACHE =
{}
WHITELIST =
%w(
    title base a form frame iframe meta input select option script link area
    textarea input select button comment !--
)
IGNORE_REQUEST_HEADERS =
[
    HTTP::Client::SEED_HEADER_NAME,
    'Content-Length'
]

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Utilities

#available_port, available_port_mutex, #bytes_to_kilobytes, #bytes_to_megabytes, #caller_name, #caller_path, #cookie_decode, #cookie_encode, #cookies_from_file, #cookies_from_parser, #cookies_from_response, #exception_jail, #exclude_path?, #follow_protocol?, #form_decode, #form_encode, #forms_from_parser, #forms_from_response, #full_and_absolute_url?, #generate_token, #get_path, #hms_to_seconds, #html_decode, #html_encode, #include_path?, #links_from_parser, #links_from_response, #normalize_url, #page_from_response, #page_from_url, #parse_set_cookie, #path_in_domain?, #path_too_deep?, #port_available?, #rand_port, #random_seed, #redundant_path?, #regexp_array_match, #remove_constants, #request_parse_body, #seconds_to_hms, #skip_page?, #skip_path?, #skip_resource?, #skip_response?, #uri_decode, #uri_encode, #uri_parse, #uri_parse_query, #uri_parser, #uri_rewrite

Methods included from UI::Output

#debug?, #debug_level_1?, #debug_level_2?, #debug_level_3?, #debug_level_4?, #debug_off, #debug_on, #disable_only_positives, #included, #mute, #muted?, #only_positives, #only_positives?, #print_bad, #print_debug, #print_debug_backtrace, #print_debug_level_1, #print_debug_level_2, #print_debug_level_3, #print_debug_level_4, #print_error, #print_error_backtrace, #print_exception, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, reset_output_options, #unmute, #verbose?, #verbose_on

Constructor Details

#initialize(resource) ⇒ Parser

Returns a new instance of Parser.

Parameters:

  • resource (Document, HTTP::Response, Array<HTTP::Response>)

    Response(s) to analyze and parse. By providing multiple responses the parser will be able to perform some preliminary differential analysis and identify nonce tokens in inputs.



176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/arachni/parser.rb', line 176

def initialize( resource )
    case resource

        when Document
            @resource = :document
            @document = resource

        when HTTP::Response
            @resource = :response

            @response = resource
            self.url = @response.url

        when Array
            @secondary_responses = resource[1..-1]
            @secondary_responses.compact! if @secondary_responses
            response = resource.shift

            @resource = :response

            @response = response
            self.url = response.url
    end
end

Instance Attribute Details

#responseHTTP::Response

Returns:



170
171
172
# File 'lib/arachni/parser.rb', line 170

def response
  @response
end

#urlString

Returns:



167
168
169
# File 'lib/arachni/parser.rb', line 167

def url
  @url
end

Class Method Details

.html?(string) ⇒ Boolean

Returns:

  • (Boolean)


117
118
119
120
121
122
123
124
125
# File 'lib/arachni/parser.rb', line 117

def html?( string )
    CACHE[__method__].fetch string do
        begin
            _html? string
        rescue => e
            false
        end
    end
end

.markup?(string) ⇒ Boolean

Returns:

  • (Boolean)


109
110
111
112
113
114
115
# File 'lib/arachni/parser.rb', line 109

def markup?( string )
    begin
        Ox.parse( string ).is_a?( Ox::Element )
    rescue => e
        false
    end
end

.parse(html, options = {}) ⇒ Object



66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/arachni/parser.rb', line 66

def parse( html, options = {} )
    CACHE[__method__].fetch [html, options] do
        handler, sax_options = prepare_ox_options( options )

        begin
            Ox.sax_html( handler, StringIO.new( html ), sax_options )
        rescue SAX::Stop
        end

        handler.document
    end
end

.parse_fragment(html) ⇒ Object



94
95
96
97
98
99
100
101
# File 'lib/arachni/parser.rb', line 94

def parse_fragment( html )
    CACHE[__method__].fetch html do
        parse( html ).children.first.tap do |o|
            o.parent   = nil
            o.document = nil
        end
    end
end

.parse_xml(xml) ⇒ Object



103
104
105
106
107
# File 'lib/arachni/parser.rb', line 103

def parse_xml( xml )
    CACHE[__method__].fetch xml do
        Nokogiri::XML( xml )
    end
end

.push_parse(options = {}) ⇒ Object



79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/arachni/parser.rb', line 79

def push_parse( options = {} )
    buffer, buffer_in = IO.pipe

    document, sax_options = prepare_ox_options( options )

    push_parse_pool.post do
        begin
            Ox.sax_html( document, buffer, sax_options )
        rescue SAX::Stop
        end
    end

    [buffer_in, document]
end

Instance Method Details

#baseString

Returns Base `href`, if there is one.

Returns:

  • (String)

    Base `href`, if there is one.



459
460
461
# File 'lib/arachni/parser.rb', line 459

def base
    @base ||= document.nodes_by_name( :base ).map { |b| b['href'] }.first || @url
end

#bodyObject



252
253
254
# File 'lib/arachni/parser.rb', line 252

def body
    @body || (@response.body if from_response?)
end

#body=(string) ⇒ String

Returns Override the #response body for the parsing process.

Returns:



247
248
249
250
# File 'lib/arachni/parser.rb', line 247

def body=( string )
    @links = @forms = @cookies = @document = nil
    @body = string
end

Returns Cookies with which to update the HTTP cookie-jar.

Returns:



434
435
436
437
438
439
440
441
442
443
444
445
# File 'lib/arachni/parser.rb', line 434

def cookie_jar
    return @cookie_jar.freeze if @cookie_jar
    from_jar = []

    # Make a list of the response cookie names.
    cookie_names = Set.new( cookies.map( &:name ) )

    from_jar |= HTTP::Client.cookie_jar.for_url( @url ).
        reject { |cookie| cookie_names.include?( cookie.name ) }

    @cookie_jar = (cookies | from_jar)
end

#cookiesArray<Element::Cookie>

Returns Cookies from HTTP headers and response body.

Returns:



389
390
391
392
393
394
395
396
# File 'lib/arachni/parser.rb', line 389

def cookies
    return @cookies.freeze if @cookies

    @cookies = Cookie.from_headers( @url, @response.headers )
    return @cookies if !text? || !Cookie.in_html?( body )

    @cookies |= Cookie.from_parser( self )
end

#cookies_to_be_auditedArray<Element::Cookie>

Returns Cookies to be audited.

Returns:



408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
# File 'lib/arachni/parser.rb', line 408

def cookies_to_be_audited
    return @cookies_to_be_audited.freeze if @cookies_to_be_audited
    return [] if !text?

    # Make a list of the response cookie names.
    cookie_names = Set.new( cookies.map(&:name) )

    # Grab all cookies from the cookiejar giving preferrence to the ones
    # specified by the current page, if there are any.
    from_http_jar = HTTP::Client.cookie_jar.cookies.reject do |c|
        cookie_names.include?( c.name )
    end

    # These cookies are to be audited and thus are dirty and anarchistic,
    # so they have to contain even cookies completely irrelevant to the
    # current page. I.e. it contains all cookies that have been observed
    # since the beginning of the scan
    @cookies_to_be_audited = (cookies | from_http_jar).map do |c|
        dc = c.dup
        dc.action = @url
        dc
    end
end

#documentArachni::Parser::Document?

Returns a parsed HTML document from the body of the HTTP response or `nil` if the response data wasn't text-based or the response couldn't be parsed.

Returns:

  • (Arachni::Parser::Document, nil)

    Returns a parsed HTML document from the body of the HTTP response or `nil` if the response data wasn't text-based or the response couldn't be parsed.



260
261
262
263
264
265
# File 'lib/arachni/parser.rb', line 260

def document
    return @document if @document
    return if !text?

    @document = self.class.parse( body, filter: true )
end

#formsArray<Element::Form>

Returns Forms from #document.

Returns:



291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# File 'lib/arachni/parser.rb', line 291

def forms
    return @forms.freeze if @forms
    return [] if !text? || (body && !Form.in_html?( body ))

    f = Form.from_parser( self )
    return f if !@secondary_responses

    @secondary_responses.each do |response|
        next if response.body.to_s.empty?

        Form.from_parser( Parser.new( response ) ).each do |form2|
            f.each do |form|
                next if "#{form.coverage_id}:#{form.name_or_id}" !=
                    "#{form2.coverage_id}:#{form2.name_or_id}"

                form.inputs.each do |k, v|
                    next if v == form2.inputs[k] ||
                        form.field_type_for( k ) != :hidden

                    form.nonce_name = k
                end
            end
        end
    end

    @forms = f
end

#from_document?Boolean

Returns:

  • (Boolean)


241
242
243
# File 'lib/arachni/parser.rb', line 241

def from_document?
    @resource == :document
end

#from_response?Boolean

Returns:

  • (Boolean)


237
238
239
# File 'lib/arachni/parser.rb', line 237

def from_response?
    @resource == :response
end

#headersHash

Note:

It will include common request headers as well headers from the HTTP request.

Returns List of valid auditable HTTP header fields.

Returns:

  • (Hash)

    List of valid auditable HTTP header fields.



272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
# File 'lib/arachni/parser.rb', line 272

def headers
    @headers ||= {
        'Accept'          => 'text/html,application/xhtml+xml,application' +
            '/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset'  => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
        'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
        'From'            => Options.authorized_by  || '',
        'User-Agent'      => Options.http.user_agent || '',
        'Referer'         => @url,
        'Pragma'          => 'no-cache'
    }.merge(
        response.request.headers.dup.tap do |h|
            IGNORE_REQUEST_HEADERS.each { |k| h.delete k }
        end
    ).map { |k, v| Header.new( url: @url, inputs: { k => v } ) }.freeze
end

#jsonsArray<Element::JSON>

Returns:



360
361
362
# File 'lib/arachni/parser.rb', line 360

def jsons
    @jsons ||= [JSON.from_request( @url, response.request )].compact
end

Returns Link to the page.

Returns:



321
322
323
324
# File 'lib/arachni/parser.rb', line 321

def link
    return if link_vars.empty? && (@response && !@response.redirection?)
    Link.new( url: @url, inputs: link_vars )
end

Returns LinkTemplate for the current page.

Returns:



328
329
330
331
332
333
334
335
336
337
338
# File 'lib/arachni/parser.rb', line 328

def link_template
    template, inputs = LinkTemplate.extract_inputs( @url )
    return if !template

    LinkTemplate.new(
        url:      @url.freeze,
        action:   @url.freeze,
        inputs:   inputs,
        template: template
    )
end

Returns Links matching OptionsGroups::Audit#link_templates in #document.

Returns:



351
352
353
354
355
356
357
# File 'lib/arachni/parser.rb', line 351

def link_templates
    return @link_templates.freeze if @link_templates
    return @link_templates = [link_template].compact if !text?

    @link_templates =
        [link_template].compact | LinkTemplate.from_parser( self )
end

Returns Parameters found in #url.

Returns:



371
372
373
374
375
# File 'lib/arachni/parser.rb', line 371

def link_vars
    return {} if !(parsed = uri_parse( @url ))

    @link_vars ||= parsed.rewrite.query_parameters.freeze
end

Returns Links in #document.

Returns:



342
343
344
345
346
347
# File 'lib/arachni/parser.rb', line 342

def links
    return @links.freeze if @links
    return @links = [link].compact if !text? || (body && !Link.in_html?( body ))

    @links = [link].compact | Link.from_parser( self )
end

#nested_cookiesArray<Element::NestedCookie>

Returns Nested cookies from #cookies_to_be_audited.

Returns:



400
401
402
403
404
# File 'lib/arachni/parser.rb', line 400

def nested_cookies
    return @nested_cookies.freeze if @nested_cookies

    @nested_cookies = NestedCookie.from_cookies( cookies_to_be_audited )
end

#pagePage

Returns:



227
228
229
# File 'lib/arachni/parser.rb', line 227

def page
    @page ||= Page.new( parser: self )
end

#pathsArray<String>

Returns Distinct links to follow.

Returns:



449
450
451
452
453
454
455
# File 'lib/arachni/parser.rb', line 449

def paths
  return @paths if @paths
  @paths = []
  return @paths.freeze if !document

  @paths = run_extractors.freeze
end

#text?Boolean

Returns `true` if the given HTTP response data are text based, `false` otherwise.

Returns:

  • (Boolean)

    `true` if the given HTTP response data are text based, `false` otherwise.



233
234
235
# File 'lib/arachni/parser.rb', line 233

def text?
    from_response? ? @response.text? : true
end

#to_absolute(relative_url) ⇒ String

Converts a relative URL to an absolute one.

Parameters:

  • relative_url (String)

    URL to convert to absolute.

Returns:



216
217
218
219
220
221
222
223
224
# File 'lib/arachni/parser.rb', line 216

def to_absolute( relative_url )
    if (url = base)
        base_url = url
    else
        base_url = @url
    end

    super( relative_url, base_url )
end

#ui_formsObject

Dummy method, only the browser can fill this in.



383
384
385
# File 'lib/arachni/parser.rb', line 383

def ui_forms
    []
end

#ui_inputsObject

Dummy method, only the browser can fill this in.



378
379
380
# File 'lib/arachni/parser.rb', line 378

def ui_inputs
    []
end

#xmlsArray<Element::XML>

Returns:



365
366
367
# File 'lib/arachni/parser.rb', line 365

def xmls
    @xmls ||= [XML.from_request( @url, response.request )].compact
end