Class: ContextDev::Models::WebWebCrawlMdParams

Inherits:
Internal::Type::BaseModel show all
Extended by:
Internal::Type::RequestParameters::Converter
Includes:
Internal::Type::RequestParameters
Defined in:
lib/context_dev/models/web_web_crawl_md_params.rb

Overview

Instance Attribute Summary collapse

Attributes included from Internal::Type::RequestParameters

#request_options

Instance Method Summary collapse

Methods included from Internal::Type::RequestParameters::Converter

dump_request

Methods included from Internal::Type::RequestParameters

included

Methods inherited from Internal::Type::BaseModel

==, #==, #[], coerce, #deconstruct_keys, #deep_to_h, dump, fields, hash, #hash, inherited, inspect, #inspect, known_fields, optional, recursively_to_h, required, #to_h, #to_json, #to_s, to_sorbet_type, #to_yaml

Methods included from Internal::Type::Converter

#coerce, coerce, #dump, dump, #inspect, inspect, meta_info, new_coerce_state, type_info

Methods included from Internal::Util::SorbetRuntimeSupport

#const_missing, #define_sorbet_constant!, #sorbet_constant_defined?, #to_sorbet_type, to_sorbet_type

Constructor Details

#initialize(url:, follow_subdomains: nil, include_frames: nil, include_images: nil, include_links: nil, max_age_ms: nil, max_depth: nil, max_pages: nil, parse_pdf: nil, shorten_base64_images: nil, url_regex: nil, use_main_content_only: nil, request_options: {}) ⇒ Object

Some parameter documentations has been truncated, see ContextDev::Models::WebWebCrawlMdParams for more details.

Parameters:

  • url (String)

    The starting URL for the crawl (must include http:// or https:// protocol)

  • follow_subdomains (Boolean) (defaults to: nil)

    When true, follow links on subdomains of the starting URL’s domain (e.g. docs.ex

  • include_frames (Boolean) (defaults to: nil)

    When true, the contents of iframes are rendered to Markdown for each crawled pag

  • include_images (Boolean) (defaults to: nil)

    Include image references in the Markdown output

  • include_links (Boolean) (defaults to: nil)

    Preserve hyperlinks in the Markdown output

  • max_age_ms (Integer) (defaults to: nil)

    Return a cached result if a prior scrape for the same parameters exists and is y

  • max_depth (Integer) (defaults to: nil)

    Maximum link depth from the starting URL (0 = only the starting page)

  • max_pages (Integer) (defaults to: nil)

    Maximum number of pages to crawl. Hard cap: 500.

  • parse_pdf (Boolean) (defaults to: nil)

    When true (default), PDF pages are fetched and their text layer is extracted and

  • shorten_base64_images (Boolean) (defaults to: nil)

    Truncate base64-encoded image data in the Markdown output

  • url_regex (String) (defaults to: nil)

    Regex pattern. Only URLs matching this pattern will be followed and scraped.

  • use_main_content_only (Boolean) (defaults to: nil)

    Extract only the main content, stripping headers, footers, sidebars, and navigat

  • request_options (ContextDev::RequestOptions, Hash{Symbol=>Object}) (defaults to: {})


# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 90

Instance Attribute Details

#follow_subdomainsBoolean?

When true, follow links on subdomains of the starting URL’s domain (e.g. docs.example.com when starting from example.com). www and apex are always treated as equivalent.

Returns:

  • (Boolean, nil)


22
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 22

optional :follow_subdomains, ContextDev::Internal::Type::Boolean, api_name: :followSubdomains

#include_framesBoolean?

When true, the contents of iframes are rendered to Markdown for each crawled page.

Returns:

  • (Boolean, nil)


29
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 29

optional :include_frames, ContextDev::Internal::Type::Boolean, api_name: :includeFrames

#include_imagesBoolean?

Include image references in the Markdown output

Returns:

  • (Boolean, nil)


35
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 35

optional :include_images, ContextDev::Internal::Type::Boolean, api_name: :includeImages

Preserve hyperlinks in the Markdown output

Returns:

  • (Boolean, nil)


41
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 41

optional :include_links, ContextDev::Internal::Type::Boolean, api_name: :includeLinks

#max_age_msInteger?

Return a cached result if a prior scrape for the same parameters exists and is younger than this many milliseconds. Defaults to 1 day (86400000 ms) when omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.

Returns:

  • (Integer, nil)


49
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 49

optional :max_age_ms, Integer, api_name: :maxAgeMs

#max_depthInteger?

Maximum link depth from the starting URL (0 = only the starting page)

Returns:

  • (Integer, nil)


55
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 55

optional :max_depth, Integer, api_name: :maxDepth

#max_pagesInteger?

Maximum number of pages to crawl. Hard cap: 500.

Returns:

  • (Integer, nil)


61
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 61

optional :max_pages, Integer, api_name: :maxPages

#parse_pdfBoolean?

When true (default), PDF pages are fetched and their text layer is extracted and converted to Markdown alongside HTML pages. When false, PDF pages are skipped entirely (not included in results and not counted as failures).

Returns:

  • (Boolean, nil)


69
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 69

optional :parse_pdf, ContextDev::Internal::Type::Boolean, api_name: :parsePDF

#shorten_base64_imagesBoolean?

Truncate base64-encoded image data in the Markdown output

Returns:

  • (Boolean, nil)


75
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 75

optional :shorten_base64_images, ContextDev::Internal::Type::Boolean, api_name: :shortenBase64Images

#urlString

The starting URL for the crawl (must include http:// or https:// protocol)

Returns:

  • (String)


14
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 14

required :url, String

#url_regexString?

Regex pattern. Only URLs matching this pattern will be followed and scraped.

Returns:

  • (String, nil)


81
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 81

optional :url_regex, String, api_name: :urlRegex

#use_main_content_onlyBoolean?

Extract only the main content, stripping headers, footers, sidebars, and navigation

Returns:

  • (Boolean, nil)


88
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 88

optional :use_main_content_only, ContextDev::Internal::Type::Boolean, api_name: :useMainContentOnly