Class: ContextDev::Models::WebWebCrawlMdParams

Inherits:
Internal::Type::BaseModel show all
Extended by:
Internal::Type::RequestParameters::Converter
Includes:
Internal::Type::RequestParameters
Defined in:
lib/context_dev/models/web_web_crawl_md_params.rb

Overview

Instance Attribute Summary collapse

Attributes included from Internal::Type::RequestParameters

#request_options

Instance Method Summary collapse

Methods included from Internal::Type::RequestParameters::Converter

dump_request

Methods included from Internal::Type::RequestParameters

included

Methods inherited from Internal::Type::BaseModel

==, #==, #[], coerce, #deconstruct_keys, #deep_to_h, dump, fields, hash, #hash, inherited, inspect, #inspect, known_fields, optional, recursively_to_h, required, #to_h, #to_json, #to_s, to_sorbet_type, #to_yaml

Methods included from Internal::Type::Converter

#coerce, coerce, #dump, dump, #inspect, inspect, meta_info, new_coerce_state, type_info

Methods included from Internal::Util::SorbetRuntimeSupport

#const_missing, #define_sorbet_constant!, #sorbet_constant_defined?, #to_sorbet_type, to_sorbet_type

Constructor Details

#initialize(url:, follow_subdomains: nil, include_images: nil, include_links: nil, max_age_ms: nil, max_depth: nil, max_pages: nil, parse_pdf: nil, shorten_base64_images: nil, url_regex: nil, use_main_content_only: nil, request_options: {}) ⇒ Object

Some parameter documentations has been truncated, see ContextDev::Models::WebWebCrawlMdParams for more details.

Parameters:

  • url (String)

    The starting URL for the crawl (must include http:// or https:// protocol)

  • follow_subdomains (Boolean) (defaults to: nil)

    When true, follow links on subdomains of the starting URL’s domain (e.g. docs.ex

  • include_images (Boolean) (defaults to: nil)

    Include image references in the Markdown output

  • include_links (Boolean) (defaults to: nil)

    Preserve hyperlinks in the Markdown output

  • max_age_ms (Integer) (defaults to: nil)

    Return a cached result if a prior scrape for the same parameters exists and is y

  • max_depth (Integer) (defaults to: nil)

    Maximum link depth from the starting URL (0 = only the starting page)

  • max_pages (Integer) (defaults to: nil)

    Maximum number of pages to crawl. Hard cap: 500.

  • parse_pdf (Boolean) (defaults to: nil)

    When true (default), PDF pages are fetched and their text layer is extracted and

  • shorten_base64_images (Boolean) (defaults to: nil)

    Truncate base64-encoded image data in the Markdown output

  • url_regex (String) (defaults to: nil)

    Regex pattern. Only URLs matching this pattern will be followed and scraped.

  • use_main_content_only (Boolean) (defaults to: nil)

    Extract only the main content, stripping headers, footers, sidebars, and navigat

  • request_options (ContextDev::RequestOptions, Hash{Symbol=>Object}) (defaults to: {})


# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 83

Instance Attribute Details

#follow_subdomainsBoolean?

When true, follow links on subdomains of the starting URL’s domain (e.g. docs.example.com when starting from example.com). www and apex are always treated as equivalent.

Returns:

  • (Boolean, nil)


22
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 22

optional :follow_subdomains, ContextDev::Internal::Type::Boolean, api_name: :followSubdomains

#include_imagesBoolean?

Include image references in the Markdown output

Returns:

  • (Boolean, nil)


28
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 28

optional :include_images, ContextDev::Internal::Type::Boolean, api_name: :includeImages

Preserve hyperlinks in the Markdown output

Returns:

  • (Boolean, nil)


34
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 34

optional :include_links, ContextDev::Internal::Type::Boolean, api_name: :includeLinks

#max_age_msInteger?

Return a cached result if a prior scrape for the same parameters exists and is younger than this many milliseconds. Defaults to 1 day (86400000 ms) when omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.

Returns:

  • (Integer, nil)


42
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 42

optional :max_age_ms, Integer, api_name: :maxAgeMs

#max_depthInteger?

Maximum link depth from the starting URL (0 = only the starting page)

Returns:

  • (Integer, nil)


48
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 48

optional :max_depth, Integer, api_name: :maxDepth

#max_pagesInteger?

Maximum number of pages to crawl. Hard cap: 500.

Returns:

  • (Integer, nil)


54
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 54

optional :max_pages, Integer, api_name: :maxPages

#parse_pdfBoolean?

When true (default), PDF pages are fetched and their text layer is extracted and converted to Markdown alongside HTML pages. When false, PDF pages are skipped entirely (not included in results and not counted as failures).

Returns:

  • (Boolean, nil)


62
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 62

optional :parse_pdf, ContextDev::Internal::Type::Boolean, api_name: :parsePDF

#shorten_base64_imagesBoolean?

Truncate base64-encoded image data in the Markdown output

Returns:

  • (Boolean, nil)


68
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 68

optional :shorten_base64_images, ContextDev::Internal::Type::Boolean, api_name: :shortenBase64Images

#urlString

The starting URL for the crawl (must include http:// or https:// protocol)

Returns:

  • (String)


14
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 14

required :url, String

#url_regexString?

Regex pattern. Only URLs matching this pattern will be followed and scraped.

Returns:

  • (String, nil)


74
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 74

optional :url_regex, String, api_name: :urlRegex

#use_main_content_onlyBoolean?

Extract only the main content, stripping headers, footers, sidebars, and navigation

Returns:

  • (Boolean, nil)


81
# File 'lib/context_dev/models/web_web_crawl_md_params.rb', line 81

optional :use_main_content_only, ContextDev::Internal::Type::Boolean, api_name: :useMainContentOnly